diff --git a/CMakeLists.txt b/CMakeLists.txt
index b57e6087..a63c4902 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,21 +73,18 @@ option(KAMINPAR_COMPRESSION_EDGE_WEIGHTS "Whether to compress edge weights." ON)
 option(KAMINPAR_COMPRESSION_HIGH_DEGREE_ENCODING "Use high-degree encoding for the compressed graph." ON)
 option(KAMINPAR_COMPRESSION_INTERVAL_ENCODING "Use interval encoding for the compressed graph." ON)
 option(KAMINPAR_COMPRESSION_RUN_LENGTH_ENCODING "Use run-length encoding for the compressed graph." OFF)
-option(KAMINPAR_COMPRESSION_STREAM_ENCODING "Use stream encoding for the compressed graph." OFF)
+option(KAMINPAR_COMPRESSION_STREAMVBYTE_ENCODING "Use StreamVByte encoding for the compressed graph." OFF)
 option(KAMINPAR_COMPRESSION_FAST_DECODING "Use fast decoding for the compressed graph." OFF)
-option(KAMINPAR_COMPRESSION_ISOLATED_NODES_SEPARATION "Whether all isolated nodes are the last nodes of the input graph" OFF)
 
 if (KAMINPAR_COMPRESSION_RUN_LENGTH_ENCODING AND KAMINPAR_COMPRESSION_STREAM_ENCODING)
-    message(FATAL_ERROR "Either run-length or stream encoding can be used for varints but not both.")
+    message(FATAL_ERROR "Either run-length or StreamVByte encoding can be used for varints but not both.")
 endif ()
 
 if (KAMINPAR_64BIT_NODE_IDS AND KAMINPAR_COMPRESSION_STREAM_ENCODING)
-    message(FATAL_ERROR "Stream encoding cannot be used with 64-bit NodeIDs.")
+    message(FATAL_ERROR "StreamVByte encoding cannot be used with 64-bit NodeIDs.")
 endif ()
 
-if (KAMINPAR_COMPRESSION_EDGE_WEIGHTS AND KAMINPAR_COMPRESSION_STREAM_ENCODING)
-    message(FATAL_ERROR "Stream encoding cannot be used together with compressed edge weights.")
-elseif (KAMINPAR_COMPRESSION_EDGE_WEIGHTS AND KAMINPAR_COMPRESSION_RUN_LENGTH_ENCODING)
+if (KAMINPAR_COMPRESSION_EDGE_WEIGHTS AND KAMINPAR_COMPRESSION_RUN_LENGTH_ENCODING)
     message(FATAL_ERROR "Run-length encoding cannot be used together with compressed edge weights.")
 endif ()
 
@@ -245,11 +242,11 @@ else ()
     message("  Run-length encoding: disabled")
 endif ()
 
-if (KAMINPAR_COMPRESSION_STREAM_ENCODING)
-    list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_STREAM_ENCODING")
-    message("  Stream encoding: enabled")
+if (KAMINPAR_COMPRESSION_STREAMVBYTE_ENCODING)
+    list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_STREAMVBYTE_ENCODING")
+    message("  StreamVByte encoding: enabled")
 else ()
-    message("  Stream encoding: disabled")
+    message("  StreamVByte encoding: disabled")
 endif ()
 
 if (KAMINPAR_COMPRESSION_FAST_DECODING)
@@ -260,13 +257,6 @@ else ()
     message("  Fast decoding: disabled")
 endif ()
 
-if (KAMINPAR_COMPRESSION_ISOLATED_NODES_SEPARATION)
-    list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_ISOLATED_NODES_SEPARATION")
-    message("  Isolated nodes separation: enabled")
-else ()
-    message("  Isolated nodes separation: disabled")
-endif ()
-
 if (KAMINPAR_64BIT_NODE_IDS OR KAMINPAR_64BIT_IDS)
     list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_64BIT_NODE_IDS")
     set(KAMINPAR_SHM_NODE_ID_STR "std::uint64_t")
diff --git a/apps/benchmarks/shm_variable_length_codec_benchmark.cc b/apps/benchmarks/shm_variable_length_codec_benchmark.cc
index 746adc97..3bbc858a 100644
--- a/apps/benchmarks/shm_variable_length_codec_benchmark.cc
+++ b/apps/benchmarks/shm_variable_length_codec_benchmark.cc
@@ -13,9 +13,9 @@
 #include "kaminpar-cli/CLI11.h"
 
 #include "kaminpar-common/console_io.h"
-#include "kaminpar-common/graph-compression/varint_codec.h"
-#include "kaminpar-common/graph-compression/varint_run_length_codec.h"
-#include "kaminpar-common/graph-compression/varint_stream_codec.h"
+#include "kaminpar-common/graph-compression/streamvbyte.h"
+#include "kaminpar-common/graph-compression/varint.h"
+#include "kaminpar-common/graph-compression/varint_rle.h"
 #include "kaminpar-common/logger.h"
 #include "kaminpar-common/timer.h"
 
@@ -112,7 +112,7 @@ sv_encode_values(std::string_view name, const std::size_t count, Lambda &&l) {
   auto encoded_values = std::make_unique<std::uint8_t[]>(count * sizeof(Int) + count);
 
   TIMED_SCOPE(name) {
-    VarIntStreamEncoder<Int> encoder(encoded_values.get(), count);
+    streamvbyte::StreamVByteEncoder<Int> encoder(count, encoded_values.get());
 
     for (std::size_t i = 0; i < count; ++i) {
       const std::size_t bytes_written = encoder.add(l(i));
@@ -218,9 +218,7 @@ void benchmark(
   SCOPED_TIMER(name);
 
   for (std::size_t i = 0; i < count; ++i) {
-    const auto [value, bytes_decoded] = l(values_ptr);
-    values_ptr += bytes_decoded;
-
+    const auto value = l(&values_ptr);
     do_not_optimize(value);
   }
 }
@@ -229,7 +227,7 @@ template <typename Int>
 void benchmark_rle(std::string_view name, const std::size_t count, const std::uint8_t *values_ptr) {
   SCOPED_TIMER(name);
 
-  VarIntRunLengthDecoder<Int> decoder(values_ptr, count);
+  VarIntRunLengthDecoder<Int> decoder(count, values_ptr);
   decoder.decode([](const Int value) { do_not_optimize(value); });
 }
 
@@ -237,7 +235,7 @@ template <typename Int>
 void benchmark_sve(std::string_view name, const std::size_t count, const std::uint8_t *values_ptr) {
   SCOPED_TIMER(name);
 
-  VarIntStreamDecoder<Int> decoder(values_ptr, count);
+  streamvbyte::StreamVByteDecoder<Int> decoder(count, values_ptr);
   decoder.decode([](const Int value) { do_not_optimize(value); });
 }
 
@@ -299,7 +297,7 @@ template <typename Int> void run_benchmark(std::size_t count) {
       encoded_zero_values.get(),
       encoded_max_values.get(),
       encoded_random_values.get(),
-      [](const std::uint8_t *ptr) { return varint_decode_general<Int>(ptr); }
+      [](const std::uint8_t **ptr) { return varint_decode_loop<Int>(ptr); }
   );
 
   benchmark(
@@ -308,9 +306,10 @@ template <typename Int> void run_benchmark(std::size_t count) {
       encoded_zero_values.get(),
       encoded_max_values.get(),
       encoded_random_values.get(),
-      [](const std::uint8_t *ptr) { return varint_decode<Int>(ptr); }
+      [](const std::uint8_t **ptr) { return varint_decode_pext_unrolled<Int>(ptr); }
   );
 
+  /*
   std::vector<std::make_signed_t<Int>> random_signed_values =
       generate_random_values<std::make_signed_t<Int>>(count);
 
@@ -336,6 +335,7 @@ template <typename Int> void run_benchmark(std::size_t count) {
       encoded_random_signed_values.get(),
       [](const std::uint8_t *ptr) { return signed_varint_decode<std::make_signed_t<Int>>(ptr); }
   );
+  */
 
   const auto [rl_encoded_zero_values, rl_encoded_max_values, rl_encoded_random_values] =
       rl_encode_values<Int>(count, random_values);
diff --git a/apps/io/shm_compressed_graph_binary.cc b/apps/io/shm_compressed_graph_binary.cc
index 7e5fcf9c..1a61caac 100644
--- a/apps/io/shm_compressed_graph_binary.cc
+++ b/apps/io/shm_compressed_graph_binary.cc
@@ -31,8 +31,7 @@ struct CompressedBinaryHeader {
   bool use_high_degree_encoding;
   bool use_interval_encoding;
   bool use_run_length_encoding;
-  bool use_stream_vbyte_encoding;
-  bool use_isolated_nodes_separation;
+  bool use_streamvbyte_encoding;
 
   std::uint64_t high_degree_threshold;
   std::uint64_t high_degree_part_length;
@@ -66,8 +65,7 @@ CompressedBinaryHeader create_header(const CompressedGraph &graph) {
       CompressedGraph::kHighDegreeEncoding,
       CompressedGraph::kIntervalEncoding,
       CompressedGraph::kRunLengthEncoding,
-      CompressedGraph::kStreamEncoding,
-      CompressedGraph::kIsolatedNodesSeparation,
+      CompressedGraph::kStreamVByteEncoding,
 
       CompressedGraph::kHighDegreeThreshold,
       CompressedGraph::kHighDegreePartLength,
@@ -91,12 +89,12 @@ template <typename T> static void write_int(std::ofstream &out, const T id) {
 
 static void write_header(std::ofstream &out, const CompressedBinaryHeader header) {
   const std::uint16_t boolean_values =
-      (header.use_isolated_nodes_separation << 12) | (header.use_stream_vbyte_encoding << 11) |
-      (header.use_run_length_encoding << 10) | (header.use_interval_encoding << 9) |
-      (header.use_high_degree_encoding << 8) | (header.compress_edge_weights << 7) |
-      (header.use_degree_bucket_order << 6) | (header.has_64_bit_edge_weight << 5) |
-      (header.has_64_bit_node_weight << 4) | (header.has_64_bit_edge_id << 3) |
-      (header.has_64_bit_node_id << 2) | (header.has_edge_weights << 1) | (header.has_node_weights);
+      (header.use_streamvbyte_encoding << 11) | (header.use_run_length_encoding << 10) |
+      (header.use_interval_encoding << 9) | (header.use_high_degree_encoding << 8) |
+      (header.compress_edge_weights << 7) | (header.use_degree_bucket_order << 6) |
+      (header.has_64_bit_edge_weight << 5) | (header.has_64_bit_node_weight << 4) |
+      (header.has_64_bit_edge_id << 3) | (header.has_64_bit_node_id << 2) |
+      (header.has_edge_weights << 1) | (header.has_node_weights);
   write_int(out, boolean_values);
 
   write_int(out, header.high_degree_threshold);
@@ -155,14 +153,14 @@ template <typename T> static T read_int(std::ifstream &in) {
 CompressedBinaryHeader read_header(std::ifstream &in) {
   const auto boolean_values = read_int<std::uint16_t>(in);
   return {
-      (boolean_values & 1) != 0,    (boolean_values & 2) != 0,    (boolean_values & 4) != 0,
-      (boolean_values & 8) != 0,    (boolean_values & 16) != 0,   (boolean_values & 32) != 0,
-      (boolean_values & 64) != 0,   (boolean_values & 128) != 0,  (boolean_values & 256) != 0,
-      (boolean_values & 512) != 0,  (boolean_values & 1024) != 0, (boolean_values & 2048) != 0,
-      (boolean_values & 4096) != 0, read_int<std::uint64_t>(in),  read_int<std::uint64_t>(in),
-      read_int<std::uint64_t>(in),  read_int<std::uint64_t>(in),  read_int<std::uint64_t>(in),
-      read_int<std::int64_t>(in),   read_int<std::uint64_t>(in),  read_int<std::uint64_t>(in),
-      read_int<std::uint64_t>(in),  read_int<std::uint64_t>(in),
+      (boolean_values & 1) != 0,   (boolean_values & 2) != 0,    (boolean_values & 4) != 0,
+      (boolean_values & 8) != 0,   (boolean_values & 16) != 0,   (boolean_values & 32) != 0,
+      (boolean_values & 64) != 0,  (boolean_values & 128) != 0,  (boolean_values & 256) != 0,
+      (boolean_values & 512) != 0, (boolean_values & 1024) != 0, (boolean_values & 2048) != 0,
+      read_int<std::uint64_t>(in), read_int<std::uint64_t>(in),  read_int<std::uint64_t>(in),
+      read_int<std::uint64_t>(in), read_int<std::uint64_t>(in),  read_int<std::int64_t>(in),
+      read_int<std::uint64_t>(in), read_int<std::uint64_t>(in),  read_int<std::uint64_t>(in),
+      read_int<std::uint64_t>(in),
   };
 }
 
@@ -263,8 +261,8 @@ void verify_header(const CompressedBinaryHeader header) {
     std::exit(1);
   }
 
-  if (header.use_stream_vbyte_encoding != CompressedGraph::kStreamEncoding) {
-    if (header.use_stream_vbyte_encoding) {
+  if (header.use_streamvbyte_encoding != CompressedGraph::kStreamVByteEncoding) {
+    if (header.use_streamvbyte_encoding) {
       LOG_ERROR << "The stored compressed graph uses stream encoding but this build does not.";
     } else {
       LOG_ERROR << "The stored compressed graph does not use stream encoding but this build does.";
@@ -272,17 +270,6 @@ void verify_header(const CompressedBinaryHeader header) {
     std::exit(1);
   }
 
-  if (header.use_isolated_nodes_separation != CompressedGraph::kIsolatedNodesSeparation) {
-    if (header.use_isolated_nodes_separation) {
-      LOG_ERROR
-          << "The stored compressed graph uses isolated nodes separation but this build does not.";
-    } else {
-      LOG_ERROR << "The stored compressed graph does not use isolated nodes separation but this "
-                   "build does.";
-    }
-    std::exit(1);
-  }
-
   if (header.high_degree_threshold != CompressedGraph::kHighDegreeThreshold) {
     LOG_ERROR << "The stored compressed graph uses " << header.high_degree_threshold
               << " as the high degree threshold but this build uses "
diff --git a/apps/io/shm_parhip_parser.cc b/apps/io/shm_parhip_parser.cc
index 45d2c74a..1f6af24a 100644
--- a/apps/io/shm_parhip_parser.cc
+++ b/apps/io/shm_parhip_parser.cc
@@ -9,7 +9,6 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <fstream>
 #include <functional>
 
 #include <tbb/parallel_for.h>
@@ -105,7 +104,7 @@ class ParHIPHeader {
            (has_node_weights ? num_nodes * _node_weight_width : 0);
   }
 
-  [[nodiscard]] NodeID map_edge_offset(const EdgeID edge_offset) const {
+  [[nodiscard]] EdgeID map_edge_offset(const EdgeID edge_offset) const {
     return (edge_offset - _nodes_offset_base) / _node_id_width;
   }
 
@@ -120,13 +119,13 @@ class ParHIPHeader {
       std::exit(1);
     }
 
-    if (has_64_bit_node_weight && sizeof(NodeWeight) == 4) {
+    if (has_node_weights && has_64_bit_node_weight && sizeof(NodeWeight) == 4) {
       LOG_ERROR
           << "The stored graph uses 64-Bit node weights but this build uses 32-Bit node weights.";
       std::exit(1);
     }
 
-    if (has_64_bit_edge_weight && sizeof(EdgeWeight) == 4) {
+    if (has_edge_weights && has_64_bit_edge_weight && sizeof(EdgeWeight) == 4) {
       LOG_ERROR
           << "The stored graph uses 64-Bit edge weights but this build uses 32-Bit edge weights.";
       std::exit(1);
@@ -351,32 +350,29 @@ CompressedGraph compressed_read_parallel(const std::string &filename, const Node
 
     const bool sort_by_degree_bucket = ordering == NodeOrdering::DEGREE_BUCKETS;
     if (sort_by_degree_bucket) {
-      RECORD("degrees") StaticArray<NodeID> degrees(header.num_nodes, static_array::noinit);
-      TIMED_SCOPE("Read degrees") {
-        tbb::parallel_for(tbb::blocked_range<NodeID>(0, header.num_nodes), [&](const auto &r) {
-          for (NodeID u = r.begin(); u != r.end(); ++u) {
-            degrees[u] = header.map_edge_offset(node(u + 1)) - header.map_edge_offset(node(u));
-          }
-        });
+      const auto degree = [&](const NodeID u) {
+        return static_cast<NodeID>(
+            header.map_edge_offset(node(u + 1)) - header.map_edge_offset(node(u))
+        );
       };
-      const auto [perm, inv_perm] =
-          graph::sort_by_degree_buckets(header.num_nodes, [&](const NodeID u) {
-            return degrees[u];
-          });
 
-      return parallel_compress(
+      auto [perm, inv_perm] = graph::sort_by_degree_buckets(header.num_nodes, degree);
+      CompressedGraph compressed_graph = parallel_compress(
           header.num_nodes,
           header.num_edges,
           header.has_node_weights,
           header.has_edge_weights,
           true,
           [&](const NodeID u) { return inv_perm[u]; },
-          [&](const NodeID u) { return degrees[u]; },
+          degree,
           [&](const NodeID u) { return header.map_edge_offset(node(u)); },
           [&](const EdgeID e) { return perm[edge(e)]; },
           [&](const NodeID u) { return node_weight(u); },
           [&](const EdgeID e) { return edge_weight(e); }
       );
+
+      compressed_graph.set_permutation(std::move(perm));
+      return compressed_graph;
     } else {
       return parallel_compress(
           header.num_nodes,
diff --git a/apps/tools/shm_graph_attach_weights_tool.cc b/apps/tools/shm_graph_attach_weights_tool.cc
index 8f9ea906..5dfcb2c0 100644
--- a/apps/tools/shm_graph_attach_weights_tool.cc
+++ b/apps/tools/shm_graph_attach_weights_tool.cc
@@ -35,16 +35,22 @@ namespace {
 
 enum class WeightDistribution {
   UNIFORM,
-  ALTERNATING
+  ALTERNATING,
+  EXPONENTIAL,
 };
 
 [[nodiscard]] std::unordered_map<std::string, WeightDistribution> get_weight_distributions() {
   return {
       {"uniform", WeightDistribution::UNIFORM},
       {"alternating", WeightDistribution::ALTERNATING},
+      {"exponential", WeightDistribution::EXPONENTIAL},
   };
 }
 
+[[nodiscard]] int local_seed(const int cpu, const int seed) {
+  return seed + (cpu + 42) * 3;
+}
+
 struct EdgeHasher {
   using Edge = std::pair<NodeID, NodeID>;
 
@@ -112,8 +118,7 @@ generate_edge_weights(const CSRGraph &graph, Lambda &&edge_weight_generator_fact
     const CSRGraph &graph, const int seed, const EdgeWeight min, const EdgeWeight max
 ) {
   return generate_edge_weights(graph, [&](const int cpu, auto &&edge_weight_fetcher) {
-    const int local_seed = seed + cpu;
-    std::mt19937 gen(local_seed);
+    std::mt19937 gen(local_seed(seed, cpu));
     std::uniform_int_distribution<EdgeWeight> dist(min, max);
 
     edge_weight_fetcher([&](const EdgeID, const NodeID, const NodeID) {
@@ -132,8 +137,7 @@ generate_edge_weights(const CSRGraph &graph, Lambda &&edge_weight_generator_fact
     const EdgeWeight max_large_weights
 ) {
   return generate_edge_weights(graph, [&](const int cpu, auto &&edge_weight_fetcher) {
-    const int local_seed = seed + cpu;
-    std::mt19937 gen(local_seed);
+    std::mt19937 gen(local_seed(seed, cpu));
     std::uniform_int_distribution<EdgeWeight> small_dist(min_small_weights, max_small_weights);
     std::uniform_int_distribution<EdgeWeight> large_dist(min_large_weights, max_large_weights);
 
@@ -151,6 +155,19 @@ generate_edge_weights(const CSRGraph &graph, Lambda &&edge_weight_generator_fact
   });
 }
 
+[[nodiscard]] StaticArray<EdgeWeight>
+generate_exponential_edge_weights(const CSRGraph &graph, const int seed, const double lambda) {
+  return generate_edge_weights(graph, [&](const int cpu, auto &&edge_weight_fetcher) {
+    std::mt19937 gen(local_seed(seed, cpu));
+    std::exponential_distribution<double> dist(lambda);
+
+    edge_weight_fetcher([&](const EdgeID e, const NodeID, const NodeID) {
+      const EdgeWeight weight = static_cast<EdgeWeight>(dist(gen)) + 1;
+      return weight;
+    });
+  });
+}
+
 }; // namespace
 
 int main(int argc, char *argv[]) {
@@ -186,12 +203,13 @@ int main(int argc, char *argv[]) {
       ->transform(CLI::CheckedTransformer(get_weight_distributions()).description(""))
       ->description(R"(Distribution used for generating edge weights:
   - uniform
-  - alternating)")
+  - alternating
+  - exponential)")
       ->required()
       ->capture_default_str();
 
   EdgeWeight uniform_min_weight = 1;
-  EdgeWeight uniform_max_weight = 32768;
+  EdgeWeight uniform_max_weight = 65536;
   auto *uniform_group = app.add_option_group("Uniform Distribution");
   uniform_group->add_option("--u-min", uniform_min_weight, "Minimum weight value.")
       ->capture_default_str();
@@ -199,10 +217,10 @@ int main(int argc, char *argv[]) {
       ->capture_default_str();
 
   EdgeWeight alt_min_small_weights = 1;
-  EdgeWeight alt_max_small_weights = 128;
-  EdgeWeight alt_min_large_weights = 32768;
-  EdgeWeight alt_max_large_weights = 8388608;
-  auto *alt_group = app.add_option_group("Uniform Distribution");
+  EdgeWeight alt_max_small_weights = 1;
+  EdgeWeight alt_min_large_weights = 65536;
+  EdgeWeight alt_max_large_weights = 65536;
+  auto *alt_group = app.add_option_group("Alternating Distribution");
   alt_group
       ->add_option("--a-min-small", alt_min_small_weights, "Minimum weight value of small weights.")
       ->capture_default_str();
@@ -216,6 +234,10 @@ int main(int argc, char *argv[]) {
       ->add_option("--a-max-large", alt_max_large_weights, "Maximum weight value of large weights.")
       ->capture_default_str();
 
+  double lambda = 0.0001;
+  auto *exp_group = app.add_option_group("Exponential Distribution");
+  exp_group->add_option("--e-lambda", lambda, "Rate parameter.")->capture_default_str();
+
   CLI11_PARSE(app, argc, argv);
 
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, num_threads);
@@ -238,6 +260,8 @@ int main(int argc, char *argv[]) {
           alt_min_large_weights,
           alt_max_large_weights
       );
+    case WeightDistribution::EXPONENTIAL:
+      return generate_exponential_edge_weights(csr_graph, seed, lambda);
     default:
       __builtin_unreachable();
     }
diff --git a/kaminpar-cli/kaminpar_arguments.cc b/kaminpar-cli/kaminpar_arguments.cc
index b653dfaf..de421cdf 100644
--- a/kaminpar-cli/kaminpar_arguments.cc
+++ b/kaminpar-cli/kaminpar_arguments.cc
@@ -205,6 +205,16 @@ CLI::Option_group *create_lp_coarsening_options(CLI::App *app, Context &ctx) {
   )
       ->capture_default_str();
 
+  lp->add_option("--c-lp-tie-breaking-strategy", ctx.coarsening.clustering.lp.tie_breaking_strategy)
+      ->transform(CLI::CheckedTransformer(get_tie_breaking_strategies()).description(""))
+      ->description(
+          R"(Determines the tie breaking strategy.
+Options are:
+  - geometric: Prefer nodes with same rating located at the end of a neighborhood
+  - uniform:   Select nodes with same rating uniformly at random
+  )"
+      )
+      ->capture_default_str();
   lp->add_option(
         "--c-lp-cluster-weights-struct", ctx.coarsening.clustering.lp.cluster_weights_structure
   )
@@ -322,6 +332,13 @@ Options are:
           "The fraction of the total edges with which to fill the edge buffer"
       )
       ->capture_default_str();
+  contraction
+      ->add_option(
+          "--c-con-use-growing-hash-tables",
+          ctx.coarsening.contraction.use_growing_hash_tables,
+          "Whether to use growing hash tables to collect coarse edges (only for unbuffered mode)"
+      )
+      ->capture_default_str();
 
   return contraction;
 }
@@ -396,6 +413,16 @@ Options are:
       )
       ->capture_default_str();
 
+  lp->add_option("--r-lp-tie-breaking-strategy", ctx.refinement.lp.tie_breaking_strategy)
+      ->transform(CLI::CheckedTransformer(get_tie_breaking_strategies()).description(""))
+      ->description(
+          R"(Determines the tie breaking strategy.
+Options are:
+  - geometric: Prefer nodes with same rating located at the end of a neighborhood
+  - uniform:   Select nodes with same rating uniformly at random
+  )"
+      )
+      ->capture_default_str();
   lp->add_option(
         "--r-lp-second-phase-selection-strategy", ctx.refinement.lp.second_phase_selection_strategy
   )
diff --git a/kaminpar-common/datastructures/bitvector_rank.h b/kaminpar-common/datastructures/bitvector_rank.h
index b3403909..6378e42f 100644
--- a/kaminpar-common/datastructures/bitvector_rank.h
+++ b/kaminpar-common/datastructures/bitvector_rank.h
@@ -64,6 +64,16 @@ class RankCombinedBitVector {
   }
 
 public:
+  /*!
+   * Constructs an empty bit vector.
+   */
+  explicit RankCombinedBitVector()
+      : _length(0),
+        _num_blocks(0),
+        _data(0),
+        _num_superblocks(0),
+        _superblock_data(0) {}
+
   /*!
    * Constructs an uninitialized bit vector.
    *
diff --git a/kaminpar-common/datastructures/compact_static_array.h b/kaminpar-common/datastructures/compact_static_array.h
index bd7136eb..898775e6 100644
--- a/kaminpar-common/datastructures/compact_static_array.h
+++ b/kaminpar-common/datastructures/compact_static_array.h
@@ -37,7 +37,7 @@ template <typename Int> class CompactStaticArray {
     using difference_type = std::ptrdiff_t;
 
     CompactStaticArrayIterator(
-        const std::uint8_t byte_width, const Int read_mask, const std::uint8_t *data
+        const std::size_t byte_width, const Int read_mask, const std::uint8_t *data
     )
         : _byte_width(byte_width),
           _mask(read_mask),
@@ -125,7 +125,7 @@ template <typename Int> class CompactStaticArray {
     }
 
   private:
-    const std::uint8_t _byte_width;
+    const std::size_t _byte_width;
     const Int _mask;
     const std::uint8_t *_data;
   };
@@ -141,7 +141,12 @@ template <typename Int> class CompactStaticArray {
   /*!
    * Constructs an unitialized CompactStaticArray.
    */
-  CompactStaticArray() : _byte_width(0), _size(0), _unrestricted_size(0), _num_values(0) {
+  CompactStaticArray()
+      : _byte_width(0),
+        _size(0),
+        _num_values(0),
+        _unrestricted_size(0),
+        _unrestricted_num_values(0) {
     RECORD_DATA_STRUCT(0, _struct);
   }
 
@@ -151,7 +156,7 @@ template <typename Int> class CompactStaticArray {
    * @param byte_width The number of bytes needed to store the largest integer in the array.
    * @param size num_values number of values to store.
    */
-  CompactStaticArray(const std::uint8_t byte_width, const std::size_t num_values) {
+  CompactStaticArray(const std::size_t byte_width, const std::size_t num_values) {
     RECORD_DATA_STRUCT(0, _struct);
     resize(byte_width, num_values);
   }
@@ -164,17 +169,18 @@ template <typename Int> class CompactStaticArray {
    * @param data The pointer to the memory location where the data is compactly stored.
    */
   CompactStaticArray(
-      const std::uint8_t byte_width,
+      const std::size_t byte_width,
       const std::size_t actual_size,
       std::unique_ptr<std::uint8_t[]> data
   )
       : _byte_width(byte_width),
         _size(actual_size),
-        _unrestricted_size(actual_size),
         _num_values((_size - (sizeof(Int) - _byte_width)) / _byte_width),
         _values(std::move(data)),
-        _read_mask(std::numeric_limits<Int>::max() << (byte_width * 8)),
-        _write_mask(std::numeric_limits<Int>::max() << (byte_width * 8)) {
+        _read_mask(std::numeric_limits<Int>::max() >> ((sizeof(Int) - byte_width) * 8)),
+        _write_mask(std::numeric_limits<Int>::max() << (byte_width * 8)),
+        _unrestricted_size(_size),
+        _unrestricted_num_values(_num_values) {
     RECORD_DATA_STRUCT(0, _struct);
     KASSERT(actual_size >= sizeof(Int) - _byte_width);
     KASSERT(byte_width >= 1);
@@ -193,13 +199,12 @@ template <typename Int> class CompactStaticArray {
    * @param byte_width The number of bytes needed to store the largest integer in the array.
    * @param num_values The number of values to store.
    */
-  void resize(const std::uint8_t byte_width, const std::size_t num_values) {
+  void resize(const std::size_t byte_width, const std::size_t num_values) {
     KASSERT(byte_width >= 1);
     KASSERT(byte_width <= 8);
 
     _byte_width = byte_width;
     _size = num_values * byte_width + sizeof(Int) - byte_width;
-    _unrestricted_size = _size;
 
     _num_values = num_values;
     _values = std::make_unique<std::uint8_t[]>(_size);
@@ -207,6 +212,9 @@ template <typename Int> class CompactStaticArray {
     _read_mask = std::numeric_limits<Int>::max() >> ((sizeof(Int) - byte_width) * 8);
     _write_mask = std::numeric_limits<Int>::max() << (byte_width * 8);
 
+    _unrestricted_size = _size;
+    _unrestricted_num_values = num_values;
+
     IF_HEAP_PROFILING(_struct->size = std::max(_struct->size, _size));
   }
 
@@ -218,10 +226,11 @@ template <typename Int> class CompactStaticArray {
   void restrict(const std::size_t new_num_values) {
     KASSERT(new_num_values <= _num_values);
 
-    _num_values = new_num_values;
-
     _unrestricted_size = _size;
     _size = new_num_values * _byte_width + sizeof(Int) - _byte_width;
+
+    _unrestricted_num_values = _num_values;
+    _num_values = new_num_values;
   }
 
   /*!
@@ -230,6 +239,7 @@ template <typename Int> class CompactStaticArray {
    */
   void unrestrict() {
     _size = _unrestricted_size;
+    _num_values = _unrestricted_num_values;
   }
 
   /*!
@@ -238,12 +248,15 @@ template <typename Int> class CompactStaticArray {
    * @param pos The position in the array at which the integer is to be stored.
    * @param value The value to store.
    */
-  void write(const std::size_t pos, const Int value) {
+  void write(const std::size_t pos, Int value) {
     KASSERT(pos < _num_values);
-    KASSERT(math::byte_width<std::uint32_t>(value) <= _byte_width);
+    KASSERT(math::byte_width(value) <= _byte_width);
 
-    Int *data = reinterpret_cast<Int *>(_values.get() + pos * _byte_width);
-    *data = value | (*data & _write_mask);
+    std::uint8_t *data = _values.get() + pos * _byte_width;
+    for (std::size_t i = 0; i < _byte_width; ++i) {
+      *data++ = value & 0b11111111;
+      value >>= 8;
+    }
   }
 
   /*!
@@ -322,9 +335,8 @@ template <typename Int> class CompactStaticArray {
   }
 
 private:
-  std::uint8_t _byte_width;
+  std::size_t _byte_width;
   std::size_t _size;
-  std::size_t _unrestricted_size;
 
   std::size_t _num_values;
   std::unique_ptr<std::uint8_t[]> _values;
@@ -332,6 +344,9 @@ template <typename Int> class CompactStaticArray {
   Int _read_mask;
   Int _write_mask;
 
+  std::size_t _unrestricted_size;
+  std::size_t _unrestricted_num_values;
+
   IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct);
 };
 
diff --git a/kaminpar-common/datastructures/concurrent_fast_reset_array.h b/kaminpar-common/datastructures/concurrent_fast_reset_array.h
index c0d97292..c56cd832 100644
--- a/kaminpar-common/datastructures/concurrent_fast_reset_array.h
+++ b/kaminpar-common/datastructures/concurrent_fast_reset_array.h
@@ -18,6 +18,7 @@
 #include "kaminpar-common/datastructures/static_array.h"
 #include "kaminpar-common/heap_profiler.h"
 #include "kaminpar-common/parallel/aligned_element.h"
+#include "kaminpar-common/ranges.h"
 
 namespace kaminpar {
 
@@ -102,13 +103,25 @@ template <typename Value, typename Size = std::size_t> class ConcurrentFastReset
    */
   template <typename Lambda> void iterate_and_reset(Lambda &&l) {
     tbb::parallel_for<std::size_t>(0, _used_entries_tls.size(), [&](const auto i) {
-      l(i, _used_entries_tls[i]);
+      auto &local_used_entries = _used_entries_tls[i].vec;
+      if (local_used_entries.empty()) {
+        return;
+      }
 
-      for (const size_type pos : _used_entries_tls[i]) {
+      auto local_entries = TransformedIotaRange(
+          static_cast<std::size_t>(0),
+          local_used_entries.size(),
+          [this, &local_used_entries](const std::size_t j) {
+            const std::size_t pos = local_used_entries[j];
+            return std::make_pair(pos, _data[pos]);
+          }
+      );
+      l(i, local_entries);
+
+      for (const size_type pos : local_used_entries) {
         _data[pos] = Value();
       }
-
-      _used_entries_tls[i].clear();
+      local_used_entries.clear();
     });
   }
 
diff --git a/kaminpar-common/datastructures/dynamic_map.h b/kaminpar-common/datastructures/dynamic_map.h
index 9425d702..bd0e94ae 100644
--- a/kaminpar-common/datastructures/dynamic_map.h
+++ b/kaminpar-common/datastructures/dynamic_map.h
@@ -5,11 +5,14 @@
 #include <cstdlib>
 #include <cstring>
 #include <limits>
-#include <memory>
-#include <vector>
+
+#include "kaminpar-common/datastructures/scalable_vector.h"
+#include "kaminpar-common/parallel/tbb_malloc.h"
 
 namespace kaminpar {
 template <typename Key, typename Value, typename Derived> class DynamicMapBase {
+  static constexpr std::size_t kTHPThreshold = 1024 * 1024 * 16;
+
 public:
   DynamicMapBase(const DynamicMapBase &) = delete;
   DynamicMapBase &operator=(const DynamicMapBase &other) = delete;
@@ -82,8 +85,8 @@ template <typename Key, typename Value, typename Derived> class DynamicMapBase {
     _size = 0;
     _capacity = align_to_next_power_of_two(capacity);
 
-    const size_t alloc_size = static_cast<const Derived *>(this)->size_in_bytes_impl();
-    _data = std::make_unique<std::uint8_t[]>(alloc_size);
+    const std::size_t alloc_size = static_cast<const Derived *>(this)->size_in_bytes_impl();
+    _data = parallel::make_unique<std::uint8_t>(alloc_size, alloc_size >= kTHPThreshold);
     std::memset(_data.get(), 0, alloc_size);
 
     static_cast<Derived *>(this)->initialize_impl();
@@ -98,7 +101,7 @@ template <typename Key, typename Value, typename Derived> class DynamicMapBase {
     const std::size_t old_size = _size;
     const std::size_t old_capacity = _capacity;
     const std::size_t new_capacity = 2UL * _capacity;
-    const std::unique_ptr<std::uint8_t[]> old_data = std::move(_data);
+    const parallel::tbb_unique_ptr<std::uint8_t> old_data = std::move(_data);
     const std::uint8_t *old_data_begin = old_data.get();
 
     initialize(new_capacity);
@@ -118,7 +121,7 @@ template <typename Key, typename Value, typename Derived> class DynamicMapBase {
   std::size_t _capacity = 0;
   std::size_t _size = 0;
 
-  std::unique_ptr<std::uint8_t[]> _data = nullptr;
+  parallel::tbb_unique_ptr<std::uint8_t> _data = nullptr;
 };
 
 template <typename Key, typename Value>
@@ -215,18 +218,18 @@ class DynamicFlatMap final : public DynamicMapBase<Key, Value, DynamicFlatMap<Ke
   MapElement *_elements = nullptr;
 };
 
-template <typename Key, typename Value>
+template <typename Key, typename Value, typename Timestamp = std::size_t>
 class DynamicRememberingFlatMap final
-    : public DynamicMapBase<Key, Value, DynamicRememberingFlatMap<Key, Value>> {
-  using Base = DynamicMapBase<Key, Value, DynamicRememberingFlatMap<Key, Value>>;
+    : public DynamicMapBase<Key, Value, DynamicRememberingFlatMap<Key, Value, Timestamp>> {
+  using Base = DynamicMapBase<Key, Value, DynamicRememberingFlatMap<Key, Value, Timestamp>>;
   using Base::INVALID_POS_MASK;
 
   friend Base;
 
   struct MapElement {
+    Timestamp timestamp;
     Key key;
     Value value;
-    std::size_t timestamp;
   };
 
 public:
@@ -243,15 +246,17 @@ class DynamicRememberingFlatMap final
   ~DynamicRememberingFlatMap() = default;
 
   template <typename Lambda> void for_each(Lambda &&lambda) const {
-    for (const std::size_t pos : _positions) {
-      lambda(_elements[pos].key, _elements[pos].value);
+    for (const std::size_t pos : _used_elements) {
+      const MapElement element = _elements[pos];
+      lambda(element.key, element.value);
     }
   }
 
   [[nodiscard]] auto entries() const {
     return TransformedIotaRange(static_cast<std::size_t>(0), _size, [this](const std::size_t i) {
-      const std::size_t pos = _positions[i];
-      return std::make_pair(_elements[pos].key, _elements[pos].value);
+      const std::size_t pos = _used_elements[i];
+      const MapElement element = _elements[pos];
+      return std::make_pair(element.key, element.value);
     });
   }
 
@@ -262,8 +267,10 @@ class DynamicRememberingFlatMap final
 
   std::size_t find_impl(const Key key) const {
     std::size_t hash = key & (_capacity - 1);
-    while (_elements[hash].timestamp == _timestamp) {
-      if (_elements[hash].key == key) {
+
+    MapElement element;
+    while ((element = _elements[hash]).timestamp == _timestamp) {
+      if (element.key == key) {
         return hash;
       }
       hash = (hash + 1) & (_capacity - 1);
@@ -277,15 +284,14 @@ class DynamicRememberingFlatMap final
 
   Value &add_element_impl(Key key, Value value, const std::size_t pos) {
     _size++;
-    _positions.push_back(pos);
+    _used_elements.push_back(pos);
 
-    _elements[pos] = MapElement{key, value, _timestamp};
+    _elements[pos] = MapElement{_timestamp, key, value};
     return _elements[pos].value;
   }
 
   void initialize_impl() {
     _elements = reinterpret_cast<MapElement *>(_data.get());
-    _old_timestamp = _timestamp;
     _timestamp = 1;
   }
 
@@ -296,29 +302,29 @@ class DynamicRememberingFlatMap final
 
     const auto *elements = reinterpret_cast<const MapElement *>(old_data_begin);
     for (std::size_t i = 0; i < old_size; ++i) {
-      const std::size_t pos = _positions[i];
+      const std::size_t pos = _used_elements[i];
+      const MapElement element = elements[pos];
       const Key key = elements[pos].key;
       const std::size_t new_pos = find_impl(key) & ~INVALID_POS_MASK;
 
-      _positions[i] = new_pos;
-      _elements[new_pos] = MapElement{key, elements[pos].value, _timestamp};
+      _used_elements[i] = new_pos;
+      _elements[new_pos] = MapElement{_timestamp, key, element.value};
     }
   }
 
   void clear_impl() {
     ++_timestamp;
-    _positions.clear();
+    _used_elements.clear();
   }
 
   using Base::_capacity;
   using Base::_data;
   using Base::_size;
 
-  std::size_t _old_timestamp = 0;
-  std::size_t _timestamp = 1;
+  Timestamp _timestamp = 1;
 
   MapElement *_elements = nullptr;
-  std::vector<std::size_t> _positions;
+  ScalableVector<std::size_t> _used_elements;
 };
 
 } // namespace kaminpar
diff --git a/kaminpar-common/graph-compression/compressed_edges_builder.h b/kaminpar-common/graph-compression/compressed_edges_builder.h
index 2b499270..370f45b3 100644
--- a/kaminpar-common/graph-compression/compressed_edges_builder.h
+++ b/kaminpar-common/graph-compression/compressed_edges_builder.h
@@ -13,26 +13,44 @@
 
 #include "kaminpar-common/graph-compression/compressed_neighborhoods.h"
 #include "kaminpar-common/heap_profiler.h"
-#include "kaminpar-common/logger.h"
 
 namespace kaminpar {
-SET_DEBUG(false);
 
+/*!
+ * A builder to construct compressed edges.
+ *
+ * @tparam NodeID The type of integer to use to identify a node.
+ * @tparam EdgeID The type of integer to use to identify an edge.
+ * @tparam EdgeWeight The type of integer to use for edge weights.
+ */
 template <typename NodeID, typename EdgeID, typename EdgeWeight> class CompressedEdgesBuilder {
   using CompressedNeighborhoods = kaminpar::CompressedNeighborhoods<NodeID, EdgeID, EdgeWeight>;
-  using SignedID = CompressedNeighborhoods::SignedID;
 
   static constexpr bool kCompressEdgeWeights = CompressedNeighborhoods::kCompressEdgeWeights;
+
   static constexpr bool kHighDegreeEncoding = CompressedNeighborhoods::kHighDegreeEncoding;
   static constexpr NodeID kHighDegreeThreshold = CompressedNeighborhoods::kHighDegreeThreshold;
   static constexpr NodeID kHighDegreePartLength = CompressedNeighborhoods::kHighDegreePartLength;
+
   static constexpr NodeID kIntervalEncoding = CompressedNeighborhoods::kIntervalEncoding;
   static constexpr NodeID kIntervalLengthTreshold =
       CompressedNeighborhoods::kIntervalLengthTreshold;
+
   static constexpr bool kRunLengthEncoding = CompressedNeighborhoods::kRunLengthEncoding;
-  static constexpr bool kStreamEncoding = CompressedNeighborhoods::kStreamEncoding;
-  static constexpr bool kIsolatedNodesSeparation =
-      CompressedNeighborhoods::kIsolatedNodesSeparation;
+
+  static constexpr bool kStreamVByteEncoding = CompressedNeighborhoods::kStreamVByteEncoding;
+  static constexpr NodeID kStreamVByteThreshold = CompressedNeighborhoods::kStreamVByteThreshold;
+
+  static constexpr NodeID kInvalidNodeID = std::numeric_limits<NodeID>::max();
+
+  using SignedNodeID = std::int64_t;
+  using SignedEdgeWeight = std::make_signed_t<EdgeWeight>;
+
+  using StreamVByteGapEncoder =
+      streamvbyte::StreamVByteEncoder<NodeID, streamvbyte::DifferentialCodingKind::D1>;
+
+  using StreamVByteGapAndWeightEncoder =
+      streamvbyte::StreamVByteEncoder<NodeID, streamvbyte::DifferentialCodingKind::D2>;
 
 public:
   /*!
@@ -48,6 +66,7 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
   [[nodiscard]] static std::size_t compressed_edge_array_max_size(
       const NodeID num_nodes, const EdgeID num_edges, const bool has_edge_weights
   ) {
+    std::size_t node_id_width = signed_varint_length(num_nodes);
     std::size_t edge_id_width;
     if constexpr (kActualNumEdges) {
       if constexpr (kIntervalEncoding) {
@@ -59,19 +78,14 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
       edge_id_width = varint_max_length<EdgeID>();
     }
 
-    std::size_t max_size = num_nodes * edge_id_width + num_edges * varint_length(num_nodes);
+    std::size_t max_size = (num_nodes + 1) * edge_id_width + num_edges * node_id_width;
 
     if constexpr (kHighDegreeEncoding) {
-      if constexpr (kIntervalEncoding) {
-        max_size += 2 * num_nodes * varint_max_length<NodeID>();
-      } else {
-        max_size += num_nodes * varint_max_length<NodeID>();
-      }
-
-      max_size += (num_edges / kHighDegreePartLength) * varint_max_length<NodeID>();
+      max_size += num_nodes * varint_max_length<NodeID>() +
+                  (num_edges / kHighDegreePartLength) * varint_max_length<NodeID>();
     }
 
-    if (has_edge_weights) {
+    if (kCompressEdgeWeights && has_edge_weights) {
       max_size += num_edges * varint_max_length<EdgeWeight>();
     }
 
@@ -97,8 +111,8 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
         _edge_weights(edge_weights) {
     const std::size_t max_size =
         compressed_edge_array_max_size(num_nodes, num_edges, has_edge_weights);
-    _compressed_data_start = heap_profiler::overcommit_memory<std::uint8_t>(max_size);
-    _compressed_data = _compressed_data_start.get();
+    _compressed_edges = heap_profiler::overcommit_memory<std::uint8_t>(max_size);
+    _cur_compressed_edges = _compressed_edges.get();
     _compressed_data_max_size = 0;
   }
 
@@ -124,8 +138,8 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
         _edge_weights(edge_weights) {
     const std::size_t max_size =
         compressed_edge_array_max_size<false>(num_nodes, max_degree, has_edge_weights);
-    _compressed_data_start = heap_profiler::overcommit_memory<std::uint8_t>(max_size);
-    _compressed_data = _compressed_data_start.get();
+    _compressed_edges = heap_profiler::overcommit_memory<std::uint8_t>(max_size);
+    _cur_compressed_edges = _compressed_edges.get();
     _compressed_data_max_size = 0;
   }
 
@@ -135,14 +149,13 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
    */
   ~CompressedEdgesBuilder() {
     if constexpr (kHeapProfiling) {
-      if (_compressed_data_start) {
-        const auto prev_compressed_data_size =
-            static_cast<std::size_t>(_compressed_data - _compressed_data_start.get());
+      if (_compressed_edges) {
+        const auto prev_compressed_data_size = size();
         const std::size_t compressed_data_size =
             std::max(_compressed_data_max_size, prev_compressed_data_size);
 
         heap_profiler::HeapProfiler::global().record_alloc(
-            _compressed_data_start.get(), compressed_data_size
+            _compressed_edges.get(), compressed_data_size
         );
       }
     }
@@ -155,20 +168,19 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
   CompressedEdgesBuilder &operator=(CompressedEdgesBuilder &&) noexcept = delete;
 
   /*!
-   * Initializes/resets the builder.
+   * Initializes the builder.
    *
    * @param first_edge The first edge ID of the first node to be added.
    */
   void init(const EdgeID first_edge) {
-    const auto prev_compressed_data_size =
-        static_cast<std::size_t>(_compressed_data - _compressed_data_start.get());
+    const auto prev_compressed_data_size = size();
     _compressed_data_max_size = std::max(_compressed_data_max_size, prev_compressed_data_size);
-    _compressed_data = _compressed_data_start.get();
+    _cur_compressed_edges = _compressed_edges.get();
 
-    _edge = first_edge;
+    _cur_edge = first_edge;
     _max_degree = 0;
     _total_edge_weight = 0;
-    _cur_edge_weight = 0;
+    _cur_edge_weight = first_edge;
 
     _num_high_degree_nodes = 0;
     _num_high_degree_parts = 0;
@@ -184,16 +196,88 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
    * @param neighbourhood The neighbourhood of the node to add.
    * @return The offset into the compressed edge array of the node.
    */
-  template <typename Container> EdgeID add(const NodeID node, Container &neighbourhood) {
-    if constexpr (std::is_same_v<typename Container::value_type, std::pair<NodeID, EdgeWeight>>) {
-      std::sort(neighbourhood.begin(), neighbourhood.end(), [](const auto &a, const auto &b) {
+  template <typename Container> EdgeID add(const NodeID node, Container &neighborhood) {
+    using Neighbor = std::remove_reference_t<Container>::value_type;
+    constexpr bool kIsNeighbor = std::is_same_v<Neighbor, NodeID>;
+    constexpr bool kIsWeightedNeighbor = std::is_same_v<Neighbor, std::pair<NodeID, EdgeWeight>>;
+    static_assert(kIsNeighbor || kIsWeightedNeighbor);
+
+    const EdgeID offset = current_offset();
+    NodeID degree = neighborhood.size();
+    if (degree == 0) [[unlikely]] {
+      return offset;
+    }
+
+    if constexpr (kIsWeightedNeighbor) {
+      std::sort(neighborhood.begin(), neighborhood.end(), [](const auto &a, const auto &b) {
         return a.first < b.first;
       });
     } else {
-      std::sort(neighbourhood.begin(), neighbourhood.end());
+      std::sort(neighborhood.begin(), neighborhood.end());
+    }
+
+    NodeID num_intervals;
+    if constexpr (kIntervalEncoding) {
+      bool has_intervals;
+      if (kHighDegreeEncoding && degree >= kHighDegreeThreshold) {
+        has_intervals = false;
+      } else {
+        num_intervals = count_intervals(neighborhood);
+        has_intervals = num_intervals > 0;
+        _num_interval_nodes += has_intervals ? 1 : 0;
+      }
+
+      marked_varint_encode(_cur_edge, has_intervals, &_cur_compressed_edges);
+    } else {
+      varint_encode(_cur_edge, &_cur_compressed_edges);
+    }
+
+    _cur_edge += degree;
+
+    if constexpr (kHighDegreeEncoding) {
+      const bool split_neighbourhood = degree >= kHighDegreeThreshold;
+
+      if (split_neighbourhood) {
+        const NodeID num_parts = math::div_ceil(degree, kHighDegreePartLength);
+        const NodeID last_part_length = math::mod_ceil(degree, kHighDegreePartLength);
+
+        std::uint8_t *part_ptr = _cur_compressed_edges;
+        _cur_compressed_edges += sizeof(NodeID) * num_parts;
+
+        bool has_intervals = false;
+        for (NodeID i = 0; i < num_parts; ++i) {
+          const bool last_part = (i + 1) == num_parts;
+          const NodeID part_length = last_part ? last_part_length : kHighDegreePartLength;
+
+          auto part_begin = neighborhood.begin() + i * kHighDegreePartLength;
+          auto part_end = part_begin + part_length;
+          auto part_neighborhood = std::span<Neighbor>(part_begin, part_end);
+
+          NodeID *cur_part_ptr = reinterpret_cast<NodeID *>(part_ptr) + i;
+          *cur_part_ptr = static_cast<NodeID>(_cur_compressed_edges - part_ptr);
+
+          NodeID num_intervals;
+          if constexpr (kIntervalEncoding) {
+            num_intervals = count_intervals(part_neighborhood);
+
+            if (num_intervals > 0) {
+              *cur_part_ptr |= math::kSetMSB<NodeID>;
+              has_intervals = true;
+            }
+          }
+
+          add_edges(node, num_intervals, part_neighborhood);
+        }
+
+        _num_high_degree_nodes += 1;
+        _num_high_degree_parts += num_parts;
+        _num_interval_nodes += has_intervals ? 1 : 0;
+        return offset;
+      }
     }
 
-    return add_node(node, neighbourhood);
+    add_edges(node, num_intervals, neighborhood);
+    return offset;
   }
 
   /*!
@@ -202,7 +286,7 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
    * @return The number of bytes that the compressed data of the added neighborhoods take up.
    */
   [[nodiscard]] std::size_t size() const {
-    return static_cast<std::size_t>(_compressed_data - _compressed_data_start.get());
+    return static_cast<std::size_t>(current_offset());
   }
 
   /*!
@@ -211,7 +295,7 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
    * @return A pointer to the start of the compressed data.
    */
   [[nodiscard]] const std::uint8_t *compressed_data() const {
-    return _compressed_data_start.get();
+    return _compressed_edges.get();
   }
 
   /*!
@@ -220,7 +304,7 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
    * @return Ownership of the compressed data.
    */
   [[nodiscard]] heap_profiler::unique_ptr<std::uint8_t> take_compressed_data() {
-    return std::move(_compressed_data_start);
+    return std::move(_compressed_edges);
   }
 
   /*!
@@ -278,314 +362,300 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
   }
 
 private:
-  heap_profiler::unique_ptr<std::uint8_t> _compressed_data_start;
-  std::uint8_t *_compressed_data;
-  std::size_t _compressed_data_max_size;
-
-  bool _has_edge_weights;
-  EdgeWeight _total_edge_weight;
-  EdgeID _cur_edge_weight;
-  StaticArray<EdgeWeight> &_edge_weights;
-
-  EdgeID _edge;
-  NodeID _max_degree;
-
-  // Graph compression statistics
-  std::size_t _num_high_degree_nodes;
-  std::size_t _num_high_degree_parts;
-  std::size_t _num_interval_nodes;
-  std::size_t _num_intervals;
-
-  // Debug graph compression statistics
-  std::size_t _num_adjacent_node_bytes;
-  std::size_t _num_edge_weights_bytes;
+  [[nodiscard]] std::uint64_t current_offset() const {
+    return static_cast<std::uint64_t>(_cur_compressed_edges - _compressed_edges.get());
+  }
 
-private:
-  template <typename Container> EdgeID add_node(const NodeID node, Container &neighbourhood) {
-    // The offset into the compressed edge array to the start of the neighbourhood.
-    const auto offset = static_cast<EdgeID>(_compressed_data - _compressed_data_start.get());
+  template <typename Container>
+  static void
+  set_adjacent_node(Container &neighborhood, const NodeID num_neighbor, const NodeID value) {
+    using Neighbor = std::remove_reference_t<Container>::value_type;
+    constexpr bool kIsWeightedNeighbor = std::is_same_v<Neighbor, std::pair<NodeID, EdgeWeight>>;
 
-    const NodeID degree = neighbourhood.size();
-    if (degree == 0) {
-      return offset;
+    if constexpr (kIsWeightedNeighbor) {
+      neighborhood[num_neighbor].first = value;
+    } else {
+      neighborhood[num_neighbor] = value;
     }
+  }
 
-    _max_degree = std::max(_max_degree, degree);
-
-    // Store a pointer to the first byte of the first edge of this neighborhood. This byte encodes
-    // in one of its bits whether interval encoding is used for this node, i.e., whether the nodes
-    // has intervals in its neighbourhood.
-    std::uint8_t *marked_byte = _compressed_data;
+  template <typename Container>
+  [[nodiscard]] static NodeID
+  get_adjacent_node(const Container &neighborhood, const NodeID num_neighbor) {
+    using Neighbor = std::remove_reference_t<Container>::value_type;
+    constexpr bool kIsWeightedNeighbor = std::is_same_v<Neighbor, std::pair<NodeID, EdgeWeight>>;
 
-    // Store only the first edge for the source node. The degree can be obtained by determining the
-    // difference between the first edge ids of a node and the next node. Additionally, store the
-    // first edge as a gap when the isolated nodes are continuously stored at the end of the nodes
-    // array.
-    const EdgeID first_edge = _edge;
-    if constexpr (kIntervalEncoding) {
-      _compressed_data += marked_varint_encode(first_edge, false, _compressed_data);
+    if constexpr (kIsWeightedNeighbor) {
+      return neighborhood[num_neighbor].first;
     } else {
-      _compressed_data += varint_encode(first_edge, _compressed_data);
+      return neighborhood[num_neighbor];
     }
+  }
 
-    _edge += degree;
+  template <typename Container>
+  [[nodiscard]] static EdgeWeight
+  get_edge_weight(const Container &neighborhood, const NodeID num_neighbor) {
+    using Neighbor = std::remove_reference_t<Container>::value_type;
+    constexpr bool kIsWeightedNeighbor = std::is_same_v<Neighbor, std::pair<NodeID, EdgeWeight>>;
+    static_assert(kIsWeightedNeighbor);
 
-    // If high-degree encoding is used then split the neighborhood if the degree crosses a
-    // threshold. The neighborhood is split into equally sized parts (except possible the last part)
-    // and each part is encoded independently. Furthermore, the offset at which the part is encoded
-    // is also stored.
-    if constexpr (kHighDegreeEncoding) {
-      const bool split_neighbourhood = degree >= kHighDegreeThreshold;
+    return neighborhood[num_neighbor].second;
+  }
 
-      if (split_neighbourhood) {
-        const NodeID part_count = math::div_ceil(degree, kHighDegreePartLength);
-        const NodeID last_part_length = ((degree % kHighDegreePartLength) == 0)
-                                            ? kHighDegreePartLength
-                                            : (degree % kHighDegreePartLength);
+  void encode_edge_weight(const EdgeWeight edge_weight, EdgeWeight &prev_edge_weight) {
+    if (!_has_edge_weights) {
+      return;
+    }
 
-        uint8_t *part_ptr = _compressed_data;
-        _compressed_data += sizeof(NodeID) * part_count;
+    _total_edge_weight += edge_weight;
 
-        for (NodeID i = 0; i < part_count; ++i) {
-          const bool last_part = (i + 1) == part_count;
-          const NodeID part_length = last_part ? last_part_length : kHighDegreePartLength;
+    if constexpr (kCompressEdgeWeights) {
+      const SignedEdgeWeight edge_weight_gap =
+          edge_weight - static_cast<SignedEdgeWeight>(prev_edge_weight);
 
-          auto part_begin = neighbourhood.begin() + i * kHighDegreePartLength;
-          auto part_end = part_begin + part_length;
-
-          std::uint8_t *cur_part_ptr = part_ptr + sizeof(NodeID) * i;
-          *((NodeID *)cur_part_ptr) = static_cast<NodeID>(_compressed_data - part_ptr);
+      signed_varint_encode(edge_weight_gap, &_cur_compressed_edges);
+      prev_edge_weight = edge_weight;
+    } else {
+      _edge_weights[_cur_edge_weight++] = edge_weight;
+    }
+  }
 
-          using Neighbour = typename Container::value_type;
-          add_edges(node, nullptr, std::span<Neighbour>(part_begin, part_end));
-        }
+  template <typename Container>
+  void add_edges(const NodeID node, const NodeID num_intervals, Container &neighborhood) {
+    NodeID degree = neighborhood.size();
+    EdgeWeight prev_edge_weight = 0;
 
-        _num_high_degree_nodes += 1;
-        _num_high_degree_parts += part_count;
-        return offset;
-      }
+    if constexpr (kIntervalEncoding) {
+      const NodeID num_remaining_nodes =
+          encode_intervals(num_intervals, prev_edge_weight, neighborhood);
+      degree = num_remaining_nodes;
     }
 
-    add_edges(node, marked_byte, std::forward<decltype(neighbourhood)>(neighbourhood));
-    return offset;
+    encode_gaps(node, degree, prev_edge_weight, neighborhood);
   }
 
-  template <typename Container>
-  void add_edges(const NodeID node, std::uint8_t *marked_byte, Container &&neighbourhood) {
-    using Neighbour = std::remove_reference_t<Container>::value_type;
-    constexpr bool kHasEdgeWeights = std::is_same_v<Neighbour, std::pair<NodeID, EdgeWeight>>;
+  template <bool kInvalidate = false, typename Container, typename Lambda>
+  void parse_intervals(const Container &neighborhood, Lambda &&l) const {
+    const NodeID degree = neighborhood.size();
+    if (degree < kIntervalLengthTreshold) {
+      return;
+    }
 
-    const auto fetch_adjacent_node = [&](const NodeID i) {
-      if constexpr (kHasEdgeWeights) {
-        return neighbourhood[i].first;
-      } else {
-        return neighbourhood[i];
-      }
-    };
+    NodeID interval_len = 1;
+    NodeID prev_adjacent_node = get_adjacent_node(neighborhood, 0);
+    for (NodeID i = 1; i < degree; ++i) {
+      const NodeID adjacent_node = get_adjacent_node(neighborhood, i);
 
-    const auto set_adjacent_node = [&](const NodeID i, const NodeID value) {
-      if constexpr (kHasEdgeWeights) {
-        neighbourhood[i].first = value;
-      } else {
-        neighbourhood[i] = value;
+      const bool not_successive_increment = prev_adjacent_node + 1 != adjacent_node;
+      prev_adjacent_node = adjacent_node;
+      if (not_successive_increment) {
+        continue;
       }
-    };
 
-    EdgeWeight prev_edge_weight = 0;
-    const auto add_edge_weight = [&](const NodeID i) {
-      if (!_has_edge_weights) {
-        return;
+      interval_len += 1;
+      if ((i + 1 < degree) && (adjacent_node + 1 == get_adjacent_node(neighborhood, i + 1))) {
+        continue;
       }
 
-      if constexpr (kHasEdgeWeights) {
-        const EdgeWeight edge_weight = neighbourhood[i].second;
-        _total_edge_weight += edge_weight;
+      if (interval_len >= kIntervalLengthTreshold) {
+        const NodeID right_extreme = adjacent_node;
+        const NodeID left_extreme = right_extreme - (interval_len - 1);
+        l(left_extreme, right_extreme, interval_len, i - (interval_len - 1));
+      }
 
-        if constexpr (kCompressEdgeWeights) {
-          const EdgeWeight edge_weight_gap = edge_weight - prev_edge_weight;
+      interval_len = 1;
+    }
+  }
 
-          const std::size_t edge_weight_gap_len =
-              signed_varint_encode(edge_weight_gap, _compressed_data);
-          _compressed_data += edge_weight_gap_len;
-          IF_DBG _num_edge_weights_bytes += edge_weight_gap_len;
+  template <typename Container>
+  [[nodiscard]] NodeID count_intervals(const Container &neighborhood) const {
+    NodeID num_intervals = 0;
 
-          prev_edge_weight = edge_weight;
-        } else {
-          _edge_weights[_cur_edge_weight++] = edge_weight;
-        }
-      } else {
-        _edge_weights[_cur_edge_weight++] = 1;
-        _total_edge_weight += 1;
-      }
-    };
+    parse_intervals(neighborhood, [&](const NodeID, const NodeID, const NodeID, const NodeID) {
+      num_intervals += 1;
+    });
 
-    NodeID local_degree = neighbourhood.size();
+    return num_intervals;
+  }
 
-    // Find intervals [i, j] of consecutive adjacent nodes i, i + 1, ..., j - 1, j of length at
-    // least kIntervalLengthTreshold. Instead of storing all nodes, only encode the left extreme i
-    // and the length j - i + 1. Left extremes are stored using the differences between each left
-    // extreme and the previous right extreme minus 2 (because there must be at least one integer
-    // between the end of an interval and the beginning of the next one), except the first left
-    // extreme, which is stored directly. The lengths are decremented by kIntervalLengthTreshold,
-    // the minimum length of an interval.
-    if constexpr (kIntervalEncoding) {
-      NodeID interval_count = 0;
-
-      // Save the pointer to the interval count and skip the amount of bytes needed to store the
-      // interval count as we can only determine the amount of intervals after finding all of
-      // them.
-      std::uint8_t *interval_count_ptr = _compressed_data;
-      _compressed_data += sizeof(NodeID);
-
-      if (local_degree >= kIntervalLengthTreshold) {
-        NodeID interval_len = 1;
-        NodeID previous_right_extreme = 2;
-        NodeID prev_adjacent_node = fetch_adjacent_node(0);
-
-        for (NodeID i = 1; i < neighbourhood.size(); ++i) {
-          const NodeID adjacent_node = fetch_adjacent_node(i);
-
-          if (prev_adjacent_node + 1 == adjacent_node) {
-            ++interval_len;
-
-            // The interval ends if there are no more nodes or the next node is not the increment of
-            // the current node.
-            if (i + 1 == neighbourhood.size() || fetch_adjacent_node(i + 1) != adjacent_node + 1) {
-              if (interval_len >= kIntervalLengthTreshold) {
-                const NodeID left_extreme = adjacent_node + 1 - interval_len;
-                const NodeID left_extreme_gap = left_extreme + 2 - previous_right_extreme;
-                const NodeID interval_length_gap = interval_len - kIntervalLengthTreshold;
-
-                const std::size_t left_extreme_gap_len =
-                    varint_encode(left_extreme_gap, _compressed_data);
-                _compressed_data += left_extreme_gap_len;
-                IF_DBG _num_adjacent_node_bytes += left_extreme_gap_len;
-
-                const std::size_t interval_length_gap_len =
-                    varint_encode(interval_length_gap, _compressed_data);
-                _compressed_data += interval_length_gap_len;
-                IF_DBG _num_adjacent_node_bytes += interval_length_gap_len;
-
-                for (NodeID j = 0; j < interval_len; ++j) {
-                  const NodeID k = i + 1 + j - interval_len;
-
-                  // Set the adjacent node to a special value, which indicates for the gap encoder
-                  // that the node has been encoded through an interval.
-                  set_adjacent_node(k, std::numeric_limits<NodeID>::max());
-                  add_edge_weight(k);
-                }
-
-                previous_right_extreme = adjacent_node;
-
-                local_degree -= interval_len;
-                interval_count += 1;
+  template <typename Container>
+  NodeID encode_intervals(
+      const NodeID num_intervals, EdgeWeight &prev_edge_weight, Container &neighborhood
+  ) {
+    using Neighbor = std::remove_reference_t<Container>::value_type;
+    constexpr bool kHasEdgeWeights = std::is_same_v<Neighbor, std::pair<NodeID, EdgeWeight>>;
+
+    NodeID num_remaining_nodes = neighborhood.size();
+    if (num_intervals > 0) {
+      varint_encode(num_intervals - 1, &_cur_compressed_edges);
+      _num_intervals += num_intervals;
+
+      NodeID prev_right_extreme = 0;
+      parse_intervals(
+          neighborhood,
+          [&](const NodeID left_extreme,
+              const NodeID right_extreme,
+              const NodeID interval_len,
+              const NodeID index) {
+            const NodeID left_extreme_gap = left_extreme - prev_right_extreme;
+            const NodeID interval_len_gap = interval_len - kIntervalLengthTreshold;
+
+            varint_encode(left_extreme_gap, &_cur_compressed_edges);
+            varint_encode(interval_len_gap, &_cur_compressed_edges);
+
+            prev_right_extreme = right_extreme + 2;
+            num_remaining_nodes -= interval_len;
+            for (NodeID i = 0; i < interval_len; ++i) {
+              const NodeID pos = index + i;
+
+              // Set the adjacent node to a special value, which indicates to the gap encoder
+              // that the node has been encoded through an interval.
+              set_adjacent_node(neighborhood, pos, kInvalidNodeID);
+
+              if constexpr (kHasEdgeWeights) {
+                const EdgeWeight edge_weight = get_edge_weight(neighborhood, pos);
+                encode_edge_weight(edge_weight, prev_edge_weight);
               }
-
-              interval_len = 1;
             }
           }
+      );
+    }
 
-          prev_adjacent_node = adjacent_node;
-        }
-      }
-
-      // If intervals have been encoded store the interval count and set the bit in the marked byte
-      // indicating that interval encoding has been used for the neighbourhood if the marked byte is
-      // given. Otherwise, fix the amount of bytes stored as we don't store the interval count if no
-      // intervals have been encoded.
-      if (marked_byte == nullptr) {
-        *((NodeID *)interval_count_ptr) = interval_count;
-        _num_adjacent_node_bytes += sizeof(NodeID);
-      } else if (interval_count > 0) {
-        *((NodeID *)interval_count_ptr) = interval_count;
-        *marked_byte |= 0b01000000;
-        _num_adjacent_node_bytes += sizeof(NodeID);
-      } else {
-        _compressed_data -= sizeof(NodeID);
-      }
+    return num_remaining_nodes;
+  }
 
-      if (interval_count > 0) {
-        _num_interval_nodes += 1;
-        _num_intervals += interval_count;
-      }
+  template <typename Container>
+  void encode_gaps(
+      const NodeID node, const NodeID degree, EdgeWeight &prev_edge_weight, Container &neighborhood
+  ) {
+    using Neighbor = std::remove_reference_t<Container>::value_type;
+    constexpr bool kHasEdgeWeights = std::is_same_v<Neighbor, std::pair<NodeID, EdgeWeight>>;
 
-      // If all incident edges have been compressed using intervals then gap encoding cannot be
-      // applied.
-      if (local_degree == 0) {
-        return;
-      }
+    if (degree == 0) {
+      return;
     }
 
-    // Store the remaining adjacent nodes using gap encoding. That is instead of directly storing
-    // the nodes v_1, v_2, ..., v_{k - 1}, v_k, store the gaps v_1 - u, v_2 - v_1 - 1, ..., v_k -
-    // v_{k - 1} - 1 between the nodes, where u is the source node. Note that all gaps except the
-    // first one have to be positive as we sorted the nodes in ascending order. Thus, only for the
-    // first gap the sign is additionally stored.
     NodeID i = 0;
+    while (get_adjacent_node(neighborhood, i) == kInvalidNodeID) {
+      i += 1;
+    }
+
+    const NodeID first_adjacent_node = get_adjacent_node(neighborhood, i);
+    const SignedNodeID first_gap = first_adjacent_node - static_cast<SignedNodeID>(node);
+    signed_varint_encode(first_gap, &_cur_compressed_edges);
+    if constexpr (kHasEdgeWeights) {
+      const EdgeWeight edge_weight = get_edge_weight(neighborhood, i);
+      encode_edge_weight(edge_weight, prev_edge_weight);
+    }
+
+    i += 1;
+
+    if constexpr (kRunLengthEncoding) {
+      VarIntRunLengthEncoder<NodeID> rl_encoder(_cur_compressed_edges);
+
+      NodeID prev_adjacent_node = first_adjacent_node;
+      while (i < neighborhood.size()) {
+        const NodeID adjacent_node = get_adjacent_node(neighborhood, i);
+        if (adjacent_node == kInvalidNodeID) {
+          i += 1;
+          continue;
+        }
+
+        const NodeID gap = adjacent_node - prev_adjacent_node - 1;
+        prev_adjacent_node = adjacent_node;
+
+        _cur_compressed_edges += rl_encoder.add(gap);
+        if constexpr (kHasEdgeWeights) {
+          const EdgeWeight edge_weight = get_edge_weight(neighborhood, i);
+          encode_edge_weight(edge_weight, prev_edge_weight);
+        }
 
-    // Go to the first adjacent node that has not been encoded through an interval.
-    if constexpr (kIntervalEncoding) {
-      while (fetch_adjacent_node(i) == std::numeric_limits<NodeID>::max()) {
         i += 1;
       }
-    }
 
-    const NodeID first_adjacent_node = fetch_adjacent_node(i);
-    const SignedID first_gap = first_adjacent_node - static_cast<SignedID>(node);
+      rl_encoder.flush();
+      return;
+    } else if constexpr (kStreamVByteEncoding) {
+      const NodeID num_remaining_gaps = degree - 1;
+
+      if (num_remaining_gaps >= kStreamVByteThreshold) [[likely]] {
+        if constexpr (kHasEdgeWeights) {
+          if (_has_edge_weights) {
+            StreamVByteGapAndWeightEncoder encoder(num_remaining_gaps * 2, _cur_compressed_edges);
+
+            while (i < neighborhood.size()) {
+              const NodeID adjacent_node = get_adjacent_node(neighborhood, i);
+              if (adjacent_node == kInvalidNodeID) {
+                i += 1;
+                continue;
+              }
 
-    const std::size_t first_gap_len = signed_varint_encode(first_gap, _compressed_data);
-    _compressed_data += first_gap_len;
-    IF_DBG _num_adjacent_node_bytes += first_gap_len;
+              const EdgeWeight weight = get_edge_weight(neighborhood, i);
+              _cur_compressed_edges += encoder.add(adjacent_node);
+              _cur_compressed_edges += encoder.add(weight);
 
-    add_edge_weight(i);
-    i += 1;
+              i += 1;
+            }
 
-    const auto encode_gaps = [&](const auto &&encode_gap) {
-      NodeID prev_adjacent_node = first_adjacent_node;
-      while (i < neighbourhood.size()) {
-        const NodeID adjacent_node = fetch_adjacent_node(i);
+            encoder.flush();
+            return;
+          }
+        }
 
-        // Skip the adjacent node if it has been encoded through an interval.
-        if constexpr (kIntervalEncoding) {
-          if (adjacent_node == std::numeric_limits<NodeID>::max()) {
-            i += 1;
+        StreamVByteGapEncoder encoder(num_remaining_gaps, _cur_compressed_edges);
+        while (i < neighborhood.size()) {
+          const NodeID adjacent_node = get_adjacent_node(neighborhood, i++);
+          if (adjacent_node == kInvalidNodeID) {
             continue;
           }
+
+          _cur_compressed_edges += encoder.add(adjacent_node);
         }
 
-        const NodeID gap = adjacent_node - prev_adjacent_node - 1;
-        encode_gap(gap);
-        add_edge_weight(i);
+        encoder.flush();
+        return;
+      }
+    }
 
-        prev_adjacent_node = adjacent_node;
+    NodeID prev_adjacent_node = first_adjacent_node;
+    while (i < neighborhood.size()) {
+      const NodeID adjacent_node = get_adjacent_node(neighborhood, i);
+      if (adjacent_node == kInvalidNodeID) {
         i += 1;
+        continue;
       }
-    };
 
-    if constexpr (kRunLengthEncoding) {
-      VarIntRunLengthEncoder<NodeID> rl_encoder(_compressed_data);
-      encode_gaps([&](const NodeID gap) {
-        const std::size_t gap_len = rl_encoder.add(gap);
-        _compressed_data += gap_len;
-        IF_DBG _num_adjacent_node_bytes += gap_len;
-      });
-      rl_encoder.flush();
-    } else if constexpr (kStreamEncoding) {
-      VarIntStreamEncoder<NodeID> sv_encoder(_compressed_data, local_degree - 1);
-      encode_gaps([&](const NodeID gap) {
-        const std::size_t gap_len = sv_encoder.add(gap);
-        _compressed_data += gap_len;
-        IF_DBG _num_adjacent_node_bytes += gap_len;
-      });
-      sv_encoder.flush();
-    } else {
-      encode_gaps([&](const NodeID gap) {
-        const std::size_t gap_len = varint_encode(gap, _compressed_data);
-        _compressed_data += gap_len;
-        IF_DBG _num_adjacent_node_bytes += gap_len;
-      });
+      const NodeID gap = adjacent_node - prev_adjacent_node - 1;
+      prev_adjacent_node = adjacent_node;
+
+      varint_encode(gap, &_cur_compressed_edges);
+      if constexpr (kHasEdgeWeights) {
+        const EdgeWeight edge_weight = get_edge_weight(neighborhood, i);
+        encode_edge_weight(edge_weight, prev_edge_weight);
+      }
+
+      i += 1;
     }
   }
+
+private:
+  heap_profiler::unique_ptr<std::uint8_t> _compressed_edges;
+  std::uint8_t *_cur_compressed_edges;
+  std::size_t _compressed_data_max_size;
+
+  bool _has_edge_weights;
+  EdgeWeight _total_edge_weight;
+  EdgeID _cur_edge_weight;
+  StaticArray<EdgeWeight> &_edge_weights;
+
+  EdgeID _cur_edge;
+  NodeID _max_degree;
+
+  // Graph compression statistics
+  std::size_t _num_high_degree_nodes;
+  std::size_t _num_high_degree_parts;
+  std::size_t _num_interval_nodes;
+  std::size_t _num_intervals;
 };
 
 } // namespace kaminpar
diff --git a/kaminpar-common/graph-compression/compressed_neighborhoods.h b/kaminpar-common/graph-compression/compressed_neighborhoods.h
index e6e78c5a..0d085969 100644
--- a/kaminpar-common/graph-compression/compressed_neighborhoods.h
+++ b/kaminpar-common/graph-compression/compressed_neighborhoods.h
@@ -10,29 +10,76 @@
 #include "kaminpar-common/constexpr_utils.h"
 #include "kaminpar-common/datastructures/compact_static_array.h"
 #include "kaminpar-common/datastructures/static_array.h"
-#include "kaminpar-common/graph-compression/varint_codec.h"
-#include "kaminpar-common/graph-compression/varint_run_length_codec.h"
-#include "kaminpar-common/graph-compression/varint_stream_codec.h"
+#include "kaminpar-common/graph-compression/streamvbyte.h"
+#include "kaminpar-common/graph-compression/varint.h"
+#include "kaminpar-common/graph-compression/varint_rle.h"
 #include "kaminpar-common/math.h"
 #include "kaminpar-common/ranges.h"
 
+#define INVOKE_CALLBACKU(edge, adjacent_node)                                                      \
+  if constexpr (kNonStoppable) {                                                                   \
+    callback(edge, adjacent_node);                                                                 \
+  } else {                                                                                         \
+    const bool stop = callback(edge, adjacent_node);                                               \
+    if (stop) [[unlikely]] {                                                                       \
+      return true;                                                                                 \
+    }                                                                                              \
+  }
+
+#define INVOKE_CALLBACKW(edge, adjacent_node)                                                      \
+  EdgeWeight edge_weight;                                                                          \
+  if constexpr (kCompressEdgeWeights) {                                                            \
+    const SignedEdgeWeight edge_weight_gap = signed_varint_decode<SignedEdgeWeight>(&node_data);   \
+    edge_weight = static_cast<EdgeWeight>(edge_weight_gap + prev_edge_weight);                     \
+  } else {                                                                                         \
+    edge_weight = _edge_weights[edge];                                                             \
+  }                                                                                                \
+                                                                                                   \
+  if constexpr (kNonStoppable) {                                                                   \
+    callback(edge, adjacent_node, edge_weight);                                                    \
+  } else {                                                                                         \
+    const bool stop = callback(edge, adjacent_node, edge_weight);                                  \
+    if (stop) [[unlikely]] {                                                                       \
+      return true;                                                                                 \
+    }                                                                                              \
+  }                                                                                                \
+                                                                                                   \
+  prev_edge_weight = edge_weight;
+
+#define INVOKE_CALLBACK(edge, adjacent_node)                                                       \
+  if constexpr (kHasEdgeWeights) {                                                                 \
+    INVOKE_CALLBACKW(edge, adjacent_node);                                                         \
+  } else {                                                                                         \
+    INVOKE_CALLBACKU(edge, adjacent_node);                                                         \
+  }
+
 namespace kaminpar {
 
+/*!
+ * The neighborhoods of a graph, which are stored in compressed format through variable-length
+ * encoding, gap encoding, interval encoding and high-degree encoding.
+ *
+ * @tparam NodeID The type of integer to use to identify a node.
+ * @tparam EdgeID The type of integer to use to identify an edge.
+ * @tparam EdgeWeight The type of integer to use for edge weights.
+ */
 template <typename NodeID, typename EdgeID, typename EdgeWeight> class CompressedNeighborhoods {
   static_assert(std::numeric_limits<NodeID>::is_integer);
   static_assert(std::numeric_limits<EdgeID>::is_integer);
   static_assert(std::numeric_limits<EdgeWeight>::is_integer);
 
-  struct NeighborhoodHeader {
-    EdgeID first_edge;
-    NodeID degree;
-    bool uses_intervals;
-    std::size_t length;
-  };
+  using SignedNodeID = std::int64_t;
+  using SignedEdgeWeight = std::make_signed_t<EdgeWeight>;
 
-public:
-  using SignedID = std::int64_t;
+  using StreamVByteGapDecoder =
+      streamvbyte::StreamVByteDecoder<NodeID, false, streamvbyte::DifferentialCodingKind::D1>;
+
+  using StreamVByteGapAndWeightsDecoder =
+      streamvbyte::StreamVByteDecoder<NodeID, true, streamvbyte::DifferentialCodingKind::D2>;
+
+  static constexpr EdgeWeight kDefaultEdgeWeight = 1;
 
+public:
   /*!
    * Whether edge weights are compressed.
    */
@@ -43,7 +90,7 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
 #endif
 
   /*!
-   * Whether high degree encoding is used.
+   * Whether high-degree encoding is used.
    */
 #ifdef KAMINPAR_COMPRESSION_HIGH_DEGREE_ENCODING
   static constexpr bool kHighDegreeEncoding = true;
@@ -57,7 +104,7 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
   static constexpr NodeID kHighDegreeThreshold = 10000;
 
   /*!
-   * The length of a part when splitting the neighbourhood of a high degree
+   * The length of each part when splitting the neighbourhood of a high degree
    * node.
    */
   static constexpr NodeID kHighDegreePartLength = 1000;
@@ -86,40 +133,47 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
 #endif
 
   /*!
-   * Whether stream encoding is used.
+   * Whether StreamVByte encoding is used.
    */
-#ifdef KAMINPAR_COMPRESSION_STREAM_ENCODING
-  static constexpr bool kStreamEncoding = true;
+#ifdef KAMINPAR_COMPRESSION_STREAMVBYTE_ENCODING
+  static constexpr bool kStreamVByteEncoding = true;
 #else
-  static constexpr bool kStreamEncoding = false;
+  static constexpr bool kStreamVByteEncoding = false;
 #endif
 
+  /*!
+   * The minimum number of adjacent nodes required to use StreamVByte encoding.
+   */
+  static constexpr NodeID kStreamVByteThreshold = 3;
+
   static_assert(
-      !kRunLengthEncoding || !kStreamEncoding,
-      "Either run-length or stream encoding can be used for varints "
+      !kRunLengthEncoding || !kStreamVByteEncoding,
+      "Either run-length or StreamVByte encoding can be used for varints "
       "but not both."
   );
 
-  /*!
-   * Whether the isolated nodes of the compressed graph are continuously stored
-   * at the end of the nodes array.
-   */
-#ifdef KAMINPAR_COMPRESSION_ISOLATED_NODES_SEPARATION
-  static constexpr bool kIsolatedNodesSeparation = true;
-#else
-  static constexpr bool kIsolatedNodesSeparation = false;
-#endif
+  static_assert(
+      !kRunLengthEncoding || !kCompressEdgeWeights,
+      "Run-length cannot be used together with compressed edge weights."
+  );
 
-  /**
+  static_assert(
+      !kStreamVByteEncoding || !kCompressEdgeWeights || sizeof(NodeID) == sizeof(EdgeWeight),
+      "StreamVByte together with compressed edge weights can only be used when the node IDs and "
+      "edge weights have the same width."
+  );
+
+  /*!
    * Constructs a new CompressedNeighborhoods.
    *
-   * @param nodes The nodes of the compressed neighborhoods.
-   * @param compressed_edges The edges and edge weights of the compressed neighborhoods.
+   * @param nodes The offsets for each node into the compressed edges where the corresponding
+   * adjacent nodes and edge weights are encoded.
+   * @param compressed_edges The edges and edge weights in compresed format.
    * @param edge_weights The edge weights of the graph, which is only used when the graph has edge
-   * weights and graph compression is disabled.
+   * weights and edg weight compression is disabled.
    * @param max_degree The maximum degree of the nodes.
    * @param num_edges The number of edges.
-   * @param has_edge_weights Whether edge weights are stored
+   * @param has_edge_weights Whether edge weights are stored.
    * @param total_edge_weight The total edge weight.
    * @param num_high_degree_nodes The number of nodes that have high degree.
    * @param num_high_degree_parts The total number of parts that result from splitting high degree
@@ -155,6 +209,8 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
     KASSERT(kHighDegreeEncoding || _num_high_degree_parts == 0);
     KASSERT(kIntervalEncoding || _num_interval_nodes == 0);
     KASSERT(kIntervalEncoding || _num_intervals == 0);
+    KASSERT(!has_edge_weights || edge_weights.size() == num_edges);
+    KASSERT(has_edge_weights || edge_weights.empty());
   }
 
   CompressedNeighborhoods(const CompressedNeighborhoods &) = delete;
@@ -163,7 +219,43 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
   CompressedNeighborhoods(CompressedNeighborhoods &&) noexcept = default;
   CompressedNeighborhoods &operator=(CompressedNeighborhoods &&) noexcept = default;
 
-  /**
+  /*!
+   * Returns the number of nodes.
+   *
+   * @return The number of nodes.
+   */
+  [[nodiscard]] EdgeID num_nodes() const {
+    return _nodes.size() - 1;
+  }
+
+  /*!
+   * Returns the number of edges.
+   *
+   * @return The number of edges.
+   */
+  [[nodiscard]] EdgeID num_edges() const {
+    return _num_edges;
+  }
+
+  /*!
+   * Returns whether the edges are weighted.
+   *
+   * @return Whether the edges are weighted.
+   */
+  [[nodiscard]] bool has_edge_weights() const {
+    return _has_edge_weights;
+  }
+
+  /*!
+   * Returns the total edge weight.
+   *
+   * @return The total edge weight.
+   */
+  [[nodiscard]] EdgeWeight total_edge_weight() const {
+    return _total_edge_weight;
+  }
+
+  /*!
    * Returns the maximum degree of the nodes.
    *
    * @return The maximum degree of the nodes.
@@ -172,169 +264,133 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
     return _max_degree;
   }
 
-  /**
+  /*!
    * Returns the degree of a node.
    *
    * @param node The node whose degree is to be returned.
    * @return The degree of the node.
    */
   [[nodiscard]] NodeID degree(const NodeID node) const {
-    const std::uint8_t *data = _compressed_edges.data();
-
-    const std::uint8_t *node_data = data + _nodes[node];
-    const std::uint8_t *next_node_data = data + _nodes[node + 1];
-
-    const bool is_isolated_node = node_data == next_node_data;
-    if (is_isolated_node) [[unlikely]] {
-      return 0;
-    }
-
-    const auto header = decode_header(node, node_data, next_node_data);
-    return header.degree;
+    return static_cast<NodeID>(first_invalid_edge(node) - first_edge(node));
   }
 
-  /**
+  /*!
    * Returns incident edges of a nodes.
    *
-   * @param node The node whose incident edges is to be returned.
+   * @param node The node whose incident edges are to be returned.
    * @return The incident edges of the node.
    */
   [[nodiscard]] IotaRange<EdgeID> incident_edges(const NodeID node) const {
-    const std::uint8_t *data = _compressed_edges.data();
-
-    const std::uint8_t *node_data = data + _nodes[node];
-    const std::uint8_t *next_node_data = data + _nodes[node + 1];
-
-    const bool is_isolated_node = node_data == next_node_data;
-    if (is_isolated_node) [[unlikely]] {
-      return {0, 0};
-    }
-
-    const auto header = decode_header(node, node_data, next_node_data);
-    return {header.first_edge, header.first_edge + header.degree};
+    return {first_edge(node), first_invalid_edge(node)};
   }
 
-  /**
-   * Decodes a neighborhood and invokes a caller with each adjacent node and corresponding edge
-   * weight.
+  /*!
+   * Decodes the adjacent nodes of a node.
    *
-   * @tparam kParallelDecoding Whether to decode the neighborhood in parallel.
-   * @tparam Lambda The type of the caller to invoke.
-   * @param u The node whose neighborhood is to be decoded.
-   * @param l The caller to invoke.
+   * @tparam The type of callback to invoke with the adjacent nodes.
+   * @param node The node whose adjacent nodes are to be decoded.
+   * @param callback The function to invoke with each adjacent node.
    */
-  template <bool kParallelDecoding = false, typename Lambda>
-  void decode(const NodeID u, Lambda &&l) const {
-    KASSERT(u < num_nodes());
-    constexpr bool kInvokeDirectly = std::is_invocable_v<Lambda, EdgeID, NodeID, EdgeWeight>;
-
-    if (_has_edge_weights) [[unlikely]] {
-      decode_neighborhood<true, kParallelDecoding>(u, std::forward<Lambda>(l));
-    } else {
-      if constexpr (kInvokeDirectly) {
-        decode_neighborhood<false, kParallelDecoding>(u, [&](const EdgeID e, const NodeID v) {
-          return l(e, v, 1);
-        });
-      } else {
-        decode_neighborhood<false, kParallelDecoding>(u, [&](auto &&l2) {
-          l([&](auto &&l3) { l2([&](const EdgeID e, const NodeID v) { return l3(e, v, 1); }); });
-        });
-      }
-    }
+  template <typename Callback> void adjacent_nodes(const NodeID node, Callback &&callback) const {
+    decode_adjacent_nodes<false>(node, std::forward<Callback>(callback));
   }
 
-  /**
-   * Decodes the leading edges of a neighborhood and invokes a caller with each adjacent node and
-   * corresponding edge weight.
+  /*!
+   * Decodes the neighbors of a node.
    *
-   * @tparam Lambda The type of the caller to invoke.
-   * @param u The node whose neighborhood is to be decoded.
-   * @param max_num_neighbors The number of neighbors to decode.
-   * @param l The caller to invoke.
+   * @tparam The type of callback to invoke with the neighbor.
+   * @param node The node whose neighbors are to be decoded.
+   * @param callback The function to invoke with each neighbor.
    */
-  template <typename Lambda>
-  void decode(const NodeID u, const NodeID max_num_neighbors, Lambda &&l) const {
-    KASSERT(u < num_nodes());
-    KASSERT(max_num_neighbors > 0);
+  template <typename Callback> void neighbors(const NodeID node, Callback &&callback) const {
+    decode_neighbors<false>(node, std::forward<Callback>(callback));
+  }
 
-    static_assert(std::is_invocable_v<Lambda, EdgeID, NodeID, EdgeWeight>);
+  /*!
+   * Decodes a part of the neighbors of a node.
+   *
+   * @tparam The type of callback to invoke with the neighbor.
+   * @param node The node whose neighbors are to be decoded.
+   * @param callback The function to invoke with each neighbor.
+   */
+  template <typename Callback>
+  void neighbors(const NodeID node, const NodeID max_num_neighbors, Callback &&callback) const {
+    static_assert(std::is_invocable_v<Callback, EdgeID, NodeID, EdgeWeight>);
     constexpr bool kNonStoppable =
-        std::is_void_v<std::invoke_result_t<Lambda, EdgeID, NodeID, EdgeWeight>>;
+        std::is_void_v<std::invoke_result_t<Callback, EdgeID, NodeID, EdgeWeight>>;
 
     NodeID num_neighbors_visited = 1;
     const auto invoke_and_check = [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
       bool abort = num_neighbors_visited++ >= max_num_neighbors;
 
       if constexpr (kNonStoppable) {
-        l(e, v, w);
+        callback(e, v, w);
       } else {
-        abort |= l(e, v, w);
+        abort |= callback(e, v, w);
       }
 
       return abort;
     };
 
     if (_has_edge_weights) [[unlikely]] {
-      decode_neighborhood<true, false>(u, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
+      decode<true, false>(node, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
         return invoke_and_check(e, v, w);
       });
     } else {
-      decode_neighborhood<false, false>(u, [&](const EdgeID e, const NodeID v) {
-        return invoke_and_check(e, v, 1);
+      decode<false, false>(node, [&](const EdgeID e, const NodeID v) {
+        return invoke_and_check(e, v, kDefaultEdgeWeight);
       });
     }
   }
 
-  /**
-   * Restricts the node array to a specific number of nodes.
+  /*!
+   * Decodes the adjacent nodes of a node in parallel.
    *
-   * @param new_n The new number of nodes.
+   * @tparam The type of callback to invoke with the adjacent nodes.
+   * @param node The node whose adjacent nodes are to be decoded.
+   * @param callback The function to invoke with each adjacent node.
    */
-  void restrict_nodes(const NodeID new_n) {
-    _nodes.restrict(new_n);
+  template <typename Callback>
+  void parallel_adjacent_nodes(const NodeID node, Callback &&callback) const {
+    decode_adjacent_nodes<true>(node, std::forward<Callback>(callback));
   }
 
-  /**
-   * Unrestricts the node array.
-   */
-  void unrestrict_nodes() {
-    _nodes.unrestrict();
-  }
-
-  /**
-   * Returns the number of nodes.
+  /*!
+   * Decodes the neighbors of a node in parallel.
    *
-   * @return The number of nodes.
+   * @tparam The type of callback to invoke with the neighbor.
+   * @param node The node whose neighbors are to be decoded.
+   * @param callback The function to invoke with each neighbor.
    */
-  [[nodiscard]] EdgeID num_nodes() const {
-    return _nodes.size() - 1;
+  template <typename Callback>
+  void parallel_neighbors(const NodeID node, Callback &&callback) const {
+    decode_neighbors<true>(node, std::forward<Callback>(callback));
   }
 
-  /**
-   * Returns the number of edges.
+  /*!
+   * Restricts the node array to a specific number of nodes.
    *
-   * @return The number of edges.
+   * @param new_num_nodes The new number of nodes.
    */
-  [[nodiscard]] EdgeID num_edges() const {
-    return _num_edges;
+  void restrict_nodes(const NodeID new_num_nodes) {
+    _nodes.restrict(new_num_nodes);
   }
 
-  /**
-   * Returns whether the edges are weighted.
-   *
-   * @return Whether the edges are weighted.
+  /*!
+   * Unrestricts the node array.
    */
-  [[nodiscard]] bool has_edge_weights() const {
-    return _has_edge_weights;
+  void unrestrict_nodes() {
+    _nodes.unrestrict();
   }
 
-  /**
-   * Returns the total edge weight.
+  /*!
+   * Returns the used memory space in bytes.
    *
-   * @return The total edge weight.
+   * @return The used memory space in bytes.
    */
-  [[nodiscard]] EdgeWeight total_edge_weight() const {
-    return _total_edge_weight;
+  [[nodiscard]] std::size_t memory_space() const {
+    return _nodes.memory_space() + _compressed_edges.size() +
+           _edge_weights.size() * sizeof(EdgeWeight);
   }
 
   /*!
@@ -373,17 +429,7 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
     return _num_intervals;
   }
 
-  /**
-   * Returns the used memory space in bytes.
-   *
-   * @return The used memory space in bytes.
-   */
-  [[nodiscard]] std::size_t memory_space() const {
-    return _nodes.memory_space() + _compressed_edges.size() +
-           _edge_weights.size() * sizeof(EdgeWeight);
-  }
-
-  /**
+  /*!
    * Returns ownership of the raw node array.
    *
    * @return Ownership of the raw node array.
@@ -392,7 +438,7 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
     return std::move(_nodes);
   }
 
-  /**
+  /*!
    * Returns a reference to the raw node array.
    *
    * @return A reference to the raw node array.
@@ -401,7 +447,7 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
     return _nodes;
   }
 
-  /**
+  /*!
    * Returns a reference to the raw node array.
    *
    * @return A reference to the raw node array.
@@ -410,7 +456,7 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
     return _nodes;
   }
 
-  /**
+  /*!
    * Returns a reference to the raw compressed edges.
    *
    * @return A reference to the raw compressed edges.
@@ -419,7 +465,7 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
     return _compressed_edges;
   }
 
-  /**
+  /*!
    * Returns a reference to the raw edge weights.
    *
    * Note that the weights are only valid when edge weight compression is enabled and when the
@@ -432,136 +478,169 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
   }
 
 private:
-  CompactStaticArray<EdgeID> _nodes;
-  StaticArray<std::uint8_t> _compressed_edges;
-  StaticArray<EdgeWeight> _edge_weights;
+  [[nodiscard]] EdgeID first_edge(const NodeID node) const {
+    const std::uint8_t *node_data = _compressed_edges.data() + _nodes[node];
 
-  EdgeID _num_edges;
-  NodeID _max_degree;
+    if constexpr (kIntervalEncoding) {
+      const auto [first_edge, _] = marked_varint_decode<EdgeID>(node_data);
+      return first_edge;
+    } else {
+      return varint_decode<EdgeID>(node_data);
+    }
+  }
 
-  bool _has_edge_weights;
-  EdgeWeight _total_edge_weight;
+  [[nodiscard]] EdgeID first_invalid_edge(const NodeID node) const {
+    return first_edge(node + 1);
+  }
 
-  std::size_t _num_high_degree_nodes;
-  std::size_t _num_high_degree_parts;
-  std::size_t _num_interval_nodes;
-  std::size_t _num_intervals;
+  template <bool kParallel, typename Callback>
+  void decode_adjacent_nodes(const NodeID node, Callback &&callback) const {
+    constexpr bool kInvokeDirectly = std::is_invocable_v<Callback, NodeID, EdgeWeight>;
 
-private:
-  template <bool kHasEdgeWeights, bool kParallelDecoding, typename Lambda>
-  void decode_neighborhood(const NodeID node, Lambda &&l) const {
-    constexpr bool kInvokeDirectly = []() {
-      if constexpr (kHasEdgeWeights) {
-        return std::is_invocable_v<Lambda, EdgeID, NodeID, EdgeWeight>;
+    if (_has_edge_weights) [[unlikely]] {
+      decode<true, kParallel>(node, [&](const EdgeID, const NodeID v, const EdgeWeight w) {
+        return callback(v, w);
+      });
+    } else {
+      if constexpr (kInvokeDirectly) {
+        decode<false, kParallel>(node, [&](const EdgeID, const NodeID v) {
+          return callback(v, kDefaultEdgeWeight);
+        });
       } else {
-        return std::is_invocable_v<Lambda, EdgeID, NodeID>;
+        decode<false, kParallel>(node, [&](auto &&local_decode) {
+          callback([&](auto &&actual_callback) {
+            local_decode([&](const EdgeID, const NodeID v) {
+              return actual_callback(v, kDefaultEdgeWeight);
+            });
+          });
+        });
       }
-    }();
+    }
+  }
 
-    const std::uint8_t *data = _compressed_edges.data();
+  template <bool kParallel, typename Callback>
+  void decode_neighbors(const NodeID node, Callback &&callback) const {
+    constexpr bool kInvokeDirectly = std::is_invocable_v<Callback, EdgeID, NodeID, EdgeWeight>;
+
+    if (_has_edge_weights) [[unlikely]] {
+      decode<true, kParallel>(node, std::forward<Callback>(callback));
+    } else {
+      if constexpr (kInvokeDirectly) {
+        decode<false, kParallel>(node, [&](const EdgeID e, const NodeID v) {
+          return callback(e, v, kDefaultEdgeWeight);
+        });
+      } else {
+        decode<false, kParallel>(node, [&](auto &&local_decode) {
+          callback([&](auto &&actual_callback) {
+            local_decode([&](const EdgeID e, const NodeID v) {
+              return actual_callback(e, v, kDefaultEdgeWeight);
+            });
+          });
+        });
+      }
+    }
+  }
+
+  template <bool kHasEdgeWeights, bool kParallel, typename Callback>
+  void decode(const NodeID node, Callback &&callback) const {
+    constexpr bool kInvokeDirectly = std::conditional_t<
+        kHasEdgeWeights,
+        std::is_invocable<Callback, EdgeID, NodeID, EdgeWeight>,
+        std::is_invocable<Callback, EdgeID, NodeID>>::value;
 
+    const std::uint8_t *data = _compressed_edges.data();
     const std::uint8_t *node_data = data + _nodes[node];
     const std::uint8_t *next_node_data = data + _nodes[node + 1];
-
-    const bool is_isolated_node = node_data == next_node_data;
-    if (is_isolated_node) [[unlikely]] {
+    if (node_data == next_node_data) [[unlikely]] {
       return;
     }
 
-    const auto header = decode_header(node, node_data, next_node_data);
-    node_data += header.length;
+    EdgeID edge;
+    EdgeID last_edge;
+    bool has_intervals;
+    if constexpr (kIntervalEncoding) {
+      const auto header = marked_varint_decode<EdgeID>(&node_data);
+      edge = header.first;
+      has_intervals = header.second;
+      last_edge = marked_varint_decode<EdgeID>(next_node_data).first;
+    } else {
+      edge = varint_decode<EdgeID>(&node_data);
+      last_edge = varint_decode<EdgeID>(next_node_data);
+    }
 
     if constexpr (kHighDegreeEncoding) {
-      if (header.degree >= kHighDegreeThreshold) {
-        decode_parts<kHasEdgeWeights, kParallelDecoding>(
-            node_data, node, header.degree, header.first_edge, std::forward<Lambda>(l)
+      const NodeID degree = static_cast<NodeID>(last_edge - edge);
+      const bool split_neighbourhood = degree >= kHighDegreeThreshold;
+
+      if (split_neighbourhood) [[unlikely]] {
+        decode_parts<kHasEdgeWeights, kParallel>(
+            node_data, node, degree, edge, last_edge, std::forward<Callback>(callback)
         );
         return;
       }
     }
 
-    invoke_indirect<kInvokeDirectly>(std::forward<Lambda>(l), [&](auto &&l2) {
+    invoke_indirect<kInvokeDirectly>(std::forward<Callback>(callback), [&](auto &&actual_callback) {
       decode_edges<kHasEdgeWeights>(
           node_data,
           node,
-          header.degree,
-          header.first_edge,
-          header.uses_intervals,
-          std::forward<decltype(l2)>(l2)
+          edge,
+          last_edge,
+          has_intervals,
+          std::forward<decltype(actual_callback)>(actual_callback)
       );
     });
   }
 
-  [[nodiscard]] NeighborhoodHeader decode_header(
-      const NodeID node,
-      const std::uint8_t *const node_data,
-      const std::uint8_t *const next_node_data
-  ) const {
-    const auto [first_edge, next_first_edge, uses_intervals, len] = [&] {
-      if constexpr (kIntervalEncoding) {
-        const auto [first_edge, uses_intervals, len] = marked_varint_decode<EdgeID>(node_data);
-        const auto [next_first_edge, _, __] = marked_varint_decode<EdgeID>(next_node_data);
-
-        return std::make_tuple(first_edge, next_first_edge, uses_intervals, len);
-      } else {
-        const auto [first_edge, len] = varint_decode<EdgeID>(node_data);
-        const auto [next_first_edge, _] = varint_decode<EdgeID>(next_node_data);
-
-        return std::make_tuple(first_edge, next_first_edge, false, len);
-      }
-    }();
-
-    if constexpr (kIsolatedNodesSeparation) {
-      const EdgeID ungapped_first_edge = first_edge + node;
-      const NodeID degree = static_cast<NodeID>(1 + next_first_edge - first_edge);
-      return {ungapped_first_edge, degree, uses_intervals, len};
-    } else {
-      const NodeID degree = static_cast<NodeID>(next_first_edge - first_edge);
-      return {first_edge, degree, uses_intervals, len};
-    }
-  }
-
-  template <bool kHasEdgeWeights, bool kParallelDecoding, typename Lambda>
+  template <bool kHasEdgeWeights, bool kParallel, typename Callback>
   void decode_parts(
-      const std::uint8_t *data,
+      const std::uint8_t *node_data,
       const NodeID node,
       const NodeID degree,
       const EdgeID edge,
-      Lambda &&l
+      const EdgeID last_edge,
+      Callback &&callback
   ) const {
-    constexpr bool kInvokeDirectly = []() {
-      if constexpr (kHasEdgeWeights) {
-        return std::is_invocable_v<Lambda, EdgeID, NodeID, EdgeWeight>;
-      } else {
-        return std::is_invocable_v<Lambda, EdgeID, NodeID>;
-      }
-    }();
-
-    const NodeID part_count = math::div_ceil(degree, kHighDegreePartLength);
+    constexpr bool kInvokeDirectly = std::conditional_t<
+        kHasEdgeWeights,
+        std::is_invocable<Callback, EdgeID, NodeID, EdgeWeight>,
+        std::is_invocable<Callback, EdgeID, NodeID>>::value;
 
-    const auto iterate_part = [&](const NodeID part) {
-      const NodeID part_offset = *((NodeID *)(data + sizeof(NodeID) * part));
-      const std::uint8_t *part_data = data + part_offset;
+    const NodeID num_parts = math::div_ceil(degree, kHighDegreePartLength);
+    const auto decode_part = [&](const NodeID part) {
+      NodeID part_offset = *(reinterpret_cast<const NodeID *>(node_data) + part);
 
-      const NodeID part_count_m1 = part_count - 1;
-      const bool last_part = part == part_count_m1;
+      bool has_intervals;
+      if constexpr (kIntervalEncoding) {
+        has_intervals = math::is_msb_set(part_offset);
+        part_offset &= ~math::kSetMSB<NodeID>;
+      }
 
       const EdgeID part_edge = edge + kHighDegreePartLength * part;
-      const NodeID part_degree =
-          last_part ? (degree - kHighDegreePartLength * part_count_m1) : kHighDegreePartLength;
-
-      return invoke_indirect2<kInvokeDirectly, bool>(std::forward<Lambda>(l), [&](auto &&l2) {
-        return decode_edges<kHasEdgeWeights>(
-            part_data, node, part_degree, part_edge, true, std::forward<decltype(l2)>(l2)
-        );
-      });
+      const EdgeID part_last_edge =
+          ((part + 1) == num_parts) ? last_edge : part_edge + kHighDegreePartLength;
+
+      const std::uint8_t *part_data = node_data + part_offset;
+      return invoke_indirect2<kInvokeDirectly, bool>(
+          std::forward<Callback>(callback),
+          [&](auto &&actual_callback) {
+            return decode_edges<kHasEdgeWeights>(
+                part_data,
+                node,
+                part_edge,
+                part_last_edge,
+                has_intervals,
+                std::forward<decltype(actual_callback)>(actual_callback)
+            );
+          }
+      );
     };
 
-    if constexpr (kParallelDecoding) {
-      tbb::parallel_for<NodeID>(0, part_count, iterate_part);
+    if constexpr (kParallel) {
+      tbb::parallel_for<NodeID>(0, num_parts, decode_part);
     } else {
-      for (NodeID part = 0; part < part_count; ++part) {
-        const bool stop = iterate_part(part);
+      for (NodeID part = 0; part < num_parts; ++part) {
+        const bool stop = decode_part(part);
         if (stop) [[unlikely]] {
           return;
         }
@@ -569,189 +648,152 @@ template <typename NodeID, typename EdgeID, typename EdgeWeight> class Compresse
     }
   }
 
-  template <bool kHasEdgeWeights, typename Lambda>
+  template <bool kHasEdgeWeights, typename Callback>
   bool decode_edges(
-      const std::uint8_t *data,
+      const std::uint8_t *node_data,
       const NodeID node,
-      const NodeID degree,
       EdgeID edge,
-      bool uses_intervals,
-      Lambda &&l
+      const EdgeID last_edge,
+      const bool has_intervals,
+      Callback &&callback
   ) const {
-    const EdgeID max_edge = edge + degree;
-    EdgeWeight prev_edge_weight = 0;
+    using CallbackReturnType = std::conditional_t<
+        kHasEdgeWeights,
+        std::invoke_result<Callback, EdgeID, NodeID, EdgeWeight>,
+        std::invoke_result<Callback, EdgeID, NodeID>>::type;
+    constexpr bool kNonStoppable = std::is_void_v<CallbackReturnType>;
 
+    EdgeWeight prev_edge_weight = 0;
     if constexpr (kIntervalEncoding) {
-      if (uses_intervals) {
-        const bool stop = decode_intervals<kHasEdgeWeights>(
-            data, edge, prev_edge_weight, std::forward<Lambda>(l)
-        );
-        if (stop) [[unlikely]] {
-          return true;
-        }
+      if (has_intervals) {
+        NodeID num_intervals = varint_decode<NodeID>(&node_data) + 1;
+        NodeID prev_right_extreme = 0;
 
-        if (edge == max_edge) [[unlikely]] {
-          return false;
-        }
-      }
-    }
+        do {
+          const NodeID left_extreme_gap = varint_decode<NodeID>(&node_data);
+          const NodeID length_gap = varint_decode<NodeID>(&node_data);
 
-    return decode_gaps<kHasEdgeWeights>(
-        data, node, edge, prev_edge_weight, max_edge, std::forward<Lambda>(l)
-    );
-  }
+          const NodeID left_extreme = left_extreme_gap + prev_right_extreme;
+          const NodeID length = length_gap + kIntervalLengthTreshold;
+          prev_right_extreme = left_extreme + (length - 1) + 2;
 
-  template <bool kHasEdgeWeights, typename Lambda>
-  bool decode_intervals(
-      const std::uint8_t *&data, EdgeID &edge, EdgeWeight &prev_edge_weight, Lambda &&l
-  ) const {
-    using LambdaReturnType = std::conditional_t<
-        kHasEdgeWeights,
-        std::invoke_result<Lambda, EdgeID, NodeID, EdgeWeight>,
-        std::invoke_result<Lambda, EdgeID, NodeID>>::type;
-    constexpr bool kNonStoppable = std::is_void_v<LambdaReturnType>;
+          static_assert(kIntervalLengthTreshold == 3, "Optimized for length threshold = 3.");
+          INVOKE_CALLBACK(edge, left_extreme);
+          INVOKE_CALLBACK(edge + 1, left_extreme + 1);
+          INVOKE_CALLBACK(edge + 2, left_extreme + 2);
+          edge += kIntervalLengthTreshold;
+
+          for (NodeID j = kIntervalLengthTreshold; j < length; ++j) {
+            const NodeID adjacent_node = left_extreme + j;
 
-    const auto invoke_caller = [&](const NodeID adjacent_node) {
-      if constexpr (kHasEdgeWeights) {
-        if constexpr (kCompressEdgeWeights) {
-          const auto [edge_weight_gap, length] = signed_varint_decode<EdgeWeight>(data);
-          data += length;
+            INVOKE_CALLBACK(edge, adjacent_node);
+            edge += 1;
+          }
 
-          const EdgeWeight edge_weight = edge_weight_gap + prev_edge_weight;
-          prev_edge_weight = edge_weight;
+          num_intervals -= 1;
+        } while (num_intervals > 0);
 
-          return l(edge, adjacent_node, edge_weight);
-        } else {
-          return l(edge, adjacent_node, _edge_weights[edge]);
+        if (edge == last_edge) [[unlikely]] {
+          return false;
         }
-      } else {
-        return l(edge, adjacent_node);
       }
-    };
+    }
 
-    const NodeID interval_count = *((NodeID *)data);
-    data += sizeof(NodeID);
+    const SignedNodeID first_gap = signed_varint_decode<SignedNodeID>(&node_data);
+    const NodeID first_adjacent_node = static_cast<NodeID>(first_gap + node);
+    INVOKE_CALLBACK(edge, first_adjacent_node);
+    edge += 1;
 
-    NodeID previous_right_extreme = 2;
-    for (NodeID i = 0; i < interval_count; ++i) {
-      const auto [left_extreme_gap, left_extreme_gap_len] = varint_decode<NodeID>(data);
-      data += left_extreme_gap_len;
+    if constexpr (kRunLengthEncoding) {
+      const NodeID num_remaining_gaps = static_cast<NodeID>(last_edge - edge);
+      VarIntRunLengthDecoder<NodeID> rl_decoder(num_remaining_gaps, node_data);
 
-      const auto [interval_length_gap, interval_length_gap_len] = varint_decode<NodeID>(data);
-      data += interval_length_gap_len;
+      bool stop = false;
+      NodeID prev_adjacent_node = first_adjacent_node;
+      rl_decoder.decode([&](const NodeID gap) {
+        const NodeID adjacent_node = gap + prev_adjacent_node + 1;
+        prev_adjacent_node = adjacent_node;
 
-      const NodeID cur_left_extreme = left_extreme_gap + previous_right_extreme - 2;
-      const NodeID cur_interval_len = interval_length_gap + kIntervalLengthTreshold;
-      previous_right_extreme = cur_left_extreme + cur_interval_len - 1;
+        if constexpr (kHasEdgeWeights) {
+          EdgeWeight edge_weight = _edge_weights[edge];
 
-      for (NodeID j = 0; j < cur_interval_len; ++j) {
-        if constexpr (kNonStoppable) {
-          invoke_caller(cur_left_extreme + j);
+          if constexpr (kNonStoppable) {
+            callback(edge++, adjacent_node, edge_weight);
+          } else {
+            stop = callback(edge++, adjacent_node, edge_weight);
+            return stop;
+          }
         } else {
-          const bool stop = invoke_caller(cur_left_extreme + j);
-          if (stop) [[unlikely]] {
-            return true;
+          if constexpr (kNonStoppable) {
+            callback(edge++, adjacent_node);
+          } else {
+            stop = callback(edge++, adjacent_node);
+            return stop;
           }
         }
+      });
 
-        edge += 1;
-      }
-    }
-
-    return false;
-  }
-
-  template <bool kHasEdgeWeights, typename Lambda>
-  bool decode_gaps(
-      const std::uint8_t *data,
-      NodeID node,
-      EdgeID &edge,
-      EdgeWeight &prev_edge_weight,
-      const EdgeID max_edge,
-      Lambda &&l
-  ) const {
-    using LambdaReturnType = std::conditional_t<
-        kHasEdgeWeights,
-        std::invoke_result<Lambda, EdgeID, NodeID, EdgeWeight>,
-        std::invoke_result<Lambda, EdgeID, NodeID>>::type;
-    constexpr bool kNonStoppable = std::is_void_v<LambdaReturnType>;
-
-    const auto invoke_caller = [&](const NodeID adjacent_node) {
-      if constexpr (kHasEdgeWeights) {
-        if constexpr (kCompressEdgeWeights) {
-          const auto [edge_weight_gap, length] = signed_varint_decode<EdgeWeight>(data);
-          data += length;
-
-          const EdgeWeight edge_weight = edge_weight_gap + prev_edge_weight;
-          prev_edge_weight = edge_weight;
-          return l(edge, adjacent_node, edge_weight);
+      return stop;
+    } else if constexpr (kStreamVByteEncoding) {
+      const NodeID num_remaining_gaps = static_cast<NodeID>(last_edge - edge);
+
+      if (num_remaining_gaps >= kStreamVByteThreshold) {
+        bool stop = false;
+
+        if constexpr (kHasEdgeWeights) {
+          StreamVByteGapAndWeightsDecoder decoder(num_remaining_gaps * 2, node_data);
+          decoder.decode([&](const NodeID adjacent_node, const EdgeWeight edge_weight) {
+            if constexpr (kNonStoppable) {
+              callback(edge++, adjacent_node, edge_weight);
+            } else {
+              stop = callback(edge++, adjacent_node, edge_weight);
+              return stop;
+            }
+          });
         } else {
-          return l(edge, adjacent_node, _edge_weights[edge]);
+          StreamVByteGapDecoder decoder(num_remaining_gaps, node_data);
+          decoder.decode([&](const NodeID adjacent_node) {
+            if constexpr (kNonStoppable) {
+              callback(edge++, adjacent_node);
+            } else {
+              stop = callback(edge++, adjacent_node);
+              return stop;
+            }
+          });
         }
-      } else {
-        return l(edge, adjacent_node);
-      }
-    };
-
-    const auto [first_gap, first_gap_len] = signed_varint_decode<SignedID>(data);
-    data += first_gap_len;
 
-    const NodeID first_adjacent_node = static_cast<NodeID>(first_gap + node);
-    NodeID prev_adjacent_node = first_adjacent_node;
-
-    if constexpr (kNonStoppable) {
-      invoke_caller(first_adjacent_node);
-    } else {
-      const bool stop = invoke_caller(first_adjacent_node);
-      if (stop) [[unlikely]] {
-        return true;
+        return stop;
       }
     }
-    edge += 1;
 
-    const auto handle_gap = [&](const NodeID gap) {
+    NodeID prev_adjacent_node = first_adjacent_node;
+    while (edge < last_edge) {
+      const NodeID gap = varint_decode<NodeID>(&node_data);
       const NodeID adjacent_node = gap + prev_adjacent_node + 1;
-      prev_adjacent_node = adjacent_node;
 
-      if constexpr (kNonStoppable) {
-        invoke_caller(adjacent_node);
-        edge += 1;
-      } else {
-        const bool stop = invoke_caller(adjacent_node);
-        edge += 1;
-        return stop;
-      }
-    };
+      INVOKE_CALLBACK(edge, adjacent_node);
+      prev_adjacent_node = adjacent_node;
+      edge += 1;
+    }
 
-    if constexpr (kRunLengthEncoding) {
-      VarIntRunLengthDecoder<NodeID> rl_decoder(data, max_edge - edge);
-      rl_decoder.decode(handle_gap);
-    } else if constexpr (kStreamEncoding) {
-      VarIntStreamDecoder<NodeID> sv_encoder(data, max_edge - edge);
-      sv_encoder.decode(handle_gap);
-    } else {
-      while (edge != max_edge) {
-        const auto [gap, gap_len] = varint_decode<NodeID>(data);
-        data += gap_len;
+    return false;
+  }
 
-        const NodeID adjacent_node = gap + prev_adjacent_node + 1;
-        prev_adjacent_node = adjacent_node;
+private:
+  CompactStaticArray<EdgeID> _nodes;
+  StaticArray<std::uint8_t> _compressed_edges;
+  StaticArray<EdgeWeight> _edge_weights;
 
-        if constexpr (kNonStoppable) {
-          invoke_caller(adjacent_node);
-        } else {
-          const bool stop = invoke_caller(adjacent_node);
-          if (stop) [[unlikely]] {
-            return true;
-          }
-        }
+  EdgeID _num_edges;
+  NodeID _max_degree;
 
-        edge += 1;
-      }
-    }
+  bool _has_edge_weights;
+  EdgeWeight _total_edge_weight;
 
-    return false;
-  }
+  std::size_t _num_high_degree_nodes;
+  std::size_t _num_high_degree_parts;
+  std::size_t _num_interval_nodes;
+  std::size_t _num_intervals;
 };
 
 } // namespace kaminpar
diff --git a/kaminpar-common/graph-compression/compressed_neighborhoods_builder.h b/kaminpar-common/graph-compression/compressed_neighborhoods_builder.h
index 229f10e8..a9e584e9 100644
--- a/kaminpar-common/graph-compression/compressed_neighborhoods_builder.h
+++ b/kaminpar-common/graph-compression/compressed_neighborhoods_builder.h
@@ -89,14 +89,14 @@ class CompressedNeighborhoodsBuilder {
     const EdgeID last_edge = _num_edges;
     std::uint8_t *compressed_edges_end = compressed_edges.get() + compressed_edges_size;
     if constexpr (CompressedNeighborhoods::kIntervalEncoding) {
-      compressed_edges_size += marked_varint_encode(last_edge, false, compressed_edges_end);
+      marked_varint_encode(last_edge, false, &compressed_edges_end);
     } else {
-      compressed_edges_size += varint_encode(last_edge, compressed_edges_end);
+      varint_encode(last_edge, &compressed_edges_end);
     }
 
     // Add an additional 15 bytes to the compressed edge array when stream encoding is enabled to
     // avoid a possible segmentation fault as the stream decoder reads 16-byte chunks.
-    if constexpr (CompressedNeighborhoods::kStreamEncoding) {
+    if constexpr (CompressedNeighborhoods::kStreamVByteEncoding) {
       compressed_edges_size += 15;
     }
 
@@ -257,14 +257,14 @@ class ParallelCompressedNeighborhoodsBuilder {
     std::uint8_t *_compressed_edges_end = _compressed_edges.get() + _compressed_edges_size;
     const EdgeID last_edge = _num_edges;
     if constexpr (CompressedNeighborhoods::kIntervalEncoding) {
-      _compressed_edges_size += marked_varint_encode(last_edge, false, _compressed_edges_end);
+      marked_varint_encode(last_edge, false, &_compressed_edges_end);
     } else {
-      _compressed_edges_size += varint_encode(last_edge, _compressed_edges_end);
+      varint_encode(last_edge, &_compressed_edges_end);
     }
 
     // Add an additional 15 bytes to the compressed edge array when stream encoding is enabled to
     // avoid a possible segmentation fault as the stream decoder reads 16-byte chunks.
-    if constexpr (CompressedNeighborhoods::kStreamEncoding) {
+    if constexpr (CompressedNeighborhoods::kStreamVByteEncoding) {
       _compressed_edges_size += 15;
     }
 
@@ -281,7 +281,7 @@ class ParallelCompressedNeighborhoodsBuilder {
         _max_degree,
         _num_edges,
         _has_edge_weights,
-        _total_edge_weight,
+        _has_edge_weights ? _total_edge_weight : _num_edges,
         _num_high_degree_nodes,
         _num_high_degree_parts,
         _num_interval_nodes,
diff --git a/kaminpar-common/graph-compression/streamvbyte.h b/kaminpar-common/graph-compression/streamvbyte.h
new file mode 100644
index 00000000..3d0d3f9c
--- /dev/null
+++ b/kaminpar-common/graph-compression/streamvbyte.h
@@ -0,0 +1,899 @@
+/*******************************************************************************
+ * Endoder and decoder for StreamVByte.
+ *
+ * @file:   streamvbyte.h
+ * @author: Daniel Salwasser
+ * @date:   29.12.2023
+ ******************************************************************************/
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <concepts>
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__x86_64__)
+#include <immintrin.h>
+#elif defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
+#include "kaminpar-common/constexpr_utils.h"
+#include "kaminpar-common/math.h"
+
+namespace kaminpar::streamvbyte {
+
+enum class DifferentialCodingKind {
+  NONE,
+  D1,
+  D2,
+  DM,
+  D4,
+};
+
+template <std::integral Int, DifferentialCodingKind GapKind = DifferentialCodingKind::NONE>
+class StreamVByteEncoder {
+  static constexpr std::size_t kIntByteWidth = sizeof(Int);
+  static_assert(
+      kIntByteWidth == 4 || kIntByteWidth == 8,
+      "StreamVByte only supports 32-bit or 64-bit integers."
+  );
+
+  [[nodiscard]] static std::size_t required_byte_width(const Int value) {
+    if constexpr (kIntByteWidth == 4) {
+      return math::byte_width(value);
+    } else if constexpr (kIntByteWidth == 8) {
+      switch (math::byte_width(value)) {
+      case 1:
+        return 1;
+      case 2:
+        return 2;
+      case 3:
+        [[fallthrough]];
+      case 4:
+        return 4;
+      case 5:
+        [[fallthrough]];
+      case 6:
+        [[fallthrough]];
+      case 7:
+        [[fallthrough]];
+      case 8:
+        return 8;
+      default:
+        __builtin_unreachable();
+      }
+    } else {
+      static_assert("Unexpected integer width.");
+    }
+  }
+
+  [[nodiscard]] static std::uint8_t encoded_byte_width(const Int value) {
+    if constexpr (kIntByteWidth == 4) {
+      return required_byte_width(value) - 1;
+    } else if constexpr (kIntByteWidth == 8) {
+      switch (required_byte_width(value)) {
+      case 1:
+        return 0;
+      case 2:
+        return 1;
+      case 4:
+        return 2;
+      case 8:
+        return 3;
+      default:
+        __builtin_unreachable();
+      }
+    } else {
+      static_assert("Unexpected integer width.");
+    }
+  }
+
+public:
+  explicit StreamVByteEncoder(const std::size_t num_values, std::uint8_t *ptr)
+      : _num_values(num_values),
+        _control_bytes_ptr(ptr),
+        _data_ptr(ptr + math::div_ceil(num_values, 4)),
+        _num_buffered(0),
+        _prev_value(0),
+        _prev2_value(0),
+        _prev3_value(0),
+        _prev4_value(0),
+        _prev_max_value(0),
+        _next_max_value(0) {
+    std::fill(std::begin(_buffer), std::end(_buffer), 0);
+  }
+
+  std::size_t add(Int value) {
+    if constexpr (GapKind == DifferentialCodingKind::D1) {
+      const Int next_prev_value = value;
+      value = value - _prev_value;
+
+      _prev_value = next_prev_value;
+    } else if constexpr (GapKind == DifferentialCodingKind::D2) {
+      const Int next_prev_value = value;
+      value = value - _prev2_value;
+
+      _prev2_value = _prev_value;
+      _prev_value = next_prev_value;
+    } else if constexpr (GapKind == DifferentialCodingKind::DM) {
+      _next_max_value = std::max(_prev_max_value, value);
+      value = value - _prev_max_value;
+    } else if constexpr (GapKind == DifferentialCodingKind::D4) {
+      const Int next_prev_value = value;
+      value = value - _prev4_value;
+
+      _prev4_value = _prev3_value;
+      _prev3_value = _prev2_value;
+      _prev2_value = _prev_value;
+      _prev_value = next_prev_value;
+    }
+
+    _buffer[_num_buffered] = value;
+
+    if (_num_buffered == 3) {
+      if constexpr (GapKind == DifferentialCodingKind::DM) {
+        _prev_max_value = _next_max_value;
+        _next_max_value = 0;
+      }
+
+      unchecked_flush();
+      return required_byte_width(value);
+    }
+
+    const bool first_element = _num_buffered++ == 0;
+    return required_byte_width(value) + (first_element ? 1 : 0);
+  }
+
+  std::uint8_t *flush() {
+    if (_num_buffered > 0) [[likely]] {
+      unchecked_flush(_num_buffered);
+    }
+
+    return _data_ptr;
+  }
+
+private:
+  std::size_t _num_values;
+  std::uint8_t *_control_bytes_ptr;
+  std::uint8_t *_data_ptr;
+
+  std::size_t _num_buffered;
+  std::array<Int, 4> _buffer;
+
+  Int _prev_value;
+  Int _prev2_value;
+  Int _prev3_value;
+  Int _prev4_value;
+
+  Int _prev_max_value;
+  Int _next_max_value;
+
+private:
+  void unchecked_flush(const std::size_t num_values = 4) {
+    const std::uint8_t control_byte =
+        (encoded_byte_width(_buffer[3]) << 6) | (encoded_byte_width(_buffer[2]) << 4) |
+        (encoded_byte_width(_buffer[1]) << 2) | encoded_byte_width(_buffer[0]);
+    *_control_bytes_ptr++ = control_byte;
+
+    for (std::size_t i = 0; i < num_values; ++i) {
+      Int value = _buffer[i];
+
+      do {
+        *_data_ptr++ = static_cast<std::uint8_t>(value);
+        value >>= 8;
+      } while (value > 0);
+
+      if constexpr (kIntByteWidth == 8) {
+        std::size_t num_padding_bytes = required_byte_width(value) - math::byte_width(value);
+        while (num_padding_bytes > 0) {
+          *_data_ptr++ = static_cast<std::uint8_t>(0);
+          num_padding_bytes -= 1;
+        }
+      }
+    }
+
+    _num_buffered = 0;
+    std::fill(std::begin(_buffer), std::end(_buffer), 0);
+  }
+};
+
+template <
+    std::integral Int,
+    bool PassPairs = false,
+    DifferentialCodingKind GapKind = DifferentialCodingKind::NONE>
+class StreamVByteDecoder {
+  static constexpr std::size_t kIntByteWidth = sizeof(Int);
+  static_assert(
+      kIntByteWidth == 4 || kIntByteWidth == 8,
+      "StreamVByte only supports 32-bit or 64-bit integers."
+  );
+
+  static constexpr bool k32BitInts = kIntByteWidth == 4;
+  using LengthTable =
+      std::conditional_t<k32BitInts, std::array<std::uint8_t, 256>, std::array<std::uint8_t, 16>>;
+  using ShuffleTable = std::conditional_t<
+      k32BitInts,
+      std::array<std::array<std::uint8_t, 16>, 256>,
+      std::array<std::array<std::uint8_t, 16>, 16>>;
+
+  [[nodiscard]] static consteval LengthTable create_length_table() {
+    LengthTable length_table{};
+
+    if constexpr (k32BitInts) {
+      constexpr_for<256>([&](const std::uint8_t control_byte) {
+        length_table[control_byte] = 0;
+
+        constexpr_for<4>([&](const std::uint8_t i) {
+          const std::uint8_t header = (control_byte >> (2 * i)) & 0b11;
+          const std::uint8_t length = header + 1;
+          length_table[control_byte] += length;
+        });
+      });
+    } else {
+      const auto actual_length = [&](const std::uint8_t header) {
+        switch (header) {
+        case 0:
+          return 1;
+        case 1:
+          return 2;
+        case 2:
+          return 4;
+        case 3:
+          return 8;
+        default:
+          __builtin_unreachable();
+        }
+      };
+
+      constexpr_for<16>([&](const std::uint8_t control_byte) {
+        length_table[control_byte] = 0;
+
+        constexpr_for<2>([&](const std::uint8_t i) {
+          const std::uint8_t header = (control_byte >> (2 * i)) & 0b11;
+          const std::uint8_t length = actual_length(header);
+          length_table[control_byte] += length;
+        });
+      });
+    }
+
+    return length_table;
+  }
+
+  [[nodiscard]] static consteval ShuffleTable create_shuffle_table() {
+    ShuffleTable shuffle_table{};
+
+    if constexpr (k32BitInts) {
+      constexpr_for<256>([&](const std::uint8_t control_byte) {
+        std::uint8_t byte = 0;
+        std::uint8_t pos = 0;
+
+        constexpr_for<4>([&](const std::uint8_t i) {
+          const std::uint8_t header = (control_byte >> (2 * i)) & 0b11;
+          const std::uint8_t length = header + 1;
+
+          std::uint8_t j = 0;
+          while (j < length) {
+            shuffle_table[control_byte][pos++] = byte++;
+            j += 1;
+          }
+
+          while (j < 4) {
+            shuffle_table[control_byte][pos++] = 0b11111111;
+            j += 1;
+          }
+        });
+      });
+    } else {
+      const auto actual_length = [&](const std::uint8_t value) {
+        switch (value) {
+        case 0:
+          return 1;
+        case 1:
+          return 2;
+        case 2:
+          return 4;
+        case 3:
+          return 8;
+        default:
+          __builtin_unreachable();
+        }
+      };
+
+      constexpr_for<16>([&](const std::uint8_t control_byte) {
+        std::uint8_t byte = 0;
+        std::uint8_t pos = 0;
+
+        constexpr_for<2>([&](const std::uint8_t i) {
+          const std::uint8_t header = (control_byte >> (2 * i)) & 0b11;
+          const std::uint8_t length = actual_length(header);
+
+          std::uint8_t j = 0;
+          while (j < length) {
+            shuffle_table[control_byte][pos++] = byte++;
+            j += 1;
+          }
+
+          while (j < 8) {
+            shuffle_table[control_byte][pos++] = 0b11111111;
+            j += 1;
+          }
+        });
+      });
+    }
+
+    return shuffle_table;
+  }
+
+  static constexpr const LengthTable kLengthTable = create_length_table();
+  static constexpr const ShuffleTable kShuffleTable = create_shuffle_table();
+
+public:
+  explicit StreamVByteDecoder(const std::size_t num_values, const std::uint8_t *ptr)
+      : _num_control_bytes(num_values / 4),
+        _control_bytes_ptr(ptr),
+        _num_values(num_values),
+        _data_ptr(ptr + _num_control_bytes + ((num_values % 4) != 0)) {}
+
+  template <typename Lambda> void decode(Lambda &&l) {
+    if constexpr (k32BitInts) {
+      decode32(std::forward<Lambda>(l));
+    } else {
+      decode64(std::forward<Lambda>(l));
+    }
+  }
+
+  [[nodiscard]] const std::uint8_t *get() {
+    return _data_ptr;
+  }
+
+private:
+#if defined(__x86_64__)
+  template <typename Lambda> void decode32(Lambda &&l) {
+    static_assert(std::is_invocable_v<Lambda, Int> || PassPairs && std::is_invocable_v<Lambda, Int, Int>);
+
+    using LambdaReturnType = std::conditional_t<
+        PassPairs,
+        std::invoke_result<Lambda, Int, Int>,
+        std::invoke_result<Lambda, Int>>::type;
+    constexpr bool kNonStoppable = std::is_void_v<LambdaReturnType>;
+
+    __m128i prev = _mm_setzero_si128();
+    const auto decode_gaps = [&](__m128i data) {
+      if constexpr (GapKind == DifferentialCodingKind::NONE) {
+        prev = data;
+        return;
+      }
+
+      if constexpr (GapKind == DifferentialCodingKind::D1) {
+        const __m128i temp = _mm_add_epi32(_mm_slli_si128(data, 8), data);
+        prev = _mm_add_epi32(
+            _mm_add_epi32(_mm_slli_si128(temp, 4), temp), _mm_shuffle_epi32(prev, 0xff)
+        );
+      } else if constexpr (GapKind == DifferentialCodingKind::D2) {
+        prev = _mm_add_epi32(
+            _mm_add_epi32(_mm_slli_si128(data, 8), data),
+            _mm_shuffle_epi32(prev, _MM_SHUFFLE(3, 2, 3, 2))
+        );
+      } else if constexpr (GapKind == DifferentialCodingKind::DM) {
+        prev = _mm_add_epi32(data, _mm_shuffle_epi32(prev, 0xff));
+      } else if constexpr (GapKind == DifferentialCodingKind::D4) {
+        prev = _mm_add_epi32(data, prev);
+      } else {
+        static_assert("Unexpected differential coding kind.");
+      }
+    };
+
+    for (std::size_t i = 0; i < _num_control_bytes; ++i) {
+      const std::uint8_t control_byte = _control_bytes_ptr[i];
+      const std::uint8_t length = kLengthTable[control_byte];
+
+      __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
+      _data_ptr += length;
+
+      const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
+      const __m128i mask = _mm_loadu_si128((const __m128i *)shuffle_mask);
+
+      data = _mm_shuffle_epi8(data, mask);
+      decode_gaps(data);
+
+      if constexpr (kNonStoppable) {
+        if constexpr (PassPairs) {
+          l(_mm_extract_epi32(prev, 0), _mm_extract_epi32(prev, 1));
+          l(_mm_extract_epi32(prev, 2), _mm_extract_epi32(prev, 3));
+        } else {
+          l(_mm_extract_epi32(prev, 0));
+          l(_mm_extract_epi32(prev, 1));
+          l(_mm_extract_epi32(prev, 2));
+          l(_mm_extract_epi32(prev, 3));
+        }
+      } else {
+        if constexpr (PassPairs) {
+          if (l(_mm_extract_epi32(prev, 0), _mm_extract_epi32(prev, 1))) [[unlikely]] {
+            return;
+          }
+
+          if (l(_mm_extract_epi32(prev, 2), _mm_extract_epi32(prev, 3))) [[unlikely]] {
+            return;
+          }
+        } else {
+          if (l(_mm_extract_epi32(prev, 0))) [[unlikely]] {
+            return;
+          }
+
+          if (l(_mm_extract_epi32(prev, 1))) [[unlikely]] {
+            return;
+          }
+
+          if (l(_mm_extract_epi32(prev, 2))) [[unlikely]] {
+            return;
+          }
+
+          if (l(_mm_extract_epi32(prev, 3))) [[unlikely]] {
+            return;
+          }
+        }
+      }
+    }
+
+    if constexpr (PassPairs) {
+      if (_num_values % 4 == 2) {
+        const std::uint8_t control_byte = _control_bytes_ptr[_num_control_bytes];
+        const std::uint8_t length = kLengthTable[control_byte];
+
+        const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
+        const __m128i mask = _mm_loadu_si128((const __m128i *)shuffle_mask);
+
+        __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
+        _data_ptr += length - 2;
+
+        data = _mm_shuffle_epi8(data, mask);
+        decode_gaps(data);
+
+        if constexpr (kNonStoppable) {
+          l(_mm_extract_epi32(prev, 0), _mm_extract_epi32(prev, 1));
+        } else {
+          if (l(_mm_extract_epi32(prev, 0), _mm_extract_epi32(prev, 1))) [[unlikely]] {
+            return;
+          }
+        }
+      }
+    } else {
+      switch (_num_values % 4) {
+      case 1: {
+        const std::uint8_t control_byte = _control_bytes_ptr[_num_control_bytes];
+        const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
+
+        __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
+        const __m128i mask = _mm_loadu_si128((const __m128i *)shuffle_mask);
+
+        data = _mm_shuffle_epi8(data, mask);
+        decode_gaps(data);
+
+        if constexpr (kNonStoppable) {
+          l(_mm_extract_epi32(prev, 0));
+        } else {
+          if (l(_mm_extract_epi32(prev, 0))) [[unlikely]] {
+            return;
+          }
+        }
+        break;
+      }
+      case 2: {
+        const std::uint8_t control_byte = _control_bytes_ptr[_num_control_bytes];
+        const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
+
+        __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
+        const __m128i mask = _mm_loadu_si128((const __m128i *)shuffle_mask);
+
+        data = _mm_shuffle_epi8(data, mask);
+        decode_gaps(data);
+
+        if constexpr (kNonStoppable) {
+          l(_mm_extract_epi32(prev, 0));
+          l(_mm_extract_epi32(prev, 1));
+        } else {
+          if (l(_mm_extract_epi32(prev, 0))) [[unlikely]] {
+            return;
+          }
+
+          if (l(_mm_extract_epi32(prev, 1))) [[unlikely]] {
+            return;
+          }
+        }
+        break;
+      }
+      case 3: {
+        const std::uint8_t control_byte = _control_bytes_ptr[_num_control_bytes];
+        const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
+
+        __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
+        const __m128i mask = _mm_loadu_si128((const __m128i *)shuffle_mask);
+
+        data = _mm_shuffle_epi8(data, mask);
+        decode_gaps(data);
+
+        if constexpr (kNonStoppable) {
+          l(_mm_extract_epi32(prev, 0));
+          l(_mm_extract_epi32(prev, 1));
+          l(_mm_extract_epi32(prev, 2));
+        } else {
+          if (l(_mm_extract_epi32(prev, 0))) [[unlikely]] {
+            return;
+          }
+
+          if (l(_mm_extract_epi32(prev, 1))) [[unlikely]] {
+            return;
+          }
+
+          if (l(_mm_extract_epi32(prev, 2))) [[unlikely]] {
+            return;
+          }
+        }
+        break;
+      }
+      }
+    }
+  }
+#elif defined(__aarch64__)
+  template <typename Lambda> void decode32(Lambda &&l) {
+    static_assert(std::is_invocable_v<Lambda, Int> || PassPairs && std::is_invocable_v<Lambda, Int, Int>);
+
+    using LambdaReturnType = std::conditional_t<
+        PassPairs,
+        std::invoke_result<Lambda, Int, Int>,
+        std::invoke_result<Lambda, Int>>::type;
+    constexpr bool kNonStoppable = std::is_void_v<LambdaReturnType>;
+
+    uint32x4_t prev = vmovq_n_u32(0);
+    const auto decode_gaps = [&](__m128i data) {
+      if constexpr (GapKind == DifferentialCodingKind::NONE) {
+        prev = data;
+        return;
+      } else {
+        static_assert("Unsupported differential coding kind (ARM).");
+      }
+    };
+
+    for (std::size_t i = 0; i < _num_control_bytes; ++i) {
+      const std::uint8_t control_byte = _control_bytes_ptr[i];
+      const std::uint8_t length = kLengthTable[control_byte];
+
+      uint32x4_t data = vld1q_u32(reinterpret_cast<const std::uint32_t *>(_data_ptr));
+      _data_ptr += length;
+
+      const uint8x16_t shuffle_mask = vld1q_u8(kShuffleTable[control_byte].data());
+      data = vqtbl1q_u8(data, shuffle_mask);
+
+      decode_gaps(data);
+
+      if constexpr (kNonStoppable) {
+        if constexpr (PassPairs) {
+          l(vgetq_lane_u32(prev, 0), vgetq_lane_u32(prev, 1));
+          l(vgetq_lane_u32(prev, 2), vgetq_lane_u32(prev, 3));
+        } else {
+          l(vgetq_lane_u32(prev, 0));
+          l(vgetq_lane_u32(prev, 1));
+          l(vgetq_lane_u32(prev, 2));
+          l(vgetq_lane_u32(prev, 3));
+        }
+      } else {
+        if constexpr (PassPairs) {
+          if (l(vgetq_lane_u32(prev, 0), vgetq_lane_u32(prev, 1))) [[unlikely]] {
+            return;
+          }
+
+          if (l(vgetq_lane_u32(prev, 2), vgetq_lane_u32(prev, 3))) [[unlikely]] {
+            return;
+          }
+        } else {
+          if (l(vgetq_lane_u32(prev, 0))) [[unlikely]] {
+            return;
+          }
+
+          if (l(vgetq_lane_u32(prev, 1))) [[unlikely]] {
+            return;
+          }
+
+          if (l(vgetq_lane_u32(prev, 2))) [[unlikely]] {
+            return;
+          }
+
+          if (l(vgetq_lane_u32(prev, 3))) [[unlikely]] {
+            return;
+          }
+        }
+      }
+    }
+
+    if constexpr (PassPairs) {
+      if (_num_values % 4 == 2) {
+        const std::uint8_t control_byte = _control_bytes_ptr[_num_control_bytes];
+        const std::uint8_t length = kLengthTable[control_byte];
+
+        uint32x4_t data = vld1q_u32(reinterpret_cast<const std::uint32_t *>(_data_ptr));
+        _data_ptr += length;
+
+        const uint8x16_t shuffle_mask = vld1q_u8(kShuffleTable[control_byte].data());
+        data = vqtbl1q_u8(data, shuffle_mask);
+
+        decode_gaps(data);
+
+        if constexpr (kNonStoppable) {
+          l(vgetq_lane_u32(prev, 0), vgetq_lane_u32(prev, 1));
+        } else {
+          if (l(vgetq_lane_u32(prev, 0), vgetq_lane_u32(prev, 1))) [[unlikely]] {
+            return;
+          }
+        }
+      }
+    } else {
+      switch (_num_values % 4) {
+      case 1: {
+        const std::uint8_t control_byte = _control_bytes_ptr[_num_control_bytes];
+        const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
+
+        uint32x4_t data = vld1q_u32(reinterpret_cast<const std::uint32_t *>(_data_ptr));
+        _data_ptr += length;
+
+        const uint8x16_t shuffle_mask = vld1q_u8(kShuffleTable[control_byte].data());
+        data = vqtbl1q_u8(data, shuffle_mask);
+
+        decode_gaps(data);
+
+        if constexpr (kNonStoppable) {
+          l(vgetq_lane_u32(prev, 0));
+        } else {
+          if (l(vgetq_lane_u32(prev, 0))) [[unlikely]] {
+            return;
+          }
+        }
+        break;
+      }
+      case 2: {
+        const std::uint8_t control_byte = _control_bytes_ptr[_num_control_bytes];
+        const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
+
+        uint32x4_t data = vld1q_u32(reinterpret_cast<const std::uint32_t *>(_data_ptr));
+        _data_ptr += length;
+
+        const uint8x16_t shuffle_mask = vld1q_u8(kShuffleTable[control_byte].data());
+        data = vqtbl1q_u8(data, shuffle_mask);
+
+        decode_gaps(data);
+
+        if constexpr (kNonStoppable) {
+          l(vgetq_lane_u32(prev, 0));
+          l(vgetq_lane_u32(prev, 1));
+        } else {
+          if (l(vgetq_lane_u32(prev, 0))) [[unlikely]] {
+            return;
+          }
+
+          if (l(vgetq_lane_u32(prev, 1))) [[unlikely]] {
+            return;
+          }
+        }
+        break;
+      }
+      case 3: {
+        const std::uint8_t control_byte = _control_bytes_ptr[_num_control_bytes];
+        const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
+
+        uint32x4_t data = vld1q_u32(reinterpret_cast<const std::uint32_t *>(_data_ptr));
+        _data_ptr += length;
+
+        const uint8x16_t shuffle_mask = vld1q_u8(kShuffleTable[control_byte].data());
+        data = vqtbl1q_u8(data, shuffle_mask);
+
+        decode_gaps(data);
+
+        if constexpr (kNonStoppable) {
+          l(vgetq_lane_u32(prev, 0));
+          l(vgetq_lane_u32(prev, 1));
+          l(vgetq_lane_u32(prev, 2));
+        } else {
+          if (l(vgetq_lane_u32(prev, 0))) [[unlikely]] {
+            return;
+          }
+
+          if (l(vgetq_lane_u32(prev, 1))) [[unlikely]] {
+            return;
+          }
+
+          if (l(vgetq_lane_u32(prev, 2))) [[unlikely]] {
+            return;
+          }
+        }
+        break;
+      }
+      }
+    }
+  }
+#else
+#error "Only x64 and ARM are supported"
+#endif
+
+#if defined(__x86_64__)
+  template <typename Lambda> void decode64(Lambda &&l) {
+    static_assert(std::is_invocable_v<Lambda, Int>);
+    constexpr bool kNonStoppable = std::is_void_v<std::invoke_result_t<Lambda, Int>>;
+
+    __m128i prev = _mm_setzero_si128();
+    const auto decode_gaps = [&](__m128i data) {
+      if constexpr (GapKind == DifferentialCodingKind::NONE) {
+        prev = data;
+        return;
+      }
+
+      if constexpr (GapKind == DifferentialCodingKind::D1) {
+        const __m128i temp = _mm_add_epi64(_mm_slli_si128(data, 8), data);
+        prev = _mm_add_epi32(
+            _mm_add_epi32(_mm_slli_si128(temp, 4), temp), _mm_shuffle_epi32(prev, 0xff)
+        );
+      } else if constexpr (GapKind == DifferentialCodingKind::D2) {
+        prev = _mm_add_epi32(
+            _mm_add_epi32(_mm_slli_si128(data, 8), data),
+            _mm_shuffle_epi32(prev, _MM_SHUFFLE(3, 2, 3, 2))
+        );
+      } else if constexpr (GapKind == DifferentialCodingKind::DM) {
+        prev = _mm_add_epi32(data, _mm_shuffle_epi32(prev, 0xff));
+      } else if constexpr (GapKind == DifferentialCodingKind::D4) {
+        prev = _mm_add_epi32(data, prev);
+      } else {
+        static_assert("Unexpected differential coding kind.");
+      }
+    };
+
+    for (std::size_t i = 0; i < _num_control_bytes; ++i) {
+      const std::uint8_t control_byte = _control_bytes_ptr[i];
+      const std::uint8_t control_byte_lh = control_byte & 0b1111;
+      const std::uint8_t control_byte_uh = control_byte >> 4;
+
+      const std::uint8_t length1 = kLengthTable[control_byte_lh];
+      const std::uint8_t length2 = kLengthTable[control_byte_uh];
+
+      __m128i data1 = _mm_loadu_si128((const __m128i *)_data_ptr);
+      _data_ptr += length1;
+
+      __m128i data2 = _mm_loadu_si128((const __m128i *)_data_ptr);
+      _data_ptr += length2;
+
+      const std::uint8_t *shuffle_mask1 = kShuffleTable[control_byte_lh].data();
+      const __m128i mask1 = _mm_loadu_si128((const __m128i *)shuffle_mask1);
+
+      const std::uint8_t *shuffle_mask2 = kShuffleTable[control_byte_uh].data();
+      const __m128i mask2 = _mm_loadu_si128((const __m128i *)shuffle_mask2);
+
+      data1 = _mm_shuffle_epi8(data1, mask1);
+      data2 = _mm_shuffle_epi8(data2, mask2);
+
+      if constexpr (GapKind == DifferentialCodingKind::NONE) {
+        if constexpr (kNonStoppable) {
+          l(_mm_extract_epi64(data1, 0));
+          l(_mm_extract_epi64(data1, 1));
+          l(_mm_extract_epi64(data2, 0));
+          l(_mm_extract_epi64(data2, 1));
+        } else {
+          if (l(_mm_extract_epi64(data1, 0))) [[unlikely]] {
+            return;
+          }
+
+          if (l(_mm_extract_epi64(data1, 1))) [[unlikely]] {
+            return;
+          }
+
+          if (l(_mm_extract_epi64(data2, 0))) [[unlikely]] {
+            return;
+          }
+
+          if (l(_mm_extract_epi64(data2, 1))) [[unlikely]] {
+            return;
+          }
+        }
+      } else {
+        decode_gaps(data1);
+      }
+    }
+
+    switch (_num_values % 4) {
+    case 1: {
+      const std::uint8_t control_byte = _control_bytes_ptr[_num_control_bytes];
+      const std::uint8_t control_byte_lh = control_byte & 0b1111;
+
+      const std::uint8_t *shuffle_mask = kShuffleTable[control_byte_lh].data();
+      const __m128i mask = _mm_loadu_si128((const __m128i *)shuffle_mask);
+
+      __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
+      data = _mm_shuffle_epi8(data, mask);
+
+      if constexpr (kNonStoppable) {
+        l(_mm_extract_epi64(data, 0));
+      } else {
+        if (l(_mm_extract_epi64(data, 0))) [[unlikely]] {
+          return;
+        }
+      }
+      break;
+    }
+    case 2: {
+      const std::uint8_t control_byte = _control_bytes_ptr[_num_control_bytes];
+      const std::uint8_t control_byte_lh = control_byte & 0b1111;
+
+      const std::uint8_t *shuffle_mask = kShuffleTable[control_byte_lh].data();
+      const __m128i mask = _mm_loadu_si128((const __m128i *)shuffle_mask);
+
+      __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
+      data = _mm_shuffle_epi8(data, mask);
+
+      if constexpr (kNonStoppable) {
+        l(_mm_extract_epi64(data, 0));
+        l(_mm_extract_epi64(data, 1));
+      } else {
+        if (l(_mm_extract_epi64(data, 0))) [[unlikely]] {
+          return;
+        }
+
+        if (l(_mm_extract_epi64(data, 1))) [[unlikely]] {
+          return;
+        }
+      }
+      break;
+    }
+    case 3: {
+      const std::uint8_t control_byte = _control_bytes_ptr[_num_control_bytes];
+      const std::uint8_t control_byte_lh = control_byte & 0b1111;
+      const std::uint8_t control_byte_uh = control_byte >> 4;
+
+      const std::uint8_t length1 = kLengthTable[control_byte_lh];
+      __m128i data1 = _mm_loadu_si128((const __m128i *)_data_ptr);
+
+      _data_ptr += length1;
+      __m128i data2 = _mm_loadu_si128((const __m128i *)_data_ptr);
+
+      const std::uint8_t *shuffle_mask1 = kShuffleTable[control_byte_lh].data();
+      const __m128i mask1 = _mm_loadu_si128((const __m128i *)shuffle_mask1);
+
+      const std::uint8_t *shuffle_mask2 = kShuffleTable[control_byte_uh].data();
+      const __m128i mask2 = _mm_loadu_si128((const __m128i *)shuffle_mask2);
+
+      data1 = _mm_shuffle_epi8(data1, mask1);
+      data2 = _mm_shuffle_epi8(data2, mask2);
+
+      if constexpr (kNonStoppable) {
+        l(_mm_extract_epi64(data1, 0));
+        l(_mm_extract_epi64(data1, 1));
+        l(_mm_extract_epi64(data2, 0));
+      } else {
+        if (l(_mm_extract_epi64(data1, 0))) [[unlikely]] {
+          return;
+        }
+
+        if (l(_mm_extract_epi64(data1, 1))) [[unlikely]] {
+          return;
+        }
+
+        if (l(_mm_extract_epi64(data2, 0))) [[unlikely]] {
+          return;
+        }
+      }
+      break;
+    }
+    }
+  }
+#elif defined(__aarch64__)
+  template <typename Lambda> void decode64(Lambda &&l) {
+    static_assert("Unsupported streamvbyte configuration (ARM).");
+  }
+#endif
+
+private:
+  const std::size_t _num_control_bytes;
+  const std::uint8_t *_control_bytes_ptr;
+
+  const std::size_t _num_values;
+  const std::uint8_t *_data_ptr;
+};
+
+} // namespace kaminpar::streamvbyte
diff --git a/kaminpar-common/graph-compression/varint.h b/kaminpar-common/graph-compression/varint.h
new file mode 100644
index 00000000..0a45a158
--- /dev/null
+++ b/kaminpar-common/graph-compression/varint.h
@@ -0,0 +1,511 @@
+/*******************************************************************************
+ * Encoding and decoding methods for VarInts.
+ *
+ * @file:   varint.h
+ * @author: Daniel Salwasser
+ * @date:   11.11.2023
+ ******************************************************************************/
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+
+#ifdef KAMINPAR_COMPRESSION_FAST_DECODING
+#include <immintrin.h>
+#endif
+
+namespace kaminpar {
+
+/*!
+ * Returns the maximum number of bytes that a VarInt needs to be stored.
+ *
+ * @tparam Int The type of integer whose encoded maximum length is returned.
+ */
+template <typename Int> [[nodiscard]] constexpr std::size_t varint_max_length() {
+  return (sizeof(Int) * 8) / 7 + 1;
+}
+
+/*!
+ * Returns the number of bytes a VarInt needs to be stored.
+ *
+ * @tparam Int The type of integer whose encoded length is returned.
+ * @param Int The integer to store.
+ * @return The number of bytes the integer needs to be stored.
+ */
+template <typename Int> [[nodiscard]] std::size_t varint_length(Int i) {
+  std::size_t len = 1;
+
+  while (i > 0b01111111) {
+    i >>= 7;
+    len++;
+  }
+
+  return len;
+}
+
+/*!
+ * Writes an integer to a memory location as a VarInt.
+ *
+ * @tparam Int The type of integer to encode.
+ * @param Int The integer to store.
+ * @param ptr The pointer to the memory location to write the integer to.
+ * @return The number of bytes that the integer occupies at the memory location.
+ */
+template <typename Int> std::size_t varint_encode(Int i, std::uint8_t *ptr) {
+  std::size_t len = 1;
+
+  while (i > 0b01111111) {
+    const std::uint8_t octet = (i & 0b01111111) | 0b10000000;
+    *ptr = octet;
+
+    i >>= 7;
+    ptr += 1;
+    len += 1;
+  }
+
+  const std::uint8_t last_octet = i & 0b01111111;
+  *ptr = last_octet;
+
+  return len;
+}
+
+/*!
+ * Writes an integer to a memory location as a VarInt.
+ *
+ * @tparam Int The type of integer to encode.
+ * @param Int The integer to store.
+ * @param ptr A pointer to the pointer to the memory location to write the integer to, which is
+ * incremented accordingly.
+ */
+template <typename Int> void varint_encode(Int i, std::uint8_t **ptr) {
+  while (i > 0b01111111) {
+    const std::uint8_t octet = (i & 0b01111111) | 0b10000000;
+    **ptr = octet;
+
+    i >>= 7;
+    *ptr += 1;
+  }
+
+  const std::uint8_t last_octet = i & 0b01111111;
+  **ptr = last_octet;
+  *ptr += 1;
+}
+
+/*!
+ * Reads an integer encoded as a VarInt from a memory location.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr A pointer to the memory location to read the integer from.
+ * @return The decoded integer.
+ */
+template <typename Int> [[nodiscard]] Int varint_decode(const std::uint8_t *data) {
+  Int value = 0;
+
+  Int shift = 0;
+  while (true) {
+    const std::uint8_t byte = *data;
+
+    if ((byte & 0b10000000) == 0) {
+      value |= static_cast<Int>(byte) << shift;
+      break;
+    } else {
+      value |= static_cast<Int>(byte & 0b01111111) << shift;
+    }
+
+    shift += 7;
+    data += 1;
+  }
+
+  return value;
+}
+
+/*!
+ * Reads an integer encoded as a VarInt from a memory location.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr A pointer to the pointer to the memory location to read the integer from, which is
+ * incremented accordingly.
+ * @return The decoded integer.
+ */
+template <typename Int> [[nodiscard]] Int varint_decode_loop(const std::uint8_t **data) {
+  Int value = 0;
+
+  Int shift = 0;
+  while (true) {
+    const std::uint8_t octet = **data;
+    *data += 1;
+
+    if ((octet & 0b10000000) == 0) {
+      value |= static_cast<Int>(octet) << shift;
+      break;
+    } else {
+      value |= static_cast<Int>(octet & 0b01111111) << shift;
+    }
+
+    shift += 7;
+  }
+
+  return value;
+}
+
+/*!
+ * Reads an integer encoded as a VarInt from a memory location.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr A pointer to the pointer to the memory location to read the integer from, which is
+ * incremented accordingly.
+ * @return The decoded integer.
+ */
+template <typename Int> [[nodiscard]] Int varint_decode_pext_unrolled(const std::uint8_t **data) {
+#ifdef KAMINPAR_COMPRESSION_FAST_DECODING
+  if constexpr (sizeof(Int) == 4) {
+    const std::uint8_t *data_ptr = *data;
+    if ((data_ptr[0] & 0b10000000) == 0) {
+      const std::uint32_t result = *data_ptr & 0b01111111;
+      *data += 1;
+      return result;
+    }
+
+    if ((data_ptr[1] & 0b10000000) == 0) {
+      const std::uint32_t result =
+          _pext_u32(*reinterpret_cast<const std::uint32_t *>(data_ptr), 0x7F7F);
+      *data += 2;
+      return result;
+    }
+
+    if ((data_ptr[2] & 0b10000000) == 0) {
+      const std::uint32_t result =
+          _pext_u32(*reinterpret_cast<const std::uint32_t *>(data_ptr), 0x7F7F7F);
+      *data += 3;
+      return result;
+    }
+
+    if ((data_ptr[3] & 0b10000000) == 0) {
+      const std::uint32_t result =
+          _pext_u32(*reinterpret_cast<const std::uint32_t *>(data_ptr), 0x7F7F7F7F);
+      *data += 4;
+      return result;
+    }
+
+    const std::uint32_t result = static_cast<std::uint32_t>(
+        _pext_u64(*reinterpret_cast<const std::uint64_t *>(data_ptr), 0x7F7F7F7F7F)
+    );
+    *data += 5;
+    return result;
+  } else if constexpr (sizeof(Int) == 8) {
+    if ((ptr[0] & 0b10000000) == 0) {
+      const std::uint64_t result = *ptr & 0b01111111;
+      *data += 1;
+      return result;
+    }
+
+    if ((ptr[1] & 0b10000000) == 0) {
+      const std::uint64_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F);
+      *data += 2;
+      return result;
+    }
+
+    if ((ptr[2] & 0b10000000) == 0) {
+      const std::uint64_t result =
+          _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F);
+      *data += 3;
+      return result;
+    }
+
+    if ((ptr[3] & 0b10000000) == 0) {
+      const std::uint64_t result =
+          _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F7F);
+      *data += 4;
+      return result;
+    }
+
+    if ((ptr[4] & 0b10000000) == 0) {
+      const std::uint64_t result =
+          _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F);
+      *data += 5;
+      return result;
+    }
+
+    if ((ptr[5] & 0b10000000) == 0) {
+      const std::uint64_t result =
+          _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F);
+      *data += 6;
+      return result;
+    }
+
+    if ((ptr[6] & 0b10000000) == 0) {
+      const std::uint64_t result =
+          _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F);
+      *data += 7;
+      return result;
+    }
+
+    if ((ptr[7] & 0b10000000) == 0) {
+      const std::uint64_t result =
+          _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F7F);
+      *data += 8;
+      return result;
+    }
+
+    if ((ptr[8] & 0b10000000) == 0) {
+      const std::uint64_t result =
+          _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F7F) |
+          (static_cast<std::uint64_t>(ptr[8] & 0b01111111) << 56);
+      *data += 9;
+      return result;
+    }
+
+    const std::uint64_t result =
+        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F7F) |
+        (static_cast<std::uint64_t>(ptr[8] & 0b01111111) << 56) |
+        (static_cast<std::uint64_t>(ptr[9]) << 63);
+    *data += 10;
+    return result;
+  }
+#else
+  return varint_decode_loop<Int>(data);
+#endif
+}
+
+/*!
+ * Reads an integer encoded as a VarInt from a memory location.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr A pointer to the pointer to the memory location to read the integer from, which is
+ * incremented accordingly.
+ * @return The decoded integer.
+ */
+template <typename Int> [[nodiscard]] Int varint_decode_pext_branchless(const std::uint8_t **data) {
+#ifdef KAMINPAR_COMPRESSION_FAST_DECODING
+  if constexpr (sizeof(Int) == 4) {
+    const std::uint8_t *data_ptr = *data;
+
+    const std::uint64_t word = *reinterpret_cast<const std::uint64_t *>(data_ptr);
+    const std::uint64_t continuation_bits = ~word & 0x8080808080;
+    const std::uint64_t mask = continuation_bits ^ (continuation_bits - 1);
+    const std::uint64_t length = (std::countr_zero(continuation_bits) + 1) / 8;
+
+    const Int result = _pext_u64(word & mask, 0x7F7F7F7F7F);
+    *data += length;
+    return result;
+  }
+#else
+  return varint_decode_loop<Int>(data);
+#endif
+}
+
+/*!
+ * Reads an integer encoded as a VarInt from a memory location.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr A pointer to the pointer to the memory location to read the integer from, which is
+ * incremented accordingly.
+ * @return The decoded integer.
+ */
+template <typename Int> [[nodiscard]] Int varint_decode(const std::uint8_t **data) {
+  return varint_decode_pext_unrolled<Int>(data);
+}
+
+/*!
+ * Returns the number of bytes a marked VarInt needs to be stored.
+ *
+ * @tparam Int The type of integer whose encoded length is returned.
+ * @param Int The integer to store.
+ * @return The number of bytes the integer needs to be stored.
+ */
+template <typename Int> [[nodiscard]] std::size_t marked_varint_length(Int i) {
+  std::size_t len = 1;
+  i >>= 6;
+
+  if (i > 0) {
+    len += varint_length(i);
+  }
+
+  return len;
+}
+
+/*!
+ * Writes an integer to a memory location as a marked VarInt.
+ *
+ * @tparam Int The type of integer to encode.
+ * @param Int The integer to store.
+ * @param marker_set Whether the integer is marked.
+ * @param ptr The pointer to the memory location to write the integer to.
+ */
+template <typename Int> void marked_varint_encode(Int i, const bool marked, std::uint8_t **ptr) {
+  std::uint8_t first_octet = i & 0b00111111;
+  if (marked) {
+    first_octet |= 0b01000000;
+  }
+
+  i >>= 6;
+
+  if (i == 0) {
+    **ptr = first_octet;
+    *ptr += 1;
+    return;
+  }
+
+  first_octet |= 0b10000000;
+  **ptr = first_octet;
+  *ptr += 1;
+
+  varint_encode(i, ptr);
+}
+
+/*!
+ * Reads an integer encoded as a marked VarInt from a memory location.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr A pointer to the memory location to read the integer from.
+ * @return A pair consisting of the decoded integer and whether the marker is set.
+ */
+template <typename Int>
+[[nodiscard]] std::pair<Int, bool> marked_varint_decode(const std::uint8_t *ptr) {
+  const std::uint8_t first_octet = *ptr;
+  ptr += 1;
+
+  const bool is_continuation_bit_set = (first_octet & 0b10000000) != 0;
+  const bool is_marked = (first_octet & 0b01000000) != 0;
+
+  Int result = first_octet & 0b00111111;
+  if (is_continuation_bit_set) {
+    Int shift = 6;
+
+    while (true) {
+      const std::uint8_t octet = *ptr;
+      ptr += 1;
+
+      if ((octet & 0b10000000) == 0) {
+        result |= static_cast<Int>(octet) << shift;
+        break;
+      } else {
+        result |= static_cast<Int>(octet & 0b01111111) << shift;
+      }
+
+      shift += 7;
+    }
+  }
+
+  return std::make_pair(result, is_marked);
+}
+
+/*!
+ * Reads an integer encoded as a marked VarInt from a memory location.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr A pointer to the pointer to the memory location to read the integer from, which is
+ * incremented accordingly.
+ * @return A pair consisting of the decoded integer and whether the markes is set.
+ */
+template <typename Int>
+[[nodiscard]] std::pair<Int, bool> marked_varint_decode(const std::uint8_t **ptr) {
+  const std::uint8_t first_octet = **ptr;
+  *ptr += 1;
+
+  const bool is_continuation_bit_set = (first_octet & 0b10000000) != 0;
+  const bool is_marked = (first_octet & 0b01000000) != 0;
+
+  Int result = first_octet & 0b00111111;
+  if (is_continuation_bit_set) {
+    Int shift = 6;
+
+    while (true) {
+      const std::uint8_t octet = **ptr;
+      *ptr += 1;
+
+      if ((octet & 0b10000000) == 0) {
+        result |= static_cast<Int>(octet) << shift;
+        break;
+      } else {
+        result |= static_cast<Int>(octet & 0b01111111) << shift;
+      }
+
+      shift += 7;
+    }
+  }
+
+  return std::make_pair(result, is_marked);
+}
+
+/*!
+ * Encodes a signed integer using zigzag encoding.
+ *
+ * @param i The signed integer to encode.
+ * @return The encoded integer.
+ */
+template <typename Int> [[nodiscard]] std::make_unsigned_t<Int> zigzag_encode(const Int i) {
+  return (i >> (sizeof(Int) * 8 - 1)) ^ (i << 1);
+}
+
+/*!
+ * Decodes a zigzag encoded integer.
+ *
+ * @param i The zigzag encoded integer to decode.
+ * @return The decoded integer.
+ */
+template <typename Int> [[nodiscard]] std::make_signed_t<Int> zigzag_decode(const Int i) {
+  return (i >> 1) ^ -(i & 1);
+}
+
+/*!
+ * Returns the number of bytes a signed VarInt needs to be stored.
+ *
+ * @tparam Int The type of integer whose encoded length is returned.
+ * @param Int The integer to store.
+ * @return The number of bytes the integer needs to be stored.
+ */
+template <typename Int> [[nodiscard]] std::size_t signed_varint_length(const Int i) {
+  return varint_length(zigzag_encode(i));
+}
+
+/*!
+ * Writes an integer to a memory location as a signed VarInt.
+ *
+ * @tparam Int The type of integer to encode.
+ * @param Int The integer to store.
+ * @param ptr The pointer to the memory location to write the integer to.
+ * @return The number of bytes that the integer occupies at the memory location.
+ */
+template <typename Int> std::size_t signed_varint_encode(const Int i, std::uint8_t *ptr) {
+  return varint_encode(zigzag_encode(i), ptr);
+}
+
+/*!
+ * Writes an integer to a memory location as a signed VarInt.
+ *
+ * @tparam Int The type of integer to encode.
+ * @param Int The integer to store.
+ * @param ptr A pointer to the pointer to the memory location to write the integer to, which is
+ * incremented accordingly.
+ */
+template <typename Int> void signed_varint_encode(const Int i, std::uint8_t **ptr) {
+  varint_encode(zigzag_encode(i), ptr);
+}
+
+/*!
+ * Reads an integer encoded as a signed VarInt from a memory location.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr A pointer to the memory location to read the integer from.
+ * @return The decoded integer.
+ */
+template <typename Int> [[nodiscard]] Int signed_varint_decode(const std::uint8_t *data) {
+  return zigzag_decode(varint_decode<std::make_unsigned_t<Int>>(data));
+}
+
+/*!
+ * Reads an integer encoded as a signed VarInt from a memory location.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr A pointer to the pointer to the memory location to read the integer from, which is
+ * incremented accordingly.
+ * @return The decoded integer.
+ */
+template <typename Int> [[nodiscard]] Int signed_varint_decode(const std::uint8_t **data) {
+  return zigzag_decode(varint_decode<std::make_unsigned_t<Int>>(data));
+}
+
+} // namespace kaminpar
diff --git a/kaminpar-common/graph-compression/varint_codec.cc b/kaminpar-common/graph-compression/varint_codec.cc
deleted file mode 100644
index 0905c592..00000000
--- a/kaminpar-common/graph-compression/varint_codec.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/*******************************************************************************
- * Encoding and decoding methods for VarInts.
- *
- * @file:   varint_codec.cc
- * @author: Daniel Salwasser
- * @date:   26.12.2023
- ******************************************************************************/
-#include "kaminpar-common/graph-compression/varint_codec.h"
-
-namespace kaminpar {
-
-namespace debug {
-
-static VarIntStats stats = {0, 0, 0, 0, 0, 0};
-
-void varint_stats_reset() {
-  stats.varint_count = 0;
-  stats.signed_varint_count = 0;
-  stats.marked_varint_count = 0;
-
-  stats.varint_bytes = 0;
-  stats.signed_varint_bytes = 0;
-  stats.marked_varint_bytes = 0;
-}
-
-VarIntStats &varint_stats_global() {
-  return stats;
-}
-
-} // namespace debug
-
-} // namespace kaminpar
diff --git a/kaminpar-common/graph-compression/varint_codec.h b/kaminpar-common/graph-compression/varint_codec.h
deleted file mode 100644
index 98d279e1..00000000
--- a/kaminpar-common/graph-compression/varint_codec.h
+++ /dev/null
@@ -1,558 +0,0 @@
-/*******************************************************************************
- * Encoding and decoding methods for VarInts.
- *
- * @file:   varint_codec.h
- * @author: Daniel Salwasser
- * @date:   11.11.2023
- ******************************************************************************/
-#pragma once
-
-#include <cstdint>
-#include <tuple>
-#include <utility>
-
-#ifdef KAMINPAR_COMPRESSION_FAST_DECODING
-#include <immintrin.h>
-#endif // KAMINPAR_COMPRESSION_FAST_DECODING
-
-namespace kaminpar {
-
-namespace debug {
-
-/*!
- * Whether to track statistics on encoded VarInts.
- */
-static constexpr bool kTrackVarintStats = false;
-
-/*!
- * Statistics about encoded VarInts.
- */
-struct VarIntStats {
-  std::size_t varint_count;
-  std::size_t signed_varint_count;
-  std::size_t marked_varint_count;
-
-  std::size_t varint_bytes;
-  std::size_t signed_varint_bytes;
-  std::size_t marked_varint_bytes;
-};
-
-/*!
- * Reset the global statistics on encoded VarInts.
- */
-void varint_stats_reset();
-
-/*!
- * Returns a reference to the global statistics on encoded VarInts.
- *
- * @return A reference to the global statistics on encoded VarInts.
- */
-VarIntStats &varint_stats_global();
-
-} // namespace debug
-
-/*!
- * Encodes a signed integer using zigzag encoding.
- *
- * @param i The signed integer to encode.
- * @return The encoded integer.
- */
-template <typename Int> [[nodiscard]] std::make_unsigned_t<Int> zigzag_encode(Int i) {
-  return (i >> (sizeof(Int) * 8 - 1)) ^ (i << 1);
-}
-
-/*!
- * Decodes a zigzag encoded integer.
- *
- * @param i The zigzag encoded integer to decode.
- * @return The decoded integer.
- */
-template <typename Int> [[nodiscard]] std::make_signed_t<Int> zigzag_decode(Int i) {
-  return (i >> 1) ^ -(i & 1);
-}
-
-/*!
- * Returns the maximum number of bytes that a VarInt needs to be stored.
- *
- * @tparam Int The type of integer whose encoded maximum length is returned.
- */
-template <typename Int> [[nodiscard]] constexpr std::size_t varint_max_length() {
-  return (sizeof(Int) * 8) / 7 + 1;
-}
-
-/*!
- * Returns the number of bytes a VarInt needs to be stored.
- *
- * @tparam Int The type of integer whose encoded length is returned.
- * @param Int The integer to store.
- * @return The number of bytes the integer needs to be stored.
- */
-template <typename Int> [[nodiscard]] std::size_t varint_length(Int i) {
-  std::size_t len = 1;
-
-  while (i > 0b01111111) {
-    i >>= 7;
-    len++;
-  }
-
-  return len;
-}
-
-/*!
- * Returns the number of bytes a signed VarInt needs to be stored.
- *
- * @tparam Int The type of integer whose encoded length is returned.
- * @param Int The integer to store.
- * @return The number of bytes the integer needs to be stored.
- */
-template <typename Int> [[nodiscard]] std::size_t signed_varint_length(Int i) {
-  return varint_length(zigzag_encode(i));
-}
-
-/*!
- * Returns the number of bytes a marked VarInt needs to be stored.
- *
- * @tparam Int The type of integer whose encoded length is returned.
- * @param Int The integer to store.
- * @return The number of bytes the integer needs to be stored.
- */
-template <typename Int> [[nodiscard]] std::size_t marked_varint_length(Int i) {
-  std::size_t len = 1;
-
-  i >>= 6;
-  if (i > 0) {
-    len += varint_length(i);
-  }
-
-  return len;
-}
-
-/*!
- * Writes an integer to a memory location as a VarInt.
- *
- * @tparam Int The type of integer to encode.
- * @param Int The integer to store.
- * @param ptr The pointer to the memory location to write the integer to.
- * @return The number of bytes that the integer occupies at the memory location.
- */
-template <typename Int> std::size_t varint_encode(Int i, std::uint8_t *ptr) {
-  std::size_t len = 1;
-
-  while (i > 0b01111111) {
-    std::uint8_t octet = (i & 0b01111111) | 0b10000000;
-    *ptr = octet;
-
-    i >>= 7;
-    ptr++;
-    len++;
-  }
-
-  std::uint8_t last_octet = i & 0b01111111;
-  *ptr = last_octet;
-
-  if (debug::kTrackVarintStats) {
-    debug::varint_stats_global().varint_count++;
-    debug::varint_stats_global().varint_bytes += len;
-  }
-
-  return len;
-}
-
-/*!
- * Writes an integer to a memory location as a signed VarInt.
- *
- * @tparam Int The type of integer to encode.
- * @param Int The integer to store.
- * @param ptr The pointer to the memory location to write the integer to.
- * @return The number of bytes that the integer occupies at the memory location.
- */
-template <typename Int> std::size_t signed_varint_encode(Int i, std::uint8_t *ptr) {
-  const std::size_t len = varint_encode(zigzag_encode(i), ptr);
-
-  if (debug::kTrackVarintStats) {
-    debug::varint_stats_global().signed_varint_count++;
-    debug::varint_stats_global().signed_varint_bytes += len;
-  }
-
-  return len;
-}
-
-/*!
- * Writes an integer to a memory location as a marked VarInt.
- *
- * @tparam Int The type of integer to encode.
- * @param Int The integer to store.
- * @param marker_set Whether the integer is marked.
- * @param ptr The pointer to the memory location to write the integer to.
- * @return The number of bytes that the integer occupies at the memory location.
- */
-template <typename Int>
-std::size_t marked_varint_encode(Int i, bool marker_set, std::uint8_t *ptr) {
-  std::uint8_t first_octet;
-
-  if (marker_set) {
-    first_octet = (i & 0b00111111) | 0b01000000;
-  } else {
-    first_octet = (i & 0b00111111);
-  }
-
-  i >>= 6;
-
-  if (i > 0) {
-    first_octet |= 0b10000000;
-    *ptr = first_octet;
-
-    std::size_t len = varint_encode<Int>(i, ptr + 1) + 1;
-
-    if (debug::kTrackVarintStats) {
-      debug::varint_stats_global().marked_varint_count++;
-      debug::varint_stats_global().marked_varint_bytes += len;
-    }
-
-    return len;
-  }
-
-  if (debug::kTrackVarintStats) {
-    debug::varint_stats_global().marked_varint_count++;
-    debug::varint_stats_global().marked_varint_bytes++;
-  }
-
-  *ptr = first_octet;
-  return 1;
-}
-
-/*!
- * Reads an integer encoded as a VarInt from a memory location. The decoding is implemented as a
- * loop with non intrinsic operations.
- *
- * @tparam Int The type of integer to decode.
- * @param ptr The pointer to the memory location to read the integer from.
- * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer
- * occupied at the memory location.
- */
-template <typename Int>
-[[nodiscard]] std::pair<Int, std::size_t> varint_decode_general(const std::uint8_t *ptr) {
-  Int result = 0;
-  std::size_t shift = 0;
-  std::size_t position = 0;
-
-  while (true) {
-    const std::uint8_t byte = ptr[position++];
-
-    if ((byte & 0b10000000) == 0) {
-      result |= static_cast<Int>(byte) << shift;
-      break;
-    } else {
-      result |= static_cast<Int>(byte & 0b01111111) << shift;
-    }
-
-    shift += 7;
-  }
-
-  return std::make_pair(result, position);
-}
-
-/*!
- * Reads an integer encoded as a VarInt from a memory location.
- *
- * @tparam Int The type of integer to decode.
- * @param ptr The pointer to the memory location to read the integer from.
- * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer
- * occupied at the memory location.
- */
-template <typename Int>
-[[nodiscard]] std::pair<Int, std::size_t> varint_decode(const std::uint8_t *ptr) {
-  return varint_decode_general<Int>(ptr);
-}
-
-#ifdef KAMINPAR_COMPRESSION_FAST_DECODING
-/*!
- * Reads a 32-bit integer encoded as a VarInt from a memory location. The decoding is implemented
- * as an unrolled loop with intrinsic operations.
- *
- * @param ptr The pointer to the memory location to read the integer from.
- * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer
- * occupied at the memory location.
- */
-template <>
-inline std::pair<std::uint32_t, std::size_t> varint_decode<std::uint32_t>(const std::uint8_t *ptr) {
-  if ((ptr[0] & 0b10000000) == 0) {
-    const std::uint32_t result = *ptr & 0b01111111;
-    return std::make_pair(result, 1);
-  }
-
-  if ((ptr[1] & 0b10000000) == 0) {
-    const std::uint32_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F);
-    return std::make_pair(result, 2);
-  }
-
-  if ((ptr[2] & 0b10000000) == 0) {
-    const std::uint32_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F);
-    return std::make_pair(result, 3);
-  }
-
-  if ((ptr[3] & 0b10000000) == 0) {
-    const std::uint32_t result =
-        _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F7F);
-    return std::make_pair(result, 4);
-  }
-
-  const std::uint32_t result = static_cast<std::uint32_t>(
-      _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F)
-  );
-  return std::make_pair(result, 5);
-}
-
-/*!
- * Reads a 64-bit integer encoded as a VarInt from a memory location. The decoding is implemented
- * as an unrolled loop with intrinsic operations.
- *
- * @param ptr The pointer to the memory location to read the integer from.
- * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer
- * occupied at the memory location.
- */
-template <>
-inline std::pair<std::uint64_t, std::size_t> varint_decode<std::uint64_t>(const std::uint8_t *ptr) {
-  if ((ptr[0] & 0b10000000) == 0) {
-    const std::uint64_t result = *ptr & 0b01111111;
-    return std::make_pair(result, 1);
-  }
-
-  if ((ptr[1] & 0b10000000) == 0) {
-    const std::uint64_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F);
-    return std::make_pair(result, 2);
-  }
-
-  if ((ptr[2] & 0b10000000) == 0) {
-    const std::uint64_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F);
-    return std::make_pair(result, 3);
-  }
-
-  if ((ptr[3] & 0b10000000) == 0) {
-    const std::uint64_t result =
-        _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F7F);
-    return std::make_pair(result, 4);
-  }
-
-  if ((ptr[4] & 0b10000000) == 0) {
-    const std::uint64_t result =
-        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F);
-    return std::make_pair(result, 5);
-  }
-
-  if ((ptr[5] & 0b10000000) == 0) {
-    const std::uint64_t result =
-        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F);
-    return std::make_pair(result, 6);
-  }
-
-  if ((ptr[6] & 0b10000000) == 0) {
-    const std::uint64_t result =
-        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F);
-    return std::make_pair(result, 7);
-  }
-
-  if ((ptr[7] & 0b10000000) == 0) {
-    const std::uint64_t result =
-        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F7F);
-    return std::make_pair(result, 8);
-  }
-
-  if ((ptr[8] & 0b10000000) == 0) {
-    const std::uint64_t result =
-        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F7F) |
-        (static_cast<std::uint64_t>(ptr[8] & 0b01111111) << 56);
-    return std::make_pair(result, 9);
-  }
-
-  const std::uint64_t result =
-      _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F7F) |
-      (static_cast<std::uint64_t>(ptr[8] & 0b01111111) << 56) |
-      (static_cast<std::uint64_t>(ptr[9]) << 63);
-  return std::make_pair(result, 10);
-}
-#endif
-
-/*!
- * Reads an integer encoded as a signed VarInt from a memory location. The decoding is implemented
- * as a loop with non intrinsic operations.
- *
- * @tparam Int The type of integer to decode.
- * @param ptr The pointer to the memory location to read the integer from.
- * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer
- * occupied at the memory location.
- */
-template <typename Int>
-[[nodiscard]] std::pair<Int, std::size_t> signed_varint_decode_general(const std::uint8_t *ptr) {
-  const auto [unsigned_value, len] = varint_decode_general<std::make_unsigned_t<Int>>(ptr);
-  return std::make_pair(zigzag_decode(unsigned_value), len);
-}
-
-/*!
- * Reads an integer encoded as a signed VarInt from a memory location.
- *
- * @tparam Int The type of integer to decode.
- * @param ptr The pointer to the memory location to read the integer from.
- * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer
- * occupied at the memory location.
- */
-template <typename Int>
-[[nodiscard]] std::pair<Int, std::size_t> signed_varint_decode(const std::uint8_t *ptr) {
-  const auto [unsigned_value, len] = varint_decode<std::make_unsigned_t<Int>>(ptr);
-  return std::make_pair(zigzag_decode(unsigned_value), len);
-}
-
-/*!
- * Reads an integer encoded as a marked VarInt from a memory location. The decoding is implemented
- * as a loop with non intrinsic operations.
- *
- * @tparam Int The type of integer to decode.
- * @param ptr The pointer to the memory location to read the integer from.
- * @return A tuple consisting of the decoded integer, whether the markes is set and the number of
- * bytes that the encoded integer occupied at the memory location.
- */
-template <typename Int>
-[[nodiscard]] std::tuple<Int, bool, std::size_t> marked_varint_decode(const std::uint8_t *ptr) {
-  const std::uint8_t first_byte = *ptr;
-  const bool is_continuation_bit_set = (first_byte & 0b10000000) != 0;
-  const bool is_marker_set = (first_byte & 0b01000000) != 0;
-
-  Int result = first_byte & 0b00111111;
-  std::size_t shift = 0;
-  std::size_t position = 1;
-
-  if (is_continuation_bit_set) {
-    while (true) {
-      const std::uint8_t byte = ptr[position++];
-
-      if ((byte & 0b10000000) == 0) {
-        result |= static_cast<Int>(byte) << (shift + 6);
-        break;
-      } else {
-        result |= static_cast<Int>(byte & 0b01111111) << (shift + 6);
-      }
-
-      shift += 7;
-    }
-  }
-
-  return std::make_tuple(result, is_marker_set, position);
-}
-
-#ifdef KAMINPAR_COMPRESSION_FAST_DECODING
-/*!
- * Reads a 32-bit integer encoded as a marked VarInt from a memory location. The decoding is
- * implemented as an unrolled loop with intrinsic operations.
- *
- * @tparam Int The type of integer to decode.
- * @param ptr The pointer to the memory location to read the integer from.
- * @return A tuple consisting of the decoded integer, whether the markes is set and the number of
- * bytes that the encoded integer occupied at the memory location.
- */
-template <>
-inline std::tuple<std::uint32_t, bool, std::size_t>
-marked_varint_decode<std::uint32_t>(const std::uint8_t *ptr) {
-  const bool is_marker_set = (*ptr & 0b01000000) != 0;
-
-  if ((ptr[0] & 0b10000000) == 0) {
-    const std::uint32_t result = *ptr & 0b00111111;
-    return std::make_tuple(result, is_marker_set, 1);
-  }
-
-  if ((ptr[1] & 0b10000000) == 0) {
-    const std::uint32_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F3F);
-    return std::make_tuple(result, is_marker_set, 2);
-  }
-
-  if ((ptr[2] & 0b10000000) == 0) {
-    const std::uint32_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F3F);
-    return std::make_tuple(result, is_marker_set, 3);
-  }
-
-  if ((ptr[3] & 0b10000000) == 0) {
-    const std::uint32_t result =
-        _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F3F);
-    return std::make_tuple(result, is_marker_set, 4);
-  }
-
-  const std::uint32_t result = static_cast<std::uint32_t>(
-      _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F3F)
-  );
-  return std::make_tuple(result, is_marker_set, 5);
-}
-
-/*!
- * Reads a 64-bit integer encoded as a marked VarInt from a memory location. The decoding is
- * implemented as an unrolled loop with intrinsic operations.
- *
- * @tparam Int The type of integer to decode.
- * @param ptr The pointer to the memory location to read the integer from.
- * @return A tuple consisting of the decoded integer, whether the markes is set and the number of
- * bytes that the encoded integer occupied at the memory location.
- */
-template <>
-inline std::tuple<std::uint64_t, bool, std::size_t>
-marked_varint_decode<std::uint64_t>(const std::uint8_t *ptr) {
-  const bool is_marker_set = (*ptr & 0b01000000) != 0;
-
-  if ((ptr[0] & 0b10000000) == 0) {
-    const std::uint64_t result = *ptr & 0b00111111;
-    return std::make_tuple(result, is_marker_set, 1);
-  }
-
-  if ((ptr[1] & 0b10000000) == 0) {
-    const std::uint64_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F3F);
-    return std::make_tuple(result, is_marker_set, 2);
-  }
-
-  if ((ptr[2] & 0b10000000) == 0) {
-    const std::uint64_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F3F);
-    return std::make_tuple(result, is_marker_set, 3);
-  }
-
-  if ((ptr[3] & 0b10000000) == 0) {
-    const std::uint64_t result =
-        _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F3F);
-    return std::make_tuple(result, is_marker_set, 4);
-  }
-
-  if ((ptr[4] & 0b10000000) == 0) {
-    const std::uint64_t result =
-        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F3F);
-    return std::make_tuple(result, is_marker_set, 5);
-  }
-
-  if ((ptr[5] & 0b10000000) == 0) {
-    const std::uint64_t result =
-        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F3F);
-    return std::make_tuple(result, is_marker_set, 6);
-  }
-
-  if ((ptr[6] & 0b10000000) == 0) {
-    const std::uint64_t result =
-        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F3F);
-    return std::make_tuple(result, is_marker_set, 7);
-  }
-
-  if ((ptr[7] & 0b10000000) == 0) {
-    const std::uint64_t result =
-        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F3F);
-    return std::make_tuple(result, is_marker_set, 8);
-  }
-
-  if ((ptr[8] & 0b10000000) == 0) {
-    const std::uint64_t result =
-        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F3F) |
-        (static_cast<std::uint64_t>(ptr[8] & 0b01111111) << 55);
-    return std::make_tuple(result, is_marker_set, 9);
-  }
-
-  const std::uint64_t result =
-      _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F3F) |
-      (static_cast<std::uint64_t>(ptr[8] & 0b01111111) << 55) |
-      (static_cast<std::uint64_t>(ptr[9]) << 62);
-  return std::make_tuple(result, is_marker_set, 10);
-}
-#endif
-
-} // namespace kaminpar
diff --git a/kaminpar-common/graph-compression/varint_run_length_codec.h b/kaminpar-common/graph-compression/varint_rle.h
similarity index 75%
rename from kaminpar-common/graph-compression/varint_run_length_codec.h
rename to kaminpar-common/graph-compression/varint_rle.h
index 17c7b84b..ffbb41ce 100644
--- a/kaminpar-common/graph-compression/varint_run_length_codec.h
+++ b/kaminpar-common/graph-compression/varint_rle.h
@@ -1,7 +1,7 @@
 /*******************************************************************************
  * Encoding and decoding methods for run-length VarInts.
  *
- * @file:   varint_run_length_codec.h
+ * @file:   varint_rle.h
  * @author: Daniel Salwasser
  * @date:   29.12.2023
  ******************************************************************************/
@@ -12,6 +12,8 @@
 #include <utility>
 #include <vector>
 
+#include "kaminpar-common/math.h"
+
 namespace kaminpar {
 
 /*!
@@ -40,13 +42,13 @@ template <typename Int> class VarIntRunLengthEncoder {
    * includes the control byte if it is the first integer of a block.
    */
   std::size_t add(Int i) {
-    std::uint8_t size = needed_bytes(i);
+    std::uint8_t size = math::byte_width(i);
 
     if (_buffer.empty()) {
-      _buffered_size = size++;
-    } else if (_buffer.size() == kBufferSize || _buffered_size != size) {
+      _num_buffered = size++;
+    } else if (_buffer.size() == kBufferSize || _num_buffered != size) {
       flush();
-      _buffered_size = size++;
+      _num_buffered = size++;
     }
 
     _buffer.push_back(i);
@@ -64,17 +66,17 @@ template <typename Int> class VarIntRunLengthEncoder {
 
     const std::uint8_t *begin = _ptr;
     if constexpr (sizeof(Int) == 4) {
-      const std::uint8_t header = (static_cast<std::uint8_t>(_buffer.size() - 1) << 2) |
-                                  ((_buffered_size - 1) & 0b00000011);
+      const std::uint8_t header =
+          (static_cast<std::uint8_t>(_buffer.size() - 1) << 2) | ((_num_buffered - 1) & 0b00000011);
       *_ptr++ = header;
     } else if constexpr (sizeof(Int) == 8) {
-      const std::uint8_t header = (static_cast<std::uint8_t>(_buffer.size() - 1) << 3) |
-                                  ((_buffered_size - 1) & 0b00000111);
+      const std::uint8_t header =
+          (static_cast<std::uint8_t>(_buffer.size() - 1) << 3) | ((_num_buffered - 1) & 0b00000111);
       *_ptr++ = header;
     }
 
     for (Int value : _buffer) {
-      for (std::uint8_t i = 0; i < _buffered_size; ++i) {
+      for (std::uint8_t i = 0; i < _num_buffered; ++i) {
         *_ptr++ = static_cast<std::uint8_t>(value);
         value >>= 8;
       }
@@ -86,19 +88,8 @@ template <typename Int> class VarIntRunLengthEncoder {
 private:
   std::uint8_t *_ptr;
 
-  std::uint8_t _buffered_size;
+  std::uint8_t _num_buffered;
   std::vector<Int> _buffer;
-
-  std::uint8_t needed_bytes(Int i) const {
-    std::size_t len = 1;
-
-    while (i > 0b11111111) {
-      i >>= 8;
-      len++;
-    }
-
-    return len;
-  }
 };
 
 /*!
@@ -113,12 +104,12 @@ template <typename Int> class VarIntRunLengthDecoder {
   /*!
    * Constructs a new VarIntRunLengthDecoder.
    *
+   * @param num_values The number of integers that are encoded.
    * @param ptr The pointer to the memory location where the encoded integers are stored.
-   * @param count The number of integers that are encoded.
    */
-  VarIntRunLengthDecoder(const std::uint8_t *ptr, const std::size_t count)
-      : _ptr(ptr),
-        _count(count) {}
+  VarIntRunLengthDecoder(const std::size_t num_values, const std::uint8_t *ptr)
+      : _num_values(num_values),
+        _ptr(ptr) {}
 
   /*!
    * Decodes the encoded integers.
@@ -127,19 +118,19 @@ template <typename Int> class VarIntRunLengthDecoder {
    * parameter of type Int.
    */
   template <typename Lambda> void decode(Lambda &&l) {
-    constexpr bool non_stoppable = std::is_void_v<std::invoke_result_t<Lambda, std::uint32_t>>;
+    constexpr bool kNonStoppable = std::is_void_v<std::invoke_result_t<Lambda, Int>>;
 
-    std::size_t decoded = 0;
-    while (decoded < _count) {
+    std::size_t num_decoded = 0;
+    while (num_decoded < _num_values) {
       const std::uint8_t run_header = *_ptr++;
 
       if constexpr (sizeof(Int) == 4) {
-        const std::uint8_t run_length = (run_header >> 2) + 1;
-        const std::uint8_t run_size = (run_header & 0b00000011) + 1;
+        const std::size_t run_length = (run_header >> 2) + 1;
+        const std::size_t run_size = (run_header & 0b00000011) + 1;
 
-        decoded += run_length;
+        num_decoded += run_length;
 
-        if constexpr (non_stoppable) {
+        if constexpr (kNonStoppable) {
           decode32(run_length, run_size, std::forward<Lambda>(l));
         } else {
           const bool stop = decode32(run_length, run_size, std::forward<Lambda>(l));
@@ -148,12 +139,12 @@ template <typename Int> class VarIntRunLengthDecoder {
           }
         }
       } else if constexpr (sizeof(Int) == 8) {
-        const std::uint8_t run_length = (run_header >> 3) + 1;
-        const std::uint8_t run_size = (run_header & 0b00000111) + 1;
+        const std::size_t run_length = (run_header >> 3) + 1;
+        const std::size_t run_size = (run_header & 0b00000111) + 1;
 
-        decoded += run_length;
+        num_decoded += run_length;
 
-        if constexpr (non_stoppable) {
+        if constexpr (kNonStoppable) {
           decode64(run_length, run_size, std::forward<Lambda>(l));
         } else {
           const bool stop = decode64(run_length, run_size, std::forward<Lambda>(l));
@@ -166,16 +157,13 @@ template <typename Int> class VarIntRunLengthDecoder {
   }
 
 private:
-  const std::uint8_t *_ptr;
-  const std::size_t _count;
-
   template <typename Lambda>
-  bool decode32(const std::uint8_t run_length, const std::uint8_t run_size, Lambda &&l) {
+  bool decode32(const std::size_t run_length, const std::size_t run_size, Lambda &&l) {
     constexpr bool kNonStoppable = std::is_void_v<std::invoke_result_t<Lambda, std::uint32_t>>;
 
     switch (run_size) {
     case 1:
-      for (std::uint8_t i = 0; i < run_length; ++i) {
+      for (std::size_t i = 0; i < run_length; ++i) {
         const std::uint32_t value = static_cast<std::uint32_t>(*_ptr);
         _ptr += 1;
 
@@ -190,7 +178,7 @@ template <typename Int> class VarIntRunLengthDecoder {
       }
       break;
     case 2:
-      for (std::uint8_t i = 0; i < run_length; ++i) {
+      for (std::size_t i = 0; i < run_length; ++i) {
         const std::uint32_t value = *((std::uint16_t *)_ptr);
         _ptr += 2;
 
@@ -205,7 +193,7 @@ template <typename Int> class VarIntRunLengthDecoder {
       }
       break;
     case 3:
-      for (std::uint8_t i = 0; i < run_length; ++i) {
+      for (std::size_t i = 0; i < run_length; ++i) {
         const std::uint32_t value = *((std::uint32_t *)_ptr) & 0xFFFFFF;
         _ptr += 3;
 
@@ -220,7 +208,7 @@ template <typename Int> class VarIntRunLengthDecoder {
       }
       break;
     case 4:
-      for (std::uint8_t i = 0; i < run_length; ++i) {
+      for (std::size_t i = 0; i < run_length; ++i) {
         const std::uint32_t value = *((std::uint32_t *)_ptr);
         _ptr += 4;
 
@@ -242,12 +230,12 @@ template <typename Int> class VarIntRunLengthDecoder {
   }
 
   template <typename Lambda>
-  bool decode64(const std::uint8_t run_length, const std::uint8_t run_size, Lambda &&l) {
+  bool decode64(const std::size_t run_length, const std::size_t run_size, Lambda &&l) {
     constexpr bool kNonStoppable = std::is_void_v<std::invoke_result_t<Lambda, std::uint64_t>>;
 
     switch (run_size) {
     case 1:
-      for (std::uint8_t i = 0; i < run_length; ++i) {
+      for (std::size_t i = 0; i < run_length; ++i) {
         const std::uint64_t value = static_cast<std::uint64_t>(*_ptr);
         _ptr += 1;
 
@@ -262,7 +250,7 @@ template <typename Int> class VarIntRunLengthDecoder {
       }
       break;
     case 2:
-      for (std::uint8_t i = 0; i < run_length; ++i) {
+      for (std::size_t i = 0; i < run_length; ++i) {
         const std::uint64_t value = *((std::uint16_t *)_ptr);
         _ptr += 2;
 
@@ -277,7 +265,7 @@ template <typename Int> class VarIntRunLengthDecoder {
       }
       break;
     case 3:
-      for (std::uint8_t i = 0; i < run_length; ++i) {
+      for (std::size_t i = 0; i < run_length; ++i) {
         const std::uint64_t value = *((std::uint32_t *)_ptr) & 0xFFFFFF;
         _ptr += 3;
 
@@ -292,7 +280,7 @@ template <typename Int> class VarIntRunLengthDecoder {
       }
       break;
     case 4:
-      for (std::uint8_t i = 0; i < run_length; ++i) {
+      for (std::size_t i = 0; i < run_length; ++i) {
         const std::uint64_t value = *((std::uint32_t *)_ptr);
         _ptr += 4;
 
@@ -307,7 +295,7 @@ template <typename Int> class VarIntRunLengthDecoder {
       }
       break;
     case 5:
-      for (std::uint8_t i = 0; i < run_length; ++i) {
+      for (std::size_t i = 0; i < run_length; ++i) {
         const std::uint64_t value = *((std::uint64_t *)_ptr) & 0xFFFFFFFFFF;
         _ptr += 5;
 
@@ -322,7 +310,7 @@ template <typename Int> class VarIntRunLengthDecoder {
       }
       break;
     case 6:
-      for (std::uint8_t i = 0; i < run_length; ++i) {
+      for (std::size_t i = 0; i < run_length; ++i) {
         const std::uint64_t value = *((std::uint64_t *)_ptr) & 0xFFFFFFFFFFFF;
         _ptr += 6;
 
@@ -337,7 +325,7 @@ template <typename Int> class VarIntRunLengthDecoder {
       }
       break;
     case 7:
-      for (std::uint8_t i = 0; i < run_length; ++i) {
+      for (std::size_t i = 0; i < run_length; ++i) {
         const std::uint64_t value = *((std::uint64_t *)_ptr) & 0xFFFFFFFFFFFFFF;
         _ptr += 7;
 
@@ -352,7 +340,7 @@ template <typename Int> class VarIntRunLengthDecoder {
       }
       break;
     case 8:
-      for (std::uint8_t i = 0; i < run_length; ++i) {
+      for (std::size_t i = 0; i < run_length; ++i) {
         const std::uint64_t value = *((std::uint64_t *)_ptr);
         _ptr += 8;
 
@@ -372,6 +360,10 @@ template <typename Int> class VarIntRunLengthDecoder {
 
     return false;
   }
+
+private:
+  const std::size_t _num_values;
+  const std::uint8_t *_ptr;
 };
 
 }; // namespace kaminpar
diff --git a/kaminpar-common/graph-compression/varint_stream_codec.h b/kaminpar-common/graph-compression/varint_stream_codec.h
deleted file mode 100644
index b38639d4..00000000
--- a/kaminpar-common/graph-compression/varint_stream_codec.h
+++ /dev/null
@@ -1,440 +0,0 @@
-/*******************************************************************************
- * Encoding and decoding methods for the StreamVByte codec.
- *
- * @file:   varint_stream_codec.h
- * @author: Daniel Salwasser
- * @date:   29.12.2023
- ******************************************************************************/
-#pragma once
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-
-#if defined(__x86_64__)
-#include <immintrin.h>
-#elif defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-
-#include "kaminpar-common/constexpr_utils.h"
-
-namespace kaminpar {
-
-/*!
- * An encoder for writing variable length integers with the StreamVByte codec.
- *
- * @tparam Int The type of integer to encode.
- */
-template <typename Int> class VarIntStreamEncoder {
-  static_assert(sizeof(Int) == 4);
-
-public:
-  /*!
-   * Constructs a new VarIntStreamEncoder.
-   *
-   * @param ptr The pointer to the memory location where the encoded integers are written.
-   * @param count The amount of integers to encode.
-   */
-  VarIntStreamEncoder(std::uint8_t *ptr, std::size_t count)
-      : _control_bytes_ptr(ptr),
-        _data_ptr(ptr + count / 4 + ((count % 4) != 0)),
-        _count(count),
-        _buffered(0) {}
-
-  /*!
-   * Encodes an integer.
-   *
-   * @param i The integer to encode.
-   * @return The number of bytes that the integer requires to be stored in encoded format. It
-   * includes the control byte if it is the last integer of a block.
-   */
-  std::size_t add(Int i) {
-    if (_buffered == 3) {
-      _buffer[3] = i;
-      write_stream();
-
-      _buffered = 0;
-      return needed_bytes(i);
-    }
-
-    _buffer[_buffered] = i;
-    return needed_bytes(i) + (_buffered++ == 0);
-  }
-
-  /*!
-   * Writes the remaining integers added to the encoder which do not form a complete block to
-   * memory.
-   */
-  void flush() {
-    if (_buffered == 0) {
-      return;
-    }
-
-    const std::uint8_t control_byte =
-        ((needed_bytes(_buffer[3]) - 1) << 6) | (((needed_bytes(_buffer[2]) - 1) & 0b11) << 4) |
-        (((needed_bytes(_buffer[1]) - 1) & 0b11) << 2) | ((needed_bytes(_buffer[0]) - 1) & 0b11);
-    *_control_bytes_ptr++ = control_byte;
-
-    for (std::size_t i = 0; i < _buffered; ++i) {
-      Int value = _buffer[i];
-      do {
-        *_data_ptr++ = static_cast<std::uint8_t>(value);
-        value >>= 8;
-      } while (value > 0);
-    }
-  }
-
-private:
-  std::uint8_t *_control_bytes_ptr;
-  std::uint8_t *_data_ptr;
-  const std::size_t _count;
-
-  std::size_t _buffered;
-  std::array<Int, 4> _buffer;
-
-  void write_stream() {
-    const std::uint8_t control_byte =
-        ((needed_bytes(_buffer[3]) - 1) << 6) | (((needed_bytes(_buffer[2]) - 1) & 0b11) << 4) |
-        (((needed_bytes(_buffer[1]) - 1) & 0b11) << 2) | ((needed_bytes(_buffer[0]) - 1) & 0b11);
-    *_control_bytes_ptr++ = control_byte;
-
-    for (Int value : _buffer) {
-      do {
-        *_data_ptr++ = static_cast<std::uint8_t>(value);
-        value >>= 8;
-      } while (value > 0);
-    }
-  }
-
-  std::uint8_t needed_bytes(Int i) const {
-    std::size_t len = 1;
-
-    while (i > 0b11111111) {
-      i >>= 8;
-      len++;
-    }
-
-    return len;
-  }
-};
-
-/*!
- * A decoder for reading variable length integers stored with the StreamVByte codec.
- *
- * @tparam Int The type of integer to decode.
- */
-template <typename Int> class VarIntStreamDecoder {
-  static_assert(sizeof(Int) == 4);
-
-  static constexpr std::array<std::uint8_t, 256> create_length_table() {
-    std::array<std::uint8_t, 256> length_table{};
-
-    constexpr_for<256>([&](const std::uint8_t control_byte) {
-      length_table[control_byte] = 0;
-
-      constexpr_for<4>([&](const std::uint8_t i) {
-        const std::uint8_t length = ((control_byte >> (2 * i)) & 0b11) + 1;
-        length_table[control_byte] += length;
-      });
-    });
-
-    return length_table;
-  }
-
-  static constexpr std::array<std::array<std::uint8_t, 16>, 256> create_shuffle_table() {
-    std::array<std::array<std::uint8_t, 16>, 256> shuffle_table{};
-
-    constexpr_for<256>([&](const std::uint8_t control_byte) {
-      std::uint8_t byte = 0;
-      std::uint8_t pos = 0;
-
-      constexpr_for<4>([&](const std::uint8_t i) {
-        std::uint8_t c = (control_byte >> (2 * i)) & 0b11;
-
-        std::uint8_t j = 0;
-        while (j <= c) {
-          shuffle_table[control_byte][pos++] = byte++;
-          j += 1;
-        }
-
-        while (j < 4) {
-          shuffle_table[control_byte][pos++] = 0b11111111;
-          j += 1;
-        }
-      });
-    });
-
-    return shuffle_table;
-  }
-
-  static constexpr const std::array<std::uint8_t, 256> kLengthTable = create_length_table();
-
-  static constexpr const std::array<std::array<std::uint8_t, 16>, 256> kShuffleTable =
-      create_shuffle_table();
-
-public:
-  /*!
-   * Constructs a new VarIntStreamDecoder.
-   *
-   * @param ptr The pointer to the memory location where the encoded integers are stored.
-   * @param count The amount of integers that are stored at the memory location.
-   */
-  VarIntStreamDecoder(const std::uint8_t *ptr, const std::size_t count)
-      : _control_bytes_ptr(ptr),
-        _control_bytes(count / 4),
-        _data_ptr(ptr + _control_bytes + ((count % 4) != 0)),
-        _count(count) {}
-
-  /*!
-   * Decodes the encoded integers.
-   *
-   * @param l The function to be called with the decoded integers, i.e. the function has one
-   * parameter of type Int.
-   */
-#if defined(__x86_64__)
-  template <typename Lambda> void decode(Lambda &&l) {
-    constexpr bool kNonStoppable = std::is_void_v<std::invoke_result_t<Lambda, Int>>;
-
-    for (std::size_t i = 0; i < _control_bytes; ++i) {
-      const std::uint8_t control_byte = _control_bytes_ptr[i];
-      const std::uint8_t length = kLengthTable[control_byte];
-
-      __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
-      _data_ptr += length;
-
-      const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
-      data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask);
-
-      if constexpr (kNonStoppable) {
-        l(_mm_extract_epi32(data, 0));
-        l(_mm_extract_epi32(data, 1));
-        l(_mm_extract_epi32(data, 2));
-        l(_mm_extract_epi32(data, 3));
-      } else {
-        if (l(_mm_extract_epi32(data, 0))) [[unlikely]] {
-          return;
-        }
-
-        if (l(_mm_extract_epi32(data, 1))) [[unlikely]] {
-          return;
-        }
-
-        if (l(_mm_extract_epi32(data, 2))) [[unlikely]] {
-          return;
-        }
-
-        if (l(_mm_extract_epi32(data, 3))) [[unlikely]] {
-          return;
-        }
-      }
-    }
-
-    switch (_count % 4) {
-    case 1: {
-      const std::uint8_t control_byte = _control_bytes_ptr[_control_bytes];
-      const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
-
-      __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
-      data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask);
-
-      if constexpr (kNonStoppable) {
-        l(_mm_extract_epi32(data, 0));
-      } else {
-        if (l(_mm_extract_epi32(data, 0))) [[unlikely]] {
-          return;
-        }
-      }
-      break;
-    }
-    case 2: {
-      const std::uint8_t control_byte = _control_bytes_ptr[_control_bytes];
-      const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
-
-      __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
-      data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask);
-
-      if constexpr (kNonStoppable) {
-        l(_mm_extract_epi32(data, 0));
-        l(_mm_extract_epi32(data, 1));
-      } else {
-        if (l(_mm_extract_epi32(data, 0))) [[unlikely]] {
-          return;
-        }
-
-        if (l(_mm_extract_epi32(data, 1))) [[unlikely]] {
-          return;
-        }
-      }
-      break;
-    }
-    case 3: {
-      const std::uint8_t control_byte = _control_bytes_ptr[_control_bytes];
-      const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
-
-      __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
-      data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask);
-
-      if constexpr (kNonStoppable) {
-        l(_mm_extract_epi32(data, 0));
-        l(_mm_extract_epi32(data, 1));
-        l(_mm_extract_epi32(data, 2));
-      } else {
-        if (l(_mm_extract_epi32(data, 0))) [[unlikely]] {
-          return;
-        }
-
-        if (l(_mm_extract_epi32(data, 1))) [[unlikely]] {
-          return;
-        }
-
-        if (l(_mm_extract_epi32(data, 2))) [[unlikely]] {
-          return;
-        }
-      }
-      break;
-    }
-    }
-  }
-#elif defined(__aarch64__)
-  template <typename Lambda> void decode(Lambda &&l) {
-    constexpr bool kNonStoppable = std::is_void_v<std::invoke_result_t<Lambda, Int>>;
-
-    for (std::size_t i = 0; i < _control_bytes; ++i) {
-      const std::uint8_t control_byte = _control_bytes_ptr[i];
-      const std::uint8_t length = kLengthTable[control_byte];
-
-      //__m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
-      uint8x16_t data = vld1q_u8(_data_ptr);
-      _data_ptr += length;
-
-      // const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
-      //  data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask);
-      const uint8x16_t shuffle_mask = vld1q_u8(kShuffleTable[control_byte].data());
-      data = vqtbl1q_u8(data, shuffle_mask);
-
-      std::array<std::uint32_t, 4> out;
-      vst1q_u8(reinterpret_cast<std::uint8_t *>(out.data()), data);
-
-      if constexpr (kNonStoppable) {
-        l(out[0]);
-        l(out[1]);
-        l(out[2]);
-        l(out[3]);
-      } else {
-        if (l(out[0])) [[unlikely]] {
-          return;
-        }
-
-        if (l(out[1])) [[unlikely]] {
-          return;
-        }
-
-        if (l(out[2])) [[unlikely]] {
-          return;
-        }
-
-        if (l(out[3])) [[unlikely]] {
-          return;
-        }
-      }
-    }
-
-    switch (_count % 4) {
-    case 1: {
-      const std::uint8_t control_byte = _control_bytes_ptr[_control_bytes];
-      // const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
-      const uint8x16_t shuffle_mask = vld1q_u8(kShuffleTable[control_byte].data());
-
-      // __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
-      // data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask);
-      uint8x16_t data = vld1q_u8(_data_ptr);
-      data = vqtbl1q_u8(data, shuffle_mask);
-
-      std::array<std::uint32_t, 4> out;
-      vst1q_u8(reinterpret_cast<std::uint8_t *>(out.data()), data);
-
-      if constexpr (kNonStoppable) {
-        l(out[0]);
-      } else {
-        if (l(out[0])) [[unlikely]] {
-          return;
-        }
-      }
-      break;
-    }
-    case 2: {
-      const std::uint8_t control_byte = _control_bytes_ptr[_control_bytes];
-      // const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
-      const uint8x16_t shuffle_mask = vld1q_u8(kShuffleTable[control_byte].data());
-
-      // __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
-      // data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask);
-      uint8x16_t data = vld1q_u8(_data_ptr);
-      data = vqtbl1q_u8(data, shuffle_mask);
-
-      std::array<std::uint32_t, 4> out;
-      vst1q_u8(reinterpret_cast<std::uint8_t *>(out.data()), data);
-
-      if constexpr (kNonStoppable) {
-        l(out[0]);
-        l(out[1]);
-      } else {
-        if (l(out[0])) [[unlikely]] {
-          return;
-        }
-
-        if (l(out[1])) [[unlikely]] {
-          return;
-        }
-      }
-      break;
-    }
-    case 3: {
-      const std::uint8_t control_byte = _control_bytes_ptr[_control_bytes];
-      // const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
-      const uint8x16_t shuffle_mask = vld1q_u8(kShuffleTable[control_byte].data());
-
-      // __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
-      // data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask);
-      uint8x16_t data = vld1q_u8(_data_ptr);
-      data = vqtbl1q_u8(data, shuffle_mask);
-
-      std::array<std::uint32_t, 4> out;
-      vst1q_u8(reinterpret_cast<std::uint8_t *>(out.data()), data);
-
-      if constexpr (kNonStoppable) {
-        l(out[0]);
-        l(out[1]);
-        l(out[2]);
-      } else {
-        if (l(out[0])) [[unlikely]] {
-          return;
-        }
-
-        if (l(out[1])) [[unlikely]] {
-          return;
-        }
-
-        if (l(out[2])) [[unlikely]] {
-          return;
-        }
-      }
-      break;
-    }
-    }
-  }
-#else
-  template <typename Lambda> void decode(Lambda &&l) {
-    throw std::runtime_error("not implemented");
-  }
-#endif
-
-private:
-  const std::uint8_t *_control_bytes_ptr;
-  const std::size_t _control_bytes;
-  const std::uint8_t *_data_ptr;
-  const std::size_t _count;
-};
-
-} // namespace kaminpar
diff --git a/kaminpar-common/math.h b/kaminpar-common/math.h
index 00171ffd..7b73e8a5 100644
--- a/kaminpar-common/math.h
+++ b/kaminpar-common/math.h
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <algorithm>
+#include <bit>
 #include <cmath>
 #include <limits>
 #include <tuple>
@@ -18,6 +19,12 @@
 
 namespace kaminpar::math {
 
+template <std::integral Int> constexpr Int kSetMSB = static_cast<Int>(1) << (sizeof(Int) * 8 - 1);
+
+template <std::integral Int> [[nodiscard]] constexpr bool is_msb_set(const Int x) {
+  return (x & kSetMSB<Int>) != 0;
+}
+
 /*!
  * Divides two integers with ceil rounding.
  *
@@ -55,6 +62,12 @@ template <typename Int1, typename Int2> constexpr Int1 div_ceil(const Int1 x, co
   return x / y + (x % y != 0);
 }
 
+template <typename Int1, typename Int2>
+[[nodiscard]] constexpr Int1 mod_ceil(const Int1 x, const Int2 y) {
+  const Int1 mod = x % y;
+  return mod == 0 ? y : mod;
+}
+
 template <typename Int> bool is_square(const Int value) {
   const Int sqrt = std::sqrt(value);
   return sqrt * sqrt == value;
@@ -101,7 +114,7 @@ template <typename Int> constexpr Int byte_width(const Int i) {
     return 1;
   }
 
-  const Int bit_width = 1 + floor_log2(i);
+  const Int bit_width = std::bit_width(i);
   return div_ceil<Int>(bit_width, 8);
 }
 
diff --git a/kaminpar-common/parallel/aligned_element.h b/kaminpar-common/parallel/aligned_element.h
index 92c871e2..9da9fd0c 100644
--- a/kaminpar-common/parallel/aligned_element.h
+++ b/kaminpar-common/parallel/aligned_element.h
@@ -7,8 +7,7 @@
 #pragma once
 
 #include <cstdlib>
-
-#include "kaminpar-common/ranges.h"
+#include <utility>
 
 namespace kaminpar::parallel {
 
@@ -38,16 +37,19 @@ template <typename Value> struct alignas(64) Aligned {
 };
 
 template <typename Vector> struct alignas(64) AlignedVec {
+  using value_type = typename Vector::value_type;
+  using size_type = typename Vector::size_type;
+
   Vector vec;
 
   AlignedVec() : vec() {}
   AlignedVec(Vector vec) : vec(std::move(vec)) {}
 
-  decltype(auto) operator[](std::size_t pos) {
+  decltype(auto) operator[](size_type pos) {
     return vec[pos];
   }
 
-  decltype(auto) operator[](std::size_t pos) const {
+  decltype(auto) operator[](size_type pos) const {
     return vec[pos];
   }
 
@@ -67,20 +69,20 @@ template <typename Vector> struct alignas(64) AlignedVec {
     return vec.end();
   }
 
+  decltype(auto) size() const {
+    return vec.size();
+  }
+
   void clear() noexcept {
     vec.clear();
   }
 
-  void resize(std::size_t count) {
+  void resize(size_type count) {
     vec.resize(count);
   }
 
-  [[nodiscard]] decltype(auto) entries() const {
-    return TransformedIotaRange(
-        static_cast<std::size_t>(0),
-        vec.size(),
-        [this](const std::size_t pos) { return std::make_pair(pos, vec[pos]); }
-    );
+  void resize(size_type count, const value_type &value) {
+    vec.resize(count, value);
   }
 };
 
diff --git a/kaminpar-common/ranges.h b/kaminpar-common/ranges.h
index 50e3a41d..7ab848b2 100644
--- a/kaminpar-common/ranges.h
+++ b/kaminpar-common/ranges.h
@@ -66,6 +66,8 @@ template <typename Int> class IotaRange {
 };
 
 template <typename Int, typename Function> class TransformedIotaRange {
+  using Self = TransformedIotaRange<Int, Function>;
+
 public:
   class iterator {
   public:
@@ -123,6 +125,10 @@ template <typename Int, typename Function> class TransformedIotaRange {
     return _end;
   }
 
+  const Self &entries() const {
+    return *this;
+  }
+
 private:
   iterator _begin;
   iterator _end;
diff --git a/kaminpar-dist/context_io.cc b/kaminpar-dist/context_io.cc
index 8c82f9ff..374d2946 100644
--- a/kaminpar-dist/context_io.cc
+++ b/kaminpar-dist/context_io.cc
@@ -353,7 +353,7 @@ void print(const ChunksContext &ctx, const ParallelContext &parallel, std::ostre
         << (ctx.scale_chunks_with_threads
                 ? std::string(" / ") + std::to_string(parallel.num_threads)
                 : "")
-        << "]\n";
+        << ")]\n";
   } else {
     out << "  Number of chunks:           " << ctx.fixed_num_chunks << "\n";
   }
@@ -380,8 +380,8 @@ void print(
   out << "Enabled:                      " << (ctx.enabled ? "yes" : "no") << "\n";
   if (ctx.enabled) {
     out << "Compression Scheme:           Gap Encoding + ";
-    if constexpr (Compression::kStreamEncoding) {
-      out << "VarInt Stream Encoding\n";
+    if constexpr (Compression::kStreamVByteEncoding) {
+      out << "StreamVByte Encoding\n";
     } else if constexpr (Compression::kRunLengthEncoding) {
       out << "VarInt Run-Length Encoding\n";
     } else {
@@ -399,9 +399,6 @@ void print(
       out << "    Length Threshold:         " << Compression::kIntervalLengthTreshold << "\n";
     }
 
-    out << "  Isolated Nodes Separation:  " << yeyornay(Compression::kIsolatedNodesSeparation)
-        << "\n";
-
     out << "Compression ratio:            [Min=" << round(ctx.min_compression_ratio)
         << " | Mean=" << round(ctx.avg_compression_ratio)
         << " | Max=" << round(ctx.max_compression_ratio) << "]"
diff --git a/kaminpar-dist/datastructures/distributed_compressed_graph.h b/kaminpar-dist/datastructures/distributed_compressed_graph.h
index 854c1052..9c7370df 100644
--- a/kaminpar-dist/datastructures/distributed_compressed_graph.h
+++ b/kaminpar-dist/datastructures/distributed_compressed_graph.h
@@ -283,7 +283,7 @@ class DistributedCompressedGraph : public AbstractDistributedGraph {
     constexpr bool kDecodeEdgeWeights = std::is_invocable_v<Lambda, NodeID, EdgeWeight>;
     static_assert(kDontDecodeEdgeWeights || kDecodeEdgeWeights);
 
-    _compressed_neighborhoods.decode(u, [&](const EdgeID, const NodeID v, const EdgeWeight w) {
+    _compressed_neighborhoods.adjacent_nodes(u, [&](const NodeID v, const EdgeWeight w) {
       if constexpr (kDecodeEdgeWeights) {
         return l(v, w);
       } else {
@@ -297,7 +297,7 @@ class DistributedCompressedGraph : public AbstractDistributedGraph {
     constexpr bool kDecodeEdgeWeights = std::is_invocable_v<Lambda, EdgeID, NodeID, EdgeWeight>;
     static_assert(kDontDecodeEdgeWeights || kDecodeEdgeWeights);
 
-    _compressed_neighborhoods.decode(u, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
+    _compressed_neighborhoods.neighbors(u, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
       if constexpr (kDecodeEdgeWeights) {
         return l(e, v, w);
       } else {
@@ -313,7 +313,7 @@ class DistributedCompressedGraph : public AbstractDistributedGraph {
     static_assert(kDontDecodeEdgeWeights || kDecodeEdgeWeights);
 
     _compressed_neighborhoods
-        .decode(u, max_num_neighbors, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
+        .neighbors(u, max_num_neighbors, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
           if constexpr (kDecodeEdgeWeights) {
             return l(e, v, w);
           } else {
diff --git a/kaminpar-dist/datastructures/ghost_node_mapper.h b/kaminpar-dist/datastructures/ghost_node_mapper.h
index 31e07bb2..a4e0177b 100644
--- a/kaminpar-dist/datastructures/ghost_node_mapper.h
+++ b/kaminpar-dist/datastructures/ghost_node_mapper.h
@@ -34,18 +34,45 @@ class CompactGhostNodeMapping {
   )
       : _num_nodes(num_nodes),
         _num_ghost_nodes(num_ghost_nodes),
+        _use_dense_global_to_ghost(true),
         _global_to_ghost_bitmap(std::move(global_to_ghost_bitmap)),
         _dense_global_to_ghost(std::move(dense_global_to_ghost)),
         _ghost_to_global(std::move(ghost_to_global)),
         _ghost_owner(std::move(ghost_owner)) {}
 
+  explicit CompactGhostNodeMapping(
+      const NodeID num_nodes,
+      const NodeID num_ghost_nodes,
+      growt::StaticGhostNodeMapping sparse_global_to_ghost,
+      CompactStaticArray<GlobalNodeID> ghost_to_global,
+      CompactStaticArray<PEID> ghost_owner
+  )
+      : _num_nodes(num_nodes),
+        _num_ghost_nodes(num_ghost_nodes),
+        _use_dense_global_to_ghost(false),
+        _sparse_global_to_ghost(std::move(sparse_global_to_ghost)),
+        _ghost_to_global(std::move(ghost_to_global)),
+        _ghost_owner(std::move(ghost_owner)) {}
+
+  [[nodiscard]] NodeID num_ghost_nodes() const {
+    return _num_ghost_nodes;
+  }
+
   [[nodiscard]] bool contains_global_as_ghost(const GlobalNodeID global_node) const {
-    return _global_to_ghost_bitmap.is_set(global_node);
+    if (_use_dense_global_to_ghost) [[likely]] {
+      return _global_to_ghost_bitmap.is_set(global_node);
+    } else {
+      return _sparse_global_to_ghost.find(global_node + 1) != _sparse_global_to_ghost.end();
+    }
   }
 
   [[nodiscard]] NodeID global_to_ghost(const GlobalNodeID global_node) const {
-    const NodeID dense_index = _global_to_ghost_bitmap.rank(global_node);
-    return _dense_global_to_ghost[dense_index] + _num_nodes;
+    if (_use_dense_global_to_ghost) [[likely]] {
+      const NodeID dense_index = _global_to_ghost_bitmap.rank(global_node);
+      return _dense_global_to_ghost[dense_index] + _num_nodes;
+    } else {
+      return (*_sparse_global_to_ghost.find(global_node + 1)).second;
+    }
   }
 
   [[nodiscard]] GlobalNodeID ghost_to_global(const NodeID ghost_node) const {
@@ -56,15 +83,16 @@ class CompactGhostNodeMapping {
     return _ghost_owner[ghost_node];
   }
 
-  [[nodiscard]] NodeID num_ghost_nodes() const {
-    return _num_ghost_nodes;
-  }
-
 private:
   NodeID _num_nodes;
   NodeID _num_ghost_nodes;
+
+  bool _use_dense_global_to_ghost;
+  growt::StaticGhostNodeMapping _sparse_global_to_ghost;
+
   RankCombinedBitVector<> _global_to_ghost_bitmap;
   CompactStaticArray<NodeID> _dense_global_to_ghost;
+
   CompactStaticArray<GlobalNodeID> _ghost_to_global;
   CompactStaticArray<PEID> _ghost_owner;
 };
@@ -81,15 +109,13 @@ class CompactGhostNodeMappingBuilder {
   )
       : _num_nodes(static_cast<NodeID>(node_distribution[rank + 1] - node_distribution[rank])),
         _node_distribution(node_distribution.begin(), node_distribution.end()),
-        _next_ghost_node(_num_nodes),
-        _global_to_ghost_bitmap(node_distribution.back()) {}
+        _next_ghost_node(_num_nodes) {}
 
   NodeID new_ghost_node(const GlobalNodeID global_node) {
     GhostNodeMap::accessor entry;
     if (_global_to_ghost.insert(entry, global_node)) {
       const NodeID ghost_node = _next_ghost_node++;
       entry->second = ghost_node;
-      _global_to_ghost_bitmap.set(global_node);
     } else {
       [[maybe_unused]] const bool found = _global_to_ghost.find(entry, global_node);
       KASSERT(found);
@@ -108,11 +134,6 @@ class CompactGhostNodeMappingBuilder {
     const GlobalNodeID num_global_nodes = _node_distribution.back();
     const std::size_t num_processes = _node_distribution.size() - 1;
 
-    RECORD("dense_global_to_ghost")
-    CompactStaticArray<NodeID> dense_global_to_ghost(
-        math::byte_width(num_ghost_nodes - 1), num_ghost_nodes
-    );
-
     RECORD("ghost_to_global")
     CompactStaticArray<GlobalNodeID> ghost_to_global(
         math::byte_width(num_global_nodes - 1), num_ghost_nodes
@@ -121,33 +142,76 @@ class CompactGhostNodeMappingBuilder {
     RECORD("ghost_owner")
     CompactStaticArray<PEID> ghost_owner(math::byte_width(num_processes - 1), num_ghost_nodes);
 
-    _global_to_ghost_bitmap.update();
-    for (const auto [global_node, local_node] : _global_to_ghost) {
-      const NodeID local_ghost = local_node - _num_nodes;
+    const auto foreach_global_to_ghost = [&](auto &&l) {
+      for (const auto [global_node, local_node] : _global_to_ghost) {
+        const NodeID local_ghost = local_node - _num_nodes;
 
-      const auto owner_it =
-          std::upper_bound(_node_distribution.begin() + 1, _node_distribution.end(), global_node);
-      const auto owner = static_cast<PEID>(std::distance(_node_distribution.begin(), owner_it) - 1);
+        const auto owner_it =
+            std::upper_bound(_node_distribution.begin() + 1, _node_distribution.end(), global_node);
+        const auto owner =
+            static_cast<PEID>(std::distance(_node_distribution.begin(), owner_it) - 1);
 
-      KASSERT(local_ghost < dense_global_to_ghost.size());
-      KASSERT(local_ghost < ghost_to_global.size());
-      KASSERT(local_ghost < ghost_owner.size());
+        l(global_node, local_node, local_ghost, owner);
+      }
+    };
 
-      const std::size_t dense_index = _global_to_ghost_bitmap.rank(global_node);
-      dense_global_to_ghost.write(dense_index, local_ghost);
+    const std::size_t sparse_size =
+        num_ghost_nodes * sizeof(growt::StaticGhostNodeMapping::atomic_slot_type);
+    const std::size_t dense_size =
+        num_global_nodes / 8 + num_ghost_nodes * math::byte_width(num_ghost_nodes - 1);
+
+    if (sparse_size >= dense_size) {
+      RankCombinedBitVector global_to_ghost_bitmap(_node_distribution.back());
+      foreach_global_to_ghost([&](const GlobalNodeID global_node,
+                                  const NodeID local_node,
+                                  const NodeID local_ghost,
+                                  const PEID owner) { global_to_ghost_bitmap.set(global_node); });
+      global_to_ghost_bitmap.update();
+
+      RECORD("dense_global_to_ghost")
+      CompactStaticArray<NodeID> dense_global_to_ghost(
+          math::byte_width(num_ghost_nodes - 1), num_ghost_nodes
+      );
+      foreach_global_to_ghost([&](const GlobalNodeID global_node,
+                                  const NodeID local_node,
+                                  const NodeID local_ghost,
+                                  const PEID owner) {
+        const std::size_t dense_index = global_to_ghost_bitmap.rank(global_node);
+        dense_global_to_ghost.write(dense_index, local_ghost);
+
+        ghost_to_global.write(local_ghost, global_node);
+        ghost_owner.write(local_ghost, owner);
+      });
+
+      return CompactGhostNodeMapping(
+          _num_nodes,
+          num_ghost_nodes,
+          std::move(global_to_ghost_bitmap),
+          std::move(dense_global_to_ghost),
+          std::move(ghost_to_global),
+          std::move(ghost_owner)
+      );
+    } else {
+      growt::StaticGhostNodeMapping global_to_ghost(num_ghost_nodes);
+      foreach_global_to_ghost([&](const GlobalNodeID global_node,
+                                  const NodeID local_node,
+                                  const NodeID local_ghost,
+                                  const PEID owner) {
+        DBG << "Map global node " << global_node << " to local ghost node " << local_node;
+        global_to_ghost.insert(global_node + 1, local_node);
 
-      ghost_to_global.write(local_ghost, global_node);
-      ghost_owner.write(local_ghost, owner);
+        ghost_to_global.write(local_ghost, global_node);
+        ghost_owner.write(local_ghost, owner);
+      });
+
+      return CompactGhostNodeMapping(
+          _num_nodes,
+          num_ghost_nodes,
+          std::move(global_to_ghost),
+          std::move(ghost_to_global),
+          std::move(ghost_owner)
+      );
     }
-
-    return CompactGhostNodeMapping(
-        _num_nodes,
-        num_ghost_nodes,
-        std::move(_global_to_ghost_bitmap),
-        std::move(dense_global_to_ghost),
-        std::move(ghost_to_global),
-        std::move(ghost_owner)
-    );
   }
 
 private:
@@ -156,7 +220,6 @@ class CompactGhostNodeMappingBuilder {
 
   NodeID _next_ghost_node;
   GhostNodeMap _global_to_ghost;
-  RankCombinedBitVector<> _global_to_ghost_bitmap;
 };
 
 namespace graph {
diff --git a/kaminpar-shm/coarsening/cluster_coarsener.cc b/kaminpar-shm/coarsening/cluster_coarsener.cc
index 42f534fe..6caed5e1 100644
--- a/kaminpar-shm/coarsening/cluster_coarsener.cc
+++ b/kaminpar-shm/coarsening/cluster_coarsener.cc
@@ -95,6 +95,18 @@ PartitionedGraph ClusteringCoarsener::uncoarsen(PartitionedGraph &&p_graph) {
   return {current(), p_graph_k, std::move(partition)};
 }
 
+void ClusteringCoarsener::release_allocated_memory() {
+  SCOPED_HEAP_PROFILER("Deallocation");
+  SCOPED_TIMER("Deallocation");
+
+  _clustering_algorithm.reset();
+
+  _contraction_m_ctx.buckets.free();
+  _contraction_m_ctx.buckets_index.free();
+  _contraction_m_ctx.leader_mapping.free();
+  _contraction_m_ctx.all_buffered_nodes.free();
+}
+
 std::unique_ptr<CoarseGraph> ClusteringCoarsener::pop_hierarchy(PartitionedGraph &&p_graph) {
   KASSERT(!empty(), "cannot pop from an empty graph hierarchy", assert::light);
 
diff --git a/kaminpar-shm/coarsening/cluster_coarsener.h b/kaminpar-shm/coarsening/cluster_coarsener.h
index 6f443a02..84355429 100644
--- a/kaminpar-shm/coarsening/cluster_coarsener.h
+++ b/kaminpar-shm/coarsening/cluster_coarsener.h
@@ -38,6 +38,8 @@ class ClusteringCoarsener : public Coarsener {
     return _hierarchy.size();
   }
 
+  void release_allocated_memory() final;
+
 private:
   std::unique_ptr<CoarseGraph> pop_hierarchy(PartitionedGraph &&p_graph);
 
diff --git a/kaminpar-shm/coarsening/clustering/lp_clusterer.cc b/kaminpar-shm/coarsening/clustering/lp_clusterer.cc
index 3e88ae72..aff20e7a 100644
--- a/kaminpar-shm/coarsening/clustering/lp_clusterer.cc
+++ b/kaminpar-shm/coarsening/clustering/lp_clusterer.cc
@@ -49,6 +49,7 @@ class LPClusteringImpl final
     Base::set_max_degree(_lp_ctx.large_degree_threshold);
     Base::set_max_num_neighbors(_lp_ctx.max_num_neighbors);
     Base::set_implementation(_lp_ctx.impl);
+    Base::set_tie_breaking_strategy(_lp_ctx.tie_breaking_strategy);
     Base::set_second_phase_selection_strategy(_lp_ctx.second_phase_selection_strategy);
     Base::set_second_phase_aggregation_strategy(_lp_ctx.second_phase_aggregation_strategy);
     Base::set_relabel_before_second_phase(_lp_ctx.relabel_before_second_phase);
@@ -246,55 +247,100 @@ class LPClusteringImpl final
   template <typename RatingMap>
   [[nodiscard]] ClusterID select_best_cluster(
       const bool store_favored_cluster,
+      const EdgeWeight gain_delta,
       Base::ClusterSelectionState &state,
       RatingMap &map,
-      ScalableVector<ClusterID> &tie_breaking_clusters
+      ScalableVector<ClusterID> &tie_breaking_clusters,
+      ScalableVector<ClusterID> &tie_breaking_favored_clusters
   ) {
+    const bool use_uniform_tie_breaking = _tie_breaking_strategy == TieBreakingStrategy::UNIFORM;
+
     ClusterID favored_cluster = state.initial_cluster;
+    if (use_uniform_tie_breaking) {
+      const auto accept_cluster = [&] {
+        return state.current_cluster_weight + state.u_weight <=
+                   max_cluster_weight(state.current_cluster) ||
+               state.current_cluster == state.initial_cluster;
+      };
+
+      for (const auto [cluster, rating] : map.entries()) {
+        state.current_cluster = cluster;
+        state.current_gain = rating - gain_delta;
+        state.current_cluster_weight = cluster_weight(cluster);
+
+        if (state.current_gain > state.best_gain) {
+          if (store_favored_cluster) {
+            tie_breaking_favored_clusters.clear();
+            tie_breaking_favored_clusters.push_back(state.current_cluster);
+
+            favored_cluster = state.current_cluster;
+          }
 
-    const EdgeWeight gain_delta = (Config::kUseActualGain) ? map[state.initial_cluster] : 0;
-    for (const auto [cluster, rating] : map.entries()) {
-      state.current_cluster = cluster;
-      state.current_gain = rating - gain_delta;
-      state.current_cluster_weight = cluster_weight(cluster);
+          if (accept_cluster()) {
+            tie_breaking_clusters.clear();
+            tie_breaking_clusters.push_back(state.current_cluster);
 
-      if (state.current_gain > state.best_gain) {
-        if (store_favored_cluster) {
-          favored_cluster = state.current_cluster;
+            state.best_cluster = state.current_cluster;
+            state.best_gain = state.current_gain;
+          }
+        } else if (state.current_gain == state.best_gain) {
+          if (store_favored_cluster) {
+            tie_breaking_favored_clusters.push_back(state.current_cluster);
+          }
+
+          if (accept_cluster()) {
+            tie_breaking_clusters.push_back(state.current_cluster);
+          }
         }
+      }
 
-        if (accept_cluster(state)) {
-          tie_breaking_clusters.clear();
-          tie_breaking_clusters.push_back(state.current_cluster);
+      if (tie_breaking_clusters.size() > 1) {
+        const ClusterID i = state.local_rand.random_index(0, tie_breaking_clusters.size());
+        const ClusterID best_cluster = tie_breaking_clusters[i];
+        state.best_cluster = best_cluster;
+      }
+      tie_breaking_clusters.clear();
 
+      if (tie_breaking_favored_clusters.size() > 1) {
+        const ClusterID i = state.local_rand.random_index(0, tie_breaking_favored_clusters.size());
+        const ClusterID best_favored_cluster = tie_breaking_favored_clusters[i];
+        favored_cluster = best_favored_cluster;
+      }
+      tie_breaking_favored_clusters.clear();
+
+      return favored_cluster;
+    } else {
+      const auto accept_cluster = [&] {
+        return (state.current_gain > state.best_gain ||
+                (state.current_gain == state.best_gain && state.local_rand.random_bool())) &&
+               (state.current_cluster_weight + state.u_weight <=
+                    max_cluster_weight(state.current_cluster) ||
+                state.current_cluster == state.initial_cluster);
+      };
+
+      for (const auto [cluster, rating] : map.entries()) {
+        state.current_cluster = cluster;
+        state.current_gain = rating - gain_delta;
+        state.current_cluster_weight = cluster_weight(cluster);
+
+        if (store_favored_cluster && state.current_gain > state.best_gain) {
+          favored_cluster = state.current_cluster;
+        }
+
+        if (accept_cluster()) {
           state.best_cluster = state.current_cluster;
+          state.best_cluster_weight = state.current_cluster_weight;
           state.best_gain = state.current_gain;
         }
-      } else if (state.current_gain == state.best_gain && accept_cluster(state)) {
-        tie_breaking_clusters.push_back(state.current_cluster);
       }
-    }
 
-    if (tie_breaking_clusters.size() > 1) {
-      const ClusterID index = state.local_rand.random_index(0, tie_breaking_clusters.size());
-      const ClusterID best_cluster = tie_breaking_clusters[index];
-      state.best_cluster = best_cluster;
+      return favored_cluster;
     }
-
-    tie_breaking_clusters.clear();
-    return favored_cluster;
-  }
-
-  [[nodiscard]] bool accept_cluster(const Base::ClusterSelectionState &state) {
-    return (state.current_gain > state.best_gain ||
-            (state.current_gain == state.best_gain && state.local_rand.random_bool())) &&
-           (state.current_cluster_weight + state.u_weight <=
-                max_cluster_weight(state.current_cluster) ||
-            state.current_cluster == state.initial_cluster);
   }
 
   using Base::_current_num_clusters;
   using Base::_graph;
+  using Base::_tie_breaking_strategy;
 
   const LabelPropagationCoarseningContext &_lp_ctx;
   NodeWeight _max_cluster_weight = kInvalidBlockWeight;
diff --git a/kaminpar-shm/coarsening/coarsener.h b/kaminpar-shm/coarsening/coarsener.h
index e3e608e1..929e0a14 100644
--- a/kaminpar-shm/coarsening/coarsener.h
+++ b/kaminpar-shm/coarsening/coarsener.h
@@ -67,5 +67,11 @@ class Coarsener {
    * @return partition of the *new* coarsest graph.
    */
   virtual PartitionedGraph uncoarsen(PartitionedGraph &&p_graph) = 0;
+
+  /**
+   * Releases the memory reserved for coarsening, thus making it not possible to call coarsen()
+   * afterwards.
+   */
+  virtual void release_allocated_memory() = 0;
 };
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/coarsening/contraction/unbuffered_cluster_contraction.cc b/kaminpar-shm/coarsening/contraction/unbuffered_cluster_contraction.cc
index 665966d5..d9423e40 100644
--- a/kaminpar-shm/coarsening/contraction/unbuffered_cluster_contraction.cc
+++ b/kaminpar-shm/coarsening/contraction/unbuffered_cluster_contraction.cc
@@ -12,6 +12,7 @@
 #include "kaminpar-shm/kaminpar.h"
 
 #include "kaminpar-common/datastructures/compact_static_array.h"
+#include "kaminpar-common/datastructures/dynamic_map.h"
 #include "kaminpar-common/datastructures/rating_map.h"
 #include "kaminpar-common/datastructures/static_array.h"
 #include "kaminpar-common/heap_profiler.h"
@@ -19,7 +20,7 @@
 
 namespace kaminpar::shm::contraction {
 namespace {
-class ConstantSizeNeighborhoodsBuffer {
+class NeighborhoodsBuffer {
   static constexpr NodeID kSize = 30000; // Chosen such that its about 1 MB in size
 
 public:
@@ -27,7 +28,7 @@ class ConstantSizeNeighborhoodsBuffer {
     return degree >= kSize;
   }
 
-  ConstantSizeNeighborhoodsBuffer(
+  NeighborhoodsBuffer(
       EdgeID *nodes,
       NodeID *edges,
       NodeWeight *node_weights,
@@ -123,29 +124,18 @@ std::unique_ptr<CoarseGraph> contract_clustering_unbuffered(
 
   // Overcomit memory for the edge and edge weight array as we only know the amount of edges of
   // the coarse graph afterwards.
-  const EdgeID edge_count = graph.m();
-  auto c_edges = heap_profiler::overcommit_memory<NodeID>(edge_count);
-  auto c_edge_weights = heap_profiler::overcommit_memory<EdgeWeight>(edge_count);
+  const EdgeID num_fine_edges = graph.m();
+  auto c_edges = heap_profiler::overcommit_memory<NodeID>(num_fine_edges);
+  auto c_edge_weights = heap_profiler::overcommit_memory<EdgeWeight>(num_fine_edges);
 
   START_HEAP_PROFILER("Construct coarse graph");
   START_TIMER("Construct coarse graph");
 
-  CompactStaticArray<NodeID> remapping(static_cast<std::uint8_t>(math::byte_width(c_n)), c_n);
-
-  tbb::enumerable_thread_specific<RatingMap<EdgeWeight, NodeID>> collector{[&] {
-    return RatingMap<EdgeWeight, NodeID>(c_n);
-  }};
-
-  tbb::enumerable_thread_specific<ConstantSizeNeighborhoodsBuffer> neighborhoods_buffer_ets{[&] {
-    return ConstantSizeNeighborhoodsBuffer(
-        c_nodes.data(), c_edges.get(), c_node_weights.data(), c_edge_weights.get(), remapping
-    );
-  }};
-
+  CompactStaticArray<NodeID> remapping(math::byte_width(c_n), c_n);
   const auto write_neighbourhood = [&](const NodeID c_u,
+                                       const NodeWeight c_u_weight,
                                        const NodeID new_c_u,
                                        EdgeID edge,
-                                       const NodeWeight c_u_weight,
                                        auto &map) {
     remapping.write(c_u, new_c_u);
 
@@ -160,86 +150,117 @@ std::unique_ptr<CoarseGraph> contract_clustering_unbuffered(
   };
 
   __uint128_t next_coarse_node_info = 0;
-  const auto &atomic_fetch_next_coarse_node_info = [&](std::uint64_t nodes, std::uint64_t degree) {
-    std::uint64_t old_c_v;
-    std::uint64_t old_edge;
+  const auto atomic_fetch_next_coarse_node_info = [&](const std::uint64_t nodes,
+                                                      const std::uint64_t degree) {
+    std::uint64_t c_v;
+    std::uint64_t edge;
 
     bool success;
     do {
-      __uint128_t expected = next_coarse_node_info;
-      old_c_v = (expected >> 64) & 0xFFFFFFFFFFFFFFFF;
-      old_edge = expected & 0xFFFFFFFFFFFFFFFF;
+      const __uint128_t expected = next_coarse_node_info;
+      c_v = (expected >> 64) & 0xFFFFFFFFFFFFFFFF;
+      edge = expected & 0xFFFFFFFFFFFFFFFF;
 
-      __uint128_t desired = (static_cast<__uint128_t>(old_c_v + nodes) << 64) |
-                            static_cast<__uint128_t>(old_edge + degree);
+      const __uint128_t desired =
+          (static_cast<__uint128_t>(c_v + nodes) << 64) | static_cast<__uint128_t>(edge + degree);
       success = __sync_bool_compare_and_swap(&next_coarse_node_info, expected, desired);
     } while (!success);
 
-    return std::make_pair(old_c_v, old_edge);
+    return std::make_pair(c_v, edge);
   };
 
-  tbb::parallel_for(tbb::blocked_range<NodeID>(0, c_n), [&](const auto &r) {
-    auto &local_collector = collector.local();
-    auto &local_buffer = neighborhoods_buffer_ets.local();
-
-    for (NodeID c_u = r.begin(); c_u != r.end(); ++c_u) {
-      const NodeID first = buckets_index[c_u];
-      const NodeID last = buckets_index[c_u + 1];
+  const auto aggregate_edges = [&](const NodeID c_u,
+                                   const NodeID first,
+                                   const NodeID last,
+                                   auto &edge_collector,
+                                   auto &neighborhood_buffer) {
+    NodeWeight c_u_weight = 0;
+    for (NodeID i = first; i < last; ++i) {
+      const NodeID u = buckets[i];
+      KASSERT(mapping[u] == c_u);
+
+      c_u_weight += graph.node_weight(u);
+
+      graph.adjacent_nodes(u, [&](const NodeID v, const EdgeWeight w) {
+        const NodeID c_v = mapping[v];
+        if (c_u != c_v) {
+          edge_collector[c_v] += w;
+        }
+      });
+    }
 
-      // Build coarse graph
-      const auto collect_edges = [&](auto &map) {
-        NodeWeight c_u_weight = 0;
-        for (NodeID i = first; i < last; ++i) {
-          const NodeID u = buckets[i];
-          KASSERT(mapping[u] == c_u);
+    const std::size_t degree = edge_collector.size();
+    if (NeighborhoodsBuffer::exceeds_capacity(degree)) {
+      auto [new_c_u, edge] = atomic_fetch_next_coarse_node_info(1, degree);
+      write_neighbourhood(c_u, c_u_weight, new_c_u, edge, edge_collector);
+    } else if (neighborhood_buffer.overfills(degree)) {
+      const NodeID num_buffered_nodes = neighborhood_buffer.num_buffered_nodes();
+      const EdgeID num_buffered_edges = neighborhood_buffer.num_buffered_edges();
+      const auto [new_c_u, edge] =
+          atomic_fetch_next_coarse_node_info(num_buffered_nodes + 1, num_buffered_edges + degree);
+      neighborhood_buffer.flush(new_c_u, edge);
+      write_neighbourhood(
+          c_u, c_u_weight, new_c_u + num_buffered_nodes, edge + num_buffered_edges, edge_collector
+      );
+    } else {
+      neighborhood_buffer.add(c_u, degree, c_u_weight, [&](auto &&l) {
+        for (const auto [c_v, weight] : edge_collector.entries()) {
+          l(c_v, weight);
+        }
+      });
+    }
 
-          c_u_weight += graph.node_weight(u);
+    edge_collector.clear();
+  };
 
-          graph.adjacent_nodes(u, [&](const NodeID v, const EdgeWeight w) {
-            const NodeID c_v = mapping[v];
-            if (c_u != c_v) {
-              map[c_v] += w;
-            }
-          });
-        }
+  tbb::enumerable_thread_specific<NeighborhoodsBuffer> neighborhoods_buffer_ets{[&] {
+    return NeighborhoodsBuffer(
+        c_nodes.data(), c_edges.get(), c_node_weights.data(), c_edge_weights.get(), remapping
+    );
+  }};
 
-        const std::size_t degree = map.size();
-        if (ConstantSizeNeighborhoodsBuffer::exceeds_capacity(degree)) {
-          auto [new_c_u, edge] = atomic_fetch_next_coarse_node_info(1, degree);
-          write_neighbourhood(c_u, new_c_u, edge, c_u_weight, map);
-        } else if (local_buffer.overfills(degree)) {
-          const NodeID num_buffered_nodes = local_buffer.num_buffered_nodes();
-          const EdgeID num_buffered_edges = local_buffer.num_buffered_edges();
-          const auto [new_c_u, edge] = atomic_fetch_next_coarse_node_info(
-              num_buffered_nodes + 1, num_buffered_edges + degree
-          );
-          local_buffer.flush(new_c_u, edge);
-          write_neighbourhood(
-              c_u, new_c_u + num_buffered_nodes, edge + num_buffered_edges, c_u_weight, map
-          );
-        } else {
-          local_buffer.add(c_u, degree, c_u_weight, [&](auto &&l) {
-            for (const auto [c_v, weight] : map.entries()) {
-              l(c_v, weight);
-            }
-          });
-        }
+  if (con_ctx.use_growing_hash_tables) {
+    using EdgeCollector = DynamicRememberingFlatMap<NodeID, EdgeWeight, NodeID>;
+    tbb::enumerable_thread_specific<EdgeCollector> edge_collector_ets;
 
-        map.clear();
-      };
+    tbb::parallel_for(tbb::blocked_range<NodeID>(0, c_n), [&](const auto &r) {
+      auto &local_collector = edge_collector_ets.local();
+      auto &local_buffer = neighborhoods_buffer_ets.local();
 
-      // To select the right map, we need a upper bound on the coarse node degree. If we
-      // previously split the coarse nodes into chunks, we have already computed them and stored
-      // them in the c_nodes array.
-      NodeID upper_bound_degree = 0;
-      for (NodeID i = first; i < last; ++i) {
-        const NodeID u = buckets[i];
-        upper_bound_degree += graph.degree(u);
+      for (NodeID c_u = r.begin(); c_u != r.end(); ++c_u) {
+        const NodeID first = buckets_index[c_u];
+        const NodeID last = buckets_index[c_u + 1];
+        aggregate_edges(c_u, first, last, local_collector, local_buffer);
       }
+    });
+  } else {
+    using EdgeCollector = RatingMap<EdgeWeight, NodeID>;
+    tbb::enumerable_thread_specific<EdgeCollector> edge_collector_ets{[&] {
+      return EdgeCollector(c_n);
+    }};
+
+    tbb::parallel_for(tbb::blocked_range<NodeID>(0, c_n), [&](const auto &r) {
+      auto &local_collector = edge_collector_ets.local();
+      auto &local_buffer = neighborhoods_buffer_ets.local();
+
+      for (NodeID c_u = r.begin(); c_u != r.end(); ++c_u) {
+        const NodeID first = buckets_index[c_u];
+        const NodeID last = buckets_index[c_u + 1];
+
+        // To select the right map, we compute a upper bound on the coarse node degree by summing
+        // the degree of all fine nodes.
+        NodeID upper_bound_degree = 0;
+        for (NodeID i = first; i < last; ++i) {
+          const NodeID u = buckets[i];
+          upper_bound_degree += graph.degree(u);
+        }
 
-      local_collector.execute(upper_bound_degree, collect_edges);
-    }
-  });
+        local_collector.execute(upper_bound_degree, [&](auto &edge_collector) {
+          aggregate_edges(c_u, first, last, edge_collector, local_buffer);
+        });
+      }
+    });
+  }
 
   tbb::parallel_for(neighborhoods_buffer_ets.range(), [&](auto &r) {
     for (auto &buffer : r) {
diff --git a/kaminpar-shm/coarsening/noop_coarsener.h b/kaminpar-shm/coarsening/noop_coarsener.h
index 243580c3..1b6f8bfa 100644
--- a/kaminpar-shm/coarsening/noop_coarsener.h
+++ b/kaminpar-shm/coarsening/noop_coarsener.h
@@ -35,6 +35,8 @@ class NoopCoarsener : public Coarsener {
     return std::move(p_graph);
   }
 
+  void release_allocated_memory() final {};
+
 private:
   const Graph *_graph = nullptr;
 };
diff --git a/kaminpar-shm/context.cc b/kaminpar-shm/context.cc
index bbe090cd..8dd9c975 100644
--- a/kaminpar-shm/context.cc
+++ b/kaminpar-shm/context.cc
@@ -22,8 +22,7 @@ void GraphCompressionContext::setup(const Graph &graph) {
   interval_encoding = CompressedGraph::kIntervalEncoding;
   interval_length_treshold = CompressedGraph::kIntervalLengthTreshold;
   run_length_encoding = CompressedGraph::kRunLengthEncoding;
-  stream_encoding = CompressedGraph::kStreamEncoding;
-  isolated_nodes_separation = CompressedGraph::kIsolatedNodesSeparation;
+  streamvbyte_encoding = CompressedGraph::kStreamVByteEncoding;
 
   if (enabled) {
     if (const auto *compressed_graph =
diff --git a/kaminpar-shm/context_io.cc b/kaminpar-shm/context_io.cc
index 63106ecc..4c967029 100644
--- a/kaminpar-shm/context_io.cc
+++ b/kaminpar-shm/context_io.cc
@@ -15,7 +15,6 @@
 
 #include "kaminpar-common/asserting_cast.h"
 #include "kaminpar-common/console_io.h"
-#include "kaminpar-common/graph-compression/varint_codec.h"
 #include "kaminpar-common/random.h"
 #include "kaminpar-common/strutils.h"
 
@@ -288,6 +287,24 @@ std::ostream &operator<<(std::ostream &out, const GainCacheStrategy strategy) {
   return out << "<invalid>";
 }
 
+std::unordered_map<std::string, TieBreakingStrategy> get_tie_breaking_strategies() {
+  return {
+      {"geometric", TieBreakingStrategy::GEOMETRIC},
+      {"uniform", TieBreakingStrategy::UNIFORM},
+  };
+}
+
+std::ostream &operator<<(std::ostream &out, const TieBreakingStrategy strategy) {
+  switch (strategy) {
+  case TieBreakingStrategy::GEOMETRIC:
+    return out << "geometric";
+  case TieBreakingStrategy::UNIFORM:
+    return out << "uniform";
+  }
+
+  return out << "<invalid>";
+}
+
 std::ostream &operator<<(std::ostream &out, const TwoHopStrategy strategy) {
   switch (strategy) {
   case TwoHopStrategy::DISABLE:
@@ -393,8 +410,8 @@ void print(const GraphCompressionContext &c_ctx, std::ostream &out) {
     out << "Compression Scheme:           Gap Encoding + ";
     if (c_ctx.run_length_encoding) {
       out << "VarInt Run-Length Encoding\n";
-    } else if (c_ctx.stream_encoding) {
-      out << "VarInt Stream Encoding\n";
+    } else if (c_ctx.streamvbyte_encoding) {
+      out << "StreamVByte Encoding\n";
     } else {
       out << "VarInt Encoding\n";
     }
@@ -410,8 +427,6 @@ void print(const GraphCompressionContext &c_ctx, std::ostream &out) {
     if (c_ctx.interval_encoding) {
       out << "    Length Threshold:         " << c_ctx.interval_length_treshold << "\n";
     }
-    out << "  Isolated Nodes Separation:  " << (c_ctx.isolated_nodes_separation ? "yes" : "no")
-        << "\n";
 
     out << "Compresion Ratio:             ";
     if (c_ctx.dismissed) {
@@ -424,29 +439,6 @@ void print(const GraphCompressionContext &c_ctx, std::ostream &out) {
       out << "  High Degree Part Count:     " << c_ctx.num_high_degree_parts << "\n";
       out << "  Interval Node Count:        " << c_ctx.num_interval_nodes << "\n";
       out << "  Interval Count:             " << c_ctx.num_intervals << "\n";
-
-      if (debug::kTrackVarintStats) {
-        const auto &stats = debug::varint_stats_global();
-
-        const float avg_varint_len =
-            (stats.varint_count == 0) ? 0 : (stats.varint_bytes / (float)stats.varint_count);
-        out << "Average Varint Length:        " << avg_varint_len
-            << " [count: " << stats.varint_count << "]\n";
-
-        const float avg_signed_varint_len =
-            (stats.signed_varint_count == 0)
-                ? 0
-                : (stats.signed_varint_bytes / (float)stats.signed_varint_count);
-        out << "Average Signed Varint Length: " << avg_signed_varint_len
-            << " [count: " << stats.signed_varint_count << "]\n";
-
-        const float avg_marked_varint_len =
-            (stats.marked_varint_count == 0)
-                ? 0
-                : (stats.marked_varint_bytes / (float)stats.marked_varint_count);
-        out << "Average Marked Varint Length: " << avg_marked_varint_len
-            << " [count: " << stats.marked_varint_count << "]\n";
-      }
     }
   }
 }
@@ -494,6 +486,9 @@ void print(const CoarseningContext &c_ctx, std::ostream &out) {
   out << "Contraction mode:             " << c_ctx.contraction.mode << '\n';
   if (c_ctx.contraction.mode == ContractionMode::BUFFERED) {
     out << "  Edge buffer fill fraction:  " << c_ctx.contraction.edge_buffer_fill_fraction << "\n";
+  } else if (c_ctx.contraction.mode == ContractionMode::UNBUFFERED) {
+    out << "  Use growing hash tables:    "
+        << (c_ctx.contraction.use_growing_hash_tables ? "yes" : "no") << "\n";
   }
 }
 
@@ -501,6 +496,7 @@ void print(const LabelPropagationCoarseningContext &lp_ctx, std::ostream &out) {
   out << "    Number of iterations:     " << lp_ctx.num_iterations << "\n";
   out << "    High degree threshold:    " << lp_ctx.large_degree_threshold << "\n";
   out << "    Max degree:               " << lp_ctx.max_num_neighbors << "\n";
+  out << "    Tie breaking strategy:    " << lp_ctx.tie_breaking_strategy << "\n";
   out << "    Cluster weights struct:   " << lp_ctx.cluster_weights_structure << "\n";
   out << "    Implementation:           " << lp_ctx.impl << "\n";
   if (lp_ctx.impl == LabelPropagationImplementation::TWO_PHASE) {
@@ -524,6 +520,7 @@ void print(const RefinementContext &r_ctx, std::ostream &out) {
   if (r_ctx.includes_algorithm(RefinementAlgorithm::LABEL_PROPAGATION)) {
     out << "Label propagation:\n";
     out << "  Number of iterations:       " << r_ctx.lp.num_iterations << "\n";
+    out << "  Tie breaking strategy:      " << r_ctx.lp.tie_breaking_strategy << "\n";
     out << "  Implementation:             " << r_ctx.lp.impl << "\n";
     if (r_ctx.lp.impl == LabelPropagationImplementation::TWO_PHASE) {
       out << "    Selection strategy:       " << r_ctx.lp.second_phase_selection_strategy << '\n';
diff --git a/kaminpar-shm/context_io.h b/kaminpar-shm/context_io.h
index 25f21b1f..467ec963 100644
--- a/kaminpar-shm/context_io.h
+++ b/kaminpar-shm/context_io.h
@@ -59,6 +59,10 @@ std::unordered_map<std::string, InitialPartitioningMode> get_initial_partitionin
 
 std::ostream &operator<<(std::ostream &out, GainCacheStrategy strategy);
 
+std::unordered_map<std::string, TieBreakingStrategy> get_tie_breaking_strategies();
+
+std::ostream &operator<<(std::ostream &out, TieBreakingStrategy strategy);
+
 std::ostream &operator<<(std::ostream &out, SecondPhaseSelectionStrategy strategy);
 
 std::unordered_map<std::string, SecondPhaseSelectionStrategy>
diff --git a/kaminpar-shm/datastructures/compressed_graph.h b/kaminpar-shm/datastructures/compressed_graph.h
index 883c5705..ba89f8e3 100644
--- a/kaminpar-shm/datastructures/compressed_graph.h
+++ b/kaminpar-shm/datastructures/compressed_graph.h
@@ -74,16 +74,9 @@ class CompressedGraph : public AbstractGraph {
   static constexpr bool kRunLengthEncoding = CompressedNeighborhoods::kRunLengthEncoding;
 
   /*!
-   * Whether stream encoding is used.
+   * Whether StreamVByte encoding is used.
    */
-  static constexpr bool kStreamEncoding = CompressedNeighborhoods::kStreamEncoding;
-
-  /*!
-   * Whether the isolated nodes of the compressed graph are continuously stored
-   * at the end of the nodes array.
-   */
-  static constexpr bool kIsolatedNodesSeparation =
-      CompressedNeighborhoods::kIsolatedNodesSeparation;
+  static constexpr bool kStreamVByteEncoding = CompressedNeighborhoods::kStreamVByteEncoding;
 
   /*!
    * Constructs a new compressed graph.
@@ -144,7 +137,7 @@ class CompressedGraph : public AbstractGraph {
   }
 
   [[nodiscard]] inline EdgeWeight total_edge_weight() const final {
-    return _total_edge_weight;
+    return _compressed_neighborhoods.total_edge_weight();
   }
 
   //
@@ -184,7 +177,7 @@ class CompressedGraph : public AbstractGraph {
     constexpr bool kDecodeEdgeWeights = std::is_invocable_v<Lambda, NodeID, EdgeWeight>;
     static_assert(kDontDecodeEdgeWeights || kDecodeEdgeWeights);
 
-    _compressed_neighborhoods.decode(u, [&](const EdgeID, const NodeID v, const EdgeWeight w) {
+    _compressed_neighborhoods.adjacent_nodes(u, [&](const NodeID v, const EdgeWeight w) {
       if constexpr (kDecodeEdgeWeights) {
         return l(v, w);
       } else {
@@ -198,7 +191,7 @@ class CompressedGraph : public AbstractGraph {
     constexpr bool kDecodeEdgeWeights = std::is_invocable_v<Lambda, EdgeID, NodeID, EdgeWeight>;
     static_assert(kDontDecodeEdgeWeights || kDecodeEdgeWeights);
 
-    _compressed_neighborhoods.decode(u, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
+    _compressed_neighborhoods.neighbors(u, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
       if constexpr (kDecodeEdgeWeights) {
         return l(e, v, w);
       } else {
@@ -214,7 +207,7 @@ class CompressedGraph : public AbstractGraph {
     static_assert(kDontDecodeEdgeWeights || kDecodeEdgeWeights);
 
     _compressed_neighborhoods
-        .decode(u, max_num_neighbors, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
+        .neighbors(u, max_num_neighbors, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
           if constexpr (kDecodeEdgeWeights) {
             return l(e, v, w);
           } else {
@@ -239,8 +232,7 @@ class CompressedGraph : public AbstractGraph {
   inline void pfor_neighbors(
       const NodeID u, const NodeID max_num_neighbors, const NodeID grainsize, Lambda &&l
   ) const {
-    constexpr bool kParallelDecoding = true;
-    _compressed_neighborhoods.decode<kParallelDecoding>(u, std::forward<Lambda>(l));
+    _compressed_neighborhoods.parallel_neighbors(u, std::forward<Lambda>(l));
   }
 
   //
@@ -428,7 +420,6 @@ class CompressedGraph : public AbstractGraph {
 
   NodeWeight _max_node_weight = kInvalidNodeWeight;
   NodeWeight _total_node_weight = kInvalidNodeWeight;
-  EdgeWeight _total_edge_weight = kInvalidEdgeWeight;
 
   StaticArray<NodeID> _permutation;
   bool _sorted;
diff --git a/kaminpar-shm/graphutils/parallel_compressed_graph_builder.h b/kaminpar-shm/graphutils/parallel_compressed_graph_builder.h
index 12f798e4..32a5eb6f 100644
--- a/kaminpar-shm/graphutils/parallel_compressed_graph_builder.h
+++ b/kaminpar-shm/graphutils/parallel_compressed_graph_builder.h
@@ -210,7 +210,7 @@ template <
   });
 
   using CompressedEdgesBuilder = kaminpar::CompressedEdgesBuilder<NodeID, EdgeID, EdgeWeight>;
-  tbb::enumerable_thread_specific<CompressedEdgesBuilder> neighbourhood_builder_ets([&] {
+  tbb::enumerable_thread_specific<CompressedEdgesBuilder> edges_builder_ets([&] {
     return CompressedEdgesBuilder(
         num_nodes, num_edges, max_degree, kHasEdgeWeights, builder.edge_weights()
     );
@@ -227,13 +227,13 @@ template <
 
     auto &offsets = offsets_ets.local();
     auto &neighbourhood = neighbourhood_ets.local();
-    auto &neighbourhood_builder = neighbourhood_builder_ets.local();
+    auto &edges_builder = edges_builder_ets.local();
 
     const NodeID chunk = buffer.next();
     const auto [start, end, first_edge] = chunks[chunk];
 
     NodeWeight local_node_weight = 0;
-    neighbourhood_builder.init(first_edge);
+    edges_builder.init(first_edge);
 
     // Compress the neighborhoods of the nodes in the fetched chunk.
     debug::scoped_time(dbg.compression_time, [&] {
@@ -256,7 +256,7 @@ template <
           edge += 1;
         }
 
-        const EdgeID local_offset = neighbourhood_builder.add(i, neighbourhood);
+        const EdgeID local_offset = edges_builder.add(i, neighbourhood);
         offsets.push_back(local_offset);
 
         neighbourhood.clear();
@@ -265,7 +265,7 @@ template <
 
     // Wait for the parallel tasks that process the previous chunks to finish.
     const EdgeID offset = debug::scoped_time(dbg.sync_time, [&] {
-      const EdgeID compressed_neighborhoods_size = neighbourhood_builder.size();
+      const EdgeID compressed_neighborhoods_size = edges_builder.size();
       return buffer.fetch_and_update(chunk, compressed_neighborhoods_size);
     });
 
@@ -287,23 +287,20 @@ template <
       }
       offsets.clear();
 
-      builder.add_compressed_edges(
-          offset, neighbourhood_builder.size(), neighbourhood_builder.compressed_data()
-      );
-
+      builder.add_compressed_edges(offset, edges_builder.size(), edges_builder.compressed_data());
       builder.record_local_statistics(
-          neighbourhood_builder.max_degree(),
-          neighbourhood_builder.total_edge_weight(),
-          neighbourhood_builder.num_high_degree_nodes(),
-          neighbourhood_builder.num_high_degree_parts(),
-          neighbourhood_builder.num_interval_nodes(),
-          neighbourhood_builder.num_intervals()
+          edges_builder.max_degree(),
+          edges_builder.total_edge_weight(),
+          edges_builder.num_high_degree_nodes(),
+          edges_builder.num_high_degree_parts(),
+          edges_builder.num_interval_nodes(),
+          edges_builder.num_intervals()
       );
     });
   });
 
   IF_DBG debug::print_graph_compression_stats(dbg_ets);
-  IF_DBG debug::print_compressed_graph_stats(neighbourhood_builder_ets);
+  IF_DBG debug::print_compressed_graph_stats(edges_builder_ets);
 
   return CompressedGraph(builder.build(), std::move(node_weights_array), sorted);
 }
diff --git a/kaminpar-shm/graphutils/permutator.h b/kaminpar-shm/graphutils/permutator.h
index 83ec7f7c..460254cc 100644
--- a/kaminpar-shm/graphutils/permutator.h
+++ b/kaminpar-shm/graphutils/permutator.h
@@ -39,7 +39,7 @@ template <template <typename> typename Container> struct NodePermutations {
  * @return Bidirectional node permutation.
  */
 template <bool put_deg0_at_end = true, typename Lambda>
-NodePermutations<StaticArray> sort_by_degree_buckets(const NodeID n, const Lambda &&degrees) {
+NodePermutations<StaticArray> sort_by_degree_buckets(const NodeID n, Lambda &&degrees) {
   static_assert(std::is_invocable_r_v<NodeID, Lambda, NodeID>);
   SCOPED_TIMER("Sort nodes by degree bucket");
 
diff --git a/kaminpar-shm/kaminpar.h b/kaminpar-shm/kaminpar.h
index a9d9b872..b10b3bdc 100644
--- a/kaminpar-shm/kaminpar.h
+++ b/kaminpar-shm/kaminpar.h
@@ -95,6 +95,11 @@ enum class ClusterWeightLimit {
   ZERO,
 };
 
+enum class TieBreakingStrategy {
+  GEOMETRIC,
+  UNIFORM,
+};
+
 enum class ClusterWeightsStructure {
   VEC,
   TWO_LEVEL_VEC,
@@ -147,6 +152,8 @@ struct LabelPropagationCoarseningContext {
   NodeID large_degree_threshold;
   NodeID max_num_neighbors;
 
+  TieBreakingStrategy tie_breaking_strategy;
+
   ClusterWeightsStructure cluster_weights_structure;
   LabelPropagationImplementation impl;
 
@@ -163,6 +170,7 @@ struct LabelPropagationCoarseningContext {
 struct ContractionCoarseningContext {
   ContractionMode mode;
   double edge_buffer_fill_fraction;
+  bool use_growing_hash_tables;
 };
 
 struct ClusterCoarseningContext {
@@ -221,6 +229,8 @@ struct LabelPropagationRefinementContext {
 
   LabelPropagationImplementation impl;
 
+  TieBreakingStrategy tie_breaking_strategy;
+
   SecondPhaseSelectionStrategy second_phase_selection_strategy;
   SecondPhaseAggregationStrategy second_phase_aggregation_strategy;
 };
@@ -418,8 +428,7 @@ struct GraphCompressionContext {
   bool interval_encoding;
   NodeID interval_length_treshold;
   bool run_length_encoding;
-  bool stream_encoding;
-  bool isolated_nodes_separation;
+  bool streamvbyte_encoding;
 
   bool dismissed;
   double compression_ratio;
diff --git a/kaminpar-shm/label_propagation.h b/kaminpar-shm/label_propagation.h
index bdbe15a9..cb01831e 100644
--- a/kaminpar-shm/label_propagation.h
+++ b/kaminpar-shm/label_propagation.h
@@ -98,6 +98,7 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
   using GrowingRatingMap = typename Config::GrowingRatingMap;
 
   using LabelPropagationImplementation = shm::LabelPropagationImplementation;
+  using TieBreakingStrategy = shm::TieBreakingStrategy;
   using SecondPhaseSelectionStrategy = shm::SecondPhaseSelectionStrategy;
   using SecondPhaseAggregationStrategy = shm::SecondPhaseAggregationStrategy;
 
@@ -130,6 +131,13 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
     return _impl;
   }
 
+  void set_tie_breaking_strategy(const TieBreakingStrategy strategy) {
+    _tie_breaking_strategy = strategy;
+  }
+  [[nodiscard]] TieBreakingStrategy tie_breaking_strategy() {
+    return _tie_breaking_strategy;
+  }
+
   void set_second_phase_selection_strategy(const SecondPhaseSelectionStrategy strategy) {
     _second_phase_selection_strategy = strategy;
   }
@@ -254,7 +262,7 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
     _graph = graph;
     _initial_num_clusters = num_clusters;
     _current_num_clusters = num_clusters;
-    _local_cluster_selection_states.resize(tbb::this_task_arena::max_concurrency());
+    _local_cluster_selection_states.resize(tbb::this_task_arena::max_concurrency(), {-1, 0, -1, 0});
     reset_state();
   }
 
@@ -337,7 +345,8 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
       const NodeID u,
       Random &rand,
       GrowingRatingMap &map,
-      ScalableVector<ClusterID> &tie_breaking_clusters
+      ScalableVector<ClusterID> &tie_breaking_clusters,
+      ScalableVector<ClusterID> &tie_breaking_favored_clusters
   ) {
     if (derived_skip_node(u)) {
       return {false, false};
@@ -346,8 +355,9 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
     const NodeWeight u_weight = _graph->node_weight(u);
     const ClusterID u_cluster = derived_cluster(u);
 
-    const auto [best_cluster, gain] =
-        find_best_cluster(u, u_weight, u_cluster, rand, map, tie_breaking_clusters);
+    const auto [best_cluster, gain] = find_best_cluster(
+        u, u_weight, u_cluster, rand, map, tie_breaking_clusters, tie_breaking_favored_clusters
+    );
     return try_node_move(u, u_weight, u_cluster, best_cluster, gain);
   }
 
@@ -361,7 +371,11 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
    * the previous cluster is now empty.
    */
   std::pair<bool, bool> handle_first_phase_node(
-      const NodeID u, Random &rand, RatingMap &map, ScalableVector<ClusterID> &tie_breaking_clusters
+      const NodeID u,
+      Random &rand,
+      RatingMap &map,
+      ScalableVector<ClusterID> &tie_breaking_clusters,
+      ScalableVector<ClusterID> &tie_breaking_favored_clusters
   ) {
     if (!derived_skip_node(u)) {
       const NodeWeight u_weight = _graph->node_weight(u);
@@ -378,9 +392,16 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
 
       const auto maybe_move = map.execute(upper_bound_size, [&](auto &actual_map) {
         return find_best_cluster_first_phase(
-            u, u_weight, u_cluster, rand, actual_map, tie_breaking_clusters
+            u,
+            u_weight,
+            u_cluster,
+            rand,
+            actual_map,
+            tie_breaking_clusters,
+            tie_breaking_favored_clusters
         );
       });
+
       if (maybe_move.has_value()) {
         const auto [best_cluster, gain] = *maybe_move;
         return try_node_move(u, u_weight, u_cluster, best_cluster, gain);
@@ -441,7 +462,8 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
       const ClusterID u_cluster,
       Random &rand,
       RatingMap &map,
-      ScalableVector<ClusterID> &tie_breaking_clusters
+      ScalableVector<ClusterID> &tie_breaking_clusters,
+      ScalableVector<ClusterID> &tie_breaking_favored_clusters
   ) {
     const ClusterWeight initial_cluster_weight = derived_cluster_weight(u_cluster);
     ClusterSelectionState state{
@@ -486,8 +508,15 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
     const bool store_favored_cluster =
         Config::kUseTwoHopClustering && u_weight == initial_cluster_weight &&
         initial_cluster_weight <= derived_max_cluster_weight(u_cluster) / 2;
-    ClusterID favored_cluster =
-        derived_select_best_cluster(store_favored_cluster, state, map, tie_breaking_clusters);
+    const EdgeWeight gain_delta = (Config::kUseActualGain) ? map[u_cluster] : 0;
+    ClusterID favored_cluster = derived_select_best_cluster(
+        store_favored_cluster,
+        gain_delta,
+        state,
+        map,
+        tie_breaking_clusters,
+        tie_breaking_favored_clusters
+    );
 
     // If we couldn't join any cluster, we store the favored cluster
     if (store_favored_cluster && state.best_cluster == state.initial_cluster) {
@@ -506,7 +535,8 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
       const ClusterID u_cluster,
       Random &rand,
       RatingMap &map,
-      ScalableVector<ClusterID> &tie_breaking_clusters
+      ScalableVector<ClusterID> &tie_breaking_clusters,
+      ScalableVector<ClusterID> &tie_breaking_favored_clusters
   ) {
     const ClusterWeight initial_cluster_weight = derived_cluster_weight(u_cluster);
     ClusterSelectionState state{
@@ -536,7 +566,7 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
         const ClusterID v_cluster = derived_cluster(v);
         map[v_cluster] += w;
 
-        if (use_frm_selection && map.size() >= Config::kRatingMapThreshold) {
+        if (use_frm_selection && map.size() >= Config::kRatingMapThreshold) [[unlikely]] {
           if (aggregate_during_second_phase) {
             _second_phase_nodes.push_back(u);
           }
@@ -574,8 +604,15 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
     const bool store_favored_cluster =
         Config::kUseTwoHopClustering && u_weight == initial_cluster_weight &&
         initial_cluster_weight <= derived_max_cluster_weight(u_cluster) / 2;
-    ClusterID favored_cluster =
-        derived_select_best_cluster(store_favored_cluster, state, map, tie_breaking_clusters);
+    const EdgeWeight gain_delta = (Config::kUseActualGain) ? map[u_cluster] : 0;
+    ClusterID favored_cluster = derived_select_best_cluster(
+        store_favored_cluster,
+        gain_delta,
+        state,
+        map,
+        tie_breaking_clusters,
+        tie_breaking_favored_clusters
+    );
 
     // If we couldn't join any cluster, we store the favored cluster
     if (store_favored_cluster && state.best_cluster == state.initial_cluster) {
@@ -695,9 +732,10 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
     const bool store_favored_cluster =
         Config::kUseTwoHopClustering && u_weight == initial_cluster_weight &&
         initial_cluster_weight <= derived_max_cluster_weight(u_cluster) / 2;
+    const EdgeWeight gain_delta = (Config::kUseActualGain) ? map[u_cluster] : 0;
     ClusterID favored_cluster = u_cluster;
 
-    map.iterate_and_reset([&](const auto i, const auto &used_entries) {
+    map.iterate_and_reset([&](const auto i, const auto &local_entries) {
       ClusterSelectionState local_state{
           .local_rand = Random::instance(),
           .u = u,
@@ -713,7 +751,12 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
       };
 
       const ClusterID local_favored_cluster = derived_select_best_cluster(
-          store_favored_cluster, local_state, used_entries, _tie_breaking_clusters_ets.local()
+          store_favored_cluster,
+          gain_delta,
+          local_state,
+          local_entries,
+          _tie_breaking_clusters_ets.local(),
+          _tie_breaking_favored_clusters_ets.local()
       );
       const EdgeWeight local_favored_cluster_gain = map[local_favored_cluster];
 
@@ -725,19 +768,69 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
       };
     });
 
-    EdgeWeight favored_cluster_gain = 0;
-    for (const LocalClusterSelectionState local_state : _local_cluster_selection_states) {
-      if (local_state.best_gain > state.best_gain) {
-        state.best_gain = local_state.best_gain;
-        state.best_cluster = local_state.best_cluster;
+    const bool use_uniform_tie_breaking = _tie_breaking_strategy == TieBreakingStrategy::UNIFORM;
+    if (use_uniform_tie_breaking) {
+      auto &tie_breaking_clusters = _tie_breaking_clusters_ets.local();
+      auto &tie_breaking_favored_clusters = _tie_breaking_favored_clusters_ets.local();
+
+      EdgeWeight favored_cluster_gain = 0;
+      for (LocalClusterSelectionState &local_state : _local_cluster_selection_states) {
+        if (local_state.best_gain > state.best_gain) {
+          state.best_gain = local_state.best_gain;
+          state.best_cluster = local_state.best_cluster;
+
+          tie_breaking_clusters.clear();
+          tie_breaking_clusters.push_back(local_state.best_cluster);
+        } else if (local_state.best_gain == state.best_gain) {
+          tie_breaking_clusters.push_back(local_state.best_cluster);
+        }
+
+        if (store_favored_cluster) {
+          if (local_state.favored_cluster_gain > favored_cluster_gain) {
+            tie_breaking_favored_clusters.clear();
+            tie_breaking_favored_clusters.push_back(local_state.favored_cluster);
+
+            favored_cluster_gain = local_state.favored_cluster_gain;
+            favored_cluster = local_state.favored_cluster;
+          } else if (local_state.favored_cluster_gain == favored_cluster_gain) {
+            tie_breaking_favored_clusters.push_back(local_state.favored_cluster);
+          }
+        }
+
+        local_state.best_gain = -1;
+        local_state.favored_cluster_gain = -1;
+      }
+
+      if (tie_breaking_clusters.size() > 1) {
+        const ClusterID i = state.local_rand.random_index(0, tie_breaking_clusters.size());
+        const ClusterID best_cluster = tie_breaking_clusters[i];
+        state.best_cluster = best_cluster;
       }
+      tie_breaking_clusters.clear();
 
-      if (store_favored_cluster && local_state.favored_cluster_gain > favored_cluster_gain) {
-        favored_cluster_gain = local_state.favored_cluster_gain;
-        favored_cluster = local_state.favored_cluster;
+      if (tie_breaking_favored_clusters.size() > 1) {
+        const ClusterID i = state.local_rand.random_index(0, tie_breaking_favored_clusters.size());
+        const ClusterID best_favored_cluster = tie_breaking_favored_clusters[i];
+        favored_cluster = best_favored_cluster;
+      }
+      tie_breaking_favored_clusters.clear();
+    } else {
+      EdgeWeight favored_cluster_gain = 0;
+      for (LocalClusterSelectionState &local_state : _local_cluster_selection_states) {
+        if (local_state.best_gain > state.best_gain) {
+          state.best_gain = local_state.best_gain;
+          state.best_cluster = local_state.best_cluster;
+        }
+
+        if (store_favored_cluster && local_state.favored_cluster_gain > favored_cluster_gain) {
+          favored_cluster_gain = local_state.favored_cluster_gain;
+          favored_cluster = local_state.favored_cluster;
+        }
+
+        local_state.best_gain = -1;
+        local_state.favored_cluster_gain = -1;
       }
     }
-    _local_cluster_selection_states.clear();
 
     // If we couldn't join any cluster, we store the favored cluster
     if (store_favored_cluster && state.best_cluster == state.initial_cluster) {
@@ -1209,12 +1302,19 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
   template <typename RatingMap>
   [[nodiscard]] ClusterID derived_select_best_cluster(
       const bool store_favored_cluster,
+      const EdgeWeight gain_delta,
       ClusterSelectionState &state,
       RatingMap &map,
-      ScalableVector<ClusterID> &tie_breaking_clusters
+      ScalableVector<ClusterID> &tie_breaking_clusters,
+      ScalableVector<ClusterID> &tie_breaking_favored_clusters
   ) {
     return static_cast<Derived *>(this)->select_best_cluster(
-        store_favored_cluster, state, map, tie_breaking_clusters
+        store_favored_cluster,
+        gain_delta,
+        state,
+        map,
+        tie_breaking_clusters,
+        tie_breaking_favored_clusters
     );
   }
 
@@ -1294,6 +1394,9 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
   //! The label propagation implementation that is used.
   LabelPropagationImplementation _impl;
 
+  //! The tie breaking strategy that is used.
+  TieBreakingStrategy _tie_breaking_strategy;
+
   //! The strategy by which the nodes for the second phase are selected.
   SecondPhaseSelectionStrategy _second_phase_selection_strategy;
 
@@ -1312,8 +1415,11 @@ template <typename Derived, typename Config, typename Graph> class LabelPropagat
   //! Thread-local vector to hold clusters considered for uniform tie-breaking.
   tbb::enumerable_thread_specific<ScalableVector<ClusterID>> _tie_breaking_clusters_ets;
 
+  //! Thread-local vector to hold favored clusters considered for uniform tie-breaking.
+  tbb::enumerable_thread_specific<ScalableVector<ClusterID>> _tie_breaking_favored_clusters_ets;
+
   //! Vector of local cluster selection states where each entry is owned by a parallel task.
-  std::vector<LocalClusterSelectionState> _local_cluster_selection_states;
+  parallel::AlignedVec<std::vector<LocalClusterSelectionState>> _local_cluster_selection_states;
 
   //! Flags nodes with at least one node in its neighborhood that changed
   //! clusters during the last iteration. Nodes without this flag set must not
@@ -1785,6 +1891,7 @@ class ChunkRandomLabelPropagation : public LabelPropagation<Derived, Config, Gra
       auto &local_rand = Random::instance();
       auto &local_rating_map = _growing_rating_map_ets.local();
       auto &tie_breaking_clusters = _tie_breaking_clusters_ets.local();
+      auto &tie_breaking_favored_clusters = _tie_breaking_favored_clusters_ets.local();
       NodeID num_removed_clusters = 0;
 
       const auto chunk_id = next_chunk.fetch_add(1, std::memory_order_relaxed);
@@ -1814,8 +1921,13 @@ class ChunkRandomLabelPropagation : public LabelPropagation<Derived, Config, Gra
 
           const NodeID degree = _graph->degree(u);
           if (degree < _max_degree) {
-            const auto [moved_node, emptied_cluster] =
-                handle_node(u, local_rand, local_rating_map, tie_breaking_clusters);
+            const auto [moved_node, emptied_cluster] = handle_node(
+                u,
+                local_rand,
+                local_rating_map,
+                tie_breaking_clusters,
+                tie_breaking_favored_clusters
+            );
 
             ++local_num_processed_nodes;
             if (moved_node) {
@@ -1861,6 +1973,7 @@ class ChunkRandomLabelPropagation : public LabelPropagation<Derived, Config, Gra
       auto &local_rand = Random::instance();
       auto &local_rating_map = _rating_map_ets.local();
       auto &tie_breaking_clusters = _tie_breaking_clusters_ets.local();
+      auto &tie_breaking_favored_clusters = _tie_breaking_favored_clusters_ets.local();
       NodeID num_removed_clusters = 0;
 
       const auto chunk_id = next_chunk.fetch_add(1, std::memory_order_relaxed);
@@ -1900,8 +2013,13 @@ class ChunkRandomLabelPropagation : public LabelPropagation<Derived, Config, Gra
               continue;
             }
 
-            const auto [moved_node, emptied_cluster] =
-                handle_first_phase_node(u, local_rand, local_rating_map, tie_breaking_clusters);
+            const auto [moved_node, emptied_cluster] = handle_first_phase_node(
+                u,
+                local_rand,
+                local_rating_map,
+                tie_breaking_clusters,
+                tie_breaking_favored_clusters
+            );
             if (moved_node) {
               ++local_num_moved_nodes;
 
@@ -1972,6 +2090,7 @@ class ChunkRandomLabelPropagation : public LabelPropagation<Derived, Config, Gra
   using Base::_second_phase_nodes;
   using Base::_second_phase_selection_strategy;
   using Base::_tie_breaking_clusters_ets;
+  using Base::_tie_breaking_favored_clusters_ets;
 
   Permutations &_random_permutations;
   std::vector<Chunk> _chunks;
diff --git a/kaminpar-shm/partitioning/deep/deep_multilevel.cc b/kaminpar-shm/partitioning/deep/deep_multilevel.cc
index 2e690349..7d72cb9d 100644
--- a/kaminpar-shm/partitioning/deep/deep_multilevel.cc
+++ b/kaminpar-shm/partitioning/deep/deep_multilevel.cc
@@ -195,6 +195,10 @@ const Graph *DeepMultilevelPartitioner::coarsen() {
   }
   _subgraph_memory.resize(subgraph_memory_n, _input_ctx.partition.k, subgraph_memory_m, true, true);
 
+  TIMED_SCOPE("Coarsening") {
+    _coarsener->release_allocated_memory();
+  };
+
   if (shrunk) {
     LOG << "==> Coarsening terminated with less than " << initial_partitioning_threshold()
         << " nodes.";
diff --git a/kaminpar-shm/presets.cc b/kaminpar-shm/presets.cc
index 043f27fc..b290ab51 100644
--- a/kaminpar-shm/presets.cc
+++ b/kaminpar-shm/presets.cc
@@ -90,12 +90,13 @@ Context create_default_context() {
                               .num_iterations = 5,
                               .large_degree_threshold = 1000000,
                               .max_num_neighbors = 200000,
+                              .tie_breaking_strategy = TieBreakingStrategy::UNIFORM,
                               .cluster_weights_structure = ClusterWeightsStructure::VEC,
                               .impl = LabelPropagationImplementation::TWO_PHASE,
                               .second_phase_selection_strategy =
                                   SecondPhaseSelectionStrategy::FULL_RATING_MAP,
                               .second_phase_aggregation_strategy =
-                                  SecondPhaseAggregationStrategy::BUFFERED,
+                                  SecondPhaseAggregationStrategy::DIRECT,
                               .relabel_before_second_phase = false,
                               .two_hop_strategy = TwoHopStrategy::MATCH_THREADWISE,
                               .two_hop_threshold = 0.5,
@@ -172,9 +173,10 @@ Context create_default_context() {
                       .large_degree_threshold = 1000000,
                       .max_num_neighbors = std::numeric_limits<NodeID>::max(),
                       .impl = LabelPropagationImplementation::SINGLE_PHASE,
+                      .tie_breaking_strategy = TieBreakingStrategy::UNIFORM,
                       .second_phase_selection_strategy =
                           SecondPhaseSelectionStrategy::FULL_RATING_MAP,
-                      .second_phase_aggregation_strategy = SecondPhaseAggregationStrategy::BUFFERED,
+                      .second_phase_aggregation_strategy = SecondPhaseAggregationStrategy::DIRECT,
                   },
               .kway_fm =
                   {
@@ -241,6 +243,7 @@ Context create_memory_context() {
   ctx.coarsening.clustering.lp.impl = LabelPropagationImplementation::TWO_PHASE;
   ctx.coarsening.clustering.max_mem_free_coarsening_level = 1;
   ctx.coarsening.contraction.mode = ContractionMode::UNBUFFERED;
+  ctx.coarsening.contraction.use_growing_hash_tables = true;
   ctx.refinement.algorithms = {
       RefinementAlgorithm::GREEDY_BALANCER,
       RefinementAlgorithm::LABEL_PROPAGATION,
diff --git a/kaminpar-shm/refinement/lp/lp_refiner.cc b/kaminpar-shm/refinement/lp/lp_refiner.cc
index 44112285..b1a78742 100644
--- a/kaminpar-shm/refinement/lp/lp_refiner.cc
+++ b/kaminpar-shm/refinement/lp/lp_refiner.cc
@@ -48,6 +48,7 @@ class LPRefinerImpl final
     Base::set_max_degree(_r_ctx.lp.large_degree_threshold);
     Base::set_max_num_neighbors(_r_ctx.lp.max_num_neighbors);
     Base::set_implementation(_r_ctx.lp.impl);
+    Base::set_tie_breaking_strategy(_r_ctx.lp.tie_breaking_strategy);
     Base::set_second_phase_selection_strategy(_r_ctx.lp.second_phase_selection_strategy);
     Base::set_second_phase_aggregation_strategy(_r_ctx.lp.second_phase_aggregation_strategy);
     Base::set_relabel_before_second_phase(false);
@@ -87,8 +88,6 @@ class LPRefinerImpl final
     return true;
   }
 
-  using Base::expected_total_gain;
-
 public:
   [[nodiscard]] BlockID initial_cluster(const NodeID u) {
     return _p_graph->block(u);
@@ -139,44 +138,31 @@ class LPRefinerImpl final
   template <typename RatingMap>
   [[nodiscard]] ClusterID select_best_cluster(
       const bool store_favored_cluster,
+      const EdgeWeight gain_delta,
       Base::ClusterSelectionState &state,
       RatingMap &map,
-      ScalableVector<ClusterID> &tie_breaking_clusters
+      ScalableVector<ClusterID> &tie_breaking_clusters,
+      ScalableVector<ClusterID> &tie_breaking_favored_clusters
   ) {
-    ClusterID favored_cluster = state.initial_cluster;
-
-    const EdgeWeight gain_delta = (Config::kUseActualGain) ? map[state.initial_cluster] : 0;
-    for (const auto [cluster, rating] : map.entries()) {
-      state.current_cluster = cluster;
-      state.current_gain = rating - gain_delta;
-      state.current_cluster_weight = cluster_weight(cluster);
-
-      if (state.current_gain > state.best_gain) {
-        if (store_favored_cluster) {
-          favored_cluster = state.current_cluster;
-        }
+    const bool use_uniform_tie_breaking = _tie_breaking_strategy == TieBreakingStrategy::UNIFORM;
 
-        const NodeWeight current_max_weight = max_cluster_weight(state.current_cluster);
-        const NodeWeight current_overload = state.current_cluster_weight - current_max_weight;
-        const NodeWeight initial_overload =
-            state.initial_cluster_weight - max_cluster_weight(state.initial_cluster);
-
-        if (state.current_cluster_weight + state.u_weight < current_max_weight ||
-            current_overload < initial_overload || state.current_cluster == state.initial_cluster) {
-          tie_breaking_clusters.clear();
-          tie_breaking_clusters.push_back(state.current_cluster);
-
-          state.best_cluster = state.current_cluster;
-          state.best_cluster_weight = state.current_cluster_weight;
-          state.best_gain = state.current_gain;
-        }
-      } else if (state.current_gain == state.best_gain) {
-        const NodeWeight current_max_weight = max_cluster_weight(state.current_cluster);
-        const NodeWeight best_overload =
-            state.best_cluster_weight - max_cluster_weight(state.best_cluster);
-        const NodeWeight current_overload = state.current_cluster_weight - current_max_weight;
+    ClusterID favored_cluster = state.initial_cluster;
+    if (use_uniform_tie_breaking) {
+      for (const auto [cluster, rating] : map.entries()) {
+        state.current_cluster = cluster;
+        state.current_gain = rating - gain_delta;
+        state.current_cluster_weight = cluster_weight(cluster);
+
+        if (state.current_gain > state.best_gain) {
+          if (store_favored_cluster) {
+            tie_breaking_favored_clusters.clear();
+            tie_breaking_favored_clusters.push_back(state.current_cluster);
+
+            favored_cluster = state.current_cluster;
+          }
 
-        if (current_overload < best_overload) {
+          const NodeWeight current_max_weight = max_cluster_weight(state.current_cluster);
+          const NodeWeight current_overload = state.current_cluster_weight - current_max_weight;
           const NodeWeight initial_overload =
               state.initial_cluster_weight - max_cluster_weight(state.initial_cluster);
 
@@ -188,30 +174,102 @@ class LPRefinerImpl final
 
             state.best_cluster = state.current_cluster;
             state.best_cluster_weight = state.current_cluster_weight;
+            state.best_gain = state.current_gain;
+          }
+        } else if (state.current_gain == state.best_gain) {
+          if (store_favored_cluster) {
+            tie_breaking_favored_clusters.push_back(state.current_cluster);
           }
-        } else if (current_overload == best_overload) {
-          const NodeWeight initial_overload =
-              state.initial_cluster_weight - max_cluster_weight(state.initial_cluster);
 
-          if (state.current_cluster_weight + state.u_weight < current_max_weight ||
-              current_overload < initial_overload ||
-              state.current_cluster == state.initial_cluster) {
-            tie_breaking_clusters.push_back(state.current_cluster);
+          const NodeWeight current_max_weight = max_cluster_weight(state.current_cluster);
+          const NodeWeight best_overload =
+              state.best_cluster_weight - max_cluster_weight(state.best_cluster);
+          const NodeWeight current_overload = state.current_cluster_weight - current_max_weight;
+
+          if (current_overload < best_overload) {
+            const NodeWeight initial_overload =
+                state.initial_cluster_weight - max_cluster_weight(state.initial_cluster);
+
+            if (state.current_cluster_weight + state.u_weight < current_max_weight ||
+                current_overload < initial_overload ||
+                state.current_cluster == state.initial_cluster) {
+              tie_breaking_clusters.clear();
+              tie_breaking_clusters.push_back(state.current_cluster);
+
+              state.best_cluster = state.current_cluster;
+              state.best_cluster_weight = state.current_cluster_weight;
+            }
+          } else if (current_overload == best_overload) {
+            const NodeWeight initial_overload =
+                state.initial_cluster_weight - max_cluster_weight(state.initial_cluster);
+
+            if (state.current_cluster_weight + state.u_weight < current_max_weight ||
+                current_overload < initial_overload ||
+                state.current_cluster == state.initial_cluster) {
+              tie_breaking_clusters.push_back(state.current_cluster);
+            }
           }
         }
       }
-    }
 
-    if (tie_breaking_clusters.size() > 1) {
-      const ClusterID index = state.local_rand.random_index(0, tie_breaking_clusters.size());
-      const ClusterID best_cluster = tie_breaking_clusters[index];
-      state.best_cluster = best_cluster;
-    }
+      if (tie_breaking_clusters.size() > 1) {
+        const ClusterID i = state.local_rand.random_index(0, tie_breaking_clusters.size());
+        const ClusterID best_cluster = tie_breaking_clusters[i];
+        state.best_cluster = best_cluster;
+      }
+      tie_breaking_clusters.clear();
+
+      if (tie_breaking_favored_clusters.size() > 1) {
+        const ClusterID i = state.local_rand.random_index(0, tie_breaking_favored_clusters.size());
+        const ClusterID best_favored_cluster = tie_breaking_favored_clusters[i];
+        favored_cluster = best_favored_cluster;
+      }
+      tie_breaking_favored_clusters.clear();
 
-    tie_breaking_clusters.clear();
-    return favored_cluster;
+      return favored_cluster;
+    } else {
+      const auto accept_cluster = [&] {
+        static_assert(std::is_signed_v<NodeWeight>);
+
+        const NodeWeight current_max_weight = max_cluster_weight(state.current_cluster);
+        const NodeWeight best_overload =
+            state.best_cluster_weight - max_cluster_weight(state.best_cluster);
+        const NodeWeight current_overload = state.current_cluster_weight - current_max_weight;
+        const NodeWeight initial_overload =
+            state.initial_cluster_weight - max_cluster_weight(state.initial_cluster);
+
+        return (state.current_gain > state.best_gain ||
+                (state.current_gain == state.best_gain &&
+                 (current_overload < best_overload ||
+                  (current_overload == best_overload && state.local_rand.random_bool())))) &&
+               (state.current_cluster_weight + state.u_weight < current_max_weight ||
+                current_overload < initial_overload ||
+                state.current_cluster == state.initial_cluster);
+      };
+
+      for (const auto [cluster, rating] : map.entries()) {
+        state.current_cluster = cluster;
+        state.current_gain = rating - gain_delta;
+        state.current_cluster_weight = cluster_weight(cluster);
+
+        if (store_favored_cluster && state.current_gain > state.best_gain) {
+          favored_cluster = state.current_cluster;
+        }
+
+        if (accept_cluster()) {
+          state.best_cluster = state.current_cluster;
+          state.best_cluster_weight = state.current_cluster_weight;
+          state.best_gain = state.current_gain;
+        }
+      }
+
+      return favored_cluster;
+    }
   }
 
+  using Base::_tie_breaking_strategy;
+  using Base::expected_total_gain;
+
   const Graph *_graph = nullptr;
   PartitionedGraph *_p_graph = nullptr;
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 20a3544f..f94631ad 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -37,10 +37,9 @@ kaminpar_add_common_test(test_common_compact_hash_map common/datastructures/comp
 kaminpar_add_common_test(test_common_math common/math_test.cc)
 kaminpar_add_common_test(test_common_string common/strutils_test.cc)
 kaminpar_add_common_test(test_common_parallel_algorithm common/parallel/algorithm_test.cc)
-kaminpar_add_common_test(test_common_heap_profiler common/heap_profiler_test.cc)
-kaminpar_add_common_test(test_common_varint_codec_test common/varint_codec_test.cc)
-kaminpar_add_common_test(test_common_varint_run_length_codec_test common/varint_run_length_codec_test.cc)
-kaminpar_add_common_test(test_common_varint_stream_codec_test common/varint_stream_codec_test.cc)
+kaminpar_add_common_test(test_common_varint_test common/graph-compression/varint_test.cc)
+kaminpar_add_common_test(test_common_varint_run_length_codec_test common/graph-compression/varint_run_length_codec_test.cc)
+kaminpar_add_common_test(test_common_streamvbyte_test common/graph-compression/streamvbyte_test.cc)
 
 # KaMinPar -> End-to-end
 kaminpar_add_shm_test(test_shm_endtoend endtoend/shm_endtoend_test.cc)
diff --git a/tests/common/graph-compression/streamvbyte_test.cc b/tests/common/graph-compression/streamvbyte_test.cc
new file mode 100644
index 00000000..b2cbd97d
--- /dev/null
+++ b/tests/common/graph-compression/streamvbyte_test.cc
@@ -0,0 +1,120 @@
+#include <gmock/gmock.h>
+
+#include "kaminpar-common/graph-compression/streamvbyte.h"
+
+namespace {
+using namespace kaminpar::streamvbyte;
+
+template <typename Int> [[nodiscard]] Int generate_value(const std::size_t byte_width = 1) {
+  return static_cast<Int>(1) << (byte_width * 7);
+}
+
+template <typename Int> [[nodiscard]] std::vector<Int> generate_values() {
+  std::vector<Int> values;
+
+  for (std::size_t control_byte = 0; control_byte < 256; ++control_byte) {
+    for (std::size_t i = 0; i < 4; ++i) {
+      const std::uint8_t header = (control_byte >> (2 * i)) & 0b11;
+
+      std::uint8_t length;
+      if constexpr (sizeof(Int) == 4) {
+        length = header + 1;
+      } else {
+        const auto actual_length = [&](const std::uint8_t value) {
+          switch (value) {
+          case 0:
+            return 1;
+          case 1:
+            return 2;
+          case 2:
+            return 4;
+          case 3:
+            return 8;
+          default:
+            __builtin_unreachable();
+          }
+        };
+
+        length = actual_length(header);
+      }
+
+      values.push_back(generate_value<Int>(length));
+    };
+  }
+
+  return values;
+}
+
+template <typename Int> [[nodiscard]] std::vector<Int> generate_sorted_values() {
+  std::vector<Int> values;
+
+  for (std::size_t shift = 0; shift < sizeof(Int) * 8; ++shift) {
+    values.push_back(static_cast<Int>(1) << shift);
+  };
+
+  return values;
+}
+
+template <typename Int, DifferentialCodingKind GapKind = DifferentialCodingKind::NONE>
+void test_streamvbyte_codec(const std::vector<Int> &values) {
+  auto ptr = std::make_unique<std::uint8_t[]>(values.size() * sizeof(Int) + values.size());
+
+  {
+    StreamVByteEncoder<Int, GapKind> encoder(values.size(), ptr.get());
+    for (const Int value : values) {
+      encoder.add(value);
+    }
+    encoder.flush();
+  }
+
+  std::size_t i = 0;
+  {
+    StreamVByteDecoder<Int, false, GapKind> decoder(values.size(), ptr.get());
+    decoder.decode([&](const Int value) { EXPECT_EQ(values[i++], value); });
+  }
+  EXPECT_EQ(i, values.size());
+}
+
+template <typename Int> void test_streamvbyte_codec() {
+  std::vector<Int> values = generate_values<Int>();
+
+  for (std::size_t i = 0; i < 4; i++) {
+    test_streamvbyte_codec(values);
+    values.push_back(generate_value<Int>());
+  }
+}
+
+template <typename Int, DifferentialCodingKind GapKind> void test_sorted_streamvbyte_codec() {
+  std::vector<Int> values = generate_sorted_values<Int>();
+
+  for (std::size_t i = 0; i < 4; i++) {
+    test_streamvbyte_codec<Int, GapKind>(values);
+    values.push_back(std::numeric_limits<Int>::max());
+  }
+}
+
+TEST(StreamVByte32Test, Default) {
+  test_streamvbyte_codec<std::uint32_t>();
+}
+
+TEST(StreamVByte32Test, GapKindD1) {
+  test_sorted_streamvbyte_codec<std::uint32_t, DifferentialCodingKind::D1>();
+}
+
+TEST(StreamVByte32Test, GapKindD2) {
+  test_sorted_streamvbyte_codec<std::uint32_t, DifferentialCodingKind::D2>();
+}
+
+TEST(StreamVByte32Test, GapKindD3) {
+  test_sorted_streamvbyte_codec<std::uint32_t, DifferentialCodingKind::DM>();
+}
+
+TEST(StreamVByte32Test, GapKindD4) {
+  test_sorted_streamvbyte_codec<std::uint32_t, DifferentialCodingKind::D4>();
+}
+
+TEST(StreamVByte64Test, Default) {
+  test_streamvbyte_codec<std::uint64_t>();
+}
+
+} // namespace
\ No newline at end of file
diff --git a/tests/common/varint_run_length_codec_test.cc b/tests/common/graph-compression/varint_run_length_codec_test.cc
similarity index 89%
rename from tests/common/varint_run_length_codec_test.cc
rename to tests/common/graph-compression/varint_run_length_codec_test.cc
index 6b044cad..e94670ac 100644
--- a/tests/common/varint_run_length_codec_test.cc
+++ b/tests/common/graph-compression/varint_run_length_codec_test.cc
@@ -1,6 +1,6 @@
 #include <gmock/gmock.h>
 
-#include "kaminpar-common/graph-compression/varint_run_length_codec.h"
+#include "kaminpar-common/graph-compression/varint_rle.h"
 
 using namespace kaminpar;
 
@@ -28,7 +28,7 @@ template <typename Int> void test_run_length_codec() {
   }
   rl_encoder.flush();
 
-  VarIntRunLengthDecoder<Int> rl_decoder(ptr.get(), values.size());
+  VarIntRunLengthDecoder<Int> rl_decoder(values.size(), ptr.get());
   std::size_t i = 0;
   rl_decoder.decode([&](const Int value) { EXPECT_EQ(values[i++], value); });
   EXPECT_EQ(i, values.size());
diff --git a/tests/common/graph-compression/varint_test.cc b/tests/common/graph-compression/varint_test.cc
new file mode 100644
index 00000000..e0591b2d
--- /dev/null
+++ b/tests/common/graph-compression/varint_test.cc
@@ -0,0 +1,111 @@
+#include <gmock/gmock.h>
+
+#include "kaminpar-common/graph-compression/varint.h"
+
+namespace {
+using namespace kaminpar;
+
+template <typename Int> [[nodiscard]] std::vector<Int> generate_values() {
+  std::vector<Int> values;
+
+  values.push_back(static_cast<Int>(0));
+
+  for (std::size_t i = 1; i < sizeof(Int) + 1; ++i) {
+    values.push_back((static_cast<Int>(1) << (i * 7)) - 1);
+    values.push_back(static_cast<Int>(1) << (i * 7));
+  }
+
+  values.push_back(std::numeric_limits<Int>::max());
+
+  return values;
+}
+
+template <typename Int> [[nodiscard]] std::vector<Int> generate_signed_values() {
+  std::vector<Int> values;
+
+  values.push_back(static_cast<Int>(0));
+
+  for (std::size_t i = 0; i < sizeof(Int); ++i) {
+    values.push_back((static_cast<Int>(1) << (i * 7 + 6)) - 1);
+    values.push_back(static_cast<Int>(1) << (i * 7 + 6));
+    values.push_back(-(static_cast<Int>(1) << (i * 7 + 6)) + 1);
+    values.push_back(-static_cast<Int>(1) << (i * 7 + 6));
+  }
+
+  values.push_back(std::numeric_limits<Int>::max());
+  values.push_back(std::numeric_limits<Int>::min());
+
+  return values;
+}
+
+template <
+    typename Int,
+    typename LengthFun,
+    typename Encoder,
+    typename Decoder,
+    typename DecoderAndIncrementer>
+void test_varint_codec(
+    const auto &values,
+    LengthFun &&length,
+    Encoder &&encode,
+    Decoder &&decode,
+    DecoderAndIncrementer &&decode_and_increment
+) {
+  std::size_t total_length = 0;
+  std::vector<std::size_t> value_lengths;
+  for (const Int value : values) {
+    const std::size_t value_len = length(value);
+    value_lengths.push_back(value_len);
+    total_length += value_len;
+  }
+
+  auto ptr = std::make_unique<std::uint8_t[]>(total_length);
+
+  std::uint8_t *encode_ptr = ptr.get();
+  for (std::size_t i = 0; i < values.size(); ++i) {
+    const std::size_t value_len = encode(values[i], encode_ptr);
+    EXPECT_EQ(value_lengths[i], value_len);
+
+    encode_ptr += value_len;
+  }
+
+  const std::uint8_t *decode_ptr = ptr.get();
+  for (std::size_t i = 0; i < values.size(); ++i) {
+    const std::uint8_t *decode_start_ptr = decode_ptr;
+
+    const Int decoded_value1 = decode(decode_ptr);
+    EXPECT_EQ(values[i], decoded_value1);
+
+    const Int decoded_value2 = decode_and_increment(&decode_ptr);
+    EXPECT_EQ(values[i], decoded_value2);
+
+    const std::size_t value_len = static_cast<std::size_t>(decode_ptr - decode_start_ptr);
+    EXPECT_EQ(value_lengths[i], value_len);
+  }
+}
+
+TEST(VarIntTest, Codec) {
+  const auto values = generate_values<std::uint32_t>();
+
+  test_varint_codec<std::uint32_t>(
+      values,
+      [](const std::uint32_t i) { return varint_length(i); },
+      [](const std::uint32_t i, std::uint8_t *ptr) { return varint_encode(i, ptr); },
+      [](const std::uint8_t *ptr) { return varint_decode<std::uint32_t>(ptr); },
+      [](const std::uint8_t **ptr) { return varint_decode<std::uint32_t>(ptr); }
+  );
+}
+
+TEST(SignedVarIntTest, Codec) {
+  const auto values = generate_signed_values<std::int32_t>();
+
+  test_varint_codec<std::int32_t>(
+      values,
+      [](const std::int32_t i) { return signed_varint_length(i); },
+      [](const std::int32_t i, std::uint8_t *ptr) { return signed_varint_encode(i, ptr); },
+      [](const std::uint8_t *ptr) { return signed_varint_decode<std::int32_t>(ptr); },
+      [](const std::uint8_t **ptr) { return signed_varint_decode<std::int32_t>(ptr); }
+  );
+}
+
+} // namespace
diff --git a/tests/common/heap_profiler_test.cc b/tests/common/heap_profiler_test.cc
deleted file mode 100644
index 9b1be9a6..00000000
--- a/tests/common/heap_profiler_test.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-#include <iostream>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-#include "kaminpar-common/heap_profiler.h"
-
-// Allocate memory such that the compiler does not optimise it away.
-#define ALLOC_ARR(name, size)                                                                      \
-  char *name = new char[size];                                                                     \
-  volatile auto name##_copy = *((char *)name);
-
-namespace kaminpar::heap_profiler {
-
-TEST(HeapProfilerTest, NewArrayOperator) {
-  const std::size_t size = 1024;
-
-  HeapProfiler::global().enable();
-
-  ALLOC_ARR(array, size)
-  delete[] array;
-
-  HeapProfiler::global().disable();
-
-#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
-  EXPECT_EQ(size, HeapProfiler::global().get_alloc());
-  EXPECT_EQ(size, HeapProfiler::global().get_max_alloc());
-  EXPECT_EQ(1, HeapProfiler::global().get_allocs());
-  EXPECT_EQ(1, HeapProfiler::global().get_frees());
-#endif // KAMINPAR_ENABLE_HEAP_PROFILING
-}
-
-TEST(HeapProfilerTest, MaxAllocTest) {
-  HeapProfiler::global().enable();
-
-  ALLOC_ARR(array1, 1024);
-  delete[] array1;
-
-#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
-  EXPECT_EQ(1024, HeapProfiler::global().get_max_alloc());
-#endif // KAMINPAR_ENABLE_HEAP_PROFILING
-
-  ALLOC_ARR(array2, 2048);
-  delete[] array2;
-#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
-  EXPECT_EQ(2048, HeapProfiler::global().get_max_alloc());
-#endif // KAMINPAR_ENABLE_HEAP_PROFILING
-
-  ALLOC_ARR(array3, 128);
-#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
-  EXPECT_EQ(2048, HeapProfiler::global().get_max_alloc());
-#endif // KAMINPAR_ENABLE_HEAP_PROFILING
-
-  ALLOC_ARR(array4, 4096);
-#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
-  EXPECT_EQ(4224, HeapProfiler::global().get_max_alloc());
-#endif // KAMINPAR_ENABLE_HEAP_PROFILING
-  delete[] array3;
-  delete[] array4;
-
-  HeapProfiler::global().disable();
-}
-
-} // namespace kaminpar::heap_profiler
diff --git a/tests/common/varint_codec_test.cc b/tests/common/varint_codec_test.cc
deleted file mode 100644
index d39becb0..00000000
--- a/tests/common/varint_codec_test.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-#include <gmock/gmock.h>
-
-#include "kaminpar-common/graph-compression/varint_codec.h"
-
-using namespace kaminpar;
-
-template <
-    typename Int,
-    bool marked_codec = false,
-    bool marked = false,
-    typename LengthFun,
-    typename Encoder,
-    typename Decoder>
-void test_varint_codec(
-    const std::vector<Int> &values, LengthFun &&length, Encoder &&encode, Decoder &&decode
-) {
-  std::size_t total_len = 0;
-  std::vector<std::size_t> values_len;
-  for (const Int value : values) {
-    const std::size_t value_len = length(value);
-    values_len.push_back(value_len);
-    total_len += value_len;
-  }
-
-  auto ptr = std::make_unique<std::uint8_t[]>(total_len);
-
-  std::size_t i = 0;
-  std::uint8_t *encoded_ptr = ptr.get();
-  for (const Int value : values) {
-    const std::size_t value_len = encode(value, encoded_ptr);
-    encoded_ptr += value_len;
-
-    EXPECT_EQ(values_len[i++], value_len);
-  }
-
-  i = 0;
-  const std::uint8_t *decoded_ptr = ptr.get();
-  for (const Int value : values) {
-    if constexpr (marked_codec) {
-      const auto [decoded_value, marker_set, value_len] = decode(decoded_ptr);
-      decoded_ptr += value_len;
-
-      EXPECT_EQ(values_len[i++], value_len);
-      EXPECT_EQ(value, decoded_value);
-      EXPECT_EQ(marked, marker_set);
-    } else {
-      const auto [decoded_value, value_len] = decode(decoded_ptr);
-      decoded_ptr += value_len;
-
-      EXPECT_EQ(values_len[i++], value_len);
-      EXPECT_EQ(value, decoded_value);
-    }
-  }
-}
-
-template <typename Int> std::vector<Int> generate_values() {
-  std::vector<Int> values;
-
-  values.push_back(static_cast<Int>(0));
-  for (std::size_t i = 1; i < sizeof(Int) + 1; ++i) {
-    values.push_back((static_cast<Int>(1) << (i * 7)) - 1);
-    values.push_back(static_cast<Int>(1) << (i * 7));
-  }
-  values.push_back(std::numeric_limits<Int>::max());
-
-  return values;
-}
-
-template <typename Int> std::vector<Int> generate_signed_values() {
-  std::vector<Int> values;
-
-  values.push_back(static_cast<Int>(0));
-
-  values.push_back((static_cast<Int>(1) << 6) - 1);
-  values.push_back((static_cast<Int>(1) << 6));
-
-  values.push_back(-(static_cast<Int>(1) << 6) + 1);
-  values.push_back(-(static_cast<Int>(1) << 6));
-
-  for (std::size_t i = 1; i < sizeof(Int); ++i) {
-    values.push_back((static_cast<Int>(1) << (i * 7 + 6)) - 1);
-    values.push_back(static_cast<Int>(1) << (i * 7 + 6));
-    values.push_back(-(static_cast<Int>(1) << (i * 7 + 6)) + 1);
-    values.push_back(-static_cast<Int>(1) << (i * 7 + 6));
-  }
-  values.push_back(std::numeric_limits<Int>::max());
-  values.push_back(-std::numeric_limits<Int>::max());
-
-  return values;
-}
-
-template <typename Int, bool marked = false> void test_varint_codec() {
-  if constexpr (marked) {
-    std::vector<Int> values = generate_values<Int>();
-
-    test_varint_codec<Int, true, false>(
-        values,
-        [](const Int value) { return marked_varint_length<Int>(value); },
-        [](const Int value, std::uint8_t *ptr) {
-          return marked_varint_encode<Int>(value, false, ptr);
-        },
-        [](const std::uint8_t *ptr) { return marked_varint_decode<Int>(ptr); }
-    );
-
-    test_varint_codec<Int, true, true>(
-        values,
-        [](const Int value) { return marked_varint_length<Int>(value); },
-        [](const Int value, std::uint8_t *ptr) {
-          return marked_varint_encode<Int>(value, true, ptr);
-        },
-        [](const std::uint8_t *ptr) { return marked_varint_decode<Int>(ptr); }
-    );
-  } else if constexpr (std::numeric_limits<Int>::is_signed) {
-    test_varint_codec<Int>(
-        generate_signed_values<Int>(),
-        [](const Int value) { return signed_varint_length<Int>(value); },
-        [](const Int value, std::uint8_t *ptr) { return signed_varint_encode<Int>(value, ptr); },
-        [](const std::uint8_t *ptr) { return signed_varint_decode<Int>(ptr); }
-    );
-  } else {
-    std::vector<Int> values = generate_values<Int>();
-
-    test_varint_codec<Int>(
-        values,
-        [](const Int value) { return varint_length<Int>(value); },
-        [](const Int value, std::uint8_t *ptr) { return varint_encode<Int>(value, ptr); },
-        [](const std::uint8_t *ptr) { return varint_decode_general<Int>(ptr); }
-    );
-
-    test_varint_codec<Int>(
-        values,
-        [](const Int value) { return varint_length<Int>(value); },
-        [](const Int value, std::uint8_t *ptr) { return varint_encode<Int>(value, ptr); },
-        [](const std::uint8_t *ptr) { return varint_decode<Int>(ptr); }
-    );
-  }
-}
-
-TEST(VarIntCodecTest, varint_codec) {
-  test_varint_codec<std::uint32_t>();
-  test_varint_codec<std::uint64_t>();
-}
-
-TEST(VarIntCodecTest, signed_varint_codec) {
-  test_varint_codec<std::int32_t>();
-  test_varint_codec<std::int64_t>();
-}
-
-TEST(VarIntCodecTest, marked_varint_codec) {
-  test_varint_codec<std::uint32_t, true>();
-  test_varint_codec<std::uint64_t, true>();
-}
diff --git a/tests/common/varint_stream_codec_test.cc b/tests/common/varint_stream_codec_test.cc
deleted file mode 100644
index f7dcf6f0..00000000
--- a/tests/common/varint_stream_codec_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-#include <gmock/gmock.h>
-
-#include "kaminpar-common/graph-compression/varint_stream_codec.h"
-
-using namespace kaminpar;
-
-template <typename Int> void test_varint_stream(const std::vector<Int> &values) {
-  auto ptr = std::make_unique<std::uint8_t[]>(values.size() * sizeof(Int) + values.size());
-
-  VarIntStreamEncoder<Int> encoder(ptr.get(), values.size());
-  for (const Int value : values) {
-    encoder.add(value);
-  }
-  encoder.flush();
-
-  VarIntStreamDecoder<Int> decoder(ptr.get(), values.size());
-  std::size_t i = 0;
-  decoder.decode([&](const Int value) { EXPECT_EQ(values[i++], value); });
-  EXPECT_EQ(i, values.size());
-}
-
-template <typename Int> void test_varint_stream() {
-  std::vector<Int> values;
-
-  for (std::size_t control_byte = 0; control_byte < 256; ++control_byte) {
-    for (std::uint8_t i = 0; i < 4; ++i) {
-      const std::uint8_t length = ((control_byte >> (2 * i)) & 0b11) + 1;
-      const Int value = static_cast<Int>(1) << (length * 7);
-      values.push_back(value);
-    };
-  }
-
-  test_varint_stream(values);
-}
-
-TEST(VarIntStreamCodecTest, varint_stream) {
-  test_varint_stream<std::uint32_t>();
-}
-
-template <typename Int> void test_varint_stream_remaining() {
-  for (std::uint8_t i = 0; i < 3; ++i) {
-    std::vector<Int> values;
-
-    for (std::uint8_t j = 0; j <= i; ++j) {
-      values.push_back(static_cast<Int>(1) << ((j + 1) * 7));
-    }
-
-    test_varint_stream(values);
-  };
-}
-
-TEST(VarIntStreamCodecTest, varint_stream_remaining) {
-  test_varint_stream_remaining<std::uint32_t>();
-}