diff --git a/CMakeLists.txt b/CMakeLists.txt
index 99f57f751..b05ff222a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,6 +147,7 @@ include_directories(SYSTEM ${gtest_SOURCE_DIR}/../googlemock/include ${gtest_SOU
 include_directories(external_tools/kahypar-shared-resources)
 include_directories(external_tools/growt)
 include_directories(external_tools/WHFC)
+include_directories(external_tools/pcg)
 
 if(KAHYPAR_DOWNLOAD_BOOST)
   # Download Boost
diff --git a/README.md b/README.md
index 79eb31c07..1c2891da1 100644
--- a/README.md
+++ b/README.md
@@ -489,7 +489,7 @@ If you use Mt-KaHyPar in an academic setting please cite the appropriate papers.
     @inproceedings{MT-KAHYPAR-SDET,
       author    = {Lars Gottesb{\"{u}}ren and
                    Michael Hamann},
-      title     = {{Deterministic Parallel Hypergraph Partitioning}},
+      title     = {Deterministic Parallel Hypergraph Partitioning},
       booktitle = {European Conference on Parallel Processing (Euro-Par)},
       volume    = {13440},
       pages     = {301--316},
@@ -498,6 +498,17 @@ If you use Mt-KaHyPar in an academic setting please cite the appropriate papers.
       doi       = {10.1007/978-3-031-12597-3\_19},
     }
 
+    // Unconstrained Refinement (Under Review)
+    @article{MT-KAHYPAR-UNCONSTRAINED,
+      title       = {Parallel Unconstrained Local Search for Partitioning Irregular Graphs},
+      author      = {Nikolai Maas and
+                     Lars Gottesb{\"{u}}ren and
+                     Daniel Seemaier},
+      institution = {Karlsruhe Institute of Technology},
+      year        = {2023},
+      url         = {https://arxiv.org/abs/2308.15494}
+    }
+
     // Dissertation of Lars Gottesbüren
     @phdthesis{MT-KAHYPAR-DIS-GOTTESBUEREN,
       author         = {Lars Gottesb\"{u}ren},
@@ -523,11 +534,11 @@ If you use Mt-KaHyPar in an academic setting please cite the appropriate papers.
       title       = {Scalable High-Quality Hypergraph Partitioning},
       author      = {Lars Gottesb{\"u}ren and
                      Tobias Heuer and
-                     Nikolai Mass and
+                     Nikolai Maas and
                      Peter Sanders and
                      Sebastian Schlag},
       institution = {Karlsruhe Institute of Technology},
-      year        = {2023}
+      year        = {2023},
       url         = {https://arxiv.org/abs/2303.17679}
     }
 
diff --git a/config/default_preset.ini b/config/default_preset.ini
index f7d303465..35f2e3278 100644
--- a/config/default_preset.ini
+++ b/config/default_preset.ini
@@ -57,22 +57,29 @@ i-r-fm-iter-moves-on-recalc=true
 # main -> initial_partitioning -> refinement -> flows
 i-r-flow-algo=do_nothing
 # main -> refinement
-r-rebalancer-type=simple_rebalancer
+r-rebalancer-type=advanced_rebalancer
 r-refine-until-no-improvement=false
 # main -> refinement -> label_propagation
 r-lp-type=label_propagation
+r-lp-unconstrained=true
 r-lp-maximum-iterations=5
-r-lp-rebalancing=true
+r-lp-rebalancing=false
 r-lp-he-size-activation-threshold=100
+r-lp-relative-improvement-threshold=0.001
 # main -> refinement -> fm
-r-fm-type=kway_fm
+r-fm-type=unconstrained_fm
 r-fm-multitry-rounds=10
+r-fm-unconstrained-rounds=8
 r-fm-perform-moves-global=false
 r-fm-rollback-parallel=true
-r-fm-rollback-balance-violation-factor=1.25
+r-fm-rollback-balance-violation-factor=1.0
+r-fm-threshold-border-node-inclusion=0.7
+r-fm-imbalance-penalty-min=0.2
+r-fm-imbalance-penalty-max=1.0
 r-fm-seed-nodes=25
 r-fm-release-nodes=true
 r-fm-min-improvement=-1.0
+r-fm-unconstrained-min-improvement=0.002
 r-fm-obey-minimal-parallelism=true
 r-fm-time-limit-factor=0.25
 r-fm-iter-moves-on-recalc=true
diff --git a/config/deterministic_preset.ini b/config/deterministic_preset.ini
index d05f7be7c..1270b0e34 100644
--- a/config/deterministic_preset.ini
+++ b/config/deterministic_preset.ini
@@ -40,7 +40,7 @@ i-fm-refinement-rounds=3
 i-lp-maximum-iterations=20
 i-lp-initial-block-size=5
 # main -> initial_partitioning -> refinement
-r-rebalancer-type=simple_rebalancer
+r-rebalancer-type=advanced_rebalancer
 i-r-refine-until-no-improvement=false
 # main -> initial_partitioning -> refinement -> label_propagation
 i-r-lp-type=deterministic
diff --git a/config/highest_quality_preset.ini b/config/highest_quality_preset.ini
index 728da8efe..0b290d81d 100644
--- a/config/highest_quality_preset.ini
+++ b/config/highest_quality_preset.ini
@@ -62,7 +62,7 @@ i-r-use-global-fm=false
 # main -> initial_partitioning -> refinement -> flows
 i-r-flow-algo=do_nothing
 # main -> refinement
-r-rebalancer-type=simple_rebalancer
+r-rebalancer-type=advanced_rebalancer
 r-refine-until-no-improvement=true
 r-relative-improvement-threshold=0.0025
 r-max-batch-size=1000
diff --git a/config/large_k_preset.ini b/config/large_k_preset.ini
index 2ca80e473..deda3580c 100644
--- a/config/large_k_preset.ini
+++ b/config/large_k_preset.ini
@@ -58,7 +58,7 @@ i-r-fm-type=do_nothing
 # main -> initial_partitioning -> refinement -> flows
 i-r-flow-algo=do_nothing
 # main -> refinement
-r-rebalancer-type=simple_rebalancer
+r-rebalancer-type=advanced_rebalancer
 r-refine-until-no-improvement=false
 # main -> refinement -> label_propagation
 r-lp-type=label_propagation
diff --git a/config/quality_preset.ini b/config/quality_preset.ini
index 74c924709..4ee85e6ea 100644
--- a/config/quality_preset.ini
+++ b/config/quality_preset.ini
@@ -58,23 +58,30 @@ i-r-fm-iter-moves-on-recalc=true
 # main -> initial_partitioning -> refinement -> flows
 i-r-flow-algo=do_nothing
 # main -> refinement
-r-rebalancer-type=simple_rebalancer
+r-rebalancer-type=advanced_rebalancer
 r-refine-until-no-improvement=true
 r-relative-improvement-threshold=0.0025
 # main -> refinement -> label_propagation
 r-lp-type=label_propagation
+r-lp-unconstrained=true
 r-lp-maximum-iterations=5
 r-lp-rebalancing=true
 r-lp-he-size-activation-threshold=100
+r-lp-relative-improvement-threshold=0.001
 # main -> refinement -> fm
-r-fm-type=kway_fm
+r-fm-type=unconstrained_fm
 r-fm-multitry-rounds=10
+r-fm-unconstrained-rounds=8
 r-fm-perform-moves-global=false
 r-fm-rollback-parallel=true
-r-fm-rollback-balance-violation-factor=1.25
+r-fm-rollback-balance-violation-factor=1.0
+r-fm-threshold-border-node-inclusion=0.7
+r-fm-imbalance-penalty-min=0.2
+r-fm-imbalance-penalty-max=1.0
 r-fm-seed-nodes=25
 r-fm-release-nodes=true
 r-fm-min-improvement=-1.0
+r-fm-unconstrained-min-improvement=0.002
 r-fm-obey-minimal-parallelism=true
 r-fm-time-limit-factor=0.25
 r-fm-iter-moves-on-recalc=true
diff --git a/external_tools/pcg/pcg_extras.hpp b/external_tools/pcg/pcg_extras.hpp
new file mode 100644
index 000000000..23a416fde
--- /dev/null
+++ b/external_tools/pcg/pcg_extras.hpp
@@ -0,0 +1,637 @@
+/*
+ * PCG Random Number Generation for C++
+ *
+ * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *     http://www.pcg-random.org
+ */
+
+/*
+ * This file provides support code that is useful for random-number generation
+ * but not specific to the PCG generation scheme, including:
+ *      - 128-bit int support for platforms where it isn't available natively
+ *      - bit twiddling operations
+ *      - I/O of 128-bit and 8-bit integers
+ *      - Handling the evilness of SeedSeq
+ *      - Support for efficiently producing random numbers less than a given
+ *        bound
+ */
+
+#ifndef PCG_EXTRAS_HPP_INCLUDED
+#define PCG_EXTRAS_HPP_INCLUDED 1
+
+#include <cinttypes>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#include <limits>
+#include <iostream>
+#include <type_traits>
+#include <utility>
+#include <locale>
+#include <iterator>
+#include <utility>
+
+#ifdef __GNUC__
+    #include <cxxabi.h>
+#endif
+
+/*
+ * Abstractions for compiler-specific directives
+ */
+
+#ifdef __GNUC__
+    #define PCG_NOINLINE __attribute__((noinline))
+#else
+    #define PCG_NOINLINE
+#endif
+
+/*
+ * Some members of the PCG library use 128-bit math.  When compiling on 64-bit
+ * platforms, both GCC and Clang provide 128-bit integer types that are ideal
+ * for the job.
+ *
+ * On 32-bit platforms (or with other compilers), we fall back to a C++
+ * class that provides 128-bit unsigned integers instead.  It may seem
+ * like we're reinventing the wheel here, because libraries already exist
+ * that support large integers, but most existing libraries provide a very
+ * generic multiprecision code, but here we're operating at a fixed size.
+ * Also, most other libraries are fairly heavyweight.  So we use a direct
+ * implementation.  Sadly, it's much slower than hand-coded assembly or
+ * direct CPU support.
+ *
+ */
+#if __SIZEOF_INT128__
+    namespace pcg_extras {
+        typedef __uint128_t pcg128_t;
+    }
+    #define PCG_128BIT_CONSTANT(high,low) \
+            ((pcg128_t(high) << 64) + low)
+#else
+    #include "pcg_uint128.hpp"
+    namespace pcg_extras {
+        typedef pcg_extras::uint_x4<uint32_t,uint64_t> pcg128_t;
+    }
+    #define PCG_128BIT_CONSTANT(high,low) \
+            pcg128_t(high,low)
+    #define PCG_EMULATED_128BIT_MATH 1
+#endif
+
+
+namespace pcg_extras {
+
+/*
+ * We often need to represent a "number of bits".  When used normally, these
+ * numbers are never greater than 128, so an unsigned char is plenty.
+ * If you're using a nonstandard generator of a larger size, you can set
+ * PCG_BITCOUNT_T to have it define it as a larger size.  (Some compilers
+ * might produce faster code if you set it to an unsigned int.)
+ */
+
+#ifndef PCG_BITCOUNT_T
+    typedef uint8_t bitcount_t;
+#else
+    typedef PCG_BITCOUNT_T bitcount_t;
+#endif
+
+/*
+ * C++ requires us to be able to serialize RNG state by printing or reading
+ * it from a stream.  Because we use 128-bit ints, we also need to be able
+ * ot print them, so here is code to do so.
+ *
+ * This code provides enough functionality to print 128-bit ints in decimal
+ * and zero-padded in hex.  It's not a full-featured implementation.
+ */
+
+template <typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits>& out, pcg128_t value)
+{
+    auto desired_base = out.flags() & out.basefield;
+    bool want_hex = desired_base == out.hex;
+
+    if (want_hex) {
+        uint64_t highpart = uint64_t(value >> 64);
+        uint64_t lowpart  = uint64_t(value);
+        auto desired_width = out.width();
+        if (desired_width > 16) {
+            out.width(desired_width - 16);
+        }
+        if (highpart != 0 || desired_width > 16)
+            out << highpart;
+        CharT oldfill;
+        if (highpart != 0) {
+            out.width(16);
+            oldfill = out.fill('0');
+        }
+        auto oldflags = out.setf(decltype(desired_base){}, out.showbase);
+        out << lowpart;
+        out.setf(oldflags);
+        if (highpart != 0) {
+            out.fill(oldfill);
+        }
+        return out;
+    }
+    constexpr size_t MAX_CHARS_128BIT = 40;
+
+    char buffer[MAX_CHARS_128BIT];
+    char* pos = buffer+sizeof(buffer);
+    *(--pos) = '\0';
+    constexpr auto BASE = pcg128_t(10ULL);
+    do {
+        auto div = value / BASE;
+        auto mod = uint32_t(value - (div * BASE));
+        *(--pos) = '0' + mod;
+        value = div;
+    } while(value != pcg128_t(0ULL));
+    return out << pos;
+}
+
+template <typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits>& in, pcg128_t& value)
+{
+    typename std::basic_istream<CharT,Traits>::sentry s(in);
+
+    if (!s)
+         return in;
+
+    constexpr auto BASE = pcg128_t(10ULL);
+    pcg128_t current(0ULL);
+    bool did_nothing = true;
+    bool overflow = false;
+    for(;;) {
+        CharT wide_ch = in.get();
+        if (!in.good())
+            break;
+        auto ch = in.narrow(wide_ch, '\0');
+        if (ch < '0' || ch > '9') {
+            in.unget();
+            break;
+        }
+        did_nothing = false;
+        pcg128_t digit(uint32_t(ch - '0'));
+        pcg128_t timesbase = current*BASE;
+        overflow = overflow || timesbase < current;
+        current = timesbase + digit;
+        overflow = overflow || current < digit;
+    }
+
+    if (did_nothing || overflow) {
+        in.setstate(std::ios::failbit);
+        if (overflow)
+            current = ~pcg128_t(0ULL);
+    }
+
+    value = current;
+
+    return in;
+}
+
+/*
+ * Likewise, if people use tiny rngs, we'll be serializing uint8_t.
+ * If we just used the provided IO operators, they'd read/write chars,
+ * not ints, so we need to define our own.  We *can* redefine this operator
+ * here because we're in our own namespace.
+ */
+
+template <typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits>&out, uint8_t value)
+{
+    return out << uint32_t(value);
+}
+
+template <typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits>& in, uint8_t& target)
+{
+    uint32_t value = 0xdecea5edU;
+    in >> value;
+    if (!in && value == 0xdecea5edU)
+        return in;
+    if (value > uint8_t(~0)) {
+        in.setstate(std::ios::failbit);
+        value = ~0U;
+    }
+    target = uint8_t(value);
+    return in;
+}
+
+/* Unfortunately, the above functions don't get found in preference to the
+ * built in ones, so we create some more specific overloads that will.
+ * Ugh.
+ */
+
+inline std::ostream& operator<<(std::ostream& out, uint8_t value)
+{
+    return pcg_extras::operator<< <char>(out, value);
+}
+
+inline std::istream& operator>>(std::istream& in, uint8_t& value)
+{
+    return pcg_extras::operator>> <char>(in, value);
+}
+
+
+
+/*
+ * Useful bitwise operations.
+ */
+
+/*
+ * XorShifts are invertable, but they are someting of a pain to invert.
+ * This function backs them out.  It's used by the whacky "inside out"
+ * generator defined later.
+ */
+
+template <typename itype>
+inline itype unxorshift(itype x, bitcount_t bits, bitcount_t shift)
+{
+    if (2*shift >= bits) {
+        return x ^ (x >> shift);
+    }
+    itype lowmask1 = (itype(1U) << (bits - shift*2)) - 1;
+    itype highmask1 = ~lowmask1;
+    itype top1 = x;
+    itype bottom1 = x & lowmask1;
+    top1 ^= top1 >> shift;
+    top1 &= highmask1;
+    x = top1 | bottom1;
+    itype lowmask2 = (itype(1U) << (bits - shift)) - 1;
+    itype bottom2 = x & lowmask2;
+    bottom2 = unxorshift(bottom2, bits - shift, shift);
+    bottom2 &= lowmask1;
+    return top1 | bottom2;
+}
+
+/*
+ * Rotate left and right.
+ *
+ * In ideal world, compilers would spot idiomatic rotate code and convert it
+ * to a rotate instruction.  Of course, opinions vary on what the correct
+ * idiom is and how to spot it.  For clang, sometimes it generates better
+ * (but still crappy) code if you define PCG_USE_ZEROCHECK_ROTATE_IDIOM.
+ */
+
+template <typename itype>
+inline itype rotl(itype value, bitcount_t rot)
+{
+    constexpr bitcount_t bits = sizeof(itype) * 8;
+    constexpr bitcount_t mask = bits - 1;
+#if PCG_USE_ZEROCHECK_ROTATE_IDIOM
+    return rot ? (value << rot) | (value >> (bits - rot)) : value;
+#else
+    return (value << rot) | (value >> ((- rot) & mask));
+#endif
+}
+
+template <typename itype>
+inline itype rotr(itype value, bitcount_t rot)
+{
+    constexpr bitcount_t bits = sizeof(itype) * 8;
+    constexpr bitcount_t mask = bits - 1;
+#if PCG_USE_ZEROCHECK_ROTATE_IDIOM
+    return rot ? (value >> rot) | (value << (bits - rot)) : value;
+#else
+    return (value >> rot) | (value << ((- rot) & mask));
+#endif
+}
+
+/* Unfortunately, both Clang and GCC sometimes perform poorly when it comes
+ * to properly recognizing idiomatic rotate code, so for we also provide
+ * assembler directives (enabled with PCG_USE_INLINE_ASM).  Boo, hiss.
+ * (I hope that these compilers get better so that this code can die.)
+ *
+ * These overloads will be preferred over the general template code above.
+ */
+#if PCG_USE_INLINE_ASM && __GNUC__ && (__x86_64__  || __i386__)
+
+inline uint8_t rotr(uint8_t value, bitcount_t rot)
+{
+    asm ("rorb   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
+    return value;
+}
+
+inline uint16_t rotr(uint16_t value, bitcount_t rot)
+{
+    asm ("rorw   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
+    return value;
+}
+
+inline uint32_t rotr(uint32_t value, bitcount_t rot)
+{
+    asm ("rorl   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
+    return value;
+}
+
+#if __x86_64__
+inline uint64_t rotr(uint64_t value, bitcount_t rot)
+{
+    asm ("rorq   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
+    return value;
+}
+#endif // __x86_64__
+
+#endif // PCG_USE_INLINE_ASM
+
+
+/*
+ * The C++ SeedSeq concept (modelled by seed_seq) can fill an array of
+ * 32-bit integers with seed data, but sometimes we want to produce
+ * larger or smaller integers.
+ *
+ * The following code handles this annoyance.
+ *
+ * uneven_copy will copy an array of 32-bit ints to an array of larger or
+ * smaller ints (actually, the code is general it only needing forward
+ * iterators).  The copy is identical to the one that would be performed if
+ * we just did memcpy on a standard little-endian machine, but works
+ * regardless of the endian of the machine (or the weirdness of the ints
+ * involved).
+ *
+ * generate_to initializes an array of integers using a SeedSeq
+ * object.  It is given the size as a static constant at compile time and
+ * tries to avoid memory allocation.  If we're filling in 32-bit constants
+ * we just do it directly.  If we need a separate buffer and it's small,
+ * we allocate it on the stack.  Otherwise, we fall back to heap allocation.
+ * Ugh.
+ *
+ * generate_one produces a single value of some integral type using a
+ * SeedSeq object.
+ */
+
+ /* uneven_copy helper, case where destination ints are less than 32 bit. */
+
+template<class SrcIter, class DestIter>
+SrcIter uneven_copy_impl(
+    SrcIter src_first, DestIter dest_first, DestIter dest_last,
+    std::true_type)
+{
+    typedef typename std::iterator_traits<SrcIter>::value_type  src_t;
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+
+    constexpr bitcount_t SRC_SIZE  = sizeof(src_t);
+    constexpr bitcount_t DEST_SIZE = sizeof(dest_t);
+    constexpr bitcount_t DEST_BITS = DEST_SIZE * 8;
+    constexpr bitcount_t SCALE     = SRC_SIZE / DEST_SIZE;
+
+    size_t count = 0;
+    src_t value;
+
+    while (dest_first != dest_last) {
+        if ((count++ % SCALE) == 0)
+            value = *src_first++;       // Get more bits
+        else
+            value >>= DEST_BITS;        // Move down bits
+
+        *dest_first++ = dest_t(value);  // Truncates, ignores high bits.
+    }
+    return src_first;
+}
+
+ /* uneven_copy helper, case where destination ints are more than 32 bit. */
+
+template<class SrcIter, class DestIter>
+SrcIter uneven_copy_impl(
+    SrcIter src_first, DestIter dest_first, DestIter dest_last,
+    std::false_type)
+{
+    typedef typename std::iterator_traits<SrcIter>::value_type  src_t;
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+
+    constexpr auto SRC_SIZE  = sizeof(src_t);
+    constexpr auto SRC_BITS  = SRC_SIZE * 8;
+    constexpr auto DEST_SIZE = sizeof(dest_t);
+    constexpr auto SCALE     = (DEST_SIZE+SRC_SIZE-1) / SRC_SIZE;
+
+    while (dest_first != dest_last) {
+        dest_t value(0UL);
+        unsigned int shift = 0;
+
+        for (size_t i = 0; i < SCALE; ++i) {
+            value |= dest_t(*src_first++) << shift;
+            shift += SRC_BITS;
+        }
+
+        *dest_first++ = value;
+    }
+    return src_first;
+}
+
+/* uneven_copy, call the right code for larger vs. smaller */
+
+template<class SrcIter, class DestIter>
+inline SrcIter uneven_copy(SrcIter src_first,
+                           DestIter dest_first, DestIter dest_last)
+{
+    typedef typename std::iterator_traits<SrcIter>::value_type  src_t;
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+
+    constexpr bool DEST_IS_SMALLER = sizeof(dest_t) < sizeof(src_t);
+
+    return uneven_copy_impl(src_first, dest_first, dest_last,
+                            std::integral_constant<bool, DEST_IS_SMALLER>{});
+}
+
+/* generate_to, fill in a fixed-size array of integral type using a SeedSeq
+ * (actually works for any random-access iterator)
+ */
+
+template <size_t size, typename SeedSeq, typename DestIter>
+inline void generate_to_impl(SeedSeq&& generator, DestIter dest,
+                             std::true_type)
+{
+    generator.generate(dest, dest+size);
+}
+
+template <size_t size, typename SeedSeq, typename DestIter>
+void generate_to_impl(SeedSeq&& generator, DestIter dest,
+                      std::false_type)
+{
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+    constexpr auto DEST_SIZE = sizeof(dest_t);
+    constexpr auto GEN_SIZE  = sizeof(uint32_t);
+
+    constexpr bool GEN_IS_SMALLER = GEN_SIZE < DEST_SIZE;
+    constexpr size_t FROM_ELEMS =
+        GEN_IS_SMALLER
+            ? size * ((DEST_SIZE+GEN_SIZE-1) / GEN_SIZE)
+            : (size + (GEN_SIZE / DEST_SIZE) - 1)
+                / ((GEN_SIZE / DEST_SIZE) + GEN_IS_SMALLER);
+                        //  this odd code ^^^^^^^^^^^^^^^^^ is work-around for
+                        //  a bug: http://llvm.org/bugs/show_bug.cgi?id=21287
+
+    if (FROM_ELEMS <= 1024) {
+        uint32_t buffer[FROM_ELEMS];
+        generator.generate(buffer, buffer+FROM_ELEMS);
+        uneven_copy(buffer, dest, dest+size);
+    } else {
+        uint32_t* buffer = (uint32_t*) malloc(GEN_SIZE * FROM_ELEMS);
+        generator.generate(buffer, buffer+FROM_ELEMS);
+        uneven_copy(buffer, dest, dest+size);
+        free(buffer);
+    }
+}
+
+template <size_t size, typename SeedSeq, typename DestIter>
+inline void generate_to(SeedSeq&& generator, DestIter dest)
+{
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+    constexpr bool IS_32BIT = sizeof(dest_t) == sizeof(uint32_t);
+
+    generate_to_impl<size>(std::forward<SeedSeq>(generator), dest,
+                           std::integral_constant<bool, IS_32BIT>{});
+}
+
+/* generate_one, produce a value of integral type using a SeedSeq
+ * (optionally, we can have it produce more than one and pick which one
+ * we want)
+ */
+
+template <typename UInt, size_t i = 0UL, size_t N = i+1UL, typename SeedSeq>
+inline UInt generate_one(SeedSeq&& generator)
+{
+    UInt result[N];
+    generate_to<N>(std::forward<SeedSeq>(generator), result);
+    return result[i];
+}
+
+template <typename RngType>
+auto bounded_rand(RngType& rng, typename RngType::result_type upper_bound)
+        -> typename RngType::result_type
+{
+    typedef typename RngType::result_type rtype;
+    rtype threshold = (RngType::max() - RngType::min() + rtype(1) - upper_bound)
+                    % upper_bound;
+    for (;;) {
+        rtype r = rng() - RngType::min();
+        if (r >= threshold)
+            return r % upper_bound;
+    }
+}
+
+template <typename Iter, typename RandType>
+void shuffle(Iter from, Iter to, RandType&& rng)
+{
+    typedef typename std::iterator_traits<Iter>::difference_type delta_t;
+    auto count = to - from;
+    while (count > 1) {
+        delta_t chosen(bounded_rand(rng, count));
+        --count;
+        --to;
+        using std::swap;
+        swap(*(from+chosen), *to);
+    }
+}
+
+/*
+ * Although std::seed_seq is useful, it isn't everything.  Often we want to
+ * initialize a random-number generator some other way, such as from a random
+ * device.
+ *
+ * Technically, it does not meet the requirements of a SeedSequence because
+ * it lacks some of the rarely-used member functions (some of which would
+ * be impossible to provide).  However the C++ standard is quite specific
+ * that actual engines only called the generate method, so it ought not to be
+ * a problem in practice.
+ */
+
+template <typename RngType>
+class seed_seq_from {
+private:
+    RngType rng_;
+
+    typedef uint_least32_t result_type;
+
+public:
+    template<typename... Args>
+    seed_seq_from(Args&&... args) :
+        rng_(std::forward<Args>(args)...)
+    {
+        // Nothing (else) to do...
+    }
+
+    template<typename Iter>
+    void generate(Iter start, Iter finish)
+    {
+        for (auto i = start; i != finish; ++i)
+            *i = result_type(rng_());
+    }
+
+    constexpr size_t size() const
+    {
+        return (sizeof(typename RngType::result_type) > sizeof(result_type)
+                && RngType::max() > ~size_t(0UL))
+             ? ~size_t(0UL)
+             : size_t(RngType::max());
+    }
+};
+
+/*
+ * Sometimes you might want a distinct seed based on when the program
+ * was compiled.  That way, a particular instance of the program will
+ * behave the same way, but when recompiled it'll produce a different
+ * value.
+ */
+
+template <typename IntType>
+struct static_arbitrary_seed {
+private:
+    static constexpr IntType fnv(IntType hash, const char* pos) {
+        return *pos == '\0'
+             ? hash
+             : fnv((hash * IntType(16777619U)) ^ *pos, (pos+1));
+    }
+
+public:
+    static constexpr IntType value = fnv(IntType(2166136261U ^ sizeof(IntType)),
+                        __DATE__ __TIME__ __FILE__);
+};
+
+// Sometimes, when debugging or testing, it's handy to be able print the name
+// of a (in human-readable form).  This code allows the idiom:
+//
+//      cout << printable_typename<my_foo_type_t>()
+//
+// to print out my_foo_type_t (or its concrete type if it is a synonym)
+
+template <typename T>
+struct printable_typename {};
+
+template <typename T>
+std::ostream& operator<<(std::ostream& out, printable_typename<T>) {
+    const char *implementation_typename = typeid(T).name();
+#ifdef __GNUC__
+    int status;
+    const char* pretty_name =
+        abi::__cxa_demangle(implementation_typename, NULL, NULL, &status);
+    if (status == 0)
+        out << pretty_name;
+    free((void*) pretty_name);
+    if (status == 0)
+        return out;
+#endif
+    out << implementation_typename;
+    return out;
+}
+
+} // namespace pcg_extras
+
+#endif // PCG_EXTRAS_HPP_INCLUDED
diff --git a/external_tools/pcg/pcg_random.hpp b/external_tools/pcg/pcg_random.hpp
new file mode 100644
index 000000000..3f04d854e
--- /dev/null
+++ b/external_tools/pcg/pcg_random.hpp
@@ -0,0 +1,1751 @@
+/*
+ * PCG Random Number Generation for C++
+ *
+ * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *     http://www.pcg-random.org
+ */
+
+/*
+ * This code provides the reference implementation of the PCG family of
+ * random number generators.  The code is complex because it implements
+ *
+ *      - several members of the PCG family, specifically members corresponding
+ *        to the output functions:
+ *             - XSH RR         (good for 64-bit state, 32-bit output)
+ *             - XSH RS         (good for 64-bit state, 32-bit output)
+ *             - XSL RR         (good for 128-bit state, 64-bit output)
+ *             - RXS M XS       (statistically most powerful generator)
+ *             - XSL RR RR      (good for 128-bit state, 128-bit output)
+ *             - and RXS, RXS M, XSH, XSL       (mostly for testing)
+ *      - at potentially *arbitrary* bit sizes
+ *      - with four different techniques for random streams (MCG, one-stream
+ *        LCG, settable-stream LCG, unique-stream LCG)
+ *      - and the extended generation schemes allowing arbitrary periods
+ *      - with all features of C++11 random number generation (and more),
+ *        some of which are somewhat painful, including
+ *            - initializing with a SeedSequence which writes 32-bit values
+ *              to memory, even though the state of the generator may not
+ *              use 32-bit values (it might use smaller or larger integers)
+ *            - I/O for RNGs and a prescribed format, which needs to handle
+ *              the issue that 8-bit and 128-bit integers don't have working
+ *              I/O routines (e.g., normally 8-bit = char, not integer)
+ *            - equality and inequality for RNGs
+ *      - and a number of convenience typedefs to mask all the complexity
+ *
+ * The code employes a fairly heavy level of abstraction, and has to deal
+ * with various C++ minutia.  If you're looking to learn about how the PCG
+ * scheme works, you're probably best of starting with one of the other
+ * codebases (see www.pcg-random.org).  But if you're curious about the
+ * constants for the various output functions used in those other, simpler,
+ * codebases, this code shows how they are calculated.
+ *
+ * On the positive side, at least there are convenience typedefs so that you
+ * can say
+ *
+ *      pcg32 myRNG;
+ *
+ * rather than:
+ *
+ *      pcg_detail::engine<
+ *          uint32_t,                                           // Output Type
+ *          uint64_t,                                           // State Type
+ *          pcg_detail::xsh_rr_mixin<uint32_t, uint64_t>, true, // Output Func
+ *          pcg_detail::specific_stream<uint64_t>,              // Stream Kind
+ *          pcg_detail::default_multiplier<uint64_t>            // LCG Mult
+ *      > myRNG;
+ *
+ */
+
+#ifndef PCG_RAND_HPP_INCLUDED
+#define PCG_RAND_HPP_INCLUDED 1
+
+#include <cinttypes>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#include <limits>
+#include <iostream>
+#include <type_traits>
+#include <utility>
+#include <locale>
+#include <new>
+#include <stdexcept>
+
+/*
+ * The pcg_extras namespace contains some support code that is likley to
+ * be useful for a variety of RNGs, including:
+ *      - 128-bit int support for platforms where it isn't available natively
+ *      - bit twiddling operations
+ *      - I/O of 128-bit and 8-bit integers
+ *      - Handling the evilness of SeedSeq
+ *      - Support for efficiently producing random numbers less than a given
+ *        bound
+ */
+
+#include "pcg_extras.hpp"
+
+namespace pcg_detail {
+
+using namespace pcg_extras;
+
+/*
+ * The LCG generators need some constants to function.  This code lets you
+ * look up the constant by *type*.  For example
+ *
+ *      default_multiplier<uint32_t>::multiplier()
+ *
+ * gives you the default multipler for 32-bit integers.  We use the name
+ * of the constant and not a generic word like value to allow these classes
+ * to be used as mixins.
+ */
+
+template <typename T>
+struct default_multiplier {
+    // Not defined for an arbitrary type
+};
+
+template <typename T>
+struct default_increment {
+    // Not defined for an arbitrary type
+};
+
+#define PCG_DEFINE_CONSTANT(type, what, kind, constant) \
+        template <>                                     \
+        struct what ## _ ## kind<type> {                \
+            static constexpr type kind() {              \
+                return constant;                        \
+            }                                           \
+        };
+
+PCG_DEFINE_CONSTANT(uint8_t,  default, multiplier, 141U)
+PCG_DEFINE_CONSTANT(uint8_t,  default, increment,  77U)
+
+PCG_DEFINE_CONSTANT(uint16_t, default, multiplier, 12829U)
+PCG_DEFINE_CONSTANT(uint16_t, default, increment,  47989U)
+
+PCG_DEFINE_CONSTANT(uint32_t, default, multiplier, 747796405U)
+PCG_DEFINE_CONSTANT(uint32_t, default, increment,  2891336453U)
+
+PCG_DEFINE_CONSTANT(uint64_t, default, multiplier, 6364136223846793005ULL)
+PCG_DEFINE_CONSTANT(uint64_t, default, increment,  1442695040888963407ULL)
+
+PCG_DEFINE_CONSTANT(pcg128_t, default, multiplier,
+        PCG_128BIT_CONSTANT(2549297995355413924ULL,4865540595714422341ULL))
+PCG_DEFINE_CONSTANT(pcg128_t, default, increment,
+        PCG_128BIT_CONSTANT(6364136223846793005ULL,1442695040888963407ULL))
+
+
+/*
+ * Each PCG generator is available in four variants, based on how it applies
+ * the additive constant for its underlying LCG; the variations are:
+ *
+ *     single stream   - all instances use the same fixed constant, thus
+ *                       the RNG always somewhere in same sequence
+ *     mcg             - adds zero, resulting in a single stream and reduced
+ *                       period
+ *     specific stream - the constant can be changed at any time, selecting
+ *                       a different random sequence
+ *     unique stream   - the constant is based on the memory addresss of the
+ *                       object, thus every RNG has its own unique sequence
+ *
+ * This variation is provided though mixin classes which define a function
+ * value called increment() that returns the nesessary additive constant.
+ */
+
+
+
+/*
+ * unique stream
+ */
+
+
+template <typename itype>
+class unique_stream {
+protected:
+    static constexpr bool is_mcg = false;
+
+    // Is never called, but is provided for symmetry with specific_stream
+    void set_stream(...)
+    {
+        abort();
+    }
+
+public:
+    typedef itype state_type;
+
+    constexpr itype increment() const {
+        return itype(reinterpret_cast<unsigned long>(this) | 1);
+    }
+
+    constexpr itype stream() const
+    {
+         return increment() >> 1;
+    }
+
+    static constexpr bool can_specify_stream = false;
+
+    static constexpr size_t streams_pow2()
+    {
+        return (sizeof(itype) < sizeof(size_t) ? sizeof(itype)
+                                               : sizeof(size_t))*8 - 1u;
+    }
+
+protected:
+    constexpr unique_stream() = default;
+};
+
+
+/*
+ * no stream (mcg)
+ */
+
+template <typename itype>
+class no_stream {
+protected:
+    static constexpr bool is_mcg = true;
+
+    // Is never called, but is provided for symmetry with specific_stream
+    void set_stream(...)
+    {
+        abort();
+    }
+
+public:
+    typedef itype state_type;
+
+    static constexpr itype increment() {
+        return 0;
+    }
+
+    static constexpr bool can_specify_stream = false;
+
+    static constexpr size_t streams_pow2()
+    {
+        return 0u;
+    }
+
+protected:
+    constexpr no_stream() = default;
+};
+
+
+/*
+ * single stream/sequence (oneseq)
+ */
+
+template <typename itype>
+class oneseq_stream : public default_increment<itype> {
+protected:
+    static constexpr bool is_mcg = false;
+
+    // Is never called, but is provided for symmetry with specific_stream
+    void set_stream(...)
+    {
+        abort();
+    }
+
+public:
+    typedef itype state_type;
+
+    static constexpr itype stream()
+    {
+         return default_increment<itype>::increment() >> 1;
+    }
+
+    static constexpr bool can_specify_stream = false;
+
+    static constexpr size_t streams_pow2()
+    {
+        return 0u;
+    }
+
+protected:
+    constexpr oneseq_stream() = default;
+};
+
+
+/*
+ * specific stream
+ */
+
+template <typename itype>
+class specific_stream {
+protected:
+    static constexpr bool is_mcg = false;
+
+    itype inc_ = default_increment<itype>::increment();
+
+public:
+    typedef itype state_type;
+    typedef itype stream_state;
+
+    constexpr itype increment() const {
+        return inc_;
+    }
+
+    itype stream()
+    {
+         return inc_ >> 1;
+    }
+
+    void set_stream(itype specific_seq)
+    {
+         inc_ = (specific_seq << 1) | 1;
+    }
+
+    static constexpr bool can_specify_stream = true;
+
+    static constexpr size_t streams_pow2()
+    {
+        return (sizeof(itype)*8) - 1u;
+    }
+
+protected:
+    specific_stream() = default;
+
+    specific_stream(itype specific_seq)
+        : inc_((specific_seq << 1) | itype(1U))
+    {
+        // Nothing (else) to do.
+    }
+};
+
+
+/*
+ * This is where it all comes together.  This function joins together three
+ * mixin classes which define
+ *    - the LCG additive constant (the stream)
+ *    - the LCG multiplier
+ *    - the output function
+ * in addition, we specify the type of the LCG state, and the result type,
+ * and whether to use the pre-advance version of the state for the output
+ * (increasing instruction-level parallelism) or the post-advance version
+ * (reducing register pressure).
+ *
+ * Given the high level of parameterization, the code has to use some
+ * template-metaprogramming tricks to handle some of the suble variations
+ * involved.
+ */
+
+template <typename xtype, typename itype,
+          typename output_mixin,
+          bool output_previous = true,
+          typename stream_mixin = oneseq_stream<itype>,
+          typename multiplier_mixin = default_multiplier<itype> >
+class engine : protected output_mixin,
+               public stream_mixin,
+               protected multiplier_mixin {
+protected:
+    itype state_;
+
+    struct can_specify_stream_tag {};
+    struct no_specifiable_stream_tag {};
+
+    using stream_mixin::increment;
+    using multiplier_mixin::multiplier;
+
+public:
+    typedef xtype result_type;
+    typedef itype state_type;
+
+    static constexpr size_t period_pow2()
+    {
+        return sizeof(state_type)*8 - 2*stream_mixin::is_mcg;
+    }
+
+    // It would be nice to use std::numeric_limits for these, but
+    // we can't be sure that it'd be defined for the 128-bit types.
+
+    static constexpr result_type min()
+    {
+        return result_type(0UL);
+    }
+
+    static constexpr result_type max()
+    {
+        return ~result_type(0UL);
+    }
+
+protected:
+    itype bump(itype state)
+    {
+        return state * multiplier() + increment();
+    }
+
+    itype base_generate()
+    {
+        return state_ = bump(state_);
+    }
+
+    itype base_generate0()
+    {
+        itype old_state = state_;
+        state_ = bump(state_);
+        return old_state;
+    }
+
+public:
+    result_type operator()()
+    {
+        if (output_previous)
+            return this->output(base_generate0());
+        else
+            return this->output(base_generate());
+    }
+
+    result_type operator()(result_type upper_bound)
+    {
+        return bounded_rand(*this, upper_bound);
+    }
+
+protected:
+    static itype advance(itype state, itype delta,
+                         itype cur_mult, itype cur_plus);
+
+    static itype distance(itype cur_state, itype newstate, itype cur_mult,
+                          itype cur_plus, itype mask = ~itype(0U));
+
+    itype distance(itype newstate, itype mask = ~itype(0U)) const
+    {
+        return distance(state_, newstate, multiplier(), increment(), mask);
+    }
+
+public:
+    void advance(itype delta)
+    {
+        state_ = advance(state_, delta, this->multiplier(), this->increment());
+    }
+
+    void backstep(itype delta)
+    {
+        advance(-delta);
+    }
+
+    void discard(itype delta)
+    {
+        advance(delta);
+    }
+
+    bool wrapped()
+    {
+        if (stream_mixin::is_mcg) {
+            // For MCGs, the low order two bits never change. In this
+            // implementation, we keep them fixed at 3 to make this test
+            // easier.
+            return state_ == 3;
+        } else {
+            return state_ == 0;
+        }
+    }
+
+    engine(itype state = itype(0xcafef00dd15ea5e5ULL))
+        : state_(this->is_mcg ? state|state_type(3U)
+                              : bump(state + this->increment()))
+    {
+        // Nothing else to do.
+    }
+
+    // This function may or may not exist.  It thus has to be a template
+    // to use SFINAE; users don't have to worry about its template-ness.
+
+    template <typename sm = stream_mixin>
+    engine(itype state, typename sm::stream_state stream_seed)
+        : stream_mixin(stream_seed),
+          state_(this->is_mcg ? state|state_type(3U)
+                              : bump(state + this->increment()))
+    {
+        // Nothing else to do.
+    }
+
+    template<typename SeedSeq>
+    engine(SeedSeq&& seedSeq, typename std::enable_if<
+                  !stream_mixin::can_specify_stream
+               && !std::is_convertible<SeedSeq, itype>::value
+               && !std::is_convertible<SeedSeq, engine>::value,
+               no_specifiable_stream_tag>::type = {})
+        : engine(generate_one<itype>(std::forward<SeedSeq>(seedSeq)))
+    {
+        // Nothing else to do.
+    }
+
+    template<typename SeedSeq>
+    engine(SeedSeq&& seedSeq, typename std::enable_if<
+                   stream_mixin::can_specify_stream
+               && !std::is_convertible<SeedSeq, itype>::value
+               && !std::is_convertible<SeedSeq, engine>::value,
+        can_specify_stream_tag>::type = {})
+        : engine(generate_one<itype,1,2>(seedSeq),
+                 generate_one<itype,0,2>(seedSeq))
+    {
+        // Nothing else to do.
+    }
+
+
+    template<typename... Args>
+    void seed(Args&&... args)
+    {
+        new (this) engine(std::forward<Args>(args)...);
+    }
+
+    template <typename xtype1, typename itype1,
+              typename output_mixin1, bool output_previous1,
+              typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+              typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+    friend bool operator==(const engine<xtype1,itype1,
+                                     output_mixin1,output_previous1,
+                                     stream_mixin_lhs, multiplier_mixin_lhs>&,
+                           const engine<xtype1,itype1,
+                                     output_mixin1,output_previous1,
+                                     stream_mixin_rhs, multiplier_mixin_rhs>&);
+
+    template <typename xtype1, typename itype1,
+              typename output_mixin1, bool output_previous1,
+              typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+              typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+    friend itype1 operator-(const engine<xtype1,itype1,
+                                     output_mixin1,output_previous1,
+                                     stream_mixin_lhs, multiplier_mixin_lhs>&,
+                            const engine<xtype1,itype1,
+                                     output_mixin1,output_previous1,
+                                     stream_mixin_rhs, multiplier_mixin_rhs>&);
+
+    template <typename CharT, typename Traits,
+              typename xtype1, typename itype1,
+              typename output_mixin1, bool output_previous1,
+              typename stream_mixin1, typename multiplier_mixin1>
+    friend std::basic_ostream<CharT,Traits>&
+    operator<<(std::basic_ostream<CharT,Traits>& out,
+               const engine<xtype1,itype1,
+                              output_mixin1,output_previous1,
+                              stream_mixin1, multiplier_mixin1>&);
+
+    template <typename CharT, typename Traits,
+              typename xtype1, typename itype1,
+              typename output_mixin1, bool output_previous1,
+              typename stream_mixin1, typename multiplier_mixin1>
+    friend std::basic_istream<CharT,Traits>&
+    operator>>(std::basic_istream<CharT,Traits>& in,
+               engine<xtype1, itype1,
+                        output_mixin1, output_previous1,
+                        stream_mixin1, multiplier_mixin1>& rng);
+};
+
+template <typename CharT, typename Traits,
+          typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin, typename multiplier_mixin>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits>& out,
+           const engine<xtype,itype,
+                          output_mixin,output_previous,
+                          stream_mixin, multiplier_mixin>& rng)
+{
+    auto orig_flags = out.flags(std::ios_base::dec | std::ios_base::left);
+    auto space = out.widen(' ');
+    auto orig_fill = out.fill();
+
+    out << rng.multiplier() << space
+        << rng.increment() << space
+        << rng.state_;
+
+    out.flags(orig_flags);
+    out.fill(orig_fill);
+    return out;
+}
+
+
+template <typename CharT, typename Traits,
+          typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin, typename multiplier_mixin>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits>& in,
+           engine<xtype,itype,
+                    output_mixin,output_previous,
+                    stream_mixin, multiplier_mixin>& rng)
+{
+    auto orig_flags = in.flags(std::ios_base::dec | std::ios_base::skipws);
+
+    itype multiplier, increment, state;
+    in >> multiplier >> increment >> state;
+
+    if (!in.fail()) {
+        bool good = true;
+        if (multiplier != rng.multiplier()) {
+           good = false;
+        } else if (rng.can_specify_stream) {
+           rng.set_stream(increment >> 1);
+        } else if (increment != rng.increment()) {
+           good = false;
+        }
+        if (good) {
+            rng.state_ = state;
+        } else {
+            in.clear(std::ios::failbit);
+        }
+    }
+
+    in.flags(orig_flags);
+    return in;
+}
+
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin, typename multiplier_mixin>
+itype engine<xtype,itype,output_mixin,output_previous,stream_mixin,
+             multiplier_mixin>::advance(
+    itype state, itype delta, itype cur_mult, itype cur_plus)
+{
+    // The method used here is based on Brown, "Random Number Generation
+    // with Arbitrary Stride,", Transactions of the American Nuclear
+    // Society (Nov. 1994).  The algorithm is very similar to fast
+    // exponentiation.
+    //
+    // Even though delta is an unsigned integer, we can pass a
+    // signed integer to go backwards, it just goes "the long way round".
+
+    constexpr itype ZERO = 0u;  // itype may be a non-trivial types, so
+    constexpr itype ONE  = 1u;  // we define some ugly constants.
+    itype acc_mult = 1;
+    itype acc_plus = 0;
+    while (delta > ZERO) {
+       if (delta & ONE) {
+          acc_mult *= cur_mult;
+          acc_plus = acc_plus*cur_mult + cur_plus;
+       }
+       cur_plus = (cur_mult+ONE)*cur_plus;
+       cur_mult *= cur_mult;
+       delta >>= 1;
+    }
+    return acc_mult * state + acc_plus;
+}
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin, typename multiplier_mixin>
+itype engine<xtype,itype,output_mixin,output_previous,stream_mixin,
+               multiplier_mixin>::distance(
+    itype cur_state, itype newstate, itype cur_mult, itype cur_plus, itype mask)
+{
+    constexpr itype ONE  = 1u;  // itype could be weird, so use constant
+    itype the_bit = stream_mixin::is_mcg ? itype(4u) : itype(1u);
+    itype distance = 0u;
+    while ((cur_state & mask) != (newstate & mask)) {
+       if ((cur_state & the_bit) != (newstate & the_bit)) {
+           cur_state = cur_state * cur_mult + cur_plus;
+           distance |= the_bit;
+       }
+       assert((cur_state & the_bit) == (newstate & the_bit));
+       the_bit <<= 1;
+       cur_plus = (cur_mult+ONE)*cur_plus;
+       cur_mult *= cur_mult;
+    }
+    return stream_mixin::is_mcg ? distance >> 2 : distance;
+}
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+          typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+itype operator-(const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_lhs, multiplier_mixin_lhs>& lhs,
+               const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_rhs, multiplier_mixin_rhs>& rhs)
+{
+    if (lhs.multiplier() != rhs.multiplier()
+        || lhs.increment() != rhs.increment())
+        throw std::logic_error("incomparable generators");
+    return rhs.distance(lhs.state_);
+}
+
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+          typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+bool operator==(const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_lhs, multiplier_mixin_lhs>& lhs,
+                const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_rhs, multiplier_mixin_rhs>& rhs)
+{
+    return    (lhs.multiplier() == rhs.multiplier())
+           && (lhs.increment()  == rhs.increment())
+           && (lhs.state_       == rhs.state_);
+}
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+          typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+inline bool operator!=(const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_lhs, multiplier_mixin_lhs>& lhs,
+                       const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_rhs, multiplier_mixin_rhs>& rhs)
+{
+    return !operator==(lhs,rhs);
+}
+
+
+template <typename xtype, typename itype,
+         template<typename XT,typename IT> class output_mixin,
+         bool output_previous = (sizeof(itype) <= 8)>
+using oneseq_base  = engine<xtype, itype,
+                        output_mixin<xtype, itype>, output_previous,
+                        oneseq_stream<itype> >;
+
+template <typename xtype, typename itype,
+         template<typename XT,typename IT> class output_mixin,
+         bool output_previous = (sizeof(itype) <= 8)>
+using unique_base = engine<xtype, itype,
+                         output_mixin<xtype, itype>, output_previous,
+                         unique_stream<itype> >;
+
+template <typename xtype, typename itype,
+         template<typename XT,typename IT> class output_mixin,
+         bool output_previous = (sizeof(itype) <= 8)>
+using setseq_base = engine<xtype, itype,
+                         output_mixin<xtype, itype>, output_previous,
+                         specific_stream<itype> >;
+
+template <typename xtype, typename itype,
+         template<typename XT,typename IT> class output_mixin,
+         bool output_previous = (sizeof(itype) <= 8)>
+using mcg_base = engine<xtype, itype,
+                      output_mixin<xtype, itype>, output_previous,
+                      no_stream<itype> >;
+
+/*
+ * OUTPUT FUNCTIONS.
+ *
+ * These are the core of the PCG generation scheme.  They specify how to
+ * turn the base LCG's internal state into the output value of the final
+ * generator.
+ *
+ * They're implemented as mixin classes.
+ *
+ * All of the classes have code that is written to allow it to be applied
+ * at *arbitrary* bit sizes, although in practice they'll only be used at
+ * standard sizes supported by C++.
+ */
+
+/*
+ * XSH RS -- high xorshift, followed by a random shift
+ *
+ * Fast.  A good performer.
+ */
+
+template <typename xtype, typename itype>
+struct xsh_rs_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t bits        = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t xtypebits   = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t sparebits   = bits - xtypebits;
+        constexpr bitcount_t opbits =
+                              sparebits-5 >= 64 ? 5
+                            : sparebits-4 >= 32 ? 4
+                            : sparebits-3 >= 16 ? 3
+                            : sparebits-2 >= 4  ? 2
+                            : sparebits-1 >= 1  ? 1
+                            :                     0;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        constexpr bitcount_t maxrandshift  = mask;
+        constexpr bitcount_t topspare     = opbits;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift     = topspare + (xtypebits+maxrandshift)/2;
+        bitcount_t rshift =
+            opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0;
+        internal ^= internal >> xshift;
+        xtype result = xtype(internal >> (bottomspare - maxrandshift + rshift));
+        return result;
+    }
+};
+
+/*
+ * XSH RR -- high xorshift, followed by a random rotate
+ *
+ * Fast.  A good performer.  Slightly better statistically than XSH RS.
+ */
+
+template <typename xtype, typename itype>
+struct xsh_rr_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t bits        = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t xtypebits   = bitcount_t(sizeof(xtype)*8);
+        constexpr bitcount_t sparebits   = bits - xtypebits;
+        constexpr bitcount_t wantedopbits =
+                              xtypebits >= 128 ? 7
+                            : xtypebits >=  64 ? 6
+                            : xtypebits >=  32 ? 5
+                            : xtypebits >=  16 ? 4
+                            :                    3;
+        constexpr bitcount_t opbits =
+                              sparebits >= wantedopbits ? wantedopbits
+                                                        : sparebits;
+        constexpr bitcount_t amplifier = wantedopbits - opbits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        constexpr bitcount_t topspare    = opbits;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift      = (topspare + xtypebits)/2;
+        bitcount_t rot = opbits ? bitcount_t(internal >> (bits - opbits)) & mask
+                                : 0;
+        bitcount_t amprot = (rot << amplifier) & mask;
+        internal ^= internal >> xshift;
+        xtype result = xtype(internal >> bottomspare);
+        result = rotr(result, amprot);
+        return result;
+    }
+};
+
+/*
+ * RXS -- random xorshift
+ */
+
+template <typename xtype, typename itype>
+struct rxs_mixin {
+static xtype output_rxs(itype internal)
+    {
+        constexpr bitcount_t bits        = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t xtypebits   = bitcount_t(sizeof(xtype)*8);
+        constexpr bitcount_t shift       = bits - xtypebits;
+        constexpr bitcount_t extrashift  = (xtypebits - shift)/2;
+        bitcount_t rshift = shift > 64+8 ? (internal >> (bits - 6)) & 63
+                       : shift > 32+4 ? (internal >> (bits - 5)) & 31
+                       : shift > 16+2 ? (internal >> (bits - 4)) & 15
+                       : shift >  8+1 ? (internal >> (bits - 3)) & 7
+                       : shift >  4+1 ? (internal >> (bits - 2)) & 3
+                       : shift >  2+1 ? (internal >> (bits - 1)) & 1
+                       :              0;
+        internal ^= internal >> (shift + extrashift - rshift);
+        xtype result = internal >> rshift;
+        return result;
+    }
+};
+
+/*
+ * RXS M XS -- random xorshift, mcg multiply, fixed xorshift
+ *
+ * The most statistically powerful generator, but all those steps
+ * make it slower than some of the others.  We give it the rottenest jobs.
+ *
+ * Because it's usually used in contexts where the state type and the
+ * result type are the same, it is a permutation and is thus invertable.
+ * We thus provide a function to invert it.  This function is used to
+ * for the "inside out" generator used by the extended generator.
+ */
+
+/* Defined type-based concepts for the multiplication step.  They're actually
+ * all derived by truncating the 128-bit, which was computed to be a good
+ * "universal" constant.
+ */
+
+template <typename T>
+struct mcg_multiplier {
+    // Not defined for an arbitrary type
+};
+
+template <typename T>
+struct mcg_unmultiplier {
+    // Not defined for an arbitrary type
+};
+
+PCG_DEFINE_CONSTANT(uint8_t,  mcg, multiplier,   217U)
+PCG_DEFINE_CONSTANT(uint8_t,  mcg, unmultiplier, 105U)
+
+PCG_DEFINE_CONSTANT(uint16_t, mcg, multiplier,   62169U)
+PCG_DEFINE_CONSTANT(uint16_t, mcg, unmultiplier, 28009U)
+
+PCG_DEFINE_CONSTANT(uint32_t, mcg, multiplier,   277803737U)
+PCG_DEFINE_CONSTANT(uint32_t, mcg, unmultiplier, 2897767785U)
+
+PCG_DEFINE_CONSTANT(uint64_t, mcg, multiplier,   12605985483714917081ULL)
+PCG_DEFINE_CONSTANT(uint64_t, mcg, unmultiplier, 15009553638781119849ULL)
+
+PCG_DEFINE_CONSTANT(pcg128_t, mcg, multiplier,
+        PCG_128BIT_CONSTANT(17766728186571221404ULL, 12605985483714917081ULL))
+PCG_DEFINE_CONSTANT(pcg128_t, mcg, unmultiplier,
+        PCG_128BIT_CONSTANT(14422606686972528997ULL, 15009553638781119849ULL))
+
+
+template <typename xtype, typename itype>
+struct rxs_m_xs_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t opbits = xtypebits >= 128 ? 6
+                                 : xtypebits >=  64 ? 5
+                                 : xtypebits >=  32 ? 4
+                                 : xtypebits >=  16 ? 3
+                                 :                    2;
+        constexpr bitcount_t shift = bits - xtypebits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        bitcount_t rshift =
+            opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0;
+        internal ^= internal >> (opbits + rshift);
+        internal *= mcg_multiplier<itype>::multiplier();
+        xtype result = internal >> shift;
+        result ^= result >> ((2U*xtypebits+2U)/3U);
+        return result;
+    }
+
+    static itype unoutput(itype internal)
+    {
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t opbits = bits >= 128 ? 6
+                                 : bits >=  64 ? 5
+                                 : bits >=  32 ? 4
+                                 : bits >=  16 ? 3
+                                 :               2;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+
+        internal = unxorshift(internal, bits, (2U*bits+2U)/3U);
+
+        internal *= mcg_unmultiplier<itype>::unmultiplier();
+
+        bitcount_t rshift = opbits ? (internal >> (bits - opbits)) & mask : 0;
+        internal = unxorshift(internal, bits, opbits + rshift);
+
+        return internal;
+    }
+};
+
+
+/*
+ * RXS M -- random xorshift, mcg multiply
+ */
+
+template <typename xtype, typename itype>
+struct rxs_m_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t opbits = xtypebits >= 128 ? 6
+                                 : xtypebits >=  64 ? 5
+                                 : xtypebits >=  32 ? 4
+                                 : xtypebits >=  16 ? 3
+                                 :                    2;
+        constexpr bitcount_t shift = bits - xtypebits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        bitcount_t rshift = opbits ? (internal >> (bits - opbits)) & mask : 0;
+        internal ^= internal >> (opbits + rshift);
+        internal *= mcg_multiplier<itype>::multiplier();
+        xtype result = internal >> shift;
+        return result;
+    }
+};
+
+/*
+ * XSL RR -- fixed xorshift (to low bits), random rotate
+ *
+ * Useful for 128-bit types that are split across two CPU registers.
+ */
+
+template <typename xtype, typename itype>
+struct xsl_rr_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t sparebits = bits - xtypebits;
+        constexpr bitcount_t wantedopbits = xtypebits >= 128 ? 7
+                                       : xtypebits >=  64 ? 6
+                                       : xtypebits >=  32 ? 5
+                                       : xtypebits >=  16 ? 4
+                                       :                    3;
+        constexpr bitcount_t opbits = sparebits >= wantedopbits ? wantedopbits
+                                                             : sparebits;
+        constexpr bitcount_t amplifier = wantedopbits - opbits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        constexpr bitcount_t topspare = sparebits;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift = (topspare + xtypebits) / 2;
+
+        bitcount_t rot =
+            opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0;
+        bitcount_t amprot = (rot << amplifier) & mask;
+        internal ^= internal >> xshift;
+        xtype result = xtype(internal >> bottomspare);
+        result = rotr(result, amprot);
+        return result;
+    }
+};
+
+
+/*
+ * XSL RR RR -- fixed xorshift (to low bits), random rotate (both parts)
+ *
+ * Useful for 128-bit types that are split across two CPU registers.
+ * If you really want an invertable 128-bit RNG, I guess this is the one.
+ */
+
+template <typename T> struct halfsize_trait {};
+template <> struct halfsize_trait<pcg128_t>  { typedef uint64_t type; };
+template <> struct halfsize_trait<uint64_t>  { typedef uint32_t type; };
+template <> struct halfsize_trait<uint32_t>  { typedef uint16_t type; };
+template <> struct halfsize_trait<uint16_t>  { typedef uint8_t type;  };
+
+template <typename xtype, typename itype>
+struct xsl_rr_rr_mixin {
+    typedef typename halfsize_trait<itype>::type htype;
+
+    static itype output(itype internal)
+    {
+        constexpr bitcount_t htypebits = bitcount_t(sizeof(htype) * 8);
+        constexpr bitcount_t bits      = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t sparebits = bits - htypebits;
+        constexpr bitcount_t wantedopbits = htypebits >= 128 ? 7
+                                       : htypebits >=  64 ? 6
+                                       : htypebits >=  32 ? 5
+                                       : htypebits >=  16 ? 4
+                                       :                    3;
+        constexpr bitcount_t opbits = sparebits >= wantedopbits ? wantedopbits
+                                                                : sparebits;
+        constexpr bitcount_t amplifier = wantedopbits - opbits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        constexpr bitcount_t topspare = sparebits;
+        constexpr bitcount_t xshift = (topspare + htypebits) / 2;
+
+        bitcount_t rot =
+            opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0;
+        bitcount_t amprot = (rot << amplifier) & mask;
+        internal ^= internal >> xshift;
+        htype lowbits = htype(internal);
+        lowbits = rotr(lowbits, amprot);
+        htype highbits = htype(internal >> topspare);
+        bitcount_t rot2 = lowbits & mask;
+        bitcount_t amprot2 = (rot2 << amplifier) & mask;
+        highbits = rotr(highbits, amprot2);
+        return (itype(highbits) << topspare) ^ itype(lowbits);
+    }
+};
+
+
+/*
+ * XSH -- fixed xorshift (to high bits)
+ *
+ * You shouldn't use this at 64-bits or less.
+ */
+
+template <typename xtype, typename itype>
+struct xsh_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t sparebits = bits - xtypebits;
+        constexpr bitcount_t topspare = 0;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift = (topspare + xtypebits) / 2;
+
+        internal ^= internal >> xshift;
+        xtype result = internal >> bottomspare;
+        return result;
+    }
+};
+
+/*
+ * XSL -- fixed xorshift (to low bits)
+ *
+ * You shouldn't use this at 64-bits or less.
+ */
+
+template <typename xtype, typename itype>
+struct xsl_mixin {
+    inline xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t sparebits = bits - xtypebits;
+        constexpr bitcount_t topspare = sparebits;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift = (topspare + xtypebits) / 2;
+
+        internal ^= internal >> xshift;
+        xtype result = internal >> bottomspare;
+        return result;
+    }
+};
+
+/* ---- End of Output Functions ---- */
+
+
+template <typename baseclass>
+struct inside_out : private baseclass {
+    inside_out() = delete;
+
+    typedef typename baseclass::result_type result_type;
+    typedef typename baseclass::state_type  state_type;
+    static_assert(sizeof(result_type) == sizeof(state_type),
+                  "Require a RNG whose output function is a permutation");
+
+    static bool external_step(result_type& randval, size_t i)
+    {
+        state_type state = baseclass::unoutput(randval);
+        state = state * baseclass::multiplier() + baseclass::increment()
+                + state_type(i*2);
+        result_type result = baseclass::output(state);
+        randval = result;
+        state_type zero =
+            baseclass::is_mcg ? state & state_type(3U) : state_type(0U);
+        return result == zero;
+    }
+
+    static bool external_advance(result_type& randval, size_t i,
+                                 result_type delta, bool forwards = true)
+    {
+        state_type state = baseclass::unoutput(randval);
+        state_type mult  = baseclass::multiplier();
+        state_type inc   = baseclass::increment() + state_type(i*2);
+        state_type zero =
+            baseclass::is_mcg ? state & state_type(3U) : state_type(0U);
+        state_type dist_to_zero = baseclass::distance(state, zero, mult, inc);
+        bool crosses_zero =
+            forwards ? dist_to_zero <= delta
+                     : (-dist_to_zero) <= delta;
+        if (!forwards)
+            delta = -delta;
+        state = baseclass::advance(state, delta, mult, inc);
+        randval = baseclass::output(state);
+        return crosses_zero;
+    }
+};
+
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, typename baseclass, typename extvalclass, bool kdd = true>
+class extended : public baseclass {
+public:
+    typedef typename baseclass::state_type  state_type;
+    typedef typename baseclass::result_type result_type;
+    typedef inside_out<extvalclass> insideout;
+
+private:
+    static constexpr bitcount_t rtypebits = sizeof(result_type)*8;
+    static constexpr bitcount_t stypebits = sizeof(state_type)*8;
+
+    static constexpr bitcount_t tick_limit_pow2 = 64U;
+
+    static constexpr size_t table_size  = 1UL << table_pow2;
+    static constexpr size_t table_shift = stypebits - table_pow2;
+    static constexpr state_type table_mask =
+        (state_type(1U) << table_pow2) - state_type(1U);
+
+    static constexpr bool   may_tick  =
+        (advance_pow2 < stypebits) && (advance_pow2 < tick_limit_pow2);
+    static constexpr size_t tick_shift = stypebits - advance_pow2;
+    static constexpr state_type tick_mask  =
+        may_tick ? state_type(
+                       (uint64_t(1) << (advance_pow2*may_tick)) - 1)
+                                        // ^-- stupidity to appease GCC warnings
+                 : ~state_type(0U);
+
+    static constexpr bool may_tock = stypebits < tick_limit_pow2;
+
+    result_type data_[table_size];
+
+    PCG_NOINLINE void advance_table();
+
+    PCG_NOINLINE void advance_table(state_type delta, bool isForwards = true);
+
+    result_type& get_extended_value()
+    {
+        state_type state = this->state_;
+        if (kdd && baseclass::is_mcg) {
+            // The low order bits of an MCG are constant, so drop them.
+            state >>= 2;
+        }
+        size_t index       = kdd ? state &  table_mask
+                                 : state >> table_shift;
+
+        if (may_tick) {
+            bool tick = kdd ? (state & tick_mask) == state_type(0u)
+                            : (state >> tick_shift) == state_type(0u);
+            if (tick)
+                    advance_table();
+        }
+        if (may_tock) {
+            bool tock = state == state_type(0u);
+            if (tock)
+                advance_table();
+        }
+        return data_[index];
+    }
+
+public:
+    static constexpr size_t period_pow2()
+    {
+        return baseclass::period_pow2() + table_size*extvalclass::period_pow2();
+    }
+
+    __attribute__((always_inline)) result_type operator()()
+    {
+        result_type rhs = get_extended_value();
+        result_type lhs = this->baseclass::operator()();
+        return lhs ^ rhs;
+    }
+
+    result_type operator()(result_type upper_bound)
+    {
+        return bounded_rand(*this, upper_bound);
+    }
+
+    void set(result_type wanted)
+    {
+        result_type& rhs = get_extended_value();
+        result_type lhs = this->baseclass::operator()();
+        rhs = lhs ^ wanted;
+    }
+
+    void advance(state_type distance, bool forwards = true);
+
+    void backstep(state_type distance)
+    {
+        advance(distance, false);
+    }
+
+    extended(const result_type* data)
+        : baseclass()
+    {
+        datainit(data);
+    }
+
+    extended(const result_type* data, state_type seed)
+        : baseclass(seed)
+    {
+        datainit(data);
+    }
+
+    // This function may or may not exist.  It thus has to be a template
+    // to use SFINAE; users don't have to worry about its template-ness.
+
+    template <typename bc = baseclass>
+    extended(const result_type* data, state_type seed,
+            typename bc::stream_state stream_seed)
+        : baseclass(seed, stream_seed)
+    {
+        datainit(data);
+    }
+
+    extended()
+        : baseclass()
+    {
+        selfinit();
+    }
+
+    extended(state_type seed)
+        : baseclass(seed)
+    {
+        selfinit();
+    }
+
+    // This function may or may not exist.  It thus has to be a template
+    // to use SFINAE; users don't have to worry about its template-ness.
+
+    template <typename bc = baseclass>
+    extended(state_type seed, typename bc::stream_state stream_seed)
+        : baseclass(seed, stream_seed)
+    {
+        selfinit();
+    }
+
+private:
+    void selfinit();
+    void datainit(const result_type* data);
+
+public:
+
+    template<typename SeedSeq, typename = typename std::enable_if<
+           !std::is_convertible<SeedSeq, result_type>::value
+        && !std::is_convertible<SeedSeq, extended>::value>::type>
+    extended(SeedSeq&& seedSeq)
+        : baseclass(seedSeq)
+    {
+        generate_to<table_size>(seedSeq, data_);
+    }
+
+    template<typename... Args>
+    void seed(Args&&... args)
+    {
+        new (this) extended(std::forward<Args>(args)...);
+    }
+
+    template <bitcount_t table_pow2_, bitcount_t advance_pow2_,
+              typename baseclass_, typename extvalclass_, bool kdd_>
+    friend bool operator==(const extended<table_pow2_, advance_pow2_,
+                                              baseclass_, extvalclass_, kdd_>&,
+                           const extended<table_pow2_, advance_pow2_,
+                                              baseclass_, extvalclass_, kdd_>&);
+
+    template <typename CharT, typename Traits,
+              bitcount_t table_pow2_, bitcount_t advance_pow2_,
+              typename baseclass_, typename extvalclass_, bool kdd_>
+    friend std::basic_ostream<CharT,Traits>&
+    operator<<(std::basic_ostream<CharT,Traits>& out,
+               const extended<table_pow2_, advance_pow2_,
+                              baseclass_, extvalclass_, kdd_>&);
+
+    template <typename CharT, typename Traits,
+              bitcount_t table_pow2_, bitcount_t advance_pow2_,
+              typename baseclass_, typename extvalclass_, bool kdd_>
+    friend std::basic_istream<CharT,Traits>&
+    operator>>(std::basic_istream<CharT,Traits>& in,
+               extended<table_pow2_, advance_pow2_,
+                        baseclass_, extvalclass_, kdd_>&);
+
+};
+
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::datainit(
+         const result_type* data)
+{
+    for (size_t i = 0; i < table_size; ++i)
+        data_[i] = data[i];
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::selfinit()
+{
+    // We need to fill the extended table with something, and we have
+    // very little provided data, so we use the base generator to
+    // produce values.  Although not ideal (use a seed sequence, folks!),
+    // unexpected correlations are mitigated by
+    //      - using XOR differences rather than the number directly
+    //      - the way the table is accessed, its values *won't* be accessed
+    //        in the same order the were written.
+    //      - any strange correlations would only be apparent if we
+    //        were to backstep the generator so that the base generator
+    //        was generating the same values again
+    result_type xdiff = baseclass::operator()() - baseclass::operator()();
+    for (size_t i = 0; i < table_size; ++i) {
+        data_[i] = baseclass::operator()() ^ xdiff;
+    }
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+bool operator==(const extended<table_pow2, advance_pow2,
+                               baseclass, extvalclass, kdd>& lhs,
+                const extended<table_pow2, advance_pow2,
+                               baseclass, extvalclass, kdd>& rhs)
+{
+    auto& base_lhs = static_cast<const baseclass&>(lhs);
+    auto& base_rhs = static_cast<const baseclass&>(rhs);
+    return base_lhs == base_rhs
+        && !memcmp((void*) lhs.data_, (void*) rhs.data_, sizeof(lhs.data_));
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+inline bool operator!=(const extended<table_pow2, advance_pow2,
+                                      baseclass, extvalclass, kdd>& lhs,
+                       const extended<table_pow2, advance_pow2,
+                                      baseclass, extvalclass, kdd>& rhs)
+{
+    return lhs != rhs;
+}
+
+template <typename CharT, typename Traits,
+          bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits>& out,
+           const extended<table_pow2, advance_pow2,
+                          baseclass, extvalclass, kdd>& rng)
+{
+    auto orig_flags = out.flags(std::ios_base::dec | std::ios_base::left);
+    auto space = out.widen(' ');
+    auto orig_fill = out.fill();
+
+    out << rng.multiplier() << space
+        << rng.increment() << space
+        << rng.state_;
+
+    for (const auto& datum : rng.data_)
+        out << space << datum;
+
+    out.flags(orig_flags);
+    out.fill(orig_fill);
+    return out;
+}
+
+template <typename CharT, typename Traits,
+          bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits>& in,
+           extended<table_pow2, advance_pow2,
+                    baseclass, extvalclass, kdd>& rng)
+{
+    extended<table_pow2, advance_pow2, baseclass, extvalclass> new_rng;
+    auto& base_rng = static_cast<baseclass&>(new_rng);
+    in >> base_rng;
+
+    if (in.fail())
+        return in;
+
+    auto orig_flags = in.flags(std::ios_base::dec | std::ios_base::skipws);
+
+    for (auto& datum : new_rng.data_) {
+        in >> datum;
+        if (in.fail())
+            goto bail;
+    }
+
+    rng = new_rng;
+
+bail:
+    in.flags(orig_flags);
+    return in;
+}
+
+
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void
+extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::advance_table()
+{
+    bool carry = false;
+    for (size_t i = 0; i < table_size; ++i) {
+        if (carry) {
+            carry = insideout::external_step(data_[i],i+1);
+        }
+        bool carry2 = insideout::external_step(data_[i],i+1);
+        carry = carry || carry2;
+    }
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void
+extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::advance_table(
+        state_type delta, bool isForwards)
+{
+    typedef typename baseclass::state_type   base_state_t;
+    typedef typename extvalclass::state_type ext_state_t;
+    constexpr bitcount_t basebits = sizeof(base_state_t)*8;
+    constexpr bitcount_t extbits  = sizeof(ext_state_t)*8;
+    static_assert(basebits <= extbits || advance_pow2 > 0,
+                  "Current implementation might overflow its carry");
+
+    base_state_t carry = 0;
+    for (size_t i = 0; i < table_size; ++i) {
+        base_state_t total_delta = carry + delta;
+        ext_state_t  trunc_delta = ext_state_t(total_delta);
+        if (basebits > extbits) {
+            carry = total_delta >> extbits;
+        } else {
+            carry = 0;
+        }
+        carry +=
+            insideout::external_advance(data_[i],i+1, trunc_delta, isForwards);
+    }
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::advance(
+    state_type distance, bool forwards)
+{
+    static_assert(kdd,
+        "Efficient advance is too hard for non-kdd extension. "
+        "For a weak advance, cast to base class");
+    state_type zero =
+        baseclass::is_mcg ? this->state_ & state_type(3U) : state_type(0U);
+    if (may_tick) {
+        state_type ticks = distance >> (advance_pow2*may_tick);
+                                        // ^-- stupidity to appease GCC
+                                        // warnings
+        state_type adv_mask =
+            baseclass::is_mcg ? tick_mask << 2 : tick_mask;
+        state_type next_advance_distance = this->distance(zero, adv_mask);
+        if (!forwards)
+            next_advance_distance = (-next_advance_distance) & tick_mask;
+        if (next_advance_distance < (distance & tick_mask)) {
+            ++ticks;
+        }
+        if (ticks)
+            advance_table(ticks, forwards);
+    }
+    if (forwards) {
+        if (may_tock && this->distance(zero) <= distance)
+            advance_table();
+        baseclass::advance(distance);
+    } else {
+        if (may_tock && -(this->distance(zero)) <= distance)
+            advance_table(state_type(1U), false);
+        baseclass::advance(-distance);
+    }
+}
+
+} // namespace pcg_detail
+
+namespace pcg_engines {
+
+using namespace pcg_detail;
+
+/* Predefined types for XSH RS */
+
+typedef oneseq_base<uint8_t,  uint16_t, xsh_rs_mixin>  oneseq_xsh_rs_16_8;
+typedef oneseq_base<uint16_t, uint32_t, xsh_rs_mixin>  oneseq_xsh_rs_32_16;
+typedef oneseq_base<uint32_t, uint64_t, xsh_rs_mixin>  oneseq_xsh_rs_64_32;
+typedef oneseq_base<uint64_t, pcg128_t, xsh_rs_mixin>  oneseq_xsh_rs_128_64;
+
+typedef unique_base<uint8_t,  uint16_t, xsh_rs_mixin>  unique_xsh_rs_16_8;
+typedef unique_base<uint16_t, uint32_t, xsh_rs_mixin>  unique_xsh_rs_32_16;
+typedef unique_base<uint32_t, uint64_t, xsh_rs_mixin>  unique_xsh_rs_64_32;
+typedef unique_base<uint64_t, pcg128_t, xsh_rs_mixin>  unique_xsh_rs_128_64;
+
+typedef setseq_base<uint8_t,  uint16_t, xsh_rs_mixin>  setseq_xsh_rs_16_8;
+typedef setseq_base<uint16_t, uint32_t, xsh_rs_mixin>  setseq_xsh_rs_32_16;
+typedef setseq_base<uint32_t, uint64_t, xsh_rs_mixin>  setseq_xsh_rs_64_32;
+typedef setseq_base<uint64_t, pcg128_t, xsh_rs_mixin>  setseq_xsh_rs_128_64;
+
+typedef mcg_base<uint8_t,  uint16_t, xsh_rs_mixin>  mcg_xsh_rs_16_8;
+typedef mcg_base<uint16_t, uint32_t, xsh_rs_mixin>  mcg_xsh_rs_32_16;
+typedef mcg_base<uint32_t, uint64_t, xsh_rs_mixin>  mcg_xsh_rs_64_32;
+typedef mcg_base<uint64_t, pcg128_t, xsh_rs_mixin>  mcg_xsh_rs_128_64;
+
+/* Predefined types for XSH RR */
+
+typedef oneseq_base<uint8_t,  uint16_t, xsh_rr_mixin>  oneseq_xsh_rr_16_8;
+typedef oneseq_base<uint16_t, uint32_t, xsh_rr_mixin>  oneseq_xsh_rr_32_16;
+typedef oneseq_base<uint32_t, uint64_t, xsh_rr_mixin>  oneseq_xsh_rr_64_32;
+typedef oneseq_base<uint64_t, pcg128_t, xsh_rr_mixin>  oneseq_xsh_rr_128_64;
+
+typedef unique_base<uint8_t,  uint16_t, xsh_rr_mixin>  unique_xsh_rr_16_8;
+typedef unique_base<uint16_t, uint32_t, xsh_rr_mixin>  unique_xsh_rr_32_16;
+typedef unique_base<uint32_t, uint64_t, xsh_rr_mixin>  unique_xsh_rr_64_32;
+typedef unique_base<uint64_t, pcg128_t, xsh_rr_mixin>  unique_xsh_rr_128_64;
+
+typedef setseq_base<uint8_t,  uint16_t, xsh_rr_mixin>  setseq_xsh_rr_16_8;
+typedef setseq_base<uint16_t, uint32_t, xsh_rr_mixin>  setseq_xsh_rr_32_16;
+typedef setseq_base<uint32_t, uint64_t, xsh_rr_mixin>  setseq_xsh_rr_64_32;
+typedef setseq_base<uint64_t, pcg128_t, xsh_rr_mixin>  setseq_xsh_rr_128_64;
+
+typedef mcg_base<uint8_t,  uint16_t, xsh_rr_mixin>  mcg_xsh_rr_16_8;
+typedef mcg_base<uint16_t, uint32_t, xsh_rr_mixin>  mcg_xsh_rr_32_16;
+typedef mcg_base<uint32_t, uint64_t, xsh_rr_mixin>  mcg_xsh_rr_64_32;
+typedef mcg_base<uint64_t, pcg128_t, xsh_rr_mixin>  mcg_xsh_rr_128_64;
+
+
+/* Predefined types for RXS M XS */
+
+typedef oneseq_base<uint8_t,  uint8_t, rxs_m_xs_mixin>   oneseq_rxs_m_xs_8_8;
+typedef oneseq_base<uint16_t, uint16_t, rxs_m_xs_mixin>  oneseq_rxs_m_xs_16_16;
+typedef oneseq_base<uint32_t, uint32_t, rxs_m_xs_mixin>  oneseq_rxs_m_xs_32_32;
+typedef oneseq_base<uint64_t, uint64_t, rxs_m_xs_mixin>  oneseq_rxs_m_xs_64_64;
+typedef oneseq_base<pcg128_t, pcg128_t, rxs_m_xs_mixin>  oneseq_rxs_m_xs_128_128;
+
+typedef unique_base<uint8_t,  uint8_t, rxs_m_xs_mixin>  unique_rxs_m_xs_8_8;
+typedef unique_base<uint16_t, uint16_t, rxs_m_xs_mixin> unique_rxs_m_xs_16_16;
+typedef unique_base<uint32_t, uint32_t, rxs_m_xs_mixin> unique_rxs_m_xs_32_32;
+typedef unique_base<uint64_t, uint64_t, rxs_m_xs_mixin> unique_rxs_m_xs_64_64;
+typedef unique_base<pcg128_t, pcg128_t, rxs_m_xs_mixin> unique_rxs_m_xs_128_128;
+
+typedef setseq_base<uint8_t,  uint8_t, rxs_m_xs_mixin>  setseq_rxs_m_xs_8_8;
+typedef setseq_base<uint16_t, uint16_t, rxs_m_xs_mixin> setseq_rxs_m_xs_16_16;
+typedef setseq_base<uint32_t, uint32_t, rxs_m_xs_mixin> setseq_rxs_m_xs_32_32;
+typedef setseq_base<uint64_t, uint64_t, rxs_m_xs_mixin> setseq_rxs_m_xs_64_64;
+typedef setseq_base<pcg128_t, pcg128_t, rxs_m_xs_mixin> setseq_rxs_m_xs_128_128;
+
+                // MCG versions don't make sense here, so aren't defined.
+
+/* Predefined types for XSL RR (only defined for "large" types) */
+
+typedef oneseq_base<uint32_t, uint64_t, xsl_rr_mixin>  oneseq_xsl_rr_64_32;
+typedef oneseq_base<uint64_t, pcg128_t, xsl_rr_mixin>  oneseq_xsl_rr_128_64;
+
+typedef unique_base<uint32_t, uint64_t, xsl_rr_mixin>  unique_xsl_rr_64_32;
+typedef unique_base<uint64_t, pcg128_t, xsl_rr_mixin>  unique_xsl_rr_128_64;
+
+typedef setseq_base<uint32_t, uint64_t, xsl_rr_mixin>  setseq_xsl_rr_64_32;
+typedef setseq_base<uint64_t, pcg128_t, xsl_rr_mixin>  setseq_xsl_rr_128_64;
+
+typedef mcg_base<uint32_t, uint64_t, xsl_rr_mixin>  mcg_xsl_rr_64_32;
+typedef mcg_base<uint64_t, pcg128_t, xsl_rr_mixin>  mcg_xsl_rr_128_64;
+
+
+/* Predefined types for XSL RR RR (only defined for "large" types) */
+
+typedef oneseq_base<uint64_t, uint64_t, xsl_rr_rr_mixin>
+    oneseq_xsl_rr_rr_64_64;
+typedef oneseq_base<pcg128_t, pcg128_t, xsl_rr_rr_mixin>
+    oneseq_xsl_rr_rr_128_128;
+
+typedef unique_base<uint64_t, uint64_t, xsl_rr_rr_mixin>
+    unique_xsl_rr_rr_64_64;
+typedef unique_base<pcg128_t, pcg128_t, xsl_rr_rr_mixin>
+    unique_xsl_rr_rr_128_128;
+
+typedef setseq_base<uint64_t, uint64_t, xsl_rr_rr_mixin>
+    setseq_xsl_rr_rr_64_64;
+typedef setseq_base<pcg128_t, pcg128_t, xsl_rr_rr_mixin>
+    setseq_xsl_rr_rr_128_128;
+
+                // MCG versions don't make sense here, so aren't defined.
+
+/* Extended generators */
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename BaseRNG, bool kdd = true>
+using ext_std8 = extended<table_pow2, advance_pow2, BaseRNG,
+                          oneseq_rxs_m_xs_8_8, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename BaseRNG, bool kdd = true>
+using ext_std16 = extended<table_pow2, advance_pow2, BaseRNG,
+                           oneseq_rxs_m_xs_16_16, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename BaseRNG, bool kdd = true>
+using ext_std32 = extended<table_pow2, advance_pow2, BaseRNG,
+                           oneseq_rxs_m_xs_32_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename BaseRNG, bool kdd = true>
+using ext_std64 = extended<table_pow2, advance_pow2, BaseRNG,
+                           oneseq_rxs_m_xs_64_64, kdd>;
+
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_oneseq_rxs_m_xs_32_32 =
+          ext_std32<table_pow2, advance_pow2, oneseq_rxs_m_xs_32_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_mcg_xsh_rs_64_32 =
+          ext_std32<table_pow2, advance_pow2, mcg_xsh_rs_64_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_oneseq_xsh_rs_64_32 =
+          ext_std32<table_pow2, advance_pow2, oneseq_xsh_rs_64_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_setseq_xsh_rr_64_32 =
+          ext_std32<table_pow2, advance_pow2, setseq_xsh_rr_64_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_mcg_xsl_rr_128_64 =
+          ext_std64<table_pow2, advance_pow2, mcg_xsl_rr_128_64, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_oneseq_xsl_rr_128_64 =
+          ext_std64<table_pow2, advance_pow2, oneseq_xsl_rr_128_64, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_setseq_xsl_rr_128_64 =
+          ext_std64<table_pow2, advance_pow2, setseq_xsl_rr_128_64, kdd>;
+
+} // namespace pcg_engines
+
+typedef pcg_engines::setseq_xsh_rr_64_32        pcg32;
+typedef pcg_engines::oneseq_xsh_rr_64_32        pcg32_oneseq;
+typedef pcg_engines::unique_xsh_rr_64_32        pcg32_unique;
+typedef pcg_engines::mcg_xsh_rs_64_32           pcg32_fast;
+
+typedef pcg_engines::setseq_xsl_rr_128_64       pcg64;
+typedef pcg_engines::oneseq_xsl_rr_128_64       pcg64_oneseq;
+typedef pcg_engines::unique_xsl_rr_128_64       pcg64_unique;
+typedef pcg_engines::mcg_xsl_rr_128_64          pcg64_fast;
+
+typedef pcg_engines::setseq_rxs_m_xs_8_8        pcg8_once_insecure;
+typedef pcg_engines::setseq_rxs_m_xs_16_16      pcg16_once_insecure;
+typedef pcg_engines::setseq_rxs_m_xs_32_32      pcg32_once_insecure;
+typedef pcg_engines::setseq_rxs_m_xs_64_64      pcg64_once_insecure;
+typedef pcg_engines::setseq_xsl_rr_rr_128_128   pcg128_once_insecure;
+
+typedef pcg_engines::oneseq_rxs_m_xs_8_8        pcg8_oneseq_once_insecure;
+typedef pcg_engines::oneseq_rxs_m_xs_16_16      pcg16_oneseq_once_insecure;
+typedef pcg_engines::oneseq_rxs_m_xs_32_32      pcg32_oneseq_once_insecure;
+typedef pcg_engines::oneseq_rxs_m_xs_64_64      pcg64_oneseq_once_insecure;
+typedef pcg_engines::oneseq_xsl_rr_rr_128_128   pcg128_oneseq_once_insecure;
+
+
+// These two extended RNGs provide two-dimensionally equidistributed
+// 32-bit generators.  pcg32_k2_fast occupies the same space as pcg64,
+// and can be called twice to generate 64 bits, but does not required
+// 128-bit math; on 32-bit systems, it's faster than pcg64 as well.
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,true>     pcg32_k2;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<6,32,true>     pcg32_k2_fast;
+
+// These eight extended RNGs have about as much state as arc4random
+//
+//  - the k variants are k-dimensionally equidistributed
+//  - the c variants offer better crypographic security
+//
+// (just how good the cryptographic security is is an open question)
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,true>     pcg32_k64;
+typedef pcg_engines::ext_mcg_xsh_rs_64_32<6,32,true>        pcg32_k64_oneseq;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<6,32,true>     pcg32_k64_fast;
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,false>    pcg32_c64;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<6,32,false>    pcg32_c64_oneseq;
+typedef pcg_engines::ext_mcg_xsh_rs_64_32<6,32,false>       pcg32_c64_fast;
+
+typedef pcg_engines::ext_setseq_xsl_rr_128_64<5,16,true>    pcg64_k32;
+typedef pcg_engines::ext_oneseq_xsl_rr_128_64<5,128,true>   pcg64_k32_oneseq;
+typedef pcg_engines::ext_mcg_xsl_rr_128_64<5,128,true>      pcg64_k32_fast;
+
+typedef pcg_engines::ext_setseq_xsl_rr_128_64<5,16,false>   pcg64_c32;
+typedef pcg_engines::ext_oneseq_xsl_rr_128_64<5,128,false>  pcg64_c32_oneseq;
+typedef pcg_engines::ext_mcg_xsl_rr_128_64<5,128,false>     pcg64_c32_fast;
+
+// These eight extended RNGs have more state than the Mersenne twister
+//
+//  - the k variants are k-dimensionally equidistributed
+//  - the c variants offer better crypographic security
+//
+// (just how good the cryptographic security is is an open question)
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<10,16,true>    pcg32_k1024;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<10,32,true>    pcg32_k1024_fast;
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<10,16,false>   pcg32_c1024;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<10,32,false>   pcg32_c1024_fast;
+
+typedef pcg_engines::ext_setseq_xsl_rr_128_64<10,16,true>   pcg64_k1024;
+typedef pcg_engines::ext_oneseq_xsl_rr_128_64<10,128,true>  pcg64_k1024_fast;
+
+typedef pcg_engines::ext_setseq_xsl_rr_128_64<10,16,false>  pcg64_c1024;
+typedef pcg_engines::ext_oneseq_xsl_rr_128_64<10,128,false> pcg64_c1024_fast;
+
+// These generators have an insanely huge period (2^524352), and is suitable
+// for silly party tricks, such as dumping out 64 KB ZIP files at an arbitrary
+// point in the future.   [Actually, over the full period of the generator, it
+// will produce every 64 KB ZIP file 2^64 times!]
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<14,16,true>    pcg32_k16384;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<14,32,true>    pcg32_k16384_fast;
+
+#endif // PCG_RAND_HPP_INCLUDED
diff --git a/external_tools/pcg/pcg_uint128.hpp b/external_tools/pcg/pcg_uint128.hpp
new file mode 100644
index 000000000..99b20e780
--- /dev/null
+++ b/external_tools/pcg/pcg_uint128.hpp
@@ -0,0 +1,750 @@
+/*
+ * PCG Random Number Generation for C++
+ *
+ * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *     http://www.pcg-random.org
+ */
+
+/*
+ * This code provides a a C++ class that can provide 128-bit (or higher)
+ * integers.  To produce 2K-bit integers, it uses two K-bit integers,
+ * placed in a union that allowes the code to also see them as four K/2 bit
+ * integers (and access them either directly name, or by index).
+ *
+ * It may seem like we're reinventing the wheel here, because several
+ * libraries already exist that support large integers, but most existing
+ * libraries provide a very generic multiprecision code, but here we're
+ * operating at a fixed size.  Also, most other libraries are fairly
+ * heavyweight.  So we use a direct implementation.  Sadly, it's much slower
+ * than hand-coded assembly or direct CPU support.
+ */
+
+#ifndef PCG_UINT128_HPP_INCLUDED
+#define PCG_UINT128_HPP_INCLUDED 1
+
+#include <cstdint>
+#include <cstdio>
+#include <cassert>
+#include <climits>
+#include <utility>
+#include <initializer_list>
+#include <type_traits>
+
+/*
+ * We want to lay the type out the same way that a native type would be laid
+ * out, which means we must know the machine's endian, at compile time.
+ * This ugliness attempts to do so.
+ */
+
+#ifndef PCG_LITTLE_ENDIAN
+    #if defined(__BYTE_ORDER__)
+        #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+            #define PCG_LITTLE_ENDIAN 1
+        #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+            #define PCG_LITTLE_ENDIAN 0
+        #else
+            #error __BYTE_ORDER__ does not match a standard endian, pick a side
+        #endif
+    #elif __LITTLE_ENDIAN__ || _LITTLE_ENDIAN
+        #define PCG_LITTLE_ENDIAN 1
+    #elif __BIG_ENDIAN__ || _BIG_ENDIAN
+        #define PCG_LITTLE_ENDIAN 0
+    #elif __x86_64 || __x86_64__ || __i386 || __i386__
+        #define PCG_LITTLE_ENDIAN 1
+    #elif __powerpc__ || __POWERPC__ || __ppc__ || __PPC__ \
+          || __m68k__ || __mc68000__
+        #define PCG_LITTLE_ENDIAN 0
+    #else
+        #error Unable to determine target endianness
+    #endif
+#endif
+
+namespace pcg_extras {
+
+// Recent versions of GCC have intrinsics we can use to quickly calculate
+// the number of leading and trailing zeros in a number.  If possible, we
+// use them, otherwise we fall back to old-fashioned bit twiddling to figure
+// them out.
+
+#ifndef PCG_BITCOUNT_T
+    typedef uint8_t bitcount_t;
+#else
+    typedef PCG_BITCOUNT_T bitcount_t;
+#endif
+
+/*
+ * Provide some useful helper functions
+ *      * flog2                 floor(log2(x))
+ *      * trailingzeros         number of trailing zero bits
+ */
+
+#ifdef __GNUC__         // Any GNU-compatible compiler supporting C++11 has
+                        // some useful intrinsics we can use.
+
+inline bitcount_t flog2(uint32_t v)
+{
+    return 31 - __builtin_clz(v);
+}
+
+inline bitcount_t trailingzeros(uint32_t v)
+{
+    return __builtin_ctz(v);
+}
+
+inline bitcount_t flog2(uint64_t v)
+{
+#if UINT64_MAX == ULONG_MAX
+    return 63 - __builtin_clzl(v);
+#elif UINT64_MAX == ULLONG_MAX
+    return 63 - __builtin_clzll(v);
+#else
+    #error Cannot find a function for uint64_t
+#endif
+}
+
+inline bitcount_t trailingzeros(uint64_t v)
+{
+#if UINT64_MAX == ULONG_MAX
+    return __builtin_ctzl(v);
+#elif UINT64_MAX == ULLONG_MAX
+    return __builtin_ctzll(v);
+#else
+    #error Cannot find a function for uint64_t
+#endif
+}
+
+#else                   // Otherwise, we fall back to bit twiddling
+                        // implementations
+
+inline bitcount_t flog2(uint32_t v)
+{
+    // Based on code by Eric Cole and Mark Dickinson, which appears at
+    // https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
+
+    static const uint8_t multiplyDeBruijnBitPos[32] = {
+      0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+      8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
+    };
+
+    v |= v >> 1; // first round down to one less than a power of 2
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+
+    return multiplyDeBruijnBitPos[(uint32_t)(v * 0x07C4ACDDU) >> 27];
+}
+
+inline bitcount_t trailingzeros(uint32_t v)
+{
+    static const uint8_t multiplyDeBruijnBitPos[32] = {
+      0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+      31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+    };
+
+    return multiplyDeBruijnBitPos[((uint32_t)((v & -v) * 0x077CB531U)) >> 27];
+}
+
+inline bitcount_t flog2(uint64_t v)
+{
+    uint32_t high = v >> 32;
+    uint32_t low  = uint32_t(v);
+
+    return high ? 32+flog2(high) : flog2(low);
+}
+
+inline bitcount_t trailingzeros(uint64_t v)
+{
+    uint32_t high = v >> 32;
+    uint32_t low  = uint32_t(v);
+
+    return low ? trailingzeros(low) : trailingzeros(high)+32;
+}
+
+#endif
+
+template <typename UInt>
+inline bitcount_t clog2(UInt v)
+{
+    return flog2(v) + ((v & (-v)) != v);
+}
+
+template <typename UInt>
+inline UInt addwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
+{
+    UInt half_result = y + carryin;
+    UInt result = x + half_result;
+    *carryout = (half_result < y) || (result < x);
+    return result;
+}
+
+template <typename UInt>
+inline UInt subwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
+{
+    UInt half_result = y + carryin;
+    UInt result = x - half_result;
+    *carryout = (half_result < y) || (result > x);
+    return result;
+}
+
+
+template <typename UInt, typename UIntX2>
+class uint_x4 {
+// private:
+public:
+    union {
+#if PCG_LITTLE_ENDIAN
+        struct {
+            UInt v0, v1, v2, v3;
+        } w;
+        struct {
+            UIntX2 v01, v23;
+        } d;
+#else
+        struct {
+            UInt v3, v2, v1, v0;
+        } w;
+        struct {
+            UIntX2 v23, v01;
+        } d;
+#endif
+        // For the array access versions, the code that uses the array
+        // must handle endian itself.  Yuck.
+        UInt wa[4];
+        UIntX2 da[2];
+    };
+
+public:
+    uint_x4() = default;
+
+    constexpr uint_x4(UInt v3, UInt v2, UInt v1, UInt v0)
+#if PCG_LITTLE_ENDIAN
+       : w{v0, v1, v2, v3}
+#else
+       : w{v3, v2, v1, v0}
+#endif
+    {
+        // Nothing (else) to do
+    }
+
+    constexpr uint_x4(UIntX2 v23, UIntX2 v01)
+#if PCG_LITTLE_ENDIAN
+       : d{v01,v23}
+#else
+       : d{v23,v01}
+#endif
+    {
+        // Nothing (else) to do
+    }
+
+    template<class Integral,
+             typename std::enable_if<(std::is_integral<Integral>::value
+                                      && sizeof(Integral) <= sizeof(UIntX2))
+                                    >::type* = nullptr>
+    constexpr uint_x4(Integral v01)
+#if PCG_LITTLE_ENDIAN
+       : d{UIntX2(v01),0UL}
+#else
+       : d{0UL,UIntX2(v01)}
+#endif
+    {
+        // Nothing (else) to do
+    }
+
+    explicit constexpr operator uint64_t() const
+    {
+        return d.v01;
+    }
+
+    explicit constexpr operator uint32_t() const
+    {
+        return w.v0;
+    }
+
+    explicit constexpr operator int() const
+    {
+        return w.v0;
+    }
+
+    explicit constexpr operator uint16_t() const
+    {
+        return w.v0;
+    }
+
+    explicit constexpr operator uint8_t() const
+    {
+        return w.v0;
+    }
+
+    typedef typename std::conditional<std::is_same<uint64_t,
+                                                   unsigned long>::value,
+                                      unsigned long long,
+                                      unsigned long>::type
+            uint_missing_t;
+
+    explicit constexpr operator uint_missing_t() const
+    {
+        return d.v01;
+    }
+
+    explicit constexpr operator bool() const
+    {
+        return d.v01 || d.v23;
+    }
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator*(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend std::pair< uint_x4<U,V>,uint_x4<U,V> >
+        divmod(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator+(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator-(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator<<(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator>>(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator&(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator|(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator^(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bool operator==(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bool operator!=(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bool operator<(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bool operator<=(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bool operator>(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bool operator>=(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator~(const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator-(const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bitcount_t flog2(const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bitcount_t trailingzeros(const uint_x4<U,V>&);
+
+    uint_x4& operator*=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this * rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator/=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this / rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator%=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this % rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator+=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this + rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator-=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this - rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator&=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this & rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator|=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this | rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator^=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this ^ rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator>>=(bitcount_t shift)
+    {
+        uint_x4 result = *this >> shift;
+        return *this = result;
+    }
+
+    uint_x4& operator<<=(bitcount_t shift)
+    {
+        uint_x4 result = *this << shift;
+        return *this = result;
+    }
+
+};
+
+template<typename U, typename V>
+bitcount_t flog2(const uint_x4<U,V>& v)
+{
+#if PCG_LITTLE_ENDIAN
+    for (uint8_t i = 4; i !=0; /* dec in loop */) {
+        --i;
+#else
+    for (uint8_t i = 0; i < 4; ++i) {
+#endif
+        if (v.wa[i] == 0)
+             continue;
+        return flog2(v.wa[i]) + (sizeof(U)*CHAR_BIT)*i;
+    }
+    abort();
+}
+
+template<typename U, typename V>
+bitcount_t trailingzeros(const uint_x4<U,V>& v)
+{
+#if PCG_LITTLE_ENDIAN
+    for (uint8_t i = 0; i < 4; ++i) {
+#else
+    for (uint8_t i = 4; i !=0; /* dec in loop */) {
+        --i;
+#endif
+        if (v.wa[i] != 0)
+            return trailingzeros(v.wa[i]) + (sizeof(U)*CHAR_BIT)*i;
+    }
+    return (sizeof(U)*CHAR_BIT)*4;
+}
+
+template <typename UInt, typename UIntX2>
+std::pair< uint_x4<UInt,UIntX2>, uint_x4<UInt,UIntX2> >
+    divmod(const uint_x4<UInt,UIntX2>& orig_dividend,
+           const uint_x4<UInt,UIntX2>& divisor)
+{
+    // If the dividend is less than the divisor, the answer is always zero.
+    // This takes care of boundary cases like 0/x (which would otherwise be
+    // problematic because we can't take the log of zero.  (The boundary case
+    // of division by zero is undefined.)
+    if (orig_dividend < divisor)
+        return { uint_x4<UInt,UIntX2>(0UL), orig_dividend };
+
+    auto dividend = orig_dividend;
+
+    auto log2_divisor  = flog2(divisor);
+    auto log2_dividend = flog2(dividend);
+    // assert(log2_dividend >= log2_divisor);
+    bitcount_t logdiff = log2_dividend - log2_divisor;
+
+    constexpr uint_x4<UInt,UIntX2> ONE(1UL);
+    if (logdiff == 0)
+        return { ONE, dividend - divisor };
+
+    // Now we change the log difference to
+    //  floor(log2(divisor)) - ceil(log2(dividend))
+    // to ensure that we *underestimate* the result.
+    logdiff -= 1;
+
+    uint_x4<UInt,UIntX2> quotient(0UL);
+
+    auto qfactor = ONE << logdiff;
+    auto factor  = divisor << logdiff;
+
+    do {
+        dividend -= factor;
+        quotient += qfactor;
+        while (dividend < factor) {
+            factor  >>= 1;
+            qfactor >>= 1;
+        }
+    } while (dividend >= divisor);
+
+    return { quotient, dividend };
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator/(const uint_x4<UInt,UIntX2>& dividend,
+                               const uint_x4<UInt,UIntX2>& divisor)
+{
+    return divmod(dividend, divisor).first;
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator%(const uint_x4<UInt,UIntX2>& dividend,
+                               const uint_x4<UInt,UIntX2>& divisor)
+{
+    return divmod(dividend, divisor).second;
+}
+
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator*(const uint_x4<UInt,UIntX2>& a,
+                               const uint_x4<UInt,UIntX2>& b)
+{
+    uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
+    bool carryin = false;
+    bool carryout;
+    UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(b.w.v0);
+    r.w.v0 = UInt(a0b0);
+    r.w.v1 = UInt(a0b0 >> 32);
+
+    UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(b.w.v0);
+    r.w.v2 = UInt(a1b0 >> 32);
+    r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout);
+    carryin = carryout;
+    r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
+
+    UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b.w.v1);
+    carryin = false;
+    r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> 32), carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
+
+    carryin = false;
+    r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout);
+    carryin = carryout;
+    r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
+
+    UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b.w.v1);
+    carryin = false;
+    r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> 32), carryin, &carryout);
+
+    r.d.v23 += a.d.v01 * b.d.v23 + a.d.v23 * b.d.v01;
+
+    return r;
+}
+
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator+(const uint_x4<UInt,UIntX2>& a,
+                               const uint_x4<UInt,UIntX2>& b)
+{
+    uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
+
+    bool carryin = false;
+    bool carryout;
+    r.w.v0 = addwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
+    carryin = carryout;
+    r.w.v1 = addwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
+    carryin = carryout;
+    r.w.v2 = addwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
+
+    return r;
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& a,
+                               const uint_x4<UInt,UIntX2>& b)
+{
+    uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
+
+    bool carryin = false;
+    bool carryout;
+    r.w.v0 = subwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
+    carryin = carryout;
+    r.w.v1 = subwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
+    carryin = carryout;
+    r.w.v2 = subwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = subwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
+
+    return r;
+}
+
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator&(const uint_x4<UInt,UIntX2>& a,
+                               const uint_x4<UInt,UIntX2>& b)
+{
+    return uint_x4<UInt,UIntX2>(a.d.v23 & b.d.v23, a.d.v01 & b.d.v01);
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator|(const uint_x4<UInt,UIntX2>& a,
+                               const uint_x4<UInt,UIntX2>& b)
+{
+    return uint_x4<UInt,UIntX2>(a.d.v23 | b.d.v23, a.d.v01 | b.d.v01);
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator^(const uint_x4<UInt,UIntX2>& a,
+                               const uint_x4<UInt,UIntX2>& b)
+{
+    return uint_x4<UInt,UIntX2>(a.d.v23 ^ b.d.v23, a.d.v01 ^ b.d.v01);
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator~(const uint_x4<UInt,UIntX2>& v)
+{
+    return uint_x4<UInt,UIntX2>(~v.d.v23, ~v.d.v01);
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& v)
+{
+    return uint_x4<UInt,UIntX2>(0UL,0UL) - v;
+}
+
+template <typename UInt, typename UIntX2>
+bool operator==(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
+{
+    return (a.d.v01 == b.d.v01) && (a.d.v23 == b.d.v23);
+}
+
+template <typename UInt, typename UIntX2>
+bool operator!=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
+{
+    return !operator==(a,b);
+}
+
+
+template <typename UInt, typename UIntX2>
+bool operator<(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
+{
+    return (a.d.v23 < b.d.v23)
+           || ((a.d.v23 == b.d.v23) && (a.d.v01 < b.d.v01));
+}
+
+template <typename UInt, typename UIntX2>
+bool operator>(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
+{
+    return operator<(b,a);
+}
+
+template <typename UInt, typename UIntX2>
+bool operator<=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
+{
+    return !(operator<(b,a));
+}
+
+template <typename UInt, typename UIntX2>
+bool operator>=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
+{
+    return !(operator<(a,b));
+}
+
+
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator<<(const uint_x4<UInt,UIntX2>& v,
+                                const bitcount_t shift)
+{
+    uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
+    const bitcount_t bits    = sizeof(UInt) * CHAR_BIT;
+    const bitcount_t bitmask = bits - 1;
+    const bitcount_t shiftdiv = shift / bits;
+    const bitcount_t shiftmod = shift & bitmask;
+
+    if (shiftmod) {
+        UInt carryover = 0;
+#if PCG_LITTLE_ENDIAN
+        for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
+#else
+        for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
+            --out, --in;
+#endif
+            r.wa[out] = (v.wa[in] << shiftmod) | carryover;
+            carryover = (v.wa[in] >> (bits - shiftmod));
+        }
+    } else {
+#if PCG_LITTLE_ENDIAN
+        for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
+#else
+        for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
+            --out, --in;
+#endif
+            r.wa[out] = v.wa[in];
+        }
+    }
+
+    return r;
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator>>(const uint_x4<UInt,UIntX2>& v,
+                                const bitcount_t shift)
+{
+    uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
+    const bitcount_t bits    = sizeof(UInt) * CHAR_BIT;
+    const bitcount_t bitmask = bits - 1;
+    const bitcount_t shiftdiv = shift / bits;
+    const bitcount_t shiftmod = shift & bitmask;
+
+    if (shiftmod) {
+        UInt carryover = 0;
+#if PCG_LITTLE_ENDIAN
+        for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
+            --out, --in;
+#else
+        for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
+#endif
+            r.wa[out] = (v.wa[in] >> shiftmod) | carryover;
+            carryover = (v.wa[in] << (bits - shiftmod));
+        }
+    } else {
+#if PCG_LITTLE_ENDIAN
+        for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
+            --out, --in;
+#else
+        for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
+#endif
+            r.wa[out] = v.wa[in];
+        }
+    }
+
+    return r;
+}
+
+} // namespace pcg_extras
+
+#endif // PCG_UINT128_HPP_INCLUDED
diff --git a/mt-kahypar/datastructures/delta_partitioned_graph.h b/mt-kahypar/datastructures/delta_partitioned_graph.h
index d0a8480bc..43e11c484 100644
--- a/mt-kahypar/datastructures/delta_partitioned_graph.h
+++ b/mt-kahypar/datastructures/delta_partitioned_graph.h
@@ -190,7 +190,7 @@ class DeltaPartitionedGraph {
       _part_weights_delta[to] += weight;
       _part_weights_delta[from] -= weight;
 
-      SyncronizedEdgeUpdate sync_update;
+      SynchronizedEdgeUpdate sync_update;
       sync_update.from = from;
       sync_update.to = to;
       sync_update.target_graph = _pg->targetGraph();
@@ -344,4 +344,4 @@ class DeltaPartitionedGraph {
 };
 
 } // namespace ds
-} // namespace mt_kahypar
\ No newline at end of file
+} // namespace mt_kahypar
diff --git a/mt-kahypar/datastructures/delta_partitioned_hypergraph.h b/mt-kahypar/datastructures/delta_partitioned_hypergraph.h
index e26b5156c..9eb52ab3d 100644
--- a/mt-kahypar/datastructures/delta_partitioned_hypergraph.h
+++ b/mt-kahypar/datastructures/delta_partitioned_hypergraph.h
@@ -202,7 +202,7 @@ class DeltaPartitionedHypergraph {
       _part_weights_delta[to] += wu;
       _part_weights_delta[from] -= wu;
 
-      SyncronizedEdgeUpdate sync_update;
+      SynchronizedEdgeUpdate sync_update;
       sync_update.from = from;
       sync_update.to = to;
       sync_update.target_graph = _phg->targetGraph();
@@ -340,7 +340,7 @@ class DeltaPartitionedHypergraph {
 
   MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
   void updateConnectivitySet(const HyperedgeID e,
-                             const SyncronizedEdgeUpdate& sync_update) {
+                             const SynchronizedEdgeUpdate& sync_update) {
     if ( sync_update.pin_count_in_from_part_after == 0 ) {
       _connectivity_set_delta.remove(sync_update.he, sync_update.from);
     }
@@ -373,4 +373,4 @@ class DeltaPartitionedHypergraph {
 };
 
 } // namespace ds
-} // namespace mt_kahypar
\ No newline at end of file
+} // namespace mt_kahypar
diff --git a/mt-kahypar/datastructures/hypergraph_common.h b/mt-kahypar/datastructures/hypergraph_common.h
index cdb2d6e98..ee7720433 100644
--- a/mt-kahypar/datastructures/hypergraph_common.h
+++ b/mt-kahypar/datastructures/hypergraph_common.h
@@ -139,37 +139,23 @@ class ConnectivityInfo;
 class SparseConnectivityInfo;
 }
 
-struct SyncronizedEdgeUpdate {
-  SyncronizedEdgeUpdate() :
-    he(kInvalidHyperedge),
-    from(kInvalidPartition),
-    to(kInvalidPartition),
-    edge_weight(0),
-    edge_size(0),
-    pin_count_in_from_part_after(kInvalidHypernode),
-    pin_count_in_to_part_after(kInvalidHypernode),
-    block_of_other_node(kInvalidPartition),
-    connectivity_set_after(nullptr),
-    pin_counts_after(nullptr),
-    target_graph(nullptr),
-    edge_locks(nullptr) { }
-
-  HyperedgeID he;
-  PartitionID from;
-  PartitionID to;
-  HyperedgeID edge_weight;
-  HypernodeID edge_size;
-  HypernodeID pin_count_in_from_part_after;
-  HypernodeID pin_count_in_to_part_after;
-  PartitionID block_of_other_node;
-  mutable ds::Bitset* connectivity_set_after;
-  mutable ds::PinCountSnapshot* pin_counts_after;
-  const TargetGraph* target_graph;
-  ds::Array<SpinLock>* edge_locks;
+struct SynchronizedEdgeUpdate {
+  HyperedgeID he = kInvalidHyperedge;
+  PartitionID from = kInvalidPartition;
+  PartitionID to = kInvalidPartition;
+  HyperedgeID edge_weight = 0;
+  HypernodeID edge_size = 0;
+  HypernodeID pin_count_in_from_part_after = kInvalidHypernode;
+  HypernodeID pin_count_in_to_part_after = kInvalidHypernode;
+  PartitionID block_of_other_node = kInvalidPartition;
+  mutable ds::Bitset* connectivity_set_after = nullptr;
+  mutable ds::PinCountSnapshot* pin_counts_after = nullptr;
+  const TargetGraph* target_graph = nullptr;
+  ds::Array<SpinLock>* edge_locks = nullptr;
 };
 
 struct NoOpDeltaFunc {
-  void operator() (const SyncronizedEdgeUpdate&) { }
+  void operator() (const SynchronizedEdgeUpdate&) { }
 };
 
 template<typename Hypergraph, typename ConInfo>
diff --git a/mt-kahypar/datastructures/partitioned_graph.h b/mt-kahypar/datastructures/partitioned_graph.h
index 2539b8b98..1bc3acd9d 100644
--- a/mt-kahypar/datastructures/partitioned_graph.h
+++ b/mt-kahypar/datastructures/partitioned_graph.h
@@ -63,10 +63,10 @@ class PartitionedGraph {
   static_assert(!Hypergraph::is_partitioned,  "Only unpartitioned hypergraphs are allowed");
 
   using Self = PartitionedGraph<Hypergraph>;
-  using NotificationFunc = std::function<void (SyncronizedEdgeUpdate&)>;
-  using DeltaFunction = std::function<void (const SyncronizedEdgeUpdate&)>;
-  #define NOOP_NOTIFY_FUNC [] (const SyncronizedEdgeUpdate&) { }
-  #define NOOP_FUNC [] (const SyncronizedEdgeUpdate&) { }
+  using NotificationFunc = std::function<void (SynchronizedEdgeUpdate&)>;
+  using DeltaFunction = std::function<void (const SynchronizedEdgeUpdate&)>;
+  #define NOOP_NOTIFY_FUNC [] (const SynchronizedEdgeUpdate&) { }
+  #define NOOP_FUNC [] (const SynchronizedEdgeUpdate&) { }
 
   // Factory
   using HypergraphFactory = typename Hypergraph::Factory;
@@ -584,7 +584,7 @@ class PartitionedGraph {
                       HypernodeWeight max_weight_to,
                       SuccessFunc&& report_success,
                       const DeltaFunction& delta_func) {
-    auto my_delta_func = [&](const SyncronizedEdgeUpdate& sync_update) {
+    auto my_delta_func = [&](const SynchronizedEdgeUpdate& sync_update) {
       delta_func(sync_update);
       gain_cache.deltaGainUpdate(*this, sync_update);
     };
@@ -593,7 +593,7 @@ class PartitionedGraph {
         report_success, my_delta_func, NOOP_NOTIFY_FUNC);
     } else {
       return changeNodePartImpl<true>(u, from, to, max_weight_to,
-        report_success, my_delta_func, [&](SyncronizedEdgeUpdate& sync_update) {
+        report_success, my_delta_func, [&](SynchronizedEdgeUpdate& sync_update) {
           gain_cache.notifyBeforeDeltaGainUpdate(*this, sync_update);
         });
     }
@@ -1062,7 +1062,7 @@ class PartitionedGraph {
       _part_weights[from].fetch_sub(weight, std::memory_order_relaxed);
       report_success();
       DBG << "<<< Start changing node part: " << V(u) << " - " << V(from) << " - " << V(to);
-      SyncronizedEdgeUpdate sync_update;
+      SynchronizedEdgeUpdate sync_update;
       sync_update.from = from;
       sync_update.to = to;
       sync_update.target_graph = _target_graph;
@@ -1112,7 +1112,7 @@ class PartitionedGraph {
   // node u is moved to the block 'to'.
   template<bool notify>
   MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
-  PartitionID synchronizeMoveOnEdge(SyncronizedEdgeUpdate& sync_update,
+  PartitionID synchronizeMoveOnEdge(SynchronizedEdgeUpdate& sync_update,
                                     const HyperedgeID edge,
                                     const HypernodeID u,
                                     const PartitionID to,
diff --git a/mt-kahypar/datastructures/partitioned_hypergraph.h b/mt-kahypar/datastructures/partitioned_hypergraph.h
index 0bb9891d9..c42fa1c90 100644
--- a/mt-kahypar/datastructures/partitioned_hypergraph.h
+++ b/mt-kahypar/datastructures/partitioned_hypergraph.h
@@ -62,10 +62,10 @@ class PartitionedHypergraph {
  private:
   static_assert(!Hypergraph::is_partitioned,  "Only unpartitioned hypergraphs are allowed");
 
-  using NotificationFunc = std::function<void (SyncronizedEdgeUpdate&)>;
-  using DeltaFunction = std::function<void (const SyncronizedEdgeUpdate&)>;
-  #define NOOP_NOTIFY_FUNC [] (const SyncronizedEdgeUpdate&) { }
-  #define NOOP_FUNC [] (const SyncronizedEdgeUpdate&) { }
+  using NotificationFunc = std::function<void (SynchronizedEdgeUpdate&)>;
+  using DeltaFunction = std::function<void (const SynchronizedEdgeUpdate&)>;
+  #define NOOP_NOTIFY_FUNC [] (const SynchronizedEdgeUpdate&) { }
+  #define NOOP_FUNC [] (const SynchronizedEdgeUpdate&) { }
 
   // Factory
   using HypergraphFactory = typename Hypergraph::Factory;
@@ -590,7 +590,7 @@ class PartitionedHypergraph {
       _part_ids[u] = to;
       _part_weights[from].fetch_sub(wu, std::memory_order_relaxed);
       report_success();
-      SyncronizedEdgeUpdate sync_update;
+      SynchronizedEdgeUpdate sync_update;
       sync_update.from = from;
       sync_update.to = to;
       sync_update.target_graph = _target_graph;
@@ -624,7 +624,7 @@ class PartitionedHypergraph {
                       HypernodeWeight max_weight_to,
                       SuccessFunc&& report_success,
                       const DeltaFunction& delta_func) {
-    auto my_delta_func = [&](const SyncronizedEdgeUpdate& sync_update) {
+    auto my_delta_func = [&](const SynchronizedEdgeUpdate& sync_update) {
       delta_func(sync_update);
       gain_cache.deltaGainUpdate(*this, sync_update);
     };
@@ -632,7 +632,7 @@ class PartitionedHypergraph {
       return changeNodePart(u, from, to, max_weight_to, report_success, my_delta_func);
     } else {
       return changeNodePart(u, from, to, max_weight_to, report_success, my_delta_func,
-        [&](SyncronizedEdgeUpdate& sync_update) {
+        [&](SynchronizedEdgeUpdate& sync_update) {
           sync_update.pin_count_in_from_part_after = pinCountInPart(sync_update.he, from) - 1;
           sync_update.pin_count_in_to_part_after = pinCountInPart(sync_update.he, to) + 1;
           gain_cache.notifyBeforeDeltaGainUpdate(*this, sync_update);
@@ -1168,7 +1168,7 @@ class PartitionedHypergraph {
   MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE void updatePinCountOfHyperedge(const HyperedgeID he,
                                                                     const PartitionID from,
                                                                     const PartitionID to,
-                                                                    SyncronizedEdgeUpdate& sync_update,
+                                                                    SynchronizedEdgeUpdate& sync_update,
                                                                     const DeltaFunction& delta_func,
                                                                     const NotificationFunc& notify_func) {
     ASSERT(he < _pin_count_update_ownership.size());
diff --git a/mt-kahypar/datastructures/sparse_map.h b/mt-kahypar/datastructures/sparse_map.h
index 931abb153..6131f9e12 100644
--- a/mt-kahypar/datastructures/sparse_map.h
+++ b/mt-kahypar/datastructures/sparse_map.h
@@ -124,6 +124,14 @@ class SparseMapBase {
     return _dense[_sparse[key]].value;
   }
 
+  Value getOrDefault(const Key key) const {
+    const size_t index = _sparse[key];
+    if (!contains(key)) {
+      return Value();
+    }
+    return _dense[index].value;
+  }
+
   void freeInternalData() {
     _size = 0;
     _data = nullptr;
diff --git a/mt-kahypar/io/command_line_options.cpp b/mt-kahypar/io/command_line_options.cpp
index dbf5ac95e..4474c1df2 100644
--- a/mt-kahypar/io/command_line_options.cpp
+++ b/mt-kahypar/io/command_line_options.cpp
@@ -373,6 +373,11 @@ namespace mt_kahypar {
                               &context.initial_partitioning.refinement.label_propagation.rebalancing))->value_name(
                      "<bool>")->default_value(true),
              "If true, then zero gain moves are only performed if they improve the balance of the solution (only in label propagation)")
+            ((initial_partitioning ? "i-r-lp-unconstrained" : "r-lp-unconstrained"),
+             po::value<bool>((!initial_partitioning ? &context.refinement.label_propagation.unconstrained :
+                              &context.initial_partitioning.refinement.label_propagation.unconstrained))->value_name(
+                     "<bool>")->default_value(false),
+             "If true, then unconstrained label propagation (including rebalancing) is used.")
             ((initial_partitioning ? "i-r-lp-he-size-activation-threshold" : "r-lp-he-size-activation-threshold"),
              po::value<size_t>(
                      (!initial_partitioning ? &context.refinement.label_propagation.hyperedge_size_activation_threshold
@@ -380,6 +385,11 @@ namespace mt_kahypar {
                       &context.initial_partitioning.refinement.label_propagation.hyperedge_size_activation_threshold))->value_name(
                      "<size_t>")->default_value(100),
              "LP refiner activates only neighbors of moved vertices that are part of hyperedges with a size less than this threshold")
+            ((initial_partitioning ? "i-r-lp-relative-improvement-threshold" : "r-lp-relative-improvement-threshold"),
+             po::value<double>((!initial_partitioning ? &context.refinement.label_propagation.relative_improvement_threshold :
+                                &context.initial_partitioning.refinement.label_propagation.relative_improvement_threshold))->value_name(
+                     "<double>")->default_value(-1.0),
+             "Relative improvement threshold for label propagation.")
             ((initial_partitioning ? "i-r-fm-type" : "r-fm-type"),
              po::value<std::string>()->value_name("<string>")->notifier(
                      [&, initial_partitioning](const std::string& type) {
@@ -391,6 +401,7 @@ namespace mt_kahypar {
                      })->default_value("kway_fm"),
              "FM Algorithm:\n"
              "- kway_fm\n"
+             "- unconstrained_fm\n"
              "- do_nothing")
             ((initial_partitioning ? "i-r-fm-multitry-rounds" : "r-fm-multitry-rounds"),
              po::value<size_t>((initial_partitioning ? &context.initial_partitioning.refinement.fm.multitry_rounds :
@@ -434,6 +445,42 @@ namespace mt_kahypar {
              po::value<bool>((initial_partitioning ? &context.initial_partitioning.refinement.fm.release_nodes :
                               &context.refinement.fm.release_nodes))->value_name("<bool>")->default_value(true),
              "FM releases nodes that weren't moved, so they might be found by another search.")
+            ((initial_partitioning ? "i-r-fm-threshold-border-node-inclusion" : "r-fm-threshold-border-node-inclusion"),
+             po::value<double>((initial_partitioning ? &context.initial_partitioning.refinement.fm.treshold_border_node_inclusion :
+                              &context.refinement.fm.treshold_border_node_inclusion))->value_name("<double>")->default_value(0.75),
+             "Threshold for block-internal incident weight when deciding whether to include border nodes for rebalancing estimation.")
+            ((initial_partitioning ? "i-r-fm-unconstrained-upper-bound" : "r-fm-unconstrained-upper-bound"),
+             po::value<double>((initial_partitioning ? &context.initial_partitioning.refinement.fm.unconstrained_upper_bound :
+                              &context.refinement.fm.unconstrained_upper_bound))->value_name("<double>")->default_value(0.0),
+             "Still use upper limit for imbalance with unconstrained FM, expressed as a factor of the max part weight (default = 0 = no limit).")
+            ((initial_partitioning ? "i-r-fm-unconstrained-rounds" : "r-fm-unconstrained-rounds"),
+             po::value<size_t>((initial_partitioning ? &context.initial_partitioning.refinement.fm.unconstrained_rounds :
+                              &context.refinement.fm.unconstrained_rounds))->value_name("<size_t>")->default_value(8),
+             "Unconstrained FM: Number of rounds that are unconstrained.")
+            ((initial_partitioning ? "i-r-fm-imbalance-penalty-min" : "r-fm-imbalance-penalty-min"),
+             po::value<double>((initial_partitioning ? &context.initial_partitioning.refinement.fm.imbalance_penalty_min :
+                              &context.refinement.fm.imbalance_penalty_min))->value_name("<double>")->default_value(0.2),
+             "Unconstrained FM: Minimum (starting) penalty factor.")
+            ((initial_partitioning ? "i-r-fm-imbalance-penalty-max" : "r-fm-imbalance-penalty-max"),
+             po::value<double>((initial_partitioning ? &context.initial_partitioning.refinement.fm.imbalance_penalty_max :
+                              &context.refinement.fm.imbalance_penalty_max))->value_name("<double>")->default_value(1.0),
+             "Unconstrained FM: Maximum (final) penalty factor.")
+            ((initial_partitioning ? "i-r-fm-unconstrained-upper-bound-min" : "r-fm-unconstrained-upper-bound-min"),
+             po::value<double>((initial_partitioning ? &context.initial_partitioning.refinement.fm.unconstrained_upper_bound_min :
+                              &context.refinement.fm.unconstrained_upper_bound_min))->value_name("<double>")->default_value(0.0),
+             "Unconstrained FM: Minimum (final) upper bound (default = 0 = equal to start).")
+            ((initial_partitioning ? "i-r-fm-activate-unconstrained-dynamically" : "r-fm-activate-unconstrained-dynamically"),
+             po::value<bool>((initial_partitioning ? &context.initial_partitioning.refinement.fm.activate_unconstrained_dynamically :
+                              &context.refinement.fm.activate_unconstrained_dynamically))->value_name("<bool>")->default_value(false),
+             "Decide dynamically (based on first two rounds) whether to use unconstrained FM.")
+            ((initial_partitioning ? "i-r-fm-penalty-for-activation-test" : "r-fm-penalty-for-activation-test"),
+             po::value<double>((initial_partitioning ? &context.initial_partitioning.refinement.fm.penalty_for_activation_test :
+                              &context.refinement.fm.penalty_for_activation_test))->value_name("<double>")->default_value(0.5),
+             "If unconstrained FM is activated dynamically, determines the penalty factor used for the test round.")
+            ((initial_partitioning ? "i-r-fm-unconstrained-min-improvement" : "r-fm-unconstrained-min-improvement"),
+             po::value<double>((initial_partitioning ? &context.initial_partitioning.refinement.fm.unconstrained_min_improvement :
+                              &context.refinement.fm.unconstrained_min_improvement))->value_name("<double>")->default_value(-1.0),
+             "Switch to constrained FM if relative improvement of unconstrained FM is below this treshold.")
             ((initial_partitioning ? "i-r-fm-obey-minimal-parallelism" : "r-fm-obey-minimal-parallelism"),
              po::value<bool>(
                      (initial_partitioning ? &context.initial_partitioning.refinement.fm.obey_minimal_parallelism :
@@ -476,6 +523,7 @@ namespace mt_kahypar {
                      })->default_value("do_nothing"),
              "Rebalancer Algorithm:\n"
              "- simple_rebalancer\n"
+             "- advanced_rebalancer\n"
              "- do_nothing");
     return options;
   }
diff --git a/mt-kahypar/io/sql_plottools_serializer.cpp b/mt-kahypar/io/sql_plottools_serializer.cpp
index bf3813d5f..fe1b2f8d1 100644
--- a/mt-kahypar/io/sql_plottools_serializer.cpp
+++ b/mt-kahypar/io/sql_plottools_serializer.cpp
@@ -113,6 +113,8 @@ std::string serialize(const PartitionedHypergraph& hypergraph,
         << " lp_algorithm=" << context.refinement.label_propagation.algorithm
         << " lp_maximum_iterations=" << context.refinement.label_propagation.maximum_iterations
         << " lp_rebalancing=" << std::boolalpha << context.refinement.label_propagation.rebalancing
+        << " lp_unconstrained=" << std::boolalpha << context.refinement.label_propagation.unconstrained
+        << " lp_relative_improvement_threshold=" << context.refinement.label_propagation.relative_improvement_threshold
         << " lp_hyperedge_size_activation_threshold=" << context.refinement.label_propagation.hyperedge_size_activation_threshold
         << " sync_lp_num_sub_rounds_sync_lp=" << context.refinement.deterministic_refinement.num_sub_rounds_sync_lp
         << " sync_lp_use_active_node_set=" << context.refinement.deterministic_refinement.use_active_node_set
@@ -130,6 +132,15 @@ std::string serialize(const PartitionedHypergraph& hypergraph,
         << " fm_time_limit_factor=" << context.refinement.fm.time_limit_factor
         << " fm_obey_minimal_parallelism=" << std::boolalpha << context.refinement.fm.obey_minimal_parallelism
         << " fm_shuffle=" << std::boolalpha << context.refinement.fm.shuffle
+        << " fm_unconstrained_rounds=" << context.refinement.fm.unconstrained_rounds
+        << " fm_treshold_border_node_inclusion=" << context.refinement.fm.treshold_border_node_inclusion
+        << " fm_unconstrained_min_improvement=" << context.refinement.fm.unconstrained_min_improvement
+        << " fm_unconstrained_upper_bound=" << context.refinement.fm.unconstrained_upper_bound
+        << " fm_unconstrained_upper_bound_min=" << context.refinement.fm.unconstrained_upper_bound_min
+        << " fm_imbalance_penalty_min=" << context.refinement.fm.imbalance_penalty_min
+        << " fm_imbalance_penalty_max=" << context.refinement.fm.imbalance_penalty_max
+        << " fm_activate_unconstrained_dynamically=" << std::boolalpha << context.refinement.fm.activate_unconstrained_dynamically
+        << " fm_penalty_for_activation_test=" << context.refinement.fm.penalty_for_activation_test
         << " global_fm_use_global_fm=" << std::boolalpha << context.refinement.global_fm.use_global_fm
         << " global_fm_refine_until_no_improvement=" << std::boolalpha << context.refinement.global_fm.refine_until_no_improvement
         << " global_fm_num_seed_nodes=" << context.refinement.global_fm.num_seed_nodes
diff --git a/mt-kahypar/partition/coarsening/multilevel_uncoarsener.cpp b/mt-kahypar/partition/coarsening/multilevel_uncoarsener.cpp
index 5d85ebe81..c65b6967d 100644
--- a/mt-kahypar/partition/coarsening/multilevel_uncoarsener.cpp
+++ b/mt-kahypar/partition/coarsening/multilevel_uncoarsener.cpp
@@ -32,8 +32,6 @@
 #include "mt-kahypar/io/partitioning_output.h"
 #include "mt-kahypar/partition/refinement/i_refiner.h"
 #include "mt-kahypar/partition/metrics.h"
-#include "mt-kahypar/partition/refinement/flows/scheduler.h"
-#include "mt-kahypar/partition/refinement/rebalancing/rebalancer.h"
 #include "mt-kahypar/utils/stats.h"
 #include "mt-kahypar/utils/cast.h"
 
diff --git a/mt-kahypar/partition/coarsening/nlevel_uncoarsener.cpp b/mt-kahypar/partition/coarsening/nlevel_uncoarsener.cpp
index 2f98b4e1f..8ede126d0 100644
--- a/mt-kahypar/partition/coarsening/nlevel_uncoarsener.cpp
+++ b/mt-kahypar/partition/coarsening/nlevel_uncoarsener.cpp
@@ -28,11 +28,7 @@
 
 #include "mt-kahypar/partition/coarsening/nlevel_uncoarsener.h"
 
-#include "kahypar-resources/datastructure/fast_reset_flag_array.h"
-
 #include "mt-kahypar/definitions.h"
-#include "mt-kahypar/partition/refinement/flows/scheduler.h"
-#include "mt-kahypar/partition/refinement/rebalancing/rebalancer.h"
 #include "mt-kahypar/utils/progress_bar.h"
 #include "mt-kahypar/io/partitioning_output.h"
 #include "mt-kahypar/utils/utilities.h"
diff --git a/mt-kahypar/partition/coarsening/nlevel_uncoarsener.h b/mt-kahypar/partition/coarsening/nlevel_uncoarsener.h
index cc98ef2ec..2f3512d70 100644
--- a/mt-kahypar/partition/coarsening/nlevel_uncoarsener.h
+++ b/mt-kahypar/partition/coarsening/nlevel_uncoarsener.h
@@ -28,6 +28,8 @@
 
 #pragma once
 
+#include "kahypar-resources/datastructure/fast_reset_flag_array.h"
+
 #include "mt-kahypar/partition/context.h"
 #include "mt-kahypar/partition/coarsening/i_uncoarsener.h"
 #include "mt-kahypar/partition/coarsening/uncoarsener_base.h"
diff --git a/mt-kahypar/partition/coarsening/uncoarsener_base.h b/mt-kahypar/partition/coarsening/uncoarsener_base.h
index 5500283a2..d1b282b35 100644
--- a/mt-kahypar/partition/coarsening/uncoarsener_base.h
+++ b/mt-kahypar/partition/coarsening/uncoarsener_base.h
@@ -83,7 +83,7 @@ class UncoarsenerBase {
   std::unique_ptr<IRefiner> _label_propagation;
   std::unique_ptr<IRefiner> _fm;
   std::unique_ptr<IRefiner> _flows;
-  std::unique_ptr<IRefiner> _rebalancer;
+  std::unique_ptr<IRebalancer> _rebalancer;
 
  protected:
 
@@ -119,17 +119,18 @@ class UncoarsenerBase {
 
   void initializeRefinementAlgorithms() {
     _gain_cache = GainCachePtr::constructGainCache(_context);
+    // refinement algorithms require access to the rebalancer
+    _rebalancer = RebalancerFactory::getInstance().createObject(
+      _context.refinement.rebalancer, _hg.initialNumNodes(), _context, _gain_cache);
     _label_propagation = LabelPropagationFactory::getInstance().createObject(
       _context.refinement.label_propagation.algorithm,
-      _hg.initialNumNodes(), _hg.initialNumEdges(), _context, _gain_cache);
+      _hg.initialNumNodes(), _hg.initialNumEdges(), _context, _gain_cache, *_rebalancer);
     _fm = FMFactory::getInstance().createObject(
       _context.refinement.fm.algorithm,
-      _hg.initialNumNodes(), _hg.initialNumEdges(), _context, _gain_cache);
+      _hg.initialNumNodes(), _hg.initialNumEdges(), _context, _gain_cache, *_rebalancer);
     _flows = FlowSchedulerFactory::getInstance().createObject(
       _context.refinement.flows.algorithm,
       _hg.initialNumNodes(), _hg.initialNumEdges(), _context, _gain_cache);
-    _rebalancer = RebalancerFactory::getInstance().createObject(
-      _context.refinement.rebalancer, _context);
   }
 };
 }
diff --git a/mt-kahypar/partition/context.cpp b/mt-kahypar/partition/context.cpp
index 5db1d965e..5af1e05f2 100644
--- a/mt-kahypar/partition/context.cpp
+++ b/mt-kahypar/partition/context.cpp
@@ -538,26 +538,33 @@ namespace mt_kahypar {
     initial_partitioning.refinement.flows.algorithm = FlowAlgorithm::do_nothing;
 
     // refinement
-    refinement.rebalancer = RebalancingAlgorithm::simple_rebalancer;
+    refinement.rebalancer = RebalancingAlgorithm::advanced_rebalancer;
     refinement.refine_until_no_improvement = false;
 
     // refinement -> label propagation
     refinement.label_propagation.algorithm = LabelPropagationAlgorithm::label_propagation;
+    refinement.label_propagation.unconstrained = true;
     refinement.label_propagation.maximum_iterations = 5;
-    refinement.label_propagation.rebalancing = true;
+    refinement.label_propagation.rebalancing = false;
     refinement.label_propagation.hyperedge_size_activation_threshold = 100;
+    refinement.label_propagation.relative_improvement_threshold = 0.001;
 
     // refinement -> fm
-    refinement.fm.algorithm = FMAlgorithm::kway_fm;
+    refinement.fm.algorithm = FMAlgorithm::unconstrained_fm;
     refinement.fm.multitry_rounds = 10;
+    refinement.fm.unconstrained_rounds = 8;
     refinement.fm.perform_moves_global = false;
     refinement.fm.rollback_parallel = true;
-    refinement.fm.rollback_balance_violation_factor = 1.25;
+    refinement.fm.rollback_balance_violation_factor = 1.0;
+    refinement.fm.treshold_border_node_inclusion = 0.7;
+    refinement.fm.imbalance_penalty_min = 0.2;
+    refinement.fm.imbalance_penalty_max = 1.0;
     refinement.fm.num_seed_nodes = 25;
     refinement.fm.obey_minimal_parallelism = true;
     refinement.fm.release_nodes = true;
     refinement.fm.time_limit_factor = 0.25;
     refinement.fm.min_improvement = -1;
+    refinement.fm.unconstrained_min_improvement = 0.002;
     refinement.fm.iter_moves_on_recalc = true;
 
     // refinement -> flows
@@ -574,6 +581,9 @@ namespace mt_kahypar {
     refinement.refine_until_no_improvement = true;
     refinement.relative_improvement_threshold = 0.0025;
 
+    // refinement -> label propagation
+    refinement.label_propagation.rebalancing = true;
+
     // refinement -> flows;
     refinement.flows.algorithm = FlowAlgorithm::flow_cutter;
     refinement.flows.alpha = 16;
@@ -660,7 +670,7 @@ namespace mt_kahypar {
     initial_partitioning.refinement.flows.algorithm = FlowAlgorithm::do_nothing;
 
     // refinement
-    refinement.rebalancer = RebalancingAlgorithm::simple_rebalancer;
+    refinement.rebalancer = RebalancingAlgorithm::advanced_rebalancer;
     refinement.refine_until_no_improvement = false;
 
     // refinement -> label propagation
@@ -762,7 +772,7 @@ namespace mt_kahypar {
     initial_partitioning.refinement.global_fm.use_global_fm = false;
 
     // refinement
-    refinement.rebalancer = RebalancingAlgorithm::simple_rebalancer;
+    refinement.rebalancer = RebalancingAlgorithm::advanced_rebalancer;
     refinement.refine_until_no_improvement = true;
     refinement.max_batch_size = 1000;
     refinement.min_border_vertices_per_thread = 50;
@@ -891,7 +901,7 @@ namespace mt_kahypar {
     initial_partitioning.refinement.flows.algorithm = FlowAlgorithm::do_nothing;
 
     // refinement
-    refinement.rebalancer = RebalancingAlgorithm::simple_rebalancer;
+    refinement.rebalancer = RebalancingAlgorithm::advanced_rebalancer;
     refinement.refine_until_no_improvement = false;
 
     // refinement -> label propagation
diff --git a/mt-kahypar/partition/context.h b/mt-kahypar/partition/context.h
index 45712bcab..c6b59f98a 100644
--- a/mt-kahypar/partition/context.h
+++ b/mt-kahypar/partition/context.h
@@ -135,7 +135,9 @@ struct LabelPropagationParameters {
   size_t maximum_iterations = 1;
   bool rebalancing = true;
   bool execute_sequential = false;
+  bool unconstrained = false;
   size_t hyperedge_size_activation_threshold = std::numeric_limits<size_t>::max();
+  double relative_improvement_threshold = -1.0;
 };
 
 std::ostream & operator<< (std::ostream& str, const LabelPropagationParameters& params);
@@ -156,6 +158,19 @@ struct FMParameters {
   bool shuffle = true;
   mutable bool obey_minimal_parallelism = false;
   bool release_nodes = true;
+
+  double treshold_border_node_inclusion = 0.75;
+  double unconstrained_upper_bound = 0.0;
+
+  // unconstrained
+  size_t unconstrained_rounds = 1;
+  double imbalance_penalty_min = 0.2;
+  double imbalance_penalty_max = 1.0;
+  double unconstrained_upper_bound_min = 0.0;
+
+  bool activate_unconstrained_dynamically = false;
+  double penalty_for_activation_test = 0.5;
+  double unconstrained_min_improvement = -1.0;
 };
 
 std::ostream& operator<<(std::ostream& out, const FMParameters& params);
diff --git a/mt-kahypar/partition/context_enum_classes.cpp b/mt-kahypar/partition/context_enum_classes.cpp
index cdaac1154..913dab8b7 100644
--- a/mt-kahypar/partition/context_enum_classes.cpp
+++ b/mt-kahypar/partition/context_enum_classes.cpp
@@ -229,6 +229,7 @@ namespace mt_kahypar {
   std::ostream & operator<< (std::ostream& os, const FMAlgorithm& algo) {
     switch (algo) {
       case FMAlgorithm::kway_fm: return os << "kway_fm";
+      case FMAlgorithm::unconstrained_fm: return os << "unconstrained_fm";
       case FMAlgorithm::do_nothing: return os << "fm_do_nothing";
         // omit default case to trigger compiler warning for missing cases
     }
@@ -249,6 +250,7 @@ namespace mt_kahypar {
   std::ostream & operator<< (std::ostream& os, const RebalancingAlgorithm& algo) {
       switch (algo) {
         case RebalancingAlgorithm::simple_rebalancer: return os << "simple_rebalancer";
+        case RebalancingAlgorithm::advanced_rebalancer: return os << "advanced_rebalancer";
         case RebalancingAlgorithm::do_nothing: return os << "do_nothing";
           // omit default case to trigger compiler warning for missing cases
       }
@@ -445,6 +447,8 @@ namespace mt_kahypar {
   FMAlgorithm fmAlgorithmFromString(const std::string& type) {
     if (type == "kway_fm") {
       return FMAlgorithm::kway_fm;
+    } else if (type == "unconstrained_fm") {
+      return FMAlgorithm::unconstrained_fm;
     } else if (type == "do_nothing") {
       return FMAlgorithm::do_nothing;
     }
@@ -465,6 +469,8 @@ namespace mt_kahypar {
   RebalancingAlgorithm rebalancingAlgorithmFromString(const std::string& type) {
     if (type == "simple_rebalancer") {
       return RebalancingAlgorithm::simple_rebalancer;
+    } else if (type == "advanced_rebalancer") {
+      return RebalancingAlgorithm::advanced_rebalancer;
     } else if (type == "do_nothing") {
       return RebalancingAlgorithm::do_nothing;
     }
@@ -491,4 +497,4 @@ namespace mt_kahypar {
     throw InvalidParameterException("Illegal option: " + policy);
     return SteinerTreeFlowValuePolicy::UNDEFINED;
   }
-}
\ No newline at end of file
+}
diff --git a/mt-kahypar/partition/context_enum_classes.h b/mt-kahypar/partition/context_enum_classes.h
index 3d7fa7be7..3d4aefdda 100644
--- a/mt-kahypar/partition/context_enum_classes.h
+++ b/mt-kahypar/partition/context_enum_classes.h
@@ -29,6 +29,7 @@
 
 #include <iostream>
 #include <string>
+#include <cstdint>
 
 #include "include/libmtkahypartypes.h"
 #include "mt-kahypar/macros.h"
@@ -154,6 +155,7 @@ enum class LabelPropagationAlgorithm : uint8_t {
 
 enum class FMAlgorithm : uint8_t {
   kway_fm,
+  unconstrained_fm,
   do_nothing
 };
 
@@ -165,6 +167,7 @@ enum class FlowAlgorithm : uint8_t {
 
 enum class RebalancingAlgorithm : uint8_t {
   simple_rebalancer,
+  advanced_rebalancer,
   do_nothing
 };
 
diff --git a/mt-kahypar/partition/deep_multilevel.cpp b/mt-kahypar/partition/deep_multilevel.cpp
index 286d93c3d..d5c684277 100644
--- a/mt-kahypar/partition/deep_multilevel.cpp
+++ b/mt-kahypar/partition/deep_multilevel.cpp
@@ -592,7 +592,7 @@ void bipartition_each_block(typename TypeTraits::PartitionedHypergraph& partitio
     return true;
   }(), "Cut of extracted blocks does not sum up to current objective");
 
-  if ( gain_cache.isInitialized() ) {
+  if ( GainCache::invalidates_entries && gain_cache.isInitialized() ) {
     partitioned_hg.doParallelForAllNodes([&](const HypernodeID& hn) {
       gain_cache.recomputeInvalidTerms(partitioned_hg, hn);
     });
diff --git a/mt-kahypar/partition/factories.h b/mt-kahypar/partition/factories.h
index 03ae8ba9a..a39026dad 100644
--- a/mt-kahypar/partition/factories.h
+++ b/mt-kahypar/partition/factories.h
@@ -42,14 +42,20 @@
 #include "mt-kahypar/partition/coarsening/policies/rating_heavy_node_penalty_policy.h"
 #include "mt-kahypar/partition/context.h"
 #include "mt-kahypar/partition/refinement/i_refiner.h"
+#include "mt-kahypar/partition/refinement/i_rebalancer.h"
 #include "mt-kahypar/partition/refinement/flows/i_flow_refiner.h"
 #include "mt-kahypar/partition/refinement/label_propagation/label_propagation_refiner.h"
 #include "mt-kahypar/partition/refinement/deterministic/deterministic_label_propagation.h"
+#include "mt-kahypar/partition/refinement/fm/fm_commons.h"
 #include "mt-kahypar/partition/refinement/fm/multitry_kway_fm.h"
+#include "mt-kahypar/partition/refinement/fm/strategies/i_fm_strategy.h"
+#include "mt-kahypar/partition/refinement/fm/strategies/gain_cache_strategy.h"
+#include "mt-kahypar/partition/refinement/fm/strategies/unconstrained_strategy.h"
 #include "mt-kahypar/partition/refinement/gains/gain_definitions.h"
 #include "mt-kahypar/partition/refinement/flows/scheduler.h"
 #include "mt-kahypar/partition/refinement/flows/flow_refiner.h"
-#include "mt-kahypar/partition/refinement/rebalancing/rebalancer.h"
+#include "mt-kahypar/partition/refinement/rebalancing/simple_rebalancer.h"
+#include "mt-kahypar/partition/refinement/rebalancing/advanced_rebalancer.h"
 
 namespace mt_kahypar {
 
@@ -77,7 +83,7 @@ using NLevelCoarsenerDispatcher = kahypar::meta::StaticMultiDispatchFactory<NLev
 #endif
 
 using LabelPropagationFactory = kahypar::meta::Factory<LabelPropagationAlgorithm,
-                                  IRefiner* (*)(HypernodeID, HyperedgeID, const Context&, gain_cache_t)>;
+                                  IRefiner* (*)(HypernodeID, HyperedgeID, const Context&, gain_cache_t, IRebalancer&)>;
 
 using LabelPropagationDispatcher = kahypar::meta::StaticMultiDispatchFactory<
                                         LabelPropagationRefiner,
@@ -90,12 +96,26 @@ using DeterministicLabelPropagationDispatcher = kahypar::meta::StaticMultiDispat
                                                   kahypar::meta::Typelist<TypeTraitsList>>;
 
 using FMFactory = kahypar::meta::Factory<FMAlgorithm,
-                    IRefiner* (*)(HypernodeID, HyperedgeID, const Context&, gain_cache_t)>;
+                    IRefiner* (*)(HypernodeID, HyperedgeID, const Context&, gain_cache_t, IRebalancer&)>;
 
-using FMDispatcher = kahypar::meta::StaticMultiDispatchFactory<
-                      MultiTryKWayFM,
-                      IRefiner,
-                      kahypar::meta::Typelist<TypeTraitsList, GainTypes>>;
+using DefaultFMDispatcher = kahypar::meta::StaticMultiDispatchFactory<
+                            MultiTryKWayFM,
+                            IRefiner,
+                            kahypar::meta::Typelist<TypeTraitsList, GainTypes>>;
+
+using UnconstrainedFMDispatcher = DefaultFMDispatcher;
+
+using FMStrategyFactory = kahypar::meta::Factory<FMAlgorithm, IFMStrategy* (*)(const Context&, FMSharedData&)>;
+
+using GainCacheFMStrategyDispatcher = kahypar::meta::StaticMultiDispatchFactory<
+                                      GainCacheStrategy,
+                                      IFMStrategy,
+                                      kahypar::meta::Typelist<TypeTraitsList, GainTypes>>;
+
+using UnconstrainedFMStrategyDispatcher = kahypar::meta::StaticMultiDispatchFactory<
+                                          UnconstrainedStrategy,
+                                          IFMStrategy,
+                                          kahypar::meta::Typelist<TypeTraitsList, GainTypes>>;
 
 using FlowSchedulerFactory = kahypar::meta::Factory<FlowAlgorithm,
                               IRefiner* (*)(const HypernodeID, const HyperedgeID, const Context&, gain_cache_t)>;
@@ -105,12 +125,17 @@ using FlowSchedulerDispatcher = kahypar::meta::StaticMultiDispatchFactory<
                                   IRefiner,
                                   kahypar::meta::Typelist<TypeTraitsList, GainTypes>>;
 
-using RebalancerFactory = kahypar::meta::Factory<RebalancingAlgorithm, IRefiner* (*)(const Context&)>;
+using RebalancerFactory = kahypar::meta::Factory<RebalancingAlgorithm, IRebalancer* (*)(HypernodeID, const Context&, gain_cache_t)>;
+
+using SimpleRebalancerDispatcher = kahypar::meta::StaticMultiDispatchFactory<
+                                   SimpleRebalancer,
+                                   IRebalancer,
+                                   kahypar::meta::Typelist<TypeTraitsList, GainTypes>>;
 
-using RebalancerDispatcher = kahypar::meta::StaticMultiDispatchFactory<
-                              Rebalancer,
-                              IRefiner,
-                              kahypar::meta::Typelist<TypeTraitsList, GainTypes>>;
+using AdvancedRebalancerDispatcher = kahypar::meta::StaticMultiDispatchFactory<
+                                     AdvancedRebalancer,
+                                     IRebalancer,
+                                     kahypar::meta::Typelist<TypeTraitsList, GainTypes>>;
 
 using FlowRefinementFactory = kahypar::meta::Factory<FlowAlgorithm,
                               IFlowRefiner* (*)(const HyperedgeID, const Context&)>;
diff --git a/mt-kahypar/partition/initial_partitioning/greedy_initial_partitioner.h b/mt-kahypar/partition/initial_partitioning/greedy_initial_partitioner.h
index 32e015a3b..e870026d2 100644
--- a/mt-kahypar/partition/initial_partitioning/greedy_initial_partitioner.h
+++ b/mt-kahypar/partition/initial_partitioning/greedy_initial_partitioner.h
@@ -39,8 +39,8 @@ class GreedyInitialPartitioner : public IInitialPartitioner {
   using PartitionedHypergraph = typename TypeTraits::PartitionedHypergraph;
   using GainComputationPolicy = GainPolicyT<TypeTraits>;
   using PQSelectionPolicy = PQSelectionPolicyT<TypeTraits>;
-  using DeltaFunction = std::function<void (const SyncronizedEdgeUpdate&)>;
-  #define NOOP_FUNC [] (const SyncronizedEdgeUpdate&) { }
+  using DeltaFunction = std::function<void (const SynchronizedEdgeUpdate&)>;
+  #define NOOP_FUNC [] (const SynchronizedEdgeUpdate&) { }
 
   static constexpr bool debug = false;
   static constexpr bool enable_heavy_assert = false;
diff --git a/mt-kahypar/partition/initial_partitioning/initial_partitioning_data_container.h b/mt-kahypar/partition/initial_partitioning/initial_partitioning_data_container.h
index e7ffdeee5..ea76a660c 100644
--- a/mt-kahypar/partition/initial_partitioning/initial_partitioning_data_container.h
+++ b/mt-kahypar/partition/initial_partitioning/initial_partitioning_data_container.h
@@ -204,6 +204,7 @@ class InitialPartitioningDataContainer {
               std::numeric_limits<HypernodeWeight>::max(),
               std::numeric_limits<double>::max()),
       _gain_cache(GainCachePtr::constructGainCache(context)),
+      _rebalancer(nullptr),
       _label_propagation(nullptr),
       _twoway_fm(nullptr),
       _stats() {
@@ -217,9 +218,11 @@ class InitialPartitioningDataContainer {
         _twoway_fm = std::make_unique<SequentialTwoWayFmRefiner<TypeTraits>>(_partitioned_hypergraph, _context);
       } else if ( _context.refinement.label_propagation.algorithm != LabelPropagationAlgorithm::do_nothing ) {
         // In case of a direct-kway initial partition we instantiate the LP refiner
+        _rebalancer = RebalancerFactory::getInstance().createObject(
+          _context.refinement.rebalancer, hypergraph.initialNumNodes(), _context, _gain_cache);
         _label_propagation = LabelPropagationFactory::getInstance().createObject(
           _context.refinement.label_propagation.algorithm,
-          hypergraph.initialNumNodes(), hypergraph.initialNumEdges(), _context, _gain_cache);
+          hypergraph.initialNumNodes(), hypergraph.initialNumEdges(), _context, _gain_cache, *_rebalancer);
       }
     }
 
@@ -344,6 +347,7 @@ class InitialPartitioningDataContainer {
     parallel::scalable_vector<PartitionID> _partition;
     PartitioningResult _result;
     gain_cache_t _gain_cache;
+    std::unique_ptr<IRebalancer> _rebalancer;
     std::unique_ptr<IRefiner> _label_propagation;
     std::unique_ptr<SequentialTwoWayFmRefiner<TypeTraits>> _twoway_fm;
     parallel::scalable_vector<utils::InitialPartitionerSummary> _stats;
diff --git a/mt-kahypar/partition/initial_partitioning/label_propagation_initial_partitioner.h b/mt-kahypar/partition/initial_partitioning/label_propagation_initial_partitioner.h
index 7e6ddeb68..3dc05acf1 100644
--- a/mt-kahypar/partition/initial_partitioning/label_propagation_initial_partitioner.h
+++ b/mt-kahypar/partition/initial_partitioning/label_propagation_initial_partitioner.h
@@ -42,8 +42,8 @@ template<typename TypeTraits>
 class LabelPropagationInitialPartitioner : public IInitialPartitioner {
 
   using PartitionedHypergraph = typename TypeTraits::PartitionedHypergraph;
-  using DeltaFunction = std::function<void (const SyncronizedEdgeUpdate&)>;
-  #define NOOP_FUNC [] (const SyncronizedEdgeUpdate&) { }
+  using DeltaFunction = std::function<void (const SynchronizedEdgeUpdate&)>;
+  #define NOOP_FUNC [] (const SynchronizedEdgeUpdate&) { }
 
   static constexpr bool debug = false;
   static constexpr bool enable_heavy_assert = false;
diff --git a/mt-kahypar/partition/refinement/CMakeLists.txt b/mt-kahypar/partition/refinement/CMakeLists.txt
index 08564bc97..4eb3ef92b 100644
--- a/mt-kahypar/partition/refinement/CMakeLists.txt
+++ b/mt-kahypar/partition/refinement/CMakeLists.txt
@@ -1,10 +1,12 @@
 set(RefinementSources
+        fm/fm_commons.cpp
         fm/multitry_kway_fm.cpp
         fm/localized_kway_fm_core.cpp
         fm/global_rollback.cpp
         fm/sequential_twoway_fm_refiner.cpp
         label_propagation/label_propagation_refiner.cpp
-        rebalancing/rebalancer.cpp
+        rebalancing/simple_rebalancer.cpp
+        rebalancing/advanced_rebalancer.cpp
         deterministic/deterministic_label_propagation.cpp
         flows/refiner_adapter.cpp
         flows/problem_construction.cpp
@@ -69,4 +71,4 @@ foreach(modtarget IN LISTS PARTITIONING_SUITE_TARGETS)
       target_sources(mtkahypar PRIVATE ${SteinerTreeGraphSources})
       target_sources(mtkahypar_python PRIVATE ${SteinerTreeGraphSources})
     endif()
-endforeach()
\ No newline at end of file
+endforeach()
diff --git a/mt-kahypar/partition/refinement/deterministic/deterministic_label_propagation.cpp b/mt-kahypar/partition/refinement/deterministic/deterministic_label_propagation.cpp
index 45d4c058f..336464a3c 100644
--- a/mt-kahypar/partition/refinement/deterministic/deterministic_label_propagation.cpp
+++ b/mt-kahypar/partition/refinement/deterministic/deterministic_label_propagation.cpp
@@ -125,7 +125,7 @@ namespace mt_kahypar {
   Gain DeterministicLabelPropagationRefiner<TypeTraits>::performMoveWithAttributedGain(
           PartitionedHypergraph& phg, const Move& m, bool activate_neighbors) {
     Gain attributed_gain = 0;
-    auto objective_delta = [&](const SyncronizedEdgeUpdate& sync_update) {
+    auto objective_delta = [&](const SynchronizedEdgeUpdate& sync_update) {
       attributed_gain -= Km1AttributedGains::gain(sync_update);
     };
     const bool was_moved = phg.changeNodePart(m.node, m.from, m.to, objective_delta);
@@ -561,4 +561,4 @@ namespace mt_kahypar {
 
   INSTANTIATE_CLASS_WITH_TYPE_TRAITS(DeterministicLabelPropagationRefiner)
 
-} // namespace mt_kahypar
\ No newline at end of file
+} // namespace mt_kahypar
diff --git a/mt-kahypar/partition/refinement/deterministic/deterministic_label_propagation.h b/mt-kahypar/partition/refinement/deterministic/deterministic_label_propagation.h
index 9f002c837..32e99ec85 100644
--- a/mt-kahypar/partition/refinement/deterministic/deterministic_label_propagation.h
+++ b/mt-kahypar/partition/refinement/deterministic/deterministic_label_propagation.h
@@ -30,6 +30,7 @@
 #include "mt-kahypar/datastructures/buffered_vector.h"
 #include "mt-kahypar/partition/context.h"
 #include "mt-kahypar/partition/refinement/i_refiner.h"
+#include "mt-kahypar/partition/refinement/i_rebalancer.h"
 
 #include "mt-kahypar/partition/refinement/fm/strategies/km1_gains.h"
 #include "mt-kahypar/partition/refinement/gains/gain_cache_ptr.h"
@@ -46,7 +47,13 @@ class DeterministicLabelPropagationRefiner final : public IRefiner {
   explicit DeterministicLabelPropagationRefiner(const HypernodeID num_hypernodes,
                                                 const HyperedgeID num_hyperedges,
                                                 const Context& context,
-                                                gain_cache_t /* only relevant for other refiners */) :
+                                                gain_cache_t /* only relevant for other refiners */,
+                                                IRebalancer& /* only relevant for other refiners */) :
+    DeterministicLabelPropagationRefiner(num_hypernodes, num_hyperedges, context) { }
+
+  explicit DeterministicLabelPropagationRefiner(const HypernodeID num_hypernodes,
+                                                const HyperedgeID num_hyperedges,
+                                                const Context& context) :
       context(context),
       compute_gains(context),
       moves(num_hypernodes),
@@ -62,12 +69,6 @@ class DeterministicLabelPropagationRefiner final : public IRefiner {
     }
   }
 
-  explicit DeterministicLabelPropagationRefiner(const HypernodeID num_hypernodes,
-                                                const HyperedgeID num_hyperedges,
-                                                const Context& context) :
-    DeterministicLabelPropagationRefiner(num_hypernodes, num_hyperedges, context,
-      gain_cache_t { nullptr, GainPolicy::none }) { }
-
 private:
   static constexpr bool debug = false;
 
diff --git a/mt-kahypar/partition/refinement/do_nothing_refiner.h b/mt-kahypar/partition/refinement/do_nothing_refiner.h
index d72ab9c71..e86a77cf0 100644
--- a/mt-kahypar/partition/refinement/do_nothing_refiner.h
+++ b/mt-kahypar/partition/refinement/do_nothing_refiner.h
@@ -34,7 +34,7 @@
 #include "mt-kahypar/partition/refinement/i_refiner.h"
 
 namespace mt_kahypar {
-class DoNothingRefiner final : public IRefiner {
+class DoNothingRefiner final : public IRebalancer {
  public:
   template <typename ... Args>
   explicit DoNothingRefiner(Args&& ...) noexcept { }
@@ -52,5 +52,21 @@ class DoNothingRefiner final : public IRefiner {
                   const double) override final {
     return false;
   }
+
+  virtual bool refineAndOutputMovesImpl(mt_kahypar_partitioned_hypergraph_t&,
+                                        const parallel::scalable_vector<HypernodeID>&,
+                                        parallel::scalable_vector<parallel::scalable_vector<Move>>&,
+                                        Metrics&,
+                                        const double) override final {
+    return false;
+  }
+
+  virtual bool refineAndOutputMovesLinearImpl(mt_kahypar_partitioned_hypergraph_t&,
+                                              const parallel::scalable_vector<HypernodeID>&,
+                                              parallel::scalable_vector<Move>&,
+                                              Metrics&,
+                                              const double) override final {
+    return false;
+  }
 };
 }  // namespace kahypar
diff --git a/mt-kahypar/partition/refinement/flows/scheduler.cpp b/mt-kahypar/partition/refinement/flows/scheduler.cpp
index d291bb467..57f7006aa 100644
--- a/mt-kahypar/partition/refinement/flows/scheduler.cpp
+++ b/mt-kahypar/partition/refinement/flows/scheduler.cpp
@@ -292,7 +292,7 @@ HyperedgeWeight FlowRefinementScheduler<TypeTraits, GainTypes>::applyMoves(const
 
   HyperedgeWeight improvement = 0;
   vec<NewCutHyperedge> new_cut_hes;
-  auto delta_func = [&](const SyncronizedEdgeUpdate& sync_update) {
+  auto delta_func = [&](const SynchronizedEdgeUpdate& sync_update) {
     improvement -= AttributedGains::gain(sync_update);
 
     // Collect hyperedges with new blocks in its connectivity set
@@ -396,4 +396,4 @@ namespace {
 
 INSTANTIATE_CLASS_WITH_TYPE_TRAITS_AND_GAIN_TYPES(FLOW_REFINEMENT_SCHEDULER)
 
-}
\ No newline at end of file
+}
diff --git a/mt-kahypar/partition/refinement/fm/fm_commons.cpp b/mt-kahypar/partition/refinement/fm/fm_commons.cpp
new file mode 100644
index 000000000..ca29e62e5
--- /dev/null
+++ b/mt-kahypar/partition/refinement/fm/fm_commons.cpp
@@ -0,0 +1,257 @@
+/*******************************************************************************
+ * MIT License
+ *
+ * This file is part of Mt-KaHyPar.
+ *
+ * Copyright (C) 2023 Nikolai Maas <nikolai.maas@kit.edu>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ ******************************************************************************/
+
+#include <algorithm>
+#include <limits>
+
+#include "mt-kahypar/partition/refinement/gains/gain_definitions.h"
+#include "mt-kahypar/datastructures/sparse_map.h"
+#include "mt-kahypar/partition/refinement/fm/fm_commons.h"
+
+
+namespace mt_kahypar {
+  template<typename L, typename R>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  uint64_t pairToKey(L left, R right) {
+    ASSERT(left >= 0 && static_cast<uint64_t>(left) <= std::numeric_limits<uint32_t>::max());
+    ASSERT(right >= 0 && static_cast<uint64_t>(right) <= std::numeric_limits<uint32_t>::max());
+    return (static_cast<uint64_t>(left) << 32) + static_cast<uint64_t>(right);
+  }
+
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  std::pair<uint32_t, uint32_t> keyToPair(uint64_t key) {
+    return {key >> 32, key & std::numeric_limits<uint32_t>::max()};
+  }
+
+  Gain UnconstrainedFMData::estimatePenaltyForImbalancedMove(PartitionID to,
+                                                             HypernodeWeight initial_imbalance,
+                                                             HypernodeWeight moved_weight) const {
+    ASSERT(initialized && to != kInvalidPartition);
+    // TODO test whether it is faster to save the previous position locally
+    BucketID bucketId = 0;
+    while (bucketId < NUM_BUCKETS
+           && initial_imbalance + moved_weight > bucket_weights[indexForBucket(to, bucketId)]) {
+      ++bucketId;
+    }
+    if (bucketId < NUM_BUCKETS) {
+      return std::ceil(moved_weight * gainPerWeightForBucket(bucketId));
+    }
+
+    // fallback case (it should be very unlikely that fallback_bucket_weights contains elements)
+    while (bucketId < NUM_BUCKETS + fallback_bucket_weights[to].size()
+           && initial_imbalance + moved_weight > fallback_bucket_weights[to][bucketId - NUM_BUCKETS]) {
+      ++bucketId;
+    }
+    return (bucketId == NUM_BUCKETS + fallback_bucket_weights[to].size()) ?
+              std::numeric_limits<Gain>::max() : std::ceil(moved_weight * gainPerWeightForBucket(bucketId));
+  }
+
+
+  template<typename TypeTraits, typename GainTypes>
+  void UnconstrainedFMData::InitializationHelper<TypeTraits, GainTypes>::initialize(
+            UnconstrainedFMData& data, const Context& context,
+            const typename TypeTraits::PartitionedHypergraph& phg,
+            const typename GainTypes::GainCache& gain_cache) {
+    auto get_node_stats = [&](const HypernodeID hypernode) {
+      // TODO(maas): we might want to save the total incident weight in the hypergraph data structure
+      // at some point in the future
+      HyperedgeWeight total_incident_weight = 0;
+      for (const HyperedgeID& he : phg.incidentEdges(hypernode)) {
+        total_incident_weight += phg.edgeWeight(he);
+      }
+      HyperedgeWeight internal_weight = gain_cache.penaltyTerm(hypernode, phg.partID(hypernode));
+      ASSERT(internal_weight == gain_cache.recomputePenaltyTerm(phg, hypernode));
+      return std::make_pair(internal_weight, total_incident_weight);
+    };
+
+    const double bn_treshold = context.refinement.fm.treshold_border_node_inclusion;
+    tbb::enumerable_thread_specific<HypernodeWeight> local_considered_weight(0);
+    tbb::enumerable_thread_specific<HypernodeWeight> local_inserted_weight(0);
+    // collect nodes and fill buckets
+    phg.doParallelForAllNodes([&](const HypernodeID hn) {
+      const HypernodeWeight hn_weight = phg.nodeWeight(hn);
+      if (hn_weight == 0) return;
+
+      auto [internal_weight, total_incident_weight] = get_node_stats(hn);
+      if (static_cast<double>(internal_weight) >= bn_treshold * total_incident_weight) {
+        local_considered_weight.local() += hn_weight;
+        const BucketID bucketId = bucketForGainPerWeight(static_cast<double>(internal_weight) / hn_weight);
+        if (bucketId < NUM_BUCKETS) {
+          local_inserted_weight.local() += hn_weight;
+          auto& local_weights = data.local_bucket_weights.local();
+          local_weights[data.indexForBucket(phg.partID(hn), bucketId)] += hn_weight;
+          data.rebalancing_nodes.set(hn, true);
+        }
+      }
+    });
+
+    auto& bucket_weights = data.bucket_weights;
+    // for each block compute prefix sum of bucket weights, which is later used for estimating penalties
+    auto compute_prefix_sum_for_range = [&](size_t start, size_t end) {
+      for (const auto& local_weights: data.local_bucket_weights) {
+        ASSERT(bucket_weights.size() == local_weights.size());
+        for (size_t i = start; i < end; ++i) {
+          ASSERT(i < local_weights.size());
+          bucket_weights[i] += local_weights[i];
+        }
+      }
+      for (size_t i = start; i + 1 < end; ++i) {
+        bucket_weights[i + 1] += bucket_weights[i];
+      }
+    };
+    tbb::parallel_for(static_cast<PartitionID>(0), context.partition.k, [&](const PartitionID block) {
+      compute_prefix_sum_for_range(block * NUM_BUCKETS, (block + 1) * NUM_BUCKETS);
+    }, tbb::static_partitioner());
+
+    const HypernodeWeight considered_weight = local_considered_weight.combine(std::plus<>());
+    const HypernodeWeight inserted_weight = local_inserted_weight.combine(std::plus<>());
+    if (static_cast<double>(inserted_weight) / considered_weight < FALLBACK_TRESHOLD) {
+      // Use fallback if fixed number of buckets per block is not sufficient:
+      // For unweighted instances or instances with reasonable weight distribution this should almost never
+      // be necessary. We use more expensive precomputations (hash maps instead of arrays) here in order to
+      // keep memory overhead low and still get fast queries for estimating imbalance penalties.
+      using SparseMap = ds::DynamicSparseMap<uint64_t, HypernodeWeight>;
+
+      // collect nodes into local hashmaps
+      tbb::enumerable_thread_specific<SparseMap> local_accumulator;
+      phg.doParallelForAllNodes([&](const HypernodeID hn) {
+        const HypernodeWeight hn_weight = phg.nodeWeight(hn);
+        if (hn_weight == 0) return;
+
+        auto [internal_weight, total_incident_weight] = get_node_stats(hn);
+        if (static_cast<double>(internal_weight) >= bn_treshold * total_incident_weight) {
+          const BucketID bucketId = bucketForGainPerWeight(static_cast<double>(internal_weight) / hn_weight);
+          if (bucketId >= NUM_BUCKETS) {
+            auto& map = local_accumulator.local();
+            // hash by block id and bucket id
+            map[pairToKey(phg.partID(hn), bucketId - NUM_BUCKETS)] += hn_weight;
+          }
+        }
+      });
+
+      vec<AtomicBucketID> max_rank_per_block(context.partition.k, AtomicBucketID(0));
+      vec<AtomicWeight> weight_per_block(context.partition.k, AtomicWeight(0));
+      // sort resulting values and determine ranks (so that larger values are ignored)
+      local_accumulator.combine_each([&](SparseMap& map) {
+        // we sort the values in the dense part (note that this invalidates the map)
+        std::sort(map.begin(), map.end(), [](const auto& l, const auto& r) {
+          return l.key < r.key;
+        });
+        // compute summed weight and rank/bucketID for each block
+        auto it = map.begin();
+        for (PartitionID p = 0; p < context.partition.k; ++p) {
+          const uint32_t block = static_cast<uint32_t>(p);
+
+          ASSERT(it == map.end() || keyToPair(it->key).first >= block);
+          HypernodeWeight total_weight = 0;
+          while (it < map.end() && keyToPair(it->key).first == block) {
+            total_weight += it->value;
+            ++it;
+          }
+          // scan backwards to find (approximately) an element with rank according to the fallback treshold
+          HypernodeWeight remaining_upper_weight = std::floor((1.0 - FALLBACK_TRESHOLD) * total_weight);
+          auto backwards_it = it;
+          while (total_weight > 0 && remaining_upper_weight >= (--backwards_it)->value) {
+            ASSERT(keyToPair(backwards_it->key).first == block);
+            remaining_upper_weight -= backwards_it->value;
+          }
+          // write result to global arrays
+          weight_per_block[block].fetch_add(total_weight, std::memory_order_relaxed);
+          const auto [curr_block, new_rank] = keyToPair(backwards_it->key);
+          if (curr_block == block) {
+            AtomicBucketID& global_rank = max_rank_per_block[block];
+            BucketID current = global_rank.load(std::memory_order_relaxed);
+            while (current < new_rank
+                   && !global_rank.compare_exchange_strong(current, new_rank, std::memory_order_relaxed)) { /* try again */ }
+          } else {
+            ASSERT(total_weight == 0);
+          }
+        }
+      });
+
+      auto& fallback_bucket_weights = data.fallback_bucket_weights;
+      // resize vectors accordingly, set rank to zero if no fallback is required for this block
+      tbb::parallel_for(static_cast<PartitionID>(0), context.partition.k, [&](const PartitionID block) {
+        const HypernodeWeight handled_weight = bucket_weights[data.indexForBucket(block, NUM_BUCKETS - 1)];
+        const HypernodeWeight fallback_weight = weight_per_block[block];
+        if (static_cast<double>(handled_weight) / (handled_weight + fallback_weight) >= FALLBACK_TRESHOLD) {
+          max_rank_per_block[block].store(0);
+        } else {
+          fallback_bucket_weights[block].resize(max_rank_per_block[block] + 1, 0);
+        }
+      }, tbb::static_partitioner());
+
+      // accumulate results in fallback_bucket_weights
+      local_accumulator.combine_each([&](SparseMap& map) {
+        auto it = map.begin();
+        for (PartitionID p = 0; p < context.partition.k; ++p) {
+          const uint32_t block = static_cast<uint32_t>(p);
+          const size_t upper_limit = fallback_bucket_weights[block].size();
+          ASSERT(upper_limit == 0 || upper_limit == max_rank_per_block[block] + 1);
+
+          ASSERT(it == map.end() || keyToPair(it->key).first >= block);
+          while (it < map.end() && keyToPair(it->key).first == block) {
+            BucketID current_rank = keyToPair(it->key).second;
+            if (current_rank < upper_limit) {
+              __atomic_fetch_add(&fallback_bucket_weights[block][current_rank], it->value, __ATOMIC_RELAXED);
+            }
+            ++it;
+          }
+        }
+      });
+
+      // compute prefix sums
+      tbb::parallel_for(static_cast<PartitionID>(0), context.partition.k, [&](const PartitionID block) {
+        auto& weights = fallback_bucket_weights[block];
+        if (!weights.empty()) {
+          weights[0] += bucket_weights[data.indexForBucket(block, NUM_BUCKETS - 1)];
+          for (size_t  i = 0; i + 1 < weights.size(); ++i) {
+            weights[i + 1] += weights[i];
+          }
+        }
+      }, tbb::static_partitioner());
+    }
+
+    data.initialized = true;
+  }
+
+  void UnconstrainedFMData::reset() {
+    rebalancing_nodes.reset();
+    bucket_weights.assign(current_k * NUM_BUCKETS, 0);
+    virtual_weight_delta.assign(current_k, AtomicWeight(0));
+    for (auto& local_weights: local_bucket_weights) {
+      local_weights.assign(current_k * NUM_BUCKETS, 0);
+    }
+    fallback_bucket_weights.assign(current_k, {});
+    initialized = false;
+  }
+
+  namespace {
+  #define UNCONSTRAINED_FM_INITIALIZATION(X, Y) UnconstrainedFMData::InitializationHelper<X, Y>;
+  }
+
+  INSTANTIATE_CLASS_WITH_TYPE_TRAITS_AND_GAIN_TYPES(UNCONSTRAINED_FM_INITIALIZATION)
+}
diff --git a/mt-kahypar/partition/refinement/fm/fm_commons.h b/mt-kahypar/partition/refinement/fm/fm_commons.h
index a0a29274a..8bdd5c4fc 100644
--- a/mt-kahypar/partition/refinement/fm/fm_commons.h
+++ b/mt-kahypar/partition/refinement/fm/fm_commons.h
@@ -26,12 +26,16 @@
 
 #pragma once
 
+#include <limits>
+
+#include <mt-kahypar/datastructures/concurrent_bucket_map.h>
 #include <mt-kahypar/datastructures/priority_queue.h>
 #include <mt-kahypar/partition/context.h>
 #include <mt-kahypar/parallel/work_stack.h>
 
 #include "kahypar-resources/datastructure/fast_reset_flag_array.h"
 
+#include <tbb/enumerable_thread_specific.h>
 #include <tbb/parallel_for.h>
 
 namespace mt_kahypar {
@@ -65,7 +69,6 @@ struct GlobalMoveTracker {
     const MoveID move_id = runningMoveID.fetch_add(1, std::memory_order_relaxed);
     assert(move_id - firstMoveID < moveOrder.size());
     moveOrder[move_id - firstMoveID] = m;
-    moveOrder[move_id - firstMoveID].gain = 0;      // set to zero so the recalculation can safely distribute
     moveOfNode[m.node] = move_id;
     return move_id;
   }
@@ -77,9 +80,11 @@ struct GlobalMoveTracker {
 
   bool wasNodeMovedInThisRound(HypernodeID u) const {
     const MoveID m_id = moveOfNode[u];
-    return m_id >= firstMoveID
-           && m_id < runningMoveID.load(std::memory_order_relaxed)  // active move ID
-           && moveOrder[m_id - firstMoveID].isValid();      // not reverted already
+    if (m_id >= firstMoveID && m_id < runningMoveID.load(std::memory_order_relaxed)) {   // active move ID
+      ASSERT(moveOrder[m_id - firstMoveID].node == u);
+      return moveOrder[m_id - firstMoveID].isValid();  // not reverted already
+    }
+    return false;
   }
 
   MoveID numPerformedMoves() const {
@@ -142,6 +147,107 @@ struct NodeTracker {
 };
 
 
+// Contains data required for unconstrained FM: We group non-border nodes in buckets based on their
+// incident weight to node weight ratio. This allows to give a (pessimistic) estimate of the effective
+// gain for moves that violate the balance constraint
+class UnconstrainedFMData {
+  using AtomicWeight = parallel::IntegralAtomicWrapper<HypernodeWeight>;
+  using BucketID = uint32_t;
+  using AtomicBucketID = parallel::IntegralAtomicWrapper<BucketID>;
+
+  template<typename TypeTraits, typename GainTypes>
+  struct InitializationHelper {
+    static void initialize(UnconstrainedFMData& data, const Context& context,
+                           const typename TypeTraits::PartitionedHypergraph& phg,
+                           const typename GainTypes::GainCache& gain_cache);
+  };
+
+  static constexpr BucketID NUM_BUCKETS = 16;
+  static constexpr double BUCKET_FACTOR = 1.5;
+  static constexpr double FALLBACK_TRESHOLD = 0.75;
+
+ public:
+  explicit UnconstrainedFMData(HypernodeID num_nodes):
+    initialized(false),
+    current_k(0),
+    bucket_weights(),
+    virtual_weight_delta(),
+    local_bucket_weights(),
+    rebalancing_nodes(num_nodes) { }
+
+  template<typename TypeTraits, typename GainTypes>
+  void initialize(const Context& context,
+                  const typename TypeTraits::PartitionedHypergraph& phg,
+                  const typename GainTypes::GainCache& gain_cache) {
+    changeNumberOfBlocks(context.partition.k);
+    reset();
+
+    InitializationHelper<TypeTraits, GainTypes>::initialize(*this, context, phg, gain_cache);
+  }
+
+  Gain estimatePenaltyForImbalancedMove(PartitionID to, HypernodeWeight initial_imbalance, HypernodeWeight moved_weight) const;
+
+  AtomicWeight& virtualWeightDelta(PartitionID block) {
+    ASSERT(block >= 0 && static_cast<size_t>(block) < virtual_weight_delta.size());
+    return virtual_weight_delta[block];
+  }
+
+  bool isRebalancingNode(HypernodeID hn) const {
+    return initialized && rebalancing_nodes[hn];
+  }
+
+  void reset();
+
+  void changeNumberOfBlocks(PartitionID new_k) {
+    if (new_k != current_k) {
+      current_k = new_k;
+      local_bucket_weights = tbb::enumerable_thread_specific<vec<HypernodeWeight>>(new_k * NUM_BUCKETS);
+      initialized = false;
+    }
+  }
+
+ private:
+  template<typename TypeTraits, typename GainTypes>
+  friend class InitializationHelper;
+
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE size_t indexForBucket(PartitionID block, BucketID bucketId) const {
+    ASSERT(bucketId < NUM_BUCKETS && block * NUM_BUCKETS + bucketId < bucket_weights.size());
+    return block * NUM_BUCKETS + bucketId;
+  }
+
+  // upper bound of gain values in bucket
+  static double gainPerWeightForBucket(BucketID bucketId) {
+    if (bucketId > 1) {
+      return std::pow(BUCKET_FACTOR, bucketId - 2);
+    } else if (bucketId == 1) {
+      return 0.5;
+    } else {
+      return 0;
+    }
+  }
+
+  static BucketID bucketForGainPerWeight(double gainPerWeight) {
+    if (gainPerWeight >= 1) {
+      return 2 + std::ceil(std::log(gainPerWeight) / std::log(BUCKET_FACTOR));
+    } else if (gainPerWeight > 0.5) {
+      return 2;
+    } else if (gainPerWeight > 0) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  bool initialized = false;
+  PartitionID current_k;
+  parallel::scalable_vector<HypernodeWeight> bucket_weights;
+  parallel::scalable_vector<AtomicWeight> virtual_weight_delta;
+  tbb::enumerable_thread_specific<parallel::scalable_vector<HypernodeWeight>> local_bucket_weights;
+  parallel::scalable_vector<parallel::scalable_vector<HypernodeWeight>> fallback_bucket_weights;
+  kahypar::ds::FastResetFlagArray<> rebalancing_nodes;
+};
+
+
 struct FMSharedData {
   // ! Number of Nodes
   size_t numberOfNodes;
@@ -161,6 +267,9 @@ struct FMSharedData {
   // ! Stores the designated target part of a vertex, i.e. the part with the highest gain to which moving is feasible
   vec<PartitionID> targetPart;
 
+  // ! Additional data for unconstrained FM algorithm
+  UnconstrainedFMData unconstrained;
+
   // ! Stop parallel refinement if finishedTasks > finishedTasksLimit to avoid long-running single searches
   CAtomic<size_t> finishedTasks;
   size_t finishedTasksLimit = std::numeric_limits<size_t>::max();
@@ -178,7 +287,8 @@ struct FMSharedData {
     vertexPQHandles(), //numPQHandles, invalid_position),
     moveTracker(), //numNodes),
     nodeTracker(), //numNodes),
-    targetPart() {
+    targetPart(),
+    unconstrained(numNodes) {
     finishedTasks.store(0, std::memory_order_relaxed);
 
     // 128 * 3/2 GB --> roughly 1.5 GB per thread on our biggest machine
@@ -267,4 +377,4 @@ struct FMStats {
 };
 
 
-}
\ No newline at end of file
+}
diff --git a/mt-kahypar/partition/refinement/fm/global_rollback.cpp b/mt-kahypar/partition/refinement/fm/global_rollback.cpp
index 21253249c..ec7265a3d 100644
--- a/mt-kahypar/partition/refinement/fm/global_rollback.cpp
+++ b/mt-kahypar/partition/refinement/fm/global_rollback.cpp
@@ -180,9 +180,11 @@ namespace mt_kahypar {
     });
 
     // recompute penalty term values since they are potentially invalid
-    tbb::parallel_for(MoveID(0), numMoves, [&](const MoveID i) {
-      gain_cache.recomputeInvalidTerms(phg, move_order[i].node);
-    });
+    if constexpr (GainCache::invalidates_entries) {
+      tbb::parallel_for(MoveID(0), numMoves, [&](const MoveID i) {
+        gain_cache.recomputeInvalidTerms(phg, move_order[i].node);
+      });
+    }
 
     sharedData.moveTracker.reset();
 
@@ -258,7 +260,7 @@ namespace mt_kahypar {
     for ( const PartitionID& block : phg.connectivitySet(e) ) {
       pin_counts.setPinCountInPart(block, phg.pinCountInPart(e, block));
     }
-    SyncronizedEdgeUpdate sync_update;
+    SynchronizedEdgeUpdate sync_update;
     sync_update.he = e;
     sync_update.edge_weight = phg.edgeWeight(e);
     sync_update.edge_size = phg.edgeSize(e);
@@ -311,7 +313,7 @@ namespace mt_kahypar {
                                                                                             const HyperedgeID& e) {
     if ( !phg.isSinglePin(e) ) {
       GlobalMoveTracker& tracker = sharedData.moveTracker;
-      SyncronizedEdgeUpdate sync_update;
+      SynchronizedEdgeUpdate sync_update;
       sync_update.he = e;
       sync_update.edge_weight = phg.edgeWeight(e);
       sync_update.edge_size = phg.edgeSize(e);
@@ -386,6 +388,10 @@ namespace mt_kahypar {
       }
     };
 
+    tbb::parallel_for(MoveID(0), tracker.numPerformedMoves(), [&](MoveID m_id) {
+      tracker.moveOrder[m_id].gain = 0;
+    });
+
     if (context.refinement.fm.iter_moves_on_recalc) {
       tbb::parallel_for(0U, sharedData.moveTracker.numPerformedMoves(), [&](const MoveID local_move_id) {
         const HypernodeID u = sharedData.moveTracker.moveOrder[local_move_id].node;
@@ -442,7 +448,7 @@ namespace mt_kahypar {
     // roll forward sequentially
     Gain best_gain = 0, gain_sum = 0;
     MoveID best_index = 0;
-    auto attributed_gains = [&](const SyncronizedEdgeUpdate& sync_update) {
+    auto attributed_gains = [&](const SynchronizedEdgeUpdate& sync_update) {
       gain_sum -= AttributedGains::gain(sync_update);
     };
     for (MoveID localMoveID = 0; localMoveID < numMoves; ++localMoveID) {
@@ -478,9 +484,11 @@ namespace mt_kahypar {
       }
     });
 
-    tbb::parallel_for(0U, numMoves, [&](const MoveID i) {
-      gain_cache.recomputeInvalidTerms(phg, move_order[i].node);
-    });
+    if constexpr (GainCache::invalidates_entries) {
+      tbb::parallel_for(0U, numMoves, [&](const MoveID i) {
+        gain_cache.recomputeInvalidTerms(phg, move_order[i].node);
+      });
+    }
 
     tracker.reset();
 
@@ -519,7 +527,7 @@ namespace mt_kahypar {
         continue;
 
       Gain gain = 0;
-      auto attributed_gains = [&](const SyncronizedEdgeUpdate& sync_update) {
+      auto attributed_gains = [&](const SynchronizedEdgeUpdate& sync_update) {
         gain -= AttributedGains::gain(sync_update);
       };
 
diff --git a/mt-kahypar/partition/refinement/fm/localized_kway_fm_core.cpp b/mt-kahypar/partition/refinement/fm/localized_kway_fm_core.cpp
index cfe1aafdc..149bc21f2 100644
--- a/mt-kahypar/partition/refinement/fm/localized_kway_fm_core.cpp
+++ b/mt-kahypar/partition/refinement/fm/localized_kway_fm_core.cpp
@@ -29,13 +29,15 @@
 
 #include "mt-kahypar/definitions.h"
 #include "mt-kahypar/partition/refinement/gains/gain_definitions.h"
+#include "mt-kahypar/partition/refinement/fm/strategies/gain_cache_strategy.h"
+#include "mt-kahypar/partition/refinement/fm/strategies/unconstrained_strategy.h"
 
 namespace mt_kahypar {
 
   template<typename TypeTraits, typename GainTypes>
-  bool LocalizedKWayFM<TypeTraits, GainTypes>::findMoves(PartitionedHypergraph& phg,
-                                                         size_t taskID,
-                                                         size_t numSeeds) {
+  template<typename DispatchedFMStrategy>
+  bool LocalizedKWayFM<TypeTraits, GainTypes>::findMoves(DispatchedFMStrategy& fm_strategy, PartitionedHypergraph& phg,
+                                                         size_t taskID, size_t numSeeds) {
     localMoves.clear();
     thisSearch = ++sharedData.nodeTracker.highestActiveSearchID;
 
@@ -54,21 +56,18 @@ namespace mt_kahypar {
 
       if (context.refinement.fm.perform_moves_global || sharedData.deltaExceededMemoryConstraints) {
         if ( phg.hasFixedVertices() ) {
-          internalFindMoves<false, true>(phg);
+          internalFindMoves<false, true>(phg, fm_strategy);
         } else {
-          internalFindMoves<false, false>(phg);
+          internalFindMoves<false, false>(phg, fm_strategy);
         }
       } else {
         deltaPhg.clear();
         delta_gain_cache.clear();
         deltaPhg.setPartitionedHypergraph(&phg);
         if ( phg.hasFixedVertices() ) {
-          internalFindMoves<true, true>(phg);
+          internalFindMoves<true, true>(phg, fm_strategy);
         } else {
-          internalFindMoves<true, false>(phg);
-        }
-        if (deltaPhg.combinedMemoryConsumption() > sharedData.deltaMemoryLimitPerThread) {
-          sharedData.deltaExceededMemoryConstraints = true;
+          internalFindMoves<true, false>(phg, fm_strategy);
         }
       }
       return true;
@@ -92,9 +91,10 @@ namespace mt_kahypar {
   }
 
   template<typename TypeTraits, typename GainTypes>
-  template<bool has_fixed_vertices, typename PHG, typename CACHE>
+  template<bool has_fixed_vertices, typename PHG, typename CACHE, typename DispatchedFMStrategy>
   MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
-  void LocalizedKWayFM<TypeTraits, GainTypes>::acquireOrUpdateNeighbors(PHG& phg, CACHE& gain_cache, const Move& move) {
+  void LocalizedKWayFM<TypeTraits, GainTypes>::acquireOrUpdateNeighbors(PHG& phg, CACHE& gain_cache, const Move& move,
+                                                                        DispatchedFMStrategy& fm_strategy) {
     // Note: In theory we should acquire/update all neighbors. It just turned out that this works fine
     // Actually: only vertices incident to edges with gain changes can become new boundary vertices.
     // Vertices that already were boundary vertices, can still be considered later since they are in the task queue
@@ -126,12 +126,13 @@ namespace mt_kahypar {
 
 
   template<typename TypeTraits, typename GainTypes>
-  template<bool use_delta, bool has_fixed_vertices>
-  void LocalizedKWayFM<TypeTraits, GainTypes>::internalFindMoves(PartitionedHypergraph& phg) {
+  template<bool use_delta, bool has_fixed_vertices, typename DispatchedFMStrategy>
+  void LocalizedKWayFM<TypeTraits, GainTypes>::internalFindMoves(PartitionedHypergraph& phg,
+                                                                 DispatchedFMStrategy& fm_strategy) {
     StopRule stopRule(phg.initialNumNodes());
     Move move;
 
-    auto delta_func = [&](const SyncronizedEdgeUpdate& sync_update) {
+    auto delta_func = [&](const SynchronizedEdgeUpdate& sync_update) {
       // Gains of the pins of a hyperedge can only change in the following situations.
       if ( GainCache::triggersDeltaGainUpdate(sync_update) ) {
         edgesWithGainChanges.push_back(sync_update.he);
@@ -185,6 +186,8 @@ namespace mt_kahypar {
       edgesWithGainChanges.clear(); // clear before move. delta_func feeds nets of moved vertex.
       MoveID move_id = std::numeric_limits<MoveID>::max();
       bool moved = false;
+      const HypernodeWeight allowed_weight = DispatchedFMStrategy::is_unconstrained ? std::numeric_limits<HypernodeWeight>::max()
+                                             : context.partition.max_part_weights[move.to];
       if constexpr (use_delta) {
         heaviestPartWeight = heaviestPartAndWeight(deltaPhg, context.partition.k).second;
         fromWeight = deltaPhg.partWeight(move.from);
@@ -194,18 +197,18 @@ namespace mt_kahypar {
           // this is intended to allow moving high deg nodes (blow up hash tables) if they give an improvement.
           // The nets affected by a gain cache update are collected when we apply this improvement on the
           // global partition (used to expand the localized search and update the gain values).
-          moved = toWeight + phg.nodeWeight(move.node) <= context.partition.max_part_weights[move.to];
+          moved = toWeight + phg.nodeWeight(move.node) <= allowed_weight;
         } else {
-          moved = deltaPhg.changeNodePart(move.node, move.from, move.to,
-                                          context.partition.max_part_weights[move.to], delta_func);
+          moved = deltaPhg.changeNodePart(move.node, move.from, move.to, allowed_weight, delta_func);
+          fm_strategy.applyMove(deltaPhg, delta_gain_cache, move, false);
         }
       } else {
         heaviestPartWeight = heaviestPartAndWeight(phg, context.partition.k).second;
         fromWeight = phg.partWeight(move.from);
         toWeight = phg.partWeight(move.to);
-        moved = phg.changeNodePart(move.node, move.from, move.to,
-                                   context.partition.max_part_weights[move.to],
+        moved = phg.changeNodePart(move.node, move.from, move.to, allowed_weight,
                                    [&] { move_id = sharedData.moveTracker.insertMove(move); }, delta_func);
+        fm_strategy.applyMove(phg, gain_cache, move, true);
       }
 
       if (moved) {
@@ -223,7 +226,7 @@ namespace mt_kahypar {
           bestImprovementIndex = localMoves.size();
 
           if constexpr (use_delta) {
-            applyBestLocalPrefixToSharedPartition(phg, bestImprovementIndex, bestImprovement, true /* apply all moves */);
+            applyBestLocalPrefixToSharedPartition(phg, fm_strategy, bestImprovementIndex);
             bestImprovementIndex = 0;
             localMoves.clear();
             deltaPhg.clear();   // clear hashtables, save memory :)
@@ -238,41 +241,39 @@ namespace mt_kahypar {
         }
 
         if constexpr (use_delta) {
-          acquireOrUpdateNeighbors<has_fixed_vertices>(deltaPhg, delta_gain_cache, move);
+          acquireOrUpdateNeighbors<has_fixed_vertices>(deltaPhg, delta_gain_cache, move, fm_strategy);
         } else {
-          acquireOrUpdateNeighbors<has_fixed_vertices>(phg, gain_cache, move);
+          acquireOrUpdateNeighbors<has_fixed_vertices>(phg, gain_cache, move, fm_strategy);
         }
       }
 
     }
 
     if constexpr (use_delta) {
-      std::tie(bestImprovement, bestImprovementIndex) =
-        applyBestLocalPrefixToSharedPartition(phg, bestImprovementIndex, bestImprovement, false);
+      // in this case there is no improved local prefix to apply (was already applied in the loop)
+      ASSERT(bestImprovementIndex == 0);
     } else {
-      revertToBestLocalPrefix(phg, bestImprovementIndex);
+      revertToBestLocalPrefix(phg, fm_strategy, bestImprovementIndex);
     }
 
+    fm_strategy.reset();
     runStats.estimated_improvement = bestImprovement;
-    fm_strategy.clearPQs(bestImprovementIndex);
     runStats.merge(stats);
   }
 
 
   template<typename TypeTraits, typename GainTypes>
-  std::pair<Gain, size_t> LocalizedKWayFM<TypeTraits, GainTypes>::applyBestLocalPrefixToSharedPartition(
+  template<typename DispatchedFMStrategy>
+  void LocalizedKWayFM<TypeTraits, GainTypes>::applyBestLocalPrefixToSharedPartition(
           PartitionedHypergraph& phg,
-          const size_t best_index_locally_observed,
-          const Gain best_improvement_locally_observed,
-          const bool apply_delta_improvement) {
+          DispatchedFMStrategy& fm_strategy,
+          const size_t best_index_locally_observed) {
+    // Note: if this precondition does not hold, the call to fm_strategy.flushLocalChanges() would be incorrect
+    ASSERT(best_index_locally_observed == localMoves.size());
 
-    Gain improvement_from_attributed_gains = 0;
-    Gain attributed_gain = 0;
     bool is_last_move = false;
 
-    auto delta_gain_func = [&](const SyncronizedEdgeUpdate& sync_update) {
-      attributed_gain += AttributedGains::gain(sync_update);
-
+    auto delta_gain_func = [&](const SynchronizedEdgeUpdate& sync_update) {
       // Gains of the pins of a hyperedge can only change in the following situations.
       if ( is_last_move && GainCache::triggersDeltaGainUpdate(sync_update) ) {
         // This vector is used by the acquireOrUpdateNeighbor function to expand to neighbors
@@ -281,25 +282,13 @@ namespace mt_kahypar {
         // and the expansion happens after applyBestLocalPrefixToSharedPartition.
         edgesWithGainChanges.push_back(sync_update.he);
       }
-
-      // TODO: We have different strategies to maintain the gain values during an FM search.
-      // Some use the gain cache, others compute them each time from scratch or use delta gain updates.
-      // In case the delta gain update strategy is used, we would have to call the deltaGainUpdate function
-      // of the FM strategy here. However, the current strategy in our presets use the gain cache and calling the deltaGainUpdate
-      // function would apply the updates on the thread-local partition, which we do not want here.
-      // Keep in mind that the gain values of the FMGainDeltaStrategy might be incorrect afterwards.
-      // However, this strategy is only experimental We should remove the
-      // different strategies since we do not use them.
     };
 
-    // Apply move sequence to original hypergraph and update gain values
-    Gain best_improvement_from_attributed_gains = 0;
-    size_t best_index_from_attributed_gains = 0;
+    // Apply move sequence to original hypergraph
     for (size_t i = 0; i < best_index_locally_observed; ++i) {
       ASSERT(i < localMoves.size());
       Move& local_move = localMoves[i].first;
       MoveID& move_id = localMoves[i].second;
-      attributed_gain = 0;
       // In a localized FM search, we apply all moves to a thread-local partition (delta_phg)
       // using hash tables. Once we find an improvement, we apply the corresponding move
       // sequence to the global partition. To save memory (in the hash tables), we do not apply
@@ -308,50 +297,28 @@ namespace mt_kahypar {
       // we collect all nets affected by a gain cache update and expand the search to pins
       // contained in these nets. Since, we do not apply last move on the thread-local partition we collect
       // these nets here.
-      is_last_move = apply_delta_improvement && i == best_index_locally_observed - 1;
+      is_last_move = (i == best_index_locally_observed - 1);
 
       phg.changeNodePart(
         gain_cache, local_move.node, local_move.from, local_move.to,
         std::numeric_limits<HypernodeWeight>::max(),
         [&] { move_id = sharedData.moveTracker.insertMove(local_move); },
         delta_gain_func);
-
-      attributed_gain = -attributed_gain; // delta func yields negative sum of improvements, i.e. negative values mean improvements
-      improvement_from_attributed_gains += attributed_gain;
       ASSERT(move_id != std::numeric_limits<MoveID>::max());
-      if (improvement_from_attributed_gains >= best_improvement_from_attributed_gains) {
-        best_improvement_from_attributed_gains = improvement_from_attributed_gains;
-        best_index_from_attributed_gains = i;
-      }
-    }
-
-    runStats.local_reverts += localMoves.size() - best_index_locally_observed;
-    if (!apply_delta_improvement && best_index_from_attributed_gains != best_index_locally_observed) {
-      runStats.best_prefix_mismatch++;
-    }
-
-    // kind of double rollback, if attributed gains say we overall made things worse
-    if (!apply_delta_improvement && improvement_from_attributed_gains < 0) {
-      // always using the if-branch gave similar results
-      runStats.local_reverts += best_index_locally_observed - best_index_from_attributed_gains + 1;
-      for (size_t i = best_index_from_attributed_gains + 1; i < best_index_locally_observed; ++i) {
-        Move& m = sharedData.moveTracker.getMove(localMoves[i].second);
-        phg.changeNodePart(gain_cache, m.node, m.to, m.from);
-        m.invalidate();
-      }
-      return std::make_pair(best_improvement_from_attributed_gains, best_index_from_attributed_gains);
-    } else {
-      return std::make_pair(best_improvement_locally_observed, best_index_locally_observed);
     }
+    fm_strategy.flushLocalChanges();
   }
 
   template<typename TypeTraits, typename GainTypes>
+  template<typename DispatchedFMStrategy>
   void LocalizedKWayFM<TypeTraits, GainTypes>::revertToBestLocalPrefix(PartitionedHypergraph& phg,
+                                                                       DispatchedFMStrategy& fm_strategy,
                                                                        size_t bestGainIndex) {
     runStats.local_reverts += localMoves.size() - bestGainIndex;
     while (localMoves.size() > bestGainIndex) {
       Move& m = sharedData.moveTracker.getMove(localMoves.back().second);
       phg.changeNodePart(gain_cache, m.node, m.to, m.from);
+      fm_strategy.revertMove(phg, gain_cache, m, true);
       m.invalidate();
       localMoves.pop_back();
     }
@@ -360,7 +327,13 @@ namespace mt_kahypar {
   template<typename TypeTraits, typename GainTypes>
   void LocalizedKWayFM<TypeTraits, GainTypes>::changeNumberOfBlocks(const PartitionID new_k) {
     deltaPhg.changeNumberOfBlocks(new_k);
-    fm_strategy.changeNumberOfBlocks(new_k);
+    blockPQ.resize(new_k);
+    for ( VertexPriorityQueue& pq : vertexPQs ) {
+      pq.setHandle(sharedData.vertexPQHandles.data(), sharedData.numberOfNodes);
+    }
+    while ( static_cast<size_t>(new_k) > vertexPQs.size() ) {
+      vertexPQs.emplace_back(sharedData.vertexPQHandles.data(), sharedData.numberOfNodes);
+    }
   }
 
   template<typename TypeTraits, typename GainTypes>
@@ -374,16 +347,25 @@ namespace mt_kahypar {
     utils::MemoryTreeNode *edges_to_activate_node = localized_fm_node->addChild("edgesWithGainChanges");
     edges_to_activate_node->updateSize(edgesWithGainChanges.capacity() * sizeof(HyperedgeID));
 
+    size_t vertex_pq_sizes = std::accumulate(
+            vertexPQs.begin(), vertexPQs.end(), 0,
+            [](size_t init, const VertexPriorityQueue& pq) { return init + pq.size_in_bytes(); }
+    );
+    localized_fm_node->addChild("PQs", blockPQ.size_in_bytes() + vertex_pq_sizes);
+
     utils::MemoryTreeNode *local_moves_node = parent->addChild("Local FM Moves");
     local_moves_node->updateSize(localMoves.capacity() * sizeof(std::pair<Move, MoveID>));
 
-    fm_strategy.memoryConsumption(localized_fm_node);
     deltaPhg.memoryConsumption(localized_fm_node);
     delta_gain_cache.memoryConsumption(localized_fm_node);
   }
 
   namespace {
-  #define LOCALIZED_KWAY_FM(X, Y) LocalizedKWayFM<X, Y>
+  #define LOCALIZED_KWAY_FM(X, Y) LocalizedKWayFM<X, Y>;                                                      \
+    template bool LocalizedKWayFM<X, Y>::findMoves(LocalUnconstrainedStrategy&,                               \
+                    typename LocalizedKWayFM<X, Y>::PartitionedHypergraph&, size_t, size_t);                  \
+    template bool LocalizedKWayFM<X, Y>::findMoves(LocalGainCacheStrategy&,                                   \
+                    typename LocalizedKWayFM<X, Y>::PartitionedHypergraph&, size_t, size_t)
   }
 
   INSTANTIATE_CLASS_WITH_TYPE_TRAITS_AND_GAIN_TYPES(LOCALIZED_KWAY_FM)
diff --git a/mt-kahypar/partition/refinement/fm/localized_kway_fm_core.h b/mt-kahypar/partition/refinement/fm/localized_kway_fm_core.h
index b7453aa00..d299e5c04 100644
--- a/mt-kahypar/partition/refinement/fm/localized_kway_fm_core.h
+++ b/mt-kahypar/partition/refinement/fm/localized_kway_fm_core.h
@@ -34,22 +34,25 @@
 #include "mt-kahypar/datastructures/sparse_map.h"
 #include "mt-kahypar/partition/refinement/fm/fm_commons.h"
 #include "mt-kahypar/partition/refinement/fm/stop_rule.h"
-#include "mt-kahypar/partition/refinement/fm/strategies/gain_cache_strategy.h"
 
 namespace mt_kahypar {
 
 
 template<typename TypeTraits, typename GainTypes>
 class LocalizedKWayFM {
+public:
+  using PartitionedHypergraph = typename TypeTraits::PartitionedHypergraph;
 
+ private:
   static constexpr size_t MAP_SIZE_LARGE = 16384;
   static constexpr size_t MAP_SIZE_MOVE_DELTA = 8192;
 
-  using PartitionedHypergraph = typename TypeTraits::PartitionedHypergraph;
   using GainCache = typename GainTypes::GainCache;
   using DeltaGainCache = typename GainTypes::DeltaGainCache;
   using DeltaPartitionedHypergraph = typename PartitionedHypergraph::template DeltaPartition<DeltaGainCache::requires_connectivity_set>;
   using AttributedGains = typename GainTypes::AttributedGains;
+  using BlockPriorityQueue = ds::ExclusiveHandleHeap< ds::MaxHeap<Gain, PartitionID> >;
+  using VertexPriorityQueue = ds::MaxHeap<Gain, HypernodeID>;    // these need external handles
 
 public:
   explicit LocalizedKWayFM(const Context& context,
@@ -60,16 +63,23 @@ class LocalizedKWayFM {
     thisSearch(0),
     deltaPhg(context),
     neighborDeduplicator(numNodes, 0),
-    fm_strategy(context, sharedData, runStats),
     gain_cache(gainCache),
     delta_gain_cache(gainCache),
-    sharedData(sharedData) {
+    sharedData(sharedData),
+    blockPQ(static_cast<size_t>(context.partition.k)),
+    vertexPQs(static_cast<size_t>(context.partition.k),
+      VertexPriorityQueue(sharedData.vertexPQHandles.data(), sharedData.numberOfNodes)) {
     const bool top_level = context.type == ContextType::main;
     delta_gain_cache.initialize(top_level ? MAP_SIZE_LARGE : MAP_SIZE_MOVE_DELTA);
   }
 
+  template<typename DispatchedFMStrategy>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE DispatchedFMStrategy initializeDispatchedStrategy() {
+    return DispatchedFMStrategy(context, sharedData, blockPQ, vertexPQs, runStats);
+  }
 
-  bool findMoves(PartitionedHypergraph& phg, size_t taskID, size_t numSeeds);
+  template<typename DispatchedFMStrategy>
+  bool findMoves(DispatchedFMStrategy& fm_strategy, PartitionedHypergraph& phg, size_t taskID, size_t numSeeds);
 
   void memoryConsumption(utils::MemoryTreeNode* parent) const;
 
@@ -78,30 +88,24 @@ class LocalizedKWayFM {
   FMStats stats;
 
 private:
+  template<bool use_delta, bool has_fixed_vertices, typename DispatchedFMStrategy>
+  void internalFindMoves(PartitionedHypergraph& phg, DispatchedFMStrategy& fm_strategy);
 
-  // ! Performs localized FM local search on the delta partitioned hypergraph.
-  // ! Moves made by this search are not immediately visible to other concurrent local searches.
-  // ! The best prefix of moves is applied to the global partitioned hypergraph after the search finishes.
-  //void internalFindMovesOnDeltaHypergraph(PartitionedHypergraph& phg, FMSharedData& sharedData);
-
-
-  template<bool use_delta, bool has_fixed_vertices>
-  void internalFindMoves(PartitionedHypergraph& phg);
-
-  template<bool has_fixed_vertices, typename PHG, typename CACHE>
+  template<bool has_fixed_vertices, typename PHG, typename CACHE, typename DispatchedFMStrategy>
   MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
-  void acquireOrUpdateNeighbors(PHG& phg, CACHE& gain_cache, const Move& move);
+  void acquireOrUpdateNeighbors(PHG& phg, CACHE& gain_cache, const Move& move, DispatchedFMStrategy& fm_strategy);
 
 
   // ! Makes moves applied on delta hypergraph visible on the global partitioned hypergraph.
-  std::pair<Gain, size_t> applyBestLocalPrefixToSharedPartition(PartitionedHypergraph& phg,
-                                                                const size_t best_index_locally_observed,
-                                                                const Gain best_improvement_locally_observed,
-                                                                const bool apply_delta_improvement);
+  template<typename DispatchedFMStrategy>
+  void applyBestLocalPrefixToSharedPartition(PartitionedHypergraph& phg,
+                                             DispatchedFMStrategy& fm_strategy,
+                                             const size_t best_index_locally_observed);
 
   // ! Rollback to the best improvement found during local search in case we applied moves
   // ! directly on the global partitioned hypergraph.
-  void revertToBestLocalPrefix(PartitionedHypergraph& phg, size_t bestGainIndex);
+  template<typename DispatchedFMStrategy>
+  void revertToBestLocalPrefix(PartitionedHypergraph& phg, DispatchedFMStrategy& fm_strategy, size_t bestGainIndex);
 
  private:
 
@@ -127,13 +131,20 @@ class LocalizedKWayFM {
 
   FMStats runStats;
 
-  GainCacheStrategy fm_strategy;
-
   GainCache& gain_cache;
 
   DeltaGainCache delta_gain_cache;
 
   FMSharedData& sharedData;
+
+  // ! Priority Queue that contains for each block of the partition
+  // ! the vertex with the best gain value
+  BlockPriorityQueue blockPQ;
+
+  // ! From PQs -> For each block it contains the vertices (contained
+  // ! in that block) touched by the current local search associated
+  // ! with their gain values
+  vec<VertexPriorityQueue> vertexPQs;
 };
 
 }
\ No newline at end of file
diff --git a/mt-kahypar/partition/refinement/fm/multitry_kway_fm.cpp b/mt-kahypar/partition/refinement/fm/multitry_kway_fm.cpp
index 26dd2f702..bd5a95d7d 100644
--- a/mt-kahypar/partition/refinement/fm/multitry_kway_fm.cpp
+++ b/mt-kahypar/partition/refinement/fm/multitry_kway_fm.cpp
@@ -25,16 +25,57 @@
  * SOFTWARE.
  ******************************************************************************/
 
+#include <set>
+
 #include "mt-kahypar/partition/refinement/fm/multitry_kway_fm.h"
 
 #include "mt-kahypar/definitions.h"
 #include "mt-kahypar/utils/utilities.h"
+#include "mt-kahypar/partition/factories.h"   // TODO removing this could make compilation a lot faster
 #include "mt-kahypar/partition/metrics.h"
 #include "mt-kahypar/partition/refinement/gains/gain_definitions.h"
 #include "mt-kahypar/utils/memory_tree.h"
 #include "mt-kahypar/utils/cast.h"
 
 namespace mt_kahypar {
+  using ds::StreamingVector;
+
+  template<typename TypeTraits, typename GainTypes>
+  MultiTryKWayFM<TypeTraits, GainTypes>::MultiTryKWayFM(const HypernodeID num_hypernodes,
+                                                        const HyperedgeID num_hyperedges,
+                                                        const Context& c,
+                                                        GainCache& gainCache,
+                                                        IRebalancer& rb) :
+    initial_num_nodes(num_hypernodes),
+    context(c),
+    gain_cache(gainCache),
+    current_k(c.partition.k),
+    sharedData(num_hypernodes),
+    fm_strategy(FMStrategyFactory::getInstance().createObject(context.refinement.fm.algorithm, context, sharedData)),
+    globalRollback(num_hyperedges, context, gainCache),
+    ets_fm([&] { return constructLocalizedKWayFMSearch(); }),
+    tmp_move_order(num_hypernodes),
+    rebalancer(rb) {
+    if (context.refinement.fm.obey_minimal_parallelism) {
+      sharedData.finishedTasksLimit = std::min(UL(8), context.shared_memory.num_threads);
+    }
+  }
+
+  // helper function for rebalancing
+  std::vector<HypernodeWeight> setupMaxPartWeights(const Context& context) {
+    double max_part_weight_scaling = context.refinement.fm.rollback_balance_violation_factor;
+    std::vector<HypernodeWeight> max_part_weights = context.partition.perfect_balance_part_weights;
+    if (max_part_weight_scaling == 0.0) {
+      for (PartitionID i = 0; i < context.partition.k; ++i) {
+        max_part_weights[i] = std::numeric_limits<HypernodeWeight>::max();
+      }
+    } else {
+      for (PartitionID i = 0; i < context.partition.k; ++i) {
+        max_part_weights[i] *= ( 1.0 + context.partition.epsilon * max_part_weight_scaling );
+      }
+    }
+    return max_part_weights;
+  }
 
   template<typename TypeTraits, typename GainTypes>
   bool MultiTryKWayFM<TypeTraits, GainTypes>::refineImpl(
@@ -55,14 +96,26 @@ namespace mt_kahypar {
     double current_time_limit = time_limit;
     tbb::task_group tg;
     vec<HypernodeWeight> initialPartWeights(size_t(context.partition.k));
+    std::vector<HypernodeWeight> max_part_weights;
     HighResClockTimepoint fm_start = std::chrono::high_resolution_clock::now();
     utils::Timer& timer = utils::Utilities::instance().getTimer(context.utility_id);
 
+    if (fm_strategy->includesUnconstrained()) {
+      max_part_weights = setupMaxPartWeights(context);
+    }
+
     for (size_t round = 0; round < context.refinement.fm.multitry_rounds; ++round) { // global multi try rounds
       for (PartitionID i = 0; i < context.partition.k; ++i) {
         initialPartWeights[i] = phg.partWeight(i);
       }
 
+      const bool is_unconstrained = fm_strategy->isUnconstrainedRound(round);
+      if (is_unconstrained) {
+        timer.start_timer("initialize_data_unconstrained", "Initialize Data for Unc. FM");
+        sharedData.unconstrained.initialize<TypeTraits, GainTypes>(context, phg, gain_cache);
+        timer.stop_timer("initialize_data_unconstrained");
+      }
+
       timer.start_timer("collect_border_nodes", "Collect Border Nodes");
       roundInitialization(phg, refinement_nodes);
       timer.stop_timer("collect_border_nodes");
@@ -81,24 +134,42 @@ namespace mt_kahypar {
       }
 
       timer.start_timer("find_moves", "Find Moves");
+      size_t num_tasks = std::min(num_border_nodes, size_t(TBBInitializer::instance().total_number_of_threads()));
       sharedData.finishedTasks.store(0, std::memory_order_relaxed);
+      fm_strategy->findMoves(utils::localized_fm_cast(ets_fm), hypergraph,
+                             num_tasks, num_seeds, round);
+      timer.stop_timer("find_moves");
 
-      auto task = [&](const size_t task_id) {
-        auto& fm = ets_fm.local();
-        while(sharedData.finishedTasks.load(std::memory_order_relaxed) < sharedData.finishedTasksLimit
-              && fm.findMoves(phg, task_id, num_seeds)) { /* keep running*/ }
-        sharedData.finishedTasks.fetch_add(1, std::memory_order_relaxed);
-      };
-      size_t num_tasks = std::min(num_border_nodes, size_t(TBBInitializer::instance().total_number_of_threads()));
-      for (size_t i = 0; i < num_tasks; ++i) {
-        tg.run(std::bind(task, i));
+      if (is_unconstrained && !isBalanced(phg, max_part_weights)) {
+        vec<vec<Move>> moves_by_part;
+
+        // compute rebalancing moves
+        timer.start_timer("rebalance_fm", "Rebalance");
+        Metrics tmp_metrics;
+        ASSERT([&]{ // correct quality only required for assertions
+          tmp_metrics.quality = metrics::quality(phg, context);
+          return true;
+        }());
+
+        if constexpr (GainCache::invalidates_entries) {
+          tbb::parallel_for(MoveID(0), sharedData.moveTracker.numPerformedMoves(), [&](const MoveID i) {
+            gain_cache.recomputeInvalidTerms(phg, sharedData.moveTracker.moveOrder[i].node);
+          });
+        }
+        HEAVY_REFINEMENT_ASSERT(phg.checkTrackedPartitionInformation(gain_cache));
+
+        tmp_metrics.imbalance = metrics::imbalance(phg, context);
+        rebalancer.refineAndOutputMoves(hypergraph, {}, moves_by_part, tmp_metrics, current_time_limit);
+        timer.stop_timer("rebalance_fm");
+
+        if (!moves_by_part.empty()) {
+          // compute new move sequence where each imbalanced move is immediately rebalanced
+          interleaveMoveSequenceWithRebalancingMoves(phg, initialPartWeights, max_part_weights, moves_by_part);
+        }
       }
-      tg.wait();
-      timer.stop_timer("find_moves");
 
       timer.start_timer("rollback", "Rollback to Best Solution");
-      HyperedgeWeight improvement = globalRollback.revertToBestPrefix(
-        phg, sharedData, initialPartWeights);
+      HyperedgeWeight improvement = globalRollback.revertToBestPrefix(phg, sharedData, initialPartWeights);
       timer.stop_timer("rollback");
 
       const double roundImprovementFraction = improvementFraction(improvement,
@@ -109,6 +180,7 @@ namespace mt_kahypar {
       } else {
         consecutive_rounds_with_too_little_improvement = 0;
       }
+      fm_strategy->reportImprovement(round, improvement, roundImprovementFraction);
 
       HighResClockTimepoint fm_timestamp = std::chrono::high_resolution_clock::now();
       const double elapsed_time = std::chrono::duration<double>(fm_timestamp - fm_start).count();
@@ -137,7 +209,8 @@ namespace mt_kahypar {
         }
       }
 
-      if (improvement <= 0 || consecutive_rounds_with_too_little_improvement >= 2) {
+      if ( (improvement <= 0 && (!context.refinement.fm.activate_unconstrained_dynamically || round > 1))
+            || consecutive_rounds_with_too_little_improvement >= 2 ) {
         break;
       }
     }
@@ -156,7 +229,8 @@ namespace mt_kahypar {
     metrics.imbalance = metrics::imbalance(phg, context);
     HEAVY_REFINEMENT_ASSERT(phg.checkTrackedPartitionInformation(gain_cache));
     ASSERT(metrics.quality == metrics::quality(phg, context),
-      V(metrics.quality) << V(metrics::quality(phg, context)));
+           V(metrics.quality) << V(metrics::quality(phg, context)));
+
     return overall_improvement > 0;
   }
 
@@ -209,6 +283,144 @@ namespace mt_kahypar {
     sharedData.nodeTracker.requestNewSearches(static_cast<SearchID>(sharedData.refinementNodes.unsafe_size()));
   }
 
+  template<typename TypeTraits, typename GainTypes>
+  void MultiTryKWayFM<TypeTraits, GainTypes>::interleaveMoveSequenceWithRebalancingMoves(
+                                                            const PartitionedHypergraph& phg,
+                                                            const vec<HypernodeWeight>& initialPartWeights,
+                                                            const std::vector<HypernodeWeight>& max_part_weights,
+                                                            vec<vec<Move>>& rebalancing_moves_by_part) {
+    ASSERT(rebalancing_moves_by_part.size() == static_cast<size_t>(context.partition.k));
+    HEAVY_REFINEMENT_ASSERT([&] {
+      std::set<HypernodeID> moved_nodes;
+      for (PartitionID part = 0; part < context.partition.k; ++part) {
+        for (const Move& m: rebalancing_moves_by_part[part]) {
+          if (m.from != part || m.to != phg.partID(m.node) || moved_nodes.count(m.node) != 0) {
+            return false;
+          }
+          moved_nodes.insert(m.node);
+        }
+      }
+      return true;
+    }());
+
+    GlobalMoveTracker& move_tracker = sharedData.moveTracker;
+    // Check the rebalancing moves for nodes that are moved twice. Double moves violate the precondition of the global
+    // rollback, which requires that each node is moved at most once. Thus we "merge" the moves of any node
+    // that is moved twice (e.g., 0 -> 2 -> 1 becomes 0 -> 1)
+    for (PartitionID part = 0; part < context.partition.k; ++part) {
+      vec<Move>& moves = rebalancing_moves_by_part[part];
+      tbb::parallel_for(UL(0), moves.size(), [&](const size_t i) {
+        Move& r_move = moves[i];
+        if (r_move.isValid() && move_tracker.wasNodeMovedInThisRound(r_move.node)) {
+          ASSERT(r_move.to == phg.partID(r_move.node));
+          Move& first_move = move_tracker.getMove(move_tracker.moveOfNode[r_move.node]);
+          ASSERT(r_move.node == first_move.node && r_move.from == first_move.to);
+          if (first_move.from == r_move.to) {
+            // if rebalancing undid the move, we simply delete it
+            move_tracker.moveOfNode[r_move.node] = 0;
+            first_move.invalidate();
+            r_move.invalidate();
+          } else {
+            // "merge" the moves
+            r_move.from = first_move.from;
+            first_move.invalidate();
+          }
+        }
+      }, tbb::static_partitioner());
+    }
+
+    // NOTE: We re-insert invalid rebalancing moves to ensure the gain cache is updated correctly by the global rollback
+    // For now we use a sequential implementation, which is probably fast enough (since this is a single scan trough
+    // the move sequence). We might replace it with a parallel implementation later.
+    vec<HypernodeWeight> current_part_weights = initialPartWeights;
+    vec<MoveID> current_rebalancing_move_index(context.partition.k, 0);
+    MoveID next_move_index = 0;
+
+    auto insert_moves_to_balance_part = [&](const PartitionID part) {
+      if (current_part_weights[part] > max_part_weights[part]) {
+        insertMovesToBalanceBlock(phg, part, max_part_weights, rebalancing_moves_by_part,
+                                  next_move_index, current_part_weights, current_rebalancing_move_index);
+      }
+    };
+
+    // it might be possible that the initial weights are already imbalanced
+    for (PartitionID part = 0; part < context.partition.k; ++part) {
+      insert_moves_to_balance_part(part);
+    }
+
+    const vec<Move>& move_order = move_tracker.moveOrder;
+    const MoveID num_moves = move_tracker.numPerformedMoves();
+    for (MoveID move_id = 0; move_id < num_moves; ++move_id) {
+      const Move& m = move_order[move_id];
+      if (m.isValid()) {
+        const HypernodeWeight hn_weight = phg.nodeWeight(m.node);
+        current_part_weights[m.from] -= hn_weight;
+        current_part_weights[m.to] += hn_weight;
+        tmp_move_order[next_move_index] = m;
+        ++next_move_index;
+        // insert rebalancing moves if necessary
+        insert_moves_to_balance_part(m.to);
+      } else {
+        // setting moveOfNode to zero is necessary because, after replacing the move sequence,
+        // wasNodeMovedInThisRound() could falsely return true otherwise
+        move_tracker.moveOfNode[m.node] = 0;
+      }
+    }
+
+    // append any remaining rebalancing moves (rollback will decide whether to keep them)
+    for (PartitionID part = 0; part < context.partition.k; ++part) {
+      while (current_rebalancing_move_index[part] < rebalancing_moves_by_part[part].size()) {
+        const MoveID move_index_for_part = current_rebalancing_move_index[part];
+        const Move& m = rebalancing_moves_by_part[part][move_index_for_part];
+        ++current_rebalancing_move_index[part];
+        tmp_move_order[next_move_index] = m;
+        ++next_move_index;
+      }
+    }
+
+    // update sharedData
+    const MoveID first_move_id = move_tracker.firstMoveID;
+    ASSERT(tmp_move_order.size() == move_tracker.moveOrder.size());
+
+    std::swap(move_tracker.moveOrder, tmp_move_order);
+    move_tracker.runningMoveID.store(first_move_id + next_move_index);
+    tbb::parallel_for(ID(0), next_move_index, [&](const MoveID move_id) {
+      const Move& m = move_tracker.moveOrder[move_id];
+      if (m.isValid()) {
+        move_tracker.moveOfNode[m.node] = first_move_id + move_id;
+      }
+    }, tbb::static_partitioner());
+  }
+
+  template<typename TypeTraits, typename GainTypes>
+  void MultiTryKWayFM<TypeTraits, GainTypes>::insertMovesToBalanceBlock(const PartitionedHypergraph& phg,
+                                                                        const PartitionID block,
+                                                                        const std::vector<HypernodeWeight>& max_part_weights,
+                                                                        const vec<vec<Move>>& rebalancing_moves_by_part,
+                                                                        MoveID& next_move_index,
+                                                                        vec<HypernodeWeight>& current_part_weights,
+                                                                        vec<MoveID>& current_rebalancing_move_index) {
+    while (current_part_weights[block] > max_part_weights[block]
+            && current_rebalancing_move_index[block] < rebalancing_moves_by_part[block].size()) {
+      const MoveID move_index_for_block = current_rebalancing_move_index[block];
+      const Move& m = rebalancing_moves_by_part[block][move_index_for_block];
+      ++current_rebalancing_move_index[block];
+      tmp_move_order[next_move_index] = m;
+      ++next_move_index;
+      if (m.isValid()) {
+        const HypernodeWeight hn_weight = phg.nodeWeight(m.node);
+        current_part_weights[m.from] -= hn_weight;
+        current_part_weights[m.to] += hn_weight;
+
+        if (current_part_weights[m.to] > max_part_weights[m.to]) {
+          // edge case: it is possible that the rebalancing move itself causes new imbalance -> call recursively
+          insertMovesToBalanceBlock(phg, m.to, max_part_weights, rebalancing_moves_by_part,
+                                    next_move_index, current_part_weights, current_rebalancing_move_index);
+        }
+      }
+    }
+  }
+
 
   template<typename TypeTraits, typename GainTypes>
   void MultiTryKWayFM<TypeTraits, GainTypes>::initializeImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph) {
@@ -217,6 +429,7 @@ namespace mt_kahypar {
     if (!gain_cache.isInitialized()) {
       gain_cache.initializeGainCache(phg);
     }
+    rebalancer.initialize(hypergraph);
 
     is_initialized = true;
   }
@@ -232,6 +445,7 @@ namespace mt_kahypar {
       // as we initialize them with the final number of blocks. This is just a fallback
       // if someone changes this in the future.
       globalRollback.changeNumberOfBlocks(current_k);
+      sharedData.unconstrained.changeNumberOfBlocks(current_k);
       for ( auto& localized_fm : ets_fm ) {
         localized_fm.changeNumberOfBlocks(current_k);
       }
@@ -258,5 +472,4 @@ namespace mt_kahypar {
   }
 
   INSTANTIATE_CLASS_WITH_TYPE_TRAITS_AND_GAIN_TYPES(MULTITRY_KWAY_FM)
-
 } // namespace mt_kahypar
diff --git a/mt-kahypar/partition/refinement/fm/multitry_kway_fm.h b/mt-kahypar/partition/refinement/fm/multitry_kway_fm.h
index 172260660..32547674e 100644
--- a/mt-kahypar/partition/refinement/fm/multitry_kway_fm.h
+++ b/mt-kahypar/partition/refinement/fm/multitry_kway_fm.h
@@ -27,12 +27,15 @@
 
 #pragma once
 
+#include <tbb/enumerable_thread_specific.h>
 
 #include "mt-kahypar/partition/context.h"
 
 #include "mt-kahypar/partition/refinement/i_refiner.h"
+#include "mt-kahypar/partition/refinement/i_rebalancer.h"
 #include "mt-kahypar/partition/refinement/fm/localized_kway_fm_core.h"
 #include "mt-kahypar/partition/refinement/fm/global_rollback.h"
+#include "mt-kahypar/partition/refinement/fm/strategies/i_fm_strategy.h"
 #include "mt-kahypar/partition/refinement/gains/gain_cache_ptr.h"
 
 namespace mt_kahypar {
@@ -55,25 +58,16 @@ class MultiTryKWayFM final : public IRefiner {
   MultiTryKWayFM(const HypernodeID num_hypernodes,
                  const HyperedgeID num_hyperedges,
                  const Context& c,
-                 GainCache& gainCache) :
-    initial_num_nodes(num_hypernodes),
-    context(c),
-    gain_cache(gainCache),
-    current_k(c.partition.k),
-    sharedData(num_hypernodes),
-    globalRollback(num_hyperedges, context, gainCache),
-    ets_fm([&] { return constructLocalizedKWayFMSearch(); }) {
-    if (context.refinement.fm.obey_minimal_parallelism) {
-      sharedData.finishedTasksLimit = std::min(UL(8), context.shared_memory.num_threads);
-    }
-  }
+                 GainCache& gainCache,
+                 IRebalancer& rb);
 
   MultiTryKWayFM(const HypernodeID num_hypernodes,
                  const HyperedgeID num_hyperedges,
                  const Context& c,
-                 gain_cache_t gainCache) :
+                 gain_cache_t gainCache,
+                 IRebalancer& rb) :
     MultiTryKWayFM(num_hypernodes, num_hyperedges, c,
-      GainCachePtr::cast<GainCache>(gainCache)) { }
+      GainCachePtr::cast<GainCache>(gainCache), rb) { }
 
   void printMemoryConsumption();
 
@@ -88,6 +82,27 @@ class MultiTryKWayFM final : public IRefiner {
   void roundInitialization(PartitionedHypergraph& phg,
                            const vec<HypernodeID>& refinement_nodes);
 
+  void interleaveMoveSequenceWithRebalancingMoves(const PartitionedHypergraph& phg,
+                                                  const vec<HypernodeWeight>& initialPartWeights,
+                                                  const std::vector<HypernodeWeight>& max_part_weights,
+                                                  vec<vec<Move>>& rebalancing_moves_by_part);
+
+  void insertMovesToBalanceBlock(const PartitionedHypergraph& phg,
+                                 const PartitionID block,
+                                 const std::vector<HypernodeWeight>& max_part_weights,
+                                 const vec<vec<Move>>& rebalancing_moves_by_part,
+                                 MoveID& next_move_index,
+                                 vec<HypernodeWeight>& current_part_weights,
+                                 vec<MoveID>& current_rebalancing_move_index);
+
+  bool isBalanced(const PartitionedHypergraph& phg, const std::vector<HypernodeWeight>& max_part_weights) {
+    for (PartitionID i = 0; i < context.partition.k; ++i) {
+      if (phg.partWeight(i) > max_part_weights[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
 
   LocalizedFMSearch constructLocalizedKWayFMSearch() {
     return LocalizedFMSearch(context, initial_num_nodes, sharedData, gain_cache);
@@ -109,8 +124,11 @@ class MultiTryKWayFM final : public IRefiner {
   GainCache& gain_cache;
   PartitionID current_k;
   FMSharedData sharedData;
+  std::unique_ptr<IFMStrategy> fm_strategy;
   Rollback globalRollback;
   tbb::enumerable_thread_specific<LocalizedFMSearch> ets_fm;
+  vec<Move> tmp_move_order;
+  IRebalancer& rebalancer;
 };
 
 } // namespace mt_kahypar
diff --git a/mt-kahypar/partition/refinement/fm/sequential_twoway_fm_refiner.cpp b/mt-kahypar/partition/refinement/fm/sequential_twoway_fm_refiner.cpp
index 4a2bd951c..568cb29f9 100644
--- a/mt-kahypar/partition/refinement/fm/sequential_twoway_fm_refiner.cpp
+++ b/mt-kahypar/partition/refinement/fm/sequential_twoway_fm_refiner.cpp
@@ -55,7 +55,7 @@ bool SequentialTwoWayFmRefiner<TypeTraits>::refine(Metrics& best_metrics, std::m
     _he_state[he] = HEState::FREE;
   }
 
-  auto border_vertex_update = [&](const SyncronizedEdgeUpdate& sync_update) {
+  auto border_vertex_update = [&](const SynchronizedEdgeUpdate& sync_update) {
                             if ( sync_update.edge_size > 1 ) {
                               if ( sync_update.pin_count_in_from_part_after == 0 ) {
                                 _border_vertices.becameNonCutHyperedge(_phg, sync_update.he, _vertex_state);
diff --git a/mt-kahypar/partition/refinement/fm/strategies/gain_cache_strategy.h b/mt-kahypar/partition/refinement/fm/strategies/gain_cache_strategy.h
index 115dc6d20..c290252fb 100644
--- a/mt-kahypar/partition/refinement/fm/strategies/gain_cache_strategy.h
+++ b/mt-kahypar/partition/refinement/fm/strategies/gain_cache_strategy.h
@@ -3,7 +3,7 @@
  *
  * This file is part of Mt-KaHyPar.
  *
- * Copyright (C) 2020 Lars Gottesbüren <lars.gottesbueren@kit.edu>
+ * Copyright (C) 2023 Nikolai Maas <nikolai.maas@kit.edu>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -26,258 +26,43 @@
 
 #pragma once
 
-#include "mt-kahypar/partition/refinement/fm/fm_commons.h"
+#include "mt-kahypar/partition/refinement/fm/localized_kway_fm_core.h"
+#include "mt-kahypar/partition/refinement/fm/strategies/i_fm_strategy.h"
+#include "mt-kahypar/partition/refinement/fm/strategies/local_gain_cache_strategy.h"
 
 
 namespace mt_kahypar {
 
-  /*
-   * FMStrategy interface
-   * static constexpr bool uses_gain_cache
-   * Constructor(context, numNodes, sharedData, runStats)
-   * insertIntoPQ(phg, node)
-   * updateGain(phg, node, move)
-   * findNextMove(phg, move)
-   * clearPQs()
-   * updatePQs()
-   * memoryConsumption(utils::MemoryTreeNode* parent) const
-   *
-   *
-   */
+template<typename TypeTraits, typename GainTypes>
+class GainCacheStrategy: public IFMStrategy {
+  using Base = IFMStrategy;
 
-class GainCacheStrategy {
-public:
+ public:
+  using LocalFM = LocalizedKWayFM<TypeTraits, GainTypes>;
+  using PartitionedHypergraph = typename TypeTraits::PartitionedHypergraph;
 
-  using BlockPriorityQueue = ds::ExclusiveHandleHeap< ds::MaxHeap<Gain, PartitionID> >;
-  using VertexPriorityQueue = ds::MaxHeap<Gain, HypernodeID>;    // these need external handles
+  GainCacheStrategy(const Context& context, FMSharedData& sharedData):
+      Base(context, sharedData) { }
 
-  static constexpr bool uses_gain_cache = true;
-  static constexpr bool maintain_gain_cache_between_rounds = true;
-
-  GainCacheStrategy(const Context& context,
-                    FMSharedData& sharedData,
-                    FMStats& runStats) :
-      context(context),
-      runStats(runStats),
-      sharedData(sharedData),
-      blockPQ(static_cast<size_t>(context.partition.k)),
-      vertexPQs(static_cast<size_t>(context.partition.k),
-        VertexPriorityQueue(sharedData.vertexPQHandles.data(), sharedData.numberOfNodes)) { }
-
-  template<typename PartitionedHypergraph, typename GainCache>
-  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
-  void insertIntoPQ(const PartitionedHypergraph& phg,
-                    const GainCache& gain_cache,
-                    const HypernodeID v) {
-    const PartitionID pv = phg.partID(v);
-    ASSERT(pv < context.partition.k);
-    auto [target, gain] = computeBestTargetBlock(phg, gain_cache, v, pv);
-    ASSERT(target < context.partition.k, V(target) << V(context.partition.k));
-    sharedData.targetPart[v] = target;
-    vertexPQs[pv].insert(v, gain);  // blockPQ updates are done later, collectively.
-    runStats.pushes++;
-  }
-
-  template<typename PartitionedHypergraph, typename GainCache>
-  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
-  void updateGain(const PartitionedHypergraph& phg,
-                  const GainCache& gain_cache,
-                  const HypernodeID v,
-                  const Move& move) {
-    const PartitionID pv = phg.partID(v);
-    ASSERT(vertexPQs[pv].contains(v));
-    const PartitionID designatedTargetV = sharedData.targetPart[v];
-    Gain gain = 0;
-    PartitionID newTarget = kInvalidPartition;
-
-    if (context.partition.k < 4 || designatedTargetV == move.from || designatedTargetV == move.to) {
-      // penalty term of designatedTargetV is affected.
-      // and may now be greater than that of other blocks --> recompute full
-      std::tie(newTarget, gain) = computeBestTargetBlock(phg, gain_cache, v, pv);
-    } else {
-      // penalty term of designatedTargetV is not affected.
-      // only move.from and move.to may be better
-      std::tie(newTarget, gain) = bestOfThree(phg, gain_cache,
-        v, pv, { designatedTargetV, move.from, move.to });
-    }
-
-    sharedData.targetPart[v] = newTarget;
-    vertexPQs[pv].adjustKey(v, gain);
-  }
-
-  template<typename PartitionedHypergraph, typename GainCache>
-  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
-  bool findNextMove(const PartitionedHypergraph& phg,
-                    const GainCache& gain_cache,
-                    Move& m) {
-    updatePQs();
-
-    if (blockPQ.empty()) {
-      return false;
-    }
-
-    while (true) {
-      const PartitionID from = blockPQ.top();
-      const HypernodeID u = vertexPQs[from].top();
-      const Gain estimated_gain = vertexPQs[from].topKey();
-      ASSERT(estimated_gain == blockPQ.topKey());
-      auto [to, gain] = computeBestTargetBlock(phg, gain_cache, u, phg.partID(u));
-
-      if (gain >= estimated_gain) { // accept any gain that is at least as good
-        m.node = u; m.to = to; m.from = from;
-        m.gain = gain;
-        runStats.extractions++;
-        vertexPQs[from].deleteTop();  // blockPQ updates are done later, collectively.
-        return true;
-      } else {
-        runStats.retries++;
-        vertexPQs[from].adjustKey(u, gain);
-        sharedData.targetPart[u] = to;
-        if (vertexPQs[from].topKey() != blockPQ.keyOf(from)) {
-          blockPQ.adjustKey(from, vertexPQs[from].topKey());
-        }
-      }
-    }
+  bool dispatchedFindMoves(LocalFM& local_fm, PartitionedHypergraph& phg, size_t task_id, size_t num_seeds, size_t) {
+    LocalGainCacheStrategy local_strategy = local_fm.template initializeDispatchedStrategy<LocalGainCacheStrategy>();
+    return local_fm.findMoves(local_strategy, phg, task_id, num_seeds);
   }
 
-  void clearPQs(const size_t /* bestImprovementIndex */ ) {
-    // release all nodes that were not moved
-    const bool release = sharedData.release_nodes
-                         && runStats.moves > 0;
-
-    if (release) {
-      // Release all nodes contained in PQ
-      for (PartitionID i = 0; i < context.partition.k; ++i) {
-        for (PosT j = 0; j < vertexPQs[i].size(); ++j) {
-          const HypernodeID v = vertexPQs[i].at(j);
-          sharedData.nodeTracker.releaseNode(v);
-        }
-      }
-    }
-
-    for (PartitionID i = 0; i < context.partition.k; ++i) {
-      vertexPQs[i].clear();
-    }
-    blockPQ.clear();
+ private:
+  virtual void findMovesImpl(localized_k_way_fm_t local_fm, mt_kahypar_partitioned_hypergraph_t& phg,
+                             size_t num_tasks, size_t num_seeds, size_t round) final {
+    Base::findMovesWithConcreteStrategy<GainCacheStrategy>(
+              local_fm, phg, num_tasks, num_seeds, round);
   }
 
-
-  // We're letting the FM details implementation decide what happens here, since some may not want to do gain cache updates,
-  // but rather update gains in their PQs or something
-
-  template<typename PartitionedHypergraph, typename GainCache>
-  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
-  void deltaGainUpdates(PartitionedHypergraph& phg,
-                        GainCache& gain_cache,
-                        const SyncronizedEdgeUpdate& sync_update) {
-    gain_cache.deltaGainUpdate(phg, sync_update);
+  virtual bool isUnconstrainedRoundImpl(size_t) const final {
+    return false;
   }
 
-  void changeNumberOfBlocks(const PartitionID new_k) {
-    blockPQ.resize(new_k);
-    for ( VertexPriorityQueue& pq : vertexPQs ) {
-      pq.setHandle(sharedData.vertexPQHandles.data(), sharedData.numberOfNodes);
-    }
-    while ( static_cast<size_t>(new_k) > vertexPQs.size() ) {
-      vertexPQs.emplace_back(sharedData.vertexPQHandles.data(), sharedData.numberOfNodes);
-    }
+  virtual bool includesUnconstrainedImpl() const final {
+    return false;
   }
-
-  void memoryConsumption(utils::MemoryTreeNode *parent) const {
-    size_t vertex_pq_sizes = std::accumulate(
-            vertexPQs.begin(), vertexPQs.end(), 0,
-            [](size_t init, const VertexPriorityQueue& pq) { return init + pq.size_in_bytes(); }
-    );
-    parent->addChild("PQs", blockPQ.size_in_bytes() + vertex_pq_sizes);
-  }
-
-private:
-
-
-  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
-  void updatePQs() {
-    for (PartitionID i = 0; i < context.partition.k; ++i) {
-      if (!vertexPQs[i].empty()) {
-        blockPQ.insertOrAdjustKey(i, vertexPQs[i].topKey());
-      } else if (blockPQ.contains(i)) {
-        blockPQ.remove(i);
-      }
-    }
-  }
-
-  template<typename PartitionedHypergraph, typename GainCache>
-  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
-  std::pair<PartitionID, HyperedgeWeight> computeBestTargetBlock(const PartitionedHypergraph& phg,
-                                                                 const GainCache& gain_cache,
-                                                                 const HypernodeID u,
-                                                                 const PartitionID from) {
-    const HypernodeWeight wu = phg.nodeWeight(u);
-    const HypernodeWeight from_weight = phg.partWeight(from);
-    PartitionID to = kInvalidPartition;
-    HyperedgeWeight to_benefit = std::numeric_limits<HyperedgeWeight>::min();
-    HypernodeWeight best_to_weight = from_weight - wu;
-    for ( const PartitionID& i : gain_cache.adjacentBlocks(u) ) {
-      if (i != from) {
-        const HypernodeWeight to_weight = phg.partWeight(i);
-        const HyperedgeWeight penalty = gain_cache.benefitTerm(u, i);
-        if ( ( penalty > to_benefit || ( penalty == to_benefit && to_weight < best_to_weight ) ) &&
-             to_weight + wu <= context.partition.max_part_weights[i] ) {
-          to_benefit = penalty;
-          to = i;
-          best_to_weight = to_weight;
-        }
-      }
-    }
-    const Gain gain = to != kInvalidPartition ? to_benefit - gain_cache.penaltyTerm(u, phg.partID(u))
-                                              : std::numeric_limits<HyperedgeWeight>::min();
-    return std::make_pair(to, gain);
-  }
-
-  template<typename PartitionedHypergraph, typename GainCache>
-  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
-  std::pair<PartitionID, HyperedgeWeight> bestOfThree(const PartitionedHypergraph& phg,
-                                                      const GainCache& gain_cache,
-                                                      HypernodeID u,
-                                                      PartitionID from,
-                                                      std::array<PartitionID, 3> parts) {
-
-    const HypernodeWeight wu = phg.nodeWeight(u);
-    const HypernodeWeight from_weight = phg.partWeight(from);
-    PartitionID to = kInvalidPartition;
-    HyperedgeWeight to_benefit = std::numeric_limits<HyperedgeWeight>::min();
-    HypernodeWeight best_to_weight = from_weight - wu;
-    for (PartitionID i : parts) {
-      if (i != from && i != kInvalidPartition) {
-        const HypernodeWeight to_weight = phg.partWeight(i);
-        const HyperedgeWeight penalty = gain_cache.benefitTerm(u, i);
-        if ( ( penalty > to_benefit || ( penalty == to_benefit && to_weight < best_to_weight ) ) &&
-             to_weight + wu <= context.partition.max_part_weights[i] ) {
-          to_benefit = penalty;
-          to = i;
-          best_to_weight = to_weight;
-        }
-      }
-    }
-    const Gain gain = to != kInvalidPartition ? to_benefit - gain_cache.penaltyTerm(u, phg.partID(u))
-                                              : std::numeric_limits<HyperedgeWeight>::min();
-    return std::make_pair(to, gain);
-  }
-
-  const Context& context;
-
-  FMStats& runStats;
-
-protected:
-  FMSharedData& sharedData;
-
-  // ! Priority Queue that contains for each block of the partition
-  // ! the vertex with the best gain value
-  BlockPriorityQueue blockPQ;
-
-  // ! From PQs -> For each block it contains the vertices (contained
-  // ! in that block) touched by the current local search associated
-  // ! with their gain values
-  vec<VertexPriorityQueue> vertexPQs;
 };
 
-}
\ No newline at end of file
+}
diff --git a/mt-kahypar/partition/refinement/fm/strategies/i_fm_strategy.h b/mt-kahypar/partition/refinement/fm/strategies/i_fm_strategy.h
new file mode 100644
index 000000000..4dbf7a054
--- /dev/null
+++ b/mt-kahypar/partition/refinement/fm/strategies/i_fm_strategy.h
@@ -0,0 +1,143 @@
+/*******************************************************************************
+ * MIT License
+ *
+ * This file is part of Mt-KaHyPar.
+ *
+ * Copyright (C) 2023 Nikolai Maas <nikolai.maas@kit.edu>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ ******************************************************************************/
+
+#pragma once
+
+#include <tbb/enumerable_thread_specific.h>
+
+#include "mt-kahypar/datastructures/streaming_vector.h"
+#include "mt-kahypar/macros.h"
+#include "mt-kahypar/definitions.h"
+#include "mt-kahypar/utils/cast.h"
+
+namespace mt_kahypar {
+
+// TODO: this is still a bit hacky, is there any better way?
+struct localized_k_way_fm_s;
+
+struct localized_k_way_fm_t {
+  localized_k_way_fm_s* local_fm;
+  mt_kahypar_partition_type_t type;
+};
+
+namespace utils {
+// compare cast.h
+template<typename LocalFM>
+localized_k_way_fm_t localized_fm_cast(tbb::enumerable_thread_specific<LocalFM>& local_fm) {
+  return localized_k_way_fm_t {
+    reinterpret_cast<localized_k_way_fm_s*>(&local_fm), LocalFM::PartitionedHypergraph::TYPE };
+}
+
+template<typename LocalFM>
+tbb::enumerable_thread_specific<LocalFM>& cast(localized_k_way_fm_t fm) {
+  if ( LocalFM::PartitionedHypergraph::TYPE != fm.type ) {
+    ERR("Cannot cast local FM [" << typeToString(fm.type) << "to"
+        << typeToString(LocalFM::PartitionedHypergraph::TYPE) << "]");
+  }
+  return *reinterpret_cast<tbb::enumerable_thread_specific<LocalFM>*>(fm.local_fm);
+}
+
+} // namespace utils
+
+
+class IFMStrategy {
+ public:
+  // !!! The following declarations should be present in subclasses:
+  // using LocalFM = ...;
+  // using PartitionedHypergraph = ...;
+
+  IFMStrategy(const IFMStrategy&) = delete;
+  IFMStrategy(IFMStrategy&&) = delete;
+  IFMStrategy & operator= (const IFMStrategy &) = delete;
+  IFMStrategy & operator= (IFMStrategy &&) = delete;
+
+  virtual ~IFMStrategy() = default;
+
+  void findMoves(localized_k_way_fm_t local_fm, mt_kahypar_partitioned_hypergraph_t& phg,
+                 size_t num_tasks, size_t num_seeds, size_t round) {
+    findMovesImpl(local_fm, phg, num_tasks, num_seeds, round);
+  }
+
+  bool isUnconstrainedRound(size_t round) const {
+    return isUnconstrainedRoundImpl(round);
+  }
+
+  bool includesUnconstrained() const {
+    return includesUnconstrainedImpl();
+  }
+
+  void reportImprovement(size_t round, Gain absolute_improvement, double relative_improvement) {
+    reportImprovementImpl(round, absolute_improvement, relative_improvement);
+  }
+
+  // !!! The following method should be present in subclasses:
+  // bool dispatchedFindMoves(LocalFM& local_fm, PartitionedHypergraph& phg,
+  //                          size_t task_id, size_t num_seeds, size_t round);
+
+ protected:
+  IFMStrategy(const Context& context, FMSharedData& sharedData):
+      context(context), sharedData(sharedData) { }
+
+  template<typename Derived>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void findMovesWithConcreteStrategy(localized_k_way_fm_t local_fm, mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                                     size_t num_tasks, size_t num_seeds, size_t round) {
+    using LocalFM = typename Derived::LocalFM;
+    using PartitionedHypergraph = typename Derived::PartitionedHypergraph;
+    Derived& concrete_strategy = *static_cast<Derived*>(this);
+    tbb::enumerable_thread_specific<LocalFM>& ets_fm = utils::cast<LocalFM>(local_fm);
+    PartitionedHypergraph& phg = utils::cast<PartitionedHypergraph>(hypergraph);
+    tbb::task_group tg;
+
+    auto task = [&](const size_t task_id) {
+      LocalFM& fm = ets_fm.local();
+      while(sharedData.finishedTasks.load(std::memory_order_relaxed) < sharedData.finishedTasksLimit
+            && concrete_strategy.dispatchedFindMoves(fm, phg, task_id, num_seeds, round)) { /* keep running*/ }
+      sharedData.finishedTasks.fetch_add(1, std::memory_order_relaxed);
+    };
+    for (size_t i = 0; i < num_tasks; ++i) {
+      tg.run(std::bind(task, i));
+    }
+    tg.wait();
+  }
+
+  const Context& context;
+  FMSharedData& sharedData;
+
+ private:
+  virtual void findMovesImpl(localized_k_way_fm_t local_fm, mt_kahypar_partitioned_hypergraph_t& phg,
+                             size_t num_tasks, size_t num_seeds, size_t round) = 0;
+
+  virtual bool isUnconstrainedRoundImpl(size_t round) const = 0;
+
+  virtual bool includesUnconstrainedImpl() const = 0;
+
+  virtual void reportImprovementImpl(size_t, Gain, double) {
+    // most strategies don't use this
+  }
+};
+
+}  // namespace mt_kahypar
diff --git a/mt-kahypar/partition/refinement/fm/strategies/local_gain_cache_strategy.h b/mt-kahypar/partition/refinement/fm/strategies/local_gain_cache_strategy.h
new file mode 100644
index 000000000..6c9508f83
--- /dev/null
+++ b/mt-kahypar/partition/refinement/fm/strategies/local_gain_cache_strategy.h
@@ -0,0 +1,284 @@
+/*******************************************************************************
+ * MIT License
+ *
+ * This file is part of Mt-KaHyPar.
+ *
+ * Copyright (C) 2020 Lars Gottesbüren <lars.gottesbueren@kit.edu>
+ * Copyright (C) 2023 Nikolai Maas <nikolai.maas@kit.edu>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ ******************************************************************************/
+
+#pragma once
+
+#include "mt-kahypar/partition/refinement/fm/fm_commons.h"
+
+
+namespace mt_kahypar {
+
+  /*
+   * LocalFMStrategy interface
+   * static constexpr bool uses_gain_cache
+   * static constexpr bool maintain_gain_cache_between_rounds
+   * static constexpr bool is_unconstrained
+   *
+   * Constructor(context, sharedData, blockPQ, vertexPQs, runStats)
+   * insertIntoPQ(phg, gain_cache, node)
+   * updateGain(phg, gain_cache, node, move)
+   * findNextMove(phg, gain_cache, move)
+   * applyMove(phg, gain_cache, move, global)
+   * reset()
+   * deltaGainUpdates(phg, gain_cache, sync_update)
+   *
+   */
+
+class LocalGainCacheStrategy {
+public:
+
+  using BlockPriorityQueue = ds::ExclusiveHandleHeap< ds::MaxHeap<Gain, PartitionID> >;
+  using VertexPriorityQueue = ds::MaxHeap<Gain, HypernodeID>;    // these need external handles
+
+  static constexpr bool uses_gain_cache = true;
+  static constexpr bool maintain_gain_cache_between_rounds = true;
+  static constexpr bool is_unconstrained = false;
+
+  LocalGainCacheStrategy(const Context& context,
+                         FMSharedData& sharedData,
+                         BlockPriorityQueue& blockPQ,
+                         vec<VertexPriorityQueue>& vertexPQs,
+                         FMStats& runStats) :
+      context(context),
+      runStats(runStats),
+      sharedData(sharedData),
+      blockPQ(blockPQ),
+      vertexPQs(vertexPQs) { }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void insertIntoPQ(const PartitionedHypergraph& phg,
+                    const GainCache& gain_cache,
+                    const HypernodeID v) {
+    const PartitionID pv = phg.partID(v);
+    ASSERT(pv < context.partition.k);
+    auto [target, gain] = computeBestTargetBlock(phg, gain_cache, v, pv);
+    ASSERT(target < context.partition.k, V(target) << V(context.partition.k));
+    sharedData.targetPart[v] = target;
+    vertexPQs[pv].insert(v, gain);  // blockPQ updates are done later, collectively.
+    runStats.pushes++;
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void updateGain(const PartitionedHypergraph& phg,
+                  const GainCache& gain_cache,
+                  const HypernodeID v,
+                  const Move& move) {
+    const PartitionID pv = phg.partID(v);
+    ASSERT(vertexPQs[pv].contains(v));
+    const PartitionID designatedTargetV = sharedData.targetPart[v];
+    Gain gain = 0;
+    PartitionID newTarget = kInvalidPartition;
+
+    if (context.partition.k < 4 || designatedTargetV == move.from || designatedTargetV == move.to) {
+      // penalty term of designatedTargetV is affected.
+      // and may now be greater than that of other blocks --> recompute full
+      std::tie(newTarget, gain) = computeBestTargetBlock(phg, gain_cache, v, pv);
+    } else {
+      // penalty term of designatedTargetV is not affected.
+      // only move.from and move.to may be better
+      std::tie(newTarget, gain) = bestOfThree(phg, gain_cache,
+        v, pv, { designatedTargetV, move.from, move.to });
+    }
+
+    sharedData.targetPart[v] = newTarget;
+    vertexPQs[pv].adjustKey(v, gain);
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  bool findNextMove(const PartitionedHypergraph& phg,
+                    const GainCache& gain_cache,
+                    Move& m) {
+    updatePQs();
+
+    if (blockPQ.empty()) {
+      return false;
+    }
+
+    while (true) {
+      const PartitionID from = blockPQ.top();
+      const HypernodeID u = vertexPQs[from].top();
+      const Gain estimated_gain = vertexPQs[from].topKey();
+      ASSERT(estimated_gain == blockPQ.topKey());
+      auto [to, gain] = computeBestTargetBlock(phg, gain_cache, u, phg.partID(u));
+
+      if (gain >= estimated_gain) { // accept any gain that is at least as good
+        m.node = u; m.to = to; m.from = from;
+        m.gain = gain;
+        runStats.extractions++;
+        vertexPQs[from].deleteTop();  // blockPQ updates are done later, collectively.
+        return true;
+      } else {
+        runStats.retries++;
+        vertexPQs[from].adjustKey(u, gain);
+        sharedData.targetPart[u] = to;
+        if (vertexPQs[from].topKey() != blockPQ.keyOf(from)) {
+          blockPQ.adjustKey(from, vertexPQs[from].topKey());
+        }
+      }
+    }
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void applyMove(const PartitionedHypergraph&, const GainCache&, Move, bool) {
+    // nothing to do here
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void revertMove(const PartitionedHypergraph&, const GainCache&, Move, bool) {
+    // nothing to do here
+  }
+
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void flushLocalChanges() {
+    // nothing to do here
+  }
+
+  void reset() {
+    // release all nodes that were not moved
+    const bool release = sharedData.release_nodes
+                         && runStats.moves > 0;
+
+    if (release) {
+      // Release all nodes contained in PQ
+      for (PartitionID i = 0; i < context.partition.k; ++i) {
+        for (PosT j = 0; j < vertexPQs[i].size(); ++j) {
+          const HypernodeID v = vertexPQs[i].at(j);
+          sharedData.nodeTracker.releaseNode(v);
+        }
+      }
+    }
+
+    for (PartitionID i = 0; i < context.partition.k; ++i) {
+      vertexPQs[i].clear();
+    }
+    blockPQ.clear();
+  }
+
+
+  // We're letting the FM details implementation decide what happens here, since some may not want to do gain cache updates,
+  // but rather update gains in their PQs or something
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void deltaGainUpdates(PartitionedHypergraph& phg,
+                        GainCache& gain_cache,
+                        const SynchronizedEdgeUpdate& sync_update) {
+    gain_cache.deltaGainUpdate(phg, sync_update);
+  }
+
+private:
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void updatePQs() {
+    for (PartitionID i = 0; i < context.partition.k; ++i) {
+      if (!vertexPQs[i].empty()) {
+        blockPQ.insertOrAdjustKey(i, vertexPQs[i].topKey());
+      } else if (blockPQ.contains(i)) {
+        blockPQ.remove(i);
+      }
+    }
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  std::pair<PartitionID, HyperedgeWeight> computeBestTargetBlock(const PartitionedHypergraph& phg,
+                                                                 const GainCache& gain_cache,
+                                                                 const HypernodeID u,
+                                                                 const PartitionID from) {
+    const HypernodeWeight wu = phg.nodeWeight(u);
+    const HypernodeWeight from_weight = phg.partWeight(from);
+    PartitionID to = kInvalidPartition;
+    HyperedgeWeight to_benefit = std::numeric_limits<HyperedgeWeight>::min();
+    HypernodeWeight best_to_weight = from_weight - wu;
+    for ( const PartitionID& i : gain_cache.adjacentBlocks(u) ) {
+      if (i != from) {
+        const HypernodeWeight to_weight = phg.partWeight(i);
+        const HyperedgeWeight penalty = gain_cache.benefitTerm(u, i);
+        if ( ( penalty > to_benefit || ( penalty == to_benefit && to_weight < best_to_weight ) ) &&
+             to_weight + wu <= context.partition.max_part_weights[i] ) {
+          to_benefit = penalty;
+          to = i;
+          best_to_weight = to_weight;
+        }
+      }
+    }
+    const Gain gain = to != kInvalidPartition ? to_benefit - gain_cache.penaltyTerm(u, phg.partID(u))
+                                              : std::numeric_limits<HyperedgeWeight>::min();
+    return std::make_pair(to, gain);
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  std::pair<PartitionID, HyperedgeWeight> bestOfThree(const PartitionedHypergraph& phg,
+                                                      const GainCache& gain_cache,
+                                                      HypernodeID u,
+                                                      PartitionID from,
+                                                      std::array<PartitionID, 3> parts) {
+
+    const HypernodeWeight wu = phg.nodeWeight(u);
+    const HypernodeWeight from_weight = phg.partWeight(from);
+    PartitionID to = kInvalidPartition;
+    HyperedgeWeight to_benefit = std::numeric_limits<HyperedgeWeight>::min();
+    HypernodeWeight best_to_weight = from_weight - wu;
+    for (PartitionID i : parts) {
+      if (i != from && i != kInvalidPartition) {
+        const HypernodeWeight to_weight = phg.partWeight(i);
+        const HyperedgeWeight penalty = gain_cache.benefitTerm(u, i);
+        if ( ( penalty > to_benefit || ( penalty == to_benefit && to_weight < best_to_weight ) ) &&
+             to_weight + wu <= context.partition.max_part_weights[i] ) {
+          to_benefit = penalty;
+          to = i;
+          best_to_weight = to_weight;
+        }
+      }
+    }
+    const Gain gain = to != kInvalidPartition ? to_benefit - gain_cache.penaltyTerm(u, phg.partID(u))
+                                              : std::numeric_limits<HyperedgeWeight>::min();
+    return std::make_pair(to, gain);
+  }
+
+  const Context& context;
+
+  FMStats& runStats;
+
+protected:
+  FMSharedData& sharedData;
+
+  // ! Priority Queue that contains for each block of the partition
+  // ! the vertex with the best gain value
+  BlockPriorityQueue& blockPQ;
+
+  // ! From PQs -> For each block it contains the vertices (contained
+  // ! in that block) touched by the current local search associated
+  // ! with their gain values
+  vec<VertexPriorityQueue>& vertexPQs;
+};
+
+}
diff --git a/mt-kahypar/partition/refinement/fm/strategies/local_unconstrained_strategy.h b/mt-kahypar/partition/refinement/fm/strategies/local_unconstrained_strategy.h
new file mode 100644
index 000000000..d3ad8c0a5
--- /dev/null
+++ b/mt-kahypar/partition/refinement/fm/strategies/local_unconstrained_strategy.h
@@ -0,0 +1,373 @@
+/*******************************************************************************
+ * MIT License
+ *
+ * This file is part of Mt-KaHyPar.
+ *
+ * Copyright (C) 2023 Nikolai Maas <nikolai.maas@kit.edu>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ ******************************************************************************/
+
+#pragma once
+
+#include "mt-kahypar/datastructures/sparse_map.h"
+#include "mt-kahypar/partition/refinement/fm/fm_commons.h"
+
+
+namespace mt_kahypar {
+
+  /*
+   * LocalFMStrategy interface
+   * static constexpr bool uses_gain_cache
+   * static constexpr bool maintain_gain_cache_between_rounds
+   * static constexpr bool is_unconstrained
+   *
+   * Constructor(context, sharedData, blockPQ, vertexPQs, runStats)
+   * insertIntoPQ(phg, gain_cache, node)
+   * updateGain(phg, gain_cache, node, move)
+   * findNextMove(phg, gain_cache, move)
+   * applyMove(phg, gain_cache, move, global)
+   * reset()
+   * deltaGainUpdates(phg, gain_cache, sync_update)
+   *
+   */
+
+class LocalUnconstrainedStrategy {
+  using VirtualWeightMap = ds::SparseMap<PartitionID, HypernodeWeight>;
+
+ public:
+  using BlockPriorityQueue = ds::ExclusiveHandleHeap< ds::MaxHeap<Gain, PartitionID> >;
+  using VertexPriorityQueue = ds::MaxHeap<Gain, HypernodeID>;    // these need external handles
+
+  static constexpr bool uses_gain_cache = true;
+  static constexpr bool maintain_gain_cache_between_rounds = true;
+  static constexpr bool is_unconstrained = true;
+
+  LocalUnconstrainedStrategy(const Context& context,
+                             FMSharedData& sharedData,
+                             BlockPriorityQueue& blockPQ,
+                             vec<VertexPriorityQueue>& vertexPQs,
+                             FMStats& runStats) :
+      context(context),
+      runStats(runStats),
+      sharedData(sharedData),
+      blockPQ(blockPQ),
+      vertexPQs(vertexPQs),
+      localVirtualWeightDelta(context.partition.k),
+      penaltyFactor(context.refinement.fm.imbalance_penalty_max),
+      upperBound(context.refinement.fm.unconstrained_upper_bound) { }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void insertIntoPQ(const PartitionedHypergraph& phg,
+                    const GainCache& gain_cache,
+                    const HypernodeID v) {
+    const PartitionID pv = phg.partID(v);
+    ASSERT(pv < context.partition.k);
+    auto [target, gain] = computeBestTargetBlock(phg, gain_cache, v, pv);
+    ASSERT(target < context.partition.k);
+    sharedData.targetPart[v] = target;
+    vertexPQs[pv].insert(v, gain);  // blockPQ updates are done later, collectively.
+    runStats.pushes++;
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void updateGain(const PartitionedHypergraph& phg,
+                  const GainCache& gain_cache,
+                  const HypernodeID v,
+                  const Move& move) {
+    const PartitionID pv = phg.partID(v);
+    ASSERT(vertexPQs[pv].contains(v));
+    const PartitionID designatedTargetV = sharedData.targetPart[v];
+    Gain gain = 0;
+    PartitionID newTarget = kInvalidPartition;
+
+    if (context.partition.k < 4 || designatedTargetV == move.from || designatedTargetV == move.to) {
+      // penalty term of designatedTargetV is affected.
+      // and may now be greater than that of other blocks --> recompute full
+      std::tie(newTarget, gain) = computeBestTargetBlock(phg, gain_cache, v, pv);
+    } else {
+      // penalty term of designatedTargetV is not affected.
+      // only move.from and move.to may be better
+      std::tie(newTarget, gain) = bestOfThree(phg, gain_cache,
+        v, pv, { designatedTargetV, move.from, move.to });
+    }
+
+    sharedData.targetPart[v] = newTarget;
+    vertexPQs[pv].adjustKey(v, gain);
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  bool findNextMove(const PartitionedHypergraph& phg,
+                    const GainCache& gain_cache,
+                    Move& m) {
+    updatePQs();
+
+    if (blockPQ.empty()) {
+      return false;
+    }
+
+    while (true) {
+      const PartitionID from = blockPQ.top();
+      const HypernodeID u = vertexPQs[from].top();
+      const Gain estimated_gain = vertexPQs[from].topKey();
+      ASSERT(estimated_gain == blockPQ.topKey());
+      auto [to, gain] = computeBestTargetBlock(phg, gain_cache, u, phg.partID(u));
+
+      bool apply_move = (gain >= estimated_gain); // accept any gain that is at least as good
+      if (apply_move && to != kInvalidPartition && penaltyFactor > 0) {
+        const HypernodeWeight wu = phg.nodeWeight(u);
+        const HypernodeWeight to_weight = phg.partWeight(to);
+        if (upperBound >= 1 && to_weight + wu > upperBound * context.partition.max_part_weights[to]) {
+          apply_move = false;
+        } else if (to_weight + wu > context.partition.max_part_weights[to]) {
+          const Gain imbalance_penalty = estimatePenalty(to, to_weight, wu);
+          if (imbalance_penalty != std::numeric_limits<Gain>::max()) {
+            Gain new_gain = gain_cache.gain(u, from, to) - std::ceil(penaltyFactor * imbalance_penalty);
+            gain = new_gain;
+          } else {
+            apply_move = false;
+          }
+        }
+      }
+
+      if (apply_move) {
+        m.node = u; m.to = to; m.from = from;
+        m.gain = gain;
+        runStats.extractions++;
+        vertexPQs[from].deleteTop();  // blockPQ updates are done later, collectively.
+        return true;
+      } else {
+        runStats.retries++;
+        vertexPQs[from].adjustKey(u, gain);
+        sharedData.targetPart[u] = to;
+        if (vertexPQs[from].topKey() != blockPQ.keyOf(from)) {
+          blockPQ.adjustKey(from, vertexPQs[from].topKey());
+        }
+      }
+    }
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void applyMove(const PartitionedHypergraph& phg, const GainCache&, Move m, bool global) {
+    if (sharedData.unconstrained.isRebalancingNode(m.node)) {
+      // If a node is moved which is already in use for penalty estimation, we need to make
+      // an adjustment so future estimations are not overly optimistic (since in reality, the
+      // node is not available anymore). This is achieved by increasing the "virtual" weight of
+      // the origin block, thus pessimizing future estimations
+      if (global) {
+        sharedData.unconstrained.virtualWeightDelta(m.from).fetch_add(
+            phg.nodeWeight(m.node), std::memory_order_relaxed);
+      } else {
+        localVirtualWeightDelta[m.from] += phg.nodeWeight(m.node);
+      }
+    }
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void revertMove(const PartitionedHypergraph& phg, const GainCache&, Move m, bool global) {
+    if (sharedData.unconstrained.isRebalancingNode(m.node)) {
+      if (global) {
+        sharedData.unconstrained.virtualWeightDelta(m.from).fetch_sub(
+            phg.nodeWeight(m.node), std::memory_order_relaxed);
+      } else {
+        localVirtualWeightDelta[m.from] -= phg.nodeWeight(m.node);
+      }
+    }
+  }
+
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void flushLocalChanges() {
+    for (auto [block, delta]: localVirtualWeightDelta) {
+      ASSERT(delta >= 0);
+      sharedData.unconstrained.virtualWeightDelta(block).fetch_add(delta, std::memory_order_relaxed);
+    }
+    localVirtualWeightDelta.clear();
+  }
+
+  void reset() {
+    // release all nodes that were not moved
+    const bool release = sharedData.release_nodes
+                         && runStats.moves > 0;
+
+    if (release) {
+      // Release all nodes contained in PQ
+      for (PartitionID i = 0; i < context.partition.k; ++i) {
+        for (PosT j = 0; j < vertexPQs[i].size(); ++j) {
+          const HypernodeID v = vertexPQs[i].at(j);
+          sharedData.nodeTracker.releaseNode(v);
+        }
+      }
+    }
+
+    for (PartitionID i = 0; i < context.partition.k; ++i) {
+      vertexPQs[i].clear();
+    }
+    blockPQ.clear();
+    localVirtualWeightDelta.clear();
+  }
+
+
+  // We're letting the FM details implementation decide what happens here, since some may not want to do gain cache updates,
+  // but rather update gains in their PQs or something
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void deltaGainUpdates(PartitionedHypergraph& phg,
+                        GainCache& gain_cache,
+                        const SynchronizedEdgeUpdate& sync_update) {
+    gain_cache.deltaGainUpdate(phg, sync_update);
+  }
+
+  void setPenaltyFactor(double penalty) {
+    ASSERT(penalty >= 0 && penalty <= 1);
+    penaltyFactor = penalty;
+  }
+
+  void setUpperBound(double upper_bound) {
+    upperBound = upper_bound;
+  }
+
+private:
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void updatePQs() {
+    for (PartitionID i = 0; i < context.partition.k; ++i) {
+      if (!vertexPQs[i].empty()) {
+        blockPQ.insertOrAdjustKey(i, vertexPQs[i].topKey());
+      } else if (blockPQ.contains(i)) {
+        blockPQ.remove(i);
+      }
+    }
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  std::pair<PartitionID, HyperedgeWeight> computeBestTargetBlock(const PartitionedHypergraph& phg,
+                                                                 const GainCache& gain_cache,
+                                                                 const HypernodeID u,
+                                                                 const PartitionID from) const {
+    const HypernodeWeight wu = phg.nodeWeight(u);
+    const HypernodeWeight from_weight = phg.partWeight(from);
+    PartitionID to = kInvalidPartition;
+    HyperedgeWeight to_benefit = std::numeric_limits<HyperedgeWeight>::min();
+    HypernodeWeight best_to_weight = from_weight - wu;
+    for (PartitionID i = 0; i < context.partition.k; ++i) {
+      if (i != from) {
+        const HypernodeWeight to_weight = phg.partWeight(i);
+        const HypernodeWeight max_weight = context.partition.max_part_weights[i];
+        HyperedgeWeight benefit = gain_cache.benefitTerm(u, i);
+        if (upperBound >= 1 && to_weight + wu > upperBound * max_weight) {
+          continue;
+        } else if (to_weight + wu > max_weight && benefit <= to_benefit) {
+          // don't take imbalanced move without improved gain
+          continue;
+        } else if (to_weight + wu > max_weight && penaltyFactor > 0) {
+          const Gain imbalance_penalty = estimatePenalty(i, to_weight, wu);
+          if (imbalance_penalty == std::numeric_limits<Gain>::max()) {
+            continue;
+          }
+          benefit -= std::ceil(penaltyFactor * imbalance_penalty);
+        }
+        if ( benefit > to_benefit || ( benefit == to_benefit && to_weight < best_to_weight ) ) {
+          to_benefit = benefit;
+          to = i;
+          best_to_weight = to_weight;
+        }
+      }
+    }
+    ASSERT(from == phg.partID(u));
+    const Gain gain = to != kInvalidPartition ? to_benefit - gain_cache.penaltyTerm(u, from)
+                                              : std::numeric_limits<HyperedgeWeight>::min();
+    return std::make_pair(to, gain);
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  std::pair<PartitionID, HyperedgeWeight> bestOfThree(const PartitionedHypergraph& phg,
+                                                      const GainCache& gain_cache,
+                                                      HypernodeID u,
+                                                      PartitionID from,
+                                                      std::array<PartitionID, 3> parts) const {
+
+    const HypernodeWeight wu = phg.nodeWeight(u);
+    const HypernodeWeight from_weight = phg.partWeight(from);
+    PartitionID to = kInvalidPartition;
+    HyperedgeWeight to_benefit = std::numeric_limits<HyperedgeWeight>::min();
+    HypernodeWeight best_to_weight = from_weight - wu;
+    for (PartitionID i : parts) {
+      if (i != from && i != kInvalidPartition) {
+        const HypernodeWeight to_weight = phg.partWeight(i);
+        HyperedgeWeight benefit = gain_cache.benefitTerm(u, i);
+        if (upperBound >= 1 && to_weight + wu > upperBound * context.partition.max_part_weights[i]) {
+          continue;
+        } else if (to_weight + wu > context.partition.max_part_weights[i] && penaltyFactor > 0) {
+          const Gain imbalance_penalty = estimatePenalty(i, to_weight, wu);
+          if (imbalance_penalty == std::numeric_limits<Gain>::max()) {
+            continue;
+          }
+          benefit -= std::ceil(penaltyFactor * imbalance_penalty);
+        }
+        if ( benefit > to_benefit || ( benefit == to_benefit && to_weight < best_to_weight ) ) {
+          to_benefit = benefit;
+          to = i;
+          best_to_weight = to_weight;
+        }
+      }
+    }
+    ASSERT(from == phg.partID(u));
+    const Gain gain = to != kInvalidPartition ? to_benefit - gain_cache.penaltyTerm(u, from)
+                                              : std::numeric_limits<HyperedgeWeight>::min();
+    return std::make_pair(to, gain);
+  }
+
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  Gain estimatePenalty(PartitionID to, HypernodeWeight to_weight, HypernodeWeight wu) const {
+    HypernodeWeight virtual_delta = sharedData.unconstrained.virtualWeightDelta(to).load(std::memory_order_relaxed)
+                                    + localVirtualWeightDelta.getOrDefault(to);
+    HypernodeWeight initial_imbalance = to_weight + virtual_delta - context.partition.max_part_weights[to];
+    return sharedData.unconstrained.estimatePenaltyForImbalancedMove(to, initial_imbalance, wu);
+  }
+
+  const Context& context;
+
+  FMStats& runStats;
+
+  FMSharedData& sharedData;
+
+  // ! Priority Queue that contains for each block of the partition
+  // ! the vertex with the best gain value
+  BlockPriorityQueue& blockPQ;
+
+  // ! From PQs -> For each block it contains the vertices (contained
+  // ! in that block) touched by the current local search associated
+  // ! with their gain values
+  vec<VertexPriorityQueue>& vertexPQs;
+
+  // ! Virtual block weights are saved as delta to the actual block weight. They
+  // ! are necessary to ensure a reasonable penalty estimation in some edge cases.
+  VirtualWeightMap localVirtualWeightDelta;
+
+  double penaltyFactor;
+  double upperBound;
+};
+
+}
diff --git a/mt-kahypar/partition/refinement/fm/strategies/unconstrained_strategy.h b/mt-kahypar/partition/refinement/fm/strategies/unconstrained_strategy.h
new file mode 100644
index 000000000..c4929734a
--- /dev/null
+++ b/mt-kahypar/partition/refinement/fm/strategies/unconstrained_strategy.h
@@ -0,0 +1,165 @@
+/*******************************************************************************
+ * MIT License
+ *
+ * This file is part of Mt-KaHyPar.
+ *
+ * Copyright (C) 2023 Nikolai Maas <nikolai.maas@kit.edu>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ ******************************************************************************/
+
+#pragma once
+
+#include "mt-kahypar/partition/refinement/fm/localized_kway_fm_core.h"
+#include "mt-kahypar/partition/refinement/fm/strategies/i_fm_strategy.h"
+#include "mt-kahypar/partition/refinement/fm/strategies/local_gain_cache_strategy.h"
+#include "mt-kahypar/partition/refinement/fm/strategies/local_unconstrained_strategy.h"
+
+
+namespace mt_kahypar {
+
+template<typename TypeTraits, typename GainTypes>
+class UnconstrainedStrategy: public IFMStrategy {
+  using Base = IFMStrategy;
+  static constexpr bool debug = false;
+
+ public:
+  using LocalFM = LocalizedKWayFM<TypeTraits, GainTypes>;
+  using PartitionedHypergraph = typename TypeTraits::PartitionedHypergraph;
+
+  UnconstrainedStrategy(const Context& context, FMSharedData& sharedData):
+      Base(context, sharedData),
+      current_penalty(context.refinement.fm.imbalance_penalty_min),
+      current_upper_bound(context.refinement.fm.unconstrained_upper_bound),
+      absolute_improvement_first_round(kInvalidGain),
+      unconstrained_is_enabled(true),
+      stats(utils::Utilities::instance().getStats(context.utility_id)) {
+        ASSERT(!context.refinement.fm.activate_unconstrained_dynamically
+                || context.refinement.fm.multitry_rounds > 2);
+  }
+
+  bool dispatchedFindMoves(LocalFM& local_fm, PartitionedHypergraph& phg, size_t task_id, size_t num_seeds, size_t round) {
+    if (isUnconstrainedRound(round)) {
+      LocalUnconstrainedStrategy local_strategy = local_fm.template initializeDispatchedStrategy<LocalUnconstrainedStrategy>();
+      local_strategy.setPenaltyFactor(current_penalty);
+      local_strategy.setUpperBound(current_upper_bound);
+      return local_fm.findMoves(local_strategy, phg, task_id, num_seeds);
+    } else {
+      LocalGainCacheStrategy local_strategy = local_fm.template initializeDispatchedStrategy<LocalGainCacheStrategy>();
+      return local_fm.findMoves(local_strategy, phg, task_id, num_seeds);
+    }
+  }
+
+ private:
+  virtual void findMovesImpl(localized_k_way_fm_t local_fm, mt_kahypar_partitioned_hypergraph_t& phg,
+                             size_t num_tasks, size_t num_seeds, size_t round) final {
+    initRound(round);
+
+    Base::findMovesWithConcreteStrategy<UnconstrainedStrategy>(
+              local_fm, phg, num_tasks, num_seeds, round);
+  }
+
+  virtual bool isUnconstrainedRoundImpl(size_t round) const final {
+    if (round > 0 && !unconstrained_is_enabled) {
+      return false;
+    }
+    if (context.refinement.fm.activate_unconstrained_dynamically) {
+      return round == 1 || (round > 1 && round - 2 < context.refinement.fm.unconstrained_rounds);
+    } else {
+      return round < context.refinement.fm.unconstrained_rounds;
+    }
+  }
+
+  virtual bool includesUnconstrainedImpl() const final {
+    return true;
+  }
+
+  virtual void reportImprovementImpl(size_t round, Gain absolute_improvement, double relative_improvement) final {
+    if (round == 0) {
+      absolute_improvement_first_round = absolute_improvement;
+    } else if (round == 1
+               && context.refinement.fm.activate_unconstrained_dynamically
+               && absolute_improvement < absolute_improvement_first_round) {
+        // this is the decision point whether unconstrained or constrained FM is used
+        unconstrained_is_enabled = false;
+        DBG << "Disabling unconstrained FM after test round: " << V(absolute_improvement) << V(absolute_improvement_first_round);
+    } else if (relative_improvement < context.refinement.fm.unconstrained_min_improvement) {
+      unconstrained_is_enabled = false;
+      DBG << "Disabling unconstrained FM due to too little improvement:" << V(relative_improvement);
+    }
+    if (round == 1) {
+      stats.update_stat("top-level-ufm-active", unconstrained_is_enabled);
+      if (unconstrained_is_enabled) {
+        stats.update_stat("ufm-active-levels", 1);
+      } else {
+        stats.update_stat("ufm-inactive-levels", 1);
+      }
+    }
+  }
+
+  void initRound(size_t round) {
+    if (round == 0) {
+      unconstrained_is_enabled = true;
+    }
+    if (context.refinement.fm.activate_unconstrained_dynamically) {
+      if (round == 1) {
+        current_penalty = context.refinement.fm.penalty_for_activation_test;
+        current_upper_bound = context.refinement.fm.unconstrained_upper_bound;
+      } else if (round > 1 && isUnconstrainedRound(round)) {
+        size_t n_rounds = std::min(context.refinement.fm.unconstrained_rounds, context.refinement.fm.multitry_rounds - 2);
+        calculateInterpolation(round - 2, n_rounds);
+      }
+    } else if (isUnconstrainedRound(round)) {
+      calculateInterpolation(round, context.refinement.fm.unconstrained_rounds);
+    }
+  }
+
+  void calculateInterpolation(size_t round, size_t n_rounds) {
+    ASSERT(unconstrained_is_enabled && round < context.refinement.fm.multitry_rounds);
+    auto interpolate = [&](double start, double end) {
+      if (round == 0) {
+        return start;
+      }
+      double summed = (n_rounds - round - 1) * start + round * end;
+      return summed / static_cast<double>(n_rounds - 1);
+    };
+
+    if (round < n_rounds) {
+      // interpolate values for current penalty and upper bound
+      current_penalty = interpolate(context.refinement.fm.imbalance_penalty_min,
+                                    context.refinement.fm.imbalance_penalty_max);
+      if (context.refinement.fm.unconstrained_upper_bound >= 1) {
+        if (context.refinement.fm.unconstrained_upper_bound_min >= 1) {
+          current_upper_bound = interpolate(context.refinement.fm.unconstrained_upper_bound,
+                                            context.refinement.fm.unconstrained_upper_bound_min);
+        } else {
+          current_upper_bound = context.refinement.fm.unconstrained_upper_bound;
+        }
+      }
+    }
+  }
+
+  double current_penalty;
+  double current_upper_bound;
+  Gain absolute_improvement_first_round;
+  bool unconstrained_is_enabled;
+  utils::Stats& stats;
+};
+
+}
diff --git a/mt-kahypar/partition/refinement/gains/README.md b/mt-kahypar/partition/refinement/gains/README.md
index 22a4d027e..c19432d87 100644
--- a/mt-kahypar/partition/refinement/gains/README.md
+++ b/mt-kahypar/partition/refinement/gains/README.md
@@ -36,11 +36,11 @@ Our label propagation algorithm iterates over all nodes in parallel and moves ea
 
 The gain of a node move can change between its initial calculation and execution due to concurrent node moves in its neighborhood. We therefore double-check the gain of a node move at the time performed on the partition via synchronized data structure updates. This technique is called *attributed gains*. The label propagation algorithm reverts node moves that worsen the solution quality by checking the attributed gain value. The attributed gain function implements the following interface:
 ```cpp
-static HyperedgeWeight gain(const SyncronizedEdgeUpdate& sync_update);
+static HyperedgeWeight gain(const SynchronizedEdgeUpdate& sync_update);
 ```
-The ```SyncronizedEdgeUpdate``` structs contains the following members:
+The ```SynchronizedEdgeUpdate``` structs contains the following members:
 ```cpp
-struct SyncronizedEdgeUpdate {
+struct SynchronizedEdgeUpdate {
   HyperedgeID he;
   PartitionID from;
   PartitionID to;
@@ -56,7 +56,7 @@ struct SyncronizedEdgeUpdate {
 };
 ```
 
-When we move a node from its *source* (```from```) to a *target* block (```to```), we iterate over all hyperedges, perform syncronized data structure updates and call this function for each incident hyperedge of the moved node. The sum of all calls to this function is the attributed gain of the node move. The most important parameters of the ```SyncronizedEdgeUpdate``` struct are ```pin_count_in_from_part_after``` and ```pin_count_in_to_part_after```, which are the number of pins contained in the source and target block of hyperedge ```he``` after the node move. For example, the node move removes an hyperedge from the cut if ```pin_count_in_to_part_after == edge_size```. If ```pin_count_in_from_part_after == 0```, then the node move reduces the connectivity of the hyperedge by one. Conversely, if ```pin_count_in_to_part_after == 1```, then the node move increases the connectivity of the hyperedge by one.
+When we move a node from its *source* (```from```) to a *target* block (```to```), we iterate over all hyperedges, perform syncronized data structure updates and call this function for each incident hyperedge of the moved node. The sum of all calls to this function is the attributed gain of the node move. The most important parameters of the ```SynchronizedEdgeUpdate``` struct are ```pin_count_in_from_part_after``` and ```pin_count_in_to_part_after```, which are the number of pins contained in the source and target block of hyperedge ```he``` after the node move. For example, the node move removes an hyperedge from the cut if ```pin_count_in_to_part_after == edge_size```. If ```pin_count_in_from_part_after == 0```, then the node move reduces the connectivity of the hyperedge by one. Conversely, if ```pin_count_in_to_part_after == 1```, then the node move increases the connectivity of the hyperedge by one.
 
 ### Gain Computation
 
@@ -92,7 +92,7 @@ Most notable is the delta gain update function, which has the following interfac
 ```cpp
 template<typename PartitionedHypergraph>
 void deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                     const SyncronizedEdgeUpdate& sync_update);
+                     const SynchronizedEdgeUpdate& sync_update);
 ```
 If we move a node u to another block, we call this function for each incident hyperedge of u (similar to the attributed gains). The function should be used to update gain cache entries affected by the node move.
 
@@ -158,4 +158,3 @@ To test your implementation, you can enable logging in our flow-based refinement
 ### Python Interface
 
 The Python interface is defined in ```python/module.cpp```. You only have to add a mapping between a string representation of your new objective function and our ```Objective``` enum type in the enum type section of the file. Afterwards, add function to the ```PartitionedGraph```, ```PartitionedHypergraph``` and ```SparsePartitionedHypergraph``` class that computes the value of your objective function.
-
diff --git a/mt-kahypar/partition/refinement/gains/cut/cut_attributed_gains.h b/mt-kahypar/partition/refinement/gains/cut/cut_attributed_gains.h
index 60cd687ae..ea10192f8 100644
--- a/mt-kahypar/partition/refinement/gains/cut/cut_attributed_gains.h
+++ b/mt-kahypar/partition/refinement/gains/cut/cut_attributed_gains.h
@@ -36,7 +36,7 @@ namespace mt_kahypar {
  * attributed gain value.
  */
 struct CutAttributedGains {
-  static HyperedgeWeight gain(const SyncronizedEdgeUpdate& sync_update) {
+  static HyperedgeWeight gain(const SynchronizedEdgeUpdate& sync_update) {
     return sync_update.edge_size > 1 ?
       ( sync_update.pin_count_in_from_part_after == sync_update.edge_size - 1) * sync_update.edge_weight -
       ( sync_update.pin_count_in_to_part_after == sync_update.edge_size ) * sync_update.edge_weight : 0;
diff --git a/mt-kahypar/partition/refinement/gains/cut/cut_gain_cache.cpp b/mt-kahypar/partition/refinement/gains/cut/cut_gain_cache.cpp
index 5c9446946..50ac8c25e 100644
--- a/mt-kahypar/partition/refinement/gains/cut/cut_gain_cache.cpp
+++ b/mt-kahypar/partition/refinement/gains/cut/cut_gain_cache.cpp
@@ -118,7 +118,7 @@ void CutGainCache::initializeGainCache(const PartitionedHypergraph& partitioned_
   _is_initialized = true;
 }
 
-bool CutGainCache::triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& sync_update) {
+bool CutGainCache::triggersDeltaGainUpdate(const SynchronizedEdgeUpdate& sync_update) {
   return sync_update.pin_count_in_from_part_after == sync_update.edge_size - 1 ||
          sync_update.pin_count_in_from_part_after == sync_update.edge_size - 2 ||
          sync_update.pin_count_in_to_part_after == sync_update.edge_size ||
@@ -128,7 +128,7 @@ bool CutGainCache::triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& sync_upd
 
 template<typename PartitionedHypergraph>
 void CutGainCache::deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                                   const SyncronizedEdgeUpdate& sync_update) {
+                                   const SynchronizedEdgeUpdate& sync_update) {
   ASSERT(_is_initialized, "Gain cache is not initialized");
   const HypernodeID edge_size = sync_update.edge_size;
   if ( edge_size > 1 ) {
@@ -282,7 +282,7 @@ void CutGainCache::initializeGainCacheEntryForNode(const PartitionedHypergraph&
 namespace {
 #define CUT_INITIALIZE_GAIN_CACHE(X) void CutGainCache::initializeGainCache(const X&)
 #define CUT_DELTA_GAIN_UPDATE(X) void CutGainCache::deltaGainUpdate(const X&,                     \
-                                                                    const SyncronizedEdgeUpdate&)
+                                                                    const SynchronizedEdgeUpdate&)
 #define CUT_RESTORE_UPDATE(X) void CutGainCache::uncontractUpdateAfterRestore(const X&,          \
                                                                               const HypernodeID, \
                                                                               const HypernodeID, \
diff --git a/mt-kahypar/partition/refinement/gains/cut/cut_gain_cache.h b/mt-kahypar/partition/refinement/gains/cut/cut_gain_cache.h
index d67347a6f..4b6bb5e47 100644
--- a/mt-kahypar/partition/refinement/gains/cut/cut_gain_cache.h
+++ b/mt-kahypar/partition/refinement/gains/cut/cut_gain_cache.h
@@ -64,6 +64,7 @@ class CutGainCache {
   static constexpr GainPolicy TYPE = GainPolicy::cut;
   static constexpr bool requires_notification_before_update = false;
   static constexpr bool initializes_gain_cache_entry_after_batch_uncontractions = false;
+  static constexpr bool invalidates_entries = true;
 
   CutGainCache() :
     _is_initialized(false),
@@ -159,14 +160,14 @@ class CutGainCache {
 
   // ! This function returns true if the corresponding syncronized edge update triggers
   // ! a gain cache update.
-  static bool triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& sync_update);
+  static bool triggersDeltaGainUpdate(const SynchronizedEdgeUpdate& sync_update);
 
   // ! The partitioned (hyper)graph call this function when its updates its internal
   // ! data structures before calling the delta gain update function. The partitioned
   // ! (hyper)graph holds a lock for the corresponding (hyper)edge when calling this
   // ! function. Thus, it is guaranteed that no other thread will modify the hyperedge.
   template<typename PartitionedHypergraph>
-  void notifyBeforeDeltaGainUpdate(const PartitionedHypergraph&, const SyncronizedEdgeUpdate&) {
+  void notifyBeforeDeltaGainUpdate(const PartitionedHypergraph&, const SynchronizedEdgeUpdate&) {
     // Do nothing
   }
 
@@ -177,7 +178,7 @@ class CutGainCache {
   // ! corresponding hyperedge.
   template<typename PartitionedHypergraph>
   void deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                       const SyncronizedEdgeUpdate& sync_update);
+                       const SynchronizedEdgeUpdate& sync_update);
 
   // ####################### Uncontraction #######################
 
@@ -252,6 +253,7 @@ class CutGainCache {
   }
 
   void changeNumberOfBlocks(const PartitionID new_k) {
+    ASSERT(new_k <= _k);
     _dummy_adjacent_blocks = IntegerRangeIterator<PartitionID>(new_k);
   }
 
@@ -400,7 +402,7 @@ class DeltaCutGainCache {
   template<typename PartitionedHypergraph>
   MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
   void deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                       const SyncronizedEdgeUpdate& sync_update) {
+                       const SynchronizedEdgeUpdate& sync_update) {
     const HypernodeID edge_size = sync_update.edge_size;
     if ( edge_size > 1 ) {
       const HyperedgeID he = sync_update.he;
diff --git a/mt-kahypar/partition/refinement/gains/cut_for_graphs/cut_attributed_gains_for_graphs.h b/mt-kahypar/partition/refinement/gains/cut_for_graphs/cut_attributed_gains_for_graphs.h
index 8292a8f7c..77a4a0d05 100644
--- a/mt-kahypar/partition/refinement/gains/cut_for_graphs/cut_attributed_gains_for_graphs.h
+++ b/mt-kahypar/partition/refinement/gains/cut_for_graphs/cut_attributed_gains_for_graphs.h
@@ -36,7 +36,7 @@ namespace mt_kahypar {
  * attributed gain value.
  */
 struct GraphCutAttributedGains {
-  static HyperedgeWeight gain(const SyncronizedEdgeUpdate& sync_update) {
+  static HyperedgeWeight gain(const SynchronizedEdgeUpdate& sync_update) {
     return (sync_update.pin_count_in_to_part_after == 1 ? sync_update.edge_weight : 0) +
       (sync_update.pin_count_in_from_part_after == 0 ? -sync_update.edge_weight : 0);
   }
diff --git a/mt-kahypar/partition/refinement/gains/cut_for_graphs/cut_gain_cache_for_graphs.cpp b/mt-kahypar/partition/refinement/gains/cut_for_graphs/cut_gain_cache_for_graphs.cpp
index b501d2191..da1244a14 100644
--- a/mt-kahypar/partition/refinement/gains/cut_for_graphs/cut_gain_cache_for_graphs.cpp
+++ b/mt-kahypar/partition/refinement/gains/cut_for_graphs/cut_gain_cache_for_graphs.cpp
@@ -59,13 +59,13 @@ void GraphCutGainCache::initializeGainCache(const PartitionedGraph& partitioned_
   _is_initialized = true;
 }
 
-bool GraphCutGainCache::triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& /* only relevant for hypergraphs */) {
+bool GraphCutGainCache::triggersDeltaGainUpdate(const SynchronizedEdgeUpdate& /* only relevant for hypergraphs */) {
   return true;
 }
 
 template<typename PartitionedGraph>
 void GraphCutGainCache::deltaGainUpdate(const PartitionedGraph& partitioned_graph,
-                                        const SyncronizedEdgeUpdate& sync_update) {
+                                        const SynchronizedEdgeUpdate& sync_update) {
   ASSERT(_is_initialized, "Gain cache is not initialized");
   const HypernodeID target = partitioned_graph.edgeTarget(sync_update.he);
   const size_t index_in_from_part = incident_weight_index(target, sync_update.from);
@@ -108,7 +108,7 @@ void GraphCutGainCache::uncontractUpdateAfterReplacement(const PartitionedGraph&
 namespace {
 #define GRAPH_CUT_INITIALIZE_GAIN_CACHE(X) void GraphCutGainCache::initializeGainCache(const X&)
 #define GRAPH_CUT_DELTA_GAIN_UPDATE(X) void GraphCutGainCache::deltaGainUpdate(const X&,                     \
-                                                                               const SyncronizedEdgeUpdate&)
+                                                                               const SynchronizedEdgeUpdate&)
 #define GRAPH_CUT_RESTORE_UPDATE(X) void GraphCutGainCache::uncontractUpdateAfterRestore(const X&,   \
                                                                               const HypernodeID,     \
                                                                               const HypernodeID,     \
diff --git a/mt-kahypar/partition/refinement/gains/cut_for_graphs/cut_gain_cache_for_graphs.h b/mt-kahypar/partition/refinement/gains/cut_for_graphs/cut_gain_cache_for_graphs.h
index 07c88d123..d98ee8ff0 100644
--- a/mt-kahypar/partition/refinement/gains/cut_for_graphs/cut_gain_cache_for_graphs.h
+++ b/mt-kahypar/partition/refinement/gains/cut_for_graphs/cut_gain_cache_for_graphs.h
@@ -61,6 +61,7 @@ class GraphCutGainCache {
   static constexpr GainPolicy TYPE = GainPolicy::cut_for_graphs;
   static constexpr bool requires_notification_before_update = false;
   static constexpr bool initializes_gain_cache_entry_after_batch_uncontractions = false;
+  static constexpr bool invalidates_entries = false;
 
   using AdjacentBlocksIterator = IntegerRangeIterator<PartitionID>::const_iterator;
 
@@ -155,14 +156,14 @@ class GraphCutGainCache {
 
   // ! This function returns true if the corresponding syncronized edge update triggers
   // ! a gain cache update.
-  static bool triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& sync_update);
+  static bool triggersDeltaGainUpdate(const SynchronizedEdgeUpdate& sync_update);
 
   // ! The partitioned (hyper)graph call this function when its updates its internal
   // ! data structures before calling the delta gain update function. The partitioned
   // ! (hyper)graph holds a lock for the corresponding (hyper)edge when calling this
   // ! function. Thus, it is guaranteed that no other thread will modify the hyperedge.
   template<typename PartitionedHypergraph>
-  void notifyBeforeDeltaGainUpdate(const PartitionedHypergraph&, const SyncronizedEdgeUpdate&) {
+  void notifyBeforeDeltaGainUpdate(const PartitionedHypergraph&, const SynchronizedEdgeUpdate&) {
     // Do nothing
   }
 
@@ -173,7 +174,7 @@ class GraphCutGainCache {
   // ! corresponding edge.
   template<typename PartitionedGraph>
   void deltaGainUpdate(const PartitionedGraph& partitioned_graph,
-                       const SyncronizedEdgeUpdate& sync_update);
+                       const SynchronizedEdgeUpdate& sync_update);
 
   // ####################### Uncontraction #######################
 
@@ -245,6 +246,7 @@ class GraphCutGainCache {
   }
 
   void changeNumberOfBlocks(const PartitionID new_k) {
+    ASSERT(new_k <= _k);
     _dummy_adjacent_blocks = IntegerRangeIterator<PartitionID>(new_k);
   }
 
@@ -365,7 +367,7 @@ class DeltaGraphCutGainCache {
   template<typename PartitionedGraph>
   MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
   void deltaGainUpdate(const PartitionedGraph& partitioned_graph,
-                       const SyncronizedEdgeUpdate& sync_update) {
+                       const SynchronizedEdgeUpdate& sync_update) {
     const HypernodeID target = partitioned_graph.edgeTarget(sync_update.he);
     const size_t index_in_from_part = _gain_cache.incident_weight_index(target, sync_update.from);
     _incident_weight_in_part_delta[index_in_from_part] -= sync_update.edge_weight;
diff --git a/mt-kahypar/partition/refinement/gains/gain_computation_base.h b/mt-kahypar/partition/refinement/gains/gain_computation_base.h
index 87d234e11..6f8856854 100644
--- a/mt-kahypar/partition/refinement/gains/gain_computation_base.h
+++ b/mt-kahypar/partition/refinement/gains/gain_computation_base.h
@@ -62,7 +62,8 @@ class GainComputationBase {
   Move computeMaxGainMove(const PartitionedHypergraph& phg,
                           const HypernodeID hn,
                           const bool rebalance = false,
-                          const bool consider_non_adjacent_blocks = false) {
+                          const bool consider_non_adjacent_blocks = false,
+                          const bool allow_imbalance = false) {
     Derived* derived = static_cast<Derived*>(this);
     RatingMap& tmp_scores = _tmp_scores.local();
     Gain& isolated_block_gain = _isolated_block_gain.local();
@@ -80,8 +81,8 @@ class GainComputationBase {
                             (score == best_move.gain &&
                             !_disable_randomization &&
                             (no_tie_breaking || rand.flipCoin(cpu_id)));
-      if (new_best_gain && phg.partWeight(to) + hn_weight <=
-          _context.partition.max_part_weights[to]) {
+      if (new_best_gain && (allow_imbalance || phg.partWeight(to) + hn_weight <=
+          _context.partition.max_part_weights[to])) {
         best_move.to = to;
         best_move.gain = score;
         return true;
@@ -124,7 +125,7 @@ class GainComputationBase {
     return best_move;
   }
 
-  inline void computeDeltaForHyperedge(const SyncronizedEdgeUpdate& sync_update) {
+  inline void computeDeltaForHyperedge(const SynchronizedEdgeUpdate& sync_update) {
     _deltas.local() += AttributedGains::gain(sync_update);
   }
 
diff --git a/mt-kahypar/partition/refinement/gains/km1/km1_attributed_gains.h b/mt-kahypar/partition/refinement/gains/km1/km1_attributed_gains.h
index ed6606a59..26928fc68 100644
--- a/mt-kahypar/partition/refinement/gains/km1/km1_attributed_gains.h
+++ b/mt-kahypar/partition/refinement/gains/km1/km1_attributed_gains.h
@@ -36,7 +36,7 @@ namespace mt_kahypar {
  * attributed gain value.
  */
 struct Km1AttributedGains {
-  static HyperedgeWeight gain(const SyncronizedEdgeUpdate& sync_update) {
+  static HyperedgeWeight gain(const SynchronizedEdgeUpdate& sync_update) {
     return (sync_update.pin_count_in_to_part_after == 1 ? sync_update.edge_weight : 0) +
            (sync_update.pin_count_in_from_part_after == 0 ? -sync_update.edge_weight : 0);
   }
diff --git a/mt-kahypar/partition/refinement/gains/km1/km1_gain_cache.cpp b/mt-kahypar/partition/refinement/gains/km1/km1_gain_cache.cpp
index ae7998818..3f9812d38 100644
--- a/mt-kahypar/partition/refinement/gains/km1/km1_gain_cache.cpp
+++ b/mt-kahypar/partition/refinement/gains/km1/km1_gain_cache.cpp
@@ -114,7 +114,7 @@ void Km1GainCache::initializeGainCache(const PartitionedHypergraph& partitioned_
   _is_initialized = true;
 }
 
-bool Km1GainCache::triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& sync_update) {
+bool Km1GainCache::triggersDeltaGainUpdate(const SynchronizedEdgeUpdate& sync_update) {
   return sync_update.pin_count_in_from_part_after == 0 ||
          sync_update.pin_count_in_from_part_after == 1 ||
          sync_update.pin_count_in_to_part_after == 1 ||
@@ -123,7 +123,7 @@ bool Km1GainCache::triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& sync_upd
 
 template<typename PartitionedHypergraph>
 void Km1GainCache::deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                                   const SyncronizedEdgeUpdate& sync_update) {
+                                   const SynchronizedEdgeUpdate& sync_update) {
   ASSERT(_is_initialized, "Gain cache is not initialized");
   const HyperedgeID he = sync_update.he;
   const PartitionID from = sync_update.from;
@@ -262,7 +262,7 @@ void Km1GainCache::initializeGainCacheEntryForNode(const PartitionedHypergraph&
 namespace {
 #define KM1_INITIALIZE_GAIN_CACHE(X) void Km1GainCache::initializeGainCache(const X&)
 #define KM1_DELTA_GAIN_UPDATE(X) void Km1GainCache::deltaGainUpdate(const X&,                     \
-                                                                    const SyncronizedEdgeUpdate&)
+                                                                    const SynchronizedEdgeUpdate&)
 #define KM1_RESTORE_UPDATE(X) void Km1GainCache::uncontractUpdateAfterRestore(const X&,          \
                                                                               const HypernodeID, \
                                                                               const HypernodeID, \
diff --git a/mt-kahypar/partition/refinement/gains/km1/km1_gain_cache.h b/mt-kahypar/partition/refinement/gains/km1/km1_gain_cache.h
index aa8bca5cb..36d397a77 100644
--- a/mt-kahypar/partition/refinement/gains/km1/km1_gain_cache.h
+++ b/mt-kahypar/partition/refinement/gains/km1/km1_gain_cache.h
@@ -72,6 +72,7 @@ class Km1GainCache {
   static constexpr GainPolicy TYPE = GainPolicy::km1;
   static constexpr bool requires_notification_before_update = false;
   static constexpr bool initializes_gain_cache_entry_after_batch_uncontractions = false;
+  static constexpr bool invalidates_entries = true;
 
   Km1GainCache() :
     _is_initialized(false),
@@ -166,14 +167,14 @@ class Km1GainCache {
 
   // ! This function returns true if the corresponding syncronized edge update triggers
   // ! a gain cache update.
-  static bool triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& sync_update);
+  static bool triggersDeltaGainUpdate(const SynchronizedEdgeUpdate& sync_update);
 
   // ! The partitioned (hyper)graph call this function when its updates its internal
   // ! data structures before calling the delta gain update function. The partitioned
   // ! (hyper)graph holds a lock for the corresponding (hyper)edge when calling this
   // ! function. Thus, it is guaranteed that no other thread will modify the hyperedge.
   template<typename PartitionedHypergraph>
-  void notifyBeforeDeltaGainUpdate(const PartitionedHypergraph&, const SyncronizedEdgeUpdate&) {
+  void notifyBeforeDeltaGainUpdate(const PartitionedHypergraph&, const SynchronizedEdgeUpdate&) {
     // Do nothing
   }
 
@@ -184,7 +185,7 @@ class Km1GainCache {
   // ! corresponding hyperedge.
   template<typename PartitionedHypergraph>
   void deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                       const SyncronizedEdgeUpdate& sync_update);
+                       const SynchronizedEdgeUpdate& sync_update);
 
   // ####################### Uncontraction #######################
 
@@ -255,6 +256,7 @@ class Km1GainCache {
   }
 
   void changeNumberOfBlocks(const PartitionID new_k) {
+    ASSERT(new_k <= _k);
     _dummy_adjacent_blocks = IntegerRangeIterator<PartitionID>(new_k);
   }
 
@@ -403,7 +405,7 @@ class DeltaKm1GainCache {
   template<typename PartitionedHypergraph>
   MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
   void deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                       const SyncronizedEdgeUpdate& sync_update) {
+                       const SynchronizedEdgeUpdate& sync_update) {
     const HyperedgeID he = sync_update.he;
     const PartitionID from = sync_update.from;
     const PartitionID to = sync_update.to;
diff --git a/mt-kahypar/partition/refinement/gains/soed/soed_attributed_gains.h b/mt-kahypar/partition/refinement/gains/soed/soed_attributed_gains.h
index 60b9c7b64..d7534bf99 100644
--- a/mt-kahypar/partition/refinement/gains/soed/soed_attributed_gains.h
+++ b/mt-kahypar/partition/refinement/gains/soed/soed_attributed_gains.h
@@ -36,7 +36,7 @@ namespace mt_kahypar {
  * attributed gain value.
  */
 struct SoedAttributedGains {
-  static HyperedgeWeight gain(const SyncronizedEdgeUpdate& sync_update) {
+  static HyperedgeWeight gain(const SynchronizedEdgeUpdate& sync_update) {
     const HypernodeID edge_size = sync_update.edge_size;
     const HyperedgeWeight edge_weight = sync_update.edge_weight;
     const HypernodeID pin_count_in_from_part_after = sync_update.pin_count_in_from_part_after;
diff --git a/mt-kahypar/partition/refinement/gains/soed/soed_gain_cache.cpp b/mt-kahypar/partition/refinement/gains/soed/soed_gain_cache.cpp
index 3af4d3804..5c4d010ac 100644
--- a/mt-kahypar/partition/refinement/gains/soed/soed_gain_cache.cpp
+++ b/mt-kahypar/partition/refinement/gains/soed/soed_gain_cache.cpp
@@ -120,7 +120,7 @@ void SoedGainCache::initializeGainCache(const PartitionedHypergraph& partitioned
   _is_initialized = true;
 }
 
-bool SoedGainCache::triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& sync_update) {
+bool SoedGainCache::triggersDeltaGainUpdate(const SynchronizedEdgeUpdate& sync_update) {
   return sync_update.pin_count_in_from_part_after == 0 ||
          sync_update.pin_count_in_from_part_after == 1 ||
          sync_update.pin_count_in_to_part_after == 1 ||
@@ -133,7 +133,7 @@ bool SoedGainCache::triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& sync_up
 
 template<typename PartitionedHypergraph>
 void SoedGainCache::deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                                    const SyncronizedEdgeUpdate& sync_update) {
+                                    const SynchronizedEdgeUpdate& sync_update) {
   ASSERT(_is_initialized, "Gain cache is not initialized");
   const HypernodeID edge_size = sync_update.edge_size;
   if ( edge_size > 1 ) {
@@ -371,7 +371,7 @@ void SoedGainCache::initializeGainCacheEntryForNode(const PartitionedHypergraph&
 namespace {
 #define SOED_INITIALIZE_GAIN_CACHE(X) void SoedGainCache::initializeGainCache(const X&)
 #define SOED_DELTA_GAIN_UPDATE(X) void SoedGainCache::deltaGainUpdate(const X&,                     \
-                                                                      const SyncronizedEdgeUpdate&)
+                                                                      const SynchronizedEdgeUpdate&)
 #define SOED_RESTORE_UPDATE(X) void SoedGainCache::uncontractUpdateAfterRestore(const X&,          \
                                                                                 const HypernodeID, \
                                                                                 const HypernodeID, \
diff --git a/mt-kahypar/partition/refinement/gains/soed/soed_gain_cache.h b/mt-kahypar/partition/refinement/gains/soed/soed_gain_cache.h
index 24c4a3c74..4fcc30a6e 100644
--- a/mt-kahypar/partition/refinement/gains/soed/soed_gain_cache.h
+++ b/mt-kahypar/partition/refinement/gains/soed/soed_gain_cache.h
@@ -73,6 +73,7 @@ class SoedGainCache {
   static constexpr GainPolicy TYPE = GainPolicy::soed;
   static constexpr bool requires_notification_before_update = false;
   static constexpr bool initializes_gain_cache_entry_after_batch_uncontractions = false;
+  static constexpr bool invalidates_entries = true;
 
   SoedGainCache() :
     _is_initialized(false),
@@ -169,14 +170,14 @@ class SoedGainCache {
 
   // ! This function returns true if the corresponding syncronized edge update triggers
   // ! a gain cache update.
-  static bool triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& sync_update);
+  static bool triggersDeltaGainUpdate(const SynchronizedEdgeUpdate& sync_update);
 
   // ! The partitioned (hyper)graph call this function when its updates its internal
   // ! data structures before calling the delta gain update function. The partitioned
   // ! (hyper)graph holds a lock for the corresponding (hyper)edge when calling this
   // ! function. Thus, it is guaranteed that no other thread will modify the hyperedge.
   template<typename PartitionedHypergraph>
-  void notifyBeforeDeltaGainUpdate(const PartitionedHypergraph&, const SyncronizedEdgeUpdate&) {
+  void notifyBeforeDeltaGainUpdate(const PartitionedHypergraph&, const SynchronizedEdgeUpdate&) {
     // Do nothing
   }
 
@@ -187,7 +188,7 @@ class SoedGainCache {
   // ! corresponding hyperedge.
   template<typename PartitionedHypergraph>
   void deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                       const SyncronizedEdgeUpdate& sync_update);
+                       const SynchronizedEdgeUpdate& sync_update);
 
   // ####################### Uncontraction #######################
 
@@ -262,6 +263,7 @@ class SoedGainCache {
   }
 
   void changeNumberOfBlocks(const PartitionID new_k) {
+    ASSERT(new_k <= _k);
     _dummy_adjacent_blocks = IntegerRangeIterator<PartitionID>(new_k);
   }
 
@@ -410,7 +412,7 @@ class DeltaSoedGainCache {
   template<typename PartitionedHypergraph>
   MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
   void deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                       const SyncronizedEdgeUpdate& sync_update) {
+                       const SynchronizedEdgeUpdate& sync_update) {
     const HypernodeID edge_size = sync_update.edge_size;
     if ( edge_size > 1 ) {
       const HyperedgeID he = sync_update.he;
diff --git a/mt-kahypar/partition/refinement/gains/steiner_tree/steiner_tree_attributed_gains.h b/mt-kahypar/partition/refinement/gains/steiner_tree/steiner_tree_attributed_gains.h
index d8266f8ac..64f9bbeaf 100644
--- a/mt-kahypar/partition/refinement/gains/steiner_tree/steiner_tree_attributed_gains.h
+++ b/mt-kahypar/partition/refinement/gains/steiner_tree/steiner_tree_attributed_gains.h
@@ -38,7 +38,7 @@ namespace mt_kahypar {
  * attributed gain value.
  */
 struct SteinerTreeAttributedGains {
-  static HyperedgeWeight gain(const SyncronizedEdgeUpdate& sync_update) {
+  static HyperedgeWeight gain(const SynchronizedEdgeUpdate& sync_update) {
     ASSERT(sync_update.target_graph);
     ds::Bitset& connectivity_set = *sync_update.connectivity_set_after;
     // Distance between blocks of the hyperedge after the syncronized edge update
diff --git a/mt-kahypar/partition/refinement/gains/steiner_tree/steiner_tree_gain_cache.cpp b/mt-kahypar/partition/refinement/gains/steiner_tree/steiner_tree_gain_cache.cpp
index 66bad6e5b..ebd0dd195 100644
--- a/mt-kahypar/partition/refinement/gains/steiner_tree/steiner_tree_gain_cache.cpp
+++ b/mt-kahypar/partition/refinement/gains/steiner_tree/steiner_tree_gain_cache.cpp
@@ -64,7 +64,7 @@ void SteinerTreeGainCache::initializeGainCacheEntryForNode(const PartitionedHype
   initializeGainCacheEntryForNode(partitioned_hg, hn, benefit_aggregator);
 }
 
-bool SteinerTreeGainCache::triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& sync_update) {
+bool SteinerTreeGainCache::triggersDeltaGainUpdate(const SynchronizedEdgeUpdate& sync_update) {
   return sync_update.pin_count_in_from_part_after == 0 ||
          sync_update.pin_count_in_from_part_after == 1 ||
          sync_update.pin_count_in_to_part_after == 1 ||
@@ -73,7 +73,7 @@ bool SteinerTreeGainCache::triggersDeltaGainUpdate(const SyncronizedEdgeUpdate&
 
 template<typename PartitionedHypergraph>
 void SteinerTreeGainCache::notifyBeforeDeltaGainUpdate(const PartitionedHypergraph&,
-                                                       const SyncronizedEdgeUpdate& sync_update) {
+                                                       const SynchronizedEdgeUpdate& sync_update) {
   if ( triggersDeltaGainUpdate(sync_update) ) {
     ASSERT(UL(sync_update.he) < _version.size());
     // The move will induce a gain cache update. In this case, we increment the version ID
@@ -105,7 +105,7 @@ HyperedgeWeight gainOfHyperedge(const PartitionID from,
 }
 
 MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
-void reconstructConnectivitySetAndPinCountsBeforeMove(const SyncronizedEdgeUpdate& sync_update,
+void reconstructConnectivitySetAndPinCountsBeforeMove(const SynchronizedEdgeUpdate& sync_update,
                                                       ds::Bitset& connectivity_set,
                                                       ds::PinCountSnapshot& pin_counts) {
   if ( sync_update.pin_count_in_from_part_after == 0 ) {
@@ -123,7 +123,7 @@ void reconstructConnectivitySetAndPinCountsBeforeMove(const SyncronizedEdgeUpdat
 
 template<typename PartitionedHypergraph>
 void SteinerTreeGainCache::deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                                           const SyncronizedEdgeUpdate& sync_update) {
+                                           const SynchronizedEdgeUpdate& sync_update) {
   ASSERT(_is_initialized, "Gain cache is not initialized");
   ASSERT(sync_update.connectivity_set_after);
   ASSERT(sync_update.pin_counts_after);
@@ -424,7 +424,7 @@ void SteinerTreeGainCache::initializeAdjacentBlocksOfNode(const PartitionedHyper
 
 template<typename PartitionedHypergraph>
 void SteinerTreeGainCache::updateAdjacentBlocks(const PartitionedHypergraph& partitioned_hg,
-                                                const SyncronizedEdgeUpdate& sync_update) {
+                                                const SynchronizedEdgeUpdate& sync_update) {
   if ( partitioned_hg.edgeSize(sync_update.he) <= _large_he_threshold ) {
     if ( sync_update.pin_count_in_from_part_after == 0 ) {
       // The node move has removed the source block of the move from the
@@ -628,9 +628,9 @@ namespace {
 #define STEINER_TREE_INITIALIZE_GAIN_CACHE_FOR_NODE(X) void SteinerTreeGainCache::initializeGainCacheEntryForNode(const X&,          \
                                                                                                                   const HypernodeID)
 #define STEINER_TREE_NOTIFY(X) void SteinerTreeGainCache::notifyBeforeDeltaGainUpdate(const X&,                     \
-                                                                                      const SyncronizedEdgeUpdate&)
+                                                                                      const SynchronizedEdgeUpdate&)
 #define STEINER_TREE_DELTA_GAIN_UPDATE(X) void SteinerTreeGainCache::deltaGainUpdate(const X&,                     \
-                                                                                     const SyncronizedEdgeUpdate&)
+                                                                                     const SynchronizedEdgeUpdate&)
 #define STEINER_TREE_RESTORE_UPDATE(X) void SteinerTreeGainCache::uncontractUpdateAfterRestore(const X&,          \
                                                                                                const HypernodeID, \
                                                                                                const HypernodeID, \
@@ -646,7 +646,7 @@ namespace {
 #define STEINER_TREE_INIT_ADJACENT_BLOCKS_OF_NODE(X) void SteinerTreeGainCache::initializeAdjacentBlocksOfNode(const X&,          \
                                                                                                                const HypernodeID)
 #define STEINER_TREE_UPDATE_ADJACENT_BLOCKS(X) void SteinerTreeGainCache::updateAdjacentBlocks(const X&,                     \
-                                                                                               const SyncronizedEdgeUpdate&)
+                                                                                               const SynchronizedEdgeUpdate&)
 #define STEINER_TREE_INIT_GAIN_CACHE_ENTRY(X) void SteinerTreeGainCache::initializeGainCacheEntryForNode(const X&,           \
                                                                                                          const HypernodeID,  \
                                                                                                          vec<Gain>&)
diff --git a/mt-kahypar/partition/refinement/gains/steiner_tree/steiner_tree_gain_cache.h b/mt-kahypar/partition/refinement/gains/steiner_tree/steiner_tree_gain_cache.h
index 88b6f122a..2e19423cf 100644
--- a/mt-kahypar/partition/refinement/gains/steiner_tree/steiner_tree_gain_cache.h
+++ b/mt-kahypar/partition/refinement/gains/steiner_tree/steiner_tree_gain_cache.h
@@ -86,6 +86,7 @@ class SteinerTreeGainCache {
   static constexpr GainPolicy TYPE = GainPolicy::steiner_tree;
   static constexpr bool requires_notification_before_update = true;
   static constexpr bool initializes_gain_cache_entry_after_batch_uncontractions = true;
+  static constexpr bool invalidates_entries = true;
 
   SteinerTreeGainCache() :
     _is_initialized(false),
@@ -186,7 +187,7 @@ class SteinerTreeGainCache {
 
   // ! This function returns true if the corresponding syncronized edge update triggers
   // ! a gain cache update.
-  static bool triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& sync_update);
+  static bool triggersDeltaGainUpdate(const SynchronizedEdgeUpdate& sync_update);
 
   // ! The partitioned (hyper)graph call this function when its updates its internal
   // ! data structures before calling the delta gain update function. The partitioned
@@ -194,7 +195,7 @@ class SteinerTreeGainCache {
   // ! function. Thus, it is guaranteed that no other thread will modify the hyperedge.
   template<typename PartitionedHypergraph>
   void notifyBeforeDeltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                                const SyncronizedEdgeUpdate& sync_update);
+                                const SynchronizedEdgeUpdate& sync_update);
 
   // ! This functions implements the delta gain updates for the steiner tree metric.
   // ! When moving a node from its current block from to a target block to, we iterate
@@ -203,7 +204,7 @@ class SteinerTreeGainCache {
   // ! corresponding hyperedge.
   template<typename PartitionedHypergraph>
   void deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                       const SyncronizedEdgeUpdate& sync_update);
+                       const SynchronizedEdgeUpdate& sync_update);
 
   // ####################### Uncontraction #######################
 
@@ -273,7 +274,9 @@ class SteinerTreeGainCache {
     return gain;
   }
 
-  void changeNumberOfBlocks(const PartitionID) {
+  void changeNumberOfBlocks(const PartitionID new_k) {
+    ASSERT(new_k <= _k);
+    unused(new_k);
     // Do nothing
   }
 
@@ -320,7 +323,7 @@ class SteinerTreeGainCache {
   // ! Updates the adjacent blocks of a node based on a synronized hyperedge update
   template<typename PartitionedHypergraph>
   void updateAdjacentBlocks(const PartitionedHypergraph& partitioned_hg,
-                            const SyncronizedEdgeUpdate& sync_update);
+                            const SynchronizedEdgeUpdate& sync_update);
 
   // ! Increments the number of incident edges of node u that contains pins of block to.
   // ! If the value increases to one, we add the block to the connectivity set of the node
@@ -498,7 +501,7 @@ class DeltaSteinerTreeGainCache {
   template<typename PartitionedHypergraph>
   MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
   void deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                       const SyncronizedEdgeUpdate& sync_update) {
+                       const SynchronizedEdgeUpdate& sync_update) {
     ASSERT(sync_update.connectivity_set_after);
     ASSERT(sync_update.target_graph);
     const HyperedgeID he = sync_update.he;
@@ -663,7 +666,7 @@ class DeltaSteinerTreeGainCache {
   }
 
   MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
-  void reconstructConnectivitySetBeforeMove(const SyncronizedEdgeUpdate& sync_update,
+  void reconstructConnectivitySetBeforeMove(const SynchronizedEdgeUpdate& sync_update,
                                             ds::Bitset& connectivity_set) {
     if ( sync_update.pin_count_in_from_part_after == 0 ) {
       connectivity_set.set(sync_update.from);
@@ -676,7 +679,7 @@ class DeltaSteinerTreeGainCache {
   // ! Updates the adjacent blocks of a node based on a synronized hyperedge update
   template<typename PartitionedHypergraph>
   void updateAdjacentBlocks(const PartitionedHypergraph& partitioned_hg,
-                            const SyncronizedEdgeUpdate& sync_update) {
+                            const SynchronizedEdgeUpdate& sync_update) {
     if ( partitioned_hg.edgeSize(sync_update.he) <= _large_he_threshold ) {
       if ( sync_update.pin_count_in_from_part_after == 0 ) {
         for ( const HypernodeID& pin : partitioned_hg.pins(sync_update.he) ) {
diff --git a/mt-kahypar/partition/refinement/gains/steiner_tree_for_graphs/steiner_tree_attributed_gains_for_graphs.h b/mt-kahypar/partition/refinement/gains/steiner_tree_for_graphs/steiner_tree_attributed_gains_for_graphs.h
index b9de71030..f54b1fdba 100644
--- a/mt-kahypar/partition/refinement/gains/steiner_tree_for_graphs/steiner_tree_attributed_gains_for_graphs.h
+++ b/mt-kahypar/partition/refinement/gains/steiner_tree_for_graphs/steiner_tree_attributed_gains_for_graphs.h
@@ -37,7 +37,7 @@ namespace mt_kahypar {
  * attributed gain value.
  */
 struct GraphSteinerTreeAttributedGains {
-  static HyperedgeWeight gain(const SyncronizedEdgeUpdate& sync_update) {
+  static HyperedgeWeight gain(const SynchronizedEdgeUpdate& sync_update) {
     ASSERT(sync_update.block_of_other_node != kInvalidPartition);
     ASSERT(sync_update.target_graph);
     const TargetGraph& target_graph = *sync_update.target_graph;
diff --git a/mt-kahypar/partition/refinement/gains/steiner_tree_for_graphs/steiner_tree_gain_cache_for_graphs.cpp b/mt-kahypar/partition/refinement/gains/steiner_tree_for_graphs/steiner_tree_gain_cache_for_graphs.cpp
index 8a0915dfe..0d809762f 100644
--- a/mt-kahypar/partition/refinement/gains/steiner_tree_for_graphs/steiner_tree_gain_cache_for_graphs.cpp
+++ b/mt-kahypar/partition/refinement/gains/steiner_tree_for_graphs/steiner_tree_gain_cache_for_graphs.cpp
@@ -72,13 +72,13 @@ void GraphSteinerTreeGainCache::initializeGainCacheEntryForNode(const Partitione
   initializeGainCacheEntryForNode(partitioned_hg, hn, gain_aggregator);
 }
 
-bool GraphSteinerTreeGainCache::triggersDeltaGainUpdate(const SyncronizedEdgeUpdate&) {
+bool GraphSteinerTreeGainCache::triggersDeltaGainUpdate(const SynchronizedEdgeUpdate&) {
   return true;
 }
 
 template<typename PartitionedHypergraph>
 void GraphSteinerTreeGainCache::notifyBeforeDeltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                                                            const SyncronizedEdgeUpdate& sync_update) {
+                                                            const SynchronizedEdgeUpdate& sync_update) {
   if ( !partitioned_hg.isSinglePin(sync_update.he) ) {
     const HyperedgeID unique_id = partitioned_hg.uniqueEdgeID(sync_update.he);
     ASSERT(UL(unique_id) < _edge_state.size());
@@ -97,7 +97,7 @@ void GraphSteinerTreeGainCache::notifyBeforeDeltaGainUpdate(const PartitionedHyp
 
 template<typename PartitionedHypergraph>
 void GraphSteinerTreeGainCache::deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                                                const SyncronizedEdgeUpdate& sync_update) {
+                                                const SynchronizedEdgeUpdate& sync_update) {
   ASSERT(_is_initialized, "Gain cache is not initialized");
   ASSERT(sync_update.target_graph);
 
@@ -237,7 +237,7 @@ void GraphSteinerTreeGainCache::initializeAdjacentBlocksOfNode(const Partitioned
 
 template<typename PartitionedHypergraph>
 void GraphSteinerTreeGainCache::updateAdjacentBlocks(const PartitionedHypergraph& partitioned_hg,
-                                                     const SyncronizedEdgeUpdate& sync_update) {
+                                                     const SynchronizedEdgeUpdate& sync_update) {
   ASSERT(!partitioned_hg.isSinglePin(sync_update.he));
   if ( sync_update.pin_count_in_from_part_after == 0 ) {
     // The node move has removed the source block of the move from the
@@ -426,9 +426,9 @@ namespace {
 #define STEINER_TREE_INITIALIZE_GAIN_CACHE_FOR_NODE(X) void GraphSteinerTreeGainCache::initializeGainCacheEntryForNode(const X&,          \
                                                                                                                        const HypernodeID)
 #define STEINER_TREE_NOTIFY(X) void GraphSteinerTreeGainCache::notifyBeforeDeltaGainUpdate(const X&,                     \
-                                                                                           const SyncronizedEdgeUpdate&)
+                                                                                           const SynchronizedEdgeUpdate&)
 #define STEINER_TREE_DELTA_GAIN_UPDATE(X) void GraphSteinerTreeGainCache::deltaGainUpdate(const X&,                     \
-                                                                                          const SyncronizedEdgeUpdate&)
+                                                                                          const SynchronizedEdgeUpdate&)
 #define STEINER_TREE_RESTORE_UPDATE(X) void GraphSteinerTreeGainCache::uncontractUpdateAfterRestore(const X&,          \
                                                                                                     const HypernodeID, \
                                                                                                     const HypernodeID, \
@@ -444,7 +444,7 @@ namespace {
 #define STEINER_TREE_INIT_ADJACENT_BLOCKS_OF_NODE(X) void GraphSteinerTreeGainCache::initializeAdjacentBlocksOfNode(const X&,          \
                                                                                                                     const HypernodeID)
 #define STEINER_TREE_UPDATE_ADJACENT_BLOCKS(X) void GraphSteinerTreeGainCache::updateAdjacentBlocks(const X&,                     \
-                                                                                                    const SyncronizedEdgeUpdate&)
+                                                                                                    const SynchronizedEdgeUpdate&)
 #define STEINER_TREE_INIT_GAIN_CACHE_ENTRY(X) void GraphSteinerTreeGainCache::initializeGainCacheEntryForNode(const X&,           \
                                                                                                               const HypernodeID,  \
                                                                                                               vec<Gain>&)
diff --git a/mt-kahypar/partition/refinement/gains/steiner_tree_for_graphs/steiner_tree_gain_cache_for_graphs.h b/mt-kahypar/partition/refinement/gains/steiner_tree_for_graphs/steiner_tree_gain_cache_for_graphs.h
index ee007a21c..8e3e61782 100644
--- a/mt-kahypar/partition/refinement/gains/steiner_tree_for_graphs/steiner_tree_gain_cache_for_graphs.h
+++ b/mt-kahypar/partition/refinement/gains/steiner_tree_for_graphs/steiner_tree_gain_cache_for_graphs.h
@@ -102,6 +102,7 @@ class GraphSteinerTreeGainCache {
   static constexpr GainPolicy TYPE = GainPolicy::steiner_tree_for_graphs;
   static constexpr bool requires_notification_before_update = true;
   static constexpr bool initializes_gain_cache_entry_after_batch_uncontractions = true;
+  static constexpr bool invalidates_entries = false;
 
   GraphSteinerTreeGainCache() :
     _is_initialized(false),
@@ -197,7 +198,7 @@ class GraphSteinerTreeGainCache {
 
   // ! This function returns true if the corresponding syncronized edge update triggers
   // ! a gain cache update.
-  static bool triggersDeltaGainUpdate(const SyncronizedEdgeUpdate& sync_update);
+  static bool triggersDeltaGainUpdate(const SynchronizedEdgeUpdate& sync_update);
 
   // ! The partitioned (hyper)graph call this function when its updates its internal
   // ! data structures before calling the delta gain update function. The partitioned
@@ -205,7 +206,7 @@ class GraphSteinerTreeGainCache {
   // ! function. Thus, it is guaranteed that no other thread will modify the hyperedge.
   template<typename PartitionedHypergraph>
   void notifyBeforeDeltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                                   const SyncronizedEdgeUpdate& sync_update);
+                                   const SynchronizedEdgeUpdate& sync_update);
 
 
   // ! This functions implements the delta gain updates for the steiner tree metric.
@@ -215,7 +216,7 @@ class GraphSteinerTreeGainCache {
   // ! corresponding hyperedge.
   template<typename PartitionedHypergraph>
   void deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                       const SyncronizedEdgeUpdate& sync_update);
+                       const SynchronizedEdgeUpdate& sync_update);
 
   // ####################### Uncontraction #######################
 
@@ -288,7 +289,9 @@ class GraphSteinerTreeGainCache {
     return gain;
   }
 
-  void changeNumberOfBlocks(const PartitionID) {
+  void changeNumberOfBlocks(const PartitionID new_k) {
+    ASSERT(new_k <= _k);
+    unused(new_k);
     // Do nothing
   }
 
@@ -335,7 +338,7 @@ class GraphSteinerTreeGainCache {
   // ! Updates the adjacent blocks of a node based on a synronized hyperedge update
   template<typename PartitionedHypergraph>
   void updateAdjacentBlocks(const PartitionedHypergraph& partitioned_hg,
-                            const SyncronizedEdgeUpdate& sync_update);
+                            const SynchronizedEdgeUpdate& sync_update);
 
   // ! Increments the number of incident edges of node u that contains pins of block to.
   // ! If the value increases to one, we add the block to the connectivity set of the node
@@ -496,7 +499,7 @@ class GraphDeltaSteinerTreeGainCache {
   template<typename PartitionedHypergraph>
   MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
   void deltaGainUpdate(const PartitionedHypergraph& partitioned_hg,
-                       const SyncronizedEdgeUpdate& sync_update) {
+                       const SynchronizedEdgeUpdate& sync_update) {
     ASSERT(sync_update.target_graph);
 
     const HyperedgeID he = sync_update.he;
@@ -532,7 +535,7 @@ class GraphDeltaSteinerTreeGainCache {
   // ! Updates the adjacent blocks of a node based on a synronized hyperedge update
   template<typename PartitionedHypergraph>
   void updateAdjacentBlocks(const PartitionedHypergraph& partitioned_hg,
-                            const SyncronizedEdgeUpdate& sync_update) {
+                            const SynchronizedEdgeUpdate& sync_update) {
     if ( sync_update.pin_count_in_from_part_after == 0 ) {
       for ( const HypernodeID& pin : partitioned_hg.pins(sync_update.he) ) {
         decrementIncidentEdges(pin, sync_update.from);
diff --git a/mt-kahypar/partition/refinement/i_rebalancer.h b/mt-kahypar/partition/refinement/i_rebalancer.h
new file mode 100644
index 000000000..e700f5318
--- /dev/null
+++ b/mt-kahypar/partition/refinement/i_rebalancer.h
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ * MIT License
+ *
+ * This file is part of KaHyPar.
+ *
+ * Copyright (C) 2023 Nikolai Maas <nikolai.maas@kit.edu>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ ******************************************************************************/
+
+#pragma once
+
+#include <vector>
+
+#include "include/libmtkahypartypes.h"
+
+#include "mt-kahypar/macros.h"
+#include "mt-kahypar/parallel/stl/scalable_vector.h"
+#include "mt-kahypar/partition/metrics.h"
+#include "mt-kahypar/partition/refinement/i_refiner.h"
+
+namespace mt_kahypar {
+
+class IRebalancer: public IRefiner {
+
+ public:
+  virtual ~IRebalancer() = default;
+
+  bool refineAndOutputMoves(mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                            const parallel::scalable_vector<HypernodeID>& refinement_nodes,
+                            parallel::scalable_vector<parallel::scalable_vector<Move>>& moves_by_part,
+                            Metrics& best_metrics,
+                            const double time_limit) {
+    return refineAndOutputMovesImpl(hypergraph, refinement_nodes, moves_by_part, best_metrics, time_limit);
+  }
+
+  bool refineAndOutputMovesLinear(mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                                  const parallel::scalable_vector<HypernodeID>& refinement_nodes,
+                                  parallel::scalable_vector<Move>& moves,
+                                  Metrics& best_metrics,
+                                  const double time_limit) {
+    return refineAndOutputMovesLinearImpl(hypergraph, refinement_nodes, moves, best_metrics, time_limit);
+  }
+
+ protected:
+  IRebalancer() = default;
+
+ private:
+  virtual bool refineAndOutputMovesImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                                        const parallel::scalable_vector<HypernodeID>& refinement_nodes,
+                                        parallel::scalable_vector<parallel::scalable_vector<Move>>& moves_by_part,
+                                        Metrics& best_metrics,
+                                        const double time_limit) = 0;
+
+  virtual bool refineAndOutputMovesLinearImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                                              const parallel::scalable_vector<HypernodeID>& refinement_nodes,
+                                              parallel::scalable_vector<Move>& moves,
+                                              Metrics& best_metrics,
+                                              const double time_limit) = 0;
+};
+
+}  // namespace mt_kahypar
diff --git a/mt-kahypar/partition/refinement/i_refiner.h b/mt-kahypar/partition/refinement/i_refiner.h
index 778679532..ba50b082c 100644
--- a/mt-kahypar/partition/refinement/i_refiner.h
+++ b/mt-kahypar/partition/refinement/i_refiner.h
@@ -26,9 +26,6 @@
 
 #pragma once
 
-#include <array>
-#include <string>
-#include <utility>
 #include <vector>
 
 #include "include/libmtkahypartypes.h"
diff --git a/mt-kahypar/partition/refinement/label_propagation/label_propagation_refiner.cpp b/mt-kahypar/partition/refinement/label_propagation/label_propagation_refiner.cpp
index a90765aa9..bb00e39b3 100644
--- a/mt-kahypar/partition/refinement/label_propagation/label_propagation_refiner.cpp
+++ b/mt-kahypar/partition/refinement/label_propagation/label_propagation_refiner.cpp
@@ -39,6 +39,65 @@
 
 namespace mt_kahypar {
 
+  template <typename TypeTraits, typename GainTypes>
+  template<bool unconstrained, typename F>
+  bool LabelPropagationRefiner<TypeTraits, GainTypes>::moveVertex(PartitionedHypergraph& hypergraph,
+                                                                  const HypernodeID hn,
+                                                                  NextActiveNodes& next_active_nodes,
+                                                                  const F& objective_delta) {
+    bool is_moved = false;
+    ASSERT(hn != kInvalidHypernode);
+    if ( hypergraph.isBorderNode(hn) && !hypergraph.isFixed(hn) ) {
+      ASSERT(hypergraph.nodeIsEnabled(hn));
+
+      Move best_move = _gain.computeMaxGainMove(hypergraph, hn, false, false, unconstrained);
+      // We perform a move if it either improves the solution quality or, in case of a
+      // zero gain move, the balance of the solution.
+      const bool positive_gain = best_move.gain < 0;
+      const bool zero_gain_move = (_context.refinement.label_propagation.rebalancing &&
+                                    best_move.gain == 0 &&
+                                    hypergraph.partWeight(best_move.from) - 1 >
+                                    hypergraph.partWeight(best_move.to) + 1 &&
+                                    hypergraph.partWeight(best_move.to) <
+                                    _context.partition.perfect_balance_part_weights[best_move.to]);
+      const bool perform_move = positive_gain || zero_gain_move;
+      if (best_move.from != best_move.to && perform_move) {
+        PartitionID from = best_move.from;
+        PartitionID to = best_move.to;
+
+        Gain delta_before = _gain.localDelta();
+        bool changed_part = changeNodePart<unconstrained>(hypergraph, hn, from, to, objective_delta);
+        ASSERT(!unconstrained || changed_part);
+        is_moved = true;
+        if (unconstrained || changed_part) {
+          // In case the move to block 'to' was successful, we verify that the "real" gain
+          // of the move is either equal to our computed gain or if not, still improves
+          // the solution quality.
+          Gain move_delta = _gain.localDelta() - delta_before;
+          bool accept_move = (move_delta == best_move.gain || move_delta <= 0);
+          if (accept_move) {
+            DBG << "Move hypernode" << hn << "from block" << from << "to block" << to
+                << "with gain" << best_move.gain << "( Real Gain: " << move_delta << ")";
+            if constexpr (!unconstrained) {
+              // in unconstrained case, we don't want to activate neighbors if the move is undone
+              // by the rebalancing
+              activateNodeAndNeighbors(hypergraph, next_active_nodes, hn, true);
+            }
+          } else {
+            DBG << "Revert move of hypernode" << hn << "from block" << from << "to block" << to
+                << "( Expected Gain:" << best_move.gain << ", Real Gain:" << move_delta << ")";
+            // In case, the real gain is not equal with the computed gain and
+            // worsen the solution quality we revert the move.
+            ASSERT(hypergraph.partID(hn) == to);
+            changeNodePart<unconstrained>(hypergraph, hn, to, from, objective_delta);
+          }
+        }
+      }
+    }
+
+    return is_moved;
+  }
+
   template <typename TypeTraits, typename GainTypes>
   bool LabelPropagationRefiner<TypeTraits, GainTypes>::refineImpl(
                   mt_kahypar_partitioned_hypergraph_t& phg,
@@ -49,82 +108,135 @@ namespace mt_kahypar {
     resizeDataStructuresForCurrentK();
     _gain.reset();
     _next_active.reset();
+    Gain old_quality = best_metrics.quality;
 
     // Initialize set of active vertices
     initializeActiveNodes(hypergraph, refinement_nodes);
 
     // Perform Label Propagation
-    labelPropagation(hypergraph);
-
-    // Update global part weight and sizes
-    best_metrics.imbalance = metrics::imbalance(hypergraph, _context);
-
-    // Update metrics statistics
-    Gain delta = _gain.delta();
-    ASSERT(delta <= 0, "LP refiner worsen solution quality");
+    labelPropagation(hypergraph, best_metrics);
 
     HEAVY_REFINEMENT_ASSERT(hypergraph.checkTrackedPartitionInformation(_gain_cache));
-    HEAVY_REFINEMENT_ASSERT(best_metrics.quality + delta ==
+    HEAVY_REFINEMENT_ASSERT(best_metrics.quality ==
       metrics::quality(hypergraph, _context,
         !_context.refinement.label_propagation.execute_sequential),
-      V(best_metrics.quality) << V(delta) << V((best_metrics.quality + delta))
-        << V(metrics::quality(hypergraph, _context,
+      V(best_metrics.quality) << V(metrics::quality(hypergraph, _context,
           !_context.refinement.label_propagation.execute_sequential)));
 
-    best_metrics.quality += delta;
-    utils::Utilities::instance().getStats(_context.utility_id).update_stat("lp_improvement", std::abs(delta));
-    return delta < 0;
+    // Update metrics statistics
+    Gain delta = old_quality - best_metrics.quality;
+    ASSERT(delta >= 0, "LP refiner worsen solution quality");
+    utils::Utilities::instance().getStats(_context.utility_id).update_stat("lp_improvement", delta);
+    return delta > 0;
   }
 
 
   template <typename TypeTraits, typename GainTypes>
-  void LabelPropagationRefiner<TypeTraits, GainTypes>::labelPropagation(PartitionedHypergraph& hypergraph) {
+  void LabelPropagationRefiner<TypeTraits, GainTypes>::labelPropagation(PartitionedHypergraph& hypergraph,
+                                                                        Metrics& best_metrics) {
     NextActiveNodes next_active_nodes;
-    for (size_t i = 0; i < _context.refinement.label_propagation.maximum_iterations; ++i) {
-      DBG << "Starting Label Propagation Round" << i;
-
-      if ( _active_nodes.size() > 0 ) {
-        labelPropagationRound(hypergraph, next_active_nodes);
-      }
+    vec<Move> rebalance_moves;
+    bool should_stop = false;
+    for (size_t i = 0; i < _context.refinement.label_propagation.maximum_iterations
+                       && !should_stop && !_active_nodes.empty(); ++i) {
+      should_stop = labelPropagationRound(hypergraph, next_active_nodes, best_metrics, rebalance_moves,
+                                          _context.refinement.label_propagation.unconstrained);
 
       if ( _context.refinement.label_propagation.execute_sequential ) {
         _active_nodes = next_active_nodes.copy_sequential();
-        next_active_nodes.clear_sequential();
       } else {
         _active_nodes = next_active_nodes.copy_parallel();
-        next_active_nodes.clear_parallel();
-      }
-
-      if ( _active_nodes.size() == 0 ) {
-        break;
       }
+      next_active_nodes.clear_sequential();
     }
   }
 
   template <typename TypeTraits, typename GainTypes>
   bool LabelPropagationRefiner<TypeTraits, GainTypes>::labelPropagationRound(
                               PartitionedHypergraph& hypergraph,
-                              NextActiveNodes& next_active_nodes) {
+                              NextActiveNodes& next_active_nodes,
+                              Metrics& best_metrics,
+                              vec<Move>& rebalance_moves,
+                              bool unconstrained_lp) {
+    Metrics current_metrics = best_metrics;
     _visited_he.reset();
     _next_active.reset();
+    _gain.reset();
+
+    if (unconstrained_lp) {
+      moveActiveNodes<true>(hypergraph, next_active_nodes);
+    } else {
+      moveActiveNodes<false>(hypergraph, next_active_nodes);
+    }
+
+    current_metrics.imbalance = metrics::imbalance(hypergraph, _context);
+    current_metrics.quality += _gain.delta();
+
+    bool should_update_gain_cache = GainCache::invalidates_entries && _gain_cache.isInitialized();
+    if ( should_update_gain_cache ) {
+      forEachMovedNode([&](size_t j) {
+        _gain_cache.recomputeInvalidTerms(hypergraph, _active_nodes[j]);
+        if (!unconstrained_lp) { _active_node_was_moved[j] = uint8_t(false); }
+      });
+    }
+
+    bool should_stop = false;
+    if ( unconstrained_lp ) {
+      if (!metrics::isBalanced(hypergraph, _context)) {
+        should_stop = applyRebalancing(hypergraph, best_metrics, current_metrics, rebalance_moves);
+        // rebalancer might initialize the gain cache
+        should_update_gain_cache = GainCache::invalidates_entries && _gain_cache.isInitialized();
+      } else {
+        should_update_gain_cache = false;
+      }
+
+      // store current part of each node (required for rollback)
+      if ( !should_stop ) {
+        forEachMovedNode([&](size_t j) {
+          _old_part[_active_nodes[j]] = hypergraph.partID(_active_nodes[j]);
+        });
+      }
+      // collect activated nodes, update gain cache and reset flags
+      forEachMovedNode([&](size_t j) {
+        if (!should_stop) {
+          activateNodeAndNeighbors(hypergraph, next_active_nodes, _active_nodes[j], false);
+        }
+        if (should_update_gain_cache) {
+          _gain_cache.recomputeInvalidTerms(hypergraph, _active_nodes[j]);
+        }
+        _active_node_was_moved[j] = uint8_t(false);
+      });
+    }
+
+    ASSERT(current_metrics.quality <= best_metrics.quality);
+    const Gain old_quality = best_metrics.quality;
+    best_metrics = current_metrics;
+
+    HEAVY_REFINEMENT_ASSERT(hypergraph.checkTrackedPartitionInformation(_gain_cache));
+    return should_stop || old_quality - current_metrics.quality <
+                          _context.refinement.label_propagation.relative_improvement_threshold * old_quality;
+  }
+
+  template <typename TypeTraits, typename GainTypes>
+  template<bool unconstrained>
+  void LabelPropagationRefiner<TypeTraits, GainTypes>::moveActiveNodes(PartitionedHypergraph& phg,
+                                                                       NextActiveNodes& next_active_nodes) {
     // This function is passed as lambda to the changeNodePart function and used
     // to calculate the "real" delta of a move (in terms of the used objective function).
-    auto objective_delta = [&](const SyncronizedEdgeUpdate& sync_update) {
+    auto objective_delta = [&](const SynchronizedEdgeUpdate& sync_update) {
       _gain.computeDeltaForHyperedge(sync_update);
     };
+    const bool should_update_gain_cache = GainCache::invalidates_entries && _gain_cache.isInitialized();
+    const bool should_mark_nodes = unconstrained || should_update_gain_cache;
 
-    // Shuffle Vector
-    bool converged = true;
     if ( _context.refinement.label_propagation.execute_sequential ) {
       utils::Randomize::instance().shuffleVector(
               _active_nodes, UL(0), _active_nodes.size(), THREAD_ID);
 
       for ( size_t j = 0; j < _active_nodes.size(); ++j ) {
         const HypernodeID hn = _active_nodes[j];
-        if ( moveVertex(hypergraph, hn, next_active_nodes, objective_delta) ) {
-          _active_node_was_moved[j] = uint8_t(true);
-        } else {
-          converged = false;
+        if ( moveVertex<unconstrained>(phg, hn, next_active_nodes, objective_delta) ) {
+          if (should_mark_nodes) { _active_node_was_moved[j] = uint8_t(true); }
         }
       }
     } else {
@@ -133,93 +245,108 @@ namespace mt_kahypar {
 
       tbb::parallel_for(UL(0), _active_nodes.size(), [&](const size_t& j) {
         const HypernodeID hn = _active_nodes[j];
-        if ( moveVertex(hypergraph, hn, next_active_nodes, objective_delta) ) {
-          _active_node_was_moved[j] = uint8_t(true);
-        } else {
-          converged = false;
+        if ( moveVertex<unconstrained>(phg, hn, next_active_nodes, objective_delta) ) {
+          if (should_mark_nodes) { _active_node_was_moved[j] = uint8_t(true); }
         }
       });
     }
+  }
 
-    if ( _context.forceGainCacheUpdates() && _gain_cache.isInitialized() ) {
-      auto recompute = [&](size_t j) {
-        if ( _active_node_was_moved[j] ) {
-          _gain_cache.recomputeInvalidTerms(hypergraph, _active_nodes[j]);
-          _active_node_was_moved[j] = uint8_t(false);
-        }
-      };
 
-      if ( _context.refinement.label_propagation.execute_sequential ) {
-        for (size_t j = 0; j < _active_nodes.size(); ++j) {
-          recompute(j);
+  template <typename TypeTraits, typename GainTypes>
+  bool LabelPropagationRefiner<TypeTraits, GainTypes>::applyRebalancing(PartitionedHypergraph& hypergraph,
+                                                                        Metrics& best_metrics,
+                                                                        Metrics& current_metrics,
+                                                                        vec<Move>& rebalance_moves) {
+    utils::Timer& timer = utils::Utilities::instance().getTimer(_context.utility_id);
+    timer.start_timer("rebalance_lp", "Rebalance");
+    mt_kahypar_partitioned_hypergraph_t phg = utils::partitioned_hg_cast(hypergraph);
+    _rebalancer.refineAndOutputMovesLinear(phg, {}, rebalance_moves, current_metrics, 0.0);
+
+    // append to active nodes so they are included for gain cache updates and rollback
+    _active_nodes.reserve(_active_nodes.size() + rebalance_moves.size());
+    for (const Move& m: rebalance_moves) {
+      bool old_part_unintialized = _might_be_uninitialized && !_old_part_is_initialized[m.node];
+      if (old_part_unintialized || m.from == _old_part[m.node]) {
+        size_t i = _active_nodes.size();
+        _active_nodes.push_back(m.node);
+        _active_node_was_moved[i] = uint8_t(true);
+        if (old_part_unintialized) {
+          _old_part[m.node] = m.from;
+          _old_part_is_initialized.set(m.node, true);
         }
-      } else {
-        tbb::parallel_for(UL(0), _active_nodes.size(), recompute);
       }
     }
+    timer.stop_timer("rebalance_lp");
+    DBG << "[LP] Imbalance after rebalancing: " << current_metrics.imbalance << ", quality: " << current_metrics.quality;
 
-    HEAVY_REFINEMENT_ASSERT(hypergraph.checkTrackedPartitionInformation(_gain_cache));
-    return converged;
+    if (current_metrics.quality > best_metrics.quality) { // rollback and stop LP
+      auto noop_obj_fn = [](const SynchronizedEdgeUpdate&) { };
+      current_metrics = best_metrics;
+
+      forEachMovedNode([&](size_t j) {
+        const HypernodeID hn = _active_nodes[j];
+        ASSERT(!_might_be_uninitialized || _old_part_is_initialized[hn]);
+        if (hypergraph.partID(hn) != _old_part[hn]) {
+          changeNodePart<true>(hypergraph, hn, hypergraph.partID(hn), _old_part[hn], noop_obj_fn);
+        }
+      });
+      return true;
+    }
+    return false;
   }
 
   template <typename TypeTraits, typename GainTypes>
-  void LabelPropagationRefiner<TypeTraits, GainTypes>::initializeImpl(mt_kahypar_partitioned_hypergraph_t& phg) {
-    PartitionedHypergraph& hypergraph = utils::cast<PartitionedHypergraph>(phg);
-    ActiveNodes tmp_active_nodes;
-    _active_nodes = std::move(tmp_active_nodes);
-
+  template<typename F>
+  void LabelPropagationRefiner<TypeTraits, GainTypes>::forEachMovedNode(F node_fn) {
     if ( _context.refinement.label_propagation.execute_sequential ) {
-      // Setup active nodes sequential
-      for ( const HypernodeID hn : hypergraph.nodes() ) {
-        if ( _context.refinement.label_propagation.rebalancing || hypergraph.isBorderNode(hn) ) {
-          _active_nodes.push_back(hn);
+      for (size_t j = 0; j < _active_nodes.size(); j++) {
+        if (_active_node_was_moved[j]) {
+          node_fn(j);
         }
       }
     } else {
-      // Setup active nodes in parallel
-      // A node is active, if it is a border vertex.
-      NextActiveNodes tmp_active_nodes;
-
-      hypergraph.doParallelForAllNodes([&](const HypernodeID& hn) {
-        if ( _context.refinement.label_propagation.rebalancing || hypergraph.isBorderNode(hn) ) {
-          tmp_active_nodes.stream(hn);
+      tbb::parallel_for(UL(0), _active_nodes.size(), [&](const size_t j) {
+        if (_active_node_was_moved[j]) {
+          node_fn(j);
         }
       });
-
-      _active_nodes = tmp_active_nodes.copy_parallel();
     }
   }
 
+  template <typename TypeTraits, typename GainTypes>
+  void LabelPropagationRefiner<TypeTraits, GainTypes>::initializeImpl(mt_kahypar_partitioned_hypergraph_t& phg) {
+    _rebalancer.initialize(phg);  // TODO: probably wrong place for this
+  }
+
   template <typename TypeTraits, typename GainTypes>
   void LabelPropagationRefiner<TypeTraits, GainTypes>::initializeActiveNodes(
                               PartitionedHypergraph& hypergraph,
                               const parallel::scalable_vector<HypernodeID>& refinement_nodes) {
-    ActiveNodes tmp_active_nodes;
-    _active_nodes = std::move(tmp_active_nodes);
-
+    _active_nodes.clear();
     if ( refinement_nodes.empty() ) {
+      _might_be_uninitialized = false;
       if ( _context.refinement.label_propagation.execute_sequential ) {
         for ( const HypernodeID hn : hypergraph.nodes() ) {
-          if ( _context.refinement.label_propagation.rebalancing ||
-               hypergraph.isBorderNode(hn) ) {
+          if ( _context.refinement.label_propagation.rebalancing || hypergraph.isBorderNode(hn) ) {
             _active_nodes.push_back(hn);
           }
+          if ( _context.refinement.label_propagation.unconstrained ) {
+            _old_part[hn] = hypergraph.partID(hn);
+          }
         }
       } else {
         // Setup active nodes in parallel
         // A node is active, if it is a border vertex.
         NextActiveNodes tmp_active_nodes;
-
-        auto add_vertex = [&](const HypernodeID& hn) {
-          if ( _next_active.compare_and_set_to_true(hn) ) {
-            tmp_active_nodes.stream(hn);
-          }
-        };
-
         hypergraph.doParallelForAllNodes([&](const HypernodeID& hn) {
-          if ( _context.refinement.label_propagation.rebalancing ||
-               hypergraph.isBorderNode(hn) ) {
-            add_vertex(hn);
+          if ( _context.refinement.label_propagation.rebalancing || hypergraph.isBorderNode(hn) ) {
+            if ( _next_active.compare_and_set_to_true(hn) ) {
+              tmp_active_nodes.stream(hn);
+            }
+          }
+          if ( _context.refinement.label_propagation.unconstrained ) {
+            _old_part[hn] = hypergraph.partID(hn);
           }
         });
 
@@ -227,6 +354,25 @@ namespace mt_kahypar {
       }
     } else {
       _active_nodes = refinement_nodes;
+
+      if ( _context.refinement.label_propagation.unconstrained ) {
+        auto set_old_part = [&](const size_t& i) {
+          const HypernodeID hn = refinement_nodes[i];
+          _old_part[hn] = hypergraph.partID(hn);
+          _old_part_is_initialized.set(hn, true);
+        };
+
+        // we don't want to scan the whole graph for localized LP
+        _might_be_uninitialized = true;
+        _old_part_is_initialized.reset();
+        if ( _context.refinement.label_propagation.execute_sequential ) {
+          for (size_t i = 0; i < refinement_nodes.size(); ++i) {
+            set_old_part(i);
+          }
+        } else {
+          tbb::parallel_for(UL(0), refinement_nodes.size(), set_old_part);
+        }
+      }
     }
 
     _next_active.reset();
@@ -238,4 +384,4 @@ namespace mt_kahypar {
 
   // explicitly instantiate so the compiler can generate them when compiling this cpp file
   INSTANTIATE_CLASS_WITH_TYPE_TRAITS_AND_GAIN_TYPES(LABEL_PROPAGATION_REFINER)
-}
\ No newline at end of file
+}
diff --git a/mt-kahypar/partition/refinement/label_propagation/label_propagation_refiner.h b/mt-kahypar/partition/refinement/label_propagation/label_propagation_refiner.h
index 78ff61d6c..52e90c9d1 100644
--- a/mt-kahypar/partition/refinement/label_propagation/label_propagation_refiner.h
+++ b/mt-kahypar/partition/refinement/label_propagation/label_propagation_refiner.h
@@ -34,6 +34,7 @@
 #include "mt-kahypar/parallel/stl/scalable_vector.h"
 #include "mt-kahypar/partition/context.h"
 #include "mt-kahypar/partition/refinement/i_refiner.h"
+#include "mt-kahypar/partition/refinement/i_rebalancer.h"
 #include "mt-kahypar/partition/refinement/gains/gain_cache_ptr.h"
 #include "mt-kahypar/utils/cast.h"
 
@@ -56,7 +57,9 @@ class LabelPropagationRefiner final : public IRefiner {
   explicit LabelPropagationRefiner(const HypernodeID num_hypernodes,
                                    const HyperedgeID num_hyperedges,
                                    const Context& context,
-                                   GainCache& gain_cache) :
+                                   GainCache& gain_cache,
+                                   IRebalancer& rb) :
+    _might_be_uninitialized(false),
     _context(context),
     _gain_cache(gain_cache),
     _current_k(context.partition.k),
@@ -64,16 +67,20 @@ class LabelPropagationRefiner final : public IRefiner {
     _current_num_edges(kInvalidHyperedge),
     _gain(context),
     _active_nodes(),
-    _active_node_was_moved(num_hypernodes, uint8_t(false)),
+    _active_node_was_moved(2 * num_hypernodes, uint8_t(false)),
+    _old_part(_context.refinement.label_propagation.unconstrained ? num_hypernodes : 0, kInvalidPartition),
+    _old_part_is_initialized(_context.refinement.label_propagation.unconstrained ? num_hypernodes : 0),
     _next_active(num_hypernodes),
-    _visited_he(num_hyperedges) { }
+    _visited_he(Hypergraph::is_graph ? 0 : num_hyperedges),
+    _rebalancer(rb) { }
 
   explicit LabelPropagationRefiner(const HypernodeID num_hypernodes,
                                    const HyperedgeID num_hyperedges,
                                    const Context& context,
-                                   gain_cache_t gain_cache) :
+                                   gain_cache_t gain_cache,
+                                   IRebalancer& rb) :
     LabelPropagationRefiner(num_hypernodes, num_hyperedges, context,
-      GainCachePtr::cast<GainCache>(gain_cache)) { }
+      GainCachePtr::cast<GainCache>(gain_cache), rb) { }
 
   LabelPropagationRefiner(const LabelPropagationRefiner&) = delete;
   LabelPropagationRefiner(LabelPropagationRefiner&&) = delete;
@@ -87,100 +94,92 @@ class LabelPropagationRefiner final : public IRefiner {
                   Metrics& best_metrics,
                   double) final ;
 
-  void labelPropagation(PartitionedHypergraph& phg);
+  void labelPropagation(PartitionedHypergraph& phg, Metrics& best_metrics);
 
-  bool labelPropagationRound(PartitionedHypergraph& hypergraph, NextActiveNodes& next_active_nodes);
+  bool labelPropagationRound(PartitionedHypergraph& hypergraph,
+                             NextActiveNodes& next_active_nodes,
+                             Metrics& best_metrics,
+                             vec<Move>& rebalance_moves,
+                             bool unconstrained_lp);
+
+  template<bool unconstrained>
+  void moveActiveNodes(PartitionedHypergraph& hypergraph, NextActiveNodes& next_active_nodes);
+
+  bool applyRebalancing(PartitionedHypergraph& hypergraph,
+                        Metrics& best_metrics,
+                        Metrics& current_metrics,
+                        vec<Move>& rebalance_moves);
 
   template<typename F>
+  void forEachMovedNode(F node_fn);
+
+  template<bool unconstrained, typename F>
   bool moveVertex(PartitionedHypergraph& hypergraph,
                   const HypernodeID hn,
                   NextActiveNodes& next_active_nodes,
-                  const F& objective_delta) {
-    bool is_moved = false;
-    ASSERT(hn != kInvalidHypernode);
-    if ( hypergraph.isBorderNode(hn) && !hypergraph.isFixed(hn) ) {
-      ASSERT(hypergraph.nodeIsEnabled(hn));
-
-      Move best_move = _gain.computeMaxGainMove(hypergraph, hn);
-      // We perform a move if it either improves the solution quality or, in case of a
-      // zero gain move, the balance of the solution.
-      const bool positive_gain = best_move.gain < 0;
-      const bool zero_gain_move = (_context.refinement.label_propagation.rebalancing &&
-                                    best_move.gain == 0 &&
-                                    hypergraph.partWeight(best_move.from) - 1 >
-                                    hypergraph.partWeight(best_move.to) + 1 &&
-                                    hypergraph.partWeight(best_move.to) <
-                                    _context.partition.perfect_balance_part_weights[best_move.to]);
-      const bool perform_move = positive_gain || zero_gain_move;
-      if (best_move.from != best_move.to && perform_move) {
-        PartitionID from = best_move.from;
-        PartitionID to = best_move.to;
-
-        Gain delta_before = _gain.localDelta();
-        bool changed_part = changeNodePart(hypergraph, hn, from, to, objective_delta);
-        is_moved = true;
-        if (changed_part) {
-          // In case the move to block 'to' was successful, we verify that the "real" gain
-          // of the move is either equal to our computed gain or if not, still improves
-          // the solution quality.
-          Gain move_delta = _gain.localDelta() - delta_before;
-          bool accept_move = (move_delta == best_move.gain || move_delta <= 0);
-          if (accept_move) {
-            DBG << "Move hypernode" << hn << "from block" << from << "to block" << to
-                << "with gain" << best_move.gain << "( Real Gain: " << move_delta << ")";
-
-            // Set all neighbors of the vertex to active
-            for (const HyperedgeID& he : hypergraph.incidentEdges(hn)) {
-              if ( hypergraph.edgeSize(he) <=
-                    ID(_context.refinement.label_propagation.hyperedge_size_activation_threshold) ) {
-                if ( !_visited_he[he] ) {
-                  for (const HypernodeID& pin : hypergraph.pins(he)) {
-                    if ( _next_active.compare_and_set_to_true(pin) ) {
-                      next_active_nodes.stream(pin);
-                    }
-                  }
-                  _visited_he.set(he, true);
-                }
-              }
-            }
-            if ( _next_active.compare_and_set_to_true(hn) ) {
-              next_active_nodes.stream(hn);
-            }
-          } else {
-            DBG << "Revert move of hypernode" << hn << "from block" << from << "to block" << to
-                << "( Expected Gain:" << best_move.gain << ", Real Gain:" << move_delta << ")";
-            // In case, the real gain is not equal with the computed gain and
-            // worsen the solution quality we revert the move.
-            ASSERT(hypergraph.partID(hn) == to);
-            changeNodePart(hypergraph, hn, to, from, objective_delta);
-          }
-        }
-      }
-    }
-
-    return is_moved;
-  }
+                  const F& objective_delta);
 
   void initializeActiveNodes(PartitionedHypergraph& hypergraph,
                              const parallel::scalable_vector<HypernodeID>& refinement_nodes);
 
   void initializeImpl(mt_kahypar_partitioned_hypergraph_t&) final;
 
-  template<typename F>
+  template<bool unconstrained, typename F>
   bool changeNodePart(PartitionedHypergraph& phg,
                       const HypernodeID hn,
                       const PartitionID from,
                       const PartitionID to,
                       const F& objective_delta) {
-    bool success = false;
-    if ( _context.forceGainCacheUpdates() && _gain_cache.isInitialized() ) {
-      success = phg.changeNodePart(_gain_cache, hn, from, to,
-        _context.partition.max_part_weights[to], [] { }, objective_delta);
+    HypernodeWeight max_weight = unconstrained ? std::numeric_limits<HypernodeWeight>::max()
+                                                 : _context.partition.max_part_weights[to];
+    if ( _gain_cache.isInitialized() ) {
+      return phg.changeNodePart(_gain_cache, hn, from, to, max_weight, []{}, objective_delta);
+    } else {
+      return phg.changeNodePart(hn, from, to, max_weight, []{}, objective_delta);
+    }
+  }
+
+  MT_KAHYPAR_ATTRIBUTE_ALWAYS_INLINE
+  void activateNodeAndNeighbors(PartitionedHypergraph& hypergraph,
+                                NextActiveNodes& next_active_nodes,
+                                const HypernodeID hn,
+                                bool activate_moved) {
+    auto activate = [&](const HypernodeID hn) {
+      bool old_part_unintialized = _might_be_uninitialized && !_old_part_is_initialized[hn];
+      if (activate_moved || old_part_unintialized || hypergraph.partID(hn) == _old_part[hn]) {
+        if ( _next_active.compare_and_set_to_true(hn) ) {
+          next_active_nodes.stream(hn);
+          if ( old_part_unintialized ) {
+            _old_part[hn] = hypergraph.partID(hn);
+            _old_part_is_initialized.set(hn, true);
+          }
+        }
+      }
+    };
+
+    // Set all neighbors of the vertex to active
+    if constexpr (Hypergraph::is_graph) {
+      for (const HyperedgeID& he : hypergraph.incidentEdges(hn)) {
+        activate(hypergraph.edgeTarget(he));
+      }
     } else {
-      success = phg.changeNodePart(hn, from, to,
-        _context.partition.max_part_weights[to], []{}, objective_delta);
+      for (const HyperedgeID& he : hypergraph.incidentEdges(hn)) {
+        if ( hypergraph.edgeSize(he) <=
+              ID(_context.refinement.label_propagation.hyperedge_size_activation_threshold) ) {
+          if ( !_visited_he[he] ) {
+            for (const HypernodeID& pin : hypergraph.pins(he)) {
+              activate(pin);
+            }
+            _visited_he.set(he, true);
+          }
+        }
+      }
+    }
+
+    if ( activate_moved && _next_active.compare_and_set_to_true(hn) ) {
+      ASSERT(!_might_be_uninitialized);
+      next_active_nodes.stream(hn);
     }
-    return success;
   }
 
   void resizeDataStructuresForCurrentK() {
@@ -189,9 +188,13 @@ class LabelPropagationRefiner final : public IRefiner {
     if ( _current_k != _context.partition.k ) {
       _current_k = _context.partition.k;
       _gain.changeNumberOfBlocks(_current_k);
+      if ( _gain_cache.isInitialized() ) {
+        _gain_cache.changeNumberOfBlocks(_current_k);
+      }
     }
   }
 
+  bool _might_be_uninitialized;
   const Context& _context;
   GainCache& _gain_cache;
   PartitionID _current_k;
@@ -200,8 +203,11 @@ class LabelPropagationRefiner final : public IRefiner {
   GainCalculator _gain;
   ActiveNodes _active_nodes;
   parallel::scalable_vector<uint8_t> _active_node_was_moved;
+  parallel::scalable_vector<PartitionID> _old_part;
+  kahypar::ds::FastResetFlagArray<> _old_part_is_initialized;
   ds::ThreadSafeFastResetFlagArray<> _next_active;
   kahypar::ds::FastResetFlagArray<> _visited_he;
+  IRebalancer& _rebalancer;
 };
 
 }  // namespace kahypar
diff --git a/mt-kahypar/partition/refinement/rebalancing/advanced_rebalancer.cpp b/mt-kahypar/partition/refinement/rebalancing/advanced_rebalancer.cpp
new file mode 100644
index 000000000..9366dff86
--- /dev/null
+++ b/mt-kahypar/partition/refinement/rebalancing/advanced_rebalancer.cpp
@@ -0,0 +1,537 @@
+/*******************************************************************************
+ * MIT License
+ *
+ * This file is part of Mt-KaHyPar.
+ *
+ * Copyright (C) 2023 Lars Gottesbüren <lars.gottesbueren@kit.edu>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ ******************************************************************************/
+
+#include "mt-kahypar/partition/refinement/rebalancing/advanced_rebalancer.h"
+
+#include <optional>
+
+#include "mt-kahypar/partition/refinement/gains/gain_definitions.h"
+#include "mt-kahypar/utils/cast.h"
+#include "mt-kahypar/partition/context.h"
+
+#include "pcg_random.hpp"
+
+namespace mt_kahypar {
+
+namespace impl {
+
+  float transformGain(Gain gain_, HypernodeWeight wu) {
+    float gain = gain_;
+    if (gain > 0) {
+      gain *= wu;
+    } else if (gain < 0) {
+      gain /= wu;
+    }
+    return gain;
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  std::pair<PartitionID, float> computeBestTargetBlock(
+          const PartitionedHypergraph& phg, const Context& context, const GainCache& gain_cache,
+          HypernodeID u, PartitionID from) {
+    const HypernodeWeight wu = phg.nodeWeight(u);
+    const HypernodeWeight from_weight = phg.partWeight(from);
+    PartitionID to = kInvalidPartition;
+    HyperedgeWeight to_benefit = std::numeric_limits<HyperedgeWeight>::min();
+    HypernodeWeight best_to_weight = from_weight - wu;
+    for (PartitionID i = 0; i < context.partition.k; ++i) {
+      if (i != from) {
+        const HypernodeWeight to_weight = phg.partWeight(i);
+        const HyperedgeWeight benefit = gain_cache.benefitTerm(u, i);
+        if ((benefit > to_benefit || (benefit == to_benefit && to_weight < best_to_weight)) &&
+            to_weight + wu <= context.partition.max_part_weights[i]) {
+          to_benefit = benefit;
+          to = i;
+          best_to_weight = to_weight;
+        }
+      }
+    }
+
+    Gain gain = std::numeric_limits<Gain>::min();
+    if (to != kInvalidPartition) {
+      gain = to_benefit - gain_cache.penaltyTerm(u, phg.partID(u));
+    }
+    return std::make_pair(to, transformGain(gain, wu));
+  }
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  std::pair<PartitionID, float> bestOfThree(
+          const PartitionedHypergraph& phg, const Context& context, const GainCache& gain_cache,
+          HypernodeID u, PartitionID from, std::array<PartitionID, 3> parts) {
+    const HypernodeWeight wu = phg.nodeWeight(u);
+    const HypernodeWeight from_weight = phg.partWeight(from);
+    PartitionID to = kInvalidPartition;
+    HyperedgeWeight to_benefit = std::numeric_limits<HyperedgeWeight>::min();
+    HypernodeWeight best_to_weight = from_weight - wu;
+    for (PartitionID i : parts) {
+      if (i != from && i != kInvalidPartition) {
+        const HypernodeWeight to_weight = phg.partWeight(i);
+        const HyperedgeWeight benefit = gain_cache.benefitTerm(u, i);
+        if ((benefit > to_benefit || (benefit == to_benefit && to_weight < best_to_weight)) &&
+            to_weight + wu <= context.partition.max_part_weights[i]) {
+          to_benefit = benefit;
+          to = i;
+          best_to_weight = to_weight;
+        }
+      }
+    }
+
+    Gain gain = std::numeric_limits<Gain>::min();
+    if (to != kInvalidPartition) {
+      gain = to_benefit - gain_cache.penaltyTerm(u, phg.partID(u));
+    }
+    return std::make_pair(to, transformGain(gain, wu));
+  }
+
+  struct AccessToken {
+    AccessToken(int seed, size_t num_pqs) : dist(0, num_pqs - 1) { rng.seed(seed); }
+    size_t getRandomPQ() { return dist(rng); }
+
+    std::array<size_t, 2> getTwoRandomPQs() {
+      std::array<size_t, 2> result({getRandomPQ(), getRandomPQ()});
+      while (result[0] == result[1]) { result[1] = getRandomPQ(); }
+      return result;
+    }
+
+    pcg32 rng;
+    std::uniform_int_distribution<size_t> dist;
+  };
+
+
+  template<typename PartitionedHypergraph, typename GainCache>
+  struct NextMoveFinder {
+    Move next_move;
+
+    PartitionedHypergraph& _phg;
+    GainCache& _gain_cache;
+    const Context& _context;
+
+    vec<rebalancer::GuardedPQ>& _pqs;
+    ds::Array<PartitionID>& _target_part;
+    ds::Array<rebalancer::NodeState>& _node_state;
+    AccessToken _token;
+
+    NextMoveFinder(int seed, const Context& context, PartitionedHypergraph& phg, GainCache& gain_cache,
+                   vec<rebalancer::GuardedPQ>& pqs,
+                   ds::Array<PartitionID>& target_part, ds::Array<rebalancer::NodeState>& node_state) :
+                   _phg(phg), _gain_cache(gain_cache), _context(context),
+                   _pqs(pqs), _target_part(target_part), _node_state(node_state), _token(seed, pqs.size()) { }
+
+
+    void recomputeTopGainMove(HypernodeID v, const Move& move /* of the neighbor */) {
+      float gain = 0;
+      PartitionID newTarget = kInvalidPartition;
+      const PartitionID designatedTargetV = _target_part[v];
+      if (_context.partition.k < 4 || designatedTargetV == move.from || designatedTargetV == move.to) {
+        std::tie(newTarget, gain) = computeBestTargetBlock(_phg, _context, _gain_cache, v, _phg.partID(v));
+      } else {
+        std::tie(newTarget, gain) = bestOfThree(_phg, _context, _gain_cache,
+                                                v, _phg.partID(v), {designatedTargetV, move.from, move.to});
+      }
+      _target_part[v] = newTarget;
+    }
+
+    bool checkCandidate(HypernodeID u, float& gain_in_pq) {
+      if (!_node_state[u].tryLock()) return false;
+      auto [to, true_gain] = computeBestTargetBlock(_phg, _context, _gain_cache, u, _phg.partID(u));
+      if (true_gain >= gain_in_pq) {
+        next_move.node = u;
+        next_move.to = to;
+        next_move.from = _phg.partID(u);
+        next_move.gain = true_gain;
+        return true;
+      } else {
+        _target_part[u] = to;
+        gain_in_pq = true_gain;
+        _node_state[u].unlock();
+        return false;
+      }
+    }
+
+    bool lockedModifyPQ(size_t best_id) {
+      auto& gpq = _pqs[best_id];
+      auto& pq = gpq.pq;
+
+      HypernodeID node = pq.top();
+      float gain_in_pq = pq.topKey();
+      const bool success = checkCandidate(node, gain_in_pq);
+
+      if (success) {
+        pq.deleteTop();
+        gpq.top_key = pq.empty() ? std::numeric_limits<float>::min() : pq.topKey();
+      } else {
+        // gain was updated by success_func in this case
+        if (_target_part[node] != kInvalidPartition) {
+          pq.adjustKey(node, gain_in_pq);
+          gpq.top_key = pq.topKey();
+        } else {
+          pq.deleteTop();
+          gpq.top_key = pq.empty() ? std::numeric_limits<float>::min() : pq.topKey();
+        }
+      }
+      gpq.lock.unlock();
+      return success;
+    }
+
+    bool tryPop() {
+      static constexpr size_t NUM_TRIES = 32;
+      for (size_t i = 0; i < NUM_TRIES; ++i) {
+        auto two = _token.getTwoRandomPQs();
+        auto& first = _pqs[two[0]];
+        auto& second = _pqs[two[1]];
+        if (first.pq.empty() && second.pq.empty()) continue;
+        size_t best_id = two[0];
+        if (first.pq.empty() || first.top_key < second.top_key) best_id = two[1];
+        if (!_pqs[best_id].lock.tryLock()) continue;
+        // could also check for top key. would want to distinguish tries that failed due to high contention
+        // vs approaching the end
+        if (_pqs[best_id].pq.empty()) {
+          _pqs[best_id].lock.unlock();
+          continue;
+        }
+        if (lockedModifyPQ(best_id)) return true;
+        // if you got a PQ but it fails because the node's gain was wrong or the node couldn't be locked
+        // (success_func failed) then we still want to use the standard method
+        i = 0;
+      }
+
+      while (true) {
+        float best_key = std::numeric_limits<float>::min();
+        int best_id = -1;
+        for (size_t i = 0; i < _pqs.size(); ++i) {
+          if (!_pqs[i].pq.empty() && _pqs[i].top_key > best_key) {
+            best_key = _pqs[i].top_key;
+            best_id = i;
+          }
+        }
+        if (best_id == -1) return false;
+        if (!_pqs[best_id].lock.tryLock()) continue;
+        if (_pqs[best_id].pq.empty()) {
+          _pqs[best_id].lock.unlock();
+          continue;
+        }
+        if (lockedModifyPQ(best_id)) return true;
+      }
+    }
+
+    bool findNextMove() {
+      return tryPop();
+    }
+  };
+
+  void deactivateOverloadedBlock(uint8_t* is_overloaded, size_t* num_overloaded_blocks) {
+    if (*is_overloaded) {
+      uint8_t expected = 1;
+      if (__atomic_compare_exchange_n(is_overloaded, &expected, 0, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
+        __atomic_fetch_sub(num_overloaded_blocks, 1, __ATOMIC_RELAXED);
+      }
+    }
+  }
+
+} // namespace impl
+
+
+  template <typename TypeTraits, typename GainTypes>
+  void AdvancedRebalancer<TypeTraits, GainTypes>::insertNodesInOverloadedBlocks(mt_kahypar_partitioned_hypergraph_t& hypergraph) {
+    auto& phg = utils::cast<PartitionedHypergraph>(hypergraph);
+
+    // init PQs if not done before
+    const size_t num_pqs = 2 * _context.shared_memory.num_threads;
+    if (_pqs.size() != num_pqs) {
+      _pqs.assign(num_pqs, rebalancer::GuardedPQ(_pq_handles.data(), _node_state.size()));
+    }
+    for (auto& gpq : _pqs) {
+      gpq.reset();
+    }
+
+    // data structures to draw random PQs
+    std::atomic<int> seed { 555 };
+    tbb::enumerable_thread_specific<impl::AccessToken> ets_tokens([&]() {
+      return impl::AccessToken(seed.fetch_add(1, std::memory_order_relaxed), num_pqs);
+    });
+
+    // insert nodes into PQs
+    phg.doParallelForAllNodes([&](HypernodeID u) {
+      const PartitionID b = phg.partID(u);
+      if (!_is_overloaded[b] || phg.isFixed(u)) return;
+
+      auto [target, gain] = impl::computeBestTargetBlock(phg, _context, _gain_cache, u, phg.partID(u));
+      if (target == kInvalidPartition) return;
+
+      _node_state[u].markAsMovable();
+      _target_part[u] = target;
+
+      auto& token = ets_tokens.local();
+      int my_pq_id = -1;
+      while (true) {
+        my_pq_id = token.getRandomPQ();
+        if (_pqs[my_pq_id].lock.tryLock()) {
+          break;
+        }
+      }
+      _pqs[my_pq_id].pq.insert(u, gain);
+      _pqs[my_pq_id].lock.unlock();
+      _pq_id[u] = my_pq_id;
+    });
+
+
+    for (rebalancer::GuardedPQ& gpq : _pqs) {
+      if (!gpq.pq.empty()) {
+        gpq.top_key = gpq.pq.topKey();
+      }
+    }
+  }
+
+  template <typename TypeTraits, typename GainTypes>
+  std::pair<int64_t, size_t> AdvancedRebalancer<TypeTraits, GainTypes>::findMoves(mt_kahypar_partitioned_hypergraph_t& hypergraph) {
+    auto& phg = utils::cast<PartitionedHypergraph>(hypergraph);
+    int64_t attributed_gain = 0;
+    size_t global_move_id = 0;
+    size_t num_overloaded_blocks = _overloaded_blocks.size();
+
+    auto task = [&](size_t task_id) {
+      vec<HyperedgeID> edges_with_gain_changes;
+      Gain local_attributed_gain = 0;
+      vec<vec<HypernodeID>> nodes_to_update(_pqs.size());
+      vec<int> pqs_to_update;
+
+      const int seed = phg.initialNumNodes() + task_id;
+
+      impl::NextMoveFinder next_move_finder(seed, _context, phg, _gain_cache, _pqs, _target_part, _node_state);
+
+      while (num_overloaded_blocks > 0 && next_move_finder.findNextMove()) {
+        const Move& m = next_move_finder.next_move;
+        const PartitionID from = phg.partID(m.node);
+        _node_state[m.node].markAsMovedAndUnlock();
+
+        if (phg.partWeight(from) <= _context.partition.max_part_weights[from]) {
+          impl::deactivateOverloadedBlock(&_is_overloaded[from], &num_overloaded_blocks);
+          continue;
+        }
+
+        edges_with_gain_changes.clear();
+        size_t move_id = 0;
+        bool moved = phg.changeNodePart(
+                      _gain_cache, m.node, m.from, m.to,
+                      _context.partition.max_part_weights[m.to],
+                      [&] { move_id = __atomic_fetch_add(&global_move_id, 1, __ATOMIC_RELAXED); },
+                      [&](const SynchronizedEdgeUpdate& sync_update) {
+                        local_attributed_gain += AttributedGains::gain(sync_update);
+                        if (!PartitionedHypergraph::is_graph && GainCache::triggersDeltaGainUpdate(sync_update)) {
+                          edges_with_gain_changes.push_back(sync_update.he);
+                        }
+                      }
+                    );
+
+
+
+        if (!moved) continue;
+
+        auto update_neighbor = [&](HypernodeID v) {
+          if (v != m.node && _node_state[v].tryLock()) {
+            int my_pq_id = _pq_id[v];
+            assert(my_pq_id != -1);
+            if (nodes_to_update[my_pq_id].empty()) {
+              pqs_to_update.push_back(my_pq_id);
+            }
+            nodes_to_update[my_pq_id].push_back(v);
+            next_move_finder.recomputeTopGainMove(v, m);
+          }
+        };
+
+        // update neighbors
+        if constexpr (PartitionedHypergraph::is_graph) {
+          for (const auto e : phg.incidentEdges(m.node)) {
+            HypernodeID v = phg.edgeTarget(e);
+            update_neighbor(v);
+          }
+        } else {
+          for (HyperedgeID e : edges_with_gain_changes) {
+            if (phg.edgeSize(e) < _context.partition.ignore_hyperedge_size_threshold) {
+              for (HypernodeID v : phg.pins(e)) {
+                update_neighbor(v);
+              }
+            }
+          }
+        }
+
+        while (!pqs_to_update.empty()) {
+          for (size_t i = 0; i < pqs_to_update.size(); ++i) {
+            int my_pq_id = pqs_to_update[i];
+            auto& gpq = _pqs[my_pq_id];
+            auto& pq = gpq.pq;
+            if (gpq.lock.tryLock()) {
+              for (HypernodeID v : nodes_to_update[my_pq_id]) {
+                if (pq.contains(v)) {
+                  if (_target_part[v] != kInvalidPartition) {
+                    Gain new_gain_int = _gain_cache.gain(v, phg.partID(v), _target_part[v]);
+                    float new_gain = impl::transformGain(new_gain_int, phg.nodeWeight(v));
+                    pq.adjustKey(v, new_gain);
+                  } else {
+                    pq.remove(v);
+                  }
+                }
+                _node_state[v].unlock();
+              }
+
+              gpq.lock.unlock();
+              pqs_to_update[i] = pqs_to_update.back();
+              pqs_to_update.pop_back();
+              nodes_to_update[my_pq_id].clear();
+            }
+          }
+        }
+
+        _moves[move_id] = m;
+      }
+      __atomic_fetch_add(&attributed_gain, local_attributed_gain, __ATOMIC_RELAXED);
+    };
+
+    tbb::task_group tg;
+    for (size_t i = 0; i < _context.shared_memory.num_threads; ++i) { tg.run(std::bind(task, i)); }
+    tg.wait();
+
+    return std::make_pair(attributed_gain, global_move_id);
+  }
+
+  template <typename TypeTraits, typename GainTypes>
+  bool AdvancedRebalancer<TypeTraits, GainTypes>::refineInternalParallel(mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                                                                         vec<vec<Move>>* moves_by_part,
+                                                                         vec<Move>* moves_linear,
+                                                                         Metrics& best_metric) {
+    auto& phg = utils::cast<PartitionedHypergraph>(hypergraph);
+
+    if (!_gain_cache.isInitialized()) {
+      _gain_cache.initializeGainCache(phg);
+    }
+
+    _overloaded_blocks.clear();
+    _is_overloaded.assign(_context.partition.k, false);
+    for (PartitionID k = 0; k < _context.partition.k; ++k) {
+      if (phg.partWeight(k) > _context.partition.max_part_weights[k]) {
+        _overloaded_blocks.push_back(k);
+        _is_overloaded[k] = 1;
+      }
+    }
+
+    insertNodesInOverloadedBlocks(hypergraph);
+
+    auto [attributed_gain, num_moves_performed] = findMoves(hypergraph);
+
+    if (moves_by_part != nullptr) {
+      moves_by_part->resize(_context.partition.k);
+      for (auto& direction : *moves_by_part) direction.clear();
+      for (size_t i = 0; i < num_moves_performed; ++i) {
+        (*moves_by_part)[_moves[i].from].push_back(_moves[i]);
+      }
+    } else if (moves_linear != nullptr) {
+      moves_linear->clear();
+      moves_linear->reserve(num_moves_performed);
+      for (size_t i = 0; i < num_moves_performed; ++i) {
+        moves_linear->push_back(_moves[i]);
+      }
+    }
+
+    best_metric.quality += attributed_gain;
+    best_metric.imbalance = metrics::imbalance(phg, _context);
+
+    size_t num_overloaded_blocks = 0;
+    for (PartitionID b = 0; b < _context.partition.k; ++b) {
+      if (phg.partWeight(b) > _context.partition.max_part_weights[b]) {
+        num_overloaded_blocks++;
+      }
+    }
+
+    phg.doParallelForAllNodes([&](HypernodeID u) {
+      _node_state[u].reset();
+    });
+
+    for (auto& gpq : _pqs) {
+      gpq.pq.clear();
+    }
+
+    return num_overloaded_blocks == 0;
+  }
+
+
+template <typename TypeTraits, typename GainTypes>
+AdvancedRebalancer<TypeTraits, GainTypes>::AdvancedRebalancer(
+        HypernodeID num_nodes, const Context& context, GainCache& gain_cache) :
+        _context(context),
+        _gain_cache(gain_cache),
+        _current_k(_context.partition.k),
+        _gain(context),
+        _moves(num_nodes),
+        _target_part(num_nodes, kInvalidPartition),
+        _pq_handles(num_nodes, invalid_position),
+        _pq_id(num_nodes, -1),
+        _node_state(num_nodes) { }
+
+template <typename TypeTraits, typename GainTypes>
+AdvancedRebalancer<TypeTraits, GainTypes>::AdvancedRebalancer(
+        HypernodeID num_nodes, const Context& context, gain_cache_t gain_cache) :
+        AdvancedRebalancer(num_nodes, context, GainCachePtr::cast<GainCache>(gain_cache)) { }
+
+
+template <typename TypeTraits, typename GainTypes>
+bool AdvancedRebalancer<TypeTraits, GainTypes>::refineImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                const vec<HypernodeID>& , Metrics& best_metrics, double) {
+  return refineInternalParallel(hypergraph, nullptr, nullptr, best_metrics);
+}
+
+template <typename TypeTraits, typename GainTypes>
+void AdvancedRebalancer<TypeTraits, GainTypes>::initializeImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph) {
+  auto& phg = utils::cast<PartitionedHypergraph>(hypergraph);
+  unused(phg);
+}
+
+template <typename TypeTraits, typename GainTypes>
+bool AdvancedRebalancer<TypeTraits, GainTypes>::refineAndOutputMovesImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                                                                         const vec<HypernodeID>& ,
+                                                                         vec<vec<Move>>& moves_by_part,
+                                                                         Metrics& best_metrics,
+                                                                         const double) {
+  return refineInternalParallel(hypergraph, &moves_by_part, nullptr, best_metrics);
+}
+
+template <typename TypeTraits, typename GainTypes>
+bool AdvancedRebalancer<TypeTraits, GainTypes>::refineAndOutputMovesLinearImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                                                                               const vec<HypernodeID>& ,
+                                                                               vec<Move>& moves,
+                                                                               Metrics& best_metrics,
+                                                                               const double) {
+  return refineInternalParallel(hypergraph, nullptr, &moves, best_metrics);
+}
+
+// explicitly instantiate so the compiler can generate them when compiling this cpp file
+namespace {
+  #define ADVANCED_REBALANCER(X, Y) AdvancedRebalancer<X, Y>
+}
+
+// explicitly instantiate so the compiler can generate them when compiling this cpp file
+INSTANTIATE_CLASS_WITH_TYPE_TRAITS_AND_GAIN_TYPES(ADVANCED_REBALANCER)
+
+}   // namespace mt_kahypar
diff --git a/mt-kahypar/partition/refinement/rebalancing/advanced_rebalancer.h b/mt-kahypar/partition/refinement/rebalancing/advanced_rebalancer.h
new file mode 100644
index 000000000..39979f641
--- /dev/null
+++ b/mt-kahypar/partition/refinement/rebalancing/advanced_rebalancer.h
@@ -0,0 +1,142 @@
+/*******************************************************************************
+ * MIT License
+ *
+ * This file is part of Mt-KaHyPar.
+ *
+ * Copyright (C) 2023 Lars Gottesbüren <lars.gottesbueren@kit.edu>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ ******************************************************************************/
+
+#pragma once
+
+#include "mt-kahypar/datastructures/priority_queue.h"
+#include "mt-kahypar/partition/context.h"
+#include "mt-kahypar/partition/metrics.h"
+#include "mt-kahypar/partition/refinement/i_refiner.h"
+#include "mt-kahypar/partition/refinement/i_rebalancer.h"
+#include "mt-kahypar/partition/refinement/gains/gain_cache_ptr.h"
+
+namespace mt_kahypar {
+
+namespace rebalancer {
+  struct GuardedPQ {
+    GuardedPQ(PosT *handles, size_t num_nodes) : pq(handles, num_nodes) { }
+    SpinLock lock;
+    ds::MaxHeap<float, HypernodeID> pq;
+    float top_key = std::numeric_limits<float>::min();
+    void reset() {
+      pq.clear();
+      top_key = std::numeric_limits<float>::min();
+    }
+  };
+
+  struct NodeState {
+    uint8_t state = 0;
+
+    bool canMove() const { return state == 1; }
+
+    bool isLocked() const { return state == 2; }
+
+    bool wasMoved() const { return state == 3; }
+
+    // Returns true if the node is marked as movable, is not locked and taking the lock now succeeds
+    bool tryLock() {
+      uint8_t expected = 1;
+      return state == 1 && __atomic_compare_exchange_n(&state, &expected, 2, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+    }
+
+    void unlock() { __atomic_store_n(&state, 1, __ATOMIC_RELEASE); }
+
+    void markAsMovedAndUnlock() { __atomic_store_n(&state, 3, __ATOMIC_RELEASE); }
+
+    void markAsMovable() { state = 1; }
+
+    void reset() { state = 0; }
+  };
+
+} // namespace rebalancer
+
+
+template <typename TypeTraits, typename GainTypes>
+class AdvancedRebalancer final : public IRebalancer {
+private:
+  using PartitionedHypergraph = typename TypeTraits::PartitionedHypergraph;
+  using GainCache = typename GainTypes::GainCache;
+  using GainCalculator = typename GainTypes::GainComputation;
+  using AttributedGains = typename GainTypes::AttributedGains;
+
+  static constexpr bool debug = false;
+  static constexpr bool enable_heavy_assert = false;
+
+public:
+
+  explicit AdvancedRebalancer(HypernodeID num_nodes,
+                        const Context& context,
+                        GainCache& gain_cache);
+
+  explicit AdvancedRebalancer(HypernodeID num_nodes,
+                        const Context& context,
+                        gain_cache_t gain_cache);
+
+private:
+  bool refineImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                  const vec<HypernodeID>& refinement_nodes,
+                  Metrics& best_metrics,
+                  double);
+
+  void initializeImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph) final;
+
+  bool refineAndOutputMovesImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                                const vec<HypernodeID>& refinement_nodes,
+                                vec<vec<Move>>& moves_by_part,
+                                Metrics& best_metrics,
+                                const double);
+
+  bool refineAndOutputMovesLinearImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                                      const vec<HypernodeID>& refinement_nodes,
+                                      vec<Move>& moves,
+                                      Metrics& best_metrics,
+                                      const double);
+
+  bool refineInternalParallel(mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                              vec<vec<Move>>* moves_by_part,
+                              vec<Move>* moves_linear,
+                              Metrics& best_metric);
+
+  const Context& _context;
+  GainCache& _gain_cache;
+  PartitionID _current_k;
+  GainCalculator _gain;
+
+
+  void insertNodesInOverloadedBlocks(mt_kahypar_partitioned_hypergraph_t& hypergraph);
+  std::pair<int64_t, size_t> findMoves(mt_kahypar_partitioned_hypergraph_t& hypergraph);
+
+  ds::Array<Move> _moves;
+  vec<rebalancer::GuardedPQ> _pqs;
+  vec<PartitionID> _overloaded_blocks;
+  vec<uint8_t> _is_overloaded;
+  ds::Array<PartitionID> _target_part;
+  ds::Array<PosT> _pq_handles;
+  ds::Array<int> _pq_id;
+  ds::Array<rebalancer::NodeState> _node_state;
+};
+
+}  // namespace mt_kahypar
diff --git a/mt-kahypar/partition/refinement/rebalancing/rebalancer.cpp b/mt-kahypar/partition/refinement/rebalancing/simple_rebalancer.cpp
similarity index 94%
rename from mt-kahypar/partition/refinement/rebalancing/rebalancer.cpp
rename to mt-kahypar/partition/refinement/rebalancing/simple_rebalancer.cpp
index 686c5c220..7bb9b2a09 100644
--- a/mt-kahypar/partition/refinement/rebalancing/rebalancer.cpp
+++ b/mt-kahypar/partition/refinement/rebalancing/simple_rebalancer.cpp
@@ -25,7 +25,7 @@
  * SOFTWARE.
  ******************************************************************************/
 
-#include "mt-kahypar/partition/refinement/rebalancing//rebalancer.h"
+#include "mt-kahypar/partition/refinement/rebalancing/simple_rebalancer.h"
 
 
 #include <boost/dynamic_bitset.hpp>
@@ -43,11 +43,12 @@
 namespace mt_kahypar {
 
   template <typename TypeTraits, typename GainTypes>
-  bool Rebalancer<TypeTraits, GainTypes>::refineImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph,
-                                                     const vec<HypernodeID>&,
-                                                     Metrics& best_metrics,
-                                                     double) {
+  bool SimpleRebalancer<TypeTraits, GainTypes>::refineImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph,
+                                                           const vec<HypernodeID>&,
+                                                           Metrics& best_metrics,
+                                                           double) {
     PartitionedHypergraph& phg = utils::cast<PartitionedHypergraph>(hypergraph);
+    resizeDataStructuresForCurrentK();
     // If partition is imbalanced, rebalancer is activated
     bool improvement = false;
     if ( !metrics::isBalanced(phg, _context) ) {
@@ -59,7 +60,7 @@ namespace mt_kahypar {
 
       // This function is passed as lambda to the changeNodePart function and used
       // to calculate the "real" delta of a move (in terms of the used objective function).
-      auto objective_delta = [&](const SyncronizedEdgeUpdate& sync_update) {
+      auto objective_delta = [&](const SynchronizedEdgeUpdate& sync_update) {
         _gain.computeDeltaForHyperedge(sync_update);
       };
 
@@ -185,7 +186,7 @@ namespace mt_kahypar {
   }
 
   template <typename TypeTraits, typename GainTypes>
-  vec<Move> Rebalancer<TypeTraits, GainTypes>::repairEmptyBlocks(PartitionedHypergraph& phg) {
+  vec<Move> SimpleRebalancer<TypeTraits, GainTypes>::repairEmptyBlocks(PartitionedHypergraph& phg) {
     // First detect if there are any empty blocks.
     const size_t k = size_t(_context.partition.k);
     boost::dynamic_bitset<> is_empty(k);
@@ -290,9 +291,9 @@ namespace mt_kahypar {
 
   // explicitly instantiate so the compiler can generate them when compiling this cpp file
   namespace {
-  #define REBALANCER(X, Y) Rebalancer<X, Y>
+  #define SIMPLE_REBALANCER(X, Y) SimpleRebalancer<X, Y>
   }
 
   // explicitly instantiate so the compiler can generate them when compiling this cpp file
-  INSTANTIATE_CLASS_WITH_TYPE_TRAITS_AND_GAIN_TYPES(REBALANCER)
-}
\ No newline at end of file
+  INSTANTIATE_CLASS_WITH_TYPE_TRAITS_AND_GAIN_TYPES(SIMPLE_REBALANCER)
+}
diff --git a/mt-kahypar/partition/refinement/rebalancing/rebalancer.h b/mt-kahypar/partition/refinement/rebalancing/simple_rebalancer.h
similarity index 66%
rename from mt-kahypar/partition/refinement/rebalancing/rebalancer.h
rename to mt-kahypar/partition/refinement/rebalancing/simple_rebalancer.h
index 84fcb5c92..20d3b497a 100644
--- a/mt-kahypar/partition/refinement/rebalancing/rebalancer.h
+++ b/mt-kahypar/partition/refinement/rebalancing/simple_rebalancer.h
@@ -32,14 +32,18 @@
 #include "mt-kahypar/partition/context.h"
 #include "mt-kahypar/partition/metrics.h"
 #include "mt-kahypar/partition/refinement/i_refiner.h"
+#include "mt-kahypar/partition/refinement/i_rebalancer.h"
 #include "mt-kahypar/partition/refinement/gains/km1/km1_gain_computation.h"
 #include "mt-kahypar/partition/refinement/gains/cut/cut_gain_computation.h"
+#include "mt-kahypar/partition/refinement/gains/gain_cache_ptr.h"
+#include "mt-kahypar/utils/cast.h"
 
 namespace mt_kahypar {
 template <typename TypeTraits, typename GainTypes>
-class Rebalancer final : public IRefiner {
+class SimpleRebalancer final : public IRebalancer {
  private:
   using PartitionedHypergraph = typename TypeTraits::PartitionedHypergraph;
+  using GainCache = typename GainTypes::GainCache;
   using GainCalculator = typename GainTypes::GainComputation;
   using AtomicWeight = parallel::IntegralAtomicWrapper<HypernodeWeight>;
 
@@ -67,16 +71,23 @@ class Rebalancer final : public IRefiner {
     MovePQ pq;
   };
 
-  explicit Rebalancer(const Context& context) :
+  explicit SimpleRebalancer(const Context& context) :
     _context(context),
+    _current_k(context.partition.k),
     _gain(context),
     _part_weights(_context.partition.k) { }
 
-  Rebalancer(const Rebalancer&) = delete;
-  Rebalancer(Rebalancer&&) = delete;
+  explicit SimpleRebalancer(HypernodeID , const Context& context, GainCache&) :
+    SimpleRebalancer(context) { }
 
-  Rebalancer & operator= (const Rebalancer &) = delete;
-  Rebalancer & operator= (Rebalancer &&) = delete;
+  explicit SimpleRebalancer(HypernodeID num_nodes, const Context& context, gain_cache_t gain_cache) :
+    SimpleRebalancer(num_nodes, context, GainCachePtr::cast<GainCache>(gain_cache)) {}
+
+  SimpleRebalancer(const SimpleRebalancer&) = delete;
+  SimpleRebalancer(SimpleRebalancer&&) = delete;
+
+  SimpleRebalancer & operator= (const SimpleRebalancer &) = delete;
+  SimpleRebalancer & operator= (SimpleRebalancer &&) = delete;
 
   bool refineImpl(mt_kahypar_partitioned_hypergraph_t& hypergraph,
                   const vec<HypernodeID>&,
@@ -85,6 +96,22 @@ class Rebalancer final : public IRefiner {
 
   void initializeImpl(mt_kahypar_partitioned_hypergraph_t&) final { }
 
+  bool refineAndOutputMovesImpl(mt_kahypar_partitioned_hypergraph_t&,
+                                const vec<HypernodeID>&,
+                                vec<vec<Move>>&,
+                                Metrics&,
+                                const double) override final {
+    ERR("simple rebalancer can not be used for unconstrained refinement");
+  }
+
+  bool refineAndOutputMovesLinearImpl(mt_kahypar_partitioned_hypergraph_t&,
+                                      const vec<HypernodeID>&,
+                                      vec<Move>&,
+                                      Metrics&,
+                                      const double) override final {
+    ERR("simple rebalancer can not be used for unconstrained refinement");
+  }
+
   vec<Move> repairEmptyBlocks(PartitionedHypergraph& phg);
 
 private:
@@ -115,7 +142,19 @@ class Rebalancer final : public IRefiner {
     return false;
   }
 
+
+  void resizeDataStructuresForCurrentK() {
+    // If the number of blocks changes, we resize data structures
+    // (can happen during deep multilevel partitioning)
+    if ( _current_k != _context.partition.k ) {
+      _current_k = _context.partition.k;
+      _gain.changeNumberOfBlocks(_current_k);
+      _part_weights = parallel::scalable_vector<AtomicWeight>(_context.partition.k);
+    }
+  }
+
   const Context& _context;
+  PartitionID _current_k;
   GainCalculator _gain;
   parallel::scalable_vector<AtomicWeight> _part_weights;
 };
diff --git a/mt-kahypar/partition/registries/register_refinement_algorithms.cpp b/mt-kahypar/partition/registries/register_refinement_algorithms.cpp
index 1675b2da0..9a3684030 100644
--- a/mt-kahypar/partition/registries/register_refinement_algorithms.cpp
+++ b/mt-kahypar/partition/registries/register_refinement_algorithms.cpp
@@ -36,9 +36,9 @@
   static kahypar::meta::Registrar<LabelPropagationFactory> register_ ## dispatcher(                    \
     id,                                                                                                \
     [](const HypernodeID num_hypernodes, const HyperedgeID num_hyperedges,                             \
-       const Context& context, gain_cache_t gain_cache) {                                              \
+       const Context& context, gain_cache_t gain_cache, IRebalancer& rebalancer) {                     \
     return dispatcher::create(                                                                         \
-      std::forward_as_tuple(num_hypernodes, num_hyperedges, context, gain_cache),                      \
+      std::forward_as_tuple(num_hypernodes, num_hyperedges, context, gain_cache, rebalancer),          \
       __VA_ARGS__                                                                                      \
       );                                                                                               \
   })
@@ -47,17 +47,17 @@
   static kahypar::meta::Registrar<LabelPropagationFactory> JOIN(register_ ## refiner, t)(        \
     id,                                                                                          \
     [](const HypernodeID num_hypernodes, const HyperedgeID num_hyperedges,                       \
-       const Context& context, gain_cache_t gain_cache) -> IRefiner* {                           \
-    return new refiner(num_hypernodes, num_hyperedges, context, gain_cache);                     \
+       const Context& context, gain_cache_t gain_cache, IRebalancer& rebalancer) -> IRefiner* {  \
+    return new refiner(num_hypernodes, num_hyperedges, context, gain_cache, rebalancer);         \
   })
 
 #define REGISTER_DISPATCHED_FM_REFINER(id, dispatcher, ...)                                            \
   static kahypar::meta::Registrar<FMFactory> register_ ## dispatcher(                                  \
     id,                                                                                                \
     [](const HypernodeID num_hypernodes, const HyperedgeID num_hyperedges,                             \
-       const Context& context, gain_cache_t gain_cache) {                                              \
+       const Context& context, gain_cache_t gain_cache, IRebalancer& rebalancer) {                     \
     return dispatcher::create(                                                                         \
-      std::forward_as_tuple(num_hypernodes, num_hyperedges, context, gain_cache),                      \
+      std::forward_as_tuple(num_hypernodes, num_hyperedges, context, gain_cache, rebalancer),          \
       __VA_ARGS__                                                                                      \
       );                                                                                               \
   })
@@ -66,8 +66,18 @@
   static kahypar::meta::Registrar<FMFactory> JOIN(register_ ## refiner, t)(                      \
     id,                                                                                          \
     [](const HypernodeID num_hypernodes, const HyperedgeID num_hyperedges,                       \
-       const Context& context, gain_cache_t gain_cache) -> IRefiner* {                           \
-    return new refiner(num_hypernodes, num_hyperedges, context, gain_cache);                     \
+       const Context& context, gain_cache_t gain_cache, IRebalancer& rebalancer) -> IRefiner* {  \
+    return new refiner(num_hypernodes, num_hyperedges, context, gain_cache, rebalancer);         \
+  })
+
+#define REGISTER_DISPATCHED_FM_STRATEGY(id, dispatcher, ...)                                           \
+  static kahypar::meta::Registrar<FMStrategyFactory> register_ ## dispatcher(                          \
+    id,                                                                                                \
+    [](const Context& context, FMSharedData& shared_data) {                                            \
+    return dispatcher::create(                                                                         \
+      std::forward_as_tuple(context, shared_data),                                                     \
+      __VA_ARGS__                                                                                      \
+      );                                                                                               \
   })
 
 #define REGISTER_DISPATCHED_FLOW_SCHEDULER(id, dispatcher, ...)                                        \
@@ -92,18 +102,18 @@
 #define REGISTER_DISPATCHED_REBALANCER(id, dispatcher, ...)                                            \
   static kahypar::meta::Registrar<RebalancerFactory> register_ ## dispatcher(                          \
     id,                                                                                                \
-    [](const Context& context) {                                                                       \
+    [](HypernodeID num_hypernodes, const Context& context, gain_cache_t gain_cache) {                  \
     return dispatcher::create(                                                                         \
-      std::forward_as_tuple(context),                                                                  \
+      std::forward_as_tuple(num_hypernodes, context, gain_cache),                                      \
       __VA_ARGS__                                                                                      \
       );                                                                                               \
   })
 
-#define REGISTER_REBALANCER(id, refiner, t)                                                      \
-  static kahypar::meta::Registrar<RebalancerFactory> JOIN(register_ ## refiner, t)(              \
-    id,                                                                                          \
-    [](const Context& context) -> IRefiner* {                                                    \
-    return new refiner(context);                                                                 \
+#define REGISTER_REBALANCER(id, refiner, t)                                                            \
+  static kahypar::meta::Registrar<RebalancerFactory> JOIN(register_ ## refiner, t)(                    \
+    id,                                                                                                \
+    [](HypernodeID num_hypernodes, const Context& context, gain_cache_t gain_cache) -> IRebalancer* {  \
+    return new refiner(num_hypernodes, context, gain_cache);                                           \
   })
 
 #define REGISTER_DISPATCHED_FLOW_REFINER(id, dispatcher, ...)                                          \
@@ -137,12 +147,31 @@ REGISTER_DISPATCHED_LP_REFINER(LabelPropagationAlgorithm::deterministic,
 REGISTER_LP_REFINER(LabelPropagationAlgorithm::do_nothing, DoNothingRefiner, 1);
 
 REGISTER_DISPATCHED_FM_REFINER(FMAlgorithm::kway_fm,
-                               FMDispatcher,
+                               DefaultFMDispatcher,
+                               kahypar::meta::PolicyRegistry<mt_kahypar_partition_type_t>::getInstance().getPolicy(
+                                context.partition.partition_type),
+                               kahypar::meta::PolicyRegistry<GainPolicy>::getInstance().getPolicy(
+                                context.partition.gain_policy));
+REGISTER_DISPATCHED_FM_REFINER(FMAlgorithm::unconstrained_fm,
+                               UnconstrainedFMDispatcher,
                                kahypar::meta::PolicyRegistry<mt_kahypar_partition_type_t>::getInstance().getPolicy(
                                 context.partition.partition_type),
                                kahypar::meta::PolicyRegistry<GainPolicy>::getInstance().getPolicy(
                                 context.partition.gain_policy));
-REGISTER_FM_REFINER(FMAlgorithm::do_nothing, DoNothingRefiner, 2);
+REGISTER_FM_REFINER(FMAlgorithm::do_nothing, DoNothingRefiner, 3);
+
+REGISTER_DISPATCHED_FM_STRATEGY(FMAlgorithm::kway_fm,
+                                GainCacheFMStrategyDispatcher,
+                                kahypar::meta::PolicyRegistry<mt_kahypar_partition_type_t>::getInstance().getPolicy(
+                                 context.partition.partition_type),
+                                kahypar::meta::PolicyRegistry<GainPolicy>::getInstance().getPolicy(
+                                 context.partition.gain_policy));
+REGISTER_DISPATCHED_FM_STRATEGY(FMAlgorithm::unconstrained_fm,
+                                UnconstrainedFMStrategyDispatcher,
+                                kahypar::meta::PolicyRegistry<mt_kahypar_partition_type_t>::getInstance().getPolicy(
+                                 context.partition.partition_type),
+                                kahypar::meta::PolicyRegistry<GainPolicy>::getInstance().getPolicy(
+                                 context.partition.gain_policy));
 
 REGISTER_DISPATCHED_FLOW_SCHEDULER(FlowAlgorithm::flow_cutter,
                                    FlowSchedulerDispatcher,
@@ -150,15 +179,21 @@ REGISTER_DISPATCHED_FLOW_SCHEDULER(FlowAlgorithm::flow_cutter,
                                     context.partition.partition_type),
                                    kahypar::meta::PolicyRegistry<GainPolicy>::getInstance().getPolicy(
                                      context.partition.gain_policy));
-REGISTER_FLOW_SCHEDULER(FlowAlgorithm::do_nothing, DoNothingRefiner, 3);
+REGISTER_FLOW_SCHEDULER(FlowAlgorithm::do_nothing, DoNothingRefiner, 4);
 
 REGISTER_DISPATCHED_REBALANCER(RebalancingAlgorithm::simple_rebalancer,
-                               RebalancerDispatcher,
+                               SimpleRebalancerDispatcher,
                                kahypar::meta::PolicyRegistry<mt_kahypar_partition_type_t>::getInstance().getPolicy(
                                 context.partition.partition_type),
                                kahypar::meta::PolicyRegistry<GainPolicy>::getInstance().getPolicy(
                                 context.partition.gain_policy));
-REGISTER_REBALANCER(RebalancingAlgorithm::do_nothing, DoNothingRefiner, 4);
+REGISTER_DISPATCHED_REBALANCER(RebalancingAlgorithm::advanced_rebalancer,
+                                 AdvancedRebalancerDispatcher,
+                                 kahypar::meta::PolicyRegistry<mt_kahypar_partition_type_t>::getInstance().getPolicy(
+                                         context.partition.partition_type),
+                                 kahypar::meta::PolicyRegistry<GainPolicy>::getInstance().getPolicy(
+                                         context.partition.gain_policy));
+REGISTER_REBALANCER(RebalancingAlgorithm::do_nothing, DoNothingRefiner, 5);
 
 REGISTER_DISPATCHED_FLOW_REFINER(FlowAlgorithm::flow_cutter,
                                   FlowRefinementDispatcher,
@@ -166,5 +201,5 @@ REGISTER_DISPATCHED_FLOW_REFINER(FlowAlgorithm::flow_cutter,
                                    context.partition.partition_type),
                                   kahypar::meta::PolicyRegistry<GainPolicy>::getInstance().getPolicy(
                                     context.partition.gain_policy));
-REGISTER_FLOW_REFINER(FlowAlgorithm::do_nothing, DoNothingFlowRefiner, 5);
+REGISTER_FLOW_REFINER(FlowAlgorithm::do_nothing, DoNothingFlowRefiner, 6);
 }  // namespace mt_kahypar
diff --git a/tests/datastructures/delta_partitioned_graph_test.cc b/tests/datastructures/delta_partitioned_graph_test.cc
index 797a3ef10..e1933f39b 100644
--- a/tests/datastructures/delta_partitioned_graph_test.cc
+++ b/tests/datastructures/delta_partitioned_graph_test.cc
@@ -102,7 +102,7 @@ class ADeltaPartitionedGraph : public Test {
   void changeNodePartWithGainCacheUpdate(const HypernodeID hn,
                                          const PartitionID from,
                                          const PartitionID to) {
-    auto delta_gain_update = [&](const SyncronizedEdgeUpdate& sync_update) {
+    auto delta_gain_update = [&](const SynchronizedEdgeUpdate& sync_update) {
       delta_gain_cache->deltaGainUpdate(*delta_phg, sync_update);
     };
     delta_phg->changeNodePart(hn, from, to, 1000, delta_gain_update);
@@ -183,4 +183,4 @@ TEST_F(ADeltaPartitionedGraph, MovesVertices) {
 }
 
 } // namespace ds
-} // namespace mt_kahypar
\ No newline at end of file
+} // namespace mt_kahypar
diff --git a/tests/datastructures/delta_partitioned_hypergraph_test.cc b/tests/datastructures/delta_partitioned_hypergraph_test.cc
index 058cf4ec9..58e54c1f4 100644
--- a/tests/datastructures/delta_partitioned_hypergraph_test.cc
+++ b/tests/datastructures/delta_partitioned_hypergraph_test.cc
@@ -103,7 +103,7 @@ class ADeltaPartitionedHypergraph : public Test {
   void changeNodePartWithGainCacheUpdate(const HypernodeID hn,
                                          const PartitionID from,
                                          const PartitionID to) {
-    auto delta_gain_update = [&](const SyncronizedEdgeUpdate& sync_update) {
+    auto delta_gain_update = [&](const SynchronizedEdgeUpdate& sync_update) {
       delta_gain_cache->deltaGainUpdate(*delta_phg, sync_update);
     };
     delta_phg->changeNodePart(hn, from, to, 1000, delta_gain_update);
@@ -232,4 +232,4 @@ TEST_F(ADeltaPartitionedHypergraph, MovesSeveralVertices) {
 }
 
 } // namespace ds
-} // namespace mt_kahypar
\ No newline at end of file
+} // namespace mt_kahypar
diff --git a/tests/datastructures/partitioned_graph_test.cc b/tests/datastructures/partitioned_graph_test.cc
index 0befeba99..1338812c7 100644
--- a/tests/datastructures/partitioned_graph_test.cc
+++ b/tests/datastructures/partitioned_graph_test.cc
@@ -394,7 +394,7 @@ TYPED_TEST(APartitionedGraph, ComputesDeltaAndGainsCorrectlyIfAllNodesMoveConcur
   this->gain_cache.initializeGainCache(this->partitioned_hypergraph);
 
   CAtomic<HyperedgeWeight> delta(0);
-  auto delta_fun = [&](const SyncronizedEdgeUpdate& sync_update) {
+  auto delta_fun = [&](const SynchronizedEdgeUpdate& sync_update) {
       delta.fetch_add(CutAttributedGains::gain(sync_update));
   };
 
@@ -423,4 +423,4 @@ TYPED_TEST(APartitionedGraph, ComputesDeltaAndGainsCorrectlyIfAllNodesMoveConcur
 }
 
 }  // namespace ds
-}  // namespace mt_kahypar
\ No newline at end of file
+}  // namespace mt_kahypar
diff --git a/tests/datastructures/partitioned_hypergraph_smoke_test.cc b/tests/datastructures/partitioned_hypergraph_smoke_test.cc
index c1619345b..085ed134c 100644
--- a/tests/datastructures/partitioned_hypergraph_smoke_test.cc
+++ b/tests/datastructures/partitioned_hypergraph_smoke_test.cc
@@ -140,7 +140,7 @@ void moveAllNodesOfHypergraphRandom(HyperGraph& hypergraph,
 
   tbb::enumerable_thread_specific<HyperedgeWeight> deltas(0);
 
-  auto objective_delta = [&](const SyncronizedEdgeUpdate& sync_update) {
+  auto objective_delta = [&](const SynchronizedEdgeUpdate& sync_update) {
                            if (objective == Objective::km1) {
                              deltas.local() += Km1AttributedGains::gain(sync_update);
                            } else if (objective == Objective::cut) {
diff --git a/tests/parallel/work_container_test.cc b/tests/parallel/work_container_test.cc
index a5abb536e..d685181d4 100644
--- a/tests/parallel/work_container_test.cc
+++ b/tests/parallel/work_container_test.cc
@@ -91,7 +91,7 @@ TEST(WorkContainer, WorkStealingWorks) {
       cdc.safe_push(i, thread_id);
     }
 
-    stage.fetch_add(1, std::memory_order_acq_rel);
+    stage.fetch_add(1, std::memory_order_acquire);
 
     int own_element;
     while (cdc.try_pop(own_element, thread_id)) {
@@ -101,7 +101,7 @@ TEST(WorkContainer, WorkStealingWorks) {
 
   std::thread consumer([&] {
     int thread_id = 1;
-    while (stage.load(std::memory_order_acq_rel) < 1) { } //spin
+    while (stage.load(std::memory_order_acquire) < 1) { } //spin
 
     int stolen_element;
     while (cdc.try_pop(stolen_element, thread_id)) {
diff --git a/tests/partition/initial_partitioning/pool_initial_partitioner_test.cc b/tests/partition/initial_partitioning/pool_initial_partitioner_test.cc
index b04eb2e33..754d1ebf5 100644
--- a/tests/partition/initial_partitioning/pool_initial_partitioner_test.cc
+++ b/tests/partition/initial_partitioning/pool_initial_partitioner_test.cc
@@ -34,7 +34,6 @@
 #include "mt-kahypar/utils/utilities.h"
 #include "mt-kahypar/io/hypergraph_factory.h"
 #include "mt-kahypar/partition/registries/register_initial_partitioning_algorithms.h"
-#include "mt-kahypar/partition/registries/register_refinement_algorithms.cpp"
 #include "mt-kahypar/partition/initial_partitioning/pool_initial_partitioner.h"
 
 using ::testing::Test;
diff --git a/tests/partition/refinement/CMakeLists.txt b/tests/partition/refinement/CMakeLists.txt
index 4467f0901..e8377c144 100644
--- a/tests/partition/refinement/CMakeLists.txt
+++ b/tests/partition/refinement/CMakeLists.txt
@@ -8,6 +8,7 @@ target_sources(mt_kahypar_tests PRIVATE
          label_propagation_refiner_test.cc
          rollback_test.cc
          rebalance_test.cc
+         advanced_rebalancer_test.cc
          twoway_fm_refiner_test.cc
          gain_test.cc
          gain_cache_test.cc
diff --git a/tests/partition/refinement/advanced_rebalancer_test.cc b/tests/partition/refinement/advanced_rebalancer_test.cc
new file mode 100644
index 000000000..c211882a6
--- /dev/null
+++ b/tests/partition/refinement/advanced_rebalancer_test.cc
@@ -0,0 +1,173 @@
+/*******************************************************************************
+ * MIT License
+ *
+ * This file is part of Mt-KaHyPar.
+ *
+ * Copyright (C) 2023 Nikolai Maas <nikolai.maas@kit.edu>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ ******************************************************************************/
+
+#include <functional>
+#include <random>
+
+
+#include "gmock/gmock.h"
+
+#include "mt-kahypar/definitions.h"
+#include "mt-kahypar/io/hypergraph_factory.h"
+#include "mt-kahypar/partition/refinement/rebalancing/advanced_rebalancer.h"
+#include "mt-kahypar/partition/refinement/gains/gain_definitions.h"
+#include "mt-kahypar/utils/randomize.h"
+
+using ::testing::Test;
+
+namespace mt_kahypar {
+
+template <typename TypeTraitsT, typename GainTypesT, PartitionID k>
+struct TestConfig {
+  using TypeTraits = TypeTraitsT;
+  using GainTypes = GainTypesT;
+  static constexpr PartitionID K = k;
+};
+
+template<typename Config>
+class RebalancerTest : public Test {
+
+ public:
+  using TypeTraits = typename Config::TypeTraits;
+  using GainTypes = typename Config::GainTypes;
+  using Hypergraph = typename TypeTraits::Hypergraph;
+  using PartitionedHypergraph = typename TypeTraits::PartitionedHypergraph;
+  using HypergraphFactory = typename Hypergraph::Factory;
+  using GainCache = typename GainTypes::GainCache;
+  using Km1Rebalancer = AdvancedRebalancer<TypeTraits, GainTypes>;
+
+  RebalancerTest() :
+          hypergraph(),
+          partitioned_hypergraph(),
+          context(),
+          gain_cache(),
+          rebalancer(nullptr) {
+    TBBInitializer::instance(std::thread::hardware_concurrency());
+    context.partition.mode = Mode::direct;
+    context.partition.epsilon = 0.05;
+    context.partition.k = Config::K;
+
+    context.partition.preset_type = PresetType::default_preset;
+    context.partition.instance_type = InstanceType::hypergraph;
+    context.partition.partition_type = PartitionedHypergraph::TYPE;
+    context.partition.verbose_output = false;
+
+    // Shared Memory
+    context.shared_memory.original_num_threads = std::thread::hardware_concurrency();
+    context.shared_memory.num_threads = std::thread::hardware_concurrency();
+
+    context.partition.objective = Hypergraph::is_graph ? Objective::cut : Objective::km1;
+    context.partition.gain_policy = Hypergraph::is_graph ? GainPolicy::cut_for_graphs : GainPolicy::km1;
+  }
+
+  void constructFromFile() {
+    if constexpr ( Hypergraph::is_graph ) {
+      hypergraph = io::readInputFile<Hypergraph>(
+        "../tests/instances/delaunay_n10.graph", FileFormat::Metis, true);
+    } else {
+      hypergraph = io::readInputFile<Hypergraph>(
+        "../tests/instances/contracted_unweighted_ibm01.hgr", FileFormat::hMetis, true);
+    }
+  }
+
+  void constructFromValues(const HypernodeID num_hypernodes, const HyperedgeID num_hyperedges,
+                           const vec<vec<HypernodeID>>& edge_vector, const vec<HypernodeWeight> hypernode_weight) {
+    hypergraph = HypergraphFactory::construct(num_hypernodes, num_hyperedges, edge_vector, nullptr, hypernode_weight.data());
+  }
+
+  void setup() {
+    partitioned_hypergraph = PartitionedHypergraph(context.partition.k, hypergraph, parallel_tag_t());
+    context.setupPartWeights(hypergraph.totalWeight());
+
+    rebalancer = std::make_unique<Km1Rebalancer>(hypergraph.initialNumNodes(), context, gain_cache);
+    mt_kahypar_partitioned_hypergraph_t phg = utils::partitioned_hg_cast(partitioned_hypergraph);
+    rebalancer->initialize(phg);
+  }
+
+  Hypergraph hypergraph;
+  PartitionedHypergraph partitioned_hypergraph;
+  Context context;
+  GainCache gain_cache;
+  std::unique_ptr<Km1Rebalancer> rebalancer;
+};
+
+
+typedef ::testing::Types<TestConfig<StaticHypergraphTypeTraits, Km1GainTypes, 2>,
+                         TestConfig<StaticHypergraphTypeTraits, Km1GainTypes, 4>
+                         ENABLE_GRAPHS(COMMA TestConfig<StaticGraphTypeTraits COMMA CutGainForGraphsTypes COMMA 2>)
+                         ENABLE_GRAPHS(COMMA TestConfig<StaticGraphTypeTraits COMMA CutGainForGraphsTypes COMMA 4>) > TestConfigs;
+
+TYPED_TEST_CASE(RebalancerTest, TestConfigs);
+
+
+TYPED_TEST(RebalancerTest, CanNotBeRebalanced) {
+  this->constructFromValues(3, 1, { {0, 1} }, {6, 5, 4});
+  this->setup();
+
+  this->partitioned_hypergraph.setOnlyNodePart(0, 0);
+  this->partitioned_hypergraph.setOnlyNodePart(1, 1);
+  this->partitioned_hypergraph.setOnlyNodePart(2, 0);
+  this->partitioned_hypergraph.initializePartition();
+  mt_kahypar_partitioned_hypergraph_t phg = utils::partitioned_hg_cast(this->partitioned_hypergraph);
+
+  Metrics metrics;
+  metrics.quality = metrics::quality(this->partitioned_hypergraph, this->context);
+  metrics.imbalance = metrics::imbalance(this->partitioned_hypergraph, this->context);
+  this->rebalancer->refine(phg, {}, metrics, std::numeric_limits<double>::max());
+
+  ASSERT_DOUBLE_EQ(metrics::imbalance(this->partitioned_hypergraph, this->context), metrics.imbalance);
+}
+
+
+TYPED_TEST(RebalancerTest, ProducesBalancedResult) {
+  this->constructFromFile();
+  this->setup();
+
+  this->partitioned_hypergraph.doParallelForAllNodes([&](const HypernodeID hn) {
+    PartitionID block = 0;
+    for (PartitionID p = 1; p < this->context.partition.k; ++p) {
+      if (utils::Randomize::instance().flipCoin(THREAD_ID)) {
+        block++;
+      }
+    }
+    this->partitioned_hypergraph.setOnlyNodePart(hn, block);
+  });
+
+  this->partitioned_hypergraph.initializePartition();
+  mt_kahypar_partitioned_hypergraph_t phg = utils::partitioned_hg_cast(this->partitioned_hypergraph);
+
+  Metrics metrics;
+  metrics.quality = metrics::quality(this->partitioned_hypergraph, this->context);
+  metrics.imbalance = metrics::imbalance(this->partitioned_hypergraph, this->context);
+  this->rebalancer->refine(phg, {}, metrics, std::numeric_limits<double>::max());
+
+  ASSERT_DOUBLE_EQ(metrics::imbalance(this->partitioned_hypergraph, this->context), metrics.imbalance);
+  for (PartitionID part = 0; part < this->context.partition.k; ++part) {
+    ASSERT_LE(this->partitioned_hypergraph.partWeight(part), this->context.partition.max_part_weights[part]);
+  }
+}
+
+}
diff --git a/tests/partition/refinement/fm_strategy_test.cc b/tests/partition/refinement/fm_strategy_test.cc
index a85101cf5..70af849b2 100644
--- a/tests/partition/refinement/fm_strategy_test.cc
+++ b/tests/partition/refinement/fm_strategy_test.cc
@@ -31,6 +31,7 @@
 #include "mt-kahypar/io/hypergraph_factory.h"
 
 #include "mt-kahypar/partition/refinement/fm/strategies/gain_cache_strategy.h"
+#include "mt-kahypar/partition/refinement/fm/strategies/unconstrained_strategy.h"
 #include "mt-kahypar/partition/refinement/gains/km1/km1_gain_cache.h"
 
 using ::testing::Test;
@@ -40,27 +41,40 @@ namespace mt_kahypar {
 namespace {
   using Hypergraph = typename StaticHypergraphTypeTraits::Hypergraph;
   using PartitionedHypergraph = typename StaticHypergraphTypeTraits::PartitionedHypergraph;
+  using BlockPriorityQueue = ds::ExclusiveHandleHeap< ds::MaxHeap<Gain, PartitionID> >;
+  using VertexPriorityQueue = ds::MaxHeap<Gain, HypernodeID>;    // these need external handles
 }
 
 
 template<typename Strategy>
-vec<Gain> insertAndExtractAllMoves(Strategy& strat,
-                                   PartitionedHypergraph& phg,
-                                   Km1GainCache& gain_cache) {
-  Move m;
-  vec<Gain> gains;
-  for (HypernodeID u : phg.nodes()) {
-    strat.insertIntoPQ(phg, gain_cache, u);
+struct AFMStrategy : public Test {
+  vec<Gain> insertAndExtractAllMoves(PartitionedHypergraph& phg,
+                                     const Context& context,
+                                     Km1GainCache& gain_cache,
+                                     FMSharedData& sd,
+                                     BlockPriorityQueue& blockPQ,
+                                     vec<VertexPriorityQueue>& vertexPQs,
+                                     FMStats& fm_stats) {
+    Strategy strategy(context, sd, blockPQ, vertexPQs, fm_stats);
+
+    Move m;
+    vec<Gain> gains;
+    for (HypernodeID u : phg.nodes()) {
+      strategy.insertIntoPQ(phg, gain_cache, u);
+    }
+
+    while (strategy.findNextMove(phg, gain_cache, m)) {
+      gains.push_back(m.gain);
+    }
+    strategy.reset();
+    return gains;
   }
+};
 
-  while (strat.findNextMove(phg, gain_cache, m)) {
-    gains.push_back(m.gain);
-  }
-  strat.clearPQs(0);
-  return gains;
-}
+using FMStrategyTestTypes = ::testing::Types<LocalGainCacheStrategy, LocalUnconstrainedStrategy>;
+TYPED_TEST_CASE(AFMStrategy, FMStrategyTestTypes);
 
-TEST(StrategyTests, FindNextMove) {
+TYPED_TEST(AFMStrategy, FindNextMove) {
   PartitionID k = 8;
   Context context;
   context.partition.k = k;
@@ -85,13 +99,15 @@ TEST(StrategyTests, FindNextMove) {
 
   context.refinement.fm.algorithm = FMAlgorithm::kway_fm;
 
-  FMSharedData sd(hg.initialNumNodes());
+  FMSharedData sd(hg.initialNumNodes(), false);
   FMStats fm_stats;
   fm_stats.moves = 1;
 
-  GainCacheStrategy gain_caching(context, sd, fm_stats);
-  vec<Gain> gains_cached = insertAndExtractAllMoves(gain_caching, phg, gain_cache);
+  BlockPriorityQueue blockPQ(k);
+  vec<VertexPriorityQueue> vertexPQs(k, VertexPriorityQueue(sd.vertexPQHandles.data(), sd.numberOfNodes));
+
+  vec<Gain> gains_cached = this->insertAndExtractAllMoves(phg, context, gain_cache, sd, blockPQ, vertexPQs, fm_stats);
   ASSERT_TRUE(std::is_sorted(gains_cached.begin(), gains_cached.end(), std::greater<Gain>()));
 }
 
-}
\ No newline at end of file
+}
diff --git a/tests/partition/refinement/gain_cache_test.cc b/tests/partition/refinement/gain_cache_test.cc
index 6f30335ab..915e9a205 100644
--- a/tests/partition/refinement/gain_cache_test.cc
+++ b/tests/partition/refinement/gain_cache_test.cc
@@ -175,7 +175,7 @@ class AGainCache : public Test {
   }
 
   void moveAllNodesAtRandomOnDeltaPartition() {
-    auto update_delta_gain_cache = [&](const SyncronizedEdgeUpdate& sync_update) {
+    auto update_delta_gain_cache = [&](const SynchronizedEdgeUpdate& sync_update) {
       delta_gain_cache->deltaGainUpdate(*delta_phg, sync_update);
     };
 
@@ -358,7 +358,7 @@ class AGainCache : public Test {
     verifyAdjacentBlocksOfDeltaGainCache();
   }
 
-  Gain attributedGain(const SyncronizedEdgeUpdate& sync_update) {
+  Gain attributedGain(const SynchronizedEdgeUpdate& sync_update) {
     return -AttributedGains::gain(sync_update);
   }
 
@@ -476,7 +476,7 @@ TYPED_TEST(AGainCache, ComparesGainsWithAttributedGains) {
 
   utils::Randomize& rand = utils::Randomize::instance();
   Gain attributed_gain = 0;
-  auto delta = [&](const SyncronizedEdgeUpdate& sync_update) {
+  auto delta = [&](const SynchronizedEdgeUpdate& sync_update) {
     attributed_gain += this->attributedGain(sync_update);
   };
   vec<PartitionID> adjacent_blocks;
@@ -512,4 +512,4 @@ TYPED_TEST(AGainCache, HasCorrectGainsAfterNLevelUncontractionWithLocalizedRefin
 
 #endif
 
-}
\ No newline at end of file
+}
diff --git a/tests/partition/refinement/gain_policy_test.cc b/tests/partition/refinement/gain_policy_test.cc
index 224c70006..248214740 100644
--- a/tests/partition/refinement/gain_policy_test.cc
+++ b/tests/partition/refinement/gain_policy_test.cc
@@ -83,7 +83,7 @@ TEST_F(AKm1PolicyK2, ComputesCorrectMoveGainForVertex1) {
 TEST_F(AKm1PolicyK2, ComputesCorrectObjectiveDelta1) {
   assignPartitionIDs({ 1, 0, 0, 0, 0, 1, 1 });
   ASSERT_TRUE(hypergraph.changeNodePart(0, 1, 0,
-    [&](const SyncronizedEdgeUpdate& sync_update) {
+    [&](const SynchronizedEdgeUpdate& sync_update) {
       gain->computeDeltaForHyperedge(sync_update);
     }));
   ASSERT_EQ(-2, gain->delta());
@@ -100,7 +100,7 @@ TEST_F(AKm1PolicyK2, ComputesCorrectMoveGainForVertex2) {
 TEST_F(AKm1PolicyK2, ComputesCorrectObjectiveDelta2) {
   assignPartitionIDs({ 0, 0, 0, 1, 0, 1, 1 });
   ASSERT_TRUE(hypergraph.changeNodePart(3, 1, 0,
-    [&](const SyncronizedEdgeUpdate& sync_update) {
+    [&](const SynchronizedEdgeUpdate& sync_update) {
       gain->computeDeltaForHyperedge(sync_update);
     }));
   ASSERT_EQ(-1, gain->delta());
@@ -127,7 +127,7 @@ TEST_F(ACutPolicyK2, ComputesCorrectMoveGainForVertex1) {
 TEST_F(ACutPolicyK2, ComputesCorrectObjectiveDelta1) {
   assignPartitionIDs({ 1, 0, 0, 0, 0, 1, 1 });
   ASSERT_TRUE(hypergraph.changeNodePart(0, 1, 0,
-    [&](const SyncronizedEdgeUpdate& sync_update) {
+    [&](const SynchronizedEdgeUpdate& sync_update) {
       gain->computeDeltaForHyperedge(sync_update);
     }));
   ASSERT_EQ(-2, gain->delta());
@@ -144,7 +144,7 @@ TEST_F(ACutPolicyK2, ComputesCorrectMoveGainForVertex2) {
 TEST_F(ACutPolicyK2, ComputesCorrectObjectiveDelta2) {
   assignPartitionIDs({ 0, 0, 0, 1, 0, 1, 1 });
   ASSERT_TRUE(hypergraph.changeNodePart(3, 1, 0,
-    [&](const SyncronizedEdgeUpdate& sync_update) {
+    [&](const SynchronizedEdgeUpdate& sync_update) {
       gain->computeDeltaForHyperedge(sync_update);
     }));
   ASSERT_EQ(-1, gain->delta());
@@ -171,7 +171,7 @@ TEST_F(AKm1PolicyK4, ComputesCorrectMoveGainForVertex1) {
 TEST_F(AKm1PolicyK4, ComputesCorrectObjectiveDelta1) {
   assignPartitionIDs({ 0, 1, 2, 3, 3, 1, 2 });
   ASSERT_TRUE(hypergraph.changeNodePart(0, 0, 1,
-    [&](const SyncronizedEdgeUpdate& sync_update) {
+    [&](const SynchronizedEdgeUpdate& sync_update) {
       gain->computeDeltaForHyperedge(sync_update);
     }));
   ASSERT_EQ(-1, gain->delta());
@@ -188,7 +188,7 @@ TEST_F(AKm1PolicyK4, ComputesCorrectMoveGainForVertex2) {
 TEST_F(AKm1PolicyK4, ComputesCorrectObjectiveDelta2) {
   assignPartitionIDs({ 0, 3, 1, 2, 2, 0, 3 });
   ASSERT_TRUE(hypergraph.changeNodePart(6, 3, 0,
-    [&](const SyncronizedEdgeUpdate& sync_update) {
+    [&](const SynchronizedEdgeUpdate& sync_update) {
       gain->computeDeltaForHyperedge(sync_update);
     }));
   ASSERT_EQ(-1, gain->delta());
@@ -215,7 +215,7 @@ TEST_F(ACutPolicyK4, ComputesCorrectMoveGainForVertex1) {
 TEST_F(ACutPolicyK4, ComputesCorrectObjectiveDelta1) {
   assignPartitionIDs({ 0, 1, 2, 3, 3, 1, 2 });
   ASSERT_TRUE(hypergraph.changeNodePart(0, 0, 2,
-    [&](const SyncronizedEdgeUpdate& sync_update) {
+    [&](const SynchronizedEdgeUpdate& sync_update) {
       gain->computeDeltaForHyperedge(sync_update);
     }));
   ASSERT_EQ(-1, gain->delta());
@@ -232,7 +232,7 @@ TEST_F(ACutPolicyK4, ComputesCorrectMoveGainForVertex2) {
 TEST_F(ACutPolicyK4, ComputesCorrectObjectiveDelta2) {
   assignPartitionIDs({ 0, 3, 1, 2, 2, 0, 3 });
   ASSERT_TRUE(hypergraph.changeNodePart(6, 3, 2,
-    [&](const SyncronizedEdgeUpdate& sync_update) {
+    [&](const SynchronizedEdgeUpdate& sync_update) {
       gain->computeDeltaForHyperedge(sync_update);
     }));
   ASSERT_EQ(-1, gain->delta());
diff --git a/tests/partition/refinement/label_propagation_refiner_test.cc b/tests/partition/refinement/label_propagation_refiner_test.cc
index 0d7703a9e..5c04ed648 100644
--- a/tests/partition/refinement/label_propagation_refiner_test.cc
+++ b/tests/partition/refinement/label_propagation_refiner_test.cc
@@ -30,37 +30,38 @@
 #include "mt-kahypar/definitions.h"
 #include "mt-kahypar/io/hypergraph_factory.h"
 #include "mt-kahypar/partition/context.h"
-#include "mt-kahypar/partition/registries/register_refinement_algorithms.cpp"
 #include "mt-kahypar/partition/initial_partitioning/bfs_initial_partitioner.h"
 #include "mt-kahypar/partition/refinement/label_propagation/label_propagation_refiner.h"
 #include "mt-kahypar/partition/refinement/gains/gain_definitions.h"
-#include "mt-kahypar/utils/randomize.h"
+#include "mt-kahypar/partition/refinement/rebalancing/advanced_rebalancer.h"
 #include "mt-kahypar/utils/cast.h"
 
 using ::testing::Test;
 
 namespace mt_kahypar {
-template <typename TypeTraitsT, PartitionID k, Objective objective>
+template <typename TypeTraitsT, PartitionID k, bool unconstrained, Objective objective>
 struct TestConfig { };
 
-template <typename TypeTraitsT, PartitionID k>
-struct TestConfig<TypeTraitsT, k, Objective::km1> {
+template <typename TypeTraitsT, PartitionID k, bool unconstrained>
+struct TestConfig<TypeTraitsT, k, unconstrained, Objective::km1> {
   using TypeTraits = TypeTraitsT;
   using GainTypes = Km1GainTypes;
   using Refiner = LabelPropagationRefiner<TypeTraits, GainTypes>;
   static constexpr PartitionID K = k;
   static constexpr Objective OBJECTIVE = Objective::km1;
   static constexpr LabelPropagationAlgorithm LP_ALGO = LabelPropagationAlgorithm::label_propagation;
+  static constexpr bool is_unconstrained = unconstrained;
 };
 
-template <typename TypeTraitsT, PartitionID k>
-struct TestConfig<TypeTraitsT, k, Objective::cut> {
+template <typename TypeTraitsT, PartitionID k, bool unconstrained>
+struct TestConfig<TypeTraitsT, k, unconstrained, Objective::cut> {
   using TypeTraits = TypeTraitsT;
   using GainTypes = CutGainTypes;
   using Refiner = LabelPropagationRefiner<TypeTraits, GainTypes>;
   static constexpr PartitionID K = k;
   static constexpr Objective OBJECTIVE = Objective::cut;
   static constexpr LabelPropagationAlgorithm LP_ALGO = LabelPropagationAlgorithm::label_propagation;
+  static constexpr bool is_unconstrained = unconstrained;
 };
 
 template <typename Config>
@@ -109,7 +110,9 @@ class ALabelPropagationRefiner : public Test {
 
     // Label Propagation
     context.refinement.label_propagation.algorithm = Config::LP_ALGO;
+    context.refinement.label_propagation.unconstrained = Config::is_unconstrained;
     context.initial_partitioning.refinement.label_propagation.algorithm = Config::LP_ALGO;
+    // Note: unconstrained currently doesn't work for initial partitioning
 
     // Read hypergraph
     hypergraph = io::readInputFile<Hypergraph>(
@@ -119,9 +122,9 @@ class ALabelPropagationRefiner : public Test {
     context.setupPartWeights(hypergraph.totalWeight());
     initialPartition();
 
+    rebalancer = std::make_unique<AdvancedRebalancer<TypeTraits, GainTypes>>(hypergraph.initialNumNodes(), context, gain_cache);
     refiner = std::make_unique<Refiner>(
-      hypergraph.initialNumNodes(), hypergraph.initialNumEdges(),
-      context, gain_cache);
+      hypergraph.initialNumNodes(), hypergraph.initialNumEdges(), context, gain_cache, *rebalancer);
     mt_kahypar_partitioned_hypergraph_t phg = utils::partitioned_hg_cast(partitioned_hypergraph);
     refiner->initialize(phg);
   }
@@ -144,6 +147,7 @@ class ALabelPropagationRefiner : public Test {
   Context context;
   GainCache gain_cache;
   std::unique_ptr<Refiner> refiner;
+  std::unique_ptr<IRebalancer> rebalancer;
   Metrics metrics;
 };
 
@@ -152,18 +156,31 @@ size_t ALabelPropagationRefiner<Config>::num_threads = HardwareTopology::instanc
 
 static constexpr double EPS = 0.05;
 
-typedef ::testing::Types<TestConfig<StaticHypergraphTypeTraits, 2, Objective::cut>,
-                         TestConfig<StaticHypergraphTypeTraits, 4, Objective::cut>,
-                         TestConfig<StaticHypergraphTypeTraits, 8, Objective::cut>,
-                         TestConfig<StaticHypergraphTypeTraits, 2, Objective::km1>,
-                         TestConfig<StaticHypergraphTypeTraits, 4, Objective::km1>,
-                         TestConfig<StaticHypergraphTypeTraits, 8, Objective::km1>
-                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 2 COMMA Objective::cut>)
-                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 4 COMMA Objective::cut>)
-                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 8 COMMA Objective::cut>)
-                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 2 COMMA Objective::km1>)
-                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 4 COMMA Objective::km1>)
-                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 8 COMMA Objective::km1>) > TestConfigs;
+typedef ::testing::Types<TestConfig<StaticHypergraphTypeTraits, 2, false, Objective::cut>,
+                         TestConfig<StaticHypergraphTypeTraits, 4, false, Objective::cut>,
+                         TestConfig<StaticHypergraphTypeTraits, 8, false, Objective::cut>,
+                         TestConfig<StaticHypergraphTypeTraits, 2, false, Objective::km1>,
+                         TestConfig<StaticHypergraphTypeTraits, 4, false, Objective::km1>,
+                         TestConfig<StaticHypergraphTypeTraits, 8, false, Objective::km1>
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 2 COMMA false COMMA Objective::cut>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 4 COMMA false COMMA Objective::cut>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 8 COMMA false COMMA Objective::cut>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 2 COMMA false COMMA Objective::km1>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 4 COMMA false COMMA Objective::km1>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 8 COMMA false COMMA Objective::km1>),
+                         // unconstrained
+                         TestConfig<StaticHypergraphTypeTraits, 2, true, Objective::cut>,
+                         TestConfig<StaticHypergraphTypeTraits, 4, true, Objective::cut>,
+                         TestConfig<StaticHypergraphTypeTraits, 8, true, Objective::cut>,
+                         TestConfig<StaticHypergraphTypeTraits, 2, true, Objective::km1>,
+                         TestConfig<StaticHypergraphTypeTraits, 4, true, Objective::km1>,
+                         TestConfig<StaticHypergraphTypeTraits, 8, true, Objective::km1>
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 2 COMMA true COMMA Objective::cut>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 4 COMMA true COMMA Objective::cut>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 8 COMMA true COMMA Objective::cut>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 2 COMMA true COMMA Objective::km1>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 4 COMMA true COMMA Objective::km1>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 8 COMMA true COMMA Objective::km1>) > TestConfigs;
 
 TYPED_TEST_CASE(ALabelPropagationRefiner, TestConfigs);
 
@@ -194,43 +211,44 @@ TYPED_TEST(ALabelPropagationRefiner, DoesNotWorsenSolutionQuality) {
 }
 
 
-TYPED_TEST(ALabelPropagationRefiner, IncreasesTheNumberOfBlocks) {
+TYPED_TEST(ALabelPropagationRefiner, ChangesTheNumberOfBlocks) {
   using PartitionedHypergraph = typename TestFixture::PartitionedHypergraph;
   HyperedgeWeight objective_before = metrics::quality(this->partitioned_hypergraph, this->context.partition.objective);
   mt_kahypar_partitioned_hypergraph_t phg = utils::partitioned_hg_cast(this->partitioned_hypergraph);
   this->refiner->refine(phg, {}, this->metrics, std::numeric_limits<double>::max());
   ASSERT_LE(this->metrics.quality, objective_before);
 
-  // Initialize partition with larger K
+  // Initialize partition with smaller K
   const PartitionID old_k = this->context.partition.k;
-  this->context.partition.k = 2 * old_k;
+  this->context.partition.k = std::max(old_k / 2, 2);
   this->context.setupPartWeights(this->hypergraph.totalWeight());
-  PartitionedHypergraph phg_with_larger_k(
+  PartitionedHypergraph phg_with_new_k(
     this->context.partition.k, this->hypergraph, mt_kahypar::parallel_tag_t());
-  utils::Randomize& rand = utils::Randomize::instance();
   vec<PartitionID> non_optimized_partition(this->hypergraph.initialNumNodes(), kInvalidPartition);
   this->partitioned_hypergraph.doParallelForAllNodes([&](const HypernodeID hn) {
+    // create a semi-random partition
     const PartitionID block = this->partitioned_hypergraph.partID(hn);
-    phg_with_larger_k.setOnlyNodePart(hn, rand.flipCoin(THREAD_ID) ? 2 * block : 2 * block + 1);
-    non_optimized_partition[hn] = phg_with_larger_k.partID(hn);
+    phg_with_new_k.setOnlyNodePart(hn, (block + hn) % this->context.partition.k);
+    non_optimized_partition[hn] = phg_with_new_k.partID(hn);
   });
-  phg_with_larger_k.initializePartition();
-  this->metrics.quality = metrics::quality(phg_with_larger_k, this->context);
-  this->metrics.imbalance = metrics::imbalance(phg_with_larger_k, this->context);
-
-  objective_before = metrics::quality(phg_with_larger_k, this->context.partition.objective);
-  mt_kahypar_partitioned_hypergraph_t phg_larger_k = utils::partitioned_hg_cast(phg_with_larger_k);
-  this->refiner->initialize(phg_larger_k);
-  this->refiner->refine(phg_larger_k, {}, this->metrics, std::numeric_limits<double>::max());
+  phg_with_new_k.initializePartition();
+  this->metrics.quality = metrics::quality(phg_with_new_k, this->context);
+  this->metrics.imbalance = metrics::imbalance(phg_with_new_k, this->context);
+
+  objective_before = metrics::quality(phg_with_new_k, this->context.partition.objective);
+  mt_kahypar_partitioned_hypergraph_t phg_new_k = utils::partitioned_hg_cast(phg_with_new_k);
+  this->gain_cache.reset();
+  this->refiner->initialize(phg_new_k);
+  this->rebalancer->initialize(phg_new_k);
+  this->refiner->refine(phg_new_k, {}, this->metrics, std::numeric_limits<double>::max());
   ASSERT_LE(this->metrics.quality, objective_before);
-  ASSERT_EQ(metrics::quality(phg_with_larger_k, this->context.partition.objective),
+  ASSERT_EQ(metrics::quality(phg_with_new_k, this->context.partition.objective),
             this->metrics.quality);
 
-  // Check if refiner has moved some nodes from new blocks
+  // Check if refiner has moved some nodes
   bool has_moved_nodes = false;
-  for ( const HypernodeID hn : phg_with_larger_k.nodes() ) {
-    if ( non_optimized_partition[hn] >= old_k &&
-         non_optimized_partition[hn] != phg_with_larger_k.partID(hn) ) {
+  for ( const HypernodeID hn : phg_with_new_k.nodes() ) {
+    if ( non_optimized_partition[hn] != phg_with_new_k.partID(hn) ) {
       has_moved_nodes = true;
       break;
     }
diff --git a/tests/partition/refinement/multitry_fm_test.cc b/tests/partition/refinement/multitry_fm_test.cc
index efb5411c6..411e08ccd 100644
--- a/tests/partition/refinement/multitry_fm_test.cc
+++ b/tests/partition/refinement/multitry_fm_test.cc
@@ -29,18 +29,22 @@
 #include "mt-kahypar/definitions.h"
 #include "mt-kahypar/partition/context.h"
 #include "mt-kahypar/io/hypergraph_factory.h"
+#include "mt-kahypar/partition/refinement/fm/fm_commons.h"
 #include "mt-kahypar/partition/refinement/fm/multitry_kway_fm.h"
 #include "mt-kahypar/partition/refinement/gains/gain_definitions.h"
+#include "mt-kahypar/partition/refinement/fm/strategies/gain_cache_strategy.h"
 #include "mt-kahypar/partition/initial_partitioning/bfs_initial_partitioner.h"
+#include "mt-kahypar/partition/refinement/rebalancing/advanced_rebalancer.h"
 
 using ::testing::Test;
 
 namespace mt_kahypar {
 
-template <typename TypeTraitsT, PartitionID k>
+template <typename TypeTraitsT, PartitionID k, FMAlgorithm alg>
 struct TestConfig {
   using TypeTraits = TypeTraitsT;
   static constexpr PartitionID K = k;
+  static constexpr FMAlgorithm ALG = alg;
 };
 
 template<typename Config>
@@ -64,6 +68,15 @@ class MultiTryFMTest : public Test {
     context.partition.graph_community_filename = "../tests/instances/contracted_ibm01.hgr.community";
     context.partition.mode = Mode::direct;
     context.partition.epsilon = 0.25;
+    context.partition.k = Config::K;
+    #ifdef KAHYPAR_ENABLE_HIGHEST_QUALITY_FEATURES
+    context.partition.preset_type = Hypergraph::is_static_hypergraph ?
+      PresetType::default_preset : PresetType::highest_quality;
+    #else
+    context.partition.preset_type = PresetType::default_preset;
+    #endif
+    context.partition.instance_type = InstanceType::hypergraph;
+    context.partition.partition_type = PartitionedHypergraph::TYPE;
     context.partition.verbose_output = false;
 
     // Shared Memory
@@ -74,10 +87,13 @@ class MultiTryFMTest : public Test {
     context.initial_partitioning.mode = Mode::deep_multilevel;
     context.initial_partitioning.runs = 1;
 
-    context.partition.k = Config::K;
-
-    context.refinement.fm.algorithm = FMAlgorithm::kway_fm;
+    context.refinement.fm.algorithm = Config::ALG;
     context.refinement.fm.multitry_rounds = 10;
+    if (context.refinement.fm.algorithm == FMAlgorithm::unconstrained_fm) {
+      context.refinement.fm.unconstrained_rounds = 10;
+      context.refinement.fm.imbalance_penalty_min = 0.5;
+      context.refinement.fm.imbalance_penalty_max = 0.5;
+    }
     context.refinement.fm.num_seed_nodes = 5;
     context.refinement.fm.rollback_balance_violation_factor = 1.0;
 
@@ -92,8 +108,9 @@ class MultiTryFMTest : public Test {
     context.setupPartWeights(hypergraph.totalWeight());
     initialPartition();
 
+    rebalancer = std::make_unique<AdvancedRebalancer<TypeTraits, Km1GainTypes>>(hypergraph.initialNumNodes(), context, gain_cache);
     refiner = std::make_unique<Refiner>(hypergraph.initialNumNodes(),
-      hypergraph.initialNumEdges(), context, gain_cache);
+      hypergraph.initialNumEdges(), context, gain_cache, *rebalancer);
     mt_kahypar_partitioned_hypergraph_t phg = utils::partitioned_hg_cast(partitioned_hypergraph);
     refiner->initialize(phg);
   }
@@ -116,22 +133,28 @@ class MultiTryFMTest : public Test {
   Context context;
   Km1GainCache gain_cache;
   std::unique_ptr<Refiner> refiner;
+  std::unique_ptr<IRebalancer> rebalancer;
   Metrics metrics;
 };
 
 
-typedef ::testing::Types<TestConfig<StaticHypergraphTypeTraits, 2>,
-                         TestConfig<StaticHypergraphTypeTraits, 4>,
-                         TestConfig<StaticHypergraphTypeTraits, 8>,
-                         TestConfig<StaticHypergraphTypeTraits, 2>,
-                         TestConfig<StaticHypergraphTypeTraits, 4>,
-                         TestConfig<StaticHypergraphTypeTraits, 8>
-                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 2>)
-                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 4>)
-                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 8>)
-                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 2>)
-                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 4>)
-                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 8>) > TestConfigs;
+typedef ::testing::Types<TestConfig<StaticHypergraphTypeTraits, 2, FMAlgorithm::kway_fm>,
+                         TestConfig<StaticHypergraphTypeTraits, 4, FMAlgorithm::kway_fm>,
+                         TestConfig<StaticHypergraphTypeTraits, 8, FMAlgorithm::kway_fm>,
+                         TestConfig<StaticHypergraphTypeTraits, 128, FMAlgorithm::kway_fm>
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 2 COMMA FMAlgorithm::kway_fm>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 4 COMMA FMAlgorithm::kway_fm>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 8 COMMA FMAlgorithm::kway_fm>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 128 COMMA FMAlgorithm::kway_fm>),
+                         // unconstrained
+                         TestConfig<StaticHypergraphTypeTraits, 2, FMAlgorithm::unconstrained_fm>,
+                         TestConfig<StaticHypergraphTypeTraits, 4, FMAlgorithm::unconstrained_fm>,
+                         TestConfig<StaticHypergraphTypeTraits, 8, FMAlgorithm::unconstrained_fm>,
+                         TestConfig<StaticHypergraphTypeTraits, 128, FMAlgorithm::unconstrained_fm>
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 2 COMMA FMAlgorithm::unconstrained_fm>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 4 COMMA FMAlgorithm::unconstrained_fm>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 8 COMMA FMAlgorithm::unconstrained_fm>)
+                         ENABLE_HIGHEST_QUALITY(COMMA TestConfig<DynamicHypergraphTypeTraits COMMA 128 COMMA FMAlgorithm::unconstrained_fm>) > TestConfigs;
 
 TYPED_TEST_CASE(MultiTryFMTest, TestConfigs);
 
@@ -196,37 +219,75 @@ TYPED_TEST(MultiTryFMTest, WorksWithRefinementNodes) {
   std::cout.rdbuf(old);                                   // and reset again
 }
 
-/*TYPED_TEST(MultiTryFMTest, IncreasesTheNumberOfBlocks) {
+TYPED_TEST(MultiTryFMTest, ChangesTheNumberOfBlocks) {
   using PartitionedHypergraph = typename TestFixture::PartitionedHypergraph;
   HyperedgeWeight objective_before = metrics::quality(this->partitioned_hypergraph, this->context.partition.objective);
   mt_kahypar_partitioned_hypergraph_t phg = utils::partitioned_hg_cast(this->partitioned_hypergraph);
   this->refiner->refine(phg, {}, this->metrics, std::numeric_limits<double>::max());
   ASSERT_LE(this->metrics.quality, objective_before);
 
-  // Initialize partition with larger K
+  // Initialize partition with smaller K
   const PartitionID old_k = this->context.partition.k;
-  this->context.partition.k = 2 * old_k;
+  this->context.partition.k = std::max(old_k / 2, 2);
   this->context.setupPartWeights(this->hypergraph.totalWeight());
-  PartitionedHypergraph phg_with_larger_k(
+  PartitionedHypergraph phg_with_new_k(
     this->context.partition.k, this->hypergraph, mt_kahypar::parallel_tag_t());
-  utils::Randomize& rand = utils::Randomize::instance();
-  vec<PartitionID> non_optimized_partition(this->hypergraph.initialNumNodes(), kInvalidPartition);
   this->partitioned_hypergraph.doParallelForAllNodes([&](const HypernodeID hn) {
+    // create a semi-random partition
     const PartitionID block = this->partitioned_hypergraph.partID(hn);
-    phg_with_larger_k.setOnlyNodePart(hn, rand.flipCoin(THREAD_ID) ? 2 * block : 2 * block + 1);
-    non_optimized_partition[hn] = phg_with_larger_k.partID(hn);
+    phg_with_new_k.setOnlyNodePart(hn, (block + hn) % this->context.partition.k);
   });
-  phg_with_larger_k.initializePartition();
-  this->metrics.quality = metrics::quality(phg_with_larger_k, this->context);
-  this->metrics.imbalance = metrics::imbalance(phg_with_larger_k, this->context);
-
-  objective_before = metrics::quality(phg_with_larger_k, this->context.partition.objective);
-  mt_kahypar_partitioned_hypergraph_t phg_larger_k = utils::partitioned_hg_cast(phg_with_larger_k);
-  this->refiner->initialize(phg_larger_k);
-  this->refiner->refine(phg_larger_k, {}, this->metrics, std::numeric_limits<double>::max());
+  phg_with_new_k.initializePartition();
+  this->metrics.quality = metrics::quality(phg_with_new_k, this->context);
+  this->metrics.imbalance = metrics::imbalance(phg_with_new_k, this->context);
+
+  objective_before = metrics::quality(phg_with_new_k, this->context.partition.objective);
+  mt_kahypar_partitioned_hypergraph_t phg_new_k = utils::partitioned_hg_cast(phg_with_new_k);
+  this->gain_cache.reset();
+  this->refiner->initialize(phg_new_k);
+  this->rebalancer->initialize(phg_new_k);
+  this->refiner->refine(phg_new_k, {}, this->metrics, std::numeric_limits<double>::max());
   ASSERT_LE(this->metrics.quality, objective_before);
-  ASSERT_EQ(metrics::quality(phg_with_larger_k, this->context.partition.objective),
+  ASSERT_EQ(metrics::quality(phg_with_new_k, this->context.partition.objective),
             this->metrics.quality);
-}*/
+}
+
+TEST(UnconstrainedFMDataTest, CorrectlyComputesPenalty) {
+  using TypeTraits = StaticHypergraphTypeTraits;
+  using Hypergraph = typename TypeTraits::Hypergraph;
+  using PartitionedHypergraph = typename TypeTraits::PartitionedHypergraph;
+  using HypergraphFactory = typename Hypergraph::Factory;
+
+  Context context;
+  context.partition.k = 2;
+
+  // use a super-heavy edge to trigger the fallback case
+  std::vector<HyperedgeWeight> he_weights{ 1, 10000 };
+  Hypergraph hg = HypergraphFactory::construct(4, 2, { {0, 1}, {2, 3} }, he_weights.data());
+  PartitionedHypergraph phg(2, hg);
+  phg.setOnlyNodePart(0, 0);
+  phg.setOnlyNodePart(1, 0);
+  phg.setOnlyNodePart(2, 1);
+  phg.setOnlyNodePart(3, 1);
+  phg.initializePartition();
+
+  Km1GainCache gain_cache;
+  gain_cache.initializeGainCache(phg);
+
+  UnconstrainedFMData ufm_data(4);
+  ufm_data.initialize<TypeTraits, Km1GainTypes>(context, phg, gain_cache);
+
+  ASSERT_EQ(0, ufm_data.estimatePenaltyForImbalancedMove(0, -1, -1));
+  ASSERT_LE(1.0, ufm_data.estimatePenaltyForImbalancedMove(0, 0, 1));
+  ASSERT_GE(1.5, ufm_data.estimatePenaltyForImbalancedMove(0, 0, 1));
+  ASSERT_LE(2.0, ufm_data.estimatePenaltyForImbalancedMove(0, 0, 2));
+  ASSERT_GE(3.0, ufm_data.estimatePenaltyForImbalancedMove(0, 0, 2));
+
+  ASSERT_EQ(0, ufm_data.estimatePenaltyForImbalancedMove(1, -1, -1));
+  ASSERT_LE(10000, ufm_data.estimatePenaltyForImbalancedMove(1, 0, 1));
+  ASSERT_GE(15000, ufm_data.estimatePenaltyForImbalancedMove(1, 0, 1));
+  ASSERT_LE(20000, ufm_data.estimatePenaltyForImbalancedMove(1, 0, 2));
+  ASSERT_GE(30000, ufm_data.estimatePenaltyForImbalancedMove(1, 0, 2));
+}
 
 }  // namespace mt_kahypar
diff --git a/tests/partition/refinement/rebalance_test.cc b/tests/partition/refinement/rebalance_test.cc
index d168406fc..7d4032671 100644
--- a/tests/partition/refinement/rebalance_test.cc
+++ b/tests/partition/refinement/rebalance_test.cc
@@ -32,7 +32,7 @@
 
 #include "mt-kahypar/definitions.h"
 #include "mt-kahypar/io/hypergraph_factory.h"
-#include "mt-kahypar/partition/refinement/rebalancing/rebalancer.h"
+#include "mt-kahypar/partition/refinement/rebalancing/simple_rebalancer.h"
 #include "mt-kahypar/partition/refinement/gains/gain_definitions.h"
 
 using ::testing::Test;
@@ -43,7 +43,7 @@ namespace {
   using TypeTraits = StaticHypergraphTypeTraits;
   using Hypergraph = typename TypeTraits::Hypergraph;
   using PartitionedHypergraph = typename TypeTraits::PartitionedHypergraph;
-  using Km1Rebalancer = Rebalancer<TypeTraits, Km1GainTypes>;
+  using Km1Rebalancer = SimpleRebalancer<TypeTraits, Km1GainTypes>;
 }
 
 
diff --git a/tests/partition/refinement/rollback_test.cc b/tests/partition/refinement/rollback_test.cc
index b3407052c..c7edab07b 100644
--- a/tests/partition/refinement/rollback_test.cc
+++ b/tests/partition/refinement/rollback_test.cc
@@ -77,7 +77,7 @@ TEST(RollbackTests, GainRecalculationAndRollsbackCorrectly) {
   context.refinement.fm.rollback_balance_violation_factor = 0.0;
 
 
-  FMSharedData sharedData(hg.initialNumNodes());
+  FMSharedData sharedData(hg.initialNumNodes(), false);
 
   GlobalRollback<TypeTraits, Km1GainTypes> grb(
     hg.initialNumEdges(), context, gain_cache);
@@ -134,7 +134,7 @@ TEST(RollbackTests, GainRecalculation2) {
   context.partition.max_part_weights = { std::numeric_limits<HypernodeWeight>::max(), std::numeric_limits<HypernodeWeight>::max()};
   context.refinement.fm.rollback_balance_violation_factor = 0.0;
 
-  FMSharedData sharedData(hg.initialNumNodes());
+  FMSharedData sharedData(hg.initialNumNodes(), false);
 
   GlobalRollback<TypeTraits, Km1GainTypes> grb(
     hg.initialNumEdges(), context, gain_cache);
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 41d8512c1..2acfeda2e 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -10,6 +10,12 @@ target_link_libraries(GraphToHgr TBB::tbb TBB::tbbmalloc_proxy)
 set_property(TARGET GraphToHgr PROPERTY CXX_STANDARD 17)
 set_property(TARGET GraphToHgr PROPERTY CXX_STANDARD_REQUIRED ON)
 
+add_executable(HgrToGraph hgr_to_graph.cc)
+target_link_libraries(HgrToGraph ${Boost_LIBRARIES})
+target_link_libraries(HgrToGraph TBB::tbb TBB::tbbmalloc_proxy)
+set_property(TARGET HgrToGraph PROPERTY CXX_STANDARD 17)
+set_property(TARGET HgrToGraph PROPERTY CXX_STANDARD_REQUIRED ON)
+
 add_executable(HgrToParkway hgr_to_parkway_converter.cc)
 target_link_libraries(HgrToParkway ${Boost_LIBRARIES})
 target_link_libraries(HgrToParkway TBB::tbb TBB::tbbmalloc_proxy)
@@ -92,7 +98,13 @@ target_link_libraries(BenchShuffle TBB::tbb TBB::tbbmalloc_proxy)
 set_property(TARGET BenchShuffle PROPERTY CXX_STANDARD 17)
 set_property(TARGET BenchShuffle PROPERTY CXX_STANDARD_REQUIRED ON)
 
-set(TOOLS_TARGETS ${TOOLS_TARGETS} EvaluateBipart
+add_executable(MtxToGraph mtx_to_graph.cc)
+set_property(TARGET MtxToGraph PROPERTY CXX_STANDARD 17)
+set_property(TARGET MtxToGraph PROPERTY CXX_STANDARD_REQUIRED ON)
+
+set(TOOLS_TARGETS ${TOOLS_TARGETS} GraphToHgr
+                                   HgrToGraph
+                                   EvaluateBipart
                                    VerifyPartition
                                    EvaluatePartition
                                    HgrToParkway
@@ -104,4 +116,4 @@ set(TOOLS_TARGETS ${TOOLS_TARGETS} EvaluateBipart
                                    GridGraphGenerator
                                    HierarchicalTargetGraphGenerator
                                    FixedVertexFileGenerator
-                                   PARENT_SCOPE)
\ No newline at end of file
+                                   PARENT_SCOPE)
diff --git a/tools/graph_to_hgr.cc b/tools/graph_to_hgr.cc
index fe506db38..d53274227 100644
--- a/tools/graph_to_hgr.cc
+++ b/tools/graph_to_hgr.cc
@@ -31,7 +31,11 @@
 #include <string>
 
 #include "mt-kahypar/macros.h"
+#include "mt-kahypar/definitions.h"
+#include "mt-kahypar/io/hypergraph_factory.h"
+#include "mt-kahypar/io/hypergraph_io.h"
 
+using namespace mt_kahypar;
 namespace po = boost::program_options;
 
 int main(int argc, char* argv[]) {
@@ -54,42 +58,44 @@ int main(int argc, char* argv[]) {
 
   std::ofstream out_stream(hgr_filename.c_str());
 
-  std::ifstream in_stream(graph_filename);
-  std::string line;
-  std::getline(in_stream, line);
+  // Read Hypergraph
+  HyperedgeID num_edges = 0;
+  HypernodeID num_nodes = 0;
+  io::HyperedgeVector hyperedges;
+  vec<HyperedgeWeight> hyperedges_weight;
+  vec<HypernodeWeight> hypernodes_weight;
 
-  // Read header
-  int num_nodes;
-  int num_edges;
-  {
-    std::stringstream sstream(line);
-    sstream >> num_nodes >> num_edges;
-  }
-
-  std::vector<std::vector<int>> adj_list(num_nodes + 1);
-  int u = 1;
-  while ( std::getline(in_stream, line) ) {
-    std::istringstream sstream(line);
-    int v;
-    while ( sstream >> v ) {
-      adj_list[u].push_back(v);
-    }
-    ++u;
-  }
+  io::readGraphFile(graph_filename, num_edges, num_nodes,
+                    hyperedges, hyperedges_weight, hypernodes_weight);
+  ALWAYS_ASSERT(hyperedges.size() == num_edges);
 
   // Write header
-  out_stream << num_edges << " " << num_nodes << " 0"  /* Unweighted */ << std::endl;
+  out_stream << num_edges << " " << num_nodes << " ";
+  if (hyperedges_weight.empty() && hypernodes_weight.empty()) {
+    out_stream << "0"  /* Unweighted */ << std::endl;
+  } else {
+    out_stream << (hypernodes_weight.empty() ? "0" : "1");
+    out_stream << (hyperedges_weight.empty() ? "0" : "1") << std::endl;
+  }
 
   // Write hyperedges
-  for ( int u = 1; u <= num_nodes; ++u ) {
-    for ( const int v : adj_list[u]  ) {
-      if ( u < v ) {
-        out_stream << u << " " << v << std::endl;
-      }
+  for (size_t i = 0; i < hyperedges.size(); ++i) {
+    const auto& pins = hyperedges[i];
+    ALWAYS_ASSERT(pins.size() == 2);
+    HypernodeID u = pins[0] + 1;
+    HypernodeID v = pins[1] + 1;
+    if (hyperedges_weight.size() > 0) {
+      out_stream << " " << hyperedges_weight[i];
     }
+    out_stream << u << " " << v;
+    out_stream << std::endl;
+  }
+
+  // Write node weights
+  for (HypernodeWeight weight: hypernodes_weight) {
+    out_stream << weight << std::endl;
   }
 
-  in_stream.close();
   out_stream.close();
 
   return 0;
diff --git a/tools/hgr_to_graph.cc b/tools/hgr_to_graph.cc
new file mode 100644
index 000000000..fd887539e
--- /dev/null
+++ b/tools/hgr_to_graph.cc
@@ -0,0 +1,121 @@
+/*******************************************************************************
+ * MIT License
+ *
+ * This file is part of Mt-KaHyPar.
+ *
+ * Copyright (C) 2023 Nikolai Maas <nikolai.maas@kit.edu>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ ******************************************************************************/
+
+#include <boost/program_options.hpp>
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <algorithm>
+
+#include "mt-kahypar/macros.h"
+#include "mt-kahypar/definitions.h"
+#include "mt-kahypar/io/hypergraph_factory.h"
+#include "mt-kahypar/io/hypergraph_io.h"
+
+using namespace mt_kahypar;
+namespace po = boost::program_options;
+
+int main(int argc, char* argv[]) {
+  std::string graph_filename;
+  std::string hgr_filename;
+
+  po::options_description options("Options");
+  options.add_options()
+    ("hypergraph,h",
+    po::value<std::string>(&hgr_filename)->value_name("<string>")->required(),
+    "Hypergraph filename")
+    ("graph,g",
+    po::value<std::string>(&graph_filename)->value_name("<string>")->required(),
+    "Graph filename");
+
+  po::variables_map cmd_vm;
+  po::store(po::parse_command_line(argc, argv, options), cmd_vm);
+  po::notify(cmd_vm);
+
+  std::ofstream out_stream(graph_filename.c_str());
+
+  // Read Hypergraph
+  HyperedgeID num_edges = 0;
+  HypernodeID num_nodes = 0;
+  HyperedgeID num_removed_single_pin_hyperedges = 0;
+  io::HyperedgeVector hyperedges;
+  vec<HyperedgeWeight> hyperedges_weight;
+  vec<HypernodeWeight> hypernodes_weight;
+
+  io::readHypergraphFile(hgr_filename, num_edges, num_nodes, num_removed_single_pin_hyperedges,
+                         hyperedges, hyperedges_weight, hypernodes_weight);
+  ALWAYS_ASSERT(hyperedges.size() == num_edges);
+  ALWAYS_ASSERT(num_removed_single_pin_hyperedges == 0);
+
+  // Write header
+  out_stream << num_nodes << " " << num_edges << " ";
+  if (hyperedges_weight.empty() && hypernodes_weight.empty()) {
+    out_stream << "0"  /* Unweighted */ << std::endl;
+  } else {
+    out_stream << (hypernodes_weight.empty() ? "0" : "1");
+    out_stream << (hyperedges_weight.empty() ? "0" : "1") << std::endl;
+  }
+
+  // insert backward edges
+  hyperedges.reserve(2 * num_edges);
+  for (size_t he = 0; he < num_edges; ++he) {
+    const auto& pins = hyperedges[he];
+    ALWAYS_ASSERT(pins.size() == 2, "Input hypergraph is not a graph!");
+    hyperedges.push_back({pins[1], pins[0]});
+  }
+
+  std::sort(hyperedges.begin(), hyperedges.end(),
+            [](const auto& l, const auto& r) { return l[0] < r[0] || (l[0] == r[0] && l[1] < r[1]); });
+
+  // Write edges
+  size_t i = 0;
+  bool at_start_of_line = true;
+  for (size_t he = 0; he < hyperedges.size();) {
+    const auto& pins = hyperedges[he];
+    if (!hypernodes_weight.empty() && at_start_of_line) {
+      out_stream << hypernodes_weight[i] << " ";
+    }
+    if (pins[0] == i) {
+      out_stream << (pins[1] + 1) << " ";
+      if (!hyperedges_weight.empty()) {
+        out_stream << hyperedges_weight[he] << " ";
+      }
+      ++he;
+      at_start_of_line = false;
+    } else {
+      out_stream << std::endl;
+      ++i;
+      ALWAYS_ASSERT(i < num_nodes);
+      at_start_of_line = true;
+    }
+  }
+
+  out_stream << std::endl;
+  out_stream.close();
+
+  return 0;
+}
diff --git a/tools/mtx_to_graph.cc b/tools/mtx_to_graph.cc
new file mode 100644
index 000000000..3f39972f3
--- /dev/null
+++ b/tools/mtx_to_graph.cc
@@ -0,0 +1,213 @@
+#include <fstream>
+#include <iostream>
+#include <cstdint>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+#include <chrono>
+#include <charconv>
+
+int main(int argc, const char* argv[]) {
+  if (argc != 3) {
+    std::cerr << "Usage ./MtxToGraph input-file output-file" << std::endl;
+    std::abort();
+  }
+  std::string input = argv[1];
+  std::string output = argv[2];
+
+  std::ifstream in(input);
+  if (!in) {
+    std::cerr << "Input file " << input << " does not exist" << std::endl;
+    std::abort();
+  }
+
+  uint64_t num_nodes = 0, num_edges = 0;
+  uint64_t nrows = 0, ncols = 0, nnz = 0;
+  std::string dummy, line, symmetry_str;
+  bool symmetric = false;
+  bool binary = false;
+
+  { // header
+    std::getline(in, line);
+    std::istringstream sstream(line);
+    std::string matrix_market, object, matrix_format, data_format;
+    sstream >> matrix_market >> object >> matrix_format >> data_format >> symmetry_str;
+
+    if (matrix_format != "coordinate" || object != "matrix") {
+      std::cerr << "not supported format" << std::endl;
+      std::abort();
+    }
+    if (symmetry_str == "symmetric") {
+      symmetric = true;
+    } else {
+      if (symmetry_str != "general") {
+        std::cerr << "Unsupported symmetry option " << symmetry_str << std::endl;
+        std::abort();
+      }
+      std::cerr << "Warning. The matrix isn't symmetric. This will result in a directed graph" << std::endl;
+    }
+    if (data_format == "pattern") {
+      binary = true;
+    }
+  }
+
+  { // dimensions
+    do {
+      std::getline(in, line);
+    } while (line[0] == '%');
+    std::istringstream sstream(line);
+    sstream >> nrows >> ncols >> nnz;
+    if (nrows != ncols) {
+      std::cerr << "Num Rows != Num Cols --> This doesn't work for graphs." << std::endl;
+      std::abort();
+    }
+    num_nodes = nrows;
+  }
+
+  std::cout << "num nodes = " << num_nodes << " num non-zeroes = " << nnz << " symmetric ? " << (symmetric ? "yes" : "no") << std::endl;
+
+  std::vector<std::vector<uint32_t>> adj_list(num_nodes);
+
+  auto t1 = std::chrono::high_resolution_clock::now();
+  int row, col;
+  for (uint64_t e = 0; e < nnz; ++e) {
+    do {
+      std::getline(in, line);
+    } while (line[0] == '%');
+
+    size_t pos = 0;
+    size_t l = 0;
+    while (pos < line.size() && line[pos] != ' ') { ++pos; }
+    if (pos == line.size()) {  throw std::runtime_error("Line too short"); }
+    std::from_chars(line.data() + l, line.data() + pos, row);
+
+    ++pos;
+    l = pos;
+    while (pos < line.size() && line[pos] != ' ') { ++pos; }
+    if (pos == line.size() && !binary) {  throw std::runtime_error("Line too short"); }
+    std::from_chars(line.data() + l, line.data() + pos, col);
+
+    if (row == col) continue;
+
+    --row; --col;
+    if (row >= num_nodes || col >= num_nodes) {
+      std::cerr << "Row or col index higher than number of nodes " << row << " " << col << " " << num_nodes << std::endl;
+      std::cerr << line << std::endl;
+      std::abort();
+    }
+    adj_list[row].push_back(col);
+    if (symmetric) adj_list[col].push_back(row);
+  }
+
+  auto t3 = std::chrono::high_resolution_clock::now();
+
+  std::cout << (t3-t1).count() / 1e6 << " ms reading time. " << std::endl;
+
+  if (!symmetric) {
+    std::cout << "Not symmetric --> Symmetrize" << std::endl;
+    num_edges = 0; for (const auto& n : adj_list) num_edges += n.size();
+    std::cout << "num directed edges before " << num_edges << std::endl;
+
+    std::vector<bool> is_adj(num_nodes, false);
+    std::vector<uint64_t> old_degrees(num_nodes, 0);
+    for (uint64_t u = 0; u < num_nodes; ++u) {
+      old_degrees[u] = adj_list[u].size();
+    }
+
+    // symmetrize
+    for (uint64_t u = 0; u < num_nodes; ++u) {
+      for (int v : adj_list[u]) {
+        adj_list[v].push_back(u);
+      }
+    }
+
+    // remove duplicates
+    for (uint64_t u = 0; u < num_nodes; ++u) {
+      auto& n = adj_list[u];
+      for (uint64_t j = 0; j < old_degrees[u]; ++j) { is_adj[n[j]] = true; }
+      uint64_t l = old_degrees[u];
+      for (uint64_t j = old_degrees[u]; j < n.size(); ++j) {
+        if (!is_adj[n[j]]) {  // keep_if
+          n[l++] = n[j];
+        }
+      }
+      n.resize(l);
+#ifdef false
+      if (l != old_degrees[u]) {
+        std::cout << "Node " << u << " got " << (l - old_degrees[u]) << " new edges. old deg = " << old_degrees[u] << std::endl;
+        std::cout << "New neighbors:";
+        for (uint64_t j = old_degrees[u]; j < n.size(); ++j) {
+          std::cout << " " << n[j];
+        }
+        std::cout << std::endl;
+      }
+#endif
+      for (uint64_t j = 0; j < old_degrees[u]; ++j) { is_adj[n[j]] = false; }
+    }
+
+    num_edges = 0; for (const auto& n : adj_list) num_edges += n.size();
+    std::cout << "num directed edges after " << num_edges << std::endl;
+  }
+
+  size_t deg_zero = 0;
+  for (const auto& n : adj_list) {
+    if (n.empty()) {
+      deg_zero++;
+    }
+  }
+  if (deg_zero) {
+    std::cerr << "Has " << deg_zero << " zero degree nodes" << std::endl;
+#ifdef false
+    std::cerr << "Remap node IDs." << std::endl;
+    std::vector<int64_t> remapped_node_ids(num_nodes, -1);
+    uint64_t new_node_id = 0;
+    for (uint64_t u = 0; u < num_nodes; ++u) {
+      if (!adj_list[u].empty()) {
+        adj_list[new_node_id] = std::move(adj_list[u]);
+        remapped_node_ids[u] = new_node_id;
+        new_node_id++;
+      }
+    }
+
+    adj_list.resize(new_node_id);
+    num_nodes = new_node_id;
+
+    for (auto& n : adj_list) {
+      for (auto& v : n) {
+        v = remapped_node_ids[v];
+      }
+    }
+#endif
+  }
+
+#ifdef false
+  for (auto neigh : adj_list) {
+    std::sort(neigh.begin(), neigh.end());
+    if (std::unique(neigh.begin(), neigh.end()) != neigh.end()) {
+      std::cerr << "duplicate edges..." << std::endl;
+    }
+  }
+#endif
+
+  num_edges = 0;
+  for (const auto& n : adj_list) num_edges += n.size();
+  if (num_edges % 2 != 0) {
+    std::cerr << "Num edges not even " << num_edges << std::endl;
+  }
+  num_edges /= 2;
+
+  std::ofstream out(output);
+  out << num_nodes << " " << num_edges << "\n";
+  for (uint64_t u = 0; u < num_nodes; ++u) {
+    const auto& n = adj_list[u];
+    if (!n.empty()) {
+      out << n[0] + 1;
+      for (size_t j = 1; j < n.size(); ++j) {
+        out << " " << n[j] + 1;
+      }
+    }
+    out << "\n";
+  }
+
+  std::cout << "finished writing." << std::endl;
+}