From 3c2c5b6ed01223f39ac5da39585261eb64e5ce58 Mon Sep 17 00:00:00 2001 From: Alexey Zatelepin Date: Tue, 18 Jun 2024 13:18:29 +0200 Subject: [PATCH] c/controller: allow decreasing core count Now that shard_balancer will copy partition data from extra kvstore shards, we can relax the check in validate_configuration_invariants. --- src/v/cluster/controller.cc | 45 ++++++++++++++++++-------- tests/rptest/tests/node_resize_test.py | 9 +++--- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/src/v/cluster/controller.cc b/src/v/cluster/controller.cc index 26ded618326e..b8fda5cb6a56 100644 --- a/src/v/cluster/controller.cc +++ b/src/v/cluster/controller.cc @@ -555,6 +555,24 @@ ss::future<> controller::start( &shard_balancer::start, conf_invariants.core_count); + if (conf_invariants.core_count > ss::smp::count) { + // Successfully starting shard_balancer with reduced core count means + // that all partition info from extra kvstores has been copied and we + // can finally update the configuration invariants. + auto new_invariants = configuration_invariants( + *config::node().node_id(), ss::smp::count); + co_await _storage.local().kvs().put( + storage::kvstore::key_space::controller, + invariants_key, + reflection::to_iobuf(configuration_invariants{new_invariants})); + conf_invariants = new_invariants; + vlog( + clusterlog.info, + "successfully decreased core count, " + "updated configuration invariants: {}", + conf_invariants); + } + co_await _backend.invoke_on_all(&controller_backend::start); co_await _api.start( @@ -1110,6 +1128,10 @@ controller::validate_configuration_invariants() { storage::kvstore::key_space::controller, invariants_key, reflection::to_iobuf(configuration_invariants{current})); + vlog( + clusterlog.info, + "persisted initial configuration invariants: {}", + current); co_return current; } auto invariants = reflection::from_iobuf( @@ -1125,26 +1147,23 @@ controller::validate_configuration_invariants() { current.node_id); throw configuration_invariants_changed(invariants, current); } - if (invariants.core_count > current.core_count) { - vlog( - clusterlog.error, - "Detected change in number of cores dedicated to run redpanda." - "Decreasing redpanda core count is not allowed. Expected core " - "count " - "{}, currently have {} cores.", - invariants.core_count, - ss::smp::count); - throw configuration_invariants_changed(invariants, current); - } else if (invariants.core_count != current.core_count) { + if (current.core_count > invariants.core_count) { // Update the persistent invariants to reflect increased core // count -- this tracks the high water mark of core count, to - // reject subsequent decreases. + // track the number of extra kvstore shards that we need to process if + // the core count later decreases. co_await _storage.local().kvs().put( storage::kvstore::key_space::controller, invariants_key, reflection::to_iobuf(configuration_invariants{current})); + invariants = current; + vlog(clusterlog.info, "updated configuration invariants: {}", current); + } else if (current.core_count < invariants.core_count) { + // If core count decreased, do nothing just now. shard_balancer will + // check if decreasing is possible and we will update + // configuration_invariants in kvstore later. } - co_return current; + co_return invariants; } } // namespace cluster diff --git a/tests/rptest/tests/node_resize_test.py b/tests/rptest/tests/node_resize_test.py index 82015f5bb999..0dc59665b57c 100644 --- a/tests/rptest/tests/node_resize_test.py +++ b/tests/rptest/tests/node_resize_test.py @@ -18,17 +18,15 @@ from rptest.tests.redpanda_test import RedpandaTest RESIZE_LOG_ALLOW_LIST = RESTART_LOG_ALLOW_LIST + [ - re.compile("Decreasing redpanda core count is not allowed"), - re.compile( - "Failure during startup: cluster::configuration_invariants_changed") + re.compile("Detected decrease in number of cores"), ] class NodeResizeTest(RedpandaTest): """ Validate redpanda behaviour on node core count changes. At time of writing this simply checks - that redpanda refuses to start if core count has decreased: if we make node resizes more - flexible in future, this test should be updated to exercise that. + that redpanda refuses to start if core count has decreased and core balancing on core count + change is unavailable. """ INITIAL_NUM_CPUS = 2 @@ -37,6 +35,7 @@ def __init__(self, *args, **kwargs): super().__init__( *args, resource_settings=ResourceSettings(num_cpus=self.INITIAL_NUM_CPUS), + extra_rp_conf={"core_balancing_on_core_count_change": False}, **kwargs) def _restart_with_num_cpus(self, node: ClusterNode, num_cpus: int,