Skip to content

Commit

Permalink
c/controller: allow decreasing core count
Browse files Browse the repository at this point in the history
Now that shard_balancer will copy partition data from extra kvstore
shards, we can relax the check in validate_configuration_invariants.
  • Loading branch information
ztlpn committed Jun 28, 2024
1 parent 34e43b9 commit 3c2c5b6
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 18 deletions.
45 changes: 32 additions & 13 deletions src/v/cluster/controller.cc
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,24 @@ ss::future<> controller::start(
&shard_balancer::start,
conf_invariants.core_count);

if (conf_invariants.core_count > ss::smp::count) {
// Successfully starting shard_balancer with reduced core count means
// that all partition info from extra kvstores has been copied and we
// can finally update the configuration invariants.
auto new_invariants = configuration_invariants(
*config::node().node_id(), ss::smp::count);
co_await _storage.local().kvs().put(
storage::kvstore::key_space::controller,
invariants_key,
reflection::to_iobuf(configuration_invariants{new_invariants}));
conf_invariants = new_invariants;
vlog(
clusterlog.info,
"successfully decreased core count, "
"updated configuration invariants: {}",
conf_invariants);
}

co_await _backend.invoke_on_all(&controller_backend::start);

co_await _api.start(
Expand Down Expand Up @@ -1110,6 +1128,10 @@ controller::validate_configuration_invariants() {
storage::kvstore::key_space::controller,
invariants_key,
reflection::to_iobuf(configuration_invariants{current}));
vlog(
clusterlog.info,
"persisted initial configuration invariants: {}",
current);
co_return current;
}
auto invariants = reflection::from_iobuf<configuration_invariants>(
Expand All @@ -1125,26 +1147,23 @@ controller::validate_configuration_invariants() {
current.node_id);
throw configuration_invariants_changed(invariants, current);
}
if (invariants.core_count > current.core_count) {
vlog(
clusterlog.error,
"Detected change in number of cores dedicated to run redpanda."
"Decreasing redpanda core count is not allowed. Expected core "
"count "
"{}, currently have {} cores.",
invariants.core_count,
ss::smp::count);
throw configuration_invariants_changed(invariants, current);
} else if (invariants.core_count != current.core_count) {
if (current.core_count > invariants.core_count) {
// Update the persistent invariants to reflect increased core
// count -- this tracks the high water mark of core count, to
// reject subsequent decreases.
// track the number of extra kvstore shards that we need to process if
// the core count later decreases.
co_await _storage.local().kvs().put(
storage::kvstore::key_space::controller,
invariants_key,
reflection::to_iobuf(configuration_invariants{current}));
invariants = current;
vlog(clusterlog.info, "updated configuration invariants: {}", current);
} else if (current.core_count < invariants.core_count) {
// If core count decreased, do nothing just now. shard_balancer will
// check if decreasing is possible and we will update
// configuration_invariants in kvstore later.
}
co_return current;
co_return invariants;
}

} // namespace cluster
9 changes: 4 additions & 5 deletions tests/rptest/tests/node_resize_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,15 @@
from rptest.tests.redpanda_test import RedpandaTest

RESIZE_LOG_ALLOW_LIST = RESTART_LOG_ALLOW_LIST + [
re.compile("Decreasing redpanda core count is not allowed"),
re.compile(
"Failure during startup: cluster::configuration_invariants_changed")
re.compile("Detected decrease in number of cores"),
]


class NodeResizeTest(RedpandaTest):
"""
Validate redpanda behaviour on node core count changes. At time of writing this simply checks
that redpanda refuses to start if core count has decreased: if we make node resizes more
flexible in future, this test should be updated to exercise that.
that redpanda refuses to start if core count has decreased and core balancing on core count
change is unavailable.
"""

INITIAL_NUM_CPUS = 2
Expand All @@ -37,6 +35,7 @@ def __init__(self, *args, **kwargs):
super().__init__(
*args,
resource_settings=ResourceSettings(num_cpus=self.INITIAL_NUM_CPUS),
extra_rp_conf={"core_balancing_on_core_count_change": False},
**kwargs)

def _restart_with_num_cpus(self, node: ClusterNode, num_cpus: int,
Expand Down

0 comments on commit 3c2c5b6

Please sign in to comment.