Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change information stored in _topic_node_index to avoid oversized alloc #17350

Merged
merged 2 commits into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 5 additions & 79 deletions src/v/cluster/scheduling/leader_balancer_constraints.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,85 +64,11 @@ void even_topic_distributon_constraint::update_index(const reassignment& r) {

std::optional<reassignment>
even_topic_distributon_constraint::recommended_reassignment() {
// Sort topics based on topic error here
std::vector<decltype(_topic_node_index)::const_iterator> sorted_topics;
sorted_topics.reserve(_topic_node_index.size());

for (auto it = _topic_node_index.cbegin(); it != _topic_node_index.cend();
++it) {
sorted_topics.push_back(it);
}

std::sort(
sorted_topics.begin(),
sorted_topics.end(),
[this](const auto& a, const auto& b) {
return _topic_skew[a->first] > _topic_skew[b->first];
});

// Look for a topic with the most skew
for (const auto& topic : sorted_topics) {
const auto& nodes = topic->second;

if (nodes.size() == 0) {
continue;
}

std::vector<decltype(nodes.cbegin())> nodes_sorted;
nodes_sorted.reserve(nodes.size());

for (auto it = nodes.cbegin(); it != nodes.cend(); ++it) {
nodes_sorted.push_back(it);
}

std::sort(
nodes_sorted.begin(),
nodes_sorted.end(),
[](const auto& a, const auto& b) {
return a->second.size() > b->second.size();
});

// Try to move leadership off the node with the most leadership.
for (const auto& node : nodes_sorted) {
// Don't try moving a group from a muted node.
if (mi().muted_nodes().contains(node->first)) {
continue;
}

for (const auto& g_info : node->second) {
const auto& leader = g_info.leader;
const auto& group = g_info.group_id;
const auto& replicas = g_info.replicas;

// Don't try moving any groups that are currently muted.
if (mi().muted_groups().contains(
static_cast<uint64_t>(group))) {
continue;
}

for (const auto& replica : replicas) {
// Don't try a move to a different shard to on the same
// node. As it won't decrease error
if (replica.node_id == node->first || leader == replica) {
continue;
}

// Don't try moving group to a muted node.
if (mi().muted_nodes().contains(replica.node_id)) {
continue;
}

reassignment r{group, leader, replica};

if (evaluate_internal(r) > error_jitter) {
return r;
}
}
}
}
}

return std::nullopt;
// This method is deprecated and is ony used in `leader_balancer_greedy`
// which doesn't use the `even_topic_distributon_constraint`. Hence there is
// no need to implement it here. Once the greedy balancer has been removed
// this should be removed as well.
vassert(false, "not implemented");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you include a clear comment here about why this won't be called? Is it called on other types implementing this interface?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Copy link
Member

@BenPope BenPope Jul 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ERROR 2024-07-19 07:45:03,854 [shard 0:main] assert - Assert failure: (/var/lib/buildkite-agent/builds/buildkite-amd64-builders-i-0269b6a11fdecc85f-1/redpanda/redpanda/src/v/cluster/scheduling/leader_balancer_constraints.cc:59) 'false' not implemented
 
thanks @BenPope looks like something changed, or the assumptions weren't correct initially.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I searched a little but didn't see a ticket for this. is there one @BenPope?

}

void even_topic_distributon_constraint::rebuild_indexes() {
Expand Down
43 changes: 5 additions & 38 deletions src/v/cluster/tests/leader_balancer_constraints_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,6 @@ BOOST_AUTO_TEST_CASE(even_topic_distribution_empty) {
gntp_i, shard_index, muted_index);

BOOST_REQUIRE(even_topic_con.error() == 0);
BOOST_REQUIRE(!even_topic_con.recommended_reassignment());
}

BOOST_AUTO_TEST_CASE(even_topic_distribution_constraint_no_error) {
Expand Down Expand Up @@ -326,41 +325,12 @@ BOOST_AUTO_TEST_CASE(even_topic_distributon_constraint_find_reassignment) {
BOOST_REQUIRE(
topic_constraint.error() == topic_constraint.evaluate(reassignment));

auto rreassignment = topic_constraint.recommended_reassignment();
BOOST_REQUIRE(rreassignment.has_value());

index_cl.update_index(rreassignment.value());
topic_constraint.update_index(rreassignment.value());
index_cl.update_index(reassignment);
topic_constraint.update_index(reassignment);

BOOST_REQUIRE(topic_constraint.error() == 0);
}

BOOST_AUTO_TEST_CASE(even_topic_odd_partition_cnt) {
// In cases where a topic as a partition count that
// is not even divisible by the nodes they are replicated on.
// In these cases its not possible for every node to has_value
// equal leadership for the topic.
//
// This tests that in those cases the even topic constraint
// correctly recommends no further reassignments.

auto g_id_to_t_id = group_to_topic_from_spec({
{0, {1, 2, 3}},
});

auto [shard_index, muted_index] = from_spec(
{
{{1, 2}, {3}},
{{3}, {1, 2}},
},
{});

auto even_topic_con = lbt::even_topic_distributon_constraint(
g_id_to_t_id, shard_index, muted_index);

BOOST_REQUIRE(!even_topic_con.recommended_reassignment());
}

BOOST_AUTO_TEST_CASE(even_shard_no_error_even_topic_error) {
// Here even_shard_load_constraint.error = 0, but
// even_topic_distributon_constraint.error > 0 and any move will increase
Expand Down Expand Up @@ -390,11 +360,10 @@ BOOST_AUTO_TEST_CASE(even_shard_no_error_even_topic_error) {
BOOST_REQUIRE(even_shard_con.error() == 0);
BOOST_REQUIRE(even_topic_con.error() > 0);

auto rea = even_topic_con.recommended_reassignment();
BOOST_REQUIRE(rea.has_value());
auto rea = re(1, 0, 1);

BOOST_REQUIRE(even_shard_con.evaluate(*rea) < 0);
BOOST_REQUIRE(even_topic_con.evaluate(*rea) > 0);
BOOST_REQUIRE(even_shard_con.evaluate(rea) < 0);
BOOST_REQUIRE(even_topic_con.evaluate(rea) > 0);
}

BOOST_AUTO_TEST_CASE(even_topic_no_error_even_shard_error) {
Expand Down Expand Up @@ -426,8 +395,6 @@ BOOST_AUTO_TEST_CASE(even_topic_no_error_even_shard_error) {
g_id_to_t_id, shard_index, muted_index);

BOOST_REQUIRE(even_shard_con.error() > 0);

BOOST_REQUIRE(!even_topic_con.recommended_reassignment());
BOOST_REQUIRE(even_shard_con.recommended_reassignment().has_value());
}

Expand Down