Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added rpk cluster health command #4295

Merged
merged 9 commits into from
May 3, 2022
26 changes: 26 additions & 0 deletions src/go/rpk/pkg/api/admin/api_cluster.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Copyright 2021 Redpanda Data, Inc.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.md
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0

package admin

import "net/http"

// Health overview data structure
type ClusterHealthOverview struct {
IsHealthy bool `json:"is_healthy"`
ControllerID int `json:"controller_id"`
AllNodes []int `json:"all_nodes"`
NodesDown []int `json:"nodes_down"`
LeaderlessPartitions []string `json:"leaderless_partition"`
}

func (a *AdminAPI) GetHealthOverview() (ClusterHealthOverview, error) {
var response ClusterHealthOverview
return response, a.sendAny(http.MethodGet, "/v1/cluster/health_overview", nil, &response)
}
1 change: 1 addition & 0 deletions src/go/rpk/pkg/cli/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ func NewClusterCommand(fs afero.Fs) *cobra.Command {

command.AddCommand(config.NewConfigCommand(fs))
command.AddCommand(maintenance.NewMaintenanceCommand(fs))
command.AddCommand(cluster.NewHealthOverviewCommand(fs))

return command
}
81 changes: 81 additions & 0 deletions src/go/rpk/pkg/cli/cmd/cluster/health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// Copyright 2022 Redpanda Data, Inc.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.md
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0

// Package brokers contains commands to talk to the Redpanda's admin brokers
// endpoints.
package cluster

import (
"fmt"
"reflect"
"time"

"github.com/redpanda-data/redpanda/src/go/rpk/pkg/api/admin"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/config"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/out"
"github.com/spf13/afero"
"github.com/spf13/cobra"
)

func NewHealthOverviewCommand(fs afero.Fs) *cobra.Command {
var (
wait bool
exit bool
)
cmd := cobra.Command{
Use: "health",
Short: "Queries cluster for health overview.",
mmaslankaprv marked this conversation as resolved.
Show resolved Hide resolved
Long: `Queries health overview.

Health overview is created based on the health reports collected periodically
from all nodes in the cluster. A cluster is considered healthy when the
following conditions are met:

* all cluster nodes are responding
* all partitions have leaders
* cluster controller is present
`,
Args: cobra.ExactArgs(0),
Run: func(cmd *cobra.Command, _ []string) {
p := config.ParamsFromCommand(cmd)
cfg, err := p.Load(fs)
out.MaybeDie(err, "unable to load config: %v", err)

cl, err := admin.NewClient(fs, cfg)
out.MaybeDie(err, "unable to initialize admin client: %v", err)

var lastOverview admin.ClusterHealthOverview
for {
ret, err := cl.GetHealthOverview()
out.MaybeDie(err, "unable to request cluster health: %v", err)
if !reflect.DeepEqual(ret, lastOverview) {
printHealthOverview(&ret)
}
lastOverview = ret
if !wait || exit && lastOverview.IsHealthy {
break
}
time.Sleep(2 * time.Second)
}
},
}
cmd.Flags().BoolVarP(&wait, "watch", "w", false, "blocks and writes out all cluster health changes")
cmd.Flags().BoolVarP(&exit, "exit-when-healthy", "e", false, "when used with wait, exits after cluster is back in healthy state")
return &cmd
}

func printHealthOverview(hov *admin.ClusterHealthOverview) {
out.Section("CLUSTER HEALTH OVERVIEW")
overviewFormat := `Healthy: %v
Controller ID: %v
Nodes down: %v
Leaderless partitions: %v
`
fmt.Printf(overviewFormat, hov.IsHealthy, hov.ControllerID, hov.NodesDown, hov.LeaderlessPartitions)
}
46 changes: 46 additions & 0 deletions src/v/cluster/health_monitor_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@
#include <seastar/core/with_timeout.hh>
#include <seastar/util/log.hh>

#include <absl/container/node_hash_set.h>

#include <iterator>

namespace cluster {

health_monitor_backend::health_monitor_backend(
Expand Down Expand Up @@ -713,4 +717,46 @@ health_monitor_backend::get_node_drain_status(
co_return it->second.drain_status;
}

ss::future<cluster_health_overview>
health_monitor_backend::get_cluster_health_overview(
model::timeout_clock::time_point deadline) {
auto ec = co_await maybe_refresh_cluster_health(
force_refresh::no, deadline);
Comment on lines +723 to +724
Copy link
Member

@dotnwat dotnwat Apr 15, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the idea here that any node can serve a health report, and that it will be refreshed if it is stale? I wonder if we should always retrieve health reports from the controller?

I see that ec is used to determine is_healthly. Presumably it said the cluster isn't healthy if we couldn't refresh?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

exactly, i assumed that if we can not refresh cluster is in bad condition.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm I'm not very familiar with this code so please tell me if I have some wrong assumptions.

but I think if the cluster was healthy, but we couldn't refresh, then we may have cluster_health_overview with no nodes down, all nodes have leaders, but is_healthy is false, which may be confusing.

Maybe we should add a field like obsolete cluster state to the cluster_health_overview?


cluster_health_overview ret;
const auto brokers = _members.local().all_brokers();
ret.all_nodes.reserve(brokers.size());

for (auto& broker : brokers) {
ret.all_nodes.push_back(broker->id());
if (broker->id() == _raft0->self().id()) {
continue;
}
auto it = _status.find(broker->id());
if (it == _status.end() || !it->second.is_alive) {
ret.nodes_down.push_back(broker->id());
}
}
absl::node_hash_set<model::ntp> leaderless;
for (const auto& [_, report] : _reports) {
for (const auto& [tp_ns, partitions] : report.topics) {
for (const auto& partition : partitions) {
if (!partition.leader_id.has_value()) {
leaderless.emplace(tp_ns.ns, tp_ns.tp, partition.id);
}
}
}
}
ret.leaderless_partitions.reserve(leaderless.size());
std::move(
leaderless.begin(),
leaderless.end(),
std::back_inserter(ret.leaderless_partitions));
ret.controller_id = _raft0->get_leader_id();

ret.is_healthy = ret.nodes_down.empty() && ret.leaderless_partitions.empty()
&& ret.controller_id && !ec;

co_return ret;
}
} // namespace cluster
3 changes: 3 additions & 0 deletions src/v/cluster/health_monitor_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ class health_monitor_backend {
ss::future<result<std::optional<cluster::drain_manager::drain_status>>>
get_node_drain_status(model::node_id, model::timeout_clock::time_point);

ss::future<cluster_health_overview>
get_cluster_health_overview(model::timeout_clock::time_point);

private:
/**
* Struct used to track pending refresh request, it gives ability
Expand Down
7 changes: 7 additions & 0 deletions src/v/cluster/health_monitor_frontend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,11 @@ health_monitor_frontend::get_node_drain_status(
});
}

ss::future<cluster_health_overview>
health_monitor_frontend::get_cluster_health_overview(
model::timeout_clock::time_point deadline) {
return dispatch_to_backend([deadline](health_monitor_backend& be) {
return be.get_cluster_health_overview(deadline);
});
}
} // namespace cluster
13 changes: 13 additions & 0 deletions src/v/cluster/health_monitor_frontend.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ class health_monitor_frontend {
get_node_drain_status(
model::node_id node_id, model::timeout_clock::time_point deadline);

/**
* Return cluster health overview
*
* Health overview is based on the information available in health monitor.
* Cluster is considered as healthy when follwing conditions are met:
*
* - all nodes that are are responding
* - all partitions have leaders
* - cluster controller is present (_raft0 leader)
*/
Comment on lines +64 to +70
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also this comment doesn't mention that cluster is not healthy if we couldn't refresh health information (does that mean that some nodes are not responding?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is implicitly covered in all cluster nodes are responding, I've changed the comment to fix error in the fist item

ss::future<cluster_health_overview>
get_cluster_health_overview(model::timeout_clock::time_point);

private:
template<typename Func>
auto dispatch_to_backend(Func&& f) {
Expand Down
14 changes: 14 additions & 0 deletions src/v/cluster/health_monitor_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,20 @@ struct cluster_health_report {
operator<<(std::ostream&, const cluster_health_report&);
};

struct cluster_health_overview {
// is healthy is a main cluster indicator, it is intended as an simple flag
// that will allow all external cluster orchestrating processes to decide if
// they can proceed with next steps
bool is_healthy;

// additional human readable information that will make debugging cluster
// errors easier
std::optional<model::node_id> controller_id;
std::vector<model::node_id> all_nodes;
std::vector<model::node_id> nodes_down;
std::vector<model::ntp> leaderless_partitions;
};

using include_partitions_info = ss::bool_class<struct include_partitions_tag>;

/**
Expand Down
1 change: 1 addition & 0 deletions src/v/redpanda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ set(swags
partition
hbadger
transaction
cluster
debug)

set(swag_files)
Expand Down
63 changes: 63 additions & 0 deletions src/v/redpanda/admin/api-doc/cluster.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
{
"apiVersion": "0.0.1",
"swaggerVersion": "1.2",
"basePath": "/v1",
"resourcePath": "/brokers",
"produces": [
"application/json"
],
"apis": [
{
"path": "/v1/cluster/health_overview",
"operations": [
{
"method": "GET",
"summary": "Get cluster health overview",
"type": "get_cluster_health_overview",
"nickname": "get_cluster_health_overview",
"produces": [
"application/json"
],
"parameters": []
}
]
}
],
"models": {
"cluster_health_overview": {
"id": "cluster_health_overview",
"description": "Returns simple overview of cluster status",
"properties": {
"is_healthy": {
"type": "boolean",
"description": "basic cluster health indicator"
},
"controller_id": {
"type": "int",
"description": "node that is currently a leader or `-1` if leader is not elected"
},
"all_nodes": {
"type": "array",
"items": {
"type": "int"
},
"description": "ids of all nodes registered in the cluster"
},
"nodes_down": {
"type": "array",
"items": {
"type": "int"
},
"description": "ids of all nodes being recognized as down"
},
"leaderless_partitions": {
"type": "array",
"items": {
"type": "string"
},
"description": "list of partitions for which no leader is elected"
}
}
}
}
}
39 changes: 38 additions & 1 deletion src/v/redpanda/admin_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include "net/dns.h"
#include "raft/types.h"
#include "redpanda/admin/api-doc/broker.json.h"
#include "redpanda/admin/api-doc/cluster.json.h"
#include "redpanda/admin/api-doc/cluster_config.json.h"
#include "redpanda/admin/api-doc/config.json.h"
#include "redpanda/admin/api-doc/debug.json.h"
Expand Down Expand Up @@ -144,7 +145,8 @@ void admin_server::configure_admin_routes() {
rb->register_api_file(_server._routes, "transaction");
rb->register_function(_server._routes, insert_comma);
rb->register_api_file(_server._routes, "debug");

rb->register_function(_server._routes, insert_comma);
rb->register_api_file(_server._routes, "cluster");
register_config_routes();
register_cluster_config_routes();
register_raft_routes();
Expand All @@ -157,6 +159,7 @@ void admin_server::configure_admin_routes() {
register_hbadger_routes();
register_transaction_routes();
register_debug_routes();
register_cluster_routes();
}

struct json_validator {
Expand Down Expand Up @@ -2620,3 +2623,37 @@ void admin_server::register_debug_routes() {
co_return ss::json::json_return_type(ans);
});
}

void admin_server::register_cluster_routes() {
register_route<publik>(
ss::httpd::cluster_json::get_cluster_health_overview,
[this](std::unique_ptr<ss::httpd::request>)
-> ss::future<ss::json::json_return_type> {
vlog(logger.debug, "Requested cluster status");
auto health_overview = co_await _controller->get_health_monitor()
.local()
.get_cluster_health_overview(
model::time_from_now(
std::chrono::seconds(5)));
ss::httpd::cluster_json::cluster_health_overview ret;
ret.is_healthy = health_overview.is_healthy;
ret.all_nodes._set = true;
ret.nodes_down._set = true;
ret.leaderless_partitions._set = true;

ret.all_nodes = health_overview.all_nodes;
ret.nodes_down = health_overview.nodes_down;

for (auto& ntp : health_overview.leaderless_partitions) {
ret.leaderless_partitions.push(fmt::format(
"{}/{}/{}", ntp.ns(), ntp.tp.topic(), ntp.tp.partition));
}
if (health_overview.controller_id) {
ret.controller_id = health_overview.controller_id.value();
} else {
ret.controller_id = -1;
}

co_return ss::json::json_return_type(ret);
});
}
1 change: 1 addition & 0 deletions src/v/redpanda/admin_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ class admin_server {
void register_hbadger_routes();
void register_transaction_routes();
void register_debug_routes();
void register_cluster_routes();

ss::future<> throw_on_error(
ss::httpd::request& req,
Expand Down
5 changes: 5 additions & 0 deletions tests/rptest/services/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,11 @@ def get_cluster_view(self, node):
"""
return self._request('get', "cluster_view", node=node).json()

def get_cluster_health_overview(self, node=None):

return self._request('get', "cluster/health_overview",
node=node).json()

def decommission_broker(self, id, node=None):
"""
Decommission broker
Expand Down
Loading