Skip to content

Commit

Permalink
etcdctl: cluster-health supports forever flag
Browse files Browse the repository at this point in the history
cluster-health command supports checking the cluster health
forever.
  • Loading branch information
xiang90 committed Aug 1, 2015
1 parent 219ed16 commit f7f00b0
Showing 1 changed file with 70 additions and 39 deletions.
109 changes: 70 additions & 39 deletions etcdctl/command/cluster_health.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"fmt"
"net/http"
"os"
"os/signal"
"sort"
"time"

Expand All @@ -15,74 +16,104 @@ import (

func NewClusterHealthCommand() cli.Command {
return cli.Command{
Name: "cluster-health",
Usage: "check the health of the etcd cluster",
Flags: []cli.Flag{},
Name: "cluster-health",
Usage: "check the health of the etcd cluster",
Flags: []cli.Flag{
cli.BoolFlag{Name: "forever", Usage: "forever check the health every 10 second until CTRL+C"},
},
Action: handleClusterHealth,
}
}

func handleClusterHealth(c *cli.Context) {
forever := c.Bool("forever")
if forever {
sigch := make(chan os.Signal, 1)
signal.Notify(sigch, os.Interrupt)

go func() {
<-sigch
os.Exit(0)
}()
}

tr, err := getTransport(c)
if err != nil {
handleError(ExitServerError, err)
}

// TODO: update members when forever is set.
mi := mustNewMembersAPI(c)
ms, err := mi.List(context.TODO())
if err != nil {
fmt.Println("cluster may be unhealthy: failed to list members")
handleError(ExitServerError, err)
}

cl := make([]string, 0)
for _, m := range ms {
cl = append(cl, m.ClientURLs...)
}

// check the /health endpoint of all members first

ep, rs0, err := getLeaderStatus(tr, cl)
if err != nil {
fmt.Println("cluster may be unhealthy: failed to connect", cl)
os.Exit(1)
}
for {
// check the /health endpoint of all members first

time.Sleep(time.Second)

// are all the members makeing progress?
_, rs1, err := getLeaderStatus(tr, []string{ep})
if err != nil {
fmt.Println("cluster is unhealthy")
os.Exit(1)
}

if rs1.Commit > rs0.Commit {
fmt.Printf("cluster is healthy: raft is making progress [commit index: %v->%v]\n", rs0.Commit, rs1.Commit)
} else {
fmt.Printf("cluster is unhealthy: raft is not making progress [commit index: %v]\n", rs0.Commit)
}
fmt.Printf("leader is %v\n", rs0.Lead)
ep, rs0, err := getLeaderStatus(tr, cl)
if err != nil {
fmt.Println("cluster may be unhealthy: failed to connect", cl)
if forever {
time.Sleep(10 * time.Second)
continue
}
os.Exit(1)
}

var prints []string
time.Sleep(time.Second)

for id, pr0 := range rs0.Progress {
pr1, ok := rs1.Progress[id]
if !ok {
fmt.Println("Cluster configuration changed during health checking. Please retry.")
// are all the members makeing progress?
_, rs1, err := getLeaderStatus(tr, []string{ep})
if err != nil {
fmt.Println("cluster is unhealthy")
if forever {
time.Sleep(10 * time.Second)
continue
}
os.Exit(1)
}
if pr1.Match <= pr0.Match {
prints = append(prints, fmt.Sprintf("member %s is unhealthy: raft is not making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))

if rs1.Commit > rs0.Commit {
fmt.Printf("cluster is healthy: raft is making progress [commit index: %v->%v]\n", rs0.Commit, rs1.Commit)
} else {
prints = append(prints, fmt.Sprintf("member %s is healthy: raft is making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
fmt.Printf("cluster is unhealthy: raft is not making progress [commit index: %v]\n", rs0.Commit)
}
fmt.Printf("leader is %v\n", rs0.Lead)

var prints []string

for id, pr0 := range rs0.Progress {
pr1, ok := rs1.Progress[id]
if !ok {
// TODO: forever should handle configuration change.
fmt.Println("Cluster configuration changed during health checking. Please retry.")
os.Exit(1)
}
if pr1.Match <= pr0.Match {
prints = append(prints, fmt.Sprintf("member %s is unhealthy: raft is not making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
} else {
prints = append(prints, fmt.Sprintf("member %s is healthy: raft is making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
}
}

sort.Strings(prints)
for _, p := range prints {
fmt.Print(p)
}

if !forever {
return
}
}

sort.Strings(prints)
for _, p := range prints {
fmt.Print(p)
time.Sleep(10 * time.Second)
}
os.Exit(0)
}

type raftStatus struct {
Expand Down

0 comments on commit f7f00b0

Please sign in to comment.