Skip to content

Commit

Permalink
server: Split metrics and health code
Browse files Browse the repository at this point in the history
Signed-off-by: Siyuan Zhang <sizhang@google.com>
  • Loading branch information
serathius authored and siyuanfoundation committed Dec 14, 2023
1 parent 5eee30b commit 09cc82d
Show file tree
Hide file tree
Showing 6 changed files with 241 additions and 224 deletions.
6 changes: 4 additions & 2 deletions embed/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -773,7 +773,8 @@ func (e *Etcd) serveClients() (err error) {
} else {
mux := http.NewServeMux()
etcdhttp.HandleBasic(mux, e.Server)
etcdhttp.HandleMetricsHealth(mux, e.Server)
etcdhttp.HandleMetrics(mux)
etcdhttp.HandleHealth(mux, e.Server)
h = mux
}

Expand Down Expand Up @@ -862,7 +863,8 @@ func (e *Etcd) serveMetrics() (err error) {

if len(e.cfg.ListenMetricsUrls) > 0 {
metricsMux := http.NewServeMux()
etcdhttp.HandleMetricsHealth(metricsMux, e.Server)
etcdhttp.HandleMetrics(metricsMux)
etcdhttp.HandleHealth(metricsMux, e.Server)

for _, murl := range e.cfg.ListenMetricsUrls {
tlsInfo := &e.cfg.ClientTLSInfo
Expand Down
2 changes: 1 addition & 1 deletion etcdmain/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ func startProxy(cfg *config) error {
plog.Infof("v2 proxy started listening on client requests on %q", host)
}
mux := http.NewServeMux()
etcdhttp.HandlePrometheus(mux) // v2 proxy just uses the same port
etcdhttp.HandleMetrics(mux) // v2 proxy just uses the same port
mux.Handle("/", ph)
plog.Fatal(http.Serve(l, mux))
}()
Expand Down
223 changes: 223 additions & 0 deletions etcdserver/api/etcdhttp/health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
// Copyright 2017 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package etcdhttp

import (
"context"
"encoding/json"
"fmt"
"net/http"
"time"

"go.etcd.io/etcd/auth"
"go.etcd.io/etcd/etcdserver"
pb "go.etcd.io/etcd/etcdserver/etcdserverpb"
"go.etcd.io/etcd/pkg/types"
"go.etcd.io/etcd/raft"

"github.com/prometheus/client_golang/prometheus"
)

const (
PathHealth = "/health"
)

type ServerHealth interface {
serverHealthV2V3
Range(context.Context, *pb.RangeRequest) (*pb.RangeResponse, error)
Config() etcdserver.ServerConfig
}

type serverHealthV2V3 interface {
Alarms() []*pb.AlarmMember
Leader() types.ID
}

// HandleHealthForV2 registers metrics and health handlers for v2.
func HandleHealthForV2(mux *http.ServeMux, srv etcdserver.ServerV2) {
mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health {
if h := checkAlarms(srv, excludedAlarms); h.Health != "true" {
return h
}
if h := checkLeader(srv, serializable); h.Health != "true" {
return h
}
return checkV2API(srv)
}))
}

// HandleHealth registers metrics and health handlers. it checks health by using v3 range request
// and its corresponding timeout.
func HandleHealth(mux *http.ServeMux, srv ServerHealth) {
mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health {
if h := checkAlarms(srv, excludedAlarms); h.Health != "true" {
return h
}
if h := checkLeader(srv, serializable); h.Health != "true" {
return h
}
return checkAPI(srv, serializable)
}))
}

// NewHealthHandler handles '/health' requests.
func NewHealthHandler(hfunc func(excludedAlarms AlarmSet, serializable bool) Health) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
w.Header().Set("Allow", http.MethodGet)
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
plog.Warningf("/health error (status code %d)", http.StatusMethodNotAllowed)
return
}
excludedAlarms := getExcludedAlarms(r)
// Passing the query parameter "serializable=true" ensures that the
// health of the local etcd is checked vs the health of the cluster.
// This is useful for probes attempting to validate the liveness of
// the etcd process vs readiness of the cluster to serve requests.
serializableFlag := getSerializableFlag(r)
h := hfunc(excludedAlarms, serializableFlag)
defer func() {
if h.Health == "true" {
healthSuccess.Inc()
} else {
healthFailed.Inc()
}
}()
d, _ := json.Marshal(h)
if h.Health != "true" {
http.Error(w, string(d), http.StatusServiceUnavailable)
return
}
w.WriteHeader(http.StatusOK)
w.Write(d)
plog.Debugf("/health OK (status code %d)", http.StatusOK)
}
}

var (
healthSuccess = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "server",
Name: "health_success",
Help: "The total number of successful health checks",
})
healthFailed = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "server",
Name: "health_failures",
Help: "The total number of failed health checks",
})
)

func init() {
prometheus.MustRegister(healthSuccess)
prometheus.MustRegister(healthFailed)
}

// Health defines etcd server health status.
// TODO: remove manual parsing in etcdctl cluster-health
type Health struct {
Health string `json:"health"`
Reason string `json:"reason"`
}

type AlarmSet map[string]struct{}

func getExcludedAlarms(r *http.Request) (alarms AlarmSet) {
alarms = make(map[string]struct{}, 2)
alms, found := r.URL.Query()["exclude"]
if found {
for _, alm := range alms {
if len(alms) == 0 {
continue
}
alarms[alm] = struct{}{}
}
}
return alarms
}

func getSerializableFlag(r *http.Request) bool {
return r.URL.Query().Get("serializable") == "true"
}

// TODO: etcdserver.ErrNoLeader in health API

func checkAlarms(srv serverHealthV2V3, excludedAlarms AlarmSet) Health {
h := Health{Health: "true"}
as := srv.Alarms()
if len(as) > 0 {
for _, v := range as {
alarmName := v.Alarm.String()
if _, found := excludedAlarms[alarmName]; found {
plog.Debugf("/health excluded alarm %s", v.String())
continue
}

h.Health = "false"
switch v.Alarm {
case pb.AlarmType_NOSPACE:
h.Reason = "ALARM NOSPACE"
case pb.AlarmType_CORRUPT:
h.Reason = "ALARM CORRUPT"
default:
h.Reason = "ALARM UNKNOWN"
}
plog.Warningf("/health error due to %s", v.String())
return h
}
}

return h
}

func checkLeader(srv serverHealthV2V3, serializable bool) Health {
h := Health{Health: "true"}
if !serializable && (uint64(srv.Leader()) == raft.None) {
h.Health = "false"
h.Reason = "RAFT NO LEADER"
plog.Warningf("/health error; no leader (status code %d)", http.StatusServiceUnavailable)
}
return h
}

func checkV2API(srv etcdserver.ServerV2) Health {
h := Health{Health: "true"}
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
_, err := srv.Do(ctx, pb.Request{Method: "QGET"})
cancel()
if err != nil {
h.Health = "false"
h.Reason = fmt.Sprintf("QGET ERROR:%s", err)
plog.Warningf("/health error; QGET failed %v (status code %d)", err, http.StatusServiceUnavailable)
return h
}
return h
}

func checkAPI(srv ServerHealth, serializable bool) Health {
h := Health{Health: "true"}
cfg := srv.Config()
ctx, cancel := context.WithTimeout(context.Background(), cfg.ReqTimeout())
_, err := srv.Range(ctx, &pb.RangeRequest{KeysOnly: true, Limit: 1, Serializable: serializable})
cancel()
if err != nil && err != auth.ErrUserEmpty && err != auth.ErrPermissionDenied {
h.Health = "false"
h.Reason = fmt.Sprintf("RANGE ERROR:%s", err)
plog.Warningf("serving /health false; Range failed %v (status code %d)", err, http.StatusServiceUnavailable)
return h
}
return h
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,29 +19,20 @@ import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net/http"
"net/http/httptest"
"testing"

"go.etcd.io/etcd/auth"
"go.etcd.io/etcd/etcdserver"
stats "go.etcd.io/etcd/etcdserver/api/v2stats"
pb "go.etcd.io/etcd/etcdserver/etcdserverpb"
"go.etcd.io/etcd/pkg/testutil"
"go.etcd.io/etcd/pkg/types"
"go.etcd.io/etcd/raft"
)

type fakeStats struct{}

func (s *fakeStats) SelfStats() []byte { return nil }
func (s *fakeStats) LeaderStats() []byte { return nil }
func (s *fakeStats) StoreStats() []byte { return nil }

type fakeHealthServer struct {
fakeServer
stats.Stats
health string
apiError error
}
Expand Down Expand Up @@ -130,31 +121,33 @@ func TestHealthHandler(t *testing.T) {
expectHealth: "true",
},
{
name: "Healthy even if authentication failed",
healthCheckURL: "/health",
apiError: auth.ErrUserEmpty,
expectStatusCode: http.StatusOK,
expectHealth: "true",
},
{
name: "Healthy even if authorization failed",
healthCheckURL: "/health",
apiError: auth.ErrPermissionDenied,
expectStatusCode: http.StatusOK,
expectHealth: "true",
},
{
name: "Unhealthy if api is not available",
healthCheckURL: "/health",
apiError: fmt.Errorf("Unexpected error"),
expectStatusCode: http.StatusServiceUnavailable,
expectHealth: "false",
},
}

for i, tt := range tests {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
mux := http.NewServeMux()
HandleMetricsHealth(mux, &fakeHealthServer{
HandleHealth(mux, &fakeHealthServer{
fakeServer: fakeServer{alarms: tt.alarms},
Stats: &fakeStats{},
health: tt.expectHealth,
apiError: tt.apiError,
})
Expand All @@ -163,14 +156,14 @@ func TestHealthHandler(t *testing.T) {

res, err := ts.Client().Do(&http.Request{Method: http.MethodGet, URL: testutil.MustNewURL(t, ts.URL+tt.healthCheckURL)})
if err != nil {
t.Errorf("fail serve http request %s %v in test case #%d", tt.healthCheckURL, err, i+1)
t.Errorf("fail serve http request %s %v", tt.healthCheckURL, err)
}
if res == nil {
t.Errorf("got nil http response with http request %s in test case #%d", tt.healthCheckURL, i+1)
t.Errorf("got nil http response with http request %s", tt.healthCheckURL)
return
}
if res.StatusCode != tt.expectStatusCode {
t.Errorf("want statusCode %d but got %d in test case #%d", tt.expectStatusCode, res.StatusCode, i+1)
t.Errorf("want statusCode %d but got %d", tt.expectStatusCode, res.StatusCode)
}
health, err := parseHealthOutput(res.Body)
if err != nil {
Expand All @@ -185,7 +178,7 @@ func TestHealthHandler(t *testing.T) {

func parseHealthOutput(body io.Reader) (Health, error) {
obj := Health{}
d, derr := ioutil.ReadAll(body)
d, derr := io.ReadAll(body)
if derr != nil {
return obj, derr
}
Expand Down
Loading

0 comments on commit 09cc82d

Please sign in to comment.