Skip to content

Commit

Permalink
tikvclient: add metrics for gRPC connection transient failure (#12084) (
Browse files Browse the repository at this point in the history
  • Loading branch information
lonng authored and sre-bot committed Sep 9, 2019
1 parent 2ebecd4 commit eb62dae
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 3 deletions.
27 changes: 27 additions & 0 deletions metrics/gprc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright 2019 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package metrics

import "github.com/prometheus/client_golang/prometheus"

// Metrics to monitor gRPC service
var (
GRPCConnTransientFailureCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "tidb",
Subsystem: "grpc",
Name: "connection_transient_failure_count",
Help: "Counter of gRPC connection transient failure",
}, []string{LblAddress, LblStore})
)
1 change: 1 addition & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,4 +150,5 @@ func RegisterMetrics() {
prometheus.MustRegister(TiKVBatchClientUnavailable)
prometheus.MustRegister(TiKVRangeTaskStats)
prometheus.MustRegister(TiKVRangeTaskPushDuration)
prometheus.MustRegister(GRPCConnTransientFailureCounter)
}
2 changes: 2 additions & 0 deletions metrics/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,6 @@ const (
LblSQLType = "sql_type"
LblGeneral = "general"
LblInternal = "internal"
LblStore = "store"
LblAddress = "address"
)
2 changes: 1 addition & 1 deletion metrics/tikvclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ var (
Name: "request_seconds",
Help: "Bucketed histogram of sending request duration.",
Buckets: prometheus.ExponentialBuckets(0.0005, 2, 20), // 0.5ms ~ 524s
}, []string{LblType, "store"})
}, []string{LblType, LblStore})

TiKVCoprocessorHistogram = prometheus.NewHistogram(
prometheus.HistogramOpts{
Expand Down
10 changes: 8 additions & 2 deletions store/tikv/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/pingcap/tidb/store/tikv/tikvrpc"
"github.com/pingcap/tidb/util/logutil"
"google.golang.org/grpc"
"google.golang.org/grpc/connectivity"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/keepalive"
)
Expand Down Expand Up @@ -288,14 +289,19 @@ func (c *rpcClient) SendRequest(ctx context.Context, addr string, req *tikvrpc.R
}
}

clientConn := connArray.Get()
if state := clientConn.GetState(); state == connectivity.TransientFailure {
metrics.GRPCConnTransientFailureCounter.WithLabelValues(addr, storeID).Inc()
}

if req.IsDebugReq() {
client := debugpb.NewDebugClient(connArray.Get())
client := debugpb.NewDebugClient(clientConn)
ctx1, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
return tikvrpc.CallDebugRPC(ctx1, client, req)
}

client := tikvpb.NewTikvClient(connArray.Get())
client := tikvpb.NewTikvClient(clientConn)

if req.Type != tikvrpc.CmdCopStream {
ctx1, cancel := context.WithTimeout(ctx, timeout)
Expand Down

0 comments on commit eb62dae

Please sign in to comment.