diff --git a/client/resource_group/controller/controller.go b/client/resource_group/controller/controller.go index d7211cd22150..d2cf3e887e07 100755 --- a/client/resource_group/controller/controller.go +++ b/client/resource_group/controller/controller.go @@ -633,7 +633,7 @@ type groupCostController struct { handleRespFunc func(*rmpb.TokenBucketResponse) // metrics - metrics *metricsCollection + metrics *groupMetricsCollection mu struct { sync.Mutex consumption *rmpb.Consumption @@ -681,7 +681,7 @@ type groupCostController struct { tombstone bool } -type metricsCollection struct { +type groupMetricsCollection struct { successfulRequestDuration prometheus.Observer failedLimitReserveDuration prometheus.Observer requestRetryCounter prometheus.Counter @@ -690,12 +690,12 @@ type metricsCollection struct { tokenRequestCounter prometheus.Counter } -func initMetrics(oldName, name string) *metricsCollection { +func initMetrics(oldName, name string) *groupMetricsCollection { const ( otherType = "others" throttledType = "throttled" ) - return &metricsCollection{ + return &groupMetricsCollection{ successfulRequestDuration: successfulRequestDuration.WithLabelValues(oldName, name), failedLimitReserveDuration: failedLimitReserveDuration.WithLabelValues(oldName, name), failedRequestCounterWithOthers: failedRequestCounter.WithLabelValues(oldName, name, otherType), @@ -806,7 +806,7 @@ func (gc *groupCostController) initRunState() { case rmpb.GroupMode_RUMode: gc.run.requestUnitTokens = make(map[rmpb.RequestUnitType]*tokenCounter) for typ := range requestUnitLimitTypeList { - limiter := NewLimiterWithCfg(now, cfgFunc(getRUTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) + limiter := NewLimiterWithCfg(now, cfgFunc(getRUTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan).SetName(gc.name).SetupMetrics() counter := &tokenCounter{ limiter: limiter, avgRUPerSec: 0, @@ -820,7 +820,7 @@ func (gc *groupCostController) initRunState() { case rmpb.GroupMode_RawMode: gc.run.resourceTokens = make(map[rmpb.RawResourceType]*tokenCounter) for typ := range requestResourceLimitTypeList { - limiter := NewLimiterWithCfg(now, cfgFunc(getRawResourceTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) + limiter := NewLimiterWithCfg(now, cfgFunc(getRawResourceTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan).SetName(gc.name).SetupMetrics() counter := &tokenCounter{ limiter: limiter, avgRUPerSec: 0, diff --git a/client/resource_group/controller/limiter.go b/client/resource_group/controller/limiter.go index a726b0e219a7..a585fd24b505 100644 --- a/client/resource_group/controller/limiter.go +++ b/client/resource_group/controller/limiter.go @@ -26,6 +26,7 @@ import ( "time" "github.com/pingcap/log" + "github.com/prometheus/client_golang/prometheus" "github.com/tikv/pd/client/errs" "go.uber.org/zap" ) @@ -81,6 +82,15 @@ type Limiter struct { isLowProcess bool // remainingNotifyTimes is used to limit notify when the speed limit is already set. remainingNotifyTimes int + name string + + // metrics + metrics *limiterMetricsCollection +} + +// limiterMetricsCollection is a collection of metrics for a limiter. +type limiterMetricsCollection struct { + lowTokenNotifyCounter prometheus.Counter } // Limit returns the maximum overall event rate. @@ -224,6 +234,23 @@ func (lim *Limiter) SetupNotificationThreshold(threshold float64) { lim.notifyThreshold = threshold } +// SetName sets the name of the limiter. +func (lim *Limiter) SetName(name string) *Limiter { + lim.mu.Lock() + defer lim.mu.Unlock() + lim.name = name + return lim +} + +func (lim *Limiter) SetupMetrics() *Limiter { + lim.mu.Lock() + defer lim.mu.Unlock() + lim.metrics = &limiterMetricsCollection{ + lowTokenNotifyCounter: lowTokenRequestNotifyCounter.WithLabelValues(lim.name), + } + return lim +} + // notify tries to send a non-blocking notification on notifyCh and disables // further notifications (until the next Reconfigure or StartNotification). func (lim *Limiter) notify() { @@ -234,6 +261,9 @@ func (lim *Limiter) notify() { lim.isLowProcess = true select { case lim.lowTokensNotifyChan <- struct{}{}: + if lim.metrics != nil { + lim.metrics.lowTokenNotifyCounter.Inc() + } default: } } diff --git a/client/resource_group/controller/metrics.go b/client/resource_group/controller/metrics.go index da0d1e371bfb..231013126e00 100644 --- a/client/resource_group/controller/metrics.go +++ b/client/resource_group/controller/metrics.go @@ -42,7 +42,7 @@ var ( Namespace: namespace, Subsystem: requestSubsystem, Name: "success", - Buckets: []float64{.005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30}, // 0.005 ~ 30 + Buckets: []float64{0.0005, .005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30, 60, 600, 1800, 3600}, // 0.0005 ~ 1h Help: "Bucketed histogram of wait duration of successful request.", }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) @@ -51,7 +51,7 @@ var ( Namespace: namespace, Subsystem: requestSubsystem, Name: "limit_reserve_time_failed", - Buckets: []float64{.005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30}, // 0.005 ~ 30 + Buckets: []float64{0.0005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30, 60, 600, 1800, 3600, 86400}, // 0.0005 ~ 24h Help: "Bucketed histogram of wait duration of failed request.", }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) @@ -75,6 +75,7 @@ var ( prometheus.HistogramOpts{ Namespace: namespace, Subsystem: tokenRequestSubsystem, + Buckets: prometheus.ExponentialBuckets(0.001, 2, 13), // 1ms ~ 8s Name: "duration", Help: "Bucketed histogram of latency(s) of token request.", }, []string{"type"}) @@ -86,6 +87,14 @@ var ( Name: "resource_group", Help: "Counter of token request by every resource group.", }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) + + lowTokenRequestNotifyCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: tokenRequestSubsystem, + Name: "low_token_notified", + Help: "Counter of low token request.", + }, []string{newResourceGroupNameLabel}) ) var (