diff --git a/server/coordinator.go b/server/coordinator.go index 196acc3ca5e..317e8367930 100644 --- a/server/coordinator.go +++ b/server/coordinator.go @@ -15,6 +15,7 @@ package server import ( "context" + "fmt" "sync" "time" @@ -304,28 +305,30 @@ func (c *coordinator) collectHotSpotMetrics() { status := s.Scheduler.(hasHotStatus).GetHotWriteStatus() for _, s := range stores { storeAddress := s.GetAddress() - stat, ok := status.AsPeer[s.GetID()] + storeID := s.GetID() + storeLabel := fmt.Sprintf("%d", storeID) + stat, ok := status.AsPeer[storeID] if ok { totalWriteBytes := float64(stat.TotalFlowBytes) hotWriteRegionCount := float64(stat.RegionsCount) - hotSpotStatusGauge.WithLabelValues(storeAddress, "total_written_bytes_as_peer").Set(totalWriteBytes) - hotSpotStatusGauge.WithLabelValues(storeAddress, "hot_write_region_as_peer").Set(hotWriteRegionCount) + hotSpotStatusGauge.WithLabelValues(storeAddress, storeLabel, "total_written_bytes_as_peer").Set(totalWriteBytes) + hotSpotStatusGauge.WithLabelValues(storeAddress, storeLabel, "hot_write_region_as_peer").Set(hotWriteRegionCount) } else { - hotSpotStatusGauge.WithLabelValues(storeAddress, "total_written_bytes_as_peer").Set(0) - hotSpotStatusGauge.WithLabelValues(storeAddress, "hot_write_region_as_peer").Set(0) + hotSpotStatusGauge.WithLabelValues(storeAddress, storeLabel, "total_written_bytes_as_peer").Set(0) + hotSpotStatusGauge.WithLabelValues(storeAddress, storeLabel, "hot_write_region_as_peer").Set(0) } - stat, ok = status.AsLeader[s.GetID()] + stat, ok = status.AsLeader[storeID] if ok { totalWriteBytes := float64(stat.TotalFlowBytes) hotWriteRegionCount := float64(stat.RegionsCount) - hotSpotStatusGauge.WithLabelValues(storeAddress, "total_written_bytes_as_leader").Set(totalWriteBytes) - hotSpotStatusGauge.WithLabelValues(storeAddress, "hot_write_region_as_leader").Set(hotWriteRegionCount) + hotSpotStatusGauge.WithLabelValues(storeAddress, storeLabel, "total_written_bytes_as_leader").Set(totalWriteBytes) + hotSpotStatusGauge.WithLabelValues(storeAddress, storeLabel, "hot_write_region_as_leader").Set(hotWriteRegionCount) } else { - hotSpotStatusGauge.WithLabelValues(storeAddress, "total_written_bytes_as_leader").Set(0) - hotSpotStatusGauge.WithLabelValues(storeAddress, "hot_write_region_as_leader").Set(0) + hotSpotStatusGauge.WithLabelValues(storeAddress, storeLabel, "total_written_bytes_as_leader").Set(0) + hotSpotStatusGauge.WithLabelValues(storeAddress, storeLabel, "hot_write_region_as_leader").Set(0) } } @@ -333,16 +336,18 @@ func (c *coordinator) collectHotSpotMetrics() { status = s.Scheduler.(hasHotStatus).GetHotReadStatus() for _, s := range stores { storeAddress := s.GetAddress() - stat, ok := status.AsLeader[s.GetID()] + storeID := s.GetID() + storeLabel := fmt.Sprintf("%d", storeID) + stat, ok := status.AsLeader[storeID] if ok { totalReadBytes := float64(stat.TotalFlowBytes) hotReadRegionCount := float64(stat.RegionsCount) - hotSpotStatusGauge.WithLabelValues(storeAddress, "total_read_bytes_as_leader").Set(totalReadBytes) - hotSpotStatusGauge.WithLabelValues(storeAddress, "hot_read_region_as_leader").Set(hotReadRegionCount) + hotSpotStatusGauge.WithLabelValues(storeAddress, storeLabel, "total_read_bytes_as_leader").Set(totalReadBytes) + hotSpotStatusGauge.WithLabelValues(storeAddress, storeLabel, "hot_read_region_as_leader").Set(hotReadRegionCount) } else { - hotSpotStatusGauge.WithLabelValues(storeAddress, "total_read_bytes_as_leader").Set(0) - hotSpotStatusGauge.WithLabelValues(storeAddress, "hot_read_region_as_leader").Set(0) + hotSpotStatusGauge.WithLabelValues(storeAddress, storeLabel, "total_read_bytes_as_leader").Set(0) + hotSpotStatusGauge.WithLabelValues(storeAddress, storeLabel, "hot_read_region_as_leader").Set(0) } } diff --git a/server/grpc_service.go b/server/grpc_service.go index cb39ef91159..4e0d0f446d7 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -17,6 +17,7 @@ import ( "context" "fmt" "io" + "strconv" "sync/atomic" "time" @@ -349,21 +350,22 @@ func (s *Server) RegionHeartbeat(stream pdpb.PD_RegionHeartbeatServer) error { } storeID := request.GetLeader().GetStoreId() + storeLabel := strconv.FormatUint(storeID, 10) store, err := cluster.GetStore(storeID) if err != nil { return err } storeAddress := store.GetAddress() - regionHeartbeatCounter.WithLabelValues(storeAddress, "report", "recv").Inc() - regionHeartbeatLatency.WithLabelValues(storeAddress).Observe(float64(time.Now().Unix()) - float64(request.GetInterval().GetEndTimestamp())) + regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "report", "recv").Inc() + regionHeartbeatLatency.WithLabelValues(storeAddress, storeLabel).Observe(float64(time.Now().Unix()) - float64(request.GetInterval().GetEndTimestamp())) cluster.RLock() hbStreams := cluster.coordinator.hbStreams cluster.RUnlock() if time.Since(lastBind) > s.cfg.heartbeatStreamBindInterval.Duration { - regionHeartbeatCounter.WithLabelValues(storeAddress, "report", "bind").Inc() + regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "report", "bind").Inc() hbStreams.bindStream(storeID, server) lastBind = time.Now() } @@ -371,22 +373,22 @@ func (s *Server) RegionHeartbeat(stream pdpb.PD_RegionHeartbeatServer) error { region := core.RegionFromHeartbeat(request) if region.GetID() == 0 { msg := fmt.Sprintf("invalid request region, %v", request) - hbStreams.sendErr(pdpb.ErrorType_UNKNOWN, msg, storeAddress) + hbStreams.sendErr(pdpb.ErrorType_UNKNOWN, msg, storeAddress, storeLabel) continue } if region.GetLeader() == nil { msg := fmt.Sprintf("invalid request leader, %v", request) - hbStreams.sendErr(pdpb.ErrorType_UNKNOWN, msg, storeAddress) + hbStreams.sendErr(pdpb.ErrorType_UNKNOWN, msg, storeAddress, storeLabel) continue } err = cluster.HandleRegionHeartbeat(region) if err != nil { msg := err.Error() - hbStreams.sendErr(pdpb.ErrorType_UNKNOWN, msg, storeAddress) + hbStreams.sendErr(pdpb.ErrorType_UNKNOWN, msg, storeAddress, storeLabel) } - regionHeartbeatCounter.WithLabelValues(storeAddress, "report", "ok").Inc() + regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "report", "ok").Inc() } } diff --git a/server/heartbeat_streams.go b/server/heartbeat_streams.go index 976a16cd4df..9a822053d10 100644 --- a/server/heartbeat_streams.go +++ b/server/heartbeat_streams.go @@ -15,6 +15,7 @@ package server import ( "context" + "strconv" "sync" "time" @@ -79,6 +80,7 @@ func (s *heartbeatStreams) run() { s.streams[update.storeID] = update.stream case msg := <-s.msgCh: storeID := msg.GetTargetPeer().GetStoreId() + storeLabel := strconv.FormatUint(storeID, 10) store, err := s.cluster.GetStore(storeID) if err != nil { log.Error("fail to get store", @@ -94,15 +96,15 @@ func (s *heartbeatStreams) run() { log.Error("send heartbeat message fail", zap.Uint64("region-id", msg.RegionId), zap.Error(err)) delete(s.streams, storeID) - regionHeartbeatCounter.WithLabelValues(storeAddress, "push", "err").Inc() + regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "push", "err").Inc() } else { - regionHeartbeatCounter.WithLabelValues(storeAddress, "push", "ok").Inc() + regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "push", "ok").Inc() } } else { log.Debug("heartbeat stream not found, skip send message", zap.Uint64("region-id", msg.RegionId), zap.Uint64("store-id", storeID)) - regionHeartbeatCounter.WithLabelValues(storeAddress, "push", "skip").Inc() + regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "push", "skip").Inc() } case <-keepAliveTicker.C: for storeID, stream := range s.streams { @@ -113,14 +115,15 @@ func (s *heartbeatStreams) run() { continue } storeAddress := store.GetAddress() + storeLabel := strconv.FormatUint(storeID, 10) if err := stream.Send(keepAlive); err != nil { log.Error("send keepalive message fail", zap.Uint64("target-store-id", storeID), zap.Error(err)) delete(s.streams, storeID) - regionHeartbeatCounter.WithLabelValues(storeAddress, "keepalive", "err").Inc() + regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "keepalive", "err").Inc() } else { - regionHeartbeatCounter.WithLabelValues(storeAddress, "keepalive", "ok").Inc() + regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "keepalive", "ok").Inc() } } case <-s.ctx.Done(): @@ -161,8 +164,8 @@ func (s *heartbeatStreams) SendMsg(region *core.RegionInfo, msg *pdpb.RegionHear } } -func (s *heartbeatStreams) sendErr(errType pdpb.ErrorType, errMsg string, storeAddress string) { - regionHeartbeatCounter.WithLabelValues(storeAddress, "report", "err").Inc() +func (s *heartbeatStreams) sendErr(errType pdpb.ErrorType, errMsg string, storeAddress string, storeLabel string) { + regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "report", "err").Inc() msg := &pdpb.RegionHeartbeatResponse{ Header: &pdpb.ResponseHeader{ diff --git a/server/metrics.go b/server/metrics.go index 2354371d9ea..fe050209583 100644 --- a/server/metrics.go +++ b/server/metrics.go @@ -95,7 +95,7 @@ var ( Subsystem: "scheduler", Name: "region_heartbeat", Help: "Counter of region hearbeat.", - }, []string{"address", "type", "status"}) + }, []string{"address", "store", "type", "status"}) regionHeartbeatLatency = prometheus.NewHistogramVec( prometheus.HistogramOpts{ @@ -104,7 +104,7 @@ var ( Name: "region_heartbeat_latency_seconds", Help: "Bucketed histogram of latency (s) of receiving heartbeat.", Buckets: prometheus.ExponentialBuckets(1, 2, 12), - }, []string{"address"}) + }, []string{"address", "store"}) storeStatusGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -112,7 +112,7 @@ var ( Subsystem: "scheduler", Name: "store_status", Help: "Store status for schedule", - }, []string{"namespace", "address", "type"}) + }, []string{"namespace", "address", "store", "type"}) hotSpotStatusGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -120,7 +120,7 @@ var ( Subsystem: "hotspot", Name: "status", Help: "Status of the hotspot.", - }, []string{"address", "type"}) + }, []string{"address", "store", "type"}) tsoCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ diff --git a/server/schedule/filters.go b/server/schedule/filters.go index 5cc273518c3..db62f890ac0 100644 --- a/server/schedule/filters.go +++ b/server/schedule/filters.go @@ -14,6 +14,8 @@ package schedule import ( + "fmt" + "github.com/pingcap/pd/server/cache" "github.com/pingcap/pd/server/core" "github.com/pingcap/pd/server/namespace" @@ -33,9 +35,10 @@ type Filter interface { // FilterSource checks if store can pass all Filters as source store. func FilterSource(opt Options, store *core.StoreInfo, filters []Filter) bool { storeAddress := store.GetAddress() + storeID := fmt.Sprintf("%d", store.GetID()) for _, filter := range filters { if filter.FilterSource(opt, store) { - filterCounter.WithLabelValues("filter-source", storeAddress, filter.Type()).Inc() + filterCounter.WithLabelValues("filter-source", storeAddress, storeID, filter.Type()).Inc() return true } } @@ -45,9 +48,10 @@ func FilterSource(opt Options, store *core.StoreInfo, filters []Filter) bool { // FilterTarget checks if store can pass all Filters as target store. func FilterTarget(opt Options, store *core.StoreInfo, filters []Filter) bool { storeAddress := store.GetAddress() + storeID := fmt.Sprintf("%d", store.GetID()) for _, filter := range filters { if filter.FilterTarget(opt, store) { - filterCounter.WithLabelValues("filter-target", storeAddress, filter.Type()).Inc() + filterCounter.WithLabelValues("filter-target", storeAddress, storeID, filter.Type()).Inc() return true } } diff --git a/server/schedule/metrics.go b/server/schedule/metrics.go index d3d3cb9a5d6..fbb4f7a735f 100644 --- a/server/schedule/metrics.go +++ b/server/schedule/metrics.go @@ -47,7 +47,7 @@ var ( Subsystem: "schedule", Name: "filter", Help: "Counter of the filter", - }, []string{"action", "address", "type"}) + }, []string{"action", "address", "store", "type"}) operatorCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ diff --git a/server/schedulers/balance_leader.go b/server/schedulers/balance_leader.go index 8540eb2eb7a..45a1898d8dc 100644 --- a/server/schedulers/balance_leader.go +++ b/server/schedulers/balance_leader.go @@ -14,6 +14,8 @@ package schedulers import ( + "strconv" + log "github.com/pingcap/log" "github.com/pingcap/pd/server/cache" "github.com/pingcap/pd/server/core" @@ -88,30 +90,34 @@ func (l *balanceLeaderScheduler) Schedule(cluster schedule.Cluster) []*schedule. return nil } - log.Debug("store leader score", zap.String("scheduler", l.GetName()), zap.Uint64("max-store", source.GetID()), zap.Uint64("min-store", target.GetID())) + sourceID := source.GetID() + targetID := target.GetID() + log.Debug("store leader score", zap.String("scheduler", l.GetName()), zap.Uint64("max-store", sourceID), zap.Uint64("min-store", targetID)) + sourceStoreLabel := strconv.FormatUint(sourceID, 10) + targetStoreLabel := strconv.FormatUint(targetID, 10) sourceAddress := source.GetAddress() targetAddress := target.GetAddress() - balanceLeaderCounter.WithLabelValues("high_score", sourceAddress).Inc() - balanceLeaderCounter.WithLabelValues("low_score", targetAddress).Inc() + balanceLeaderCounter.WithLabelValues("high_score", sourceAddress, sourceStoreLabel).Inc() + balanceLeaderCounter.WithLabelValues("low_score", targetAddress, targetStoreLabel).Inc() opInfluence := l.opController.GetOpInfluence(cluster) for i := 0; i < balanceLeaderRetryLimit; i++ { if op := l.transferLeaderOut(source, cluster, opInfluence); op != nil { - balanceLeaderCounter.WithLabelValues("transfer_out", sourceAddress).Inc() + balanceLeaderCounter.WithLabelValues("transfer_out", sourceAddress, sourceStoreLabel).Inc() return op } if op := l.transferLeaderIn(target, cluster, opInfluence); op != nil { - balanceLeaderCounter.WithLabelValues("transfer_in", targetAddress).Inc() + balanceLeaderCounter.WithLabelValues("transfer_in", targetAddress, targetStoreLabel).Inc() return op } } // If no operator can be created for the selected stores, ignore them for a while. - log.Debug("no operator created for selected stores", zap.String("scheduler", l.GetName()), zap.Uint64("source", source.GetID()), zap.Uint64("target", target.GetID())) - balanceLeaderCounter.WithLabelValues("add_taint", sourceAddress).Inc() - l.taintStores.Put(source.GetID()) - balanceLeaderCounter.WithLabelValues("add_taint", targetAddress).Inc() - l.taintStores.Put(target.GetID()) + log.Debug("no operator created for selected stores", zap.String("scheduler", l.GetName()), zap.Uint64("source", sourceID), zap.Uint64("target", targetID)) + balanceLeaderCounter.WithLabelValues("add_taint", sourceAddress, sourceStoreLabel).Inc() + l.taintStores.Put(sourceID) + balanceLeaderCounter.WithLabelValues("add_taint", targetAddress, targetStoreLabel).Inc() + l.taintStores.Put(targetID) return nil } @@ -119,9 +125,10 @@ func (l *balanceLeaderScheduler) Schedule(cluster schedule.Cluster) []*schedule. // It randomly selects a health region from the source store, then picks // the best follower peer and transfers the leader. func (l *balanceLeaderScheduler) transferLeaderOut(source *core.StoreInfo, cluster schedule.Cluster, opInfluence schedule.OpInfluence) []*schedule.Operator { - region := cluster.RandLeaderRegion(source.GetID(), core.HealthRegion()) + sourceID := source.GetID() + region := cluster.RandLeaderRegion(sourceID, core.HealthRegion()) if region == nil { - log.Debug("store has no leader", zap.String("scheduler", l.GetName()), zap.Uint64("store-id", source.GetID())) + log.Debug("store has no leader", zap.String("scheduler", l.GetName()), zap.Uint64("store-id", sourceID)) schedulerCounter.WithLabelValues(l.GetName(), "no_leader_region").Inc() return nil } @@ -138,9 +145,10 @@ func (l *balanceLeaderScheduler) transferLeaderOut(source *core.StoreInfo, clust // It randomly selects a health region from the target store, then picks // the worst follower peer and transfers the leader. func (l *balanceLeaderScheduler) transferLeaderIn(target *core.StoreInfo, cluster schedule.Cluster, opInfluence schedule.OpInfluence) []*schedule.Operator { - region := cluster.RandFollowerRegion(target.GetID(), core.HealthRegion()) + targetID := target.GetID() + region := cluster.RandFollowerRegion(targetID, core.HealthRegion()) if region == nil { - log.Debug("store has no follower", zap.String("scheduler", l.GetName()), zap.Uint64("store-id", target.GetID())) + log.Debug("store has no follower", zap.String("scheduler", l.GetName()), zap.Uint64("store-id", targetID)) schedulerCounter.WithLabelValues(l.GetName(), "no_follower_region").Inc() return nil } @@ -158,28 +166,33 @@ func (l *balanceLeaderScheduler) transferLeaderIn(target *core.StoreInfo, cluste // no new operator need to be created, otherwise create an operator that transfers // the leader from the source store to the target store for the region. func (l *balanceLeaderScheduler) createOperator(region *core.RegionInfo, source, target *core.StoreInfo, cluster schedule.Cluster, opInfluence schedule.OpInfluence) []*schedule.Operator { - if cluster.IsRegionHot(region.GetID()) { - log.Debug("region is hot region, ignore it", zap.String("scheduler", l.GetName()), zap.Uint64("region-id", region.GetID())) + regionID := region.GetID() + if cluster.IsRegionHot(regionID) { + log.Debug("region is hot region, ignore it", zap.String("scheduler", l.GetName()), zap.Uint64("region-id", regionID)) schedulerCounter.WithLabelValues(l.GetName(), "region_hot").Inc() return nil } + sourceID := source.GetID() + targetID := target.GetID() if !shouldBalance(cluster, source, target, region, core.LeaderKind, opInfluence) { log.Debug("skip balance region", - zap.String("scheduler", l.GetName()), zap.Uint64("region-id", region.GetID()), zap.Uint64("source-store", source.GetID()), zap.Uint64("target-store", target.GetID()), + zap.String("scheduler", l.GetName()), zap.Uint64("region-id", regionID), zap.Uint64("source-store", sourceID), zap.Uint64("target-store", targetID), zap.Int64("source-size", source.GetLeaderSize()), zap.Float64("source-score", source.LeaderScore(0)), - zap.Int64("source-influence", opInfluence.GetStoreInfluence(source.GetID()).ResourceSize(core.LeaderKind)), + zap.Int64("source-influence", opInfluence.GetStoreInfluence(sourceID).ResourceSize(core.LeaderKind)), zap.Int64("target-size", target.GetLeaderSize()), zap.Float64("target-score", target.LeaderScore(0)), - zap.Int64("target-influence", opInfluence.GetStoreInfluence(target.GetID()).ResourceSize(core.LeaderKind)), + zap.Int64("target-influence", opInfluence.GetStoreInfluence(targetID).ResourceSize(core.LeaderKind)), zap.Int64("average-region-size", cluster.GetAverageRegionSize())) schedulerCounter.WithLabelValues(l.GetName(), "skip").Inc() return nil } schedulerCounter.WithLabelValues(l.GetName(), "new_operator").Inc() - balanceLeaderCounter.WithLabelValues("move_leader", source.GetAddress()+"-out").Inc() - balanceLeaderCounter.WithLabelValues("move_leader", target.GetAddress()+"-in").Inc() - step := schedule.TransferLeader{FromStore: region.GetLeader().GetStoreId(), ToStore: target.GetID()} - op := schedule.NewOperator("balance-leader", region.GetID(), region.GetRegionEpoch(), schedule.OpBalance|schedule.OpLeader, step) + sourceLabel := strconv.FormatUint(sourceID, 10) + targetLabel := strconv.FormatUint(targetID, 10) + balanceLeaderCounter.WithLabelValues("move_leader", source.GetAddress()+"-out", sourceLabel).Inc() + balanceLeaderCounter.WithLabelValues("move_leader", target.GetAddress()+"-in", targetLabel).Inc() + step := schedule.TransferLeader{FromStore: region.GetLeader().GetStoreId(), ToStore: targetID} + op := schedule.NewOperator("balance-leader", regionID, region.GetRegionEpoch(), schedule.OpBalance|schedule.OpLeader, step) return []*schedule.Operator{op} } diff --git a/server/schedulers/balance_region.go b/server/schedulers/balance_region.go index c592d91e088..4950fcb33bf 100644 --- a/server/schedulers/balance_region.go +++ b/server/schedulers/balance_region.go @@ -14,6 +14,8 @@ package schedulers import ( + "strconv" + "github.com/pingcap/kvproto/pkg/metapb" log "github.com/pingcap/log" "github.com/pingcap/pd/server/cache" @@ -83,18 +85,20 @@ func (s *balanceRegionScheduler) Schedule(cluster schedule.Cluster) []*schedule. return nil } - log.Debug("store has the max region score", zap.String("scheduler", s.GetName()), zap.Uint64("store-id", source.GetID())) + sourceID := source.GetID() + log.Debug("store has the max region score", zap.String("scheduler", s.GetName()), zap.Uint64("store-id", sourceID)) sourceAddress := source.GetAddress() - balanceRegionCounter.WithLabelValues("source_store", sourceAddress).Inc() + sourceLabel := strconv.FormatUint(sourceID, 10) + balanceRegionCounter.WithLabelValues("source_store", sourceAddress, sourceLabel).Inc() opInfluence := s.opController.GetOpInfluence(cluster) var hasPotentialTarget bool for i := 0; i < balanceRegionRetryLimit; i++ { // Priority the region that has a follower in the source store. - region := cluster.RandFollowerRegion(source.GetID(), core.HealthRegion()) + region := cluster.RandFollowerRegion(sourceID, core.HealthRegion()) if region == nil { // Then the region has the leader in the source store - region = cluster.RandLeaderRegion(source.GetID(), core.HealthRegion()) + region = cluster.RandLeaderRegion(sourceID, core.HealthRegion()) } if region == nil { schedulerCounter.WithLabelValues(s.GetName(), "no_region").Inc() @@ -121,7 +125,7 @@ func (s *balanceRegionScheduler) Schedule(cluster schedule.Cluster) []*schedule. } hasPotentialTarget = true - oldPeer := region.GetStorePeer(source.GetID()) + oldPeer := region.GetStorePeer(sourceID) if op := s.transferPeer(cluster, region, oldPeer, opInfluence); op != nil { schedulerCounter.WithLabelValues(s.GetName(), "new_operator").Inc() return []*schedule.Operator{op} @@ -130,9 +134,9 @@ func (s *balanceRegionScheduler) Schedule(cluster schedule.Cluster) []*schedule. if !hasPotentialTarget { // If no potential target store can be found for the selected store, ignore it for a while. - log.Debug("no operator created for selected store", zap.String("scheduler", s.GetName()), zap.Uint64("store-id", source.GetID())) - balanceRegionCounter.WithLabelValues("add_taint", sourceAddress).Inc() - s.taintStores.Put(source.GetID()) + log.Debug("no operator created for selected store", zap.String("scheduler", s.GetName()), zap.Uint64("store-id", sourceID)) + balanceRegionCounter.WithLabelValues("add_taint", sourceAddress, sourceLabel).Inc() + s.taintStores.Put(sourceID) } return nil @@ -153,15 +157,18 @@ func (s *balanceRegionScheduler) transferPeer(cluster schedule.Cluster, region * } target := cluster.GetStore(storeID) - log.Debug("", zap.Uint64("region-id", region.GetID()), zap.Uint64("source-store", source.GetID()), zap.Uint64("target-store", target.GetID())) + regionID := region.GetID() + sourceID := source.GetID() + targetID := target.GetID() + log.Debug("", zap.Uint64("region-id", regionID), zap.Uint64("source-store", sourceID), zap.Uint64("target-store", targetID)) if !shouldBalance(cluster, source, target, region, core.RegionKind, opInfluence) { log.Debug("skip balance region", - zap.String("scheduler", s.GetName()), zap.Uint64("region-id", region.GetID()), zap.Uint64("source-store", source.GetID()), zap.Uint64("target-store", target.GetID()), + zap.String("scheduler", s.GetName()), zap.Uint64("region-id", regionID), zap.Uint64("source-store", sourceID), zap.Uint64("target-store", targetID), zap.Int64("source-size", source.GetRegionSize()), zap.Float64("source-score", source.RegionScore(cluster.GetHighSpaceRatio(), cluster.GetLowSpaceRatio(), 0)), - zap.Int64("source-influence", opInfluence.GetStoreInfluence(source.GetID()).ResourceSize(core.RegionKind)), + zap.Int64("source-influence", opInfluence.GetStoreInfluence(sourceID).ResourceSize(core.RegionKind)), zap.Int64("target-size", target.GetRegionSize()), zap.Float64("target-score", target.RegionScore(cluster.GetHighSpaceRatio(), cluster.GetLowSpaceRatio(), 0)), - zap.Int64("target-influence", opInfluence.GetStoreInfluence(target.GetID()).ResourceSize(core.RegionKind)), + zap.Int64("target-influence", opInfluence.GetStoreInfluence(targetID).ResourceSize(core.RegionKind)), zap.Int64("average-region-size", cluster.GetAverageRegionSize())) schedulerCounter.WithLabelValues(s.GetName(), "skip").Inc() return nil @@ -172,8 +179,10 @@ func (s *balanceRegionScheduler) transferPeer(cluster schedule.Cluster, region * schedulerCounter.WithLabelValues(s.GetName(), "no_peer").Inc() return nil } - balanceRegionCounter.WithLabelValues("move_peer", source.GetAddress()+"-out").Inc() - balanceRegionCounter.WithLabelValues("move_peer", target.GetAddress()+"-in").Inc() + sourceLabel := strconv.FormatUint(sourceID, 10) + targetLabel := strconv.FormatUint(targetID, 10) + balanceRegionCounter.WithLabelValues("move_peer", source.GetAddress()+"-out", sourceLabel).Inc() + balanceRegionCounter.WithLabelValues("move_peer", target.GetAddress()+"-in", targetLabel).Inc() op, err := schedule.CreateMovePeerOperator("balance-region", cluster, region, schedule.OpBalance, oldPeer.GetStoreId(), newPeer.GetStoreId(), newPeer.GetId()) if err != nil { schedulerCounter.WithLabelValues(s.GetName(), "create_operator_fail").Inc() diff --git a/server/schedulers/metrics.go b/server/schedulers/metrics.go index 4d5370c2d4f..4c5eec20658 100644 --- a/server/schedulers/metrics.go +++ b/server/schedulers/metrics.go @@ -37,7 +37,7 @@ var balanceLeaderCounter = prometheus.NewCounterVec( Subsystem: "scheduler", Name: "balance_leader", Help: "Counter of balance leader scheduler.", - }, []string{"type", "address"}) + }, []string{"type", "address", "store"}) var balanceRegionCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -45,7 +45,7 @@ var balanceRegionCounter = prometheus.NewCounterVec( Subsystem: "scheduler", Name: "balance_region", Help: "Counter of balance region scheduler.", - }, []string{"type", "address"}) + }, []string{"type", "address", "store"}) func init() { prometheus.MustRegister(schedulerCounter) diff --git a/server/store_statistics.go b/server/store_statistics.go index 7783f9e9fb9..70339b3f9b9 100644 --- a/server/store_statistics.go +++ b/server/store_statistics.go @@ -15,6 +15,7 @@ package server import ( "fmt" + "strconv" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/pd/server/core" @@ -61,6 +62,7 @@ func (s *storeStatistics) Observe(store *core.StoreInfo) { s.LabelCounter[key]++ } storeAddress := store.GetAddress() + id := strconv.FormatUint(store.GetID(), 10) // Store state. switch store.GetState() { case metapb.StoreState_Up: @@ -77,7 +79,7 @@ func (s *storeStatistics) Observe(store *core.StoreInfo) { s.Offline++ case metapb.StoreState_Tombstone: s.Tombstone++ - s.resetStoreStatistics(storeAddress) + s.resetStoreStatistics(storeAddress, id) return } if store.IsLowSpace(s.opt.GetLowSpaceRatio()) { @@ -90,15 +92,15 @@ func (s *storeStatistics) Observe(store *core.StoreInfo) { s.RegionCount += store.GetRegionCount() s.LeaderCount += store.GetLeaderCount() - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "region_score").Set(store.RegionScore(s.opt.GetHighSpaceRatio(), s.opt.GetLowSpaceRatio(), 0)) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "leader_score").Set(store.LeaderScore(0)) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "region_size").Set(float64(store.GetRegionSize())) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "region_count").Set(float64(store.GetRegionCount())) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "leader_size").Set(float64(store.GetLeaderSize())) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "leader_count").Set(float64(store.GetLeaderCount())) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "store_available").Set(float64(store.GetAvailable())) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "store_used").Set(float64(store.GetUsedSize())) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "store_capacity").Set(float64(store.GetCapacity())) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "region_score").Set(store.RegionScore(s.opt.GetHighSpaceRatio(), s.opt.GetLowSpaceRatio(), 0)) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "leader_score").Set(store.LeaderScore(0)) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "region_size").Set(float64(store.GetRegionSize())) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "region_count").Set(float64(store.GetRegionCount())) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "leader_size").Set(float64(store.GetLeaderSize())) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "leader_count").Set(float64(store.GetLeaderCount())) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "store_available").Set(float64(store.GetAvailable())) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "store_used").Set(float64(store.GetUsedSize())) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "store_capacity").Set(float64(store.GetCapacity())) } func (s *storeStatistics) Collect() { @@ -162,16 +164,16 @@ func (s *storeStatistics) Collect() { } } -func (s *storeStatistics) resetStoreStatistics(storeAddress string) { - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "region_score").Set(0) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "leader_score").Set(0) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "region_size").Set(0) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "region_count").Set(0) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "leader_size").Set(0) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "leader_count").Set(0) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "store_available").Set(0) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "store_used").Set(0) - storeStatusGauge.WithLabelValues(s.namespace, storeAddress, "store_capacity").Set(0) +func (s *storeStatistics) resetStoreStatistics(storeAddress string, id string) { + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "region_score").Set(0) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "leader_score").Set(0) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "region_size").Set(0) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "region_count").Set(0) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "leader_size").Set(0) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "leader_count").Set(0) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "store_available").Set(0) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "store_used").Set(0) + storeStatusGauge.WithLabelValues(s.namespace, storeAddress, id, "store_capacity").Set(0) } type storeStatisticsMap struct {