Skip to content

Commit

Permalink
Merge pull request #6396 from guopeng0/feature/node_group_healthy_met…
Browse files Browse the repository at this point in the history
…rics

feat:add node group health and back off metrics
  • Loading branch information
k8s-ci-robot committed Jan 24, 2024
2 parents 779c1ba + 4b9d4b1 commit a2f8902
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 1 deletion.
5 changes: 5 additions & 0 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,11 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned)
}

// BackoffStatusForNodeGroup queries the backoff status of the node group
func (csr *ClusterStateRegistry) BackoffStatusForNodeGroup(nodeGroup cloudprovider.NodeGroup, now time.Time) backoff.Status {
return csr.backoff.BackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now)
}

// NodeGroupScaleUpSafety returns information about node group safety to be scaled up now.
func (csr *ClusterStateRegistry) NodeGroupScaleUpSafety(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety {
isHealthy := csr.IsNodeGroupHealthy(nodeGroup.Id())
Expand Down
42 changes: 42 additions & 0 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,22 @@ var (
}, []string{"node_group"},
)

nodesGroupHealthiness = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_group_healthiness",
Help: "Whether or not node group is healthy enough for autoscaling. 1 if it is, 0 otherwise.",
}, []string{"node_group"},
)

nodeGroupBackOffStatus = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_group_backoff_status",
Help: "Whether or not node group is backoff for not autoscaling. 1 if it is, 0 otherwise.",
}, []string{"node_group", "reason"},
)

/**** Metrics related to autoscaler execution ****/
lastActivity = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Expand Down Expand Up @@ -438,6 +454,8 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
legacyregistry.MustRegister(nodesGroupMinNodes)
legacyregistry.MustRegister(nodesGroupMaxNodes)
legacyregistry.MustRegister(nodesGroupTargetSize)
legacyregistry.MustRegister(nodesGroupHealthiness)
legacyregistry.MustRegister(nodeGroupBackOffStatus)
}
}

Expand Down Expand Up @@ -543,6 +561,30 @@ func UpdateNodeGroupTargetSize(targetSizes map[string]int) {
}
}

// UpdateNodeGroupHealthStatus records if node group is healthy to autoscaling
func UpdateNodeGroupHealthStatus(nodeGroup string, healthy bool) {
if healthy {
nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(1)
} else {
nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(0)
}
}

// UpdateNodeGroupBackOffStatus records if node group is backoff for not autoscaling
func UpdateNodeGroupBackOffStatus(nodeGroup string, backoffReasonStatus map[string]bool) {
if len(backoffReasonStatus) == 0 {
nodeGroupBackOffStatus.WithLabelValues(nodeGroup, "").Set(0)
} else {
for reason, backoff := range backoffReasonStatus {
if backoff {
nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(1)
} else {
nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(0)
}
}
}
}

// RegisterError records any errors preventing Cluster Autoscaler from working.
// No more than one error should be recorded per loop.
func RegisterError(err errors.AutoscalerError) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ type AutoscalingStatusProcessor interface {

// NewDefaultAutoscalingStatusProcessor creates a default instance of AutoscalingStatusProcessor.
func NewDefaultAutoscalingStatusProcessor() AutoscalingStatusProcessor {
return &NoOpAutoscalingStatusProcessor{}
return &MetricsAutoscalingStatusProcessor{
backoffReasonStatus: make(map[string]BackoffReasonStatus),
}
}

// NoOpAutoscalingStatusProcessor is an AutoscalingStatusProcessor implementation useful for testing.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package status

import (
"time"

"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
"k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/metrics"
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
)

const (
// unknownErrorCode means that the cloud provider has not provided an error code.
unknownErrorCode = "unknown"
)

// BackoffReasonStatus contains information about backoff status and reason
type BackoffReasonStatus map[string]bool

// MetricsAutoscalingStatusProcessor is used to update metrics after each autoscaling iteration.
type MetricsAutoscalingStatusProcessor struct {
backoffReasonStatus map[string]BackoffReasonStatus
}

// Process queries the health status and backoff situation of all node groups and updates metrics after each autoscaling iteration.
func (p *MetricsAutoscalingStatusProcessor) Process(context *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, now time.Time) error {
for _, nodeGroup := range context.CloudProvider.NodeGroups() {
if !nodeGroup.Exist() {
continue
}
metrics.UpdateNodeGroupHealthStatus(nodeGroup.Id(), csr.IsNodeGroupHealthy(nodeGroup.Id()))
backoffStatus := csr.BackoffStatusForNodeGroup(nodeGroup, now)
p.updateNodeGroupBackoffStatusMetrics(nodeGroup.Id(), backoffStatus)
}
return nil
}

// CleanUp cleans up the processor's internal structures.
func (p *MetricsAutoscalingStatusProcessor) CleanUp() {
}

// updateNodeGroupBackoffStatusMetrics updates metrics about backoff situation and reason of the node group
func (p *MetricsAutoscalingStatusProcessor) updateNodeGroupBackoffStatusMetrics(nodeGroup string, backoffStatus backoff.Status) {
if _, ok := p.backoffReasonStatus[nodeGroup]; ok {
for reason := range p.backoffReasonStatus[nodeGroup] {
p.backoffReasonStatus[nodeGroup][reason] = false
}
} else {
p.backoffReasonStatus[nodeGroup] = make(BackoffReasonStatus)
}

if backoffStatus.IsBackedOff {
errorCode := backoffStatus.ErrorInfo.ErrorCode
if errorCode == "" {
// prevent error code from being empty.
errorCode = unknownErrorCode
}
p.backoffReasonStatus[nodeGroup][errorCode] = true
}
metrics.UpdateNodeGroupBackOffStatus(nodeGroup, p.backoffReasonStatus[nodeGroup])
}

0 comments on commit a2f8902

Please sign in to comment.