diff --git a/cluster-autoscaler/clusterstate/api/types.go b/cluster-autoscaler/clusterstate/api/types.go index 52d79b24012a..282c4a7bb206 100644 --- a/cluster-autoscaler/clusterstate/api/types.go +++ b/cluster-autoscaler/clusterstate/api/types.go @@ -22,19 +22,14 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// ClusterAutoscalerConditionType is the type of ClusterAutoscalerCondition. -type ClusterAutoscalerConditionType string +// ClusterAutoscalerStatusCondition is the status of the cluster autoscaler. +type ClusterAutoscalerStatusCondition string const ( - // ClusterAutoscalerHealth - is a condition that explains what is the current health - // of ClusterAutoscaler or its node groups. - ClusterAutoscalerHealth ClusterAutoscalerConditionType = "Health" - // ClusterAutoscalerScaleDown is a condition that explains what is the current status - // of a node group with regard to scale down activities. - ClusterAutoscalerScaleDown ClusterAutoscalerConditionType = "ScaleDown" - // ClusterAutoscalerScaleUp is a condition that explains what is the current status - // of a node group with regard to scale up activities. - ClusterAutoscalerScaleUp ClusterAutoscalerConditionType = "ScaleUp" + // ClusterAutoscalerRunning status means that the cluster autoscaler has been initialized and running. + ClusterAutoscalerRunning ClusterAutoscalerStatusCondition = "Running" + // ClusterAutoscalerInitializing status means that cluster autoscaler is currently being initialized. + ClusterAutoscalerInitializing ClusterAutoscalerStatusCondition = "Initializing" ) // ClusterAutoscalerConditionStatus is a status of ClusterAutoscalerCondition. @@ -69,36 +64,135 @@ const ( ClusterAutoscalerBackoff ClusterAutoscalerConditionStatus = "Backoff" ) -// ClusterAutoscalerCondition describes some aspect of ClusterAutoscaler work. -type ClusterAutoscalerCondition struct { - // Type defines the aspect that the condition describes. For example, it can be Health or ScaleUp/Down activity. - Type ClusterAutoscalerConditionType `json:"type,omitempty"` - // Status of the condition. - Status ClusterAutoscalerConditionStatus `json:"status,omitempty"` - // Message is a free text extra information about the condition. It may contain some - // extra debugging data, like why the cluster is unhealthy. - Message string `json:"message,omitempty"` - // Reason is a unique, one-word, CamelCase reason for the condition's last transition. - Reason string `json:"reason,omitempty"` +// RegisteredUnreadyNodeCount contains node counts of registered but unready nodes. +type RegisteredUnreadyNodeCount struct { + // Total number of registered but unready nodes. + Total int `json:"total" yaml:"total"` + // ResourceUnready is the number of registered but unready nodes due to a missing resource (e.g. GPU). + ResourceUnready int `json:"resourceUnready" yaml:"resourceUnready"` +} + +// RegisteredNodeCount contains node counts of registered nodes. +type RegisteredNodeCount struct { + Total int `json:"total" yaml:"total"` + Ready int `json:"ready" yaml:"ready"` + NotStarted int `json:"notStarted" yaml:"notStarted"` + // Number of nodes that are being currently deleted. They exist in K8S but are not included in NodeGroup.TargetSize(). + BeingDeleted int `json:"beingDeleted,omitempty" yaml:"beingDeleted,omitempty"` + Unready RegisteredUnreadyNodeCount `json:"unready,omitempty" yaml:"unready,omitempty"` +} + +// NodeCount contains number of nodes that satisfy different criteria. +type NodeCount struct { + Registered RegisteredNodeCount `json:"registered,omitempty" yaml:"registered,omitempty"` + LongUnregistered int `json:"longUnregistered" yaml:"longUnregistered"` + Unregistered int `json:"unregistered" yaml:"unregistered"` +} + +// ClusterHealthCondition contains information about health condition for the whole cluster. +type ClusterHealthCondition struct { + // Status of cluster health. + Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"` + // NodeCounts contains number of nodes that satisfy different criteria in the cluster. + NodeCounts NodeCount `json:"nodeCounts,omitempty" yaml:"nodeCounts,omitempty"` // LastProbeTime is the last time we probed the condition. - LastProbeTime metav1.Time `json:"lastProbeTime,omitempty"` + LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"` // LastTransitionTime is the time since when the condition was in the given state. - LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` + LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"` } -// ClusterAutoscalerStatus contains ClusterAutoscaler status. -type ClusterAutoscalerStatus struct { - // NodeGroupStatuses contains status information of individual node groups on which CA works. - NodeGroupStatuses []NodeGroupStatus `json:"nodeGroupStatuses,omitempty"` - // ClusterwideConditions contains conditions that apply to the whole autoscaler. - ClusterwideConditions []ClusterAutoscalerCondition `json:"clusterwideConditions,omitempty"` +// NodeGroupHealthCondition contains information about health condition for a node group. +type NodeGroupHealthCondition struct { + // Status of node group health. + Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"` + // NodeCounts contains number of nodes that satisfy different criteria in the node group. + NodeCounts NodeCount `json:"nodeCounts,omitempty" yaml:"nodeCounts,omitempty"` + // CloudProviderTarget is the target size set by cloud provider. + CloudProviderTarget int `json:"cloudProviderTarget" yaml:"cloudProviderTarget"` + // MinSize is the CA max size of a node group. + MinSize int `json:"minSize" yaml:"minSize"` + // MaxSize is the CA max size of a node group. + MaxSize int `json:"maxSize" yaml:"maxSize"` + // LastProbeTime is the last time we probed the condition. + LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"` + // LastTransitionTime is the time since when the condition was in the given state. + LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"` +} + +// ClusterScaleUpCondition contains information about scale up condition for the whole cluster. +type ClusterScaleUpCondition struct { + // Status of the scale up. + Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"` + // LastProbeTime is the last time we probed the condition. + LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"` + // LastTransitionTime is the time since when the condition was in the given state. + LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"` +} + +// BackoffInfo contains error information that caused the backoff. +type BackoffInfo struct { + // ErrorCode is a specific error code for error condition + ErrorCode string `json:"errorCode,omitempty" yaml:"errorCode,omitempty"` + // ErrorMessage is human readable description of error condition + ErrorMessage string `json:"errorMessage,omitempty" yaml:"errorMessage,omitempty"` } -// NodeGroupStatus contains status of a group of nodes controlled by ClusterAutoscaler. +// NodeGroupScaleUpCondition contains information about scale up condition for a node group. +type NodeGroupScaleUpCondition struct { + // Status of the scale up. + Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"` + // LastProbeTime is the last time we probed the condition. + BackoffInfo BackoffInfo `json:"backoffInfo,omitempty" yaml:"backoffInfo,omitempty"` + // LastProbeTime is the last time we probed the condition. + LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"` + // LastTransitionTime is the time since when the condition was in the given state. + LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"` +} + +// ScaleDownCondition contains information about scale down condition for a node group or the whole cluster. +type ScaleDownCondition struct { + // Status of the scale down. + Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"` + // Candidates number for the scale down. + Candidates int `json:"candidates,omitempty" yaml:"candidates,omitempty"` + // LastProbeTime is the last time we probed the condition. + LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"` + // LastTransitionTime is the time since when the condition was in the given state. + LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"` +} + +// ClusterWideStatus contains status that apply to the whole cluster. +type ClusterWideStatus struct { + // Health contains information about health condition of the cluster. + Health ClusterHealthCondition `json:"health,omitempty" yaml:"health,omitempty"` + // ScaleUp contains information about scale up condition of the cluster. + ScaleUp ClusterScaleUpCondition `json:"scaleUp,omitempty" yaml:"scaleUp,omitempty"` + // ScaleDown contains information about scale down condition of the node group. + ScaleDown ScaleDownCondition `json:"scaleDown,omitempty" yaml:"scaleDown,omitempty"` +} + +// NodeGroupStatus contains status of an individual node group on which CA works.. type NodeGroupStatus struct { - // ProviderID is the cloud-provider-specific name of the node group. On GCE it will be equal - // to MIG url, on AWS it will be ASG name, etc. - ProviderID string `json:"providerID,omitempty"` - // Conditions is a list of conditions that describe the state of the node group. - Conditions []ClusterAutoscalerCondition `json:"conditions,omitempty"` + // Name of the node group. + Name string `json:"name,omitempty" yaml:"name,omitempty"` + // Health contains information about health condition of the node group. + Health NodeGroupHealthCondition `json:"health,omitempty" yaml:"health,omitempty"` + // ScaleUp contains information about scale up condition of the node group. + ScaleUp NodeGroupScaleUpCondition `json:"scaleUp,omitempty" yaml:"scaleUp,omitempty"` + // ScaleDown contains information about scale down condition of the node group. + ScaleDown ScaleDownCondition `json:"scaleDown,omitempty" yaml:"scaleDown,omitempty"` +} + +// ClusterAutoscalerStatus contains ClusterAutoscaler status. +type ClusterAutoscalerStatus struct { + // Time of the cluster autoscaler status. + Time string `json:"time,omitempty" yaml:"time,omitempty"` + // AutoscalerStatus contains status of ClusterAutoscaler (e.g. 'Initializing' & 'Running'). + AutoscalerStatus ClusterAutoscalerStatusCondition `json:"autoscalerStatus,omitempty" yaml:"autoscalerStatus,omitempty"` + // Message contains extra information about the status. + Message string `json:"message,omitempty" yaml:"message,omitempty"` + // ClusterWide contains conditions that apply to the whole cluster. + ClusterWide ClusterWideStatus `json:"clusterWide,omitempty" yaml:"clusterWide,omitempty"` + // NodeGroups contains status information of individual node groups on which CA works. + NodeGroups []NodeGroupStatus `json:"nodeGroups,omitempty" yaml:"nodeGroups,omitempty"` } diff --git a/cluster-autoscaler/clusterstate/api/utils.go b/cluster-autoscaler/clusterstate/api/utils.go deleted file mode 100644 index 4045820ed0d7..000000000000 --- a/cluster-autoscaler/clusterstate/api/utils.go +++ /dev/null @@ -1,92 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package api - -import ( - "bytes" - "fmt" -) - -// GetConditionByType gets condition by type. -func GetConditionByType(conditionType ClusterAutoscalerConditionType, - conditions []ClusterAutoscalerCondition) *ClusterAutoscalerCondition { - for i := range conditions { - if conditions[i].Type == conditionType { - return &conditions[i] - } - } - return nil -} - -func getConditionsString(autoscalerConditions []ClusterAutoscalerCondition, prefix string) string { - health := fmt.Sprintf("%v%-12v ", prefix, ClusterAutoscalerHealth+":") - var scaleUp, scaleDown string - var buffer, other bytes.Buffer - for _, condition := range autoscalerConditions { - var line bytes.Buffer - line.WriteString(fmt.Sprintf("%v%-12v %v", - prefix, - condition.Type+":", - condition.Status)) - if condition.Message != "" { - line.WriteString(" (") - line.WriteString(condition.Message) - line.WriteString(")") - } - line.WriteString("\n") - line.WriteString(fmt.Sprintf("%v%13sLastProbeTime: %v\n", - prefix, - "", - condition.LastProbeTime)) - line.WriteString(fmt.Sprintf("%v%13sLastTransitionTime: %v\n", - prefix, - "", - condition.LastTransitionTime)) - switch condition.Type { - case ClusterAutoscalerHealth: - health = line.String() - case ClusterAutoscalerScaleUp: - scaleUp = line.String() - case ClusterAutoscalerScaleDown: - scaleDown = line.String() - default: - other.WriteString(line.String()) - } - } - buffer.WriteString(health) - buffer.WriteString(scaleUp) - buffer.WriteString(scaleDown) - buffer.WriteString(other.String()) - return buffer.String() -} - -// GetReadableString produces human-readable description of status. -func (status ClusterAutoscalerStatus) GetReadableString() string { - var buffer bytes.Buffer - buffer.WriteString("Cluster-wide:\n") - buffer.WriteString(getConditionsString(status.ClusterwideConditions, " ")) - if len(status.NodeGroupStatuses) == 0 { - return buffer.String() - } - buffer.WriteString("\nNodeGroups:\n") - for _, nodeGroupStatus := range status.NodeGroupStatuses { - buffer.WriteString(fmt.Sprintf(" Name: %v\n", nodeGroupStatus.ProviderID)) - buffer.WriteString(getConditionsString(nodeGroupStatus.Conditions, " ")) - buffer.WriteString("\n") - } - return buffer.String() -} diff --git a/cluster-autoscaler/clusterstate/api/utils_test.go b/cluster-autoscaler/clusterstate/api/utils_test.go deleted file mode 100644 index 3e1fa9d83170..000000000000 --- a/cluster-autoscaler/clusterstate/api/utils_test.go +++ /dev/null @@ -1,91 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package api - -import ( - "fmt" - "regexp" - "testing" - - "github.com/stretchr/testify/assert" -) - -func prepareConditions() (health, scaleUp ClusterAutoscalerCondition) { - healthCondition := ClusterAutoscalerCondition{ - Type: ClusterAutoscalerHealth, - Status: ClusterAutoscalerHealthy, - Message: "HEALTH_MESSAGE"} - scaleUpCondition := ClusterAutoscalerCondition{ - Type: ClusterAutoscalerScaleUp, - Status: ClusterAutoscalerNotNeeded, - Message: "SCALE_UP_MESSAGE"} - return healthCondition, scaleUpCondition -} - -func TestGetStringForEmptyStatus(t *testing.T) { - var empty ClusterAutoscalerStatus - assert.Regexp(t, regexp.MustCompile("\\s*Health:\\s*"), empty.GetReadableString()) -} - -func TestGetStringNothingGoingOn(t *testing.T) { - var status ClusterAutoscalerStatus - healthCondition, scaleUpCondition := prepareConditions() - status.ClusterwideConditions = append(status.ClusterwideConditions, healthCondition) - status.ClusterwideConditions = append(status.ClusterwideConditions, scaleUpCondition) - - // Make sure everything is printed - result := status.GetReadableString() - assert.Regexp(t, regexp.MustCompile(fmt.Sprintf("%v:\\s*%v", ClusterAutoscalerHealth, ClusterAutoscalerHealthy)), result) - assert.Regexp(t, regexp.MustCompile(fmt.Sprintf("%v.*HEALTH_MESSAGE", ClusterAutoscalerHealth)), result) - assert.NotRegexp(t, regexp.MustCompile(fmt.Sprintf("%v.*SCALE_UP_MESSAGE", ClusterAutoscalerHealth)), result) - assert.NotRegexp(t, regexp.MustCompile("NodeGroups"), result) - assert.Regexp(t, regexp.MustCompile(fmt.Sprintf("%v:\\s*%v", ClusterAutoscalerScaleUp, ClusterAutoscalerNotNeeded)), result) - - // Check if reordering fields doesn't change output - var reorderedStatus ClusterAutoscalerStatus - reorderedStatus.ClusterwideConditions = append(status.ClusterwideConditions, scaleUpCondition) - reorderedStatus.ClusterwideConditions = append(status.ClusterwideConditions, healthCondition) - reorderedResult := reorderedStatus.GetReadableString() - assert.Equal(t, result, reorderedResult) -} - -func TestGetStringScalingUp(t *testing.T) { - var status ClusterAutoscalerStatus - healthCondition, scaleUpCondition := prepareConditions() - scaleUpCondition.Status = ClusterAutoscalerInProgress - status.ClusterwideConditions = append(status.ClusterwideConditions, healthCondition) - status.ClusterwideConditions = append(status.ClusterwideConditions, scaleUpCondition) - result := status.GetReadableString() - assert.Regexp(t, regexp.MustCompile(fmt.Sprintf("%v:\\s*%v.*SCALE_UP_MESSAGE", ClusterAutoscalerScaleUp, ClusterAutoscalerInProgress)), result) -} - -func TestGetStringNodeGroups(t *testing.T) { - var status ClusterAutoscalerStatus - healthCondition, scaleUpCondition := prepareConditions() - status.ClusterwideConditions = append(status.ClusterwideConditions, healthCondition) - status.ClusterwideConditions = append(status.ClusterwideConditions, scaleUpCondition) - var ng1, ng2 NodeGroupStatus - ng1.ProviderID = "ng1" - ng1.Conditions = status.ClusterwideConditions - ng2.ProviderID = "ng2" - ng2.Conditions = status.ClusterwideConditions - status.NodeGroupStatuses = append(status.NodeGroupStatuses, ng1) - status.NodeGroupStatuses = append(status.NodeGroupStatuses, ng2) - result := status.GetReadableString() - assert.Regexp(t, regexp.MustCompile("(?ms)NodeGroups:.*Name:\\s*ng1"), result) - assert.Regexp(t, regexp.MustCompile("(?ms)NodeGroups:.*Name:\\s*ng2"), result) -} diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 2e3547dd77bc..67deb6595c4c 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -45,6 +45,10 @@ import ( const ( // MaxNodeStartupTime is the maximum time from the moment the node is registered to the time the node is ready. MaxNodeStartupTime = 15 * time.Minute + // maxErrorMessageSize is the maximum size of error messages displayed in config map as the max size of configmap is 1MB. + maxErrorMessageSize = 500 + // messageTrancated is displayed at the end of a trancated message. + messageTrancated = "" ) var ( @@ -152,11 +156,6 @@ type NodeGroupScalingSafety struct { // NewClusterStateRegistry creates new ClusterStateRegistry. func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config ClusterStateRegistryConfig, logRecorder *utils.LogEventRecorder, backoff backoff.Backoff, nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor) *ClusterStateRegistry { - emptyStatus := &api.ClusterAutoscalerStatus{ - ClusterwideConditions: make([]api.ClusterAutoscalerCondition, 0), - NodeGroupStatuses: make([]api.NodeGroupStatus, 0), - } - return &ClusterStateRegistry{ scaleUpRequests: make(map[string]*ScaleUpRequest), scaleDownRequests: make([]*ScaleDownRequest, 0), @@ -170,7 +169,7 @@ func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config C deletedNodes: make(map[string]struct{}), candidatesForScaleDown: make(map[string][]string), backoff: backoff, - lastStatus: emptyStatus, + lastStatus: utils.EmptyClusterAutoscalerStatus(), logRecorder: logRecorder, cloudProviderNodeInstancesCache: utils.NewCloudProviderNodeInstancesCache(cloudProvider), interrupt: make(chan struct{}), @@ -456,8 +455,8 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() { metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned) } -// IsNodeGroupSafeToScaleUp returns information about node group safety to be scaled up now. -func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety { +// NodeGroupScaleUpSafety returns information about node group safety to be scaled up now. +func (csr *ClusterStateRegistry) NodeGroupScaleUpSafety(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety { isHealthy := csr.IsNodeGroupHealthy(nodeGroup.Id()) backoffStatus := csr.backoff.BackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now) return NodeGroupScalingSafety{SafeToScale: isHealthy && !backoffStatus.IsBackedOff, Healthy: isHealthy, BackoffStatus: backoffStatus} @@ -757,42 +756,45 @@ func (csr *ClusterStateRegistry) UpdateScaleDownCandidates(nodes []*apiv1.Node, // GetStatus returns ClusterAutoscalerStatus with the current cluster autoscaler status. func (csr *ClusterStateRegistry) GetStatus(now time.Time) *api.ClusterAutoscalerStatus { result := &api.ClusterAutoscalerStatus{ - ClusterwideConditions: make([]api.ClusterAutoscalerCondition, 0), - NodeGroupStatuses: make([]api.NodeGroupStatus, 0), + AutoscalerStatus: api.ClusterAutoscalerRunning, + NodeGroups: make([]api.NodeGroupStatus, 0), + } + nodeGroupsLastStatus := make(map[string]api.NodeGroupStatus) + for _, nodeGroup := range csr.lastStatus.NodeGroups { + nodeGroupsLastStatus[nodeGroup.Name] = nodeGroup } for _, nodeGroup := range csr.cloudProvider.NodeGroups() { nodeGroupStatus := api.NodeGroupStatus{ - ProviderID: nodeGroup.Id(), - Conditions: make([]api.ClusterAutoscalerCondition, 0), + Name: nodeGroup.Id(), } readiness := csr.perNodeGroupReadiness[nodeGroup.Id()] acceptable := csr.acceptableRanges[nodeGroup.Id()] + nodeGroupLastStatus := nodeGroupsLastStatus[nodeGroup.Id()] + // Health. - nodeGroupStatus.Conditions = append(nodeGroupStatus.Conditions, buildHealthStatusNodeGroup( - csr.IsNodeGroupHealthy(nodeGroup.Id()), readiness, acceptable, nodeGroup.MinSize(), nodeGroup.MaxSize())) + nodeGroupStatus.Health = buildHealthStatusNodeGroup( + csr.IsNodeGroupHealthy(nodeGroup.Id()), readiness, acceptable, nodeGroup.MinSize(), nodeGroup.MaxSize(), nodeGroupLastStatus.Health) // Scale up. - nodeGroupStatus.Conditions = append(nodeGroupStatus.Conditions, buildScaleUpStatusNodeGroup( - csr.IsNodeGroupScalingUp(nodeGroup.Id()), - csr.IsNodeGroupSafeToScaleUp(nodeGroup, now), + nodeGroupStatus.ScaleUp = csr.buildScaleUpStatusNodeGroup( + nodeGroup, readiness, - acceptable)) + acceptable, now, nodeGroupLastStatus.ScaleUp) // Scale down. - nodeGroupStatus.Conditions = append(nodeGroupStatus.Conditions, buildScaleDownStatusNodeGroup( - csr.candidatesForScaleDown[nodeGroup.Id()], csr.lastScaleDownUpdateTime)) + nodeGroupStatus.ScaleDown = buildScaleDownStatusNodeGroup( + csr.candidatesForScaleDown[nodeGroup.Id()], csr.lastScaleDownUpdateTime, nodeGroupLastStatus.ScaleDown) - result.NodeGroupStatuses = append(result.NodeGroupStatuses, nodeGroupStatus) + result.NodeGroups = append(result.NodeGroups, nodeGroupStatus) } - result.ClusterwideConditions = append(result.ClusterwideConditions, - buildHealthStatusClusterwide(csr.IsClusterHealthy(), csr.totalReadiness)) - result.ClusterwideConditions = append(result.ClusterwideConditions, - buildScaleUpStatusClusterwide(result.NodeGroupStatuses, csr.totalReadiness)) - result.ClusterwideConditions = append(result.ClusterwideConditions, - buildScaleDownStatusClusterwide(csr.candidatesForScaleDown, csr.lastScaleDownUpdateTime)) + result.ClusterWide.Health = + buildHealthStatusClusterwide(csr.IsClusterHealthy(), csr.totalReadiness, csr.lastStatus.ClusterWide.Health) + result.ClusterWide.ScaleUp = + buildScaleUpStatusClusterwide(result.NodeGroups, csr.totalReadiness, csr.lastStatus.ClusterWide.ScaleUp) + result.ClusterWide.ScaleDown = + buildScaleDownStatusClusterwide(csr.candidatesForScaleDown, csr.lastScaleDownUpdateTime, csr.lastStatus.ClusterWide.ScaleDown) - updateLastTransition(csr.lastStatus, result) csr.lastStatus = result return result } @@ -802,35 +804,48 @@ func (csr *ClusterStateRegistry) GetClusterReadiness() Readiness { return csr.totalReadiness } -func buildHealthStatusNodeGroup(isHealthy bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int) api.ClusterAutoscalerCondition { - condition := api.ClusterAutoscalerCondition{ - Type: api.ClusterAutoscalerHealth, - Message: fmt.Sprintf("ready=%d unready=%d (resourceUnready=%d) notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)", - len(readiness.Ready), - len(readiness.Unready), - len(readiness.ResourceUnready), - len(readiness.NotStarted), - len(readiness.Registered), - len(readiness.LongUnregistered), - acceptable.CurrentTarget, - minSize, - maxSize), - LastProbeTime: metav1.Time{Time: readiness.Time}, +func buildNodeCount(readiness Readiness) api.NodeCount { + return api.NodeCount{ + Registered: api.RegisteredNodeCount{ + Total: len(readiness.Registered), + Ready: len(readiness.Ready), + NotStarted: len(readiness.NotStarted), + BeingDeleted: len(readiness.Deleted), + Unready: api.RegisteredUnreadyNodeCount{ + Total: len(readiness.Unready), + ResourceUnready: len(readiness.ResourceUnready), + }, + }, + LongUnregistered: len(readiness.LongUnregistered), + Unregistered: len(readiness.Unregistered), + } +} + +func buildHealthStatusNodeGroup(isHealthy bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int, lastStatus api.NodeGroupHealthCondition) api.NodeGroupHealthCondition { + condition := api.NodeGroupHealthCondition{ + NodeCounts: buildNodeCount(readiness), + CloudProviderTarget: acceptable.CurrentTarget, + MinSize: minSize, + MaxSize: maxSize, + LastProbeTime: metav1.Time{Time: readiness.Time}, } if isHealthy { condition.Status = api.ClusterAutoscalerHealthy } else { condition.Status = api.ClusterAutoscalerUnhealthy } + if condition.Status == lastStatus.Status { + condition.LastTransitionTime = lastStatus.LastTransitionTime + } else { + condition.LastTransitionTime = condition.LastProbeTime + } return condition } -func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, scaleUpSafety NodeGroupScalingSafety, readiness Readiness, acceptable AcceptableRange) api.ClusterAutoscalerCondition { - condition := api.ClusterAutoscalerCondition{ - Type: api.ClusterAutoscalerScaleUp, - Message: fmt.Sprintf("ready=%d cloudProviderTarget=%d", - len(readiness.Ready), - acceptable.CurrentTarget), +func (csr *ClusterStateRegistry) buildScaleUpStatusNodeGroup(nodeGroup cloudprovider.NodeGroup, readiness Readiness, acceptable AcceptableRange, now time.Time, lastStatus api.NodeGroupScaleUpCondition) api.NodeGroupScaleUpCondition { + isScaleUpInProgress := csr.IsNodeGroupScalingUp(nodeGroup.Id()) + scaleUpSafety := csr.NodeGroupScaleUpSafety(nodeGroup, now) + condition := api.NodeGroupScaleUpCondition{ LastProbeTime: metav1.Time{Time: readiness.Time}, } if isScaleUpInProgress { @@ -839,16 +854,24 @@ func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, scaleUpSafety NodeGro condition.Status = api.ClusterAutoscalerUnhealthy } else if !scaleUpSafety.SafeToScale { condition.Status = api.ClusterAutoscalerBackoff + condition.BackoffInfo = api.BackoffInfo{ + ErrorCode: scaleUpSafety.BackoffStatus.ErrorInfo.ErrorCode, + ErrorMessage: truncateIfExceedMaxLength(scaleUpSafety.BackoffStatus.ErrorInfo.ErrorMessage, maxErrorMessageSize), + } } else { condition.Status = api.ClusterAutoscalerNoActivity } + if condition.Status == lastStatus.Status { + condition.LastTransitionTime = lastStatus.LastTransitionTime + } else { + condition.LastTransitionTime = condition.LastProbeTime + } return condition } -func buildScaleDownStatusNodeGroup(candidates []string, lastProbed time.Time) api.ClusterAutoscalerCondition { - condition := api.ClusterAutoscalerCondition{ - Type: api.ClusterAutoscalerScaleDown, - Message: fmt.Sprintf("candidates=%d", len(candidates)), +func buildScaleDownStatusNodeGroup(candidates []string, lastProbed time.Time, lastStatus api.ScaleDownCondition) api.ScaleDownCondition { + condition := api.ScaleDownCondition{ + Candidates: len(candidates), LastProbeTime: metav1.Time{Time: lastProbed}, } if len(candidates) > 0 { @@ -856,46 +879,41 @@ func buildScaleDownStatusNodeGroup(candidates []string, lastProbed time.Time) ap } else { condition.Status = api.ClusterAutoscalerNoCandidates } + if condition.Status == lastStatus.Status { + condition.LastTransitionTime = lastStatus.LastTransitionTime + } else { + condition.LastTransitionTime = condition.LastProbeTime + } return condition } -func buildHealthStatusClusterwide(isReady bool, readiness Readiness) api.ClusterAutoscalerCondition { - condition := api.ClusterAutoscalerCondition{ - Type: api.ClusterAutoscalerHealth, - Message: fmt.Sprintf("ready=%d unready=%d (resourceUnready=%d) notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d", - len(readiness.Ready), - len(readiness.Unready), - len(readiness.ResourceUnready), - len(readiness.NotStarted), - len(readiness.Registered), - len(readiness.LongUnregistered), - ), +func buildHealthStatusClusterwide(isHealthy bool, readiness Readiness, lastStatus api.ClusterHealthCondition) api.ClusterHealthCondition { + condition := api.ClusterHealthCondition{ + NodeCounts: buildNodeCount(readiness), LastProbeTime: metav1.Time{Time: readiness.Time}, } - if isReady { + if isHealthy { condition.Status = api.ClusterAutoscalerHealthy } else { condition.Status = api.ClusterAutoscalerUnhealthy } + if condition.Status == lastStatus.Status { + condition.LastTransitionTime = lastStatus.LastTransitionTime + } else { + condition.LastTransitionTime = condition.LastProbeTime + } return condition } -func buildScaleUpStatusClusterwide(nodeGroupStatuses []api.NodeGroupStatus, readiness Readiness) api.ClusterAutoscalerCondition { +func buildScaleUpStatusClusterwide(nodeGroupsStatuses []api.NodeGroupStatus, readiness Readiness, lastStatus api.ClusterScaleUpCondition) api.ClusterScaleUpCondition { isScaleUpInProgress := false - for _, nodeGroupStatuses := range nodeGroupStatuses { - for _, condition := range nodeGroupStatuses.Conditions { - if condition.Type == api.ClusterAutoscalerScaleUp && - condition.Status == api.ClusterAutoscalerInProgress { - isScaleUpInProgress = true - } + for _, nodeGroupStatus := range nodeGroupsStatuses { + if nodeGroupStatus.ScaleUp.Status == api.ClusterAutoscalerInProgress { + isScaleUpInProgress = true + break } } - - condition := api.ClusterAutoscalerCondition{ - Type: api.ClusterAutoscalerScaleUp, - Message: fmt.Sprintf("ready=%d registered=%d", - len(readiness.Ready), - len(readiness.Registered)), + condition := api.ClusterScaleUpCondition{ LastProbeTime: metav1.Time{Time: readiness.Time}, } if isScaleUpInProgress { @@ -903,17 +921,21 @@ func buildScaleUpStatusClusterwide(nodeGroupStatuses []api.NodeGroupStatus, read } else { condition.Status = api.ClusterAutoscalerNoActivity } + if condition.Status == lastStatus.Status { + condition.LastTransitionTime = lastStatus.LastTransitionTime + } else { + condition.LastTransitionTime = condition.LastProbeTime + } return condition } -func buildScaleDownStatusClusterwide(candidates map[string][]string, lastProbed time.Time) api.ClusterAutoscalerCondition { +func buildScaleDownStatusClusterwide(candidates map[string][]string, lastProbed time.Time, lastStatus api.ScaleDownCondition) api.ScaleDownCondition { totalCandidates := 0 for _, val := range candidates { totalCandidates += len(val) } - condition := api.ClusterAutoscalerCondition{ - Type: api.ClusterAutoscalerScaleDown, - Message: fmt.Sprintf("candidates=%d", totalCandidates), + condition := api.ScaleDownCondition{ + Candidates: totalCandidates, LastProbeTime: metav1.Time{Time: lastProbed}, } if totalCandidates > 0 { @@ -921,48 +943,12 @@ func buildScaleDownStatusClusterwide(candidates map[string][]string, lastProbed } else { condition.Status = api.ClusterAutoscalerNoCandidates } - return condition -} - -func updateLastTransition(oldStatus, newStatus *api.ClusterAutoscalerStatus) { - newStatus.ClusterwideConditions = updateLastTransitionSingleList( - oldStatus.ClusterwideConditions, newStatus.ClusterwideConditions) - updatedNgStatuses := make([]api.NodeGroupStatus, 0) - for _, ngStatus := range newStatus.NodeGroupStatuses { - oldConds := make([]api.ClusterAutoscalerCondition, 0) - for _, oldNgStatus := range oldStatus.NodeGroupStatuses { - if ngStatus.ProviderID == oldNgStatus.ProviderID { - oldConds = oldNgStatus.Conditions - break - } - } - newConds := updateLastTransitionSingleList(oldConds, ngStatus.Conditions) - updatedNgStatuses = append( - updatedNgStatuses, - api.NodeGroupStatus{ - ProviderID: ngStatus.ProviderID, - Conditions: newConds, - }) - } - newStatus.NodeGroupStatuses = updatedNgStatuses -} - -func updateLastTransitionSingleList(oldConds, newConds []api.ClusterAutoscalerCondition) []api.ClusterAutoscalerCondition { - result := make([]api.ClusterAutoscalerCondition, 0) - // We have ~3 conditions, so O(n**2) is good enough - for _, condition := range newConds { + if condition.Status == lastStatus.Status { + condition.LastTransitionTime = lastStatus.LastTransitionTime + } else { condition.LastTransitionTime = condition.LastProbeTime - for _, oldCondition := range oldConds { - if condition.Type == oldCondition.Type { - if condition.Status == oldCondition.Status { - condition.LastTransitionTime = oldCondition.LastTransitionTime - } - break - } - } - result = append(result, condition) } - return result + return condition } // GetIncorrectNodeGroupSize gets IncorrectNodeGroupSizeInformation for the given node group. @@ -1269,3 +1255,14 @@ func (csr *ClusterStateRegistry) GetScaleUpFailures() map[string][]ScaleUpFailur } return result } + +func truncateIfExceedMaxLength(s string, maxLength int) string { + if len(s) <= maxLength { + return s + } + untrancatedLen := maxLength - len(messageTrancated) + if untrancatedLen < 0 { + return s[:maxLength] + } + return s[:untrancatedLen] + messageTrancated +} diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go index 5100c1446f23..a57a6a7aceaa 100644 --- a/cluster-autoscaler/clusterstate/clusterstate_test.go +++ b/cluster-autoscaler/clusterstate/clusterstate_test.go @@ -82,20 +82,17 @@ func TestOKWithScaleUp(t *testing.T) { assert.Empty(t, clusterstate.GetScaleUpFailures()) status := clusterstate.GetStatus(now) - assert.Equal(t, api.ClusterAutoscalerInProgress, - api.GetConditionByType(api.ClusterAutoscalerScaleUp, status.ClusterwideConditions).Status) - assert.Equal(t, 2, len(status.NodeGroupStatuses)) + assert.Equal(t, api.ClusterAutoscalerInProgress, status.ClusterWide.ScaleUp.Status) + assert.Equal(t, 2, len(status.NodeGroups)) ng1Checked := false ng2Checked := true - for _, nodeStatus := range status.NodeGroupStatuses { - if nodeStatus.ProviderID == "ng1" { - assert.Equal(t, api.ClusterAutoscalerInProgress, - api.GetConditionByType(api.ClusterAutoscalerScaleUp, nodeStatus.Conditions).Status) + for _, nodeGroupStatus := range status.NodeGroups { + if nodeGroupStatus.Name == "ng1" { + assert.Equal(t, api.ClusterAutoscalerInProgress, nodeGroupStatus.ScaleUp.Status) ng1Checked = true } - if nodeStatus.ProviderID == "ng2" { - assert.Equal(t, api.ClusterAutoscalerNoActivity, - api.GetConditionByType(api.ClusterAutoscalerScaleUp, nodeStatus.Conditions).Status) + if nodeGroupStatus.Name == "ng2" { + assert.Equal(t, api.ClusterAutoscalerNoActivity, nodeGroupStatus.ScaleUp.Status) ng2Checked = true } } @@ -206,17 +203,14 @@ func TestOKOneUnreadyNode(t *testing.T) { assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) status := clusterstate.GetStatus(now) - assert.Equal(t, api.ClusterAutoscalerHealthy, - api.GetConditionByType(api.ClusterAutoscalerHealth, status.ClusterwideConditions).Status) - assert.Equal(t, api.ClusterAutoscalerNoActivity, - api.GetConditionByType(api.ClusterAutoscalerScaleUp, status.ClusterwideConditions).Status) + assert.Equal(t, api.ClusterAutoscalerHealthy, status.ClusterWide.Health.Status) + assert.Equal(t, api.ClusterAutoscalerNoActivity, status.ClusterWide.ScaleUp.Status) - assert.Equal(t, 2, len(status.NodeGroupStatuses)) + assert.Equal(t, 2, len(status.NodeGroups)) ng1Checked := false - for _, nodeStatus := range status.NodeGroupStatuses { - if nodeStatus.ProviderID == "ng1" { - assert.Equal(t, api.ClusterAutoscalerHealthy, - api.GetConditionByType(api.ClusterAutoscalerHealth, nodeStatus.Conditions).Status) + for _, nodeGroupStatus := range status.NodeGroups { + if nodeGroupStatus.Name == "ng1" { + assert.Equal(t, api.ClusterAutoscalerHealthy, nodeGroupStatus.Health.Status) ng1Checked = true } } @@ -273,33 +267,22 @@ func TestOKOneUnreadyNodeWithScaleDownCandidate(t *testing.T) { assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) status := clusterstate.GetStatus(now) - assert.Equal(t, api.ClusterAutoscalerHealthy, - api.GetConditionByType(api.ClusterAutoscalerHealth, status.ClusterwideConditions).Status) - assert.Equal(t, api.ClusterAutoscalerNoActivity, - api.GetConditionByType(api.ClusterAutoscalerScaleUp, status.ClusterwideConditions).Status) - assert.Equal(t, api.ClusterAutoscalerCandidatesPresent, - api.GetConditionByType(api.ClusterAutoscalerScaleDown, status.ClusterwideConditions).Status) - - assert.Equal(t, 2, len(status.NodeGroupStatuses)) + assert.Equal(t, api.ClusterAutoscalerHealthy, status.ClusterWide.Health.Status) + assert.Equal(t, api.ClusterAutoscalerNoActivity, status.ClusterWide.ScaleUp.Status) + assert.Equal(t, api.ClusterAutoscalerCandidatesPresent, status.ClusterWide.ScaleDown.Status) + + assert.Equal(t, 2, len(status.NodeGroups)) ng1Checked := false ng2Checked := false - for _, nodeStatus := range status.NodeGroupStatuses { - if nodeStatus.ProviderID == "ng1" { - assert.Equal(t, api.ClusterAutoscalerHealthy, - api.GetConditionByType(api.ClusterAutoscalerHealth, nodeStatus.Conditions).Status) - - assert.Equal(t, api.ClusterAutoscalerCandidatesPresent, - api.GetConditionByType(api.ClusterAutoscalerScaleDown, nodeStatus.Conditions).Status) - + for _, nodeGroupStatus := range status.NodeGroups { + if nodeGroupStatus.Name == "ng1" { + assert.Equal(t, api.ClusterAutoscalerHealthy, nodeGroupStatus.Health.Status) + assert.Equal(t, api.ClusterAutoscalerCandidatesPresent, nodeGroupStatus.ScaleDown.Status) ng1Checked = true } - if nodeStatus.ProviderID == "ng2" { - assert.Equal(t, api.ClusterAutoscalerHealthy, - api.GetConditionByType(api.ClusterAutoscalerHealth, nodeStatus.Conditions).Status) - - assert.Equal(t, api.ClusterAutoscalerNoCandidates, - api.GetConditionByType(api.ClusterAutoscalerScaleDown, nodeStatus.Conditions).Status) - + if nodeGroupStatus.Name == "ng2" { + assert.Equal(t, api.ClusterAutoscalerHealthy, nodeGroupStatus.Health.Status) + assert.Equal(t, api.ClusterAutoscalerNoCandidates, nodeGroupStatus.ScaleDown.Status) ng2Checked = true } } @@ -336,14 +319,12 @@ func TestMissingNodes(t *testing.T) { assert.False(t, clusterstate.IsNodeGroupHealthy("ng1")) status := clusterstate.GetStatus(now) - assert.Equal(t, api.ClusterAutoscalerHealthy, - api.GetConditionByType(api.ClusterAutoscalerHealth, status.ClusterwideConditions).Status) - assert.Equal(t, 2, len(status.NodeGroupStatuses)) + assert.Equal(t, api.ClusterAutoscalerHealthy, status.ClusterWide.Health.Status) + assert.Equal(t, 2, len(status.NodeGroups)) ng1Checked := false - for _, nodeStatus := range status.NodeGroupStatuses { - if nodeStatus.ProviderID == "ng1" { - assert.Equal(t, api.ClusterAutoscalerUnhealthy, - api.GetConditionByType(api.ClusterAutoscalerHealth, nodeStatus.Conditions).Status) + for _, nodeGroupStatus := range status.NodeGroups { + if nodeGroupStatus.Name == "ng1" { + assert.Equal(t, api.ClusterAutoscalerUnhealthy, nodeGroupStatus.Health.Status) ng1Checked = true } } @@ -787,111 +768,6 @@ func TestCloudProviderDeletedNodes(t *testing.T) { assert.Equal(t, 0, len(GetCloudProviderDeletedNodeNames(clusterstate))) } -func TestUpdateLastTransitionTimes(t *testing.T) { - now := metav1.Time{Time: time.Now()} - later := metav1.Time{Time: now.Time.Add(10 * time.Second)} - oldStatus := &api.ClusterAutoscalerStatus{ - ClusterwideConditions: make([]api.ClusterAutoscalerCondition, 0), - NodeGroupStatuses: make([]api.NodeGroupStatus, 0), - } - oldStatus.ClusterwideConditions = append( - oldStatus.ClusterwideConditions, - api.ClusterAutoscalerCondition{ - Type: api.ClusterAutoscalerHealth, - Status: api.ClusterAutoscalerHealthy, - LastProbeTime: now, - LastTransitionTime: now, - }) - oldStatus.ClusterwideConditions = append( - oldStatus.ClusterwideConditions, - api.ClusterAutoscalerCondition{ - Type: api.ClusterAutoscalerScaleUp, - Status: api.ClusterAutoscalerInProgress, - LastProbeTime: now, - LastTransitionTime: now, - }) - oldStatus.NodeGroupStatuses = append( - oldStatus.NodeGroupStatuses, - api.NodeGroupStatus{ - ProviderID: "ng1", - Conditions: oldStatus.ClusterwideConditions, - }) - - newStatus := &api.ClusterAutoscalerStatus{ - ClusterwideConditions: make([]api.ClusterAutoscalerCondition, 0), - NodeGroupStatuses: make([]api.NodeGroupStatus, 0), - } - newStatus.ClusterwideConditions = append( - newStatus.ClusterwideConditions, - api.ClusterAutoscalerCondition{ - Type: api.ClusterAutoscalerHealth, - Status: api.ClusterAutoscalerHealthy, - LastProbeTime: later, - }) - newStatus.ClusterwideConditions = append( - newStatus.ClusterwideConditions, - api.ClusterAutoscalerCondition{ - Type: api.ClusterAutoscalerScaleUp, - Status: api.ClusterAutoscalerNotNeeded, - LastProbeTime: later, - }) - newStatus.ClusterwideConditions = append( - newStatus.ClusterwideConditions, - api.ClusterAutoscalerCondition{ - Type: api.ClusterAutoscalerScaleDown, - Status: api.ClusterAutoscalerNoCandidates, - LastProbeTime: later, - }) - newStatus.NodeGroupStatuses = append( - newStatus.NodeGroupStatuses, - api.NodeGroupStatus{ - ProviderID: "ng2", - Conditions: newStatus.ClusterwideConditions, - }) - newStatus.NodeGroupStatuses = append( - newStatus.NodeGroupStatuses, - api.NodeGroupStatus{ - ProviderID: "ng1", - Conditions: newStatus.ClusterwideConditions, - }) - updateLastTransition(oldStatus, newStatus) - - for _, cwCondition := range newStatus.ClusterwideConditions { - switch cwCondition.Type { - case api.ClusterAutoscalerHealth: - // Status has not changed - assert.Equal(t, now, cwCondition.LastTransitionTime) - case api.ClusterAutoscalerScaleUp: - // Status has changed - assert.Equal(t, later, cwCondition.LastTransitionTime) - case api.ClusterAutoscalerScaleDown: - // No old status information - assert.Equal(t, later, cwCondition.LastTransitionTime) - } - } - - expectedNgTimestamps := make(map[string]map[api.ClusterAutoscalerConditionType]metav1.Time, 0) - // Same as cluster-wide - expectedNgTimestamps["ng1"] = map[api.ClusterAutoscalerConditionType]metav1.Time{ - api.ClusterAutoscalerHealth: now, - api.ClusterAutoscalerScaleUp: later, - api.ClusterAutoscalerScaleDown: later, - } - // New node group - everything should have latest timestamp as last transition time - expectedNgTimestamps["ng2"] = map[api.ClusterAutoscalerConditionType]metav1.Time{ - api.ClusterAutoscalerHealth: later, - api.ClusterAutoscalerScaleUp: later, - api.ClusterAutoscalerScaleDown: later, - } - - for _, ng := range newStatus.NodeGroupStatuses { - expectations := expectedNgTimestamps[ng.ProviderID] - for _, ngCondition := range ng.Conditions { - assert.Equal(t, expectations[ngCondition.Type], ngCondition.LastTransitionTime) - } - } -} - func TestScaleUpBackoff(t *testing.T) { now := time.Now() @@ -934,7 +810,7 @@ func TestScaleUpBackoff(t *testing.T) { ErrorMessage: "Scale-up timed out for node group ng1 after 3m0s", }, }, - }, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now)) + }, clusterstate.NodeGroupScaleUpSafety(ng1, now)) assert.Equal(t, backoff.Status{ IsBackedOff: true, ErrorInfo: cloudprovider.InstanceErrorInfo{ @@ -947,7 +823,7 @@ func TestScaleUpBackoff(t *testing.T) { now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second) assert.True(t, clusterstate.IsClusterHealthy()) assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) - assert.Equal(t, NodeGroupScalingSafety{SafeToScale: true, Healthy: true}, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now)) + assert.Equal(t, NodeGroupScalingSafety{SafeToScale: true, Healthy: true}, clusterstate.NodeGroupScaleUpSafety(ng1, now)) // Another failed scale up should cause longer backoff clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now.Add(-121*time.Second)) @@ -967,7 +843,7 @@ func TestScaleUpBackoff(t *testing.T) { ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s", }, }, - }, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now)) + }, clusterstate.NodeGroupScaleUpSafety(ng1, now)) now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second) assert.Equal(t, NodeGroupScalingSafety{ @@ -981,7 +857,7 @@ func TestScaleUpBackoff(t *testing.T) { ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s", }, }, - }, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now)) + }, clusterstate.NodeGroupScaleUpSafety(ng1, now)) // The backoff should be cleared after a successful scale-up clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now) @@ -992,7 +868,7 @@ func TestScaleUpBackoff(t *testing.T) { assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) - assert.Equal(t, NodeGroupScalingSafety{SafeToScale: true, Healthy: true}, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now)) + assert.Equal(t, NodeGroupScalingSafety{SafeToScale: true, Healthy: true}, clusterstate.NodeGroupScaleUpSafety(ng1, now)) assert.Equal(t, backoff.Status{IsBackedOff: false}, clusterstate.backoff.BackoffStatus(ng1, nil, now)) } @@ -1449,3 +1325,44 @@ func TestUpdateIncorrectNodeGroupSizes(t *testing.T) { }) } } + +func TestTruncateIfExceedMaxSize(t *testing.T) { + testCases := []struct { + name string + message string + maxSize int + wantMessage string + }{ + { + name: "Message doesn't exceed maxSize", + message: "Some message", + maxSize: len("Some message"), + wantMessage: "Some message", + }, + { + name: "Message exceeds maxSize", + message: "Some long message", + maxSize: len("Some long message") - 1, + wantMessage: "Some ", + }, + { + name: "Message doesn't exceed maxSize and maxSize is smaller than truncatedMessageSuffix length", + message: "msg", + maxSize: len("msg"), + wantMessage: "msg", + }, + { + name: "Message exceeds maxSize and maxSize is smaller than truncatedMessageSuffix length", + message: "msg", + maxSize: 2, + wantMessage: "ms", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got := truncateIfExceedMaxLength(tc.message, tc.maxSize) + assert.Equal(t, tc.wantMessage, got) + }) + } +} diff --git a/cluster-autoscaler/clusterstate/utils/status.go b/cluster-autoscaler/clusterstate/utils/status.go index c1917b44b1c9..d0ba9be423e3 100644 --- a/cluster-autoscaler/clusterstate/utils/status.go +++ b/cluster-autoscaler/clusterstate/utils/status.go @@ -22,10 +22,12 @@ import ( "fmt" "time" + "gopkg.in/yaml.v2" apiv1 "k8s.io/api/core/v1" kube_errors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/autoscaler/cluster-autoscaler/clusterstate/api" kube_client "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/record" @@ -60,6 +62,13 @@ func (ler *LogEventRecorder) Eventf(eventtype, reason, message string, args ...i } } +// EmptyClusterAutoscalerStatus returns empty status for ClusterAutoscalerStatus when it is being initialized. +func EmptyClusterAutoscalerStatus() *api.ClusterAutoscalerStatus { + return &api.ClusterAutoscalerStatus{ + AutoscalerStatus: api.ClusterAutoscalerInitializing, + } +} + // NewStatusMapRecorder creates a LogEventRecorder creating events on status configmap. // If the configmap doesn't exist it will be created (with 'Initializing' status). // If active == false the map will not be created and no events will be recorded. @@ -67,7 +76,7 @@ func NewStatusMapRecorder(kubeClient kube_client.Interface, namespace string, re var mapObj runtime.Object var err error if active { - mapObj, err = WriteStatusConfigMap(kubeClient, namespace, "Initializing", nil, statusConfigMapName) + mapObj, err = WriteStatusConfigMap(kubeClient, namespace, *EmptyClusterAutoscalerStatus(), nil, statusConfigMapName, time.Now()) if err != nil { return nil, errors.New("Failed to init status ConfigMap") } @@ -82,14 +91,19 @@ func NewStatusMapRecorder(kubeClient kube_client.Interface, namespace string, re // WriteStatusConfigMap writes updates status ConfigMap with a given message or creates a new // ConfigMap if it doesn't exist. If logRecorder is passed and configmap update is successful // logRecorder's internal reference will be updated. -func WriteStatusConfigMap(kubeClient kube_client.Interface, namespace string, msg string, logRecorder *LogEventRecorder, statusConfigMapName string) (*apiv1.ConfigMap, error) { - statusUpdateTime := time.Now().Format(ConfigMapLastUpdateFormat) - statusMsg := fmt.Sprintf("Cluster-autoscaler status at %s:\n%v", statusUpdateTime, msg) +func WriteStatusConfigMap(kubeClient kube_client.Interface, namespace string, status api.ClusterAutoscalerStatus, logRecorder *LogEventRecorder, statusConfigMapName string, currentTime time.Time) (*apiv1.ConfigMap, error) { + statusUpdateTime := currentTime.Format(ConfigMapLastUpdateFormat) + status.Time = statusUpdateTime var configMap *apiv1.ConfigMap var getStatusError, writeStatusError error var errMsg string maps := kubeClient.CoreV1().ConfigMaps(namespace) configMap, getStatusError = maps.Get(context.TODO(), statusConfigMapName, metav1.GetOptions{}) + statusYaml, err := yaml.Marshal(status) + if err != nil { + return nil, fmt.Errorf("Failed to marshal status configmap: %v", err) + } + statusMsg := string(statusYaml) if getStatusError == nil { if configMap.Data == nil { configMap.Data = make(map[string]string) diff --git a/cluster-autoscaler/clusterstate/utils/status_test.go b/cluster-autoscaler/clusterstate/utils/status_test.go index ff96a2f0f00e..60d1823e4e75 100644 --- a/cluster-autoscaler/clusterstate/utils/status_test.go +++ b/cluster-autoscaler/clusterstate/utils/status_test.go @@ -18,12 +18,15 @@ package utils import ( "errors" + "io/ioutil" "testing" + "time" apiv1 "k8s.io/api/core/v1" kube_errors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/autoscaler/cluster-autoscaler/clusterstate/api" "k8s.io/client-go/kubernetes/fake" core "k8s.io/client-go/testing" @@ -87,7 +90,7 @@ func setUpTest(t *testing.T) *testInfo { func TestWriteStatusConfigMapExisting(t *testing.T) { ti := setUpTest(t) - result, err := WriteStatusConfigMap(ti.client, ti.namespace, "TEST_MSG", nil, "my-cool-configmap") + result, err := WriteStatusConfigMap(ti.client, ti.namespace, api.ClusterAutoscalerStatus{Message: "TEST_MSG"}, nil, "my-cool-configmap", time.Now()) assert.Equal(t, ti.configMap, result) assert.Contains(t, result.Data["status"], "TEST_MSG") assert.Contains(t, result.ObjectMeta.Annotations, ConfigMapLastUpdatedKey) @@ -98,7 +101,7 @@ func TestWriteStatusConfigMapExisting(t *testing.T) { // to test the case where configmap is empty ti.configMap.Data = nil - result, err = WriteStatusConfigMap(ti.client, ti.namespace, "TEST_MSG", nil, "my-cool-configmap") + result, err = WriteStatusConfigMap(ti.client, ti.namespace, api.ClusterAutoscalerStatus{Message: "TEST_MSG"}, nil, "my-cool-configmap", time.Now()) assert.Equal(t, ti.configMap, result) assert.Contains(t, result.Data["status"], "TEST_MSG") assert.Contains(t, result.ObjectMeta.Annotations, ConfigMapLastUpdatedKey) @@ -111,7 +114,7 @@ func TestWriteStatusConfigMapExisting(t *testing.T) { func TestWriteStatusConfigMapCreate(t *testing.T) { ti := setUpTest(t) ti.getError = kube_errors.NewNotFound(apiv1.Resource("configmap"), "nope, not found") - result, err := WriteStatusConfigMap(ti.client, ti.namespace, "TEST_MSG", nil, "my-cool-configmap") + result, err := WriteStatusConfigMap(ti.client, ti.namespace, api.ClusterAutoscalerStatus{Message: "TEST_MSG"}, nil, "my-cool-configmap", time.Now()) assert.Contains(t, result.Data["status"], "TEST_MSG") assert.Contains(t, result.ObjectMeta.Annotations, ConfigMapLastUpdatedKey) assert.Nil(t, err) @@ -123,7 +126,7 @@ func TestWriteStatusConfigMapCreate(t *testing.T) { func TestWriteStatusConfigMapError(t *testing.T) { ti := setUpTest(t) ti.getError = errors.New("stuff bad") - result, err := WriteStatusConfigMap(ti.client, ti.namespace, "TEST_MSG", nil, "my-cool-configmap") + result, err := WriteStatusConfigMap(ti.client, ti.namespace, api.ClusterAutoscalerStatus{Message: "TEST_MSG"}, nil, "my-cool-configmap", time.Now()) assert.NotNil(t, err) assert.Contains(t, err.Error(), "stuff bad") assert.Nil(t, result) @@ -131,3 +134,94 @@ func TestWriteStatusConfigMapError(t *testing.T) { assert.False(t, ti.updateCalled) assert.False(t, ti.createCalled) } + +var status api.ClusterAutoscalerStatus = api.ClusterAutoscalerStatus{ + Message: "TEST_MSG", + AutoscalerStatus: "Running", + ClusterWide: api.ClusterWideStatus{ + Health: api.ClusterHealthCondition{ + Status: "Healthy", + NodeCounts: api.NodeCount{ + Registered: api.RegisteredNodeCount{ + Total: 10, + Ready: 4, + NotStarted: 3, + BeingDeleted: 1, + Unready: api.RegisteredUnreadyNodeCount{ + Total: 2, + ResourceUnready: 1, + }, + }, + LongUnregistered: 1, + Unregistered: 2, + }, + LastProbeTime: metav1.Date(2023, 11, 24, 04, 28, 19, 48988, time.UTC), + LastTransitionTime: metav1.Date(2023, 11, 23, 14, 52, 02, 11000, time.UTC), + }, + ScaleUp: api.ClusterScaleUpCondition{ + Status: "NoActivity", + LastProbeTime: metav1.Date(2023, 11, 24, 04, 28, 19, 48988, time.UTC), + LastTransitionTime: metav1.Date(2023, 11, 23, 14, 52, 02, 11000, time.UTC), + }, + ScaleDown: api.ScaleDownCondition{ + Status: "NoCandidates", + Candidates: 2, + LastProbeTime: metav1.Date(2023, 11, 24, 04, 28, 19, 48988, time.UTC), + LastTransitionTime: metav1.Date(2023, 11, 23, 14, 52, 02, 11000, time.UTC), + }, + }, + NodeGroups: []api.NodeGroupStatus{{ + Name: "sample-node-group", + Health: api.NodeGroupHealthCondition{ + Status: "Healthy", + NodeCounts: api.NodeCount{ + Registered: api.RegisteredNodeCount{ + Total: 10, + Ready: 4, + NotStarted: 3, + BeingDeleted: 1, + Unready: api.RegisteredUnreadyNodeCount{ + Total: 2, + ResourceUnready: 1, + }, + }, + LongUnregistered: 1, + Unregistered: 2, + }, + CloudProviderTarget: 8, + MinSize: 2, + MaxSize: 12, + LastProbeTime: metav1.Date(2023, 11, 24, 04, 28, 19, 48988, time.UTC), + LastTransitionTime: metav1.Date(2023, 11, 23, 14, 52, 02, 11000, time.UTC), + }, + ScaleUp: api.NodeGroupScaleUpCondition{ + Status: "Backoff", + BackoffInfo: api.BackoffInfo{ + ErrorCode: "QUOTA_EXCEEDED", + ErrorMessage: "Instance 'sample-node-group-40ce0341-t28s' creation failed: Quota 'CPUS' exceeded. Limit: 57.0 in region us-central1.", + }, + LastProbeTime: metav1.Date(2023, 11, 24, 04, 28, 19, 48988, time.UTC), + LastTransitionTime: metav1.Date(2023, 11, 23, 14, 52, 02, 11000, time.UTC), + }, + ScaleDown: api.ScaleDownCondition{ + Status: "NoCandidates", + Candidates: 2, + LastProbeTime: metav1.Date(2023, 11, 24, 04, 28, 19, 48988, time.UTC), + LastTransitionTime: metav1.Date(2023, 11, 23, 14, 52, 02, 11000, time.UTC), + }, + }}, +} + +func TestWriteStatusConfigMapMarshal(t *testing.T) { + const statusYamlTestFile = "status_test.yaml" + ti := setUpTest(t) + want, err := ioutil.ReadFile(statusYamlTestFile) + if err != nil { + t.Fatalf("Failed to Marshal %s: %v", statusYamlTestFile, err) + } + result, err := WriteStatusConfigMap(ti.client, ti.namespace, status, nil, "my-cool-configmap", time.Date(2023, 11, 24, 4, 28, 19, 546750398, time.UTC)) + if err != nil { + t.Fatalf("Expected WriteStatusConfigMap not to return error, got: %v", err) + } + assert.YAMLEq(t, string(want), result.Data["status"]) +} diff --git a/cluster-autoscaler/clusterstate/utils/status_test.yaml b/cluster-autoscaler/clusterstate/utils/status_test.yaml new file mode 100644 index 000000000000..a1ba0b75845b --- /dev/null +++ b/cluster-autoscaler/clusterstate/utils/status_test.yaml @@ -0,0 +1,61 @@ +time: "2023-11-24 04:28:19.546750398 +0000 UTC" +message: "TEST_MSG" +autoscalerStatus: "Running" +clusterWide: + health: + status: "Healthy" + nodeCounts: + registered: + total: 10 + ready: 4 + notStarted: 3 + beingDeleted: 1 + unready: + total: 2 + resourceUnready: 1 + longUnregistered: 1 + unregistered: 2 + lastProbeTime: "2023-11-24T04:28:19.000048988Z" + lastTransitionTime: "2023-11-23T14:52:02.000011Z" + scaleUp: + status: "NoActivity" + lastProbeTime: "2023-11-24T04:28:19.000048988Z" + lastTransitionTime: "2023-11-23T14:52:02.000011Z" + scaleDown: + status: "NoCandidates" + candidates: 2 + lastProbeTime: "2023-11-24T04:28:19.000048988Z" + lastTransitionTime: "2023-11-23T14:52:02.000011Z" +nodeGroups: + - + name: "sample-node-group" + health: + status: "Healthy" + nodeCounts: + registered: + total: 10 + ready: 4 + notStarted: 3 + beingDeleted: 1 + unready: + total: 2 + resourceUnready: 1 + longUnregistered: 1 + unregistered: 2 + cloudProviderTarget: 8 + minSize: 2 + maxSize: 12 + lastProbeTime: "2023-11-24T04:28:19.000048988Z" + lastTransitionTime: "2023-11-23T14:52:02.000011Z" + scaleUp: + status: "Backoff" + backoffInfo: + errorCode: "QUOTA_EXCEEDED" + errorMessage: "Instance 'sample-node-group-40ce0341-t28s' creation failed: Quota 'CPUS' exceeded. Limit: 57.0 in region us-central1." + lastProbeTime: "2023-11-24T04:28:19.000048988Z" + lastTransitionTime: "2023-11-23T14:52:02.000011Z" + scaleDown: + status: "NoCandidates" + candidates: 2 + lastProbeTime: "2023-11-24T04:28:19.000048988Z" + lastTransitionTime: "2023-11-23T14:52:02.000011Z" diff --git a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go index 392a9d98f68f..656da06f274d 100644 --- a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go +++ b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go @@ -572,7 +572,7 @@ func (o *ScaleUpOrchestrator) IsNodeGroupReadyToScaleUp(nodeGroup cloudprovider. if !nodeGroup.Exist() { return nil } - if scaleUpSafety := o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(nodeGroup, now); !scaleUpSafety.SafeToScale { + if scaleUpSafety := o.clusterStateRegistry.NodeGroupScaleUpSafety(nodeGroup, now); !scaleUpSafety.SafeToScale { if !scaleUpSafety.Healthy { klog.Warningf("Node group %s is not ready for scaleup - unhealthy", nodeGroup.Id()) return NotReadyReason @@ -660,7 +660,7 @@ func (o *ScaleUpOrchestrator) ComputeSimilarNodeGroups( var validSimilarNodeGroups []cloudprovider.NodeGroup for _, ng := range similarNodeGroups { // Non-existing node groups are created later so skip check for them. - if ng.Exist() && !o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(ng, now).SafeToScale { + if ng.Exist() && !o.clusterStateRegistry.NodeGroupScaleUpSafety(ng, now).SafeToScale { klog.V(2).Infof("Ignoring node group %s when balancing: group is not ready for scaleup", ng.Id()) } else if similarSchedulablePods, found := schedulablePods[ng.Id()]; found && matchingSchedulablePods(groupSchedulablePods, similarSchedulablePods) { validSimilarNodeGroups = append(validSimilarNodeGroups, ng) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 4dbe475f5eb4..1afedcfb1bb5 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -388,7 +388,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr if autoscalingContext.WriteStatusConfigMap { status := a.clusterStateRegistry.GetStatus(currentTime) utils.WriteStatusConfigMap(autoscalingContext.ClientSet, autoscalingContext.ConfigNamespace, - status.GetReadableString(), a.AutoscalingContext.LogRecorder, a.AutoscalingContext.StatusConfigMapName) + *status, a.AutoscalingContext.LogRecorder, a.AutoscalingContext.StatusConfigMapName, currentTime) } // This deferred processor execution allows the processors to handle a situation when a scale-(up|down) diff --git a/cluster-autoscaler/processors/actionablecluster/actionable_cluster_processor.go b/cluster-autoscaler/processors/actionablecluster/actionable_cluster_processor.go index c9458ab2d0a2..db430b6a2c79 100644 --- a/cluster-autoscaler/processors/actionablecluster/actionable_cluster_processor.go +++ b/cluster-autoscaler/processors/actionablecluster/actionable_cluster_processor.go @@ -20,6 +20,7 @@ import ( "time" apiv1 "k8s.io/api/core/v1" + "k8s.io/autoscaler/cluster-autoscaler/clusterstate/api" "k8s.io/autoscaler/cluster-autoscaler/clusterstate/utils" "k8s.io/autoscaler/cluster-autoscaler/context" "k8s.io/autoscaler/cluster-autoscaler/metrics" @@ -71,7 +72,7 @@ func OnEmptyCluster(context *context.AutoscalingContext, status string, emitEven metrics.UpdateClusterSafeToAutoscale(false) metrics.UpdateNodesCount(0, 0, 0, 0, 0) if context.WriteStatusConfigMap { - utils.WriteStatusConfigMap(context.ClientSet, context.ConfigNamespace, status, context.LogRecorder, context.StatusConfigMapName) + utils.WriteStatusConfigMap(context.ClientSet, context.ConfigNamespace, api.ClusterAutoscalerStatus{AutoscalerStatus: api.ClusterAutoscalerInitializing, Message: status}, context.LogRecorder, context.StatusConfigMapName, time.Now()) } if emitEvent { context.LogRecorder.Eventf(apiv1.EventTypeWarning, "ClusterUnhealthy", status)