Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert status in cluster-autoscaler-status to yaml and add error info for scale-up backoff #6375

Merged
merged 2 commits into from
Dec 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 130 additions & 36 deletions cluster-autoscaler/clusterstate/api/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,14 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// ClusterAutoscalerConditionType is the type of ClusterAutoscalerCondition.
type ClusterAutoscalerConditionType string
// ClusterAutoscalerStatusCondition is the status of the cluster autoscaler.
type ClusterAutoscalerStatusCondition string

const (
// ClusterAutoscalerHealth - is a condition that explains what is the current health
// of ClusterAutoscaler or its node groups.
ClusterAutoscalerHealth ClusterAutoscalerConditionType = "Health"
// ClusterAutoscalerScaleDown is a condition that explains what is the current status
// of a node group with regard to scale down activities.
ClusterAutoscalerScaleDown ClusterAutoscalerConditionType = "ScaleDown"
// ClusterAutoscalerScaleUp is a condition that explains what is the current status
// of a node group with regard to scale up activities.
ClusterAutoscalerScaleUp ClusterAutoscalerConditionType = "ScaleUp"
// ClusterAutoscalerRunning status means that the cluster autoscaler has been initialized and running.
ClusterAutoscalerRunning ClusterAutoscalerStatusCondition = "Running"
// ClusterAutoscalerInitializing status means that cluster autoscaler is currently being initialized.
ClusterAutoscalerInitializing ClusterAutoscalerStatusCondition = "Initializing"
)

// ClusterAutoscalerConditionStatus is a status of ClusterAutoscalerCondition.
Expand Down Expand Up @@ -69,36 +64,135 @@ const (
ClusterAutoscalerBackoff ClusterAutoscalerConditionStatus = "Backoff"
)

// ClusterAutoscalerCondition describes some aspect of ClusterAutoscaler work.
type ClusterAutoscalerCondition struct {
// Type defines the aspect that the condition describes. For example, it can be Health or ScaleUp/Down activity.
Type ClusterAutoscalerConditionType `json:"type,omitempty"`
// Status of the condition.
Status ClusterAutoscalerConditionStatus `json:"status,omitempty"`
// Message is a free text extra information about the condition. It may contain some
// extra debugging data, like why the cluster is unhealthy.
Message string `json:"message,omitempty"`
// Reason is a unique, one-word, CamelCase reason for the condition's last transition.
Reason string `json:"reason,omitempty"`
// RegisteredUnreadyNodeCount contains node counts of registered but unready nodes.
type RegisteredUnreadyNodeCount struct {
// Total number of registered but unready nodes.
Total int `json:"total" yaml:"total"`
// ResourceUnready is the number of registered but unready nodes due to a missing resource (e.g. GPU).
ResourceUnready int `json:"resourceUnready" yaml:"resourceUnready"`
}

// RegisteredNodeCount contains node counts of registered nodes.
type RegisteredNodeCount struct {
Total int `json:"total" yaml:"total"`
Ready int `json:"ready" yaml:"ready"`
NotStarted int `json:"notStarted" yaml:"notStarted"`
// Number of nodes that are being currently deleted. They exist in K8S but are not included in NodeGroup.TargetSize().
BeingDeleted int `json:"beingDeleted,omitempty" yaml:"beingDeleted,omitempty"`
Unready RegisteredUnreadyNodeCount `json:"unready,omitempty" yaml:"unready,omitempty"`
}

// NodeCount contains number of nodes that satisfy different criteria.
type NodeCount struct {
Registered RegisteredNodeCount `json:"registered,omitempty" yaml:"registered,omitempty"`
LongUnregistered int `json:"longUnregistered" yaml:"longUnregistered"`
Unregistered int `json:"unregistered" yaml:"unregistered"`
walidghallab marked this conversation as resolved.
Show resolved Hide resolved
}

// ClusterHealthCondition contains information about health condition for the whole cluster.
type ClusterHealthCondition struct {
// Status of cluster health.
Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"`
// NodeCounts contains number of nodes that satisfy different criteria in the cluster.
NodeCounts NodeCount `json:"nodeCounts,omitempty" yaml:"nodeCounts,omitempty"`
// LastProbeTime is the last time we probed the condition.
LastProbeTime metav1.Time `json:"lastProbeTime,omitempty"`
LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"`
// LastTransitionTime is the time since when the condition was in the given state.
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"`
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"`
walidghallab marked this conversation as resolved.
Show resolved Hide resolved
}

// ClusterAutoscalerStatus contains ClusterAutoscaler status.
type ClusterAutoscalerStatus struct {
// NodeGroupStatuses contains status information of individual node groups on which CA works.
NodeGroupStatuses []NodeGroupStatus `json:"nodeGroupStatuses,omitempty"`
// ClusterwideConditions contains conditions that apply to the whole autoscaler.
ClusterwideConditions []ClusterAutoscalerCondition `json:"clusterwideConditions,omitempty"`
// NodeGroupHealthCondition contains information about health condition for a node group.
type NodeGroupHealthCondition struct {
// Status of node group health.
Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"`
// NodeCounts contains number of nodes that satisfy different criteria in the node group.
NodeCounts NodeCount `json:"nodeCounts,omitempty" yaml:"nodeCounts,omitempty"`
// CloudProviderTarget is the target size set by cloud provider.
CloudProviderTarget int `json:"cloudProviderTarget" yaml:"cloudProviderTarget"`
// MinSize is the CA max size of a node group.
MinSize int `json:"minSize" yaml:"minSize"`
// MaxSize is the CA max size of a node group.
MaxSize int `json:"maxSize" yaml:"maxSize"`
// LastProbeTime is the last time we probed the condition.
LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"`
// LastTransitionTime is the time since when the condition was in the given state.
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"`
}

// ClusterScaleUpCondition contains information about scale up condition for the whole cluster.
type ClusterScaleUpCondition struct {
// Status of the scale up.
Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"`
// LastProbeTime is the last time we probed the condition.
LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"`
// LastTransitionTime is the time since when the condition was in the given state.
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"`
}

// BackoffInfo contains error information that caused the backoff.
type BackoffInfo struct {
// ErrorCode is a specific error code for error condition
ErrorCode string `json:"errorCode,omitempty" yaml:"errorCode,omitempty"`
// ErrorMessage is human readable description of error condition
ErrorMessage string `json:"errorMessage,omitempty" yaml:"errorMessage,omitempty"`
}

// NodeGroupStatus contains status of a group of nodes controlled by ClusterAutoscaler.
// NodeGroupScaleUpCondition contains information about scale up condition for a node group.
type NodeGroupScaleUpCondition struct {
// Status of the scale up.
Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"`
// LastProbeTime is the last time we probed the condition.
BackoffInfo BackoffInfo `json:"backoffInfo,omitempty" yaml:"backoffInfo,omitempty"`
// LastProbeTime is the last time we probed the condition.
LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"`
walidghallab marked this conversation as resolved.
Show resolved Hide resolved
// LastTransitionTime is the time since when the condition was in the given state.
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"`
}

// ScaleDownCondition contains information about scale down condition for a node group or the whole cluster.
type ScaleDownCondition struct {
// Status of the scale down.
Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"`
// Candidates number for the scale down.
Candidates int `json:"candidates,omitempty" yaml:"candidates,omitempty"`
// LastProbeTime is the last time we probed the condition.
LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"`
// LastTransitionTime is the time since when the condition was in the given state.
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"`
}

// ClusterWideStatus contains status that apply to the whole cluster.
type ClusterWideStatus struct {
// Health contains information about health condition of the cluster.
Health ClusterHealthCondition `json:"health,omitempty" yaml:"health,omitempty"`
// ScaleUp contains information about scale up condition of the cluster.
ScaleUp ClusterScaleUpCondition `json:"scaleUp,omitempty" yaml:"scaleUp,omitempty"`
// ScaleDown contains information about scale down condition of the node group.
ScaleDown ScaleDownCondition `json:"scaleDown,omitempty" yaml:"scaleDown,omitempty"`
}

// NodeGroupStatus contains status of an individual node group on which CA works..
type NodeGroupStatus struct {
// ProviderID is the cloud-provider-specific name of the node group. On GCE it will be equal
// to MIG url, on AWS it will be ASG name, etc.
ProviderID string `json:"providerID,omitempty"`
// Conditions is a list of conditions that describe the state of the node group.
Conditions []ClusterAutoscalerCondition `json:"conditions,omitempty"`
// Name of the node group.
Name string `json:"name,omitempty" yaml:"name,omitempty"`
// Health contains information about health condition of the node group.
Health NodeGroupHealthCondition `json:"health,omitempty" yaml:"health,omitempty"`
// ScaleUp contains information about scale up condition of the node group.
ScaleUp NodeGroupScaleUpCondition `json:"scaleUp,omitempty" yaml:"scaleUp,omitempty"`
// ScaleDown contains information about scale down condition of the node group.
ScaleDown ScaleDownCondition `json:"scaleDown,omitempty" yaml:"scaleDown,omitempty"`
}

// ClusterAutoscalerStatus contains ClusterAutoscaler status.
type ClusterAutoscalerStatus struct {
// Time of the cluster autoscaler status.
Time string `json:"time,omitempty" yaml:"time,omitempty"`
// AutoscalerStatus contains status of ClusterAutoscaler (e.g. 'Initializing' & 'Running').
AutoscalerStatus ClusterAutoscalerStatusCondition `json:"autoscalerStatus,omitempty" yaml:"autoscalerStatus,omitempty"`
// Message contains extra information about the status.
Message string `json:"message,omitempty" yaml:"message,omitempty"`
// ClusterWide contains conditions that apply to the whole cluster.
ClusterWide ClusterWideStatus `json:"clusterWide,omitempty" yaml:"clusterWide,omitempty"`
// NodeGroups contains status information of individual node groups on which CA works.
NodeGroups []NodeGroupStatus `json:"nodeGroups,omitempty" yaml:"nodeGroups,omitempty"`
}
92 changes: 0 additions & 92 deletions cluster-autoscaler/clusterstate/api/utils.go

This file was deleted.

91 changes: 0 additions & 91 deletions cluster-autoscaler/clusterstate/api/utils_test.go

This file was deleted.

Loading
Loading