-
Notifications
You must be signed in to change notification settings - Fork 4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: correctly handle lack of capacity of AWS spot ASGs #2008
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -277,7 +277,10 @@ func (m *AwsManager) SetAsgSize(asg *asg, size int) error { | |
|
||
// DeleteInstances deletes the given instances. All instances must be controlled by the same ASG. | ||
func (m *AwsManager) DeleteInstances(instances []*AwsInstanceRef) error { | ||
return m.asgCache.DeleteInstances(instances) | ||
if err := m.asgCache.DeleteInstances(instances); err != nil { | ||
return err | ||
} | ||
return m.forceRefresh() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It feels like we should either have a log message here stating that a refresh is being forced or update the message in |
||
} | ||
|
||
// GetAsgNodes returns Asg nodes. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -274,6 +274,14 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) { | |
csr.scaleDownRequests = newScaleDownRequests | ||
} | ||
|
||
// BackoffNodeGroup is used to force the specified nodeGroup to go into backoff mode, which | ||
// means it won't be used for scaling out temporarily | ||
func (csr *ClusterStateRegistry) BackoffNodeGroup(nodeGroup cloudprovider.NodeGroup, currentTime time.Time) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please revert |
||
csr.Lock() | ||
defer csr.Unlock() | ||
csr.backoffNodeGroup(nodeGroup, cloudprovider.OtherErrorClass, "cloudProviderError", currentTime) | ||
} | ||
|
||
// To be executed under a lock. | ||
func (csr *ClusterStateRegistry) backoffNodeGroup(nodeGroup cloudprovider.NodeGroup, errorClass cloudprovider.InstanceErrorClass, errorCode string, currentTime time.Time) { | ||
nodeGroupInfo := csr.nodeInfosForGroups[nodeGroup.Id()] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -254,7 +254,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError | |
unregisteredNodes := a.clusterStateRegistry.GetUnregisteredNodes() | ||
if len(unregisteredNodes) > 0 { | ||
klog.V(1).Infof("%d unregistered nodes present", len(unregisteredNodes)) | ||
removedAny, err := removeOldUnregisteredNodes(unregisteredNodes, autoscalingContext, currentTime, autoscalingContext.LogRecorder) | ||
removedAny, err := removeOldUnregisteredNodes(unregisteredNodes, autoscalingContext, a.clusterStateRegistry, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just twigged that this will need reverting as well whilst playing around with backporting this into a 1.3 version. |
||
currentTime, autoscalingContext.LogRecorder) | ||
// There was a problem with removing unregistered nodes. Retry in the next loop. | ||
if err != nil { | ||
klog.Warningf("Failed to remove unregistered nodes: %v", err) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -484,7 +484,7 @@ func sanitizeTemplateNode(node *apiv1.Node, nodeGroup string, ignoredTaints tain | |
|
||
// Removes unregistered nodes if needed. Returns true if anything was removed and error if such occurred. | ||
func removeOldUnregisteredNodes(unregisteredNodes []clusterstate.UnregisteredNode, context *context.AutoscalingContext, | ||
currentTime time.Time, logRecorder *utils.LogEventRecorder) (bool, error) { | ||
clusterStateRegistry *clusterstate.ClusterStateRegistry, currentTime time.Time, logRecorder *utils.LogEventRecorder) (bool, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please revert. |
||
removedAny := false | ||
for _, unregisteredNode := range unregisteredNodes { | ||
if unregisteredNode.UnregisteredSince.Add(context.MaxNodeProvisionTime).Before(currentTime) { | ||
|
@@ -514,6 +514,7 @@ func removeOldUnregisteredNodes(unregisteredNodes []clusterstate.UnregisteredNod | |
"Failed to remove node %s: %v", unregisteredNode.Node.Name, err) | ||
return removedAny, err | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. plaese revert |
||
logRecorder.Eventf(apiv1.EventTypeNormal, "DeleteUnregistered", | ||
"Removed unregistered node %v", unregisteredNode.Node.Name) | ||
removedAny = true | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -451,12 +451,12 @@ func TestRemoveOldUnregisteredNodes(t *testing.T) { | |
assert.Equal(t, 1, len(unregisteredNodes)) | ||
|
||
// Nothing should be removed. The unregistered node is not old enough. | ||
removed, err := removeOldUnregisteredNodes(unregisteredNodes, context, now.Add(-50*time.Minute), fakeLogRecorder) | ||
removed, err := removeOldUnregisteredNodes(unregisteredNodes, context, clusterState, now.Add(-50*time.Minute), fakeLogRecorder) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please revert |
||
assert.NoError(t, err) | ||
assert.False(t, removed) | ||
|
||
// ng1_2 should be removed. | ||
removed, err = removeOldUnregisteredNodes(unregisteredNodes, context, now, fakeLogRecorder) | ||
removed, err = removeOldUnregisteredNodes(unregisteredNodes, context, clusterState, now, fakeLogRecorder) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please revert |
||
assert.NoError(t, err) | ||
assert.True(t, removed) | ||
deletedNode := getStringFromChan(deletedNodes) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do see several issue leads to desire > current. Sometimes onDemand nodes can run out of capacity like hitting limit or nodes with strict condition like in one placement group.