Skip to content

Commit

Permalink
Fix: Do not inject fakeNode for instance which has errors on create
Browse files Browse the repository at this point in the history
  • Loading branch information
azylinski committed Jul 17, 2023
1 parent 1ce7553 commit e5bc070
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 9 deletions.
7 changes: 6 additions & 1 deletion cluster-autoscaler/cloudprovider/test/test_cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,12 @@ func (tng *TestNodeGroup) Nodes() ([]cloudprovider.Instance, error) {
instances := make([]cloudprovider.Instance, 0)
for node, nodegroup := range tng.cloudProvider.nodes {
if nodegroup == tng.id {
instances = append(instances, cloudprovider.Instance{Id: node})
instances = append(instances, cloudprovider.Instance{
Id: node,
Status: &cloudprovider.InstanceStatus{
State: cloudprovider.InstanceRunning,
},
})
}
}
return instances, nil
Expand Down
10 changes: 8 additions & 2 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -988,7 +988,9 @@ func (csr *ClusterStateRegistry) getCloudProviderNodeInstances() (map[string][]c
return csr.cloudProviderNodeInstancesCache.GetCloudProviderNodeInstances()
}

// Calculates which of the existing cloud provider nodes are not registered in Kubernetes.
// Calculates which of the existing cloud provider nodes are not yet registered in Kubernetes.
// As we are expecting for those instances to be Ready soon (O(~minutes)), to speed up the scaling process,
// we are injecting a temporary, fake nodes to continue scaling based on in-memory cluster state.
func getNotRegisteredNodes(allNodes []*apiv1.Node, cloudProviderNodeInstances map[string][]cloudprovider.Instance, time time.Time) []UnregisteredNode {
registered := sets.NewString()
for _, node := range allNodes {
Expand All @@ -997,7 +999,7 @@ func getNotRegisteredNodes(allNodes []*apiv1.Node, cloudProviderNodeInstances ma
notRegistered := make([]UnregisteredNode, 0)
for _, instances := range cloudProviderNodeInstances {
for _, instance := range instances {
if !registered.Has(instance.Id) {
if !registered.Has(instance.Id) && expectedToRegister(instance) {
notRegistered = append(notRegistered, UnregisteredNode{
Node: fakeNode(instance, cloudprovider.FakeNodeUnregistered),
UnregisteredSince: time,
Expand All @@ -1008,6 +1010,10 @@ func getNotRegisteredNodes(allNodes []*apiv1.Node, cloudProviderNodeInstances ma
return notRegistered
}

func expectedToRegister(instance cloudprovider.Instance) bool {
return instance.Status != nil && instance.Status.State != cloudprovider.InstanceDeleting && instance.Status.ErrorInfo == nil
}

// Calculates which of the registered nodes in Kubernetes that do not exist in cloud provider.
func (csr *ClusterStateRegistry) getCloudProviderDeletedNodes(allNodes []*apiv1.Node) []*apiv1.Node {
nodesRemoved := make([]*apiv1.Node, 0)
Expand Down
21 changes: 15 additions & 6 deletions cluster-autoscaler/clusterstate/utils/node_instances_cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,20 @@ import (
func TestCloudProviderNodeInstancesCache(t *testing.T) {
// Fresh entry for node group in cache.
nodeNg1_1 := BuildTestNode("ng1-1", 1000, 1000)
instanceNg1_1 := cloudprovider.Instance{Id: nodeNg1_1.Name}
instanceNg1_1 := buildRunningInstance(nodeNg1_1.Name)
// Fresh entry for node group in cache - checks Invalidate function.
nodeNg2_1 := BuildTestNode("ng2-1", 1000, 1000)
instanceNg2_1 := cloudprovider.Instance{Id: nodeNg2_1.Name}
instanceNg2_1 := buildRunningInstance(nodeNg2_1.Name)
nodeNg2_2 := BuildTestNode("ng2-2", 1000, 1000)
instanceNg2_2 := cloudprovider.Instance{Id: nodeNg2_2.Name}
instanceNg2_2 := buildRunningInstance(nodeNg2_2.Name)
// Stale entry for node group in cache - check Refresh function.
nodeNg3_1 := BuildTestNode("ng3-1", 1000, 1000)
instanceNg3_1 := cloudprovider.Instance{Id: nodeNg3_1.Name}
instanceNg3_1 := buildRunningInstance(nodeNg3_1.Name)
nodeNg3_2 := BuildTestNode("ng3-2", 1000, 1000)
instanceNg3_2 := cloudprovider.Instance{Id: nodeNg3_2.Name}
instanceNg3_2 := buildRunningInstance(nodeNg3_2.Name)
// Removed node group.
nodeNg4_1 := BuildTestNode("ng4-1", 1000, 1000)
instanceNg4_1 := cloudprovider.Instance{Id: nodeNg4_1.Name}
instanceNg4_1 := buildRunningInstance(nodeNg4_1.Name)

provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 1, 10, 1)
Expand Down Expand Up @@ -90,3 +90,12 @@ func TestCloudProviderNodeInstancesCache(t *testing.T) {
assert.Equal(t, map[string][]cloudprovider.Instance{"ng1": {instanceNg1_1}, "ng2": {instanceNg2_2}, "ng3": {instanceNg3_2}}, results)
assert.Equal(t, 3, len(cache.cloudProviderNodeInstances))
}

func buildRunningInstance(name string) cloudprovider.Instance {
return cloudprovider.Instance{
Id: name,
Status: &cloudprovider.InstanceStatus{
State: cloudprovider.InstanceRunning,
},
}
}

0 comments on commit e5bc070

Please sign in to comment.