Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Terminate orphaned instances from CreateFleet requests #194

Merged
merged 2 commits into from
Oct 9, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion docs/deployment/aws/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,12 @@ Escalator requires the following IAM policy to be able to properly integrate wit
"autoscaling:SetDesiredCapacity",
"autoscaling:TerminateInstanceInAutoScalingGroup",
"ec2:CreateFleet",
"ec2:CreateTags",
"ec2:DescribeInstances",
"ec2:DescribeInstanceStatus",
"ec2:DescribeInstances"
"ec2:RunInstances",
"ec2:TerminateInstances",
"iam:PassRole"
],
"Resource": "*"
}
Expand Down
52 changes: 52 additions & 0 deletions pkg/cloudprovider/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ const (
tagKey = "k8s.io/atlassian-escalator/enabled"
// tagValue is the value for the tag applied to ASGs and Fleet requests
tagValue = "true"
// maxTerminateInstancesTries is the maximum number of times terminateOrphanedInstances can be called consecutively
maxTerminateInstancesTries = 3
// The TerminateInstances API only supports terminating 1000 instances at a time
terminateBatchSize = 1000
)

var (
terminateInstancesTries = 0
Jacobious52 marked this conversation as resolved.
Show resolved Hide resolved
)

func instanceToProviderID(instance *autoscaling.Instance) string {
Expand Down Expand Up @@ -403,6 +411,8 @@ InstanceReadyLoop:
break InstanceReadyLoop
}
case <-deadline.C:
log.Info("Reached instance ready deadline but not all instances are ready")
terminateOrphanedInstances(n, instances)
Jacobious52 marked this conversation as resolved.
Show resolved Hide resolved
return errors.New("Not all instances could be started")
}
}
Expand All @@ -417,6 +427,7 @@ InstanceReadyLoop:
})
if err != nil {
log.Error("Failed AttachInstances call.")
terminateOrphanedInstances(n, instances)
return err
}
}
Expand All @@ -432,9 +443,11 @@ InstanceReadyLoop:
log.WithField("asg", n.id).Debugf("CurrentTargetSize: %v", n.TargetSize())
if err != nil {
log.Error("Failed AttachInstances call.")
terminateOrphanedInstances(n, instances)
return err
}

terminateInstancesTries = 0
return nil
}

Expand Down Expand Up @@ -601,3 +614,42 @@ func addASGTags(config *cloudprovider.NodeGroupConfig, asg *autoscaling.Group, p
log.Errorf("failed to create auto scaling tag for ASG %v", id)
}
}

// terminateOrphanedInstances will attempt to terminate a list of instances
func terminateOrphanedInstances(n *NodeGroup, instances []*string) {
Jacobious52 marked this conversation as resolved.
Show resolved Hide resolved
if len(instances) == 0 {
return
}
log.WithField("asg", n.id).Infof("terminating %v instance(s) that could not be attached to the ASG", len(instances))

var instanceIds []string
for i := 0; i < len(instances); i += terminateBatchSize {
Jacobious52 marked this conversation as resolved.
Show resolved Hide resolved
batch := instances[i:minInt(i+terminateBatchSize, len(instances))]

for _, id := range batch {
instanceIds = append(instanceIds, *id)
}

_, err := n.provider.ec2Service.TerminateInstances(&ec2.TerminateInstancesInput{
InstanceIds: awsapi.StringSlice(instanceIds),
})
if err != nil {
log.Warnf("failed to terminate instances %v", err)
}
}

// prevent an endless cycle of provisioning and terminating instances
terminateInstancesTries++
if terminateInstancesTries == maxTerminateInstancesTries {
Jacobious52 marked this conversation as resolved.
Show resolved Hide resolved
log.Fatalf("reached maximum number of consecutive failures (%v) for provisioning nodes with CreateFleet",
Jacobious52 marked this conversation as resolved.
Show resolved Hide resolved
maxTerminateInstancesTries)
}
}

// minInt returns the minimum of two ints
func minInt(x, y int) int {
if x < y {
return x
}
return y
}
49 changes: 49 additions & 0 deletions pkg/cloudprovider/aws/aws_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/atlassian/escalator/pkg/test"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/autoscaling"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/stretchr/testify/assert"
)

Expand Down Expand Up @@ -322,3 +323,51 @@ func TestAddASGTags_WithErrorResponse(t *testing.T) {
)
addASGTags(&mockNodeGroupConfig, &mockASG, awsCloudProvider)
}

func TestTerminateInstances_Success(t *testing.T) {
setupAWSMocks()

// Mock service call and error
awsCloudProvider, _ := newMockCloudProviderUsingInjection(
nil,
&test.MockAutoscalingService{},
&test.MockEc2Service{
TerminateInstancesOutput: &ec2.TerminateInstancesOutput{},
TerminateInstancesErr: nil,
},
)

instance1 := "i-123456"
instances := []*string{&instance1}
mockNodeGroup.provider = awsCloudProvider

terminateOrphanedInstances(&mockNodeGroup, instances)
Jacobious52 marked this conversation as resolved.
Show resolved Hide resolved
}

func TestTerminateInstances_WithErrorResponse(t *testing.T) {
setupAWSMocks()

// Mock service call and error
awsCloudProvider, _ := newMockCloudProviderUsingInjection(
nil,
&test.MockAutoscalingService{},
&test.MockEc2Service{
TerminateInstancesOutput: &ec2.TerminateInstancesOutput{},
TerminateInstancesErr: errors.New("unauthorized"),
},
)

instance1 := "i-123456"
instances := []*string{&instance1}
mockNodeGroup.provider = awsCloudProvider

terminateOrphanedInstances(&mockNodeGroup, instances)
}

func TestMinInt(t *testing.T) {
x := 1
y := 2
assert.Equal(t, x, minInt(x, y))
assert.Equal(t, x, minInt(y, x))
assert.Equal(t, x, minInt(x, x))
}
8 changes: 8 additions & 0 deletions pkg/test/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ type MockEc2Service struct {

DescribeInstanceStatusOutput *ec2.DescribeInstanceStatusOutput
DescribeInstanceStatusErr error

TerminateInstancesOutput *ec2.TerminateInstancesOutput
TerminateInstancesErr error
}

// DescribeInstances mock implementation for MockEc2Service
Expand All @@ -85,3 +88,8 @@ func (m MockEc2Service) DescribeInstanceStatusPages(statusInput *ec2.DescribeIns
allInstancesReadyHelper(&ec2.DescribeInstanceStatusOutput{}, true)
return nil
}

// TerminateInstances mock implementation for MockEc2Service
func (m MockEc2Service) TerminateInstances(*ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error) {
return m.TerminateInstancesOutput, m.TerminateInstancesErr
}