Skip to content

Commit

Permalink
prevent instances going into an EC2 Auto Scaling group warm pool from…
Browse files Browse the repository at this point in the history
… being registered with the cluster
  • Loading branch information
Lydia Filipe committed Nov 17, 2021
1 parent 7130677 commit d7b4c3c
Show file tree
Hide file tree
Showing 9 changed files with 202 additions and 4 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,8 @@ additional details on each available environment variable.
| `ECS_FSX_WINDOWS_FILE_SERVER_SUPPORTED` | `true` | Whether FSx for Windows File Server volume type is supported on the container instance. This variable is only supported on agent versions 1.47.0 and later. | `false` | `true` |
| `ECS_ENABLE_RUNTIME_STATS` | `true` | Determines if [pprof](https://pkg.go.dev/net/http/pprof) is enabled for the agent. If enabled, the different profiles can be accessed through the agent's introspection port (e.g. `curl http://localhost:51678/debug/pprof/heap > heap.pprof`). In addition, agent's [runtime stats](https://pkg.go.dev/runtime#ReadMemStats) are logged to `/var/log/ecs/runtime-stats.log` file. | `false` | `false` |
| `ECS_EXCLUDE_IPV6_PORTBINDING` | `true` | Determines if agent should exclude IPv6 port binding using default network mode. If enabled, IPv6 port binding will be filtered out, and the response of DescribeTasks API call will not show tasks' IPv6 port bindings, but it is still included in Task metadata endpoint. | `true` | `true` |

| `ECS_WARM_POOLS_CHECK` | `true` | Whether to ensure instances going into an [EC2 Auto Scaling group warm pool](https://docs.aws.amazon.com/autoscaling/ec2/userguide/ec2-auto-scaling-warm-pools.html) are prevented from being registered with the cluster. Set to true only if using EC2 Autoscaling | `false` | `false` |

### Persistence

When you run the Amazon ECS Container Agent in production, its `datadir` should be persisted between runs of the Docker
Expand Down
66 changes: 66 additions & 0 deletions agent/app/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,15 @@ const (
instanceIdBackoffJitter = 0.2
instanceIdBackoffMultiple = 1.3
instanceIdMaxRetryCount = 3

targetLifecycleBackoffMin = time.Second
targetLifecycleBackoffMax = time.Second * 5
targetLifecycleBackoffJitter = 0.2
targetLifecycleBackoffMultiple = 1.3
targetLifecycleMaxRetryCount = 3
inServiceState = "InService"
asgLifeCyclePollWait = time.Minute
asgLifeCyclePollMax = 120 // given each poll cycle waits for about a minute, this gives 2-3 hours before timing out
)

var (
Expand Down Expand Up @@ -284,6 +293,15 @@ func (agent *ecsAgent) doStart(containerChangeEventStream *eventstream.EventStre
}
}

// If part of ASG, wait until instance is being set up to go in service before registering with cluster
if agent.cfg.WarmPoolsSupport.Enabled() {
err := agent.waitUntilInstanceInService(asgLifeCyclePollWait, asgLifeCyclePollMax)
if err != nil && err.Error() != blackholed {
seelog.Criticalf("Could not determine target lifecycle of instance: %v", err)
return exitcodes.ExitTerminal
}
}

// Create the task engine
taskEngine, currentEC2InstanceID, err := agent.newTaskEngine(containerChangeEventStream,
credentialsManager, state, imageManager, execCmdMgr)
Expand Down Expand Up @@ -387,6 +405,54 @@ func (agent *ecsAgent) doStart(containerChangeEventStream *eventstream.EventStre
deregisterInstanceEventStream, client, state, taskHandler, doctor)
}

// waitUntilInstanceInService Polls IMDS until the target lifecycle state indicates that the instance is going in
// service. This is to avoid instances going to a warm pool being registered as container instances with the cluster
func (agent *ecsAgent) waitUntilInstanceInService(pollWaitDuration time.Duration, pollMaxTimes int) error {
var err error
var targetState string
// Poll while the instance is in a warmed state or while waiting for the data to be populated.
// If the data is not populated after a certain number of polls, then stop polling and return the not found error.
// The polling maximum does not apply to instances in the warmed states
for i := 0; i < pollMaxTimes || targetState != ""; i++ {
targetState, err = agent.getTargetLifeCycle()
// stop polling if the retrieved state is in service or we get an unexpected error
if targetState == inServiceState {
break
}
if err != nil {
var statusCode int
if reqErr, ok := err.(awserr.RequestFailure); ok {
statusCode = reqErr.StatusCode()
}
if statusCode != 404 {
break
}
}
time.Sleep(pollWaitDuration)
}
return err
}

// getTargetLifecycle obtains the target lifecycle state for the instance from IMDS. This is populated for instances
// associated with an ASG
func (agent *ecsAgent) getTargetLifeCycle() (string, error) {
var targetState string
var err error
backoff := retry.NewExponentialBackoff(targetLifecycleBackoffMin, targetLifecycleBackoffMax, targetLifecycleBackoffJitter, targetLifecycleBackoffMultiple)
for i := 0; i < targetLifecycleMaxRetryCount; i++ {
targetState, err = agent.ec2MetadataClient.TargetLifecycleState()
if err == nil {
break
}
seelog.Debugf("Error when getting intended lifecycle state: %v", err)
if i < targetLifecycleMaxRetryCount {
time.Sleep(backoff.Duration())
}
}
seelog.Infof("Target lifecycle state of instance: %v", targetState)
return targetState, err
}

// newTaskEngine creates a new docker task engine object. It tries to load the
// local state if needed, else initializes a new one
func (agent *ecsAgent) newTaskEngine(containerChangeEventStream *eventstream.EventStream,
Expand Down
105 changes: 102 additions & 3 deletions agent/app/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"sort"
"sync"
"testing"
"time"

apierrors "github.com/aws/amazon-ecs-agent/agent/api/errors"
mock_api "github.com/aws/amazon-ecs-agent/agent/api/mocks"
Expand All @@ -50,7 +51,6 @@ import (
mock_statemanager "github.com/aws/amazon-ecs-agent/agent/statemanager/mocks"
mock_mobypkgwrapper "github.com/aws/amazon-ecs-agent/agent/utils/mobypkgwrapper/mocks"
"github.com/aws/amazon-ecs-agent/agent/version"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
aws_credentials "github.com/aws/aws-sdk-go/aws/credentials"
Expand All @@ -66,8 +66,10 @@ const (
hostPrivateIPv4Address = "127.0.0.1"
hostPublicIPv4Address = "127.0.0.1"
instanceID = "i-123"
warmedState = "Warmed:Pending"
)

var notFoundErr = awserr.NewRequestFailure(awserr.Error(awserr.New("NotFound", "", errors.New(""))), 404, "")
var apiVersions = []dockerclient.DockerVersion{
dockerclient.Version_1_21,
dockerclient.Version_1_22,
Expand Down Expand Up @@ -286,7 +288,48 @@ func TestDoStartRegisterContainerInstanceErrorNonTerminal(t *testing.T) {
assert.Equal(t, exitcodes.ExitError, exitCode)
}

func TestDoStartWarmPoolsError(t *testing.T) {
ctrl, credentialsManager, state, imageManager, client,
dockerClient, _, _, execCmdMgr := setup(t)
defer ctrl.Finish()
mockEC2Metadata := mock_ec2.NewMockEC2MetadataClient(ctrl)
gomock.InOrder(
dockerClient.EXPECT().SupportedVersions().Return(apiVersions),
)

cfg := getTestConfig()
cfg.WarmPoolsSupport = config.BooleanDefaultFalse{Value: config.ExplicitlyEnabled}
ctx, cancel := context.WithCancel(context.TODO())
// Cancel the context to cancel async routines
defer cancel()
agent := &ecsAgent{
ctx: ctx,
cfg: &cfg,
dockerClient: dockerClient,
ec2MetadataClient: mockEC2Metadata,
}

err := errors.New("error")
mockEC2Metadata.EXPECT().TargetLifecycleState().Return("", err).Times(3)

exitCode := agent.doStart(eventstream.NewEventStream("events", ctx),
credentialsManager, state, imageManager, client, execCmdMgr)
assert.Equal(t, exitcodes.ExitTerminal, exitCode)
}

func TestDoStartHappyPath(t *testing.T) {
testDoStartHappyPathWithConditions(t, false, false)
}

func TestDoStartWarmPoolsEnabled(t *testing.T) {
testDoStartHappyPathWithConditions(t, false, true)
}

func TestDoStartWarmPoolsBlackholed(t *testing.T) {
testDoStartHappyPathWithConditions(t, true, true)
}

func testDoStartHappyPathWithConditions(t *testing.T, blackholed bool, warmPoolsEnv bool) {
ctrl, credentialsManager, _, imageManager, client,
dockerClient, stateManagerFactory, saveableOptionFactory, execCmdMgr := setup(t)
defer ctrl.Finish()
Expand All @@ -299,7 +342,19 @@ func TestDoStartHappyPath(t *testing.T) {
ec2MetadataClient.EXPECT().PrivateIPv4Address().Return(hostPrivateIPv4Address, nil)
ec2MetadataClient.EXPECT().PublicIPv4Address().Return(hostPublicIPv4Address, nil)
ec2MetadataClient.EXPECT().OutpostARN().Return("", nil)
ec2MetadataClient.EXPECT().InstanceID().Return(instanceID, nil)

if blackholed {
if warmPoolsEnv {
ec2MetadataClient.EXPECT().TargetLifecycleState().Return("", errors.New("blackholed")).Times(3)
}
ec2MetadataClient.EXPECT().InstanceID().Return("", errors.New("blackholed"))
} else {
if warmPoolsEnv {
ec2MetadataClient.EXPECT().TargetLifecycleState().Return("", errors.New("error"))
ec2MetadataClient.EXPECT().TargetLifecycleState().Return(inServiceState, nil)
}
ec2MetadataClient.EXPECT().InstanceID().Return(instanceID, nil)
}

var discoverEndpointsInvoked sync.WaitGroup
discoverEndpointsInvoked.Add(2)
Expand Down Expand Up @@ -347,6 +402,9 @@ func TestDoStartHappyPath(t *testing.T) {
cfg := getTestConfig()
cfg.ContainerMetadataEnabled = config.BooleanDefaultFalse{Value: config.ExplicitlyEnabled}
cfg.Checkpoint = config.BooleanDefaultFalse{Value: config.ExplicitlyEnabled}
if warmPoolsEnv {
cfg.WarmPoolsSupport = config.BooleanDefaultFalse{Value: config.ExplicitlyEnabled}
}
cfg.Cluster = clusterName
ctx, cancel := context.WithCancel(context.TODO())

Expand Down Expand Up @@ -386,7 +444,9 @@ func TestDoStartHappyPath(t *testing.T) {
assertMetadata(t, data.AvailabilityZoneKey, availabilityZone, dataClient)
assertMetadata(t, data.ClusterNameKey, clusterName, dataClient)
assertMetadata(t, data.ContainerInstanceARNKey, containerInstanceARN, dataClient)
assertMetadata(t, data.EC2InstanceIDKey, instanceID, dataClient)
if !blackholed {
assertMetadata(t, data.EC2InstanceIDKey, instanceID, dataClient)
}
}

func assertMetadata(t *testing.T, key, expectedVal string, dataClient data.Client) {
Expand Down Expand Up @@ -1473,3 +1533,42 @@ func newTestDataClient(t *testing.T) (data.Client, func()) {
}
return testClient, cleanup
}

func TestWaitUntilInstanceInServicePolling(t *testing.T) {
testCases := []struct {
name string
states []string
err error
returnsState bool
maxPolls int
}{
{"TestWaitUntilInstanceInServicePollsWarmed", []string{warmedState, inServiceState}, nil, true, asgLifeCyclePollMax},
{"TestWaitUntilInstanceInServicePollsMissing", []string{inServiceState}, notFoundErr, true, asgLifeCyclePollMax},
{"TestWaitUntilInstanceInServicePollingMaxReached", nil, notFoundErr, false, 1},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()
cfg := getTestConfig()
cfg.WarmPoolsSupport = config.BooleanDefaultFalse{Value: config.ExplicitlyEnabled}
ec2MetadataClient := mock_ec2.NewMockEC2MetadataClient(ctrl)
agent := &ecsAgent{ec2MetadataClient: ec2MetadataClient, cfg: &cfg}

if tc.err != nil {
ec2MetadataClient.EXPECT().TargetLifecycleState().Return("", tc.err).Times(3)
}
for _, state := range tc.states {
ec2MetadataClient.EXPECT().TargetLifecycleState().Return(state, nil)
}
var expectedResult error
if tc.returnsState {
expectedResult = nil
} else {
expectedResult = tc.err
}
assert.Equal(t, expectedResult, agent.waitUntilInstanceInService(1*time.Millisecond, tc.maxPolls))
})
}
}
1 change: 1 addition & 0 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@ func environmentConfig() (Config, error) {
External: parseBooleanDefaultFalseConfig("ECS_EXTERNAL"),
EnableRuntimeStats: parseBooleanDefaultFalseConfig("ECS_ENABLE_RUNTIME_STATS"),
ShouldExcludeIPv6PortBinding: parseBooleanDefaultTrueConfig("ECS_EXCLUDE_IPV6_PORTBINDING"),
WarmPoolsSupport: parseBooleanDefaultFalseConfig("ECS_WARM_POOLS_CHECK"),
}, err
}

Expand Down
2 changes: 2 additions & 0 deletions agent/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ func TestEnvironmentConfig(t *testing.T) {
defer setTestEnv("ECS_PULL_DEPENDENT_CONTAINERS_UPFRONT", "true")()
defer setTestEnv("ECS_ENABLE_RUNTIME_STATS", "true")()
defer setTestEnv("ECS_EXCLUDE_IPV6_PORTBINDING", "true")()
defer setTestEnv("ECS_WARM_POOLS_CHECK", "false")()
additionalLocalRoutesJSON := `["1.2.3.4/22","5.6.7.8/32"]`
setTestEnv("ECS_AWSVPC_ADDITIONAL_LOCAL_ROUTES", additionalLocalRoutesJSON)
setTestEnv("ECS_ENABLE_CONTAINER_METADATA", "true")
Expand Down Expand Up @@ -216,6 +217,7 @@ func TestEnvironmentConfig(t *testing.T) {
assert.True(t, conf.DependentContainersPullUpfront.Enabled(), "Wrong value for DependentContainersPullUpfront")
assert.True(t, conf.EnableRuntimeStats.Enabled(), "Wrong value for EnableRuntimeStats")
assert.True(t, conf.ShouldExcludeIPv6PortBinding.Enabled(), "Wrong value for ShouldExcludeIPv6PortBinding")
assert.False(t, conf.WarmPoolsSupport.Enabled(), "Wrong value for WarmPoolsSupport")
}

func TestTrimWhitespaceWhenCreating(t *testing.T) {
Expand Down
4 changes: 4 additions & 0 deletions agent/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,4 +354,8 @@ type Config struct {
// is set to true by default, and can be overridden by the ECS_EXCLUDE_IPV6_PORTBINDING environment variable. This is a workaround
// for docker's bug as detailed in https://github.com/aws/amazon-ecs-agent/issues/2870.
ShouldExcludeIPv6PortBinding BooleanDefaultTrue

// WarmPoolsSupport specifies whether the agent should poll IMDS to check the target lifecycle state for a starting
// instance
WarmPoolsSupport BooleanDefaultFalse
}
4 changes: 4 additions & 0 deletions agent/ec2/blackhole_ec2_metadata_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,7 @@ func (blackholeMetadataClient) SpotInstanceAction() (string, error) {
func (blackholeMetadataClient) OutpostARN() (string, error) {
return "", errors.New("blackholed")
}

func (blackholeMetadataClient) TargetLifecycleState() (string, error) {
return "", errors.New("blackholed")
}
6 changes: 6 additions & 0 deletions agent/ec2/ec2_metadata_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ const (
PublicIPv4Resource = "public-ipv4"
OutpostARN = "outpost-arn"
PrimaryIPV4VPCCIDRResourceFormat = "network/interfaces/macs/%s/vpc-ipv4-cidr-block"
TargetLifecycleState = "autoscaling/target-lifecycle-state"
)

const (
Expand Down Expand Up @@ -82,6 +83,7 @@ type EC2MetadataClient interface {
PublicIPv4Address() (string, error)
SpotInstanceAction() (string, error)
OutpostARN() (string, error)
TargetLifecycleState() (string, error)
}

type ec2MetadataClientImpl struct {
Expand Down Expand Up @@ -203,3 +205,7 @@ func (c *ec2MetadataClientImpl) SpotInstanceAction() (string, error) {
func (c *ec2MetadataClientImpl) OutpostARN() (string, error) {
return c.client.GetMetadata(OutpostARN)
}

func (c *ec2MetadataClientImpl) TargetLifecycleState() (string, error) {
return c.client.GetMetadata(TargetLifecycleState)
}
15 changes: 15 additions & 0 deletions agent/ec2/mocks/ec2_mocks.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit d7b4c3c

Please sign in to comment.