Skip to content

Commit

Permalink
Introduce jitter for task cleanup wait duration.
Browse files Browse the repository at this point in the history
Configurable via ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION_JITTER environment variable. The purpose of introducing this is to address the use case where a large number of tasks are stopped at the same time. Without the jitter, the cleanup for all those tasks also happen at roughly the same time, which can generate a lot of work that could impact the tasks that are running at the time of such cleanup. With the jitter, each task will be cleaned up at different time, avoiding the aforementioned impact.
  • Loading branch information
fenxiong committed Jul 30, 2021
1 parent 0df0593 commit 08a7053
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 4 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,8 @@ additional details on each available environment variable.
| `ECS_DISABLE_PRIVILEGED` | `true` | Whether launching privileged containers is disabled on the container instance. | `false` | `false` |
| `ECS_SELINUX_CAPABLE` | `true` | Whether SELinux is available on the container instance. | `false` | `false` |
| `ECS_APPARMOR_CAPABLE` | `true` | Whether AppArmor is available on the container instance. | `false` | `false` |
| `ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION` | 10m | Time to wait to delete containers for a stopped task. If set to less than 1 minute, the value is ignored. | 3h | 3h |
| `ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION` | 10m | Default time to wait to delete containers for a stopped task (see also `ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION_JITTER`). If set to less than 1 minute, the value is ignored. | 3h | 3h |
| `ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION_JITTER` | 1h | Jitter value for the task engine cleanup wait duration. When specified, the actual cleanup wait duration time for each task will be the duration specified in `ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION` plus a random duration between 0 and the jitter duration. | <blank> | <blank> |
| `ECS_CONTAINER_STOP_TIMEOUT` | 10m | Instance scoped configuration for time to wait for the container to exit normally before being forcibly killed. | 30s | 30s |
| `ECS_CONTAINER_START_TIMEOUT` | 10m | Timeout before giving up on starting a container. | 3m | 8m |
| `ECS_CONTAINER_CREATE_TIMEOUT` | 10m | Timeout before giving up on creating a container. Minimum value is 1m. If user sets a value below minimum it will be set to min. | 4m | 4m |
Expand Down
1 change: 1 addition & 0 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,7 @@ func environmentConfig() (Config, error) {
SELinuxCapable: parseBooleanDefaultFalseConfig("ECS_SELINUX_CAPABLE"),
AppArmorCapable: parseBooleanDefaultFalseConfig("ECS_APPARMOR_CAPABLE"),
TaskCleanupWaitDuration: parseEnvVariableDuration("ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION"),
TaskCleanupWaitDurationJitter: parseEnvVariableDuration("ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION_JITTER"),
TaskENIEnabled: parseBooleanDefaultFalseConfig("ECS_ENABLE_TASK_ENI"),
TaskIAMRoleEnabled: parseBooleanDefaultFalseConfig("ECS_ENABLE_TASK_IAM_ROLE"),
DeleteNonECSImagesEnabled: parseBooleanDefaultFalseConfig("ECS_ENABLE_UNTRACKED_IMAGE_CLEANUP"),
Expand Down
13 changes: 11 additions & 2 deletions agent/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,13 @@ func TestGetRegionWithNoIID(t *testing.T) {
}

func TestEnvironmentConfig(t *testing.T) {
const (
testTaskCleanupWaitDurationStr = "90s"
testTaskCleanupWaitDurationJitterStr = "1m"
testTaskCleanupWaitDuration = 90 * time.Second
testTaskCleanupWaitDurationJitter = time.Minute
)

defer setTestRegion()()
defer setTestEnv("ECS_CLUSTER", "myCluster")()
defer setTestEnv("ECS_RESERVED_PORTS_UDP", "[42,99]")()
Expand All @@ -126,7 +133,8 @@ func TestEnvironmentConfig(t *testing.T) {
defer setTestEnv("ECS_SELINUX_CAPABLE", "true")()
defer setTestEnv("ECS_APPARMOR_CAPABLE", "true")()
defer setTestEnv("ECS_DISABLE_PRIVILEGED", "true")()
defer setTestEnv("ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION", "90s")()
defer setTestEnv("ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION", testTaskCleanupWaitDurationStr)()
defer setTestEnv("ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION_JITTER", testTaskCleanupWaitDurationJitterStr)()
defer setTestEnv("ECS_ENABLE_TASK_IAM_ROLE", "true")()
defer setTestEnv("ECS_ENABLE_UNTRACKED_IMAGE_CLEANUP", "true")()
defer setTestEnv("ECS_ENABLE_TASK_IAM_ROLE_NETWORK_HOST", "true")()
Expand Down Expand Up @@ -187,7 +195,8 @@ func TestEnvironmentConfig(t *testing.T) {
assert.Equal(t, ImagePullAlwaysBehavior, conf.ImagePullBehavior)
assert.Equal(t, "testing", conf.InstanceAttributes["my_attribute"])
assert.Equal(t, "testing", conf.ContainerInstanceTags["my_tag"])
assert.Equal(t, (90 * time.Second), conf.TaskCleanupWaitDuration)
assert.Equal(t, testTaskCleanupWaitDuration, conf.TaskCleanupWaitDuration)
assert.Equal(t, testTaskCleanupWaitDurationJitter, conf.TaskCleanupWaitDurationJitter)
serializedAdditionalLocalRoutesJSON, err := json.Marshal(conf.AWSVPCAdditionalLocalRoutes)
assert.NoError(t, err, "should marshal additional local routes")
assert.Equal(t, additionalLocalRoutesJSON, string(serializedAdditionalLocalRoutesJSON))
Expand Down
6 changes: 6 additions & 0 deletions agent/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@ type Config struct {
// until cleanup of task resources is started.
TaskCleanupWaitDuration time.Duration

// TaskCleanupWaitDurationJitter specifies a jitter for task cleanup wait duration.
// When specified to a non-zero duration (default is zero), the task cleanup wait duration for each task
// will be a random duration between [TaskCleanupWaitDuration, TaskCleanupWaitDuration +
// TaskCleanupWaitDurationJitter].
TaskCleanupWaitDurationJitter time.Duration

// TaskIAMRoleEnabled specifies if the Agent is capable of launching
// tasks with IAM Roles.
TaskIAMRoleEnabled BooleanDefaultFalse
Expand Down
2 changes: 1 addition & 1 deletion agent/engine/task_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ func (mtask *managedTask) overseeTask() {
}
// TODO: make this idempotent on agent restart
go mtask.releaseIPInIPAM()
mtask.cleanupTask(mtask.cfg.TaskCleanupWaitDuration)
mtask.cleanupTask(retry.AddJitter(mtask.cfg.TaskCleanupWaitDuration, mtask.cfg.TaskCleanupWaitDurationJitter))
}

// shouldExit checks if the task manager should exit, as the agent is exiting.
Expand Down

0 comments on commit 08a7053

Please sign in to comment.