Skip to content

Commit

Permalink
Filter managed non-ASG nodes by tag (#669)
Browse files Browse the repository at this point in the history
* Remove unused ASG calls since tags should propagate to instance

* Replace and deprecate ASG-specific tags

* Fix unit tests

* Clean up comments and document deprecated chart values

* Fix tests

* Add managed tag to e2e tests

* Fix managed tag in e2e tests

* Update test/e2e/asg-lifecycle-sqs-test

* Update test/e2e/ec2-state-change-sqs-test

* Update test/e2e/rebalance-recommendation-sqs-test

* Update test/e2e/scheduled-change-event-sqs-test

* Update test/e2e/spot-interruption-sqs-test

* Remove extraneous comments

Co-authored-by: Brandon Wagner <bmwagner10@gmail.com>
Co-authored-by: Steve Nay <265958+snay2@users.noreply.github.com>
  • Loading branch information
3 people authored Aug 18, 2022
1 parent 267fca0 commit 5fa4dc4
Show file tree
Hide file tree
Showing 14 changed files with 85 additions and 293 deletions.
15 changes: 11 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -269,19 +269,26 @@ $ aws autoscaling put-lifecycle-hook \
--role-arn <your SQS access role ARN here>
```

#### 3. Tag the ASGs:
#### 3. Tag the Instances:

By default the aws-node-termination-handler will only manage terminations for ASGs tagged w/ `key=aws-node-termination-handler/managed`
By default the aws-node-termination-handler will only manage terminations for instances tagged with `key=aws-node-termination-handler/managed`.
The value of the key does not matter.

To tag ASGs and propagate the tags to your instances (recommended):
```
$ aws autoscaling create-or-update-tags \
--tags ResourceId=my-auto-scaling-group,ResourceType=auto-scaling-group,Key=aws-node-termination-handler/managed,Value=,PropagateAtLaunch=true
```

The value of the key does not matter.
To tag an EC2 instance:
```
aws ec2 create-tags \
--resources i-1234567890abcdef0 \
--tags 'Key="aws-node-termination-handler/managed",Value='
```

This functionality is helpful in accounts where there are ASGs that do not run kubernetes nodes or you do not want aws-node-termination-handler to manage their termination lifecycle.
However, if your account is dedicated to ASGs for your kubernetes cluster, then you can turn off the ASG tag check by setting the flag `--check-asg-tag-before-draining=false` or environment variable `CHECK_ASG_TAG_BEFORE_DRAINING=false`.
However, if your account is dedicated to ASGs for your kubernetes cluster, then you can turn off the ASG tag check by setting the flag `--check-tag-before-draining=false` or environment variable `CHECK_TAG_BEFORE_DRAINING=false`.

You can also control what resources NTH manages by adding the resource ARNs to your Amazon EventBridge rules.

Expand Down
4 changes: 2 additions & 2 deletions cmd/node-termination-handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ func main() {
log.Debug().Msgf("AWS Credentials retrieved from provider: %s", creds.ProviderName)

sqsMonitor := sqsevent.SQSMonitor{
CheckIfManaged: nthConfig.CheckASGTagBeforeDraining,
ManagedAsgTag: nthConfig.ManagedAsgTag,
CheckIfManaged: nthConfig.CheckTagBeforeDraining,
ManagedTag: nthConfig.ManagedTag,
QueueURL: nthConfig.QueueURL,
InterruptionChan: interruptionChan,
CancelChan: cancelChan,
Expand Down
8 changes: 5 additions & 3 deletions config/helm/aws-node-termination-handler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,11 @@ The configuration in this table applies to AWS Node Termination Handler in queue
| `awsRegion` | If specified, use the AWS region for AWS API calls, else NTH will try to find the region through the `AWS_REGION` environment variable, IMDS, or the specified queue URL. | `""` |
| `queueURL` | Listens for messages on the specified SQS queue URL. | `""` |
| `workers` | The maximum amount of parallel event processors to handle concurrent events. | `10` |
| `checkASGTagBeforeDraining` | If `true`, check that the instance is tagged with the `managedAsgTag` before draining the node. If `false`, disables calls ASG API. | `true` |
| `managedAsgTag` | The node tag to check if `checkASGTagBeforeDraining` is `true`. | `aws-node-termination-handler/managed` |
| `useProviderId` | If `true`, fetch node name through Kubernetes node spec ProviderID instead of AWS event PrivateDnsHostname. | `false` |
| `checkTagBeforeDraining` | If `true`, check that the instance is tagged with the `managedTag` before draining the node. | `true` |
| `managedTag` | The node tag to check if `checkTagBeforeDraining` is `true`. | `aws-node-termination-handler/managed` |
| `checkASGTagBeforeDraining` | [DEPRECATED](Use `checkTagBeforeDraining` instead) If `true`, check that the instance is tagged with the `managedAsgTag` before draining the node. If `false`, disables calls ASG API. | `true` |
| `managedAsgTag` | [DEPRECATED](Use `managedTag` instead) The node tag to check if `checkASGTagBeforeDraining` is `true`.
| `useProviderId` | If `true`, fetch node name through Kubernetes node spec ProviderID instead of AWS event PrivateDnsHostname. | `false` |

### IMDS Mode Configuration

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,10 @@ spec:
value: {{ .Values.enablePrometheusServer | quote }}
- name: PROMETHEUS_SERVER_PORT
value: {{ .Values.prometheusServerPort | quote }}
- name: CHECK_ASG_TAG_BEFORE_DRAINING
value: {{ .Values.checkASGTagBeforeDraining | quote }}
- name: MANAGED_ASG_TAG
value: {{ .Values.managedAsgTag | quote }}
- name: CHECK_TAG_BEFORE_DRAINING
value: {{ .Values.checkTagBeforeDraining | quote }}
- name: MANAGED_TAG
value: {{ .Values.managedTag | quote }}
- name: USE_PROVIDER_ID
value: {{ .Values.useProviderId | quote }}
- name: DRY_RUN
Expand Down
7 changes: 3 additions & 4 deletions config/helm/aws-node-termination-handler/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -171,11 +171,10 @@ queueURL: ""
workers: 10

# If true, check that the instance is tagged with "aws-node-termination-handler/managed" as the key before draining the node
# If false, disables calls to ASG API.
checkASGTagBeforeDraining: true
checkTagBeforeDraining: true

# The tag to ensure is on a node if checkASGTagBeforeDraining is true
managedAsgTag: "aws-node-termination-handler/managed"
# The tag to ensure is on a node if checkTagBeforeDraining is true
managedTag: "aws-node-termination-handler/managed"

# If true, fetch node name through Kubernetes node spec ProviderID instead of AWS event PrivateDnsHostname.
useProviderId: false
Expand Down
40 changes: 31 additions & 9 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,12 @@ const (
enableRebalanceDrainingDefault = false
checkASGTagBeforeDrainingConfigKey = "CHECK_ASG_TAG_BEFORE_DRAINING"
checkASGTagBeforeDrainingDefault = true
checkTagBeforeDrainingConfigKey = "CHECK_TAG_BEFORE_DRAINING"
checkTagBeforeDrainingDefault = true
managedAsgTagConfigKey = "MANAGED_ASG_TAG"
managedTagConfigKey = "MANAGED_TAG"
managedAsgTagDefault = "aws-node-termination-handler/managed"
managedTagDefault = "aws-node-termination-handler/managed"
useProviderIdConfigKey = "USE_PROVIDER_ID"
useProviderIdDefault = false
metadataTriesConfigKey = "METADATA_TRIES"
Expand Down Expand Up @@ -123,7 +127,9 @@ type Config struct {
EnableRebalanceMonitoring bool
EnableRebalanceDraining bool
CheckASGTagBeforeDraining bool
CheckTagBeforeDraining bool
ManagedAsgTag string
ManagedTag string
MetadataTries int
CordonOnly bool
TaintNode bool
Expand Down Expand Up @@ -178,8 +184,10 @@ func ParseCliArgs() (config Config, err error) {
flag.BoolVar(&config.EnableSQSTerminationDraining, "enable-sqs-termination-draining", getBoolEnv(enableSQSTerminationDrainingConfigKey, enableSQSTerminationDrainingDefault), "If true, drain nodes when an SQS termination event is received")
flag.BoolVar(&config.EnableRebalanceMonitoring, "enable-rebalance-monitoring", getBoolEnv(enableRebalanceMonitoringConfigKey, enableRebalanceMonitoringDefault), "If true, cordon nodes when the rebalance recommendation notice is received. If you'd like to drain the node in addition to cordoning, then also set \"enableRebalanceDraining\".")
flag.BoolVar(&config.EnableRebalanceDraining, "enable-rebalance-draining", getBoolEnv(enableRebalanceDrainingConfigKey, enableRebalanceDrainingDefault), "If true, drain nodes when the rebalance recommendation notice is received")
flag.BoolVar(&config.CheckASGTagBeforeDraining, "check-asg-tag-before-draining", getBoolEnv(checkASGTagBeforeDrainingConfigKey, checkASGTagBeforeDrainingDefault), "If true, check that the instance is tagged with \"aws-node-termination-handler/managed\" as the key before draining the node. If false, disables calls to ASG API.")
flag.StringVar(&config.ManagedAsgTag, "managed-asg-tag", getEnv(managedAsgTagConfigKey, managedAsgTagDefault), "Sets the tag to check for on instances that is propogated from the ASG before taking action, default to aws-node-termination-handler/managed")
flag.BoolVar(&config.CheckASGTagBeforeDraining, "check-asg-tag-before-draining", getBoolEnv(checkASGTagBeforeDrainingConfigKey, checkASGTagBeforeDrainingDefault), "[DEPRECATED] * Use check-tag-before-draining instead * If true, check that the instance is tagged with \"aws-node-termination-handler/managed\" as the key before draining the node. If false, disables calls to ASG API.")
flag.BoolVar(&config.CheckTagBeforeDraining, "check-tag-before-draining", getBoolEnv(checkTagBeforeDrainingConfigKey, checkTagBeforeDrainingDefault), "If true, check that the instance is tagged with \"aws-node-termination-handler/managed\" as the key before draining the node.")
flag.StringVar(&config.ManagedAsgTag, "managed-asg-tag", getEnv(managedAsgTagConfigKey, managedAsgTagDefault), "[DEPRECATED] * Use managed-tag instead * Sets the tag to check instances for that is propogated from the ASG before taking action, default to aws-node-termination-handler/managed")
flag.StringVar(&config.ManagedTag, "managed-tag", getEnv(managedTagConfigKey, managedTagDefault), "Sets the tag to check instances for before taking action, default to aws-node-termination-handler/managed")
flag.IntVar(&config.MetadataTries, "metadata-tries", getIntEnv(metadataTriesConfigKey, metadataTriesDefault), "The number of times to try requesting metadata. If you would like 2 retries, set metadata-tries to 3.")
flag.BoolVar(&config.CordonOnly, "cordon-only", getBoolEnv(cordonOnly, false), "If true, nodes will be cordoned but not drained when an interruption event occurs.")
flag.BoolVar(&config.TaintNode, "taint-node", getBoolEnv(taintNode, false), "If true, nodes will be tainted when an interruption event occurs.")
Expand Down Expand Up @@ -209,12 +217,26 @@ func ParseCliArgs() (config Config, err error) {
config.PodTerminationGracePeriod = gracePeriod
}

if isConfigProvided("managed-asg-tag", managedAsgTagConfigKey) && isConfigProvided("managed-tag", managedTagConfigKey) {
log.Warn().Msg("Deprecated argument \"managed-asg-tag\" and the replacement argument \"managed-tag\" was provided. Using the newer argument \"managed-tag\"")
} else if isConfigProvided("managed-asg-tag", managedAsgTagConfigKey) {
log.Warn().Msg("Deprecated argument \"managed-asg-tag\" was provided. This argument will eventually be removed. Please switch to \"managed-tag\" instead.")
config.ManagedTag = config.ManagedAsgTag
}

if isConfigProvided("check-asg-tag-before-draining", checkASGTagBeforeDrainingConfigKey) && isConfigProvided("check-tag-before-draining", checkTagBeforeDrainingConfigKey) {
log.Warn().Msg("Deprecated argument \"check-asg-tag-before-draining\" and the replacement argument \"check-tag-before-draining\" was provided. Using the newer argument \"check-tag-before-draining\"")
} else if isConfigProvided("check-asg-tag-before-draining", checkASGTagBeforeDrainingConfigKey) {
log.Warn().Msg("Deprecated argument \"check-asg-tag-before-draining\" was provided. This argument will eventually be removed. Please switch to \"check-tag-before-draining\" instead.")
config.CheckTagBeforeDraining = config.CheckASGTagBeforeDraining
}

switch strings.ToLower(config.LogLevel) {
case "info":
case "debug":
case "error":
default:
return config, fmt.Errorf("Invalid log-level passed: %s Should be one of: info, debug, error", config.LogLevel)
return config, fmt.Errorf("invalid log-level passed: %s Should be one of: info, debug, error", config.LogLevel)
}

if config.NodeName == "" {
Expand Down Expand Up @@ -273,8 +295,8 @@ func (c Config) PrintJsonConfigArgs() {
Str("aws_region", c.AWSRegion).
Str("aws_endpoint", c.AWSEndpoint).
Str("queue_url", c.QueueURL).
Bool("check_asg_tag_before_draining", c.CheckASGTagBeforeDraining).
Str("ManagedAsgTag", c.ManagedAsgTag).
Bool("check_tag_before_draining", c.CheckTagBeforeDraining).
Str("ManagedTag", c.ManagedTag).
Bool("use_provider_id", c.UseProviderId).
Msg("aws-node-termination-handler arguments")
}
Expand Down Expand Up @@ -321,8 +343,8 @@ func (c Config) PrintHumanConfigArgs() {
"\tkubernetes-events-extra-annotations: %s,\n"+
"\taws-region: %s,\n"+
"\tqueue-url: %s,\n"+
"\tcheck-asg-tag-before-draining: %t,\n"+
"\tmanaged-asg-tag: %s,\n"+
"\tcheck-tag-before-draining: %t,\n"+
"\tmanaged-tag: %s,\n"+
"\tuse-provider-id: %t,\n"+
"\taws-endpoint: %s,\n",
c.DryRun,
Expand Down Expand Up @@ -358,8 +380,8 @@ func (c Config) PrintHumanConfigArgs() {
c.KubernetesEventsExtraAnnotations,
c.AWSRegion,
c.QueueURL,
c.CheckASGTagBeforeDraining,
c.ManagedAsgTag,
c.CheckTagBeforeDraining,
c.ManagedTag,
c.UseProviderId,
c.AWSEndpoint,
)
Expand Down
73 changes: 4 additions & 69 deletions pkg/monitor/sqsevent/sqs-monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"github.com/aws/aws-node-termination-handler/pkg/monitor"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/service/autoscaling"
"github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
Expand Down Expand Up @@ -49,7 +48,7 @@ type SQSMonitor struct {
ASG autoscalingiface.AutoScalingAPI
EC2 ec2iface.EC2API
CheckIfManaged bool
ManagedAsgTag string
ManagedTag string
}

// InterruptionEventWrapper is a convenience wrapper for associating an interruption event with its error, if any
Expand Down Expand Up @@ -214,7 +213,7 @@ func (m SQSMonitor) processInterruptionEvents(interruptionEventWrappers []Interr
dropMessageSuggestionCount++

case m.CheckIfManaged && !eventWrapper.InterruptionEvent.IsManaged:
// This event isn't for an instance that is managed by this process
// This event is for an instance that is not managed by this process
log.Debug().Str("instance-id", eventWrapper.InterruptionEvent.InstanceID).Msg("dropping interruption event for unmanaged node")
dropMessageSuggestionCount++

Expand Down Expand Up @@ -354,20 +353,8 @@ func (m SQSMonitor) getNodeInfo(instanceID string) (*NodeInfo, error) {
}

if m.CheckIfManaged {
if nodeInfo.AsgName == "" {
// If ASG tags are not propagated we might need to use the API
// to retrieve the ASG name
nodeInfo.AsgName, err = m.retrieveAutoScalingGroupName(nodeInfo.InstanceID)
if err != nil {
return nil, fmt.Errorf("unable to retrieve AutoScaling group: %w", err)
}
}
if nodeInfo.Tags[m.ManagedAsgTag] == "" {
// if ASG tags are not propagated we might have to check the ASG directly
nodeInfo.IsManaged, err = m.isASGManaged(nodeInfo.AsgName, nodeInfo.InstanceID)
if err != nil {
return nil, err
}
if _, ok := nodeInfo.Tags[m.ManagedTag]; !ok {
nodeInfo.IsManaged = false
}
}

Expand All @@ -376,55 +363,3 @@ func (m SQSMonitor) getNodeInfo(instanceID string) (*NodeInfo, error) {

return nodeInfo, nil
}

// isASGManaged returns whether the autoscaling group should be managed by node termination handler
func (m SQSMonitor) isASGManaged(asgName string, instanceID string) (bool, error) {
if asgName == "" {
return false, nil
}
asgFilter := autoscaling.Filter{Name: aws.String("auto-scaling-group"), Values: []*string{aws.String(asgName)}}
asgDescribeTagsInput := autoscaling.DescribeTagsInput{
Filters: []*autoscaling.Filter{&asgFilter},
}
isManaged := false
err := m.ASG.DescribeTagsPages(&asgDescribeTagsInput, func(resp *autoscaling.DescribeTagsOutput, next bool) bool {
for _, tag := range resp.Tags {
if *tag.Key == m.ManagedAsgTag {
isManaged = true
// breaks paging loop
return false
}
}
// continue paging loop
return true
})

log.Debug().
Str("instance_id", instanceID).
Str("tag_key", m.ManagedAsgTag).
Bool("is_managed", isManaged).
Msg("directly checked if instance's Auto Scaling Group is managed")
return isManaged, err
}

// retrieveAutoScalingGroupName returns the autoscaling group name for a given instanceID
func (m SQSMonitor) retrieveAutoScalingGroupName(instanceID string) (string, error) {
asgDescribeInstanceInput := autoscaling.DescribeAutoScalingInstancesInput{
InstanceIds: []*string{&instanceID},
MaxRecords: aws.Int64(50),
}
asgs, err := m.ASG.DescribeAutoScalingInstances(&asgDescribeInstanceInput)
if err != nil {
return "", err
}
if len(asgs.AutoScalingInstances) == 0 {
log.Debug().Str("instance_id", instanceID).Msg("Did not find an Auto Scaling Group for the given instance id")
return "", nil
}
asgName := asgs.AutoScalingInstances[0].AutoScalingGroupName
log.Debug().
Str("instance_id", instanceID).
Str("asg_name", *asgName).
Msg("performed API lookup of instance ASG")
return *asgName, nil
}
Loading

0 comments on commit 5fa4dc4

Please sign in to comment.