Skip to content

Commit

Permalink
🐛 awsmachine: only register machine to LB when it's running
Browse files Browse the repository at this point in the history
The AWS docs [1] state that to register an instance with the Load
Balancer target groups, the instance must be running. Currently, CAPA
tries to register it while it's still pending, causing the following
error:

```
0314 22:24:18.512176  701419 awsmachine_controller.go:605] "failed to reconcile LB attachment" err=<
    [could not register machine to load balancer: could not register control plane instance "i-039bb22b1e8df7c99" with load balancer: failed to register instance with target group 'mrb-capa-67-d2pfx-int-22623': InvalidTarget: The following targets are not in a running state and cannot be registered: 'i-039bb22b1e8df7c99'
        status code: 400, request id: 17514354-77ac-42b1-a882-489760563bbd, could not register machine to load balancer: could not register control plane instance "i-039bb22b1e8df7c99" with load balancer: failed to register instance with target group 'mrb-capa-67-d2pfx-ext-6443': InvalidTarget: The following targets are not in a running state and cannot be registered: 'i-039bb22b1e8df7c99'
        status code: 400, request id: 84e1849c-abb7-4af9-9220-8791fdc1a3fb]
 >
I0314 22:24:18.512325  701419 recorder.go:104] "events: Failed to register control plane instance \"i-039bb22b1e8df7c99\" with load balancer: failed to register instance with target group 'mrb-capa-67-d2pfx-ext-6443': InvalidTarget: The following targets are not in a running state and cannot be registered: 'i-039bb22b1e8df7c99'\n\tstatus code: 400, request id: 84e1849c-abb7-4af9-9220-8791fdc1a3fb" type="Warning" object={"kind":"AWSMachine","namespace":"openshift-cluster-api-guests","name":"mrb-capa-67-d2pfx-bootstrap","uid":"58af1162-380b-4f3e-93fe-c0e81401070e","apiVersion":"infrastructure.cluster.x-k8s.io/v1beta2","resourceVersion":"562"} reason="FailedAttachControlPlaneELB"
```

Even though this doesn't stop the install from succeeding, let's wait
for the instance state to be "running" and with that avoid unnecessary
AWS API calls.

[1] https://docs.aws.amazon.com/elasticloadbalancing/latest/application/target-group-register-targets.html#register-instances
  • Loading branch information
r4f4 committed Jul 4, 2024
1 parent 1313226 commit 172038a
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 2 deletions.
18 changes: 16 additions & 2 deletions controllers/awsmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -619,8 +619,14 @@ func (r *AWSMachineReconciler) reconcileNormal(_ context.Context, machineScope *
}

if err := r.reconcileLBAttachment(machineScope, elbScope, instance); err != nil {
machineScope.Error(err, "failed to reconcile LB attachment")
return ctrl.Result{}, err
// We are tolerating InstanceNotRunning error, so we don't report it as an error condition.
// Because we are reconciling all load balancers, attempt to treat the error as a list of errors.
if err := kerrors.FilterOut(err, elb.IsInstanceNotRunning); err != nil {
machineScope.Error(err, "failed to reconcile LB attachment")
return ctrl.Result{}, err
}
// Cannot attach non-running instances to LB
shouldRequeue = true
}
}

Expand Down Expand Up @@ -1002,6 +1008,14 @@ func (r *AWSMachineReconciler) registerInstanceToV2LB(machineScope *scope.Machin
return nil
}

// See https://docs.aws.amazon.com/elasticloadbalancing/latest/application/target-group-register-targets.html#register-instances
if ptr.Deref(machineScope.GetInstanceState(), infrav1.InstanceStatePending) != infrav1.InstanceStateRunning {
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "FailedAttachControlPlaneELB",
"Cannot register control plane instance %q with load balancer: instance is not running", instance.ID)
conditions.MarkFalse(machineScope.AWSMachine, infrav1.ELBAttachedCondition, infrav1.ELBAttachFailedReason, clusterv1.ConditionSeverityInfo, "instance not running")
return elb.NewInstanceNotRunning("instance is not running")
}

if err := elbsvc.RegisterInstanceWithAPIServerLB(instance, lb); err != nil {
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "FailedAttachControlPlaneELB",
"Failed to register control plane instance %q with load balancer: %v", instance.ID, err)
Expand Down
14 changes: 14 additions & 0 deletions pkg/cloud/services/elb/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,15 @@ func NewConflict(msg string) error {
}
}

// NewInstanceNotRunning returns an error which indicates that the request cannot be processed due to the instance not
// being in a running state.
func NewInstanceNotRunning(msg string) error {
return &ELBError{
msg: msg,
Code: http.StatusTooEarly,
}
}

// IsNotFound returns true if the error was created by NewNotFound.
func IsNotFound(err error) bool {
if ReasonForError(err) == http.StatusNotFound {
Expand Down Expand Up @@ -90,6 +99,11 @@ func IsSDKError(err error) (ok bool) {
return
}

// IsInstanceNotRunning returns true if the error was created by NewInstanceNotRunning.
func IsInstanceNotRunning(err error) (ok bool) {
return ReasonForError(err) == http.StatusTooEarly
}

// ReasonForError returns the HTTP status for a particular error.
func ReasonForError(err error) int {
if t, ok := errors.Cause(err).(*ELBError); ok {
Expand Down

0 comments on commit 172038a

Please sign in to comment.