Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛: ec2/byoip: fix EIP leak when creating machine #5039

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion controllers/awsmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@ func (r *AWSMachineReconciler) reconcileNormal(_ context.Context, machineScope *
// a BYOIP without duplication.
if pool := machineScope.GetElasticIPPool(); pool != nil {
if err := ec2svc.ReconcileElasticIPFromPublicPool(pool, instance); err != nil {
machineScope.Error(err, "failed to associate elastic IP address")
machineScope.Error(err, "failed to reconcile BYO Public IPv4")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think about what I did with registering the instance with the LB: https://github.com/kubernetes-sigs/cluster-api-provider-aws/pull/5040/files#diff-0b559bbd149f0e6d54d789235423f66fa8906fdc6ee9c99b9e85db912912011eR622-R629. Just returning an error because the instance is not yet running might confuse users looking at the logs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@r4f4 that's a great idea, specially now it is generating more warnings with this approach, instead of leak and failures. But non error/warn messages for expected states would be much better.

return ctrl.Result{}, err
}
}
Expand Down
35 changes: 31 additions & 4 deletions pkg/cloud/services/ec2/eip.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/ec2"
"k8s.io/utils/ptr"

infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2"
"sigs.k8s.io/cluster-api-provider-aws/v2/pkg/record"
Expand All @@ -17,16 +18,42 @@ func getElasticIPRoleName(instanceID string) string {

// ReconcileElasticIPFromPublicPool reconciles the elastic IP from a custom Public IPv4 Pool.
func (s *Service) ReconcileElasticIPFromPublicPool(pool *infrav1.ElasticIPPool, instance *infrav1.Instance) error {
// TODO: check if the instance is in the state allowing EIP association.
// Expected instance states: pending or running
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-lifecycle.html
iip := ptr.Deref(instance.PublicIP, "")
s.scope.Debug("Reconciling machine with custom Public IPv4 Pool", "instance-id", instance.ID, "instance-state", instance.State, "instance-public-ip", iip, "publicIpv4PoolID", pool.PublicIpv4Pool)

// Requeue when the instance is not ready to be associated.
if instance.State != infrav1.InstanceStateRunning {
err := fmt.Errorf("unable to reconcile Elastic IP Pool to instance %q with state: %v", instance.ID, instance.State)
s.scope.Error(err, "failed to set Elastic IP Pool to machine", "publicIpv4PoolID", pool.PublicIpv4Pool)
return err
}

// Prevent running association every reconciliation when it is already done.
addrs, err := s.netService.GetAddresses(getElasticIPRoleName(instance.ID))
if err != nil {
s.scope.Error(err, "error checking if addresses exists for Elastic IP Pool to machine", "eip-role", getElasticIPRoleName(instance.ID))
return err
}
if len(addrs.Addresses) > 0 {
if len(addrs.Addresses) != 1 {
return fmt.Errorf("unexpected number of EIPs allocated to the role. expected 1, got %d", len(addrs.Addresses))
}
addr := addrs.Addresses[0]
// address is already associated.
if addr.AssociationId != nil && addr.InstanceId != nil && *addr.InstanceId == instance.ID {
s.scope.Debug("machine is already associated with an Elastic IP with custom Public IPv4 pool", "eip", addr.AllocationId, "eip-address", addr.PublicIp, "eip-associationID", addr.AssociationId, "eip-instance", addr.InstanceId)
return nil
}
}

// Associate EIP.
if err := s.getAndAssociateAddressesToInstance(pool, getElasticIPRoleName(instance.ID), instance.ID); err != nil {
return fmt.Errorf("failed to reconcile Elastic IP: %w", err)
}
return nil
}

// ReleaseElasticIP releases a specific elastic IP based on the instance role.
// ReleaseElasticIP releases a specific Elastic IP based on the instance role.
func (s *Service) ReleaseElasticIP(instanceID string) error {
return s.netService.ReleaseAddressByRole(getElasticIPRoleName(instanceID))
}
Expand Down
22 changes: 7 additions & 15 deletions pkg/cloud/services/ec2/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -563,21 +563,13 @@ func (s *Service) runInstance(role string, i *infrav1.Instance) (*infrav1.Instan

input.NetworkInterfaces = netInterfaces
} else {
if ptr.Deref(i.PublicIPOnLaunch, false) {
input.NetworkInterfaces = []*ec2.InstanceNetworkInterfaceSpecification{
{
DeviceIndex: aws.Int64(0),
SubnetId: aws.String(i.SubnetID),
Groups: aws.StringSlice(i.SecurityGroupIDs),
AssociatePublicIpAddress: i.PublicIPOnLaunch,
},
}
} else {
input.SubnetId = aws.String(i.SubnetID)

if len(i.SecurityGroupIDs) > 0 {
input.SecurityGroupIds = aws.StringSlice(i.SecurityGroupIDs)
}
input.NetworkInterfaces = []*ec2.InstanceNetworkInterfaceSpecification{
{
DeviceIndex: aws.Int64(0),
SubnetId: aws.String(i.SubnetID),
Groups: aws.StringSlice(i.SecurityGroupIDs),
AssociatePublicIpAddress: i.PublicIPOnLaunch,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I see you already set PublicIPOnLaunch to false if EIPPool is set.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, preserving the value from machine spec, and forcing to false when EIPPool

// Preserve user-defined PublicIp option.
input.PublicIPOnLaunch = scope.AWSMachine.Spec.PublicIP
// Public address from BYO Public IPv4 Pools need to be associated after launch (main machine
// reconciliate loop) preventing duplicated public IP. The map on launch is explicitly
// disabled in instances with PublicIP defined to true.
if scope.AWSMachine.Spec.ElasticIPPool != nil && scope.AWSMachine.Spec.ElasticIPPool.PublicIpv4Pool != nil {
input.PublicIPOnLaunch = ptr.To(false)
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main problem in this point is leaving the flag unset (original code), the AssociatePublicIpAddress seems to assume True in some point making the EIP being created before BYO IP reconciliation loop is reached.

I may have some scenarios triggered by those tests to review:

--- FAIL: TestCreateInstance (0.01s)
    --- FAIL: TestCreateInstance/when_multiple_subnets_match_filters,_subnets_in_the_cluster_vpc_are_preferred (0.00s)
    --- FAIL: TestCreateInstance/with_a_subnet_outside_the_cluster_vpc (0.00s)
    --- FAIL: TestCreateInstance/with_dedicated_tenancy_cloud-config (0.00s)
    --- FAIL: TestCreateInstance/with_custom_placement_group_cloud-config (0.00s)
    --- FAIL: TestCreateInstance/with_dedicated_tenancy_and_placement_group_ignition (0.00s)
    --- FAIL: TestCreateInstance/with_custom_placement_group_and_partition_number (0.00s)

Need to review those tests if it is expected or also a bug in the expected return values from API.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changes in pkg/cloud/services/ec2/instances_test.go fixes the RunInstanceWithContext api call to adapt to this change, it seems not impact in the use case described in the tests as it will result in the same.

},
}
}

Expand Down
66 changes: 48 additions & 18 deletions pkg/cloud/services/ec2/instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -672,11 +672,16 @@ func TestCreateInstance(t *testing.T) {
}, nil)
m.
RunInstancesWithContext(context.TODO(), &ec2.RunInstancesInput{
ImageId: aws.String("abc"),
InstanceType: aws.String("m5.2xlarge"),
KeyName: aws.String("default"),
SecurityGroupIds: aws.StringSlice([]string{"2", "3"}),
SubnetId: aws.String("subnet-3"),
ImageId: aws.String("abc"),
InstanceType: aws.String("m5.2xlarge"),
KeyName: aws.String("default"),
NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
{
DeviceIndex: aws.Int64(0),
SubnetId: aws.String("subnet-3"),
Groups: aws.StringSlice([]string{"2", "3"}),
},
},
TagSpecifications: []*ec2.TagSpecification{
{
ResourceType: aws.String("instance"),
Expand Down Expand Up @@ -870,11 +875,16 @@ func TestCreateInstance(t *testing.T) {
}, nil)
m.
RunInstancesWithContext(context.TODO(), &ec2.RunInstancesInput{
ImageId: aws.String("abc"),
InstanceType: aws.String("m5.2xlarge"),
KeyName: aws.String("default"),
SecurityGroupIds: aws.StringSlice([]string{"4", "3"}),
SubnetId: aws.String("subnet-5"),
ImageId: aws.String("abc"),
InstanceType: aws.String("m5.2xlarge"),
KeyName: aws.String("default"),
NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
{
DeviceIndex: aws.Int64(0),
SubnetId: aws.String("subnet-5"),
Groups: aws.StringSlice([]string{"4", "3"}),
},
},
TagSpecifications: []*ec2.TagSpecification{
{
ResourceType: aws.String("instance"),
Expand Down Expand Up @@ -3246,8 +3256,13 @@ func TestCreateInstance(t *testing.T) {
Placement: &ec2.Placement{
Tenancy: &tenancy,
},
SecurityGroupIds: []*string{aws.String("2"), aws.String("3")},
SubnetId: aws.String("subnet-1"),
NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
{
DeviceIndex: aws.Int64(0),
SubnetId: aws.String("subnet-1"),
Groups: []*string{aws.String("2"), aws.String("3")},
},
},
TagSpecifications: []*ec2.TagSpecification{
{
ResourceType: aws.String("instance"),
Expand Down Expand Up @@ -3405,8 +3420,13 @@ func TestCreateInstance(t *testing.T) {
Placement: &ec2.Placement{
GroupName: aws.String("placement-group1"),
},
SecurityGroupIds: []*string{aws.String("2"), aws.String("3")},
SubnetId: aws.String("subnet-1"),
NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
{
DeviceIndex: aws.Int64(0),
SubnetId: aws.String("subnet-1"),
Groups: []*string{aws.String("2"), aws.String("3")},
},
},
TagSpecifications: []*ec2.TagSpecification{
{
ResourceType: aws.String("instance"),
Expand Down Expand Up @@ -3585,8 +3605,13 @@ func TestCreateInstance(t *testing.T) {
Tenancy: &tenancy,
GroupName: aws.String("placement-group1"),
},
SecurityGroupIds: []*string{aws.String("2"), aws.String("3")},
SubnetId: aws.String("subnet-1"),
NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
{
DeviceIndex: aws.Int64(0),
SubnetId: aws.String("subnet-1"),
Groups: []*string{aws.String("2"), aws.String("3")},
},
},
TagSpecifications: []*ec2.TagSpecification{
{
ResourceType: aws.String("instance"),
Expand Down Expand Up @@ -3726,8 +3751,13 @@ func TestCreateInstance(t *testing.T) {
GroupName: aws.String("placement-group1"),
PartitionNumber: aws.Int64(2),
},
SecurityGroupIds: []*string{aws.String("2"), aws.String("3")},
SubnetId: aws.String("subnet-1"),
NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
{
DeviceIndex: aws.Int64(0),
SubnetId: aws.String("subnet-1"),
Groups: aws.StringSlice([]string{"2", "3"}),
},
},
TagSpecifications: []*ec2.TagSpecification{
{
ResourceType: aws.String("instance"),
Expand Down
13 changes: 9 additions & 4 deletions pkg/cloud/services/network/eips.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,17 @@ func (s *Service) GetOrAllocateAddresses(pool *infrav1.ElasticIPPool, num int, r
return s.getOrAllocateAddresses(num, role, pool)
}

// GetAddresses returns the address associated to a given role.
func (s *Service) GetAddresses(role string) (*ec2.DescribeAddressesOutput, error) {
return s.describeAddresses(role)
}

// ReleaseAddressByRole releases EIP addresses filtering by tag CAPA provider role.
func (s *Service) ReleaseAddressByRole(role string) error {
clusterFilter := []*ec2.Filter{filter.EC2.Cluster(s.scope.Name())}
clusterFilter = append(clusterFilter, filter.EC2.ProviderRole(role))

return s.releaseAddressesWithFilter(clusterFilter)
return s.releaseAddressesWithFilter([]*ec2.Filter{
filter.EC2.ClusterOwned(s.scope.Name()),
filter.EC2.ProviderRole(role),
})
}

// setByoPublicIpv4 check if the config has Public IPv4 Pool defined, then
Expand Down