From ace1beea091161f6c206e2c4456939612865ccef Mon Sep 17 00:00:00 2001 From: Marco Braga Date: Thu, 27 Jun 2024 17:12:39 -0300 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B:=20ec2/byoip:=20fix=20EIP=20leak?= =?UTF-8?q?=20when=20creating=20machine?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The instance creation flow is creating by default EIP to instances even if the BYO IP flow is set. BYO IPv4 creates and associates the EIP to instance after it is created, preventing paying for additional EIP (amazon-provided) when creating the instance when the BYO IPv4 Pool is defined to be used by the machine. Furthermore, the fix provides additional checks to prevent duplicated EIP in the BYO IP reconciliation loop. The extra checks include running the EIP association many times, while the EIP is already associated, and failures in the log when running the EIP association prematurely - when the instance isn't ready, Eg ec2 in pending state. --- controllers/awsmachine_controller.go | 2 +- pkg/cloud/services/ec2/eip.go | 35 +++++++++++-- pkg/cloud/services/ec2/instances.go | 22 +++----- pkg/cloud/services/ec2/instances_test.go | 66 +++++++++++++++++------- pkg/cloud/services/network/eips.go | 6 +++ 5 files changed, 93 insertions(+), 38 deletions(-) diff --git a/controllers/awsmachine_controller.go b/controllers/awsmachine_controller.go index e468e4c2d7..47d4d171c4 100644 --- a/controllers/awsmachine_controller.go +++ b/controllers/awsmachine_controller.go @@ -541,7 +541,7 @@ func (r *AWSMachineReconciler) reconcileNormal(_ context.Context, machineScope * // a BYOIP without duplication. if pool := machineScope.GetElasticIPPool(); pool != nil { if err := ec2svc.ReconcileElasticIPFromPublicPool(pool, instance); err != nil { - machineScope.Error(err, "failed to associate elastic IP address") + machineScope.Error(err, "failed to reconcile BYO Public IPv4") return ctrl.Result{}, err } } diff --git a/pkg/cloud/services/ec2/eip.go b/pkg/cloud/services/ec2/eip.go index 77484e201c..c6d50f5003 100644 --- a/pkg/cloud/services/ec2/eip.go +++ b/pkg/cloud/services/ec2/eip.go @@ -6,6 +6,7 @@ import ( "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/ec2" + "k8s.io/utils/ptr" infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/record" @@ -17,16 +18,42 @@ func getElasticIPRoleName(instanceID string) string { // ReconcileElasticIPFromPublicPool reconciles the elastic IP from a custom Public IPv4 Pool. func (s *Service) ReconcileElasticIPFromPublicPool(pool *infrav1.ElasticIPPool, instance *infrav1.Instance) error { - // TODO: check if the instance is in the state allowing EIP association. - // Expected instance states: pending or running - // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-lifecycle.html + iip := ptr.Deref(instance.PublicIP, "") + s.scope.Debug("Reconciling machine with custom Public IPv4 Pool", "instance-id", instance.ID, "instance-state", instance.State, "instance-public-ip", iip, "publicIpv4PoolID", pool.PublicIpv4Pool) + + // Requeue when the instance is not ready to be associated. + if instance.State != infrav1.InstanceStateRunning { + err := fmt.Errorf("unable to reconcile Elastic IP Pool to instance %q with state: %v", instance.ID, instance.State) + s.scope.Error(err, "failed to set Elastic IP Pool to machine", "publicIpv4PoolID", pool.PublicIpv4Pool) + return err + } + + // Prevent running association every reconciliation when it is already done. + addrs, err := s.netService.GetAddresses(getElasticIPRoleName(instance.ID)) + if err != nil { + s.scope.Error(err, "error checking if addresses exists for Elastic IP Pool to machine", "eip-role", getElasticIPRoleName(instance.ID)) + return err + } + if len(addrs.Addresses) > 0 { + if len(addrs.Addresses) != 1 { + return fmt.Errorf("unexpected number of EIPs allocated to the role. expected 1, got %d", len(addrs.Addresses)) + } + addr := addrs.Addresses[0] + // address is already associated. + if addr.AssociationId != nil && addr.InstanceId != nil && *addr.InstanceId == instance.ID { + s.scope.Debug("machine is already associated with an Elastic IP with custom Public IPv4 pool", "eip", addr.AllocationId, "eip-address", addr.PublicIp, "eip-associationID", addr.AssociationId, "eip-instance", addr.InstanceId) + return nil + } + } + + // Associate EIP. if err := s.getAndAssociateAddressesToInstance(pool, getElasticIPRoleName(instance.ID), instance.ID); err != nil { return fmt.Errorf("failed to reconcile Elastic IP: %w", err) } return nil } -// ReleaseElasticIP releases a specific elastic IP based on the instance role. +// ReleaseElasticIP releases a specific Elastic IP based on the instance role. func (s *Service) ReleaseElasticIP(instanceID string) error { return s.netService.ReleaseAddressByRole(getElasticIPRoleName(instanceID)) } diff --git a/pkg/cloud/services/ec2/instances.go b/pkg/cloud/services/ec2/instances.go index b6975c5309..4c3d5017b6 100644 --- a/pkg/cloud/services/ec2/instances.go +++ b/pkg/cloud/services/ec2/instances.go @@ -563,21 +563,13 @@ func (s *Service) runInstance(role string, i *infrav1.Instance) (*infrav1.Instan input.NetworkInterfaces = netInterfaces } else { - if ptr.Deref(i.PublicIPOnLaunch, false) { - input.NetworkInterfaces = []*ec2.InstanceNetworkInterfaceSpecification{ - { - DeviceIndex: aws.Int64(0), - SubnetId: aws.String(i.SubnetID), - Groups: aws.StringSlice(i.SecurityGroupIDs), - AssociatePublicIpAddress: i.PublicIPOnLaunch, - }, - } - } else { - input.SubnetId = aws.String(i.SubnetID) - - if len(i.SecurityGroupIDs) > 0 { - input.SecurityGroupIds = aws.StringSlice(i.SecurityGroupIDs) - } + input.NetworkInterfaces = []*ec2.InstanceNetworkInterfaceSpecification{ + { + DeviceIndex: aws.Int64(0), + SubnetId: aws.String(i.SubnetID), + Groups: aws.StringSlice(i.SecurityGroupIDs), + AssociatePublicIpAddress: i.PublicIPOnLaunch, + }, } } diff --git a/pkg/cloud/services/ec2/instances_test.go b/pkg/cloud/services/ec2/instances_test.go index 5fc267036b..134b582135 100644 --- a/pkg/cloud/services/ec2/instances_test.go +++ b/pkg/cloud/services/ec2/instances_test.go @@ -672,11 +672,16 @@ func TestCreateInstance(t *testing.T) { }, nil) m. RunInstancesWithContext(context.TODO(), &ec2.RunInstancesInput{ - ImageId: aws.String("abc"), - InstanceType: aws.String("m5.2xlarge"), - KeyName: aws.String("default"), - SecurityGroupIds: aws.StringSlice([]string{"2", "3"}), - SubnetId: aws.String("subnet-3"), + ImageId: aws.String("abc"), + InstanceType: aws.String("m5.2xlarge"), + KeyName: aws.String("default"), + NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{ + { + DeviceIndex: aws.Int64(0), + SubnetId: aws.String("subnet-3"), + Groups: aws.StringSlice([]string{"2", "3"}), + }, + }, TagSpecifications: []*ec2.TagSpecification{ { ResourceType: aws.String("instance"), @@ -870,11 +875,16 @@ func TestCreateInstance(t *testing.T) { }, nil) m. RunInstancesWithContext(context.TODO(), &ec2.RunInstancesInput{ - ImageId: aws.String("abc"), - InstanceType: aws.String("m5.2xlarge"), - KeyName: aws.String("default"), - SecurityGroupIds: aws.StringSlice([]string{"4", "3"}), - SubnetId: aws.String("subnet-5"), + ImageId: aws.String("abc"), + InstanceType: aws.String("m5.2xlarge"), + KeyName: aws.String("default"), + NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{ + { + DeviceIndex: aws.Int64(0), + SubnetId: aws.String("subnet-5"), + Groups: aws.StringSlice([]string{"4", "3"}), + }, + }, TagSpecifications: []*ec2.TagSpecification{ { ResourceType: aws.String("instance"), @@ -3246,8 +3256,13 @@ func TestCreateInstance(t *testing.T) { Placement: &ec2.Placement{ Tenancy: &tenancy, }, - SecurityGroupIds: []*string{aws.String("2"), aws.String("3")}, - SubnetId: aws.String("subnet-1"), + NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{ + { + DeviceIndex: aws.Int64(0), + SubnetId: aws.String("subnet-1"), + Groups: []*string{aws.String("2"), aws.String("3")}, + }, + }, TagSpecifications: []*ec2.TagSpecification{ { ResourceType: aws.String("instance"), @@ -3405,8 +3420,13 @@ func TestCreateInstance(t *testing.T) { Placement: &ec2.Placement{ GroupName: aws.String("placement-group1"), }, - SecurityGroupIds: []*string{aws.String("2"), aws.String("3")}, - SubnetId: aws.String("subnet-1"), + NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{ + { + DeviceIndex: aws.Int64(0), + SubnetId: aws.String("subnet-1"), + Groups: []*string{aws.String("2"), aws.String("3")}, + }, + }, TagSpecifications: []*ec2.TagSpecification{ { ResourceType: aws.String("instance"), @@ -3585,8 +3605,13 @@ func TestCreateInstance(t *testing.T) { Tenancy: &tenancy, GroupName: aws.String("placement-group1"), }, - SecurityGroupIds: []*string{aws.String("2"), aws.String("3")}, - SubnetId: aws.String("subnet-1"), + NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{ + { + DeviceIndex: aws.Int64(0), + SubnetId: aws.String("subnet-1"), + Groups: []*string{aws.String("2"), aws.String("3")}, + }, + }, TagSpecifications: []*ec2.TagSpecification{ { ResourceType: aws.String("instance"), @@ -3726,8 +3751,13 @@ func TestCreateInstance(t *testing.T) { GroupName: aws.String("placement-group1"), PartitionNumber: aws.Int64(2), }, - SecurityGroupIds: []*string{aws.String("2"), aws.String("3")}, - SubnetId: aws.String("subnet-1"), + NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{ + { + DeviceIndex: aws.Int64(0), + SubnetId: aws.String("subnet-1"), + Groups: aws.StringSlice([]string{"2", "3"}), + }, + }, TagSpecifications: []*ec2.TagSpecification{ { ResourceType: aws.String("instance"), diff --git a/pkg/cloud/services/network/eips.go b/pkg/cloud/services/network/eips.go index f301650797..de2e8d7d1b 100644 --- a/pkg/cloud/services/network/eips.go +++ b/pkg/cloud/services/network/eips.go @@ -191,10 +191,16 @@ func (s *Service) GetOrAllocateAddresses(pool *infrav1.ElasticIPPool, num int, r return s.getOrAllocateAddresses(num, role, pool) } +// GetAddresses returns the address associated to a given role. +func (s *Service) GetAddresses(role string) (*ec2.DescribeAddressesOutput, error) { + return s.describeAddresses(role) +} + // ReleaseAddressByRole releases EIP addresses filtering by tag CAPA provider role. func (s *Service) ReleaseAddressByRole(role string) error { clusterFilter := []*ec2.Filter{filter.EC2.Cluster(s.scope.Name())} clusterFilter = append(clusterFilter, filter.EC2.ProviderRole(role)) + clusterFilter = append(clusterFilter, filter.EC2.ClusterOwned(s.scope.Name())) return s.releaseAddressesWithFilter(clusterFilter) }