Skip to content

Commit

Permalink
feat: Automated AMI management and Upgrade locking (#327)
Browse files Browse the repository at this point in the history
Automated AMI management

InstanceGroups can now set the image configuration to "latest".
This will result the ami value being retrieved from a ssm parameter
(https://docs.aws.amazon.com/eks/latest/userguide/retrieve-ami-id.html).
This will ensure that nodes within an InstanceGroup are kept up-to-date
and is especially useful in development clusters.

Automated AMI management supports retrieving amazon amis for amazon
linux 2, bottlerocket and windows nodes. This can be configured using
the annotation `instancemgr.keikoproj.io/os-family`.

Upgrade locking

InstanceGroups can now set an annotation`instancemgr.keikoproj.io/lock-upgrades="true"`
which will prevent the InstanceGroup from entering the InitUpgrade state.
This is useful for controlling when the nodes of an InstanceGroup can be
upgraded, pairing well with the automated AMI management feature.

Signed-off-by: Sebastian Cole <sebastian.paris@gmail.com>
  • Loading branch information
preflightsiren authored Oct 7, 2021
1 parent 19de2db commit 165095e
Show file tree
Hide file tree
Showing 34 changed files with 789 additions and 74 deletions.
9 changes: 9 additions & 0 deletions .github/DEVELOPER.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ You can also run `make coverage` to generate a coverage report.

## Running BDD tests

### Dependencies

1. You will need an existing EKS cluster running with the connection details exported into a kube config file.
2. [Keikoproj Minion-Manager](https://github.com/keikoproj/minion-manager) must also be running in the cluster
3. Instance Manager needs to be started outside of the bdd test suite


Export some variables and run `make bdd` to run a functional e2e test.

### Example
Expand Down Expand Up @@ -96,3 +103,5 @@ testing: warning: no tests to run
PASS
ok github.com/keikoproj/instance-manager/test-bdd 1362.336s [no tests to run]
```

Note: If your test cluster uses `InstanceGroups` to run core components, annotating the namespace with `instancemgr.keikoproj.io/config-excluded="true"` can help prevent unexpected disruption.
19 changes: 16 additions & 3 deletions api/v1alpha1/instancegroup_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ const (
ReconcileModified ReconcileState = "ReconcileModified"

// End States
ReconcileReady ReconcileState = "Ready"
ReconcileErr ReconcileState = "Error"
ReconcileLocked ReconcileState = "Locked"
ReconcileReady ReconcileState = "Ready"
ReconcileErr ReconcileState = "Error"

// Userdata bootstrap stages
PreBootstrapStage = "PreBootstrap"
Expand Down Expand Up @@ -76,6 +77,8 @@ const (
HostPlacementTenancyType = "host"
DefaultPlacementTenancyType = "default"
DedicatedPlacementTenancyType = "dedicated"

ImageLatestValue = "latest"
)

type ContainerRuntime string
Expand All @@ -87,6 +90,8 @@ const (

DockerRuntime ContainerRuntime = "dockerd"
ContainerDRuntime ContainerRuntime = "containerd"

UpgradeLockedAnnotationKey = "instancemgr.keikoproj.io/lock-upgrades"
)

var (
Expand Down Expand Up @@ -392,6 +397,15 @@ func (ig *InstanceGroup) GetUpgradeStrategy() *AwsUpgradeStrategy {
func (ig *InstanceGroup) SetUpgradeStrategy(strategy AwsUpgradeStrategy) {
ig.Spec.AwsUpgradeStrategy = strategy
}
func (ig *InstanceGroup) Locked() bool {
annotations := ig.GetAnnotations()
if val, ok := annotations[UpgradeLockedAnnotationKey]; ok {
if strings.EqualFold(val, "true") {
return true
}
}
return false
}

func (s *EKSSpec) Validate() error {
var (
Expand Down Expand Up @@ -521,7 +535,6 @@ func (c *EKSConfiguration) Validate() error {
c.SuspendedProcesses = processes
}


if c.BootstrapOptions != nil {
if c.BootstrapOptions.ContainerRuntime != "" && !contains(AllowedContainerRuntimes, c.BootstrapOptions.ContainerRuntime) {
return errors.Errorf("validation failed, 'bootstrapOptions.containerRuntime' must be one of %+v", AllowedContainerRuntimes)
Expand Down
50 changes: 43 additions & 7 deletions api/v1alpha1/instancegroup_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"testing"

"github.com/aws/aws-sdk-go/aws"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

type EksUnitTest struct {
Expand Down Expand Up @@ -117,13 +118,13 @@ func TestInstanceGroupSpecValidate(t *testing.T) {
MinSize: 1,
Type: "LaunchTemplate",
EKSConfiguration: &EKSConfiguration{
BootstrapOptions: &BootstrapOptions{ContainerRuntime: "foo"},
EksClusterName: "my-eks-cluster",
NodeSecurityGroups: []string{"sg-123456789"},
Image: "ami-12345",
InstanceType: "m5.large",
KeyPairName: "thisShouldBeOptional",
Subnets: []string{"subnet-1111111", "subnet-222222"},
BootstrapOptions: &BootstrapOptions{ContainerRuntime: "foo"},
EksClusterName: "my-eks-cluster",
NodeSecurityGroups: []string{"sg-123456789"},
Image: "ami-12345",
InstanceType: "m5.large",
KeyPairName: "thisShouldBeOptional",
Subnets: []string{"subnet-1111111", "subnet-222222"},
},
}, nil, nil),
},
Expand Down Expand Up @@ -353,6 +354,41 @@ func TestInstanceGroupSpecValidate(t *testing.T) {
}
}

func TestLockedAnnotation(t *testing.T) {
tests := []struct {
name string
annotation string
expected bool
}{
{
name: "Locked",
annotation: "true",
expected: true,
},
{
name: "Unlocked",
annotation: "false",
expected: false,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
testIg := &InstanceGroup{
ObjectMeta: v1.ObjectMeta{
Annotations: map[string]string{
UpgradeLockedAnnotationKey: test.annotation,
},
},
}
res := testIg.Locked()
if res != test.expected {
t.Errorf("%v: got %v, expected %v", test.name, res, test.expected)
}
})
}
}

func basicFargateSpec() *EKSFargateSpec {
return &EKSFargateSpec{
ClusterName: "",
Expand Down
2 changes: 1 addition & 1 deletion controllers/instancegroup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ func (r *InstanceGroupReconciler) IsNamespaceAnnotated(namespace, key, value str
}

annotations := unstructuredNamespace.GetAnnotations()
if kubeprovider.HasAnnotation(annotations, key, value) {
if kubeprovider.HasAnnotationWithValue(annotations, key, value) {
return true
}
}
Expand Down
12 changes: 12 additions & 0 deletions controllers/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ type CloudDeployer interface {
GetState() v1alpha.ReconcileState // Gets the current state type of the instance group
SetState(v1alpha.ReconcileState) // Sets the current state of the instance group
IsReady() bool // Returns true if state is Ready
Locked() bool // Returns true if instanceGroup is locked
}

func HandleReconcileRequest(d CloudDeployer) error {
Expand Down Expand Up @@ -54,6 +55,11 @@ func HandleReconcileRequest(d CloudDeployer) error {

// CRUD Nodes Upgrade Strategy
if d.GetState() == v1alpha.ReconcileInitUpgrade {
// Locked
if d.Locked() {
d.SetState(v1alpha.ReconcileLocked)
return nil
}
err = d.UpgradeNodes()
if err != nil {
return err
Expand All @@ -67,12 +73,18 @@ func HandleReconcileRequest(d CloudDeployer) error {

// Bootstrap Nodes
if d.IsReady() {

err = d.BootstrapNodes()
if err != nil {
return err
}

if d.GetState() == v1alpha.ReconcileInitUpgrade {
// Locked
if d.Locked() {
d.SetState(v1alpha.ReconcileLocked)
return nil
}
err = d.UpgradeNodes()
if err != nil {
return err
Expand Down
18 changes: 14 additions & 4 deletions controllers/providers/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
"github.com/aws/aws-sdk-go/service/eks/eksiface"
"github.com/aws/aws-sdk-go/service/iam/iamiface"
"github.com/aws/aws-sdk-go/service/ssm/ssmiface"
"github.com/pkg/errors"
ctrl "sigs.k8s.io/controller-runtime"
)
Expand All @@ -55,6 +56,7 @@ const (
DescribeLaunchTemplateVersionsTTL time.Duration = 60 * time.Second
DescribeInstanceTypesTTL time.Duration = 24 * time.Hour
DescribeInstanceTypeOfferingTTL time.Duration = 1 * time.Hour
GetParameterTTL time.Duration = 1 * time.Hour

CacheBackgroundPruningInterval time.Duration = 1 * time.Hour
CacheMaxItems int64 = 250
Expand Down Expand Up @@ -117,6 +119,7 @@ type AwsWorker struct {
EksClient eksiface.EKSAPI
IamClient iamiface.IAMAPI
Ec2Client ec2iface.EC2API
SsmClient ssmiface.SSMAPI
Ec2Metadata *ec2metadata.EC2Metadata
Parameters map[string]interface{}
}
Expand Down Expand Up @@ -246,10 +249,9 @@ func GetScalingConfigName(group *autoscaling.Group) string {
}

func GetInstanceTypeNetworkInfo(instanceTypes []*ec2.InstanceTypeInfo, instanceType string) *ec2.NetworkInfo {
for _, instanceTypeInfo := range instanceTypes {
if aws.StringValue(instanceTypeInfo.InstanceType) == instanceType {
return instanceTypeInfo.NetworkInfo
}
i := GetInstanceTypeInfo(instanceTypes, instanceType)
if i != nil {
return i.NetworkInfo
}
return nil
}
Expand All @@ -262,3 +264,11 @@ func GetInstanceTypeInfo(instanceTypes []*ec2.InstanceTypeInfo, instanceType str
}
return nil
}

func GetInstanceTypeArchitectures(instanceTypes []*ec2.InstanceTypeInfo, instanceType string) []string {
i := GetInstanceTypeInfo(instanceTypes, instanceType)
if i != nil {
return aws.StringValueSlice((*i).ProcessorInfo.SupportedArchitectures)
}
return nil
}
72 changes: 72 additions & 0 deletions controllers/providers/aws/ssm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package aws

import (
"fmt"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/request"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/ssm"
"github.com/aws/aws-sdk-go/service/ssm/ssmiface"
"github.com/keikoproj/aws-sdk-go-cache/cache"
"github.com/keikoproj/instance-manager/controllers/common"
)

type architectureMap map[string]string

const (
EksOptimisedAmiPath = "/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id"
EksOptimisedAmazonLinux2Arm64 = "/aws/service/eks/optimized-ami/%s/amazon-linux-2-arm64/recommended/image_id"
EksOptimisedBottlerocket = "/aws/service/bottlerocket/aws-k8s-%s/x86_64/latest/image_id"
EksOptimisedBottlerocketArm64 = "/aws/service/bottlerocket/aws-k8s-%s/arm64/latest/image_id"
EksOptimisedWindowsCore = "/aws/service/ami-windows-latest/Windows_Server-2019-English-Core-EKS_Optimized-%s/image_id"
EksOptimisedWindowsFull = "/aws/service/ami-windows-latest/Windows_Server-2019-English-Full-EKS_Optimized-%s/image_id"
)

var (
EksAmis = map[string]architectureMap{
"amazonlinux2": architectureMap{
"x86_64": EksOptimisedAmiPath,
"arm64": EksOptimisedAmazonLinux2Arm64,
},
"bottlerocket": architectureMap{
"x86_64": EksOptimisedBottlerocket,
"arm64": EksOptimisedBottlerocketArm64,
},
"windows": architectureMap{
"x86_64": EksOptimisedWindowsCore,
},
}
)

func GetAwsSsmClient(region string, cacheCfg *cache.Config, maxRetries int, collector *common.MetricsCollector) ssmiface.SSMAPI {
config := aws.NewConfig().WithRegion(region).WithCredentialsChainVerboseErrors(true)
config = request.WithRetryer(config, NewRetryLogger(maxRetries, collector))
sess, err := session.NewSession(config)
if err != nil {
panic(err)
}
cache.AddCaching(sess, cacheCfg)
cacheCfg.SetCacheTTL("ssm", "GetParameter", GetParameterTTL)
sess.Handlers.Complete.PushFront(func(r *request.Request) {
ctx := r.HTTPRequest.Context()
log.V(1).Info("AWS API call",
"cacheHit", cache.IsCacheHit(ctx),
"service", r.ClientInfo.ServiceName,
"operation", r.Operation.Name,
)
})
return ssm.New(sess)
}

func (w *AwsWorker) GetEksLatestAmi(OSFamily string, arch string, kubernetesVersion string) (string, error) {
input := &ssm.GetParameterInput{
Name: aws.String(fmt.Sprintf(EksAmis[OSFamily][arch], kubernetesVersion)),
}

output, err := w.SsmClient.GetParameter(input)
if err != nil {
return "", err
}
return aws.StringValue(output.Parameter.Value), nil
}
2 changes: 1 addition & 1 deletion controllers/providers/kubernetes/crd.go
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ func GetResources(kube dynamic.Interface, instanceGroup *v1alpha1.InstanceGroup,

annotations := ru.GetAnnotations()

if HasAnnotation(annotations, OwnershipAnnotationKey, OwnershipAnnotationValue) && HasAnnotation(annotations, ScopeAnnotationKey, status.GetActiveScalingGroupName()) {
if HasAnnotationWithValue(annotations, OwnershipAnnotationKey, OwnershipAnnotationValue) && HasAnnotationWithValue(annotations, ScopeAnnotationKey, status.GetActiveScalingGroupName()) {
if IsPathValue(ru, statusJSONPath, completedStatus) || IsPathValue(ru, statusJSONPath, errorStatus) {
// if resource is not completed and not failed, it must be still active
inactiveResources = append(inactiveResources, ru)
Expand Down
9 changes: 8 additions & 1 deletion controllers/providers/kubernetes/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,14 @@ func AddAnnotation(u *unstructured.Unstructured, key, value string) {
u.SetAnnotations(annotations)
}

func HasAnnotation(annotations map[string]string, key, value string) bool {
func HasAnnotation(annotations map[string]string, key string) bool {
if _, ok := annotations[key]; ok {
return true
}
return false
}

func HasAnnotationWithValue(annotations map[string]string, key, value string) bool {
if val, ok := annotations[key]; ok {
if strings.EqualFold(val, value) {
return true
Expand Down
Loading

0 comments on commit 165095e

Please sign in to comment.