Skip to content

Commit

Permalink
Backport cluster critial fargate profiles + karpenter module (#352)
Browse files Browse the repository at this point in the history
* Backport fargate profile for cluster-critial addons

This feature is backported to 1.24 to ease the upgrade process
to 1.25

By upgrading to the latest 1.24 module version we can add the
fargate profile before the cluster critical autoscaling group is
removed.

* Backport karpenter module to 1.24

Allows karpenter to be installed or updated before
upgrading a cluster to 1.25
  • Loading branch information
errm committed Oct 30, 2023
1 parent 11c3125 commit a2bc4fe
Show file tree
Hide file tree
Showing 12 changed files with 532 additions and 8 deletions.
44 changes: 44 additions & 0 deletions modules/cluster/fargate.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
resource "aws_eks_fargate_profile" "critical_pods" {
cluster_name = aws_eks_cluster.control_plane.name
fargate_profile_name = "${var.name}-critical-pods"
pod_execution_role_arn = aws_iam_role.fargate.arn
subnet_ids = values(var.vpc_config.private_subnet_ids)

dynamic "selector" {
for_each = var.fargate_namespaces

content {
namespace = selector.value
labels = {}
}
}
}

resource "aws_iam_role" "fargate" {
name = "${var.iam_role_name_prefix}Fargate-${var.name}"
assume_role_policy = data.aws_iam_policy_document.fargate_assume_role_policy.json
description = "Fargate execution role for pods on ${var.name} eks cluster"
}

data "aws_iam_policy_document" "fargate_assume_role_policy" {
statement {
effect = "Allow"
actions = ["sts:AssumeRole"]

principals {
type = "Service"
identifiers = ["eks-fargate-pods.amazonaws.com"]
}
}
}

resource "aws_iam_role_policy_attachment" "fargate_managed_policies" {
for_each = toset([
"arn:aws:iam::aws:policy/AmazonEKSFargatePodExecutionRolePolicy",
"arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy",
])

role = aws_iam_role.fargate.id
policy_arn = each.value
}

19 changes: 11 additions & 8 deletions modules/cluster/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
locals {
config = {
name = aws_eks_cluster.control_plane.name
endpoint = aws_eks_cluster.control_plane.endpoint
ca_data = aws_eks_cluster.control_plane.certificate_authority[0].data
vpc_id = var.vpc_config.vpc_id
private_subnet_ids = var.vpc_config.private_subnet_ids
node_security_group = aws_eks_cluster.control_plane.vpc_config.0.cluster_security_group_id
node_instance_profile = var.iam_config.node_role
tags = var.tags
name = aws_eks_cluster.control_plane.name
endpoint = aws_eks_cluster.control_plane.endpoint
arn = aws_eks_cluster.control_plane.arn
ca_data = aws_eks_cluster.control_plane.certificate_authority[0].data
vpc_id = var.vpc_config.vpc_id
private_subnet_ids = var.vpc_config.private_subnet_ids
node_security_group = aws_eks_cluster.control_plane.vpc_config.0.cluster_security_group_id
node_instance_profile = var.iam_config.node_role
tags = var.tags
iam_role_name_prefix = var.iam_role_name_prefix
fargate_execution_role_arn = aws_iam_role.fargate.arn
}
}

Expand Down
6 changes: 6 additions & 0 deletions modules/cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,9 @@ variable "security_group_ids" {
default = []
description = "A list of security group IDs for the cross-account elastic network interfaces that Amazon EKS creates to use to allow communication with the Kubernetes control plane. *WARNING* changes to this list will cause the cluster to be recreated."
}

variable "fargate_namespaces" {
type = set(string)
default = ["kube-system", "flux-system"]
description = "A list of namespaces to create fargate profiles for, should be set to a list of namespaces critical for flux / cluster bootstrapping"
}
51 changes: 51 additions & 0 deletions modules/karpenter/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Karpenter

This module configures the resources required to run the
karpenter node-provisioning tool in an eks cluster.

* Fargate Profile - to run karpenter
* IAM roles for the fargate controller and nodes to be provisioned by karpenter
* SQS queue to provide events (spot interruption etc) to karpenter

It does not install karpenter itself to the cluster - and we recomend
that you use helm as per the [karpenter documentation](https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/#4-install-karpenter)

It is provided as a submodule so the core module is less opinionated.

However we test the core module and the karpenter module
in our test suite to ensure that the different components we use in our
clusters at cookpad intergrate correctly.


## Example

You should pass cluster and oidc config from the cluster to the karpenter module.

You will also need to add the IAM role of nodes created by karpenter to the aws_auth_role_map
so they can connect to the cluster.

```hcl
module "cluster" {
source = "cookpad/eks/aws"
name = "hal-9000"
vpc_config = module.vpc.config
aws_auth_role_map = [
{
username = "system:node:{{EC2PrivateDNSName}}"
rolearn = module.karpenter.node_role_arn
groups = [
"system:bootstrappers",
"system:nodes",
]
},
]
}
module "karpenter" {
source = "cookpad/eks/aws//modules/karpenter"
cluster_config = module.cluster.config
oidc_config = module.cluster.oidc_config
}
```
263 changes: 263 additions & 0 deletions modules/karpenter/controller_iam.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
resource "aws_iam_role" "karpenter_controller" {
name = "${var.cluster_config.iam_role_name_prefix}Karpenter-${var.cluster_config.name}"
assume_role_policy = data.aws_iam_policy_document.karpenter_controller_assume_role_policy.json
description = "Karpenter controller role for ${var.cluster_config.name} cluster"
}

data "aws_iam_policy_document" "karpenter_controller_assume_role_policy" {
statement {
actions = ["sts:AssumeRoleWithWebIdentity"]
effect = "Allow"

condition {
test = "StringEquals"
variable = "${replace(var.oidc_config.url, "https://", "")}:sub"
values = ["system:serviceaccount:karpenter:karpenter"]
}

condition {
test = "StringEquals"
variable = "${replace(var.oidc_config.url, "https://", "")}:aud"
values = ["sts.amazonaws.com"]
}

principals {
identifiers = [var.oidc_config.arn]
type = "Federated"
}
}
}

resource "aws_iam_role_policy" "karpenter_controller" {
name = "KarpenterController"
role = aws_iam_role.karpenter_controller.id
policy = data.aws_iam_policy_document.karpenter_controller.json
}

data "aws_iam_policy_document" "karpenter_controller" {
statement {
sid = "AllowScopedEC2InstanceActions"
effect = "Allow"

# tfsec:ignore:aws-iam-no-policy-wildcards
resources = [
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}::image/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}::snapshot/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:spot-instances-request/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:security-group/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:subnet/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:launch-template/*",
]

actions = [
"ec2:RunInstances",
"ec2:CreateFleet",
]
}

statement {
sid = "AllowScopedEC2InstanceActionsWithTags"
effect = "Allow"

# tfsec:ignore:aws-iam-no-policy-wildcards
resources = [
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:fleet/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:instance/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:volume/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:network-interface/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:launch-template/*",
]

actions = [
"ec2:RunInstances",
"ec2:CreateFleet",
"ec2:CreateLaunchTemplate",
]

condition {
test = "StringEquals"
variable = "aws:RequestTag/kubernetes.io/cluster/${var.cluster_config.name}"
values = ["owned"]
}

condition {
test = "StringLike"
variable = "aws:RequestTag/karpenter.sh/provisioner-name"
values = ["*"]
}
}

statement {
sid = "AllowScopedResourceCreationTagging"
effect = "Allow"

# tfsec:ignore:aws-iam-no-policy-wildcards
resources = [
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:fleet/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:instance/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:volume/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:network-interface/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:launch-template/*",
]

actions = ["ec2:CreateTags"]

condition {
test = "StringEquals"
variable = "aws:RequestTag/kubernetes.io/cluster/${var.cluster_config.name}"
values = ["owned"]
}

condition {
test = "StringEquals"
variable = "ec2:CreateAction"

values = [
"RunInstances",
"CreateFleet",
"CreateLaunchTemplate",
]
}

condition {
test = "StringLike"
variable = "aws:RequestTag/karpenter.sh/provisioner-name"
values = ["*"]
}
}

statement {
sid = "AllowMachineMigrationTagging"
effect = "Allow"
# tfsec:ignore:aws-iam-no-policy-wildcards
resources = ["arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:instance/*"]
actions = ["ec2:CreateTags"]

condition {
test = "StringEquals"
variable = "aws:ResourceTag/kubernetes.io/cluster/${var.cluster_config.name}"
values = ["owned"]
}

condition {
test = "StringEquals"
variable = "aws:RequestTag/karpenter.sh/managed-by"
values = [var.cluster_config.name]
}

condition {
test = "StringLike"
variable = "aws:RequestTag/karpenter.sh/provisioner-name"
values = ["*"]
}

condition {
test = "ForAllValues:StringEquals"
variable = "aws:TagKeys"

values = [
"karpenter.sh/provisioner-name",
"karpenter.sh/managed-by",
]
}
}

statement {
sid = "AllowScopedDeletion"
effect = "Allow"

# tfsec:ignore:aws-iam-no-policy-wildcards
resources = [
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:instance/*",
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.current.name}:*:launch-template/*",
]

actions = [
"ec2:TerminateInstances",
"ec2:DeleteLaunchTemplate",
]

condition {
test = "StringEquals"
variable = "aws:ResourceTag/kubernetes.io/cluster/${var.cluster_config.name}"
values = ["owned"]
}

condition {
test = "StringLike"
variable = "aws:ResourceTag/karpenter.sh/provisioner-name"
values = ["*"]
}
}

statement {
sid = "AllowRegionalReadActions"
effect = "Allow"
resources = ["*"]

actions = [
"ec2:DescribeAvailabilityZones",
"ec2:DescribeImages",
"ec2:DescribeInstances",
"ec2:DescribeInstanceTypeOfferings",
"ec2:DescribeInstanceTypes",
"ec2:DescribeLaunchTemplates",
"ec2:DescribeSecurityGroups",
"ec2:DescribeSpotPriceHistory",
"ec2:DescribeSubnets",
]

condition {
test = "StringEquals"
variable = "aws:RequestedRegion"
values = [data.aws_region.current.name]
}
}

statement {
sid = "AllowSSMReadActions"
effect = "Allow"
resources = ["arn:${data.aws_partition.current.partition}:ssm:${data.aws_region.current.name}::parameter/aws/service/*"]
actions = ["ssm:GetParameter"]
}

statement {
sid = "AllowPricingReadActions"
effect = "Allow"
resources = ["*"]
actions = ["pricing:GetProducts"]
}

statement {
sid = "AllowInterruptionQueueActions"
effect = "Allow"
resources = [aws_sqs_queue.karpenter_interruption.arn]

actions = [
"sqs:DeleteMessage",
"sqs:GetQueueAttributes",
"sqs:GetQueueUrl",
"sqs:ReceiveMessage",
]
}

statement {
sid = "AllowPassingInstanceRole"
effect = "Allow"
resources = [aws_iam_role.karpenter_node.arn]
actions = ["iam:PassRole"]

condition {
test = "StringEquals"
variable = "iam:PassedToService"
values = ["ec2.amazonaws.com"]
}
}

statement {
sid = "AllowAPIServerEndpointDiscovery"
effect = "Allow"
resources = [var.cluster_config.arn]
actions = ["eks:DescribeCluster"]
}
}
3 changes: 3 additions & 0 deletions modules/karpenter/data.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data "aws_caller_identity" "current" {}
data "aws_partition" "current" {}
data "aws_region" "current" {}
Loading

0 comments on commit a2bc4fe

Please sign in to comment.