Skip to content

Commit

Permalink
Merge pull request #7 from edenlabllc/release/v1.22.0
Browse files Browse the repository at this point in the history
Release/v1.22.0
  • Loading branch information
anovikov-el authored Sep 11, 2024
2 parents c3b4611 + 867f96e commit 3e03e86
Show file tree
Hide file tree
Showing 15 changed files with 358 additions and 239 deletions.
25 changes: 18 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,17 +112,17 @@ and can be considered as some kind of one-time "migrations".

> It is recommended to investigate the scripts logic before applying to a K8S cluster.
#### Upgrading to EKS 1.27
#### Requirements

The scripts support upgrading K8S from a minimal version of `1.23` to `1.27`.

**Requirements:**

* [RMK](https://github.com/edenlabllc/rmk) >= v0.41.0
* [RMK](https://github.com/edenlabllc/rmk) >= v0.44.2
* [AWS CLI](https://aws.amazon.com/cli/) >= 2.9
* [eksctl](https://eksctl.io/) >= v0.160.0
* [eksctl](https://eksctl.io/) >= v0.190.0
* [yq](https://mikefarah.gitbook.io/yq) >= v4.35.2

#### Upgrading EKS from 1.23 to 1.27

The scripts support upgrading K8S from a minimal version of `1.23` to `1.27`.

> The current upgrade covers 4 minor versions, therefore the logic is complex. For the next versions,
> it might have been simplified greatly, when upgrading to the closest version only, e.g. from `1.27` to `1.28`.
Expand Down Expand Up @@ -176,3 +176,14 @@ configs:
inject: disabled
# ...
```

#### Upgrading EKS from 1.27 to 1.29

The scripts support upgrading K8S from a minimal version of `1.27` to `1.29`.

The list of scripts:
- [upgrade-all.sh](bin/k8s-upgrade/1.29/upgrade-all.sh) - Initialize [RMK](https://github.com/edenlabllc/rmk) configuration, calling rest of scripts one by one (the main upgrade script).
- [upgrade-releases.sh](bin/k8s-upgrade/1.29/upgrade-releases.sh) - Upgrade all releases. The following subscripts are executed:
- [upgrade-ebs-csi-snapshot-scheduler.sh](bin/k8s-upgrade/1.29/upgrade-ebs-csi-snapshot-scheduler.sh) - Upgrade [EBS CSI snapshot scheduler](https://backube.github.io/snapscheduler/) to the latest version.
- [upgrade-cluster.sh](bin/k8s-upgrade/1.29/upgrade-cluster.sh) - Upgrade the K8S control plane and system worker node components (1 K8S version per iteration).
- [upgrade-nodes.sh](bin/k8s-upgrade/1.29/upgrade-nodes.sh) - Rolling-update all the K8S worker nodes.
2 changes: 2 additions & 0 deletions bin/dagster-presync-hook.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env bash

# DEPRECATED: removed in favour of secrets-sync-operator

set -e

NAMESPACE=${1:-dagster}
Expand Down
42 changes: 1 addition & 41 deletions bin/k8s-upgrade/1.27/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,44 +2,4 @@

set -e

export PATH="${HOME}/.local/bin:${PATH}"

# Note: In future, fhir-postgres, elt-postgres might be added.

readonly POSTGRES_NAMESPACE="postgres"
readonly POSTGRES_RELEASE_NAME="postgres"

# Example output:
#- Cluster: postgres-cluster
# Host: 10.1.2.38
# Member: postgres-cluster-0
# Role: Leader
# State: running
# TL: 7
#- Cluster: postgres-cluster
# Host: 10.1.6.248
# Lag in MB: 0
# Member: postgres-cluster-1
# Role: Sync Standby
# State: running
# TL: 7
echo "Showing information about Patroni cluster and its members of ${POSTGRES_RELEASE_NAME}..."
readonly POSTGRES_CLUSTER_LIST="$(kubectl -n "${POSTGRES_NAMESPACE}" exec -it -c postgres "${POSTGRES_RELEASE_NAME}-cluster-0" -- patronictl list -f yaml)"
echo "${POSTGRES_CLUSTER_LIST}"

echo "Checking all the members are running..."
if [[ "$(echo "${POSTGRES_CLUSTER_LIST}" | yq '([.[] | select(.State == "running")] | length) == (. | length)')" == "true" ]]; then
echo "OK."
else
>&2 echo "ERROR: Not all the members are running."
exit 1
fi

echo "Checking all the members have correct roles..."
if [[ "$(echo "${POSTGRES_CLUSTER_LIST}" | yq '([.[] | select(.Role == "Leader")] | length) == 1')" == "true" ]] \
&& [[ "$(echo "${POSTGRES_CLUSTER_LIST}" | yq '([.[] | select(.Role == "Sync Standby")] | length) == 1')" == "true" ]]; then
echo "OK."
else
>&2 echo "ERROR: The roles are not \"Leader\" and \"Sync Standby\"."
exit 1
fi
"$(dirname "${BASH_SOURCE}")/../run-tests.sh"
49 changes: 1 addition & 48 deletions bin/k8s-upgrade/1.27/upgrade-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,54 +2,7 @@

set -e

export PATH="${HOME}/.local/bin:${PATH}"

readonly NAME="$(rmk -ll error config view | yq '.name')"
CLUSTER_NAME="$(rmk -ll error config view | yq '.exported-vars.env.CLUSTER_NAME')"
if [[ "${CLUSTER_NAME}" == "null" ]]; then
CLUSTER_NAME="${NAME}-eks"
fi
CURRENT_CLUSTER_VERSION="$(eksctl get cluster --name "${CLUSTER_NAME}" -o yaml | yq '.[0].Version')"

export AWS_PROFILE="$(rmk -ll error config view | yq '.aws.profile')"
export AWS_CONFIG_FILE="${HOME}/.aws/config_${AWS_PROFILE}"
export AWS_SHARED_CREDENTIALS_FILE="${HOME}/.aws/credentials_${AWS_PROFILE}"

readonly NAMESPACE="kube-system"
readonly KUBE_PROXY_RELEASE_NAME="kube-proxy"
readonly COREDNS_RELEASE_NAME="coredns"

# https://docs.aws.amazon.com/eks/latest/userguide/managing-kube-proxy.html
KUBE_PROXY_IMAGE_PREFIX="$(kubectl -n "${NAMESPACE}" get daemonset "${KUBE_PROXY_RELEASE_NAME}" -o yaml | yq '.spec.template.spec.containers[0].image')"
KUBE_PROXY_IMAGE_PREFIX="${KUBE_PROXY_IMAGE_PREFIX%:*}"
# https://docs.aws.amazon.com/eks/latest/userguide/managing-coredns.html
COREDNS_IMAGE_PREFIX="$(kubectl -n "${NAMESPACE}" get deployment "${COREDNS_RELEASE_NAME}" -o yaml | yq '.spec.template.spec.containers[0].image')"
COREDNS_IMAGE_PREFIX="${COREDNS_IMAGE_PREFIX%:*}"

# https://docs.aws.amazon.com/eks/latest/userguide/update-cluster.html
# https://docs.aws.amazon.com/eks/latest/userguide/kubernetes-versions.html
function upgrade_cluster() {
local DESIRED_CLUSTER_VERSION="${1}"
local KUBE_PROXY_VERSION="${2}"
local COREDNS_VERSION="${3}"

echo
echo "Current cluster version: ${CURRENT_CLUSTER_VERSION}"
echo "Desired cluster version: ${DESIRED_CLUSTER_VERSION}"
if [[ "${CURRENT_CLUSTER_VERSION//./,}" -ge "${DESIRED_CLUSTER_VERSION//./,}" ]]; then
echo "No control plane upgrade needed."
else
eksctl upgrade cluster --name "${CLUSTER_NAME}" --version "${DESIRED_CLUSTER_VERSION}" --approve
CURRENT_CLUSTER_VERSION="${DESIRED_CLUSTER_VERSION}"
fi

if [[ "${CURRENT_CLUSTER_VERSION//./,}" -eq "${DESIRED_CLUSTER_VERSION//./,}" ]]; then
kubectl -n "${NAMESPACE}" set image daemonset "${KUBE_PROXY_RELEASE_NAME}" kube-proxy="${KUBE_PROXY_IMAGE_PREFIX}:${KUBE_PROXY_VERSION}"
kubectl -n "${NAMESPACE}" rollout status daemonset "${KUBE_PROXY_RELEASE_NAME}"
kubectl -n "${NAMESPACE}" set image deployment "${COREDNS_RELEASE_NAME}" coredns="${COREDNS_IMAGE_PREFIX}:${COREDNS_VERSION}"
kubectl -n "${NAMESPACE}" rollout status deployment "${COREDNS_RELEASE_NAME}"
fi
}
source "$(dirname "${BASH_SOURCE}")/../upgrade-cluster.sh"

echo "Upgrading K8S cluster iteratively..."
upgrade_cluster "1.24" "v1.24.17-minimal-eksbuild.2" "v1.9.3-eksbuild.7"
Expand Down
144 changes: 1 addition & 143 deletions bin/k8s-upgrade/1.27/upgrade-nodes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,146 +2,4 @@

set -e

# optional argument
# e.g. postgres|minio
# find all possible node group names in etc/**/worker-groups.auto.tfvars of a tenant repository
NODE_GROUP_NAME="${1}"

export PATH="${HOME}/.local/bin:${PATH}"

# disable client-side pager
export AWS_PAGER=
export AWS_PROFILE="$(rmk --log-level error config view | yq '.aws.profile')"
export AWS_CONFIG_FILE="${HOME}/.aws/config_${AWS_PROFILE}"
export AWS_SHARED_CREDENTIALS_FILE="${HOME}/.aws/credentials_${AWS_PROFILE}"

readonly NAME="$(rmk --log-level error config view | yq '.name')"
CLUSTER_NAME="$(rmk --log-level error config view | yq '.exported-vars.env.CLUSTER_NAME')"
if [[ "${CLUSTER_NAME}" == "null" ]]; then
CLUSTER_NAME="${NAME}-eks"
fi

NODE_GROUP_FILTER=""
if [[ -n "${NODE_GROUP_NAME}" ]]; then
NODE_GROUP_FILTER="Name=tag-value,Values=${CLUSTER_NAME}-${NODE_GROUP_NAME}-eks_asg"
fi

ASG_TAGS=($(aws autoscaling describe-auto-scaling-groups \
--filters "Name=tag-key,Values=kubernetes.io/cluster/${CLUSTER_NAME}" ${NODE_GROUP_FILTER} \
--output yaml | yq '.AutoScalingGroups[].Tags[] | select(.Key == "Name") | .Value'))
ASG_NAMES=()

if [[ ${#ASG_TAGS[@]} -eq 0 ]]; then
>&2 echo "ERROR: No autoscaling group found."
exit 1
fi

echo "Rolling-updating nodes..."

for ASG_TAG in ${ASG_TAGS[@]}; do
ASG_NAME="$(aws autoscaling describe-auto-scaling-groups \
--filters "Name=tag-value,Values=${ASG_TAG}" \
--query 'AutoScalingGroups[0].AutoScalingGroupName' \
--output text
)"
ASG_NAMES+=("${ASG_NAME}")
# nodes with STS/PVC/PV need up to 10 minutes or more to warm up/check health and mount devices
ASG_UPDATE_TIMEOUT_SECONDS=600

# remove prefix and suffix from ASG tag to get node group name
NODE_GROUP_NAME="${ASG_TAG#${CLUSTER_NAME}-}"
NODE_GROUP_NAME="${NODE_GROUP_NAME%-eks_asg}"
IS_NODE_GROUP_STATEFUL="true"
PVC_LABELS="";
case "${NODE_GROUP_NAME}" in
"clickhouse") PVC_LABELS="clickhouse.altinity.com/chi=clickhouse" ;;
"elt-postgres") PVC_LABELS="cluster-name=elt-postgres-cluster" ;;
"es") PVC_LABELS="elasticsearch.k8s.elastic.co/cluster-name=elastic" ;;
"es-jaeger") PVC_LABELS="elasticsearch.k8s.elastic.co/cluster-name=elastic-jaeger" ;;
"fhir-postgres") PVC_LABELS="cluster-name=fhir-postgres-cluster" ;;
"kafka") PVC_LABELS="app.kubernetes.io/instance=kafka" ;;
"loki-stack") PVC_LABELS="release=loki-stack" ;;
"minio") PVC_LABELS="release=minio" ;;
"mongodb") PVC_LABELS="app.kubernetes.io/instance=mongodb" ;;
"postgres") PVC_LABELS="cluster-name=postgres-cluster" ;;
"redis") PVC_LABELS="app.kubernetes.io/instance=redis" ;;
*) IS_NODE_GROUP_STATEFUL="false"; ASG_UPDATE_TIMEOUT_SECONDS=60 ;;
esac

echo
echo "Node group name: ${NODE_GROUP_NAME}"
echo "Stateful: ${IS_NODE_GROUP_STATEFUL}"
echo "ASG tag: ${ASG_TAG}"
echo "ASG name: ${ASG_NAME}"
echo "ASG update timeout: ${ASG_UPDATE_TIMEOUT_SECONDS}s"

if [[ "${IS_NODE_GROUP_STATEFUL}" == "true" && "${PVC_LABELS}" != "" ]]; then
echo "PVC labels: ${PVC_LABELS}"

PV_NAMES="$(kubectl get pvc --all-namespaces -l "${PVC_LABELS}" -o yaml | yq '.items[].spec.volumeName')"
echo "PV names: ${PV_NAMES}"

# adding pv-dummy to return list of items even for cases when we have only 1 PV found
ASG_AZS="$(kubectl get pv pv-dummy ${PV_NAMES} --ignore-not-found -o yaml | yq '.items[].spec.nodeAffinity.required.nodeSelectorTerms[0].matchExpressions[0].values[0]' | sort | uniq)"
echo "ASG availability zones: ${ASG_AZS}"

ASG_SUBNETS=""
for ASG_AZ in ${ASG_AZS}; do
echo "Getting private subnet for ${ASG_AZ}..."
ASG_SUBNET="$(aws ec2 describe-subnets --filters "Name=tag-value,Values=${NAME}-vpc-private-${ASG_AZ}" --output yaml | yq '.Subnets[0].SubnetId')"
echo "Subnet ID: ${ASG_SUBNET}"
ASG_SUBNETS="${ASG_SUBNETS} ${ASG_SUBNET}"
done
echo "ASG subnets: ${ASG_SUBNETS}"

aws autoscaling update-auto-scaling-group --auto-scaling-group-name "${ASG_NAME}" \
--availability-zones ${ASG_AZS} \
--vpc-zone-identifier "${ASG_SUBNETS// /,}" \
--default-cooldown ${ASG_UPDATE_TIMEOUT_SECONDS} \
--default-instance-warmup ${ASG_UPDATE_TIMEOUT_SECONDS} \
--health-check-grace-period ${ASG_UPDATE_TIMEOUT_SECONDS} || true
else
echo "No ASG AZ update needed for stateless node group."
fi

# rolling-update node group OR skip in case it is being updated already
echo "Starting instance refresh..."
aws autoscaling start-instance-refresh --auto-scaling-group-name "${ASG_NAME}" || true
done

echo
echo "Checking instance refresh status.."
while true; do
IN_PROGRESS_ASG_COUNT="${#ASG_NAMES[@]}"
for ASG_NAME in ${ASG_NAMES[@]}; do
ASG_INSTANCE_REFRESH="$(aws autoscaling describe-instance-refreshes \
--auto-scaling-group-name "${ASG_NAME}" \
--max-records 1 \
--output yaml | yq '.InstanceRefreshes[0] | select(.Status != "Successful") | .AutoScalingGroupName')"
if [[ -n "${ASG_INSTANCE_REFRESH}" && "${ASG_INSTANCE_REFRESH}" != "null" ]]; then
echo "ASG ${ASG_NAME} in progress..."
else
((IN_PROGRESS_ASG_COUNT--))
fi
done

if [[ "${IN_PROGRESS_ASG_COUNT}" -gt 0 ]]; then
sleep 10
else
break
fi
done
echo "Done."

echo
echo "Fixing pods with a missing linkerd sidecar after the instance refresh..."
PODS_WITH_MISSING_LINKERD_SIDECAR="$(kubectl get pods --all-namespaces -l "!linkerd.io/control-plane-ns" -o yaml | yq '.items[].metadata | select(.annotations["linkerd.io/inject"] == "enabled") | (.namespace + " " + .name)')"
# iterate over lines ignoring spaces
while IFS= read -r NAMESPACE_WITH_POD; do
if [[ -z "${NAMESPACE_WITH_POD}" ]]; then
# no pods found
break
fi
kubectl delete pod --wait=true -n ${NAMESPACE_WITH_POD}
done <<< "${PODS_WITH_MISSING_LINKERD_SIDECAR}"
echo "Done."
"$(dirname "${BASH_SOURCE}")/../upgrade-nodes.sh"
5 changes: 5 additions & 0 deletions bin/k8s-upgrade/1.29/run-tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env bash

set -e

"$(dirname "${BASH_SOURCE}")/../run-tests.sh"
22 changes: 22 additions & 0 deletions bin/k8s-upgrade/1.29/upgrade-all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash

set -e

export PATH="${HOME}/.local/bin:${PATH}"

echo "Initializing cluster configuration..."
rmk update
rmk config init
rmk cluster switch -f

echo
"$(dirname "${BASH_SOURCE}")/upgrade-releases.sh"

echo
"$(dirname "${BASH_SOURCE}")/upgrade-cluster.sh"

echo
"$(dirname "${BASH_SOURCE}")/upgrade-nodes.sh"

echo
"$(dirname "${BASH_SOURCE}")/run-tests.sh"
13 changes: 13 additions & 0 deletions bin/k8s-upgrade/1.29/upgrade-cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env bash

set -e

source "$(dirname "${BASH_SOURCE}")/../upgrade-cluster.sh"

echo "Upgrading K8S cluster iteratively..."
upgrade_cluster "1.28" "v1.28.12-eksbuild.2" "v1.10.1-eksbuild.13"
upgrade_cluster "1.29" "v1.29.0-minimal-eksbuild.1" "v1.11.1-eksbuild.4"

echo
echo "Provisioning latest AMI IDs and K8S version..."
rmk cluster provision
22 changes: 22 additions & 0 deletions bin/k8s-upgrade/1.29/upgrade-ebs-csi-snapshot-scheduler.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash

set -e

export PATH="${HOME}/.local/bin:${PATH}"

readonly NAMESPACE="kube-system"
readonly RELEASE_NAME="ebs-csi-snapshot-scheduler"

readonly CRD_NAME="snapshotschedules.snapscheduler.backube"
readonly CRD_ANNOTATIONS="meta.helm.sh/release-namespace=${NAMESPACE} meta.helm.sh/release-name=${RELEASE_NAME}"
readonly CRD_LABELS="app.kubernetes.io/managed-by=Helm"

echo "Checking whether ${RELEASE_NAME} release installed..."
if [[ "$(rmk --log-level error release list -l "app=${RELEASE_NAME}" --output json | yq '.[0].installed')" != "true" ]]; then
echo "Skipped."
exit
fi

echo "Fixing annotations and labels of ${CRD_NAME} CRD of ${RELEASE_NAME} release..."
kubectl -n "${NAMESPACE}" annotate --overwrite customresourcedefinition "${CRD_NAME}" ${CRD_ANNOTATIONS}
kubectl -n "${NAMESPACE}" label --overwrite customresourcedefinition "${CRD_NAME}" ${CRD_LABELS}
5 changes: 5 additions & 0 deletions bin/k8s-upgrade/1.29/upgrade-nodes.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env bash

set -e

"$(dirname "${BASH_SOURCE}")/../upgrade-nodes.sh"
11 changes: 11 additions & 0 deletions bin/k8s-upgrade/1.29/upgrade-releases.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash

set -e

export PATH="${HOME}/.local/bin:${PATH}"

"$(dirname "${BASH_SOURCE}")/upgrade-ebs-csi-snapshot-scheduler.sh"

echo
echo "Synchronizing all releases..."
rmk release sync
File renamed without changes.
Loading

0 comments on commit 3e03e86

Please sign in to comment.