Merge pull request #7 from edenlabllc/release/v1.22.0

Release/v1.22.0
edenlabllc · Sep 11, 2024 · 3e03e86 · 3e03e86
2 parents c3b4611 + 867f96e
commit 3e03e86
Show file tree

Hide file tree

Showing 15 changed files with 358 additions and 239 deletions.
diff --git a/README.md b/README.md
@@ -112,17 +112,17 @@ and can be considered as some kind of one-time "migrations".
 
 > It is recommended to investigate the scripts logic before applying to a K8S cluster.
 
-#### Upgrading to EKS 1.27
+#### Requirements
 
-The scripts support upgrading K8S from a minimal version of `1.23` to `1.27`.
-
-**Requirements:**
-
-* [RMK](https://github.com/edenlabllc/rmk) >= v0.41.0
+* [RMK](https://github.com/edenlabllc/rmk) >= v0.44.2
 * [AWS CLI](https://aws.amazon.com/cli/) >= 2.9
-* [eksctl](https://eksctl.io/) >= v0.160.0
+* [eksctl](https://eksctl.io/) >= v0.190.0
 * [yq](https://mikefarah.gitbook.io/yq) >= v4.35.2
 
+#### Upgrading EKS from 1.23 to 1.27
+
+The scripts support upgrading K8S from a minimal version of `1.23` to `1.27`.
+
 > The current upgrade covers 4 minor versions, therefore the logic is complex. For the next versions, 
 > it might have been simplified greatly, when upgrading to the closest version only, e.g. from `1.27` to `1.28`.
 
@@ -176,3 +176,14 @@ configs:
     inject: disabled
   # ...
 ```
+
+#### Upgrading EKS from 1.27 to 1.29
+
+The scripts support upgrading K8S from a minimal version of `1.27` to `1.29`.
+
+The list of scripts:
+- [upgrade-all.sh](bin/k8s-upgrade/1.29/upgrade-all.sh) - Initialize [RMK](https://github.com/edenlabllc/rmk) configuration, calling rest of scripts one by one (the main upgrade script).
+- [upgrade-releases.sh](bin/k8s-upgrade/1.29/upgrade-releases.sh) - Upgrade all releases. The following subscripts are executed:
+    - [upgrade-ebs-csi-snapshot-scheduler.sh](bin/k8s-upgrade/1.29/upgrade-ebs-csi-snapshot-scheduler.sh) - Upgrade [EBS CSI snapshot scheduler](https://backube.github.io/snapscheduler/) to the latest version.
+- [upgrade-cluster.sh](bin/k8s-upgrade/1.29/upgrade-cluster.sh) - Upgrade the K8S control plane and system worker node components (1 K8S version per iteration).
+- [upgrade-nodes.sh](bin/k8s-upgrade/1.29/upgrade-nodes.sh) - Rolling-update all the K8S worker nodes.
diff --git a/bin/dagster-presync-hook.sh b/bin/dagster-presync-hook.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# DEPRECATED: removed in favour of secrets-sync-operator
+
 set -e
 
 NAMESPACE=${1:-dagster}

diff --git a/bin/k8s-upgrade/1.27/run-tests.sh b/bin/k8s-upgrade/1.27/run-tests.sh
@@ -2,44 +2,4 @@
 
 set -e
 
-export PATH="${HOME}/.local/bin:${PATH}"
-
-# Note: In future, fhir-postgres, elt-postgres might be added.
-
-readonly POSTGRES_NAMESPACE="postgres"
-readonly POSTGRES_RELEASE_NAME="postgres"
-
-# Example output:
-#- Cluster: postgres-cluster
-#  Host: 10.1.2.38
-#  Member: postgres-cluster-0
-#  Role: Leader
-#  State: running
-#  TL: 7
-#- Cluster: postgres-cluster
-#  Host: 10.1.6.248
-#  Lag in MB: 0
-#  Member: postgres-cluster-1
-#  Role: Sync Standby
-#  State: running
-#  TL: 7
-echo "Showing information about Patroni cluster and its members of ${POSTGRES_RELEASE_NAME}..."
-readonly POSTGRES_CLUSTER_LIST="$(kubectl -n "${POSTGRES_NAMESPACE}" exec -it -c postgres "${POSTGRES_RELEASE_NAME}-cluster-0" -- patronictl list -f yaml)"
-echo "${POSTGRES_CLUSTER_LIST}"
-
-echo "Checking all the members are running..."
-if [[ "$(echo "${POSTGRES_CLUSTER_LIST}" | yq '([.[] | select(.State == "running")] | length) == (. | length)')" == "true" ]]; then
-  echo "OK."
-else
-  >&2 echo "ERROR: Not all the members are running."
-  exit 1
-fi
-
-echo "Checking all the members have correct roles..."
-if [[ "$(echo "${POSTGRES_CLUSTER_LIST}" | yq '([.[] | select(.Role == "Leader")] | length) == 1')" == "true" ]] \
-  && [[ "$(echo "${POSTGRES_CLUSTER_LIST}" | yq '([.[] | select(.Role == "Sync Standby")] | length) == 1')" == "true" ]]; then
-  echo "OK."
-else
-  >&2 echo "ERROR: The roles are not \"Leader\" and \"Sync Standby\"."
-  exit 1
-fi
+"$(dirname "${BASH_SOURCE}")/../run-tests.sh"
diff --git a/bin/k8s-upgrade/1.27/upgrade-cluster.sh b/bin/k8s-upgrade/1.27/upgrade-cluster.sh
@@ -2,54 +2,7 @@
 
 set -e
 
-export PATH="${HOME}/.local/bin:${PATH}"
-
-readonly NAME="$(rmk -ll error config view | yq '.name')"
-CLUSTER_NAME="$(rmk -ll error config view | yq '.exported-vars.env.CLUSTER_NAME')"
-if [[ "${CLUSTER_NAME}" == "null" ]]; then
-  CLUSTER_NAME="${NAME}-eks"
-fi
-CURRENT_CLUSTER_VERSION="$(eksctl get cluster --name "${CLUSTER_NAME}" -o yaml | yq '.[0].Version')"
-
-export AWS_PROFILE="$(rmk -ll error config view | yq '.aws.profile')"
-export AWS_CONFIG_FILE="${HOME}/.aws/config_${AWS_PROFILE}"
-export AWS_SHARED_CREDENTIALS_FILE="${HOME}/.aws/credentials_${AWS_PROFILE}"
-
-readonly NAMESPACE="kube-system"
-readonly KUBE_PROXY_RELEASE_NAME="kube-proxy"
-readonly COREDNS_RELEASE_NAME="coredns"
-
-# https://docs.aws.amazon.com/eks/latest/userguide/managing-kube-proxy.html
-KUBE_PROXY_IMAGE_PREFIX="$(kubectl -n "${NAMESPACE}" get daemonset "${KUBE_PROXY_RELEASE_NAME}" -o yaml | yq '.spec.template.spec.containers[0].image')"
-KUBE_PROXY_IMAGE_PREFIX="${KUBE_PROXY_IMAGE_PREFIX%:*}"
-# https://docs.aws.amazon.com/eks/latest/userguide/managing-coredns.html
-COREDNS_IMAGE_PREFIX="$(kubectl -n "${NAMESPACE}" get deployment "${COREDNS_RELEASE_NAME}" -o yaml | yq '.spec.template.spec.containers[0].image')"
-COREDNS_IMAGE_PREFIX="${COREDNS_IMAGE_PREFIX%:*}"
-
-# https://docs.aws.amazon.com/eks/latest/userguide/update-cluster.html
-# https://docs.aws.amazon.com/eks/latest/userguide/kubernetes-versions.html
-function upgrade_cluster() {
-  local DESIRED_CLUSTER_VERSION="${1}"
-  local KUBE_PROXY_VERSION="${2}"
-  local COREDNS_VERSION="${3}"
-
-  echo
-  echo "Current cluster version: ${CURRENT_CLUSTER_VERSION}"
-  echo "Desired cluster version: ${DESIRED_CLUSTER_VERSION}"
-  if [[ "${CURRENT_CLUSTER_VERSION//./,}" -ge "${DESIRED_CLUSTER_VERSION//./,}" ]]; then
-    echo "No control plane upgrade needed."
-  else
-    eksctl upgrade cluster --name "${CLUSTER_NAME}" --version "${DESIRED_CLUSTER_VERSION}" --approve
-    CURRENT_CLUSTER_VERSION="${DESIRED_CLUSTER_VERSION}"
-  fi
-
-  if [[ "${CURRENT_CLUSTER_VERSION//./,}" -eq "${DESIRED_CLUSTER_VERSION//./,}" ]]; then
-    kubectl -n "${NAMESPACE}" set image daemonset "${KUBE_PROXY_RELEASE_NAME}" kube-proxy="${KUBE_PROXY_IMAGE_PREFIX}:${KUBE_PROXY_VERSION}"
-    kubectl -n "${NAMESPACE}" rollout status daemonset "${KUBE_PROXY_RELEASE_NAME}"
-    kubectl -n "${NAMESPACE}" set image deployment "${COREDNS_RELEASE_NAME}" coredns="${COREDNS_IMAGE_PREFIX}:${COREDNS_VERSION}"
-    kubectl -n "${NAMESPACE}" rollout status deployment "${COREDNS_RELEASE_NAME}"
-  fi
-}
+source "$(dirname "${BASH_SOURCE}")/../upgrade-cluster.sh"
 
 echo "Upgrading K8S cluster iteratively..."
 upgrade_cluster "1.24" "v1.24.17-minimal-eksbuild.2" "v1.9.3-eksbuild.7"

diff --git a/bin/k8s-upgrade/1.27/upgrade-nodes.sh b/bin/k8s-upgrade/1.27/upgrade-nodes.sh
@@ -2,146 +2,4 @@
 
 set -e
 
-# optional argument
-# e.g. postgres|minio
-# find all possible node group names in etc/**/worker-groups.auto.tfvars of a tenant repository
-NODE_GROUP_NAME="${1}"
-
-export PATH="${HOME}/.local/bin:${PATH}"
-
-# disable client-side pager
-export AWS_PAGER=
-export AWS_PROFILE="$(rmk --log-level error config view | yq '.aws.profile')"
-export AWS_CONFIG_FILE="${HOME}/.aws/config_${AWS_PROFILE}"
-export AWS_SHARED_CREDENTIALS_FILE="${HOME}/.aws/credentials_${AWS_PROFILE}"
-
-readonly NAME="$(rmk --log-level error config view | yq '.name')"
-CLUSTER_NAME="$(rmk --log-level error config view | yq '.exported-vars.env.CLUSTER_NAME')"
-if [[ "${CLUSTER_NAME}" == "null" ]]; then
-  CLUSTER_NAME="${NAME}-eks"
-fi
-
-NODE_GROUP_FILTER=""
-if [[ -n "${NODE_GROUP_NAME}" ]]; then
-  NODE_GROUP_FILTER="Name=tag-value,Values=${CLUSTER_NAME}-${NODE_GROUP_NAME}-eks_asg"
-fi
-
-ASG_TAGS=($(aws autoscaling describe-auto-scaling-groups \
-    --filters "Name=tag-key,Values=kubernetes.io/cluster/${CLUSTER_NAME}" ${NODE_GROUP_FILTER} \
-    --output yaml | yq '.AutoScalingGroups[].Tags[] | select(.Key == "Name") | .Value'))
-ASG_NAMES=()
-
-if [[ ${#ASG_TAGS[@]} -eq 0 ]]; then
-  >&2 echo "ERROR: No autoscaling group found."
-  exit 1
-fi
-
-echo "Rolling-updating nodes..."
-
-for ASG_TAG in ${ASG_TAGS[@]}; do
-  ASG_NAME="$(aws autoscaling describe-auto-scaling-groups \
-    --filters "Name=tag-value,Values=${ASG_TAG}" \
-    --query 'AutoScalingGroups[0].AutoScalingGroupName' \
-    --output text
-  )"
-  ASG_NAMES+=("${ASG_NAME}")
-  # nodes with STS/PVC/PV need up to 10 minutes or more to warm up/check health and mount devices
-  ASG_UPDATE_TIMEOUT_SECONDS=600
-
-  # remove prefix and suffix from ASG tag to get node group name
-  NODE_GROUP_NAME="${ASG_TAG#${CLUSTER_NAME}-}"
-  NODE_GROUP_NAME="${NODE_GROUP_NAME%-eks_asg}"
-  IS_NODE_GROUP_STATEFUL="true"
-  PVC_LABELS="";
-  case "${NODE_GROUP_NAME}" in
-    "clickhouse") PVC_LABELS="clickhouse.altinity.com/chi=clickhouse" ;;
-    "elt-postgres") PVC_LABELS="cluster-name=elt-postgres-cluster" ;;
-    "es") PVC_LABELS="elasticsearch.k8s.elastic.co/cluster-name=elastic" ;;
-    "es-jaeger") PVC_LABELS="elasticsearch.k8s.elastic.co/cluster-name=elastic-jaeger" ;;
-    "fhir-postgres") PVC_LABELS="cluster-name=fhir-postgres-cluster" ;;
-    "kafka") PVC_LABELS="app.kubernetes.io/instance=kafka" ;;
-    "loki-stack") PVC_LABELS="release=loki-stack" ;;
-    "minio") PVC_LABELS="release=minio" ;;
-    "mongodb") PVC_LABELS="app.kubernetes.io/instance=mongodb" ;;
-    "postgres") PVC_LABELS="cluster-name=postgres-cluster" ;;
-    "redis") PVC_LABELS="app.kubernetes.io/instance=redis" ;;
-    *) IS_NODE_GROUP_STATEFUL="false"; ASG_UPDATE_TIMEOUT_SECONDS=60 ;;
-  esac
-
-  echo
-  echo "Node group name: ${NODE_GROUP_NAME}"
-  echo "Stateful: ${IS_NODE_GROUP_STATEFUL}"
-  echo "ASG tag: ${ASG_TAG}"
-  echo "ASG name: ${ASG_NAME}"
-  echo "ASG update timeout: ${ASG_UPDATE_TIMEOUT_SECONDS}s"
-
-  if [[ "${IS_NODE_GROUP_STATEFUL}" == "true" && "${PVC_LABELS}" != "" ]]; then
-    echo "PVC labels: ${PVC_LABELS}"
-
-    PV_NAMES="$(kubectl get pvc --all-namespaces -l "${PVC_LABELS}" -o yaml | yq '.items[].spec.volumeName')"
-    echo "PV names: ${PV_NAMES}"
-
-    # adding pv-dummy to return list of items even for cases when we have only 1 PV found
-    ASG_AZS="$(kubectl get pv pv-dummy ${PV_NAMES} --ignore-not-found -o yaml | yq '.items[].spec.nodeAffinity.required.nodeSelectorTerms[0].matchExpressions[0].values[0]' | sort | uniq)"
-    echo "ASG availability zones: ${ASG_AZS}"
-
-    ASG_SUBNETS=""
-    for ASG_AZ in ${ASG_AZS}; do
-      echo "Getting private subnet for ${ASG_AZ}..."
-      ASG_SUBNET="$(aws ec2 describe-subnets --filters "Name=tag-value,Values=${NAME}-vpc-private-${ASG_AZ}" --output yaml | yq '.Subnets[0].SubnetId')"
-      echo "Subnet ID: ${ASG_SUBNET}"
-      ASG_SUBNETS="${ASG_SUBNETS} ${ASG_SUBNET}"
-    done
-    echo "ASG subnets: ${ASG_SUBNETS}"
-
-    aws autoscaling update-auto-scaling-group --auto-scaling-group-name "${ASG_NAME}" \
-      --availability-zones ${ASG_AZS} \
-      --vpc-zone-identifier "${ASG_SUBNETS// /,}" \
-      --default-cooldown ${ASG_UPDATE_TIMEOUT_SECONDS} \
-      --default-instance-warmup ${ASG_UPDATE_TIMEOUT_SECONDS} \
-      --health-check-grace-period ${ASG_UPDATE_TIMEOUT_SECONDS} || true
-  else
-    echo "No ASG AZ update needed for stateless node group."
-  fi
-
-  # rolling-update node group OR skip in case it is being updated already
-  echo "Starting instance refresh..."
-  aws autoscaling start-instance-refresh --auto-scaling-group-name "${ASG_NAME}" || true
-done
-
-echo
-echo "Checking instance refresh status.."
-while true; do
-  IN_PROGRESS_ASG_COUNT="${#ASG_NAMES[@]}"
-  for ASG_NAME in ${ASG_NAMES[@]}; do
-    ASG_INSTANCE_REFRESH="$(aws autoscaling describe-instance-refreshes \
-      --auto-scaling-group-name "${ASG_NAME}" \
-      --max-records 1 \
-      --output yaml | yq '.InstanceRefreshes[0] | select(.Status != "Successful") | .AutoScalingGroupName')"
-    if [[ -n "${ASG_INSTANCE_REFRESH}" && "${ASG_INSTANCE_REFRESH}" != "null" ]]; then
-      echo "ASG ${ASG_NAME} in progress..."
-    else
-      ((IN_PROGRESS_ASG_COUNT--))
-    fi
-  done
-
-  if [[ "${IN_PROGRESS_ASG_COUNT}" -gt 0 ]]; then
-    sleep 10
-  else
-    break
-  fi
-done
-echo "Done."
-
-echo
-echo "Fixing pods with a missing linkerd sidecar after the instance refresh..."
-PODS_WITH_MISSING_LINKERD_SIDECAR="$(kubectl get pods --all-namespaces -l "!linkerd.io/control-plane-ns" -o yaml | yq '.items[].metadata | select(.annotations["linkerd.io/inject"] == "enabled") | (.namespace + " " + .name)')"
-# iterate over lines ignoring spaces
-while IFS= read -r NAMESPACE_WITH_POD; do
-  if [[ -z "${NAMESPACE_WITH_POD}" ]]; then
-    # no pods found
-    break
-  fi
-  kubectl delete pod --wait=true -n ${NAMESPACE_WITH_POD}
-done <<< "${PODS_WITH_MISSING_LINKERD_SIDECAR}"
-echo "Done."
+"$(dirname "${BASH_SOURCE}")/../upgrade-nodes.sh"
diff --git a/bin/k8s-upgrade/1.29/run-tests.sh b/bin/k8s-upgrade/1.29/run-tests.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+set -e
+
+"$(dirname "${BASH_SOURCE}")/../run-tests.sh"
diff --git a/bin/k8s-upgrade/1.29/upgrade-all.sh b/bin/k8s-upgrade/1.29/upgrade-all.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+set -e
+
+export PATH="${HOME}/.local/bin:${PATH}"
+
+echo "Initializing cluster configuration..."
+rmk update
+rmk config init
+rmk cluster switch -f
+
+echo
+"$(dirname "${BASH_SOURCE}")/upgrade-releases.sh"
+
+echo
+"$(dirname "${BASH_SOURCE}")/upgrade-cluster.sh"
+
+echo
+"$(dirname "${BASH_SOURCE}")/upgrade-nodes.sh"
+
+echo
+"$(dirname "${BASH_SOURCE}")/run-tests.sh"
diff --git a/bin/k8s-upgrade/1.29/upgrade-cluster.sh b/bin/k8s-upgrade/1.29/upgrade-cluster.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+set -e
+
+source "$(dirname "${BASH_SOURCE}")/../upgrade-cluster.sh"
+
+echo "Upgrading K8S cluster iteratively..."
+upgrade_cluster "1.28" "v1.28.12-eksbuild.2" "v1.10.1-eksbuild.13"
+upgrade_cluster "1.29" "v1.29.0-minimal-eksbuild.1" "v1.11.1-eksbuild.4"
+
+echo
+echo "Provisioning latest AMI IDs and K8S version..."
+rmk cluster provision
diff --git a/bin/k8s-upgrade/1.29/upgrade-ebs-csi-snapshot-scheduler.sh b/bin/k8s-upgrade/1.29/upgrade-ebs-csi-snapshot-scheduler.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+set -e
+
+export PATH="${HOME}/.local/bin:${PATH}"
+
+readonly NAMESPACE="kube-system"
+readonly RELEASE_NAME="ebs-csi-snapshot-scheduler"
+
+readonly CRD_NAME="snapshotschedules.snapscheduler.backube"
+readonly CRD_ANNOTATIONS="meta.helm.sh/release-namespace=${NAMESPACE} meta.helm.sh/release-name=${RELEASE_NAME}"
+readonly CRD_LABELS="app.kubernetes.io/managed-by=Helm"
+
+echo "Checking whether ${RELEASE_NAME} release installed..."
+if [[ "$(rmk --log-level error release list -l "app=${RELEASE_NAME}" --output json | yq '.[0].installed')" != "true" ]]; then
+  echo "Skipped."
+  exit
+fi
+
+echo "Fixing annotations and labels of ${CRD_NAME} CRD of ${RELEASE_NAME} release..."
+kubectl -n "${NAMESPACE}" annotate --overwrite customresourcedefinition "${CRD_NAME}" ${CRD_ANNOTATIONS}
+kubectl -n "${NAMESPACE}" label --overwrite customresourcedefinition "${CRD_NAME}" ${CRD_LABELS}
diff --git a/bin/k8s-upgrade/1.29/upgrade-nodes.sh b/bin/k8s-upgrade/1.29/upgrade-nodes.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+set -e
+
+"$(dirname "${BASH_SOURCE}")/../upgrade-nodes.sh"
diff --git a/bin/k8s-upgrade/1.29/upgrade-releases.sh b/bin/k8s-upgrade/1.29/upgrade-releases.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+set -e
+
+export PATH="${HOME}/.local/bin:${PATH}"
+
+"$(dirname "${BASH_SOURCE}")/upgrade-ebs-csi-snapshot-scheduler.sh"
+
+echo
+echo "Synchronizing all releases..."
+rmk release sync
diff --git a/bin/k8s-upgrade/1.27/rotate-linkerd-certs.sh → bin/k8s-upgrade/rotate-linkerd-certs.sh b/bin/k8s-upgrade/1.27/rotate-linkerd-certs.sh → bin/k8s-upgrade/rotate-linkerd-certs.sh