From 4414759f47a9105b7b06ba2858e53f0dc3a1c216 Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 17:12:56 +0200 Subject: [PATCH 01/15] add nuke flag --- kubelift.sh | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 1 deletion(-) diff --git a/kubelift.sh b/kubelift.sh index b2e86b6..b0c81d4 100755 --- a/kubelift.sh +++ b/kubelift.sh @@ -10,6 +10,7 @@ KUBERNETES_VERSION="" CONTROL_PLANE_IP="" WORKER_IPS="" ENABLE_CONTROL_PLANE_WORKLOADS=false +NUKE=false function print_usage() { cat << EOF @@ -28,6 +29,7 @@ Options: --worker-ips Worker node IP addresses (create only) --enable-control-plane-workloads Enable control plane scheduling (create only) --skip-reqs Skip minimum requirements validation + --nuke Perform deep cleanup (cleanup only) EOF exit 0 } @@ -76,6 +78,7 @@ function parse_args() { ;; --enable-control-plane-workloads) ENABLE_CONTROL_PLANE_WORKLOADS="$2"; shift 2 ;; --skip-reqs) SKIP_REQS="$2"; shift 2 ;; + --nuke) NUKE="$2"; shift 2 ;; *) error "Unknown parameter $1" ;; esac done @@ -95,6 +98,10 @@ function validate_input() { [[ -z $WORKER_IPS || $WORKER_IPS =~ ^([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+,)*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]] || error "Invalid worker nodes IPs" [[ $ENABLE_CONTROL_PLANE_WORKLOADS =~ ^(true|false)$ ]] || error "Invalid control plane scheduling value" fi + + if [[ $OPERATION == "cleanup" ]]; then + [[ $NUKE =~ ^(true|false)$ ]] || error "Invalid nuke value" + fi } function get_node_resources() { @@ -511,14 +518,110 @@ function remove_control_plane() { cleanup_node "$CONTROL_PLANE_IP" } -function cleanup_node() { +function deep_clean_node() { local node_ip=$1 ssh -o StrictHostKeyChecking=no "$SSH_USER@$node_ip" bash << 'EOF' +set -euo pipefail + +systemctl stop kubelet containerd docker || true + +kubeadm reset -f || true + +for mount in $(mount | grep tmpfs | grep '/var/lib/kubelet' | awk '{ print $3 }'); do + umount "$mount" || true +done + +rm -rf \ + /etc/kubernetes \ + /var/lib/kubelet \ + /var/lib/etcd \ + /var/lib/dockershim \ + /var/run/kubernetes \ + /var/lib/cni \ + /etc/cni \ + /opt/cni \ + /var/lib/containerd \ + /var/lib/docker \ + /etc/containerd \ + /etc/docker \ + $HOME/.kube \ + /root/.kube + +ip link set docker0 down || true +ip link delete docker0 || true +ip link set cni0 down || true +ip link delete cni0 || true +ip link set flannel.1 down || true +ip link delete flannel.1 || true +ip link set weave down || true +ip link delete weave || true + +iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X +ip6tables -F && ip6tables -t nat -F && ip6tables -t mangle -F && ip6tables -X + +for pkg in kubectl kubeadm kubelet kubernetes-cni containerd docker-ce docker-ce-cli; do + apt-mark unhold "$pkg" || true + DEBIAN_FRONTEND=noninteractive apt-get remove --purge -y "$pkg" || true +done + +apt-get autoremove -y +apt-get clean + +rm -f /etc/apt/sources.list.d/kubernetes.list +rm -f /etc/apt/sources.list.d/docker.list +rm -f /etc/apt/keyrings/kubernetes*.gpg +rm -f /etc/apt/keyrings/docker*.gpg + +if [[ -f /etc/fstab.bak.* ]]; then + cp "$(ls -t /etc/fstab.bak.* | head -1)" /etc/fstab +fi + +systemctl daemon-reload +EOF +} + +function verify_deep_clean() { + local node_ip=$1 + local failed=false + + local checks=( + "systemctl is-active kubelet" + "systemctl is-active containerd" + "systemctl is-active docker" + "which kubectl" + "which kubeadm" + "which kubelet" + "test -d /etc/kubernetes" + "test -d /var/lib/kubelet" + "test -d /var/lib/etcd" + ) + + for check in "${checks[@]}"; do + if ssh -o StrictHostKeyChecking=no "$SSH_USER@$node_ip" "$check" &>/dev/null; then + log "Warning: $check still exists on $node_ip" + failed=true + fi + done + + $failed && error "Deep clean verification failed on $node_ip" +} + +function cleanup_node() { + local node_ip=$1 + + if [[ $NUKE == "true" ]]; then + log "Performing deep clean on $node_ip" + deep_clean_node "$node_ip" + verify_deep_clean "$node_ip" + else + log "Performing standard cleanup on $node_ip" + ssh -o StrictHostKeyChecking=no "$SSH_USER@$node_ip" bash << 'EOF' kubeadm reset -f rm -rf $HOME/.kube ip link delete cni0 || true ip link delete flannel.1 || true EOF + fi } function cleanup_cluster() { From 62ac34423b5ac0e34c420fda439be10a2eb1d51b Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 17:37:17 +0200 Subject: [PATCH 02/15] add more robust checks & cleanup --- kubelift.sh | 335 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 225 insertions(+), 110 deletions(-) diff --git a/kubelift.sh b/kubelift.sh index b0c81d4..352c0bc 100755 --- a/kubelift.sh +++ b/kubelift.sh @@ -106,85 +106,67 @@ function validate_input() { function get_node_resources() { local node_ip=$1 - local resources + local cpu_cores mem_gb disk_gb - resources=$(ssh -o StrictHostKeyChecking=no "$SSH_USER@$node_ip" bash << 'EOF' - { + if ! read -r cpu_cores mem_gb disk_gb < <(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 "$SSH_USER@$node_ip" bash << 'ENDSSH' cpu_cores=$(nproc) - mem_gb=$(($(grep MemTotal /proc/meminfo | awk '{print $2}') / 1024 / 1024)) - disk_gb=$(df -BG / | awk 'NR==2 {sub(/G/,"",$4); print $4}') - + mem_kb=$(awk '/MemTotal/ {print $2}' /proc/meminfo) + mem_gb=$((mem_kb / 1024 / 1024)) + disk_gb=$(df -B1G / | awk 'NR==2 {print $4}') echo "$cpu_cores $mem_gb $disk_gb" - } -EOF - ) +ENDSSH + ); then + error "Failed to retrieve resources from node $node_ip" + fi - echo "$resources" + echo "$cpu_cores $mem_gb $disk_gb" } -function validate_control_plane_resources() { +function validate_node_resources() { local node_ip=$1 - local resources cpu_cores mem_gb disk_gb - - resources=$(get_node_resources "$node_ip") - read -r cpu_cores mem_gb disk_gb <<< "$resources" - - local errors=() - - ((cpu_cores >= 2)) || errors+=("Insufficient CPU cores: $cpu_cores (minimum 2)") - ((mem_gb >= 2)) || errors+=("Insufficient memory: ${mem_gb}GB (minimum 2GB)") - ((disk_gb >= 50)) || errors+=("Insufficient disk space: ${disk_gb}GB (minimum 50GB)") - - if ((${#errors[@]} > 0)); then - printf "Control plane node (%s) validation failed:\n" "$node_ip" >&2 - printf " - %s\n" "${errors[@]}" >&2 - return 1 - fi -} + local min_cpu=$2 + local min_mem=$3 + local min_disk=$4 + local node_type=$5 -function validate_worker_node_resources() { - local node_ip=$1 local resources cpu_cores mem_gb disk_gb - - resources=$(get_node_resources "$node_ip") + resources=$(get_node_resources "$node_ip") || return 1 read -r cpu_cores mem_gb disk_gb <<< "$resources" local errors=() - - ((cpu_cores >= 1)) || errors+=("Insufficient CPU cores: $cpu_cores (minimum 1)") - ((mem_gb >= 1)) || errors+=("Insufficient memory: ${mem_gb}GB (minimum 1GB)") - ((disk_gb >= 20)) || errors+=("Insufficient disk space: ${disk_gb}GB (minimum 20GB)") + ((cpu_cores >= min_cpu)) || errors+=("CPU cores: $cpu_cores (minimum $min_cpu)") + ((mem_gb >= min_mem)) || errors+=("Memory: ${mem_gb}GB (minimum ${min_mem}GB)") + ((disk_gb >= min_disk)) || errors+=("Disk: ${disk_gb}GB (minimum ${min_disk}GB)") if ((${#errors[@]} > 0)); then - printf "Worker node (%s) validation failed:\n" "$node_ip" >&2 + log "$node_type ($node_ip) validation failed:" printf " - %s\n" "${errors[@]}" >&2 return 1 fi + + return 0 } function validate_cluster_resources() { $SKIP_REQS && return 0 - local validation_failed=false + local failed=0 + log "Validating cluster node resources" - log "Validating control plane resources" - validate_control_plane_resources "$CONTROL_PLANE_IP" || validation_failed=true + if ! validate_node_resources "$CONTROL_PLANE_IP" 2 2 50 "Control plane"; then + failed=1 + fi if [[ -n $WORKER_IPS ]]; then - log "Validating worker nodes resources" - local pids=() - for ip in ${WORKER_IPS//,/ }; do - validate_worker_node_resources "$ip" & - pids+=($!) - done - - for pid in "${pids[@]}"; do - wait "$pid" || validation_failed=true + if ! validate_node_resources "$ip" 1 1 20 "Worker node"; then + failed=1 + fi done fi - $validation_failed && error "Resource validation failed" + ((failed)) && error "Resource validation failed" + return 0 } function prompt_confirmation() { @@ -219,10 +201,15 @@ function verify_node_connectivity() { } function verify_cluster_connectivity() { - local all_nodes + local all_nodes timeout=5 all_nodes=$(get_cluster_nodes) + log "Verifying cluster connectivity" for ip in ${all_nodes//,/ }; do + if ! timeout "$timeout" bash -c "/dev/null; then + error "Node $ip is not reachable on port 22" + fi + verify_ssh_access "$ip" done @@ -262,31 +249,38 @@ function setup_container_runtime() { set -euo pipefail apt-get update -apt-get install -y ca-certificates curl +apt-get install -y ca-certificates curl gnupg if ! command -v docker &> /dev/null || ! docker info &> /dev/null; then install -m 0755 -d /etc/apt/keyrings - curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /tmp/docker.gpg - gpg --dearmor -o /etc/apt/keyrings/docker.asc /tmp/docker.gpg - chmod a+r /etc/apt/keyrings/docker.asc - echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ - tee /etc/apt/sources.list.d/docker.list > /dev/null + if [[ ! -f /etc/apt/keyrings/docker.asc ]]; then + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor --yes -o /etc/apt/keyrings/docker.asc + chmod a+r /etc/apt/keyrings/docker.asc + fi + + if [[ ! -f /etc/apt/sources.list.d/docker.list ]]; then + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + tee /etc/apt/sources.list.d/docker.list > /dev/null + fi + apt-get update apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin systemctl enable --now docker fi -cat > /etc/containerd/config.toml << EOC +mkdir -p /etc/containerd +if [[ ! -f /etc/containerd/config.toml ]] || ! grep -q "SystemdCgroup = true" /etc/containerd/config.toml; then + cat > /etc/containerd/config.toml << EOC version = 2 [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] runtime_type = "io.containerd.runc.v2" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] SystemdCgroup = true EOC - -systemctl restart containerd + systemctl restart containerd +fi EOF } @@ -300,9 +294,13 @@ set -euo pipefail KUBERNETES_VERSION="$version" KUBERNETES_VERSION_REPOSITORY="v\${KUBERNETES_VERSION%.*}" +install -m 0755 -d /etc/apt/keyrings +if [[ ! -f "/etc/apt/keyrings/kubernetes-apt-keyring-\$KUBERNETES_VERSION_REPOSITORY.gpg" ]]; then + curl -fsSL "https://pkgs.k8s.io/core:/stable:/\$KUBERNETES_VERSION_REPOSITORY/deb/Release.key" | \ + gpg --dearmor --yes -o "/etc/apt/keyrings/kubernetes-apt-keyring-\$KUBERNETES_VERSION_REPOSITORY.gpg" +fi + if ! grep -q "\$KUBERNETES_VERSION_REPOSITORY" /etc/apt/sources.list.d/kubernetes.list 2>/dev/null; then - curl -fsSL "https://pkgs.k8s.io/core:/stable:/\$KUBERNETES_VERSION_REPOSITORY/deb/Release.key" -o /tmp/kubernetes.gpg - gpg --dearmor -o "/etc/apt/keyrings/kubernetes-apt-keyring-\$KUBERNETES_VERSION_REPOSITORY.gpg" /tmp/kubernetes.gpg echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring-\$KUBERNETES_VERSION_REPOSITORY.gpg] https://pkgs.k8s.io/core:/stable:/\$KUBERNETES_VERSION_REPOSITORY/deb/ /" | \ tee /etc/apt/sources.list.d/kubernetes.list fi @@ -523,14 +521,28 @@ function deep_clean_node() { ssh -o StrictHostKeyChecking=no "$SSH_USER@$node_ip" bash << 'EOF' set -euo pipefail +# Kill all kubernetes-related processes +for proc in kubelet kube-apiserver kube-controller-manager kube-scheduler kube-proxy containerd docker flannel; do + pkill -9 "$proc" || true +done + +# Stop and disable services systemctl stop kubelet containerd docker || true +systemctl disable kubelet containerd docker || true +# Force kubeadm reset kubeadm reset -f || true +# Clean up mounts for mount in $(mount | grep tmpfs | grep '/var/lib/kubelet' | awk '{ print $3 }'); do - umount "$mount" || true + umount -f "$mount" || true done +for mount in $(mount | grep kubernetes); do + umount -f "$(echo "$mount" | awk '{print $3}')" || true +done + +# Remove all kubernetes-related directories rm -rf \ /etc/kubernetes \ /var/lib/kubelet \ @@ -547,36 +559,69 @@ rm -rf \ $HOME/.kube \ /root/.kube -ip link set docker0 down || true -ip link delete docker0 || true -ip link set cni0 down || true -ip link delete cni0 || true -ip link set flannel.1 down || true -ip link delete flannel.1 || true -ip link set weave down || true -ip link delete weave || true - -iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X -ip6tables -F && ip6tables -t nat -F && ip6tables -t mangle -F && ip6tables -X - -for pkg in kubectl kubeadm kubelet kubernetes-cni containerd docker-ce docker-ce-cli; do - apt-mark unhold "$pkg" || true - DEBIAN_FRONTEND=noninteractive apt-get remove --purge -y "$pkg" || true +# Clean up network interfaces +ip link set docker0 down 2>/dev/null || true +ip link delete docker0 2>/dev/null || true +ip link set cni0 down 2>/dev/null || true +ip link delete cni0 2>/dev/null || true +ip link set flannel.1 down 2>/dev/null || true +ip link delete flannel.1 2>/dev/null || true +ip link set weave down 2>/dev/null || true +ip link delete weave 2>/dev/null || true + +# Clean up iptables +iptables-save | grep -v KUBE | grep -v CNI | grep -v FLANNEL | iptables-restore +ip6tables-save | grep -v KUBE | grep -v CNI | grep -v FLANNEL | ip6tables-restore + +# Remove all container images +crictl rmi --all 2>/dev/null || true +docker system prune -af 2>/dev/null || true + +# Remove all K8s and container packages +for pkg in kubectl kubeadm kubelet kubernetes-cni containerd.io containerd docker-ce docker-ce-cli docker-buildx-plugin docker-compose-plugin docker-ce-rootless-extras; do + apt-mark unhold "$pkg" 2>/dev/null || true done -apt-get autoremove -y +# Force remove packages and their configurations +apt-get remove --purge -y \ + kubectl kubeadm kubelet \ + kubernetes-cni containerd.io containerd \ + docker-ce docker-ce-cli \ + docker-buildx-plugin docker-compose-plugin \ + docker-ce-rootless-extras || true + +apt-get autoremove --purge -y || true apt-get clean +# Clean up package repositories rm -f /etc/apt/sources.list.d/kubernetes.list rm -f /etc/apt/sources.list.d/docker.list rm -f /etc/apt/keyrings/kubernetes*.gpg rm -f /etc/apt/keyrings/docker*.gpg +# Remove binaries +rm -f /usr/bin/kubectl /usr/bin/kubeadm /usr/bin/kubelet + +# Restore original fstab if backup exists if [[ -f /etc/fstab.bak.* ]]; then cp "$(ls -t /etc/fstab.bak.* | head -1)" /etc/fstab fi +# Clean up systemd +rm -f /etc/systemd/system/kubelet.service +rm -f /etc/systemd/system/docker.service +rm -f /etc/systemd/system/containerd.service +rm -rf /etc/systemd/system/kubelet.service.d +rm -rf /etc/systemd/system/docker.service.d +rm -rf /etc/systemd/system/containerd.service.d + systemctl daemon-reload + +# Remove any leftover process +for proc in kubelet kube-apiserver kube-controller-manager kube-scheduler kube-proxy containerd dockerd docker-containerd flannel flanneld; do + killall -9 "$proc" || true +done + EOF } @@ -584,34 +629,107 @@ function verify_deep_clean() { local node_ip=$1 local failed=false - local checks=( - "systemctl is-active kubelet" - "systemctl is-active containerd" - "systemctl is-active docker" - "which kubectl" - "which kubeadm" - "which kubelet" - "test -d /etc/kubernetes" - "test -d /var/lib/kubelet" - "test -d /var/lib/etcd" - ) - - for check in "${checks[@]}"; do - if ssh -o StrictHostKeyChecking=no "$SSH_USER@$node_ip" "$check" &>/dev/null; then - log "Warning: $check still exists on $node_ip" - failed=true - fi - done + log "Verifying services and processes on $node_ip" + ssh -o StrictHostKeyChecking=no "$SSH_USER@$node_ip" bash << 'EOF' || failed=true +set -euo pipefail + +# Check for running services +services_running=false +for svc in kubelet containerd docker; do + if systemctl is-active --quiet "$svc" 2>/dev/null; then + echo "Service still running: $svc" + services_running=true + fi +done +$services_running && exit 1 + +# Check for K8s processes +processes_running=false +for proc in kubelet kube-apiserver kube-controller-manager kube-scheduler kube-proxy containerd dockerd flanneld; do + if pgrep -f "$proc" > /dev/null; then + echo "Process still running: $proc" + processes_running=true + fi +done +$processes_running && exit 1 + +# Check for open ports +ports_in_use=false +for port in 6443 2379 2380 10250 10251 10252 10255 8472 51820 51821; do + if netstat -tuln | grep -q ":$port "; then + echo "Port still in use: $port" + ports_in_use=true + fi +done +$ports_in_use && exit 1 - $failed && error "Deep clean verification failed on $node_ip" +# Check for remaining files/directories +files_exist=false +for path in \ + /etc/kubernetes \ + /var/lib/kubelet \ + /var/lib/etcd \ + /var/run/kubernetes \ + /var/lib/dockershim \ + /etc/cni \ + /opt/cni \ + /var/lib/cni \ + /var/lib/containerd \ + /var/lib/docker \ + /etc/containerd \ + /etc/docker \ + $HOME/.kube \ + /root/.kube; do + if [ -e "$path" ]; then + echo "Path still exists: $path" + files_exist=true + fi +done +$files_exist && exit 1 + +# Check for network interfaces +interfaces_exist=false +for iface in docker0 cni0 flannel.1 weave; do + if ip link show "$iface" &>/dev/null; then + echo "Interface still exists: $iface" + interfaces_exist=true + fi +done +$interfaces_exist && exit 1 + +# Check for kubernetes iptables rules +if iptables-save | grep -qE 'KUBE|CNI|FLANNEL'; then + echo "Kubernetes iptables rules still exist" + exit 1 +fi + +# Check for installed packages +packages_exist=false +for pkg in kubectl kubeadm kubelet kubernetes-cni containerd.io docker-ce docker-ce-cli; do + if dpkg -l | grep -q "^ii.*$pkg"; then + echo "Package still installed: $pkg" + packages_exist=true + fi +done +$packages_exist && exit 1 + +exit 0 +EOF + + if $failed; then + error "Deep clean verification failed on $node_ip - some components still present" + else + log "Verification passed for $node_ip" + fi } function cleanup_node() { local node_ip=$1 if [[ $NUKE == "true" ]]; then - log "Performing deep clean on $node_ip" + log "Starting deep clean on $node_ip" deep_clean_node "$node_ip" + log "Deep clean completed, starting verification" verify_deep_clean "$node_ip" else log "Performing standard cleanup on $node_ip" @@ -625,27 +743,24 @@ EOF } function cleanup_cluster() { - if ! ssh -o StrictHostKeyChecking=no "$SSH_USER@$CONTROL_PLANE_IP" kubectl get nodes &>/dev/null; then - error "Cannot access the Kubernetes cluster" + # First, try to get worker nodes from kubectl if possible + local worker_nodes="" + if ssh -o StrictHostKeyChecking=no "$SSH_USER@$CONTROL_PLANE_IP" kubectl get nodes &>/dev/null; then + worker_nodes=$(ssh -o StrictHostKeyChecking=no "$SSH_USER@$CONTROL_PLANE_IP" \ + kubectl get nodes -o custom-columns=IP:.status.addresses[0].address --no-headers | \ + grep -v "$CONTROL_PLANE_IP") || true fi - local worker_nodes - worker_nodes=$(ssh -o StrictHostKeyChecking=no "$SSH_USER@$CONTROL_PLANE_IP" \ - kubectl get nodes -o custom-columns=IP:.status.addresses[0].address --no-headers | \ - grep -v "$CONTROL_PLANE_IP") || true - + # Clean up worker nodes if we found any for ip in $worker_nodes; do [[ -z $ip ]] && continue log "Cleaning up worker: $ip" cleanup_node "$ip" done + # Always clean up control plane log "Cleaning up control plane" cleanup_node "$CONTROL_PLANE_IP" - - if ssh -o StrictHostKeyChecking=no "$SSH_USER@$CONTROL_PLANE_IP" kubectl get nodes &>/dev/null; then - error "Cluster is still running after cleanup" - fi } function main() { From 504c75589c283eb3f126baa5aa6483f0b81ba62c Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 17:41:46 +0200 Subject: [PATCH 03/15] cleanup coredns --- kubelift.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kubelift.sh b/kubelift.sh index 352c0bc..ca119e6 100755 --- a/kubelift.sh +++ b/kubelift.sh @@ -522,7 +522,7 @@ function deep_clean_node() { set -euo pipefail # Kill all kubernetes-related processes -for proc in kubelet kube-apiserver kube-controller-manager kube-scheduler kube-proxy containerd docker flannel; do +for proc in kubelet kube-apiserver kube-controller-manager kube-scheduler kube-proxy containerd docker flannel coredns; do pkill -9 "$proc" || true done @@ -559,6 +559,9 @@ rm -rf \ $HOME/.kube \ /root/.kube +# Clean up network namespaces that might be used by CoreDNS/pods +ip netns list | grep -E 'cni-|coredns' | xargs -r ip netns delete + # Clean up network interfaces ip link set docker0 down 2>/dev/null || true ip link delete docker0 2>/dev/null || true @@ -578,7 +581,7 @@ crictl rmi --all 2>/dev/null || true docker system prune -af 2>/dev/null || true # Remove all K8s and container packages -for pkg in kubectl kubeadm kubelet kubernetes-cni containerd.io containerd docker-ce docker-ce-cli docker-buildx-plugin docker-compose-plugin docker-ce-rootless-extras; do +for pkg in kubectl kubeadm kubelet kubernetes-cni containerd.io containerd docker-ce docker-ce-cli docker-buildx-plugin docker-compose-plugin docker-ce-rootless-extras coredns; do apt-mark unhold "$pkg" 2>/dev/null || true done From fbfb91cb709ddb1e7f55225446d6cb8eab459fe5 Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 17:43:52 +0200 Subject: [PATCH 04/15] rm nuke todo --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index a0e9e8c..7516c32 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,6 @@ Some of the alternatives you could consider are: ## High Priority -- Add support for nuking the whole cluster - not just kubeadm reset, remove all traces - Improve error handling and reporting - Add more input validation - Allow custom CIDR ranges From 246ef503a290a258f8711415094b61967121ece1 Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 17:53:11 +0200 Subject: [PATCH 05/15] add sysctl handling and cleanup --- kubelift.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kubelift.sh b/kubelift.sh index ca119e6..1ce6303 100755 --- a/kubelift.sh +++ b/kubelift.sh @@ -237,7 +237,10 @@ if grep -q "^/[^ ]* *none *swap" /proc/mounts; then fi modprobe br_netfilter -echo "net.bridge.bridge-nf-call-iptables = 1" | tee -a /etc/sysctl.conf + +grep -q "^net.bridge.bridge-nf-call-iptables = 1" /etc/sysctl.conf || \ + echo "net.bridge.bridge-nf-call-iptables = 1" >> /etc/sysctl.conf + sysctl -p EOF } @@ -605,6 +608,9 @@ rm -f /etc/apt/keyrings/docker*.gpg # Remove binaries rm -f /usr/bin/kubectl /usr/bin/kubeadm /usr/bin/kubelet +sed -i '/^net.bridge.bridge-nf-call-iptables = 1$/d' /etc/sysctl.conf +sysctl -p + # Restore original fstab if backup exists if [[ -f /etc/fstab.bak.* ]]; then cp "$(ls -t /etc/fstab.bak.* | head -1)" /etc/fstab From 64d8c3a3ae33285c35e5dd89a67ee6e19a38e32d Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 17:53:56 +0200 Subject: [PATCH 06/15] rm useless comments --- kubelift.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/kubelift.sh b/kubelift.sh index 1ce6303..35b6409 100755 --- a/kubelift.sh +++ b/kubelift.sh @@ -752,7 +752,6 @@ EOF } function cleanup_cluster() { - # First, try to get worker nodes from kubectl if possible local worker_nodes="" if ssh -o StrictHostKeyChecking=no "$SSH_USER@$CONTROL_PLANE_IP" kubectl get nodes &>/dev/null; then worker_nodes=$(ssh -o StrictHostKeyChecking=no "$SSH_USER@$CONTROL_PLANE_IP" \ @@ -760,14 +759,12 @@ function cleanup_cluster() { grep -v "$CONTROL_PLANE_IP") || true fi - # Clean up worker nodes if we found any for ip in $worker_nodes; do [[ -z $ip ]] && continue log "Cleaning up worker: $ip" cleanup_node "$ip" done - # Always clean up control plane log "Cleaning up control plane" cleanup_node "$CONTROL_PLANE_IP" } From bc8e210452b82f007735028dc9f73b0c7997248b Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 18:04:35 +0200 Subject: [PATCH 07/15] rm parser_operation --- kubelift.sh | 61 +++++++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/kubelift.sh b/kubelift.sh index 35b6409..a9b464c 100755 --- a/kubelift.sh +++ b/kubelift.sh @@ -43,17 +43,6 @@ function error() { exit 1 } -function parse_operation() { - [[ $# -lt 1 ]] && print_usage - - OPERATION=$1; shift - case $OPERATION in - create|upgrade) ;; - *) error "Invalid operation: $OPERATION" ;; - esac - return $# -} - function parse_args() { [[ $# -eq 0 || $1 == "-h" || $1 == "--help" ]] && print_usage @@ -71,9 +60,7 @@ function parse_args() { --kubernetes-version) KUBERNETES_VERSION="$2"; shift 2 ;; --control-plane-ip) CONTROL_PLANE_IP="$2"; shift 2 ;; --worker-ips) - if [[ $OPERATION == "create" ]]; then - WORKER_IPS="$2" - fi + [[ $OPERATION == "create" ]] && WORKER_IPS="$2" shift 2 ;; --enable-control-plane-workloads) ENABLE_CONTROL_PLANE_WORKLOADS="$2"; shift 2 ;; @@ -129,20 +116,20 @@ function validate_node_resources() { local min_disk=$4 local node_type=$5 - local resources cpu_cores mem_gb disk_gb - resources=$(get_node_resources "$node_ip") || return 1 - read -r cpu_cores mem_gb disk_gb <<< "$resources" + local resources cpu_cores mem_gb disk_gb + resources=$(get_node_resources "$node_ip") || return 1 + read -r cpu_cores mem_gb disk_gb <<< "$resources" - local errors=() - ((cpu_cores >= min_cpu)) || errors+=("CPU cores: $cpu_cores (minimum $min_cpu)") - ((mem_gb >= min_mem)) || errors+=("Memory: ${mem_gb}GB (minimum ${min_mem}GB)") - ((disk_gb >= min_disk)) || errors+=("Disk: ${disk_gb}GB (minimum ${min_disk}GB)") + local errors=() + ((cpu_cores >= min_cpu)) || errors+=("CPU cores: $cpu_cores (minimum $min_cpu)") + ((mem_gb >= min_mem)) || errors+=("Memory: ${mem_gb}GB (minimum ${min_mem}GB)") + ((disk_gb >= min_disk)) || errors+=("Disk: ${disk_gb}GB (minimum ${min_disk}GB)") - if ((${#errors[@]} > 0)); then - log "$node_type ($node_ip) validation failed:" - printf " - %s\n" "${errors[@]}" >&2 - return 1 - fi + if ((${#errors[@]} > 0)); then + log "$node_type ($node_ip) validation failed:" + printf " - %s\n" "${errors[@]}" >&2 + return 1 + fi return 0 } @@ -258,13 +245,13 @@ if ! command -v docker &> /dev/null || ! docker info &> /dev/null; then install -m 0755 -d /etc/apt/keyrings if [[ ! -f /etc/apt/keyrings/docker.asc ]]; then - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor --yes -o /etc/apt/keyrings/docker.asc - chmod a+r /etc/apt/keyrings/docker.asc + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor --yes -o /etc/apt/keyrings/docker.asc + chmod a+r /etc/apt/keyrings/docker.asc fi if [[ ! -f /etc/apt/sources.list.d/docker.list ]]; then - echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ - tee /etc/apt/sources.list.d/docker.list > /dev/null + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + tee /etc/apt/sources.list.d/docker.list > /dev/null fi apt-get update @@ -282,7 +269,7 @@ version = 2 [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] SystemdCgroup = true EOC - systemctl restart containerd +systemctl restart containerd fi EOF } @@ -348,14 +335,14 @@ EOF function install_cni() { ssh -o StrictHostKeyChecking=no "$SSH_USER@$CONTROL_PLANE_IP" bash << 'EOF' -curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash -kubectl create ns kube-flannel -kubectl label --overwrite ns kube-flannel pod-security.kubernetes.io/enforce=privileged + kubectl create ns kube-flannel + kubectl label --overwrite ns kube-flannel pod-security.kubernetes.io/enforce=privileged -helm repo add flannel https://flannel-io.github.io/flannel -helm repo update -helm install flannel --set podCidr=10.244.0.0/16 --namespace kube-flannel flannel/flannel + helm repo add flannel https://flannel-io.github.io/flannel + helm repo update + helm install flannel --set podCidr=10.244.0.0/16 --namespace kube-flannel flannel/flannel EOF } From 4f80f98400a13130914f8c441201ea091ae3dfcd Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 18:13:57 +0200 Subject: [PATCH 08/15] add helm cleanup --- kubelift.sh | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/kubelift.sh b/kubelift.sh index a9b464c..3a47d48 100755 --- a/kubelift.sh +++ b/kubelift.sh @@ -335,14 +335,22 @@ EOF function install_cni() { ssh -o StrictHostKeyChecking=no "$SSH_USER@$CONTROL_PLANE_IP" bash << 'EOF' - curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash +if ! command -v helm &> /dev/null; then + curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash +fi + +kubectl create ns kube-flannel +kubectl label --overwrite ns kube-flannel pod-security.kubernetes.io/enforce=privileged - kubectl create ns kube-flannel - kubectl label --overwrite ns kube-flannel pod-security.kubernetes.io/enforce=privileged +if ! helm repo list | grep -q '^flannel\s'; then + helm repo add flannel https://flannel-io.github.io/flannel + helm repo update +fi - helm repo add flannel https://flannel-io.github.io/flannel - helm repo update - helm install flannel --set podCidr=10.244.0.0/16 --namespace kube-flannel flannel/flannel +helm install flannel \ + --set podCidr=10.244.0.0/16 \ + --namespace kube-flannel \ + flannel/flannel EOF } @@ -709,6 +717,12 @@ for pkg in kubectl kubeadm kubelet kubernetes-cni containerd.io docker-ce docker done $packages_exist && exit 1 +# Remove helm and all repos +if command -v helm &> /dev/null; then + helm repo list | tail -n +2 | awk '{print $1}' | xargs -r helm repo remove + rm $(command -v helm) +fi + exit 0 EOF @@ -730,6 +744,10 @@ function cleanup_node() { else log "Performing standard cleanup on $node_ip" ssh -o StrictHostKeyChecking=no "$SSH_USER@$node_ip" bash << 'EOF' +if [[ -f $HOME/.kube/config ]] && command -v helm &> /dev/null; then + helm repo list | tail -n +2 | awk '{print $1}' | xargs -r helm repo remove +fi + kubeadm reset -f rm -rf $HOME/.kube ip link delete cni0 || true From 2a02045d35c458027054521d18b72989fb0cc73c Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 18:16:19 +0200 Subject: [PATCH 09/15] rm more checks from todo --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 7516c32..9554686 100644 --- a/README.md +++ b/README.md @@ -153,7 +153,6 @@ Some of the alternatives you could consider are: - Add automated etcd backup and restor - Add support for more CNI plugins: Calico, Cilium - Add support for customizing the kubelet configuration -- More post-installation/upgrade checks - Cluster configuration templating - Add support for MetalLB - Add support for HA control plane From b3ea7bc4a4fbaf9e9b12b08f29345304c6e8f76b Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 18:17:09 +0200 Subject: [PATCH 10/15] rm input validation - already done --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 9554686..0fe4fe0 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,6 @@ Some of the alternatives you could consider are: ## High Priority - Improve error handling and reporting -- Add more input validation - Allow custom CIDR ranges - Add support for additional logging output to a file - Add dry-run mode for operations From eb72d1b2fed3dd50a50937a66a552c1340b1a98a Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 18:17:27 +0200 Subject: [PATCH 11/15] rm kubelet customization - out of scope --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 0fe4fe0..00f5eb5 100644 --- a/README.md +++ b/README.md @@ -151,7 +151,6 @@ Some of the alternatives you could consider are: - Add automated etcd backup and restor - Add support for more CNI plugins: Calico, Cilium -- Add support for customizing the kubelet configuration - Cluster configuration templating - Add support for MetalLB - Add support for HA control plane From cfb5e216ebf6de1476a457da5bccfdd6e1b2e0b6 Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 18:17:49 +0200 Subject: [PATCH 12/15] rm etcd backup - out of scope --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 00f5eb5..3bc7043 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,6 @@ Some of the alternatives you could consider are: ## Medium Priority -- Add automated etcd backup and restor - Add support for more CNI plugins: Calico, Cilium - Cluster configuration templating - Add support for MetalLB From 5727e75864bcb468a3fd80e25c14c9b4c6330922 Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 18:18:06 +0200 Subject: [PATCH 13/15] rm metallb - out of scope --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 3bc7043..dc0914c 100644 --- a/README.md +++ b/README.md @@ -151,7 +151,6 @@ Some of the alternatives you could consider are: - Add support for more CNI plugins: Calico, Cilium - Cluster configuration templating -- Add support for MetalLB - Add support for HA control plane - Assume presence of the flag equals true (e.g. --noninteractive) if the flag is present - Add k3s support From 45e464aaf8ac04e523ea86675f8ab0cc4a46933a Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 18:18:27 +0200 Subject: [PATCH 14/15] rm performance tuning - out of scope --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index dc0914c..dec78c1 100644 --- a/README.md +++ b/README.md @@ -159,4 +159,3 @@ Some of the alternatives you could consider are: ## Low Priority - Air-gapped environments support -- Add performance tuning options From def059b3d521ce9dbfc4db76fe9b7ad628651328 Mon Sep 17 00:00:00 2001 From: Cezary Stanislawski Date: Fri, 25 Oct 2024 18:19:28 +0200 Subject: [PATCH 15/15] rm error handling and reporting - continuous goal --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index dec78c1..98a1ae5 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,6 @@ Some of the alternatives you could consider are: ## High Priority -- Improve error handling and reporting - Allow custom CIDR ranges - Add support for additional logging output to a file - Add dry-run mode for operations