From 56fda02564a128a9521a752635d6e099e44921cb Mon Sep 17 00:00:00 2001 From: Travis Nielsen Date: Mon, 10 Oct 2022 17:37:21 -0600 Subject: [PATCH] core: add command to reset mon quorum When quorum is lost, restoring quorum to a single mon is currently a complex manual process. Now with this krew command the admin can with less risk reset the mon quorum and restore the cluster again in disaster scenarios. Signed-off-by: Travis Nielsen --- README.md | 1 + docs/mons.md | 27 +++++- kubectl-rook-ceph.sh | 202 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 215 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index d2a35d52..1cc03e0c 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ These are args currently supported: - `rbd ` : Call a 'rbd' CLI command with arbitrary args - `mons` : Print mon endpoints + - `reset-quorum ` : Reset the mon quorum to a single mon since quorum was lost with the other mons - `health` : check health of the cluster and common configuration issues diff --git a/docs/mons.md b/docs/mons.md index 6b39610a..bf3bdc9b 100644 --- a/docs/mons.md +++ b/docs/mons.md @@ -1,4 +1,6 @@ -# Mons +# Mon Commands + +## Print Mons This is used to print mon endpoints. @@ -7,3 +9,26 @@ kubectl rook-ceph mons # 10.98.95.196:6789,10.106.118.240:6789,10.111.18.121:6789 ``` + +## Reset Quorum + +Mon quorum is critical to the Ceph cluster. If majority of mons are not in quorum, +the cluster will be down. If the majority of mons are also lost permanently, +the quorum will need to be reset to a remaining good mon in order to bring +the Ceph cluster up again. + +To reset the quorum in this disaster scenario: +1. Identify that mon quorum is lost. Some indications include: + - The Rook operator log shows timeout errors and continuously fails to reconcile + - All commands in the toolbox are unresponsive + - Multiple mon pods are likely down +2. Identify which mon has good state (TBD) +3. Run the command to restore quorum to that good mon +4. Follow the prompts to confirm that you want to continue with each critical step of the reset +5. The final prompt will be to restart the operator, which will add new mons to restore the full quorum size + +In this example, quorum is reset to mon **a**. + +```bash +kubectl rook-ceph mons reset-quorum a +``` diff --git a/kubectl-rook-ceph.sh b/kubectl-rook-ceph.sh index f6d497e7..7d960a8d 100755 --- a/kubectl-rook-ceph.sh +++ b/kubectl-rook-ceph.sh @@ -40,6 +40,7 @@ function print_usage() { echo " restart : restart the Rook-Ceph operator" echo " set : Set the property in the rook-ceph-operator-config configmap." echo " mons : output mon endpoints" + echo " reset-quorum : When quorum is lost, reset quorum to the remaining healthy mon" echo " rook" echo " version : print the version of Rook" echo " status : print the phase and conditions of the CephCluster CR" @@ -226,14 +227,176 @@ function path_cm_rook_ceph_operator_config() { } #################################################################################################### -# 'kubectl rook-ceph mon-endpoints' commands +# 'kubectl rook-ceph mons' commands #################################################################################################### +function run_mons_command () { + if [ "$#" -ge 1 ] && [ "$1" = "reset-quorum" ]; then + shift # remove the subcommand from the front of the arg list + run_reset_quorum "$@" + else + fetch_mon_endpoints "$@" + fi +} + function fetch_mon_endpoints() { end_of_command_parsing "$@" # end of command tree KUBECTL_NS_CLUSTER get cm rook-ceph-mon-endpoints -o json | jq --monochrome-output '.data.data' | tr -d '"' | tr -d '=' | sed 's/[A-Za-z]*//g' } + +wait_for_pod_of_deployment_to_be_running() { + echo -e "\nTODO: waiting for the pod from deployment \"rook-ceph-mon-$good_mon-debug\" to be running" + sleep 5 +} + +function run_reset_quorum() { + parse_flags parse_image_flag "$@" # parse flags before the good mon name + [[ -z "${REMAINING_ARGS[0]:-""}" ]] && fail_error "Missing healthy mon name" + good_mon="${REMAINING_ARGS[0]}" # get the good mon being used to reset quorum + shift # remove the healthy mon from the front of the arg list + REMAINING_ARGS=("${REMAINING_ARGS[@]:1}") # remove mon name from remaining args + end_of_command_parsing "$@" # end of command tree + + # Parse the endpoints configmap for the mon endpoints + bad_mons=() + mon_endpoints=$(KUBECTL_NS_CLUSTER get cm rook-ceph-mon-endpoints -o jsonpath='{.data.data}') + for single_mon in ${mon_endpoints//,/ } ; do + mon_name=$(echo "${single_mon/=/ }" | awk '{print $1}') + mon_endpoint=$(echo "${single_mon/=/ }" | awk '{print $2}') + echo "mon=$mon_name, endpoint=$mon_endpoint" + if [ "$mon_name" = "$good_mon" ]; then + good_mon_public_ip=$(echo "${mon_endpoint/:/ }" | awk '{print $1}') + good_mon_port=$(echo "${mon_endpoint/:/ }" | awk '{print $2}') + else + bad_mons+=($mon_name) + fi + done + + # Parse the cluster FSID + ceph_fsid=$(KUBECTL_NS_CLUSTER get secret rook-ceph-mon -o jsonpath='{.data.fsid}' | base64 -d) + if [ -z ${good_mon_public_ip+x} ]; then + echo "error: good mon $good_mon not found" + exit 1 + fi + if [ "$ceph_fsid" = "" ]; then + echo "error: ceph cluster fsid not found" + exit 1 + fi + + export monmap_path=/tmp/monmap + + echo "" + echo "WARNING: Resetting mon quorum to mon $good_mon ($good_mon_public_ip)" + echo "The mons to discard are: ${bad_mons[*]}" + echo "The cluster fsid is $ceph_fsid" + echo "If you want to continue resetting the mon quorum to $good_mon, type \"yes\"" + read CONTINUE_SCRIPT + if [ "$CONTINUE_SCRIPT" != "yes" ]; then + exit 1 + fi + + # scale the operator deployment down + KUBECTL_NS_CLUSTER scale deployment rook-ceph-operator --replicas=0 + + # scale down all the mon pods + KUBECTL_NS_CLUSTER scale deployment -l app=rook-ceph-mon --replicas=0 + + # TODO: wait for the operator and mons to all stop + echo -e "\nTODO: Waiting for operator and mons to stop..." + sleep 5 + + # start the mon debug pod + run_start_debug rook-ceph-mon-$good_mon + + wait_for_pod_of_deployment_to_be_running "rook-ceph-mon-$good_mon-debug" + + echo -e "\nStarted debug pod, resetting the mon quorum in the debug pod" + + # run some ceph commands in the mon debug pod to reset quorum + set +eu + echo -e "\nExtracting the monmap" + KUBECTL_NS_OPERATOR exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- ceph-mon \ + --fsid=$ceph_fsid \ + --keyring=/etc/ceph/keyring-store/keyring \ + --log-to-stderr=true \ + --err-to-stderr=true \ + --mon-cluster-log-to-stderr=true \ + --log-stderr-prefix=debug \ + --default-log-to-file=false \ + --default-mon-cluster-log-to-file=false \ + --mon-host=$ROOK_CEPH_MON_HOST \ + --mon-initial-members=$ROOK_CEPH_MON_INITIAL_MEMBERS \ + --id=$good_mon \ + --foreground \ + --public-addr=$good_mon_public_ip \ + --setuser-match-path=/var/lib/ceph/mon/ceph-$good_mon/store.db \ + --public-bind-addr=$ROOK_POD_IP \ + --extract-monmap=$monmap_path + + echo -e "\nPrinting monmap"; \ + KUBECTL_NS_OPERATOR exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- monmaptool --print $monmap_path + + # remove all the mons except the good one + for bad_mon in "${bad_mons[@]}" + do + echo -e "\nRemoving mon $bad_mon" + KUBECTL_NS_OPERATOR exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- monmaptool $monmap_path --rm $bad_mon + done + + echo -e "\rInjecting the monmap" + KUBECTL_NS_OPERATOR exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- ceph-mon \ + --fsid=$ceph_fsid \ + --keyring=/etc/ceph/keyring-store/keyring \ + --log-to-stderr=true \ + --err-to-stderr=true \ + --mon-cluster-log-to-stderr=true \ + --log-stderr-prefix=debug \ + --default-log-to-file=false \ + --default-mon-cluster-log-to-file=false \ + --mon-host=$ROOK_CEPH_MON_HOST \ + --mon-initial-members=$ROOK_CEPH_MON_INITIAL_MEMBERS \ + --id=$good_mon \ + --foreground \ + --public-addr=$good_mon_public_ip \ + --setuser-match-path=/var/lib/ceph/mon/ceph-$good_mon/store.db \ + --public-bind-addr=$ROOK_POD_IP \ + --inject-monmap=$monmap_path + echo -e "\nFinished updating the monmap!" + set -eu + + echo -e "\nPrinting final monmap" + KUBECTL_NS_OPERATOR exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- monmaptool --print $monmap_path + + echo -e "\nReseting the mons in the rook-ceph-mon-endpoints configmap to the good mon" + KUBECTL_NS_OPERATOR patch configmaps rook-ceph-mon-endpoints --type json --patch "[{ op: replace, path: /data/data, value: $good_mon=$good_mon_public_ip:$good_mon_port }]" + + echo -e "\nStopping the debug pod for mon $good_mon" + run_stop_debug rook-ceph-mon-$good_mon + + echo -e "\nTODO: Add a check that the good mon is healthy" + + echo -e "\nMon quorum was successfully reset to mon $good_mon" + + echo -e "\nProceed to purge the bad mons (${bad_mons[@]})? If so, type \"yes\"" + read PURGE_MONS + if [ "$PURGE_MONS" = "yes" ]; then + for bad_mon in "${bad_mons[@]}" + do + echo "purging old mon: $bad_mon" + KUBECTL_NS_OPERATOR delete deploy rook-ceph-mon-$bad_mon + KUBECTL_NS_OPERATOR delete svc rook-ceph-mon-$bad_mon + done + fi + + echo -e "\nStart up the operator and expand to full mon quorum again? If so, type \"yes\"" + read START_OPERATOR + if [ "$START_OPERATOR" = "yes" ]; then + # scale up the operator + KUBECTL_NS_CLUSTER scale deployment rook-ceph-operator --replicas=1 + fi +} + #################################################################################################### # 'kubectl rook-ceph rook ...' commands #################################################################################################### @@ -422,16 +585,20 @@ function run_start_debug() { parse_flags parse_image_flag "$@" # parse flags before the deployment name [[ -z "${REMAINING_ARGS[0]:-""}" ]] && fail_error "Missing mon or osd deployment name" deployment_name="${REMAINING_ARGS[0]}" # get deployment name - REMAINING_ARGS=("${REMAINING_ARGS[@]:1}") # remove deploy name from remaining args - parse_flags parse_image_flag "${REMAINING_ARGS[@]}" # parse flags after the deployment name - end_of_command_parsing "${REMAINING_ARGS[@]}" + echo "REMAINING ARGS BEFORE: ${REMAINING_ARGS}" + #REMAINING_ARGS=("${REMAINING_ARGS[@]:1}") # remove deploy name from remaining args + #if [[ -n ${REMAINING_ARGS} ]]; then + # echo "REMAINING ARGS AFTER: ${REMAINING_ARGS}" + # parse_flags parse_image_flag "${REMAINING_ARGS[@]}" # parse flags after the deployment name + # end_of_command_parsing "${REMAINING_ARGS[@]}" + #fi verify_debug_deployment "$deployment_name" # copy the deployment spec before scaling it down - deployment_spec=$(KUBECTL_NS_CLUSTER get deployments "$deployment_name" -o json | jq -r ".spec") + deployment_spec=$(KUBECTL_NS_CLUSTER get deployment "$deployment_name" -o json | jq -r ".spec") # copy the deployment labels before scaling it down - labels=$(KUBECTL_NS_CLUSTER get deployments "$deployment_name" -o json | jq -r ".metadata.labels") + labels=$(KUBECTL_NS_CLUSTER get deployment "$deployment_name" -o json | jq -r ".metadata.labels") # add debug label to the list labels=$(echo "$labels" | jq '. + {"ceph.rook.io/do-not-reconcile": "true"}') # remove probes from the deployment @@ -445,13 +612,18 @@ function run_start_debug() { echo "setting debug command to main container" deployment_spec=$(update_deployment_spec_command "$deployment_spec") - deployment_pod=$(KUBECTL_NS_CLUSTER get pod | grep "$deployment_name" | awk '{ print $1 }') - # scale the deployment to 0 - KUBECTL_NS_CLUSTER scale deployments "$deployment_name" --replicas=0 - - # wait for the deployment pod to be deleted - echo "waiting for the deployment pod \"$deployment_pod\" to be deleted" - KUBECTL_NS_CLUSTER wait --for=delete pod/"$deployment_pod" --timeout=60s + # scale down the daemon pod if it's running + if [[ "false" == "true" ]]; then + echo "get pod for deployment $deployment_name" + deployment_pod=$(KUBECTL_NS_CLUSTER get pod | grep "$deployment_name" | awk '{ print $1 }') + # scale the deployment to 0 + echo "scale down the deployment $deployment_name" + KUBECTL_NS_CLUSTER scale deployments "$deployment_name" --replicas=0 + + # wait for the deployment pod to be deleted + echo "waiting for the deployment pod \"$deployment_pod\" to be deleted" + KUBECTL_NS_CLUSTER wait --for=delete pod/"$deployment_pod" --timeout=60s + fi # create debug deployment cat <