From 56fda02564a128a9521a752635d6e099e44921cb Mon Sep 17 00:00:00 2001
From: Travis Nielsen <tnielsen@redhat.com>
Date: Mon, 10 Oct 2022 17:37:21 -0600
Subject: [PATCH] core: add command to reset mon quorum

When quorum is lost, restoring quorum to a single mon is
currently a complex manual process. Now with this krew
command the admin can with less risk reset the mon quorum
and restore the cluster again in disaster scenarios.

Signed-off-by: Travis Nielsen <tnielsen@redhat.com>
---
 README.md            |   1 +
 docs/mons.md         |  27 +++++-
 kubectl-rook-ceph.sh | 202 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 215 insertions(+), 15 deletions(-)
diff --git a/README.md b/README.md
index d2a35d52..1cc03e0c 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,7 @@ These are args currently supported:
 - `rbd <args>` : Call a 'rbd' CLI command with arbitrary args
 
 - `mons` : Print mon endpoints
+  - `reset-quorum <mon-name>` : Reset the mon quorum to a single mon since quorum was lost with the other mons
 
 - `health` : check health of the cluster and common configuration issues
 
diff --git a/docs/mons.md b/docs/mons.md
index 6b39610a..bf3bdc9b 100644
--- a/docs/mons.md
+++ b/docs/mons.md
@@ -1,4 +1,6 @@
-# Mons
+# Mon Commands
+
+## Print Mons
 
 This is used to print mon endpoints.
 
@@ -7,3 +9,26 @@ kubectl rook-ceph mons
 
 # 10.98.95.196:6789,10.106.118.240:6789,10.111.18.121:6789
 ```
+
+## Reset Quorum
+
+Mon quorum is critical to the Ceph cluster. If majority of mons are not in quorum,
+the cluster will be down. If the majority of mons are also lost permanently,
+the quorum will need to be reset to a remaining good mon in order to bring
+the Ceph cluster up again.
+
+To reset the quorum in this disaster scenario:
+1. Identify that mon quorum is lost. Some indications include:
+   - The Rook operator log shows timeout errors and continuously fails to reconcile
+   - All commands in the toolbox are unresponsive
+   - Multiple mon pods are likely down
+2. Identify which mon has good state (TBD)
+3. Run the command to restore quorum to that good mon
+4. Follow the prompts to confirm that you want to continue with each critical step of the reset
+5. The final prompt will be to restart the operator, which will add new mons to restore the full quorum size
+
+In this example, quorum is reset to mon **a**.
+
+```bash
+kubectl rook-ceph mons reset-quorum a
+```
diff --git a/kubectl-rook-ceph.sh b/kubectl-rook-ceph.sh
index f6d497e7..7d960a8d 100755
--- a/kubectl-rook-ceph.sh
+++ b/kubectl-rook-ceph.sh
@@ -40,6 +40,7 @@ function print_usage() {
   echo "    restart                                 : restart the Rook-Ceph operator"
   echo "    set <property> <value>                  : Set the property in the rook-ceph-operator-config configmap."
   echo "  mons                                      : output mon endpoints"
+  echo "    reset-quorum <mon-name>                 : When quorum is lost, reset quorum to the remaining healthy mon"
   echo "  rook"
   echo "    version                                 : print the version of Rook"
   echo "    status                                  : print the phase and conditions of the CephCluster CR"
@@ -226,14 +227,176 @@ function path_cm_rook_ceph_operator_config() {
 }
 
 ####################################################################################################
-# 'kubectl rook-ceph mon-endpoints' commands
+# 'kubectl rook-ceph mons' commands
 ####################################################################################################
 
+function run_mons_command () {
+  if [ "$#" -ge 1 ] && [ "$1" = "reset-quorum" ]; then
+    shift # remove the subcommand from the front of the arg list
+    run_reset_quorum "$@"
+  else
+    fetch_mon_endpoints "$@"
+  fi
+}
+
 function fetch_mon_endpoints() {
   end_of_command_parsing "$@" # end of command tree
   KUBECTL_NS_CLUSTER get cm rook-ceph-mon-endpoints -o json | jq --monochrome-output '.data.data' | tr -d '"' | tr -d '=' | sed 's/[A-Za-z]*//g'
 }
 
+
+wait_for_pod_of_deployment_to_be_running() {
+  echo -e "\nTODO: waiting for the pod from deployment \"rook-ceph-mon-$good_mon-debug\" to be running"
+  sleep 5
+}
+
+function run_reset_quorum() {
+  parse_flags parse_image_flag "$@" # parse flags before the good mon name
+  [[ -z "${REMAINING_ARGS[0]:-""}" ]] && fail_error "Missing healthy mon name"
+  good_mon="${REMAINING_ARGS[0]}"              # get the good mon being used to reset quorum
+  shift # remove the healthy mon from the front of the arg list
+  REMAINING_ARGS=("${REMAINING_ARGS[@]:1}")           # remove mon name from remaining args
+  end_of_command_parsing "$@" # end of command tree
+
+  # Parse the endpoints configmap for the mon endpoints
+  bad_mons=()
+  mon_endpoints=$(KUBECTL_NS_CLUSTER get cm rook-ceph-mon-endpoints -o jsonpath='{.data.data}')
+  for single_mon in ${mon_endpoints//,/ } ; do
+    mon_name=$(echo "${single_mon/=/ }" | awk '{print $1}')
+    mon_endpoint=$(echo "${single_mon/=/ }" | awk '{print $2}')
+    echo "mon=$mon_name, endpoint=$mon_endpoint"
+    if [ "$mon_name" = "$good_mon" ]; then
+      good_mon_public_ip=$(echo "${mon_endpoint/:/ }" | awk '{print $1}')
+      good_mon_port=$(echo "${mon_endpoint/:/ }" | awk '{print $2}')
+    else
+      bad_mons+=($mon_name)
+    fi
+  done
+
+  # Parse the cluster FSID
+  ceph_fsid=$(KUBECTL_NS_CLUSTER get secret rook-ceph-mon -o jsonpath='{.data.fsid}' | base64 -d)
+  if [ -z ${good_mon_public_ip+x} ]; then
+    echo "error: good mon $good_mon not found"
+    exit 1
+  fi
+  if [ "$ceph_fsid" = "" ]; then
+    echo "error: ceph cluster fsid not found"
+    exit 1
+  fi
+
+  export monmap_path=/tmp/monmap
+
+  echo ""
+  echo "WARNING: Resetting mon quorum to mon $good_mon ($good_mon_public_ip)"
+  echo "The mons to discard are: ${bad_mons[*]}"
+  echo "The cluster fsid is $ceph_fsid"
+  echo "If you want to continue resetting the mon quorum to $good_mon, type \"yes\""
+  read CONTINUE_SCRIPT
+  if [ "$CONTINUE_SCRIPT" != "yes" ]; then
+    exit 1
+  fi
+
+  # scale the operator deployment down
+  KUBECTL_NS_CLUSTER scale deployment rook-ceph-operator --replicas=0
+
+  # scale down all the mon pods
+  KUBECTL_NS_CLUSTER scale deployment -l app=rook-ceph-mon --replicas=0
+
+  # TODO: wait for the operator and mons to all stop
+  echo -e "\nTODO: Waiting for operator and mons to stop..."
+  sleep 5
+
+  # start the mon debug pod
+  run_start_debug rook-ceph-mon-$good_mon
+
+  wait_for_pod_of_deployment_to_be_running "rook-ceph-mon-$good_mon-debug"
+
+  echo -e "\nStarted debug pod, resetting the mon quorum in the debug pod"
+
+  # run some ceph commands in the mon debug pod to reset quorum
+  set +eu
+  echo -e "\nExtracting the monmap"
+  KUBECTL_NS_OPERATOR exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- ceph-mon \
+    --fsid=$ceph_fsid \
+    --keyring=/etc/ceph/keyring-store/keyring \
+    --log-to-stderr=true \
+    --err-to-stderr=true \
+    --mon-cluster-log-to-stderr=true \
+    --log-stderr-prefix=debug \
+    --default-log-to-file=false \
+    --default-mon-cluster-log-to-file=false \
+    --mon-host=$ROOK_CEPH_MON_HOST \
+    --mon-initial-members=$ROOK_CEPH_MON_INITIAL_MEMBERS \
+    --id=$good_mon \
+    --foreground \
+    --public-addr=$good_mon_public_ip \
+    --setuser-match-path=/var/lib/ceph/mon/ceph-$good_mon/store.db \
+    --public-bind-addr=$ROOK_POD_IP \
+    --extract-monmap=$monmap_path
+
+  echo -e "\nPrinting monmap"; \
+  KUBECTL_NS_OPERATOR exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- monmaptool --print $monmap_path
+
+  # remove all the mons except the good one
+  for bad_mon in "${bad_mons[@]}"
+  do
+    echo -e "\nRemoving mon $bad_mon"
+    KUBECTL_NS_OPERATOR exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- monmaptool $monmap_path --rm $bad_mon
+  done
+
+  echo -e "\rInjecting the monmap"
+  KUBECTL_NS_OPERATOR exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- ceph-mon \
+    --fsid=$ceph_fsid \
+    --keyring=/etc/ceph/keyring-store/keyring \
+    --log-to-stderr=true \
+    --err-to-stderr=true \
+    --mon-cluster-log-to-stderr=true \
+    --log-stderr-prefix=debug \
+    --default-log-to-file=false \
+    --default-mon-cluster-log-to-file=false \
+    --mon-host=$ROOK_CEPH_MON_HOST \
+    --mon-initial-members=$ROOK_CEPH_MON_INITIAL_MEMBERS \
+    --id=$good_mon \
+    --foreground \
+    --public-addr=$good_mon_public_ip \
+    --setuser-match-path=/var/lib/ceph/mon/ceph-$good_mon/store.db \
+    --public-bind-addr=$ROOK_POD_IP \
+    --inject-monmap=$monmap_path
+  echo -e "\nFinished updating the monmap!"
+  set -eu
+
+  echo -e "\nPrinting final monmap"
+  KUBECTL_NS_OPERATOR exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- monmaptool --print $monmap_path
+
+  echo -e "\nReseting the mons in the rook-ceph-mon-endpoints configmap to the good mon"
+  KUBECTL_NS_OPERATOR patch configmaps rook-ceph-mon-endpoints --type json --patch "[{ op: replace, path: /data/data, value: $good_mon=$good_mon_public_ip:$good_mon_port }]"
+
+  echo -e "\nStopping the debug pod for mon $good_mon"
+  run_stop_debug rook-ceph-mon-$good_mon
+
+  echo -e "\nTODO: Add a check that the good mon is healthy"
+
+  echo -e "\nMon quorum was successfully reset to mon $good_mon"
+
+  echo -e "\nProceed to purge the bad mons (${bad_mons[@]})? If so, type \"yes\""
+  read PURGE_MONS
+  if [ "$PURGE_MONS" = "yes" ]; then
+    for bad_mon in "${bad_mons[@]}"
+    do
+      echo "purging old mon: $bad_mon"
+      KUBECTL_NS_OPERATOR delete deploy rook-ceph-mon-$bad_mon
+      KUBECTL_NS_OPERATOR delete svc rook-ceph-mon-$bad_mon
+    done
+  fi
+
+  echo -e "\nStart up the operator and expand to full mon quorum again? If so, type \"yes\""
+  read START_OPERATOR
+  if [ "$START_OPERATOR" = "yes" ]; then
+    # scale up the operator
+    KUBECTL_NS_CLUSTER scale deployment rook-ceph-operator --replicas=1
+  fi
+}
+
 ####################################################################################################
 # 'kubectl rook-ceph rook ...' commands
 ####################################################################################################
@@ -422,16 +585,20 @@ function run_start_debug() {
   parse_flags parse_image_flag "$@" # parse flags before the deployment name
   [[ -z "${REMAINING_ARGS[0]:-""}" ]] && fail_error "Missing mon or osd deployment name"
   deployment_name="${REMAINING_ARGS[0]}"              # get deployment name
-  REMAINING_ARGS=("${REMAINING_ARGS[@]:1}")           # remove deploy name from remaining args
-  parse_flags parse_image_flag "${REMAINING_ARGS[@]}" # parse flags after the deployment name
-  end_of_command_parsing "${REMAINING_ARGS[@]}"
+  echo "REMAINING ARGS BEFORE: ${REMAINING_ARGS}"
+  #REMAINING_ARGS=("${REMAINING_ARGS[@]:1}")           # remove deploy name from remaining args
+  #if [[ -n ${REMAINING_ARGS} ]]; then
+  #  echo "REMAINING ARGS AFTER: ${REMAINING_ARGS}"
+  #  parse_flags parse_image_flag "${REMAINING_ARGS[@]}" # parse flags after the deployment name
+  #  end_of_command_parsing "${REMAINING_ARGS[@]}"
+  #fi
 
   verify_debug_deployment "$deployment_name"
 
   # copy the deployment spec before scaling it down
-  deployment_spec=$(KUBECTL_NS_CLUSTER get deployments "$deployment_name" -o json | jq -r ".spec")
+  deployment_spec=$(KUBECTL_NS_CLUSTER get deployment "$deployment_name" -o json | jq -r ".spec")
   # copy the deployment labels before scaling it down
-  labels=$(KUBECTL_NS_CLUSTER get deployments "$deployment_name" -o json | jq -r ".metadata.labels")
+  labels=$(KUBECTL_NS_CLUSTER get deployment "$deployment_name" -o json | jq -r ".metadata.labels")
   # add debug label to the list
   labels=$(echo "$labels" | jq '. + {"ceph.rook.io/do-not-reconcile": "true"}')
   # remove probes from the deployment
@@ -445,13 +612,18 @@ function run_start_debug() {
   echo "setting debug command to main container"
   deployment_spec=$(update_deployment_spec_command "$deployment_spec")
 
-  deployment_pod=$(KUBECTL_NS_CLUSTER get pod | grep "$deployment_name" | awk '{ print $1  }')
-  # scale the deployment to 0
-  KUBECTL_NS_CLUSTER scale deployments "$deployment_name" --replicas=0
-
-  # wait for the deployment pod to be deleted
-  echo "waiting for the deployment pod \"$deployment_pod\" to be deleted"
-  KUBECTL_NS_CLUSTER wait --for=delete pod/"$deployment_pod" --timeout=60s
+  # scale down the daemon pod if it's running
+  if [[ "false" == "true" ]]; then
+    echo "get pod for deployment $deployment_name"
+    deployment_pod=$(KUBECTL_NS_CLUSTER get pod | grep "$deployment_name" | awk '{ print $1  }')
+    # scale the deployment to 0
+    echo "scale down the deployment $deployment_name"
+    KUBECTL_NS_CLUSTER scale deployments "$deployment_name" --replicas=0
+
+    # wait for the deployment pod to be deleted
+    echo "waiting for the deployment pod \"$deployment_pod\" to be deleted"
+    KUBECTL_NS_CLUSTER wait --for=delete pod/"$deployment_pod" --timeout=60s
+  fi
 
   # create debug deployment
   cat <<EOF | $TOP_LEVEL_COMMAND create -f -
@@ -465,6 +637,8 @@ function run_start_debug() {
     spec:
         $deployment_spec
 EOF
+    echo "ensure the debug deployment $deployment_name is scaled up"
+    KUBECTL_NS_CLUSTER scale deployments "$deployment_name-debug" --replicas=1
 }
 
 function run_stop_debug() {
@@ -611,7 +785,7 @@ function run_main_command() {
     run_operator_command "$@"
     ;;
   mons)
-    fetch_mon_endpoints "$@"
+    run_mons_command "$@"
     ;;
   rook)
     rook_version "$@"