Skip to content

Commit

Permalink
Startup and readiness improvements and fix for monitor verbose readin…
Browse files Browse the repository at this point in the history
…ess logging (#133)
  • Loading branch information
bczoma authored Aug 14, 2023
1 parent c989700 commit 3f460d2
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 82 deletions.
4 changes: 2 additions & 2 deletions pubsubplus/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
apiVersion: v2
description: Deploy Solace PubSub+ Event Broker Singleton or HA redundancy group onto a Kubernetes Cluster
name: pubsubplus
version: 3.3.1
icon: https://solaceproducts.github.io/pubsubplus-kubernetes-quickstart/images/PubSubPlus.png
version: 3.3.2
icon: https://solaceproducts.github.io/pubsubplus-kubernetes-helm-quickstart/images/PubSubPlus.png
kubeVersion: '>= 1.10.0-0'
maintainers:
- name: Solace Community Forum
Expand Down
139 changes: 59 additions & 80 deletions pubsubplus/templates/solaceConfigMap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,9 @@ data:
cat /mnt/disks/certs/server/{{.Values.tls.certFilename | default "tls.key"}} /mnt/disks/certs/server/{{.Values.tls.certKeyFilename | default "tls.crt"}} > /dev/shm/server.cert
export tls_servercertificate_filepath="/dev/shm/server.cert"
{{- end }}
# Deal with the fact we cannot accept "-" in router names
export routername=$(echo $(hostname) | sed 's/-//g')
{{- if .Values.solace.redundancy }}
# [TODO] KBARR not using correct method of finding ordinal until we bump min Kubernetes release above 1.8.1
# https://github.com/kubernetes/kubernetes/issues/40651
# node_ordinal=$(STATEFULSET_ORDINAL)
IFS='-' read -ra host_array <<< $(hostname)
node_ordinal=${host_array[-1]}
if [[ ! -z `echo $STATEFULSET_NAMESPACE` ]]; then
Expand All @@ -49,9 +48,7 @@ data:
namespace=default
fi
service={{ template "solace.fullname" . }}
# Deal with the fact we cannot accept "-" in routre names
service_name=$(echo ${service} | sed 's/-//g')
export routername=$(echo $(hostname) | sed 's/-//g')
export redundancy_enable=yes
export configsync_enable=yes
export redundancy_authentication_presharedkey_key=`cat /mnt/disks/secrets/username_admin_password | awk '{x=$0;for(i=length;i<51;i++)x=x "0";}END{print x}' | base64` # Right-pad with 0s to 50 length
Expand Down Expand Up @@ -92,6 +89,7 @@ data:
loop_guard=60
pause=10
count=0
# Wait for Solace Management API
while [ ${count} -lt ${loop_guard} ]; do
if /mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 -t ; then
break
Expand Down Expand Up @@ -131,6 +129,7 @@ data:
resync_step_required=""
role=""
count=0
# Determine node's primary or backup role
while [ ${count} -lt ${loop_guard} ]; do
role_results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><redundancy><detail/></redundancy></show></rpc>" \
Expand All @@ -147,16 +146,16 @@ data:
;;
esac
((count++))
echo "`date` INFO: ${APP}-Waited ${run_time} seconds, got ${role_results} for this node's active-standby role"
echo "`date` INFO: ${APP}-Waited ${run_time} seconds, got ${role_results} for this node's primary or backup role"
sleep ${pause}
done
if [ ${count} -eq ${loop_guard} ]; then
echo "`date` ERROR: ${APP}-Could not determine this node's active-standby role" >&2
echo "`date` ERROR: ${APP}-Could not determine this node's primary or backup role" >&2
exit 1
fi
# Determine local activity
echo "`date` INFO: ${APP}-Management API is up, determined that this node's role is: ${role}"
# Determine activity (local or mate active)
count=0
echo "`date` INFO: ${APP}-Management API is up, determined that this node's active-standby role is: ${role}"
while [ ${count} -lt ${loop_guard} ]; do
online_results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><redundancy><detail/></redundancy></show></rpc>" \
Expand All @@ -172,7 +171,7 @@ data:
echo "`date` INFO: ${APP}-Broker initial startup detected. This node will assert config-sync configuration over its mate"
resync_step_required="true"
else
echo "`date` WARN: ${APP}-Unexpected state: this is not an initial startup of the broker and this node reports Local Active. Normally expected nodes are Mate Active after restart"
echo "`date` WARN: ${APP}-Unexpected state: this is not an initial startup of the broker and this node reports Local Active. Possibly a redeploy?"
fi
break
;;
Expand All @@ -182,15 +181,16 @@ data:
;;
esac
((count++))
echo "`date` INFO: ${APP}-Waited ${run_time} seconds, Local activity state is: ${local_activity}"
echo "`date` INFO: ${APP}-Waited ${run_time} seconds, node activity state is: ${local_activity}"
sleep ${pause}
done
if [ ${count} -eq ${loop_guard} ]; then
echo "`date` ERROR: ${APP}-Local activity state never become Local Active or Mate Active" >&2
echo "`date` ERROR: ${APP}-Node activity state never become Local Active or Mate Active" >&2
exit 1
fi
# If we need to assert leader, then we need to wait for mate to reconcile
# If we need to assert leader, then first wait for mate to report Standby state
if [ "${resync_step_required}" = "true" ]; then
# This branch is AD-active only
count=0
echo "`date` INFO: ${APP}-Waiting for mate activity state to be 'Standby'"
while [ ${count} -lt ${loop_guard} ]; do
Expand All @@ -214,7 +214,7 @@ data:
exit 1
fi
fi # if assert-leader
# Ensure Config-sync connection state is Connected before proceeding
# Ensure Config-sync connection state is Connected for both primary and backup before proceeding
count=0
echo "`date` INFO: ${APP}-Waiting for config-sync connected"
while [ ${count} -lt ${loop_guard} ]; do
Expand All @@ -239,11 +239,12 @@ data:
fi
# Now can issue assert-leader command
if [ "${resync_step_required}" = "true" ]; then
echo "`date` INFO: ${APP}-Initiating assert-leader"
/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><admin><config-sync><assert-leader><router/></assert-leader></config-sync></admin></rpc>"
/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><admin><config-sync><assert-leader><vpn-name>*</vpn-name></assert-leader></config-sync></admin></rpc>"
# This branch is AD-active only
echo "`date` INFO: ${APP}-Initiating assert-leader"
/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><admin><config-sync><assert-leader><router/></assert-leader></config-sync></admin></rpc>"
/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><admin><config-sync><assert-leader><vpn-name>*</vpn-name></assert-leader></config-sync></admin></rpc>"
fi
# Wait for config-sync results
count=0
Expand All @@ -263,7 +264,7 @@ data:
((count++))
echo "`date` INFO: ${APP}-Waited ${run_time} seconds, Config-sync is: ${confsyncstatus_results}, not yet Up"

# Additional check to confirm config-sync
# Additional checks to confirm config-sync (even if reported gloabally as not Up, it may be still up between local primary and backup in a DR setup)
echo "`date` INFO: ${APP}-Checking Config-sync Setup. Starting additional checks to confirm config-sync locally..."
messagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><config-sync><database/><detail/></config-sync></show></rpc>" \
Expand Down Expand Up @@ -378,36 +379,15 @@ data:
IFS='-' read -ra host_array <<< $(hostname)
node_ordinal=${host_array[-1]}
password=`cat /mnt/disks/secrets/username_admin_password`

# For update (includes SolOS upgrade) purposes, additional checks are required for readiness state when the pod has been started
# This is an update if the LASTVERSION_FILE with K8s controller-revision-hash exists and contents differ from current value
LASTVERSION_FILE=/var/lib/solace/var/lastConfigRevisionBeforeReboot
if [ ! -f ${LASTVERSION_FILE} ] || [[ $(cat ${LASTVERSION_FILE}) != $(get_label "controller-revision-hash") ]] ; then
echo "`date` INFO: ${APP}-Initial startup or Upgrade detected, running additional checks..."
# Check redundancy
echo "`date` INFO: ${APP}-Running checks. Redundancy state check started..."
results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><redundancy/></show></rpc>" \
-v "/rpc-reply/rpc/show/redundancy/redundancy-status"`
redundancystatus_results=`echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
if [ "${redundancystatus_results}" != "Up" ]; then
echo "`date` INFO: ${APP}-Redundancy state is not yet up."
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
fi

fi
# Record current version in LASTVERSION_FILE
echo $(get_label "controller-revision-hash") > ${LASTVERSION_FILE}
# For monitor node just check for redundancy; active label will never be set
if [ "${node_ordinal}" = "2" ]; then
# Check redundancy
echo "`date` INFO: ${APP}-Running checks. Redundancy state check started..."
results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><redundancy/></show></rpc>" \
-v "/rpc-reply/rpc/show/redundancy/redundancy-status"`
redundancystatus_results=`echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
if [ "${redundancystatus_results}" != "Up" ]; then
echo "`date` INFO: ${APP}-Redundancy state is not yet up."
echo "`date` INFO: ${APP}-Waiting for redundancy up, redundancy state is not yet up."
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
fi
if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then
Expand All @@ -418,6 +398,7 @@ data:
fi
exit 0
fi # End Monitor Node
# From here only message routing nodes.
# For Primary or Backup nodes set both service readiness (active label) and k8s readiness (exit return value)
health_result=`curl -s -o /dev/null -w "%{http_code}" http://localhost:5550/health-check/guaranteed-active`
case "${health_result}" in
Expand Down Expand Up @@ -467,54 +448,52 @@ data:
echo "`date` INFO: ${APP}-Running checks.Redundancy state is not yet up."
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
fi
# Additionally check config-sync status for non-monitoring nodes
if [ "${node_ordinal}" != "2" ]; then
results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><config-sync></config-sync></show></rpc>" \
-v "/rpc-reply/rpc/show/config-sync/status/oper-status"`
confsyncstatus_results=`echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
if [ "${confsyncstatus_results}" != "Up" ]; then
# Check config-sync status
results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><config-sync></config-sync></show></rpc>" \
-v "/rpc-reply/rpc/show/config-sync/status/oper-status"`
confsyncstatus_results=`echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
if [ "${confsyncstatus_results}" != "Up" ]; then

# Additional check to confirm config-sync
echo "`date` INFO: ${APP}-Checking Config-sync Setup. Starting additional checks to confirm config-sync locally..."
# Additional check to confirm config-sync
echo "`date` INFO: ${APP}-Checking Config-sync Setup. Starting additional checks to confirm config-sync locally..."

messagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><config-sync><database/><detail/></config-sync></show></rpc>" \
-v "count(/rpc-reply/rpc/show/config-sync/database/local/tables/table)"`
messagevpn_total=`echo ${messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
messagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><config-sync><database/><detail/></config-sync></show></rpc>" \
-v "count(/rpc-reply/rpc/show/config-sync/database/local/tables/table)"`
messagevpn_total=`echo ${messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`

# Count message_vpns in-sync and compare with total
localmessagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><config-sync><database/></config-sync></show></rpc>" \
-v "count(//table[sync-state='In-Sync'])"`
local_messagevpn_total_insync=`echo ${localmessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
if [ "$messagevpn_total" -ne "$local_messagevpn_total_insync" ]; then
echo "`date` INFO: ${APP}-Config-sync state is not in-sync locally."
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
fi
# Count message_vpns in-sync and compare with total
localmessagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><config-sync><database/></config-sync></show></rpc>" \
-v "count(//table[sync-state='In-Sync'])"`
local_messagevpn_total_insync=`echo ${localmessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
if [ "$messagevpn_total" -ne "$local_messagevpn_total_insync" ]; then
echo "`date` INFO: ${APP}-Config-sync state is not in-sync locally."
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
fi

echo "`date` INFO: ${APP}-Checking Config-sync Setup. Remote config-sync state check starting..."
vpnremotehamate_result=$(get_router_remote_config_state "name")
echo "`date` INFO: ${APP}-Checking Config-sync Setup. Remote config-sync state check starting..."
vpnremotehamate_result=$(get_router_remote_config_state "name")

remote_messagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><config-sync><database/><remote/></config-sync></show></rpc>" \
-v "count(//table/source-router[name='$vpnremotehamate_result'])"`
remote_messagevpn_total=`echo ${remote_messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
remote_messagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><config-sync><database/><remote/></config-sync></show></rpc>" \
-v "count(//table/source-router[name='$vpnremotehamate_result'])"`
remote_messagevpn_total=`echo ${remote_messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`

#Count message_vpns in-sync, not stale and compare with total
remotemessagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><config-sync><database/><remote/></config-sync></show></rpc>" \
-v "count(//table/source-router[name='$vpnremotehamate_result' and sync-state='In-Sync' and stale='No'])"`
remote_messagevpn_total_insync=`echo ${remotemessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
if [ "$remote_messagevpn_total" -ne "$remote_messagevpn_total_insync" ]; then
echo "`date` INFO: ${APP}-Config-sync state is not in-sync for remote."
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
fi
#Count message_vpns in-sync, not stale and compare with total
remotemessagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
-q "<rpc><show><config-sync><database/><remote/></config-sync></show></rpc>" \
-v "count(//table/source-router[name='$vpnremotehamate_result' and sync-state='In-Sync' and stale='No'])"`
remote_messagevpn_total_insync=`echo ${remotemessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
if [ "$remote_messagevpn_total" -ne "$remote_messagevpn_total_insync" ]; then
echo "`date` INFO: ${APP}-Config-sync state is not in-sync for remote."
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
fi
fi
# Pass readiness check
if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then
echo "`date` INFO: ${APP}-Redundancy is up and node is mate Active"
echo "`date` INFO: ${APP}-Redundancy is up and node is Mate Active"
touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}
echo "`date` INFO: ${APP}-Server status check complete for this broker node"
exit 1
Expand Down

0 comments on commit 3f460d2

Please sign in to comment.