Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-4.15] USHIFT-3276: Retry virt-install command in case of installation error #3484

Merged
merged 2 commits into from
Jun 13, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 43 additions & 32 deletions test/bin/scenario.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ WEB_SERVER_URL="http://${VM_BRIDGE_IP}:${WEB_SERVER_PORT}"
PULL_SECRET="${PULL_SECRET:-${HOME}/.pull-secret.json}"
PULL_SECRET_CONTENT="$(jq -c . "${PULL_SECRET}")"
PUBLIC_IP=${PUBLIC_IP:-""} # may be overridden in global settings file
VM_BOOT_TIMEOUT=900
VM_BOOT_TIMEOUT=1200 # Overall total boot times are around 15m
VM_GREENBOOT_TIMEOUT=1800 # Greenboot readiness may take up to 15-30m depending on the load
ENABLE_REGISTRY_MIRROR=${ENABLE_REGISTRY_MIRROR:-false}
SKIP_SOS=${SKIP_SOS:-false} # may be overridden in global settings file
SKIP_GREENBOOT=${SKIP_GREENBOOT:-false} # may be overridden in scenario file
Expand Down Expand Up @@ -272,11 +273,11 @@ wait_for_greenboot() {
return 0
fi

echo "Waiting ${VM_BOOT_TIMEOUT} for greenboot on ${vmname} to complete"
echo "Waiting ${VM_GREENBOOT_TIMEOUT} for greenboot on ${vmname} to complete"

local -r start_time=$(date +%s)
local -r ssh_cmd="ssh -oConnectTimeout=10 -oBatchMode=yes -oStrictHostKeyChecking=accept-new redhat@${ip}"
while [ $(( $(date +%s) - start_time )) -lt "${VM_BOOT_TIMEOUT}" ] ; do
while [ $(( $(date +%s) - start_time )) -lt "${VM_GREENBOOT_TIMEOUT}" ] ; do
local svc_state
svc_state="$(${ssh_cmd} systemctl show --property=SubState --value greenboot-healthcheck || true)"
if [ "${svc_state}" = "exited" ] ; then
Expand Down Expand Up @@ -437,13 +438,12 @@ launch_vm() {
vm_extra_args+=" inst.ks=${kickstart_url}"
fi

# Implement retries on VM creation until the problem is fixed
# See https://github.com/virt-manager/virt-manager/issues/498
# Implement retries on VM creation that can time out when pulling
# ostree commits or any other installation error
local vm_created=false
for attempt in $(seq 5) ; do
local vm_create_start
vm_create_start=$(date +%s)

local attempt=1
local max_attempts=2
while true ; do
local graphics_args
graphics_args="none"
if "${VNC_CONSOLE}"; then
Expand All @@ -461,7 +461,7 @@ launch_vm() {
# If the TTY is not provided, virt-install refuses
# to attach to the console. `unbuffer` provides the TTY.
# shellcheck disable=SC2086
if ! ${timeout_install} unbuffer sudo virt-install \
if ${timeout_install} unbuffer sudo virt-install \
--autoconsole text \
--graphics "${graphics_args}" \
--name "${full_vmname}" \
Expand All @@ -476,21 +476,26 @@ launch_vm() {
${vm_initrd_inject} \
--wait ; then

# Check if the command exited within 15s due to a failure
local vm_create_end
vm_create_end=$(date +%s)
if [ $(( vm_create_end - vm_create_start )) -lt 15 ] ; then
local backoff=$(( attempt * 5 ))
echo "Error running virt-install on attempt ${attempt}: retrying in ${backoff}s"
sleep "${backoff}"
continue
fi
# Stop retrying on timeout error
# Stop retrying when VM is created successfully
vm_created=true
break
fi
# Stop retrying when VM is created successfully
vm_created=true
break

# Check if VM creation should be retried
((attempt++))
if [ ${attempt} -gt ${max_attempts} ] ; then
echo "Error running virt-install: giving up on attempt ${attempt}"
break
fi

# Retry the operation on error
local backoff=$(( attempt * 5 ))
echo "Error running virt-install: retrying in ${backoff}s on attempt ${attempt}"
sleep "${backoff}"

# Cleanup the failed VM before trying to recreate it
# Keep the storage pool for the subsequent VM creation
remove_vm "${vmname}" true
done

if ${vm_created} ; then
Expand Down Expand Up @@ -561,9 +566,10 @@ launch_vm() {
echo "${full_vmname} is up and ready"
}

# Clean up the resources for one VM.
# Clean up the resources for one VM, optionally skipping storage pool removal
remove_vm() {
local -r vmname="${1}"
local -r keep_pool="${2:-false}"
local -r full_vmname="$(full_vm_name "${vmname}")"

# Remove the actual VM
Expand All @@ -575,15 +581,20 @@ remove_vm() {
fi

# Remove the VM storage pool
local -r vm_pool_name="${VM_POOL_BASENAME}-${SCENARIO}"
if sudo virsh pool-info "${vm_pool_name}" &>/dev/null; then
sudo virsh pool-destroy "${vm_pool_name}"
sudo virsh pool-undefine "${vm_pool_name}"
fi
if ! ${keep_pool} ; then
local -r vm_pool_name="${VM_POOL_BASENAME}-${SCENARIO}"
if sudo virsh pool-info "${vm_pool_name}" &>/dev/null; then
sudo virsh pool-destroy "${vm_pool_name}"
sudo virsh pool-undefine "${vm_pool_name}"
fi

# Remove the pool directory
# ShellCheck: Using "${var:?}" to ensure this never expands to '/*'
rm -rf "${VM_DISK_BASEDIR:?}/${vm_pool_name}"
# Remove the pool directory
# ShellCheck: Using "${var:?}" to ensure this never expands to '/*'
rm -rf "${VM_DISK_BASEDIR:?}/${vm_pool_name}"
else
# Remove VM disk files
rm -f "${VM_DISK_BASEDIR}/${vm_pool_name}/*"
fi

# Remove the info file so something processing the VMs does not
# assume the file exists. This is most useful in a local setting.
Expand Down