Skip to content

Commit

Permalink
check uptime
Browse files Browse the repository at this point in the history
  • Loading branch information
narrieta committed Jun 14, 2023
1 parent a395955 commit 74d3e32
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 33 deletions.
12 changes: 3 additions & 9 deletions tests_e2e/tests/fips/fips.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#

import uuid
import time
from assertpy import fail
from typing import Any, Dict, List

Expand Down Expand Up @@ -51,18 +50,13 @@ def run(self):

log.info("Restarting test VM")
vm: VirtualMachineClient = VirtualMachineClient(self._context.vm)
vm.restart()
# Trying to connect via SSH to the VM immediately after restart can fail with "connection refused"; the
# retry logic in SshClient.run_command() takes care of the error, but a short sleep reduces the amount
# of warnings in the test log.
log.info("Sleeping for 2 minutes to allow the system to initialize before connecting via SSH")
time.sleep(2 * 60)
vm.restart(wait_for_boot=True, ssh_client=ssh_client)

try:
command = "fips-mode-setup --check"
log.info("Verifying that FIPS is enabled [%s]", command)
output = ssh_client.run_command(command)
if output != "FIPS mode is enabled.\n":
output = ssh_client.run_command(command).rstrip()
if output != "FIPS mode is enabled.":
fail(f"FIPS i not enabled - '{command}' returned '{output}'")
log.info(output)
except CommandError as e:
Expand Down
13 changes: 6 additions & 7 deletions tests_e2e/tests/lib/retry.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,19 @@ def execute_with_retry(operation: Callable[[], Any]) -> Any:
time.sleep(30)


def retry_ssh_run(operation: Callable[[], Any]) -> Any:
def retry_ssh_run(operation: Callable[[], Any], attempts: int, attempt_delay: int) -> Any:
"""
This method attempts to retry ssh run command a few times if operation failed with connection time out
"""
attempts = 3
while attempts > 0:
attempts -= 1
i = 1
while i <= attempts:
try:
return operation()
except Exception as e:
# We raise CommandError on !=0 exit codes in the called method
if isinstance(e, CommandError):
# Instance of 'Exception' has no 'exit_code' member (no-member) - Disabled: e is actually an CommandError
if e.exit_code != 255 or attempts == 0: # pylint: disable=no-member
if e.exit_code != 255 or i == attempts: # pylint: disable=no-member
raise
log.warning("The operation failed, retrying in 30 secs.\n%s", e)
time.sleep(30)
log.warning("The SSH operation failed, retrying in %s secs [Attempt %s/%s].\n%s", e, attempt_delay, i, attempts)
time.sleep(attempt_delay)
26 changes: 16 additions & 10 deletions tests_e2e/tests/lib/ssh_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
from tests_e2e.tests.lib import shell
from tests_e2e.tests.lib.retry import retry_ssh_run

ATTEMPTS: int = 3
ATTEMPT_DELAY: int = 30


class SshClient(object):
def __init__(self, ip_address: str, username: str, private_key_file: Path, port: int = 22):
Expand All @@ -31,7 +34,7 @@ def __init__(self, ip_address: str, username: str, private_key_file: Path, port:
self._private_key_file: Path = private_key_file
self._port: int = port

def run_command(self, command: str, use_sudo: bool = False) -> str:
def run_command(self, command: str, use_sudo: bool = False, attempts: int = ATTEMPTS, attempt_delay: int = ATTEMPT_DELAY) -> str:
"""
Executes the given command over SSH and returns its stdout. If the command returns a non-zero exit code,
the function raises a CommandError.
Expand All @@ -44,9 +47,12 @@ def run_command(self, command: str, use_sudo: bool = False) -> str:
# Note that we add ~/bin to the remote PATH, since Python (Pypy) and other test tools are installed there.
# Note, too, that when using sudo we need to carry over the value of PATH to the sudo session
sudo = "sudo env PATH=$PATH PYTHONPATH=$PYTHONPATH" if use_sudo else ''
return retry_ssh_run(lambda: shell.run_command([
"ssh", "-o", "StrictHostKeyChecking=no", "-i", self._private_key_file, destination,
f"if [[ -e ~/bin/set-agent-env ]]; then source ~/bin/set-agent-env; fi; {sudo} {command}"]))
command = [
"ssh", "-o", "StrictHostKeyChecking=no", "-i", self._private_key_file,
destination,
f"if [[ -e ~/bin/set-agent-env ]]; then source ~/bin/set-agent-env; fi; {sudo} {command}"
]
return retry_ssh_run(lambda: shell.run_command(command), attempts, attempt_delay)

@staticmethod
def generate_ssh_key(private_key_file: Path):
Expand All @@ -59,19 +65,19 @@ def generate_ssh_key(private_key_file: Path):
def get_architecture(self):
return self.run_command("uname -m").rstrip()

def copy_to_node(self, local_path: Path, remote_path: Path, recursive: bool = False) -> None:
def copy_to_node(self, local_path: Path, remote_path: Path, recursive: bool = False, attempts: int = ATTEMPTS, attempt_delay: int = ATTEMPT_DELAY) -> None:
"""
File copy to a remote node
"""
self._copy(local_path, remote_path, remote_source=False, remote_target=True, recursive=recursive)
self._copy(local_path, remote_path, remote_source=False, remote_target=True, recursive=recursive, attempts=attempts, attempt_delay=attempt_delay)

def copy_from_node(self, remote_path: Path, local_path: Path, recursive: bool = False) -> None:
def copy_from_node(self, remote_path: Path, local_path: Path, recursive: bool = False, attempts: int = ATTEMPTS, attempt_delay: int = ATTEMPT_DELAY) -> None:
"""
File copy from a remote node
"""
self._copy(remote_path, local_path, remote_source=True, remote_target=False, recursive=recursive)
self._copy(remote_path, local_path, remote_source=True, remote_target=False, recursive=recursive, attempts=attempts, attempt_delay=attempt_delay)

def _copy(self, source: Path, target: Path, remote_source: bool, remote_target: bool, recursive: bool) -> None:
def _copy(self, source: Path, target: Path, remote_source: bool, remote_target: bool, recursive: bool, attempts: int, attempt_delay: int) -> None:
if remote_source:
source = f"{self._username}@{self._ip_address}:{source}"
if remote_target:
Expand All @@ -82,4 +88,4 @@ def _copy(self, source: Path, target: Path, remote_source: bool, remote_target:
command.append("-r")
command.extend([str(source), str(target)])

shell.run_command(command)
return retry_ssh_run(lambda: shell.run_command(command), attempts, attempt_delay)
45 changes: 38 additions & 7 deletions tests_e2e/tests/lib/virtual_machine_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
from tests_e2e.tests.lib.identifiers import VmIdentifier
from tests_e2e.tests.lib.logging import log
from tests_e2e.tests.lib.retry import execute_with_retry
from tests_e2e.tests.lib.shell import CommandError
from tests_e2e.tests.lib.ssh_client import SshClient


class VirtualMachineClient(AzureClient):
Expand All @@ -61,7 +63,7 @@ def get_model(self) -> VirtualMachine:
"""
Retrieves the model of the virtual machine.
"""
log.info("Retrieving description for %s", self._identifier)
log.info("Retrieving VM model for %s", self._identifier)
return execute_with_retry(
lambda: self._compute_client.virtual_machines.get(
resource_group_name=self._identifier.resource_group,
Expand Down Expand Up @@ -106,10 +108,25 @@ def update(self, properties: Dict[str, Any], timeout: int = AzureClient._DEFAULT
operation_name=f"Update {self._identifier}",
timeout=timeout)

def restart(self, timeout: int = AzureClient._DEFAULT_TIMEOUT, wait_for_boot: bool = True, boot_timeout: datetime.timedelta = datetime.timedelta(minutes=5)) -> None:
def restart(
self,
wait_for_boot,
ssh_client: SshClient = None,
boot_timeout: datetime.timedelta = datetime.timedelta(minutes=5),
timeout: int = AzureClient._DEFAULT_TIMEOUT) -> None:
"""
Restarts the virtual machine or scale set
Restarts (reboots) the virtual machine.
NOTES:
* If wait_for_boot is True, an SshClient must be provided in order to verify that the restart was successful.
* 'timeout' is the timeout for the restart operation itself, while 'boot_timeout' is the timeout for waiting
the boot to complete.
"""
if wait_for_boot and ssh_client is None:
raise ValueError("An SshClient must be provided if wait_for_boot is True")

before_restart = datetime.datetime.utcnow()

self._execute_async_operation(
lambda: self._compute_client.virtual_machines.begin_restart(
resource_group_name=self._identifier.resource_group,
Expand All @@ -120,8 +137,8 @@ def restart(self, timeout: int = AzureClient._DEFAULT_TIMEOUT, wait_for_boot: bo
if not wait_for_boot:
return

start = datetime.datetime.now()
while datetime.datetime.now() < start + boot_timeout:
start = datetime.datetime.utcnow()
while datetime.datetime.utcnow() < start + boot_timeout:
log.info("Waiting for VM %s to boot", self._identifier)
time.sleep(15) # Note that we always sleep at least 1 time, to give the reboot time to start
instance_view = self.get_instance_view()
Expand All @@ -130,8 +147,22 @@ def restart(self, timeout: int = AzureClient._DEFAULT_TIMEOUT, wait_for_boot: bo
raise Exception(f"Could not find PowerState in the instance view statuses:\n{json.dumps(instance_view.statuses)}")
log.info("VM's Power State: %s", power_state[0])
if power_state[0] == "PowerState/running":
log.info("VM %s completed boot and is running", self._identifier)
return
# We may get an instance view captured before the reboot actually happened; verify
# that the reboot actually happened by checking the system's uptime.
log.info("Verifying VM's uptime to ensure the reboot has completed...")
try:
uptime = ssh_client.run_command("cat /proc/uptime | sed 's/ .*//'", attempts=1).rstrip() # The uptime is the first field in the file
log.info("Uptime: %s", uptime)
boot_time = datetime.datetime.utcnow() - datetime.timedelta(seconds=float(uptime))
if boot_time > before_restart:
log.info("VM %s completed boot and is running. Boot time: %s", self._identifier, boot_time)
return
log.info("The VM has not rebooted yet. Restart time: %s. Boot time: %s", before_restart, boot_time)
except CommandError as e:
if e.exit_code == 255 and "Connection refused" in str(e):
log.info("VM %s is not yet accepting SSH connections", self._identifier)
else:
raise
raise Exception(f"VM {self._identifier} did not boot after {boot_timeout}")

def __str__(self):
Expand Down

0 comments on commit 74d3e32

Please sign in to comment.