From 3166dd8c14cd0647b6776b30c02f2e09b247754f Mon Sep 17 00:00:00 2001 From: Gonza Rafuls Date: Tue, 23 Aug 2022 11:49:11 +0200 Subject: [PATCH] fix: race condition when failing switch config letting the cloud be set as provisioned and rerunning switch config on validate_env. closes: https://github.com/redhat-performance/quads/issues/414 Change-Id: I324254583c939815b0b6cee85ffe3ea3593e68e2 --- quads/cli/cli.py | 2 -- quads/tools/move_and_rebuild_hosts.py | 8 +++++++- quads/tools/validate_env.py | 25 +++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/quads/cli/cli.py b/quads/cli/cli.py index d66d07a0f..d686d4319 100644 --- a/quads/cli/cli.py +++ b/quads/cli/cli.py @@ -1504,14 +1504,12 @@ def action_movehosts(self): self.logger.exception( "There was something wrong configuring the switch." ) - provisioned = False except Exception as exc: self.logger.exception( "There was something wrong configuring the switch.", exc_info=exc, ) - provisioned = False if done: for future in done: diff --git a/quads/tools/move_and_rebuild_hosts.py b/quads/tools/move_and_rebuild_hosts.py index 1674378bb..e2162132b 100755 --- a/quads/tools/move_and_rebuild_hosts.py +++ b/quads/tools/move_and_rebuild_hosts.py @@ -84,6 +84,8 @@ def switch_config(host, old_cloud, new_cloud): "There was something wrong updating switch for %s:%s" % (host, interface.name) ) + if ssh_helper: + ssh_helper.disconnect() return False else: if int(old_vlan) != int(new_vlan): @@ -99,6 +101,8 @@ def switch_config(host, old_cloud, new_cloud): "There was something wrong updating switch for %s:%s" % (host, interface.name) ) + if ssh_helper: + ssh_helper.disconnect() return False if ssh_helper: @@ -200,7 +204,9 @@ async def move_and_rebuild(host, new_cloud, semaphore, rebuild=False, loop=None) if is_supported(host): try: - interfaces_path = os.path.join(os.path.dirname(__file__), "../../conf/idrac_interfaces.yml") + interfaces_path = os.path.join( + os.path.dirname(__file__), "../../conf/idrac_interfaces.yml" + ) await badfish.change_boot("director", interfaces_path) # wait 10 minutes for the boot order job to complete diff --git a/quads/tools/validate_env.py b/quads/tools/validate_env.py index 79c26dd19..c79a80c93 100755 --- a/quads/tools/validate_env.py +++ b/quads/tools/validate_env.py @@ -17,6 +17,7 @@ from quads.tools.badfish import BadfishException, badfish_factory from quads.tools.foreman import Foreman from quads.tools.helpers import get_running_loop +from quads.tools.move_and_rebuild_hosts import switch_config from quads.tools.netcat import Netcat from quads.tools.postman import Postman from quads.tools.ssh_helper import SSHHelper @@ -188,7 +189,23 @@ async def post_system_test(self): async def post_network_test(self): test_host = self.hosts[0] hosts_down = [] + switch_config_missing = [] for host in self.hosts: + if not host.switch_config_applied: + current_schedule = Schedule.current_schedule( + host=host, cloud=host.cloud.name + ).first() + previous_cloud = host.default_cloud.name + previous_schedule = Schedule.objects( + host=host.name, end=current_schedule.start + ).first() + if previous_schedule: + previous_cloud = previous_schedule.cloud.name + result = switch_config(host, previous_cloud, host.cloud.name) + if result: + host.update(switch_config_applied=True) + else: + switch_config_missing.append(host.name) try: nc = Netcat(host.name) healthy = await nc.health_check() @@ -199,12 +216,20 @@ async def post_network_test(self): if len(host.interfaces) > len(test_host.interfaces): test_host = host + error = False if hosts_down: logger.error( "The following hosts appear to be down or with no ssh connection:" ) for i in hosts_down: logger.error(i) + error = True + if switch_config_missing: + logger.error("The following hosts are missing switch configuration:") + for i in switch_config_missing: + logger.error(i) + error = True + if error: return False try: