Skip to content

Commit

Permalink
fix: race condition when failing switch config
Browse files Browse the repository at this point in the history
letting the cloud be set as provisioned and rerunning switch config
on validate_env.

closes: #414

Change-Id: I324254583c939815b0b6cee85ffe3ea3593e68e2
  • Loading branch information
grafuls committed Aug 23, 2022
1 parent 57dab39 commit 3166dd8
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 3 deletions.
2 changes: 0 additions & 2 deletions quads/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1504,14 +1504,12 @@ def action_movehosts(self):
self.logger.exception(
"There was something wrong configuring the switch."
)
provisioned = False

except Exception as exc:
self.logger.exception(
"There was something wrong configuring the switch.",
exc_info=exc,
)
provisioned = False

if done:
for future in done:
Expand Down
8 changes: 7 additions & 1 deletion quads/tools/move_and_rebuild_hosts.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ def switch_config(host, old_cloud, new_cloud):
"There was something wrong updating switch for %s:%s"
% (host, interface.name)
)
if ssh_helper:
ssh_helper.disconnect()
return False
else:
if int(old_vlan) != int(new_vlan):
Expand All @@ -99,6 +101,8 @@ def switch_config(host, old_cloud, new_cloud):
"There was something wrong updating switch for %s:%s"
% (host, interface.name)
)
if ssh_helper:
ssh_helper.disconnect()
return False

if ssh_helper:
Expand Down Expand Up @@ -200,7 +204,9 @@ async def move_and_rebuild(host, new_cloud, semaphore, rebuild=False, loop=None)

if is_supported(host):
try:
interfaces_path = os.path.join(os.path.dirname(__file__), "../../conf/idrac_interfaces.yml")
interfaces_path = os.path.join(
os.path.dirname(__file__), "../../conf/idrac_interfaces.yml"
)
await badfish.change_boot("director", interfaces_path)

# wait 10 minutes for the boot order job to complete
Expand Down
25 changes: 25 additions & 0 deletions quads/tools/validate_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from quads.tools.badfish import BadfishException, badfish_factory
from quads.tools.foreman import Foreman
from quads.tools.helpers import get_running_loop
from quads.tools.move_and_rebuild_hosts import switch_config
from quads.tools.netcat import Netcat
from quads.tools.postman import Postman
from quads.tools.ssh_helper import SSHHelper
Expand Down Expand Up @@ -188,7 +189,23 @@ async def post_system_test(self):
async def post_network_test(self):
test_host = self.hosts[0]
hosts_down = []
switch_config_missing = []
for host in self.hosts:
if not host.switch_config_applied:
current_schedule = Schedule.current_schedule(
host=host, cloud=host.cloud.name
).first()
previous_cloud = host.default_cloud.name
previous_schedule = Schedule.objects(
host=host.name, end=current_schedule.start
).first()
if previous_schedule:
previous_cloud = previous_schedule.cloud.name
result = switch_config(host, previous_cloud, host.cloud.name)
if result:
host.update(switch_config_applied=True)
else:
switch_config_missing.append(host.name)
try:
nc = Netcat(host.name)
healthy = await nc.health_check()
Expand All @@ -199,12 +216,20 @@ async def post_network_test(self):
if len(host.interfaces) > len(test_host.interfaces):
test_host = host

error = False
if hosts_down:
logger.error(
"The following hosts appear to be down or with no ssh connection:"
)
for i in hosts_down:
logger.error(i)
error = True
if switch_config_missing:
logger.error("The following hosts are missing switch configuration:")
for i in switch_config_missing:
logger.error(i)
error = True
if error:
return False

try:
Expand Down

0 comments on commit 3166dd8

Please sign in to comment.