forked from sonic-net/sonic-mgmt
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[New testcase] Add chassis reboot for multiple linecard parallel rebo…
…ot scenario (sonic-net#15094) What is the motivation for this PR? We have seen such scenarios that in production, 2 LC was upgrading image, and did reboot by hwproxy flow automatically nearly at the same time(time difference ~15sec), and on one of the linecard, it has OA crash. How did you do it? Add testcase to cover scenario for multiple LC reboot nearly in parallel, within 30sec. Verify that no new core dump is generated during the test, else fail the testcase Make sure devices and links are up.
- Loading branch information
1 parent
a2b43aa
commit 850ca9c
Showing
1 changed file
with
100 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
""" | ||
This test file is created for T2 chassis specific reboot test, need to skip for all T0/T1 | ||
""" | ||
import pytest | ||
import random | ||
import logging | ||
import time | ||
from tests.common.helpers.assertions import pytest_assert | ||
from tests.common.utilities import wait_until | ||
from tests.common.reboot import wait_for_startup,\ | ||
sync_reboot_history_queue_with_dut,\ | ||
REBOOT_TYPE_HISTOYR_QUEUE | ||
from tests.platform_tests.test_reboot import check_interfaces_and_services | ||
|
||
|
||
pytestmark = [ | ||
pytest.mark.disable_loganalyzer, | ||
pytest.mark.topology('t2') | ||
] | ||
|
||
|
||
def chassis_cold_reboot(dut, localhost): | ||
logging.info( | ||
"Sync reboot cause history queue with T2 reboot cause history queue") | ||
sync_reboot_history_queue_with_dut(dut) | ||
|
||
logging.info("Run cold reboot on {}".format(dut)) | ||
dut.command("reboot") | ||
|
||
# Append the last reboot type to the queue | ||
logging.info("Append the latest reboot type to the queue") | ||
REBOOT_TYPE_HISTOYR_QUEUE.append("cold") | ||
|
||
|
||
def get_core_dump(duthost): | ||
""" | ||
This function get core dump on any of the linecards. | ||
Note that even we have core dump check pre/post testing, that check will not fail a test | ||
This check specifically fail the test if new core dump is found | ||
""" | ||
if "20191130" in duthost.os_version: | ||
return duthost.shell('ls /var/core/ | grep -v python || true')['stdout'].split() | ||
else: | ||
return duthost.shell('ls /var/core/')['stdout'].split() | ||
|
||
|
||
def test_parallel_reboot(duthosts, localhost, conn_graph_facts, xcvr_skip_list): | ||
""" | ||
@summary: This test case is to perform cold reboot on different linecards within 30 seconds, | ||
we consider it as parallel reboot. | ||
First, perform "parallel reboot" on all LCs, record initial dump files | ||
Then, make sure LCs are up and healthy | ||
Lastly, check if new core dumps are generated. | ||
We put the check in the end to make sure no core dump generated either | ||
during device down/up, or config initializing | ||
""" | ||
|
||
core_dumps = {} | ||
# Perform reboot on multiple LCs within 30sec | ||
for dut in duthosts: | ||
if dut.is_supervisor_node(): | ||
continue | ||
|
||
# collect core dump before reboot | ||
core_dumps[dut.hostname] = get_core_dump(dut) | ||
|
||
# Perform cold reboot on all linecards, with an internal within 30sec to mimic a parallel reboot scenario | ||
chassis_cold_reboot(dut, localhost) | ||
|
||
# Wait for 0 ~ 30sec | ||
rand_interval = random.randint(0, 30) | ||
time.sleep(rand_interval) | ||
|
||
# Make sure duts/critical/links/bgps are up | ||
for dut in duthosts: | ||
# 1. Make sure all LCs are up and links are up | ||
wait_for_startup(dut, localhost, delay=10, timeout=600) | ||
|
||
interfaces = conn_graph_facts.get("device_conn", {}).get(dut.hostname, {}) | ||
check_interfaces_and_services(dut, interfaces, xcvr_skip_list) | ||
|
||
# 2. Verify sessions are established | ||
config_facts = dut.config_facts(host=dut.hostname, source="running")['ansible_facts'] | ||
bgp_neighbors = config_facts.get('BGP_NEIGHBOR', {}) | ||
pytest_assert(wait_until(30, 5, 0, dut.check_bgp_session_state, list(bgp_neighbors.keys())), | ||
"Not all BGP sessions are established on DUT") | ||
|
||
# Check if new core dumps are generated | ||
for dut in duthosts: | ||
if dut.is_supervisor_node(): | ||
continue | ||
post_core_dump = get_core_dump(dut) | ||
new_core_dumps = (set(post_core_dump) - set(core_dumps[dut.hostname])) | ||
|
||
if new_core_dumps: | ||
pytest_assert(False, "New core dump found on {} during reboot! {}".format(dut.hostname, new_core_dumps)) | ||
else: | ||
logging.info("No new core dump found on {} during reboot".format(dut.hostname)) |