From e4fe46f212382d4fca25bfeffe7506da182dc136 Mon Sep 17 00:00:00 2001 From: Ashwin Srinivasan <93744978+assrinivasan@users.noreply.github.com> Date: Tue, 1 Feb 2022 09:34:43 -0800 Subject: [PATCH] Fixed bug with reboot history queue that was causing false failures (#5056) Fixed bug with reboot history queue that was causing false failures Components touched: * common/reboot.py * platform_tests/test_reboot.py List of changes: * Added DUT/Internal queue sync function to common/reboot.py * Increased wait_until timeout 30s to account for the possibility of show_and_parse failures Signed-off-by: Ashwin Srinivasan ashwin.srinivasan@microsoft.com --- tests/common/reboot.py | 49 +++++++++++++++++++++++++++++ tests/platform_tests/test_reboot.py | 7 +++-- 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/tests/common/reboot.py b/tests/common/reboot.py index 3ada3b904eb..82a98a9008e 100644 --- a/tests/common/reboot.py +++ b/tests/common/reboot.py @@ -81,6 +81,9 @@ REBOOT_TYPE_HISTOYR_QUEUE = deque([], MAX_NUM_REBOOT_CAUSE_HISTORY) REBOOT_CAUSE_HISTORY_TITLE = ["name", "cause", "time", "user", "comment"] +# Retry logic config +MAX_RETRIES = 3 +RETRY_BACKOFF_TIME = 15 def get_warmboot_finalizer_state(duthost): try: @@ -236,6 +239,51 @@ def check_reboot_cause(dut, reboot_cause_expected): logging.debug("dut {} last reboot-cause {}".format(dut.hostname, reboot_cause_got)) return reboot_cause_got == reboot_cause_expected +def sync_reboot_history_queue_with_dut(dut): + """ + @summary: Sync DUT and internal history queues + @param dut: The AnsibleHost object of DUT. + """ + + # Retry logic for increased robustness + dut_reboot_history_received = False + for retry_count in range(MAX_RETRIES): + try: + # Try and get the current reboot history from DUT + # If received, set flag and break out of for loop + + dut_reboot_history_queue = dut.show_and_parse("show reboot-cause history") + dut_reboot_history_received = True + break + except Exception as e: + e_type, e_value, e_traceback = sys.exc_info() + logging.info("Exception type: %s" % e_type.__name__) + logging.info("Exception message: %s" % e_value) + logging.info("Backing off for %d seconds before retrying", ((retry_count+1) * RETRY_BACKOFF_TIME)) + + time.sleep(((retry_count+1) * RETRY_BACKOFF_TIME)) + continue + + # If retry logic did not yield reboot cause history from DUT, + # return without clearing the existing reboot history queue. + if not dut_reboot_history_received: + return + + # Clear the current reboot history queue + REBOOT_TYPE_HISTOYR_QUEUE.clear() + + # For each item in the DUT reboot queue, + # iterate through every item in the reboot dict until + # a "cause" match is found. Then add that key to the + # reboot history queue REBOOT_TYPE_HISTOYR_QUEUE + # NB: appendleft used because queue received from DUT + # NB: is in reverse-chronological order. + + for reboot_type in (dut_reboot_history_queue): + for dict_iter in (reboot_ctrl_dict): + if re.search(reboot_ctrl_dict[dict_iter]["cause"], reboot_type["cause"]): + REBOOT_TYPE_HISTOYR_QUEUE.appendleft(dict_iter) + break def check_reboot_cause_history(dut, reboot_type_history_queue): """ @@ -270,6 +318,7 @@ def check_reboot_cause_history(dut, reboot_type_history_queue): reboot_type_history_len = len(reboot_type_history_queue) if reboot_type_history_len <= len(reboot_cause_history_got): for index, reboot_type in enumerate(reboot_type_history_queue): + logging.info("index: %d, reboot cause: %s, reboot cause from DUT: %s" % (index, reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index-1]["cause"])) if not re.search(reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index-1]["cause"]): logging.error("The {} reboot-cause not match. expected_reboot type={}, actual_reboot_cause={}".format( index, reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index]["cause"])) diff --git a/tests/platform_tests/test_reboot.py b/tests/platform_tests/test_reboot.py index 810b7e2d8fc..ac3dc6aa7bd 100644 --- a/tests/platform_tests/test_reboot.py +++ b/tests/platform_tests/test_reboot.py @@ -55,8 +55,11 @@ def reboot_and_check(localhost, dut, interfaces, xcvr_skip_list, reboot_type=REB @param reboot_helper: The helper function used only by power off reboot @param reboot_kwargs: The argument used by reboot_helper """ - logging.info("Run %s reboot on DUT" % reboot_type) + + logging.info("Sync reboot cause history queue with DUT reboot cause history queue") + sync_reboot_history_queue_with_dut(dut) + logging.info("Run %s reboot on DUT" % reboot_type) reboot(dut, localhost, reboot_type=reboot_type, reboot_helper=reboot_helper, reboot_kwargs=reboot_kwargs) REBOOT_TYPE_HISTOYR_QUEUE.append(reboot_type) @@ -75,7 +78,7 @@ def check_interfaces_and_services(dut, interfaces, xcvr_skip_list, reboot_type = if reboot_type is not None: logging.info("Check reboot cause") - assert wait_until(MAX_WAIT_TIME_FOR_REBOOT_CAUSE, 20, 0, check_reboot_cause, dut, reboot_type), \ + assert wait_until(MAX_WAIT_TIME_FOR_REBOOT_CAUSE, 20, 30, check_reboot_cause, dut, reboot_type), \ "got reboot-cause failed after rebooted by %s" % reboot_type if "201811" in dut.os_version or "201911" in dut.os_version: