From efb38dee865f9bf39179f07ee1a1e3abff876e2c Mon Sep 17 00:00:00 2001 From: Kavitha Ramalingam Date: Wed, 6 Nov 2024 14:48:01 +0530 Subject: [PATCH] sonic-host-services changes for gNOI Cold Reboot --- host_modules/gnoi_reboot.py | 155 +++++++++++++++++++++++++ scripts/sonic-host-server | 3 +- setup.py | 3 +- tests/gnoi_reboot_test.py | 221 ++++++++++++++++++++++++++++++++++++ utils/run_cmd.py | 34 ++++++ 5 files changed, 414 insertions(+), 2 deletions(-) create mode 100644 host_modules/gnoi_reboot.py create mode 100644 tests/gnoi_reboot_test.py create mode 100644 utils/run_cmd.py diff --git a/host_modules/gnoi_reboot.py b/host_modules/gnoi_reboot.py new file mode 100644 index 00000000..0eae52d8 --- /dev/null +++ b/host_modules/gnoi_reboot.py @@ -0,0 +1,155 @@ +"""gNOI reboot module which performs reboot""" + +import json +import logging +import threading +import time +from host_modules import host_service +from utils.run_cmd import _run_command + +MOD_NAME = 'gnoi_reboot' +# Reboot method in reboot request +# Both enum and string representations are supported +REBOOTMETHOD_COLD_BOOT_VALUES = {1, "COLD"} +REBOOTMETHOD_WARM_BOOT_VALUES = {4, "WARM"} +REBOOTMETHOD_NSF_VALUES = {5, "NSF"} + +# Timeout for SONiC Host Service to be killed during reboot +REBOOT_TIMEOUT = 260 + +EXECUTE_COLD_REBOOT_COMMAND = "sudo reboot" +EXECUTE_NSF_REBOOT_COMMAND = "/etc/init.d/gpins-nsf-boot nsf-reboot" + +logger = logging.getLogger(__name__) + + +class GnoiReboot(host_service.HostModule): + """DBus endpoint that executes the reboot and returns the reboot status + """ + + def __init__(self, mod_name): + """Use threading.lock mechanism to read/write into response_data + since response_data can be read/write by multiple threads""" + self.lock = threading.Lock() + # reboot_status_flag is used to keep track of reboot status on host + self.reboot_status_flag = {} + # Populating with default value i.e., no active reboot + self.populate_reboot_status_flag() + super(GnoiReboot, self).__init__(mod_name) + + def populate_reboot_status_flag(self, active = False, when = 0, reason = ""): + """Populates the reboot_status_flag with given input params""" + self.lock.acquire() + self.reboot_status_flag["active"] = active + self.reboot_status_flag["when"] = when + self.reboot_status_flag["reason"] = reason + self.lock.release() + return + + def validate_reboot_request(self, reboot_request): + # Check whether reboot method is present. + if "method" not in reboot_request: + return 1, "Reboot request must contain a reboot method" + + # Check whether reboot method is valid. + rebootmethod = reboot_request["method"] + valid_method = False + for values in [REBOOTMETHOD_COLD_BOOT_VALUES, REBOOTMETHOD_NSF_VALUES]: + if rebootmethod in values: + valid_method = True + if not valid_method: + return 1, "Invalid reboot method: " + str(rebootmethod) + + # Check whether delay is non-zero. delay key will not exist in reboot_request if it is zero + if "delay" in reboot_request and reboot_request["delay"] != 0: + return 1, "Delayed reboot is not supported" + return 0, "" + + def execute_reboot(self, rebootmethod): + """Execute reboot and reset reboot_status_flag when reboot fails""" + + if rebootmethod in REBOOTMETHOD_COLD_BOOT_VALUES: + command = EXECUTE_COLD_REBOOT_COMMAND + f = open("/tmp/hostlog.txt", "w") + f.write("Received reboot command ! ") + f.close() + logger.warning("%s: Issuing cold reboot", MOD_NAME) + elif rebootmethod in REBOOTMETHOD_NSF_VALUES: + command = EXECUTE_NSF_REBOOT_COMMAND + logger.warning("%s: Issuing NSF reboot", MOD_NAME) + else: + logger.error("%s: Invalid reboot method: %d", MOD_NAME, rebootmethod) + return + + rc, stdout, stderr = _run_command(command) + if rc: + self.populate_reboot_status_flag() + logger.error("%s: Reboot failed execution with stdout: %s, " + "stderr: %s", MOD_NAME, stdout, stderr) + return + + """Wait for 260 seconds for the reboot to complete. Here, we expect that SONiC Host Service + will be killed during this waiting period if the reboot is successful. If this module + is still alive after the below waiting period, we can conclude that the reboot has failed. + Each container can take up to 20 seconds to get killed. In total, there are 10 containers, + and adding a buffer of 1 minute brings up the delay value to be 260 seconds.""" + time.sleep(REBOOT_TIMEOUT) + # Conclude that the reboot has failed if we reach this point + self.populate_reboot_status_flag() + return + + @host_service.method(host_service.bus_name(MOD_NAME), in_signature='as', out_signature='is') + def issue_reboot(self, options): + """Issues reboot after performing the following steps sequentially: + 1. Checks that reboot_status_flag is not set + 2. Validates the reboot request + 3. Sets the reboot_status_flag + 4. Issues the reboot in a separate thread + """ + logger.warning("%s: issue_reboot rpc called", MOD_NAME) + self.lock.acquire() + is_reboot_ongoing = self.reboot_status_flag["active"] + self.lock.release() + # Return without issuing the reboot if the previous reboot is ongoing + if is_reboot_ongoing: + return 1, "Previous reboot is ongoing" + + """Convert input json formatted reboot request into python dict. + reboot_request is a python dict with the following keys: + method - specifies the method of reboot + delay - delay to issue reboot, key exists only if it is non-zero + message - reason for reboot + force - either true/false, key exists only if it is true + """ + try: + reboot_request = json.loads(options[0]) + except ValueError: + return 1, "Failed to parse json formatted reboot request into python dict" + + # Validate reboot request + err, errstr = self.validate_reboot_request(reboot_request) + if err: + return err, errstr + + # Sets reboot_status_flag to be in active state + self.populate_reboot_status_flag(True, int(time.time()), reboot_request["message"]) + + # Issue reboot in a new thread and reset the reboot_status_flag if the reboot fails + try: + t = threading.Thread(target=self.execute_reboot, args=(reboot_request["method"],)) + t.start() + except RuntimeError as error: + return 1, "Failed to start thread to execute reboot with error: " + str(error) + return 0, "Successfully issued reboot" + + @host_service.method(host_service.bus_name(MOD_NAME), in_signature='', out_signature='is') + def get_reboot_status(self): + """Returns current reboot status on host in json format""" + self.lock.acquire() + response_data = json.dumps(self.reboot_status_flag) + self.lock.release() + return 0, response_data + +def register(): + """Return the class name""" + return GnoiReboot, MOD_NAME diff --git a/scripts/sonic-host-server b/scripts/sonic-host-server index 7028f28a..ee33d57d 100755 --- a/scripts/sonic-host-server +++ b/scripts/sonic-host-server @@ -12,7 +12,7 @@ import dbus.service import dbus.mainloop.glib from gi.repository import GObject -from host_modules import config_engine, gcu, host_service, showtech, systemd_service, file_service +from host_modules import config_engine, gcu, host_service, showtech, systemd_service, file_service, gnoi_reboot def register_dbus(): @@ -21,6 +21,7 @@ def register_dbus(): 'config': config_engine.Config('config'), 'gcu': gcu.GCU('gcu'), 'host_service': host_service.HostService('host_service'), + 'gnoi_reboot': gnoi_reboot.GnoiReboot('gnoi_reboot'), 'showtech': showtech.Showtech('showtech'), 'systemd': systemd_service.SystemdService('systemd'), 'file_stat': file_service.FileService('file') diff --git a/setup.py b/setup.py index 9cb0ff39..64031e7e 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,8 @@ maintainer = 'Joe LeVeque', maintainer_email = 'jolevequ@microsoft.com', packages = [ - 'host_modules' + 'host_modules', + 'utils', ], scripts = [ 'scripts/caclmgrd', diff --git a/tests/gnoi_reboot_test.py b/tests/gnoi_reboot_test.py new file mode 100644 index 00000000..76ef0347 --- /dev/null +++ b/tests/gnoi_reboot_test.py @@ -0,0 +1,221 @@ +"""Tests for gnoi_reboot.""" + +import imp +import sys +import os +import pytest +import datetime + +if sys.version_info >= (3, 3): + from unittest import mock +else: + # Expect the 'mock' package for python 2 + # https://pypi.python.org/pypi/mock + import mock + +test_path = os.path.dirname(os.path.abspath(__file__)) +sonic_host_service_path = os.path.dirname(test_path) +host_modules_path = os.path.join(sonic_host_service_path, "host_modules") +sys.path.insert(0, sonic_host_service_path) + +TEST_ACTIVE_RESPONSE_DATA = "{\"active\": true, \"when\": 1617811205, \"reason\": \"testing reboot response\"}" +TEST_INACTIVE_RESPONSE_DATA = "{\"active\": false, \"when\": 0, \"reason\": \"\"}" + +REBOOTMETHOD_UNKNOWN_ENUM = 0 +REBOOTMETHOD_COLD_BOOT_ENUM = 1 +REBOOTMETHOD_NSF_ENUM = 5 + +TEST_TIMESTAMP = 1618942253.831912040 +REPORT_CRITICAL_STATE_FULL_COMMAND = "redis-cli -n 6 HSET COMPONENT_STATE_TABLE|host state ERROR reason \"cold reboot has failed\" essential true timestamp \"2021-04-20 18:10:53\" timestamp-seconds 1618942253 timestamp-nanoseconds 831912040" + +VALID_REBOOT_REQUEST_COLD = "{\"method\": 1, \"message\": \"test reboot request reason\"}" +VALID_REBOOT_REQUEST_NSF = "{\"method\": \"NSF\", \"message\": \"test reboot request reason\"}" +INVALID_REBOOT_REQUEST = "\"method\": 1, \"message\": \"test reboot request reason\"" + +imp.load_source("host_service", host_modules_path + "/host_service.py") +imp.load_source("gnoi_reboot", host_modules_path + "/gnoi_reboot.py") +from gnoi_reboot import * + + +class TestGnoiReboot(object): + @classmethod + def setup_class(cls): + with mock.patch("gnoi_reboot.super") as mock_host_module: + cls.gnoi_reboot_module = GnoiReboot(MOD_NAME) + + def test_populate_reboot_status_flag(self): + with mock.patch("time.time", return_value=1617811205.25): + self.gnoi_reboot_module.populate_reboot_status_flag() + assert self.gnoi_reboot_module.reboot_status_flag["active"] == False + assert self.gnoi_reboot_module.reboot_status_flag["when"] == 0 + assert self.gnoi_reboot_module.reboot_status_flag["reason"] == "" + + def test_validate_reboot_request_success_cold_boot_enum_method(self): + reboot_request = {"method": REBOOTMETHOD_COLD_BOOT_ENUM, "reason": "test reboot request reason"} + result = self.gnoi_reboot_module.validate_reboot_request(reboot_request) + assert result[0] == 0 + assert result[1] == "" + + def test_validate_reboot_request_success_cold_boot_string_method(self): + reboot_request = {"method": "COLD", "reason": "test reboot request reason"} + result = self.gnoi_reboot_module.validate_reboot_request(reboot_request) + assert result[0] == 0 + assert result[1] == "" + + def test_validate_reboot_request_success_nsf_enum_method(self): + reboot_request = {"method": REBOOTMETHOD_NSF_ENUM, "reason": "test reboot request reason"} + result = self.gnoi_reboot_module.validate_reboot_request(reboot_request) + assert result[0] == 0 + assert result[1] == "" + + def test_validate_reboot_request_success_nsf_enum_method(self): + reboot_request = {"method": "NSF", "reason": "test reboot request reason"} + result = self.gnoi_reboot_module.validate_reboot_request(reboot_request) + assert result[0] == 0 + assert result[1] == "" + + def test_validate_reboot_request_fail_unknown_method(self): + reboot_request = {"method": 0, "reason": "test reboot request reason"} + result = self.gnoi_reboot_module.validate_reboot_request(reboot_request) + assert result[0] == 1 + assert result[1] == "Invalid reboot method: 0" + + def test_validate_reboot_request_fail_no_method(self): + reboot_request = {"reason": "test reboot request reason"} + result = self.gnoi_reboot_module.validate_reboot_request(reboot_request) + assert result[0] == 1 + assert result[1] == "Reboot request must contain a reboot method" + + def test_validate_reboot_request_fail_delayed_reboot(self): + reboot_request = {"method": REBOOTMETHOD_COLD_BOOT_ENUM, "delay": 10, "reason": "test reboot request reason"} + result = self.gnoi_reboot_module.validate_reboot_request(reboot_request) + assert result[0] == 1 + assert result[1] == "Delayed reboot is not supported" + + def test_execute_reboot_success(self): + with ( + mock.patch("gnoi_reboot._run_command") as mock_run_command, + mock.patch("time.sleep") as mock_sleep, + mock.patch("gnoi_reboot.GnoiReboot.populate_reboot_status_flag") as mock_populate_reboot_status_flag, + ): + mock_run_command.return_value = (0, ["stdout: execute NSF reboot"], ["stderror: execute NSF reboot"]) + self.gnoi_reboot_module.execute_reboot("NSF") + mock_run_command.assert_called_once_with("/etc/init.d/gpins-nsf-boot nsf-reboot") + mock_sleep.assert_called_once_with(260) + mock_populate_reboot_status_flag.assert_called_once_with() + + def test_execute_reboot_fail_unknown_reboot(self, caplog): + with caplog.at_level(logging.ERROR): + self.gnoi_reboot_module.execute_reboot(-1) + msg = "gnoi_reboot: Invalid reboot method: -1" + assert caplog.records[0].message == msg + + def test_execute_reboot_fail_issue_reboot_command_cold_boot(self, caplog): + with ( + mock.patch("gnoi_reboot._run_command") as mock_run_command, + mock.patch("gnoi_reboot.GnoiReboot.populate_reboot_status_flag") as mock_populate_reboot_status_flag, + caplog.at_level(logging.ERROR), + ): + mock_run_command.return_value = (1, ["stdout: execute cold reboot"], ["stderror: execute cold reboot"]) + self.gnoi_reboot_module.execute_reboot(REBOOTMETHOD_COLD_BOOT_ENUM) + msg = ("gnoi_reboot: Reboot failed execution with " + "stdout: ['stdout: execute cold reboot'], stderr: " + "['stderror: execute cold reboot']") + assert caplog.records[0].message == msg + mock_populate_reboot_status_flag.assert_called_once_with() + + def test_execute_reboot_fail_issue_reboot_command_nsf(self, caplog): + with ( + mock.patch("gnoi_reboot._run_command") as mock_run_command, + mock.patch("gnoi_reboot.GnoiReboot.populate_reboot_status_flag") as mock_populate_reboot_status_flag, + caplog.at_level(logging.ERROR), + ): + mock_run_command.return_value = (1, ["stdout: execute NSF reboot"], ["stderror: execute NSF reboot"]) + self.gnoi_reboot_module.execute_reboot("NSF") + msg = ("gnoi_reboot: Reboot failed execution with " + "stdout: ['stdout: execute NSF reboot'], stderr: " + "['stderror: execute NSF reboot']") + assert caplog.records[0].message == msg + mock_populate_reboot_status_flag.assert_called_once_with() + + def test_issue_reboot_success_cold_boot(self): + with ( + mock.patch("threading.Thread") as mock_thread, + mock.patch("gnoi_reboot.GnoiReboot.validate_reboot_request", return_value=(0, "")), + ): + self.gnoi_reboot_module.populate_reboot_status_flag() + result = self.gnoi_reboot_module.issue_reboot([VALID_REBOOT_REQUEST_COLD]) + assert result[0] == 0 + assert result[1] == "Successfully issued reboot" + mock_thread.assert_called_once_with( + target=self.gnoi_reboot_module.execute_reboot, + args=(REBOOTMETHOD_COLD_BOOT_ENUM,), + ) + mock_thread.return_value.start.assert_called_once_with() + + def test_issue_reboot_success_nsf(self): + with ( + mock.patch("threading.Thread") as mock_thread, + mock.patch("gnoi_reboot.GnoiReboot.validate_reboot_request", return_value=(0, "")), + ): + self.gnoi_reboot_module.populate_reboot_status_flag() + result = self.gnoi_reboot_module.issue_reboot([VALID_REBOOT_REQUEST_NSF]) + assert result[0] == 0 + assert result[1] == "Successfully issued reboot" + mock_thread.assert_called_once_with( + target=self.gnoi_reboot_module.execute_reboot, + args=("NSF",), + ) + mock_thread.return_value.start.assert_called_once_with() + + def test_issue_reboot_previous_reboot_ongoing(self): + self.gnoi_reboot_module.populate_reboot_status_flag() + self.gnoi_reboot_module.reboot_status_flag["active"] = True + result = self.gnoi_reboot_module.issue_reboot([VALID_REBOOT_REQUEST_COLD]) + assert result[0] == 1 + assert result[1] == "Previous reboot is ongoing" + + def test_issue_reboot_bad_format_reboot_request(self): + self.gnoi_reboot_module.populate_reboot_status_flag() + result = self.gnoi_reboot_module.issue_reboot([INVALID_REBOOT_REQUEST]) + assert result[0] == 1 + assert result[1] == "Failed to parse json formatted reboot request into python dict" + + def test_issue_reboot_invalid_reboot_request(self): + with mock.patch("gnoi_reboot.GnoiReboot.validate_reboot_request", return_value=(1, "failed to validate reboot request")): + self.gnoi_reboot_module.populate_reboot_status_flag() + result = self.gnoi_reboot_module.issue_reboot([VALID_REBOOT_REQUEST_COLD]) + assert result[0] == 1 + assert result[1] == "failed to validate reboot request" + + def raise_runtime_exception_test(self): + raise RuntimeError('test raise RuntimeError exception') + + def test_issue_reboot_fail_issue_reboot_thread(self): + with mock.patch("threading.Thread") as mock_thread: + mock_thread.return_value.start = self.raise_runtime_exception_test + self.gnoi_reboot_module.populate_reboot_status_flag() + result = self.gnoi_reboot_module.issue_reboot([VALID_REBOOT_REQUEST_COLD]) + assert result[0] == 1 + assert result[1] == "Failed to start thread to execute reboot with error: test raise RuntimeError exception" + + def test_get_reboot_status_active(self): + self.gnoi_reboot_module.populate_reboot_status_flag(True, 1617811205, "testing reboot response") + result = self.gnoi_reboot_module.get_reboot_status() + assert result[0] == 0 + assert result[1] == TEST_ACTIVE_RESPONSE_DATA + + def test_get_reboot_status_inactive(self): + self.gnoi_reboot_module.populate_reboot_status_flag(False, 0, "") + result = self.gnoi_reboot_module.get_reboot_status() + assert result[0] == 0 + assert result[1] == TEST_INACTIVE_RESPONSE_DATA + + def test_register(self): + result = register() + assert result[0] == GnoiReboot + assert result[1] == MOD_NAME + + @classmethod + def teardown_class(cls): + print("TEARDOWN") diff --git a/utils/run_cmd.py b/utils/run_cmd.py new file mode 100644 index 00000000..d19f9b52 --- /dev/null +++ b/utils/run_cmd.py @@ -0,0 +1,34 @@ +import logging +import shlex +import subprocess + +logger = logging.getLogger(__name__) + + +def _run_command(cmd): + '''! + Execute a given command + + @param cmd (str) Command to execute. Since we execute the command directly, and not within the + context of the shell, the full path needs to be provided ($PATH is not used). + Command parameters are simply separated by a space. + Should be either string or a list + ''' + try: + if not cmd: + return (0, None, None) + shcmd = shlex.split(cmd) + proc = subprocess.Popen(shcmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, close_fds=True) + output_stdout, output_stderr = proc.communicate() + list_stdout = [] + for l in output_stdout.splitlines(): + list_stdout.append(str(l.decode())) + list_stderr = [] + for l in output_stderr.splitlines(): + list_stderr.append(str(l.decode())) + return (proc.returncode, list_stdout, list_stderr) + except (OSError, ValueError) as e: + logging.error( + "!Exception [%s] encountered while processing the command : %s", + str(e), str(cmd)) + return (1, None, None)