diff --git a/files/build_templates/docker_image_ctl.j2 b/files/build_templates/docker_image_ctl.j2 index f1757f69ba42..6d8fb59a7d65 100644 --- a/files/build_templates/docker_image_ctl.j2 +++ b/files/build_templates/docker_image_ctl.j2 @@ -109,6 +109,37 @@ function preStartAction() fi {%- elif docker_container_name == "snmp" %} $SONIC_DB_CLI STATE_DB HSET 'DEVICE_METADATA|localhost' chassis_serial_number $(decode-syseeprom -s) + + # Wait for system ready. Continue after timeout if we won't hear anything + # from DB. Read timeout from config file and add two extra minutes on top of + # it. + CONFFILE=system_health_monitoring_config.json + PLATFORM=${PLATFORM:-`sonic-cfggen -d -v DEVICE_METADATA.localhost.platform`} + TIMEOUT=$(cat /usr/share/sonic/device/$PLATFORM/$CONFFILE | jq ".timeout") + SYSTEM_READY_EVENT="SYSTEM_READY|SYSTEM_STATE" + CHECK_SR_EVENT="sonic-db-dump -n STATE_DB -k $SYSTEM_READY_EVENT" + + # Set default if not found + if [[ -z $TIMEOUT ]]; then + MESSAGE="Failed to read timeout from config file, assuming default is \ + 10 minutes" + echo $MESSAGE + TIMEOUT=10 + fi + + # Add one extra minute and convert to seconds + TIMEOUT=$(((TIMEOUT + 1) * 60)) + + # Waiting for event + echo "Waiting for $SYSTEM_READY_EVENT event" + while [[ -z "$($CHECK_SR_EVENT | jq '.[]["value"]["Status"]')" ]]; do + UPTIME=$(awk -F. '{print $1}' /proc/uptime) + if [[ $UPTIME -gt $TIMEOUT ]]; then + echo "Got uptime timeout - starting snmp..." + break + fi + sleep 15 + done {%- else %} : # nothing {%- endif %} diff --git a/files/build_templates/init_cfg.json.j2 b/files/build_templates/init_cfg.json.j2 index f8083f040541..e2feda5b1cfc 100644 --- a/files/build_templates/init_cfg.json.j2 +++ b/files/build_templates/init_cfg.json.j2 @@ -82,11 +82,17 @@ "auto_restart": "{{autorestart}}", "support_syslog_rate_limit" : "true", {# Set check_up_status to true here when app readiness will be marked in state db #} +{%- if feature in ["swss", "syncd", "pmon"] %} + "check_up_status" : "true", +{%- endif %} {# For now, to support the infrastrucure, setting the check_up_status to false for bgp,swss,pmon #} {# Once apps like bgp,synd supports app readiness, then bgp,syncd can set check_up_status to true #} -{%- if feature in ["bgp", "swss", "pmon"] %} +{%- if feature in ["bgp"] %} "check_up_status" : "false", {%- endif %} +{%- if feature in ["snmp"] %} + "irrel_for_sysready" : "true", +{%- endif %} {%- if include_kubernetes == "y" %} {%- if feature in ["lldp", "pmon", "radv", "eventd", "snmp", "telemetry", "gnmi"] %} "set_owner": "kube", {% else %} diff --git a/files/scripts/syncd.sh b/files/scripts/syncd.sh index 562af4fd2c3f..6213629b9f0d 100755 --- a/files/scripts/syncd.sh +++ b/files/scripts/syncd.sh @@ -27,6 +27,8 @@ function startplatform() { /usr/bin/mlnx-fw-upgrade.sh -c -v if [[ "$?" -ne "${EXIT_SUCCESS}" ]]; then debug "Failed to upgrade fw. " "$?" "Restart syncd" + sonic-db-cli STATE_DB HSET "FEATURE|$DEV_SRV" fail_reason \ + "ASIC FW update failed" up_status false exit 1 fi /etc/init.d/sxdkernel restart @@ -77,6 +79,9 @@ function startplatform() { exit 1 fi fi + + sonic-db-cli STATE_DB HDEL "FEATURE|$DEV_SRV" fail_reason + sonic-db-cli STATE_DB HSET "FEATURE|$DEV_SRV" up_status true } function waitplatform() { @@ -159,9 +164,11 @@ LOCKFILE="/tmp/swss-syncd-lock$DEV" NAMESPACE_PREFIX="asic" if [ "$DEV" ]; then NET_NS="$NAMESPACE_PREFIX$DEV" #name of the network namespace + DEV_SRV="$SERVICE@$DEV" SONIC_DB_CLI="sonic-db-cli -n $NET_NS" else NET_NS="" + DEV_SRV="$SERVICE" SONIC_DB_CLI="sonic-db-cli" fi diff --git a/src/sonic-yang-models/doc/Configuration.md b/src/sonic-yang-models/doc/Configuration.md index 4ebae55c3ea2..3afe69deb445 100644 --- a/src/sonic-yang-models/doc/Configuration.md +++ b/src/sonic-yang-models/doc/Configuration.md @@ -989,34 +989,34 @@ deployment envionment id and deployment type are specified. BGP local AS number is also specified in this table as current only single BGP instance is supported in SONiC. -``` +```json { -"DEVICE_METADATA": { + "DEVICE_METADATA": { "localhost": { - "asic_id": "06:00.0", - "asic_name": "asic0", - "hwsku": "Force10-S6100", - "default_bgp_status": "up", - "docker_routing_config_mode": "unified", - "hostname": "sonic-s6100-01", - "platform": "x86_64-dell_s6100_c2538-r0", - "mac": "4c:76:25:f4:70:82", - "default_pfcwd_status": "disable", - "bgp_asn": "65100", - "deployment_id": "1", - "type": "ToRRouter", - "bgp_adv_lo_prefix_as_128" : "true", - "buffer_model": "traditional", - "yang_config_validation": "disable", - "rack_mgmt_map": "dummy_value", - "timezome": "Europe/Kiev", - "bgp_router_id": "8.8.8.8" + "asic_id": "06:00.0", + "asic_name": "asic0", + "hwsku": "Force10-S6100", + "default_bgp_status": "up", + "docker_routing_config_mode": "unified", + "hostname": "sonic-s6100-01", + "platform": "x86_64-dell_s6100_c2538-r0", + "mac": "4c:76:25:f4:70:82", + "default_pfcwd_status": "disable", + "bgp_asn": "65100", + "deployment_id": "1", + "type": "ToRRouter", + "bgp_adv_lo_prefix_as_128" : "true", + "buffer_model": "traditional", + "yang_config_validation": "disable", + "rack_mgmt_map": "dummy_value", + "timezome": "Europe/Kiev", + "bgp_router_id": "8.8.8.8" + "sysready_state": "enabled" + } } - } } - ``` - +* `sysready_state` - System-ready feature configuration: `{enabled,disabled}` ### Device neighbor metada diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests/device_metadata.json b/src/sonic-yang-models/tests/yang_model_tests/tests/device_metadata.json index 99310fecc6ab..fcf60e9ae2dc 100644 --- a/src/sonic-yang-models/tests/yang_model_tests/tests/device_metadata.json +++ b/src/sonic-yang-models/tests/yang_model_tests/tests/device_metadata.json @@ -190,5 +190,12 @@ }, "DEVICE_METADATA_VALID_NEXTHOP_GROUP": { "desc": "Verifying nexthop_group configuration." - } + }, + "DEVICE_METADATA_VALID_SYSREADY": { + "desc": "Verifying valid system-ready state" + }, + "DEVICE_METADATA_INVALID_SYSREADY": { + "desc": "Verifying invalid system-ready state", + "eStrKey": "InvalidValue" + } } diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests/feature.json b/src/sonic-yang-models/tests/yang_model_tests/tests/feature.json index 52dce58a2cf6..2e8400a2319f 100644 --- a/src/sonic-yang-models/tests/yang_model_tests/tests/feature.json +++ b/src/sonic-yang-models/tests/yang_model_tests/tests/feature.json @@ -14,5 +14,10 @@ }, "FEATURE_WITH_NO_OWNER" : { "desc": "Config feature table without set_owner" + }, + "FEATURE_INVALID_IRRELEVANT_FOR_SYSTEM_READY" : { + "desc": "Invalid value for 'irrelevant for system-ready' parameter", + "eStrKey": "Pattern", + "eStr": ["false|true|False|True"] } } diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests_config/device_metadata.json b/src/sonic-yang-models/tests/yang_model_tests/tests_config/device_metadata.json index 44f0966c8b18..bea45e818cc0 100644 --- a/src/sonic-yang-models/tests/yang_model_tests/tests_config/device_metadata.json +++ b/src/sonic-yang-models/tests/yang_model_tests/tests_config/device_metadata.json @@ -526,5 +526,23 @@ } } } + }, + "DEVICE_METADATA_VALID_SYSREADY": { + "sonic-device_metadata:sonic-device_metadata": { + "sonic-device_metadata:DEVICE_METADATA": { + "sonic-device_metadata:localhost": { + "sysready_state": "disabled" + } + } + } + }, + "DEVICE_METADATA_INVALID_SYSREADY": { + "sonic-device_metadata:sonic-device_metadata": { + "sonic-device_metadata:DEVICE_METADATA": { + "sonic-device_metadata:localhost": { + "sysready_state": "enable" + } + } + } } } diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests_config/feature.json b/src/sonic-yang-models/tests/yang_model_tests/tests_config/feature.json index 12e1e07173c9..a50f071b3d0d 100644 --- a/src/sonic-yang-models/tests/yang_model_tests/tests_config/feature.json +++ b/src/sonic-yang-models/tests/yang_model_tests/tests_config/feature.json @@ -57,6 +57,7 @@ "has_per_asic_scope": "{% if not DEVICE_RUNTIME_METADATA['ETHERNET_PORTS_PRESENT'] or ('CHASSIS_METADATA' in DEVICE_RUNTIME_METADATA and DEVICE_RUNTIME_METADATA['CHASSIS_METADATA']['module_type'] in ['supervisor']) %}False{% else %}True{% endif %}", "set_owner": "kube", "check_up_status": "false", + "irrel_for_sysready": "true", "support_syslog_rate_limit": "true" }, { @@ -68,6 +69,7 @@ "has_per_asic_scope": "true", "set_owner": "kube", "check_up_status": "false", + "irrel_for_sysready": "false", "support_syslog_rate_limit": "true" } ] @@ -129,5 +131,17 @@ ] } } + }, + "FEATURE_INVALID_IRRELEVANT_FOR_SYSTEM_READY": { + "sonic-feature:sonic-feature": { + "sonic-feature:FEATURE": { + "FEATURE_LIST": [ + { + "name": "swss", + "irrel_for_sysready": "tr" + } + ] + } + } } } diff --git a/src/sonic-yang-models/yang-models/sonic-device_metadata.yang b/src/sonic-yang-models/yang-models/sonic-device_metadata.yang index 0d0607fd3cab..68bb1d602647 100644 --- a/src/sonic-yang-models/yang-models/sonic-device_metadata.yang +++ b/src/sonic-yang-models/yang-models/sonic-device_metadata.yang @@ -19,6 +19,10 @@ module sonic-device_metadata { description "DEVICE_METADATA YANG Module for SONiC OS"; + revision 2023-05-10 { + description "Add system-ready feature configuration"; + } + revision 2021-02-27 { description "Added frr_mgmt_framework_config field to handle BGP config DB schema events to configure FRR protocols."; @@ -274,6 +278,12 @@ module sonic-device_metadata { default disabled; } + leaf sysready_state { + description "System-ready feature state"; + type stypes:admin_mode; + default enabled; + } + } /* end of container localhost */ } diff --git a/src/sonic-yang-models/yang-models/sonic-feature.yang b/src/sonic-yang-models/yang-models/sonic-feature.yang index 6939f0044920..0d2026f12dbc 100644 --- a/src/sonic-yang-models/yang-models/sonic-feature.yang +++ b/src/sonic-yang-models/yang-models/sonic-feature.yang @@ -11,6 +11,10 @@ module sonic-feature{ description "Feature Table yang Module for SONiC"; + revision 2023-05-10 { + description "Add system-ready feature configuration"; + } + typedef feature-state { description "configuration to set the feature running state"; type string; @@ -117,6 +121,12 @@ module sonic-feature{ type stypes:boolean_type; default "false"; } + + leaf irrel_for_sysready { + description "Readiness of this feature is irrelevant for system-ready state"; + type stypes:boolean_type; + default "false"; + } } } } diff --git a/src/system-health/health_checker/config.py b/src/system-health/health_checker/config.py index 63af22db4ad0..ffc676d1a265 100644 --- a/src/system-health/health_checker/config.py +++ b/src/system-health/health_checker/config.py @@ -32,6 +32,9 @@ class Config(object): # Monit service start delay configuration entry MONIT_START_DELAY_CONFIG = 'with start delay' + # Default system ready timeout + DEFAULT_SYSREADY_TIMEOUT = 10 + def __init__(self): """ Constructor. Initialize all configuration entry to default value in case there is no configuration file. @@ -44,6 +47,9 @@ def __init__(self): self.ignore_services = None self.ignore_devices = None self.user_defined_checkers = None + self.wait_services = None + self.services_to_report_app_status = None + self.timeout = Config.DEFAULT_SYSREADY_TIMEOUT def config_file_exists(self): return os.path.exists(self._config_file) @@ -70,9 +76,13 @@ def load_config(self): self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL) self.ignore_services = self._get_list_data('services_to_ignore') + self.wait_services = self._get_list_data('services_to_wait') + self.services_to_report_app_status = self._get_list_data('services_to_report_app_status') + self.timeout = self.config_data.get('timeout', Config.DEFAULT_SYSREADY_TIMEOUT) self.ignore_devices = self._get_list_data('devices_to_ignore') self.user_defined_checkers = self._get_list_data('user_defined_checkers') except Exception as e: + # TODO: Add log here. Unexpected fail is not visible self._reset() def _reset(self): diff --git a/src/system-health/health_checker/sysmonitor.py b/src/system-health/health_checker/sysmonitor.py index c07b58b5489b..3969f4b0a0ff 100755 --- a/src/system-health/health_checker/sysmonitor.py +++ b/src/system-health/health_checker/sysmonitor.py @@ -1,30 +1,36 @@ #!/usr/bin/python3 +import glob +import multiprocessing import os import sys import time -import glob -import multiprocessing +from copy import copy from datetime import datetime -from swsscommon import swsscommon + from sonic_py_common.logger import Logger -from . import utils from sonic_py_common.task_base import ProcessTaskBase from sonic_py_common import device_info +from swsscommon import swsscommon + +from . import utils from .config import Config import signal import jinja2 SYSLOG_IDENTIFIER = "system#monitor" REDIS_TIMEOUT_MS = 0 -system_allsrv_state = "DOWN" -spl_srv_list = ['database-chassis', 'gbsyncd'] +spl_srv_list = ['database-chassis', 'gbsyncd', 'e2scrub_reap'] SELECT_TIMEOUT_MSECS = 1000 QUEUE_TIMEOUT = 15 TASK_STOP_TIMEOUT = 10 logger = Logger(log_identifier=SYSLOG_IDENTIFIER) exclude_srv_list = ['ztp.service'] +def get_timestamp(): + return '{}'.format(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")) + + #Subprocess which subscribes to STATE_DB FEATURE table for any update #and push service events to main process via queue class MonitorStateDbTask(ProcessTaskBase): @@ -36,8 +42,10 @@ def __init__(self,myQ): def subscribe_statedb(self): state_db = swsscommon.DBConnector("STATE_DB", REDIS_TIMEOUT_MS, False) sel = swsscommon.Select() - cst = swsscommon.SubscriberStateTable(state_db, "FEATURE") - sel.addSelectable(cst) + feature_sub = swsscommon.SubscriberStateTable(state_db, "FEATURE") + service_sub = swsscommon.SubscriberStateTable(state_db, "SERVICE_APP") + sel.addSelectable(feature_sub) + sel.addSelectable(service_sub) while not self.task_stopping_event.is_set(): (state, c) = sel.select(SELECT_TIMEOUT_MSECS) @@ -46,11 +54,22 @@ def subscribe_statedb(self): if state != swsscommon.Select.OBJECT: logger.log_warning("sel.select() did not return swsscommon.Select.OBJECT") continue - (key, op, cfvs) = cst.pop() + + fd = c.getFd() + if fd == feature_sub.getFd(): + (key, op, cfvs) = feature_sub.pop() + source = 'feature' + elif fd == service_sub.getFd(): + (key, op, cfvs) = service_sub.pop() + source = 'service_app' + else: + logger.log_warning("sel.select() returned unexpected selectable OBJECT") + continue + key_ext = key + ".service" - timestamp = "{}".format(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")) - msg={"unit": key_ext, "evt_src":"feature", "time":timestamp} - self.task_notify(msg) + self.task_notify({"unit": key_ext, + "evt_src": source, + "time": get_timestamp()}) def task_worker(self): @@ -85,8 +104,8 @@ def on_job_removed(self, id, job, unit, result): #Function for listening the systemd event on dbus def subscribe_sysbus(self): import dbus - from gi.repository import GLib from dbus.mainloop.glib import DBusGMainLoop + from gi.repository import GLib DBusGMainLoop(set_as_default=True) bus = dbus.SystemBus() @@ -109,17 +128,72 @@ def task_notify(self, msg): return self.task_queue.put(msg) + +# Subprocess to monitor system ready timeout. If timeout will be exceeded, +# send a message to queue and exit +class MonitorTimeout(ProcessTaskBase): + def __init__(self, myQ, config): + ProcessTaskBase.__init__(self) + self._stop_timeout_secs = TASK_STOP_TIMEOUT + self.task_queue = myQ + + if config and config.timeout: + self.timeout = config.timeout + else: + self.timeout = 10 + logger.log_warning("Can't get timeout configuration, assuming 10 " + "minutes") + + def task_worker(self): + if self.task_stopping_event.is_set(): + return + + logger.log_info(f'Set system ready timeout task for {self.timeout} ' + 'minutes') + + # Repeat until timeout or canceling + while not self.task_stopping_event.is_set(): + if utils.get_uptime() > self.timeout * 60: + logger.log_warning('Got system ready timeout. Notifying') + self.task_notify({ + 'unit': 'timeout', + 'evt_src': 'timeout', + 'time': get_timestamp() + }) + break + + time.sleep(15) + + logger.log_info('System ready timeout task finished') + + def task_notify(self, msg): + if self.task_stopping_event.is_set(): + return + self.task_queue.put(msg) + + #Mainprocess which launches 2 subtasks - systembus task and statedb task #and on receiving events, checks and updates the system ready status to state db class Sysmonitor(ProcessTaskBase): def __init__(self): ProcessTaskBase.__init__(self) + # Initialize the timeout to stop this task once "task_stop" event is + # received self._stop_timeout_secs = TASK_STOP_TIMEOUT - self.dnsrvs_name = set() + self.state_db = None self.config_db = None + self.monitor_timeout = None + + # List of services in "DOWN" state + self.dnsrvs_name = set() + # Last reported system state + self.last_reported_state = 'INIT' + + # Load system health monitoring configuration self.config = Config() + self.config.load_config() self.mpmgr = multiprocessing.Manager() self.myQ = self.mpmgr.Queue() @@ -143,28 +217,87 @@ def get_all_service_list(self): self.config_db = swsscommon.ConfigDBConnector(use_unix_socket_path=True) self.config_db.connect() - dir_list = [] + dir_set = set() + srvs_files = set() + #add the services from the below targets - targets= ["/etc/systemd/system/multi-user.target.wants", "/etc/systemd/system/sonic.target.wants"] + targets= [ + "/etc/systemd/system/multi-user.target.wants", + "/etc/systemd/system/sonic.target.wants", + ] for path in targets: - dir_list += [os.path.basename(i) for i in glob.glob('{}/*.service'.format(path))] - - #add the enabled docker services from config db feature table - self.get_service_from_feature_table(dir_list) - - self.config.load_config() + srvs_files_list = glob.glob('{}/*.service'.format(path)) + srvs_files |= {os.path.basename(i) for i in srvs_files_list} + + # By default, wait for all + dir_set = copy(srvs_files) + + # Want only services: means all multi-user and sonic targets, except + # services defined in FEATURE table. Feature table is expandable, so + # it may and should control services which will have effect on + # system-ready state by itself. + # Leave only services we want to wait for + if self.config and self.config.wait_services: + # srv_file example - swss@1.service, syncd@0.service + # full_srv_name example - swss@1, syncd@0 + # srv_name example - swss, syncd + dir_set = set() + for srv_name in self.config.wait_services: + for srv_file in srvs_files: + full_srv_name = srv_file.removesuffix('.service') + if (full_srv_name == srv_name or + full_srv_name.startswith(srv_name + '@')): + dir_set.add(srv_file) + + # add the enabled docker services from config db feature table. + # It will respect irrel_for_sysready flag. + feature_table_srvs = self.get_service_from_feature_table() + # srv_file example - swss@1.service, syncd@0.service + # full_srv_name example - swss@1, syncd@0 + # srv_name example - swss, syncd + for srv_name in feature_table_srvs: + for srv_file in srvs_files: + full_srv_name = srv_file.removesuffix('.service') + if (full_srv_name == srv_name or + full_srv_name.startswith(srv_name + '@')): + dir_set.add(srv_file) + + # Ignore what we don't need (we are using here the list of services to + # ignore for system health), of course if some service doesn't affect + # system health, so why it should affect on system ready? Well, it + # shouldn't. + # So that last filter filters again all services we don't want. if self.config and self.config.ignore_services: - for srv in self.config.ignore_services: - if srv in dir_list: - dir_list.remove(srv) - - #Remove services from exclude list Eg.ztp.service + # srv_file example - swss@1.service, syncd@0.service + # srv_name example - swss, syncd + for srv_file in copy(dir_set): + srv_name = srv_file.removesuffix('.service').split('@')[0] + if srv_name in self.config.ignore_services: + dir_set.remove(srv_file) + + # Remove services from exclude list Eg.ztp.service for srv in exclude_srv_list: - if srv in dir_list: - dir_list.remove(srv) - - dir_list.sort() - return dir_list + if srv in dir_set: + dir_set.remove(srv) + + # Feature is controllable by sysready_state parameter of + # DEVICE_METADATA table. Setting it to "disable" will just simulate + # "PortInitDone" behavior. + device_metadata = self.config_db.get_table( + swsscommon.CFG_DEVICE_METADATA_TABLE_NAME).get('localhost', {}) + feature_state = device_metadata.get('sysready_state', 'enabled') + + # Handle feature state. + # Use only swss.service to rely on ready state + if feature_state == 'disabled': + # Order matters + desired_srvs = ['swss.service'] + for srv in desired_srvs: + if srv in dir_set: + dir_set = {srv} + break + + return sorted(dir_set) def get_render_value_for_field(self, configuration, device_config, expected_values): """ Returns the target value by rendering the configuration as J2 template. @@ -186,7 +319,7 @@ def get_render_value_for_field(self, configuration, device_config, expected_valu raise ValueError('Invalid value rendered for configuration {}: {}'.format(configuration, target_value)) return target_value - def get_service_from_feature_table(self, dir_list): + def get_service_from_feature_table(self): """Get service from CONFIG DB FEATURE table. During "config reload" command, filling FEATURE table is not an atomic operation, sonic-cfggen do it with two steps: 1. Add an empty table entry to CONFIG DB @@ -195,13 +328,15 @@ def get_service_from_feature_table(self, dir_list): So, if system health read db on middle of step 1 and step 2, it might read invalid data. A retry mechanism is here to avoid such issue. - Args: - dir_list (list): service list + Return: + srvs_list: list: tracked services list """ max_retry = 3 retry_delay = 1 success = True + srvs_list = [] + while max_retry > 0: success = True try: @@ -216,9 +351,9 @@ def get_service_from_feature_table(self, dir_list): break state = self.get_render_value_for_field(fields["state"], device_config, ['enabled', 'disabled', 'always_enabled', 'always_disabled']) if state not in ["disabled", "always_disabled"]: - srvext = srv + ".service" - if srvext not in dir_list: - dir_list.append(srvext) + if fields.get('irrel_for_sysready', '').lower() != 'true': + if srv not in srvs_list: + srvs_list.append(srv) except: success = False if not success: @@ -229,9 +364,11 @@ def get_service_from_feature_table(self, dir_list): if not success: logger.log_error("FEATURE table is not fully ready: {}, max retry reached".format(feature_table)) - #Checks FEATURE table from config db for the service' check_up_status flag - #if marked to true, then read the service up_status from FEATURE table of state db. - #else, just return Up + return srvs_list + + # Checks FEATURE table from config db or data from json file for the + # service' check_up_status flag if marked to true, then read the service + # up_status from FEATURE table of state db. Else, just return Up def get_app_ready_status(self, service): if not self.state_db: self.state_db = swsscommon.SonicV2Connector(use_unix_socket_path=True) @@ -246,26 +383,59 @@ def get_app_ready_status(self, service): configdb_feature_table = self.config_db.get_table('FEATURE') update_time = "-" - if service not in configdb_feature_table.keys(): + app_up_status_cfg = set() + if self.config and self.config.services_to_report_app_status: + app_up_status_cfg = self.config.services_to_report_app_status + + # Config DB will have a configuration for non parametrized service + # But if there is a parameter, the service should handle the right + # reporting itself. + service_noparam = service.split('@')[0] + if service_noparam not in (app_up_status_cfg | + configdb_feature_table.keys()): pstate = "Up" + return pstate,fail_reason,update_time + + if service_noparam in configdb_feature_table.keys(): + table = 'FEATURE' + check_app_up_status = \ + configdb_feature_table[service_noparam].get('check_up_status') else: - check_app_up_status = configdb_feature_table[service].get('check_up_status') - if check_app_up_status is not None and (check_app_up_status.lower()) == "true": - up_status_flag = self.state_db.get(self.state_db.STATE_DB, 'FEATURE|{}'.format(service), 'up_status') - if up_status_flag is not None and (up_status_flag.lower()) == "true": - pstate = "Up" + table = 'SERVICE_APP' + + srvc_table = '{}|{}'.format(table, service) + + # Handle feature's application up status + if table == 'FEATURE' and (not check_app_up_status or + check_app_up_status.lower() == 'false'): + # Don't check application up status if check_app_up_status flag is + # not present or set to false + pstate = "Up" + else: + up_status_flag = self.state_db.get(self.state_db.STATE_DB, + srvc_table, 'up_status') + if up_status_flag is not None and up_status_flag.lower() == "true": + pstate = "Up" + else: + if (up_status_flag is not None and + up_status_flag.lower() == "false"): + # The application is in failed state if status flag set to + # false + pstate = "Failed" else: - fail_reason = self.state_db.get(self.state_db.STATE_DB, 'FEATURE|{}'.format(service), 'fail_reason') - if fail_reason is None: - fail_reason = "NA" + # In that case it is just in init state pstate = "Down" - update_time = self.state_db.get(self.state_db.STATE_DB, 'FEATURE|{}'.format(service), 'update_time') - if update_time is None: - update_time = "-" - else: - #Either check_up_status marked False or entry does not exist - pstate = "Up" + # Try to get fail reason anyway + fail_reason = self.state_db.get(self.state_db.STATE_DB, + srvc_table, 'fail_reason') + if fail_reason is None: + fail_reason = "NA" + + update_time = self.state_db.get(self.state_db.STATE_DB, srvc_table, + 'update_time') + if update_time is None: + update_time = "-" return pstate,fail_reason,update_time @@ -288,6 +458,27 @@ def post_unit_status(self, srv_name, srv_status, app_status, fail_reason, update self.state_db = swsscommon.SonicV2Connector(use_unix_socket_path=True) self.state_db.connect(self.state_db.STATE_DB) + # Log message with proper level + logf = logger.log_notice + msg = f'{srv_name} service status: {srv_status}, ' + msg += f'app status: {app_status}' + + if srv_status != 'OK': + msg += f', fail reason: {fail_reason}' + + # So far good down reasons for each service status + good_dr = { + 'Down': ['Inactive'], + 'Starting': ['-'], + 'Stopping': ['-'] + } + + # Only allowed combinations + if fail_reason not in good_dr.get(srv_status, []): + logf = logger.log_error + + logf(msg) + key = 'ALL_SERVICE_STATUS|{}'.format(srv_name) statusvalue = {} statusvalue['service_status'] = srv_status @@ -334,6 +525,9 @@ def get_unit_status(self, event): if init_state == "Up": service_up_status = "OK" unit_status = "OK" + elif init_state == "Failed": + fail_reason = app_fail_reason + unit_status = "FAILED" else: fail_reason = app_fail_reason unit_status = "NOT OK" @@ -347,10 +541,20 @@ def get_unit_status(self, event): service_status = "Stopping" service_up_status = "Stopping" elif active_state == "inactive": - if srv_type == "oneshot" or service_name in spl_srv_list: + if service_name in spl_srv_list: service_status = "OK" service_up_status = "OK" unit_status = "OK" + elif srv_type == "oneshot": + # If oneshot service is + if sub_state == "exited": + service_status = "OK" + service_up_status = "OK" + unit_status = "OK" + else: + unit_status = "NOT OK" + if fail_reason == "-": + fail_reason = "Inactive" else: unit_status = "NOT OK" if fail_reason == "-": @@ -378,11 +582,13 @@ def get_all_system_status(self): if ustate == "NOT OK": if service not in self.dnsrvs_name: self.dnsrvs_name.add(service) + elif ustate == "FAILED": + return "FAILED" if len(self.dnsrvs_name) == 0: return "UP" else: - return "DOWN" + return "INIT" #Displays the system ready status message on console def print_console_message(self, message): @@ -390,51 +596,65 @@ def print_console_message(self, message): console.write("\n{} {}\n".format(datetime.now().strftime("%b %d %H:%M:%S.%f"), message)) #Publish the system ready status message on logger,console and state db - def publish_system_status(self, astate): - global system_allsrv_state - if system_allsrv_state != astate: - system_allsrv_state = astate - if astate == "DOWN": - msg = "System is not ready - one or more services are not up" - elif astate == "UP": - msg = "System is ready" - logger.log_notice(msg) - self.print_console_message(msg) - self.post_system_status(astate) - - #Checks all the services and updates the current system status - def update_system_status(self): - try: - astate = self.get_all_system_status() - self.publish_system_status(astate) + def publish_system_status(self, new_state): + # Allowed transitions + # 'from': ['to'] + transitions = { + 'INIT': ['UP', 'DOWN', 'FAILED'], + 'DOWN': ['FAILED'] + } + + # Check if state transition is allowed. If status update not allowed, + # don't perform it + if new_state not in transitions.get(self.last_reported_state, []): + return - except Exception as e: - logger.log_error("update system status exception:{}".format(str(e))) + if new_state == "DOWN": + msg = "System is not ready - one or more services are not up" + elif new_state == "FAILED": + msg = "System is not ready - one or more services failed" + else: + msg = "System is ready" + + # Broadcast new system status + logger.log_notice(msg) + self.print_console_message(msg) + self.post_system_status(new_state) + self.last_reported_state = new_state + + # We don't need timeout anymore + self.monitor_timeout.task_stop() #Checks a service status and updates the system status def check_unit_status(self, event): - #global dnsrvs_name if not self.state_db: self.state_db = swsscommon.SonicV2Connector(use_unix_socket_path=True) self.state_db.connect(self.state_db.STATE_DB) - astate = "DOWN" + astate = 'INIT' full_srv_list = self.get_all_service_list() if event in full_srv_list: ustate = self.get_unit_status(event) - if ustate == "OK" and system_allsrv_state == "UP": - astate = "UP" - elif ustate == "OK" and system_allsrv_state == "DOWN": + if ustate == 'OK': + # If that unit was down/inactive, and now it is up, remove it + # from the set of down units if event in self.dnsrvs_name: self.dnsrvs_name.remove(event) - if len(self.dnsrvs_name) == 0: - astate = "UP" - else: - astate = "DOWN" - else: + # If now the set of down units is empty - we are good to report + # "OK" system status + if len(self.dnsrvs_name) == 0: + astate = 'UP' + elif ustate == 'FAILED': + # If unit reported failed state, the system should report + # "NOT OK" status + astate = 'FAILED' + elif ustate == 'NOT OK': if event not in self.dnsrvs_name: self.dnsrvs_name.add(event) - astate = "DOWN" + else: + logger.log_warning(f'Got wrong {event} unit state: {ustate}. ' + 'Nothing to do') + return self.publish_system_status(astate) else: @@ -445,9 +665,7 @@ def check_unit_status(self, event): if len(self.dnsrvs_name) == 0: astate = "UP" - else: - astate = "DOWN" - self.publish_system_status(astate) + self.publish_system_status(astate) srv_name,last = event.split('.') # stop on service maybe propagated to timers and in that case, @@ -458,7 +676,7 @@ def check_unit_status(self, event): if key_exists == 1: self.state_db.delete(self.state_db.STATE_DB, key) - return 0 + return def system_service(self): if not self.state_db: @@ -472,12 +690,18 @@ def system_service(self): monitor_statedb_table = MonitorStateDbTask(self.myQ) monitor_statedb_table.task_run() + self.monitor_timeout = MonitorTimeout(self.myQ, self.config) + self.monitor_timeout.task_run() except Exception as e: logger.log_error("SubProcess-{}".format(str(e))) sys.exit(1) - - self.update_system_status() + try: + # Maybe everything is ready now, so let's try + astate = self.get_all_system_status() + self.publish_system_status(astate) + except Exception as e: + logger.log_error(f"Failed to update initial system status: {e}") from queue import Empty # Queue to receive the STATEDB and Systemd state change event @@ -489,18 +713,34 @@ def system_service(self): event_time = msg["time"] logger.log_debug("Main process- received event:{} from source:{} time:{}".format(event,event_src,event_time)) logger.log_info("check_unit_status for [ "+event+" ] ") - self.check_unit_status(event) + if event_src == 'timeout': + # If got timeout, so not all daemons/apps reported their + # ready status. It mean the system is in "NOT OK" state. + # But we can't assume it is failed, so it is just down so + # far. + self.publish_system_status('DOWN') + else: + self.check_unit_status(event) + + # Recheck the down services, cause they may not update dbus + for service in self.dnsrvs_name.copy(): + self.check_unit_status(service) except (Empty, EOFError): pass except Exception as e: logger.log_error("system_service"+str(e)) #cleanup tables "'ALL_SERVICE_STATUS*', 'SYSTEM_READY*'" from statedb - self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, "ALL_SERVICE_STATUS|*") - self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, "SYSTEM_READY|*") + self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, + "ALL_SERVICE_STATUS|*") + self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, + "SYSTEM_READY|*") + self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, + "SERVICE_APP|*") monitor_system_bus.task_stop() monitor_statedb_table.task_stop() + self.monitor_timeout.task_stop() def task_worker(self): if self.task_stopping_event.is_set(): @@ -526,5 +766,3 @@ def task_stop(self): return False return True - - diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index 4aeb3b18f041..e785ca5d60a4 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -39,6 +39,7 @@ from health_checker.sysmonitor import Sysmonitor from health_checker.sysmonitor import MonitorStateDbTask from health_checker.sysmonitor import MonitorSystemBusTask +from health_checker.sysmonitor import MonitorTimeout load_source('healthd', os.path.join(scripts_path, 'healthd')) from healthd import HealthDaemon @@ -48,7 +49,16 @@ snmp-subagent EXITED Oct 19 01:53 AM """ device_info.get_platform = MagicMock(return_value='unittest') - + +stub_glob = [ + '/some/systemd/path/radv.service', + '/some/systemd/path/bgp.service', + '/some/systemd/path/pmon.service', + '/some/systemd/path/swss.service', + '/some/systemd/path/swss@1.service', + '/some/systemd/path/ntp-config.service', +] + device_runtime_metadata = {"DEVICE_RUNTIME_METADATA": {"ETHERNET_PORTS_PRESENT":True}} def no_op(*args, **kwargs): @@ -632,11 +642,13 @@ def test_utils(): @patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) -@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False)) +@patch('sonic_py_common.multi_asic.is_multi_asic', + MagicMock(return_value=False)) @patch('docker.DockerClient') @patch('health_checker.utils.run_command') @patch('swsscommon.swsscommon.ConfigDBConnector') @patch('sonic_py_common.device_info.get_device_runtime_metadata', MagicMock(return_value=device_runtime_metadata)) +@patch('glob.glob', MagicMock(return_value=stub_glob)) def test_get_all_service_list(mock_config_db, mock_run, mock_docker_client): mock_db_data = MagicMock() mock_get_table = MagicMock() @@ -652,6 +664,7 @@ def test_get_all_service_list(mock_config_db, mock_run, mock_docker_client): 'state': 'enabled', 'has_global_scope': 'True', 'has_per_asic_scope': 'False', + 'irrel_for_sysready': 'True' }, 'pmon': { 'state': 'disabled', @@ -659,16 +672,31 @@ def test_get_all_service_list(mock_config_db, mock_run, mock_docker_client): 'has_per_asic_scope': 'False', } } + sysmon = Sysmonitor() - print("mock get table:{}".format(mock_get_table.return_value)) + sysmon.config.wait_services = ['ntp-config'] + sysmon.config.ignore_services = ['swss'] result = sysmon.get_all_service_list() - print("result get all service list:{}".format(result)) + assert 'radv.service' in result + assert 'bgp.service' not in result assert 'pmon.service' not in result + # Test with feature disabled. The swss is expected only + mock_get_table.return_value = { + 'localhost': { + 'sysready_state': 'disabled' + } + } + sysmon = Sysmonitor() + result = sysmon.get_all_service_list() + assert len(result) == 1 + assert 'swss.service' in result + @patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) -@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False)) +@patch('sonic_py_common.multi_asic.is_multi_asic', + MagicMock(return_value=False)) @patch('docker.DockerClient') @patch('health_checker.utils.run_command') @patch('swsscommon.swsscommon.ConfigDBConnector') @@ -711,14 +739,12 @@ def test_get_app_ready_status(mock_config_db, mock_run, mock_docker_client): }}) sysmon = Sysmonitor() + sysmon.config.services_to_report_app_status = ['rsyslog'] result = sysmon.get_app_ready_status('radv') - print(result) assert 'Up' in result result = sysmon.get_app_ready_status('bgp') - print(result) - assert 'Down' in result + assert 'Failed' in result result = sysmon.get_app_ready_status('snmp') - print(result) assert 'Up' in result @@ -748,13 +774,14 @@ def test_check_unit_status(): @patch('health_checker.sysmonitor.Sysmonitor.print_console_message', MagicMock()) def test_system_status_up_after_service_removed(): sysmon = Sysmonitor() - sysmon.publish_system_status('UP') + sysmon.monitor_timeout = MagicMock() + # By default it is in init state. And should remain in init sysmon.check_unit_status('mock_bgp.service') assert 'mock_bgp.service' in sysmon.dnsrvs_name - result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status') + result = swsscommon.SonicV2Connector.exists(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE") print("system status result before service was removed from system: {}".format(result)) - assert result == "DOWN" + assert result == False sysmon.check_unit_status('mock_bgp.service') assert 'mock_bgp.service' not in sysmon.dnsrvs_name @@ -764,8 +791,11 @@ def test_system_status_up_after_service_removed(): @patch('health_checker.sysmonitor.Sysmonitor.get_all_service_list', MagicMock(return_value=['mock_snmp.service'])) +@patch('health_checker.sysmonitor.Sysmonitor.print_console_message', + MagicMock()) def test_check_unit_status_timer(): sysmon = Sysmonitor() + sysmon.monitor_timeout = MagicMock() sysmon.state_db = MagicMock() sysmon.state_db.exists = MagicMock(return_value=1) sysmon.state_db.delete = MagicMock() @@ -814,7 +844,24 @@ def test_get_all_system_status_not_ok(): sysmon = Sysmonitor() result = sysmon.get_all_system_status() print("result:{}".format(result)) - assert result == 'DOWN' + assert result == 'INIT' + + +@patch('health_checker.sysmonitor.Sysmonitor.get_all_service_list', + MagicMock(return_value=['mock_snmp.service', 'mock_ns.service'])) +@patch('health_checker.sysmonitor.Sysmonitor.get_unit_status', + MagicMock(return_value= 'FAILED')) +@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', + MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.get_app_ready_status', + MagicMock(return_value='Up')) +def test_get_all_system_status_failed(): + sysmon = Sysmonitor() + result = sysmon.get_all_system_status() + print("result:{}".format(result)) + assert result == 'FAILED' + def test_post_unit_status(): sysmon = Sysmonitor() @@ -825,10 +872,20 @@ def test_post_unit_status(): assert result['app_ready_status'] == 'Down' assert result['fail_reason'] == 'mock reason' + # Test reporting failed + sysmon.post_unit_status("mock_ntp", 'Failed', 'Down', 'mock reason', '-') + result = swsscommon.SonicV2Connector.get_all(MockConnector, 0, + 'ALL_SERVICE_STATUS|mock_ntp') + assert result['service_status'] == 'Failed' + assert result['app_ready_status'] == 'Down' + assert result['fail_reason'] == 'mock reason' + + def test_post_system_status(): sysmon = Sysmonitor() sysmon.post_system_status("UP") - result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status') + result = swsscommon.SonicV2Connector.get( + MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status') print("post system status result:{}".format(result)) assert result == "UP" @@ -837,10 +894,15 @@ def test_post_system_status(): print("post system status result:{}".format(result)) assert result == "DOWN" + # Simulate fail. Non expected exceptions is fail + sysmon.state_db.set = MagicMock(side_effect=Exception) + sysmon.post_system_status("UP") + @patch('health_checker.sysmonitor.Sysmonitor.print_console_message', MagicMock()) @patch('health_checker.sysmonitor.Sysmonitor.post_system_status', MagicMock()) def test_publish_system_status_allowed_status(): sysmon = Sysmonitor() + sysmon.monitor_timeout = MagicMock() sysmon.publish_system_status('UP') sysmon.publish_system_status('DOWN') @@ -851,20 +913,15 @@ def test_publish_system_status_allowed_status(): for call_args in sysmon.post_system_status.call_args_list: assert call_args in expected_calls -@patch('health_checker.sysmonitor.Sysmonitor.print_console_message', MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.print_console_message', + MagicMock()) def test_publish_system_status(): sysmon = Sysmonitor() + sysmon.monitor_timeout = MagicMock() sysmon.publish_system_status('UP') result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status') assert result == "UP" -@patch('health_checker.sysmonitor.Sysmonitor.get_all_system_status', test_get_all_system_status_ok()) -@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', test_publish_system_status()) -def test_update_system_status(): - sysmon = Sysmonitor() - sysmon.update_system_status() - result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status') - assert result == "UP" from sonic_py_common.task_base import ProcessTaskBase import multiprocessing @@ -878,6 +935,7 @@ def test_monitor_statedb_task(): assert sysmon._task_process is not None sysmon.task_stop() + @patch('health_checker.sysmonitor.MonitorSystemBusTask.subscribe_sysbus', MagicMock()) def test_monitor_sysbus_task(): sysmon = MonitorSystemBusTask(myQ) @@ -886,6 +944,24 @@ def test_monitor_sysbus_task(): assert sysmon._task_process is not None sysmon.task_stop() + +def test_monitor_timeout_task(): + config = lambda: None + config.timeout = 10 + monitor = MonitorTimeout(myQ, config) + monitor.task_run() + monitor.task_notify('Test') + assert monitor._task_process is not None + monitor.task_stop() + + # Make sure not failing and increase coverage + monitor = MonitorTimeout(myQ, {}) + monitor.task_run() + assert monitor._task_process is not None + monitor.task_stop() + monitor.task_notify('Test') + + @patch('health_checker.sysmonitor.MonitorSystemBusTask.subscribe_sysbus', MagicMock()) @patch('health_checker.sysmonitor.MonitorStateDbTask.subscribe_statedb', MagicMock()) def test_system_service(): @@ -920,11 +996,9 @@ def test_get_service_from_feature_table(): } } ] - dir_list = [] - sysmon.get_service_from_feature_table(dir_list) - assert 'bgp.service' in dir_list - assert 'swss.service' not in dir_list - + dir_list = sysmon.get_service_from_feature_table() + assert 'bgp' in dir_list + assert 'swss' not in dir_list @patch('healthd.time.time') @patch('healthd.HealthDaemon.log_notice', side_effect=lambda *args, **kwargs: None)