Skip to content

Commit

Permalink
Xcvrd crash and restart should not cause link flap on platforms needi…
Browse files Browse the repository at this point in the history
…ng custom NPU SI settings (#541)

* Xcvrd crash and restart should not cause link flap on platforms needing custom SI settings

Signed-off-by: Mihir Patel <patelmi@microsoft.com>

* Improved code coverage

---------

Signed-off-by: Mihir Patel <patelmi@microsoft.com>
  • Loading branch information
mihirpat1 authored and mssonicbld committed Oct 8, 2024
1 parent 9598f6b commit 0dc97ce
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 12 deletions.
51 changes: 50 additions & 1 deletion sonic-xcvrd/tests/test_xcvrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,33 @@ def test_is_cmis_api(self, mock_class, expected_return_value):
mock_xcvr_api.__class__ = mock_class
assert is_cmis_api(mock_xcvr_api) == expected_return_value

def test_get_state_db_port_table_val_by_key(self):
xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE)
port_mapping = PortMapping()

assert xcvr_table_helper.get_state_db_port_table_val_by_key("Ethernet0", None, NPU_SI_SETTINGS_SYNC_STATUS_KEY) == None

port_mapping.get_asic_id_for_logical_port = MagicMock(return_value=0)
xcvr_table_helper.get_state_port_tbl = MagicMock(return_value=None)
assert xcvr_table_helper.get_state_db_port_table_val_by_key("Ethernet0", port_mapping, NPU_SI_SETTINGS_SYNC_STATUS_KEY) == None

mock_state_port_table = MagicMock()
xcvr_table_helper.get_state_port_tbl = MagicMock(return_value=mock_state_port_table)
mock_state_port_table.get = MagicMock(return_value=(None, None))
assert xcvr_table_helper.get_state_db_port_table_val_by_key("Ethernet0", port_mapping, NPU_SI_SETTINGS_SYNC_STATUS_KEY) == None

mock_state_port_table.get = MagicMock(return_value=(True, {'A' : 'B'}))
assert xcvr_table_helper.get_state_db_port_table_val_by_key("Ethernet0", port_mapping, NPU_SI_SETTINGS_SYNC_STATUS_KEY) == None
mock_state_port_table.get = MagicMock(return_value=(True, {NPU_SI_SETTINGS_SYNC_STATUS_KEY : NPU_SI_SETTINGS_DEFAULT_VALUE}))
assert xcvr_table_helper.get_state_db_port_table_val_by_key("Ethernet0", port_mapping, NPU_SI_SETTINGS_SYNC_STATUS_KEY) == NPU_SI_SETTINGS_DEFAULT_VALUE

def test_is_npu_si_settings_update_required(self):
xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE)
port_mapping = PortMapping()
xcvr_table_helper.get_state_db_port_table_val_by_key = MagicMock(side_effect=[None, NPU_SI_SETTINGS_NOTIFIED_VALUE])
assert xcvr_table_helper.is_npu_si_settings_update_required("Ethernet0", port_mapping)
assert not xcvr_table_helper.is_npu_si_settings_update_required("Ethernet0", port_mapping)

@patch('xcvrd.xcvrd._wrapper_get_sfp_type')
@patch('xcvrd.xcvrd_utilities.port_event_helper.PortMapping.logical_port_name_to_physical_port_list', MagicMock(return_value=[0]))
@patch('xcvrd.xcvrd._wrapper_get_presence', MagicMock(return_value=True))
Expand Down Expand Up @@ -986,10 +1013,12 @@ def _check_notify_media_setting(self, index, expected_found=False, expected_valu
}
} if xcvr_info_dict is None else xcvr_info_dict
app_port_tbl = Table("APPL_DB", 'PORT_TABLE')
xcvr_table_helper.get_app_port_tbl = MagicMock(return_value=app_port_tbl)
xcvr_table_helper.is_npu_si_settings_update_required = MagicMock(return_value=True)
port_mapping = PortMapping()
port_change_event = PortChangeEvent('Ethernet0', index, 0, PortChangeEvent.PORT_ADD)
port_mapping.handle_port_change_event(port_change_event)
media_settings_parser.notify_media_setting(logical_port_name, xcvr_info_dict, app_port_tbl, mock_cfg_table, port_mapping)
media_settings_parser.notify_media_setting(logical_port_name, xcvr_info_dict, xcvr_table_helper, port_mapping)
found, result = app_port_tbl.get(logical_port_name)
result_dict = dict(result) if result else None
assert found == expected_found
Expand Down Expand Up @@ -1335,6 +1364,24 @@ def test_DaemonXcvrd_wait_for_port_config_done(self, mock_select, mock_sub_table
xcvrd.wait_for_port_config_done('')
assert swsscommon.Select.select.call_count == 2

def test_DaemonXcvrd_initialize_port_init_control_fields_in_port_table(self):
port_mapping = PortMapping()
xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER)

port_mapping.logical_port_list = ['Ethernet0']
port_mapping.get_asic_id_for_logical_port = MagicMock(return_value=0)
mock_xcvrd_table_helper = MagicMock()
mock_xcvrd_table_helper.get_state_port_tbl = MagicMock(return_value=None)
xcvrd.xcvr_table_helper = mock_xcvrd_table_helper
xcvrd.initialize_port_init_control_fields_in_port_table(port_mapping)

mock_state_db = MagicMock()
mock_xcvrd_table_helper.get_state_port_tbl = MagicMock(return_value=mock_state_db)
mock_state_db.get = MagicMock(return_value=(False, {}))

xcvrd.initialize_port_init_control_fields_in_port_table(port_mapping)
mock_state_db.set.call_count = 2

@patch('xcvrd.xcvrd.DaemonXcvrd.init')
@patch('xcvrd.xcvrd.DaemonXcvrd.deinit')
@patch('xcvrd.xcvrd.DomInfoUpdateTask.start')
Expand Down Expand Up @@ -3200,6 +3247,7 @@ def test_DaemonXcvrd_init_deinit_fastboot_enabled(self):
xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER)
with patch("subprocess.check_output") as mock_run:
mock_run.return_value = "true"
xcvrd.initialize_port_init_control_fields_in_port_table = MagicMock()

xcvrd.init()

Expand All @@ -3226,6 +3274,7 @@ def test_DaemonXcvrd_init_deinit_cold(self):
xcvrdaemon = DaemonXcvrd(SYSLOG_IDENTIFIER)
with patch("subprocess.check_output") as mock_run:
mock_run.return_value = "false"
xcvrdaemon.initialize_port_init_control_fields_in_port_table = MagicMock()

xcvrdaemon.init()

Expand Down
46 changes: 39 additions & 7 deletions sonic-xcvrd/xcvrd/xcvrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

from .xcvrd_utilities import sfp_status_helper
from .sff_mgr import SffManagerTask
from .xcvrd_utilities.xcvr_table_helper import XcvrTableHelper
from .xcvrd_utilities.xcvr_table_helper import *
from .xcvrd_utilities import port_event_helper
from .xcvrd_utilities.port_event_helper import PortChangeObserver
from .xcvrd_utilities import media_settings_parser
Expand Down Expand Up @@ -1934,7 +1934,7 @@ def _post_port_sfp_info_and_dom_thr_to_db_once(self, port_mapping, xcvr_table_he

# Do not notify media settings during warm reboot to avoid dataplane traffic impact
if is_warm_start == False:
media_settings_parser.notify_media_setting(logical_port_name, transceiver_dict, xcvr_table_helper.get_app_port_tbl(asic_index), xcvr_table_helper.get_cfg_port_tbl(asic_index), port_mapping)
media_settings_parser.notify_media_setting(logical_port_name, transceiver_dict, xcvr_table_helper, port_mapping)
transceiver_dict.clear()
else:
retry_eeprom_set.add(logical_port_name)
Expand Down Expand Up @@ -2156,10 +2156,12 @@ def task_worker(self, stopping_event, sfp_error_event):

if rc != SFP_EEPROM_NOT_READY:
post_port_dom_threshold_info_to_db(logical_port, self.port_mapping, self.xcvr_table_helper.get_dom_threshold_tbl(asic_index))
media_settings_parser.notify_media_setting(logical_port, transceiver_dict, self.xcvr_table_helper.get_app_port_tbl(asic_index), self.xcvr_table_helper.get_cfg_port_tbl(asic_index), self.port_mapping)
media_settings_parser.notify_media_setting(logical_port, transceiver_dict, self.xcvr_table_helper, self.port_mapping)
transceiver_dict.clear()
elif value == sfp_status_helper.SFP_STATUS_REMOVED:
helper_logger.log_notice("{}: Got SFP removed event".format(logical_port))
state_port_table = self.xcvr_table_helper.get_state_port_tbl(asic_index)
state_port_table.set(logical_port, [(NPU_SI_SETTINGS_SYNC_STATUS_KEY, NPU_SI_SETTINGS_DEFAULT_VALUE)])
update_port_transceiver_status_table_sw(
logical_port, self.xcvr_table_helper.get_status_tbl(asic_index), sfp_status_helper.SFP_STATUS_REMOVED)
helper_logger.log_notice("{}: received plug out and update port sfp status table.".format(logical_port))
Expand Down Expand Up @@ -2354,7 +2356,7 @@ def on_add_logical_port(self, port_change_event):
self.retry_eeprom_set.add(port_change_event.port_name)
else:
post_port_dom_threshold_info_to_db(port_change_event.port_name, self.port_mapping, dom_threshold_tbl)
media_settings_parser.notify_media_setting(port_change_event.port_name, transceiver_dict, self.xcvr_table_helper.get_app_port_tbl(port_change_event.asic_id), self.xcvr_table_helper.get_cfg_port_tbl(port_change_event.asic_id), self.port_mapping)
media_settings_parser.notify_media_setting(port_change_event.port_name, transceiver_dict, self.xcvr_table_helper, self.port_mapping)
else:
status = sfp_status_helper.SFP_STATUS_REMOVED if not status else status
update_port_transceiver_status_table_sw(port_change_event.port_name, status_tbl, status, error_description)
Expand All @@ -2380,7 +2382,7 @@ def retry_eeprom_reading(self):
rc = post_port_sfp_info_to_db(logical_port, self.port_mapping, self.xcvr_table_helper.get_intf_tbl(asic_index), transceiver_dict)
if rc != SFP_EEPROM_NOT_READY:
post_port_dom_threshold_info_to_db(logical_port, self.port_mapping, self.xcvr_table_helper.get_dom_threshold_tbl(asic_index))
media_settings_parser.notify_media_setting(logical_port, transceiver_dict, self.xcvr_table_helper.get_app_port_tbl(asic_index), self.xcvr_table_helper.get_cfg_port_tbl(asic_index), self.port_mapping)
media_settings_parser.notify_media_setting(logical_port, transceiver_dict, self.xcvr_table_helper, self.port_mapping)
transceiver_dict.clear()
retry_success_set.add(logical_port)
# Update retry EEPROM set
Expand Down Expand Up @@ -2437,6 +2439,30 @@ def wait_for_port_config_done(self, namespace):
if key in ["PortConfigDone", "PortInitDone"]:
break

"""
Initialize NPU_SI_SETTINGS_SYNC_STATUS_KEY field in STATE_DB PORT_TABLE|<lport>
if not already present for a port.
"""
def initialize_port_init_control_fields_in_port_table(self, port_mapping_data):
logical_port_list = port_mapping_data.logical_port_list
for lport in logical_port_list:
asic_index = port_mapping_data.get_asic_id_for_logical_port(lport)
state_port_table = self.xcvr_table_helper.get_state_port_tbl(asic_index)
if state_port_table is None:
helper_logger.log_error("Port init control: state_port_tbl is None for lport {}".format(lport))
continue

found, state_port_table_fvs = state_port_table.get(lport)
if not found:
self.log_notice("Port init control: Creating STATE_DB PORT_TABLE as unable to find for lport {}".format(lport))
state_port_table_fvs = []
state_port_table_fvs_dict = dict(state_port_table_fvs)
if NPU_SI_SETTINGS_SYNC_STATUS_KEY not in state_port_table_fvs_dict:
state_port_table.set(lport, [(NPU_SI_SETTINGS_SYNC_STATUS_KEY,
NPU_SI_SETTINGS_DEFAULT_VALUE)])
self.log_notice("Port init control: Initialized NPU_SI_SETTINGS_SYNC_STATUS for lport {}".format(lport))

self.log_notice("XCVRD INIT: Port init control fields initialized in STATE_DB PORT_TABLE")

# Initialize daemon
def init(self):
Expand Down Expand Up @@ -2490,7 +2516,11 @@ def init(self):
self.wait_for_port_config_done(namespace)

self.log_notice("XCVRD INIT: After port config is done")
return port_event_helper.get_port_mapping(self.namespaces)
port_mapping_data = port_event_helper.get_port_mapping(self.namespaces)

self.initialize_port_init_control_fields_in_port_table(port_mapping_data)

return port_mapping_data

# Deinitialize daemon
def deinit(self):
Expand All @@ -2508,7 +2538,9 @@ def deinit(self):
helper_logger.log_warning("Got invalid asic index for {}, ignored".format(logical_port_name))
continue

intf_tbl = self.xcvr_table_helper.get_intf_tbl(asic_index) if not is_warm_fast_reboot else None
# Skip deleting intf_tbl for avoiding OA to trigger Tx disable signal
# due to TRANSCEIVER_INFO table deletion during xcvrd shutdown/crash
intf_tbl = None

del_port_sfp_dom_info_from_db(logical_port_name, port_mapping_data,
intf_tbl,
Expand Down
23 changes: 20 additions & 3 deletions sonic-xcvrd/xcvrd/xcvrd_utilities/media_settings_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sonic_py_common import device_info, logger
from swsscommon import swsscommon
from xcvrd import xcvrd
from .xcvr_table_helper import *

g_dict = {}

Expand All @@ -24,6 +25,7 @@
SYSLOG_IDENTIFIER = "xcvrd"
helper_logger = logger.Logger(SYSLOG_IDENTIFIER)

PHYSICAL_PORT_NOT_EXIST = -1

def load_media_settings():
global g_dict
Expand Down Expand Up @@ -303,12 +305,24 @@ def get_speed_lane_count_and_subport(port, cfg_port_tbl):


def notify_media_setting(logical_port_name, transceiver_dict,
app_port_tbl, cfg_port_tbl, port_mapping):
xcvr_table_helper, port_mapping):

if not media_settings_present():
return

port_speed, lane_count, subport_num = get_speed_lane_count_and_subport(logical_port_name, cfg_port_tbl)
if not xcvr_table_helper:
helper_logger.log_error("Notify media setting: xcvr_table_helper "
"not initialized for lport {}".format(logical_port_name))
return

if not xcvr_table_helper.is_npu_si_settings_update_required(logical_port_name, port_mapping):
helper_logger.log_notice("Notify media setting: Media settings already "
"notified for lport {}".format(logical_port_name))
return

asic_index = port_mapping.get_asic_id_for_logical_port(logical_port_name)

port_speed, lane_count, subport_num = get_speed_lane_count_and_subport(logical_port_name, xcvr_table_helper.get_cfg_port_tbl(asic_index))

ganged_port = False
ganged_member_num = 1
Expand Down Expand Up @@ -354,4 +368,7 @@ def notify_media_setting(logical_port_name, transceiver_dict,
fvs[index] = (str(media_key), str(val_str))
index += 1

app_port_tbl.set(port_name, fvs)
xcvr_table_helper.get_app_port_tbl(asic_index).set(port_name, fvs)
xcvr_table_helper.get_state_port_tbl(asic_index).set(logical_port_name, [(NPU_SI_SETTINGS_SYNC_STATUS_KEY, NPU_SI_SETTINGS_NOTIFIED_VALUE)])
helper_logger.log_notice("Notify media setting: Published ASIC-side SI setting "
"for lport {} in APP_DB".format(logical_port_name))
69 changes: 68 additions & 1 deletion sonic-xcvrd/xcvrd/xcvrd_utilities/xcvr_table_helper.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
try:
from sonic_py_common import daemon_base
from sonic_py_common import daemon_base, logger
from sonic_py_common import multi_asic
from swsscommon import swsscommon
except ImportError as e:
raise ImportError(str(e) + " - required module not found")

SYSLOG_IDENTIFIER = "xcvrd"
helper_logger = logger.Logger(SYSLOG_IDENTIFIER)

TRANSCEIVER_INFO_TABLE = 'TRANSCEIVER_INFO'
TRANSCEIVER_FIRMWARE_INFO_TABLE = 'TRANSCEIVER_FIRMWARE_INFO'
TRANSCEIVER_DOM_SENSOR_TABLE = 'TRANSCEIVER_DOM_SENSOR'
TRANSCEIVER_DOM_THRESHOLD_TABLE = 'TRANSCEIVER_DOM_THRESHOLD'
TRANSCEIVER_STATUS_TABLE = 'TRANSCEIVER_STATUS'
TRANSCEIVER_PM_TABLE = 'TRANSCEIVER_PM'

NPU_SI_SETTINGS_SYNC_STATUS_KEY = 'NPU_SI_SETTINGS_SYNC_STATUS'
NPU_SI_SETTINGS_DEFAULT_VALUE = 'NPU_SI_SETTINGS_DEFAULT'
NPU_SI_SETTINGS_NOTIFIED_VALUE = 'NPU_SI_SETTINGS_NOTIFIED'

class XcvrTableHelper:
def __init__(self, namespaces):
self.int_tbl, self.dom_tbl, self.dom_threshold_tbl, self.status_tbl, self.app_port_tbl, \
Expand Down Expand Up @@ -62,3 +69,63 @@ def get_cfg_port_tbl(self, asic_id):

def get_state_port_tbl(self, asic_id):
return self.state_port_tbl[asic_id]

def get_state_db_port_table_val_by_key(self, lport, port_mapping, key):
"""
Retrieves the value of a key from STATE_DB PORT_TABLE|<lport> for the given logical port
Args:
lport:
logical port name
port_mapping:
A PortMapping object
key:
key for the corresponding value to be retrieved
Returns:
The value of the key if the key is found in STATE_DB PORT_TABLE|<lport>
None otherwise
"""

if port_mapping is None:
helper_logger.log_error("Get value by key from STATE_DB: port_mapping is None "
"for lport {}".format(lport))
return None

asic_index = port_mapping.get_asic_id_for_logical_port(lport)
state_port_table = self.get_state_port_tbl(asic_index)
if state_port_table is None:
helper_logger.log_error("Get value by key from STATE_DB: state_db is None with asic_index {} "
"for lport {}".format(asic_index, lport))
return None

found, state_port_table_fvs = state_port_table.get(lport)
if not found:
helper_logger.log_error("Get value by key from STATE_DB: Unable to find lport {}".format(lport))
return None

state_port_table_fvs_dict = dict(state_port_table_fvs)
if key not in state_port_table_fvs_dict:
helper_logger.log_error("Get value by key from STATE_DB: Unable to find key {} "
"state_port_table_fvs_dict {} for lport {}".format(key, state_port_table_fvs_dict, lport))
return None

return state_port_table_fvs_dict[key]

def is_npu_si_settings_update_required(self, lport, port_mapping):
"""
Checks if NPU SI settings update is required for a module
Args:
lport:
logical port name
port_mapping:
A PortMapping object
Returns:
True if NPU_SI_SETTINGS_SYNC_STATUS_KEY is
- not present/accessible from STATE_DB or
- set to NPU_SI_SETTINGS_DEFAULT_VALUE
False otherwise
"""
npu_si_settings_sync_val = self.get_state_db_port_table_val_by_key(lport,
port_mapping, NPU_SI_SETTINGS_SYNC_STATUS_KEY)

# If npu_si_settings_sync_val is None, it can also mean that the key is not present in the table
return npu_si_settings_sync_val is None or npu_si_settings_sync_val == NPU_SI_SETTINGS_DEFAULT_VALUE

0 comments on commit 0dc97ce

Please sign in to comment.