Skip to content

Commit

Permalink
[thermalctld] Enable stopping thermal manager (sonic-net#180)
Browse files Browse the repository at this point in the history
Make thermalctld exit as soon as possible. Depends on sonic-net/sonic-platform-common#187

During config reload flow, supervisord sends SIGTERM to thermalctld. However, if thermalctld cannot exit in 10 seconds, supervisord sends SITKILL to thermalctld. In this case, sub process of thermalctld might still stay in memory as a zombie process. This PR is aimed to fix this.
  • Loading branch information
Junchao-Mellanox authored May 10, 2021
1 parent 665fcd9 commit cc3803f
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 25 deletions.
28 changes: 24 additions & 4 deletions sonic-thermalctld/scripts/thermalctld
Original file line number Diff line number Diff line change
Expand Up @@ -191,14 +191,15 @@ class FanUpdater(logger.Logger):
FAN_INFO_TABLE_NAME = 'FAN_INFO'
FAN_DRAWER_INFO_TABLE_NAME = 'FAN_DRAWER_INFO'

def __init__(self, chassis):
def __init__(self, chassis, task_stopping_event):
"""
Initializer for FanUpdater
:param chassis: Object representing a platform chassis
"""
super(FanUpdater, self).__init__(SYSLOG_IDENTIFIER)

self.chassis = chassis
self.task_stopping_event = task_stopping_event
self.fan_status_dict = {}
state_db = daemon_base.db_connect("STATE_DB")
self.table = swsscommon.Table(state_db, FanUpdater.FAN_INFO_TABLE_NAME)
Expand Down Expand Up @@ -236,15 +237,21 @@ class FanUpdater(logger.Logger):
FanStatus.reset_fan_counter()

for drawer_index, drawer in enumerate(self.chassis.get_all_fan_drawers()):
if self.task_stopping_event.is_set():
return
self._refresh_fan_drawer_status(drawer, drawer_index)
for fan_index, fan in enumerate(drawer.get_all_fans()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_fan_status(drawer, drawer_index, fan, fan_index)
except Exception as e:
self.log_warning('Failed to update fan status - {}'.format(repr(e)))

for psu_index, psu in enumerate(self.chassis.get_all_psus()):
for fan_index, fan in enumerate(psu.get_all_fans()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_fan_status(psu, psu_index, fan, fan_index, True)
except Exception as e:
Expand Down Expand Up @@ -396,6 +403,8 @@ class FanUpdater(logger.Logger):

def _update_led_color(self):
for fan_name, fan_status in self.fan_status_dict.items():
if self.task_stopping_event.is_set():
return
try:
fvs = swsscommon.FieldValuePairs([
('led_status', str(try_get(fan_status.fan.get_status_led)))
Expand All @@ -408,6 +417,8 @@ class FanUpdater(logger.Logger):
self.table.set(fan_name, fvs)

for drawer in self.chassis.get_all_fan_drawers():
if self.task_stopping_event.is_set():
return
drawer_name = try_get(drawer.get_name)
if drawer_name == NOT_AVAILABLE:
continue
Expand Down Expand Up @@ -510,14 +521,15 @@ class TemperatureUpdater(logger.Logger):
# Temperature information table name in database
TEMPER_INFO_TABLE_NAME = 'TEMPERATURE_INFO'

def __init__(self, chassis):
def __init__(self, chassis, task_stopping_event):
"""
Initializer of TemperatureUpdater
:param chassis: Object representing a platform chassis
"""
super(TemperatureUpdater, self).__init__(SYSLOG_IDENTIFIER)

self.chassis = chassis
self.task_stopping_event = task_stopping_event
self.temperature_status_dict = {}
state_db = daemon_base.db_connect("STATE_DB")
self.table = swsscommon.Table(state_db, TemperatureUpdater.TEMPER_INFO_TABLE_NAME)
Expand Down Expand Up @@ -562,6 +574,8 @@ class TemperatureUpdater(logger.Logger):
"""
self.log_debug("Start temperature updating")
for index, thermal in enumerate(self.chassis.get_all_thermals()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index)
except Exception as e:
Expand All @@ -570,6 +584,8 @@ class TemperatureUpdater(logger.Logger):
for psu_index, psu in enumerate(self.chassis.get_all_psus()):
parent_name = 'PSU {}'.format(psu_index + 1)
for thermal_index, thermal in enumerate(psu.get_all_thermals()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_temperature_status(parent_name, thermal, thermal_index)
except Exception as e:
Expand All @@ -578,6 +594,8 @@ class TemperatureUpdater(logger.Logger):
for sfp_index, sfp in enumerate(self.chassis.get_all_sfps()):
parent_name = 'SFP {}'.format(sfp_index + 1)
for thermal_index, thermal in enumerate(sfp.get_all_thermals()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_temperature_status(parent_name, thermal, thermal_index)
except Exception as e:
Expand Down Expand Up @@ -686,8 +704,8 @@ class ThermalMonitor(ProcessTaskBase):
# Set minimum logging level to INFO
self.logger.set_min_log_priority_info()

self.fan_updater = FanUpdater(chassis)
self.temperature_updater = TemperatureUpdater(chassis)
self.fan_updater = FanUpdater(chassis, self.task_stopping_event)
self.temperature_updater = TemperatureUpdater(chassis, self.task_stopping_event)

def main(self):
begin = time.time()
Expand Down Expand Up @@ -789,6 +807,8 @@ class ThermalControlDaemon(daemon_base.DaemonBase):
if sig in FATAL_SIGNALS:
self.log_info("Caught signal '{}' - exiting...".format(SIGNALS_TO_NAMES_DICT[sig]))
exit_code = 128 + sig # Make sure we exit with a non-zero code so that supervisor will try to restart us
self.thermal_monitor.task_stop()
self.thermal_manager.stop()
self.stop_event.set()
elif sig in NONFATAL_SIGNALS:
self.log_info("Caught signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig]))
Expand Down
52 changes: 31 additions & 21 deletions sonic-thermalctld/tests/test_thermalctld.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import sys
import multiprocessing
from imp import load_source # TODO: Replace with importlib once we no longer need to support Python 2

# TODO: Clean this up once we no longer need to support Python 2
Expand Down Expand Up @@ -148,7 +149,7 @@ class TestFanUpdater(object):
Test cases to cover functionality in FanUpdater class
"""
def test_deinit(self):
fan_updater = thermalctld.FanUpdater(MockChassis())
fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event())
fan_updater.fan_status_dict = {'key1': 'value1', 'key2': 'value2'}
fan_updater.table._del = mock.MagicMock()

Expand All @@ -161,7 +162,7 @@ def test_deinit(self):
@mock.patch('thermalctld.update_entity_info', mock.MagicMock())
def test_refresh_fan_drawer_status_fan_drawer_get_name_not_impl(self):
# Test case where fan_drawer.get_name is not implemented
fan_updater = thermalctld.FanUpdater(MockChassis())
fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event())
mock_fan_drawer = mock.MagicMock()
fan_updater._refresh_fan_drawer_status(mock_fan_drawer, 1)
assert thermalctld.update_entity_info.call_count == 0
Expand All @@ -175,7 +176,7 @@ def test_update_fan_with_exception(self):
fan.make_over_speed()
chassis.get_all_fans().append(fan)

fan_updater = thermalctld.FanUpdater(chassis)
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
fan_updater.update()
assert fan.get_status_led() == MockFan.STATUS_LED_COLOR_RED
assert fan_updater.log_warning.call_count == 1
Expand All @@ -192,15 +193,15 @@ def test_set_fan_led_exception(self):
mock_fan = MockFan()
mock_fan.set_status_led = mock.MagicMock(side_effect=NotImplementedError)

fan_updater = thermalctld.FanUpdater(MockChassis())
fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event())
fan_updater._set_fan_led(mock_fan_drawer, mock_fan, 'Test Fan', fan_status)
assert fan_updater.log_warning.call_count == 1
fan_updater.log_warning.assert_called_with('Failed to set status LED for fan Test Fan, set_status_led not implemented')

def test_fan_absent(self):
chassis = MockChassis()
chassis.make_absent_fan()
fan_updater = thermalctld.FanUpdater(chassis)
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
fan_updater.update()
fan_list = chassis.get_all_fans()
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
Expand All @@ -224,7 +225,7 @@ def test_fan_absent(self):
def test_fan_faulty(self):
chassis = MockChassis()
chassis.make_faulty_fan()
fan_updater = thermalctld.FanUpdater(chassis)
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
fan_updater.update()
fan_list = chassis.get_all_fans()
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
Expand All @@ -248,7 +249,7 @@ def test_fan_faulty(self):
def test_fan_under_speed(self):
chassis = MockChassis()
chassis.make_under_speed_fan()
fan_updater = thermalctld.FanUpdater(chassis)
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
fan_updater.update()
fan_list = chassis.get_all_fans()
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
Expand All @@ -264,7 +265,7 @@ def test_fan_under_speed(self):
def test_fan_over_speed(self):
chassis = MockChassis()
chassis.make_over_speed_fan()
fan_updater = thermalctld.FanUpdater(chassis)
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
fan_updater.update()
fan_list = chassis.get_all_fans()
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
Expand All @@ -283,7 +284,7 @@ def test_update_psu_fans(self):
mock_fan = MockFan()
psu._fan_list.append(mock_fan)
chassis._psu_list.append(psu)
fan_updater = thermalctld.FanUpdater(chassis)
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
fan_updater.update()
assert fan_updater.log_warning.call_count == 0

Expand Down Expand Up @@ -331,7 +332,7 @@ def test_insufficient_fan_number():
chassis = MockChassis()
chassis.make_absent_fan()
chassis.make_faulty_fan()
fan_updater = thermalctld.FanUpdater(chassis)
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
fan_updater.update()
assert fan_updater.log_warning.call_count == 3
expected_calls = [
Expand Down Expand Up @@ -415,19 +416,20 @@ class TestTemperatureUpdater(object):
"""
def test_deinit(self):
chassis = MockChassis()
temp_updater = thermalctld.TemperatureUpdater(chassis)
temp_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
temp_updater.temperature_status_dict = {'key1': 'value1', 'key2': 'value2'}
temp_updater.table._del = mock.MagicMock()

temp_updater.deinit()
assert temp_updater.table._del.call_count == 2
expected_calls = [mock.call('key1'), mock.call('key2')]
temp_updater.table._del.assert_has_calls(expected_calls, any_order=True)


def test_over_temper(self):
chassis = MockChassis()
chassis.make_over_temper_thermal()
temperature_updater = thermalctld.TemperatureUpdater(chassis)
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
temperature_updater.update()
thermal_list = chassis.get_all_thermals()
assert temperature_updater.log_warning.call_count == 1
Expand All @@ -441,7 +443,7 @@ def test_over_temper(self):
def test_under_temper(self):
chassis = MockChassis()
chassis.make_under_temper_thermal()
temperature_updater = thermalctld.TemperatureUpdater(chassis)
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
temperature_updater.update()
thermal_list = chassis.get_all_thermals()
assert temperature_updater.log_warning.call_count == 1
Expand All @@ -458,7 +460,7 @@ def test_update_psu_thermals(self):
mock_thermal = MockThermal()
psu._thermal_list.append(mock_thermal)
chassis._psu_list.append(psu)
temperature_updater = thermalctld.TemperatureUpdater(chassis)
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 0

Expand All @@ -478,7 +480,7 @@ def test_update_sfp_thermals(self):
mock_thermal = MockThermal()
sfp._thermal_list.append(mock_thermal)
chassis._sfp_list.append(sfp)
temperature_updater = thermalctld.TemperatureUpdater(chassis)
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 0

Expand All @@ -499,7 +501,7 @@ def test_update_thermal_with_exception(self):
thermal.make_over_temper()
chassis.get_all_thermals().append(thermal)

temperature_updater = thermalctld.TemperatureUpdater(chassis)
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 2

Expand All @@ -524,17 +526,17 @@ def test_updater_thermal_check_modular_chassis():
chassis = MockChassis()
assert chassis.is_modular_chassis() == False

temperature_updater = thermalctld.TemperatureUpdater(chassis)
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
assert temperature_updater.chassis_table == None

chassis.set_modular_chassis(True)
chassis.set_my_slot(-1)
temperature_updater = thermalctld.TemperatureUpdater(chassis)
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
assert temperature_updater.chassis_table == None

my_slot = 1
chassis.set_my_slot(my_slot)
temperature_updater = thermalctld.TemperatureUpdater(chassis)
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
assert temperature_updater.chassis_table != None
assert temperature_updater.chassis_table.table_name == '{}_{}'.format(TEMPER_INFO_TABLE_NAME, str(my_slot))

Expand All @@ -547,7 +549,7 @@ def test_updater_thermal_check_chassis_table():

chassis.set_modular_chassis(True)
chassis.set_my_slot(1)
temperature_updater = thermalctld.TemperatureUpdater(chassis)
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())

temperature_updater.update()
assert temperature_updater.chassis_table.get_size() == chassis.get_num_thermals()
Expand All @@ -566,7 +568,7 @@ def test_updater_thermal_check_min_max():

chassis.set_modular_chassis(True)
chassis.set_my_slot(1)
temperature_updater = thermalctld.TemperatureUpdater(chassis)
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())

temperature_updater.update()
slot_dict = temperature_updater.chassis_table.get(thermal.get_name())
Expand All @@ -580,26 +582,30 @@ def test_signal_handler():
daemon_thermalctld.stop_event.set = mock.MagicMock()
daemon_thermalctld.log_info = mock.MagicMock()
daemon_thermalctld.log_warning = mock.MagicMock()
daemon_thermalctld.thermal_manager.stop = mock.MagicMock()
daemon_thermalctld.signal_handler(thermalctld.signal.SIGHUP, None)
daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert
assert daemon_thermalctld.log_info.call_count == 1
daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGHUP' - ignoring...")
assert daemon_thermalctld.log_warning.call_count == 0
assert daemon_thermalctld.stop_event.set.call_count == 0
assert daemon_thermalctld.thermal_manager.stop.call_count == 0
assert thermalctld.exit_code == thermalctld.ERR_UNKNOWN

# Test SIGINT
daemon_thermalctld = thermalctld.ThermalControlDaemon()
daemon_thermalctld.stop_event.set = mock.MagicMock()
daemon_thermalctld.log_info = mock.MagicMock()
daemon_thermalctld.log_warning = mock.MagicMock()
daemon_thermalctld.thermal_manager.stop = mock.MagicMock()
test_signal = thermalctld.signal.SIGINT
daemon_thermalctld.signal_handler(test_signal, None)
daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert
assert daemon_thermalctld.log_info.call_count == 1
daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGINT' - exiting...")
assert daemon_thermalctld.log_warning.call_count == 0
assert daemon_thermalctld.stop_event.set.call_count == 1
assert daemon_thermalctld.thermal_manager.stop.call_count == 1
assert thermalctld.exit_code == (128 + test_signal)

# Test SIGTERM
Expand All @@ -608,13 +614,15 @@ def test_signal_handler():
daemon_thermalctld.stop_event.set = mock.MagicMock()
daemon_thermalctld.log_info = mock.MagicMock()
daemon_thermalctld.log_warning = mock.MagicMock()
daemon_thermalctld.thermal_manager.stop = mock.MagicMock()
test_signal = thermalctld.signal.SIGTERM
daemon_thermalctld.signal_handler(test_signal, None)
daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert
assert daemon_thermalctld.log_info.call_count == 1
daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGTERM' - exiting...")
assert daemon_thermalctld.log_warning.call_count == 0
assert daemon_thermalctld.stop_event.set.call_count == 1
assert daemon_thermalctld.thermal_manager.stop.call_count == 1
assert thermalctld.exit_code == (128 + test_signal)

# Test an unhandled signal
Expand All @@ -623,12 +631,14 @@ def test_signal_handler():
daemon_thermalctld.stop_event.set = mock.MagicMock()
daemon_thermalctld.log_info = mock.MagicMock()
daemon_thermalctld.log_warning = mock.MagicMock()
daemon_thermalctld.thermal_manager.stop = mock.MagicMock()
daemon_thermalctld.signal_handler(thermalctld.signal.SIGUSR1, None)
daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert
assert daemon_thermalctld.log_warning.call_count == 1
daemon_thermalctld.log_warning.assert_called_with("Caught unhandled signal 'SIGUSR1' - ignoring...")
assert daemon_thermalctld.log_info.call_count == 0
assert daemon_thermalctld.stop_event.set.call_count == 0
assert daemon_thermalctld.thermal_manager.stop.call_count == 0
assert thermalctld.exit_code == thermalctld.ERR_UNKNOWN


Expand Down

0 comments on commit cc3803f

Please sign in to comment.