-
Notifications
You must be signed in to change notification settings - Fork 86
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding support for persistent storage and retrieval of DPU reboot-cause #169
base: master
Are you sure you want to change the base?
Changes from 46 commits
be55f8e
3cd7b67
0e47f97
1feeb9f
210dd14
4c0fa72
807a267
766a677
00496b5
b0b89c4
667ec45
97ff55c
0cca074
d97d228
1d0650f
17345aa
093bf00
c28e29c
887897d
ede90c0
6b6b6b9
e44cd1f
a98c06d
6d21142
bcba133
dffedc8
fefff1a
3293749
b61c2d9
0e4d23a
decce6c
499d551
e9187a0
82fc7fd
15a70fa
7a0d3d8
1a55a59
74c38ae
2dfbc33
0702dee
ddf4541
3e2114f
0be0072
93a1dff
79ed240
6d41638
d6c4f94
3bae343
c48efd9
31aba69
a19e2ff
6ad5432
95dff75
5f9859a
67a1f07
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -14,10 +14,12 @@ try: | |||||
|
||||||
from swsscommon import swsscommon | ||||||
from sonic_py_common import syslogger | ||||||
from sonic_py_common import device_info | ||||||
except ImportError as err: | ||||||
raise ImportError("%s - required module not found" % str(err)) | ||||||
|
||||||
VERSION = "1.0" | ||||||
CHASSIS_SERVER_PORT = 6380 | ||||||
|
||||||
SYSLOG_IDENTIFIER = "process-reboot-cause" | ||||||
|
||||||
|
@@ -28,6 +30,7 @@ USER_ISSUED_REBOOT_CAUSE_REGEX ="User issued \'{}\' command [User: {}, Time: {}] | |||||
|
||||||
REBOOT_CAUSE_UNKNOWN = "Unknown" | ||||||
REBOOT_CAUSE_TABLE_NAME = "REBOOT_CAUSE" | ||||||
MAX_HISTORY_FILES = 10 | ||||||
|
||||||
REDIS_HOSTIP = "127.0.0.1" | ||||||
state_db = None | ||||||
|
@@ -69,6 +72,55 @@ def read_reboot_cause_files_and_save_state_db(): | |||||
x = TIME_SORTED_FULL_REBOOT_FILE_LIST[i] | ||||||
os.remove(x) | ||||||
|
||||||
def get_sorted_reboot_cause_files(dpu_history_path): | ||||||
"""Retrieve and sort the reboot cause files for a specific DPU.""" | ||||||
try: | ||||||
files = os.listdir(dpu_history_path) | ||||||
sorted_files = sorted( | ||||||
[os.path.join(dpu_history_path, f) for f in files if f.endswith('.txt')], | ||||||
key=os.path.getmtime, | ||||||
reverse=True # Most recent first | ||||||
) | ||||||
return sorted_files | ||||||
except Exception as e: | ||||||
sonic_logger.log_error(f"Error retrieving reboot cause files for {dpu_history_path}: {e}") | ||||||
return [] | ||||||
|
||||||
|
||||||
def update_dpu_reboot_cause_to_chassis_state_db(): | ||||||
"""Retrieve reboot cause from history files and save them to chassisStateDB.""" | ||||||
chassis_state_db = swsscommon.SonicV2Connector(host="redis_chassis.server", port=CHASSIS_SERVER_PORT) | ||||||
chassis_state_db.connect(chassis_state_db.CHASSIS_STATE_DB) | ||||||
|
||||||
try: | ||||||
dpus = device_info.get_dpu_list() | ||||||
|
||||||
for dpu in dpus: | ||||||
# Get sorted reboot cause files for the DPU | ||||||
dpu_history_dir = os.path.join('/host/reboot-cause/module', dpu , 'history') | ||||||
reboot_files = get_sorted_reboot_cause_files(dpu_history_dir) | ||||||
|
||||||
for reboot_file in reboot_files: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rameshraghupathy How do we handle a case where NPU comes late so that DPU to NPU mid plane is not UP by the time this process starts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @prgeor As shown in the HLD the NPU-chassisd will fetch the reboot-cause from the DPU and persist it. |
||||||
if os.path.isfile(reboot_file): | ||||||
with open(reboot_file, "r") as cause_file: | ||||||
try: | ||||||
data = json.load(cause_file) | ||||||
# Ensure keys exist | ||||||
if 'name' not in data: | ||||||
sonic_logger.log_warning(f"Missing 'time' in data from {reboot_file}") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @gpunathilell Done |
||||||
continue # Skip this file | ||||||
|
||||||
_hash = f"{REBOOT_CAUSE_TABLE_NAME}|{dpu.upper()}|{data['name']}" | ||||||
chassis_state_db.set(chassis_state_db.CHASSIS_STATE_DB, _hash, 'cause', data.get('cause', '')) | ||||||
chassis_state_db.set(chassis_state_db.CHASSIS_STATE_DB, _hash, 'time', data.get('time', '')) | ||||||
chassis_state_db.set(chassis_state_db.CHASSIS_STATE_DB, _hash, 'user', data.get('user', '')) | ||||||
chassis_state_db.set(chassis_state_db.CHASSIS_STATE_DB, _hash, 'comment', data.get('comment', '')) | ||||||
|
||||||
except json.decoder.JSONDecodeError as je: | ||||||
sonic_logger.log_info(f"Unable to process reboot-cause file {reboot_file}: {je}") | ||||||
continue # Skip this file | ||||||
except Exception as e: | ||||||
sonic_logger.log_err(f"Error reading DPU reboot causes: {e}") | ||||||
|
||||||
def main(): | ||||||
# Configure logger to log all messages INFO level and higher | ||||||
|
@@ -99,6 +151,10 @@ def main(): | |||||
# Read the previous reboot cause from saved reboot-cause files and save the previous reboot cause upto 10 entry to the state db | ||||||
read_reboot_cause_files_and_save_state_db() | ||||||
|
||||||
# For smartswitch platform store the DPU reboot-cause to CHASSIS_STATE_DB | ||||||
if device_info.is_smartswitch(): | ||||||
update_dpu_reboot_cause_to_chassis_state_db() | ||||||
|
||||||
|
||||||
if __name__ == "__main__": | ||||||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
import os | ||
import shutil | ||
import pytest | ||
import json | ||
|
||
from swsscommon import swsscommon | ||
from sonic_py_common.general import load_module_from_source | ||
|
@@ -33,6 +34,8 @@ | |
determine_reboot_cause_path = os.path.join(scripts_path, 'determine-reboot-cause') | ||
determine_reboot_cause = load_module_from_source('determine_reboot_cause', determine_reboot_cause_path) | ||
|
||
# Gte the function to create dpu dir | ||
vvolam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
check_and_create_dpu_dirs = determine_reboot_cause.check_and_create_dpu_dirs | ||
|
||
PROC_CMDLINE_CONTENTS = """\ | ||
BOOT_IMAGE=/image-20191130.52/boot/vmlinuz-4.9.0-11-2-amd64 root=/dev/sda4 rw console=tty0 console=ttyS1,9600n8 quiet net.ifnames=0 biosdevname=0 loop=image-20191130.52/fs.squashfs loopfstype=squashfs apparmor=1 security=apparmor varlog_size=4096 usbcore.autosuspend=-1 module_blacklist=gpio_ich SONIC_BOOT_TYPE=warm""" | ||
|
@@ -71,6 +74,8 @@ | |
EXPECTED_KERNEL_PANIC_REBOOT_CAUSE_DICT = {'comment': '', 'gen_time': '2021_3_28_13_48_49', 'cause': 'Kernel Panic', 'user': 'N/A', 'time': 'Sun Mar 28 13:45:12 UTC 2021'} | ||
|
||
REBOOT_CAUSE_DIR="host/reboot-cause/" | ||
PLATFORM_JSON_PATH = "/usr/share/sonic/device/test_platform/platform.json" | ||
REBOOT_CAUSE_MODULE_DIR = "host/reboot-cause" | ||
|
||
class TestDetermineRebootCause(object): | ||
def test_parse_warmfast_reboot_from_proc_cmdline(self): | ||
|
@@ -199,3 +204,36 @@ def test_determine_reboot_cause_main_with_reboot_cause_dir(self): | |
determine_reboot_cause.main() | ||
assert os.path.exists("host/reboot-cause/reboot-cause.txt") == True | ||
assert os.path.exists("host/reboot-cause/previous-reboot-cause.json") == True | ||
|
||
def create_mock_platform_json(self, dpus): | ||
"""Helper function to create a mock platform.json file.""" | ||
os.makedirs(os.path.dirname(PLATFORM_JSON_PATH), exist_ok=True) | ||
with open(PLATFORM_JSON_PATH, "w") as f: | ||
json.dump({"DPUS": dpus}, f) | ||
|
||
@mock.patch('sonic_py_common.device_info.is_smartswitch', return_value=True) | ||
@mock.patch('sonic_py_common.device_info.get_platform', return_value='some_platform') | ||
def test_check_and_create_dpu_dirs(self, mock_get_platform, mock_is_smartswitch): | ||
# Call the function under test | ||
result = check_and_create_dpu_dirs() | ||
|
||
@mock.patch('sonic_py_common.device_info.get_platform_info', return_value={'platform': 'some_platform'}) | ||
@mock.patch('sonic_py_common.device_info.is_smartswitch', return_value=True) | ||
@mock.patch('os.path.exists') | ||
@mock.patch('builtins.open', new_callable=mock.mock_open, read_data='{"DPUS": ["dpu0", "dpu1"]}') | ||
@mock.patch('os.makedirs') | ||
def test_check_and_create_dpu_dirs_with_platform_json(self, mock_makedirs, mock_open, mock_exists, mock_is_smartswitch, mock_get_platform_info): | ||
# Mock the platform.json existence | ||
mock_exists.side_effect = lambda path: path == "/usr/share/sonic/device/some_platform/platform.json" | ||
|
||
# Call the function under test | ||
check_and_create_dpu_dirs() | ||
|
||
# Assert that open was called correctly | ||
mock_open.assert_any_call("/usr/share/sonic/device/some_platform/platform.json", 'r') | ||
mock_open.assert_any_call('/host/reboot-cause/module/dpu0/reboot-cause.txt', 'w') | ||
mock_open.assert_any_call('/host/reboot-cause/module/dpu1/reboot-cause.txt', 'w') | ||
|
||
# Assert that makedirs was called for the DPU directories | ||
mock_makedirs.assert_any_call(os.path.join('/host/reboot-cause/module', 'dpu0')) | ||
mock_makedirs.assert_any_call(os.path.join('/host/reboot-cause/module', 'dpu1')) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @oleksandrivantsiv will do but can we defer it as the entire files doe not have a test yet. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rameshraghupathy I think comment is addressed and test is added, right? please confirm. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @oleksandrivantsiv Done. Added "process-reboot-cause_test.py" and test_update_dpu_reboot_cause_to_chassis_state_db_update to cover this case. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import sys | ||
import os | ||
from unittest import TestCase | ||
from unittest.mock import patch, MagicMock, mock_open | ||
from io import StringIO | ||
from sonic_py_common.general import load_module_from_source | ||
|
||
# Mock the connector | ||
from .mock_connector import MockConnector | ||
import swsscommon | ||
|
||
# Mock the SonicV2Connector | ||
swsscommon.SonicV2Connector = MockConnector | ||
|
||
# Define the path to the script and load it using the helper function | ||
test_path = os.path.dirname(os.path.abspath(__file__)) | ||
modules_path = os.path.dirname(test_path) | ||
scripts_path = os.path.join(modules_path, "scripts") | ||
sys.path.insert(0, modules_path) | ||
|
||
# Load the process-reboot-cause module using the helper function | ||
process_reboot_cause_path = os.path.join(scripts_path, "process-reboot-cause") | ||
process_reboot_cause = load_module_from_source('process_reboot_cause', process_reboot_cause_path) | ||
|
||
# Now proceed with your test class and mocks | ||
class TestProcessRebootCause(TestCase): | ||
@patch("builtins.open", new_callable=mock_open, read_data='{"cause": "PowerLoss", "user": "admin", "time": "2024-12-10", "comment": "test"}') | ||
@patch("os.listdir", return_value=["file1.json", "file2.json"]) | ||
@patch("os.path.isfile", return_value=True) | ||
@patch("os.path.exists", side_effect=lambda path: path.endswith('file1.json') or path.endswith('file2.json')) | ||
@patch("os.remove") | ||
@patch("process_reboot_cause.swsscommon.SonicV2Connector") | ||
@patch("process_reboot_cause.device_info.is_smartswitch", return_value=True) | ||
@patch("sys.stdout", new_callable=StringIO) | ||
@patch("os.geteuid", return_value=0) | ||
def test_process_reboot_cause(self, mock_geteuid, mock_stdout, mock_is_smartswitch, mock_connector, mock_remove, mock_exists, mock_isfile, mock_listdir, mock_open): | ||
# Mock DB | ||
mock_db = MagicMock() | ||
mock_connector.return_value = mock_db | ||
|
||
# Simulate running the script | ||
with patch.object(sys, "argv", ["process-reboot-cause"]): | ||
process_reboot_cause.main() | ||
|
||
# Validate syslog and stdout logging | ||
output = mock_stdout.getvalue() | ||
|
||
# Verify DB interactions | ||
mock_db.connect.assert_called() | ||
|
||
@patch("builtins.open", new_callable=mock_open, read_data='{"invalid_json}') | ||
@patch("os.listdir", return_value=["file1.json"]) | ||
@patch("os.path.isfile", return_value=True) | ||
@patch("os.path.exists", side_effect=lambda path: path.endswith('file1.json')) | ||
@patch("process_reboot_cause.swsscommon.SonicV2Connector") | ||
@patch("process_reboot_cause.device_info.is_smartswitch", return_value=True) | ||
@patch("sys.stdout", new_callable=StringIO) | ||
@patch("os.geteuid", return_value=0) | ||
def test_invalid_json(self, mock_geteuid, mock_stdout, mock_is_smartswitch, mock_connector, mock_exists, mock_isfile, mock_listdir, mock_open): | ||
# Mock DB | ||
mock_db = MagicMock() | ||
mock_connector.return_value = mock_db | ||
|
||
# Simulate running the script | ||
with patch.object(sys, "argv", ["process-reboot-cause"]): | ||
process_reboot_cause.main() | ||
|
||
# Check invalid JSON handling | ||
output = mock_stdout.getvalue() | ||
self.assertTrue(mock_connector.called) | ||
|
||
# Test get_sorted_reboot_cause_files | ||
@patch("process_reboot_cause.os.listdir") | ||
@patch("process_reboot_cause.os.path.getmtime") | ||
def test_get_sorted_reboot_cause_files_success(self, mock_getmtime, mock_listdir): | ||
# Setup mock data | ||
mock_listdir.return_value = ["file1.txt", "file2.txt", "file3.txt"] | ||
mock_getmtime.side_effect = [100, 200, 50] # Mock modification times | ||
|
||
# Call the function | ||
result = process_reboot_cause.get_sorted_reboot_cause_files("/mock/dpu_history") | ||
|
||
# Assert the files are sorted by modification time in descending order | ||
self.assertEqual(result, [ | ||
"/mock/dpu_history/file2.txt", | ||
"/mock/dpu_history/file1.txt", | ||
"/mock/dpu_history/file3.txt" | ||
]) | ||
|
||
@patch("process_reboot_cause.os.listdir") | ||
def test_get_sorted_reboot_cause_files_error(self, mock_listdir): | ||
# Simulate an exception during file listing | ||
mock_listdir.side_effect = Exception("Mocked error") | ||
|
||
# Call the function and check the result | ||
result = process_reboot_cause.get_sorted_reboot_cause_files("/mock/dpu_history") | ||
self.assertEqual(result, []) | ||
|
||
# Test update_dpu_reboot_cause_to_chassis_state_db | ||
@patch("builtins.open", new_callable=mock_open, read_data='{"cause": "Non-Hardware", "comment": "Switch rebooted DPU", "device": "DPU0", "time": "Fri Dec 13 01:12:36 AM UTC 2024", "name": "2024_12_13_01_12_36"}') | ||
@patch("process_reboot_cause.device_info.get_dpu_list", return_value=["dpu1", "dpu2"]) | ||
@patch("os.path.isfile", return_value=True) | ||
@patch("process_reboot_cause.get_sorted_reboot_cause_files") | ||
@patch("process_reboot_cause.os.listdir", return_value=["2024_12_13_01_12_36_reboot_cause.txt", "2024_12_14_01_11_46_reboot_cause.txt"]) | ||
@patch("process_reboot_cause.swsscommon.SonicV2Connector") | ||
def test_update_dpu_reboot_cause_to_chassis_state_db_update(self, mock_connector, mock_listdir, mock_get_sorted_files, mock_isfile, mock_get_dpu_list, mock_open): | ||
# Setup mocks | ||
mock_get_sorted_files.return_value = ["/mock/dpu_history/2024_12_13_01_12_36_reboot_cause.txt"] | ||
|
||
# Mock the database connection | ||
mock_db = MagicMock() | ||
mock_connector.return_value = mock_db | ||
|
||
# Call the function that reads the file and updates the DB | ||
process_reboot_cause.update_dpu_reboot_cause_to_chassis_state_db() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unused variable
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@gpunathilell Removed