From 44ea6a08659a11723173c5272b0400882cf11347 Mon Sep 17 00:00:00 2001 From: Sudharsan Dhamal Gopalarathnam Date: Wed, 8 Feb 2023 13:24:23 -0800 Subject: [PATCH] [sai_failure_dump]Invoking dump during SAI failure (#2644) * [sai_failure_dump]Invoking dump during SAI failure --- orchagent/main.cpp | 10 +-- orchagent/orchdaemon.cpp | 2 +- orchagent/saihelper.cpp | 44 ++++++++++-- orchagent/saihelper.h | 2 +- tests/mock_tests/Makefile.am | 1 + tests/mock_tests/portsorch_ut.cpp | 81 +++++++++++++++++++++ tests/mock_tests/test_failure_handling.cpp | 82 ++++++++++++++++++++++ 7 files changed, 208 insertions(+), 14 deletions(-) create mode 100644 tests/mock_tests/test_failure_handling.cpp diff --git a/orchagent/main.cpp b/orchagent/main.cpp index 03bb5da55031..5ae5aea34241 100644 --- a/orchagent/main.cpp +++ b/orchagent/main.cpp @@ -126,7 +126,7 @@ void syncd_apply_view() if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to notify syncd APPLY_VIEW %d", status); - exit(EXIT_FAILURE); + handleSaiFailure(true); } } @@ -619,7 +619,7 @@ int main(int argc, char **argv) if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to create a switch, rv:%d", status); - exit(EXIT_FAILURE); + handleSaiFailure(true); } SWSS_LOG_NOTICE("Create a switch, id:%" PRIu64, gSwitchId); @@ -650,7 +650,7 @@ int main(int argc, char **argv) if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to get MAC address from switch, rv:%d", status); - exit(EXIT_FAILURE); + handleSaiFailure(true); } else { @@ -665,7 +665,7 @@ int main(int argc, char **argv) if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Fail to get switch virtual router ID %d", status); - exit(EXIT_FAILURE); + handleSaiFailure(true); } gVirtualRouterId = attr.value.oid; @@ -707,7 +707,7 @@ int main(int argc, char **argv) if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to create underlay router interface %d", status); - exit(EXIT_FAILURE); + handleSaiFailure(true); } SWSS_LOG_NOTICE("Created underlay router interface ID %" PRIx64, gUnderlayIfId); diff --git a/orchagent/orchdaemon.cpp b/orchagent/orchdaemon.cpp index d1b418882cfc..18295a9f59a8 100644 --- a/orchagent/orchdaemon.cpp +++ b/orchagent/orchdaemon.cpp @@ -675,7 +675,7 @@ void OrchDaemon::flush() if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to flush redis pipeline %d", status); - abort(); + handleSaiFailure(true); } } diff --git a/orchagent/saihelper.cpp b/orchagent/saihelper.cpp index 6abb011b697f..6f7ad9b250d9 100644 --- a/orchagent/saihelper.cpp +++ b/orchagent/saihelper.cpp @@ -496,7 +496,8 @@ task_process_status handleSaiCreateStatus(sai_api_t api, sai_status_t status, vo default: SWSS_LOG_ERROR("Encountered failure in create operation, exiting orchagent, SAI API: %s, status: %s", sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str()); - abort(); + handleSaiFailure(true); + break; } break; case SAI_API_HOSTIF: @@ -514,8 +515,10 @@ task_process_status handleSaiCreateStatus(sai_api_t api, sai_status_t status, vo default: SWSS_LOG_ERROR("Encountered failure in create operation, exiting orchagent, SAI API: %s, status: %s", sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str()); - abort(); + handleSaiFailure(true); + break; } + break; default: switch (status) { @@ -525,7 +528,8 @@ task_process_status handleSaiCreateStatus(sai_api_t api, sai_status_t status, vo default: SWSS_LOG_ERROR("Encountered failure in create operation, exiting orchagent, SAI API: %s, status: %s", sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str()); - abort(); + handleSaiFailure(true); + break; } } return task_need_retry; @@ -566,8 +570,10 @@ task_process_status handleSaiSetStatus(sai_api_t api, sai_status_t status, void default: SWSS_LOG_ERROR("Encountered failure in set operation, exiting orchagent, SAI API: %s, status: %s", sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str()); - abort(); + handleSaiFailure(true); + break; } + break; case SAI_API_TUNNEL: switch (status) { @@ -578,12 +584,15 @@ task_process_status handleSaiSetStatus(sai_api_t api, sai_status_t status, void default: SWSS_LOG_ERROR("Encountered failure in set operation, exiting orchagent, SAI API: %s, status: %s", sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str()); - abort(); + handleSaiFailure(true); + break; } + break; default: SWSS_LOG_ERROR("Encountered failure in set operation, exiting orchagent, SAI API: %s, status: %s", sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str()); - abort(); + handleSaiFailure(true); + break; } return task_need_retry; @@ -611,7 +620,8 @@ task_process_status handleSaiRemoveStatus(sai_api_t api, sai_status_t status, vo default: SWSS_LOG_ERROR("Encountered failure in remove operation, exiting orchagent, SAI API: %s, status: %s", sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str()); - abort(); + handleSaiFailure(true); + break; } return task_need_retry; } @@ -663,3 +673,23 @@ bool parseHandleSaiStatusFailure(task_process_status status) } return true; } + +/* Handling SAI failure. Request redis to invoke SAI failure dump and abort if set*/ +void handleSaiFailure(bool abort_on_failure) +{ + SWSS_LOG_ENTER(); + + sai_attribute_t attr; + + attr.id = SAI_REDIS_SWITCH_ATTR_NOTIFY_SYNCD; + attr.value.s32 = SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP; + sai_status_t status = sai_switch_api->set_switch_attribute(gSwitchId, &attr); + if (status != SAI_STATUS_SUCCESS) + { + SWSS_LOG_ERROR("Failed to take sai failure dump %d", status); + } + if (abort_on_failure) + { + abort(); + } +} diff --git a/orchagent/saihelper.h b/orchagent/saihelper.h index 335bb5fa9f63..c7ff8d23eaef 100644 --- a/orchagent/saihelper.h +++ b/orchagent/saihelper.h @@ -18,4 +18,4 @@ task_process_status handleSaiSetStatus(sai_api_t api, sai_status_t status, void task_process_status handleSaiRemoveStatus(sai_api_t api, sai_status_t status, void *context = nullptr); task_process_status handleSaiGetStatus(sai_api_t api, sai_status_t status, void *context = nullptr); bool parseHandleSaiStatusFailure(task_process_status status); - +void handleSaiFailure(bool abort_on_failure); diff --git a/tests/mock_tests/Makefile.am b/tests/mock_tests/Makefile.am index 685a30b53a4e..02bb54dd25ba 100644 --- a/tests/mock_tests/Makefile.am +++ b/tests/mock_tests/Makefile.am @@ -50,6 +50,7 @@ tests_SOURCES = aclorch_ut.cpp \ flowcounterrouteorch_ut.cpp \ orchdaemon_ut.cpp \ warmrestartassist_ut.cpp \ + test_failure_handling.cpp \ $(top_srcdir)/lib/gearboxutils.cpp \ $(top_srcdir)/lib/subintf.cpp \ $(top_srcdir)/orchagent/orchdaemon.cpp \ diff --git a/tests/mock_tests/portsorch_ut.cpp b/tests/mock_tests/portsorch_ut.cpp index 836e862d55bd..740744df51f0 100644 --- a/tests/mock_tests/portsorch_ut.cpp +++ b/tests/mock_tests/portsorch_ut.cpp @@ -9,6 +9,7 @@ #include "notifier.h" #define private public #include "pfcactionhandler.h" +#include #undef private #include @@ -21,6 +22,8 @@ namespace portsorch_test sai_port_api_t ut_sai_port_api; sai_port_api_t *pold_sai_port_api; + sai_switch_api_t ut_sai_switch_api; + sai_switch_api_t *pold_sai_switch_api; bool not_support_fetching_fec; vector mock_port_fec_modes = {SAI_PORT_FEC_MODE_RS, SAI_PORT_FEC_MODE_FC}; @@ -66,9 +69,28 @@ namespace portsorch_test _sai_set_port_fec_count++; _sai_port_fec_mode = attr[0].value.s32; } + else if (attr[0].id == SAI_PORT_ATTR_AUTO_NEG_MODE) + { + /* Simulating failure case */ + return SAI_STATUS_FAILURE; + } return pold_sai_port_api->set_port_attribute(port_id, attr); } + uint32_t *_sai_syncd_notifications_count; + int32_t *_sai_syncd_notification_event; + sai_status_t _ut_stub_sai_set_switch_attribute( + _In_ sai_object_id_t switch_id, + _In_ const sai_attribute_t *attr) + { + if (attr[0].id == SAI_REDIS_SWITCH_ATTR_NOTIFY_SYNCD) + { + *_sai_syncd_notifications_count =+ 1; + *_sai_syncd_notification_event = attr[0].value.s32; + } + return pold_sai_switch_api->set_switch_attribute(switch_id, attr); + } + void _hook_sai_port_api() { ut_sai_port_api = *sai_port_api; @@ -83,6 +105,19 @@ namespace portsorch_test sai_port_api = pold_sai_port_api; } + void _hook_sai_switch_api() + { + ut_sai_switch_api = *sai_switch_api; + pold_sai_switch_api = sai_switch_api; + ut_sai_switch_api.set_switch_attribute = _ut_stub_sai_set_switch_attribute; + sai_switch_api = &ut_sai_switch_api; + } + + void _unhook_sai_switch_api() + { + sai_switch_api = pold_sai_switch_api; + } + sai_queue_api_t ut_sai_queue_api; sai_queue_api_t *pold_sai_queue_api; int _sai_set_queue_attr_count = 0; @@ -473,6 +508,52 @@ namespace portsorch_test _unhook_sai_port_api(); } + TEST_F(PortsOrchTest, PortTestSAIFailureHandling) + { + _hook_sai_port_api(); + _hook_sai_switch_api(); + Table portTable = Table(m_app_db.get(), APP_PORT_TABLE_NAME); + std::deque entries; + + not_support_fetching_fec = false; + // Get SAI default ports to populate DB + auto ports = ut_helper::getInitialSaiPorts(); + + for (const auto &it : ports) + { + portTable.set(it.first, it.second); + } + + // Set PortConfigDone + portTable.set("PortConfigDone", { { "count", to_string(ports.size()) } }); + + // refill consumer + gPortsOrch->addExistingData(&portTable); + + // Apply configuration : + // create ports + static_cast(gPortsOrch)->doTask(); + + _sai_syncd_notifications_count = (uint32_t*)mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + _sai_syncd_notification_event = (int32_t*)mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + *_sai_syncd_notifications_count = 0; + + entries.push_back({"Ethernet0", "SET", + { + {"autoneg", "on"} + }}); + auto consumer = dynamic_cast(gPortsOrch->getExecutor(APP_PORT_TABLE_NAME)); + consumer->addToSync(entries); + ASSERT_DEATH({static_cast(gPortsOrch)->doTask();}, ""); + + ASSERT_EQ(*_sai_syncd_notifications_count, 1); + ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP); + _unhook_sai_port_api(); + _unhook_sai_switch_api(); + } + TEST_F(PortsOrchTest, PortReadinessColdBoot) { Table portTable = Table(m_app_db.get(), APP_PORT_TABLE_NAME); diff --git a/tests/mock_tests/test_failure_handling.cpp b/tests/mock_tests/test_failure_handling.cpp new file mode 100644 index 000000000000..be53e0fd5f68 --- /dev/null +++ b/tests/mock_tests/test_failure_handling.cpp @@ -0,0 +1,82 @@ +#include "saihelper.h" +#include "ut_helper.h" +#include + +extern sai_switch_api_t *sai_switch_api; + +namespace saifailure_test +{ + struct SaiFailureTest : public ::testing::Test + { + }; + uint32_t *_sai_syncd_notifications_count; + int32_t *_sai_syncd_notification_event; + sai_switch_api_t *pold_sai_switch_api; + sai_switch_api_t ut_sai_switch_api; + + sai_status_t _ut_stub_sai_set_switch_attribute( + _In_ sai_object_id_t switch_id, + _In_ const sai_attribute_t *attr) + { + if (attr[0].id == SAI_REDIS_SWITCH_ATTR_NOTIFY_SYNCD) + { + *_sai_syncd_notifications_count = *_sai_syncd_notifications_count + 1; + *_sai_syncd_notification_event = attr[0].value.s32; + } + return pold_sai_switch_api->set_switch_attribute(switch_id, attr); + } + + void _hook_sai_switch_api() + { + ut_sai_switch_api = *sai_switch_api; + pold_sai_switch_api = sai_switch_api; + ut_sai_switch_api.set_switch_attribute = _ut_stub_sai_set_switch_attribute; + sai_switch_api = &ut_sai_switch_api; + } + + void _unhook_sai_switch_api() + { + sai_switch_api = pold_sai_switch_api; + } + + TEST_F(SaiFailureTest, handleSaiFailure) + { + _hook_sai_switch_api(); + _sai_syncd_notifications_count = (uint32_t*)mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + _sai_syncd_notification_event = (int32_t*)mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + *_sai_syncd_notifications_count = 0; + uint32_t notif_count = *_sai_syncd_notifications_count; + + ASSERT_DEATH({handleSaiCreateStatus(SAI_API_FDB, SAI_STATUS_FAILURE);}, ""); + ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count); + ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP); + + ASSERT_DEATH({handleSaiCreateStatus(SAI_API_HOSTIF, SAI_STATUS_INVALID_PARAMETER);}, ""); + ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count); + ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP); + + ASSERT_DEATH({handleSaiCreateStatus(SAI_API_PORT, SAI_STATUS_FAILURE);}, ""); + ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count); + ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP); + + ASSERT_DEATH({handleSaiSetStatus(SAI_API_HOSTIF, SAI_STATUS_FAILURE);}, ""); + ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count); + ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP); + + ASSERT_DEATH({handleSaiSetStatus(SAI_API_PORT, SAI_STATUS_FAILURE);}, ""); + ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count); + ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP); + + ASSERT_DEATH({handleSaiSetStatus(SAI_API_TUNNEL, SAI_STATUS_FAILURE);}, ""); + ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count); + ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP); + + ASSERT_DEATH({handleSaiRemoveStatus(SAI_API_LAG, SAI_STATUS_FAILURE);}, ""); + ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count); + ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP); + + _unhook_sai_switch_api(); + } +}