diff --git a/source/server/BUILD b/source/server/BUILD index 82a35f729d57..852cc2bda604 100644 --- a/source/server/BUILD +++ b/source/server/BUILD @@ -55,7 +55,6 @@ envoy_cc_library( "@envoy_api//envoy/config/bootstrap/v3:pkg_cc_proto", "@envoy_api//envoy/config/metrics/v3:pkg_cc_proto", "@envoy_api//envoy/config/trace/v3:pkg_cc_proto", - "@envoy_api//envoy/watchdog/v3alpha:pkg_cc_proto", ], ) @@ -130,6 +129,7 @@ envoy_cc_library( "//source/common/stats:symbol_table_lib", "//source/common/watchdog:abort_action_config", "@envoy_api//envoy/config/bootstrap/v3:pkg_cc_proto", + "@envoy_api//envoy/watchdog/v3alpha:pkg_cc_proto", ], ) diff --git a/source/server/configuration_impl.cc b/source/server/configuration_impl.cc index 1cdcee64825a..c0b3524adcff 100644 --- a/source/server/configuration_impl.cc +++ b/source/server/configuration_impl.cc @@ -15,7 +15,6 @@ #include "envoy/server/instance.h" #include "envoy/server/tracer_config.h" #include "envoy/ssl/context_manager.h" -#include "envoy/watchdog/v3alpha/abort_action.pb.h" #include "common/common/assert.h" #include "common/common/utility.h" @@ -172,25 +171,7 @@ WatchdogImpl::WatchdogImpl(const envoy::config::bootstrap::v3::Watchdog& watchdo multikill_timeout_ = std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(watchdog, multikill_timeout, 0)); multikill_threshold_ = PROTOBUF_PERCENT_TO_DOUBLE_OR_DEFAULT(watchdog, multikill_threshold, 0.0); - auto actions = watchdog.actions(); - - // Add abort_action if killing is enabled. - envoy::watchdog::v3alpha::AbortActionConfig abort_config; - - if (kill_timeout > 0) { - envoy::config::bootstrap::v3::Watchdog::WatchdogAction* abort_action_config = actions.Add(); - abort_action_config->set_event(envoy::config::bootstrap::v3::Watchdog::WatchdogAction::KILL); - abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config); - } - - if (multikill_timeout_.count() > 0) { - envoy::config::bootstrap::v3::Watchdog::WatchdogAction* abort_action_config = actions.Add(); - abort_action_config->set_event( - envoy::config::bootstrap::v3::Watchdog::WatchdogAction::MULTIKILL); - abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config); - } - - actions_ = actions; + actions_ = watchdog.actions(); } InitialImpl::InitialImpl(const envoy::config::bootstrap::v3::Bootstrap& bootstrap) { diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc index 7c512d20310e..faa3d73e9e4d 100644 --- a/source/server/guarddog_impl.cc +++ b/source/server/guarddog_impl.cc @@ -13,6 +13,7 @@ #include "envoy/server/guarddog.h" #include "envoy/server/guarddog_config.h" #include "envoy/stats/scope.h" +#include "envoy/watchdog/v3alpha/abort_action.pb.h" #include "common/common/assert.h" #include "common/common/fmt.h" @@ -64,7 +65,23 @@ GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuratio Configuration::GuardDogActionFactoryContext context = {api, *dispatcher_, stats_scope, name}; - const auto& actions = config.actions(); + auto actions = config.actions(); + + // Add default abort_action if kill and/or multi-kill is enabled. + envoy::watchdog::v3alpha::AbortActionConfig abort_config; + + if (config.killTimeout().count() > 0) { + WatchDogAction* abort_action_config = actions.Add(); + abort_action_config->set_event(WatchDogAction::KILL); + abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config); + } + + if (config.multiKillTimeout().count() > 0) { + WatchDogAction* abort_action_config = actions.Add(); + abort_action_config->set_event(WatchDogAction::MULTIKILL); + abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config); + } + for (const auto& action : actions) { // Get factory and add the created cb auto& factory = Config::Utility::getAndCheckFactory( @@ -133,19 +150,12 @@ void GuardDogImpl::step() { } if (killEnabled() && delta > kill_timeout_) { invokeGuardDogActions(WatchDogAction::KILL, {{tid, last_checkin}}, now); - - PANIC(fmt::format("GuardDog: one thread ({}) stuck for more than watchdog_kill_timeout", - watched_dog->dog_->threadId().debugString())); } if (multikillEnabled() && delta > multi_kill_timeout_) { multi_kill_threads.emplace_back(tid, last_checkin); if (multi_kill_threads.size() >= required_for_multi_kill) { invokeGuardDogActions(WatchDogAction::MULTIKILL, multi_kill_threads, now); - - PANIC(fmt::format("GuardDog: At least {} threads ({},...) stuck for more than " - "watchdog_multikill_timeout", - multi_kill_threads.size(), tid.debugString())); } } } diff --git a/test/server/configuration_impl_test.cc b/test/server/configuration_impl_test.cc index fa9e71c608a4..ea6da93626b9 100644 --- a/test/server/configuration_impl_test.cc +++ b/test/server/configuration_impl_test.cc @@ -797,48 +797,6 @@ TEST_F(ConfigurationImplTest, DoesNotSkewIfKillTimeoutDisabled) { EXPECT_EQ(config.workerWatchdogConfig().killTimeout(), std::chrono::milliseconds(0)); } -TEST_F(ConfigurationImplTest, ShouldAddAbortActionIfKillingIsEnabled) { - envoy::config::bootstrap::v3::Bootstrap bootstrap; - MainImpl config; - const std::string kill_json = R"EOF( - { - "watchdog": { - "kill_timeout": "1.0s", - "multikill_timeout" : "1.0s" - }, - })EOF"; - - TestUtility::loadFromJson(kill_json, bootstrap); - config.initialize(bootstrap, server_, cluster_manager_factory_); - - // We should have the abort action added to both KILL and MULTIKILL events. - EXPECT_EQ(config.workerWatchdogConfig().actions().size(), 2); - EXPECT_EQ(config.mainThreadWatchdogConfig().actions().size(), 2); - - for (const auto& action : config.mainThreadWatchdogConfig().actions()) { - EXPECT_EQ(action.config().typed_config().type_url(), - "type.googleapis.com/envoy.watchdog.v3alpha.AbortActionConfig"); - } - - for (const auto& action : config.workerWatchdogConfig().actions()) { - EXPECT_EQ(action.config().typed_config().type_url(), - "type.googleapis.com/envoy.watchdog.v3alpha.AbortActionConfig"); - } -} - -TEST_F(ConfigurationImplTest, ShouldNotAddAbortActionIfKillingIsDisabled) { - envoy::config::bootstrap::v3::Bootstrap bootstrap; - MainImpl config; - const std::string killing_disabled_json = R"EOF( { "watchdog": { "miss_timeout": "1.0s" }})EOF"; - - TestUtility::loadFromJson(killing_disabled_json, bootstrap); - config.initialize(bootstrap, server_, cluster_manager_factory_); - - // We should have the abort action added - EXPECT_EQ(config.workerWatchdogConfig().actions().size(), 0); - EXPECT_EQ(config.mainThreadWatchdogConfig().actions().size(), 0); -} - TEST_F(ConfigurationImplTest, ShouldErrorIfBothWatchdogsAndWatchdogSet) { const std::string json = R"EOF( { "watchdogs": {}, "watchdog": {}})EOF"; diff --git a/test/server/server_test.cc b/test/server/server_test.cc index f226b33ef060..a26da0f646e6 100644 --- a/test/server/server_test.cc +++ b/test/server/server_test.cc @@ -1311,25 +1311,6 @@ TEST_P(ServerInstanceImplTest, DisabledExtension) { ASSERT_TRUE(disabled_filter_found); } -// Validates that server uses the abort action exit. -TEST_P(ServerInstanceImplTest, UsesAbortActionOnWatchdogKill) { - options_.concurrency_ = 2; - - EXPECT_DEATH( - { - startTestServer("test/server/test_data/server/watchdogs_use_abort_action.yaml", false); - absl::Notification main_thread_slept; - server_->dispatcher().post([&] { - absl::SleepFor(absl::Seconds(5)); - main_thread_slept.Notify(); - }); - - // Wait for the sleep to run, we should die by the watchdog by then. - main_thread_slept.WaitForNotification(); - }, - "(ABRT|Caught Abort)"); -} - } // namespace } // namespace Server } // namespace Envoy diff --git a/test/server/test_data/server/watchdogs_bootstrap_with_deprecated_field.yaml b/test/server/test_data/server/watchdogs_bootstrap_with_deprecated_field.yaml deleted file mode 100644 index f6e28cfcde91..000000000000 --- a/test/server/test_data/server/watchdogs_bootstrap_with_deprecated_field.yaml +++ /dev/null @@ -1,7 +0,0 @@ -watchdogs: - main_thread_watchdog: - miss_timeout: 1s - worker_watchdog: - miss_timeout: 0.5s -watchdog: - miss_timeout: 1s diff --git a/test/server/test_data/server/watchdogs_use_abort_action.yaml b/test/server/test_data/server/watchdogs_use_abort_action.yaml deleted file mode 100644 index f4347c0e9b7a..000000000000 --- a/test/server/test_data/server/watchdogs_use_abort_action.yaml +++ /dev/null @@ -1,7 +0,0 @@ -watchdogs: - main_thread_watchdog: - miss_timeout: 0.2s - kill_timeout: 2s - multikill_timeout: 0.5s - worker_watchdog: - miss_timeout: 0.5s