-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Watchdog action that will signal a particular thread to abort. #12860
Changes from 8 commits
9f43939
0154034
e2dcb42
6955b10
33a4f5e
71f099f
847aed0
0b3c91c
371866f
80f2dc0
11d6066
56b3f82
5fb84a9
947bd06
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# DO NOT EDIT. This file is generated by tools/proto_format/proto_sync.py. | ||
|
||
load("@envoy_api//bazel:api_build_system.bzl", "api_proto_package") | ||
|
||
licenses(["notice"]) # Apache 2 | ||
|
||
api_proto_package( | ||
deps = ["@com_github_cncf_udpa//udpa/annotations:pkg"], | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
syntax = "proto3"; | ||
|
||
package envoy.extensions.watchdog.abort_action.v3alpha; | ||
|
||
import "google/protobuf/duration.proto"; | ||
|
||
import "udpa/annotations/status.proto"; | ||
import "udpa/annotations/versioning.proto"; | ||
import "validate/validate.proto"; | ||
|
||
option java_package = "io.envoyproxy.envoy.extensions.watchdog.abort_action.v3alpha"; | ||
option java_outer_classname = "AbortActionProto"; | ||
option java_multiple_files = true; | ||
option (udpa.annotations.file_status).work_in_progress = true; | ||
option (udpa.annotations.file_status).package_version_status = ACTIVE; | ||
|
||
// [#protodoc-title: Watchdog Action that sends a SIGABRT to kill the process.] | ||
// [#extension: envoy.watchdog.abort_action] | ||
|
||
// A GuardDogAction that will terminate the process by sending SIGABRT to the | ||
// stuck thread. This would allow easier access to the call stack of the stuck | ||
// thread since we would run signal handlers on that thread. This would be | ||
// more useful than the default watchdog kill behaviors since those PANIC | ||
// from the watchdog's thread. | ||
|
||
// This is currently only implemented for systems that support kill to send | ||
// signals. | ||
message AbortActionConfig { | ||
// How long to wait for the thread to respond to the SIGABRT before killing the | ||
// process from this action. This is a blocking action. | ||
google.protobuf.Duration wait_duration = 1; | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
load( | ||
"//bazel:envoy_build_system.bzl", | ||
"envoy_cc_extension", | ||
"envoy_cc_library", | ||
"envoy_extension_package", | ||
) | ||
|
||
licenses(["notice"]) # Apache 2 | ||
|
||
envoy_extension_package() | ||
|
||
envoy_cc_library( | ||
name = "abort_action_lib", | ||
srcs = ["abort_action.cc"], | ||
hdrs = ["abort_action.h"], | ||
deps = [ | ||
"//include/envoy/common:time_interface", | ||
"//include/envoy/server:guarddog_config_interface", | ||
"//include/envoy/thread:thread_interface", | ||
"//source/common/common:assert_lib", | ||
"//source/common/protobuf:utility_lib", | ||
"@envoy_api//envoy/extensions/watchdog/abort_action/v3alpha:pkg_cc_proto", | ||
], | ||
) | ||
|
||
envoy_cc_extension( | ||
name = "config", | ||
srcs = ["config.cc"], | ||
hdrs = ["config.h"], | ||
security_posture = "data_plane_agnostic", | ||
status = "alpha", | ||
deps = [ | ||
":abort_action_lib", | ||
"//include/envoy/registry", | ||
"//source/common/config:utility_lib", | ||
"//source/common/protobuf", | ||
"//source/common/protobuf:message_validator_lib", | ||
"@envoy_api//envoy/extensions/watchdog/abort_action/v3alpha:pkg_cc_proto", | ||
], | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
#include "extensions/watchdog/abort_action/abort_action.h" | ||
|
||
#include "common/common/assert.h" | ||
#include "common/common/fmt.h" | ||
|
||
#ifndef WIN32 | ||
#include <sys/types.h> | ||
#include <csignal> | ||
#endif | ||
|
||
#include "common/common/logger.h" | ||
#include "common/protobuf/utility.h" | ||
#include "envoy/thread/thread.h" | ||
|
||
namespace Envoy { | ||
namespace Extensions { | ||
namespace Watchdog { | ||
namespace AbortAction { | ||
namespace { | ||
#ifdef __linux__ | ||
pid_t toPlatformTid(int64_t tid) { return static_cast<pid_t>(tid); } | ||
#elif defined(__APPLE__) | ||
uint64_t toPlatformTid(int64_t tid) { return static_cast<uint64_t>(tid); } | ||
#endif | ||
} // namespace | ||
|
||
AbortAction::AbortAction( | ||
envoy::extensions::watchdog::abort_action::v3alpha::AbortActionConfig& config, | ||
Server::Configuration::GuardDogActionFactoryContext& /*context*/) | ||
: config_(config){}; | ||
|
||
void AbortAction::run( | ||
envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent /*event*/, | ||
const std::vector<std::pair<Thread::ThreadId, MonotonicTime>>& thread_last_checkin_pairs, | ||
MonotonicTime /*now*/) { | ||
|
||
if (thread_last_checkin_pairs.empty()) { | ||
ENVOY_LOG_MISC(warn, "Watchdog AbortAction called without any thread."); | ||
return; | ||
} | ||
|
||
int64_t raw_tid = thread_last_checkin_pairs[0].first.getId(); | ||
|
||
#ifdef WIN32 | ||
// TODO(kbaichoo): add support for this with windows. | ||
ENVOY_LOG_MISC(error, "Watchdog AbortAction is unimplemented for Windows."); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this actually needed? I think Windows has its own extension bzl file so I'm not sure this is even compiled? Could we actually avoid the ifdef in here now or just replace with an proprocessor error if this extension gets compiled on windows? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good to know this is a possibility, thanks for pointing it out. I've added the extension to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. turns out you also need to add |
||
#else | ||
// Assume POSIX-compatible system and signal to the thread. | ||
ENVOY_LOG_MISC(error, "Watchdog AbortAction sending abort signal to thread with tid {}.", | ||
raw_tid); | ||
|
||
if (kill(toPlatformTid(raw_tid), SIGABRT) == 0) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could in theory make the signal configurable, but I expect SIGABRT to always be the right signal to use. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, I think I'll keep it simple for now given that the default envoy handlers views SIGABRT as a fatal signal. |
||
// Successfully sent signal, sleep for wait_duration. | ||
absl::SleepFor(absl::Milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(config_, wait_duration, 0))); | ||
} else { | ||
// Failed to send the signal, abort? | ||
ENVOY_LOG_MISC(error, "Failed to send signal to tid {}", raw_tid); | ||
} | ||
|
||
// Abort from the action since the signaled thread hasn't yet crashed the process. | ||
PANIC(fmt::format("Failed to kill thread with id {}, aborting from Watchdog AbortAction instead.", | ||
raw_tid)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I haven't kept fully up to date on this, but isn't the default action at the end of the action chain to terminate the process? Do we need this or would the action chain just end up aborting anyway? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, you're right. I discussed this with antonio up above. I've copied the reasoning below (copied from #12860 (comment)) There are 3 reasons I think this [having a panic in the action] would be handy (vs deferring to the underlying watchdog default actions[its panic there]):
Given that we're planning to make this also a default action on platform where it's supported I'd likely make the following change in the next PR:
I could possibly see when this has full support across platforms that we'd remove the watchdog's default panic, since it'd end up being dead code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK that make sense. Can you summarize this comment in the code? Thank you. /wait There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
#endif | ||
} | ||
|
||
} // namespace AbortAction | ||
} // namespace Watchdog | ||
} // namespace Extensions | ||
} // namespace Envoy |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#pragma once | ||
|
||
#include <chrono> | ||
|
||
#include "envoy/extensions/watchdog/abort_action/v3alpha/abort_action.pb.h" | ||
#include "envoy/server/guarddog_config.h" | ||
#include "envoy/thread/thread.h" | ||
|
||
namespace Envoy { | ||
namespace Extensions { | ||
namespace Watchdog { | ||
namespace AbortAction { | ||
|
||
/** | ||
* A GuardDogAction that will terminate the process by sending SIGABRT to the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps some/all of this comment block should be included with the proto config, so it gets included in generated docs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
* stuck thread. This is currently only implemented for systems that | ||
* support kill to send signals. | ||
*/ | ||
class AbortAction : public Server::Configuration::GuardDogAction { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like the earlier extension was added under the watchdog directory instead of guarddog. I don't know which is more correct. It makes sense to me to continue using the watchdog name for this and other future extensions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The terminology of these two get a bit messy; in the docs and at a high level we talk about I went with |
||
public: | ||
AbortAction(envoy::extensions::watchdog::abort_action::v3alpha::AbortActionConfig& config, | ||
Server::Configuration::GuardDogActionFactoryContext& context); | ||
|
||
void run(envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent event, | ||
const std::vector<std::pair<Thread::ThreadId, MonotonicTime>>& thread_last_checkin_pairs, | ||
MonotonicTime now) override; | ||
|
||
private: | ||
const envoy::extensions::watchdog::abort_action::v3alpha::AbortActionConfig config_; | ||
}; | ||
|
||
using AbortActionPtr = std::unique_ptr<AbortAction>; | ||
|
||
} // namespace AbortAction | ||
} // namespace Watchdog | ||
} // namespace Extensions | ||
} // namespace Envoy |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#include "extensions/watchdog/abort_action/config.h" | ||
|
||
#include "envoy/registry/registry.h" | ||
|
||
#include "common/config/utility.h" | ||
#include "common/protobuf/message_validator_impl.h" | ||
|
||
#include "extensions/watchdog/abort_action/abort_action.h" | ||
|
||
namespace Envoy { | ||
namespace Extensions { | ||
namespace Watchdog { | ||
namespace AbortAction { | ||
|
||
Server::Configuration::GuardDogActionPtr AbortActionFactory::createGuardDogActionFromProto( | ||
const envoy::config::bootstrap::v3::Watchdog::WatchdogAction& config, | ||
Server::Configuration::GuardDogActionFactoryContext& context) { | ||
auto message = createEmptyConfigProto(); | ||
Config::Utility::translateOpaqueConfig(config.config().typed_config(), ProtobufWkt::Struct(), | ||
ProtobufMessage::getStrictValidationVisitor(), *message); | ||
return std::make_unique<AbortAction>(dynamic_cast<AbortActionConfig&>(*message), context); | ||
} | ||
|
||
/** | ||
* Static registration for the fixed heap resource monitor factory. @see RegistryFactory. | ||
*/ | ||
REGISTER_FACTORY(AbortActionFactory, Server::Configuration::GuardDogActionFactory); | ||
|
||
} // namespace AbortAction | ||
} // namespace Watchdog | ||
} // namespace Extensions | ||
} // namespace Envoy |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#pragma once | ||
|
||
#include "envoy/extensions/watchdog/abort_action/v3alpha/abort_action.pb.h" | ||
#include "envoy/server/guarddog_config.h" | ||
|
||
#include "common/protobuf/protobuf.h" | ||
|
||
namespace Envoy { | ||
namespace Extensions { | ||
namespace Watchdog { | ||
namespace AbortAction { | ||
|
||
class AbortActionFactory : public Server::Configuration::GuardDogActionFactory { | ||
public: | ||
AbortActionFactory() = default; | ||
|
||
Server::Configuration::GuardDogActionPtr createGuardDogActionFromProto( | ||
const envoy::config::bootstrap::v3::Watchdog::WatchdogAction& config, | ||
Server::Configuration::GuardDogActionFactoryContext& context) override; | ||
|
||
ProtobufTypes::MessagePtr createEmptyConfigProto() override { | ||
return std::make_unique<AbortActionConfig>(); | ||
} | ||
|
||
std::string name() const override { return "envoy.watchdog.abort_action"; } | ||
|
||
using AbortActionConfig = envoy::extensions::watchdog::abort_action::v3alpha::AbortActionConfig; | ||
KBaichoo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
}; | ||
|
||
} // namespace AbortAction | ||
} // namespace Watchdog | ||
} // namespace Extensions | ||
} // namespace Envoy |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be the default, on supported platforms? Is there any downside to this? It seems to me that it gives better information (or a chance of it), and the same net behavior (process terminated). @envoyproxy/maintainers
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm fine with that being default in a future PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But also fine here if we want this, seems generally useful.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good, I can make it be a default action in a future PR.
The main downside I see of making it a default now would be different behaviors across platforms due to a lack of support on Windows. Should we wait for parity before we make it a default?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's ok to make it the default, because the only platform difference should be diagnostic output. The process is terminated on all platforms.
I'm also fine with making it the default in a future PR. It's your choice, @KBaichoo .
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good. I'll submit a follow up to this adding this action to the watchdog actions by default.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 to make this the default.