Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Watchdog: use abort action as a default if killing is enabled. #13523

Merged
merged 8 commits into from
Oct 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,6 @@ extensions/filters/common/original_src @snowp @klarose
/*/extensions/filters/http/decompressor @rojkov @dio
# Watchdog Extensions
/*/extensions/watchdog/profile_action @kbaichoo @antoniovicente
/*/extensions/watchdog/abort_action @kbaichoo @antoniovicente
# Core upstream code
extensions/upstreams/http @alyssawilk @snowp @mattklein123
extensions/upstreams/http/http @alyssawilk @snowp @mattklein123
Expand Down
2 changes: 1 addition & 1 deletion api/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,6 @@ proto_library(
"//envoy/extensions/upstreams/http/http/v3:pkg",
"//envoy/extensions/upstreams/http/tcp/v3:pkg",
"//envoy/extensions/wasm/v3:pkg",
"//envoy/extensions/watchdog/abort_action/v3alpha:pkg",
"//envoy/extensions/watchdog/profile_action/v3alpha:pkg",
"//envoy/service/accesslog/v3:pkg",
"//envoy/service/auth/v3:pkg",
Expand All @@ -272,6 +271,7 @@ proto_library(
"//envoy/type/metadata/v3:pkg",
"//envoy/type/tracing/v3:pkg",
"//envoy/type/v3:pkg",
"//envoy/watchdog/v3alpha:pkg",
],
)

Expand Down

This file was deleted.

2 changes: 2 additions & 0 deletions api/envoy/watchdog/v3alpha/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
This contains watchdog actions that are part of core Envoy, and therefore cannot
be in the extensions directory.
29 changes: 29 additions & 0 deletions api/envoy/watchdog/v3alpha/abort_action.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
syntax = "proto3";

package envoy.watchdog.v3alpha;

import "google/protobuf/duration.proto";

import "udpa/annotations/status.proto";
import "udpa/annotations/versioning.proto";
import "validate/validate.proto";

option java_package = "io.envoyproxy.envoy.watchdog.v3alpha";
option java_outer_classname = "AbortActionProto";
option java_multiple_files = true;
option (udpa.annotations.file_status).work_in_progress = true;
option (udpa.annotations.file_status).package_version_status = ACTIVE;

// [#protodoc-title: Watchdog Action that kills a stuck thread to kill the process.]
htuch marked this conversation as resolved.
Show resolved Hide resolved

// A GuardDogAction that will terminate the process by killing the
// stuck thread. This would allow easier access to the call stack of the stuck
// thread since we would run signal handlers on that thread. By default
// this will be registered to run as the last watchdog action on KILL and
// MULTIKILL events if those are enabled.
message AbortActionConfig {
// How long to wait for the thread to respond to the thread kill function
// before killing the process from this action. This is a blocking action.
// By default this is 5 seconds.
google.protobuf.Duration wait_duration = 1;
}
2 changes: 1 addition & 1 deletion api/versioning/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ proto_library(
"//envoy/extensions/upstreams/http/http/v3:pkg",
"//envoy/extensions/upstreams/http/tcp/v3:pkg",
"//envoy/extensions/wasm/v3:pkg",
"//envoy/extensions/watchdog/abort_action/v3alpha:pkg",
"//envoy/extensions/watchdog/profile_action/v3alpha:pkg",
"//envoy/service/accesslog/v3:pkg",
"//envoy/service/auth/v3:pkg",
Expand All @@ -155,6 +154,7 @@ proto_library(
"//envoy/type/metadata/v3:pkg",
"//envoy/type/tracing/v3:pkg",
"//envoy/type/v3:pkg",
"//envoy/watchdog/v3alpha:pkg",
],
)

Expand Down
1 change: 0 additions & 1 deletion bazel/repositories.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ WINDOWS_SKIP_TARGETS = [
"envoy.tracers.lightstep",
"envoy.tracers.datadog",
"envoy.tracers.opencensus",
"envoy.watchdog.abort_action",
]

# Make all contents of an external repository accessible under a filegroup. Used for external HTTP
Expand Down
2 changes: 1 addition & 1 deletion docs/root/api-v3/config/watchdog/watchdog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ Watchdog
:maxdepth: 2

../../extensions/watchdog/profile_action/v3alpha/*
../../extensions/watchdog/abort_action/v3alpha/*
../../watchdog/v3alpha/*
1 change: 1 addition & 0 deletions docs/root/version_history/current.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Minor Behavior Changes

* build: the Alpine based debug images are no longer built in CI, use Ubuntu based images instead.
* ext_authz filter: the deprecated field :ref:`use_alpha <envoy_api_field_config.filter.http.ext_authz.v2.ExtAuthz.use_alpha>` is no longer supported and cannot be set anymore.
* watchdog: the watchdog action :ref:`abort_action <envoy_v3_api_msg_watchdog.v3alpha.AbortActionConfig>` is now the default action to terminate the process if watchdog kill / multikill is enabled.

Bug Fixes
---------
Expand Down
2 changes: 1 addition & 1 deletion docs/root/version_history/v1.16.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ New Features
* watchdog: support randomizing the watchdog's kill timeout to prevent synchronized kills via a maximium jitter parameter :ref:`max_kill_timeout_jitter<envoy_v3_api_field_config.bootstrap.v3.Watchdog.max_kill_timeout_jitter>`.
* watchdog: supports an extension point where actions can be registered to fire on watchdog events such as miss, megamiss, kill and multikill. See :ref:`watchdog actions<envoy_v3_api_field_config.bootstrap.v3.Watchdog.actions>`.
* watchdog: watchdog action extension that does cpu profiling. See :ref:`Profile Action <envoy_v3_api_file_envoy/extensions/watchdog/profile_action/v3alpha/profile_action.proto>`.
* watchdog: watchdog action extension that sends SIGABRT to the stuck thread to terminate the process. See :ref:`Abort Action <envoy_v3_api_file_envoy/extensions/watchdog/abort_action/v3alpha/abort_action.proto>`.
* watchdog: watchdog action extension that sends SIGABRT to the stuck thread to terminate the process. See :ref:`Abort Action <envoy_v3_api_msg_watchdog.v3alpha.AbortActionConfig>`.
antoniovicente marked this conversation as resolved.
Show resolved Hide resolved
* xds: added :ref:`extension config discovery<envoy_v3_api_msg_config.core.v3.ExtensionConfigSource>` support for HTTP filters.
* xds: added support for mixed v2/v3 discovery response, which enable type url downgrade and upgrade. This feature is disabled by default and is controlled by runtime guard `envoy.reloadable_features.enable_type_url_downgrade_and_upgrade`.
* zlib: added option to use `zlib-ng <https://github.com/zlib-ng/zlib-ng>`_ as zlib library.
Expand Down

This file was deleted.

29 changes: 29 additions & 0 deletions generated_api_shadow/envoy/watchdog/v3alpha/abort_action.proto

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions source/common/thread/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
load(
"//bazel:envoy_build_system.bzl",
"envoy_cc_library",
"envoy_package",
)

licenses(["notice"]) # Apache 2

envoy_package()

envoy_cc_library(
name = "terminate_thread_lib",
srcs = ["terminate_thread.cc"],
hdrs = ["terminate_thread.h"],
deps = [
"//include/envoy/thread:thread_interface",
"//source/common/common:minimal_logger_lib",
],
)
31 changes: 31 additions & 0 deletions source/common/thread/terminate_thread.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#include "common/thread/terminate_thread.h"

#include <sys/types.h>

#include <csignal>

#include "common/common/logger.h"

namespace Envoy {
namespace Thread {
namespace {
#ifdef __linux__
pid_t toPlatformTid(int64_t tid) { return static_cast<pid_t>(tid); }
#elif defined(__APPLE__)
uint64_t toPlatformTid(int64_t tid) { return static_cast<uint64_t>(tid); }
#endif
} // namespace

bool terminateThread(const ThreadId& tid) {
#ifndef WIN32
// Assume POSIX-compatible system and signal to the thread.
return kill(toPlatformTid(tid.getId()), SIGABRT) == 0;
#else
// Windows, currently unsupported termination of thread.
ENVOY_LOG_MISC(error, "Windows is currently unsupported for terminateThread.");
return false;
#endif
}

} // namespace Thread
} // namespace Envoy
19 changes: 19 additions & 0 deletions source/common/thread/terminate_thread.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#pragma once

#include "envoy/thread/thread.h"

namespace Envoy {
namespace Thread {
/**
* Tries to terminates the process by killing the thread specified by
* the ThreadId. The implementation is platform dependent and currently
* only works on platforms that support SIGABRT.
*
* Returns true if the platform specific function to terminate the thread
* succeeded (i.e. kill() == 0). If the platform is currently unsupported, this
* will return false.
*/
bool terminateThread(const ThreadId& tid);
mattklein123 marked this conversation as resolved.
Show resolved Hide resolved

} // namespace Thread
} // namespace Envoy
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
load(
"//bazel:envoy_build_system.bzl",
"envoy_cc_extension",
"envoy_cc_library",
"envoy_extension_package",
"envoy_package",
)

licenses(["notice"]) # Apache 2

envoy_extension_package()
envoy_package()

envoy_cc_library(
name = "abort_action_lib",
Expand All @@ -19,22 +18,21 @@ envoy_cc_library(
"//include/envoy/thread:thread_interface",
"//source/common/common:assert_lib",
"//source/common/protobuf:utility_lib",
"@envoy_api//envoy/extensions/watchdog/abort_action/v3alpha:pkg_cc_proto",
"//source/common/thread:terminate_thread_lib",
"@envoy_api//envoy/watchdog/v3alpha:pkg_cc_proto",
],
)

envoy_cc_extension(
name = "config",
srcs = ["config.cc"],
hdrs = ["config.h"],
security_posture = "data_plane_agnostic",
status = "alpha",
envoy_cc_library(
name = "abort_action_config",
srcs = ["abort_action_config.cc"],
hdrs = ["abort_action_config.h"],
deps = [
":abort_action_lib",
"//include/envoy/registry",
"//source/common/config:utility_lib",
"//source/common/protobuf",
"//source/common/protobuf:message_validator_lib",
"@envoy_api//envoy/extensions/watchdog/abort_action/v3alpha:pkg_cc_proto",
"@envoy_api//envoy/watchdog/v3alpha:pkg_cc_proto",
],
)
2 changes: 2 additions & 0 deletions source/common/watchdog/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
This contains watchdog actions that are part of core Envoy, and therefore cannot
be in the extensions directory.
54 changes: 54 additions & 0 deletions source/common/watchdog/abort_action.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#include "common/watchdog/abort_action.h"

#include "envoy/thread/thread.h"

#include "common/common/assert.h"
#include "common/common/fmt.h"
#include "common/common/logger.h"
#include "common/protobuf/utility.h"
#include "common/thread/terminate_thread.h"

namespace Envoy {
namespace Watchdog {
namespace {
constexpr uint64_t DefaultWaitDurationMs = 5000;
} // end namespace

AbortAction::AbortAction(envoy::watchdog::v3alpha::AbortActionConfig& config,
Server::Configuration::GuardDogActionFactoryContext& /*context*/)
: wait_duration_(absl::Milliseconds(
PROTOBUF_GET_MS_OR_DEFAULT(config, wait_duration, DefaultWaitDurationMs))) {}

void AbortAction::run(
envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent /*event*/,
const std::vector<std::pair<Thread::ThreadId, MonotonicTime>>& thread_last_checkin_pairs,
MonotonicTime /*now*/) {

if (thread_last_checkin_pairs.empty()) {
ENVOY_LOG_MISC(warn, "Watchdog AbortAction called without any thread.");
return;
}

// The following lines of code won't be considered covered by code coverage
// tools since they would run in DEATH tests.
const auto& thread_id = thread_last_checkin_pairs[0].first;
const std::string tid_string = thread_id.debugString();
ENVOY_LOG_MISC(error, "Watchdog AbortAction terminating thread with tid {}.", tid_string);

if (Thread::terminateThread(thread_id)) {
// Successfully signaled to thread to terminate, sleep for wait_duration.
absl::SleepFor(wait_duration_);
} else {
ENVOY_LOG_MISC(error, "Failed to terminate tid {}", tid_string);
}

// Abort from the action since the signaled thread hasn't yet crashed the process.
// Panicing in the action gives flexibility since it doesn't depend on
// external code to kill the process if the signal fails.
PANIC(fmt::format(
"Failed to terminate thread with id {}, aborting from Watchdog AbortAction instead.",
tid_string));
}

} // namespace Watchdog
} // namespace Envoy
Loading