Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add systemd watchdog and adjust reload behaviour #5996

Merged
merged 1 commit into from
Jan 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ option(ICINGA2_WITH_PERFDATA "Build the perfdata module" ON)
option(ICINGA2_WITH_STUDIO "Build the Icinga Studio application" OFF)
option(ICINGA2_WITH_TESTS "Run unit tests" ON)

option (USE_SYSTEMD
"Configure icinga as native systemd service instead of a SysV initscript" OFF)

set(HAVE_SYSTEMD ${USE_SYSTEMD})

file(STRINGS icinga2.spec VERSION_LINE REGEX "^Version: ")
string(REPLACE "Version: " "" ICINGA2_VERSION ${VERSION_LINE})

Expand Down Expand Up @@ -155,6 +160,11 @@ if(UNIX OR CYGWIN)
list(APPEND base_OBJS $<TARGET_OBJECTS:execvpe>)
endif()

if(HAVE_SYSTEMD)
list(APPEND base_DEPS systemd)
endif()


if(EDITLINE_FOUND)
list(APPEND base_DEPS ${EDITLINE_LIBRARIES})
include_directories(${EDITLINE_INCLUDE_DIR})
Expand Down
1 change: 1 addition & 0 deletions config.h.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#cmakedefine HAVE_CXXABI_H
#cmakedefine HAVE_NICE
#cmakedefine HAVE_EDITLINE
#cmakedefine HAVE_SYSTEMD

#cmakedefine ICINGA2_UNITY_BUILD

Expand Down
8 changes: 8 additions & 0 deletions doc/02-getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,14 @@ content:
StartLimitInterval=10
StartLimitBurst=3

Using the watchdog can also help with monitoring Icinga 2, to activate and use it add the following to the override:

WatchdogSec=30s

This way Systemd will kill Icinga 2 if does not notify for over 30 seconds, a timout of less than 10 seconds is not
recommended. When the watchdog is activated, `Restart=` can be set to `watchdog` to restart Icinga 2 in the case of a
watchdog timeout.

Run `systemctl daemon-reload && systemctl restart icinga2` to apply the changes.
Now Systemd will always try to restart Icinga 2 (except if you run
`systemctl stop icinga2`). After three failures in ten seconds it will stop
Expand Down
3 changes: 0 additions & 3 deletions etc/initsystem/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,6 @@ if(NOT WIN32)
PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
)

option (USE_SYSTEMD
"Configure icinga as native systemd service instead of a SysV initscript" OFF)

# required for packaging on Gentoo, see Bug #6498
option (INSTALL_SYSTEMD_SERVICE_AND_INITSCRIPT
"Force install both the systemd service definition file and the SysV initscript in parallel, regardless of how USE_SYSTEMD is set. Only use this for special packaging purposes and if you know what you are doing" OFF)
Expand Down
4 changes: 2 additions & 2 deletions etc/initsystem/icinga2.service.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ Description=Icinga host/service/network monitoring system
After=syslog.target network-online.target postgresql.service mariadb.service carbon-cache.service carbon-relay.service

[Service]
Type=forking
Type=notify
EnvironmentFile=@ICINGA2_SYSCONFIGFILE@
ExecStartPre=@CMAKE_INSTALL_PREFIX@/lib/icinga2/prepare-dirs @ICINGA2_SYSCONFIGFILE@
ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/icinga2 daemon -d -e ${ICINGA2_ERROR_LOG}
ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/icinga2 daemon -e ${ICINGA2_ERROR_LOG}
PIDFile=@ICINGA2_RUNDIR@/icinga2/icinga2.pid
ExecReload=@CMAKE_INSTALL_PREFIX@/lib/icinga2/safe-reload @ICINGA2_SYSCONFIGFILE@
TimeoutStartSec=30m
Expand Down
41 changes: 39 additions & 2 deletions lib/base/application.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,12 @@
#ifdef __linux__
#include <sys/prctl.h>
#endif /* __linux__ */

#ifdef _WIN32
#include <windows.h>
#endif /* _win32 */
#endif /* _WIN32 */
#ifdef HAVE_SYSTEMD
#include <systemd/sd-daemon.h>
#endif /* HAVE_SYSTEMD */

using namespace icinga;

Expand Down Expand Up @@ -315,6 +317,11 @@ void Application::SetArgV(char **argv)
*/
void Application::RunEventLoop()
{

#ifdef HAVE_SYSTEMD
sd_notify(0, "READY=1");
#endif /* HAVE_SYSTEMD */

double lastLoop = Utility::GetTime();

mainloop:
Expand All @@ -331,6 +338,10 @@ void Application::RunEventLoop()
double now = Utility::GetTime();
double timeDiff = lastLoop - now;

#ifdef HAVE_SYSTEMD
sd_notify(0, "WATCHDOG=1");
#endif /* HAVE_SYSTEMD */

if (std::fabs(timeDiff) > 15) {
/* We made a significant jump in time. */
Log(LogInformation, "Application")
Expand All @@ -347,6 +358,10 @@ void Application::RunEventLoop()
if (m_RequestRestart) {
m_RequestRestart = false; // we are now handling the request, once is enough

#ifdef HAVE_SYSTEMD
sd_notify(0, "RELOADING=1");
#endif /* HAVE_SYSTEMD */

// are we already restarting? ignore request if we already are
if (l_Restarting)
goto mainloop;
Expand All @@ -357,6 +372,10 @@ void Application::RunEventLoop()
goto mainloop;
}

#ifdef HAVE_SYSTEMD
sd_notify(0, "STOPPING=1");
#endif /* HAVE_SYSTEMD */

Log(LogInformation, "Application", "Shutting down...");

ConfigObject::StopObjects();
Expand Down Expand Up @@ -712,6 +731,21 @@ void Application::SigUsr1Handler(int)
RequestReopenLogs();
}

/**
* Signal handler for SIGUSR2. Hands over PID to child and commits suicide
*
* @param - The signal number.
*/
void Application::SigUsr2Handler(int)
{
Log(LogInformation, "Application", "Reload requested, letting new process take over.");
#ifdef HAVE_SYSTEMD
sd_notifyf(0, "MAINPID=%lu", (unsigned long) m_ReloadProcess);
#endif /* HAVE_SYSTEMD */

Exit(0);
}

/**
* Signal handler for SIGABRT. Helps with debugging ASSERT()s.
*
Expand Down Expand Up @@ -964,6 +998,9 @@ int Application::Run()

sa.sa_handler = &Application::SigUsr1Handler;
sigaction(SIGUSR1, &sa, nullptr);

sa.sa_handler = &Application::SigUsr2Handler;
sigaction(SIGUSR2, &sa, nullptr);
#else /* _WIN32 */
SetConsoleCtrlHandler(&Application::CtrlHandler, TRUE);
#endif /* _WIN32 */
Expand Down
1 change: 1 addition & 0 deletions lib/base/application.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ class Application : public ObjectImpl<Application> {

static void SigAbrtHandler(int signum);
static void SigUsr1Handler(int signum);
static void SigUsr2Handler(int signum);
static void ExceptionHandler();

static String GetCrashReportFilename();
Expand Down
46 changes: 8 additions & 38 deletions lib/cli/daemoncommand.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,39 +136,6 @@ static bool SetDaemonIO(const String& stderrFile)
return true;
}

/**
* Terminate another process and wait till it has ended
*
* @params target PID of the process to end
*/
static void TerminateAndWaitForEnd(pid_t target)
{
#ifndef _WIN32
// allow 30 seconds timeout
double timeout = Utility::GetTime() + 30;

int ret = kill(target, SIGTERM);

while (Utility::GetTime() < timeout && (ret == 0 || errno != ESRCH)) {
Utility::Sleep(0.1);
ret = kill(target, 0);
}

// timeout and the process still seems to live: update pid and kill it
if (ret == 0 || errno != ESRCH) {
String pidFile = Application::GetPidPath();
std::ofstream fp(pidFile.CStr());
fp << Utility::GetPid();
fp.close();

kill(target, SIGKILL);
}

#else
// TODO: implement this for Win32
#endif /* _WIN32 */
}

String DaemonCommand::GetDescription() const
{
return "Starts Icinga 2.";
Expand Down Expand Up @@ -251,11 +218,14 @@ int DaemonCommand::Run(const po::variables_map& vm, const std::vector<std::strin
}

if (vm.count("reload-internal")) {
int parentpid = vm["reload-internal"].as<int>();
Log(LogInformation, "cli")
<< "Terminating previous instance of Icinga (PID " << parentpid << ")";
TerminateAndWaitForEnd(parentpid);
Log(LogInformation, "cli", "Previous instance has ended, taking over now.");
/* We went through validation and now ask the old process kindly to die */
Log(LogInformation, "cli", "Requesting to take over.");
int rc = kill(vm["reload-internal"].as<int>(), SIGUSR2);
if (rc) {
Log(LogCritical, "Application")
<< "Failed to send signal to \"" << vm["reload-internal"].as<int>() << "\" with " << strerror(errno);
return EXIT_FAILURE;
}
}

if (vm.count("daemonize")) {
Expand Down