Skip to content

Commit

Permalink
Add systemd watchdog and adjust reload behaviour
Browse files Browse the repository at this point in the history
  • Loading branch information
Crunsher committed Jan 19, 2018
1 parent 627fddf commit c418a96
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 45 deletions.
10 changes: 10 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ option(ICINGA2_WITH_PERFDATA "Build the perfdata module" ON)
option(ICINGA2_WITH_STUDIO "Build the Icinga Studio application" OFF)
option(ICINGA2_WITH_TESTS "Run unit tests" ON)

option (USE_SYSTEMD
"Configure icinga as native systemd service instead of a SysV initscript" OFF)

set(HAVE_SYSTEMD ${USE_SYSTEMD})

file(STRINGS icinga2.spec VERSION_LINE REGEX "^Version: ")
string(REPLACE "Version: " "" ICINGA2_VERSION ${VERSION_LINE})

Expand Down Expand Up @@ -155,6 +160,11 @@ if(UNIX OR CYGWIN)
list(APPEND base_OBJS $<TARGET_OBJECTS:execvpe>)
endif()

if(HAVE_SYSTEMD)
list(APPEND base_DEPS systemd)
endif()


if(EDITLINE_FOUND)
list(APPEND base_DEPS ${EDITLINE_LIBRARIES})
include_directories(${EDITLINE_INCLUDE_DIR})
Expand Down
1 change: 1 addition & 0 deletions config.h.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#cmakedefine HAVE_CXXABI_H
#cmakedefine HAVE_NICE
#cmakedefine HAVE_EDITLINE
#cmakedefine HAVE_SYSTEMD

#cmakedefine ICINGA2_UNITY_BUILD

Expand Down
8 changes: 8 additions & 0 deletions doc/02-getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,14 @@ content:
StartLimitInterval=10
StartLimitBurst=3

Using the watchdog can also help with monitoring Icinga 2, to activate and use it add the following to the override:

WatchdogSec=30s

This way Systemd will kill Icinga 2 if does not notify for over 30 seconds, a timout of less than 10 seconds is not
recommended. When the watchdog is activated, `Restart=` can be set to `watchdog` to restart Icinga 2 in the case of a
watchdog timeout.

Run `systemctl daemon-reload && systemctl restart icinga2` to apply the changes.
Now Systemd will always try to restart Icinga 2 (except if you run
`systemctl stop icinga2`). After three failures in ten seconds it will stop
Expand Down
3 changes: 0 additions & 3 deletions etc/initsystem/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,6 @@ if(NOT WIN32)
PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
)

option (USE_SYSTEMD
"Configure icinga as native systemd service instead of a SysV initscript" OFF)

# required for packaging on Gentoo, see Bug #6498
option (INSTALL_SYSTEMD_SERVICE_AND_INITSCRIPT
"Force install both the systemd service definition file and the SysV initscript in parallel, regardless of how USE_SYSTEMD is set. Only use this for special packaging purposes and if you know what you are doing" OFF)
Expand Down
4 changes: 2 additions & 2 deletions etc/initsystem/icinga2.service.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ Description=Icinga host/service/network monitoring system
After=syslog.target network-online.target postgresql.service mariadb.service carbon-cache.service carbon-relay.service

[Service]
Type=forking
Type=notify
EnvironmentFile=@ICINGA2_SYSCONFIGFILE@
ExecStartPre=@CMAKE_INSTALL_PREFIX@/lib/icinga2/prepare-dirs @ICINGA2_SYSCONFIGFILE@
ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/icinga2 daemon -d -e ${ICINGA2_ERROR_LOG}
ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/icinga2 daemon -e ${ICINGA2_ERROR_LOG}
PIDFile=@ICINGA2_RUNDIR@/icinga2/icinga2.pid
ExecReload=@CMAKE_INSTALL_PREFIX@/lib/icinga2/safe-reload @ICINGA2_SYSCONFIGFILE@
TimeoutStartSec=30m
Expand Down
41 changes: 39 additions & 2 deletions lib/base/application.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,12 @@
#ifdef __linux__
#include <sys/prctl.h>
#endif /* __linux__ */

#ifdef _WIN32
#include <windows.h>
#endif /* _win32 */
#endif /* _WIN32 */
#ifdef HAVE_SYSTEMD
#include <systemd/sd-daemon.h>
#endif /* HAVE_SYSTEMD */

using namespace icinga;

Expand Down Expand Up @@ -315,6 +317,11 @@ void Application::SetArgV(char **argv)
*/
void Application::RunEventLoop()
{

#ifdef HAVE_SYSTEMD
sd_notify(0, "READY=1");
#endif /* HAVE_SYSTEMD */

double lastLoop = Utility::GetTime();

mainloop:
Expand All @@ -331,6 +338,10 @@ void Application::RunEventLoop()
double now = Utility::GetTime();
double timeDiff = lastLoop - now;

#ifdef HAVE_SYSTEMD
sd_notify(0, "WATCHDOG=1");
#endif /* HAVE_SYSTEMD */

if (std::fabs(timeDiff) > 15) {
/* We made a significant jump in time. */
Log(LogInformation, "Application")
Expand All @@ -347,6 +358,10 @@ void Application::RunEventLoop()
if (m_RequestRestart) {
m_RequestRestart = false; // we are now handling the request, once is enough

#ifdef HAVE_SYSTEMD
sd_notify(0, "RELOADING=1");
#endif /* HAVE_SYSTEMD */

// are we already restarting? ignore request if we already are
if (l_Restarting)
goto mainloop;
Expand All @@ -357,6 +372,10 @@ void Application::RunEventLoop()
goto mainloop;
}

#ifdef HAVE_SYSTEMD
sd_notify(0, "STOPPING=1");
#endif /* HAVE_SYSTEMD */

Log(LogInformation, "Application", "Shutting down...");

ConfigObject::StopObjects();
Expand Down Expand Up @@ -712,6 +731,21 @@ void Application::SigUsr1Handler(int)
RequestReopenLogs();
}

/**
* Signal handler for SIGUSR2. Hands over PID to child and commits suicide
*
* @param - The signal number.
*/
void Application::SigUsr2Handler(int)
{
Log(LogInformation, "Application", "Reload requested, letting new process take over.");
#ifdef HAVE_SYSTEMD
sd_notifyf(0, "MAINPID=%lu", (unsigned long) m_ReloadProcess);
#endif /* HAVE_SYSTEMD */

Exit(0);
}

/**
* Signal handler for SIGABRT. Helps with debugging ASSERT()s.
*
Expand Down Expand Up @@ -964,6 +998,9 @@ int Application::Run()

sa.sa_handler = &Application::SigUsr1Handler;
sigaction(SIGUSR1, &sa, nullptr);

sa.sa_handler = &Application::SigUsr2Handler;
sigaction(SIGUSR2, &sa, nullptr);
#else /* _WIN32 */
SetConsoleCtrlHandler(&Application::CtrlHandler, TRUE);
#endif /* _WIN32 */
Expand Down
1 change: 1 addition & 0 deletions lib/base/application.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ class Application : public ObjectImpl<Application> {

static void SigAbrtHandler(int signum);
static void SigUsr1Handler(int signum);
static void SigUsr2Handler(int signum);
static void ExceptionHandler();

static String GetCrashReportFilename();
Expand Down
46 changes: 8 additions & 38 deletions lib/cli/daemoncommand.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,39 +136,6 @@ static bool SetDaemonIO(const String& stderrFile)
return true;
}

/**
* Terminate another process and wait till it has ended
*
* @params target PID of the process to end
*/
static void TerminateAndWaitForEnd(pid_t target)
{
#ifndef _WIN32
// allow 30 seconds timeout
double timeout = Utility::GetTime() + 30;

int ret = kill(target, SIGTERM);

while (Utility::GetTime() < timeout && (ret == 0 || errno != ESRCH)) {
Utility::Sleep(0.1);
ret = kill(target, 0);
}

// timeout and the process still seems to live: update pid and kill it
if (ret == 0 || errno != ESRCH) {
String pidFile = Application::GetPidPath();
std::ofstream fp(pidFile.CStr());
fp << Utility::GetPid();
fp.close();

kill(target, SIGKILL);
}

#else
// TODO: implement this for Win32
#endif /* _WIN32 */
}

String DaemonCommand::GetDescription() const
{
return "Starts Icinga 2.";
Expand Down Expand Up @@ -251,11 +218,14 @@ int DaemonCommand::Run(const po::variables_map& vm, const std::vector<std::strin
}

if (vm.count("reload-internal")) {
int parentpid = vm["reload-internal"].as<int>();
Log(LogInformation, "cli")
<< "Terminating previous instance of Icinga (PID " << parentpid << ")";
TerminateAndWaitForEnd(parentpid);
Log(LogInformation, "cli", "Previous instance has ended, taking over now.");
/* We went through validation and now ask the old process kindly to die */
Log(LogInformation, "cli", "Requesting to take over.");
int rc = kill(vm["reload-internal"].as<int>(), SIGUSR2);
if (rc) {
Log(LogCritical, "Application")
<< "Failed to send signal to \"" << vm["reload-internal"].as<int>() << "\" with " << strerror(errno);
return EXIT_FAILURE;
}
}

if (vm.count("daemonize")) {
Expand Down

0 comments on commit c418a96

Please sign in to comment.