From f6149dd0758fc345c54bec26c07093b2bbceab2f Mon Sep 17 00:00:00 2001 From: Joshua Levy Date: Wed, 22 Jul 2015 16:23:33 -0700 Subject: [PATCH] 0.1.13: Various corner case fixes. Handle stale lock files in non-strict mode. Better tracebacks in debug mode. Timeout on sending data as well in case of bugs etc on monitor. Minor test fixups. --- procdog | 99 +++++++++++++++++++++++++++++-------------- tests/run.sh | 2 + tests/tests-clean.log | 24 +++++++++++ 3 files changed, 94 insertions(+), 31 deletions(-) diff --git a/procdog b/procdog index 4bf5347..b6f7fcd 100755 --- a/procdog +++ b/procdog @@ -22,7 +22,7 @@ For further documentation, see: https://github.com/jlevy/procdog # Created: 2015-03-14 from __future__ import print_function -import socket, sys, os, errno, time, shlex, ConfigParser, argparse, re +import socket, sys, os, errno, time, shlex, ConfigParser, argparse, re, traceback from collections import namedtuple from string import Template from datetime import datetime @@ -35,7 +35,7 @@ except ImportError: import subprocess NAME = "procdog" -VERSION = "0.1.12" +VERSION = "0.1.13" DESCRIPTION = "procdog: Lightweight command-line process control" LONG_DESCRIPTION = __doc__ @@ -77,8 +77,9 @@ HEALTHY_CODE = 0 EOM = "\n" -# Connect timeout to local Unix domain socket. +# Timeouts to connect and send to local Unix domain socket. SOCKET_TIMEOUT = 3 +SEND_TIMEOUT = 10 DEBUG = False @@ -104,6 +105,10 @@ class NotRunningException(StateException): pass +class MonitorNotRunning(NotRunningException): + pass + + class AlreadyRunningException(StateException): pass @@ -244,13 +249,17 @@ def _debug(msg): print("%s (%s): debug: %s" % (NAME, os.getpid(), msg), file=sys.stderr) -def _info(msg): +def _info(msg, exc_info=None): with _LOG_LOCK: - print("%s (%s): %s" % (NAME, os.getpid(), msg), file=sys.stderr) + exc_str = "\n%s" % "".join(traceback.format_exception(*exc_info)) if exc_info else "" + print("%s (%s): %s%s" % (NAME, os.getpid(), msg, exc_str), file=sys.stderr) -def _die(msg, code=1): +def _die(msg, code=1, exceptional=False): print("%s: error: %s" % (NAME, msg), file=sys.stderr) + global DEBUG + if DEBUG: + traceback.print_exc(file=sys.stderr) sys.exit(code) @@ -297,6 +306,7 @@ class _HandleThread(Thread): self.daemon = True # Do not prevent process termination on shutdown. def run(self): + failure_count = 0 while True: connection = self.queue.get(True) try: @@ -304,6 +314,13 @@ class _HandleThread(Thread): _debug("monitor: received command '%s'" % cmd) response = self.handler(cmd) connection.sendall(response + EOM) + except: + # Catch anything not already caught by the handler (e.g. a broken pipe on send). + failure_count += 1 + if failure_count < 5: + _info("monitor: handler exception", exc_info=sys.exc_info()) + else: + raise finally: connection.close() @@ -435,6 +452,7 @@ def daemonize_and_monitor(proc_name, shell_cmd, options): unlink_path(socket_path, silent=False) # Deamonize. If we're debugging, keep all debug output on the console. + global DEBUG is_daemon = daemonize(home_dir=options.dir, do_redirect=not DEBUG, stdout=LOG_PATH_PAT % proc_name) if is_daemon: @@ -469,7 +487,12 @@ def daemonize_and_monitor(proc_name, shell_cmd, options): # Start process, tracking any immediate errors. try: args = shell_cmd if options.shell else shlex.split(shell_cmd) - popen = subprocess.Popen(args, stdin=fin, stdout=fout, stderr=ferr, shell=options.shell, preexec_fn=os.setsid) + popen = subprocess.Popen(args, + stdin=fin, + stdout=fout, + stderr=ferr, + shell=options.shell, + preexec_fn=os.setsid) except OSError as e: error = "Failed to start: %s: %s" % (e, shell_cmd) _info("monitor: %s" % error) @@ -514,8 +537,8 @@ def send_command(proc_name, cmd): # Set a short timeout on connect, in case process not running. Once we connect, disable the timeout. sock.settimeout(SOCKET_TIMEOUT) sock.connect(SOCKET_PATH_PAT % proc_name) - sock.settimeout(None) _debug("client: connected") + sock.settimeout(SEND_TIMEOUT) sock.sendall(cmd + EOM) data = _recv_msg(sock) if not data: @@ -523,8 +546,14 @@ def send_command(proc_name, cmd): raise NotRunningException("no data from monitor") _debug("client: %s: %s" % (cmd, data)) return Response(raw=data) - except socket.error as msg: - raise NotRunningException("socket error: %s" % msg) + except socket.error as e: + # Detect problematic corner cases where monitor has been killed, crashed, tampered with, etc. + # If we get connection refused (61) process can't be running. + # If we get file not found (2), the socket file is missing. + if e.errno == 61 or e.errno == 2: + raise MonitorNotRunning("couldn't connect: %s" % e) + else: + raise NotRunningException("socket error: %s" % e) finally: sock.close() @@ -544,6 +573,23 @@ def _wait_for_connect(proc_name): raise OtherException("timeout connecting to monitor: %s" % exc) +def _start_monitor(proc_name, shell_cmd, options): + daemonize_and_monitor(proc_name, shell_cmd, options) + if options.ensure_healthy: + _debug("monitor: ensure_healthy is set, waiting for health") + resp = send_command(proc_name, CMD_WAIT) + if resp.is_running() and not resp.is_healthy(): + _debug("monitor: ensure_healthy is set and process is still not healthy (%s); stopping" % resp) + resp = send_command(proc_name, CMD_STOP) + elif not resp.is_running(): + _debug("monitor: ensure_healthy is set but process is not running (%s)" % resp) + else: + _debug("monitor: ensure_healthy is set and process is still healthy (%s)" % resp) + else: + resp = send_command(proc_name, CMD_STATUS) + return resp + + def start(proc_name, shell_cmd, options, strict=False): '''Start a process, assigning it the given id. If strict is true, return existing process status; otherwise raise exception if process already is running.''' @@ -553,24 +599,16 @@ def start(proc_name, shell_cmd, options, strict=False): _debug("client: acquired lock: %s: %s" % (lock_acquired, lock_path)) if lock_acquired: - daemonize_and_monitor(proc_name, shell_cmd, options) - if options.ensure_healthy: - _debug("monitor: ensure_healthy is set, waiting for health") - resp = send_command(proc_name, CMD_WAIT) - if resp.is_running() and not resp.is_healthy(): - _debug("monitor: ensure_healthy is set and process is still not healthy (%s); stopping" % resp) - resp = send_command(proc_name, CMD_STOP) - elif not resp.is_running(): - _debug("monitor: ensure_healthy is set but process is not running (%s)" % resp) - else: - _debug("monitor: ensure_healthy is set and process is still healthy (%s)" % resp) - else: - resp = send_command(proc_name, CMD_STATUS) + resp = _start_monitor(proc_name, shell_cmd, options) else: if strict: raise AlreadyRunningException("process already running (to override, remove lock at %s)" % lock_path) else: - resp = send_command(proc_name, CMD_STATUS) + try: + resp = send_command(proc_name, CMD_STATUS) + except MonitorNotRunning as e: + _debug("client: stale lock file present but monitor not listening; restarting new monitor") + resp = _start_monitor(proc_name, shell_cmd, options) return resp @@ -703,9 +741,7 @@ def main(): parser.add_argument("--strict", help="abort with error if asked to start a started process or stop a stopped one", action="store_true") - parser.add_argument("--shell", - help="use shell to fully parse/execute the command string", - action="store_true") + parser.add_argument("--shell", help="use shell to fully parse/execute the command string", action="store_true") parser.add_argument("--config", help="non-default location on config file to read") parser.add_argument("--debug", help="enable debugging output", action="store_true") args = parser.parse_args() @@ -724,7 +760,7 @@ def main(): def was_explicitly_set(action, val): return (type(action) != argparse._StoreTrueAction and type(action) != argparse._StoreFalseAction - ) or val != action.default + ) or val != action.default # These are only the options that were explicitly set. command_line_options = { @@ -774,17 +810,18 @@ def main(): sys.exit(exit_code) except AlreadyRunningException as e: - _die("process '%s' is already running" % args.proc_name, code=EXIT_ALREADY_RUNNING) + _die("process '%s' is already running" % args.proc_name, code=EXIT_ALREADY_RUNNING, exceptional=True) except NotRunningException as e: - _die("process '%s' is not running" % args.proc_name, code=EXIT_NOT_RUNNING) + _die("process '%s' is not running" % args.proc_name, code=EXIT_NOT_RUNNING, exceptional=True) except (ValueError, OSError) as e: - _die(str(e), code=EXIT_UNKNOWN) + _die(str(e), code=EXIT_UNKNOWN, exceptional=True) if __name__ == '__main__': main() # TODO: +# - better logic around where procdog.cfg can be when procdog is installed globally via pip # - handle corner case scenario: # process already started externally without procdog, so process started with procdog fails, # but health check passes immediately since process is listening diff --git a/tests/run.sh b/tests/run.sh index 3052163..4d6048d 100755 --- a/tests/run.sh +++ b/tests/run.sh @@ -18,6 +18,8 @@ cd /tmp/procdog-tests # Update these patterns as appropriate. # Note we use perl not sed, so it works on Mac and Linux. The $|=1; is just for the impatient and ensures line buffering. $dir/tests.sh 2>&1 | tee $full_log \ + | perl -pe '$|=1; s/File ".*\/([a-zA-Z0-9._]+)", line [0-9]*,/File "...\/\1", line __X,/g' \ + | perl -pe '$|=1; s/, line [0-9]*,/, line __X,/g' \ | perl -pe '$|=1; s/pid=[0-9]*/pid=_PID_/g' \ | perl -pe '$|=1; s/[0-9.:T-]*Z/_TIMESTAMP_/g' \ | perl -pe '$|=1; s/[a-zA-Z0-9\/]*procdog.cfg/_PATH_\/procdog.cfg/g' \ diff --git a/tests/tests-clean.log b/tests/tests-clean.log index 7fe5dfd..d828ad6 100644 --- a/tests/tests-clean.log +++ b/tests/tests-clean.log @@ -132,6 +132,14 @@ procdog (_PID_): debug: command-line options: {'strict': True, 'command': 'sleep procdog (_PID_): debug: combined options: Options(command='sleep 5', health_command='true', health_count=5, health_delay=1.0, ensure_healthy=False, dir=None, stdin=None, stdout=None, stderr=None, append=False, linger=3.0, strict=True, shell=False) procdog (_PID_): debug: client: acquired lock: False: /var/tmp/procdog.long.lock procdog: error: process 'long' is already running +Traceback (most recent call last): + File ".../procdog", line __X, in main + response = _execute(args.proc_name, args.control_cmd, options) + File ".../procdog", line __X, in _execute + response = start(proc_name, options.command, options=options, strict=options.strict) + File ".../procdog", line __X, in start + raise AlreadyRunningException("process already running (to override, remove lock at %s)" % lock_path) +AlreadyRunningException: process already running (to override, remove lock at /var/tmp/procdog.long.lock) (got expected error: status 4) procdog stop long @@ -646,6 +654,12 @@ procdog start out1 --command 'echo hello $BADENVVAR' --stdout tmp.stdout.out1 -- procdog (_PID_): debug: command-line options: {'command': 'echo hello $BADENVVAR', 'stderr': 'tmp.stderr.out1', 'stdout': 'tmp.stdout.out1'} procdog (_PID_): debug: combined options: Options(command='echo hello $BADENVVAR', health_command=None, health_count=5, health_delay=1.0, ensure_healthy=False, dir=None, stdin=None, stdout='tmp.stdout.out1', stderr='tmp.stderr.out1', append=False, linger=3.0, strict=False, shell=False) procdog: error: could not expand environment variable names in command 'echo hello $BADENVVAR': 'BADENVVAR' +Traceback (most recent call last): + File ".../procdog", line __X, in main + options = options._replace(command=_expand_variables(options.command), + File ".../procdog", line __X, in _expand_variables + raise ValueError("could not expand environment variable names in command '%s': %s" % (template_str, e)) +ValueError: could not expand environment variable names in command 'echo hello $BADENVVAR': 'BADENVVAR' (got expected error: status 3) # Read from input and write stderr and stdout to same output. @@ -722,6 +736,16 @@ killed, signal=15 procdog start conftest_bad --config $config_file || expect_error procdog: error: Not a boolean: not-valid +Traceback (most recent call last): + File ".../procdog", line __X, in main + file_options = _read_config_file(args.proc_name, config_file, target_types) + File ".../procdog", line __X, in _read_config_file + out[key] = _parse_config(config, proc_name, key, target_types[key]) + File ".../procdog", line __X, in _parse_config + return config.getboolean(section, option) + File ".../ConfigParser.py", line __X, in getboolean + raise ValueError, 'Not a boolean: %s' % v +ValueError: Not a boolean: not-valid (got expected error: status 3) # --- End of tests ---