0.1.13: Various corner case fixes.

Handle stale lock files in non-strict mode. Better tracebacks in debug mode. Timeout on sending data as well in case of bugs etc on monitor. Minor test fixups.
jlevy · Jul 23, 2015 · f6149dd · f6149dd
1 parent e087998
commit f6149dd
Show file tree

Hide file tree

Showing 3 changed files with 94 additions and 31 deletions.
diff --git a/procdog b/procdog
@@ -22,7 +22,7 @@ For further documentation, see: https://github.com/jlevy/procdog
 # Created: 2015-03-14
 
 from __future__ import print_function
-import socket, sys, os, errno, time, shlex, ConfigParser, argparse, re
+import socket, sys, os, errno, time, shlex, ConfigParser, argparse, re, traceback
 from collections import namedtuple
 from string import Template
 from datetime import datetime
@@ -35,7 +35,7 @@ except ImportError:
     import subprocess
 
 NAME = "procdog"
-VERSION = "0.1.12"
+VERSION = "0.1.13"
 DESCRIPTION = "procdog: Lightweight command-line process control"
 LONG_DESCRIPTION = __doc__
 
@@ -77,8 +77,9 @@ HEALTHY_CODE = 0
 
 EOM = "\n"
 
-# Connect timeout to local Unix domain socket.
+# Timeouts to connect and send to local Unix domain socket.
 SOCKET_TIMEOUT = 3
+SEND_TIMEOUT = 10
 
 DEBUG = False
 
@@ -104,6 +105,10 @@ class NotRunningException(StateException):
     pass
 
 
+class MonitorNotRunning(NotRunningException):
+    pass
+
+
 class AlreadyRunningException(StateException):
     pass
 
@@ -244,13 +249,17 @@ def _debug(msg):
             print("%s (%s): debug: %s" % (NAME, os.getpid(), msg), file=sys.stderr)
 
 
-def _info(msg):
+def _info(msg, exc_info=None):
     with _LOG_LOCK:
-        print("%s (%s): %s" % (NAME, os.getpid(), msg), file=sys.stderr)
+        exc_str = "\n%s" % "".join(traceback.format_exception(*exc_info)) if exc_info else ""
+        print("%s (%s): %s%s" % (NAME, os.getpid(), msg, exc_str), file=sys.stderr)
 
 
-def _die(msg, code=1):
+def _die(msg, code=1, exceptional=False):
     print("%s: error: %s" % (NAME, msg), file=sys.stderr)
+    global DEBUG
+    if DEBUG:
+        traceback.print_exc(file=sys.stderr)
     sys.exit(code)
 
 
@@ -297,13 +306,21 @@ class _HandleThread(Thread):
         self.daemon = True  # Do not prevent process termination on shutdown.
 
     def run(self):
+        failure_count = 0
         while True:
             connection = self.queue.get(True)
             try:
                 cmd = _recv_msg(connection)
                 _debug("monitor: received command '%s'" % cmd)
                 response = self.handler(cmd)
                 connection.sendall(response + EOM)
+            except:
+                # Catch anything not already caught by the handler (e.g. a broken pipe on send).
+                failure_count += 1
+                if failure_count < 5:
+                    _info("monitor: handler exception", exc_info=sys.exc_info())
+                else:
+                    raise
             finally:
                 connection.close()
 
@@ -435,6 +452,7 @@ def daemonize_and_monitor(proc_name, shell_cmd, options):
     unlink_path(socket_path, silent=False)
 
     # Deamonize. If we're debugging, keep all debug output on the console.
+    global DEBUG
     is_daemon = daemonize(home_dir=options.dir, do_redirect=not DEBUG, stdout=LOG_PATH_PAT % proc_name)
 
     if is_daemon:
@@ -469,7 +487,12 @@ def daemonize_and_monitor(proc_name, shell_cmd, options):
             # Start process, tracking any immediate errors.
             try:
                 args = shell_cmd if options.shell else shlex.split(shell_cmd)
-                popen = subprocess.Popen(args, stdin=fin, stdout=fout, stderr=ferr, shell=options.shell, preexec_fn=os.setsid)
+                popen = subprocess.Popen(args,
+                                         stdin=fin,
+                                         stdout=fout,
+                                         stderr=ferr,
+                                         shell=options.shell,
+                                         preexec_fn=os.setsid)
             except OSError as e:
                 error = "Failed to start: %s: %s" % (e, shell_cmd)
                 _info("monitor: %s" % error)
@@ -514,17 +537,23 @@ def send_command(proc_name, cmd):
         # Set a short timeout on connect, in case process not running. Once we connect, disable the timeout.
         sock.settimeout(SOCKET_TIMEOUT)
         sock.connect(SOCKET_PATH_PAT % proc_name)
-        sock.settimeout(None)
         _debug("client: connected")
+        sock.settimeout(SEND_TIMEOUT)
         sock.sendall(cmd + EOM)
         data = _recv_msg(sock)
         if not data:
             # A rare race or a bug.
             raise NotRunningException("no data from monitor")
         _debug("client: %s: %s" % (cmd, data))
         return Response(raw=data)
-    except socket.error as msg:
-        raise NotRunningException("socket error: %s" % msg)
+    except socket.error as e:
+        # Detect problematic corner cases where monitor has been killed, crashed, tampered with, etc.
+        # If we get connection refused (61) process can't be running.
+        # If we get file not found (2), the socket file is missing.
+        if e.errno == 61 or e.errno == 2:
+            raise MonitorNotRunning("couldn't connect: %s" % e)
+        else:
+            raise NotRunningException("socket error: %s" % e)
     finally:
         sock.close()
 
@@ -544,6 +573,23 @@ def _wait_for_connect(proc_name):
     raise OtherException("timeout connecting to monitor: %s" % exc)
 
 
+def _start_monitor(proc_name, shell_cmd, options):
+    daemonize_and_monitor(proc_name, shell_cmd, options)
+    if options.ensure_healthy:
+        _debug("monitor: ensure_healthy is set, waiting for health")
+        resp = send_command(proc_name, CMD_WAIT)
+        if resp.is_running() and not resp.is_healthy():
+            _debug("monitor: ensure_healthy is set and process is still not healthy (%s); stopping" % resp)
+            resp = send_command(proc_name, CMD_STOP)
+        elif not resp.is_running():
+            _debug("monitor: ensure_healthy is set but process is not running (%s)" % resp)
+        else:
+            _debug("monitor: ensure_healthy is set and process is still healthy (%s)" % resp)
+    else:
+        resp = send_command(proc_name, CMD_STATUS)
+    return resp
+
+
 def start(proc_name, shell_cmd, options, strict=False):
     '''Start a process, assigning it the given id. If strict is true, return existing process status;
      otherwise raise exception if process already is running.'''
@@ -553,24 +599,16 @@ def start(proc_name, shell_cmd, options, strict=False):
     _debug("client: acquired lock: %s: %s" % (lock_acquired, lock_path))
 
     if lock_acquired:
-        daemonize_and_monitor(proc_name, shell_cmd, options)
-        if options.ensure_healthy:
-            _debug("monitor: ensure_healthy is set, waiting for health")
-            resp = send_command(proc_name, CMD_WAIT)
-            if resp.is_running() and not resp.is_healthy():
-                _debug("monitor: ensure_healthy is set and process is still not healthy (%s); stopping" % resp)
-                resp = send_command(proc_name, CMD_STOP)
-            elif not resp.is_running():
-                _debug("monitor: ensure_healthy is set but process is not running (%s)" % resp)
-            else:
-                _debug("monitor: ensure_healthy is set and process is still healthy (%s)" % resp)
-        else:
-            resp = send_command(proc_name, CMD_STATUS)
+        resp = _start_monitor(proc_name, shell_cmd, options)
     else:
         if strict:
             raise AlreadyRunningException("process already running (to override, remove lock at %s)" % lock_path)
         else:
-            resp = send_command(proc_name, CMD_STATUS)
+            try:
+                resp = send_command(proc_name, CMD_STATUS)
+            except MonitorNotRunning as e:
+                _debug("client: stale lock file present but monitor not listening; restarting new monitor")
+                resp = _start_monitor(proc_name, shell_cmd, options)
     return resp
 
 
@@ -703,9 +741,7 @@ def main():
     parser.add_argument("--strict",
                         help="abort with error if asked to start a started process or stop a stopped one",
                         action="store_true")
-    parser.add_argument("--shell",
-                        help="use shell to fully parse/execute the command string",
-                        action="store_true")
+    parser.add_argument("--shell", help="use shell to fully parse/execute the command string", action="store_true")
     parser.add_argument("--config", help="non-default location on config file to read")
     parser.add_argument("--debug", help="enable debugging output", action="store_true")
     args = parser.parse_args()
@@ -724,7 +760,7 @@ def main():
 
         def was_explicitly_set(action, val):
             return (type(action) != argparse._StoreTrueAction and type(action) != argparse._StoreFalseAction
-              ) or val != action.default
+                    ) or val != action.default
 
         # These are only the options that were explicitly set.
         command_line_options = {
@@ -774,17 +810,18 @@ def main():
         sys.exit(exit_code)
 
     except AlreadyRunningException as e:
-        _die("process '%s' is already running" % args.proc_name, code=EXIT_ALREADY_RUNNING)
+        _die("process '%s' is already running" % args.proc_name, code=EXIT_ALREADY_RUNNING, exceptional=True)
     except NotRunningException as e:
-        _die("process '%s' is not running" % args.proc_name, code=EXIT_NOT_RUNNING)
+        _die("process '%s' is not running" % args.proc_name, code=EXIT_NOT_RUNNING, exceptional=True)
     except (ValueError, OSError) as e:
-        _die(str(e), code=EXIT_UNKNOWN)
+        _die(str(e), code=EXIT_UNKNOWN, exceptional=True)
 
 
 if __name__ == '__main__':
     main()
 
 # TODO:
+# - better logic around where procdog.cfg can be when procdog is installed globally via pip
 # - handle corner case scenario:
 #     process already started externally without procdog, so process started with procdog fails,
 #     but health check passes immediately since process is listening

diff --git a/tests/run.sh b/tests/run.sh
@@ -18,6 +18,8 @@ cd /tmp/procdog-tests
 # Update these patterns as appropriate.
 # Note we use perl not sed, so it works on Mac and Linux. The $|=1; is just for the impatient and ensures line buffering.
 $dir/tests.sh 2>&1 | tee $full_log \
+  | perl -pe '$|=1; s/File ".*\/([a-zA-Z0-9._]+)", line [0-9]*,/File "...\/\1", line __X,/g' \
+  | perl -pe '$|=1; s/, line [0-9]*,/, line __X,/g' \
   | perl -pe '$|=1; s/pid=[0-9]*/pid=_PID_/g' \
   | perl -pe '$|=1; s/[0-9.:T-]*Z/_TIMESTAMP_/g' \
   | perl -pe '$|=1; s/[a-zA-Z0-9\/]*procdog.cfg/_PATH_\/procdog.cfg/g' \

diff --git a/tests/tests-clean.log b/tests/tests-clean.log
@@ -132,6 +132,14 @@ procdog (_PID_): debug: command-line options: {'strict': True, 'command': 'sleep
 procdog (_PID_): debug: combined options: Options(command='sleep 5', health_command='true', health_count=5, health_delay=1.0, ensure_healthy=False, dir=None, stdin=None, stdout=None, stderr=None, append=False, linger=3.0, strict=True, shell=False)
 procdog (_PID_): debug: client: acquired lock: False: /var/tmp/procdog.long.lock
 procdog: error: process 'long' is already running
+Traceback (most recent call last):
+  File ".../procdog", line __X, in main
+    response = _execute(args.proc_name, args.control_cmd, options)
+  File ".../procdog", line __X, in _execute
+    response = start(proc_name, options.command, options=options, strict=options.strict)
+  File ".../procdog", line __X, in start
+    raise AlreadyRunningException("process already running (to override, remove lock at %s)" % lock_path)
+AlreadyRunningException: process already running (to override, remove lock at /var/tmp/procdog.long.lock)
 (got expected error: status 4)
 
 procdog stop long
@@ -646,6 +654,12 @@ procdog start out1 --command 'echo hello $BADENVVAR' --stdout tmp.stdout.out1 --
 procdog (_PID_): debug: command-line options: {'command': 'echo hello $BADENVVAR', 'stderr': 'tmp.stderr.out1', 'stdout': 'tmp.stdout.out1'}
 procdog (_PID_): debug: combined options: Options(command='echo hello $BADENVVAR', health_command=None, health_count=5, health_delay=1.0, ensure_healthy=False, dir=None, stdin=None, stdout='tmp.stdout.out1', stderr='tmp.stderr.out1', append=False, linger=3.0, strict=False, shell=False)
 procdog: error: could not expand environment variable names in command 'echo hello $BADENVVAR': 'BADENVVAR'
+Traceback (most recent call last):
+  File ".../procdog", line __X, in main
+    options = options._replace(command=_expand_variables(options.command),
+  File ".../procdog", line __X, in _expand_variables
+    raise ValueError("could not expand environment variable names in command '%s': %s" % (template_str, e))
+ValueError: could not expand environment variable names in command 'echo hello $BADENVVAR': 'BADENVVAR'
 (got expected error: status 3)
 
 # Read from input and write stderr and stdout to same output.
@@ -722,6 +736,16 @@ killed, signal=15
 
 procdog start conftest_bad --config $config_file || expect_error
 procdog: error: Not a boolean: not-valid
+Traceback (most recent call last):
+  File ".../procdog", line __X, in main
+    file_options = _read_config_file(args.proc_name, config_file, target_types)
+  File ".../procdog", line __X, in _read_config_file
+    out[key] = _parse_config(config, proc_name, key, target_types[key])
+  File ".../procdog", line __X, in _parse_config
+    return config.getboolean(section, option)
+  File ".../ConfigParser.py", line __X, in getboolean
+    raise ValueError, 'Not a boolean: %s' % v
+ValueError: Not a boolean: not-valid
 (got expected error: status 3)
 
 # --- End of tests ---