Skip to content

Commit

Permalink
0.1.13: Various corner case fixes.
Browse files Browse the repository at this point in the history
Handle stale lock files in non-strict mode.
Better tracebacks in debug mode.
Timeout on sending data as well in case of bugs etc on monitor.
Minor test fixups.
  • Loading branch information
jlevy committed Jul 23, 2015
1 parent e087998 commit f6149dd
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 31 deletions.
99 changes: 68 additions & 31 deletions procdog
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ For further documentation, see: https://github.com/jlevy/procdog
# Created: 2015-03-14

from __future__ import print_function
import socket, sys, os, errno, time, shlex, ConfigParser, argparse, re
import socket, sys, os, errno, time, shlex, ConfigParser, argparse, re, traceback
from collections import namedtuple
from string import Template
from datetime import datetime
Expand All @@ -35,7 +35,7 @@ except ImportError:
import subprocess

NAME = "procdog"
VERSION = "0.1.12"
VERSION = "0.1.13"
DESCRIPTION = "procdog: Lightweight command-line process control"
LONG_DESCRIPTION = __doc__

Expand Down Expand Up @@ -77,8 +77,9 @@ HEALTHY_CODE = 0

EOM = "\n"

# Connect timeout to local Unix domain socket.
# Timeouts to connect and send to local Unix domain socket.
SOCKET_TIMEOUT = 3
SEND_TIMEOUT = 10

DEBUG = False

Expand All @@ -104,6 +105,10 @@ class NotRunningException(StateException):
pass


class MonitorNotRunning(NotRunningException):
pass


class AlreadyRunningException(StateException):
pass

Expand Down Expand Up @@ -244,13 +249,17 @@ def _debug(msg):
print("%s (%s): debug: %s" % (NAME, os.getpid(), msg), file=sys.stderr)


def _info(msg):
def _info(msg, exc_info=None):
with _LOG_LOCK:
print("%s (%s): %s" % (NAME, os.getpid(), msg), file=sys.stderr)
exc_str = "\n%s" % "".join(traceback.format_exception(*exc_info)) if exc_info else ""
print("%s (%s): %s%s" % (NAME, os.getpid(), msg, exc_str), file=sys.stderr)


def _die(msg, code=1):
def _die(msg, code=1, exceptional=False):
print("%s: error: %s" % (NAME, msg), file=sys.stderr)
global DEBUG
if DEBUG:
traceback.print_exc(file=sys.stderr)
sys.exit(code)


Expand Down Expand Up @@ -297,13 +306,21 @@ class _HandleThread(Thread):
self.daemon = True # Do not prevent process termination on shutdown.

def run(self):
failure_count = 0
while True:
connection = self.queue.get(True)
try:
cmd = _recv_msg(connection)
_debug("monitor: received command '%s'" % cmd)
response = self.handler(cmd)
connection.sendall(response + EOM)
except:
# Catch anything not already caught by the handler (e.g. a broken pipe on send).
failure_count += 1
if failure_count < 5:
_info("monitor: handler exception", exc_info=sys.exc_info())
else:
raise
finally:
connection.close()

Expand Down Expand Up @@ -435,6 +452,7 @@ def daemonize_and_monitor(proc_name, shell_cmd, options):
unlink_path(socket_path, silent=False)

# Deamonize. If we're debugging, keep all debug output on the console.
global DEBUG
is_daemon = daemonize(home_dir=options.dir, do_redirect=not DEBUG, stdout=LOG_PATH_PAT % proc_name)

if is_daemon:
Expand Down Expand Up @@ -469,7 +487,12 @@ def daemonize_and_monitor(proc_name, shell_cmd, options):
# Start process, tracking any immediate errors.
try:
args = shell_cmd if options.shell else shlex.split(shell_cmd)
popen = subprocess.Popen(args, stdin=fin, stdout=fout, stderr=ferr, shell=options.shell, preexec_fn=os.setsid)
popen = subprocess.Popen(args,
stdin=fin,
stdout=fout,
stderr=ferr,
shell=options.shell,
preexec_fn=os.setsid)
except OSError as e:
error = "Failed to start: %s: %s" % (e, shell_cmd)
_info("monitor: %s" % error)
Expand Down Expand Up @@ -514,17 +537,23 @@ def send_command(proc_name, cmd):
# Set a short timeout on connect, in case process not running. Once we connect, disable the timeout.
sock.settimeout(SOCKET_TIMEOUT)
sock.connect(SOCKET_PATH_PAT % proc_name)
sock.settimeout(None)
_debug("client: connected")
sock.settimeout(SEND_TIMEOUT)
sock.sendall(cmd + EOM)
data = _recv_msg(sock)
if not data:
# A rare race or a bug.
raise NotRunningException("no data from monitor")
_debug("client: %s: %s" % (cmd, data))
return Response(raw=data)
except socket.error as msg:
raise NotRunningException("socket error: %s" % msg)
except socket.error as e:
# Detect problematic corner cases where monitor has been killed, crashed, tampered with, etc.
# If we get connection refused (61) process can't be running.
# If we get file not found (2), the socket file is missing.
if e.errno == 61 or e.errno == 2:
raise MonitorNotRunning("couldn't connect: %s" % e)
else:
raise NotRunningException("socket error: %s" % e)
finally:
sock.close()

Expand All @@ -544,6 +573,23 @@ def _wait_for_connect(proc_name):
raise OtherException("timeout connecting to monitor: %s" % exc)


def _start_monitor(proc_name, shell_cmd, options):
daemonize_and_monitor(proc_name, shell_cmd, options)
if options.ensure_healthy:
_debug("monitor: ensure_healthy is set, waiting for health")
resp = send_command(proc_name, CMD_WAIT)
if resp.is_running() and not resp.is_healthy():
_debug("monitor: ensure_healthy is set and process is still not healthy (%s); stopping" % resp)
resp = send_command(proc_name, CMD_STOP)
elif not resp.is_running():
_debug("monitor: ensure_healthy is set but process is not running (%s)" % resp)
else:
_debug("monitor: ensure_healthy is set and process is still healthy (%s)" % resp)
else:
resp = send_command(proc_name, CMD_STATUS)
return resp


def start(proc_name, shell_cmd, options, strict=False):
'''Start a process, assigning it the given id. If strict is true, return existing process status;
otherwise raise exception if process already is running.'''
Expand All @@ -553,24 +599,16 @@ def start(proc_name, shell_cmd, options, strict=False):
_debug("client: acquired lock: %s: %s" % (lock_acquired, lock_path))

if lock_acquired:
daemonize_and_monitor(proc_name, shell_cmd, options)
if options.ensure_healthy:
_debug("monitor: ensure_healthy is set, waiting for health")
resp = send_command(proc_name, CMD_WAIT)
if resp.is_running() and not resp.is_healthy():
_debug("monitor: ensure_healthy is set and process is still not healthy (%s); stopping" % resp)
resp = send_command(proc_name, CMD_STOP)
elif not resp.is_running():
_debug("monitor: ensure_healthy is set but process is not running (%s)" % resp)
else:
_debug("monitor: ensure_healthy is set and process is still healthy (%s)" % resp)
else:
resp = send_command(proc_name, CMD_STATUS)
resp = _start_monitor(proc_name, shell_cmd, options)
else:
if strict:
raise AlreadyRunningException("process already running (to override, remove lock at %s)" % lock_path)
else:
resp = send_command(proc_name, CMD_STATUS)
try:
resp = send_command(proc_name, CMD_STATUS)
except MonitorNotRunning as e:
_debug("client: stale lock file present but monitor not listening; restarting new monitor")
resp = _start_monitor(proc_name, shell_cmd, options)
return resp


Expand Down Expand Up @@ -703,9 +741,7 @@ def main():
parser.add_argument("--strict",
help="abort with error if asked to start a started process or stop a stopped one",
action="store_true")
parser.add_argument("--shell",
help="use shell to fully parse/execute the command string",
action="store_true")
parser.add_argument("--shell", help="use shell to fully parse/execute the command string", action="store_true")
parser.add_argument("--config", help="non-default location on config file to read")
parser.add_argument("--debug", help="enable debugging output", action="store_true")
args = parser.parse_args()
Expand All @@ -724,7 +760,7 @@ def main():

def was_explicitly_set(action, val):
return (type(action) != argparse._StoreTrueAction and type(action) != argparse._StoreFalseAction
) or val != action.default
) or val != action.default

# These are only the options that were explicitly set.
command_line_options = {
Expand Down Expand Up @@ -774,17 +810,18 @@ def main():
sys.exit(exit_code)

except AlreadyRunningException as e:
_die("process '%s' is already running" % args.proc_name, code=EXIT_ALREADY_RUNNING)
_die("process '%s' is already running" % args.proc_name, code=EXIT_ALREADY_RUNNING, exceptional=True)
except NotRunningException as e:
_die("process '%s' is not running" % args.proc_name, code=EXIT_NOT_RUNNING)
_die("process '%s' is not running" % args.proc_name, code=EXIT_NOT_RUNNING, exceptional=True)
except (ValueError, OSError) as e:
_die(str(e), code=EXIT_UNKNOWN)
_die(str(e), code=EXIT_UNKNOWN, exceptional=True)


if __name__ == '__main__':
main()

# TODO:
# - better logic around where procdog.cfg can be when procdog is installed globally via pip
# - handle corner case scenario:
# process already started externally without procdog, so process started with procdog fails,
# but health check passes immediately since process is listening
Expand Down
2 changes: 2 additions & 0 deletions tests/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ cd /tmp/procdog-tests
# Update these patterns as appropriate.
# Note we use perl not sed, so it works on Mac and Linux. The $|=1; is just for the impatient and ensures line buffering.
$dir/tests.sh 2>&1 | tee $full_log \
| perl -pe '$|=1; s/File ".*\/([a-zA-Z0-9._]+)", line [0-9]*,/File "...\/\1", line __X,/g' \
| perl -pe '$|=1; s/, line [0-9]*,/, line __X,/g' \
| perl -pe '$|=1; s/pid=[0-9]*/pid=_PID_/g' \
| perl -pe '$|=1; s/[0-9.:T-]*Z/_TIMESTAMP_/g' \
| perl -pe '$|=1; s/[a-zA-Z0-9\/]*procdog.cfg/_PATH_\/procdog.cfg/g' \
Expand Down
24 changes: 24 additions & 0 deletions tests/tests-clean.log
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,14 @@ procdog (_PID_): debug: command-line options: {'strict': True, 'command': 'sleep
procdog (_PID_): debug: combined options: Options(command='sleep 5', health_command='true', health_count=5, health_delay=1.0, ensure_healthy=False, dir=None, stdin=None, stdout=None, stderr=None, append=False, linger=3.0, strict=True, shell=False)
procdog (_PID_): debug: client: acquired lock: False: /var/tmp/procdog.long.lock
procdog: error: process 'long' is already running
Traceback (most recent call last):
File ".../procdog", line __X, in main
response = _execute(args.proc_name, args.control_cmd, options)
File ".../procdog", line __X, in _execute
response = start(proc_name, options.command, options=options, strict=options.strict)
File ".../procdog", line __X, in start
raise AlreadyRunningException("process already running (to override, remove lock at %s)" % lock_path)
AlreadyRunningException: process already running (to override, remove lock at /var/tmp/procdog.long.lock)
(got expected error: status 4)

procdog stop long
Expand Down Expand Up @@ -646,6 +654,12 @@ procdog start out1 --command 'echo hello $BADENVVAR' --stdout tmp.stdout.out1 --
procdog (_PID_): debug: command-line options: {'command': 'echo hello $BADENVVAR', 'stderr': 'tmp.stderr.out1', 'stdout': 'tmp.stdout.out1'}
procdog (_PID_): debug: combined options: Options(command='echo hello $BADENVVAR', health_command=None, health_count=5, health_delay=1.0, ensure_healthy=False, dir=None, stdin=None, stdout='tmp.stdout.out1', stderr='tmp.stderr.out1', append=False, linger=3.0, strict=False, shell=False)
procdog: error: could not expand environment variable names in command 'echo hello $BADENVVAR': 'BADENVVAR'
Traceback (most recent call last):
File ".../procdog", line __X, in main
options = options._replace(command=_expand_variables(options.command),
File ".../procdog", line __X, in _expand_variables
raise ValueError("could not expand environment variable names in command '%s': %s" % (template_str, e))
ValueError: could not expand environment variable names in command 'echo hello $BADENVVAR': 'BADENVVAR'
(got expected error: status 3)

# Read from input and write stderr and stdout to same output.
Expand Down Expand Up @@ -722,6 +736,16 @@ killed, signal=15

procdog start conftest_bad --config $config_file || expect_error
procdog: error: Not a boolean: not-valid
Traceback (most recent call last):
File ".../procdog", line __X, in main
file_options = _read_config_file(args.proc_name, config_file, target_types)
File ".../procdog", line __X, in _read_config_file
out[key] = _parse_config(config, proc_name, key, target_types[key])
File ".../procdog", line __X, in _parse_config
return config.getboolean(section, option)
File ".../ConfigParser.py", line __X, in getboolean
raise ValueError, 'Not a boolean: %s' % v
ValueError: Not a boolean: not-valid
(got expected error: status 3)

# --- End of tests ---

0 comments on commit f6149dd

Please sign in to comment.