Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[201803] [services] Restart SwSS service upon unexpected critical process exit #2546

Merged
merged 6 commits into from
Feb 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 18 additions & 20 deletions dockers/docker-dhcp-relay/wait_for_intf.sh.j2
Original file line number Diff line number Diff line change
@@ -1,42 +1,40 @@
#!/usr/bin/env bash

function wait_until_iface_ready
{
IFACE=$1
STATE_DB_IDX="6"

echo "Waiting until interface $IFACE is up..."

# Wait for the interface to come up (i.e., 'ip link show' returns 0)
until ip link show dev $IFACE up > /dev/null 2>&1; do
sleep 1
done
PORT_TABLE_PREFIX="PORT_TABLE"
VLAN_TABLE_PREFIX="VLAN_TABLE"
LAG_TABLE_PREFIX="LAG_TABLE"

echo "Interface $IFACE is up"
function wait_until_iface_ready
{
TABLE_PREFIX=$1
IFACE=$2

echo "Waiting until interface $IFACE has an IPv4 address..."
echo "Waiting until interface $IFACE is ready..."

# Wait until the interface gets assigned an IPv4 address
# Wait for the interface to come up
# (i.e., interface is present in STATE_DB and state is "ok")
while true; do
IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1)

if [ -n "$IP" ]; then
RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null)
if [ x"$RESULT" == x"ok" ]; then
break
fi

sleep 1
done

echo "Interface $IFACE is configured with IP $IP"
echo "Interface ${IFACE} is ready!"
}


# Wait for all interfaces to come up and have IPv4 addresses assigned
# Wait for all interfaces to be up and ready
{% for (name, prefix) in INTERFACE %}
wait_until_iface_ready {{ name }}
wait_until_iface_ready ${PORT_TABLE_PREFIX} {{ name }}
{% endfor %}
{% for (name, prefix) in VLAN_INTERFACE %}
wait_until_iface_ready {{ name }}
wait_until_iface_ready ${VLAN_TABLE_PREFIX} {{ name }}
{% endfor %}
{% for (name, prefix) in PORTCHANNEL_INTERFACE %}
wait_until_iface_ready {{ name }}
wait_until_iface_ready ${LAG_TABLE_PREFIX} {{ name }}
{% endfor %}
2 changes: 2 additions & 0 deletions dockers/docker-orchagent/Dockerfile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ COPY ["files/arp_update", "/usr/bin"]
COPY ["enable_counters.py", "/usr/bin"]
COPY ["start.sh", "orchagent.sh", "swssconfig.sh", "/usr/bin/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor/"]

## Copy all Jinja2 template files into the templates folder
COPY ["*.j2", "/usr/share/sonic/templates/"]
Expand Down
7 changes: 7 additions & 0 deletions dockers/docker-orchagent/critical_processes
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
orchagent
portsyncd
intfsyncd
neighsyncd
vlanmgrd
intfmgrd
buffermgrd
8 changes: 7 additions & 1 deletion dockers/docker-orchagent/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener
events=PROCESS_STATE_EXITED
autostart=true
autorestart=unexpected

jleveque marked this conversation as resolved.
Show resolved Hide resolved
[program:start.sh]
command=/usr/bin/start.sh
priority=1
Expand All @@ -15,7 +21,7 @@ stderr_logfile=syslog
command=/usr/sbin/rsyslogd -n
priority=2
autostart=false
autorestart=false
autorestart=unexpected
stdout_logfile=syslog
stderr_logfile=syslog

Expand Down
2 changes: 1 addition & 1 deletion files/build_templates/dhcp_relay.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh attach
ExecStop=/usr/bin/{{ docker_container_name }}.sh stop

[Install]
WantedBy=multi-user.target teamd.service
WantedBy=multi-user.target swss.service teamd.service
2 changes: 1 addition & 1 deletion files/build_templates/radv.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh attach
ExecStop=/usr/bin/{{ docker_container_name }}.sh stop

[Install]
WantedBy=multi-user.target
WantedBy=multi-user.target swss.service
3 changes: 3 additions & 0 deletions files/build_templates/snmp.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@ Before=ntp-config.service
ExecStartPre=/usr/bin/{{docker_container_name}}.sh start
ExecStart=/usr/bin/{{docker_container_name}}.sh attach
ExecStop=/usr/bin/{{docker_container_name}}.sh stop

[Install]
WantedBy=multi-user.target swss.service
4 changes: 4 additions & 0 deletions files/build_templates/swss.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ After=opennsl-modules-3.16.0-6-amd64.service
After=nps-modules-3.16.0-6-amd64.service
{% endif %}
Before=ntp-config.service
StartLimitInterval=1200
StartLimitBurst=3

[Service]
User=root
Expand Down Expand Up @@ -52,6 +54,8 @@ ExecStopPost=/usr/bin/mst stop
ExecStopPost=/etc/init.d/xpnet.sh stop
ExecStopPost=/etc/init.d/xpnet.sh start
{% endif %}
Restart=always
RestartSec=30

[Install]
WantedBy=multi-user.target
6 changes: 3 additions & 3 deletions files/build_templates/teamd.service.j2
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[Unit]
Description=TEAMD container
Requires=updategraph.service
After=updategraph.service
Requires=updategraph.service swss.service
After=updategraph.service swss.service
Before=ntp-config.service

[Service]
Expand All @@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{docker_container_name}}.sh attach
ExecStop=/usr/bin/{{docker_container_name}}.sh stop

[Install]
WantedBy=multi-user.target
WantedBy=multi-user.target swss.service
45 changes: 45 additions & 0 deletions files/scripts/supervisor-proc-exit-listener
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env python

import os
import signal
import sys
import syslog

from supervisor import childutils

# Contents of file should be the names of critical processes (as defined in
# supervisor.conf file), one per line
CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes'

def main():
# Read the list of critical processes from a file
with open(CRITICAL_PROCESSES_FILE, 'r') as f:
critical_processes = [line.rstrip('\n') for line in f]

while True:
# Transition from ACKNOWLEDGED to READY
childutils.listener.ready()

line = sys.stdin.readline()
headers = childutils.get_headers(line)
payload = sys.stdin.read(int(headers['len']))

# Transition from READY to ACKNOWLEDGED
childutils.listener.ok()

# We only care about PROCESS_STATE_EXITED events
if headers['eventname'] == 'PROCESS_STATE_EXITED':
payload_headers, payload_data = childutils.eventdata(payload + '\n')

expected = int(payload_headers['expected'])
processname = payload_headers['processname']

# If a critical process exited unexpectedly, terminate supervisor
if expected == 0 and processname in critical_processes:
MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..."
msg = MSG_FORMAT_STR.format(payload_headers['processname'])
syslog.syslog(syslog.LOG_INFO, msg)
os.kill(os.getppid(), signal.SIGTERM)

if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion platform/broadcom/docker-orchagent-brcm.mk
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_BRCM)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_BRCM)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw

$(DOCKER_ORCHAGENT_BRCM)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_BRCM)_FILES += $(ARP_UPDATE_SCRIPT)
$(DOCKER_ORCHAGENT_BRCM)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
2 changes: 1 addition & 1 deletion platform/cavium/docker-orchagent-cavm.mk
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_CAVM)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_CAVM)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw

$(DOCKER_ORCHAGENT_CAVM)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_CAVM)_FILES += $(ARP_UPDATE_SCRIPT)
$(DOCKER_ORCHAGENT_CAVM)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
2 changes: 1 addition & 1 deletion platform/centec/docker-orchagent-centec.mk
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_CENTEC)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_CENTEC)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw

$(DOCKER_ORCHAGENT_CENTEC)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_CENTEC)_FILES += $(ARP_UPDATE_SCRIPT)
$(DOCKER_ORCHAGENT_CENTEC)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
2 changes: 1 addition & 1 deletion platform/marvell/docker-orchagent-mrvl.mk
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ $(DOCKER_ORCHAGENT_MRVL)_RUN_OPT += -v /host/machine.conf:/host/machine.conf
$(DOCKER_ORCHAGENT_MRVL)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro

$(DOCKER_ORCHAGENT_MRVL)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_MRVL)_FILES += $(ARP_UPDATE_SCRIPT)
$(DOCKER_ORCHAGENT_MRVL)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
2 changes: 1 addition & 1 deletion platform/mellanox/docker-orchagent-mlnx.mk
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_MLNX)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_MLNX)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw

$(DOCKER_ORCHAGENT_MLNX)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_MLNX)_FILES += $(ARP_UPDATE_SCRIPT)
$(DOCKER_ORCHAGENT_MLNX)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
2 changes: 1 addition & 1 deletion platform/nephos/docker-orchagent-nephos.mk
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_NEPHOS)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_NEPHOS)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw

$(DOCKER_ORCHAGENT_NEPHOS)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_NEPHOS)_FILES += $(ARP_UPDATE_SCRIPT)
$(DOCKER_ORCHAGENT_NEPHOS)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
2 changes: 1 addition & 1 deletion rules/docker-dhcp-relay.mk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

DOCKER_DHCP_RELAY = docker-dhcp-relay.gz
$(DOCKER_DHCP_RELAY)_PATH = $(DOCKERS_PATH)/docker-dhcp-relay
$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) $(ISC_DHCP_CLIENT)
$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) $(ISC_DHCP_CLIENT) $(REDIS_TOOLS)
$(DOCKER_DHCP_RELAY)_LOAD_DOCKERS = $(DOCKER_CONFIG_ENGINE)
SONIC_DOCKER_IMAGES += $(DOCKER_DHCP_RELAY)
SONIC_INSTALL_DOCKER_IMAGES += $(DOCKER_DHCP_RELAY)
Expand Down
6 changes: 5 additions & 1 deletion rules/scripts.mk
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ $(ARP_UPDATE_SCRIPT)_PATH = files/scripts
CONFIGDB_LOAD_SCRIPT = configdb-load.sh
$(CONFIGDB_LOAD_SCRIPT)_PATH = files/scripts

SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT = supervisor-proc-exit-listener
$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)_PATH = files/scripts

SONIC_COPY_FILES += $(CONFIGDB_LOAD_SCRIPT) \
$(ARP_UPDATE_SCRIPT)
$(ARP_UPDATE_SCRIPT) \
$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)


50 changes: 24 additions & 26 deletions src/sonic-config-engine/tests/sample_output/wait_for_intf.sh
Original file line number Diff line number Diff line change
@@ -1,43 +1,41 @@
#!/usr/bin/env bash

function wait_until_iface_ready
{
IFACE=$1
STATE_DB_IDX="6"

echo "Waiting until interface $IFACE is up..."

# Wait for the interface to come up (i.e., 'ip link show' returns 0)
until ip link show dev $IFACE up > /dev/null 2>&1; do
sleep 1
done
PORT_TABLE_PREFIX="PORT_TABLE"
VLAN_TABLE_PREFIX="VLAN_TABLE"
LAG_TABLE_PREFIX="LAG_TABLE"

echo "Interface $IFACE is up"
function wait_until_iface_ready
{
TABLE_PREFIX=$1
IFACE=$2

echo "Waiting until interface $IFACE has an IPv4 address..."
echo "Waiting until interface $IFACE is ready..."

# Wait until the interface gets assigned an IPv4 address
# Wait for the interface to come up
# (i.e., interface is present in STATE_DB and state is "ok")
while true; do
IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1)

if [ -n "$IP" ]; then
RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null)
if [ x"$RESULT" == x"ok" ]; then
break
fi

sleep 1
done

echo "Interface $IFACE is configured with IP $IP"
echo "Interface ${IFACE} is ready!"
}


# Wait for all interfaces to come up and have IPv4 addresses assigned
wait_until_iface_ready Vlan1000
wait_until_iface_ready PortChannel04
wait_until_iface_ready PortChannel02
wait_until_iface_ready PortChannel03
wait_until_iface_ready PortChannel03
wait_until_iface_ready PortChannel01
wait_until_iface_ready PortChannel02
wait_until_iface_ready PortChannel04
wait_until_iface_ready PortChannel01
# Wait for all interfaces to be up and ready
wait_until_iface_ready ${VLAN_TABLE_PREFIX} Vlan1000
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01