Skip to content

Commit

Permalink
[sonic-package-manager] support warm/fast reboot for extension packag…
Browse files Browse the repository at this point in the history
…es (sonic-net#1554)

- What I did

Implemented functionality for SONiC package manager allowing to support packages wich require special handling for fast and warm reboots. For more details refer to HLD - https://github.com/stepanblyschak/SONiC/blob/sonic-app-ext-3/doc/sonic-application-extention/sonic-application-extention-hld.md#warmboot-and-fastboot-design-impact.

- How I did it

I extended manifest with warm/fast shutdown fields and added a logic that will account special requirements on fast/warm reboot for a package. Fast/Warm reboot scripts are enhanced to read the ordered list of services from a file on filesystem instead of having the list of services hardcoded in the script. This file is regenerated when package is installed/uninstalled/upgraded and also this file will be generated once during build time. Similary, a warmboot-finalizer service is enhanced by making it read the file on filesystem with processes that perfrom reconciliation.

- How to verify it

There is an open example extension I pushed to Docker Hub stepanblischak/cpu-report:warm.
It can be installed on the switch:

admin@sonic:~$ sudo sonic-package-manager show package manifest --from-repository stepanblischak/cpu-report:warm | grep warm -A 6
        "warm-shutdown": {
            "after": [
                "swss"
            ],
            "before": [
                "syncd"
            ]
admin@sonic;~$ sudo sonic-package-manager install --from-repository stepanblischak/cpu-report:warm -y -v DEBUG
Then perform warm-reboot and observe that cpu-report is stopped at the right place in shutdown sequence:

admin@sonic:~$ sudo warm-reboot -v
sudo warm-reboot -v
Wed 31 Mar 2021 12:54:10 PM UTC Saving counters folder before warmboot...
Wed 31 Mar 2021 12:54:13 PM UTC Prepare MLNX ASIC to fastfast-reboot: install new FW if required
Wed 31 Mar 2021 12:54:15 PM UTC Pausing orchagent ...
Wed 31 Mar 2021 12:54:15 PM UTC Collecting logs to check ssd health before fastfast-reboot...
Wed 31 Mar 2021 12:54:15 PM UTC Stopping lldp ...
Wed 31 Mar 2021 12:54:17 PM UTC Stopped lldp
Wed 31 Mar 2021 12:54:17 PM UTC Stopping nat ...
Dumping conntrack entries failed
Wed 31 Mar 2021 12:54:18 PM UTC Stopped nat
Wed 31 Mar 2021 12:54:18 PM UTC Stopping radv ...
Wed 31 Mar 2021 12:54:18 PM UTC Stopped radv
Wed 31 Mar 2021 12:54:18 PM UTC Stopping sflow ...
Wed 31 Mar 2021 12:54:18 PM UTC Stopped sflow
Wed 31 Mar 2021 12:54:18 PM UTC Stopping bgp ...
Wed 31 Mar 2021 12:54:22 PM UTC Stopped bgp
Wed 31 Mar 2021 12:54:22 PM UTC Stopping swss ...
Wed 31 Mar 2021 12:54:31 PM UTC Stopped swss
Wed 31 Mar 2021 12:54:31 PM UTC Initialize pre-shutdown ...
Wed 31 Mar 2021 12:54:31 PM UTC Requesting pre-shutdown ...
Wed 31 Mar 2021 12:54:32 PM UTC Waiting for pre-shutdown ...
Wed 31 Mar 2021 12:54:41 PM UTC Pre-shutdown succeeded, state: pre-shutdown-succeeded ...
Wed 31 Mar 2021 12:54:41 PM UTC Backing up database ...
Wed 31 Mar 2021 12:54:41 PM UTC Stopping cpu-report...
Wed 31 Mar 2021 12:54:41 PM UTC Stopped cpu-report
Wed 31 Mar 2021 12:54:41 PM UTC Stopping teamd ...
Wed 31 Mar 2021 12:54:48 PM UTC Stopped teamd
Wed 31 Mar 2021 12:54:48 PM UTC Stopping syncd ...
Wed 31 Mar 2021 12:54:51 PM UTC Stopped syncd
Wed 31 Mar 2021 12:54:51 PM UTC Stopping all remaining containers ...
Wed 31 Mar 2021 12:54:53 PM UTC Stopped all remaining containers ...
Wed 31 Mar 2021 12:54:55 PM UTC Enabling Watchdog before fastfast-reboot
Watchdog armed for 180 seconds
Wed 31 Mar 2021 12:54:56 PM UTC Rebooting with /sbin/kexec -e to SONiC-OS-master.0-ae9ccf39 ...
  • Loading branch information
stepanblyschak authored Jul 2, 2021
1 parent 793b847 commit 4818360
Show file tree
Hide file tree
Showing 11 changed files with 465 additions and 116 deletions.
12 changes: 10 additions & 2 deletions config/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2087,20 +2087,28 @@ def warm_restart(ctx, redis_unix_socket_path):
ctx.obj = {'db': config_db, 'state_db': state_db, 'prefix': prefix}

@warm_restart.command('enable')
@click.argument('module', metavar='<module>', default='system', required=False, type=click.Choice(["system", "swss", "bgp", "teamd"]))
@click.argument('module', metavar='<module>', default='system', required=False)
@click.pass_context
def warm_restart_enable(ctx, module):
state_db = ctx.obj['state_db']
config_db = ctx.obj['db']
feature_table = config_db.get_table('FEATURE')
if module != 'system' and module not in feature_table:
exit('Feature {} is unknown'.format(module))
prefix = ctx.obj['prefix']
_hash = '{}{}'.format(prefix, module)
state_db.set(state_db.STATE_DB, _hash, 'enable', 'true')
state_db.close(state_db.STATE_DB)

@warm_restart.command('disable')
@click.argument('module', metavar='<module>', default='system', required=False, type=click.Choice(["system", "swss", "bgp", "teamd"]))
@click.argument('module', metavar='<module>', default='system', required=False)
@click.pass_context
def warm_restart_enable(ctx, module):
state_db = ctx.obj['state_db']
config_db = ctx.obj['db']
feature_table = config_db.get_table('FEATURE')
if module != 'system' and module not in feature_table:
exit('Feature {} is unknown'.format(module))
prefix = ctx.obj['prefix']
_hash = '{}{}'.format(prefix, module)
state_db.set(state_db.STATE_DB, _hash, 'enable', 'false')
Expand Down
119 changes: 55 additions & 64 deletions scripts/fast-reboot
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ WARM_DIR=/host/warmboot
REDIS_FILE=dump.rdb
REBOOT_SCRIPT_NAME=$(basename $0)
REBOOT_TYPE="${REBOOT_SCRIPT_NAME}"
SHUTDOWN_ORDER_FILE="/etc/sonic/${REBOOT_TYPE}_order"
VERBOSE=no
FORCE=no
IGNORE_ASIC=no
Expand Down Expand Up @@ -567,82 +568,72 @@ if [ -x ${LOG_SSD_HEALTH} ]; then
fi
# Kill nat docker after saving the conntrack table
debug "Stopping nat ..."
/usr/local/bin/dump_nat_entries.py
docker kill nat > /dev/null || true
systemctl stop nat
debug "Stopped nat ..."
# Kill radv before stopping BGP service to prevent announcing our departure.
debug "Stopping radv service..."
systemctl stop radv
debug "Stopped radv service..."
# Kill bgpd to start the bgp graceful restart procedure
debug "Stopping bgp ..."
systemctl stop bgp
debug "Stopped bgp ..."
# Kill sflow docker
debug "Stopping sflow ..."
container kill sflow &> /dev/null || debug "Docker sflow is not running ($?) ..."
systemctl stop sflow
debug "Stopped sflow ..."
# Kill lldp, otherwise it sends informotion about reboot.
# We call `docker kill lldp` to ensure the container stops as quickly as possible,
# then immediately call `systemctl stop lldp` to prevent the service from
# restarting the container automatically.
container kill lldp &> /dev/null || debug "Docker lldp is not running ($?) ..."
systemctl stop lldp
if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then
debug "Stopping teamd ..."
systemctl stop teamd
debug "Stopped teamd ..."
if [[ -f ${SHUTDOWN_ORDER_FILE} ]]; then
SERVICES_TO_STOP="$(cat ${SHUTDOWN_ORDER_FILE})"
else
# TODO: to be removed once sonic-buildimage change is in
if [[ "${REBOOT_TYPE}" == "fast-reboot" ]]; then
SERVICES_TO_STOP="nat radv bgp sflow lldp swss teamd syncd"
elif [[ "${REBOOT_TYPE}" == "fastfast-reboot" || "${REBOOT_TYPE}" == "warm-reboot" ]]; then
SERVICES_TO_STOP="nat radv bgp sflow lldp teamd swss syncd"
else
error "Unexpected reboot type ${REBOOT_TYPE}"
exit $EXIT_FAILURE
fi
fi
debug "Stopping swss service ..."
systemctl stop swss
debug "Stopped swss service ..."
for service in ${SERVICES_TO_STOP}; do
debug "Stopping ${service} ..."
if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
# Pre-shutdown syncd
initialize_pre_shutdown
# TODO: These exceptions for nat, sflow, lldp
# have to be coded in corresponding service scripts
if [[ "x$sonic_asic_type" == x"mellanox" ]]; then
check_issu_bank_file
if [[ "${service}" = "nat" ]]; then
/usr/local/bin/dump_nat_entries.py
fi
request_pre_shutdown
wait_for_pre_shutdown_complete_or_fail
if [[ "x$sonic_asic_type" == x"mellanox" ]]; then
check_issu_bank_file
if [[ "${service}" = "nat" || "${service}" = "sflow" || "${service}" = "lldp" ]]; then
container kill "${service}" &> /dev/null || debug "Docker ${service} is not running ($?) ..."
fi
# Warm reboot: dump state to host disk
if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
sonic-db-cli ASIC_DB FLUSHDB > /dev/null
sonic-db-cli COUNTERS_DB FLUSHDB > /dev/null
sonic-db-cli FLEX_COUNTER_DB FLUSHDB > /dev/null
if [[ "${service}" = "syncd" ]]; then
systemctl stop ${service} || debug "Ignore stopping ${service} service error $?"
else
systemctl stop ${service}
fi
# TODO: backup_database preserves FDB_TABLE
# need to cleanup as well for fastfast boot case
backup_database
debug "Stopped ${service}"
# Stop teamd gracefully
debug "Stopping teamd ..."
systemctl stop teamd
debug "Stopped teamd ..."
fi
if [[ "${service}" = "swss" ]]; then
if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
# Pre-shutdown syncd
initialize_pre_shutdown
if [[ "x$sonic_asic_type" == x"mellanox" ]]; then
check_issu_bank_file
fi
debug "Stopping syncd ..."
systemctl stop syncd || debug "Ignore stopping syncd service error $?"
debug "Stopped syncd ..."
request_pre_shutdown
wait_for_pre_shutdown_complete_or_fail
if [[ "x$sonic_asic_type" == x"mellanox" ]]; then
check_issu_bank_file
fi
# Warm reboot: dump state to host disk
if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
sonic-db-cli ASIC_DB FLUSHDB > /dev/null
sonic-db-cli COUNTERS_DB FLUSHDB > /dev/null
sonic-db-cli FLEX_COUNTER_DB FLUSHDB > /dev/null
fi
# TODO: backup_database preserves FDB_TABLE
# need to cleanup as well for fastfast boot case
backup_database
fi
fi
done
# Kill other containers to make the reboot faster
# We call `docker kill ...` to ensure the container stops as quickly as possible,
Expand Down
15 changes: 15 additions & 0 deletions scripts/generate_shutdown_order.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/python3

''' This script is used to generate initial warm/fast shutdown order file '''

from sonic_package_manager import PackageManager

def main():
manager = PackageManager.get_manager()
installed_packages = manager.get_installed_packages()
print('installed packages {}'.format(installed_packages))
manager.service_creator.generate_shutdown_sequence_files(installed_packages)
print('Done.')

if __name__ == '__main__':
main()
8 changes: 5 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
'scripts/fdbshow',
'scripts/gearboxutil',
'scripts/generate_dump',
'scripts/generate_shutdown_order.py',
'scripts/intfutil',
'scripts/intfstat',
'scripts/ipintutil',
Expand Down Expand Up @@ -187,9 +188,10 @@
'sonic-py-common',
'sonic-yang-mgmt',
'swsssdk>=2.0.1',
'tabulate>=0.8.2',
'www-authenticate>=0.9.2',
'xmltodict>=0.12.0',
'tabulate==0.8.2',
'toposort==1.6',
'www-authenticate==0.9.2',
'xmltodict==0.12.0',
],
setup_requires= [
'pytest-runner',
Expand Down
38 changes: 33 additions & 5 deletions sonic_package_manager/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pkgutil
import tempfile
from inspect import signature
from typing import Any, Iterable, Callable, Dict, Optional
from typing import Any, Iterable, List, Callable, Dict, Optional

import docker
import filelock
Expand Down Expand Up @@ -375,6 +375,14 @@ def install_from_source(self,
self.service_creator.create(package, state=feature_state, owner=default_owner)
exits.callback(rollback(self.service_creator.remove, package))

self.service_creator.generate_shutdown_sequence_files(
self._get_installed_packages_and(package)
)
exits.callback(rollback(
self.service_creator.generate_shutdown_sequence_files,
self.get_installed_packages())
)

if not skip_host_plugins:
self._install_cli_plugins(package)
exits.callback(rollback(self._uninstall_cli_plugins, package))
Expand Down Expand Up @@ -429,6 +437,9 @@ def uninstall(self, name: str, force=False):
try:
self._uninstall_cli_plugins(package)
self.service_creator.remove(package)
self.service_creator.generate_shutdown_sequence_files(
self._get_installed_packages_except(package)
)

# Clean containers based on this image
containers = self.docker.ps(filters={'ancestor': package.image_id},
Expand Down Expand Up @@ -525,8 +536,8 @@ def upgrade_from_source(self,
old_package, 'start'))

self.service_creator.remove(old_package, deregister_feature=False)
exits.callback(rollback(self.service_creator.create,
old_package, register_feature=False))
exits.callback(rollback(self.service_creator.create, old_package,
register_feature=False))

# Clean containers based on the old image
containers = self.docker.ps(filters={'ancestor': old_package.image_id},
Expand All @@ -538,6 +549,14 @@ def upgrade_from_source(self,
exits.callback(rollback(self.service_creator.remove, new_package,
register_feature=False))

self.service_creator.generate_shutdown_sequence_files(
self._get_installed_packages_and(new_package)
)
exits.callback(rollback(
self.service_creator.generate_shutdown_sequence_files,
self._get_installed_packages_and(old_package))
)

if self.feature_registry.is_feature_enabled(new_feature):
self._systemctl_action(new_package, 'start')
exits.callback(rollback(self._systemctl_action,
Expand Down Expand Up @@ -818,10 +837,19 @@ def get_installed_packages(self) -> Dict[str, Package]:
"""

return {
entry.name: self.get_installed_package(entry.name)
for entry in self.database if entry.installed
entry.name: entry for entry in self.get_installed_packages_list()
}

def get_installed_packages_list(self) -> List[Package]:
""" Returns a list of installed packages.
Returns:
Installed packages dictionary.
"""

return [self.get_installed_package(entry.name)
for entry in self.database if entry.installed]

def _migrate_package_database(self, old_package_database: PackageDatabase):
""" Performs part of package migration process.
For every package in old_package_database that is not listed in current
Expand Down
25 changes: 19 additions & 6 deletions sonic_package_manager/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ class ManifestRoot(ManifestNode):

def marshal(self, value: Optional[dict]):
result = {}
if value is None:
value = {}
value = value or {}

if not isinstance(value, dict):
raise ManifestError(f'"{self.key}" field has to be a dictionary')

for item in self.items:
next_value = value.get(item.key)
Expand All @@ -115,7 +117,7 @@ def marshal(self, value):
if value is None:
if self.default is not None:
return self.default
raise ManifestError(f'{self.key} is a required field but it is missing')
raise ManifestError(f'"{self.key}" is a required field but it is missing')
try:
return_value = self.type.marshal(value)
except Exception as err:
Expand All @@ -130,10 +132,12 @@ class ManifestArray(ManifestNode):
type: Any

def marshal(self, value):
if value is None:
return []

return_value = []
value = value or []

if not isinstance(value, list):
raise ManifestError(f'"{self.key}" has to be of type list')

try:
for item in value:
return_value.append(self.type.marshal(item))
Expand Down Expand Up @@ -173,6 +177,14 @@ def unmarshal(self, value):
ManifestField('asic-service', DefaultMarshaller(bool), False),
ManifestField('host-service', DefaultMarshaller(bool), True),
ManifestField('delayed', DefaultMarshaller(bool), False),
ManifestRoot('warm-shutdown', [
ManifestArray('after', DefaultMarshaller(str)),
ManifestArray('before', DefaultMarshaller(str)),
]),
ManifestRoot('fast-shutdown', [
ManifestArray('after', DefaultMarshaller(str)),
ManifestArray('before', DefaultMarshaller(str)),
]),
]),
ManifestRoot('container', [
ManifestField('privileged', DefaultMarshaller(bool), False),
Expand All @@ -187,6 +199,7 @@ def unmarshal(self, value):
]),
ManifestArray('processes', ManifestRoot('processes', [
ManifestField('name', DefaultMarshaller(str)),
ManifestField('reconciles', DefaultMarshaller(bool), False),
])),
ManifestRoot('cli', [
ManifestField('mandatory', DefaultMarshaller(bool), False),
Expand Down
Loading

0 comments on commit 4818360

Please sign in to comment.