Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[telemetry] Rotate streaming telemetry secrets. #9600

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
a921b8e
[telemetry] Roll over streaming telemetry secrets by ACMS.
yozhao101 Dec 21, 2021
7da4005
[telemetry] Fix the indent in the script `telemetry.sh`.
yozhao101 Dec 21, 2021
ede36b9
[telemetry] Fix the indent issue in the script `telemetry.sh`.
yozhao101 Dec 21, 2021
7c3c7d3
[telemetry] Add the field of `buffer_size=1024` in supervisord config…
yozhao101 Dec 21, 2021
90d9d03
[telemetry] Fix the indent issue.
yozhao101 Dec 21, 2021
0bfcec9
[telemetry] Fix a syntax error.
yozhao101 Dec 21, 2021
9db5b04
[telemetry] Fix a syntax error.
yozhao101 Dec 21, 2021
b1844b7
[telemetry] Install `inotify` module.
yozhao101 Jan 6, 2022
e28f64d
[telemetry] Install the inotify.
yozhao101 Jan 7, 2022
536fc4c
[telemetry] Install the `inotify` in telemetry container.
yozhao101 Jan 10, 2022
17c4d60
[telemetry] Use `inotify` module to monitor the rotation of certifica…
yozhao101 Jan 10, 2022
59dfe9b
[telemetry] Remove the unused import from script.
yozhao101 Jan 10, 2022
ce9f713
[telemetry] Use the rotation of certificate file as an indicator.
yozhao101 Jan 12, 2022
68bd0b6
[telemetry] Change the script to be executable.
yozhao101 Jan 12, 2022
6ad0ef5
[telemetry] Remove the unused import.
yozhao101 Jan 12, 2022
6ce4e4d
[telemetry] Change the script name.
yozhao101 Jan 12, 2022
6512a6f
[telemetry] Checks whether both certificate and private key files exi…
yozhao101 Jan 18, 2022
c6e2fdd
[telemetry] Reload telemetry server configuration by sending a `SIGHU…
yozhao101 Jan 25, 2022
3dbb79d
[telemetry] Fix the undefinied fields issue in telemetry server start…
yozhao101 Jan 26, 2022
0881704
[telemetry] Fix the indentation.
yozhao101 Jan 26, 2022
41377a1
[telemetry] Fix the indentation issue.
yozhao101 Jan 26, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion dockers/docker-sonic-telemetry/Dockerfile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ RUN apt-get clean -y && \
apt-get autoremove -y && \
rm -rf /debs

COPY ["start.sh", "telemetry.sh", "dialout.sh", "/usr/bin/"]
# Install the python `inotify` package
RUN pip3 install inotify

COPY ["start.sh", "telemetry.sh", "dialout.sh", "certificate_rollover_checker", "/usr/bin/"]
COPY ["telemetry_vars.j2", "/usr/share/sonic/templates/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
Expand Down
225 changes: 225 additions & 0 deletions dockers/docker-sonic-telemetry/certificate_rotation_checker
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
#!/usr/bin/env python3

"""
certificate_rotation_checker

This script will be leveraged to periodically check whether the certificate and private key
files of streaming telemetry were rotated by dSMS service or not. The streaming telemetry
server process will be restarted if the certificate and private key are rotated by dSMS service
and then updated by the acms agent running in ACMS container.
"""

import os
import subprocess
import sys
import syslog
import time

import inotify.adapters

from swsscommon import swsscommon

MAX_RETRY_TIMES = 10
CERTIFICATE_CHECKING_INTERVAL_SECS = 3600

CREDENTIALS_DIR_PATH = "/etc/sonic/credentials/"


def get_command_result(command):
"""Executes the command and returns the exiting code and resulting output.

Args:
command: A string contains the command to be executed.

Returns:
An integer indicates the exiting code.
A string which contains the output of command.
"""
command_stdout = ""
command_stderr = ""

try:
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=True, universal_newlines=True)
command_stdout, command_stderr = proc_instance.communicate()
except (OSError, ValueError) as err:
syslog.syslog(syslog.LOG_ERR, "Failed to execute the command '{}'. Error: '{}'"
.format(command, err))
return 2, command_stderr

return proc_instance.returncode, command_stdout.strip()


def check_telemetry_server_running():
"""Checkes whether telemetry server process is running.

Args:
None.

Returns:
None.
"""
processes_status_cmd = "supervisorctl status"
retry_times = 0
is_running = False

while retry_times <= MAX_RETRY_TIMES:
retry_times += 1
exit_code, command_stdout = get_command_result(processes_status_cmd)
if exit_code != 3:
syslog.syslog(syslog.LOG_INFO,
"Failed to get the processes running status in telemetry container and retry after 60 seconds ...")
time.sleep(60)
else:
for line in command_stdout.splitlines():
if "telemetry" in line and "RUNNING" in line:
is_running = True
break
if is_running:
syslog.syslog(syslog.LOG_INFO,
"Telemetry server process is running after certificate and private key were rotated!")
break

if not is_running:
syslog.syslog(syslog.LOG_ERR,
"Telemetry server process is not running after certificate and private key were rotated and exiting ...")
sys.exit(1)


def restart_telemetry_server():
"""Restarts the telemetry server process by Supervisord and then checks
it is actually running.

Args:
None

Returns:
None
"""
restart_telemetry_server_cmd = "supervisorctl restart telemetry"
retry_times = 0

while retry_times <= MAX_RETRY_TIMES:
retry_times += 1
exit_code, command_stdout = get_command_result(restart_telemetry_server_cmd)
if exit_code != 0:
syslog.syslog(syslog.LOG_INFO,
"Failed to restart telemetry server process and retry after 60 seconds ...")
time.sleep(60)
else:
break

if retry_times > MAX_RETRY_TIMES:
syslog.syslog(syslog.LOG_ERR,
"Failed to restart telemetry server process after trying '{}' times and exiting ..."
.format(MAX_RETRY_TIMES))
sys.exit(2)

check_telemetry_server_running()


def check_certificate_rotated(certificate_file_name, private_key_file_name):
"""Leverages the 'inotify' module to monitor the file system events under the
directory which stores the SONiC credentials and restarts telemetry server
process if certificate and private key were rotated.


Args:
certificate_file_name: A string indicates the telemetry certificate file name.
private_key_file_name: A string indicates the telemetry private key file name.

Returns:
None.
"""
certificate_file_rotated = False
private_key_file_rotated = False

inotify_instance = inotify.adapters.Inotify()
inotify_instance.add_watch(CREDENTIALS_DIR_PATH)
for event in inotify_instance.event_gen(yield_nones=False):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

inotify_instance

In extreme case, the file is deleted by a malicious user, will the inotify_instance still working? I think its link to inode, and deleting file will destroy the inode.

If this is true, a crash is better than a dead loop.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The inotify is inode based and it will monitor the credentials directory /etc/sonic/credentials/ to see whether the telemetry certificate file was rotated or not. If certificate file was deleted by accidentally, the inotify_instance will not be impacted.

I updated the PR to log an error message if the certificate was deleted. What I am thinking is if the certificate was restored later, then it can be treated as a kind of rotation operation and the telemetry server will be restarted by this script.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If certificate file was deleted by accidentally, what is the expected behavior?

I am considering in this case, we can kill telemetry daemon.

header, event_type, monitoring_path, file_name = event
if (file_name == certificate_file_name
Copy link
Collaborator

@qiluo-msft qiluo-msft Jan 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if

The FSM logic is complex and may be messed up by some input sequence. Could you use one file as the main indicator, and always rotate if that file changed.

ref: https://github.com/Azure/sonic-restapi/blob/94805a39ac0712219f7dc08faa2cfdbf371dd177/go-server-server/main.go#L103 #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated and use the rotation of certificate file as the main indicator.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please make sure describe the main file in document? This is very critical design assumption and the cert rotator should treat it as a contract.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated the design document.

and ("IN_CREATE" in event_type or "IN_MOVED_TO" in event_type)):
certificate_file_rotated = True
if (file_name == private_key_file_name
and ("IN_CREATE" in event_type or "IN_MOVED_TO" in event_type)):
private_key_file_rotated = True

if certificate_file_rotated and private_key_file_rotated:
certificate_file_rotated = False
private_key_file_rotated = False
syslog.syslog(syslog.LOG_INFO,
"Certificate and private key were rotated and restarting telemetry server process ...")
restart_telemetry_server()

# Wait for specified seconds and then do the next round checking
Copy link
Collaborator

@qiluo-msft qiluo-msft Jan 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait for specified second

You don't need sleep in an event loop. #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initially what I am thinking is since the directory /etc/sonic/credentials/ will store all the certificates and keys for different applications in the future and if one or several certificates under this directory are rotated too frequently (by accident?), this event loop will be executed continuously and then burn too much CPU cycles potentially. Another thinking the polling frequency of our internal tool to get the update from certificate storage is 8 ~ 24 hours. That's why I put a sleep function at here.

I will let inotify to monitor the certificate file directly instead of monitoring the /etc/sonic/credentials/ and then we can remove this sleep function in event loop.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got your motivation, and the considering may be helpful. However, event you sleep, the event will be queued and processed after each sleep, you don't save too much?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree and this will not save too much!

syslog.syslog(syslog.LOG_INFO,
"Sleeping '{}' seconds before doing the next round certifcate rotation checking ..."
.format(CERTIFICATE_CHECKING_INTERVAL_SECS))
time.sleep(CERTIFICATE_CHECKING_INTERVAL_SECS)


def certificate_rotated_checker():
"""Checks rotation of certificate and key files and restart streaming telemetry server if necessary.

Leverages 'inotify' module to check whether the certificate and private key files of
streaming telemetry were already rotated by dSMS service and updated by acms agent running
in ACMS container. The streaming telemetry server process will be restarted if they were rotated.

Args:
None

Returns:
None
"""
certificate_file_path = ""
private_key_file_path = ""
certificate_file_name = ""
private_key_file_name = ""

config_db = swsscommon.DBConnector("CONFIG_DB", 0)
telemetry_table = swsscommon.Table(config_db, "TELEMETRY")
telemetry_table_keys = telemetry_table.getKeys()
if "certs" in telemetry_table_keys:
certs_info = dict(telemetry_table.get("certs")[1])
if "server_crt" in certs_info and "server_key" in certs_info:
certificate_file_path = certs_info["server_crt"]
private_key_file_path = certs_info["server_key"]
syslog.syslog(syslog.LOG_INFO, "Path of certificate file is '{}'".format(certificate_file_path))
syslog.syslog(syslog.LOG_INFO, "Path of key file is '{}'".format(private_key_file_path))
else:
syslog.syslog(syslog.LOG_ERR,
"Failed to retrieve the path of certificate and key file from 'TELEMETRY' table!")
sys.exit(3)
else:
syslog.syslog(syslog.LOG_ERR,
"Failed to retrieve the certificate and key information from 'TELEMETRY' table!")
sys.exit(4)

while True:
Copy link
Collaborator

@qiluo-msft qiluo-msft Jan 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

while True:

The intention of this loop is to wait until two files exist. why only check one file? #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated and checks the existence of both two files.

if not os.path.exists(certificate_file_path) or not os.path.exists(private_key_file_path):
syslog.syslog(syslog.LOG_ERR,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LOG_ERR

Instead of error message and keep loop, it is better to just simply kill telemetry daemon.

"Certificate or key file did not exist on device and sleep '{}' seconds to check again ..."
.format(CERTIFICATE_CHECKING_INTERVAL_SECS))
time.sleep(CERTIFICATE_CHECKING_INTERVAL_SECS)
else:
break

certificate_file_name = certificate_file_path.strip().split("/")[-1]
private_key_file_name = private_key_file_path.strip().split("/")[-1]
syslog.syslog(syslog.LOG_INFO, "cer_file_name: {}, key_file_name: {}".format(certificate_file_name, private_key_file_name))
if not certificate_file_name or not private_key_file_name:
syslog.syslog(syslog.LOG_ERR,
"Failed to retrieve the file name of certificate or private key!")
sys.exit(5)

check_certificate_rotated(certificate_file_name, private_key_file_name)


def main():
certificate_rotated_checker()


if __name__ == "__main__":
main()
sys.exit(0)
1 change: 1 addition & 0 deletions dockers/docker-sonic-telemetry/critical_processes
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
program:telemetry
program:dialout
program:certificate_rollover_checker
10 changes: 10 additions & 0 deletions dockers/docker-sonic-telemetry/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,13 @@ stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=telemetry:running

[program:certificate_rollover_checker]
command=/usr/bin/certificate_rollover_checker
priority=5
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=start:exited
62 changes: 37 additions & 25 deletions dockers/docker-sonic-telemetry/telemetry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,35 +19,47 @@ CERTS=$(echo $TELEMETRY_VARS | jq -r '.certs')
TELEMETRY_ARGS=" -logtostderr"
export CVL_SCHEMA_PATH=/usr/sbin/schema

if [ -n "$CERTS" ]; then
SERVER_CRT=$(echo $CERTS | jq -r '.server_crt')
SERVER_KEY=$(echo $CERTS | jq -r '.server_key')
if [ -z $SERVER_CRT ] || [ -z $SERVER_KEY ]; then
TELEMETRY_ARGS+=" --insecure"
else
TELEMETRY_ARGS+=" --server_crt $SERVER_CRT --server_key $SERVER_KEY "
fi
while true
do
if [ -n "$CERTS" ]; then
SERVER_CRT=$(echo $CERTS | jq -r '.server_crt')
SERVER_KEY=$(echo $CERTS | jq -r '.server_key')
CA_CRT=$(echo $CERTS | jq -r '.ca_crt')

CA_CRT=$(echo $CERTS | jq -r '.ca_crt')
if [ ! -z $CA_CRT ]; then
TELEMETRY_ARGS+=" --ca_crt $CA_CRT"
fi
elif [ -n "$X509" ]; then
SERVER_CRT=$(echo $X509 | jq -r '.server_crt')
SERVER_KEY=$(echo $X509 | jq -r '.server_key')
if [ -z $SERVER_CRT ] || [ -z $SERVER_KEY ]; then
TELEMETRY_ARGS+=" --insecure"
logger "Trying to retrieve server certificate, key and Root CA certificate ..."
logger "The file path of server certificate in CONFIG_DB is: $SERVER_CRT"
logger "The file path of server provate key in CONFIG_DB is: $SERVER_KEY"
logger "The file path of Root CA certificate in CONFIG_DB is: $CA_CRT"

if [[ -f $SERVER_CRT && -f $SERVER_KEY && -f $CA_CRT ]]; then
logger "Succeeded in retrieving server certificate, key and Root CA certificate."
TELEMETRY_ARGS+=" --server_crt $SERVER_CRT --server_key $SERVER_KEY --ca_crt $CA_CRT"
break
else
logger "Failed to retrieve server certificate, key or Root CA certificate!"
fi
elif [ -n "$X509" ]; then
SERVER_CRT=$(echo $X509 | jq -r '.server_crt')
SERVER_KEY=$(echo $X509 | jq -r '.server_key')
if [ -z $SERVER_CRT ] || [ -z $SERVER_KEY ]; then
TELEMETRY_ARGS+=" --insecure"
else
TELEMETRY_ARGS+=" --server_crt $SERVER_CRT --server_key $SERVER_KEY "
fi

CA_CRT=$(echo $X509 | jq -r '.ca_crt')
if [ ! -z $CA_CRT ]; then
TELEMETRY_ARGS+=" --ca_crt $CA_CRT"
fi
break
else
TELEMETRY_ARGS+=" --server_crt $SERVER_CRT --server_key $SERVER_KEY "
TELEMETRY_ARGS+=" --noTLS"
break
fi

CA_CRT=$(echo $X509 | jq -r '.ca_crt')
if [ ! -z $CA_CRT ]; then
TELEMETRY_ARGS+=" --ca_crt $CA_CRT"
fi
else
TELEMETRY_ARGS+=" --noTLS"
fi
logger "Sleeping 3600 seconds and checks the existence of secrets again ..."
sleep 3600
done

# If no configuration entry exists for TELEMETRY, create one default port
if [ -z "$GNMI" ]; then
Expand Down
6 changes: 3 additions & 3 deletions src/sonic-config-engine/minigraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -1633,9 +1633,9 @@ def parse_xml(filename, platform=None, port_config_file=None, asic_name=None, hw
'log_level': '2'
},
'certs': {
'server_crt': '/etc/sonic/telemetry/streamingtelemetryserver.cer',
'server_key': '/etc/sonic/telemetry/streamingtelemetryserver.key',
'ca_crt': '/etc/sonic/telemetry/dsmsroot.cer'
'server_crt': '/etc/sonic/credentials/streamingtelemetryserver.cer',
'server_key': '/etc/sonic/credentials/streamingtelemetryserver.key',
'ca_crt': '/etc/sonic/credentials/dsmsroot.cer'
}
}
results['RESTAPI'] = {
Expand Down