Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[telemetry] Rotate streaming telemetry secrets. #9600

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
a921b8e
[telemetry] Roll over streaming telemetry secrets by ACMS.
yozhao101 Dec 21, 2021
7da4005
[telemetry] Fix the indent in the script `telemetry.sh`.
yozhao101 Dec 21, 2021
ede36b9
[telemetry] Fix the indent issue in the script `telemetry.sh`.
yozhao101 Dec 21, 2021
7c3c7d3
[telemetry] Add the field of `buffer_size=1024` in supervisord config…
yozhao101 Dec 21, 2021
90d9d03
[telemetry] Fix the indent issue.
yozhao101 Dec 21, 2021
0bfcec9
[telemetry] Fix a syntax error.
yozhao101 Dec 21, 2021
9db5b04
[telemetry] Fix a syntax error.
yozhao101 Dec 21, 2021
b1844b7
[telemetry] Install `inotify` module.
yozhao101 Jan 6, 2022
e28f64d
[telemetry] Install the inotify.
yozhao101 Jan 7, 2022
536fc4c
[telemetry] Install the `inotify` in telemetry container.
yozhao101 Jan 10, 2022
17c4d60
[telemetry] Use `inotify` module to monitor the rotation of certifica…
yozhao101 Jan 10, 2022
59dfe9b
[telemetry] Remove the unused import from script.
yozhao101 Jan 10, 2022
ce9f713
[telemetry] Use the rotation of certificate file as an indicator.
yozhao101 Jan 12, 2022
68bd0b6
[telemetry] Change the script to be executable.
yozhao101 Jan 12, 2022
6ad0ef5
[telemetry] Remove the unused import.
yozhao101 Jan 12, 2022
6ce4e4d
[telemetry] Change the script name.
yozhao101 Jan 12, 2022
6512a6f
[telemetry] Checks whether both certificate and private key files exi…
yozhao101 Jan 18, 2022
c6e2fdd
[telemetry] Reload telemetry server configuration by sending a `SIGHU…
yozhao101 Jan 25, 2022
3dbb79d
[telemetry] Fix the undefinied fields issue in telemetry server start…
yozhao101 Jan 26, 2022
0881704
[telemetry] Fix the indentation.
yozhao101 Jan 26, 2022
41377a1
[telemetry] Fix the indentation issue.
yozhao101 Jan 26, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dockers/docker-sonic-telemetry/Dockerfile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@ RUN apt-get clean -y && \
apt-get autoremove -y && \
rm -rf /debs

# Install the python `inotify` package
RUN pip3 install inotify

COPY ["start.sh", "telemetry.sh", "dialout.sh", "/usr/bin/"]
COPY ["certificate_rotation_checker", "/usr/bin/"]
COPY ["telemetry_vars.j2", "/usr/share/sonic/templates/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
Expand Down
214 changes: 214 additions & 0 deletions dockers/docker-sonic-telemetry/certificate_rotation_checker
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
#!/usr/bin/env python3

"""
certificate_rotation_checker

This script will be leveraged to periodically check whether the certificate file
of streaming telemetry was rotated or not. The configuration of streaming telemetry
server process will be reloaded if the certificate file was rotated.
"""

import os
import signal
import subprocess
import sys
import syslog
import time

import inotify.adapters

from swsscommon import swsscommon

MAX_RETRY_TIMES = 10
CERTIFICATE_CHECKING_INTERVAL_SECS = 3600

CREDENTIALS_DIR_PATH = "/etc/sonic/credentials/"


def get_command_result(command):
"""Executes the command and returns the exiting code and resulting output.

Args:
command: A string contains the command to be executed.

Returns:
An integer indicates the exiting code.
A string which contains the output of command.
"""
command_stdout = ""
command_stderr = ""

try:
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=True, universal_newlines=True)
command_stdout, command_stderr = proc_instance.communicate()
except (OSError, ValueError) as err:
syslog.syslog(syslog.LOG_ERR, "Failed to execute the command '{}'. Error: '{}'"
.format(command, err))
return 2, command_stderr

return proc_instance.returncode, command_stdout.strip()


def get_telemetry_server_info():
"""Gets telemetry server process information.

Args:
None.

Returns:
If telemetry server process is running, returns True and process id;
Otherwise returns False and -1.
"""
processes_status_cmd = "supervisorctl status"
retry_times = 0

while retry_times <= MAX_RETRY_TIMES:
retry_times += 1
exit_code, command_stdout = get_command_result(processes_status_cmd)
if exit_code != 3:
syslog.syslog(syslog.LOG_INFO,
"Failed to get the telemetry server process information and retry after 60 seconds ...")
time.sleep(60)
else:
for line in command_stdout.splitlines():
if "telemetry" in line and "RUNNING" in line:
return True, line.split()[3].strip(",")

return False, -1


def reload_telemetry_server_configuration():
"""Reloads the telemetry server configuration by sending signal 'SIGHUP'
to telemetry server process and checks it is actually running after doing the reload.

Args:
None

Returns:
Returns True if the configuration was reloaded successfully; Otherwise, return False.
"""
telemetry_server_pid = -1
is_running = False

is_running, telemetry_server_pid = get_telemetry_server_info()
if not is_running:
syslog.syslog(syslog.LOG_ERR,
"Telemetry server process is not running before reloading configuration!")
return False

syslog.syslog(syslog.LOG_INFO,
"Telemetry server process is running with PID: {}".format(telemetry_server_pid))
syslog.syslog(syslog.LOG_INFO, "Sending 'SIGHUP' signal to telemetry server process ...")

os.kill(int(telemetry_server_pid), signal.SIGHUP)

syslog.syslog(syslog.LOG_INFO, "'SIGHUP' signal was sent out.")

# Wait for 120 seconds to check whether telemetry server process comes back
time.sleep(120)

is_running, telemetry_server_pid = get_telemetry_server_info()
if not is_running:
syslog.syslog(syslog.LOG_ERR,
"Telemetry server process is not running after reloading configuration!")
return False

syslog.syslog(syslog.LOG_INFO, "Telemetry server process is running after reloading configuration!")
return True


def check_certificate_rotated(certificate_file_name):
"""Leverages the 'inotify' module to monitor the file system events under the
directory which stores the SONiC credentials and reloads telemetry server
configuration if its certificate was rotated.


Args:
certificate_file_name: A string indicates the telemetry certificate file name.

Returns:
None.
"""
certificate_file_rotated = False

inotify_instance = inotify.adapters.Inotify()
inotify_instance.add_watch(CREDENTIALS_DIR_PATH)
for event in inotify_instance.event_gen(yield_nones=False):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

inotify_instance

In extreme case, the file is deleted by a malicious user, will the inotify_instance still working? I think its link to inode, and deleting file will destroy the inode.

If this is true, a crash is better than a dead loop.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The inotify is inode based and it will monitor the credentials directory /etc/sonic/credentials/ to see whether the telemetry certificate file was rotated or not. If certificate file was deleted by accidentally, the inotify_instance will not be impacted.

I updated the PR to log an error message if the certificate was deleted. What I am thinking is if the certificate was restored later, then it can be treated as a kind of rotation operation and the telemetry server will be restarted by this script.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If certificate file was deleted by accidentally, what is the expected behavior?

I am considering in this case, we can kill telemetry daemon.

header, event_type, monitoring_path, file_name = event
if (file_name == certificate_file_name
Copy link
Collaborator

@qiluo-msft qiluo-msft Jan 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if

The FSM logic is complex and may be messed up by some input sequence. Could you use one file as the main indicator, and always rotate if that file changed.

ref: https://github.com/Azure/sonic-restapi/blob/94805a39ac0712219f7dc08faa2cfdbf371dd177/go-server-server/main.go#L103 #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated and use the rotation of certificate file as the main indicator.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please make sure describe the main file in document? This is very critical design assumption and the cert rotator should treat it as a contract.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated the design document.

and ("IN_CREATE" in event_type or "IN_MOVED_TO" in event_type)):
certificate_file_rotated = True

if certificate_file_rotated:
certificate_file_rotated = False
syslog.syslog(syslog.LOG_INFO,
"Certificate was rotated and reloading telemetry server configuration ...")

if not reload_telemetry_server_configuration():
syslog.syslog(syslog.LOG_ERR,
"Failed to reload the telemetry server configuration!")

syslog.syslog(syslog.LOG_INFO, "Telemetry server configuration was reloaded successfully!")


def certificate_rotated_checker():
"""Checks rotation of certificate file and then reloads streaming telemetry server configuration.

Leverages 'inotify' module to check whether the certificate file of streaming telemetry was
rotated or not. The configuration of telemetry server process will be reloaded if it was rotated.

Args:
None

Returns:
None
"""
certificate_file_path = ""
private_key_file_path = ""
certificate_file_name = ""

config_db = swsscommon.DBConnector("CONFIG_DB", 0)
telemetry_table = swsscommon.Table(config_db, "TELEMETRY")
telemetry_table_keys = telemetry_table.getKeys()
if "certs" in telemetry_table_keys:
certs_info = dict(telemetry_table.get("certs")[1])
if "server_crt" in certs_info and "server_key" in certs_info:
certificate_file_path = certs_info["server_crt"]
private_key_file_path = certs_info["server_key"]
syslog.syslog(syslog.LOG_INFO, "Path of certificate file is '{}'".format(certificate_file_path))
syslog.syslog(syslog.LOG_INFO, "Path of private key file is '{}'".format(private_key_file_path))
else:
syslog.syslog(syslog.LOG_ERR,
"Failed to retrieve the path of certificate or private key file from 'TELEMETRY' table!")
sys.exit(1)
else:
syslog.syslog(syslog.LOG_ERR,
"Failed to retrieve the certificate information from 'TELEMETRY' table!")
sys.exit(2)

while True:
Copy link
Collaborator

@qiluo-msft qiluo-msft Jan 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

while True:

The intention of this loop is to wait until two files exist. why only check one file? #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated and checks the existence of both two files.

if not os.path.exists(certificate_file_path) or not os.path.exists(private_key_file_path):
syslog.syslog(syslog.LOG_ERR,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LOG_ERR

Instead of error message and keep loop, it is better to just simply kill telemetry daemon.

"Certificate or private key file did not exist on device and checks again after '{}' seconds ..."
.format(CERTIFICATE_CHECKING_INTERVAL_SECS))
time.sleep(CERTIFICATE_CHECKING_INTERVAL_SECS)
else:
break

certificate_file_name = certificate_file_path.strip().split("/")[-1]
if not certificate_file_name:
syslog.syslog(syslog.LOG_ERR,
"Failed to retrieve the file name of certificate!")
sys.exit(3)

check_certificate_rotated(certificate_file_name)


def main():
certificate_rotated_checker()


if __name__ == "__main__":
main()
sys.exit(0)
1 change: 1 addition & 0 deletions dockers/docker-sonic-telemetry/critical_processes
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
program:telemetry
program:dialout
program:certificate_rollover_checker
10 changes: 10 additions & 0 deletions dockers/docker-sonic-telemetry/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,13 @@ stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=telemetry:running

[program:certificate_rollover_checker]
command=/usr/bin/certificate_rollover_checker
priority=5
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=start:exited
96 changes: 56 additions & 40 deletions dockers/docker-sonic-telemetry/telemetry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,54 +19,70 @@ CERTS=$(echo $TELEMETRY_VARS | jq -r '.certs')
TELEMETRY_ARGS=" -logtostderr"
export CVL_SCHEMA_PATH=/usr/sbin/schema

if [ -n "$CERTS" ]; then
SERVER_CRT=$(echo $CERTS | jq -r '.server_crt')
SERVER_KEY=$(echo $CERTS | jq -r '.server_key')
if [ -z $SERVER_CRT ] || [ -z $SERVER_KEY ]; then
TELEMETRY_ARGS+=" --insecure"
while true
do
if [ -n "$CERTS" ]; then
SERVER_CRT=$(echo $CERTS | jq -r '.server_crt')
SERVER_KEY=$(echo $CERTS | jq -r '.server_key')
CA_CRT=$(echo $CERTS | jq -r '.ca_crt')

logger "Trying to retrieve server certificate, key and Root CA certificate ..."
logger "The file path of server certificate in CONFIG_DB is: $SERVER_CRT"
logger "The file path of server provate key in CONFIG_DB is: $SERVER_KEY"
logger "The file path of Root CA certificate in CONFIG_DB is: $CA_CRT"

if [[ -f $SERVER_CRT && -f $SERVER_KEY && -f $CA_CRT ]]; then
logger "Succeeded in retrieving server certificate, key and Root CA certificate."
TELEMETRY_ARGS+=" --server_crt $SERVER_CRT --server_key $SERVER_KEY --ca_crt $CA_CRT"
break
else
logger "Failed to retrieve server certificate, key or Root CA certificate!"
fi
elif [ -n "$X509" ]; then
SERVER_CRT=$(echo $X509 | jq -r '.server_crt')
SERVER_KEY=$(echo $X509 | jq -r '.server_key')
CA_CRT=$(echo $X509 | jq -r '.ca_crt')

logger "Trying to retrieve server certificate, key and Root CA certificate ..."
logger "The file path of server certificate in CONFIG_DB is: $SERVER_CRT"
logger "The file path of server provate key in CONFIG_DB is: $SERVER_KEY"
logger "The file path of Root CA certificate in CONFIG_DB is: $CA_CRT"

if [[ -f $SERVER_CRT && -f $SERVER_KEY && -f $CA_CRT ]]; then
logger "Succeeded in retrieving server certificate, key and Root CA certificate."
TELEMETRY_ARGS+=" --server_crt $SERVER_CRT --server_key $SERVER_KEY --ca_crt $CA_CRT"
break
else
logger "Failed to retrieve server certificate, key or Root CA certificate!"
fi
else
TELEMETRY_ARGS+=" --server_crt $SERVER_CRT --server_key $SERVER_KEY "
TELEMETRY_ARGS+=" --noTLS"
break
fi

CA_CRT=$(echo $CERTS | jq -r '.ca_crt')
if [ ! -z $CA_CRT ]; then
TELEMETRY_ARGS+=" --ca_crt $CA_CRT"
fi
elif [ -n "$X509" ]; then
SERVER_CRT=$(echo $X509 | jq -r '.server_crt')
SERVER_KEY=$(echo $X509 | jq -r '.server_key')
if [ -z $SERVER_CRT ] || [ -z $SERVER_KEY ]; then
TELEMETRY_ARGS+=" --insecure"
logger "Sleeping 3600 seconds and checks the existence of secrets again ..."
sleep 3600
done

if [ -n "$GNMI" ]; then
PORT=$(echo $GNMI | jq -r '.port')
if [ ! -z $PORT ] || [ $PORT != "null" ]; then
TELEMETRY_ARGS+=" --port $PORT"
else
TELEMETRY_ARGS+=" --server_crt $SERVER_CRT --server_key $SERVER_KEY "
TELEMETRY_ARGS+=" --port 8080"
fi

CA_CRT=$(echo $X509 | jq -r '.ca_crt')
if [ ! -z $CA_CRT ]; then
TELEMETRY_ARGS+=" --ca_crt $CA_CRT"
LOG_LEVEL=$(echo $GNMI | jq -r '.log_level')
if [ ! -z $LOG_LEVEL ] || [ $LOG_LEVEL != "null" ]; then
TELEMETRY_ARGS+=" -v=$LOG_LEVEL"
else
TELEMETRY_ARGS+=" -v=2"
fi
else
TELEMETRY_ARGS+=" --noTLS"
fi

# If no configuration entry exists for TELEMETRY, create one default port
if [ -z "$GNMI" ]; then
PORT=8080
else
PORT=$(echo $GNMI | jq -r '.port')
fi
TELEMETRY_ARGS+=" --port $PORT"

CLIENT_AUTH=$(echo $GNMI | jq -r '.client_auth')
if [ -z $CLIENT_AUTH ] || [ $CLIENT_AUTH == "false" ]; then
TELEMETRY_ARGS+=" --allow_no_client_auth"
fi

LOG_LEVEL=$(echo $GNMI | jq -r '.log_level')
if [ ! -z $LOG_LEVEL ]; then
TELEMETRY_ARGS+=" -v=$LOG_LEVEL"
else
TELEMETRY_ARGS+=" -v=2"
CLIENT_AUTH=$(echo $GNMI | jq -r '.client_auth')
if [ -z $CLIENT_AUTH ] || [ $CLIENT_AUTH == "null" ] || [ $CLIENT_AUTH == "false" ]; then
TELEMETRY_ARGS+=" --allow_no_client_auth"
fi
fi

exec /usr/sbin/telemetry ${TELEMETRY_ARGS}
6 changes: 3 additions & 3 deletions src/sonic-config-engine/minigraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -1633,9 +1633,9 @@ def parse_xml(filename, platform=None, port_config_file=None, asic_name=None, hw
'log_level': '2'
},
'certs': {
'server_crt': '/etc/sonic/telemetry/streamingtelemetryserver.cer',
'server_key': '/etc/sonic/telemetry/streamingtelemetryserver.key',
'ca_crt': '/etc/sonic/telemetry/dsmsroot.cer'
'server_crt': '/etc/sonic/credentials/streamingtelemetryserver.cer',
'server_key': '/etc/sonic/credentials/streamingtelemetryserver.key',
'ca_crt': '/etc/sonic/credentials/dsmsroot.cer'
}
}
results['RESTAPI'] = {
Expand Down