Skip to content

Commit

Permalink
[Platform] Health Check should provide some alert if it isn't running #…
Browse files Browse the repository at this point in the history
…6581

Summary:
- Code for email sending moved from cluster_health.py to Java code;
- Added some alerts when health checker fails by own reasons;

The HTML preparation is left the same as it was in Python code. The proposed way to use templates is not implemented here and will not be implemented here as it requires a separate issue.

Test Plan:
Common scenarios:
1. Check that health emails are correctly formed;
2. Check that emails with alerts are sent;
3. Check that occasional errors in cluster_health.py lead to alert emails;

Repeat scenarios 1-3 for different SMTP settings in Customer's Profile, for different states of flags "Only include errors in alert emails", "Send backup failure notification".
Some useful information can be found here:
[[ https://phabricator.dev.yugabyte.com/D7816 | D7816 ]]
[[ https://phabricator.dev.yugabyte.com/D8309 | D8309 ]]

Also test scenarios from [[ https://phabricator.dev.yugabyte.com/D9660 | D9660 ]].

Reviewers: sanketh, daniel

Reviewed By: daniel

Subscribers: jenkins-bot, yugaware

Differential Revision: https://phabricator.dev.yugabyte.com/D10177
  • Loading branch information
SergeyPotachev committed Jan 15, 2021
1 parent 01aa22c commit a3744a8
Show file tree
Hide file tree
Showing 24 changed files with 1,253 additions and 770 deletions.
4 changes: 3 additions & 1 deletion managed/build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ libraryDependencies ++= Seq(
"commons-validator" % "commons-validator" % "1.7",
"com.h2database" % "h2" % "1.4.200" % Test,
"org.hamcrest" % "hamcrest-core" % "2.2" % Test,
"pl.pragmatists" % "JUnitParams" % "1.1.1" % Test
"pl.pragmatists" % "JUnitParams" % "1.1.1" % Test,
"com.icegreen" % "greenmail" % "1.6.1" % Test,
"com.icegreen" % "greenmail-junit4" % "1.6.1" % Test
)
// Default to true if nothing passed on the env, so we can pick up YB jars from local java itest.
lazy val mavenLocal = Option(System.getenv("USE_MAVEN_LOCAL")).getOrElse("false")
Expand Down
286 changes: 7 additions & 279 deletions managed/devops/bin/cluster_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,10 @@
import json
import logging
import os
import smtplib
import subprocess
import sys
import time

from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
try:
from builtins import RuntimeError
except Exception as e:
Expand Down Expand Up @@ -46,17 +43,6 @@
MAX_CONCURRENT_PROCESSES = 10
MAX_TRIES = 2

# This hardcoded as the ses:FromAddress in the IAM policy.
EMAIL_SERVER = os.environ.get("SMTP_SERVER", "email-smtp.us-west-2.amazonaws.com")
EMAIL_PORT = os.environ.get("SMTP_PORT", None)
# These need to be setup as env vars from YW or in the env if running manually. If not specified
# then sending the email will fail, but reporting the status will still work just fine.
EMAIL_FROM = os.environ.get("YB_ALERTS_EMAIL")
EMAIL_USERNAME = os.environ.get("YB_ALERTS_USERNAME", None)
EMAIL_PASSWORD = os.environ.get("YB_ALERTS_PASSWORD", None)
SMTP_USE_SSL = os.environ.get("SMTP_USE_SSL", "true")
SMTP_USE_TLS = os.environ.get("SMTP_USE_TLS", "false")

###################################################################################################
# Reporting
###################################################################################################
Expand Down Expand Up @@ -108,7 +94,6 @@ def as_json(self):

class Report:
def __init__(self, yb_version):
self.mail_error = None
self.entries = []
self.start_ts = generate_ts()
self.yb_version = yb_version
Expand Down Expand Up @@ -140,8 +125,6 @@ def as_json(self, only_errors=False):
"data": [e.as_json() for e in self.entries if not only_errors or e.has_error],
"has_error": True in [e.has_error for e in self.entries]
}
if self.mail_error is not None:
j["mail_error"] = self.mail_error
return json.dumps(j, indent=2)

def __str__(self):
Expand All @@ -164,9 +147,10 @@ def check_output(cmd, env):
return 'Error executing command {}: timeout occurred'.format(cmd)

output, stderr = command.communicate()
return output.decode('utf-8')
return output.decode('utf-8').encode("ascii", "ignore")
except subprocess.CalledProcessError as e:
return 'Error executing command {}: {}'.format(cmd, e.output.decode('utf-8'))
return 'Error executing command {}: {}'.format(
cmd, e.output.decode("utf-8").encode("ascii", "ignore"))


def safe_pipe(command_str):
Expand Down Expand Up @@ -445,229 +429,6 @@ def seconds_to_human_readable_time(seconds):
return ['{} days {:d}:{:02d}:{:02d}'.format(d, h, m, s)]


def assemble_mail_row(data, is_first_check, timestamp):
ts_span = '''<div style="
color:#ffffff;
font-size:5px;">{}</div>'''.format(timestamp) if is_first_check else ''
process = ''
if 'process' in data:
process = data['process']

border_color = 'transparent' if is_first_check else '#e5e5e9'

def td_element(padding, weight, content): return '''<td style="
border-top: 1px solid {};
padding:{};
{}
vertical-align:top;">{}</td>'''.format(border_color, padding, weight, content)
badge = ''
if data['has_error'] is True:
badge = '''<span style="
font-size:0.8em;
background-color:#E8473F;
color:#ffffff;
border-radius:2px;
margin-right:8px;
padding:2px 4px;
font-weight:400;">Failed</span>'''

def pre_element(content): return '''<pre style="
background-color:#f7f7f7;
border: 1px solid #e5e5e9;
padding:10px;
white-space: pre-wrap;
border-radius:5px;">{}</pre>'''.format(content)

details_content_ok = '<div style="padding:11px 0;">Ok</div>'
dat_det = data['details']
details_content = details_content_ok if dat_det == [] else pre_element(
'\n'.join(dat_det) if data['message'] == 'Disk utilization' else ' '.join(dat_det))
return '<tr>{}{}{}</tr>'.format(
td_element(
'10px 20px 10px 0',
'font-weight:700;font-size:1.2em;',
badge+data['message']+ts_span),
td_element('10px 20px 10px 0', '', process),
td_element('0', '', details_content))


def send_email(subject, msg, sender, destination):
logging.info("Sending email: '{}' to '{}'".format(subject, destination))

if SMTP_USE_SSL.lower() == 'true':
if EMAIL_PORT is not None:
s = smtplib.SMTP_SSL(EMAIL_SERVER, int(EMAIL_PORT))
else:
# Port defaults to 465
s = smtplib.SMTP_SSL(EMAIL_SERVER)
else:
if EMAIL_PORT is not None:
s = smtplib.SMTP(EMAIL_SERVER, int(EMAIL_PORT))
else:
s = smtplib.SMTP(EMAIL_SERVER)
if SMTP_USE_TLS.lower() == 'true':
s.starttls()
s.ehlo()
if EMAIL_USERNAME is not None and EMAIL_PASSWORD is not None:
s.login(EMAIL_USERNAME, EMAIL_PASSWORD)
dest_list = destination.split(',')
s.sendmail(sender, dest_list, msg.as_string())
s.quit()


def send_alert_email(customer_tag, alert_info_json, destination):
is_valid = True
subject = "Yugabyte Platform Alert - <{}>".format(customer_tag)
if 'alert_name' in alert_info_json and alert_info_json['alert_name'] == 'Backup failure':
task_type = alert_info_json['task_type']
target_type = alert_info_json['target_type']
target_name = alert_info_json['target_name']
task_info = alert_info_json['task_info']
msg_content = "{} {} failed for {}.\n\nTask Info: {}" \
.format(task_type, target_type, target_name, task_info)
elif 'alert_name' in alert_info_json:
alert_name = alert_info_json['alert_name']
alert_state = alert_info_json['state']
universe_name = alert_info_json['universe_name']
msg_content = "{} for {} is {}.".format(alert_name, universe_name, alert_state)
else:
logging.error("Invalid alert_info_json")
is_valid = False
if is_valid:
sender = EMAIL_FROM
msg = MIMEMultipart('alternative')
msg['Subject'] = subject
msg['From'] = sender
msg['To'] = destination
msg.attach(MIMEText(msg_content, 'plain'))
send_email(subject, msg, sender, destination)


def send_health_check_mail(report, subject, destination, nodes, universe_name, report_only_errors):
style_font = "font-family: SF Pro Display, SF Pro, Helvetica Neue, Helvetica, sans-serif;"
json_object = json.loads(str(report))

def make_subtitle(content):
return '<span style="color:#8D8F9D;font-weight:400;">{}:</span>'.format(content)

email_content_data = []

for node in nodes:
node_has_error = False
is_first_check = True
node_content_data = []
node_name = ''
for node_check_data_row in json_object["data"]:
if node == node_check_data_row["node"]:
if node_check_data_row["has_error"]:
node_has_error = True
elif report_only_errors:
continue
node_name = node_check_data_row["node_name"]
node_content_data.append(assemble_mail_row(
node_check_data_row,
is_first_check,
json_object["timestamp"])
)
is_first_check = False

if report_only_errors and not node_has_error:
continue

node_header_style_colors_color = '#ffffff;' if node_has_error else '#289b42;'
node_header_style_colors_back = '#E8473F;' if node_has_error else '#ffffff'
node_header_style_colors = 'background-color:{};color:{}'.format(
node_header_style_colors_back, node_header_style_colors_color)
badge_caption = 'Error' if node_has_error else 'Running fine'

def th_element(style, content): return '<th style="{}">{}</th>'.format(style, content)

def tr_element(content): return '''<tr style="
text-transform:uppercase;
font-weight:500;">{}</tr>'''.format(content)

def table_element(content): return '''<table
cellspacing="0"
style="width:auto;text-align:left;font-size:12px;">{}</table>'''.format(content)

badge_style = '''style="font-weight:400;
margin-left:10px;
font-size:0.6em;
vertical-align:middle;
{}
border-radius:4px;
padding:2px 6px;"'''.format(node_header_style_colors)
node_header_title = str(node_name) + '<br>' + str(node) + \
'<span {}>{}</span>'.format(badge_style, badge_caption)
h2_style = '''font-weight:700;
line-height:1em;
color:#202951;
font-size:2.5em;
margin:0;{}'''.format(style_font)
node_header = '{}<h2 style="{}">{}</h2>'.format(
make_subtitle('Cluster'), h2_style, node_header_title)
table_container_style = '''background-color:#ffffff;
padding:15px 20px 7px;
border-radius:10px;
margin-top:30px;
margin-bottom:50px;'''

def table_container(content): return table_element(
tr_element(
th_element('width:15%;padding:0 30px 10px 0;', 'Check type') +
th_element('width:15%;white-space:nowrap;padding:0 30px 10px 0;', 'Details') +
th_element('', '')) +
content)
email_content_data.append('{}<div style="{}">{}</div>'.format(
node_header,
table_container_style,
table_container(''.join(node_content_data))))

if not email_content_data:
email_content_data = "<b>No errors to report.</b>"
sender = EMAIL_FROM
msg = MIMEMultipart('alternative')
msg['Subject'] = subject
msg['From'] = sender
# TODO(bogdan): AWS recommends doing one sendmail per address:
# https://docs.aws.amazon.com/ses/latest/DeveloperGuide/manage-sending-limits.html
msg['To'] = destination
style = "{}font-size: 14px;background-color:#f7f7f7;padding:25px 30px 5px;".format(style_font)
# add timestamp to avoid gmail collapsing
timestamp = '<span style="color:black;font-size:10px;">{}</span>'.format(
json_object["timestamp"])

def make_header_left(title, content):
return '''{}<h1 style="
{}
line-height:1em;
color:#202951;
font-weight:700;
font-size:1.85em;
padding-bottom:15px;
margin:0;">{}</h1>'''.format(make_subtitle(title), style_font, content)
header = '''<table width="100%">
<tr>
<td style="text-align:left">{name}</td>
<td style="text-align:right">{ts}</td>
</tr>
<tr>
<td style="text-align:left">{version}</td>
</tr>
</table>'''.format(
name=make_header_left("Universe name", universe_name),
ts=timestamp,
version=make_header_left("Universe version", report.yb_version))
body = '<html><body><pre style="{}">{}\n{} {}</pre></body></html>\n'.format(
style, header, ''.join(email_content_data), timestamp)

msg.attach(MIMEText(report.as_json(report_only_errors), 'plain'))
# the last part of a multipart MIME message is the preferred part
msg.attach(MIMEText(body, 'html'))

send_email(subject, msg, sender, destination)


def local_time():
return datetime.utcnow().replace(tzinfo=tz.tzutc()).astimezone(tz.gettz('America/Los_Angeles'))

Expand Down Expand Up @@ -774,32 +535,15 @@ def main():
parser = argparse.ArgumentParser(prog=sys.argv[0])
parser.add_argument('--cluster_payload', type=str, default=None, required=False,
help='JSON serialized payload of cluster data: IPs, pem files, etc.')
parser.add_argument('--universe_name', type=str, default=None, required=False,
help='Universe name to use in the email report')
parser.add_argument('--customer_tag', type=str, default=None,
help='Customer name/env info to use in the email report')
parser.add_argument('--destination', type=str, default=None, required=False,
help='CSV of email addresses to send the report to')
# TODO (Sergey P.): extract the next functionality to Java layer
parser.add_argument('--log_file', type=str, default=None, required=False,
help='Log file to which the report will be written to')
parser.add_argument('--send_status', action="store_true",
help='Send email even if no errors')
help='Log file to which the report will be written to.')
parser.add_argument('--start_time_ms', type=int, required=False, default=None,
help='Potential start time of the universe, to prevent uptime confusion.')
parser.add_argument('--retry_interval_secs', type=int, required=False, default=30,
help='Time to wait between retries of failed checks')
parser.add_argument('--report_only_errors', action="store_true",
help='Only report nodes with errors')
parser.add_argument('--send_notification', action="store_true",
help='Whether this is to alert to notify on or not')
parser.add_argument('--alert_info', type=str, default=None, required=False,
help='JSON serialized payload of backups that have failed')
help='Time to wait between retries of failed checks.')
args = parser.parse_args()
if args.send_notification and args.alert_info is not None:
alert_info_json = json.loads(args.alert_info)
send_alert_email(args.customer_tag, alert_info_json, args.destination)
print(alert_info_json)
elif args.cluster_payload is not None and args.universe_name is not None:
if args.cluster_payload is not None:
universe = UniverseDefinition(args.cluster_payload)
coordinator = CheckCoordinator(args.retry_interval_secs)
summary_nodes = {}
Expand Down Expand Up @@ -843,22 +587,6 @@ def main():
for e in entries:
report.add_entry(e)

state = "ERROR" if report.has_errors() else "OK"

subject = "{} - <{}> {}".format(state, args.customer_tag, args.universe_name)
# If we have errors or were asked to send email with status update, then send the email.
#
# NOTE: We only send emails if we are provided a destination,
# which can be a CSV of addresses.
if args.destination and (args.send_status or report.has_errors()):
try:
send_health_check_mail(
report, subject, args.destination, summary_nodes.keys(),
args.universe_name, args.report_only_errors
)
except Exception as e:
logging.error("Sending email failed with: {}".format(str(e)))
report.mail_error = str(e)
report.write_to_log(args.log_file)

# Write to stdout to be caught by YW subprocess.
Expand Down
Loading

0 comments on commit a3744a8

Please sign in to comment.