Skip to content

Commit

Permalink
feat(examples): Demonstrate how to send an email alarm when EFS burst…
Browse files Browse the repository at this point in the history
… credits below a threshold (#373)


When EFS is deployed in bursting throughput mode it will consume credits whenever throughput is
greater than a baseline throughput dictated by the amount of data stored in EFS. If the average
throughput over time is greater than the baseline, then burst credits will continuously decrease
until they eventually run out.

This adds to the Basic example some CDK code that demonstrates the good practice of setting up
an email alarm when the available burst credits drops below some thresholds; giving the operator
time to increase the amount of data stored on the EFS to increase baseline throughput.

Bonus: Fixes some math errors in the PadEfsStorage lambda function.
  • Loading branch information
ddneilson authored Apr 1, 2021
1 parent e9ab2bf commit cc5d372
Show file tree
Hide file tree
Showing 8 changed files with 386 additions and 82 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def main():
storage_props = storage_tier.StorageTierMongoDBProps(
vpc=network.vpc,
database_instance_type=InstanceType.of(InstanceClass.MEMORY5, InstanceSize.LARGE),
alarm_email=config.alarm_email_address,
root_ca=security.root_ca,
dns_zone=network.dns_zone,
accept_sspl_license=config.accept_sspl_license,
Expand All @@ -93,6 +94,7 @@ def main():
storage_props = storage_tier.StorageTierDocDBProps(
vpc=network.vpc,
database_instance_type=InstanceType.of(InstanceClass.MEMORY5, InstanceSize.LARGE),
alarm_email=config.alarm_email_address
)
storage = storage_tier.StorageTierDocDB(app, 'StorageTier', props=storage_props, env=env)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ def __init__(self):
# See https://www.awsthinkbox.com/end-user-license-agreement for the terms of the agreement.
self.accept_aws_thinkbox_eula: AwsThinkboxEulaAcceptance = AwsThinkboxEulaAcceptance.USER_REJECTS_AWS_THINKBOX_EULA

# Fill this in if you want to receive alarm emails when:
# 1) You are crossing thresholds on decreasing burst Credits on the Amazon EFS that is
# set up in the StorageTier, for the Deadline Repository.
#
# Note: When deploying, you will be sent an email asking to authorize these emails. If you do not authorize,
# then you will receive no alarm emails.
self.alarm_email_address: Optional[str] = None

# The version of Deadline to use on the render farm. Leave as None for the latest release or specify a version
# to pin to. Some examples of pinned version values are "10", "10.1", or "10.1.12"
self.deadline_version: Optional[str] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,18 @@
Construct,
Duration,
RemovalPolicy,
Size,
Stack,
StackProps
)
from aws_cdk.aws_cloudwatch import (
ComparisonOperator,
Metric,
TreatMissingData
)
from aws_cdk.aws_cloudwatch_actions import (
SnsAction
)
from aws_cdk.aws_docdb import (
BackupProps,
DatabaseCluster,
Expand All @@ -30,9 +39,21 @@
FileSystem,
PosixUser
)
from aws_cdk.aws_iam import (
ServicePrincipal
)
from aws_cdk.aws_kms import (
Key
)
from aws_cdk.aws_route53 import (
IPrivateHostedZone
)
from aws_cdk.aws_sns import (
Topic
)
from aws_cdk.aws_sns_subscriptions import (
EmailSubscription
)

from aws_rfdk import (
MongoDbUsers,
Expand Down Expand Up @@ -62,6 +83,9 @@ class StorageTierProps(StackProps):
# The VPC to deploy resources into.
vpc: IVpc

# Email address to send alerts to when CloudWatch Alarms breach.
alarm_email: Optional[str]


class StorageTier(Stack):
"""
Expand Down Expand Up @@ -92,34 +116,6 @@ def __init__(self, scope: Construct, stack_id: str, *, props: StorageTierProps,
removal_policy=RemovalPolicy.DESTROY
)

# Add padding files to the filesystem to increase baseline throughput. Deadline's Repository filesystem
# is small (initial size of about 1GB), which results in a very low baseline throughput for the Amazon
# EFS filesystem. We add files to the filesystem to increase this baseline throughput, while retaining the
# ability to burst throughput. See RFDK's PadEfsStorage documentation for additional details.
pad_access_point = AccessPoint(
self,
'PaddingAccessPoint',
file_system=file_system,
path='/PaddingFiles',
# TODO - We set the padding files to be owned by root (uid/gid = 0) by default. You may wish to change this.
create_acl=Acl(
owner_gid='0',
owner_uid='0',
permissions='700',
),
posix_user=PosixUser(
uid='0',
gid='0',
),
)
PadEfsStorage(
self,
'PadEfsStorage',
vpc=props.vpc,
access_point=pad_access_point,
desired_padding_gb=40, # Provides 2 MB/s of baseline throughput. Costs $12/month.
)

# Create an EFS access point that is used to grant the Repository and RenderQueue with write access to the
# Deadline Repository directory in the EFS file-system.
access_point = AccessPoint(
Expand Down Expand Up @@ -162,6 +158,150 @@ def __init__(self, scope: Construct, stack_id: str, *, props: StorageTierProps,
# The database to connect Deadline to.
self.database: Optional[DatabaseConnection] = None

# The Amazon EFS filesystem deployed above has been deployed in bursting throughput
# mode. This means that it can burst throughput up to 100 MiB/s (with reads counting as
# 1/3 of their actual throughput for this purpose). However, the baseline throughput of the EFS
# is 50 KiB/s per 1 GiB stored in the filesystem and exceeding this throughput consumes burst credits.
# An EFS starts with a large amount of burst credits, and regains credits when throughput is below
# the baseline throughput threshold.
#
# The Deadline Repository is approximately 1 GiB in size; resulting in 50 KiB/s baseline throughput, which is
# not sufficient for the operation of Deadline.
#
# The following:
# 1) Sets up a series of AWS CloudWatch Alarms that will send you an email to alert you to take action
# to increase the data stored in the filesystem when the burst credits have decreased below certain thresholds.
# If you run out of burst credits on the filesystem, then Deadline will start timing-out on requests and your
# render farm may become unstable.
# 2) Uses RFDK's PadEfsStorage construct to add data to the EFS for the purpose of increasing the amount
# of stored data to increase the baseline throughput.
#
# See: https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html
# for more information on AWS CloudWatch Alarms.
# See: https://docs.aws.amazon.com/efs/latest/ug/performance.html#throughput-modes
# for more information on Amazon EFS throughput modes.

if props.alarm_email:
self.add_low_efs_burst_credit_alarms(file_system, props.alarm_email)

# Add padding files to the filesystem to increase baseline throughput. We add files to the filesystem to
# increase this baseline throughput, while retaining the ability to burst throughput. See RFDK's PadEfsStorage
# documentation for additional details.
pad_access_point = AccessPoint(
self,
'PaddingAccessPoint',
file_system=file_system,
path='/RFDK_PaddingFiles',
# TODO - We set the padding files to be owned by root (uid/gid = 0) by default. You may wish to change this.
create_acl=Acl(
owner_gid='0',
owner_uid='0',
permissions='700',
),
posix_user=PosixUser(
uid='0',
gid='0',
),
)
PadEfsStorage(
self,
'PadEfsStorage',
vpc=props.vpc,
access_point=pad_access_point,
desired_padding=Size.gibibytes(40), # Provides 2 MiB/s of baseline throughput. Costs $12/month.
)

def add_low_efs_burst_credit_alarms(self, filesystem: FileSystem, email_address: str) -> None:
'''
Set up CloudWatch Alarms that will warn when the given filesystem's burst credits are below
four different thresholds. We send an email to the given address when an Alarm breaches.
'''
# Set up the SNS Topic that will send the emails.
# ====================
# 1) KMS key to use to encrypt events within the SNS Topic. The Key is optional
key = Key(
self,
'SNSEncryptionKey',
description='Used to encrypt the SNS Topic for sending EFS Burst Credit alerts',
enable_key_rotation=True,
removal_policy=RemovalPolicy.DESTROY,
trust_account_identities=True
)
key.grant(ServicePrincipal('cloudwatch.amazonaws.com'), 'kms:Decrypt', 'kms:GenerateDataKey')

# 2) SNS Topic that will be alerted by CloudWatch and will send the email in response.
sns_topic = Topic(
self,
'BurstAlertEmailTopic',
master_key=key
)
sns_topic.grant_publish(ServicePrincipal('cloudwatch.amazonaws.com'))
sns_topic.add_subscription(EmailSubscription(email_address))
alarm_action = SnsAction(sns_topic)

# Set up the CloudWatch Alarm(s) and have them trigger SNS events when breached.
# ======================
# 1) CDK helper to define the CloudWatch Metric that we're interested in.
burst_credits_metric = Metric(
metric_name='BurstCreditBalance',
namespace='AWS/EFS',
dimensions={
"FileSystemId": filesystem.file_system_id
},
# One 99-th percentile data point sample every hour
period=Duration.hours(1),
statistic='p99'
)

# 2) Create the alarms
thresholds = [
{
"id": 'CAUTION-EfsBurstCredits',
"name": f"CAUTION Burst Credits - {filesystem.file_system_id}",
"threshold": int(2.00 * 2**40),
"message": f"CAUTION. 2 TiB Threshold Breached: EFS {filesystem.file_system_id} is depleting burst credits. Add data to the EFS to increase baseline throughput.",
# Alarm after 6 datapoints below threshold. We have 1 datapoint every hour. So, we alarm if below threshold for 6hrs
"datapoints": 6
},
{
"id": 'WARNING-EfsBurstCredits',
"name": f"WARNING Burst Credits - {filesystem.file_system_id}",
"threshold": int(1.25 * 2**40),
"message": f"WARNING. 1.25 TiB Threshold Breached: EFS {filesystem.file_system_id} is depleting burst credits. Add data to the EFS to increase baseline throughput.",
# Alarm after 6 datapoints below threshold. We have 1 datapoint every hour. So, we alarm if below threshold for 6hrs
"datapoints": 6
},
{
"id": 'ALERT-EfsBurstCredits',
"name": f"ALERT Burst Credits - {filesystem.file_system_id}",
"threshold": int(0.50 * 2**40),
"message": f"ALERT! 500 GiB Threshold Breached: EFS {filesystem.file_system_id} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm may cease operation.",
# Alarm after 6 datapoints below threshold. We have 1 datapoint every hour. So, we alarm if below threshold for 6hrs
"datapoints": 6
},
{
"id": 'EMERGENCY-EfsBurstCredits',
"name": f"EMERGENCY Burst Credits - {filesystem.file_system_id}",
"threshold": int(0.10 * 2**40),
"message": f"EMERGENCY! 100 GiB Threshold Breached: EFS {filesystem.file_system_id} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm will cease operation.",
# Alarm after 2 datapoints below threshold. We have 1 datapoint every hour. So, we alarm if below threshold for 2hrs
"datapoints": 2
},
]
for config in thresholds:
alarm = burst_credits_metric.create_alarm(
self,
config['id'],
alarm_name=config['name'],
actions_enabled=True,
alarm_description=config['message'],
treat_missing_data=TreatMissingData.NOT_BREACHING,
threshold=config['threshold'],
comparison_operator=ComparisonOperator.LESS_THAN_THRESHOLD,
evaluation_periods=config['datapoints']
)
alarm.add_alarm_action(alarm_action)


@dataclass
class StorageTierDocDBProps(StorageTierProps):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ if (config.deployMongoDB) {
env,
vpc: network.vpc,
databaseInstanceType: InstanceType.of(InstanceClass.R5, InstanceSize.LARGE),
alarmEmail: config.alarmEmailAddress,
rootCa: security.rootCa,
dnsZone: network.dnsZone,
acceptSsplLicense: config.acceptSsplLicense,
Expand All @@ -88,6 +89,7 @@ if (config.deployMongoDB) {
env,
vpc: network.vpc,
databaseInstanceType: InstanceType.of(InstanceClass.R5, InstanceSize.LARGE),
alarmEmail: config.alarmEmailAddress,
});
}

Expand Down
10 changes: 10 additions & 0 deletions examples/deadline/All-In-AWS-Infrastructure-Basic/ts/bin/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@ class AppConfig {
*/
public readonly acceptAwsThinkboxEula: AwsThinkboxEulaAcceptance = AwsThinkboxEulaAcceptance.USER_REJECTS_AWS_THINKBOX_EULA;

/**
* Fill this in if you want to receive alarm emails when:
* 1) You are crossing thresholds on decreasing burst Credits on the Amazon EFS that is
* set up in the StorageTier, for the Deadline Repository.
*
* Note: When deploying, you will be sent an email asking to authorize these emails. If you do not authorize,
* then you will receive no alarm emails.
*/
public readonly alarmEmailAddress?: string;

/**
* The version of Deadline to use on the render farm. Some examples of pinned version values are "10", "10.1", or
* "10.1.12"
Expand Down
Loading

0 comments on commit cc5d372

Please sign in to comment.