From fcff57f9f1b6e82735fc02010a30cf9807d3f28c Mon Sep 17 00:00:00 2001 From: Daniel Neilson Date: Mon, 29 Mar 2021 22:17:46 +0000 Subject: [PATCH 1/5] feat(examples): Demonstrate how to send an email alarm when EFS burst credits below a threshold When EFS is deployed in bursting throughput mode it will consume credits whenever throughput is greater than a baseline throughput dictated by the amount of data stored in EFS. If the average throughput over time is greater than the baseline, then burst credits will continuously decrease until they eventually run out. This adds to the Basic example some CDK code that demonstrates the good practice of setting up an email alarm when the available burst credits drops below some thresholds; giving the operator time to increase the amount of data stored on the EFS to increase baseline throughput. Bonus: Fixes some math errors in the PadEfsStorage lambda function. --- .../python/package/app.py | 2 + .../python/package/config.py | 8 + .../python/package/lib/storage_tier.py | 189 +++++++++++++++--- .../ts/bin/app.ts | 2 + .../ts/bin/config.ts | 10 + .../ts/lib/storage-tier.ts | 180 ++++++++++++++--- .../nodejs/pad-efs-storage/filesystem-ops.ts | 3 +- .../pad-efs-storage/test/handlers.test.ts | 54 ++--- 8 files changed, 369 insertions(+), 79 deletions(-) diff --git a/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/app.py b/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/app.py index d37a21cc8..2a270165b 100644 --- a/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/app.py +++ b/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/app.py @@ -83,6 +83,7 @@ def main(): storage_props = storage_tier.StorageTierMongoDBProps( vpc=network.vpc, database_instance_type=InstanceType.of(InstanceClass.MEMORY5, InstanceSize.LARGE), + alarm_email=config.alarm_email_address, root_ca=security.root_ca, dns_zone=network.dns_zone, accept_sspl_license=config.accept_sspl_license, @@ -93,6 +94,7 @@ def main(): storage_props = storage_tier.StorageTierDocDBProps( vpc=network.vpc, database_instance_type=InstanceType.of(InstanceClass.MEMORY5, InstanceSize.LARGE), + alarm_email=config.alarm_email_address ) storage = storage_tier.StorageTierDocDB(app, 'StorageTier', props=storage_props, env=env) diff --git a/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/config.py b/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/config.py index 393d761eb..8b855a958 100644 --- a/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/config.py +++ b/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/config.py @@ -28,6 +28,14 @@ def __init__(self): # See https://www.awsthinkbox.com/end-user-license-agreement for the terms of the agreement. self.accept_aws_thinkbox_eula: AwsThinkboxEulaAcceptance = AwsThinkboxEulaAcceptance.USER_REJECTS_AWS_THINKBOX_EULA + # Fill this in if you want to recieve alarm emails when: + # 1) You are crossing thresholds on decreasing burst Credits on the Amazon EFS that is + # set up in the StorageTier, for the Deadline Repository. + # + # Note: When deploying, you will be sent an email asking to authorize these emails. If you do not authorize, + # then you will receive no alarm emails. + self.alarm_email_address: Optional[str] = None + # The version of Deadline to use on the render farm. Leave as None for the latest release or specify a version # to pin to. Some examples of pinned version values are "10", "10.1", or "10.1.12" self.deadline_version: Optional[str] = None diff --git a/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/lib/storage_tier.py b/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/lib/storage_tier.py index b64abb8ea..50b787146 100644 --- a/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/lib/storage_tier.py +++ b/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/lib/storage_tier.py @@ -9,9 +9,18 @@ Construct, Duration, RemovalPolicy, + Size, Stack, StackProps ) +from aws_cdk.aws_cloudwatch import ( + ComparisonOperator, + Metric, + TreatMissingData +) +from aws_cdk.aws_cloudwatch_actions import ( + SnsAction +) from aws_cdk.aws_docdb import ( BackupProps, DatabaseCluster, @@ -30,9 +39,21 @@ FileSystem, PosixUser ) +from aws_cdk.aws_iam import ( + ServicePrincipal +) +from aws_cdk.aws_kms import ( + Key +) from aws_cdk.aws_route53 import ( IPrivateHostedZone ) +from aws_cdk.aws_sns import ( + Topic +) +from aws_cdk.aws_sns_subscriptions import ( + EmailSubscription +) from aws_rfdk import ( MongoDbUsers, @@ -62,6 +83,9 @@ class StorageTierProps(StackProps): # The VPC to deploy resources into. vpc: IVpc + # Email address to send alerts to when CloudWatch Alarms breach. + alarm_email: Optional[str] + class StorageTier(Stack): """ @@ -92,34 +116,6 @@ def __init__(self, scope: Construct, stack_id: str, *, props: StorageTierProps, removal_policy=RemovalPolicy.DESTROY ) - # Add padding files to the filesystem to increase baseline throughput. Deadline's Repository filesystem - # is small (initial size of about 1GB), which results in a very low baseline throughput for the Amazon - # EFS filesystem. We add files to the filesystem to increase this baseline throughput, while retaining the - # ability to burst throughput. See RFDK's PadEfsStorage documentation for additional details. - pad_access_point = AccessPoint( - self, - 'PaddingAccessPoint', - file_system=file_system, - path='/PaddingFiles', - # TODO - We set the padding files to be owned by root (uid/gid = 0) by default. You may wish to change this. - create_acl=Acl( - owner_gid='0', - owner_uid='0', - permissions='700', - ), - posix_user=PosixUser( - uid='0', - gid='0', - ), - ) - PadEfsStorage( - self, - 'PadEfsStorage', - vpc=props.vpc, - access_point=pad_access_point, - desired_padding_gb=40, # Provides 2 MB/s of baseline throughput. Costs $12/month. - ) - # Create an EFS access point that is used to grant the Repository and RenderQueue with write access to the # Deadline Repository directory in the EFS file-system. access_point = AccessPoint( @@ -162,6 +158,143 @@ def __init__(self, scope: Construct, stack_id: str, *, props: StorageTierProps, # The database to connect Deadline to. self.database: Optional[DatabaseConnection] = None + # The Amazon EFS filesystem deployed above has been deployed in bursting throughput + # mode. This means that it can burst throughput up to 100 MiB/s (with reads counting as + # 1/3 of their actual throughput for this purpose). However, the baseline throughput of the EFS + # is 50 KiB/s per 1 GiB stored in the filesystem and exceeding this throughput consumes burst credits; + # the EFS regains burst credits when throughput is below the baseline throughput threshold. + # + # The Deadline Repository is approximately 1 GiB in size; resulting in 50 KiB/s baseline throughput, which is + # not sufficient for the operation of Deadline. + # + # The following: + # 1) Sets up a series of AWS CloudWatch Alarms that will send you an email to alert you to take action + # to increase the data stored in the filesystem when the burst credits have decreased below certain thresholds. + # If you run out of burst credits on the filesystem, then Deadline will start timing-out on requests and your + # render farm may become unstable. + # 2) Uses RFDK's PadEfsStorage construct to add data to the EFS for the purpose of increasing the amount + # of stored data to increase the baseline throughput. + # + # See: https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html + # for more information on AWS CloudWatch Alarms. + # See: https://docs.aws.amazon.com/efs/latest/ug/performance.html#throughput-modes + # for more information on Amazon EFS throughput modes. + + if props.alarm_email: + self.add_low_efs_burst_credit_alarms(file_system, props.alarm_email) + + # Add padding files to the filesystem to increase baseline throughput. We add files to the filesystem to + # increase this baseline throughput, while retaining the ability to burst throughput. See RFDK's PadEfsStorage + # documentation for additional details. + pad_access_point = AccessPoint( + self, + 'PaddingAccessPoint', + file_system=file_system, + path='/RFDK_PaddingFiles', + # TODO - We set the padding files to be owned by root (uid/gid = 0) by default. You may wish to change this. + create_acl=Acl( + owner_gid='0', + owner_uid='0', + permissions='700', + ), + posix_user=PosixUser( + uid='0', + gid='0', + ), + ) + PadEfsStorage( + self, + 'PadEfsStorage', + vpc=props.vpc, + access_point=pad_access_point, + desired_padding=Size.gibibytes(40), # Provides 2 MiB/s of baseline throughput. Costs $12/month. + ) + + def add_low_efs_burst_credit_alarms(self, filesystem: FileSystem, email_address: str) -> None: + ''' + Set up CloudWatch Alarms that will warn when the given filesystem's burst credits are below + four different thresholds. We send an email to the given address when an Alarm breaches. + ''' + # Set up the SNS Topic that will send the emails. + # ==================== + # 1) KMS key to use to encrypt events within the SNS Topic. The Key is optional + key = Key( + self, + 'SNSEncryptionKey', + description='Used to encrypt the SNS Topic for sending EFS Burst Credit alerts', + enable_key_rotation=True, + removal_policy=RemovalPolicy.DESTROY, + trust_account_identities=True + ) + key.grant(ServicePrincipal('cloudwatch.amazonaws.com'), 'kms:Decrypt', 'kms:GenerateDataKey') + + # 2) SNS Topic that will be alerted by CloudWatch and will send the email in response. + sns_topic = Topic( + self, + 'BurstAlertEmailTopic', + master_key=key + ) + sns_topic.grant_publish(ServicePrincipal('cloudwatch.amazonaws.com')) + sns_topic.add_subscription(EmailSubscription(email_address)) + alarm_action = SnsAction(sns_topic) + + # Set up the CloudWatch Alarm(s) and have them trigger SNS events when breached. + # ====================== + # 1) CDK helper to define the CloudWatch Metric that we're interested in. + burst_credits_metric = Metric( + metric_name='BurstCreditBalance', + namespace='AWS/EFS', + dimensions={ + "FileSystemId": filesystem.file_system_id + }, + # One 99-th percentile data point every 6 hours + period=Duration.hours(6), + statistic='p99' + ) + + # 2) Create the alarms + thresholds = [ + { + "id": 'CAUTION-EfsBurstCredits', + "name": f"CAUTION Burst Credits - {filesystem.file_system_id}", + "threshold": int(2.00 * 2**40), + "message": f"CAUTION. 2 TiB Threshold Breached: EFS {filesystem.file_system_id} is depleting burst credits. Add data to the EFS to increase baseline throughput." + }, + { + "id": 'WARNING-EfsBurstCredits', + "name": f"WARNING Burst Credits - {filesystem.file_system_id}", + "threshold": int(1.25 * 2**40), + "message": f"WARNING. 1.25 TiB Threshold Breached: EFS {filesystem.file_system_id} is depleting burst credits. Add data to the EFS to increase baseline throughput." + }, + { + "id": 'ALERT-EfsBurstCredits', + "name": f"ALERT Burst Credits - {filesystem.file_system_id}", + "threshold": int(0.50 * 2**40), + "message": f"ALERT! 500 GiB Threshold Breached: EFS {filesystem.file_system_id} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm may cease operation." + }, + { + "id": 'EMERGENCY-EfsBurstCredits', + "name": f"EMERGENCY Burst Credits - {filesystem.file_system_id}", + "threshold": int(0.10 * 2**40), + "message": f"EMERGENCY! 100 GiB Threshold Breached: EFS {filesystem.file_system_id} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm will cease operation." + }, + ] + for config in thresholds: + alarm = burst_credits_metric.create_alarm( + self, + config['id'], + alarm_name=config['name'], + actions_enabled=True, + alarm_description=config['message'], + treat_missing_data=TreatMissingData.NOT_BREACHING, + threshold=config['threshold'], + comparison_operator=ComparisonOperator.LESS_THAN_THRESHOLD, + # We have 1 datapoint every 6 hours. CloudWatch can check a period of time + # of at most 1 day. So, we alarm if we've gone a full day below the threshold. + evaluation_periods=4 + ) + alarm.add_alarm_action(alarm_action) + @dataclass class StorageTierDocDBProps(StorageTierProps): diff --git a/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/bin/app.ts b/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/bin/app.ts index 0f3dce8d9..885167ad2 100644 --- a/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/bin/app.ts +++ b/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/bin/app.ts @@ -78,6 +78,7 @@ if (config.deployMongoDB) { env, vpc: network.vpc, databaseInstanceType: InstanceType.of(InstanceClass.R5, InstanceSize.LARGE), + alarmEmail: config.alarmEmailAddress, rootCa: security.rootCa, dnsZone: network.dnsZone, acceptSsplLicense: config.acceptSsplLicense, @@ -88,6 +89,7 @@ if (config.deployMongoDB) { env, vpc: network.vpc, databaseInstanceType: InstanceType.of(InstanceClass.R5, InstanceSize.LARGE), + alarmEmail: config.alarmEmailAddress, }); } diff --git a/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/bin/config.ts b/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/bin/config.ts index 066e3e129..226259811 100644 --- a/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/bin/config.ts +++ b/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/bin/config.ts @@ -23,6 +23,16 @@ class AppConfig { */ public readonly acceptAwsThinkboxEula: AwsThinkboxEulaAcceptance = AwsThinkboxEulaAcceptance.USER_REJECTS_AWS_THINKBOX_EULA; + /** + * Fill this in if you want to recieve alarm emails when: + * 1) You are crossing thresholds on decreasing burst Credits on the Amazon EFS that is + * set up in the StorageTier, for the Deadline Repository. + * + * Note: When deploying, you will be sent an email asking to authorize these emails. If you do not authorize, + * then you will receive no alarm emails. + */ + public readonly alarmEmailAddress?: string; + /** * The version of Deadline to use on the render farm. Some examples of pinned version values are "10", "10.1", or * "10.1.12" diff --git a/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/lib/storage-tier.ts b/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/lib/storage-tier.ts index 4b2b53510..2933f6a92 100644 --- a/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/lib/storage-tier.ts +++ b/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/lib/storage-tier.ts @@ -9,12 +9,32 @@ import { SubnetType, } from '@aws-cdk/aws-ec2'; import * as cdk from '@aws-cdk/core'; +import { + ComparisonOperator, + Metric, + TreatMissingData, +} from '@aws-cdk/aws-cloudwatch'; +import { + SnsAction, +} from '@aws-cdk/aws-cloudwatch-actions'; import { DatabaseCluster } from '@aws-cdk/aws-docdb'; import { AccessPoint, FileSystem, } from '@aws-cdk/aws-efs'; +import { + ServicePrincipal, +} from '@aws-cdk/aws-iam'; +import { + Key, +} from '@aws-cdk/aws-kms'; import { IPrivateHostedZone } from '@aws-cdk/aws-route53'; +import { + Topic, +} from '@aws-cdk/aws-sns'; +import { + EmailSubscription, +} from '@aws-cdk/aws-sns-subscriptions'; import { RemovalPolicy, Duration } from '@aws-cdk/core'; import { MongoDbInstance, @@ -39,6 +59,11 @@ export interface StorageTierProps extends cdk.StackProps { * The VPC to deploy resources into. */ readonly vpc: IVpc; + + /** + * Email address to send alerts to when CloudWatch Alarms breach. + */ + readonly alarmEmail: string; } /** @@ -75,30 +100,6 @@ export abstract class StorageTier extends cdk.Stack { removalPolicy: RemovalPolicy.DESTROY, }); - // Add padding files to the filesystem to increase baseline throughput. Deadline's Repository filesystem - // is small (initial size of about 1GB), which results in a very low baseline throughput for the Amazon - // EFS filesystem. We add files to the filesystem to increase this baseline throughput, while retaining the - // ability to burst throughput. See RFDK's PadEfsStorage documentation for additional details. - const padAccessPoint = new AccessPoint(this, 'PaddingAccessPoint', { - fileSystem, - path: '/PaddingFiles', - // TODO - We set the padding files to be owned by root (uid/gid = 0) by default. You may wish to change this. - createAcl: { - ownerGid: '0', - ownerUid: '0', - permissions: '700', - }, - posixUser: { - uid: '0', - gid: '0', - }, - }); - new PadEfsStorage(this, 'PadEfsStorage', { - vpc: props.vpc, - accessPoint: padAccessPoint, - desiredPaddingGB: 40, // Provides 2 MB/s of baseline throughput. Costs $12/month. - }); - // Create an EFS access point that is used to grant the Repository and RenderQueue with write access to the Deadline // Repository directory in the EFS file-system. const accessPoint = new AccessPoint(this, 'AccessPoint', { @@ -134,6 +135,137 @@ export abstract class StorageTier extends cdk.Stack { // that. extraMountOptions: [ 'fsc' ] }); + + // The Amazon EFS filesystem deployed above has been deployed in bursting throughput + // mode. This means that it can burst throughput up to 100 MiB/s (with reads counting as + // 1/3 of their actual throughput for this purpose). However, the baseline throughput of the EFS + // is 50 KiB/s per 1 GiB stored in the filesystem and exceeding this throughput consumes burst credits; + // the EFS regains burst credits when throughput is below the baseline throughput threshold. + // + // The Deadline Repository is approximately 1 GiB in size; resulting in 50 KiB/s baseline throughput, which is + // not sufficient for the operation of Deadline. + // + // The following: + // 1) Sets up a series of AWS CloudWatch Alarms that will send you an email to alert you to take action + // to increase the data stored in the filesystem when the burst credits have decreased below certain thresholds. + // If you run out of burst credits on the filesystem, then Deadline will start timing-out on requests and your + // render farm may become unstable. + // 2) Uses RFDK's PadEfsStorage construct to add data to the EFS for the purpose of increasing the amount + // of stored data to increase the baseline throughput. + // + // See: https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html + // for more information on AWS CloudWatch Alarms. + // See: https://docs.aws.amazon.com/efs/latest/ug/performance.html#throughput-modes + // for more information on Amazon EFS throughput modes. + + if (props.alarmEmail) { + this.addLowEfsBurstCreditAlarms(fileSystem, props.alarmEmail); + } + + // Add padding files to the filesystem to increase baseline throughput. We add files to the filesystem to + // increase this baseline throughput, while retaining the ability to burst throughput. See RFDK's PadEfsStorage + // documentation for additional details. + const padAccessPoint = new AccessPoint(this, 'PaddingAccessPoint', { + fileSystem, + path: '/RFDK_PaddingFiles', + // TODO - We set the padding files to be owned by root (uid/gid = 0) by default. You may wish to change this. + createAcl: { + ownerGid: '0', + ownerUid: '0', + permissions: '700', + }, + posixUser: { + uid: '0', + gid: '0', + }, + }); + new PadEfsStorage(this, 'PadEfsStorage', { + vpc: props.vpc, + accessPoint: padAccessPoint, + desiredPadding: cdk.Size.gibibytes(40), // Provides 2 MiB/s of baseline throughput. Costs $12/month. + }); + + } + + /** + * Set up CloudWatch Alarms that will warn when the given filesystem's burst credits are below + * four different thresholds. We send an email to the given address when an Alarm breaches. + */ + protected addLowEfsBurstCreditAlarms(filesystem: FileSystem, emailAddress: string): void { + // Set up the SNS Topic that will send the emails. + // ==================== + // 1) KMS key to use to encrypt events within the SNS Topic. The Key is optional + const key = new Key(this, 'SNSEncryptionKey', { + description: 'Used to encrypt the SNS Topic for sending EFS Burst Credit alerts', + enableKeyRotation: true, + removalPolicy: RemovalPolicy.DESTROY, + trustAccountIdentities: true, + }); + key.grant(new ServicePrincipal('cloudwatch.amazonaws.com'), 'kms:Decrypt', 'kms:GenerateDataKey'); + + // 2) SNS Topic that will be alerted by CloudWatch and will send the email in response. + const snsTopic = new Topic(this, 'BurstAlertEmailTopic', { + masterKey: key, + }); + snsTopic.grantPublish(new ServicePrincipal('cloudwatch.amazonaws.com')); + snsTopic.addSubscription(new EmailSubscription(emailAddress)); + const alarmAction = new SnsAction(snsTopic); + + // Set up the CloudWatch Alarm(s) and have them trigger SNS events when breached. + // ====================== + // 1) CDK helper to define the CloudWatch Metric that we're interested in. + const burstCreditsMetric = new Metric({ + metricName: 'BurstCreditBalance', + namespace: 'AWS/EFS', + dimensions: { + FileSystemId: filesystem.fileSystemId, + }, + // One 99-th percentile data point every 6 hours + period: Duration.hours(6), + statistic: 'p99', + }); + + // 2) Create the alarms + const thresholds = [ + { + id: 'CAUTION-EfsBurstCredits', + name: `CAUTION Burst Credits - ${filesystem.fileSystemId}`, + threshold: 2.00 * 2**40, + message: `CAUTION. 2 TiB Threshold Breached: EFS ${filesystem.fileSystemId} is depleting burst credits. Add data to the EFS to increase baseline throughput.` + }, + { + id: 'WARNING-EfsBurstCredits', + name: `WARNING Burst Credits - ${filesystem.fileSystemId}`, + threshold: 1.25 * 2**40, + message: `WARNING. 1.25 TiB Threshold Breached: EFS ${filesystem.fileSystemId} is depleting burst credits. Add data to the EFS to increase baseline throughput.` + }, + { + id: 'ALERT-EfsBurstCredits', + name: `ALERT Burst Credits - ${filesystem.fileSystemId}`, + threshold: 0.50 * 2**40, + message: `ALERT! 500 GiB Threshold Breached: EFS ${filesystem.fileSystemId} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm may cease operation.` + }, + { + id: 'EMERGENCY-EfsBurstCredits', + name: `EMERGENCY Burst Credits - ${filesystem.fileSystemId}`, + threshold: 0.10 * 2**40, + message: `EMERGENCY! 100 GiB Threshold Breached: EFS ${filesystem.fileSystemId} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm will cease operation.` + }, + ] + for (var config of thresholds) { + const alarm = burstCreditsMetric.createAlarm(this, config.id, { + alarmName: config.name, + actionsEnabled: true, + alarmDescription: config.message, + treatMissingData: TreatMissingData.NOT_BREACHING, + threshold: config.threshold, + comparisonOperator: ComparisonOperator.LESS_THAN_THRESHOLD, + // We have 1 datapoint every 6 hours. CloudWatch can check a period of time + // of at most 1 day. So, we alarm if we've gone a full day below the threshold. + evaluationPeriods: 4, + }); + alarm.addAlarmAction(alarmAction); + } } } diff --git a/packages/aws-rfdk/lib/lambdas/nodejs/pad-efs-storage/filesystem-ops.ts b/packages/aws-rfdk/lib/lambdas/nodejs/pad-efs-storage/filesystem-ops.ts index cb2b1806f..c8b61699c 100644 --- a/packages/aws-rfdk/lib/lambdas/nodejs/pad-efs-storage/filesystem-ops.ts +++ b/packages/aws-rfdk/lib/lambdas/nodejs/pad-efs-storage/filesystem-ops.ts @@ -99,7 +99,8 @@ export async function determineNextSequentialFilename(location: string): Promise */ export async function writePaddingFile(filename: string, filesize: number): Promise { const execPromise = promisify(exec); - const command = `/usr/bin/dd if=/dev/zero of=${filename} bs=10M count=${filesize/10}`; + const numberOfBlocks = filesize / 32; + const command = `/usr/bin/dd if=/dev/zero of=${filename} bs=32M count=${numberOfBlocks}`; console.log(`Writing ${filesize}MiB to ${filename}: ${command}`); const { stderr } = await execPromise(command); console.log(stderr); diff --git a/packages/aws-rfdk/lib/lambdas/nodejs/pad-efs-storage/test/handlers.test.ts b/packages/aws-rfdk/lib/lambdas/nodejs/pad-efs-storage/test/handlers.test.ts index 1c5263313..09ff1f055 100644 --- a/packages/aws-rfdk/lib/lambdas/nodejs/pad-efs-storage/test/handlers.test.ts +++ b/packages/aws-rfdk/lib/lambdas/nodejs/pad-efs-storage/test/handlers.test.ts @@ -63,14 +63,14 @@ describe('Testing filesystem modifications', () => { test('Add to empty directory', async () => { // WHEN // Add 5 10MB files to the temp directory. - await growFilesystem(5, 10, tempDirectory); + await growFilesystem(5, 64, tempDirectory); // THEN const dirContents = (await fsp.readdir(tempDirectory)).sort(); expect(dirContents).toEqual(['00000', '00001', '00002', '00003', '00004']); for (var file of dirContents) { const stat = await fsp.stat(join(tempDirectory, file)); - expect(stat.size).toBe(10485760); + expect(stat.size).toBe(67108864); } }); @@ -83,7 +83,7 @@ describe('Testing filesystem modifications', () => { // WHEN // Add 2 10MB files to the temp directory. - await growFilesystem(2, 10, tempDirectory); + await growFilesystem(2, 64, tempDirectory); // THEN // Make sure that the files that we added started numbering at 8 @@ -159,15 +159,15 @@ describe('Testing getDiskUsage behavior', () => { test('Correctly calculates disk usage', async () => { // GIVEN - // This overrides the default padding file size to 10MB from 1000MB. Keep this in mind when interpreting the test. - // All of the interface points are phrased in terms of 1GB files, but this little hack changes the semantics of those - // to be phrased in terms of 10MB files. - setDefaultFilesize(10); + // This overrides the default padding file size to 64 MiB from 1024 MiB. Keep this in mind when interpreting the test. + // All of the interface points are phrased in terms of 1 GiB files, but this little hack changes the semantics of those + // to be phrased in terms of 64 MiB files. + setDefaultFilesize(64); const execPromise = promisify(exec); - await execPromise(`/usr/bin/dd if=/dev/zero of=${join(tempDirectory, 'file1.tmp')} bs=10MB count=1`); + await execPromise(`/usr/bin/dd if=/dev/zero of=${join(tempDirectory, 'file1.tmp')} bs=32M count=2`); await fsp.mkdir(join(tempDirectory, 'subdir')); - await execPromise(`/usr/bin/dd if=/dev/zero of=${join(tempDirectory, 'subdir', 'file2.tmp')} bs=10MB count=1`); + await execPromise(`/usr/bin/dd if=/dev/zero of=${join(tempDirectory, 'subdir', 'file2.tmp')} bs=32M count=2`); // WHEN const usage = await getDiskUsage({ @@ -254,10 +254,10 @@ describe('Testing padFilesystem macro behavior', () => { // GIVEN // Empty directory: tempDirectory - // This overrides the default padding file size to 10MB from 1000MB. Keep this in mind when interpreting the test. - // All of the interface points are phrased in terms of 1GB files, but this little hack changes the semantics of those - // to be phrased in terms of 10MB files. - setDefaultFilesize(10); + // This overrides the default padding file size to 64 MiB from 1024 MiB. Keep this in mind when interpreting the test. + // All of the interface points are phrased in terms of 1 GiB files, but this little hack changes the semantics of those + // to be phrased in terms of 64 MiB files. + setDefaultFilesize(64); // WHEN await padFilesystem({ @@ -273,17 +273,18 @@ describe('Testing padFilesystem macro behavior', () => { expect(dirContents).toEqual(['00000']); for (var file of dirContents) { const stat = await fsp.stat(join(tempDirectory, file)); - expect(stat.size).toBe(10485760); + expect(stat.size).toBe(67108864); } }); test('Removes file if needed', async () => { // GIVEN - // This overrides the default padding file size to 10MB from 1000MB. Keep this in mind when interpreting the test. - // All of the interface points are phrased in terms of 1GB files, but this little hack changes the semantics of those - // to be phrased in terms of 10MB files. - setDefaultFilesize(10); - // tempDirectory with 2 10MB files in it + // This overrides the default padding file size to 64 MiB from 1024 MiB. Keep this in mind when interpreting the test. + // All of the interface points are phrased in terms of 1 GiB files, but this little hack changes the semantics of those + // to be phrased in terms of 64 MiB files. + setDefaultFilesize(64); + + // tempDirectory with 2 64 MiB files in it await padFilesystem({ desiredPadding: '2', mountPoint: tempDirectory, @@ -309,17 +310,18 @@ describe('Testing padFilesystem macro behavior', () => { expect(dirContents).toEqual(['00000']); for (var file of dirContents) { const stat = await fsp.stat(join(tempDirectory, file)); - expect(stat.size).toBe(10485760); + expect(stat.size).toBe(67108864); } }); test('No change to filesystem', async () => { // GIVEN - // This overrides the default padding file size to 10MB from 1000MB. Keep this in mind when interpreting the test. - // All of the interface points are phrased in terms of 1GB files, but this little hack changes the semantics of those - // to be phrased in terms of 10MB files. - setDefaultFilesize(10); - // tempDirectory with a 10MB file in it + // This overrides the default padding file size to 64 MiB from 1024 MiB. Keep this in mind when interpreting the test. + // All of the interface points are phrased in terms of 1 GiB files, but this little hack changes the semantics of those + // to be phrased in terms of 64 MiB files. + setDefaultFilesize(64); + + // tempDirectory with a 64 MiB file in it await padFilesystem({ desiredPadding: '1', mountPoint: tempDirectory, @@ -345,7 +347,7 @@ describe('Testing padFilesystem macro behavior', () => { expect(dirContents).toEqual(preDirContents); for (var file of dirContents) { const stat = await fsp.stat(join(tempDirectory, file)); - expect(stat.size).toBe(10485760); + expect(stat.size).toBe(67108864); } }); }); From b8a7cad7ef6aa31114414998e36752d5b2d67978 Mon Sep 17 00:00:00 2001 From: Daniel Neilson Date: Wed, 31 Mar 2021 20:21:11 +0000 Subject: [PATCH 2/5] Convert var to let --- .../All-In-AWS-Infrastructure-Basic/ts/lib/storage-tier.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/lib/storage-tier.ts b/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/lib/storage-tier.ts index 2933f6a92..77ddd837b 100644 --- a/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/lib/storage-tier.ts +++ b/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/lib/storage-tier.ts @@ -252,7 +252,7 @@ export abstract class StorageTier extends cdk.Stack { message: `EMERGENCY! 100 GiB Threshold Breached: EFS ${filesystem.fileSystemId} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm will cease operation.` }, ] - for (var config of thresholds) { + for (let config of thresholds) { const alarm = burstCreditsMetric.createAlarm(this, config.id, { alarmName: config.name, actionsEnabled: true, From 9b99cc2c9cac884eb4476a2b986577ea8b35229c Mon Sep 17 00:00:00 2001 From: Daniel Neilson Date: Wed, 31 Mar 2021 20:22:50 +0000 Subject: [PATCH 3/5] Fix sloppy comments... 10M->64M --- .../lambdas/nodejs/pad-efs-storage/test/handlers.test.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/aws-rfdk/lib/lambdas/nodejs/pad-efs-storage/test/handlers.test.ts b/packages/aws-rfdk/lib/lambdas/nodejs/pad-efs-storage/test/handlers.test.ts index 09ff1f055..ccafde78b 100644 --- a/packages/aws-rfdk/lib/lambdas/nodejs/pad-efs-storage/test/handlers.test.ts +++ b/packages/aws-rfdk/lib/lambdas/nodejs/pad-efs-storage/test/handlers.test.ts @@ -62,7 +62,7 @@ describe('Testing filesystem modifications', () => { test('Add to empty directory', async () => { // WHEN - // Add 5 10MB files to the temp directory. + // Add 5 64 MiB files to the temp directory. await growFilesystem(5, 64, tempDirectory); // THEN @@ -82,7 +82,7 @@ describe('Testing filesystem modifications', () => { } // WHEN - // Add 2 10MB files to the temp directory. + // Add 2 64 MiB files to the temp directory. await growFilesystem(2, 64, tempDirectory); // THEN @@ -332,7 +332,7 @@ describe('Testing padFilesystem macro behavior', () => { // WHEN const preDirContents = (await fsp.readdir(tempDirectory)).sort(); - // Desire for 10MB of files + // Desire for 64 MiB of files await padFilesystem({ desiredPadding: '1', mountPoint: tempDirectory, From 93e345a74ef29ab90f41c4aaca3dd86e9144f829 Mon Sep 17 00:00:00 2001 From: Daniel Neilson Date: Wed, 31 Mar 2021 21:55:47 +0000 Subject: [PATCH 4/5] Refine alarm periods --- .../python/package/lib/storage_tier.py | 29 ++++++++++++------- .../ts/lib/storage-tier.ts | 29 ++++++++++++------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/lib/storage_tier.py b/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/lib/storage_tier.py index 50b787146..aebc7f47d 100644 --- a/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/lib/storage_tier.py +++ b/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/lib/storage_tier.py @@ -161,8 +161,9 @@ def __init__(self, scope: Construct, stack_id: str, *, props: StorageTierProps, # The Amazon EFS filesystem deployed above has been deployed in bursting throughput # mode. This means that it can burst throughput up to 100 MiB/s (with reads counting as # 1/3 of their actual throughput for this purpose). However, the baseline throughput of the EFS - # is 50 KiB/s per 1 GiB stored in the filesystem and exceeding this throughput consumes burst credits; - # the EFS regains burst credits when throughput is below the baseline throughput threshold. + # is 50 KiB/s per 1 GiB stored in the filesystem and exceeding this throughput consumes burst credits. + # An EFS starts with a large amount of burst credits, and regains credits when throughput is below + # the baseline throughput threshold. # # The Deadline Repository is approximately 1 GiB in size; resulting in 50 KiB/s baseline throughput, which is # not sufficient for the operation of Deadline. @@ -247,8 +248,8 @@ def add_low_efs_burst_credit_alarms(self, filesystem: FileSystem, email_address: dimensions={ "FileSystemId": filesystem.file_system_id }, - # One 99-th percentile data point every 6 hours - period=Duration.hours(6), + # One 99-th percentile data point sample every hour + period=Duration.hours(1), statistic='p99' ) @@ -258,25 +259,33 @@ def add_low_efs_burst_credit_alarms(self, filesystem: FileSystem, email_address: "id": 'CAUTION-EfsBurstCredits', "name": f"CAUTION Burst Credits - {filesystem.file_system_id}", "threshold": int(2.00 * 2**40), - "message": f"CAUTION. 2 TiB Threshold Breached: EFS {filesystem.file_system_id} is depleting burst credits. Add data to the EFS to increase baseline throughput." + "message": f"CAUTION. 2 TiB Threshold Breached: EFS {filesystem.file_system_id} is depleting burst credits. Add data to the EFS to increase baseline throughput.", + # Alarm after 6 datapoints below threshold. We have 1 datapoint every hour. So, we alarm if below threshold for 6hrs + "datapoints": 6 }, { "id": 'WARNING-EfsBurstCredits', "name": f"WARNING Burst Credits - {filesystem.file_system_id}", "threshold": int(1.25 * 2**40), - "message": f"WARNING. 1.25 TiB Threshold Breached: EFS {filesystem.file_system_id} is depleting burst credits. Add data to the EFS to increase baseline throughput." + "message": f"WARNING. 1.25 TiB Threshold Breached: EFS {filesystem.file_system_id} is depleting burst credits. Add data to the EFS to increase baseline throughput.", + # Alarm after 6 datapoints below threshold. We have 1 datapoint every hour. So, we alarm if below threshold for 6hrs + "datapoints": 6 }, { "id": 'ALERT-EfsBurstCredits', "name": f"ALERT Burst Credits - {filesystem.file_system_id}", "threshold": int(0.50 * 2**40), - "message": f"ALERT! 500 GiB Threshold Breached: EFS {filesystem.file_system_id} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm may cease operation." + "message": f"ALERT! 500 GiB Threshold Breached: EFS {filesystem.file_system_id} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm may cease operation.", + # Alarm after 6 datapoints below threshold. We have 1 datapoint every hour. So, we alarm if below threshold for 6hrs + "datapoints": 6 }, { "id": 'EMERGENCY-EfsBurstCredits', "name": f"EMERGENCY Burst Credits - {filesystem.file_system_id}", "threshold": int(0.10 * 2**40), - "message": f"EMERGENCY! 100 GiB Threshold Breached: EFS {filesystem.file_system_id} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm will cease operation." + "message": f"EMERGENCY! 100 GiB Threshold Breached: EFS {filesystem.file_system_id} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm will cease operation.", + # Alarm after 2 datapoints below threshold. We have 1 datapoint every hour. So, we alarm if below threshold for 2hrs + "datapoints": 2 }, ] for config in thresholds: @@ -289,9 +298,7 @@ def add_low_efs_burst_credit_alarms(self, filesystem: FileSystem, email_address: treat_missing_data=TreatMissingData.NOT_BREACHING, threshold=config['threshold'], comparison_operator=ComparisonOperator.LESS_THAN_THRESHOLD, - # We have 1 datapoint every 6 hours. CloudWatch can check a period of time - # of at most 1 day. So, we alarm if we've gone a full day below the threshold. - evaluation_periods=4 + evaluation_periods=config['datapoints'] ) alarm.add_alarm_action(alarm_action) diff --git a/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/lib/storage-tier.ts b/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/lib/storage-tier.ts index 77ddd837b..3ea5eda71 100644 --- a/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/lib/storage-tier.ts +++ b/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/lib/storage-tier.ts @@ -139,8 +139,9 @@ export abstract class StorageTier extends cdk.Stack { // The Amazon EFS filesystem deployed above has been deployed in bursting throughput // mode. This means that it can burst throughput up to 100 MiB/s (with reads counting as // 1/3 of their actual throughput for this purpose). However, the baseline throughput of the EFS - // is 50 KiB/s per 1 GiB stored in the filesystem and exceeding this throughput consumes burst credits; - // the EFS regains burst credits when throughput is below the baseline throughput threshold. + // is 50 KiB/s per 1 GiB stored in the filesystem and exceeding this throughput consumes burst credits. + // An EFS starts with a large amount of burst credits, and regains credits when throughput is below + // the baseline throughput threshold. // // The Deadline Repository is approximately 1 GiB in size; resulting in 50 KiB/s baseline throughput, which is // not sufficient for the operation of Deadline. @@ -220,8 +221,8 @@ export abstract class StorageTier extends cdk.Stack { dimensions: { FileSystemId: filesystem.fileSystemId, }, - // One 99-th percentile data point every 6 hours - period: Duration.hours(6), + // One 99-th percentile data point hour + period: Duration.hours(1), statistic: 'p99', }); @@ -231,25 +232,33 @@ export abstract class StorageTier extends cdk.Stack { id: 'CAUTION-EfsBurstCredits', name: `CAUTION Burst Credits - ${filesystem.fileSystemId}`, threshold: 2.00 * 2**40, - message: `CAUTION. 2 TiB Threshold Breached: EFS ${filesystem.fileSystemId} is depleting burst credits. Add data to the EFS to increase baseline throughput.` + message: `CAUTION. 2 TiB Threshold Breached: EFS ${filesystem.fileSystemId} is depleting burst credits. Add data to the EFS to increase baseline throughput.`, + // Alarm after 6 datapoints below threshold. We have 1 datapoint every hour. So, we alarm if below threshold for 6hrs + datapoints: 6 }, { id: 'WARNING-EfsBurstCredits', name: `WARNING Burst Credits - ${filesystem.fileSystemId}`, threshold: 1.25 * 2**40, - message: `WARNING. 1.25 TiB Threshold Breached: EFS ${filesystem.fileSystemId} is depleting burst credits. Add data to the EFS to increase baseline throughput.` + message: `WARNING. 1.25 TiB Threshold Breached: EFS ${filesystem.fileSystemId} is depleting burst credits. Add data to the EFS to increase baseline throughput.`, + // Alarm after 6 datapoints below threshold. We have 1 datapoint every hour. So, we alarm if below threshold for 6hrs + datapoints: 6 }, { id: 'ALERT-EfsBurstCredits', name: `ALERT Burst Credits - ${filesystem.fileSystemId}`, threshold: 0.50 * 2**40, - message: `ALERT! 500 GiB Threshold Breached: EFS ${filesystem.fileSystemId} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm may cease operation.` + message: `ALERT! 500 GiB Threshold Breached: EFS ${filesystem.fileSystemId} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm may cease operation.`, + // Alarm after 6 datapoints below threshold. We have 1 datapoint every hour. So, we alarm if below threshold for 6hrs + datapoints: 6 }, { id: 'EMERGENCY-EfsBurstCredits', name: `EMERGENCY Burst Credits - ${filesystem.fileSystemId}`, threshold: 0.10 * 2**40, - message: `EMERGENCY! 100 GiB Threshold Breached: EFS ${filesystem.fileSystemId} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm will cease operation.` + message: `EMERGENCY! 100 GiB Threshold Breached: EFS ${filesystem.fileSystemId} is running out of burst credits. Add data to the EFS to increase baseline throughput or else the Render Farm will cease operation.`, + // Alarm after 2 datapoints below threshold. We have 1 datapoint every hour. So, we alarm if below threshold for 2hrs + datapoints: 2 }, ] for (let config of thresholds) { @@ -260,9 +269,7 @@ export abstract class StorageTier extends cdk.Stack { treatMissingData: TreatMissingData.NOT_BREACHING, threshold: config.threshold, comparisonOperator: ComparisonOperator.LESS_THAN_THRESHOLD, - // We have 1 datapoint every 6 hours. CloudWatch can check a period of time - // of at most 1 day. So, we alarm if we've gone a full day below the threshold. - evaluationPeriods: 4, + evaluationPeriods: config.datapoints, }); alarm.addAlarmAction(alarmAction); } From 93617bb0c4b9d55f16f2cb8df293f5929969d7f0 Mon Sep 17 00:00:00 2001 From: Daniel Neilson Date: Wed, 31 Mar 2021 21:58:05 +0000 Subject: [PATCH 5/5] Fix spelling error in comments --- .../All-In-AWS-Infrastructure-Basic/python/package/config.py | 2 +- .../deadline/All-In-AWS-Infrastructure-Basic/ts/bin/config.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/config.py b/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/config.py index 8b855a958..30e366eff 100644 --- a/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/config.py +++ b/examples/deadline/All-In-AWS-Infrastructure-Basic/python/package/config.py @@ -28,7 +28,7 @@ def __init__(self): # See https://www.awsthinkbox.com/end-user-license-agreement for the terms of the agreement. self.accept_aws_thinkbox_eula: AwsThinkboxEulaAcceptance = AwsThinkboxEulaAcceptance.USER_REJECTS_AWS_THINKBOX_EULA - # Fill this in if you want to recieve alarm emails when: + # Fill this in if you want to receive alarm emails when: # 1) You are crossing thresholds on decreasing burst Credits on the Amazon EFS that is # set up in the StorageTier, for the Deadline Repository. # diff --git a/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/bin/config.ts b/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/bin/config.ts index 226259811..d6482bab4 100644 --- a/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/bin/config.ts +++ b/examples/deadline/All-In-AWS-Infrastructure-Basic/ts/bin/config.ts @@ -24,7 +24,7 @@ class AppConfig { public readonly acceptAwsThinkboxEula: AwsThinkboxEulaAcceptance = AwsThinkboxEulaAcceptance.USER_REJECTS_AWS_THINKBOX_EULA; /** - * Fill this in if you want to recieve alarm emails when: + * Fill this in if you want to receive alarm emails when: * 1) You are crossing thresholds on decreasing burst Credits on the Amazon EFS that is * set up in the StorageTier, for the Deadline Repository. *