Skip to content

Commit

Permalink
Revert "feat: Encrypt s3 buckets for EMR log bucket and CICD Artifact…
Browse files Browse the repository at this point in the history
… bucket (awslabs#508)"

This reverts commit d276d1c.
  • Loading branch information
manikandan-thangavelu-rl authored Jun 14, 2021
1 parent a87ffcb commit bdac8a4
Show file tree
Hide file tree
Showing 6 changed files with 652 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import ec2RStudioInstance from '../templates/ec2-rstudio-instance.cfn.yml';
import ec2LinuxInstance from '../templates/ec2-linux-instance.cfn.yml';
import ec2WindowsInstance from '../templates/ec2-windows-instance.cfn.yml';
import sagemakerInstance from '../templates/sagemaker-notebook-instance.cfn.yml';
import emrCluster from '../templates/emr-cluster.cfn.yml';
import onboardAccount from '../templates/onboard-account.cfn.yml';
import storageGatewayNetworkInfra from '../templates/storage-gateway/network-infrastructure.cfn.yml';
import applicationLoadBalancer from '../templates/application-load-balancer.cfn.yml';
Expand All @@ -31,6 +32,7 @@ const templates = [
add('ec2-linux-instance', ec2LinuxInstance),
add('ec2-windows-instance', ec2WindowsInstance),
add('sagemaker-notebook-instance', sagemakerInstance),
add('emr-cluster', emrCluster),
add('onboard-account', onboardAccount),
add('storage-gateway-network-infra', storageGatewayNetworkInfra),
add('application-load-balancer', applicationLoadBalancer),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
AWSTemplateFormatVersion: 2010-09-09

Description: Service-Workbench-on-AWS EMR-Hail-Jupyter

Metadata:
AWS::CloudFormation::Interface:
ParameterGroups:
- Label:
default: EMR Options
Parameters:
- Namespace
- KeyName
- VPC
- Subnet
- CoreNodeCount
- DiskSizeGB
- MasterInstanceType
- WorkerInstanceType
- WorkerBidPrice
- AccessFromCIDRBlock
- AmiId
- Label:
default: Tags
Parameters:
- NameTag
- OwnerTag
- PurposeTag

Parameters:
Namespace:
Type: String
Description: An environment name that will be prefixed to resource names
KeyName:
Description: SSH key pair to use for EMR node login
Type: AWS::EC2::KeyPair::KeyName
VPC:
Description: VPC for EMR nodes.
Type: AWS::EC2::VPC::Id
Subnet:
Description: Subnet for EMR nodes, from the VPC selected above
Type: AWS::EC2::Subnet::Id
CoreNodeCount:
Description: Number of core nodes to provision (1-80)
Type: Number
MinValue: '1'
MaxValue: '80'
Default: '5'
DiskSizeGB:
Description: EBS Volume size (GB) for each node
Type: Number
MinValue: '10'
MaxValue: '1000'
Default: '20'
MasterInstanceType:
Type: String
Default: m5.xlarge
Description: EMR node ec2 instance type.
WorkerInstanceType:
Type: String
Default: m5.xlarge
Description: EMR node ec2 instance type.
Market:
Type: String
Default: ON_DEMAND
Description: Which market to purchase workers on - ON_DEMAND or SPOT.
WorkerBidPrice:
Type: String
Description: Bid price for the worker spot nodes. This is only applicable when Market = SPOT. Specify 0 for Market = ON_DEMAND.
AccessFromCIDRBlock:
Type: String
MinLength: 9
Description: Restrict WebUI access to specified address or range
AmiId:
Type: String
Description: Ami Id to use for the cluster
EnvironmentInstanceFiles:
Type: String
Description: >-
An S3 URI (starting with "s3://") that specifies the location of files to be copied to
the environment instance, including any bootstrap scripts
S3Mounts:
Type: String
Description: A JSON array of objects with name, bucket and prefix properties used to mount data
IamPolicyDocument:
Type: String
Description: The IAM policy to be associated with the launched workstation
EncryptionKeyArn:
Type: String
Description: The ARN of the KMS encryption Key used to encrypt data in the cluster

Conditions:
IamPolicyEmpty: !Equals [!Ref IamPolicyDocument, '{}']
IsOnDemandCondition: !Equals [!Ref Market, ON_DEMAND]

Resources:
# TODO: Use one bucket for EMR logs per account, so shift deployment to account on-boarding and pass here as param
LogBucket:
Type: AWS::S3::Bucket
DeletionPolicy: Retain
Properties:
PublicAccessBlockConfiguration: # Block all public access configuration for the S3 bucket
BlockPublicAcls: true
BlockPublicPolicy: true
IgnorePublicAcls: true
RestrictPublicBuckets: true

MasterSecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: Jupyter
VpcId:
Ref: VPC
SecurityGroupIngress:
- IpProtocol: tcp
FromPort: 8192
ToPort: 8192
CidrIp:
Ref: AccessFromCIDRBlock

InstanceProfile:
Properties:
Path: '/'
Roles:
- Ref: Ec2Role
Type: AWS::IAM::InstanceProfile

Ec2Role:
Type: 'AWS::IAM::Role'
Properties:
RoleName: !Join ['-', [Ref: Namespace, 'ec2-role']]
Path: '/'
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: 'Allow'
Principal:
Service:
- 'ec2.amazonaws.com'
Action:
- 'sts:AssumeRole'
Policies:
- !If
- IamPolicyEmpty
- !Ref 'AWS::NoValue'
- PolicyName: !Join ['-', [Ref: Namespace, 's3-studydata-policy']]
PolicyDocument: !Ref IamPolicyDocument
- PolicyName: !Join ['-', [Ref: Namespace, 's3-bootstrap-script-policy']]
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: 'Allow'
Action: 's3:GetObject'
Resource:
- 'arn:aws:s3:::us-east-1.elasticmapreduce/bootstrap-actions/run-if'
- !Sub
- 'arn:aws:s3:::${S3Location}/*'
# Remove "s3://" prefix from EnvironmentInstanceFiles
- S3Location: !Select [1, !Split ['s3://', !Ref EnvironmentInstanceFiles]]
- Effect: 'Allow'
Action: 's3:ListBucket'
Resource: !Sub
- 'arn:aws:s3:::${S3Bucket}'
- S3Bucket: !Select [2, !Split ['/', !Ref EnvironmentInstanceFiles]]
Condition:
StringLike:
s3:prefix: !Sub
- '${S3Prefix}/*'
- S3Prefix: !Select [3, !Split ['/', !Ref EnvironmentInstanceFiles]]

ServiceRole:
Type: AWS::IAM::Role
Properties:
Path: '/'
AssumeRolePolicyDocument:
Statement:
- Action:
- sts:AssumeRole
Effect: Allow
Principal:
Service:
- elasticmapreduce.amazonaws.com
Version: '2012-10-17'
ManagedPolicyArns:
- arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceRole

EmrSecurityConfiguration:
Type: AWS::EMR::SecurityConfiguration
Properties:
SecurityConfiguration:
{
'EncryptionConfiguration':
{
'AtRestEncryptionConfiguration':
{
'LocalDiskEncryptionConfiguration':
{
'EncryptionKeyProviderType': 'AwsKms',
'AwsKmsKey': { 'Ref': 'EncryptionKeyArn' },
'EnableEbsEncryption': true,
},
},
'EnableInTransitEncryption': false,
'EnableAtRestEncryption': true,
},
}

# TODO: customise jupyter password from launch ui
# TODO: Add security configuration to cluster
# TODO: Can we make the jupyter use https?
# TODO: Also change notebook owner to hadoop on launch
EmrCluster:
Type: AWS::EMR::Cluster
Properties:
Applications:
- Name: Hadoop
- Name: Hive
- Name: Spark
BootstrapActions:
- Name: Run-Python-Jupyter
ScriptBootstrapAction:
Path: s3://us-east-1.elasticmapreduce/bootstrap-actions/run-if
Args:
- 'instance.isMaster=true'
- '/opt/hail-on-AWS-spot-instances/src/jupyter_run.sh'
- Name: Mount-S3-Resources
ScriptBootstrapAction:
Path: !Sub '${EnvironmentInstanceFiles}/get_bootstrap.sh'
Args:
- !Ref EnvironmentInstanceFiles
- !Ref S3Mounts
CustomAmiId:
Ref: AmiId
Configurations:
- Classification: spark
ConfigurationProperties:
maximizeResourceAllocation: true
- Classification: yarn-site
ConfigurationProperties:
yarn.nodemanager.vmem-check-enabled: false
- Classification: spark-defaults
ConfigurationProperties:
spark.hadoop.io.compression.codecs: 'org.apache.hadoop.io.compress.DefaultCodec,is.hail.io.compress.BGzipCodec,org.apache.hadoop.io.compress.GzipCodec'
spark.serializer: 'org.apache.spark.serializer.KryoSerializer'
spark.hadoop.parquet.block.size: '1099511627776'
spark.sql.files.maxPartitionBytes: '1099511627776'
spark.sql.files.openCostInBytes: '1099511627776'
Configurations: []
Instances:
AdditionalMasterSecurityGroups:
- Fn::GetAtt:
- MasterSecurityGroup
- GroupId
Ec2KeyName:
Ref: KeyName
Ec2SubnetId:
Ref: Subnet
MasterInstanceGroup:
InstanceCount: 1
InstanceType:
Ref: MasterInstanceType
CoreInstanceGroup: !If
- IsOnDemandCondition
- InstanceCount:
Ref: CoreNodeCount
InstanceType:
Ref: WorkerInstanceType
Market:
Ref: Market
EbsConfiguration:
EbsOptimized: true
EbsBlockDeviceConfigs:
- VolumeSpecification:
SizeInGB:
Ref: DiskSizeGB
VolumeType: gp2
- InstanceCount:
Ref: CoreNodeCount
InstanceType:
Ref: WorkerInstanceType
Market:
Ref: Market
BidPrice:
Ref: WorkerBidPrice
EbsConfiguration:
EbsOptimized: true
EbsBlockDeviceConfigs:
- VolumeSpecification:
SizeInGB:
Ref: DiskSizeGB
VolumeType: gp2
JobFlowRole:
Ref: InstanceProfile
Name: !Sub '${Namespace}-emr'
Tags: # Add Name tag so EC2 instances are easily identifiable
- Key: Name
Value: !Sub '${Namespace}-emr'
ServiceRole:
Ref: ServiceRole
ReleaseLabel: emr-5.27.0
# This has to be true because we assume a new user each time.
VisibleToAllUsers: true
SecurityConfiguration: !Ref EmrSecurityConfiguration
LogUri: !Sub 's3://${LogBucket}'

Outputs:
JupyterUrl:
Description: Open Jupyter on your new EMR cluster
Value: !Sub 'http://${EmrCluster.MasterPublicDNS}:8192'
LogBucket:
Description: EMR Scratch data and Logs bucket
Value: !Ref LogBucket
WorkspaceInstanceRoleArn:
Description: IAM role assumed by the EMR workspace instances
Value: !GetAtt Ec2Role.Arn
Loading

0 comments on commit bdac8a4

Please sign in to comment.