Skip to content

Commit

Permalink
Review fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
cartalla committed Jul 8, 2022
1 parent 13601d3 commit 8b3b89c
Show file tree
Hide file tree
Showing 28 changed files with 128 additions and 673 deletions.
7 changes: 0 additions & 7 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,11 +1,4 @@

.mkdocs_venv/
site/
.vscode/

# Jekyll
Gemfile.lock
.jekyll-cache
.mkdocs_venv/
_site
site/
Expand Down
4 changes: 0 additions & 4 deletions source/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,6 @@

app = App()

# TODO: Create a stack for each additional region to create resources needed to create instances in those regions.
# * Instance profile
# * Security group

cdk_env = Environment(
account = app.node.try_get_context('account_id'),
region = app.node.try_get_context('region')
Expand Down
21 changes: 12 additions & 9 deletions source/cdk/cdk_slurm_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,9 @@ def check_config(self):
exit(1)

if not self.config['slurm']['InstanceConfig']['Regions']:
default_region = {
self.config['slurm']['InstanceConfig']['Regions'] = {}
self.config['slurm']['InstanceConfig']['Regions'][self.config['Region']] = {
'VpcId': self.config['VpcId'],
'CIDR': self.config['CIDR'],
'SshKeyPair': self.config['SshKeyPair'],
'AZs': [
Expand All @@ -392,7 +394,6 @@ def check_config(self):
}
]
}
self.config['slurm']['InstanceConfig']['Regions'][self.config['Region']] = default_region

self.compute_regions = {}
self.remote_compute_regions = {}
Expand Down Expand Up @@ -648,7 +649,7 @@ def create_security_groups(self):
Tags.of(self.zfs_sg).add("Name", f"{self.stack_name}-ZfsSG")
self.suppress_cfn_nag(self.zfs_sg, 'W29', 'Egress port range used to block all egress')

# Compute nodes may use lustre file systems to create a security group with the required ports.
# Compute nodes may use lustre file systems so create a security group with the required ports.
self.lustre_sg = ec2.SecurityGroup(self, "LustreSG", vpc=self.vpc, allow_all_outbound=False, description="Lustre Security Group")
Tags.of(self.lustre_sg).add("Name", f"{self.stack_name}-LustreSG")
self.suppress_cfn_nag(self.lustre_sg, 'W29', 'Egress port range used to block all egress')
Expand Down Expand Up @@ -735,6 +736,7 @@ def create_security_groups(self):
fs_client_sg.connections.allow_to(self.nfs_sg, ec2.Port.tcp(2049), f"{fs_client_sg_name} to Nfs")
if self.onprem_cidr:
self.nfs_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp(2049), 'OnPremNodes to Nfs')
# Allow compute nodes in remote regions access to NFS
for compute_region, compute_region_cidr in self.remote_compute_regions.items():
self.nfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(2049), f"{compute_region} to Nfs")

Expand All @@ -759,6 +761,7 @@ def create_security_groups(self):
self.zfs_sg.connections.allow_from(self.onprem_cidr, ec2.Port.udp_range(20001, 20003), 'OnPremNodes to Zfs')
self.suppress_cfn_nag(self.zfs_sg, 'W27', 'Correct, restricted range for zfs: 20001-20003')
self.suppress_cfn_nag(self.zfs_sg, 'W29', 'Correct, restricted range for zfs: 20001-20003')
# Allow compute nodes in remote regions access to ZFS
for compute_region, compute_region_cidr in self.remote_compute_regions.items():
self.zfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(111), f"{compute_region} to Zfs")
self.zfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.udp(111), f"{compute_region} to Zfs")
Expand All @@ -785,6 +788,7 @@ def create_security_groups(self):
self.lustre_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp_range(1021, 1023), 'OnPremNodes to Lustre')
self.lustre_sg.connections.allow_to(self.onprem_cidr, ec2.Port.tcp(988), f"Lustre to OnPremNodes")
self.lustre_sg.connections.allow_to(self.onprem_cidr, ec2.Port.tcp_range(1021, 1023), f"Lustre to OnPremNodes")
# Allow compute nodes in remote regions access to Lustre
for compute_region, compute_region_cidr in self.remote_compute_regions.items():
self.lustre_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(988), f"{compute_region} to Lustre")
self.lustre_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp_range(1021, 1023), f"{compute_region} to Lustre")
Expand Down Expand Up @@ -988,6 +992,8 @@ def create_elasticsearch(self):
self.config['slurm']['JobCompLoc'] = f"http://{domain_endpoint}/slurm/_doc"

def create_file_system(self):
self.slurmfs_fqdn = f"slurmfs.{self.config['Domain']}"

if 'kms_key_arn' in self.config['slurm']['storage']:
kms_key = kms.Key.from_key_arn(self.config['slurm']['storage']['kms_key_arn'])
else:
Expand Down Expand Up @@ -1057,7 +1063,7 @@ def create_file_system(self):

self.file_system_mount_name = ""

self.file_system_mount_source = f"{self.file_system_ip_address}:/"
self.file_system_mount_source = f"{self.slurmfs_fqdn}:/"

if self.config['slurm']['storage']['efs']['use_efs_helper']:
self.file_system_type = 'efs'
Expand Down Expand Up @@ -1155,7 +1161,7 @@ def create_file_system(self):

self.file_system_mount_name = ""

self.file_system_mount_source = f"{self.file_system_ip_address}:/slurm"
self.file_system_mount_source = f"{self.slurmfs_fqdn}:/slurm"

self.file_system_options = 'nfsvers=4.1'

Expand Down Expand Up @@ -1237,7 +1243,7 @@ def create_file_system(self):

self.file_system_mount_name = ""

self.file_system_mount_source = f"{self.file_system_ip_address}:/fsx/slurm"
self.file_system_mount_source = f"{self.slurmfs_fqdn}:/fsx/slurm"

self.file_system_options = 'nfsvers=4.1'

Expand All @@ -1255,7 +1261,6 @@ def create_file_system(self):
record_name = 'slurmfs',
target = route53.RecordTarget.from_ip_addresses(self.file_system_ip_address)
)

CfnOutput(self, "FileSystemProvider",
value = self.config['slurm']['storage']['provider']
)
Expand Down Expand Up @@ -1725,7 +1730,6 @@ def get_instance_template_vars(self, instance_role):
"ERROR_SNS_TOPIC_ARN": self.config['ErrorSnsTopicArn'],
"ExtraMounts": self.config['slurm']['storage']['ExtraMounts'],
"FileSystemDns": self.file_system_dns,
"FileSystemIpAddress": self.file_system_ip_address,
"FileSystemMountPath": self.config['slurm']['storage']['mount_path'],
"FileSystemMountSrc": self.file_system_mount_source,
"FileSystemOptions": self.file_system_options,
Expand All @@ -1749,7 +1753,6 @@ def get_instance_template_vars(self, instance_role):
else:
instance_template_vars["AccountingStorageHost"] = ''
instance_template_vars["CloudWatchPeriod"] = self.config['slurm']['SlurmCtl']['CloudWatchPeriod']
instance_template_vars["CloudWatchPeriod"] = self.config['slurm']['SlurmCtl']['CloudWatchPeriod']
instance_template_vars["DefaultPartition"] = self.default_partition
if 'Federation' in self.config['slurm']:
instance_template_vars["Federation"] = self.config['slurm']['Federation']['Name']
Expand Down
53 changes: 39 additions & 14 deletions source/cdk/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,18 @@
'AFTER_90_DAYS'
]

eda_instance_families = [
default_eda_instance_families = [
#'c5', # Mixed depending on size
'c5a', # AMD EPYC 7R32 3.3 GHz
#'c5a', # AMD EPYC 7R32 3.3 GHz
#'c5ad', # AMD EPYC 7R32 3.3 GHz
'c6a',
'c6i', # Intel Xeon 8375C (Ice Lake) 3.5 GHz
'c6g', # AWS Graviton2 Processor 2.5 GHz
#'c6gd', # AWS Graviton2 Processor 2.5 GHz
#'f1', # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz
'm5', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
#'m5d', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
'm5a', # AMD EPYC 7571 2.5 GHz
#'m5a', # AMD EPYC 7571 2.5 GHz
#'m5ad', # AMD EPYC 7571 2.5 GHz
'm5zn', # Intel Xeon Platinum 8252 4.5 GHz
'm6a', # AMD EPYC 7R13 Processor 3.6 GHz
Expand All @@ -82,13 +83,37 @@
#'u-12tb1', # Intel Xeon Scalable (Skylake) 12 TB
]

eda_instance_types = [
default_eda_instance_types = [
#'c5\.(l|x|2|4|9|18).*', # Intel Xeon Platinum 8124M 3.4 GHz
'c5\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz
#'c5\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz
#'c5d\.(l|x|2|4|9|18).*', # Intel Xeon Platinum 8124M 3.4 GHz
#'c5d\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz
]

default_excluded_instance_families = [
'a1', # Graviton 1
'c4', # Replaced by c5
'd2', # SSD optimized
'g3', # Replaced by g4
'g3s', # Replaced by g4
'h1', # SSD optimized
'i3', # SSD optimized
'i3en', # SSD optimized
'm4', # Replaced by m5
'p2', # Replaced by p3
'p3',
'p3dn',
'r4', # Replaced by r5
't2', # Replaced by t3
'x1',
'x1e',
]

default_excluded_instance_types = [
'.+\.(micro|nano)', # Not enough memory
'.*\.metal'
]

# The config file is used in the installer and the CDK app.
# Some configuration values are required in the CDK app but are optional so that they can be set by the installer.
config_schema = Schema(
Expand Down Expand Up @@ -252,22 +277,22 @@
},
# Include*/Exclude*:
# Instance families and types are regular expressions with implicit '^' and '$' at the begining and end.
# Exclude patterns are processed first and take precedence over any includes.
# Exclude patterns are processed first and take precesdence over any includes.
# An empty list is the same as '.*'.
'Include': {
Optional('Exclude', default={'InstanceFamilies': default_excluded_instance_families, 'InstanceTypes': default_excluded_instance_types}): {
Optional('InstanceFamilies', default=default_excluded_instance_families): [str],
Optional('InstanceTypes', default=default_excluded_instance_types): [str]
},
Optional('Include', default={'MaxSizeOnly': False, 'InstanceFamilies': default_eda_instance_families, 'InstanceTypes': default_eda_instance_types}): {
# MaxSizeOnly:
# If MaxSizeOnly is True then only the largest instance type in
# a family will be included unless specific instance types are included.
# Default: false
Optional('MaxSizeOnly', default=False): bool,
'InstanceFamilies': [str],
'InstanceTypes': [str]
},
Optional('Exclude', default={'InstanceFamilies': [], 'InstanceTypes': []}): {
'InstanceFamilies': [str],
'InstanceTypes': [str]
Optional('InstanceFamilies', default=default_eda_instance_families): [str],
Optional('InstanceTypes', default=default_eda_instance_types): [str]
},
Optional('Regions', default=[]): {
Optional('Regions', default={}): {
str: {
'VpcId': And(str, lambda s: re.match('vpc-', s)),
'CIDR': str,
Expand Down
7 changes: 0 additions & 7 deletions source/resources/config/default_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,16 @@ slurm:
# A partition will be created for each combination of Base OS, Architecture, and Spot
InstanceConfig:
UseSpot: true
DefaultPartition: AlmaLinux_8_arm64_spot
NodesPerInstanceType: 10
BaseOsArchitecture:
AlmaLinux: {8: [x86_64, arm64]}
CentOS:
7: [x86_64]
Include:
MaxSizeOnly: false
InstanceFamilies:
- t3
- t4g
InstanceTypes: []
Exclude:
InstanceFamilies: []
InstanceTypes:
- '.+\.(micro|nano)' # Not enough memory
- '.*\.metal'

storage:
provider: zfs
Expand Down
23 changes: 0 additions & 23 deletions source/resources/config/slurm_all_instance_types.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,35 +14,12 @@ slurm:
# A partition will be created for each combination of Base OS, Architecture, and Spot
InstanceConfig:
UseSpot: true
DefaultPartition: CentOS_7_x86_64_spot
NodesPerInstanceType: 10
BaseOsArchitecture:
AlmaLinux: {8: [x86_64, arm64]}
Include:
MaxSizeOnly: false
InstanceFamilies: []
InstanceTypes: []
Exclude:
InstanceFamilies:
- a1 # Graviton 1
- c4 # Replaced by c5
- d2 # SSD optimized
- g3 # Replaced by g4
- g3s # Replaced by g4
- h1 # SSD optimized
- i3 # SSD optimized
- i3en # SSD optimized
- m4 # Replaced by m5
- p2 # Replaced by p3
- p3
- p3dn
- r4 # Replaced by r5
- t2 # Replaced by t3
- u
- x1
- x1e
InstanceTypes:
- '.*\.metal'

# Use defaults from schema
SlurmCtl: {}
Expand Down
6 changes: 0 additions & 6 deletions source/resources/config/slurm_all_os.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ slurm:
# A partition will be created for each combination of Base OS, Architecture, and Spot
InstanceConfig:
UseSpot: true
DefaultPartition: CentOS_7_x86_64_spot
NodesPerInstanceType: 10
BaseOsArchitecture:
AlmaLinux: {8: [x86_64, arm64]}
Expand All @@ -28,15 +27,10 @@ slurm:
8: [x86_64, arm64]
Rocky: {8: [x86_64, arm64]}
Include:
MaxSizeOnly: false
InstanceFamilies:
- t3
- t4g
InstanceTypes: []
Exclude:
InstanceFamilies: []
InstanceTypes:
- '.*\.metal'

# Use defaults from schema
SlurmCtl: {}
Expand Down
6 changes: 0 additions & 6 deletions source/resources/config/slurm_alma_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,14 @@ slurm:
# A partition will be created for each combination of Base OS, Architecture, and Spot
InstanceConfig:
UseSpot: true
DefaultPartition: AlmaLinux_8_x86_64_spot
NodesPerInstanceType: 10
BaseOsArchitecture:
AlmaLinux: {8: [x86_64, arm64]}
Include:
MaxSizeOnly: false
InstanceFamilies:
- t3
- t4g
InstanceTypes: []
Exclude:
InstanceFamilies: []
InstanceTypes:
- '.*\.metal'

# Use defaults from schema
SlurmCtl: {}
Expand Down
48 changes: 0 additions & 48 deletions source/resources/config/slurm_eda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,59 +24,11 @@ slurm:
# A partition will be created for each combination of Base OS, Architecture, and Spot
InstanceConfig:
UseSpot: true
DefaultPartition: CentOS_7_x86_64_spot
NodesPerInstanceType: 5
BaseOsArchitecture:
AlmaLinux: {8: [x86_64, arm64]}
CentOS:
7: [x86_64]
Include:
MaxSizeOnly: false
InstanceFamilies:
#- 'c5' # Mixed depending on size
#- 'c5a' # AMD EPYC 7R32 3.3 GHz
#- 'c5ad' # AMD EPYC 7R32 3.3 GHz
- 'c6a' # AMD EPYC 7R13 Processor 3.6 GHz
- 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz
- 'c6g' # AWS Graviton2 Processor 2.5 GHz
#- 'c6gd' # AWS Graviton2 Processor 2.5 GHz
#- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz
#- 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
#- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
#- 'm5a' # AMD EPYC 7571 2.5 GHz
#- 'm5ad' # AMD EPYC 7571 2.5 GHz
- 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz
- 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz
- 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz
- 'm6g' # AWS Graviton2 Processor 2.5 GHz
#- 'm6gd' # AWS Graviton2 Processor 2.5 GHz
#- 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
#- 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
#- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz
#- 'r5a' # AMD EPYC 7571 2.5 GHz
#- 'r5ad' # AMD EPYC 7571 2.5 GHz
- 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB
- 'r6g' # AWS Graviton2 Processor 2.5 GHz
#- 'r6gd' # AWS Graviton2 Processor 2.5 GHz
#- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB
#- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB
- 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB
- 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB
- 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB
- 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB
- 'z1d' # Intel Xeon Platinum 8151 4.0 GHz
#- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB
#- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB
#- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB
InstanceTypes: []
#- 'c5\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz
#- 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz
#- 'c5d\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz
#- 'c5d\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz
Exclude:
InstanceFamilies: []
InstanceTypes:
- '.*\.metal'

# Use defaults from schema
storage: {'zfs': {}}
Loading

0 comments on commit 8b3b89c

Please sign in to comment.