From 13601d30f4c5358f03fcf9f0d329dcfff8681e58 Mon Sep 17 00:00:00 2001 From: Allan Carter Date: Sat, 23 Apr 2022 02:41:38 +0000 Subject: [PATCH 1/2] Add multi-AZ and multi-region support Add Regions and AZs to the InstanceConfig The code has been updated support multiple regions. The instance types that are available and the pricing varies by region so all instance type info must be maintained by region. Spot pricing additionally varies by instance type and by AZ and this commit adds an updated EC2InstanceTypeInfoPkg package that looks up the spot pricing for each instance type in each AZ and region. The Region/AZ configuration is added to the InstanceConfig section of the config file. The region requires the VpcId, CIDR, and SshKeyPair. The AZ requires the subnet ID and priority. The slurm node configuration has been updated to add the AZ id to all compute nodes and add the AZ name to all partitions. Users can specify multiple partitions with sbatch if they want jobs to be spread across multiple AZs. The modulefile has been updated to set the partition to the list of all regional/az partitions so that all nodes are available to the jobs in the priority configured in the config file. Create compute node security groups for other regions using a custom resource. Save regional security group ids in ssm parameter store. Update multi-region route53 hosted zone Fix IAM permissions to handle multiple regions Decode iam permissions messsages Update security groups with remote region cidrs Create slurmfs ARecord for use in other regions. This required adding a lambda to do DNS lookups. Add custom resource to add regional VPCs to the Route53 hosted zone. This is required for now because of a CDK bug: https://github.com/aws/aws-cdk/issues/20496 The PR for the above bug is: https://github.com/aws/aws-cdk/pull/20530 Update github-pages to use mkdocs Add github-docs target to Makefile Update to cdk@2.28.1 Create AZ and interactive partitions, set default partitions Resolves [FEATURE #22: Support mutiple availability zones and regions](https://github.com/aws-samples/aws-eda-slurm-cluster/issues/2) --- .gitignore | 3 + Makefile | 21 +- README.md | 22 +- _config.yml | 4 - docs/_config.yml | 1 - docs/deploy.md | 40 +- docs/federation.md | 6 +- docs/mkdocs.md | 17 - docs/multi-region.md | 291 ++++++ docs/source/cdk/config_schema.py | 1 - docs/source/config/default_config.yml | 1 - docs/source/config/slurm_eda_az1.yml | 1 - docs/source/config/slurm_eda_az2.yml | 1 - docs/source/config/slurm_eda_az3.yml | 1 - docs/todo.md | 16 - install.sh | 33 +- mkdocs.yml | 15 +- source/app.py | 4 + source/cdk/cdk_slurm_stack.py | 494 ++++++--- source/cdk/config_schema.py | 26 +- source/requirements.txt | 2 +- source/resources/config/slurm_multi_az.yml | 257 +++++ .../CreateComputeNodeSG.py | 258 +++++ .../CreateComputeNodeSG/cfnresponse.py | 1 + .../DeconfigureCluster/DeconfigureCluster.py | 1 + .../resources/lambdas/DnsLookup/DnsLookup.py | 61 ++ .../lambdas/DnsLookup/cfnresponse.py | 1 + .../GetOntapSvmDNSName/GetOntapSvmDNSName.py | 1 + .../Route53HostedZoneAddVpc.py | 75 ++ .../Route53HostedZoneAddVpc/cfnresponse.py | 1 + .../resources/lambdas/UpdateDns/UpdateDns.py | 1 + .../slurm/cluster/bin/EC2InstanceTypeInfo.py | 110 -- .../EC2InstanceTypeInfo.py | 343 +++++++ .../bin/EC2InstanceTypeInfoPkg/__init__.py | 0 .../get_ec2_instance_info.py | 24 + .../retry_boto3_throttling.py | 85 ++ .../slurm/cluster/bin/SlurmNodeUserData.sh | 4 +- .../opt/slurm/cluster/bin/SlurmPlugin.py | 956 ++++++++++-------- .../SlurmCtl/tasks/slurm_configuration.yml | 2 +- .../roles/SlurmCtl/tasks/slurm_scripts.yml | 54 +- .../slurm/cluster/config/slurm_config.json | 6 +- .../opt/slurm/cluster/config/slurm_config.sh | 3 - .../modules/modulefiles/slurm/.template | 18 +- .../opt/slurm/cluster/test/job_stress.sh | 2 +- .../roles/SlurmNodeAmi/tasks/main.yml | 1 - .../roles/mount_slurm_fs/tasks/main.yml | 1 + .../roles/unmount_slurm_fs/tasks/main.yml | 1 + source/resources/user_data/WaitForAmi.py | 51 +- .../user_data/slurm_node_ami_config.sh | 2 +- source/slurm_installer/installer.py | 8 + tests/test_slurm_minimal.py | 19 + 51 files changed, 2560 insertions(+), 787 deletions(-) delete mode 100644 _config.yml delete mode 100644 docs/_config.yml delete mode 100644 docs/mkdocs.md create mode 100644 docs/multi-region.md delete mode 120000 docs/source/cdk/config_schema.py delete mode 120000 docs/source/config/default_config.yml delete mode 120000 docs/source/config/slurm_eda_az1.yml delete mode 120000 docs/source/config/slurm_eda_az2.yml delete mode 120000 docs/source/config/slurm_eda_az3.yml create mode 100644 source/resources/config/slurm_multi_az.yml create mode 100644 source/resources/lambdas/CreateComputeNodeSG/CreateComputeNodeSG.py create mode 120000 source/resources/lambdas/CreateComputeNodeSG/cfnresponse.py create mode 100644 source/resources/lambdas/DnsLookup/DnsLookup.py create mode 120000 source/resources/lambdas/DnsLookup/cfnresponse.py create mode 100644 source/resources/lambdas/Route53HostedZoneAddVpc/Route53HostedZoneAddVpc.py create mode 120000 source/resources/lambdas/Route53HostedZoneAddVpc/cfnresponse.py delete mode 100755 source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfo.py create mode 100755 source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py create mode 100755 source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/__init__.py create mode 100755 source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/get_ec2_instance_info.py create mode 100755 source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/retry_boto3_throttling.py create mode 100644 tests/test_slurm_minimal.py diff --git a/.gitignore b/.gitignore index cd444850..deb1375f 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,7 @@ site/ # Jekyll Gemfile.lock .jekyll-cache +.mkdocs_venv/ _site +site/ +.vscode/ diff --git a/Makefile b/Makefile index 88868faa..6efc056b 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,24 @@ +.PHONY: help local-docs test clean + help: - @echo "Usage: make [ help | clean ]" + @echo "Usage: make [ help | local-docs | github-docs | clean ]" + +.mkdocs_venv/bin/activate: + rm -rf .mkdocs_venv + python3 -m venv .mkdocs_venv + source .mkdocs_venv/bin/activate; pip install mkdocs + +local-docs: .mkdocs_venv/bin/activate + source .mkdocs_venv/bin/activate; mkdocs serve& + firefox http://127.0.0.1:8000/ + +github-docs: .mkdocs_venv/bin/activate + source .mkdocs_venv/bin/activate; mkdocs gh-deploy --strict test: pytest -x -v tests -jekyll: - gem install jekyll bundler - bundler install - bundle exec jekyll serve - clean: git clean -d -f -x # -d: Recurse into directories diff --git a/README.md b/README.md index 249e9210..6bf2e11c 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,6 @@ # AWS EDA Slurm Cluster -[View on GitHub Pages](https://aws-samples.github.io/aws-eda-slurm-cluster/) - -This repository contains an AWS Cloud Development Kit (CDK) application that creates a SLURM cluster that is suitable for running production EDA workloads on AWS. +This repository contains an AWS Cloud Development Kit (CDK) application that creates a Slurm cluster that is suitable for running production EDA workloads on AWS. Key features are: * Automatic scaling of AWS EC2 instances based on demand @@ -11,7 +9,7 @@ Key features are: * Batch and interactive partitions (queues) * Managed tool licenses as a consumable resource * User and group fair share scheduling -* SLURM accounting database +* Slurm accounting database * CloudWatch dashboard * Job preemption * Multi-cluster federation @@ -21,7 +19,7 @@ Key features are: ## Operating System and Processor Architecture Support -This SLURM cluster supports the following OSes: +This Slurm cluster supports the following OSes: * Alma Linux 8 * Amazon Linux 2 @@ -32,7 +30,7 @@ This SLURM cluster supports the following OSes: RedHat stopped supporting CentOS 8, so for a similar RedHat 8 binary compatible distribution we support Alma Linux and Rocky Linux as replacements for CentOS. -This SLURM cluster supports both Intel/AMD (x86_64) based instances and ARM Graviton2 (arm64/aarch64) based instances. +This Slurm cluster supports both Intel/AMD (x86_64) based instances and ARM Graviton2 (arm64/aarch64) based instances. [Graviton 2 instances require](https://github.com/aws/aws-graviton-getting-started/blob/main/os.md) Amazon Linux 2, RedHat 8, AlmaLinux 8, or RockyLinux 8 operating systems. RedHat 7 and CentOS 7 do not support Graviton 2. @@ -52,7 +50,9 @@ This provides the following different combinations of OS and processor architect ## Documentation -To view the docs, clone the repository and run mkdocs: +[View on GitHub Pages](https://aws-samples.github.io/aws-eda-slurm-cluster/) + +To view the docs locally, clone the repository and run mkdocs: The docs are in the docs directory. You can view them in an editor or using the mkdocs tool. @@ -74,10 +74,16 @@ firefox http://127.0.0.1:8000/ & Open a browser to: http://127.0.0.1:8000/ +Or you can simply let make do this for you. + +``` +make local-docs +``` + ## Security See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. ## License -This library is licensed under the MIT-0 License. See the LICENSE file. +This library is licensed under the MIT-0 License. See the [LICENSE](https://github.com/aws-samples/aws-eda-slurm-cluster/blob/main/LICENSE) file. diff --git a/_config.yml b/_config.yml deleted file mode 100644 index c1352260..00000000 --- a/_config.yml +++ /dev/null @@ -1,4 +0,0 @@ -theme: jekyll-theme-slate -exclude: - - source/cdk.out - - source/resources/playbooks diff --git a/docs/_config.yml b/docs/_config.yml deleted file mode 100644 index c7418817..00000000 --- a/docs/_config.yml +++ /dev/null @@ -1 +0,0 @@ -theme: jekyll-theme-slate \ No newline at end of file diff --git a/docs/deploy.md b/docs/deploy.md index 2c1ebeb7..c783885a 100644 --- a/docs/deploy.md +++ b/docs/deploy.md @@ -75,17 +75,15 @@ Add the nodjs bin directory to your path. Note that the version of aws-cdk changes frequently. The version that has been tested is in the CDK_VERSION variable in the install script. -``` The install script will try to install the prerequisites if they aren't already installed. -``` ## Configuration File The first step in deploying your cluster is to create a configuration file. -A default configuration file is found in [source/resources/config/default_config.yml](source/config/default_config.yml). +A default configuration file is found in [source/resources/config/default_config.yml](https://github.com/aws-samples/aws-eda-slurm-cluster/blob/main/source/resources/config/default_config.yml). You should create a new config file and update the parameters for your cluster. -The schema for the config file along with its default values can be found in [source/cdk/config_schema.py](source/cdk/config_schema.py). +The schema for the config file along with its default values can be found in [source/cdk/config_schema.py](https://github.com/aws-samples/aws-eda-slurm-cluster/blob/main/source/cdk/config_schema.py). The schema is defined in python, but the actual config file should be in yaml format. The following are key parameters that you will need to update. @@ -115,7 +113,7 @@ The defaults for the following parameters are generally acceptable, but may be m ## Configure the Compute Instances The InstanceConfig configuration parameter configures the base operating systems, CPU architectures, instance families, -and instance types that the SLURM cluster should support. +and instance types that the Slurm cluster should support. The supported OSes and CPU architectures are: | Base OS | CPU Architectures @@ -204,7 +202,7 @@ If you want to use the latest base OS AMIs, then configure your AWS cli credenti the tested version. ``` -source/create-ami-map.py > source/resources/config/ami_map.yml +./source/create-ami-map.py > source/resources/config/ami_map.yml ``` ## Use Your Own AMIs (Optional) @@ -240,13 +238,13 @@ This is useful if the root volume needs additional space to install additional p ## Configure Fair Share Scheduling (Optional) -SLURM supports [fair share scheduling](https://slurm.schedmd.com/fair_tree.html), but it requires the fair share policy to be configured. +Slurm supports [fair share scheduling](https://slurm.schedmd.com/fair_tree.html), but it requires the fair share policy to be configured. By default, all users will be put into a default group that has a low fair share. -The configuration file is at **source/resources/playbooks/roles/SlurmCtl/templates/tools/slurm/etc/accounts.yml.example** +The configuration file is at [source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/etc/accounts.yml.example](https://github.com/aws-samples/aws-eda-slurm-cluster/blob/main/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/etc/accounts.yml.example) in the repository and is deployed to **/opt/slurm/{{ClusterName}}/conf/accounts.yml**. The file is a simple yaml file that allows you to configure groups, the users that belong to the group, and a fair share weight for the group. -Refer to the SLURM documentation for details on how the fair share weight is calculated. +Refer to the Slurm documentation for details on how the fair share weight is calculated. The scheduler can be configured so that users who aren't getting their fair share of resources get higher priority. The following shows 3 top level groups. @@ -322,13 +320,13 @@ These weights can be adjusted based on your needs to control job priorities. ## Configure Licenses -SLURM supports [configuring licenses as a consumable resource](https://slurm.schedmd.com/licenses.html). +Slurm supports [configuring licenses as a consumable resource](https://slurm.schedmd.com/licenses.html). It will keep track of how many running jobs are using a license and when no more licenses are available then jobs will stay pending in the queue until a job completes and frees up a license. Combined with the fairshare algorithm, this can prevent users from monopolizing licenses and preventing others from being able to run their jobs. -The configuration file is at **source/resources/playbooks/roles/SlurmCtl/templates/tools/slurm/etc/accounts.yml.example** +The configuration file is at [source/resources/playbooks/roles/SlurmCtl/templates/tools/slurm/etc/slurm_licenses.conf.example](https://github.com/aws-samples/aws-eda-slurm-cluster/blob/main/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/etc/slurm_licenses.conf.example) in the repository and is deployed to **/opt/slurm/{{ClusterName}}/conf/accounts.yml**. The example configuration shows how the number of licenses can be configured as just a comma separated list. @@ -351,11 +349,11 @@ with command line arguments, however it is better to specify all of the paramete ## Use the Cluster Configuring your environment for users requires root privileges. -The configuration commands are found in the outputs of the SLURM cloudformation stack. +The configuration commands are found in the outputs of the Slurm cloudformation stack. -### Configure SLURM Users and Groups +### Configure Slurm Users and Groups -The SLURM cluster needs to configure the users and groups of your environment. +The Slurm cluster needs to configure the users and groups of your environment. For efficiency, it does this by capturing the users and groups from your environment and saves them in a json file. When the compute nodes start they create local unix users and groups using this json file. @@ -364,18 +362,18 @@ Choose a single instance in your VPC that will always be running and that is joi so that it can list all users and groups. For SOCA this would be the Scheduler instance. Connect to that instance and run the commands in the **MountCommand** and **ConfigureSyncSlurmUsersGroups** outputs -of the SLURM stack. -These commands will mount the SLURM file system at **/opt/slurm/{{ClusterName}}** and then create +of the Slurm stack. +These commands will mount the Slurm file system at **/opt/slurm/{{ClusterName}}** and then create a cron job that runs every 5 minutes and updates **/opt/slurm/{{ClusterName}}/config/users_groups.json**. -### Configure SLURM Submitter Instances +### Configure Slurm Submitter Instances -Instances that need to submit to SLURM need to have their security group IDs in the **SubmitterSecurityGroupIds** configuration parameter -so that the security groups allow communication between the submitter instances and the SLURM cluster. -They also need to be configured by mounting the file system with the SLURM tools and +Instances that need to submit to Slurm need to have their security group IDs in the **SubmitterSecurityGroupIds** configuration parameter +so that the security groups allow communication between the submitter instances and the Slurm cluster. +They also need to be configured by mounting the file system with the Slurm tools and configuring their environment. Connect to the submitter instance and run the commands in the **MountCommand** and **ConfigureSubmitterCommand** outputs -of the SLURM stack. +of the Slurm stack. If all users need to use the cluster then it is probably best to create a custom AMI that is configured with the configuration commands. diff --git a/docs/federation.md b/docs/federation.md index 59f9409e..9e72a40a 100644 --- a/docs/federation.md +++ b/docs/federation.md @@ -5,9 +5,9 @@ If you need to run jobs in more than one AZ then you can use the [federation fea The config directory has example configuration files that demonstrate how deploy federated cluster into 3 AZs. -* [source/config/slurm_eda_az1.yml](source/config/slurm_eda_az1.yml) -* [source/config/slurm_eda_az2.yml](source/config/slurm_eda_az2.yml) -* [source/config/slurm_eda_az3.yml](source/config/slurm_eda_az3.yml) +* [source/config/slurm_eda_az1.yml](https://github.com/aws-samples/aws-eda-slurm-cluster/blob/main/source/config/slurm_eda_az1.yml) +* [source/config/slurm_eda_az2.yml](https://github.com/aws-samples/aws-eda-slurm-cluster/blob/main/source/config/slurm_eda_az2.yml) +* [source/config/slurm_eda_az3.yml](https://github.com/aws-samples/aws-eda-slurm-cluster/blob/main/source/config/slurm_eda_az3.yml) These clusters should be deployed sequentially. The first cluster creates a cluster and a slurmdbd instance. diff --git a/docs/mkdocs.md b/docs/mkdocs.md deleted file mode 100644 index ebb7ab4b..00000000 --- a/docs/mkdocs.md +++ /dev/null @@ -1,17 +0,0 @@ -# mkdocs - -For full documentation visit [mkdocs.org](https://www.mkdocs.org). - -## Commands - -* `mkdocs new [dir-name]` - Create a new project. -* `mkdocs serve` - Start the live-reloading docs server. -* `mkdocs build` - Build the documentation site. -* `mkdocs -h` - Print help message and exit. - -## Project layout - - mkdocs.yml # The configuration file. - docs/ - index.md # The documentation homepage. - ... # Other markdown pages, images and other files. diff --git a/docs/multi-region.md b/docs/multi-region.md new file mode 100644 index 00000000..1f26ad5b --- /dev/null +++ b/docs/multi-region.md @@ -0,0 +1,291 @@ +# Multi-AZ and Multi-Region Support + +By default, the EDA Slurm Cluster deploys all resources in a single availability zone (AZ). +This is done for performance and cost reasons to minimize network latency and network cross AZ charges. +Very large clusters may hit capacity constraints in a single AZ and can benefit from being able to launch compute nodes in multiple AZs and even multiple +regions to get the required capacity. +For this reason, the cluster can support compute nodes in multiple AZ and regions. + +All compute nodes are managed by one Slurm controller and the compute nodes encode the region and AZ in their node names. +When a job gets scheduled on a compute node, the Slurm plugin runs an instance in the region and AZ encoded in the node name. +Compute nodes in each AZ are grouped in partitions that can be given priorities. +This allows a job submission to specify multiple partitions and the scheduler will choose available compute nodes from the highest priority partition. + +**NOTE**: This is an advanced topic with significant configuration complexity so it is recommended that you get guidance +from and AWS specialist to help you set up your configuration. +This page documents a simple setup which is unlikely to meet your file system performance goals without modification. + +## Requirements + +There are three primary requirements for multi-AZ/region support: 1) networking, 2) file systems and 3) DNS. + +### Networking + +The main networking requirement is that each region must have a VPC with a subnet in each AZ. +The CIDR ranges of the VPC must be non-overlapping. +The VPCs must connected using VPC Peering Connections or Transit Gateways and the routes and ACLs must be configured to allow communication between all of the VPCs. +The compute nodes use all the ephemeral ports so those ports must be routed between the VPCs. + +### File Systems + +The compute nodes must have the same logical view of the file systems. +All paths used by the compute nodes must be available in each AZ and region. +One way to accomplish this is to simply mount the exact same file systems on all compute nodes. +This has the advantage of simplicity, however, it will incur performance penalties because of increased network latency and network charges +because of cross-AZ and cross-Region network charges. + +The slurm file system is not performance critical and can be cross mounted. + +Performance critical file systems can be replicated across AZs and regions and and automatically synchronized using FSx for NetApp Ontap (FSxN) and FlexCache or SnapMirror. +FlexCache is particularly efficient because it is a sparse cache that only synchronizes data when it it accessed. +This means that not all of the data has to be replicated anywhere. +If you replicate file systems then it means that your machine images will need to be configured to mount the closest file system. +This could be done at boot time by using SSM parameters or by using location specific automount maps. +Currently Route53 doesn't support a policy that allows you to choose an AZ dependent domain resolution. +This is an advanced topic and we recommend that you consult with an AWS storage specialist to help you architect a storage solution +that will meet your performance needs. + +### DNS + +The cluster creates a Route53 private hosted zone or can use an existing one to get the IP addresses for the Slurm controllers and slurmdbd instances. +It uses the AWS provided DNS in the VPC to get the IP addresses of AWS managed file systems. +All of the VPCs need access to all of the DNS entries used by the Slurm instances. + +## Configuration + +This example is going to demonstrate how to configure a cluster that spans 3 AZs in 3 regions for a total of 9 AZs. +It is going to use a very simple file system topology with all of the file systems located in the primary AZ. + +### Create VPCs + +In this example I deployed 3 Scale Out Computing on AWS (SOCA) clusters in eu-west-1, us-east-1, and us-west-2 with non-overlapping CIDRs. +This created 3 VPCs each with 3 private subnets. + +| Region | SOCA Cluster | CIDR | +|-----------|--------------|------| +| eu-west-1 | oktank-dub | 10.1.0.0/16 | +| us-east-1 | oktank-iad | 10.2.0.0/16 | +| us-west-2 | oktank-pdx | 10.3.0.0/16 | + +I am going to create a multi-region Slurm cluster in eu-west-1 that can run compute nodes in all 3 regions with the priority of the regions being + +1. eu-west-1 (dub) +1. us-east-1 (iad) +1. us-west-2 (pdx) + +If you have a globally distributed team you could modify the instructions to use regions close to your global team and deploy a cluster in the +local region of each team that they can use that can run jobs in all of the regions. + +### Connect the VPCs using VPC Peering Connections + +1. Go to the VPC console in eu-west-1 and select **Peering connections** on the left. +1. Click on **Create peering connection** on the upper right +1. Name the connection dub-to-iad +1. For the local VPC select the SOCA VPC +1. For the other VPC's region select **Another Region** +1. Select us-east-1 +1. Open the us-east-1 VPC console in another tab and copy the SOCA vpc id +1. Go back to the eu-west-1 VPC console and paste the vpc id into the **VPC ID (Accepter) field +1. Click **Create peering connection** to create the connection. +1. Go back to the us-east-1 console and select **Peering connections** on the left. +1. Select the connection you just created. It should be in **Pending acceptance** state. +1. Select **Actions** and **Accept request** + +Repeat the same steps to create VPC peering connections between eu-west-1 (dub) and us-west-2 (pdx) and between +us-east-1 (iad) and us-west-2 (pdx). +When you are complete all of the VPCs will have a connection to the others. + +The next step is to set up routing table entries to route the traffic between the VPCs over the peering connections. +Do the following steps in each region. + +1. Open the VPC console +1. Select **Route tables** on the left +1. Select the route table for each of the 3 private subnets +1. Click **Edit routes** +1. Click **Add route** +1. Enter the CIDR range for another VPC in the destination and the peering connection to that VPC (start typing pcx- and then select from the list) +1. Click **Save changes** + +When this is done packets from each VPC to any other will be routed across the appropriate VPC peering connection. + +## DNS: Route53 Private Hosted Zone + +The oktank-dub SOCA cluster has two EFS file systems mounted at /apps and /data that contain the home and tools directories for the user's desktops. +We are enabling the SOCA users to submit jobs to Slurm so those volumes will need to be available on all compute nodes. +However, the EFS DNS name will only able to be resolved by the AWS provided DNS server in the oktank-dub VPC. +We could just use the IP address, but it is more maintainable to create a Route53 private hosted zone that is shared +by all of the clusters so that we can refer the the EFS file systems with a friendly DNS name. + +**Note** that Route53 is a global, not regional, service. + +1. Open the Route53 console +1. Select Hosted zones on the left +1. Click **Create hosted zone** in the upper right +1. Enter a domain name like slurmdub.local +1. For Type select **Private hosted zone** +1. Associate the oktank-dub VPC with the hosted zone. + 1. For Region select eu-west-1 + 1. Click the VPC ID and select the SOCA VPC +1. Associate the oktank-iad VPC with the hosted zone. + 1. Click **Add VPC** + 1. For Region select eu-west-1 + 1. Click the VPC ID and select the SOCA VPC +1. Associate the oktank-pdx VPC with the hosted zone. + 1. Click **Add VPC** + 1. For Region select eu-west-1 + 1. Click the VPC ID and select the SOCA VPC +1. Click **Create hosted zone** +1. Expand **Hosted zone details** and save the **Hosted zone ID** which will be used in the config file. + +Create DNS entries for the SOCA EFS file systems. + +1. Open the EFS console in eu-west-1 +1. Select the Apps file system +1. Select the Network tab and note the IP addresses for all 3 availability zones. +1. Repeat to get the IP addresses for the Data file system. +1. Open the Route53 console and select the slurmdub.local hosted zone. +1. Click **Create record** +1. Name it fs-apps +1. For record type select **A** +1. For **Value** put the 3 IP addresses for the EFS file system on separate lines +1. For **Routing policy** select **latency** +1. For **Region** select eu-west-1 +1. For **Record ID** enter **fs-apps-dub** +1. Create idential A records for fs-apps in the us-east-1 and us-west-2 regions with **Record ID** of fs-apps-iad and fs-apps-pdx. +1. Repeat for the Data file system and create fs-data A records in all 3 regions. + +## File System Access + +Make sure that file system security groups allow access from all slurm VPCs. + +You may need to allow inbound access from 10.2.0.0/16 and 10.3.0.0/16. + +## Slurm Configuration + +The following configuration file configures all three regions. +Note that there are values for key pairs, VPC IDs, subnet IDs, etc. that you will have to update with the information from +your SOCA clusters. + +Regional resources that must be provided: +* VPC IDs +* VPC CIDRs +* Subnet IDs +* EC2 Keypairs +* Security Group IDs + +Regional resources that will be created for you: +* ComputeNodeSecurityGroup + +Global resources that will be created for you: +* IAM instance roles + +slurm_dub.yml: + +``` +--- +# Multi-region Slurm cluster with Netapp Ontap +# +# Origin of the cluster is in eu-west-1 and extends into us-east-1 and us-west-2 + +StackName: slurmdub + +Region: eu-west-1 + +SshKeyPair: admin-eu-west-1 # Or whatever Key Pair you've created + +VpcId: vpc-xxxxxxxxxxxxxxxxx # oktank-dub + +SubnetId: subnet-xxxxxxxxxxxxxxxxx # oktank-dub, PrivateSubnet1 + +HostedZoneId: XXXXXXXXXXXXXXXXXXX # The hosted zone ID for the hosted zone you created above. + +ErrorSnsTopicArn: arn:aws:sns:eu-west-1:${AccountId}:SlurmError # ARN of your SNS topic. + +TimeZone: 'US/Central' # Or whatever you prefer + +slurm: + MungeKeySsmParameter: "/slurm/munge_key" + + SlurmCtl: + NumberOfControllers: 2 + + SlurmDbd: {} + + # External security groups that should be able to use the cluster + SubmitterSecurityGroupIds: + soca-oktank-dub-ComputeNodeSG: sg-xxxxxxxxxxxxxxxxx + + SubmitterInstanceTags: + 'soca:ClusterId': ['soca-oktank-dub'] + + # InstanceConfig: + # Configure the instances used by the cluster + # A partition will be created for each combination of Base OS, Architecture, and Spot + InstanceConfig: + UseSpot: true + NodesPerInstanceType: 10 + BaseOsArchitecture: + AlmaLinux: {8: [x86_64, arm64]} + CentOS: + 7: [x86_64] + Include: + MaxSizeOnly: false + InstanceFamilies: + - t3 + - t4g + InstanceTypes: [] + Exclude: + InstanceFamilies: [] + InstanceTypes: + - '.+\.(micro|nano)' # Not enough memory + - '.*\.metal' + Regions: + eu-west-1: + VpcId: vpc-xxxxxxxxxxxxxxxxx # oktank-dub + CIDR: 10.1.0.0/16 + SshKeyPair: admin-eu-west-1 + AZs: + - Priority: 10 + Subnet: subnet-xxxxxxxxxxxxxxxxx # oktank-dub - PrivateSubnet1 + - Priority: 9 + Subnet: subnet-xxxxxxxxxxxxxxxxx # oktank-dub - PrivateSubnet2 + - Priority: 8 + Subnet: subnet-xxxxxxxxxxxxxxxxx # oktank-dub - PrivateSubnet3 + us-east-1: + VpcId: vpc-xxxxxxxxxxxxxxxxx + CIDR: 10.2.0.0/16 + SshKeyPair: admin-us-east-1 + AZs: + - Priority: 7 + Subnet: subnet-xxxxxxxxxxxxxxxxx # oktank-iad - PrivateSubnet1 + - Priority: 6 + Subnet: subnet-xxxxxxxxxxxxxxxxx # oktank-iad - PrivateSubnet2 + - Priority: 5 + Subnet: subnet-xxxxxxxxxxxxxxxxx # oktank-iad - PrivateSubnet3 + us-west-2: + VpcId: vpc-xxxxxxxxxxxxxxxxx + CIDR: 10.3.0.0/16 + SshKeyPair: admin-us-west-2 + AZs: + - Priority: 4 + Subnet: subnet-xxxxxxxxxxxxxxxxx # oktank-pdx - PrivateSubnet1 + - Priority: 3 + Subnet: subnet-xxxxxxxxxxxxxxxxx # oktank-pdx - PrivateSubnet2 + - Priority: 2 + Subnet: subnet-xxxxxxxxxxxxxxxxx # oktank-pdx - PrivateSubnet3 + + storage: + provider: ontap + removal_policy: DESTROY + ontap: {} + + ExtraMounts: + - dest: /apps + src: fs-apps.slurmdub.local:/ + type: nfs4 + options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport + - dest: /data + src: fs-data.slurmdub.local:/ + type: nfs4 + options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport +``` diff --git a/docs/source/cdk/config_schema.py b/docs/source/cdk/config_schema.py deleted file mode 120000 index 5582d3ae..00000000 --- a/docs/source/cdk/config_schema.py +++ /dev/null @@ -1 +0,0 @@ -../../../source/cdk/config_schema.py \ No newline at end of file diff --git a/docs/source/config/default_config.yml b/docs/source/config/default_config.yml deleted file mode 120000 index f95de6a8..00000000 --- a/docs/source/config/default_config.yml +++ /dev/null @@ -1 +0,0 @@ -../../../source/resources/config/default_config.yml \ No newline at end of file diff --git a/docs/source/config/slurm_eda_az1.yml b/docs/source/config/slurm_eda_az1.yml deleted file mode 120000 index 103f4834..00000000 --- a/docs/source/config/slurm_eda_az1.yml +++ /dev/null @@ -1 +0,0 @@ -../../../source/resources/config/slurm_eda_az1.yml \ No newline at end of file diff --git a/docs/source/config/slurm_eda_az2.yml b/docs/source/config/slurm_eda_az2.yml deleted file mode 120000 index 0ffeb455..00000000 --- a/docs/source/config/slurm_eda_az2.yml +++ /dev/null @@ -1 +0,0 @@ -../../../source/resources/config/slurm_eda_az2.yml \ No newline at end of file diff --git a/docs/source/config/slurm_eda_az3.yml b/docs/source/config/slurm_eda_az3.yml deleted file mode 120000 index 43c813a9..00000000 --- a/docs/source/config/slurm_eda_az3.yml +++ /dev/null @@ -1 +0,0 @@ -../../../source/resources/config/slurm_eda_az3.yml \ No newline at end of file diff --git a/docs/todo.md b/docs/todo.md index ad6e2464..4db8a65c 100644 --- a/docs/todo.md +++ b/docs/todo.md @@ -2,22 +2,8 @@ List of tasks to be completed. -* Create a configuration to emulate an on-prem cluster - * Use it to test burting from a static on-prem compute cluster to an AWS federation an multi-AZ/region. - * Configure always on instances for RIs or savings plans. -* Support multi-AZ/region for a single cluster. - * Instead of federating clusters, add support for compute nodes in multiple availability zones and regions - * Assumes that networking is configured between VPCs if multiple regions are used - * Assumes that the storage architecture supports a consistent file system view for all compute nodes. - This could be shared file systems that are mounted across the AZs and regions or AZ/region specific storage with some kind of data synchronization strategy. - -* Support multiple clusters (federation) - * Implementation is complete and is mostly working but has issues. - * I see jobs running in other clusters when there are still resources in the primary cluster. - * Job stuck in pending state with FedJobLock. Can't cancel or requeue, lower priority jobs stuck behind it in PENDING state. This completely hung cluster 1 and 3. - * Configure Preemption * https://slurm.schedmd.com/preempt.html * Configure preemption @@ -33,8 +19,6 @@ List of tasks to be completed. * I think that the ordering can be done by making the CustomResource dependent on the file systems. * This has been added but not tested. -* Put slurm logs on the file system so that they persist and can be accessed from other instances. - * Turn deletion_protection on for database. Have it turned off during testing to ease deletion of test stacks. * Configure remote licenses that are stored on slurmdbd. diff --git a/install.sh b/install.sh index b07be668..626429cc 100755 --- a/install.sh +++ b/install.sh @@ -36,42 +36,44 @@ fi python_version=$(python3 --version 2>&1 | awk '{print $2}') python_major_version=$(echo $python_version | cut -d '.' -f 1) python_minor_version=$(echo $python_version | cut -d '.' -f 2) -if [[ $python_minor_version -lt 6 ]]; then - echo "error: CDK requires python 3.6 or later. You have $python_version. Update your python3 version." +if [[ $python_minor_version -lt 7 ]]; then + echo "error: CDK requires python 3.7 or later. You have $python_version. Update your python3 version." exit 1 fi +# Check nodejs version +required_nodejs_version=16.15.0 if ! node -v &> /dev/null; then echo -e "\nnode not found in your path." echo "Installing nodejs in your home dir. Hit ctrl-c to abort" pushd $HOME - wget https://nodejs.org/dist/v16.13.1/node-v16.13.1-linux-x64.tar.xz - tar -xf node-v16.13.1-linux-x64.tar.xz - rm node-v16.13.1-linux-x64.tar.xz + wget https://nodejs.org/dist/v${required_nodejs_version}/node-v${required_nodejs_version}-linux-x64.tar.xz + tar -xf node-v${required_nodejs_version}-linux-x64.tar.xz + rm node-v${required_nodejs_version}-linux-x64.tar.xz cat >> ~/.bashrc << EOF # Nodejs -export PATH=$HOME/node-v16.13.1-linux-x64/bin:\$PATH +export PATH=$HOME/node-v${required_nodejs_version}-linux-x64/bin:\$PATH EOF source ~/.bashrc popd fi -# Check node version -node_version=$(node -v 2>&1 | awk '{print $1}') -node_version=${node_version:1} -node_major_version=$(echo $node_version | cut -d '.' -f 1) -node_minor_version=$(echo $node_version | cut -d '.' -f 2) + +nodejs_version=$(node -v 2>&1 | awk '{print $1}') +nodejs_version=${nodejs_version:1} +node_major_version=$(echo $nodejs_version | cut -d '.' -f 1) +node_minor_version=$(echo $nodejs_version | cut -d '.' -f 2) if [[ $node_major_version -lt 14 ]]; then - echo "error: CDK requires node 14.15.0 or later. You have $node_version. Update your node version." + echo "error: CDK requires node 14.15.0 or later. You have $nodejs_version. Update your node version." exit 1 fi if [[ $node_major_version -eq 14 ]] && [[ $node_minor_version -lt 6 ]]; then - echo "error: CDK requires node 14.15.0 or later. You have $node_version. Update your node version." + echo "error: CDK requires node 14.15.0 or later. You have $nodejs_version. Update your node version." exit 1 fi # Create a local installation of cdk -CDK_VERSION=2.21.1 # If you change the CDK version here, make sure to also change it in source/requirements.txt +CDK_VERSION=2.28.1 # If you change the CDK version here, make sure to also change it in source/requirements.txt if ! cdk --version &> /dev/null; then echo "CDK not installed. Installing global version of cdk@$CDK_VERSION." if ! npm install -g aws-cdk@$CDK_VERSION; then @@ -81,8 +83,9 @@ fi version=$(cdk --version | awk '{print $1}') if [[ $version != $CDK_VERSION ]]; then echo "Updating the global version of aws-cdk from version $version to $CDK_VERSION" + npm uninstall -g aws-cdk if ! npm install -g aws-cdk@$CDK_VERSION; then - npm install -g aws-cdk@$CDK_VERSION + sudo npm install -g aws-cdk@$CDK_VERSION fi fi diff --git a/mkdocs.yml b/mkdocs.yml index 7870823b..72489610 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,4 +1,6 @@ site_name: EDA SLURM Cluster on AWS +repo_url: https://github.com/aws-samples/aws-eda-slurm-cluster +docs_dir: docs nav: - 'index.md' - 'deploy.md' @@ -7,7 +9,18 @@ nav: - 'soca_integration.md' - 'f1-ami.md' - 'federation.md' + - 'multi-region.md' - 'implementation.md' - 'debug.md' - 'todo.md' - - 'mkdocs.md' +strict: true +theme: + name: mkdocs + #name: readthedocs + hljs_languages: + - python + - yaml + navigation_depth: 4 + nav_style: dark + features: + - navigation.tabs diff --git a/source/app.py b/source/app.py index ba73f869..3878400c 100644 --- a/source/app.py +++ b/source/app.py @@ -23,6 +23,10 @@ app = App() +# TODO: Create a stack for each additional region to create resources needed to create instances in those regions. +# * Instance profile +# * Security group + cdk_env = Environment( account = app.node.try_get_context('account_id'), region = app.node.try_get_context('region') diff --git a/source/cdk/cdk_slurm_stack.py b/source/cdk/cdk_slurm_stack.py index 0aa6f0ba..d6141594 100644 --- a/source/cdk/cdk_slurm_stack.py +++ b/source/cdk/cdk_slurm_stack.py @@ -67,6 +67,7 @@ from yaml.scanner import ScannerError sys.path.append(f"{dirname(__file__)}/../resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin") +from EC2InstanceTypeInfoPkg.EC2InstanceTypeInfo import EC2InstanceTypeInfo from SlurmPlugin import SlurmPlugin pp = PrettyPrinter() @@ -102,22 +103,16 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: self.config.update(self.ami_map) # Get context variables to override the config - self.get_context() + self.override_config_with_context() self.check_config() - plugin = SlurmPlugin(slurm_config_file=None, slurm_version_file=None, region=self.config['Region']) - self.config['slurm']['InstanceTypes'] = plugin.get_instance_types_from_instance_config(self.config['slurm']['InstanceConfig']) - if len(self.config['slurm']['InstanceTypes']) == 0: - logger.error(f"No instance types found. Update slurm/InstanceConfig. Current value:\n{pp.pformat(self.config['slurm']['InstanceConfig'])}\n{self.config['slurm']['InstanceTypes']}") - exit(1) - logger.info(f"{len(self.config['slurm']['InstanceTypes'])} instance types configured:\n{pp.pformat(self.config['slurm']['InstanceTypes'])}") - # Assets must be created before setting instance_template_vars so the playbooks URL exists self.create_assets() - self.create_lambdas() + # Create VPC before lambdas so that lambdas can access the VPC. self.create_vpc() + self.create_lambdas() self.create_security_groups() if 'ElasticSearch' not in self.config['slurm']: self.create_elasticsearch() @@ -178,9 +173,10 @@ def get_config(self, context_var, default_path): else: exit("No parameters were specified.") - def get_context(self): - # Get context variables to override the config - + def override_config_with_context(self): + ''' + Override the config using context variables + ''' region = self.node.try_get_context('region') config_key = 'Region' if region: @@ -217,6 +213,18 @@ def get_context(self): logger.error("You must provide --vpc-id on the command line or {config_key} in the config file.") exit(1) + config_key = 'CIDR' + cidr = self.node.try_get_context(config_key) + if cidr: + if config_key not in self.config: + logger.info(f"{config_key:20} set from command line: {vpc_id}") + elif cidr != self.config[config_key]: + logger.info(f"{config_key:20} in config file overridden on command line from {self.config[config_key]} to {cidr}") + self.config[config_key] = cidr + if config_key not in self.config: + logger.error("You must provide --cidr on the command line or {config_key} in the config file.") + exit(1) + config_key = 'SubnetId' subnet_id = self.node.try_get_context(config_key) if subnet_id: @@ -246,8 +254,9 @@ def get_context(self): self.config['slurm'][config_key] = submitterSecurityGroupIds def check_config(self): - # Check config, set defaults, and sanity check the configuration - + ''' + Check config, set defaults, and sanity check the configuration + ''' if self.stack_name: if 'StackName' not in self.config: logger.info(f"config/StackName set from command line: {self.stack_name}") @@ -266,11 +275,9 @@ def check_config(self): logger.warning(f"ErrorSnsTopicArn not set. Provide error-sns-topic-arn on the command line or ErrorSnsTopicArn in the config file to get error notifications.") self.config['ErrorSnsTopicArn'] = '' - if 'Domain' not in self.config and 'HostedZoneId' not in self.config: + if 'Domain' not in self.config: self.config['Domain'] = f"{self.stack_name}.local" - if 'Domain' in self.config and 'HostedZoneId' in self.config: - logger.error(f"Cannot specify both Domain({self.config['Domain']}) and HostedZoneId{self.config['HostedZoneId']}") - sys.exist(1) + logger.info(f"Domain defaulted to {self.config['Domain']}") if 'ClusterName' not in self.config['slurm']: self.config['slurm']['ClusterName'] = self.stack_name @@ -374,6 +381,57 @@ def check_config(self): logger.error(f"Must specify existing ElasticSearch domain in slurm/JobCompLoc when slurm/JobCompType == jobcomp/elasticsearch and slurm/ElasticSearch is not set.") exit(1) + if not self.config['slurm']['InstanceConfig']['Regions']: + default_region = { + 'CIDR': self.config['CIDR'], + 'SshKeyPair': self.config['SshKeyPair'], + 'AZs': [ + { + 'Priority': 1, + 'Subnet': self.config['SubnetId'] + } + ] + } + self.config['slurm']['InstanceConfig']['Regions'][self.config['Region']] = default_region + + self.compute_regions = {} + self.remote_compute_regions = {} + self.compute_region_cidrs_dict = {} + local_region = self.config['Region'] + for compute_region, region_dict in self.config['slurm']['InstanceConfig']['Regions'].items(): + compute_region_cidr = region_dict['CIDR'] + if compute_region not in self.compute_regions: + self.compute_regions[compute_region] = compute_region_cidr + if compute_region != local_region: + self.remote_compute_regions[compute_region] = compute_region_cidr + if compute_region_cidr not in self.compute_region_cidrs_dict: + self.compute_region_cidrs_dict[compute_region] = compute_region_cidr + logger.info(f"{len(self.compute_regions.keys())} regions configured: {sorted(self.compute_regions.keys())}") + + eC2InstanceTypeInfo = EC2InstanceTypeInfo(self.compute_regions.keys(), json_filename='/tmp/instance_type_info.json', debug=False) + + plugin = SlurmPlugin(slurm_config_file=None, region=self.region) + plugin.instance_type_info = eC2InstanceTypeInfo.instance_type_info + plugin.create_instance_family_info() + self.az_info = plugin.get_az_info_from_instance_config(self.config['slurm']['InstanceConfig']) + logger.info(f"{len(self.az_info.keys())} AZs configured: {sorted(self.az_info.keys())}") + + az_partitions = [] + for az, az_info in self.az_info.items(): + az_partitions.append(f"{az}_all") + self.default_partition = ','.join(az_partitions) + + self.instance_types = plugin.get_instance_types_from_instance_config(self.config['slurm']['InstanceConfig'], self.compute_regions, eC2InstanceTypeInfo) + for compute_region in self.compute_regions: + region_instance_types = self.instance_types[compute_region] + if len(region_instance_types) == 0: + logger.error(f"No instance types found in region {compute_region}. Update slurm/InstanceConfig. Current value:\n{pp.pformat(self.config['slurm']['InstanceConfig'])}\n{region_instance_types}") + sys.exit(1) + logger.info(f"{len(region_instance_types)} instance types configured in {compute_region}:\n{pp.pformat(region_instance_types)}") + for instance_type in region_instance_types: + self.instance_types[instance_type] = 1 + self.instance_types = sorted(self.instance_types.keys()) + # Validate updated config against schema from config_schema import check_schema from schema import SchemaError @@ -413,20 +471,127 @@ def create_assets(self): self.on_prem_compute_nodes_config_file_asset = None self.onprem_cidr = None + def create_vpc(self): + self.vpc = ec2.Vpc.from_lookup(self, "Vpc", vpc_id = self.config['VpcId']) + + self.subnets = self.vpc.private_subnets + valid_subnet_ids = [] + if 'SubnetId' in self.config: + self.subnet = None + for subnet in self.subnets: + valid_subnet_ids.append(subnet.subnet_id) + if subnet.subnet_id == self.config['SubnetId']: + self.subnet = subnet + break + if not self.subnet: + # If this is a new VPC then the cdk.context.json will not have the VPC and will be refreshed after the bootstrap phase. Until then the subnet ids will be placeholders so just pick the first subnet. After the bootstrap finishes the vpc lookup will be done and then the info will be correct. + if valid_subnet_ids[0] == 'p-12345': + logger.warning(f"VPC {self.config['VpcId']} not in cdk.context.json and will be refresshed before synth.") + self.subnet = self.vpc.private_subnets[0] + else: + logger.error(f"SubnetId {self.config['SubnetId']} not found in VPC {self.config['VpcId']}\nValid subnet ids:\n{pp.pformat(valid_subnet_ids)}") + exit(1) + else: + self.subnet = self.vpc.private_subnets[0] + self.config['SubnetId'] = self.subnet.subnet_id + logger.info(f"Subnet set to {self.config['SubnetId']}") + logger.info(f"availability zone: {self.subnet.availability_zone}") + + remote_vpcs = {} + for region, region_dict in self.config['slurm']['InstanceConfig']['Regions'].items(): + if region == self.config['Region']: + continue + remote_vpcs[region] = ec2.Vpc.from_lookup( + self, f"Vpc{region}", + region = region, + vpc_id = region_dict['VpcId']) + + # Can't create query logging for private hosted zone. + if 'HostedZoneId' in self.config: + self.hosted_zone = route53.HostedZone.from_hosted_zone_attributes( + self, "PrivateDns", + hosted_zone_id = self.config['HostedZoneId'], + zone_name = self.config['Domain'] + ) + else: + self.hosted_zone = route53.HostedZone(self, "PrivateDns", + vpcs = [self.vpc], + zone_name = self.config['Domain'] + ) + # BUG: CDK isn't creating the correct region for the vpcs even though cdk_context.json has it right. + # for remote_region, remote_vpc in remote_vpcs.items(): + # self.hosted_zone.add_vpc(remote_vpc) + def create_lambdas(self): - updateDnsLambdaAsset = s3_assets.Asset(self, "UpdateDnsLambdaAsset", path="resources/lambdas/UpdateDns") - self.update_dns_lambda = aws_lambda.Function( - self, "UpdateDnsLambda", - function_name=f"{self.stack_name}-UpdateDns", - description="Update DNS record", + dnsLookupLambdaAsset = s3_assets.Asset(self, "DnsLookupLambdaAsset", path="resources/lambdas/DnsLookup") + self.dns_lookup_lambda = aws_lambda.Function( + self, "DnsLookupLambda", + function_name=f"{self.stack_name}-DnsLookup", + description="Lookup up FQDN in DNS", + memory_size=128, + runtime=aws_lambda.Runtime.PYTHON_3_7, + timeout=Duration.minutes(3), + log_retention=logs.RetentionDays.INFINITE, + handler="DnsLookup.lambda_handler", + code=aws_lambda.Code.from_bucket(dnsLookupLambdaAsset.bucket, dnsLookupLambdaAsset.s3_object_key), + vpc = self.vpc, + allow_all_outbound = True + ) + + createComputeNodeSGLambdaAsset = s3_assets.Asset(self, "CreateComputeNodeSGLambdaAsset", path="resources/lambdas/CreateComputeNodeSG") + self.create_compute_node_sg_lambda = aws_lambda.Function( + self, "CreateComputeNodeSGLambda", + function_name=f"{self.stack_name}-CreateComputeNodeSG", + description="Create ComputeNodeSG in other region", + memory_size=128, + runtime=aws_lambda.Runtime.PYTHON_3_7, + timeout=Duration.minutes(3), + log_retention=logs.RetentionDays.INFINITE, + handler="CreateComputeNodeSG.lambda_handler", + code=aws_lambda.Code.from_bucket(createComputeNodeSGLambdaAsset.bucket, createComputeNodeSGLambdaAsset.s3_object_key) + ) + self.create_compute_node_sg_lambda.add_to_role_policy( + statement=iam.PolicyStatement( + effect=iam.Effect.ALLOW, + actions=[ + 'ec2:AuthorizeSecurityGroupEgress', + 'ec2:AuthorizeSecurityGroupIngress', + 'ec2:CreateSecurityGroup', + 'ec2:CreateTags', + 'ec2:DeleteSecurityGroup', + 'ec2:DescribeSecurityGroupRules', + 'ec2:DescribeSecurityGroups', + 'ec2:RevokeSecurityGroupEgress', + 'ec2:RevokeSecurityGroupIngress', + ], + resources=['*'] + ) + ) + + routeRoute53ZoneAddVpcLambdaAsset = s3_assets.Asset(self, "Route53HostedZoneAddVpcLambdaAsset", path="resources/lambdas/Route53HostedZoneAddVpc") + self.route53_hosted_zone_add_vpc_lambda = aws_lambda.Function( + self, "Route53HostedZoneAddVpcLambda", + function_name=f"{self.stack_name}-Route53HostedZoneAddVpc", + description="Associated VPC with Route53 hosted zone", memory_size=128, runtime=aws_lambda.Runtime.PYTHON_3_7, timeout=Duration.minutes(3), log_retention=logs.RetentionDays.INFINITE, - handler="UpdateDns.lambda_handler", - code=aws_lambda.Code.from_bucket(updateDnsLambdaAsset.bucket, updateDnsLambdaAsset.s3_object_key) + handler="Route53HostedZoneAddVpc.lambda_handler", + code=aws_lambda.Code.from_bucket(routeRoute53ZoneAddVpcLambdaAsset.bucket, routeRoute53ZoneAddVpcLambdaAsset.s3_object_key) ) + self.route53_hosted_zone_add_vpc_lambda.add_to_role_policy( + statement=iam.PolicyStatement( + effect=iam.Effect.ALLOW, + actions=[ + "route53:AssociateVpcWithHostedZone", + "route53:DissociateVpcFromHostedZone", + ], + resources=['*'] + ) + ) + getOntapSvmDNSNameLambdaAsset = s3_assets.Asset(self, "GetOntapSvmDNSNameLambdaAsset", path="resources/lambdas/GetOntapSvmDNSName") self.get_ontap_svm_dnsname_lambda = aws_lambda.Function( self, "GetOntapSvmDNSNameLambda", @@ -473,52 +638,6 @@ def create_lambdas(self): ) ) - def create_vpc(self): - self.vpc = ec2.Vpc.from_lookup(self, "Vpc", vpc_id = self.config['VpcId']) - - self.subnets = self.vpc.private_subnets - valid_subnet_ids = [] - if 'SubnetId' in self.config: - self.subnet = None - for subnet in self.subnets: - valid_subnet_ids.append(subnet.subnet_id) - if subnet.subnet_id == self.config['SubnetId']: - self.subnet = subnet - break - if not self.subnet: - # If this is a new VPC then the cdk.context.json will not have the VPC and will be refreshed after the bootstrap phase. Until then the subnet ids will be placeholders so just pick the first subnet. After the bootstrap finishes the vpc lookup will be done and then the info will be correct. - if valid_subnet_ids[0] == 'p-12345': - logger.warning(f"VPC {self.config['VpcId']} not in cdk.context.json and will be refresshed before synth.") - self.subnet = self.vpc.private_subnets[0] - else: - logger.error(f"SubnetId {self.config['SubnetId']} not found in VPC {self.config['VpcId']}\nValid subnet ids:\n{pp.pformat(valid_subnet_ids)}") - exit(1) - else: - self.subnet = self.vpc.private_subnets[0] - self.config['SubnetId'] = self.subnet.subnet_id - logger.info(f"Subnet set to {self.config['SubnetId']}") - logger.info(f"availability zone: {self.subnet.availability_zone}") - - # Can't create query logging for private hosted zone. - if 'HostedZoneId' in self.config: - self.hosted_zone = route53.HostedZone.from_hosted_zone_id(self, "PrivateDns", hosted_zone_id=self.config['HostedZoneId']) - self.config['Domain'] = self.hosted_zone.zone_name - else: - self.hosted_zone = route53.HostedZone(self, "PrivateDns", - vpcs = [self.vpc], - zone_name = self.config['Domain'] - ) - - self.update_dns_lambda.add_to_role_policy( - statement=iam.PolicyStatement( - effect=iam.Effect.ALLOW, - actions=[ - "route53:ChangeResourceRecordSets" - ], - resources=[self.hosted_zone.hosted_zone_arn] - ) - ) - def create_security_groups(self): self.nfs_sg = ec2.SecurityGroup(self, "NfsSG", vpc=self.vpc, allow_all_outbound=False, description="Nfs Security Group") Tags.of(self.nfs_sg).add("Name", f"{self.stack_name}-NfsSG") @@ -529,7 +648,7 @@ def create_security_groups(self): Tags.of(self.zfs_sg).add("Name", f"{self.stack_name}-ZfsSG") self.suppress_cfn_nag(self.zfs_sg, 'W29', 'Egress port range used to block all egress') - # Compute nodes may use lustre file systems to create a security group with the required ports. + # Compute nodes may use lustre file systems to create a security group with the required ports. self.lustre_sg = ec2.SecurityGroup(self, "LustreSG", vpc=self.vpc, allow_all_outbound=False, description="Lustre Security Group") Tags.of(self.lustre_sg).add("Name", f"{self.stack_name}-LustreSG") self.suppress_cfn_nag(self.lustre_sg, 'W29', 'Egress port range used to block all egress') @@ -616,6 +735,8 @@ def create_security_groups(self): fs_client_sg.connections.allow_to(self.nfs_sg, ec2.Port.tcp(2049), f"{fs_client_sg_name} to Nfs") if self.onprem_cidr: self.nfs_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp(2049), 'OnPremNodes to Nfs') + for compute_region, compute_region_cidr in self.remote_compute_regions.items(): + self.nfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(2049), f"{compute_region} to Nfs") # ZFS Connections # https://docs.aws.amazon.com/fsx/latest/OpenZFSGuide/limit-access-security-groups.html @@ -638,6 +759,13 @@ def create_security_groups(self): self.zfs_sg.connections.allow_from(self.onprem_cidr, ec2.Port.udp_range(20001, 20003), 'OnPremNodes to Zfs') self.suppress_cfn_nag(self.zfs_sg, 'W27', 'Correct, restricted range for zfs: 20001-20003') self.suppress_cfn_nag(self.zfs_sg, 'W29', 'Correct, restricted range for zfs: 20001-20003') + for compute_region, compute_region_cidr in self.remote_compute_regions.items(): + self.zfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(111), f"{compute_region} to Zfs") + self.zfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.udp(111), f"{compute_region} to Zfs") + self.zfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(2049), f"{compute_region} to Zfs") + self.zfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.udp(2049), f"{compute_region} to Zfs") + self.zfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp_range(20001, 20003), f"{compute_region} to Zfs") + self.zfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.udp_range(20001, 20003), f"{compute_region} to Zfs") # Lustre Connections lustre_fs_client_sgs = copy(fs_client_sgs) @@ -657,6 +785,11 @@ def create_security_groups(self): self.lustre_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp_range(1021, 1023), 'OnPremNodes to Lustre') self.lustre_sg.connections.allow_to(self.onprem_cidr, ec2.Port.tcp(988), f"Lustre to OnPremNodes") self.lustre_sg.connections.allow_to(self.onprem_cidr, ec2.Port.tcp_range(1021, 1023), f"Lustre to OnPremNodes") + for compute_region, compute_region_cidr in self.remote_compute_regions.items(): + self.lustre_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(988), f"{compute_region} to Lustre") + self.lustre_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp_range(1021, 1023), f"{compute_region} to Lustre") + self.lustre_sg.connections.allow_to(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(988), f"Lustre to {compute_region}") + self.lustre_sg.connections.allow_to(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp_range(1021, 1023), f"Lustre to {compute_region}") # slurmctl connections # egress @@ -682,6 +815,8 @@ def create_security_groups(self): if self.onprem_cidr: self.slurmctl_sg.connections.allow_to(self.onprem_cidr, ec2.Port.tcp(6818), f'{self.slurmctl_sg_name} to OnPremNodes') self.slurmctl_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp(6817), f'OnPremNodes to {self.slurmctl_sg_name}') + for compute_region, compute_region_cidr in self.remote_compute_regions.items(): + self.slurmctl_sg.connections.allow_to(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(6818), f"{self.slurmctl_sg_name} to {compute_region}") # slurmdbd connections # egress @@ -706,9 +841,13 @@ def create_security_groups(self): self.slurmnode_sg.connections.allow_to(slurm_submitter_sg, ec2.Port.tcp_range(1024, 65535), f"{self.slurmnode_sg_name} to {slurm_submitter_sg_name} - ephemeral") self.suppress_cfn_nag(slurm_submitter_sg, 'W27', 'Port range ok. slurmnode requires requires ephemeral ports to slurm submitters: 1024-65535') if self.onprem_cidr: - self.slurmnode_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp_range(6000, 7024), f"OnPremNodes to {slurm_submitter_sg_name} - x11") + slurm_submitter_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp_range(6000, 7024), f"OnPremNodes to {slurm_submitter_sg_name} - x11") # @todo Not sure if this is really initiated from the slurm node self.slurmnode_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp_range(1024, 65535), f"OnPremNodes to {slurm_submitter_sg_name} - ephemeral") + for compute_region, compute_region_cidr in self.remote_compute_regions.items(): + slurm_submitter_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp_range(6000, 7024), f"{compute_region} to {slurm_submitter_sg_name} - x11") + # @todo Not sure if this is really initiated from the slurm node + slurm_submitter_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp_range(1024, 65535), f"{compute_region} to {slurm_submitter_sg_name} - ephemeral") self.suppress_cfn_nag(self.slurmnode_sg, 'W27', 'Port range ok. slurmnode requires requires ephemeral ports to slurm submitters: 1024-65535') self.slurmnode_sg.add_egress_rule(ec2.Peer.ipv4("0.0.0.0/0"), ec2.Port.tcp(80), description="Internet") self.slurmnode_sg.add_egress_rule(ec2.Peer.ipv4("0.0.0.0/0"), ec2.Port.tcp(443), description="Internet") @@ -723,6 +862,12 @@ def create_security_groups(self): if self.onprem_cidr: self.slurmnode_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp(6818), f"OnPremNodes to {self.slurmnode_sg_name}") self.slurmnode_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp_range(1024, 65535), f"OnPremNodes to {self.slurmnode_sg_name}") + for compute_region, compute_region_cidr in self.remote_compute_regions.items(): + self.slurmctl_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(6817), f"{compute_region} to {self.slurmctl_sg_name}") + self.slurmnode_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(6818), f"{compute_region} to {self.slurmnode_sg_name}") + self.slurmnode_sg.connections.allow_to(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(6818), f"{self.slurmnode_sg_name} to {compute_region}") + self.slurmnode_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp_range(1024, 65535), f"{compute_region} to {self.slurmnode_sg_name}") + self.slurmnode_sg.connections.allow_to(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp_range(1024, 65535), f"{self.slurmnode_sg_name} to {compute_region}") # slurm submitter connections # egress @@ -733,11 +878,39 @@ def create_security_groups(self): slurm_submitter_sg.connections.allow_to(self.slurmdbd_sg, ec2.Port.tcp(6819), f"{slurm_submitter_sg_name} to {self.slurmdbd_sg_name} - sacct") if self.onprem_cidr: slurm_submitter_sg.connections.allow_to(self.onprem_cidr, ec2.Port.tcp(6818), f"{slurm_submitter_sg_name} to OnPremNodes - srun") + for compute_region, compute_region_cidr in self.remote_compute_regions.items(): + slurm_submitter_sg.connections.allow_to(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(6818), f"{slurm_submitter_sg_name} to {compute_region} - srun") # Try to suppress cfn_nag warnings on ingress/egress rules for slurm_submitter_sg_name, slurm_submitter_sg in self.submitter_security_groups.items(): self.suppress_cfn_nag(self.slurmnode_sg, 'W27', 'Port range ok. slurmsubmitter requires ephemeral ports for several reasons: 1024-65535') + self.slurmnode_security_group_ssm_parameters = {} + for compute_region, region_dict in self.config['slurm']['InstanceConfig']['Regions'].items(): + if compute_region == self.config['Region']: + slurmnode_security_group_id = self.slurmnode_sg.security_group_id + else: + slurmnode_security_group_id = CustomResource( + self, f"ComputeNodeSecurityGroup{compute_region}", + service_token = self.create_compute_node_sg_lambda.function_arn, + properties = { + 'Region': compute_region, + 'VpcId': region_dict['VpcId'], + 'SecurityGroupName': f"{self.config['slurm']['ClusterName']}-SlurmNodeSG", + 'Description': f"{self.config['slurm']['ClusterName']}-SlurmNodeSG", + 'ControllerCIDR': self.config['CIDR'], + 'CIDRs': self.compute_region_cidrs_dict, + 'StackName': self.config['StackName'] + } + ).get_att_string('GroupId') + # SSM Parameters to store the security group ids + # The SlurmPlugin reads these parameters when running an instance. + self.slurmnode_security_group_ssm_parameters[compute_region] = ssm.StringParameter( + self, f"SlurmNodeSecurityGroupSsmParameter{compute_region}", + parameter_name = f"/{self.stack_name}/SlurmNodeSecurityGroups/{compute_region}", + string_value = slurmnode_security_group_id + ) + def create_elasticsearch(self): if 'ElasticSearch' not in self.config['slurm']: return @@ -870,14 +1043,21 @@ def create_file_system(self): self.file_system_dependency = self.file_system self.file_system_dns = f"{self.file_system.file_system_id}.efs.{self.region}.amazonaws.com" - self.file_system_dns = self.file_system_dns + + # Get IpAddress of file system + self.file_system_ip_address = CustomResource( + self, f"ZfsIpAddress", + service_token = self.dns_lookup_lambda.function_arn, + properties={ + "FQDN": self.file_system_dns + } + ).get_att_string('IpAddress') self.file_system_port = 2049 self.file_system_mount_name = "" - self.file_system_mount_src = f"{self.file_system_dns}:/" - self.file_system_mount_source = self.file_system_mount_src + self.file_system_mount_source = f"{self.file_system_ip_address}:/" if self.config['slurm']['storage']['efs']['use_efs_helper']: self.file_system_type = 'efs' @@ -886,7 +1066,7 @@ def create_file_system(self): self.file_system_type = 'nfs4' self.file_system_options = 'nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport' - self.file_system_mount_command = f"sudo mkdir -p {self.config['slurm']['storage']['mount_path']} && sudo yum -y install nfs-utils && sudo mount -t {self.file_system_type} -o {self.file_system_options} {self.file_system_mount_src} {self.config['slurm']['storage']['mount_path']}" + self.file_system_mount_command = f"sudo mkdir -p {self.config['slurm']['storage']['mount_path']} && sudo yum -y install nfs-utils && sudo mount -t {self.file_system_type} -o {self.file_system_options} {self.file_system_mount_source} {self.config['slurm']['storage']['mount_path']}" elif self.config['slurm']['storage']['provider'] == "ontap": if 'iops' in self.config['slurm']['storage']['ontap']: @@ -940,6 +1120,15 @@ def create_file_system(self): } ).get_att_string('DNSName') + # Get IpAddress of SVM + self.file_system_ip_address = CustomResource( + self, f"OntapSvmIpAddress", + service_token = self.dns_lookup_lambda.function_arn, + properties={ + "FQDN": self.file_system_dns + } + ).get_att_string('IpAddress') + # Add a volume self.volume = fsx.CfnVolume( self, 'OntapVolume', @@ -966,7 +1155,7 @@ def create_file_system(self): self.file_system_mount_name = "" - self.file_system_mount_source = f"{self.file_system_dns}:/slurm" + self.file_system_mount_source = f"{self.file_system_ip_address}:/slurm" self.file_system_options = 'nfsvers=4.1' @@ -1037,9 +1226,18 @@ def create_file_system(self): self.file_system_type = 'nfs' self.file_system_dns = self.file_system.attr_dns_name + # Get IpAddress of file system + self.file_system_ip_address = CustomResource( + self, f"ZfsIpAddress", + service_token = self.dns_lookup_lambda.function_arn, + properties={ + "FQDN": self.file_system_dns + } + ).get_att_string('IpAddress') + self.file_system_mount_name = "" - self.file_system_mount_source = f"{self.file_system_dns}:/fsx/slurm" + self.file_system_mount_source = f"{self.file_system_ip_address}:/fsx/slurm" self.file_system_options = 'nfsvers=4.1' @@ -1050,6 +1248,14 @@ def create_file_system(self): Tags.of(self.file_system).add("Name", f"{self.stack_name}-Slurm") + # Create DNS entry for file system that can be used in remote VPCs + route53.ARecord( + self, f"SlurmFileSystemDnsRecord", + zone = self.hosted_zone, + record_name = 'slurmfs', + target = route53.RecordTarget.from_ip_addresses(self.file_system_ip_address) + ) + CfnOutput(self, "FileSystemProvider", value = self.config['slurm']['storage']['provider'] ) @@ -1062,6 +1268,9 @@ def create_file_system(self): CfnOutput(self, "FileSystemDnsName", value = self.file_system_dns ) + CfnOutput(self, "FileSystemIpAddress", + value = self.file_system_ip_address + ) CfnOutput(self, "MountCommand", value = self.file_system_mount_command ) @@ -1321,7 +1530,7 @@ def create_cw(self): dimensions_map = {'Reason': 'InsufficientInstanceCapacity'}, ), ) - for instance_type in self.config['slurm']['InstanceTypes']: + for instance_type in self.instance_types: self.insufficient_capacity_exceptions_widget.add_left_metric( cloudwatch.Metric( namespace = self.slurm_namespace, @@ -1337,7 +1546,7 @@ def create_cw(self): stacked = True, statistic = 'Maximum', ) - for instance_type in self.config['slurm']['InstanceTypes']: + for instance_type in self.instance_types: self.running_instances_by_type_stacked_widget.add_left_metric( cloudwatch.Metric( namespace = self.slurm_namespace, @@ -1355,7 +1564,7 @@ def create_cw(self): stacked = False, statistic = 'Maximum', ) - for instance_type in self.config['slurm']['InstanceTypes']: + for instance_type in self.instance_types: self.running_instances_by_type_unstacked_widget.add_left_metric( cloudwatch.Metric( namespace = self.slurm_namespace, @@ -1373,7 +1582,7 @@ def create_cw(self): stacked = False, statistic = 'Maximum', ) - for instance_type in self.config['slurm']['InstanceTypes']: + for instance_type in self.instance_types: self.job_count_by_instance_type_widget.add_left_metric( cloudwatch.Metric( namespace = self.slurm_namespace, @@ -1389,7 +1598,7 @@ def create_cw(self): stacked = False, statistic = 'Maximum', ) - for instance_type in self.config['slurm']['InstanceTypes']: + for instance_type in self.instance_types: self.running_jobs_by_instance_type_widget.add_left_metric( cloudwatch.Metric( namespace = self.slurm_namespace, @@ -1405,7 +1614,7 @@ def create_cw(self): stacked = False, statistic = 'Maximum', ) - for instance_type in self.config['slurm']['InstanceTypes']: + for instance_type in self.instance_types: self.static_node_count_by_instance_type_widget.add_left_metric( cloudwatch.Metric( namespace = self.slurm_namespace, @@ -1421,7 +1630,7 @@ def create_cw(self): stacked = False, statistic = 'Average', ) - for instance_type in self.config['slurm']['InstanceTypes']: + for instance_type in self.instance_types: self.memory_used_percent_by_instance_type_widget.add_left_metric( cloudwatch.Metric( namespace = self.slurm_namespace, @@ -1436,7 +1645,7 @@ def create_cw(self): stacked = False, statistic = 'Average', ) - for instance_type in self.config['slurm']['InstanceTypes']: + for instance_type in self.instance_types: self.memory_stats_by_instance_type_widget.add_left_metric( cloudwatch.Metric( namespace = self.slurm_namespace, @@ -1444,7 +1653,7 @@ def create_cw(self): dimensions_map = {'InstanceType': instance_type}, ), ) - for instance_type in self.config['slurm']['InstanceTypes']: + for instance_type in self.instance_types: self.memory_stats_by_instance_type_widget.add_right_metric( cloudwatch.Metric( namespace = self.slurm_namespace, @@ -1513,10 +1722,10 @@ def get_instance_template_vars(self, instance_role): "AWS_DEFAULT_REGION": Aws.REGION, "ClusterName": self.config['slurm']['ClusterName'], "Domain": self.config['Domain'], - "EC2_KEYPAIR": self.config['SshKeyPair'], "ERROR_SNS_TOPIC_ARN": self.config['ErrorSnsTopicArn'], "ExtraMounts": self.config['slurm']['storage']['ExtraMounts'], "FileSystemDns": self.file_system_dns, + "FileSystemIpAddress": self.file_system_ip_address, "FileSystemMountPath": self.config['slurm']['storage']['mount_path'], "FileSystemMountSrc": self.file_system_mount_source, "FileSystemOptions": self.file_system_options, @@ -1540,9 +1749,10 @@ def get_instance_template_vars(self, instance_role): else: instance_template_vars["AccountingStorageHost"] = '' instance_template_vars["CloudWatchPeriod"] = self.config['slurm']['SlurmCtl']['CloudWatchPeriod'] + instance_template_vars["CloudWatchPeriod"] = self.config['slurm']['SlurmCtl']['CloudWatchPeriod'] + instance_template_vars["DefaultPartition"] = self.default_partition if 'Federation' in self.config['slurm']: instance_template_vars["Federation"] = self.config['slurm']['Federation']['Name'] - instance_template_vars["GridSubnet1"] = self.subnet.subnet_id instance_template_vars["JobCompLoc"] = self.config['slurm']['JobCompLoc'] instance_template_vars["JobCompType"] = self.config['slurm']['JobCompType'] instance_template_vars["MaxStoppedDuration"] = self.config['slurm']['SlurmCtl']['MaxStoppedDuration'] @@ -1554,7 +1764,6 @@ def get_instance_template_vars(self, instance_role): instance_template_vars["SlurmCtlBaseHostname"] = self.config['slurm']['SlurmCtl']['BaseHostname'] instance_template_vars['SlurmNodeProfileArn'] = self.slurm_node_instance_profile.attr_arn instance_template_vars['SlurmNodeRoleName'] = self.slurm_node_role.role_name - instance_template_vars["SlurmNodeSecurityGroup"] = self.slurmnode_sg.security_group_id instance_template_vars["SuspendAction"] = self.config['slurm']['SlurmCtl']['SuspendAction'] instance_template_vars["UseAccountingDatabase"] = self.useSlurmDbd elif 'SlurmNodeAmi': @@ -1594,6 +1803,15 @@ def create_slurmctl(self): string_value = f"{munge_key}" ) + # Create SSM parameters to store the EC2 Keypairs + self.slurmnode_ec2_key_pair_ssm_parameters = {} + for compute_region, region_dict in self.config['slurm']['InstanceConfig']['Regions'].items(): + self.slurmnode_ec2_key_pair_ssm_parameters[compute_region] = ssm.StringParameter( + self, f"SlurmNodeEc2KeyPairParameter{compute_region}", + parameter_name = f"/{self.stack_name}/SlurmNodeEc2KeyPairs/{compute_region}", + string_value = region_dict['SshKeyPair'] + ) + self.slurmctl_role = iam.Role(self, "SlurmCtlRole", assumed_by=iam.CompositePrincipal( iam.ServicePrincipal(self.principals_suffix["ssm"]), @@ -1614,8 +1832,9 @@ def create_slurmctl(self): 'ec2:CreateTags', ], resources = [ - f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:volume/*", + f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:instance/*", f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:network-interface/*", + f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:volume/*", ] ), iam.PolicyStatement( @@ -1641,14 +1860,14 @@ def create_slurmctl(self): 'ec2:RunInstances' ], resources = [ - f"arn:{Aws.PARTITION}:ec2:{Aws.REGION}:{Aws.ACCOUNT_ID}:instance/*", - f"arn:{Aws.PARTITION}:ec2:{Aws.REGION}:{Aws.ACCOUNT_ID}:key-pair/{self.config['SshKeyPair']}", - f"arn:{Aws.PARTITION}:ec2:{Aws.REGION}:{Aws.ACCOUNT_ID}:network-interface/*", - f"arn:{Aws.PARTITION}:ec2:{Aws.REGION}:{Aws.ACCOUNT_ID}:security-group/{self.slurmnode_sg.security_group_id}", - f"arn:{Aws.PARTITION}:ec2:{Aws.REGION}:{Aws.ACCOUNT_ID}:subnet/{self.subnet.subnet_id}", - f"arn:{Aws.PARTITION}:ec2:{Aws.REGION}:{Aws.ACCOUNT_ID}:volume/*", - f"arn:{Aws.PARTITION}:ec2:{Aws.REGION}:{Aws.ACCOUNT_ID}:image/*", - f"arn:{Aws.PARTITION}:ec2:{Aws.REGION}::image/*", + f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:instance/*", + f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:key-pair/*", + f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:network-interface/*", + f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:security-group/*", + f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:subnet/*", + f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:volume/*", + f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:image/*", + f"arn:{Aws.PARTITION}:ec2:*::image/*", ] ), iam.PolicyStatement( @@ -1677,18 +1896,19 @@ def create_slurmctl(self): iam.PolicyStatement( effect = iam.Effect.ALLOW, actions = [ - 'ec2:CreateTags', 'ec2:StartInstances', 'ec2:StopInstances', 'ec2:TerminateInstances' ], - resources = [f"arn:{Aws.PARTITION}:ec2:{Aws.REGION}:{Aws.ACCOUNT_ID}:instance/*"] + resources = [f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:instance/*"] ), iam.PolicyStatement( effect = iam.Effect.ALLOW, actions = [ 'ec2:DescribeInstances', 'ec2:DescribeInstanceTypes', + 'ec2:DescribeSpotPriceHistory', + 'ec2:DescribeSubnets', ], # Does not support resource-level permissions and require you to choose All resources resources = ["*"] @@ -1757,6 +1977,10 @@ def create_slurmctl(self): ) self.slurmctl_instances.append(slurmctl_instance) + for compute_region in self.compute_regions: + self.slurmnode_security_group_ssm_parameters[compute_region].grant_read(slurmctl_instance) + self.slurmnode_ec2_key_pair_ssm_parameters[compute_region].grant_read(slurmctl_instance) + name = f"{self.stack_name}-SlurmSlurmCtl{instance_index}" Tags.of(slurmctl_instance).add("Name", name) Tags.of(slurmctl_instance).add("hostname", hostname) @@ -1829,17 +2053,12 @@ def create_slurmctl(self): slurmctl_instance.user_data.add_commands(user_data) # Create DNS entry - self.slurmctl_dns_record = CustomResource( + route53.ARecord( self, f"SlurmCtl{instance_index}DnsRecord", - service_token = self.update_dns_lambda.function_arn, - properties={ - "Hostname": hostname, - "Domain": self.config['Domain'], - "HostedZoneId": self.hosted_zone.hosted_zone_id, - "Type": 'A', - "Value": slurmctl_instance.instance_private_ip - } - ).get_att_string('CIDR') + zone = self.hosted_zone, + record_name = hostname, + target = route53.RecordTarget.from_ip_addresses(slurmctl_instance.instance_private_ip) + ) def create_slurmdbd(self): if 'SlurmDbd' not in self.config['slurm']: @@ -1975,17 +2194,12 @@ def create_slurmdbd(self): self.slurmdbd_instance.user_data.add_commands(user_data) # Create DNS entry - self.slurmdbd_dns_record = CustomResource( - self, "SlurmDbdDnsRecord", - service_token = self.update_dns_lambda.function_arn, - properties={ - "Hostname": self.config['slurm']['SlurmDbd']['Hostname'], - "Domain": self.config['Domain'], - "HostedZoneId": self.hosted_zone.hosted_zone_id, - "Type": 'A', - "Value": self.slurmdbd_instance.instance_private_ip - } - ).get_att_string('CIDR') + route53.ARecord( + self, f"SlurmDbdDnsRecord", + zone = self.hosted_zone, + record_name = self.config['slurm']['SlurmDbd']['Hostname'], + target = route53.RecordTarget.from_ip_addresses(self.slurmdbd_instance.instance_private_ip) + ) if self.slurmDbdFQDN: CfnOutput(self, "SlurmDbdFQDN", @@ -2087,7 +2301,7 @@ def create_slurm_node_ami(self): 'ec2:StopInstances' ], resources = [ - f"arn:{Aws.PARTITION}:ec2:{Aws.REGION}:{Aws.ACCOUNT_ID}:instance/*", + f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:instance/*", ] ), # Permissions to create and tag AMI @@ -2109,6 +2323,13 @@ def create_slurm_node_ami(self): f'arn:{Aws.PARTITION}:ec2:{Aws.REGION}::snapshot/*' ] ), + iam.PolicyStatement( + effect = iam.Effect.ALLOW, + actions = [ + 'ec2:CopyImage', + ], + resources=['*'] + ), iam.PolicyStatement( effect = iam.Effect.ALLOW, actions = [ @@ -2178,6 +2399,7 @@ def create_slurm_node_ami(self): self.slurm_node_ami_instances[distribution][distribution_major_version] = {} self.ami_ssm_parameters[distribution][distribution_major_version] = {} for architecture in version_dict: + self.ami_ssm_parameters[distribution][distribution_major_version][architecture] = {} os_tag = f"{distribution}-{distribution_major_version}-{architecture}" try: ami_id = self.config['slurm']['SlurmNodeAmis']['BaseAmis'][self.region][distribution][distribution_major_version][architecture]['ImageId'] @@ -2239,19 +2461,25 @@ def create_slurm_node_ami(self): self.slurm_node_ami_instance.node.add_dependency(self.file_system_dependency) - self.ami_ssm_parameters[distribution][distribution_major_version][architecture] = ssm.StringParameter( - self, f"SlurmNodeAmiSsmParameter{distribution}{distribution_major_version}{architecture}", - parameter_name = f"/{self.stack_name}/SlurmNodeAmis/{distribution}/{distribution_major_version}/{architecture}", - string_value = "UNDEFINED", - ) - self.ami_ssm_parameters[distribution][distribution_major_version][architecture].grant_write(self.slurm_node_ami_instance) - ami_ssm_parameter = self.ami_ssm_parameters[distribution][distribution_major_version][architecture] + ami_ssm_parameter_base_name = f"/{self.stack_name}/SlurmNodeAmis/{distribution}/{distribution_major_version}/{architecture}" + ami_ssm_parameter_arns = [] + for compute_region in self.compute_regions: + self.ami_ssm_parameters[distribution][distribution_major_version][architecture][compute_region] = ssm.StringParameter( + self, f"SlurmNodeAmiSsmParameter{distribution}{distribution_major_version}{architecture}{compute_region}", + parameter_name = f"{ami_ssm_parameter_base_name}/{compute_region}", + string_value = "UNDEFINED", + ) + self.ami_ssm_parameters[distribution][distribution_major_version][architecture][compute_region].grant_write(self.slurm_node_ami_instance) + ami_ssm_parameter_arns.append(self.ami_ssm_parameters[distribution][distribution_major_version][architecture][compute_region].parameter_arn) instance_template_vars = self.get_instance_template_vars('SlurmNodeAmi') instance_template_vars['CONFIG_SCRIPT_PATH'] = '/root/slurm_node_ami_config.sh' instance_template_vars['WAIT_FOR_AMI_SCRIPT_PATH'] = '/root/WaitForAmi.py' instance_template_vars['PLAYBOOKS_ZIP_PATH'] = '/root/playbooks.zip' - instance_template_vars['SlurmNodeAmiSsmParameter'] = ami_ssm_parameter.parameter_name + instance_template_vars['SlurmNodeAmiSsmParameter'] = f"{ami_ssm_parameter_base_name}/{self.config['Region']}" + instance_template_vars['SlurmNodeAmiSsmParameterBaseName'] = ami_ssm_parameter_base_name + instance_template_vars['ComputeRegions'] = ','.join(self.compute_regions.keys()) + instance_template_vars['RemoteComputeRegions'] = ','.join(self.remote_compute_regions.keys()) instance_template_vars['SLURM_ROOT'] = f"{instance_template_vars['FileSystemMountPath']}/slurm-{self.config['slurm']['SlurmVersion']}/{distribution}/{distribution_major_version}/{architecture}" # Add on_exit commands at top of user_data @@ -2383,7 +2611,7 @@ def create_fault_injection_templates(self): 'ec2:StopInstances', 'ec2:TerminateInstances' ], - resources = [f"arn:{Aws.PARTITION}:ec2:{Aws.REGION}:{Aws.ACCOUNT_ID}:instance/*"] + resources = [f"arn:{Aws.PARTITION}:ec2:*:{Aws.ACCOUNT_ID}:instance/*"] ) ] ) diff --git a/source/cdk/config_schema.py b/source/cdk/config_schema.py index 8b18c98f..f92b6ae2 100644 --- a/source/cdk/config_schema.py +++ b/source/cdk/config_schema.py @@ -104,6 +104,7 @@ Optional('SshKeyPair'): str, # Optional so can be specified on the command-line Optional('VpcId'): And(str, lambda s: re.match('vpc-', s)), + Optional('CIDR'): And(str, lambda s: re.match(r'\d+\.\d+\.\d+\.\d+/\d+', s)), # # SubnetId # Optional. If not specified then the first private subnet is chosen. @@ -114,15 +115,14 @@ # Domain: # Domain name for the Route 53 private hosted zone that will be used # by the slurm cluster for DNS. + # Alternately, provide HostedZoneId of an existing Route53 hosted zone to use and + # the zone name of the HostedZoneId. # By default will be {StackName}.local - # Alternately, provide HostedZoneId of an existing Route53 hosted zone to use. - # Cannot specify both Domain and HostedZoneId. Optional('Domain'): str, # # HostedZoneId: # ID of an existing hosted zone that will be used by the slurm cluster for DNS. - # Alternately, provide Domain name to use for a new Route53 hosted zone to use. - # Cannot specify both Domain and HostedZoneId. + # You must provide the Domain name of the HostedZone if it is different than the default. Optional('HostedZoneId'): str, Optional('TimeZone', default='US/Central'): str, 'slurm': { @@ -240,11 +240,6 @@ # Configure spot instances Optional('UseSpot', default=True): bool, # - # DefaultPartition: - # By default this will be the first OS/Architecture listed in BaseOsArchitecture. - # Add '_spot' to the end to make spot the default purchase option. - 'DefaultPartition': str, - # # NodesPerInstanceType: # The number of nodes that will be defined for each instance type. 'NodesPerInstanceType': int, @@ -272,6 +267,19 @@ 'InstanceFamilies': [str], 'InstanceTypes': [str] }, + Optional('Regions', default=[]): { + str: { + 'VpcId': And(str, lambda s: re.match('vpc-', s)), + 'CIDR': str, + 'SshKeyPair': str, + 'AZs': [ + { + 'Priority': int, + 'Subnet': And(str, lambda s: re.match('subnet-', s)) + } + ], + } + }, Optional('AlwaysOnNodes', default=[]): [ str # Nodelist ], diff --git a/source/requirements.txt b/source/requirements.txt index 1b624a9e..db14b752 100644 --- a/source/requirements.txt +++ b/source/requirements.txt @@ -1,5 +1,5 @@ -e . -aws-cdk-lib==2.21.1 +aws-cdk-lib==2.28.1 boto3 colored constructs>=10.0.0 diff --git a/source/resources/config/slurm_multi_az.yml b/source/resources/config/slurm_multi_az.yml new file mode 100644 index 00000000..dc42d493 --- /dev/null +++ b/source/resources/config/slurm_multi_az.yml @@ -0,0 +1,257 @@ +--- +# Sample configuraton that creates a minimal Slurm cluster +# Shows all available configuration options +# Note that CentOS 8 has been discontinued and support has been removed. +# Uses arm64 architecture for SlurmCtl and SlurmDbd by default. +# No SlurmDbd in this configuration. + +termination_protection: True # Enable (recommended) or Disable Cloudformation Stack termination protection + +#==================================================================== +# Parameters that must be in the config file or on the command line. +# Command line values override values in the config file. +#==================================================================== +StackName: slurmminimal +#Region: us-east-1 +#SshKeyPair: name of your ec2 keypair +#VpcId: vpc-xxxxxxxxxxxxxxxxx + +# SubnetId: +# Optional. If not specified then the first private subnet is chosen. +#SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 +#SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 +#SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 + +# This is optional, but highly recommended +#ErrorSnsTopicArn: arn:aws:sns:{{region}}:{AccountId}:{TopicName} + +#==================================================================== +# Required Parameters +#==================================================================== + +# Domain: Optional +# Domain name for the Route 53 private hosted zone that will be used +# by the slurm cluster for DNS. +# By default will be {StackName}.local +# Alternately, provide HostedZoneId of an existing Route53 hosted zone to use. +# Cannot specify both Domain and HostedZoneId. +# Domain: "{{StackName}}.local" + +# HostedZoneId: Optional +# ID of an existing hosted zone that will be used by the slurm cluster for DNS. +# Alternately, provide Domain name to use for a new Route53 hosted zone to use. +# Cannot specify both Domain and HostedZoneId. +# HostedZoneId: + +TimeZone: 'US/Central' + +slurm: + # High level configuration + + SlurmVersion: "21.08.5" + + # ClusterName: + # Optional + # Must be unique if multiple clusters deployed in the same VPC. + # Default: StackName + # ClusterName: slurm + + # MungeKeySsmParameter + # SSM String Parameter with a base64 encoded munge key to use for the cluster. + # Use this if your submitters need to use more than 1 cluster. + #MungeKeySsmParameter: "/slurm/munge_key" + + SlurmCtl: + # For high availability configure multiple controllers + NumberOfControllers: 1 + # The index will be appended to BaseHostname starting with 1. + BaseHostname: slurmctl + + # architecture: x86_64 or arm64 + #architecture: x86_64 + #instance_type: "c5.large" + architecture: arm64 + instance_type: "c6g.large" + volume_size: 200 # Size of the EBS root disk + + # SuspendAction + # Set to stop or terminate. + # Stopped nodes will restart quicker, but you will continue to be charged for the EBS volumes + # attached to the instance. + SuspendAction: stop + # + # MaxStoppedDuration + # In ISO 8601 duration format: https://en.wikipedia.org/wiki/ISO_8601#Durations + # Default: 1 hour = P0Y0M0DT1H0M0S + # Evaluated at least hourly + MaxStoppedDuration: P0Y0M0DT1H0M0S + + CloudWatchPeriod: 5 # Cloudwatch metric collection period in minutes. Default value is 5. Set to 1 for finer resolution. + # Also used in the dashboard widgets. + + # The accounting database is required to enable fairshare scheduling + # It is managed by the Slurm Database Daemon (slurmdbd) instance + # This instance can be created as part of the cluster or can use an existing instance. + # SlurmDbd: + # # It is recommended to get the basic cluster configured and working before enabling the accounting database + # UseSlurmDbd: False + + # # Hostname: + # # Hostname of the slurmdbd instance if CreateSlurmdbd is true. + # Hostname: slurmdbd + + # # architecture: x86_64 or arm64 + # #architecture: x86_64 + # #instance_type: "m5.large" + # architecture: arm64 + # instance_type: "m6g.large" + # volume_size: 200 # Size of the EBS root disk + + # database: + # port: 3306 + + # Federation: + # Name: slurmeda + # SlurmCtlSecurityGroups: + # SecurityGroupName: sg-xxxxxxxxxxxxxxxxx + + SlurmNodeAmis: + instance_type: + x86_64: m5.large + arm64: m6g.large + + # Customized AMIs with file system mounts, packages, etc. configured. + # If these aren't defined then the generic base AMIs are used. + # Example in the comment below is the AWS FPGA Developer AMI + #BaseAmis: + # us-east-1: + # Amazon: {2: {x86_64: {ImageId: ami-0efdec76678df9a64, RootDeviceSize: '+5'}}} + # CentOS: {7: {x86_64: {ImageId: ami-02155c6289e76719a, RootDeviceSize: 90}}} + + # External security groups that should be able to use the cluster + # SubmitterSecurityGroupIds: + # soca-ComputeNodeSG: sg-xxxxxxxxxxxxxxxxx + + # SubmitterInstanceTags: + # 'soca:ClusterId': ['soca-xyz'] + + # InstanceConfig: + # Configure the instances used by the cluster + # A partition will be created for each combination of Base OS, Architecture, and Spot + # + # UseSpot: + # Create both on-demand and spot nodes + # Default: true + # DefaultPartition: + # By default this will be the first OS/Architecture listed in BaseOsArchitecture. + # Add '_spot' to the end to make spot the default purchase option. + # NodesPerInstanceType: + # The number of nodes that will be defined for each instance type. + # Include*/Exclude*: + # Instance families and types are regular expressions with implicit '^' and '$' at the begining and end. + # Exclude patterns are processed first and take precedence over any includes. + # A empty list is the same as '.*'. + # MaxSizeOnly: If MaxSizeOnly is True then only the largest instance type in + # a family will be included unless specific instance types are included. + # Default: false + InstanceConfig: + UseSpot: true + DefaultPartition: AlmaLinux_8_arm64_spot + NodesPerInstanceType: 10 + BaseOsArchitecture: + AlmaLinux: {8: [x86_64, arm64]} + # Amazon: {2: [x86_64, arm64]} + CentOS: + 7: [x86_64] + # Amazon: {2: [x86_64, arm64]} + # RedHat: + # 7: [x86_64] + # 8: [x86_64, arm64] + # Rocky: {8: [x86_64, arm64]} + Include: + MaxSizeOnly: false + InstanceFamilies: + - t3 + - t4g + InstanceTypes: [] + Exclude: + InstanceFamilies: [] + InstanceTypes: + - '.+\.(micro|nano)' # Not enough memory + - '.*\.metal' + AZs: + - Priority: 1 + #Region: us-east-1 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 + - Priority: 2 + #Region: us-east-1 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 + - Priority: 3 + #Region: us-east-1 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 + + # ElasticSearch: + # Configure the ElasticSearch/OpenSearch domain used by the slurm cluster + # If not specified then won't be created or used by the cluster. + # master_nodes: Defaults to 0 + # data_nodes: Must be a multiple of number_of_azs + # ElasticSearch: + # ebs_volume_size: 20 + # ebs_volume_type: GP2 + # enable_version_upgrade: False + # number_of_azs: 2 + # master_nodes: 3 + # master_node_instance_type: m5.large.search + # data_nodes: 2 + # data_node_instance_type: m5.large.search + # warm_nodes: 0 + # warm_instance_type: ultrawarm.medium.search + + # JobCompType: + # Values: + # jobcomp/none + # jobcomp/elasticsearch + # jobcomp/filetxt + JobCompType: jobcomp/filetxt + # + # JobCompLoc: + # Used with jobcomp/elasticsearch + # A complete URL endpoint with format ://_doc + #JobCompLoc: http://{{EsDomain}}.{{Region}}.es.amazonaws.com/slurm/_doc + + # Configure your Storage options below + # @todo support fsxn, test if efs will gate scaling of the cluster + storage: + # mount_path: + # Default is /opt/slurm/{{cluster_name}} + #mount_path: "" + provider: "efs" # efs or lustre + #kms_key_arn: + removal_policy : "DESTROY" # DESTROY, RETAIN, SNAPSHOT. Choices: RETAIN will preserve the EFS even if you delete the stack. Any other value will delete EFS if you delete the CFN stack + efs: + use_efs_helper: false + throughput_mode: "BURSTING" # Choices: BURSTING, PROVISIONED + # provisioned_throughput_per_second: 1 # In MiB/s. Minimum value of 1 + performance_mode: "GENERAL_PURPOSE" # Choices: GENERAL_PURPOSE, MAX_IO + encrypted: True # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-efs-filesystem.html#cfn-efs-filesystem-encrypted + lifecycle_policy: "AFTER_30_DAYS" # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-efs-filesystem-lifecyclepolicy.html + lustre: + deployment_type: "SCRATCH_2" # Allowed values: PERSISTENT_1 | SCRATCH_1 | SCRATCH_2. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-deploymenttype + drive_cache_type: "NONE" # Allowed values: NONE | READ. Required when storage_type is HDD. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-drivecachetype + per_unit_storage_throughput: 50 # Allowed values: 12, 40 for HDD, 50, 100, 200 for SSD. Required for the PERSISTENT_1 deployment_type. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-perunitstoragethroughput + storage_capacity: 1200 # For SCRATCH_2 and PERSISTENT_1 types, valid values are 1,200, 2,400, then continuing in increments of 2,400 GiB. For SCRATCH_1 deployment types, valid values are 1,200, 2,400, 3,600, then continuing in increments of 3,600 GiB. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-fsx-filesystem.html#cfn-fsx-filesystem-storagecapacity + storage_type: "SSD" # Allowed values: SSD or HDD. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-fsx-filesystem.html#cfn-fsx-filesystem-storagetype + + # ExtraMounts + # Additional mounts for compute nodes + # This examle shows SOCA EFS file systems. + # This is required so the compute node as the same file structure as the remote desktops. + #ExtraMounts: + # - dest: /apps + # src: fs-xxxxxxxx.efs.us-east-1.amazonaws.com:/ + # type: nfs4 + # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport + # - dest: /data + # src: fs-xxxxxxxx.efs.us-east-1.amazonaws.com:/ + # type: nfs4 + # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport diff --git a/source/resources/lambdas/CreateComputeNodeSG/CreateComputeNodeSG.py b/source/resources/lambdas/CreateComputeNodeSG/CreateComputeNodeSG.py new file mode 100644 index 00000000..8f2ae9f6 --- /dev/null +++ b/source/resources/lambdas/CreateComputeNodeSG/CreateComputeNodeSG.py @@ -0,0 +1,258 @@ +""" +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +SPDX-License-Identifier: MIT-0 + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" + +''' +Create/delete compute node security group in another region. +''' +import cfnresponse +import boto3 +import json +import logging +logging.getLogger().setLevel(logging.INFO) + +def get_security_group_id(ec2_client, security_group_name: str) -> str: + security_group_id = ec2_client.describe_security_groups( + Filters = [ + { + 'Name': 'group-name', + 'Values': [security_group_name] + } + ] + )['SecurityGroups'][0]['GroupId'] + return security_group_id + +def lambda_handler(event, context): + try: + logging.info(f"event:\n{json.dumps(event, indent=4)}") + properties = event['ResourceProperties'] + required_properties = [ + 'Region', + 'VpcId', + 'SecurityGroupName', + 'Description', + 'ControllerCIDR', + 'CIDRs', + 'StackName' + ] + error_message = "" + for property in required_properties: + try: + value = properties[property] + except: + error_message += f"Missing {property} property. " + if error_message: + raise KeyError(error_message) + + ec2_client = boto3.client('ec2', region_name=properties['Region']) + requestType = event['RequestType'] + + cidrs = properties['CIDRs'] + + if requestType in ['Update', 'Delete']: + security_group_id = get_security_group_id(ec2_client, properties['SecurityGroupName']) + + if requestType == 'Update': + logging.info(f"Updating {security_group_id}") + # Delete existing rules and recreate them + logging.info(f"Deleting rules") + security_group_rules = ec2_client.describe_security_group_rules( + Filters = [{'Name': 'group-id', 'Values': [security_group_id]}] + )['SecurityGroupRules'] + logging.info(f"{len(security_group_rules)} security_group_rules:\n{json.dumps(security_group_rules, indent=4)}") + ingress_rule_ids = [] + egress_rule_ids = [] + for security_group_rule in security_group_rules: + security_group_rule_id = security_group_rule['SecurityGroupRuleId'] + if security_group_rule['IsEgress']: + logging.info(f"Deleting ingress rule: {security_group_rule_id}") + egress_rule_ids.append(security_group_rule_id) + else: + logging.info(f"Deleting egress rule: {security_group_rule_id}") + ingress_rule_ids.append(security_group_rule_id) + if ingress_rule_ids: + ec2_client.revoke_security_group_ingress( + GroupId = security_group_id, + SecurityGroupRuleIds=ingress_rule_ids + ) + if egress_rule_ids: + ec2_client.revoke_security_group_egress( + GroupId = security_group_id, + SecurityGroupRuleIds=egress_rule_ids + ) + + if requestType == 'Create': + security_group_id = ec2_client.create_security_group( + GroupName = properties['SecurityGroupName'], + Description = properties['Description'], + VpcId = properties['VpcId'], + TagSpecifications = [ + { + 'ResourceType': 'security-group', + 'Tags': [ + {'Key': 'Name', 'Value': f"{properties['SecurityGroupName']}"}, + {'Key': 'cloudformation:stack-name', 'Value': f"{properties['StackName']}"} + ] + } + ] + )['GroupId'] + logging.info(f"Created {security_group_id}") + + if requestType in ['Create', 'Update']: + logging.info(f"Adding security group rules") + ec2_client.authorize_security_group_ingress( + GroupId = security_group_id, + IpPermissions = [ + { + 'IpProtocol': 'tcp', + 'FromPort': 6818, + 'ToPort': 6818, + 'IpRanges': [{'CidrIp': properties['ControllerCIDR'], 'Description': f"{properties['StackName']}-SlurmCtl to {properties['Region']}-SlurmNode"}] + }, + ] + ) + ec2_client.authorize_security_group_egress( + GroupId = security_group_id, + IpPermissions = [ + { + 'IpProtocol': 'tcp', + 'FromPort': 80, + 'ToPort': 80, + 'IpRanges': [{'CidrIp': '0.0.0.0/0', 'Description': f"{properties['Region']}-SlurmNode to Internet"}] + }, + { + 'IpProtocol': 'tcp', + 'FromPort': 111, + 'ToPort': 111, + 'IpRanges': [{'CidrIp': properties['ControllerCIDR'], 'Description': f"{properties['Region']}-SlurmNode to {properties['StackName']}-ZFS"}] + }, + { + 'IpProtocol': 'udp', + 'FromPort': 111, + 'ToPort': 111, + 'IpRanges': [{'CidrIp': properties['ControllerCIDR'], 'Description': f"{properties['Region']}-SlurmNode to {properties['StackName']}-ZFS"}] + }, + { + 'IpProtocol': 'tcp', + 'FromPort': 443, + 'ToPort': 443, + 'IpRanges': [{'CidrIp': '0.0.0.0/0', 'Description': f"{properties['Region']}-SlurmNode to Internet"}] + }, + { + 'IpProtocol': 'tcp', + 'FromPort': 2049, + 'ToPort': 2049, + 'IpRanges': [{'CidrIp': properties['ControllerCIDR'], 'Description': f"{properties['Region']}-SlurmNode to {properties['StackName']}-ZFS"}] + }, + { + 'IpProtocol': 'udp', + 'FromPort': 2049, + 'ToPort': 2049, + 'IpRanges': [{'CidrIp': properties['ControllerCIDR'], 'Description': f"{properties['Region']}-SlurmNode to {properties['StackName']}-ZFS"}] + }, + { + 'IpProtocol': 'tcp', + 'FromPort': 6817, + 'ToPort': 6817, + 'IpRanges': [{'CidrIp': properties['ControllerCIDR'], 'Description': f"{properties['Region']}-SlurmNode to {properties['StackName']}-SlurmCtl"}] + }, + { + 'IpProtocol': 'tcp', + 'FromPort': 20001, + 'ToPort': 20003, + 'IpRanges': [{'CidrIp': properties['ControllerCIDR'], 'Description': f"{properties['Region']}-SlurmNode to {properties['StackName']}-ZFS"}] + }, + { + 'IpProtocol': 'udp', + 'FromPort': 20001, + 'ToPort': 20003, + 'IpRanges': [{'CidrIp': properties['ControllerCIDR'], 'Description': f"{properties['Region']}-SlurmNode to {properties['StackName']}-ZFS"}] + }, + ] + ) + for compute_region, cidr in cidrs.items(): + if compute_region == properties['Region']: + ec2_client.authorize_security_group_ingress( + GroupId = security_group_id, + IpPermissions = [ + { + 'IpProtocol': 'tcp', + 'FromPort': 1024, + 'ToPort': 65535, + 'UserIdGroupPairs': [{'GroupId': security_group_id, 'Description': f"{compute_region}-SlurmNode to {compute_region}-SlurmNode"}] + }, + ] + ) + ec2_client.authorize_security_group_egress( + GroupId = security_group_id, + IpPermissions = [ + { + 'IpProtocol': 'tcp', + 'FromPort': 1024, + 'ToPort': 65535, + 'UserIdGroupPairs': [{'GroupId': security_group_id, 'Description': f"{properties['Region']}-SlurmNode to {compute_region}-SlurmNode"}] + }, + { + 'IpProtocol': 'tcp', + 'FromPort': 6818, + 'ToPort': 6818, + 'UserIdGroupPairs': [{'GroupId': security_group_id, 'Description': f"{properties['Region']}-SlurmNode to {compute_region}-SlurmNode"}] + }, + ] + ) + else: + ec2_client.authorize_security_group_ingress( + GroupId = security_group_id, + IpPermissions = [ + { + 'IpProtocol': 'tcp', + 'FromPort': 1024, + 'ToPort': 65535, + 'IpRanges': [{'CidrIp': cidr, 'Description': f"{compute_region}-SlurmNode to {properties['Region']}-SlurmNode"}] + }, + ] + ) + ec2_client.authorize_security_group_egress( + GroupId = security_group_id, + IpPermissions = [ + { + 'IpProtocol': 'tcp', + 'FromPort': 1024, + 'ToPort': 65535, + 'IpRanges': [{'CidrIp': cidr, 'Description': f"{properties['Region']}-SlurmNode to {compute_region}-SlurmNode"}] + }, + { + 'IpProtocol': 'tcp', + 'FromPort': 6818, + 'ToPort': 6818, + 'IpRanges': [{'CidrIp': cidr, 'Description': f"{properties['Region']}-SlurmNode to {compute_region}-SlurmNode"}] + }, + ] + ) + + if requestType == 'Delete': + logging.info(f"Deleting {security_group_id}") + ec2_client.delete_security_group( + GroupId = security_group_id + ) + + except Exception as e: + logging.exception(str(e)) + cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, str(e)) + raise + + cfnresponse.send(event, context, cfnresponse.SUCCESS, {'GroupId': security_group_id}, f"{security_group_id}") diff --git a/source/resources/lambdas/CreateComputeNodeSG/cfnresponse.py b/source/resources/lambdas/CreateComputeNodeSG/cfnresponse.py new file mode 120000 index 00000000..09400dfc --- /dev/null +++ b/source/resources/lambdas/CreateComputeNodeSG/cfnresponse.py @@ -0,0 +1 @@ +../cfnresponse.py \ No newline at end of file diff --git a/source/resources/lambdas/DeconfigureCluster/DeconfigureCluster.py b/source/resources/lambdas/DeconfigureCluster/DeconfigureCluster.py index 7cfe6df5..bd06e959 100644 --- a/source/resources/lambdas/DeconfigureCluster/DeconfigureCluster.py +++ b/source/resources/lambdas/DeconfigureCluster/DeconfigureCluster.py @@ -86,5 +86,6 @@ def lambda_handler(event, context): except Exception as e: logging.exception(str(e)) cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, str(e)) + raise cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, "") diff --git a/source/resources/lambdas/DnsLookup/DnsLookup.py b/source/resources/lambdas/DnsLookup/DnsLookup.py new file mode 100644 index 00000000..a9605972 --- /dev/null +++ b/source/resources/lambdas/DnsLookup/DnsLookup.py @@ -0,0 +1,61 @@ +""" +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +SPDX-License-Identifier: MIT-0 + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" + +''' +Do DNS lookup and return the IP address. +''' +import cfnresponse +import json +import logging +from socket import getaddrinfo, SOCK_STREAM + +logging.getLogger().setLevel(logging.INFO) + +def lambda_handler(event, context): + try: + logging.info(f"event:\n{json.dumps(event, indent=4)}") + properties = event['ResourceProperties'] + required_properties = ['FQDN'] + error_message = "" + for property in required_properties: + try: + value = properties[property] + except: + error_message += f"Missing {property} property. " + if error_message: + raise KeyError(error_message) + + requestType = event['RequestType'] + if requestType == 'Delete': + cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, "") + return + + fqdn = properties['FQDN'] + ip_address_tuples = getaddrinfo(host=fqdn, port=None, type=SOCK_STREAM) + logging.info(f"Found {len(ip_address_tuples)} ip addresses") + for ip_address_tuple in ip_address_tuples: + logging.info(f"ip_address_tuple: {ip_address_tuple}") + ip_address = ip_address_tuples[0][4][0] + logging.info(f"ip_address: {ip_address}") + + except Exception as e: + logging.exception(str(e)) + cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, str(e)) + raise + + cfnresponse.send(event, context, cfnresponse.SUCCESS, {'IpAddress': ip_address}, f"{ip_address}") diff --git a/source/resources/lambdas/DnsLookup/cfnresponse.py b/source/resources/lambdas/DnsLookup/cfnresponse.py new file mode 120000 index 00000000..09400dfc --- /dev/null +++ b/source/resources/lambdas/DnsLookup/cfnresponse.py @@ -0,0 +1 @@ +../cfnresponse.py \ No newline at end of file diff --git a/source/resources/lambdas/GetOntapSvmDNSName/GetOntapSvmDNSName.py b/source/resources/lambdas/GetOntapSvmDNSName/GetOntapSvmDNSName.py index 9547d988..af0c298e 100644 --- a/source/resources/lambdas/GetOntapSvmDNSName/GetOntapSvmDNSName.py +++ b/source/resources/lambdas/GetOntapSvmDNSName/GetOntapSvmDNSName.py @@ -55,5 +55,6 @@ def lambda_handler(event, context): except Exception as e: logging.exception(str(e)) cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, str(e)) + raise cfnresponse.send(event, context, cfnresponse.SUCCESS, {'DNSName': dns_name}, f"{dns_name}") diff --git a/source/resources/lambdas/Route53HostedZoneAddVpc/Route53HostedZoneAddVpc.py b/source/resources/lambdas/Route53HostedZoneAddVpc/Route53HostedZoneAddVpc.py new file mode 100644 index 00000000..66bee4bf --- /dev/null +++ b/source/resources/lambdas/Route53HostedZoneAddVpc/Route53HostedZoneAddVpc.py @@ -0,0 +1,75 @@ +""" +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +SPDX-License-Identifier: MIT-0 + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" + +''' +Create/delete route53 zone in another region. +''' + +import boto3 +import cfnresponse +import logging + +logging.getLogger().setLevel(logging.INFO) + +def lambda_handler(event, context): + try: + logging.info(f"event:\n{json.dumps(event, indent=4)}") + properties = event['ResourceProperties'] + required_properties = ['HostedZoneId', 'VpcId', 'VpcRegion'] + error_message = "" + for property in required_properties: + try: + value = properties[property] + except: + error_message += "Missing {} property. ".format(property) + if error_message: + raise KeyError(error_message) + + requestType = event['RequestType'] + + route53_client = boto3.client('route53') + + if requestType in ['Update', 'Delete']: + try: + route53_client.disassociate_vpc_from_hosted_zone( + HostedZoneId = properties['HostedZoneId'], + VPC = { + 'VPCRegion': properties['VpcRegion'], + 'VPCId': properties['VpcId'], + }, + HostedZoneConfig = {'PrivateZone': True} + ) + except: + pass + + if requestType in ['Create', 'Update']: + route53_client.associate_vpc_with_hosted_zone( + HostedZoneId = properties['HostedZoneId'], + VPC = { + 'VPCRegion': properties['VpcRegion'], + 'VPCId': properties['VpcId'], + }, + HostedZoneConfig = {'PrivateZone': True} + ) + + except Exception as e: + logging.exception(str(e)) + cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, str(e)) + raise + + cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, "") diff --git a/source/resources/lambdas/Route53HostedZoneAddVpc/cfnresponse.py b/source/resources/lambdas/Route53HostedZoneAddVpc/cfnresponse.py new file mode 120000 index 00000000..09400dfc --- /dev/null +++ b/source/resources/lambdas/Route53HostedZoneAddVpc/cfnresponse.py @@ -0,0 +1 @@ +../cfnresponse.py \ No newline at end of file diff --git a/source/resources/lambdas/UpdateDns/UpdateDns.py b/source/resources/lambdas/UpdateDns/UpdateDns.py index ca05c53b..cbd48f39 100644 --- a/source/resources/lambdas/UpdateDns/UpdateDns.py +++ b/source/resources/lambdas/UpdateDns/UpdateDns.py @@ -70,5 +70,6 @@ def lambda_handler(event, context): except Exception as e: logging.exception(str(e)) cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, str(e)) + raise cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, "{} {}.{} {}".format(properties['Type'], properties['Hostname'], properties['Domain'], properties['Value'])) diff --git a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfo.py b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfo.py deleted file mode 100755 index 8b486a0e..00000000 --- a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfo.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python3 -""" -Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -SPDX-License-Identifier: MIT-0 - -Permission is hereby granted, free of charge, to any person obtaining a copy of this -software and associated documentation files (the "Software"), to deal in the Software -without restriction, including without limitation the rights to use, copy, modify, -merge, publish, distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A -PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -""" - -import boto3 -from botocore.exceptions import ClientError - -class EC2InstanceTypeInfo: - - def __init__(self): - self.ec2 = boto3.client('ec2') - self.get_instance_type_info() - return - - def get_instance_type_info(self): - self.instance_type_info = {} - self.instance_family_info = {} - describe_instance_types_paginator = self.ec2.get_paginator('describe_instance_types') - for result in describe_instance_types_paginator.paginate(**{'Filters': [{'Name': 'current-generation', 'Values': ['true']}]}): - for instance_type_info in result['InstanceTypes']: - instanceType = instance_type_info['InstanceType'] - self.instance_type_info[instanceType] = {} - self.instance_type_info[instanceType]['full'] = instance_type_info - architecture = instance_type_info['ProcessorInfo']['SupportedArchitectures'][0] - self.instance_type_info[instanceType]['architecture'] = architecture - self.instance_type_info[instanceType]['SustainedClockSpeedInGhz'] = instance_type_info['ProcessorInfo']['SustainedClockSpeedInGhz'] - if 'ValidThreadsPerCore' in instance_type_info['VCpuInfo']: - self.instance_type_info[instanceType]['ThreadsPerCore'] = max(instance_type_info['VCpuInfo']['ValidThreadsPerCore']) - else: - if architecture == 'x86_64': - self.instance_type_info[instanceType]['ThreadsPerCore'] = 2 - else: - self.instance_type_info[instanceType]['ThreadsPerCore'] = 1 - if 'ValidCores' in instance_type_info['VCpuInfo']: - self.instance_type_info[instanceType]['CoreCount'] = max(instance_type_info['VCpuInfo']['ValidCores']) - else: - self.instance_type_info[instanceType]['CoreCount'] = instance_type_info['VCpuInfo']['DefaultVCpus']/self.instance_type_info[instanceType]['ThreadsPerCore'] - self.instance_type_info[instanceType]['MemoryInMiB'] = instance_type_info['MemoryInfo']['SizeInMiB'] - self.instance_type_info[instanceType]['SSDCount'] = instance_type_info.get('InstanceStorageInfo', {'Disks': [{'Count': 0}]})['Disks'][0]['Count'] - self.instance_type_info[instanceType]['SSDTotalSizeGB'] = instance_type_info.get('InstanceStorageInfo', {'TotalSizeInGB': 0})['TotalSizeInGB'] - - (instance_family, instance_size) = instanceType.split(r'\.') - if instance_family not in self.instance_family_info: - self.instance_family_info[instance_family] = {} - self.instance_family_info[instance_family]['instance_types'] = [instanceType,] - self.instance_family_info[instance_family]['MaxInstanceType'] = instanceType - self.instance_family_info[instance_family]['MaxInstanceSize'] = instance_size - self.instance_family_info[instance_family]['MaxCoreCount'] = self.instance_type_info[instanceType]['CoreCount'] - else: - self.instance_family_info[instance_family]['instance_types'].append(instanceType) - if self.instance_type_info[instanceType]['CoreCount'] > self.instance_family_info[instance_family]['MaxCoreCount']: - self.instance_family_info[instance_family]['MaxInstanceType'] = instanceType - self.instance_family_info[instance_family]['MaxInstanceSize'] = instance_size - self.instance_family_info[instance_family]['MaxCoreCount'] = self.instance_type_info[instanceType]['CoreCount'] - - def get_instance_family(self, instanceType): - instance_family = instanceType.split(r'\.')[0] - return instance_family - - def get_instance_size(self, instanceType): - instance_size = instanceType.split(r'\.')[1] - return instance_size - - def get_instance_families(self): - return sorted(self.instance_type_info.keys()) - - def get_max_instance_type(self, instance_family): - return self.instance_family_info[instance_family]['MaxInstanceType'] - - def get_instance_types(self): - return sorted(self.instance_type_info.keys()) - - def get_architecture(self, instance_type): - return self.instance_type_info[instance_type]['architecture'] - - def get_SustainedClockSpeedInGhz(self, instance_type): - return self.instance_type_info[instance_type]['SustainedClockSpeedInGhz'] - - def get_CoreCount(self, instance_type): - return self.instance_type_info[instance_type]['CoreCount'] - - def get_ThreadsPerCore(self, instance_type): - return self.instance_type_info[instance_type]['ThreadsPerCore'] - - def get_MemoryInMiB(self, instance_type): - return self.instance_type_info[instance_type]['MemoryInMiB'] - - def get_SSDCount(self, instance_type): - return self.instance_type_info[instance_type]['SSDCount'] - - def get_SSDTotalSizeGB(self, instance_type): - return self.instance_type_info[instance_type]['SSDTotalSizeGB'] - - def get_full_info(self, instance_type): - return self.instance_type_info[instance_type]['full'] diff --git a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py new file mode 100755 index 00000000..e87bd965 --- /dev/null +++ b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python3 +""" +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +SPDX-License-Identifier: MIT-0 + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" + +import boto3 +from botocore.exceptions import ClientError +import csv +from datetime import datetime +import json +import logging +from logging import error, info, warning, handlers +import os +from os import environ, path +from pkg_resources import resource_filename +import pprint +from EC2InstanceTypeInfoPkg.retry_boto3_throttling import retry_boto3_throttling +import sys + +logger = logging.getLogger(__file__) +logger_formatter = logging.Formatter('%(levelname)s:%(asctime)s: %(message)s') +logger_streamHandler = logging.StreamHandler() +logger_streamHandler.setFormatter(logger_formatter) +logger.addHandler(logger_streamHandler) +logger.setLevel(logging.INFO) +logger.propagate = False + +pp = pprint.PrettyPrinter(indent=4) + +class EC2InstanceTypeInfo: + + def __init__(self, regions, json_filename=None, debug=False): + if debug: + logger.setLevel(logging.DEBUG) + + if not regions: + # Get a list of all AWS regions + ec2_client = boto3.client('ec2') + try: + regions = sorted([region["RegionName"] for region in ec2_client.describe_regions()["Regions"]]) + except ClientError as err: + logger.error(f"Unable to list all AWS regions. Make sure you have set your IAM credentials. {err}") + sys.exit(1) + self.regions = regions + + logger.info(f"Getting EC2 pricing info for following regions:\n{pp.pformat(self.regions)}") + + if json_filename: + if path.exists(json_filename): + logger.info(f"Reading cached info from {json_filename}") + self.instance_type_info = json.loads(open(json_filename, 'r').read()) + else: + logger.info(f"{json_filename} doesn't exist so cannot be read") + if not json_filename or not path.exists(json_filename): + self.instance_type_info = {} + + # Endpoints only supported in 2 regions: https://docs.aws.amazon.com/cli/latest/reference/pricing/index.html + self.pricing_client = boto3.client('pricing', region_name='us-east-1') + + for region in self.regions: + if region in self.instance_type_info and json_filename: + logger.info(f'Using EC2 instance info from {json_filename} for {region}') + continue + region_name = self.get_region_name(region) + logger.info(f'Getting EC2 instance info for {region} ({region_name})') + self.ec2 = boto3.client('ec2', region_name=region) + self.get_instance_type_info(region) + + # Save json after each successful region to speed up reruns + if json_filename: + logger.info(f"Saving instance type info for {region} in {json_filename}") + fh = open(json_filename, 'w') + print(json.dumps(self.instance_type_info, indent=4, sort_keys=True), file=fh) + fh.close() + + return + + def get_instance_type_info(self, region): + region_name = self.get_region_name(region) + logger.debug(f"region_name={region_name}") + instance_type_info = {} + self.instance_type_info[region] = instance_type_info + describe_instance_types_paginator = self.ec2.get_paginator('describe_instance_types') + for result in describe_instance_types_paginator.paginate(**{'Filters': [{'Name': 'current-generation', 'Values': ['true']}]}): + for instanceTypeDict in result['InstanceTypes']: + #logger.debug(pp.pformat("instanceTypeDict:\n%s" % (pp.pformat(instanceTypeDict)))) + instanceType = instanceTypeDict['InstanceType'] + logger.debug(pp.pformat("instanceType: %s" % (instanceType))) + instance_type_info[instanceType] = {} + #instance_type_info[instanceType]['full'] = instanceTypeDict + architecture = instanceTypeDict['ProcessorInfo']['SupportedArchitectures'][0] + instance_type_info[instanceType]['architecture'] = architecture + instance_type_info[instanceType]['SustainedClockSpeedInGhz'] = instanceTypeDict['ProcessorInfo']['SustainedClockSpeedInGhz'] + if 'ValidThreadsPerCore' in instanceTypeDict['VCpuInfo']: + instance_type_info[instanceType]['ThreadsPerCore'] = max(instanceTypeDict['VCpuInfo']['ValidThreadsPerCore']) + else: + if architecture == 'x86_64': + instance_type_info[instanceType]['ThreadsPerCore'] = 2 + else: + instance_type_info[instanceType]['ThreadsPerCore'] = 1 + if 'ValidCores' in instanceTypeDict['VCpuInfo']: + instance_type_info[instanceType]['CoreCount'] = max(instanceTypeDict['VCpuInfo']['ValidCores']) + else: + instance_type_info[instanceType]['CoreCount'] = instanceTypeDict['VCpuInfo']['DefaultVCpus']/instance_type_info[instanceType]['ThreadsPerCore'] + instance_type_info[instanceType]['MemoryInMiB'] = instanceTypeDict['MemoryInfo']['SizeInMiB'] + instance_type_info[instanceType]['SSDCount'] = instanceTypeDict.get('InstanceStorageInfo', {'Disks': [{'Count': 0}]})['Disks'][0]['Count'] + instance_type_info[instanceType]['SSDTotalSizeGB'] = instanceTypeDict.get('InstanceStorageInfo', {'TotalSizeInGB': 0})['TotalSizeInGB'] + instance_type_info[instanceType]['Hypervisor'] = instanceTypeDict.get('Hypervisor', '') + instance_type_info[instanceType]['NetworkPerformance'] = instanceTypeDict['NetworkInfo']['NetworkPerformance'] + if 'GpuInfo' in instanceTypeDict and 'Gpus' in instanceTypeDict['GpuInfo']: + instance_type_info[instanceType]['GpuCount'] = int(instanceTypeDict['GpuInfo']['Gpus'][0].get('Count', 0)) + instance_type_info[instanceType]['GpuManufacturer'] = instanceTypeDict['GpuInfo']['Gpus'][0].get('Manufacturer', "") + instance_type_info[instanceType]['GpuName'] = instanceTypeDict['GpuInfo']['Gpus'][0].get('Name', "") + instance_type_info[instanceType]['GpuMemoryMiB'] = instanceTypeDict['GpuInfo']['Gpus'][0].get('MemoryInfo', {}).get('SizeInMiB', 0) + instance_type_info[instanceType]['GpuTotalMemoryMiB'] = instanceTypeDict['GpuInfo'].get('TotalGpuMemoryInMiB', 0) + + # (instance_family, instance_size) = instanceType.split('.') + # if instance_family not in instance_family_info: + # instance_family_info[instance_family] = {} + # instance_family_info[instance_family]['instance_types'] = [instanceType,] + # instance_family_info[instance_family]['MaxInstanceType'] = instanceType + # instance_family_info[instance_family]['MaxInstanceSize'] = instance_size + # instance_family_info[instance_family]['MaxCoreCount'] = instance_type_info[instanceType]['CoreCount'] + # else: + # instance_family_info[instance_family]['instance_types'].append(instanceType) + # if instance_type_info[instanceType]['CoreCount'] > instance_family_info[instance_family]['MaxCoreCount']: + # instance_family_info[instance_family]['MaxInstanceType'] = instanceType + # instance_family_info[instance_family]['MaxInstanceSize'] = instance_size + # instance_family_info[instance_family]['MaxCoreCount'] = instance_type_info[instanceType]['CoreCount'] + + logger.debug("Getting pricing info for instances") + instance_types = instance_type_info.keys() + logger.debug("{} instance types in {}".format(len(instance_types), region)) + + for instanceType in sorted(instance_types): + logger.debug("instanceType: {}".format(instanceType)) + os = 'Linux' + pricing_filter = [ + {'Field': 'ServiceCode', 'Value': 'AmazonEC2', 'Type': 'TERM_MATCH'}, + {'Field': 'instanceType', 'Value': instanceType, 'Type': 'TERM_MATCH'}, + {'Field': 'tenancy', 'Value': 'shared', 'Type': 'TERM_MATCH'}, + {'Field': 'preInstalledSw', 'Value': 'NA', 'Type': 'TERM_MATCH'}, + {'Field': 'location', 'Value': region_name, 'Type': 'TERM_MATCH'}, + {'Field': 'operatingSystem', 'Value': os, 'Type': 'TERM_MATCH'}, + {'Field': 'capacitystatus', 'Value': 'Used', 'Type': 'TERM_MATCH'}, + ] + priceLists = self.get_products(pricing_filter) + if len(priceLists) == 0: + logger.warning(f"No pricelist for {instanceType} {region} ({region_name}). Instance type may not be available in this region.") + continue + if len(priceLists) > 1: + raise RuntimeError("Number of PriceLists > 1 for {}".format(instanceType)) + + instance_type_info[instanceType]['pricing'] = {} + instance_type_info[instanceType]['pricing']['Reserved'] = {} + instance_type_info[instanceType]['pricing']['spot'] = {} + on_demand_price = 0 + ri_min_price = 0 + ri_min_price_terms = '' + ri_max_price = 0 + ri_max_price_terms = '' + + # instance_type_info[instanceType]['priceLists'] = [] + for priceListJson in priceLists: + priceList = json.loads(priceListJson) + logger.debug("pricelist:\n{}".format(pp.pformat(priceList))) + #instance_type_info[instanceType]['priceLists'].append(priceList) + if 'physicalProcessor' in priceList['product']['attributes']: + physicalProcessor = priceList['product']['attributes']['physicalProcessor'] + for term, termInfo in priceList['terms'].items(): + if term == 'OnDemand': + for rateCodeKey, rateCode in termInfo.items(): + for dimensionKey, priceDimension in rateCode['priceDimensions'].items(): + unit = priceDimension['unit'] + if unit != 'Hrs': + raise RuntimeError("Unknown pricing unit: {}".format(unit)) + currency = list(priceDimension['pricePerUnit'])[0] + if currency != 'USD': + raise RuntimeError("Unknown currency: {}".format(currency)) + on_demand_price = float(priceDimension['pricePerUnit']['USD']) + elif term == 'Reserved': + for ri_info_key, ri_info in termInfo.items(): + attributes = ri_info['termAttributes'] + ri_length = attributes['LeaseContractLength'] + ri_class = attributes['OfferingClass'] + ri_PurchaseOption = attributes['PurchaseOption'] + ri_terms = "{} {} {}".format(ri_length, ri_class, ri_PurchaseOption) + ri_length_hours = float(ri_length.split('yr')[0]) * 365 * 24 + ri_price = float(0) + for priceDimensionKey, priceDimension in ri_info['priceDimensions'].items(): + unit = priceDimension['unit'] + pricePerUnit = float(priceDimension['pricePerUnit']['USD']) + if unit == 'Quantity': + ri_price += pricePerUnit / ri_length_hours + elif unit == 'Hrs': + ri_price += pricePerUnit + else: + raise RuntimeError("Invalid reserved instance unit {}".format(unit)) + instance_type_info[instanceType]['pricing']['Reserved'][ri_terms] = ri_price + if ri_price > ri_max_price: + ri_max_price = max(ri_max_price, ri_price) + ri_max_price_terms = ri_terms + if ri_min_price == 0 or ri_price < ri_min_price: + ri_min_price = ri_price + ri_min_price_terms = ri_terms + else: + raise RuntimeError("Invalid term {}".format(term)) + instance_type_info[instanceType]['ri_min_price'] = ri_min_price + instance_type_info[instanceType]['ri_min_price_terms'] = ri_min_price_terms + instance_type_info[instanceType]['ri_max_price'] = ri_max_price + instance_type_info[instanceType]['ri_max_price_terms'] = ri_max_price_terms + instance_type_info[instanceType]['pricing']['OnDemand'] = on_demand_price + instance_type_info[instanceType]['physicalProcessor'] = physicalProcessor + + # Get spot price for each AZ + result = self.ec2.describe_spot_price_history( + InstanceTypes = [instanceType], + Filters = [ + {'Name': 'product-description', 'Values': ['Linux/UNIX']} + ], + StartTime = datetime.now() + ) + for spotPriceHistory in result['SpotPriceHistory']: + az = spotPriceHistory['AvailabilityZone'] + spot_price = float(spotPriceHistory['SpotPrice']) + instance_type_info[instanceType]['pricing']['spot'][az] = spot_price + instance_type_info[instanceType]['pricing']['spot']['min'] = min(spot_price, instance_type_info[instanceType]['pricing']['spot'].get('min', 999999999)) + instance_type_info[instanceType]['pricing']['spot']['max'] = max(spot_price, instance_type_info[instanceType]['pricing']['spot'].get('max', 0)) + + + def print_csv(self, filename=""): + if filename: + fh = open(filename, 'w') + else: + fh = sys.stdout + csv_writer = csv.writer(fh, dialect='excel') + + # Get all ri_terms + ri_terms_dict = {} + for region, instance_type_info in self.instance_type_info.items(): + for instanceType in instance_type_info.keys(): + if 'pricing' in instance_type_info[instanceType]: + for ri_term in instance_type_info[instanceType]['pricing']['Reserved'].keys(): + ri_terms_dict[ri_term] = 1 + ri_terms = sorted(ri_terms_dict.keys()) + + column_names = ["Region", "InstanceType", "Architecture", "CoreCount", "RealMemory(MiB)", "ClockSpeed(GHz)", "NetworkPerformance", "SSDCount", "SSDTotalSizeGB", "physicalProcessor", "GPU Count", "GPU", "GPU Memory (MiB)", "ODPrice", "MinSpotPrice", "MaxSpotDiscount", "MaxSpotPrice", "MinSpotDiscount"] + for ri_term in ri_terms: + column_names.append(ri_term) + column_names.append(f"{ri_term} Discount") + csv_writer.writerow(column_names) + for region, instance_type_info in self.instance_type_info.items(): + instance_types = sorted(instance_type_info.keys()) + for instanceType in instance_types: + if 'pricing' not in instance_type_info[instanceType]: + continue + architecture = instance_type_info[instanceType]['architecture'] + coreCount = instance_type_info[instanceType]['CoreCount'] + realMemory = int(int(instance_type_info[instanceType]['MemoryInMiB'])) + clockSpeedInGHz = instance_type_info[instanceType]['SustainedClockSpeedInGhz'] + networkPerformance = instance_type_info[instanceType]['NetworkPerformance'] + ssdCount = instance_type_info[instanceType]['SSDCount'] + SSDTotalSizeGB = instance_type_info[instanceType]['SSDTotalSizeGB'] + gpuCount = instance_type_info[instanceType].get('GpuCount', 0) + gpuManufacturer = instance_type_info[instanceType].get('GpuManufacturer', '') + gpuName = instance_type_info[instanceType].get('GpuName', '') + gpu = gpuManufacturer + " " + gpuName + #gpuMemoryMiB = instance_type_info[instanceType].get('GpuMemoryMiB', 0) + gpuTotalMemoryMiB = instance_type_info[instanceType].get('GpuTotalMemoryMiB', 0) + physicalProcessor = instance_type_info[instanceType].get('physicalProcessor', 'UNKNOWN') + on_demand_price = instance_type_info[instanceType]['pricing']['OnDemand'] + if 'min' in instance_type_info[instanceType]['pricing']['spot']: + min_spot_price = instance_type_info[instanceType]['pricing']['spot']['min'] + max_spot_discount = (on_demand_price - min_spot_price) / on_demand_price + max_spot_price = instance_type_info[instanceType]['pricing']['spot']['max'] + min_spot_discount = (on_demand_price - max_spot_price) / on_demand_price + else: + logger.debug(f"{instanceType} doesn't have spot discounts") + min_spot_price = max_spot_discount = max_spot_price = min_spot_discount = '' + + columns = [region, instanceType, architecture, str(coreCount), str(realMemory), str(clockSpeedInGHz), networkPerformance, str(ssdCount), str(SSDTotalSizeGB), physicalProcessor, str(gpuCount), gpu, str(gpuTotalMemoryMiB), str(on_demand_price), str(min_spot_price), str(max_spot_discount), str(max_spot_price), str(min_spot_discount)] + + for ri_term in ri_terms: + if ri_term in instance_type_info[instanceType]['pricing']['Reserved']: + ri_price = instance_type_info[instanceType]['pricing']['Reserved'][ri_term] + columns.append(str(ri_price)) + ri_discount = (on_demand_price - ri_price) / on_demand_price + columns.append(ri_discount) + else: + logger.debug(f"{instanceType} doesn't have {ri_term} RIs") + columns.append('') + columns.append('') + + csv_writer.writerow(columns) + + @staticmethod + def get_instance_family(instanceType): + instance_family = instanceType.split('.')[0] + return instance_family + + @staticmethod + def get_instance_size(instanceType): + instance_size = instanceType.split('.')[1] + return instance_size + + # Translate region code to region name + def get_region_name(self, region_code): + missing_regions = { + 'ap-northeast-3': 'Asia Pacific (Osaka)' + } + endpoint_file = resource_filename('botocore', 'data/endpoints.json') + with open(endpoint_file, 'r') as f: + data = json.load(f) + try: + region_name = data['partitions'][0]['regions'][region_code]['description'] + except KeyError: + if region_code in missing_regions: + return missing_regions[region_code] + logger.exception(f"Couldn't get region name for {region_code}\nendpoint_file: {endpoint_file}\ndata:\n{pp.pformat(data['partitions'][0]['regions'])}") + raise + region_name = region_name.replace('Europe', 'EU') + return region_name + + @retry_boto3_throttling() + def get_products(self, pricing_filter): + priceLists = self.pricing_client.get_products( + ServiceCode='AmazonEC2', Filters=pricing_filter + )['PriceList'] + return priceLists diff --git a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/__init__.py b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/get_ec2_instance_info.py b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/get_ec2_instance_info.py new file mode 100755 index 00000000..c4e6d7ba --- /dev/null +++ b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/get_ec2_instance_info.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 + +import argparse +from botocore.exceptions import NoCredentialsError +from EC2InstanceTypeInfoPkg.EC2InstanceTypeInfo import EC2InstanceTypeInfo +import sys + +if __name__ == '__main__': + try: + parser = argparse.ArgumentParser(description="Get EC2 instance pricing info.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--region", "-r", type=str, default=[], action='append', help="AWS region(s) to get info for.") + parser.add_argument("--input", '-i', type=str, default=None, help="JSON input file. Reads existing info from previous runs. Can speed up rerun if a region failed.") + parser.add_argument("--output-csv", '-o', type=str, default=None, help="CSV output file. Default: instance_type_info.csv") + parser.add_argument("--debug", "-d", action='store_const', const=True, default=False, help="Enable debug messages") + args = parser.parse_args() + + if args.input: + print(f"Reading existing instance info from {args.input}") + ec2InstanceTypeInfo = EC2InstanceTypeInfo(args.region, json_filename=args.input, debug=args.debug) + if args.output_csv: + ec2InstanceTypeInfo.print_csv(args.output_csv) + except NoCredentialsError as e: + print('No AWS credentials found') + sys.exit(1) diff --git a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/retry_boto3_throttling.py b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/retry_boto3_throttling.py new file mode 100755 index 00000000..2a9fb028 --- /dev/null +++ b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/retry_boto3_throttling.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 + +from botocore.exceptions import ClientError +from functools import wraps +import logging +from logging import error, info, warning, handlers +import random +import time +import traceback + +logger = logging.getLogger(__file__) + +logger_formatter = logging.Formatter('%(levelname)s:%(asctime)s: %(message)s') +logger_streamHandler = logging.StreamHandler() +logger_streamHandler.setFormatter(logger_formatter) +logger.addHandler(logger_streamHandler) +logger.setLevel(logging.INFO) +#logger.setLevel(logging.DEBUG) + +def retry_boto3_throttling(min_delay = 1, max_delay = 10 * 60, max_cumulative_delay = 12 * 3600, base = 1, logger = logger): + """ + Retry calling the decorated function using a linear or exponential backoff. + + This is to handle EC2 API and resource throttling which uses a token bucket + with a fixed refill rate. Once the bucket is emptied then throttling occurs + until tokens are added. Tokens are added every second so the minimum retry + interval is 1 second up to the specified maximum delay. + + I think I like this one better since it randomly spreads the backoff while + still allowing some short backoffs. + + https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ + + http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/ + original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry + + Decorators described here: + https://docs.python.org/2/whatsnew/2.4.html?highlight=decorator#pep-318-decorators-for-functions-and-methods + + :param min_delay: Minimum delay before retry + :type min_delay: int + + :param max_delay: Maximum delay before retry + :type max_delay: int + + :param max_cumulative_delay: Maximum total time to wait in seconds + :type max_cumulative_delay: int + + :param base: Base for exponential backoff + :type base: int + + :param logger: logger to use. + :type logger: logging.Logger instance + """ + def deco_retry(f): + + @wraps(f) + def f_retry(*args, **kwargs): + attempt = 0 + cumulative_delay = 0.0 + while (cumulative_delay < max_cumulative_delay): + try: + attempt += 1 + return f(*args, **kwargs) + except ClientError as e: + logging.exception("Caught exception") + if e.response['Error']['Code'] in ['RequestLimitExceeded', 'InternalError', 'ThrottlingException']: + pass + else: + logging.exception("Rethrew exception") + raise e + logger.debug("%s" % (traceback.format_exc())) + logger.debug("attempt=%d" % attempt) + current_max_delay = min(max_delay, base * 2 ** attempt) + logger.debug("delay_range=(%f %f)" % (min_delay, current_max_delay)) + delay = random.uniform(min_delay, current_max_delay) # nosec + logger.debug("cumulative delay=%f max=%d" % (cumulative_delay, max_cumulative_delay)) + logger.debug("Retrying in %f seconds..." % (delay)) + time.sleep(delay) + cumulative_delay += delay + return f(*args, **kwargs) + + return f_retry # true decorator + + return deco_retry diff --git a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmNodeUserData.sh b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmNodeUserData.sh index af524857..f05d4d3f 100644 --- a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmNodeUserData.sh +++ b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmNodeUserData.sh @@ -101,7 +101,9 @@ fi ln -s $logs_dir /var/log/slurm # Selinux must be disable for slurmd to run -setenforce Permissive +if sestatus | grep -i enforcing &> /dev/null; then + setenforce Permissive +fi sed -i 's/^SELINUX=.*/SELINUX=disabled/' /etc/sysconfig/selinux systemctl enable slurmd diff --git a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmPlugin.py b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmPlugin.py index 514fb470..6eb7d150 100755 --- a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmPlugin.py +++ b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmPlugin.py @@ -23,6 +23,7 @@ from botocore.exceptions import ClientError from collections import Counter, defaultdict from datetime import datetime, timedelta, timezone +from EC2InstanceTypeInfoPkg.EC2InstanceTypeInfo import EC2InstanceTypeInfo from functools import wraps import hostlist from isodate import parse_duration @@ -31,6 +32,7 @@ import logging from logging import error, info, warning, handlers import os +from os import environ, path from os.path import dirname, realpath from pkg_resources import resource_filename import pprint @@ -141,9 +143,10 @@ class LaunchInstanceThread(threading.Thread): This is required so that instances can be launched as quickly as possible so that slurm doesn't time out waiting for them to enter service. ''' - def __init__(self, plugin, kwargs): + def __init__(self, plugin, region, kwargs): super(LaunchInstanceThread, self).__init__() self.plugin = plugin + self.region = region self.kwargs = kwargs self.result = None self.e = None @@ -155,6 +158,15 @@ def run(self): self.e = e self.traceback = traceback.format_exc() self.exception_reason = e.response['Error']['Code'] + if self.exception_reason == 'UnauthorizedOperation': + message = e.response['Error']['Message'] + matches = re.match(r'You are not authorized to perform this operation. Encoded authorization failure message: (\S+)', message) + if matches: + encoded_message = matches.group(1) + logger.error(f"Encoded message:\n{encoded_message}") + sts_client = boto3.client('sts', region_name=self.region) + decoded_message = json.loads(sts_client.decode_authorization_message(EncodedMessage=encoded_message)['DecodedMessage']) + logger.error(f"decoded_message:\n{json.dumps(decoded_message, indent=4)}") except Exception as e: self.e = e self.traceback = traceback.format_exc() @@ -163,7 +175,7 @@ def run(self): @retry_ec2_throttling() def launch_instance(self): - self.result = self.plugin.ec2.run_instances(**self.kwargs) + self.result = self.plugin.ec2[self.region].run_instances(**self.kwargs) return class SlurmPlugin: @@ -217,28 +229,53 @@ class SlurmPlugin: CW_UNHANDLED_SUSPEND_RESUME_EXCEPTION = 'UnhandledPluginSuspendResumeException' CW_UNHANDLED_TERMINATE_OLD_INSTANCES_EXCEPTION = 'UnhandledPluginTerminateOldInstancesException' - def __init__(self, slurm_config_file=f"/opt/slurm/config/slurm_config.json", slurm_version_file=f"/opt/slurm/config/SlurmVersion.json", region=None): + def __init__(self, slurm_config_file=f"/opt/slurm/config/slurm_config.json", region=None): if slurm_config_file: with open(slurm_config_file, 'r') as fh: self.config = json.load(fh) - os.environ['AWS_DEFAULT_REGION'] = self.config['region'] + environ['AWS_DEFAULT_REGION'] = self.config['region'] else: self.config = {} + + slurm_version_file = self.config.get('SlurmVersionFile', '') if slurm_version_file: with open(slurm_version_file, 'r') as fh: self.config.update(json.load(fh)) + + az_info_file = self.config.get('AZInfoFile', '') + if az_info_file and path.exists(az_info_file): + with open(az_info_file, 'r') as fh: + self.az_info = json.load(fh) + logger.debug(f"self.az_info: {self.az_info}") + self.az_ids = {} + for az in self.az_info.keys(): + self.az_ids[self.az_info[az]['id']] = az + logger.debug(f"self.az_ids: {self.az_ids}") + else: + self.az_info = {} + self.az_ids = {} + if region: self.config['region'] = region - os.environ['AWS_DEFAULT_REGION'] = self.config['region'] + environ['AWS_DEFAULT_REGION'] = self.config['region'] + + self.compute_regions = [self.config['region']] + for az_dict in self.az_info.values(): + region = az_dict['region'] + if region not in self.compute_regions: + self.compute_regions.append(region) + self.compute_regions.sort() # Create first so that can publish metrics for unhandled exceptions self.cw = boto3.client('cloudwatch') self.ssm = boto3.client('ssm') try: - self.ec2 = boto3.client('ec2') - self.ec2_describe_instances_paginator = self.ec2.get_paginator('describe_instances') - self.describe_instance_status_paginator = self.ec2.get_paginator('describe_instance_status') + self.ec2 = {} + self.ec2_describe_instances_paginator = {} + for region in self.compute_regions: + self.ec2[region] = boto3.client('ec2', region_name=region) + self.ec2_describe_instances_paginator[region] = self.ec2[region].get_paginator('describe_instances') except: logger.exception('Unhandled exception in SlurmPlugin constructor') self.publish_cw_metrics(self.CW_UNHANDLED_PLUGIN_CONSTRUCTOR_EXCEPTION, 1, []) @@ -293,49 +330,31 @@ def suspend_resume_setup(self): raise def get_instance_type_info(self): - logger.debug("get_instance_type_info") - self.instance_type_info = {} + logger.debug(f"get_instance_type_info") + eC2InstanceTypeInfo = EC2InstanceTypeInfo(self.compute_regions, json_filename=self.config['InstanceTypeInfoFile']) + self.instance_type_info = eC2InstanceTypeInfo.instance_type_info + self.create_instance_family_info() + + def create_instance_family_info(self): self.instance_family_info = {} - describe_instance_types_paginator = self.ec2.get_paginator('describe_instance_types') - for result in self.paginate(describe_instance_types_paginator, {'Filters': [{'Name': 'current-generation', 'Values': ['true']}]}): - for instance_type_info in result['InstanceTypes']: - instanceType = instance_type_info['InstanceType'] - #logger.debug("Found instance info for {}".format(instanceType)) - self.instance_type_info[instanceType] = {} - self.instance_type_info[instanceType]['full'] = instance_type_info - architecture = instance_type_info['ProcessorInfo']['SupportedArchitectures'][0] - self.instance_type_info[instanceType]['architecture'] = architecture - self.instance_type_info[instanceType]['SustainedClockSpeedInGhz'] = instance_type_info['ProcessorInfo']['SustainedClockSpeedInGhz'] - if 'ValidThreadsPerCore' in instance_type_info['VCpuInfo']: - self.instance_type_info[instanceType]['ThreadsPerCore'] = max(instance_type_info['VCpuInfo']['ValidThreadsPerCore']) - else: - if architecture == 'x86_64': - self.instance_type_info[instanceType]['ThreadsPerCore'] = 2 - else: - self.instance_type_info[instanceType]['ThreadsPerCore'] = 1 - if 'ValidCores' in instance_type_info['VCpuInfo']: - self.instance_type_info[instanceType]['CoreCount'] = max(instance_type_info['VCpuInfo']['ValidCores']) + for region in self.instance_type_info.keys(): + instance_type_info = self.instance_type_info[region] + self.instance_family_info[region] = {} + for instance_type in instance_type_info: + (instance_family, instance_size) = instance_type.split(r'.') + if instance_family not in self.instance_family_info[region]: + self.instance_family_info[region][instance_family] = {} + self.instance_family_info[region][instance_family]['instance_types'] = [instance_type,] + self.instance_family_info[region][instance_family]['MaxInstanceType'] = instance_type + self.instance_family_info[region][instance_family]['MaxInstanceSize'] = instance_size + self.instance_family_info[region][instance_family]['MaxCoreCount'] = instance_type_info[instance_type]['CoreCount'] + self.instance_family_info[region][instance_family]['architecture'] = instance_type_info[instance_type]['architecture'] else: - self.instance_type_info[instanceType]['CoreCount'] = int(instance_type_info['VCpuInfo']['DefaultVCpus']/self.instance_type_info[instanceType]['ThreadsPerCore']) - self.instance_type_info[instanceType]['MemoryInMiB'] = instance_type_info['MemoryInfo']['SizeInMiB'] - self.instance_type_info[instanceType]['SSDCount'] = instance_type_info.get('InstanceStorageInfo', {'Disks': [{'Count': 0}]})['Disks'][0]['Count'] - self.instance_type_info[instanceType]['SSDTotalSizeGB'] = instance_type_info.get('InstanceStorageInfo', {'TotalSizeInGB': 0})['TotalSizeInGB'] - #logger.debug(pp.pformat(self.instance_type_info[instanceType])) - - (instance_family, instance_size) = instanceType.split(r'.') - if instance_family not in self.instance_family_info: - self.instance_family_info[instance_family] = {} - self.instance_family_info[instance_family]['instance_types'] = [instanceType,] - self.instance_family_info[instance_family]['MaxInstanceType'] = instanceType - self.instance_family_info[instance_family]['MaxInstanceSize'] = instance_size - self.instance_family_info[instance_family]['MaxCoreCount'] = self.instance_type_info[instanceType]['CoreCount'] - self.instance_family_info[instance_family]['architecture'] = architecture - else: - self.instance_family_info[instance_family]['instance_types'].append(instanceType) - if self.instance_type_info[instanceType]['CoreCount'] > self.instance_family_info[instance_family]['MaxCoreCount']: - self.instance_family_info[instance_family]['MaxInstanceType'] = instanceType - self.instance_family_info[instance_family]['MaxInstanceSize'] = instance_size - self.instance_family_info[instance_family]['MaxCoreCount'] = self.instance_type_info[instanceType]['CoreCount'] + self.instance_family_info[region][instance_family]['instance_types'].append(instance_type) + if instance_type_info[instance_type]['CoreCount'] > self.instance_family_info[region][instance_family]['MaxCoreCount']: + self.instance_family_info[region][instance_family]['MaxInstanceType'] = instance_type + self.instance_family_info[region][instance_family]['MaxInstanceSize'] = instance_size + self.instance_family_info[region][instance_family]['MaxCoreCount'] = instance_type_info[instance_type]['CoreCount'] def get_instance_family(self, instanceType): instance_family = instanceType.split(r'.')[0] @@ -369,42 +388,41 @@ def decode_short_instance_size(self, short_instance_size): instance_size = short_instance_size return instance_size - def get_instance_families(self): - return sorted(self.instance_type_info.keys()) - - def get_max_instance_type(self, instance_family): - return self.instance_family_info[instance_family]['MaxInstanceType'] + def get_instance_families(self, region): + return sorted(self.instance_family_info[region].keys()) - def get_instance_types(self): - return sorted(self.instance_type_info.keys()) + def get_max_instance_type(self, region, instance_family): + return self.instance_family_info[region][instance_family]['MaxInstanceType'] - def get_architecture(self, instance_type): - return self.instance_type_info[instance_type]['architecture'] + def get_instance_types(self, region): + return sorted(self.instance_type_info[region].keys()) - def get_SustainedClockSpeedInGhz(self, instance_type): - return self.instance_type_info[instance_type]['SustainedClockSpeedInGhz'] + def get_architecture(self, region, instance_type): + return self.instance_type_info[region][instance_type]['architecture'] - def get_CoreCount(self, instance_type): - return self.instance_type_info[instance_type]['CoreCount'] + def get_SustainedClockSpeedInGhz(self, region, instance_type): + return self.instance_type_info[region][instance_type]['SustainedClockSpeedInGhz'] - def get_ThreadsPerCore(self, instance_type): - return self.instance_type_info[instance_type]['ThreadsPerCore'] + def get_CoreCount(self, region, instance_type): + return self.instance_type_info[region][instance_type]['CoreCount'] - def get_MemoryInMiB(self, instance_type): - return self.instance_type_info[instance_type]['MemoryInMiB'] + def get_ThreadsPerCore(self, region, instance_type): + return self.instance_type_info[region][instance_type]['ThreadsPerCore'] - def get_SSDCount(self, instance_type): - return self.instance_type_info[instance_type]['SSDCount'] + def get_MemoryInMiB(self, region, instance_type): + return self.instance_type_info[region][instance_type]['MemoryInMiB'] - def get_SSDTotalSizeGB(self, instance_type): - return self.instance_type_info[instance_type]['SSDTotalSizeGB'] + def get_SSDCount(self, region, instance_type): + return self.instance_type_info[region][instance_type]['SSDCount'] - def get_full_info(self, instance_type): - return self.instance_type_info[instance_type]['full'] + def get_SSDTotalSizeGB(self, region, instance_type): + return self.instance_type_info[region][instance_type]['SSDTotalSizeGB'] def get_hostinfo(self, hostnames): ''' - Store information about all existing compute nodes and those hostnames to self.hostinfo + Get information about all existing compute nodes and hostnames + + The hostnames can span multiple AZs and regions. Args: hostnames ([str]): List of hostnames that may or may not have instances. @@ -422,59 +440,61 @@ def get_hostinfo(self, hostnames): # Find existing unterminated instances # Collect the number of SlurmNodes in each state overall and by instance type slurmNodeStats = {} - for result in self.paginate(self.ec2_describe_instances_paginator, {}): - for reservation in result['Reservations']: - for instance in reservation['Instances']: - # Ignore instances that aren't SlurmNodes - role = self.getTag('role', instance) - if not role or role != 'SlurmNode': - continue - - # Ignore instances that aren't in this cluster - cluster = self.getTag('ClusterName', instance) - if not cluster or cluster != self.config['ClusterName']: - continue + for region in self.compute_regions: + for result in self.paginate(self.ec2_describe_instances_paginator[region], {}): + for reservation in result['Reservations']: + for instance in reservation['Instances']: + # Ignore instances that aren't SlurmNodes + role = self.getTag('role', instance) + if not role or role != 'SlurmNode': + continue - # Ignore terminated or terminating instances - state = instance['State']['Name'] - if state in ['shutting-down', 'terminated']: - continue + # Ignore instances that aren't in this cluster + cluster = self.getTag('ClusterName', instance) + if not cluster or cluster != self.config['ClusterName']: + continue - instanceType = instance['InstanceType'] - if state not in slurmNodeStats: - slurmNodeStats[state] = {} - slurmNodeStats[state]['all'] = 0 - if instanceType not in slurmNodeStats[state]: - slurmNodeStats[state][instanceType] = 0 - slurmNodeStats[state][instanceType] += 1 - slurmNodeStats[state]['all'] += 1 - - hostname = self.getTag('hostname', instance) - if not hostname: - continue - if hostname not in self.hostinfo: - # Ignore invalid hostnames - try: - self.add_hostname_to_hostinfo(hostname) - except ValueError: - logger.warning(f"Ignoring invalid hostname: {hostname}") + # Ignore terminated or terminating instances + state = instance['State']['Name'] + if state in ['shutting-down', 'terminated']: continue - hostinfo = self.hostinfo[hostname] + instanceType = instance['InstanceType'] + if state not in slurmNodeStats: + slurmNodeStats[state] = {} + slurmNodeStats[state]['all'] = 0 + if instanceType not in slurmNodeStats[state]: + slurmNodeStats[state][instanceType] = 0 + slurmNodeStats[state][instanceType] += 1 + slurmNodeStats[state]['all'] += 1 + + hostname = self.getTag('hostname', instance) + if not hostname: + continue + if hostname not in self.hostinfo: + # Ignore invalid hostnames + try: + self.add_hostname_to_hostinfo(hostname) + except ValueError: + logger.warning(f"Ignoring invalid hostname: {hostname}") + continue - # Check for duplicate instances with the same hostname - instanceId = instance['InstanceId'] - if hostinfo['instanceId']: - reason = "Multiple instances of {}. Marking node as down. Instances: {} {}".format(hostname, hostinfo['instanceId'], instanceId) - logger.error(reason) - self.mark_node_down(hostname, reason) - continue + hostinfo = self.hostinfo[hostname] + hostinfo['region'] = region + + # Check for duplicate instances with the same hostname + instanceId = instance['InstanceId'] + if hostinfo['instanceId']: + reason = "Multiple instances of {}. Marking node as down. Instances: {} {}".format(hostname, hostinfo['instanceId'], instanceId) + logger.error(reason) + self.mark_node_down(hostname, reason) + continue - hostinfo['instanceId'] = instanceId - hostinfo['State'] = state - hostinfo['ImageId'] = instance['ImageId'] - hostinfo['LaunchTime'] = instance.get('LaunchTime', None) - logger.debug("Found %s(%s) state=%s" % (hostname, instanceId, state)) + hostinfo['instanceId'] = instanceId + hostinfo['State'] = state + hostinfo['ImageId'] = instance['ImageId'] + hostinfo['LaunchTime'] = instance.get('LaunchTime', None) + logger.debug("Found %s(%s) state=%s" % (hostname, instanceId, state)) # Save SlurmNode counts to CloudWatch for state in slurmNodeStats.keys(): @@ -495,7 +515,7 @@ def add_hostname_to_hostinfo(self, hostname): return try: - distribution, distribution_major_version, architecture, instance_family, instance_size, spot = self.parse_hostname(hostname)[0:6] + az_id, distribution, distribution_major_version, architecture, instance_family, instance_size, spot = self.parse_hostname(hostname)[0:7] except ValueError: raise except Exception as e: @@ -503,10 +523,14 @@ def add_hostname_to_hostinfo(self, hostname): hostinfo = {} + hostinfo['az_id'] = az_id + az = self.az_ids[az_id] + hostinfo['region'] = self.az_info[az]['region'] + hostinfo['distribution'] = distribution hostinfo['distribution_major_version'] = distribution_major_version - ssm_parameter_name = f"/{self.config['STACK_NAME']}/SlurmNodeAmis/{distribution}/{distribution_major_version}/{architecture}" + ssm_parameter_name = f"/{self.config['STACK_NAME']}/SlurmNodeAmis/{distribution}/{distribution_major_version}/{architecture}/{hostinfo['region']}" try: hostinfo['ami'] = self.ssm.get_parameter(Name=ssm_parameter_name)["Parameter"]["Value"] except Exception as e: @@ -522,7 +546,7 @@ def add_hostname_to_hostinfo(self, hostname): hostinfo['spot'] = spot - hostinfo['coreCount'] = self.instance_type_info[instance_type]['CoreCount'] + hostinfo['coreCount'] = self.instance_type_info[hostinfo['region']][instance_type]['CoreCount'] hostinfo['instanceId'] = None @@ -530,39 +554,41 @@ def add_hostname_to_hostinfo(self, hostname): def update_hostinfo(self, instanceIds=[]): logger.debug("Updating hostinfo") - for result in self.paginate(self.ec2_describe_instances_paginator, {'InstanceIds': instanceIds}): - for reservation in result['Reservations']: - for instance in reservation['Instances']: - # Ignore instances that aren't SlurmNodes - role = self.getTag('role', instance) - if not role or role != 'SlurmNode': - continue - - # Ignore instances that aren't in this cluster - cluster = self.getTag('SlurmCluster', instance) - if not cluster or cluster != self.config['ClusterName']: - continue + for region in self.compute_regions: + for result in self.paginate(self.ec2_describe_instances_paginator[region], {'InstanceIds': instanceIds}): + for reservation in result['Reservations']: + for instance in reservation['Instances']: + # Ignore instances that aren't SlurmNodes + role = self.getTag('role', instance) + if not role or role != 'SlurmNode': + continue - # Ignore terminated or terminating instances - state = instance['State']['Name'] - if state in ['shutting-down', 'terminated']: - continue + # Ignore instances that aren't in this cluster + cluster = self.getTag('SlurmCluster', instance) + if not cluster or cluster != self.config['ClusterName']: + continue - hostname = self.getTag('hostname', instance) - if not hostname: - continue - instanceId = instance['InstanceId'] - logger.debug("Found %s(%s) state=%s" % (hostname, instanceId, state)) + # Ignore terminated or terminating instances + state = instance['State']['Name'] + if state in ['shutting-down', 'terminated']: + continue - if hostname not in self.hostinfo: - # Ignore invalid hostnames - try: - self.add_hostname_to_hostinfo(hostname) - except ValueError: - logger.warning(f"Ignoring invalid hostname: {hostname}") - self.hostinfo[hostname]['instanceId'] = instanceId - self.hostinfo[hostname]['ImageId'] = instance['ImageId'] - self.hostinfo[hostname]['State'] = state + hostname = self.getTag('hostname', instance) + if not hostname: + continue + instanceId = instance['InstanceId'] + logger.debug("Found %s(%s) state=%s" % (hostname, instanceId, state)) + + if hostname not in self.hostinfo: + # Ignore invalid hostnames + try: + self.add_hostname_to_hostinfo(hostname) + except ValueError: + logger.warning(f"Ignoring invalid hostname: {hostname}") + continue + self.hostinfo[hostname]['instanceId'] = instanceId + self.hostinfo[hostname]['ImageId'] = instance['ImageId'] + self.hostinfo[hostname]['State'] = state def getTag(self, key, instance): value = None @@ -583,6 +609,14 @@ def resume(self): logger.info("Resuming {} hosts: {}".format(len(self.hostnames), self.hostlist)) self.publish_cw_metrics(self.CW_SLURM_RESUME, len(self.hostnames), []) + for region in self.compute_regions: + self.resume_region(region) + except: + logger.exception('Unhandled exception in SlurmPlugin.resume') + self.publish_cw_metrics(self.CW_UNHANDLED_RESUME_EXCEPTION, 1, []) + raise + + def resume_region(self, region): # Decide what to do for each hostname # Possible states: # * none - create @@ -601,6 +635,8 @@ def resume(self): stopped_instanceIds = [] for hostname in self.hostnames: hostinfo = self.hostinfo[hostname] + if hostinfo['region'] != region: + continue # Create new instance if one doesn't exist instanceId = hostinfo['instanceId'] @@ -641,6 +677,7 @@ def resume(self): if instanceIds_to_terminate: terminated_hostnames = self.terminate_instanceIds( + region, hostnames_to_terminate, instanceIds_to_terminate, 'terminating instances before resume', self.CW_SLURM_TERMINATE_BEFORE_RESUME, @@ -653,17 +690,17 @@ def resume(self): start_instances_exception = None start_instances_exception_reason = None try: - self.start_instances({'InstanceIds': stopped_instanceIds}) + self.start_instances(region, {'InstanceIds': stopped_instanceIds}) except ClientError as e: # botocore.exceptions.ClientError: An error occurred (ResourceCountExceeded) when calling the StartInstances operation: # You have exceeded the number of resources allowed in a single call of this type start_instances_exception = e start_instances_exception_reason = e.response['Error']['Code'] - logger.error("start_instances failed because {}".format(start_instances_exception_reason)) + logger.error(f"start_instances({region}) failed because {start_instances_exception_reason}") except Exception as e: start_instances_exception = e start_instances_exception_reason = "Unknown" - logger.exception("start_instances failed with unknown exception") + logger.exception(f"start_instances({region}) failed with unknown exception") if start_instances_exception: # If there is more than one instance then some may have started so need to iterate through each instance # to see which ones started and which ones didn't so we can mark the failed @@ -708,6 +745,12 @@ def resume(self): userDataTemplate = Template(open(userDataFilename, 'r').read()) for hostname in hostnames_to_create: hostinfo = self.hostinfo[hostname] + az_id = hostinfo['az_id'] + az = self.az_ids[az_id] + region = self.az_info[az]['region'] + subnet = self.az_info[az]['subnet'] + security_group_id = self.ssm.get_parameter(Name=f"/{self.config['STACK_NAME']}/SlurmNodeSecurityGroups/{region}")['Parameter']['Value'] + key_name = self.ssm.get_parameter(Name=f"/{self.config['STACK_NAME']}/SlurmNodeEc2KeyPairs/{region}")['Parameter']['Value'] ami = hostinfo['ami'] userData = userDataTemplate.render({ 'DOMAIN': self.config['DOMAIN'], @@ -722,9 +765,9 @@ def resume(self): 'InstanceType': hostinfo['instance_type'], 'MaxCount': 1, 'MinCount': 1, - 'KeyName': self.config['EC2_KEY_PAIR'], - 'SecurityGroupIds': [self.config['SLURMNODE_SECURITY_GROUP']], - 'SubnetId': self.config['SLURMNODE_SUBNET'], + 'KeyName': key_name, + 'SecurityGroupIds': [security_group_id], + 'SubnetId': subnet, 'IamInstanceProfile': {'Arn': self.config['SLURMNODE_PROFILE_ARN']}, 'UserData': userData, 'TagSpecifications': [ @@ -743,7 +786,7 @@ def resume(self): ], 'BlockDeviceMappings': [], } - if self.get_ThreadsPerCore(hostinfo['instance_type']) > 1: + if self.get_ThreadsPerCore(region, hostinfo['instance_type']) > 1: kwargs['CpuOptions'] = {'CoreCount': hostinfo['coreCount'], 'ThreadsPerCore': 1} if hostinfo['spot']: kwargs['InstanceMarketOptions'] = { @@ -754,11 +797,11 @@ def resume(self): } } drive_letter = 'c' - for ephemeral_index in range(0, self.instance_type_info[hostinfo['instance_type']]['SSDCount']): + for ephemeral_index in range(0, self.instance_type_info[region][hostinfo['instance_type']]['SSDCount']): kwargs['BlockDeviceMappings'].append({'DeviceName': '/dev/sd'+drive_letter, 'VirtualName': 'ephemeral'+str(ephemeral_index)}) drive_letter = chr(ord(drive_letter) + 1) logger.debug(f"run_instances kwargs:\n{pp.pformat(kwargs)}") - hostinfo['launch_thread'] = LaunchInstanceThread(self, kwargs) + hostinfo['launch_thread'] = LaunchInstanceThread(self, region, kwargs) hostinfo['launch_thread'].start() # Wait for instances to be launched launch_failures = 0 @@ -813,11 +856,11 @@ def resume(self): except ClientError as e: start_instances_exception = e start_instances_exception_reason = e.response['Error']['Code'] - logger.exception("start_instances failed because {}".format(start_instances_exception_reason)) + logger.exception(f"start_instances({region}) failed because {start_instances_exception_reason}") except Exception as e: start_instances_exception = e start_instances_exception_reason = "Unknown" - logger.exception("start_instances failed with unknown exception") + logger.exception(f"start_instances({region}) failed with unknown exception") if start_instances_exception: # If there is more than one instance then some may have started so need to iterate through each instance # to see which ones started and which ones didn't so we can mark the failed @@ -852,23 +895,28 @@ def resume(self): stopping_instanceIds.remove(instanceId) self.update_hostinfo() - self.terminate_old_instances() - except: - logger.exception('Unhandled exception in SlurmPlugin.resume') - self.publish_cw_metrics(self.CW_UNHANDLED_RESUME_EXCEPTION, 1, []) - raise + self.terminate_old_instances_region(region) - def resume_fail(self): + def resume_fail(self, region): try: self.suspend_resume_setup() if not self.hostnames: return - logger.error("Resume failed on {} hosts: {}".format(len(self.hostnames), self.hostlist)) + logger.error(f"Resume failed on {len(self.hostnames)} hosts: {self.hostlist}") # They will already have been marked down my slurmctld # Just log it to CloudWatch self.publish_cw_metrics(self.CW_SLURM_RESUME_TIMEOUT, len(self.hostnames), []) + for region in self.compute_regions: + self.resume_fail_region(region) + + except: + logger.exception('Unhandled exception in SlurmPlugin.resume_fail') + self.publish_cw_metrics(self.CW_UNHANDLED_RESUME_FAIL_EXCEPTION, 1, []) + raise + + def resume_fail_region(self, region): # Now stop them so that they stop consuming resources until they can be debugged. hostnames_to_terminate = [] instanceIds_to_terminate = [] @@ -876,6 +924,8 @@ def resume_fail(self): instanceIds_to_stop = [] for hostname in self.hostnames: hostinfo = self.hostinfo[hostname] + if hostinfo['region'] != region: + continue instanceId = hostinfo['instanceId'] if not instanceId: logger.info("Not stopping {}({}) because no instance found".format(hostname, instanceId)) @@ -898,21 +948,18 @@ def resume_fail(self): if instanceIds_to_terminate: self.terminate_instanceIds( + region, hostnames_to_terminate, instanceIds_to_terminate, 'terminating instances during resume_fail', self.CW_SLURM_RESUME_FAIL_TERMINATE, self.CW_SLURM_RESUME_FAIL_TERMINATE_ERROR) if instanceIds_to_stop: self.stop_instanceIds( + region, hostnames_to_stop, instanceIds_to_stop, 'stopping instances during resume_fail', self.CW_SLURM_RESUME_FAIL_STOP, self.CW_SLURM_RESUME_FAIL_STOP_ERROR) - except: - logger.exception('Unhandled exception in SlurmPlugin.resume_fail') - self.publish_cw_metrics(self.CW_UNHANDLED_RESUME_FAIL_EXCEPTION, 1, []) - raise - def stop(self): try: self.suspend_resume_setup() @@ -922,6 +969,14 @@ def stop(self): logger.info("Stopping {} hosts: {}".format(len(self.hostnames), self.hostlist)) self.publish_cw_metrics(self.CW_SLURM_STOP, len(self.hostnames), []) + for region in self.compute_regions: + self.stop_region(region) + except: + logger.exception('Unhandled exception in SlurmPlugin.stop') + self.publish_cw_metrics(self.CW_UNHANDLED_STOP_EXCEPTION, 1, []) + raise + + def stop_region(self, region): # Decide what to do for each hostname # Possible states: # * none - no action @@ -937,6 +992,8 @@ def stop(self): instanceIds_to_stop = [] for hostname in self.hostnames: hostinfo = self.hostinfo[hostname] + if hostinfo['region'] != region: + continue instanceId = hostinfo['instanceId'] if not instanceId: logger.info("Not stopping {}({}) because no instance found".format(hostname, instanceId)) @@ -978,6 +1035,7 @@ def stop(self): if instanceIds_to_terminate: self.terminate_instanceIds( + region, hostnames_to_terminate, instanceIds_to_terminate, 'terminating instance during stop', self.CW_SLURM_STOP_TERMINATE, @@ -985,14 +1043,10 @@ def stop(self): ) if instanceIds_to_stop: - self.stop_instanceIds(hostnames_to_stop, instanceIds_to_stop, 'stopping instance during stop', self.CW_SLURM_STOP_STOP, self.CW_SLURM_STOP_STOP_ERROR) + self.stop_instanceIds(region, hostnames_to_stop, instanceIds_to_stop, 'stopping instance during stop', self.CW_SLURM_STOP_STOP, self.CW_SLURM_STOP_STOP_ERROR) self.update_hostinfo() - self.terminate_old_instances() - except: - logger.exception('Unhandled exception in SlurmPlugin.stop') - self.publish_cw_metrics(self.CW_UNHANDLED_STOP_EXCEPTION, 1, []) - raise + self.terminate_old_instances_region(region) def terminate(self): try: @@ -1003,6 +1057,14 @@ def terminate(self): logger.info("Terminating {} hosts: {}".format(len(self.hostnames), self.hostlist)) self.publish_cw_metrics(self.CW_SLURM_TERMINATE, len(self.hostnames), []) + for region in self.compute_regions: + self.terminate_region(region) + except: + logger.exception('Unhandled exception in SlurmPlugin.terminate') + self.publish_cw_metrics(self.CW_UNHANDLED_TERMINATE_EXCEPTION, 1, []) + raise + + def terminate_region(self, region): # Find instances that need to be terminated # Decide what to do for each hostname # Possible states: @@ -1017,6 +1079,9 @@ def terminate(self): instanceIds_to_terminate = [] for hostname in self.hostnames: hostinfo = self.hostinfo[hostname] + if hostinfo['region'] != region: + continue + instanceId = hostinfo['instanceId'] if not instanceId: continue @@ -1026,6 +1091,7 @@ def terminate(self): logger.info("Terminating {}({})".format(hostname, instanceId)) if instanceIds_to_terminate: self.terminate_instanceIds( + region, hostnames_to_terminate, instanceIds_to_terminate, 'terminating instances', self.CW_SLURM_TERMINATE, @@ -1033,11 +1099,7 @@ def terminate(self): ) self.update_hostinfo() - self.terminate_old_instances() - except: - logger.exception('Unhandled exception in SlurmPlugin.terminate') - self.publish_cw_metrics(self.CW_UNHANDLED_TERMINATE_EXCEPTION, 1, []) - raise + self.terminate_old_instances_region(region) def terminate_old_instances_main(self): global logger @@ -1090,6 +1152,10 @@ def check_slurmctld(self): return False def terminate_old_instances(self): + for region in self.compute_regions: + self.terminate_old_instances_region(region) + + def terminate_old_instances_region(self, region): # Find stopped instances that have an old AMI logger.debug("Checking for stopped instances with old AMIs to terminate") hostnames_to_terminate = [] @@ -1114,6 +1180,7 @@ def terminate_old_instances(self): if instanceIds_to_terminate: self.terminate_instanceIds( + region, hostnames_to_terminate, instanceIds_to_terminate, 'terminating because of old ami', self.CW_SLURM_TERMINATE_OLD_AMI, @@ -1144,24 +1211,36 @@ def terminate_old_instances(self): if instanceIds_to_terminate: self.terminate_instanceIds( + region, hostnames_to_terminate, instanceIds_to_terminate, 'terminating because older than {deadline}', self.CW_SLURM_TERMINATE_OLD_AMI, self.CW_SLURM_TERMINATE_ERROR ) - def stop_instanceIds(self, hostnames_to_stop, instanceIds_to_stop, + def stop_instanceIds(self, region, hostnames_to_stop, instanceIds_to_stop, action, metric, error_metric): if not instanceIds_to_stop: return self.publish_cw_metrics(self.CW_SLURM_STOP_STOP, len(instanceIds_to_stop), []) retry = False try: - self.stop_instances({'InstanceIds': instanceIds_to_stop}) + self.stop_instances(region, {'InstanceIds': instanceIds_to_stop}) except ClientError as e: + exception_reason = e.response['Error']['Code'] retry = True - if e.response['Error']['Code'] == 'ResourceCountExceeded': + if exception_reason == 'ResourceCountExceeded': logger.info("Caught {} while stopping {} instances".format(e.response['Error']['Code'], len(instanceIds_to_stop))) + elif exception_reason == 'UnauthorizedOperation': + retry = False + message = e.response['Error']['Message'] + matches = re.match(r'You are not authorized to perform this operation. Encoded authorization failure message: (\S+)', message) + if matches: + encoded_message = matches.group(1) + logger.error(f"Encoded message:\n{encoded_message}") + sts_client = boto3.client('sts', region_name=region) + decoded_message = json.loads(sts_client.decode_authorization_message(EncodedMessage=encoded_message)['DecodedMessage']) + logger.error(f"decoded_message:\n{json.dumps(decoded_message, indent=4)}") else: logger.exception("Error {}".format(action)) self.publish_cw_metrics(error_metric, 1, []) @@ -1177,27 +1256,38 @@ def stop_instanceIds(self, hostnames_to_stop, instanceIds_to_stop, hostinfo = self.hostinfo[hostname] instanceId = hostinfo['instanceId'] try: - self.stop_instances({'InstanceIds': [instanceId]}) + self.stop_instances(region, {'InstanceIds': [instanceId]}) except: logger.exception("Error while stopping {}({})".format(hostname, instanceId)) self.publish_cw_metrics(self.CW_SLURM_STOP_STOP_ERROR, 1) - def terminate_instanceIds(self, hostnames_to_terminate, instanceIds_to_terminate, + def terminate_instanceIds(self, region, hostnames_to_terminate, instanceIds_to_terminate, action, metric, error_metric): if not instanceIds_to_terminate: return self.publish_cw_metrics(metric, len(instanceIds_to_terminate), []) retry = False try: - self.terminate_instances({'InstanceIds': instanceIds_to_terminate}) + self.terminate_instances(region, {'InstanceIds': instanceIds_to_terminate}) terminated_hostnames = hostnames_to_terminate terminated_instanceIds = instanceIds_to_terminate for hostname in hostnames_to_terminate: self.hostinfo[hostname]['instanceId'] = None except ClientError as e: + exception_reason = e.response['Error']['Code'] retry = True - if e.response['Error']['Code'] == 'ResourceCountExceeded': + if exception_reason == 'ResourceCountExceeded': logger.info("Caught {} while terminating {} instances".format(e.response['Error']['Code'], len(instanceIds_to_terminate))) + elif exception_reason == 'UnauthorizedOperation': + retry = False + message = e.response['Error']['Message'] + matches = re.match(r'You are not authorized to perform this operation. Encoded authorization failure message: (\S+)', message) + if matches: + encoded_message = matches.group(1) + logger.error(f"Encoded message:\n{encoded_message}") + sts_client = boto3.client('sts', region_name=region) + decoded_message = json.loads(sts_client.decode_authorization_message(EncodedMessage=encoded_message)['DecodedMessage']) + logger.error(f"decoded_message:\n{json.dumps(decoded_message, indent=4)}") else: logger.exception("Error {}".format(action)) self.publish_cw_metrics(error_metric, 1, []) @@ -1214,7 +1304,7 @@ def terminate_instanceIds(self, hostnames_to_terminate, instanceIds_to_terminate for hostname in hostnames_to_terminate: instanceId = self.hostinfo[hostname]['instanceId'] try: - self.terminate_instances({'InstanceIds': [instanceId]}) + self.terminate_instances(region, {'InstanceIds': [instanceId]}) terminated_hostnames.append(hostname) terminated_instanceIds.append(instanceId) except: @@ -1244,6 +1334,10 @@ def publish_cw_metrics(self, metric_name, value, dimensions): def parse_hostname(self, hostname): ''' + Parse hostname to get instance attributes + + The format is {az_id}-{distribution-code}-{architecture}-{instance-family}-{instance-size}[-sp]-index + Args: hostname (str): Hostname of compute node Raises: @@ -1254,12 +1348,15 @@ def parse_hostname(self, hostname): logger.debug(f"hostname={hostname}") fields = hostname.split('-') logger.debug(f"fields: {fields}") - if len(fields) < 5: - raise ValueError(f"{hostname} has less than 5 fields: {fields}") - elif len(fields) > 6: - raise ValueError(f"{hostname} has more than 6 fields: {fields}") - (os, short_architecture, instance_family, short_instance_size) = fields[0:4] - spot = fields[4] == 'sp' + if len(fields) < 7: + raise ValueError(f"{hostname} has less than 7 fields: {fields}") + elif len(fields) > 8: + raise ValueError(f"{hostname} has more than 8 fields: {fields}") + (az_id1, az_id2, os, short_architecture, instance_family, short_instance_size) = fields[0:6] + az_id = f"{az_id1}-{az_id2}" + logger.debug(f"az_id={az_id}") + spot = fields[6] == 'sp' + logger.debug(f"spot={spot}") index = fields[-1] if len(os) != 2: raise ValueError(f"{hostname} has invalid os: {os}. Must be 2 characters.") @@ -1283,7 +1380,7 @@ def parse_hostname(self, hostname): logger.debug(f"instance_size={instance_size}") logger.debug(f"spot={spot}") logger.debug(f"index={index}") - return (distribution, distribution_version, architecture, instance_family, instance_size, spot, index) + return (az_id, distribution, distribution_version, architecture, instance_family, instance_size, spot, index) def drain(self, hostname, reason): logger.info(f"Setting {hostname} to drain so new jobs do not run on it.") @@ -1398,10 +1495,13 @@ def create_node_conf(self): logger_streamHandler.setFormatter(logger_formatter) logger.addHandler(logger_streamHandler) logger.setLevel(logging.INFO) + logger.propagate = False self.parser = argparse.ArgumentParser("Create SLURM node config from EC2 instance metadata") self.parser.add_argument('--config-file', default=False, help="YAML file with instance families and types to include/exclude") self.parser.add_argument('--output-file', '-o', required=True, help="Output file") + self.parser.add_argument('--az-info-file', required=True, help="JSON file where AZ info will be saved") + self.parser.add_argument('--instance-type-info-json', default=False, help="JSON file with cached instance type info.") self.parser.add_argument('--debug', '-d', action='count', default=False, help="Enable debug messages") self.args = self.parser.parse_args() @@ -1410,11 +1510,11 @@ def create_node_conf(self): logger.debug(f"Debugging level {self.args.debug}") if self.args.config_file: + logger.info(f"Loading config from {self.args.config_file}") instance_config = yaml.load(open(self.args.config_file, 'r').read(), Loader=yaml.SafeLoader) else: instance_config = { 'UseSpot': True, - 'DefaultPartition': 'CentOS_7_x86_64', 'NodesPerInstanceType': 10, 'BaseOsArchitecture': { 'AlmaLinux': {8: ['x86_64', 'arm64']}, @@ -1460,13 +1560,23 @@ def create_node_conf(self): ], 'InstanceTypes': [] }, + 'Regions': [ + { + 'Region': environ['AWS_DEFAULT_REGION'], + 'AZs': [ + { + 'Priority': 1, + 'Region': environ['AWS_DEFAULT_REGION'], + 'Subnet': environ['GridSubnet1'] + } + ], + }, + ], 'AlwaysOnNodes': [], 'AlwaysOnPartitions': [] } # Check for required fields - if 'DefaultPartition' not in instance_config: - raise ValueError(f"InstanceConfig missing DefaultPartition") if 'BaseOsArchitecture' not in instance_config: raise ValueError(f"InstanceConfig missing BaseOsArchitecture") @@ -1480,114 +1590,95 @@ def create_node_conf(self): if 'MaxSizeOnly' not in instance_config['Include']: instance_config['Include']['MaxSizeOnly'] = 10 - instance_types = self.get_instance_types_from_instance_config(instance_config) - logger.debug(f"instance_types:\n{pp.pformat(instance_types)}") + compute_regions = sorted(instance_config['Regions'].keys()) + az_info = self.get_az_info_from_instance_config(instance_config) + logger.info(f"{len(az_info.keys())} AZs configured: {sorted(az_info.keys())}") + logger.debug(f"{pp.pformat(az_info)}") + with open(self.args.az_info_file, 'w') as fh: + fh.write(json.dumps(az_info, indent=4)) - region_name = self.get_region_name(self.config['region']) - - self.pricing_client = boto3.client('pricing') - for instanceType in sorted(instance_types): - logger.debug("instanceType: {}".format(instanceType)) - os = 'Linux' - pricing_filter = [ - {'Field': 'ServiceCode', 'Value': 'AmazonEC2', 'Type': 'TERM_MATCH'}, - {'Field': 'instanceType', 'Value': instanceType, 'Type': 'TERM_MATCH'}, - {'Field': 'tenancy', 'Value': 'shared', 'Type': 'TERM_MATCH'}, - {'Field': 'preInstalledSw', 'Value': 'NA', 'Type': 'TERM_MATCH'}, - {'Field': 'location', 'Value': region_name, 'Type': 'TERM_MATCH'}, - {'Field': 'operatingSystem', 'Value': os, 'Type': 'TERM_MATCH'}, - {'Field': 'capacitystatus', 'Value': 'Used', 'Type': 'TERM_MATCH'}, - ] - kwargs = { - 'ServiceCode': 'AmazonEC2', - 'Filters': pricing_filter - } - priceLists = self.pricing_get_products(ServiceCode='AmazonEC2', Filters=pricing_filter)['PriceList'] - if self.args.debug > 2: - logger.debug("{} priceLists".format(len(priceLists))) - if len(priceLists) != 1: - raise RuntimeError("Number of PriceLists != 1 for {}".format(instanceType)) - priceList = json.loads(priceLists[0]) - if self.args.debug > 2: - logger.debug("pricelist:\n{}".format(pp.pformat(priceList))) - onDemandTerms = priceList['terms']['OnDemand'] - if self.args.debug > 2: - logger.debug("onDemandTerms:\n{}".format(pp.pformat(onDemandTerms))) - id1 = list(onDemandTerms)[0] - if self.args.debug > 2: - logger.debug("id1:{}".format(pp.pformat(id1))) - id2 = list(onDemandTerms[id1]['priceDimensions'])[0] - if self.args.debug > 2: - logger.debug("id2:{}".format(pp.pformat(id2))) - unit = onDemandTerms[id1]['priceDimensions'][id2]['unit'] - if unit != 'Hrs': - raise RuntimeError("Unknown pricing unit: {}".format(unit)) - if self.args.debug > 2: - logger.debug("unit: {}".format(unit)) - currency = list(onDemandTerms[id1]['priceDimensions'][id2]['pricePerUnit'])[0] - if currency != 'USD': - raise RuntimeError("Unknown currency: {}".format(currency)) - price = onDemandTerms[id1]['priceDimensions'][id2]['pricePerUnit']['USD'] - if self.args.debug > 2: - logger.debug("price: {}".format(price)) - - self.instance_type_info[instanceType]['price'] = price - if self.args.debug > 2: - logger.debug(f"{instanceType} info:\n{pp.pformat(self.instance_type_info[instanceType])}") + eC2InstanceTypeInfo = EC2InstanceTypeInfo(compute_regions, json_filename=self.args.instance_type_info_json, debug=self.args.debug > 1) + self.instance_type_info = eC2InstanceTypeInfo.instance_type_info + self.create_instance_family_info() + + instance_types = self.get_instance_types_from_instance_config(instance_config, compute_regions, eC2InstanceTypeInfo) + logger.debug(f"instance_types:\n{pp.pformat(instance_types)}") architecture_prefix_map = { 'x86_64': 'x86', 'arm64': 'arm', } - node_sets = {} - for distribution, distribution_dict in instance_config['BaseOsArchitecture'].items(): - logger.debug(distribution) - logger.debug(f"distribution_dict:\n{pp.pformat(distribution_dict)}") - os_prefix = distribution_to_prefix_map[distribution] - for distribution_major_version, architectures in distribution_dict.items(): - for architecture in architectures: - node_set = f"{distribution}_{distribution_major_version}_{architecture}" - node_sets[node_set] = {'nodes': [], 'node_names': []} - if instance_config['UseSpot']: - spot_node_set = f"{node_set}_spot" - node_sets[spot_node_set] = {'nodes': [], 'node_names': []} - architecture_prefix = architecture_prefix_map[architecture] - partitionName = f"{distribution}_{distribution_major_version}_{architecture}" - for instanceType in sorted(instance_types): - if self.instance_type_info[instanceType]['architecture'] != architecture: - continue - logger.debug(f"{pp.pformat(self.instance_type_info[instanceType])}") - instance_family = self.get_instance_family(instanceType) - short_instance_size = self.get_short_instance_size(instanceType) - max_node_index = instance_config['NodesPerInstanceType'] - 1 - node = f"{os_prefix}{distribution_major_version}-{architecture_prefix}-{instance_family}-{short_instance_size}-[0-{max_node_index}]" - node_sets[node_set]['nodes'].append(node) - if instance_config['UseSpot']: - spot_node = f"{os_prefix}{distribution_major_version}-{architecture_prefix}-{instance_family}-{short_instance_size}-sp-[0-{max_node_index}]" - node_sets[spot_node_set]['nodes'].append(spot_node) - - coreCount = self.instance_type_info[instanceType]['CoreCount'] - realMemory = self.instance_type_info[instanceType]['MemoryInMiB'] - if realMemory > 650: - realMemory -= 650 - realMemory = int(realMemory * 0.95) - clockSpeedInGHz = self.instance_type_info[instanceType]['SustainedClockSpeedInGhz'] - featureList = f"{os_prefix}{distribution_major_version},{partitionName},{instance_family},{instanceType},{architecture},GHz:{clockSpeedInGHz}" - if self.instance_type_info[instanceType]['SSDCount']: - featureList += ",ssd" - price = self.instance_type_info[instanceType]['price'] - weight = int(float(price) * 10000) - node_name = "NodeName={:30s} CPUs={:2s} RealMemory={:7s} Feature={:65s} Weight={}".format( - node, str(coreCount), str(realMemory), featureList, weight) - node_sets[node_set]['node_names'].append(node_name) + node_sets = {} + max_priority = 0 + max_priority_az = None + for az in sorted(az_info.keys()): + region = az_info[az]['region'] + priority = az_info[az]['priority'] + if priority > max_priority: + max_priority = priority + max_priority_az = az + az_id = az_info[az]['id'] + instance_type_info = eC2InstanceTypeInfo.instance_type_info[region] + + for distribution, distribution_dict in instance_config['BaseOsArchitecture'].items(): + logger.debug(distribution) + logger.debug(f"distribution_dict:\n{pp.pformat(distribution_dict)}") + os_prefix = distribution_to_prefix_map[distribution] + for distribution_major_version, architectures in distribution_dict.items(): + for architecture in architectures: + node_set = f"{az}_{distribution}_{distribution_major_version}_{architecture}" + node_sets[node_set] = { + 'nodes': [], + 'node_names': [], + 'priority': priority + } if instance_config['UseSpot']: - spot_feature_list = f"{featureList},spot" - weight = int(weight / 10) - spot_node_name = "NodeName={:30s} CPUs={:2s} RealMemory={:7s} Feature={:65s} Weight={}".format( - spot_node, str(coreCount), str(realMemory), spot_feature_list, weight) - node_sets[spot_node_set]['node_names'].append(spot_node_name) + spot_node_set = f"{node_set}_spot" + node_sets[spot_node_set] = { + 'nodes': [], + 'node_names': [], + 'priority': priority + } + architecture_prefix = architecture_prefix_map[architecture] + partitionName = f"{az}_{distribution}_{distribution_major_version}_{architecture}" + for instanceType in sorted(instance_types[region]): + if instance_type_info[instanceType]['architecture'] != architecture: + continue + logger.debug(f"{pp.pformat(instance_type_info[instanceType])}") + instance_family = self.get_instance_family(instanceType) + short_instance_size = self.get_short_instance_size(instanceType) + max_node_index = instance_config['NodesPerInstanceType'] - 1 + + node = f"{az_id}-{os_prefix}{distribution_major_version}-{architecture_prefix}-{instance_family}-{short_instance_size}-[0-{max_node_index}]" + node_sets[node_set]['nodes'].append(node) + + coreCount = instance_type_info[instanceType]['CoreCount'] + realMemory = instance_type_info[instanceType]['MemoryInMiB'] + if realMemory > 650: + realMemory -= 650 + realMemory = int(realMemory * 0.95) + clockSpeedInGHz = instance_type_info[instanceType]['SustainedClockSpeedInGhz'] + base_featureList = f"{az},{az_id},{os_prefix}{distribution_major_version},{partitionName},{instance_family},{instanceType},{architecture},GHz:{clockSpeedInGHz}" + if instance_type_info[instanceType]['SSDCount']: + base_featureList += ",ssd" + ondemand_featureList = base_featureList + ',ondemand' + price = instance_type_info[instanceType]['pricing']['OnDemand'] + weight = int(float(price) * 10000) + node_name = "NodeName={:39s} CPUs={:2s} RealMemory={:7s} Feature={:89s} Weight={}".format( + node, str(coreCount), str(realMemory), ondemand_featureList, weight) + node_sets[node_set]['node_names'].append(node_name) + + if instance_config['UseSpot']: + spot_node = f"{az_id}-{os_prefix}{distribution_major_version}-{architecture_prefix}-{instance_family}-{short_instance_size}-sp-[0-{max_node_index}]" + node_sets[spot_node_set]['nodes'].append(spot_node) + spot_feature_list = f"{base_featureList},spot" + spot_price = instance_type_info[instanceType]['pricing']['spot'][az] + spot_weight = int(float(spot_price) * 10000) + spot_node_name = "NodeName={:39s} CPUs={:2s} RealMemory={:7s} Feature={:89s} Weight={}".format( + spot_node, str(coreCount), str(realMemory), spot_feature_list, spot_weight) + node_sets[spot_node_set]['node_names'].append(spot_node_name) fh = open(self.args.output_file, 'w') print(dedent('''\ @@ -1618,6 +1709,7 @@ def create_node_conf(self): print(node_name, file=fh) print(dedent('''\ + # # NodeSets: # Used to group nodes to simplify partition definition. @@ -1632,49 +1724,77 @@ def create_node_conf(self): print(',\\\n'.join(node_sets[node_set]['nodes']), file=fh) print(dedent('''\ + # # Partitions: Slurm's version of queues # Selected by -p option # + # Create a partition for every AZ in the cluster + # # Set defaults for partitions # PartitionName=Default MaxTime=INFINITE State=UP Default=NO PriorityTier=1 '''), file=fh) - node_set_name = f"{instance_config['DefaultPartition']}_nodes" - print(dedent(f"""\ - # - # Batch Partition - # - # The is the default partition and includes all nodes from the 1st OS. - # - PartitionName=batch Default=YES Nodes=\\"""), file=fh) - print(f"{node_set_name}", file=fh) + for node_set in node_sets: + node_set_name = f"{node_set}_nodes" + partitionName = node_set + print(dedent(f"""\ + # + # {partitionName} Partition + # + PartitionName={partitionName} Default=NO PriorityTier={node_sets[node_set]['priority']} Nodes={node_set_name}"""), file=fh) + + # Create partitions for each AZ print(dedent(f"""\ + # - # Interactive Partition - # - # The interative partition has a high weight so that jobs in its queue will - # have the highest scheduling priority so that they should start before - # jobs in lower priority partitions. - # - # This is to allow interactive users to run small numbers of jobs that - # require immediate results. + # AZ Partitions # - PartitionName=interactive Default=NO PriorityTier=10000 Nodes=\\"""), file=fh) - print(f"{node_set_name}", file=fh) + # Group all of the node sets by AZ + #"""), file=fh) + for az in sorted(az_info.keys()): + if az == max_priority_az: + default_partition = 'YES' + else: + default_partition = 'NO' + priority = az_info[az]['priority'] + az_nodesets = [] + for distribution, distribution_dict in instance_config['BaseOsArchitecture'].items(): + for distribution_major_version, architectures in distribution_dict.items(): + for architecture in architectures: + node_set = f"{az}_{distribution}_{distribution_major_version}_{architecture}" + az_nodesets.append(f"{node_set}_nodes") + if instance_config['UseSpot']: + az_nodesets.append(f"{node_set}_spot_nodes") + for node_set in az_nodesets: + node_set_name = f"{node_set}_nodes" + node_list = ',\\\n'.join(az_nodesets) + print(dedent(f"""\ - for node_set in node_sets: - node_set_name = f"{node_set}_nodes" - partitionName = node_set + # + # {az}_all Partition + # + PartitionName={az}_all Default={default_partition} PriorityTier={priority} Nodes=\\"""), file=fh) + print(f"{node_list}", file=fh) print(dedent(f"""\ + # - # {partitionName} Partition + # {az}_all Interactive Partition + # + # The interative partition has a high weight so that jobs in its queue will + # have the highest scheduling priority so that they should start before + # jobs in lower priority partitions. # - PartitionName={partitionName} Default=NO Nodes={node_set_name}"""), file=fh) + # This is to allow interactive users to run small numbers of jobs that + # require immediate results. + # + PartitionName={az}_all_interactive Default=NO PriorityTier={priority+10000} Nodes=\\"""), file=fh) + print(f"{node_list}", file=fh) print(dedent(f"""\ + # # All Partition # @@ -1707,82 +1827,107 @@ def create_node_conf(self): self.publish_cw_metrics(self.CW_UNHANDLED_CREATE_NODE_CONF_EXCEPTION, 1, []) raise - def get_instance_types_from_instance_config(self, instance_config): - # Compile strings into regular expressions - instance_config_re = {} - for include_exclude in ['Include', 'Exclude']: - instance_config_re[include_exclude] = {} - for filter_type in ['InstanceFamilies', 'InstanceTypes']: - instance_config_re[include_exclude][filter_type] = [] - for index, re_string in enumerate(instance_config.get(include_exclude, {}).get(filter_type, {})): - try: - instance_config_re[include_exclude][filter_type].append(re.compile(f"^{re_string}$")) - except: - logging.exception(f"Invalid regular expression for instance_config['{include_exclude}']['{filter_type}'] {re_string}") - exit(1) - - self.get_instance_type_info() - - instance_types = [] - - for instance_family in sorted(self.instance_family_info.keys()): - logger.debug(f"Considering {instance_family} family exclusions") - exclude = False - for instance_family_re in instance_config_re.get('Exclude', {}).get('InstanceFamilies', {}): - if instance_family_re.match(instance_family): - logger.debug(f"Excluding {instance_family} family") - exclude = True - break - if exclude: - # Exclusions have precedence over inclusions so don't check instance type inclusions. - continue - logger.debug(f"{instance_family} family not excluded") + def get_az_info_from_instance_config(self, instance_config: dict) -> dict: + ''' + Get AZ info selected by the config file. + ''' + az_info = {} + for region, region_dict in instance_config['Regions'].items(): + logger.debug(f"region: {region}") + ec2_client = boto3.client('ec2', region_name=region) + for az_dict in region_dict['AZs']: + subnet = az_dict['Subnet'] + subnet_info = ec2_client.describe_subnets(SubnetIds=[subnet])['Subnets'][0] + az = subnet_info['AvailabilityZone'] + az_id = subnet_info['AvailabilityZoneId'] + az_info[az] = { + 'id': az_id, + 'priority': az_dict['Priority'], + 'region': region, + 'subnet': subnet + } + return az_info - # Check to see if instance family is explicitly included - include_family = False - if instance_config_re['Include']['InstanceFamilies']: - logger.debug(f"Considering {instance_family} family inclusions") - for instance_family_re in instance_config_re['Include']['InstanceFamilies']: - if instance_family_re.match(instance_family): - logger.debug(f"Including {instance_family} family") - include_family = True - break - if not include_family: - logger.debug(f"{instance_family} family not included. Will check for instance type inclusions.") - - # Check the family's instance types for exclusion and inclusion. MaxSizeOnly is a type of exclusion. - instance_family_info = self.instance_family_info[instance_family] - for instance_type in instance_family_info['instance_types']: - logger.debug(f"Checking {instance_type} for instance type exclusions") - if instance_config.get('Include', {}).get('MaxSizeOnly', False) and instance_type != instance_family_info['MaxInstanceType']: - logger.debug(f"Excluding {instance_type} because not MaxInstanceType.") - continue + def get_instance_types_from_instance_config(self, instance_config: dict, regions: [str], instance_type_info: EC2InstanceTypeInfo) -> dict: + ''' + Get instance types selected by the config file. + ''' + instance_types = {} + for region in regions: + # Compile strings into regular expressions + instance_config_re = {} + for include_exclude in ['Include', 'Exclude']: + instance_config_re[include_exclude] = {} + for filter_type in ['InstanceFamilies', 'InstanceTypes']: + instance_config_re[include_exclude][filter_type] = [] + for index, re_string in enumerate(instance_config.get(include_exclude, {}).get(filter_type, {})): + try: + instance_config_re[include_exclude][filter_type].append(re.compile(f"^{re_string}$")) + except: + logging.exception(f"Invalid regular expression for instance_config['{include_exclude}']['{filter_type}'] {re_string}") + exit(1) + + region_instance_types = [] + + for instance_family in sorted(self.instance_family_info[region].keys()): + logger.debug(f"Considering {instance_family} family exclusions") exclude = False - for instance_type_re in instance_config_re['Exclude']['InstanceTypes']: - if instance_type_re.match(instance_type): - logger.debug(f"Excluding {instance_type} because excluded") + for instance_family_re in instance_config_re.get('Exclude', {}).get('InstanceFamilies', {}): + if instance_family_re.match(instance_family): + logger.debug(f"Excluding {instance_family} family") exclude = True break if exclude: + # Exclusions have precedence over inclusions so don't check instance type inclusions. continue - logger.debug(f"{instance_type} not excluded by instance type exclusions") + logger.debug(f"{instance_family} family not excluded") + + # Check to see if instance family is explicitly included + include_family = False + if instance_config_re['Include']['InstanceFamilies']: + logger.debug(f"Considering {instance_family} family inclusions") + for instance_family_re in instance_config_re['Include']['InstanceFamilies']: + if instance_family_re.match(instance_family): + logger.debug(f"Including {instance_family} family") + include_family = True + break + if not include_family: + logger.debug(f"{instance_family} family not included. Will check for instance type inclusions.") + + # Check the family's instance types for exclusion and inclusion. MaxSizeOnly is a type of exclusion. + instance_family_info = self.instance_family_info[region][instance_family] + for instance_type in instance_family_info['instance_types']: + logger.debug(f"Checking {instance_type} for instance type exclusions") + if instance_config.get('Include', {}).get('MaxSizeOnly', False) and instance_type != instance_family_info['MaxInstanceType']: + logger.debug(f"Excluding {instance_type} because not MaxInstanceType.") + continue + exclude = False + for instance_type_re in instance_config_re['Exclude']['InstanceTypes']: + if instance_type_re.match(instance_type): + logger.debug(f"Excluding {instance_type} because excluded") + exclude = True + break + if exclude: + continue + logger.debug(f"{instance_type} not excluded by instance type exclusions") - # The instance type isn't explicitly excluded so check if it is included - if include_family: - logger.debug(f"Including {instance_type} because {instance_family} family is included.") - instance_types.append(instance_type) - continue - include = False - for instance_type_re in instance_config_re['Include']['InstanceTypes']: - if instance_type_re.match(instance_type): - logger.debug(f"Including {instance_type}") - include = True - instance_types.append(instance_type) - break - if not include: - logger.debug(f"Excluding {instance_type} because not included") - continue - return sorted(instance_types) + # The instance type isn't explicitly excluded so check if it is included + if include_family: + logger.debug(f"Including {instance_type} because {instance_family} family is included.") + region_instance_types.append(instance_type) + continue + include = False + for instance_type_re in instance_config_re['Include']['InstanceTypes']: + if instance_type_re.match(instance_type): + logger.debug(f"Including {instance_type}") + include = True + region_instance_types.append(instance_type) + break + if not include: + logger.debug(f"Excluding {instance_type} because not included") + continue + instance_types[region] = sorted(region_instance_types) + return instance_types # Translate region code to region name def get_region_name(self, region_code): @@ -1898,21 +2043,16 @@ def paginate(self, paginator, kwargs): return result @retry_ec2_throttling() - def start_instances(self, kwargs): - result = self.ec2.start_instances(**kwargs) - return result - - @retry_ec2_throttling() - def stop_instances(self, kwargs): - result = self.ec2.stop_instances(**kwargs) + def start_instances(self, region, kwargs): + result = self.ec2[region].start_instances(**kwargs) return result @retry_ec2_throttling() - def terminate_instances(self, kwargs): - result = self.ec2.terminate_instances(**kwargs) + def stop_instances(self, region, kwargs): + result = self.ec2[region].stop_instances(**kwargs) return result @retry_ec2_throttling() - def pricing_get_products(self, ServiceCode, Filters): - result = self.pricing_client.get_products(ServiceCode=ServiceCode, Filters=Filters) + def terminate_instances(self, region, kwargs): + result = self.ec2[region].terminate_instances(**kwargs) return result diff --git a/source/resources/playbooks/roles/SlurmCtl/tasks/slurm_configuration.yml b/source/resources/playbooks/roles/SlurmCtl/tasks/slurm_configuration.yml index 7672d295..2d12d7f3 100644 --- a/source/resources/playbooks/roles/SlurmCtl/tasks/slurm_configuration.yml +++ b/source/resources/playbooks/roles/SlurmCtl/tasks/slurm_configuration.yml @@ -72,7 +72,7 @@ set -ex cp {{INSTANCE_CONFIG_LOCAL_PATH}} {{INSTANCE_CONFIG_PATH}} cd {{SlurmScriptsDir}} - if ! ./slurm_ec2_create_node_conf.py --config-file {{INSTANCE_CONFIG_LOCAL_PATH}} -o {{SlurmEtcDir}}/slurm_nodes.conf.new; then + if ! ./slurm_ec2_create_node_conf.py --config-file {{INSTANCE_CONFIG_LOCAL_PATH}} --az-info-file {{SlurmConfigDir}}/AZInfo.json -o {{SlurmEtcDir}}/slurm_nodes.conf.new --instance-type-info-json {{SlurmConfigDir}}/instance-type-info.json; then rm -f {{SlurmEtcDir}}/slurm_nodes.conf.new exit 1 fi diff --git a/source/resources/playbooks/roles/SlurmCtl/tasks/slurm_scripts.yml b/source/resources/playbooks/roles/SlurmCtl/tasks/slurm_scripts.yml index 125f51a8..ecfbe128 100644 --- a/source/resources/playbooks/roles/SlurmCtl/tasks/slurm_scripts.yml +++ b/source/resources/playbooks/roles/SlurmCtl/tasks/slurm_scripts.yml @@ -1,4 +1,49 @@ --- +- name: Create {{SlurmScriptsDir}}/EC2InstanceTypeInfoPkg + when: PrimaryController|bool + file: + path: "{{SlurmScriptsDir}}/EC2InstanceTypeInfoPkg" + state: directory + owner: root + group: root + mode: 0775 + +- name: Create {{SlurmScriptsDir}}/EC2InstanceTypeInfoPkg/__init__.py + when: PrimaryController|bool + copy: + dest: "{{SlurmScriptsDir}}/EC2InstanceTypeInfoPkg/__init__.py" + src: opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/__init__.py + owner: root + group: root + mode: 0755 + +- name: Create {{SlurmScriptsDir}}/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py + when: PrimaryController|bool + copy: + dest: "{{SlurmScriptsDir}}/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py" + src: opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py + owner: root + group: root + mode: 0755 + +- name: Create {{SlurmScriptsDir}}/EC2InstanceTypeInfoPkg/get_ec2_instance_info.py + when: PrimaryController|bool + copy: + dest: "{{SlurmScriptsDir}}/EC2InstanceTypeInfoPkg/get_ec2_instance_info.py" + src: opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/get_ec2_instance_info.py + owner: root + group: root + mode: 0755 + +- name: Create {{SlurmScriptsDir}}/EC2InstanceTypeInfoPkg/retry_boto3_throttling.py + when: PrimaryController|bool + copy: + dest: "{{SlurmScriptsDir}}/EC2InstanceTypeInfoPkg/retry_boto3_throttling.py" + src: opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/retry_boto3_throttling.py + owner: root + group: root + mode: 0755 + - name: Create {{SlurmScriptsDir}}/create_users_groups_json.py when: PrimaryController|bool template: @@ -175,15 +220,6 @@ group: root mode: 0755 -- name: Create {{SlurmScriptsDir}}/EC2InstanceTypeInfo.py - when: PrimaryController|bool - copy: - dest: "{{SlurmScriptsDir}}/EC2InstanceTypeInfo.py" - src: opt/slurm/cluster/bin/EC2InstanceTypeInfo.py - owner: root - group: root - mode: 0755 - - name: Create {{SlurmScriptsDir}}/requeue_node_jobs.py when: PrimaryController|bool copy: diff --git a/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/config/slurm_config.json b/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/config/slurm_config.json index d9d89ccf..5fb6ffea 100644 --- a/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/config/slurm_config.json +++ b/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/config/slurm_config.json @@ -1,10 +1,11 @@ { "AWS_DEFAULT_REGION": "{{Region}}", + "AZInfoFile": "{{SlurmConfigDir}}/AZInfo.json", "CloudWatchPeriod": "{{CloudWatchPeriod}}", "ClusterName": "{{ClusterName}}", "DOMAIN": "{{Domain}}", - "EC2_KEY_PAIR": "{{EC2_KEYPAIR}}", + "InstanceTypeInfoFile": "{{SlurmConfigDir}}/instance-type-info.json", "MaxStoppedDuration": "{{MaxStoppedDuration}}", "region": "{{Region}}", "SLURM_POWER_LOG": "/var/log/slurm/power_save.log", @@ -12,9 +13,8 @@ "SlurmLogsDir": "{{SlurmLogsDir}}", "SLURMNODE_PROFILE_ARN": "{{SlurmNodeProfileArn}}", "SLURMNODE_ROLE_NAME": "{{SlurmNodeRoleName}}", - "SLURMNODE_SECURITY_GROUP": "{{SlurmNodeSecurityGroup}}", - "SLURMNODE_SUBNET": "{{GridSubnet1}}", "SlurmScriptsDir": "{{SlurmScriptsDir}}", + "SlurmVersionFile": "{{SlurmConfigDir}}/SlurmVersion.json", "STACK_NAME": "{{STACK_NAME}}", "SuspendAction": "{{SuspendAction}}" } diff --git a/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/config/slurm_config.sh b/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/config/slurm_config.sh index a714fac4..f0f4ca9c 100644 --- a/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/config/slurm_config.sh +++ b/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/config/slurm_config.sh @@ -4,10 +4,7 @@ # Configuration variables used by slurm plugin export AWS_DEFAULT_REGION={{Region}} export DOMAIN={{Domain}} -export EC2_KEY_PAIR={{EC2_KEYPAIR}} export SLURM_POWER_LOG=/var/log/slurm/power_save.log export SLURMNODE_PROFILE_ARN="{{SlurmNodeProfileArn}}" export SLURMNODE_ROLE_NAME={{SlurmNodeRoleName}} -export SLURMNODE_SECURITY_GROUP={{SlurmNodeSecurityGroup}} -export SLURMNODE_SUBNET={{GridSubnet1}} export STACK_NAME={{STACK_NAME}} diff --git a/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/modules/modulefiles/slurm/.template b/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/modules/modulefiles/slurm/.template index bf290a3f..5435c427 100644 --- a/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/modules/modulefiles/slurm/.template +++ b/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/modules/modulefiles/slurm/.template @@ -40,6 +40,10 @@ if { [ module-info mode load ] || [ module-info mode display ] } { setenv SBATCH_TIMELIMIT "1:0:0" setenv SBATCH_TIMELIMIT_SET "" } + if { ! [ info exists ::env(SBATCH_PARTITION) ] } { + setenv SBATCH_PARTITION "{{DefaultPartition}}" + setenv SBATCH_PARTITION_SET "" + } } elseif { [ module-info mode remove ] } { if { [ info exists ::env(SBATCH_MEM_PER_NODE_SET) ] } { unsetenv SBATCH_MEM_PER_NODE @@ -53,6 +57,10 @@ if { [ module-info mode load ] || [ module-info mode display ] } { unsetenv SBATCH_TIMELIMIT unsetenv SBATCH_TIMELIMIT_SET } + if { [ info exists ::env(SBATCH_TIMELIMIT_SET) ] } { + unsetenv SBATCH_PARTITION + unsetenv SBATCH_PARTITION_SET + } } # srun defaults @@ -65,6 +73,10 @@ if { [ module-info mode load ] || [ module-info mode display ] } { setenv SLURM_MEM_PER_NODE 100M setenv SLURM_MEM_PER_NODE_SET "" } + if { ! [ info exists ::env(SLURM_PARTITION) ] } { + setenv SLURM_PARTITION "{{DefaultPartition}}" + setenv SLURM_PARTITION_SET "" + } if { ! [ info exists ::env(SLURM_TIMELIMIT) ] } { setenv SLURM_TIMELIMIT "1:0:0" setenv SLURM_TIMELIMIT_SET "" @@ -78,6 +90,10 @@ if { [ module-info mode load ] || [ module-info mode display ] } { unsetenv SLURM_MEM_PER_NODE unsetenv SLURM_MEM_PER_NODE_SET } + if { ! [ info exists ::env(SLURM_PARTITION) ] } { + unsetenv SLURM_PARTITION + unsetenv SLURM_PARTITION_SET + } if { [ info exists ::env(SLURM_TIMELIMIT_SET) ] } { unsetenv SLURM_TIMELIMIT unsetenv SLURM_TIMELIMIT_SET @@ -114,7 +130,7 @@ if { [ module-info mode load ] || [ module-info mode display ] } { setenv SQUEUE_SORT_SET "" } if { ! ( [ info exists ::env(SQUEUE_FORMAT) ] || [ info exists ::env(SQUEUE_FORMAT2) ] ) } { - setenv SQUEUE_FORMAT2 "Cluster:16 ,Partition:9 ,JobArrayId:16 ,Priority:12 ,State:11 ,UserName:8 ,Name:16 ,NumNodes:.5 ,NumCPUs:.4 ,MinMemory:.10 ,Feature:15 ,Dependency:10 ,Licenses:8 ,ReasonList:25" + setenv SQUEUE_FORMAT2 "Cluster:16 ,Partition:15 ,JobArrayId:16 ,Priority:12 ,State:11 ,UserName:8 ,Name:16 ,NumNodes:.5 ,NumCPUs:.4 ,MinMemory:.10 ,Feature:15 ,Dependency:10 ,Licenses:8 ,ReasonList:30" # # Time and priority information #setenv SQUEUE_FORMAT2 "JobId:.6 ,Partition:9 ,State:7 ,UserName:8 ,Name:16 ,SubmitTime:16 ,PendingTime:12 ,TimeLimit:18 ,EndTime:18 ,ReasonList" diff --git a/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/test/job_stress.sh b/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/test/job_stress.sh index 5c63e92b..2a2f4ed1 100755 --- a/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/test/job_stress.sh +++ b/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/test/job_stress.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -x # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 diff --git a/source/resources/playbooks/roles/SlurmNodeAmi/tasks/main.yml b/source/resources/playbooks/roles/SlurmNodeAmi/tasks/main.yml index 22baa7d4..22e6d2ab 100644 --- a/source/resources/playbooks/roles/SlurmNodeAmi/tasks/main.yml +++ b/source/resources/playbooks/roles/SlurmNodeAmi/tasks/main.yml @@ -4,7 +4,6 @@ - name: Install slurm_node packages yum: state: present - disablerepo: "{{yum_disablerepo|default(omit)}}" name: - emacs - hwloc-libs diff --git a/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml b/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml index 249ba183..03607661 100644 --- a/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml +++ b/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml @@ -5,6 +5,7 @@ debug: msg: | FileSystemDns: {{FileSystemDns}} + FileSystemIpAddress: {{FileSystemIpAddress}} FileSystemMountPath: {{FileSystemMountPath}} FileSystemMountSrc: {{FileSystemMountSrc}} FileSystemOptions: {{FileSystemOptions}} diff --git a/source/resources/playbooks/roles/unmount_slurm_fs/tasks/main.yml b/source/resources/playbooks/roles/unmount_slurm_fs/tasks/main.yml index 75743249..2e47fb00 100644 --- a/source/resources/playbooks/roles/unmount_slurm_fs/tasks/main.yml +++ b/source/resources/playbooks/roles/unmount_slurm_fs/tasks/main.yml @@ -5,6 +5,7 @@ debug: msg: | FileSystemDns: {{FileSystemDns}} + FileSystemIpAddress: {{FileSystemIpAddress}} FileSystemMountPath: {{FileSystemMountPath}} FileSystemMountSrc: {{FileSystemMountSrc}} FileSystemOptions: {{FileSystemOptions}} diff --git a/source/resources/user_data/WaitForAmi.py b/source/resources/user_data/WaitForAmi.py index b53997e0..5c649209 100644 --- a/source/resources/user_data/WaitForAmi.py +++ b/source/resources/user_data/WaitForAmi.py @@ -22,9 +22,9 @@ ''' import argparse import boto3 -import json import logging from logging import handlers +from os import environ from time import sleep logger = logging.getLogger(__file__) @@ -38,9 +38,10 @@ def main(): try: parser = argparse.ArgumentParser("Wait for AMI to be available.") - parser.add_argument('--ami-id', required=True, default=False, help="AMI Id to wait for") - parser.add_argument('--ssm-parameter', required=True, default=False, help="SSM Parameter to store the ami id in.") - parser.add_argument('--instance-id', required=True, default=False, help="Instance ID that created the AMI.") + parser.add_argument('--ami-id', required=True, help="AMI Id to wait for") + parser.add_argument('--base-ssm-parameter', required=True, help="SSM Parameter to store the ami id in.") + parser.add_argument('--instance-id', required=True, help="Instance ID that created the AMI.") + parser.add_argument('--compute-regions', required=True, help="Comma separated list of compute regions") parser.add_argument('--debug', '-d', action='count', default=False, help="Enable debug messages") args = parser.parse_args() @@ -52,22 +53,56 @@ def main(): logger.debug(f"Debugging level {args.debug}") logger.info(f"ami-id: {args.ami_id}") - logger.info(f"ssm-parameter: {args.ssm_parameter}") + logger.info(f"base-ssm-parameter: {args.base_ssm_parameter}") logger.info(f"instance-id: {args.instance_id}") ec2_client = boto3.client('ec2') logger.info(f"Waiting for {args.ami_id} to be available.") while True: - state = ec2_client.describe_images(ImageIds=[args.ami_id])['Images'][0]['State'] + ami_info = ec2_client.describe_images(ImageIds=[args.ami_id])['Images'][0] + state = ami_info['State'] + ami_name = ami_info['Name'] logger.info(f"state={state}") if state == 'available': break sleep(60) - logger.info(f"Writing {args.ami_id} to {args.ssm_parameter}") + ssm_parameter = f"{args.base_ssm_parameter}/{environ['AWS_DEFAULT_REGION']}" + logger.info(f"Writing {args.ami_id} to {ssm_parameter}") ssm_client = boto3.client('ssm') - ssm_client.put_parameter(Name=args.ssm_parameter, Type='String', Value=args.ami_id, Overwrite=True) + ssm_client.put_parameter(Name=ssm_parameter, Type='String', Value=args.ami_id, Overwrite=True) + + # Copy AMI to remote regions + main_region = environ['AWS_DEFAULT_REGION'] + compute_regions = args.compute_regions.split(',') + remote_ami_ids = {} + for region in compute_regions: + if region == main_region: + continue + logger.info(f"Copying {args.ami_id} to {region}") + ec2_client = boto3.client('ec2', region_name=region) + remote_ami_ids[region] = ec2_client.copy_image( + Name = f"{ami_name}", + Encrypted = True, + SourceImageId = args.ami_id, + SourceRegion = main_region + )['ImageId'] + logger.info(f"Created {remote_ami_ids[region]} in {region}") + for region, remote_ami_id in remote_ami_ids.items(): + logger.info(f"Waiting for {remote_ami_id} to be available in {region}.") + ec2_client = boto3.client('ec2', region_name=region) + while True: + state = ec2_client.describe_images(ImageIds=[remote_ami_id])['Images'][0]['State'] + logger.info(f"state={state}") + if state == 'available': + break + sleep(60) + ssm_parameter = f"{args.base_ssm_parameter}/{region}" + logger.info(f"Writing {remote_ami_id} to {ssm_parameter}") + ssm_client.put_parameter(Name=ssm_parameter, Type='String', Value=remote_ami_id, Overwrite=True) + logger.info(f"Stopping {args.instance_id}") + ec2_client = boto3.client('ec2') ec2_client.stop_instances(InstanceIds=[args.instance_id]) except Exception as e: logger.exception(str(e)) diff --git a/source/resources/user_data/slurm_node_ami_config.sh b/source/resources/user_data/slurm_node_ami_config.sh index 4ffa6ff2..d2a1ee55 100644 --- a/source/resources/user_data/slurm_node_ami_config.sh +++ b/source/resources/user_data/slurm_node_ami_config.sh @@ -27,7 +27,7 @@ if [ -e /var/lib/cloud/instance/sem/ami.txt ]; then ami=$(cat /var/lib/cloud/instance/sem/ami.txt) echo "First reboot after ami ($ami) created." chmod +x /root/WaitForAmi.py - /root/WaitForAmi.py --ami-id $ami --ssm-parameter $SlurmNodeAmiSsmParameter --instance-id $instance_id + /root/WaitForAmi.py --ami-id $ami --base-ssm-parameter $SlurmNodeAmiSsmParameterBaseName --instance-id $instance_id --compute-regions $ComputeRegions # Delete the semaphore so that if the instance reboots because of template changes then a new AMI will be created mv /var/lib/cloud/instance/sem/ami.txt /var/lib/cloud/instance/sem/$ami.txt exit 0 diff --git a/source/slurm_installer/installer.py b/source/slurm_installer/installer.py index 2a0048de..0e81fae5 100755 --- a/source/slurm_installer/installer.py +++ b/source/slurm_installer/installer.py @@ -221,6 +221,14 @@ def main(self): self.install_parameters[config_key] = self.config[config_key] logger.info(f"{config_key:30}: {self.install_parameters[config_key]}") + # Get the CIDR block for the VPC. Used in multi-region deployments + config_key = 'CIDR' + if config_key not in self.config: + cidr = ec2.describe_vpcs(VpcIds=[self.config['VpcId']])['Vpcs'][0]['CidrBlock'] + self.config[config_key] = cidr + self.install_parameters[config_key] = cidr + logger.info(f"{config_key:30}: {self.install_parameters[config_key]}") + # Optional config_key = 'SubnetId' if config_key in self.config or args.SubnetId or args.prompt: diff --git a/tests/test_slurm_minimal.py b/tests/test_slurm_minimal.py new file mode 100644 index 00000000..d017215d --- /dev/null +++ b/tests/test_slurm_minimal.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 + +#import filecmp +from os import path, system +from os.path import abspath, dirname +import pytest +import subprocess +from subprocess import CalledProcessError, check_output + + +REPO_DIR = abspath(f"{dirname(__file__)}/..") + +def test_slurm_minimal(): + try: + output = check_output([f"{REPO_DIR}/install.sh", '--cdk-cmd', 'create', '--region', 'us-east-1'], stderr=subprocess.STDOUT, encoding='utf8') + except CalledProcessError as e: + print(f"returncode: {e.returncode}") + print(f"output:\n{e.stdout}") + raise From eb5df97db91344b332570b7aa85ae64cc24e67d6 Mon Sep 17 00:00:00 2001 From: Allan Carter Date: Fri, 8 Jul 2022 18:25:11 +0000 Subject: [PATCH 2/2] Review fixes --- .gitignore | 7 - docs/multi-region.md | 4 + source/app.py | 4 - source/cdk/cdk_slurm_stack.py | 21 +- source/cdk/config_schema.py | 53 +++- source/resources/config/default_config.yml | 7 - .../config/slurm_all_instance_types.yml | 23 -- source/resources/config/slurm_all_os.yml | 6 - source/resources/config/slurm_alma_linux.yml | 6 - source/resources/config/slurm_eda.yml | 48 ---- source/resources/config/slurm_eda_az1.yml | 48 ---- source/resources/config/slurm_eda_az2.yml | 47 ---- source/resources/config/slurm_eda_az3.yml | 47 ---- .../resources/config/slurm_elasticsearch.yml | 7 - source/resources/config/slurm_fpga_dev.yml | 6 - source/resources/config/slurm_lustre.yml | 6 - source/resources/config/slurm_multi_az.yml | 256 ++++-------------- source/resources/config/slurm_ontap.yml | 7 - source/resources/config/slurm_rocky_linux.yml | 6 - source/resources/config/slurm_zfs.yml | 7 - .../resources/lambdas/UpdateDns/UpdateDns.py | 75 ----- .../lambdas/UpdateDns/cfnresponse.py | 1 - .../EC2InstanceTypeInfo.py | 4 +- .../opt/slurm/cluster/bin/SlurmPlugin.py | 84 +----- .../modules/modulefiles/slurm/.template | 4 +- .../roles/mount_slurm_fs/tasks/main.yml | 1 - .../roles/unmount_slurm_fs/tasks/main.yml | 1 - source/resources/user_data/WaitForAmi.py | 7 +- .../user_data/slurm_node_ami_config.sh | 12 +- 29 files changed, 132 insertions(+), 673 deletions(-) delete mode 100644 source/resources/lambdas/UpdateDns/UpdateDns.py delete mode 120000 source/resources/lambdas/UpdateDns/cfnresponse.py diff --git a/.gitignore b/.gitignore index deb1375f..e474b001 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,4 @@ -.mkdocs_venv/ -site/ -.vscode/ - -# Jekyll -Gemfile.lock -.jekyll-cache .mkdocs_venv/ _site site/ diff --git a/docs/multi-region.md b/docs/multi-region.md index 1f26ad5b..294394d0 100644 --- a/docs/multi-region.md +++ b/docs/multi-region.md @@ -289,3 +289,7 @@ slurm: type: nfs4 options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport ``` + +## Deployment + +After the configuration is complete then deployment is the same as document on the [Deploy the Cluster](deploy.md) page. diff --git a/source/app.py b/source/app.py index 3878400c..ba73f869 100644 --- a/source/app.py +++ b/source/app.py @@ -23,10 +23,6 @@ app = App() -# TODO: Create a stack for each additional region to create resources needed to create instances in those regions. -# * Instance profile -# * Security group - cdk_env = Environment( account = app.node.try_get_context('account_id'), region = app.node.try_get_context('region') diff --git a/source/cdk/cdk_slurm_stack.py b/source/cdk/cdk_slurm_stack.py index d6141594..453feacb 100644 --- a/source/cdk/cdk_slurm_stack.py +++ b/source/cdk/cdk_slurm_stack.py @@ -382,7 +382,9 @@ def check_config(self): exit(1) if not self.config['slurm']['InstanceConfig']['Regions']: - default_region = { + self.config['slurm']['InstanceConfig']['Regions'] = {} + self.config['slurm']['InstanceConfig']['Regions'][self.config['Region']] = { + 'VpcId': self.config['VpcId'], 'CIDR': self.config['CIDR'], 'SshKeyPair': self.config['SshKeyPair'], 'AZs': [ @@ -392,7 +394,6 @@ def check_config(self): } ] } - self.config['slurm']['InstanceConfig']['Regions'][self.config['Region']] = default_region self.compute_regions = {} self.remote_compute_regions = {} @@ -648,7 +649,7 @@ def create_security_groups(self): Tags.of(self.zfs_sg).add("Name", f"{self.stack_name}-ZfsSG") self.suppress_cfn_nag(self.zfs_sg, 'W29', 'Egress port range used to block all egress') - # Compute nodes may use lustre file systems to create a security group with the required ports. + # Compute nodes may use lustre file systems so create a security group with the required ports. self.lustre_sg = ec2.SecurityGroup(self, "LustreSG", vpc=self.vpc, allow_all_outbound=False, description="Lustre Security Group") Tags.of(self.lustre_sg).add("Name", f"{self.stack_name}-LustreSG") self.suppress_cfn_nag(self.lustre_sg, 'W29', 'Egress port range used to block all egress') @@ -735,6 +736,7 @@ def create_security_groups(self): fs_client_sg.connections.allow_to(self.nfs_sg, ec2.Port.tcp(2049), f"{fs_client_sg_name} to Nfs") if self.onprem_cidr: self.nfs_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp(2049), 'OnPremNodes to Nfs') + # Allow compute nodes in remote regions access to NFS for compute_region, compute_region_cidr in self.remote_compute_regions.items(): self.nfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(2049), f"{compute_region} to Nfs") @@ -759,6 +761,7 @@ def create_security_groups(self): self.zfs_sg.connections.allow_from(self.onprem_cidr, ec2.Port.udp_range(20001, 20003), 'OnPremNodes to Zfs') self.suppress_cfn_nag(self.zfs_sg, 'W27', 'Correct, restricted range for zfs: 20001-20003') self.suppress_cfn_nag(self.zfs_sg, 'W29', 'Correct, restricted range for zfs: 20001-20003') + # Allow compute nodes in remote regions access to ZFS for compute_region, compute_region_cidr in self.remote_compute_regions.items(): self.zfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(111), f"{compute_region} to Zfs") self.zfs_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.udp(111), f"{compute_region} to Zfs") @@ -785,6 +788,7 @@ def create_security_groups(self): self.lustre_sg.connections.allow_from(self.onprem_cidr, ec2.Port.tcp_range(1021, 1023), 'OnPremNodes to Lustre') self.lustre_sg.connections.allow_to(self.onprem_cidr, ec2.Port.tcp(988), f"Lustre to OnPremNodes") self.lustre_sg.connections.allow_to(self.onprem_cidr, ec2.Port.tcp_range(1021, 1023), f"Lustre to OnPremNodes") + # Allow compute nodes in remote regions access to Lustre for compute_region, compute_region_cidr in self.remote_compute_regions.items(): self.lustre_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp(988), f"{compute_region} to Lustre") self.lustre_sg.connections.allow_from(ec2.Peer.ipv4(compute_region_cidr), ec2.Port.tcp_range(1021, 1023), f"{compute_region} to Lustre") @@ -988,6 +992,8 @@ def create_elasticsearch(self): self.config['slurm']['JobCompLoc'] = f"http://{domain_endpoint}/slurm/_doc" def create_file_system(self): + self.slurmfs_fqdn = f"slurmfs.{self.config['Domain']}" + if 'kms_key_arn' in self.config['slurm']['storage']: kms_key = kms.Key.from_key_arn(self.config['slurm']['storage']['kms_key_arn']) else: @@ -1057,7 +1063,7 @@ def create_file_system(self): self.file_system_mount_name = "" - self.file_system_mount_source = f"{self.file_system_ip_address}:/" + self.file_system_mount_source = f"{self.slurmfs_fqdn}:/" if self.config['slurm']['storage']['efs']['use_efs_helper']: self.file_system_type = 'efs' @@ -1155,7 +1161,7 @@ def create_file_system(self): self.file_system_mount_name = "" - self.file_system_mount_source = f"{self.file_system_ip_address}:/slurm" + self.file_system_mount_source = f"{self.slurmfs_fqdn}:/slurm" self.file_system_options = 'nfsvers=4.1' @@ -1237,7 +1243,7 @@ def create_file_system(self): self.file_system_mount_name = "" - self.file_system_mount_source = f"{self.file_system_ip_address}:/fsx/slurm" + self.file_system_mount_source = f"{self.slurmfs_fqdn}:/fsx/slurm" self.file_system_options = 'nfsvers=4.1' @@ -1255,7 +1261,6 @@ def create_file_system(self): record_name = 'slurmfs', target = route53.RecordTarget.from_ip_addresses(self.file_system_ip_address) ) - CfnOutput(self, "FileSystemProvider", value = self.config['slurm']['storage']['provider'] ) @@ -1725,7 +1730,6 @@ def get_instance_template_vars(self, instance_role): "ERROR_SNS_TOPIC_ARN": self.config['ErrorSnsTopicArn'], "ExtraMounts": self.config['slurm']['storage']['ExtraMounts'], "FileSystemDns": self.file_system_dns, - "FileSystemIpAddress": self.file_system_ip_address, "FileSystemMountPath": self.config['slurm']['storage']['mount_path'], "FileSystemMountSrc": self.file_system_mount_source, "FileSystemOptions": self.file_system_options, @@ -1749,7 +1753,6 @@ def get_instance_template_vars(self, instance_role): else: instance_template_vars["AccountingStorageHost"] = '' instance_template_vars["CloudWatchPeriod"] = self.config['slurm']['SlurmCtl']['CloudWatchPeriod'] - instance_template_vars["CloudWatchPeriod"] = self.config['slurm']['SlurmCtl']['CloudWatchPeriod'] instance_template_vars["DefaultPartition"] = self.default_partition if 'Federation' in self.config['slurm']: instance_template_vars["Federation"] = self.config['slurm']['Federation']['Name'] diff --git a/source/cdk/config_schema.py b/source/cdk/config_schema.py index f92b6ae2..0963ae6d 100644 --- a/source/cdk/config_schema.py +++ b/source/cdk/config_schema.py @@ -45,17 +45,18 @@ 'AFTER_90_DAYS' ] -eda_instance_families = [ +default_eda_instance_families = [ #'c5', # Mixed depending on size - 'c5a', # AMD EPYC 7R32 3.3 GHz + #'c5a', # AMD EPYC 7R32 3.3 GHz #'c5ad', # AMD EPYC 7R32 3.3 GHz + 'c6a', 'c6i', # Intel Xeon 8375C (Ice Lake) 3.5 GHz 'c6g', # AWS Graviton2 Processor 2.5 GHz #'c6gd', # AWS Graviton2 Processor 2.5 GHz #'f1', # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz 'm5', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz #'m5d', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - 'm5a', # AMD EPYC 7571 2.5 GHz + #'m5a', # AMD EPYC 7571 2.5 GHz #'m5ad', # AMD EPYC 7571 2.5 GHz 'm5zn', # Intel Xeon Platinum 8252 4.5 GHz 'm6a', # AMD EPYC 7R13 Processor 3.6 GHz @@ -82,13 +83,37 @@ #'u-12tb1', # Intel Xeon Scalable (Skylake) 12 TB ] -eda_instance_types = [ +default_eda_instance_types = [ #'c5\.(l|x|2|4|9|18).*', # Intel Xeon Platinum 8124M 3.4 GHz - 'c5\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz + #'c5\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz #'c5d\.(l|x|2|4|9|18).*', # Intel Xeon Platinum 8124M 3.4 GHz #'c5d\.(12|24).*', # Intel Xeon Platinum 8275L 3.6 GHz ] +default_excluded_instance_families = [ + 'a1', # Graviton 1 + 'c4', # Replaced by c5 + 'd2', # SSD optimized + 'g3', # Replaced by g4 + 'g3s', # Replaced by g4 + 'h1', # SSD optimized + 'i3', # SSD optimized + 'i3en', # SSD optimized + 'm4', # Replaced by m5 + 'p2', # Replaced by p3 + 'p3', + 'p3dn', + 'r4', # Replaced by r5 + 't2', # Replaced by t3 + 'x1', + 'x1e', +] + +default_excluded_instance_types = [ + '.+\.(micro|nano)', # Not enough memory + '.*\.metal' +] + # The config file is used in the installer and the CDK app. # Some configuration values are required in the CDK app but are optional so that they can be set by the installer. config_schema = Schema( @@ -252,22 +277,22 @@ }, # Include*/Exclude*: # Instance families and types are regular expressions with implicit '^' and '$' at the begining and end. - # Exclude patterns are processed first and take precedence over any includes. + # Exclude patterns are processed first and take precesdence over any includes. # An empty list is the same as '.*'. - 'Include': { + Optional('Exclude', default={'InstanceFamilies': default_excluded_instance_families, 'InstanceTypes': default_excluded_instance_types}): { + Optional('InstanceFamilies', default=default_excluded_instance_families): [str], + Optional('InstanceTypes', default=default_excluded_instance_types): [str] + }, + Optional('Include', default={'MaxSizeOnly': False, 'InstanceFamilies': default_eda_instance_families, 'InstanceTypes': default_eda_instance_types}): { # MaxSizeOnly: # If MaxSizeOnly is True then only the largest instance type in # a family will be included unless specific instance types are included. # Default: false Optional('MaxSizeOnly', default=False): bool, - 'InstanceFamilies': [str], - 'InstanceTypes': [str] - }, - Optional('Exclude', default={'InstanceFamilies': [], 'InstanceTypes': []}): { - 'InstanceFamilies': [str], - 'InstanceTypes': [str] + Optional('InstanceFamilies', default=default_eda_instance_families): [str], + Optional('InstanceTypes', default=default_eda_instance_types): [str] }, - Optional('Regions', default=[]): { + Optional('Regions', default={}): { str: { 'VpcId': And(str, lambda s: re.match('vpc-', s)), 'CIDR': str, diff --git a/source/resources/config/default_config.yml b/source/resources/config/default_config.yml index ade26c30..1a2f8018 100644 --- a/source/resources/config/default_config.yml +++ b/source/resources/config/default_config.yml @@ -18,23 +18,16 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: AlmaLinux_8_arm64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.+\.(micro|nano)' # Not enough memory - - '.*\.metal' storage: provider: zfs diff --git a/source/resources/config/slurm_all_instance_types.yml b/source/resources/config/slurm_all_instance_types.yml index 58c52f35..5e88f29c 100644 --- a/source/resources/config/slurm_all_instance_types.yml +++ b/source/resources/config/slurm_all_instance_types.yml @@ -14,35 +14,12 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} Include: - MaxSizeOnly: false InstanceFamilies: [] InstanceTypes: [] - Exclude: - InstanceFamilies: - - a1 # Graviton 1 - - c4 # Replaced by c5 - - d2 # SSD optimized - - g3 # Replaced by g4 - - g3s # Replaced by g4 - - h1 # SSD optimized - - i3 # SSD optimized - - i3en # SSD optimized - - m4 # Replaced by m5 - - p2 # Replaced by p3 - - p3 - - p3dn - - r4 # Replaced by r5 - - t2 # Replaced by t3 - - u - - x1 - - x1e - InstanceTypes: - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_all_os.yml b/source/resources/config/slurm_all_os.yml index 1856d158..6a08d427 100644 --- a/source/resources/config/slurm_all_os.yml +++ b/source/resources/config/slurm_all_os.yml @@ -16,7 +16,6 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} @@ -28,15 +27,10 @@ slurm: 8: [x86_64, arm64] Rocky: {8: [x86_64, arm64]} Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_alma_linux.yml b/source/resources/config/slurm_alma_linux.yml index 9d6e2210..d2e1ac5f 100644 --- a/source/resources/config/slurm_alma_linux.yml +++ b/source/resources/config/slurm_alma_linux.yml @@ -15,20 +15,14 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: AlmaLinux_8_x86_64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_eda.yml b/source/resources/config/slurm_eda.yml index 5340da18..d45f9c52 100644 --- a/source/resources/config/slurm_eda.yml +++ b/source/resources/config/slurm_eda.yml @@ -24,59 +24,11 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] - Include: - MaxSizeOnly: false - InstanceFamilies: - #- 'c5' # Mixed depending on size - #- 'c5a' # AMD EPYC 7R32 3.3 GHz - #- 'c5ad' # AMD EPYC 7R32 3.3 GHz - - 'c6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'c6g' # AWS Graviton2 Processor 2.5 GHz - #- 'c6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - #- 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5a' # AMD EPYC 7571 2.5 GHz - #- 'm5ad' # AMD EPYC 7571 2.5 GHz - - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'm6g' # AWS Graviton2 Processor 2.5 GHz - #- 'm6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz - #- 'r5a' # AMD EPYC 7571 2.5 GHz - #- 'r5ad' # AMD EPYC 7571 2.5 GHz - - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - - 'r6g' # AWS Graviton2 Processor 2.5 GHz - #- 'r6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB - #- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - - 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB - - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB - - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB - - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz - #- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB - #- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB - #- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB - InstanceTypes: [] - #- 'c5\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - #- 'c5d\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5d\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda_az1.yml b/source/resources/config/slurm_eda_az1.yml index 33b1f4fe..5ec9f19d 100644 --- a/source/resources/config/slurm_eda_az1.yml +++ b/source/resources/config/slurm_eda_az1.yml @@ -36,59 +36,11 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] - Include: - MaxSizeOnly: false - InstanceFamilies: - #- 'c5' # Mixed depending on size - #- 'c5a' # AMD EPYC 7R32 3.3 GHz - #- 'c5ad' # AMD EPYC 7R32 3.3 GHz - - 'c6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'c6g' # AWS Graviton2 Processor 2.5 GHz - #- 'c6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - #- 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5a' # AMD EPYC 7571 2.5 GHz - #- 'm5ad' # AMD EPYC 7571 2.5 GHz - - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'm6g' # AWS Graviton2 Processor 2.5 GHz - #- 'm6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz - #- 'r5a' # AMD EPYC 7571 2.5 GHz - #- 'r5ad' # AMD EPYC 7571 2.5 GHz - - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - - 'r6g' # AWS Graviton2 Processor 2.5 GHz - #- 'r6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB - #- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - - 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB - - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB - - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB - - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz - #- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB - #- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB - #- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB - InstanceTypes: [] - #- 'c5\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - #- 'c5d\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5d\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda_az2.yml b/source/resources/config/slurm_eda_az2.yml index 4f888085..09fd2fb4 100644 --- a/source/resources/config/slurm_eda_az2.yml +++ b/source/resources/config/slurm_eda_az2.yml @@ -37,58 +37,11 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] - Include: - MaxSizeOnly: false - InstanceFamilies: - #- 'c5' # Mixed depending on size - - 'c5a' # AMD EPYC 7R32 3.3 GHz - #- 'c5ad' # AMD EPYC 7R32 3.3 GHz - - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'c6g' # AWS Graviton2 Processor 2.5 GHz - #- 'c6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - - 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - - 'm5a' # AMD EPYC 7571 2.5 GHz - #- 'm5ad' # AMD EPYC 7571 2.5 GHz - - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'm6g' # AWS Graviton2 Processor 2.5 GHz - #- 'm6gd' # AWS Graviton2 Processor 2.5 GHz - - 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - - 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz - - 'r5a' # AMD EPYC 7571 2.5 GHz - - 'r5ad' # AMD EPYC 7571 2.5 GHz - - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - - 'r6g' # AWS Graviton2 Processor 2.5 GHz - #- 'r6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB - #- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - - 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB - - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB - - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB - - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz - #- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB - #- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB - #- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB - InstanceTypes: - #- 'c5\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - - 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - #- 'c5d\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5d\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema storage: {'zfs': {}} diff --git a/source/resources/config/slurm_eda_az3.yml b/source/resources/config/slurm_eda_az3.yml index ca0d2a3b..b66421d6 100644 --- a/source/resources/config/slurm_eda_az3.yml +++ b/source/resources/config/slurm_eda_az3.yml @@ -38,58 +38,11 @@ slurm: # A partition will be created for each combination of Base OS, Architecture, and Spot InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] - Include: - MaxSizeOnly: false - InstanceFamilies: - #- 'c5' # Mixed depending on size - - 'c5a' # AMD EPYC 7R32 3.3 GHz - #- 'c5ad' # AMD EPYC 7R32 3.3 GHz - - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'c6g' # AWS Graviton2 Processor 2.5 GHz - #- 'c6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'f1' # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz - - 'm5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'm5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - - 'm5a' # AMD EPYC 7571 2.5 GHz - #- 'm5ad' # AMD EPYC 7571 2.5 GHz - - 'm5zn' # Intel Xeon Platinum 8252 4.5 GHz - - 'm6a' # AMD EPYC 7R13 Processor 3.6 GHz - - 'm6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz - - 'm6g' # AWS Graviton2 Processor 2.5 GHz - #- 'm6gd' # AWS Graviton2 Processor 2.5 GHz - - 'r5' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - - 'r5d' # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz - #- 'r5b' # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz - - 'r5a' # AMD EPYC 7571 2.5 GHz - - 'r5ad' # AMD EPYC 7571 2.5 GHz - - 'r6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB - - 'r6g' # AWS Graviton2 Processor 2.5 GHz - #- 'r6gd' # AWS Graviton2 Processor 2.5 GHz - #- 'x1' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB - #- 'x1e' # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB - - 'x2gd' # AWS Graviton2 Processor 2.5 GHz 1TB - - 'x2idn' # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB - - 'x2iedn' # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB - - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz - #- 'u-6tb1' # Intel Xeon Scalable (Skylake) 6 TB - #- 'u-9tb1' # Intel Xeon Scalable (Skylake) 9 TB - #- 'u-12tb1' # Intel Xeon Scalable (Skylake) 12 TB - InstanceTypes: - #- 'c5\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - - 'c5\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - #- 'c5d\.(l|x|2|4|9|18).*' # Intel Xeon Platinum 8124M 3.4 GHz - #- 'c5d\.(12|24).*' # Intel Xeon Platinum 8275L 3.6 GHz - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema storage: {'zfs': {}} diff --git a/source/resources/config/slurm_elasticsearch.yml b/source/resources/config/slurm_elasticsearch.yml index 2d8a0746..95fd7c8e 100644 --- a/source/resources/config/slurm_elasticsearch.yml +++ b/source/resources/config/slurm_elasticsearch.yml @@ -34,23 +34,16 @@ slurm: InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.+\.(micro|nano)' # Not enough memory - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_fpga_dev.yml b/source/resources/config/slurm_fpga_dev.yml index a0407440..05922465 100644 --- a/source/resources/config/slurm_fpga_dev.yml +++ b/source/resources/config/slurm_fpga_dev.yml @@ -20,7 +20,6 @@ slurm: InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: Amazon: @@ -28,7 +27,6 @@ slurm: CentOS: 7: [x86_64] Include: - MaxSizeOnly: false InstanceFamilies: - 'c6a' # AMD EPYC 7R13 Processor 3.6 GHz - 'c6i' # Intel Xeon 8375C (Ice Lake) 3.5 GHz @@ -42,10 +40,6 @@ slurm: - 'x2iezn' # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB - 'z1d' # Intel Xeon Platinum 8151 4.0 GHz InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_lustre.yml b/source/resources/config/slurm_lustre.yml index 6ea6fd41..fa5a0d6e 100644 --- a/source/resources/config/slurm_lustre.yml +++ b/source/resources/config/slurm_lustre.yml @@ -19,20 +19,14 @@ slurm: InstanceConfig: UseSpot: true - DefaultPartition: CentOS_7_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: CentOS: 7: [x86_64] Include: - MaxSizeOnly: false InstanceFamilies: - t3 InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_multi_az.yml b/source/resources/config/slurm_multi_az.yml index dc42d493..72a3e62d 100644 --- a/source/resources/config/slurm_multi_az.yml +++ b/source/resources/config/slurm_multi_az.yml @@ -1,132 +1,30 @@ --- -# Sample configuraton that creates a minimal Slurm cluster -# Shows all available configuration options -# Note that CentOS 8 has been discontinued and support has been removed. -# Uses arm64 architecture for SlurmCtl and SlurmDbd by default. -# No SlurmDbd in this configuration. +# Multi-region Slurm cluster with Netapp Ontap -termination_protection: True # Enable (recommended) or Disable Cloudformation Stack termination protection +StackName: slurmmultiaz -#==================================================================== -# Parameters that must be in the config file or on the command line. -# Command line values override values in the config file. -#==================================================================== -StackName: slurmminimal #Region: us-east-1 + #SshKeyPair: name of your ec2 keypair + #VpcId: vpc-xxxxxxxxxxxxxxxxx -# SubnetId: -# Optional. If not specified then the first private subnet is chosen. #SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 -#SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 -#SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 + +#HostedZoneId: XXXXXXXXXXXXXXXXXXX # This is optional, but highly recommended #ErrorSnsTopicArn: arn:aws:sns:{{region}}:{AccountId}:{TopicName} -#==================================================================== -# Required Parameters -#==================================================================== - -# Domain: Optional -# Domain name for the Route 53 private hosted zone that will be used -# by the slurm cluster for DNS. -# By default will be {StackName}.local -# Alternately, provide HostedZoneId of an existing Route53 hosted zone to use. -# Cannot specify both Domain and HostedZoneId. -# Domain: "{{StackName}}.local" - -# HostedZoneId: Optional -# ID of an existing hosted zone that will be used by the slurm cluster for DNS. -# Alternately, provide Domain name to use for a new Route53 hosted zone to use. -# Cannot specify both Domain and HostedZoneId. -# HostedZoneId: - -TimeZone: 'US/Central' +#TimeZone: 'US/Central' slurm: - # High level configuration - - SlurmVersion: "21.08.5" - - # ClusterName: - # Optional - # Must be unique if multiple clusters deployed in the same VPC. - # Default: StackName - # ClusterName: slurm - - # MungeKeySsmParameter - # SSM String Parameter with a base64 encoded munge key to use for the cluster. - # Use this if your submitters need to use more than 1 cluster. - #MungeKeySsmParameter: "/slurm/munge_key" + MungeKeySsmParameter: "/slurm/munge_key" SlurmCtl: - # For high availability configure multiple controllers - NumberOfControllers: 1 - # The index will be appended to BaseHostname starting with 1. - BaseHostname: slurmctl - - # architecture: x86_64 or arm64 - #architecture: x86_64 - #instance_type: "c5.large" - architecture: arm64 - instance_type: "c6g.large" - volume_size: 200 # Size of the EBS root disk + NumberOfControllers: 2 - # SuspendAction - # Set to stop or terminate. - # Stopped nodes will restart quicker, but you will continue to be charged for the EBS volumes - # attached to the instance. - SuspendAction: stop - # - # MaxStoppedDuration - # In ISO 8601 duration format: https://en.wikipedia.org/wiki/ISO_8601#Durations - # Default: 1 hour = P0Y0M0DT1H0M0S - # Evaluated at least hourly - MaxStoppedDuration: P0Y0M0DT1H0M0S - - CloudWatchPeriod: 5 # Cloudwatch metric collection period in minutes. Default value is 5. Set to 1 for finer resolution. - # Also used in the dashboard widgets. - - # The accounting database is required to enable fairshare scheduling - # It is managed by the Slurm Database Daemon (slurmdbd) instance - # This instance can be created as part of the cluster or can use an existing instance. - # SlurmDbd: - # # It is recommended to get the basic cluster configured and working before enabling the accounting database - # UseSlurmDbd: False - - # # Hostname: - # # Hostname of the slurmdbd instance if CreateSlurmdbd is true. - # Hostname: slurmdbd - - # # architecture: x86_64 or arm64 - # #architecture: x86_64 - # #instance_type: "m5.large" - # architecture: arm64 - # instance_type: "m6g.large" - # volume_size: 200 # Size of the EBS root disk - - # database: - # port: 3306 - - # Federation: - # Name: slurmeda - # SlurmCtlSecurityGroups: - # SecurityGroupName: sg-xxxxxxxxxxxxxxxxx - - SlurmNodeAmis: - instance_type: - x86_64: m5.large - arm64: m6g.large - - # Customized AMIs with file system mounts, packages, etc. configured. - # If these aren't defined then the generic base AMIs are used. - # Example in the comment below is the AWS FPGA Developer AMI - #BaseAmis: - # us-east-1: - # Amazon: {2: {x86_64: {ImageId: ami-0efdec76678df9a64, RootDeviceSize: '+5'}}} - # CentOS: {7: {x86_64: {ImageId: ami-02155c6289e76719a, RootDeviceSize: 90}}} + SlurmDbd: {} # External security groups that should be able to use the cluster # SubmitterSecurityGroupIds: @@ -135,117 +33,59 @@ slurm: # SubmitterInstanceTags: # 'soca:ClusterId': ['soca-xyz'] - # InstanceConfig: - # Configure the instances used by the cluster - # A partition will be created for each combination of Base OS, Architecture, and Spot - # - # UseSpot: - # Create both on-demand and spot nodes - # Default: true - # DefaultPartition: - # By default this will be the first OS/Architecture listed in BaseOsArchitecture. - # Add '_spot' to the end to make spot the default purchase option. - # NodesPerInstanceType: - # The number of nodes that will be defined for each instance type. - # Include*/Exclude*: - # Instance families and types are regular expressions with implicit '^' and '$' at the begining and end. - # Exclude patterns are processed first and take precedence over any includes. - # A empty list is the same as '.*'. - # MaxSizeOnly: If MaxSizeOnly is True then only the largest instance type in - # a family will be included unless specific instance types are included. - # Default: false InstanceConfig: UseSpot: true - DefaultPartition: AlmaLinux_8_arm64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} - # Amazon: {2: [x86_64, arm64]} CentOS: 7: [x86_64] - # Amazon: {2: [x86_64, arm64]} - # RedHat: - # 7: [x86_64] - # 8: [x86_64, arm64] - # Rocky: {8: [x86_64, arm64]} Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.+\.(micro|nano)' # Not enough memory - - '.*\.metal' - AZs: - - Priority: 1 - #Region: us-east-1 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 - - Priority: 2 - #Region: us-east-1 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 - - Priority: 3 - #Region: us-east-1 - Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 - - # ElasticSearch: - # Configure the ElasticSearch/OpenSearch domain used by the slurm cluster - # If not specified then won't be created or used by the cluster. - # master_nodes: Defaults to 0 - # data_nodes: Must be a multiple of number_of_azs - # ElasticSearch: - # ebs_volume_size: 20 - # ebs_volume_type: GP2 - # enable_version_upgrade: False - # number_of_azs: 2 - # master_nodes: 3 - # master_node_instance_type: m5.large.search - # data_nodes: 2 - # data_node_instance_type: m5.large.search - # warm_nodes: 0 - # warm_instance_type: ultrawarm.medium.search - - # JobCompType: - # Values: - # jobcomp/none - # jobcomp/elasticsearch - # jobcomp/filetxt - JobCompType: jobcomp/filetxt - # - # JobCompLoc: - # Used with jobcomp/elasticsearch - # A complete URL endpoint with format ://_doc - #JobCompLoc: http://{{EsDomain}}.{{Region}}.es.amazonaws.com/slurm/_doc + Regions: + eu-west-1: + VpcId: vpc-xxxxxxxxxxxxxxxxx + CIDR: 10.1.0.0/16 + SshKeyPair: admin-eu-west-1 + AZs: + - Priority: 10 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 + - Priority: 9 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 + - Priority: 8 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 + us-east-1: + VpcId: vpc-xxxxxxxxxxxxxxxxx + CIDR: 10.2.0.0/16 + SshKeyPair: admin-us-east-1 + AZs: + - Priority: 7 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 + - Priority: 6 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 + - Priority: 5 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 + us-west-2: + VpcId: vpc-xxxxxxxxxxxxxxxxx + CIDR: 10.3.0.0/16 + SshKeyPair: admin-us-west-2 + #SecurityGroupId: sg-0addccc8388e008fd + AZs: + - Priority: 4 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1 + - Priority: 3 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2 + - Priority: 2 + Subnet: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3 - # Configure your Storage options below - # @todo support fsxn, test if efs will gate scaling of the cluster storage: - # mount_path: - # Default is /opt/slurm/{{cluster_name}} - #mount_path: "" - provider: "efs" # efs or lustre - #kms_key_arn: - removal_policy : "DESTROY" # DESTROY, RETAIN, SNAPSHOT. Choices: RETAIN will preserve the EFS even if you delete the stack. Any other value will delete EFS if you delete the CFN stack - efs: - use_efs_helper: false - throughput_mode: "BURSTING" # Choices: BURSTING, PROVISIONED - # provisioned_throughput_per_second: 1 # In MiB/s. Minimum value of 1 - performance_mode: "GENERAL_PURPOSE" # Choices: GENERAL_PURPOSE, MAX_IO - encrypted: True # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-efs-filesystem.html#cfn-efs-filesystem-encrypted - lifecycle_policy: "AFTER_30_DAYS" # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-efs-filesystem-lifecyclepolicy.html - lustre: - deployment_type: "SCRATCH_2" # Allowed values: PERSISTENT_1 | SCRATCH_1 | SCRATCH_2. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-deploymenttype - drive_cache_type: "NONE" # Allowed values: NONE | READ. Required when storage_type is HDD. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-drivecachetype - per_unit_storage_throughput: 50 # Allowed values: 12, 40 for HDD, 50, 100, 200 for SSD. Required for the PERSISTENT_1 deployment_type. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-perunitstoragethroughput - storage_capacity: 1200 # For SCRATCH_2 and PERSISTENT_1 types, valid values are 1,200, 2,400, then continuing in increments of 2,400 GiB. For SCRATCH_1 deployment types, valid values are 1,200, 2,400, 3,600, then continuing in increments of 3,600 GiB. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-fsx-filesystem.html#cfn-fsx-filesystem-storagecapacity - storage_type: "SSD" # Allowed values: SSD or HDD. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-fsx-filesystem.html#cfn-fsx-filesystem-storagetype + provider: ontap + removal_policy: DESTROY + ontap: {} - # ExtraMounts - # Additional mounts for compute nodes - # This examle shows SOCA EFS file systems. - # This is required so the compute node as the same file structure as the remote desktops. #ExtraMounts: # - dest: /apps # src: fs-xxxxxxxx.efs.us-east-1.amazonaws.com:/ diff --git a/source/resources/config/slurm_ontap.yml b/source/resources/config/slurm_ontap.yml index 25e4b9ad..ccf02007 100644 --- a/source/resources/config/slurm_ontap.yml +++ b/source/resources/config/slurm_ontap.yml @@ -14,23 +14,16 @@ slurm: InstanceConfig: UseSpot: true - DefaultPartition: AlmaLinux_8_arm64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.+\.(micro|nano)' # Not enough memory - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_rocky_linux.yml b/source/resources/config/slurm_rocky_linux.yml index 8eac0d08..7d4ba1ee 100644 --- a/source/resources/config/slurm_rocky_linux.yml +++ b/source/resources/config/slurm_rocky_linux.yml @@ -11,20 +11,14 @@ StackName: slurmrocky slurm: InstanceConfig: UseSpot: true - DefaultPartition: Rocky_8_x86_64_spot NodesPerInstanceType: 5 BaseOsArchitecture: Rocky: {8: [x86_64, arm64]} Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/config/slurm_zfs.yml b/source/resources/config/slurm_zfs.yml index 2460fe02..7f013467 100644 --- a/source/resources/config/slurm_zfs.yml +++ b/source/resources/config/slurm_zfs.yml @@ -15,23 +15,16 @@ slurm: InstanceConfig: UseSpot: true - DefaultPartition: AlmaLinux_8_arm64_spot NodesPerInstanceType: 10 BaseOsArchitecture: AlmaLinux: {8: [x86_64, arm64]} CentOS: 7: [x86_64] Include: - MaxSizeOnly: false InstanceFamilies: - t3 - t4g InstanceTypes: [] - Exclude: - InstanceFamilies: [] - InstanceTypes: - - '.+\.(micro|nano)' # Not enough memory - - '.*\.metal' # Use defaults from schema SlurmCtl: {} diff --git a/source/resources/lambdas/UpdateDns/UpdateDns.py b/source/resources/lambdas/UpdateDns/UpdateDns.py deleted file mode 100644 index cbd48f39..00000000 --- a/source/resources/lambdas/UpdateDns/UpdateDns.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -SPDX-License-Identifier: MIT-0 - -Permission is hereby granted, free of charge, to any person obtaining a copy of this -software and associated documentation files (the "Software"), to deal in the Software -without restriction, including without limitation the rights to use, copy, modify, -merge, publish, distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A -PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -""" - -''' -Create/delete DNS entry -''' -import cfnresponse -import boto3 -import logging -logging.getLogger().setLevel(logging.INFO) - -def lambda_handler(event, context): - try: - logging.info("event: {}".format(event)) - properties = event['ResourceProperties'] - required_properties = ['Hostname', 'Domain', 'HostedZoneId', 'Type', 'Value'] - error_message = "" - for property in required_properties: - try: - value = properties[property] - except: - error_message += "Missing {} property. ".format(property) - if error_message: - raise KeyError(error_message) - route53_client = boto3.client('route53') - requestType = event['RequestType'] - if requestType in ['Create', 'Update']: - action = 'UPSERT' - elif requestType == 'Delete': - action = 'DELETE' - else: - raise ValueError('Invalid RequestType: {}'.format(event['RequestType'])) - hostname = properties['Hostname'] - domain = properties['Domain'] - type = properties['Type'] - value = properties['Value'] - logging.info("{} {}.{} {} record, value=".format(action, hostname, type, value)) - route53_client.change_resource_record_sets( - HostedZoneId=properties['HostedZoneId'], - ChangeBatch={ - 'Comment': '{} {} DNS record'.format(action, hostname), - 'Changes': [ - { - 'Action': action, - 'ResourceRecordSet': { - 'Name': "{}.{}".format(hostname, domain), - 'Type': type, - 'TTL': 60, - 'ResourceRecords': [{'Value': value}] - } - } - ] - } - ) - except Exception as e: - logging.exception(str(e)) - cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, str(e)) - raise - - cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, "{} {}.{} {}".format(properties['Type'], properties['Hostname'], properties['Domain'], properties['Value'])) diff --git a/source/resources/lambdas/UpdateDns/cfnresponse.py b/source/resources/lambdas/UpdateDns/cfnresponse.py deleted file mode 120000 index 09400dfc..00000000 --- a/source/resources/lambdas/UpdateDns/cfnresponse.py +++ /dev/null @@ -1 +0,0 @@ -../cfnresponse.py \ No newline at end of file diff --git a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py index e87bd965..7fbb1c73 100755 --- a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py +++ b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py @@ -113,9 +113,9 @@ def get_instance_type_info(self, region): else: instance_type_info[instanceType]['ThreadsPerCore'] = 1 if 'ValidCores' in instanceTypeDict['VCpuInfo']: - instance_type_info[instanceType]['CoreCount'] = max(instanceTypeDict['VCpuInfo']['ValidCores']) + instance_type_info[instanceType]['CoreCount'] = int(max(instanceTypeDict['VCpuInfo']['ValidCores'])) else: - instance_type_info[instanceType]['CoreCount'] = instanceTypeDict['VCpuInfo']['DefaultVCpus']/instance_type_info[instanceType]['ThreadsPerCore'] + instance_type_info[instanceType]['CoreCount'] = int(instanceTypeDict['VCpuInfo']['DefaultVCpus']/instance_type_info[instanceType]['ThreadsPerCore']) instance_type_info[instanceType]['MemoryInMiB'] = instanceTypeDict['MemoryInfo']['SizeInMiB'] instance_type_info[instanceType]['SSDCount'] = instanceTypeDict.get('InstanceStorageInfo', {'Disks': [{'Count': 0}]})['Disks'][0]['Count'] instance_type_info[instanceType]['SSDTotalSizeGB'] = instanceTypeDict.get('InstanceStorageInfo', {'TotalSizeInGB': 0})['TotalSizeInGB'] diff --git a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmPlugin.py b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmPlugin.py index 6eb7d150..d2332313 100755 --- a/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmPlugin.py +++ b/source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmPlugin.py @@ -1498,7 +1498,7 @@ def create_node_conf(self): logger.propagate = False self.parser = argparse.ArgumentParser("Create SLURM node config from EC2 instance metadata") - self.parser.add_argument('--config-file', default=False, help="YAML file with instance families and types to include/exclude") + self.parser.add_argument('--config-file', required=True, help="YAML file with instance families and types to include/exclude") self.parser.add_argument('--output-file', '-o', required=True, help="Output file") self.parser.add_argument('--az-info-file', required=True, help="JSON file where AZ info will be saved") self.parser.add_argument('--instance-type-info-json', default=False, help="JSON file with cached instance type info.") @@ -1509,72 +1509,8 @@ def create_node_conf(self): logger.setLevel(logging.DEBUG) logger.debug(f"Debugging level {self.args.debug}") - if self.args.config_file: - logger.info(f"Loading config from {self.args.config_file}") - instance_config = yaml.load(open(self.args.config_file, 'r').read(), Loader=yaml.SafeLoader) - else: - instance_config = { - 'UseSpot': True, - 'NodesPerInstanceType': 10, - 'BaseOsArchitecture': { - 'AlmaLinux': {8: ['x86_64', 'arm64']}, - 'CentOS': { - '7': ['x86_64'], - '8': ['x86_64', 'arm64'] - }, - 'Amazon': {'2': ['x86_64', 'arm64']}, - 'RedHat': { - '7': ['x86_64'], - '8': ['x86_64', 'arm64'] - }, - 'Rocky': {8: ['x86_64', 'arm64']}, - }, - 'Include': { - 'MaxSizeOnly': False, - 'InstanceFamilies': [ - 't3', - 't3a', - 't4g', - ], - 'InstanceTypes': [] - }, - 'Exclude': { - 'InstanceFamilies': [ - 'a1', # Graviton 1 - 'c4', # Replaced by c5 - 'd2', # SSD optimized - 'g3', # Replaced by g4 - 'g3s', # Replaced by g4 - 'h1', # SSD optimized - 'i3', # SSD optimized - 'i3en', # SSD optimized - 'm4', # Replaced by m5 - 'p2', # Replaced by p3 - 'p3', - 'p3dn', - 'r4', # Replaced by r5 - 't2', # Replaced by t3 - 'u', - 'x1', - 'x1e' - ], - 'InstanceTypes': [] - }, - 'Regions': [ - { - 'Region': environ['AWS_DEFAULT_REGION'], - 'AZs': [ - { - 'Priority': 1, - 'Region': environ['AWS_DEFAULT_REGION'], - 'Subnet': environ['GridSubnet1'] - } - ], - }, - ], - 'AlwaysOnNodes': [], - 'AlwaysOnPartitions': [] - } + logger.info(f"Loading config from {self.args.config_file}") + instance_config = yaml.load(open(self.args.config_file, 'r').read(), Loader=yaml.SafeLoader) # Check for required fields if 'BaseOsArchitecture' not in instance_config: @@ -1582,13 +1518,15 @@ def create_node_conf(self): # Set defaults for missing fields if 'UseSpot' not in instance_config: - instance_config['UseSpot'] = True + raise ValueError(f"InstanceConfig missing UseSpot") if 'NodesPerInstanceType' not in instance_config: - instance_config['NodesPerInstanceType'] = 10 + raise ValueError(f"InstanceConfig missing NodesPerInstanceType") + if 'Exclude' not in instance_config: + raise ValueError(f"InstanceConfig missing Exclude") if 'Include' not in instance_config: - instance_config['Include'] = {} + raise ValueError(f"InstanceConfig missing Include") if 'MaxSizeOnly' not in instance_config['Include']: - instance_config['Include']['MaxSizeOnly'] = 10 + raise ValueError(f"InstanceConfig missing Include.MaxSizeOnly") compute_regions = sorted(instance_config['Regions'].keys()) az_info = self.get_az_info_from_instance_config(instance_config) @@ -1666,7 +1604,7 @@ def create_node_conf(self): ondemand_featureList = base_featureList + ',ondemand' price = instance_type_info[instanceType]['pricing']['OnDemand'] weight = int(float(price) * 10000) - node_name = "NodeName={:39s} CPUs={:2s} RealMemory={:7s} Feature={:89s} Weight={}".format( + node_name = "NodeName={:39s} CPUs={:2s} RealMemory={:7s} Feature={:103s} Weight={}".format( node, str(coreCount), str(realMemory), ondemand_featureList, weight) node_sets[node_set]['node_names'].append(node_name) @@ -1676,7 +1614,7 @@ def create_node_conf(self): spot_feature_list = f"{base_featureList},spot" spot_price = instance_type_info[instanceType]['pricing']['spot'][az] spot_weight = int(float(spot_price) * 10000) - spot_node_name = "NodeName={:39s} CPUs={:2s} RealMemory={:7s} Feature={:89s} Weight={}".format( + spot_node_name = "NodeName={:39s} CPUs={:2s} RealMemory={:7s} Feature={:103s} Weight={}".format( spot_node, str(coreCount), str(realMemory), spot_feature_list, spot_weight) node_sets[spot_node_set]['node_names'].append(spot_node_name) diff --git a/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/modules/modulefiles/slurm/.template b/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/modules/modulefiles/slurm/.template index 5435c427..a8a005ef 100644 --- a/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/modules/modulefiles/slurm/.template +++ b/source/resources/playbooks/roles/SlurmCtl/templates/opt/slurm/cluster/modules/modulefiles/slurm/.template @@ -57,7 +57,7 @@ if { [ module-info mode load ] || [ module-info mode display ] } { unsetenv SBATCH_TIMELIMIT unsetenv SBATCH_TIMELIMIT_SET } - if { [ info exists ::env(SBATCH_TIMELIMIT_SET) ] } { + if { [ info exists ::env(SBATCH_PARTITION_SET) ] } { unsetenv SBATCH_PARTITION unsetenv SBATCH_PARTITION_SET } @@ -90,7 +90,7 @@ if { [ module-info mode load ] || [ module-info mode display ] } { unsetenv SLURM_MEM_PER_NODE unsetenv SLURM_MEM_PER_NODE_SET } - if { ! [ info exists ::env(SLURM_PARTITION) ] } { + if { ! [ info exists ::env(SLURM_PARTITION_SET) ] } { unsetenv SLURM_PARTITION unsetenv SLURM_PARTITION_SET } diff --git a/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml b/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml index 03607661..249ba183 100644 --- a/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml +++ b/source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml @@ -5,7 +5,6 @@ debug: msg: | FileSystemDns: {{FileSystemDns}} - FileSystemIpAddress: {{FileSystemIpAddress}} FileSystemMountPath: {{FileSystemMountPath}} FileSystemMountSrc: {{FileSystemMountSrc}} FileSystemOptions: {{FileSystemOptions}} diff --git a/source/resources/playbooks/roles/unmount_slurm_fs/tasks/main.yml b/source/resources/playbooks/roles/unmount_slurm_fs/tasks/main.yml index 2e47fb00..75743249 100644 --- a/source/resources/playbooks/roles/unmount_slurm_fs/tasks/main.yml +++ b/source/resources/playbooks/roles/unmount_slurm_fs/tasks/main.yml @@ -5,7 +5,6 @@ debug: msg: | FileSystemDns: {{FileSystemDns}} - FileSystemIpAddress: {{FileSystemIpAddress}} FileSystemMountPath: {{FileSystemMountPath}} FileSystemMountSrc: {{FileSystemMountSrc}} FileSystemOptions: {{FileSystemOptions}} diff --git a/source/resources/user_data/WaitForAmi.py b/source/resources/user_data/WaitForAmi.py index 5c649209..dba9b221 100644 --- a/source/resources/user_data/WaitForAmi.py +++ b/source/resources/user_data/WaitForAmi.py @@ -25,6 +25,7 @@ import logging from logging import handlers from os import environ +from sys import exit from time import sleep logger = logging.getLogger(__file__) @@ -59,7 +60,11 @@ def main(): ec2_client = boto3.client('ec2') logger.info(f"Waiting for {args.ami_id} to be available.") while True: - ami_info = ec2_client.describe_images(ImageIds=[args.ami_id])['Images'][0] + try: + ami_info = ec2_client.describe_images(ImageIds=[args.ami_id])['Images'][0] + except IndexError: + logger.error(f"{args.ami_id} not found") + exit(2) state = ami_info['State'] ami_name = ami_info['Name'] logger.info(f"state={state}") diff --git a/source/resources/user_data/slurm_node_ami_config.sh b/source/resources/user_data/slurm_node_ami_config.sh index d2a1ee55..cd2c1bdd 100644 --- a/source/resources/user_data/slurm_node_ami_config.sh +++ b/source/resources/user_data/slurm_node_ami_config.sh @@ -27,10 +27,14 @@ if [ -e /var/lib/cloud/instance/sem/ami.txt ]; then ami=$(cat /var/lib/cloud/instance/sem/ami.txt) echo "First reboot after ami ($ami) created." chmod +x /root/WaitForAmi.py - /root/WaitForAmi.py --ami-id $ami --base-ssm-parameter $SlurmNodeAmiSsmParameterBaseName --instance-id $instance_id --compute-regions $ComputeRegions - # Delete the semaphore so that if the instance reboots because of template changes then a new AMI will be created - mv /var/lib/cloud/instance/sem/ami.txt /var/lib/cloud/instance/sem/$ami.txt - exit 0 + if ! /root/WaitForAmi.py --ami-id $ami --base-ssm-parameter $SlurmNodeAmiSsmParameterBaseName --instance-id $instance_id --compute-regions $ComputeRegions; then + echo "Could not wait for AMI. Assume it is bad and create a new one." + rm -f /var/lib/cloud/instance/sem/ami.txt + else + # Delete the semaphore so that if the instance reboots because of template changes then a new AMI will be created + mv /var/lib/cloud/instance/sem/ami.txt /var/lib/cloud/instance/sem/$ami.txt + exit 0 + fi fi # Install security updates first.