Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HCLS Integration Test #1810

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,6 @@ deployment_groups:

- id: network1
source: modules/network/vpc
settings:
network_name: hcls-cluster-net
subnetwork_name: primary-subnet
nick-stroud marked this conversation as resolved.
Show resolved Hide resolved

### Resource Monitoring ###

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@
delegate_to: localhost
- name: Cleanup firewall and infrastructure
ansible.builtin.include_tasks:
file: tasks/rescue_terraform_failure.yml
file: tasks/rescue_ghpc_failure.yml
apply:
delegate_to: localhost
vars:
Expand Down Expand Up @@ -174,7 +174,7 @@
always:
- name: Cleanup firewall and infrastructure
ansible.builtin.include_tasks:
file: tasks/rescue_terraform_failure.yml
file: tasks/rescue_ghpc_failure.yml
apply:
delegate_to: localhost
vars:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,19 +46,14 @@

- name: Create Infrastructure and test
block:
- name: Create Cluster with Terraform
ansible.builtin.command:
cmd: "{{ item }}"
chdir: "{{ workspace }}/{{ deployment_name }}/primary"
- name: Create Cluster with GHPC
register: deployment
changed_when: deployment.changed
ansible.builtin.command: ./ghpc deploy {{ deployment_name }} --auto-approve
args:
creates: "{{ workspace }}/{{ deployment_name }}/.terraform"
chdir: "{{ workspace }}"
environment:
TF_IN_AUTOMATION: "TRUE"
register: terraform_output
with_items:
- "terraform init"
- "terraform apply -auto-approve -no-color"

- name: Print instance IDs of VMs
ansible.builtin.include_tasks:
file: tasks/get_instance_ids.yml
Expand Down Expand Up @@ -90,6 +85,12 @@
login_ip: "{{ get_login_ip.stdout }}"
when: '"*" in login_node'

- name: Check that login IP is set
ansible.builtin.assert:
that:
- login_ip is defined
- login_ip != ""
nick-stroud marked this conversation as resolved.
Show resolved Hide resolved

- name: Print login public IP
ansible.builtin.debug:
var: login_ip
Expand Down Expand Up @@ -146,19 +147,18 @@

## Cleanup and fail gracefully
rescue:
- name: Capture terraform stderr
- name: Capture ghpc stderr
ansible.builtin.set_fact:
terraform_apply_stderr_one_line: "{{ terraform_output.results.1.stderr | replace('\n',' ') }}"

terraform_apply_stderr_one_line: "{{ deployment.stderr }}"
- name: Gather logs
ansible.builtin.include_tasks:
file: tasks/gather_startup_script_logs.yml
apply:
delegate_to: localhost

- name: Include rescue from terraform failure
- name: Include rescue from ghpc failure
cdunbar13 marked this conversation as resolved.
Show resolved Hide resolved
ansible.builtin.include_tasks:
file: tasks/rescue_terraform_failure.yml
file: tasks/rescue_ghpc_failure.yml
apply:
delegate_to: localhost
vars:
Expand Down Expand Up @@ -267,7 +267,7 @@

- name: Cleanup firewall and infrastructure
ansible.builtin.include_tasks:
file: tasks/rescue_terraform_failure.yml
file: tasks/rescue_ghpc_failure.yml
apply:
delegate_to: localhost
vars:
Expand Down
nick-stroud marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2022 Google LLC
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -30,11 +30,12 @@
- delete
- "{{ deployment_name }}"

- name: Tear Down Cluster
changed_when: true # assume something destroyed
- name: Destroy deployment
register: ghpc_destroy
changed_when: ghpc_destroy.changed
run_once: true
nick-stroud marked this conversation as resolved.
Show resolved Hide resolved
ansible.builtin.command: ./ghpc destroy {{ deployment_name }} --auto-approve
args:
chdir: "{{ workspace }}"
environment:
TF_IN_AUTOMATION: "TRUE"
ansible.builtin.command:
cmd: terraform destroy -auto-approve
chdir: "{{ workspace }}/{{ deployment_name }}/primary"
54 changes: 54 additions & 0 deletions tools/cloud-build/daily-tests/builds/hcls.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
timeout: 14400s # 4hr
steps:
## Test simple golang build
- id: build_ghpc
waitFor: ["-"]
name: "golang:bullseye"
entrypoint: /bin/bash
args:
- -c
- |
cd /workspace
make
- id: fetch_builder
waitFor: ["-"]
name: >-
us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder
entrypoint: /bin/bash
args:
- -c
- echo "done fetching builder"

# Test hcls
- id: hcls
waitFor: ["fetch_builder", "build_ghpc"]
name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder
entrypoint: /bin/bash
env:
- "ANSIBLE_HOST_KEY_CHECKING=false"
- "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
args:
- -c
- |
set -x -e
BUILD_ID_FULL=$BUILD_ID
BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}

ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
--user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
--extra-vars="@tools/cloud-build/daily-tests/tests/hcls.yml"
44 changes: 44 additions & 0 deletions tools/cloud-build/daily-tests/tests/hcls.yml
nick-stroud marked this conversation as resolved.
Show resolved Hide resolved
cdunbar13 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

test_name: hcls-cluster
deployment_name: "hcls-{{ build }}"
# No non-alphanumerical characters in the slurm cluster name - they will be
# removed by HPC Toolkit slurm wrappers, which will break the playbook
slurm_cluster_name: "hcls{{ build[0:6] }}"
zone: europe-west1-d
workspace: /workspace
blueprint_yaml: "{{ workspace }}/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml"
max_nodes: 5
network: "{{ deployment_name }}-net"
login_node: "{{ slurm_cluster_name }}-login-*"
controller_node: "{{ slurm_cluster_name }}-controller"
cli_deployment_vars:
region: europe-west1
zone: "{{ zone }}"
disable_login_public_ips: "false"
disable_controller_public_ips: "false"
post_deploy_tests:
- test-mounts.yml
- test-partitions.yml
custom_vars:
partitions:
- compute
mounts:
- /home
- /apps
- /data_input
- /data_output