Skip to content

Commit

Permalink
Ensure local SSD solutions works upon reboot of Slurm nodes
Browse files Browse the repository at this point in the history
When the local SSD mountpoint has not been mounted use SystemD to create
the RAID array and format it. This addresses the known behavior of the
Slurm-GCP solution in which it does not re-run startup-scripts upon
a power off/on (or reboot) cycle. During a typical power off/on cycle,
the local SSD contents are discarded and the disks must be re-assembled
and formatted.
  • Loading branch information
tpdownes committed Oct 17, 2024
1 parent e329312 commit fa3f3a6
Showing 1 changed file with 26 additions and 15 deletions.
41 changes: 26 additions & 15 deletions modules/scripts/startup-script/files/setup-raid.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,23 +41,34 @@
name: mdadm
state: present

- name: Force RAID array if only 1 local SSD
ansible.builtin.shell: mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices=1 /dev/disk/by-id/google-local-nvme-ssd-0 --force
args:
creates: "{{ array_dev }}"
when: local_ssd_devices.files | length == 1
# this service will act during the play and upon reboots to ensure that local
# SSD volumes are always assembled into a RAID and re-formatted if necessary;
# there are many scenarios where a VM can be stopped or migrated during
# maintenance and the contents of local SSD will be discarded
- name: Install service to create local SSD RAID and format it
ansible.builtin.copy:
dest: /etc/systemd/system/create-localssd-raid.service
mode: 0644
content: |
[Unit]
After=local-fs.target
Before=slurmd.service
ConditionPathIsMountPoint=!{{ mountpoint }}
- name: Create RAID array
ansible.builtin.shell: mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices={{ local_ssd_devices.files | length }} /dev/disk/by-id/google-local-nvme-ssd-*
args:
creates: "{{ array_dev }}"
when: local_ssd_devices.files | length >= 2
[Service]
Type=oneshot
ExecStart=/usr/bin/bash -c "/usr/sbin/mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices={{ local_ssd_devices.files | length }} /dev/disk/by-id/google-local-nvme-ssd-*{{ " --force" if local_ssd_devices.files | length == 1 else "" }}"
ExecStartPost=/usr/sbin/mkfs -t {{ fstype }}{{ " -m 0" if fstype == "ext4" else "" }} {{ array_dev }}
- name: Format filesystem
community.general.filesystem:
fstype: "{{ fstype }}"
device: "{{ array_dev }}"
opts: '{{ "-m 0" if fstype == "ext4" else "" }}'
[Install]
WantedBy=slurmd.service
- name: Create RAID array and format
ansible.builtin.systemd:
name: create-localssd-raid.service
state: started
enabled: true
daemon_reload: true

- name: Mount RAID array
ansible.posix.mount:
Expand Down

0 comments on commit fa3f3a6

Please sign in to comment.