Skip to content

Commit

Permalink
Add preflight OS and NIC and other checks
Browse files Browse the repository at this point in the history
- Implemented OS preflight checks to validate system requirements before Ceph cluster creation.
- Checks include:
  - OS version (RHEL 9+ required)
  - SELinux enforcing mode
  - Firewalld installation and status
  - Required package availability (rpcbind, podman, firewalld)
  - Podman version check (>= 3.3)
  - RHEL software profile validation
  - Tuned profile check
  - CPU, RAM, Swap, and Filesystem (part of other checks)
  - Check whether jumbo frames are enabled
  - Is it configured with DHCP or static IP
  - Is the bandwidth sufficient
  - Collect and output current NIC options set (e.g. Bonding, not bridged or virtual)
  - Check and report network latency (ping) with all hosts provided in the inventory file
  - Separate NICs for front-end and back-end networks
  • Loading branch information
Kushal-deb committed Feb 11, 2025
1 parent 1d3efbc commit 39a250e
Show file tree
Hide file tree
Showing 4 changed files with 340 additions and 0 deletions.
2 changes: 2 additions & 0 deletions ceph_defaults/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,6 @@ infra_pkgs:
- podman
- lvm2
- sos
- rpcbind
- firewalld
client_group: clients
4 changes: 4 additions & 0 deletions cephadm-preflight.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
become: true
gather_facts: true
vars:
preflight_results: []
repos_4_to_disable:
- rhceph-4-tools-for-rhel-{{ ansible_facts['distribution_major_version'] }}-{{ ansible_facts['architecture'] }}-rpms
- rhceph-4-mon-for-rhel-{{ ansible_facts['distribution_major_version'] }}-{{ ansible_facts['architecture'] }}-rpms
Expand All @@ -45,6 +46,9 @@
import_role:
name: ceph_defaults

- name: Run checks
import_tasks: preflight_checks.yml

- name: redhat family of OS related tasks
when: ansible_facts['os_family'] == 'RedHat'
block:
Expand Down
314 changes: 314 additions & 0 deletions preflight_checks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
---
- name: Initialize preflight results list
set_fact:
preflight_results: []
preflight_failures: []

- name: Gather all Ansible facts
setup:

- name: Check if OS is RHEL 9+
set_fact:
os_check: "{{ 'PASS' if ansible_facts['distribution'] == 'RedHat' and ansible_facts['distribution_major_version'] | int >= 9 else 'FAIL' }}"
os_reason: "{{ 'Ceph requires RHEL 9+. Detected: ' ~ ansible_facts['distribution'] ~ ' ' ~ ansible_facts['distribution_version'] if ansible_facts['distribution_major_version'] | int < 9 else 'N/A' }}"

- name: Store OS check result
set_fact:
preflight_results: "{{ preflight_results + [{'Check': 'OS Version', 'Result': os_check, 'Reason': os_reason}] }}"
preflight_failures: "{{ preflight_failures + ['OS Version'] if os_check == 'FAIL' else preflight_failures }}"

- name: Ensure SELinux is set to Enforcing mode
ansible.posix.selinux:
policy: targeted
state: enforcing
register: selinux_status
changed_when: false
failed_when: selinux_status.failed

- name: Retrieve SELinux status from ansible_facts
setup:
gather_subset:
- selinux

- name: Determine SELinux Check Result
set_fact:
selinux_check: "{{ 'PASS' if ansible_facts['selinux']['status'] == 'enabled' and ansible_facts['selinux']['mode'] == 'enforcing' else 'FAIL' }}"

- name: Determine SELinux Failure Reason
set_fact:
selinux_reason: "{{ 'SELinux was not in enforcing mode and could not be enforced automatically' if selinux_check == 'FAIL' else 'N/A' }}"

- name: Store SELinux check result
set_fact:
selinux_check: "{{ 'PASS' if ansible_facts['selinux']['status'] == 'enabled' and ansible_facts['selinux']['mode'] == 'enforcing' else 'FAIL' }}"
selinux_reason: "{{ 'SELinux was not in enforcing mode and could not be enforced automatically' if selinux_check == 'FAIL' else 'N/A' }}"
preflight_results: "{{ preflight_results + [{'Check': 'SELinux', 'Result': selinux_check, 'Reason': selinux_reason}] }}"
preflight_failures: "{{ preflight_failures + ['SELinux'] if selinux_check == 'FAIL' else preflight_failures }}"

- name: Ensure required packages are installed
package:
name: "{{ infra_pkgs }}"
state: present
register: package_install
failed_when: false

- name: Determine Package Installation Check Result
set_fact:
package_check: "{{ 'PASS' if not package_install.failed else 'FAIL' }}"

- name: Determine Package Installation Failure Reason
set_fact:
package_reason: "{{ 'Some required packages failed to install' if package_check == 'FAIL' else 'N/A' }}"

- name: Store Package Installation Result
set_fact:
preflight_results: "{{ preflight_results + [{'Check': 'Required Packages Installed', 'Result': package_check, 'Reason': package_reason}] }}"
preflight_failures: "{{ preflight_failures + ['Required Packages'] if package_check == 'FAIL' else preflight_failures }}"

- name: Ensure firewalld is enabled and running
systemd:
name: firewalld
state: started
enabled: true
register: firewall_status
failed_when: false

- name: Determine Firewalld Check Status
set_fact:
firewalld_check: "{{ 'PASS' if firewall_status.status.ActiveState == 'active' else 'FAIL' }}"
firewalld_reason: "{{ 'Firewalld was not running and could not be started' if firewall_status.failed else 'N/A' }}"

- name: Store Firewalld check result
set_fact:
firewalld_check: "{{ 'PASS' if firewall_status.status.ActiveState == 'active' else 'FAIL' }}"
firewalld_reason: "{{ 'Firewalld was not running and could not be started' if firewall_status.failed else 'N/A' }}"
preflight_results: "{{ preflight_results + [{'Check': 'Firewalld Running', 'Result': firewalld_check, 'Reason': firewalld_reason}] }}"
preflight_failures: "{{ preflight_failures + ['Firewalld Running'] if firewalld_check == 'FAIL' else preflight_failures }}"

- name: Collect installed package facts
package_facts:
manager: auto

- name: Check if Podman is installed
set_fact:
podman_installed: "{{ 'podman' in ansible_facts.packages }}"

- name: Extract Podman version
set_fact:
podman_version: "{{ ansible_facts.packages['podman'][0].version if podman_installed else 'NOT_INSTALLED' }}"

- name: Define Podman Check Variables
set_fact:
podman_check: "{{ 'PASS' if podman_installed else 'FAIL' }}"
podman_reason: "{{ 'Podman is not installed, required for Ceph' if not podman_installed else 'Podman version is ' ~ podman_version }}"
preflight_failures: "{{ preflight_failures + ['Podman Installed'] if not podman_installed else preflight_failures }}"

- name: Store Podman Installation Check
set_fact:
preflight_results: "{{ preflight_results + [{'Check': 'Podman Installed', 'Result': podman_check, 'Reason': podman_reason}] }}"

- name: Ensure Podman is installed if missing (Fixable)
package:
name: podman
state: present
when: not podman_installed

- name: Validate RHEL software profile
command: subscription-manager list --consumed
register: rhel_profile
changed_when: false
failed_when: false

- name: Define RHEL Profile Check Result
set_fact:
rhel_profile_check: "{{ 'PASS' if ('Server' in rhel_profile.stdout and 'File and Storage Server' in rhel_profile.stdout) else 'FAIL' }}"

- name: Define RHEL Profile Check Reason
set_fact:
rhel_profile_reason: "{{ 'Incorrect RHEL software profile. Expected: Server with File and Storage Server.' if rhel_profile_check == 'FAIL' else 'N/A' }}"

- name: Store RHEL Profile check
set_fact:
preflight_results: "{{ preflight_results + [{'Check': 'RHEL Profile', 'Result': rhel_profile_check, 'Reason': rhel_profile_reason}] }}"
preflight_failures: "{{ preflight_failures + ['RHEL Profile'] if rhel_profile_check == 'FAIL' else preflight_failures }}"

- name: Get current tuned profile
command: tuned-adm active
register: tuned_profile
changed_when: false
failed_when: false

- name: Define Tuned Profile Check Result
set_fact:
tuned_profile_check: "{{ 'PASS' if 'throughput-performance' in tuned_profile.stdout else 'FAIL' }}"

- name: Define Tuned Profile Check Reason
set_fact:
tuned_profile_reason: "{{ 'Incorrect tuned profile. Expected: throughput-performance' if tuned_profile_check == 'FAIL' else 'N/A' }}"

- name: Store Tuned Profile Check
set_fact:
preflight_results: "{{ preflight_results + [{'Check': 'Tuned Profile', 'Result': tuned_profile_check, 'Reason': tuned_profile_reason}] }}"
preflight_failures: "{{ preflight_failures + ['Tuned Profile'] if tuned_profile_check == 'FAIL' else preflight_failures }}"

- name: Check CPU requirements
shell: "lscpu | grep -q 'avx2' && echo 'yes' || echo 'no'"
register: cpu_supports_x86_64_v2
changed_when: false
failed_when: false

- name: Define CPU Check Variables
set_fact:
cpu_checks:
x86_64_v2:
result: "{{ 'PASS' if cpu_supports_x86_64_v2.stdout | trim == 'yes' else 'FAIL' }}"
reason: "{{ 'AVX2 instruction set missing. RHEL 9 requires AVX2 support.' if cpu_supports_x86_64_v2.stdout | trim != 'yes' else 'N/A' }}"
cores:
result: "{{ 'PASS' if ansible_facts['processor_vcpus'] | int >= 4 else 'FAIL' }}"
reason: "{{ 'System has only ' ~ ansible_facts['processor_vcpus'] ~ ' cores, required: 4' if ansible_facts['processor_vcpus'] | int < 4 else 'N/A' }}"

- name: Store CPU Checks
set_fact:
preflight_results: "{{ preflight_results + [
{'Check': 'CPU x86-64-v2', 'Result': cpu_checks['x86_64_v2']['result'], 'Reason': cpu_checks['x86_64_v2']['reason']},
{'Check': 'CPU Cores >= 4', 'Result': cpu_checks['cores']['result'], 'Reason': cpu_checks['cores']['reason']}
] }}"
preflight_failures: "{{ preflight_failures +
(['CPU x86-64-v2'] if cpu_checks['x86_64_v2']['result'] == 'FAIL' else []) +
(['CPU Cores'] if cpu_checks['cores']['result'] == 'FAIL' else []) }}"

- name: Define RAM and Swap Check Variables
set_fact:
memory_checks:
ram:
result: "{{ 'PASS' if ansible_facts['memtotal_mb'] | int >= 8192 else 'FAIL' }}"
reason: "{{ 'System has only ' ~ ansible_facts['memtotal_mb'] ~ ' MB RAM, required: 8192MB' if ansible_facts['memtotal_mb'] | int < 8192 else 'N/A' }}"
swap:
required: "{{ ((ansible_facts['memtotal_mb'] | int * 1.5) | round) | int }}"
actual: "{{ ansible_facts['swaptotal_mb'] | int }}"
result: "{{ 'PASS' if (ansible_facts['swaptotal_mb'] | int) >= ((ansible_facts['memtotal_mb'] | int * 1.5) | round) | int else 'FAIL' }}"
reason: "{{ 'System has only ' ~ ansible_facts['swaptotal_mb'] ~ ' MB Swap, required: ' ~ ((ansible_facts['memtotal_mb'] | int * 1.5) | round) | int ~ ' MB' if ansible_facts['swaptotal_mb'] | int < ((ansible_facts['memtotal_mb'] | int * 1.5) | round) | int else 'N/A' }}"

- name: Store RAM and Swap Space Check Results
set_fact:
preflight_results: "{{ preflight_results + [
{'Check': 'Minimum RAM (8GB)', 'Result': memory_checks['ram']['result'], 'Reason': memory_checks['ram']['reason']},
{'Check': 'Swap Space (1.5x RAM)', 'Result': memory_checks['swap']['result'], 'Reason': memory_checks['swap']['reason']}
] }}"
preflight_failures: "{{ preflight_failures +
(['Minimum RAM'] if memory_checks['ram']['result'] == 'FAIL' else []) +
(['Swap Space'] if memory_checks['swap']['result'] == 'FAIL' else []) }}"

- name: Define /var Partition and Root Filesystem Check Variables
set_fact:
filesystem_checks:
var_partition:
exists: "{{ ansible_facts['mounts'] | selectattr('mount', 'equalto', '/var') | list | length > 0 }}"
result: "{{ 'PASS' if (ansible_facts['mounts'] | selectattr('mount', 'equalto', '/var') | list | length > 0) else 'FAIL' }}"
reason: "{{ 'N/A' if (ansible_facts['mounts'] | selectattr('mount', 'equalto', '/var') | list | length > 0) else '/var is not a separate partition' }}"
root_fs:
size_gb: "{{ (ansible_facts['mounts'] | selectattr('mount', 'equalto', '/') | map(attribute='size_total') | first | default(0) | int // 1024**3) }}"
result: "{{ 'PASS' if ((ansible_facts['mounts'] | selectattr('mount', 'equalto', '/') | map(attribute='size_total') | first | default(0) | int // 1024**3) >= 100) else 'FAIL' }}"
reason: "{{ 'Root FS is only ' ~ (ansible_facts['mounts'] | selectattr('mount', 'equalto', '/') | map(attribute='size_total') | first | default(0) | int // 1024**3) ~ 'GB, required: 100GB' if ((ansible_facts['mounts'] | selectattr('mount', 'equalto', '/') | map(attribute='size_total') | first | default(0) | int // 1024**3) < 100) else 'N/A' }}"

- name: Store Filesystem Checks
set_fact:
preflight_results: "{{ preflight_results + [
{'Check': '/var is a separate partition', 'Result': filesystem_checks['var_partition']['result'], 'Reason': filesystem_checks['var_partition']['reason']},
{'Check': 'Root Filesystem >= 100GB', 'Result': filesystem_checks['root_fs']['result'], 'Reason': filesystem_checks['root_fs']['reason']}
] }}"
preflight_failures: "{{ preflight_failures +
(['/var Partition'] if filesystem_checks['var_partition']['result'] == 'FAIL' else []) +
(['Root Filesystem'] if filesystem_checks['root_fs']['result'] == 'FAIL' else []) }}"

- name: Extract networking facts
set_fact:
primary_nic: "{{ ansible_facts['default_ipv4']['interface'] }}"
primary_ip: "{{ ansible_facts['default_ipv4']['address'] }}"
primary_mac: "{{ ansible_facts['default_ipv4']['macaddress'] }}"
primary_mtu: "{{ ansible_facts[ansible_facts['default_ipv4']['interface']]['mtu'] | default('0') | int }}"
primary_speed: "{{ ansible_facts[ansible_facts['default_ipv4']['interface']]['speed'] | default('-1') | int }}"
primary_dhcp: "{{ 'dhcp' if ansible_facts['default_ipv4'].get('gateway') else 'manual' }}"

- name: Define Jumbo Frames Check
set_fact:
jumbo_frames_check: "{{ 'PASS' if (primary_mtu | int) > 1500 else 'FAIL' }}"
jumbo_frames_reason: "{{ 'MTU is ' ~ (primary_mtu | int) ~ ', recommended > 1500' if (primary_mtu | int) <= 1500 else 'N/A' }}"

- name: Store Jumbo Frames Check
set_fact:
preflight_results: "{{ preflight_results + [{'Check': 'Jumbo Frames Enabled', 'Result': jumbo_frames_check, 'Reason': jumbo_frames_reason}] }}"

- name: Define NIC Configuration Check
set_fact:
nic_config_check: "{{ 'PASS' if primary_dhcp == 'manual' else 'FAIL' }}"
nic_config_reason: "{{ 'NIC is using DHCP, static IP is recommended' if primary_dhcp != 'manual' else 'N/A' }}"

- name: Store NIC Configuration Check
set_fact:
preflight_results: "{{ preflight_results + [{'Check': 'NIC Static IP Configuration', 'Result': nic_config_check, 'Reason': nic_config_reason}] }}"

- name: Define NIC Bandwidth Check
set_fact:
nic_speed_check: "{{ 'PASS' if (primary_speed | int) >= 10000 else 'FAIL' }}"
nic_speed_reason: "{{ 'NIC speed is ' ~ primary_speed ~ ' Mbps, recommended is 10GbE' if (primary_speed | int) < 10000 else 'N/A' }}"

- name: Store NIC Bandwidth Check
set_fact:
preflight_results: "{{ preflight_results + [{'Check': 'NIC Bandwidth (10GbE Recommended)', 'Result': nic_speed_check, 'Reason': nic_speed_reason}] }}"

- name: Extract NIC Details
set_fact:
nic_config_details: "{{ ansible_facts['interfaces'] }}"

- name: Store NIC Configuration Info
set_fact:
preflight_results: "{{ preflight_results + [{'Check': 'NIC Configuration', 'Result': 'INFO', 'Reason': 'NIC options: ' ~ nic_config_details | join(', ')}] }}"

- name: Identify Front-End and Back-End NICs
set_fact:
frontend_nic: "{{ ansible_facts['default_ipv4']['interface'] | default('Unknown') }}"
backend_nic: "{{ ansible_facts['interfaces'] | difference(['lo', ansible_facts['default_ipv4']['interface']]) | first | default(ansible_facts['default_ipv4']['interface']) }}"

- name: Define NIC Separation Check
set_fact:
nic_separation_check: "{{ 'PASS' if frontend_nic != backend_nic else 'FAIL' }}"
nic_separation_reason: "{{ 'Using same NIC for both front-end and back-end networks. Customers with large deployments should separate traffic for performance optimization.' if frontend_nic == backend_nic else 'N/A' }}"
preflight_failures: "{{ preflight_failures + ['NIC Separation'] if frontend_nic == backend_nic else preflight_failures }}"

- name: Store NIC Separation Check
set_fact:
preflight_results: "{{ preflight_results + [{'Check': 'Separate NICs for Frontend & Backend Networks', 'Result': nic_separation_check, 'Reason': nic_separation_reason}] }}"

- name: Ping all hosts in inventory
ansible.builtin.ping:
register: ping_results
failed_when: false
delegate_to: "{{ item }}"
with_items: "{{ groups['all'] }}"

- name: Store Network Latency Check
set_fact:
preflight_results: "{{ preflight_results + [{'Check': 'Network Latency', 'Result': 'INFO', 'Reason': 'Latency results: ' ~ ping_results.results | map(attribute='ping') | list}] }}"

- name: Generate Preflight Check Report
delegate_to: localhost
run_once: true
become: false
template:
src: templates/preflight_report.j2
dest: ./ceph_preflight_report.txt

- name: Read Preflight Check Report
slurp:
src: ./ceph_preflight_report.txt
register: report_content

- name: Show Report Summary
debug:
msg: "{{ report_content['content'] | b64decode | regex_replace('\\r', '') | split('\n') }}"

- name: Final Check - Fail if any critical checks failed
fail:
msg: "Preflight checks failed for the following: {{ preflight_failures | join(', ') }}. Please resolve these issues before proceeding."
when: preflight_failures | length > 0
20 changes: 20 additions & 0 deletions templates/preflight_report.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Preflight Check Report

==================================================
System Checks
--------------------------------------------------
{% for item in preflight_results %}
- **{{ item['Check'] }}**: {% if item['Result'] == 'PASS' %}✅ Passed{% else %}❌ Failed{% endif %}
- **Reason:** {{ item['Reason'] }}
{% endfor %}

==================================================
Summary
--------------------------------------------------
{% if preflight_failures | length > 0 %}
❌ **Critical Failures Detected**:
- {{ preflight_failures | join(', ') }}
- **Action Required**: Please fix the above issues before proceeding.
{% else %}
✅ **All Critical Checks Passed! You are good to go.**
{% endif %}

0 comments on commit 39a250e

Please sign in to comment.