From 39a250eb0ecc34facc10c624d61eff5ee30fb737 Mon Sep 17 00:00:00 2001 From: Kushal Deb Date: Mon, 10 Feb 2025 18:58:43 +0530 Subject: [PATCH] Add preflight OS and NIC and other checks - Implemented OS preflight checks to validate system requirements before Ceph cluster creation. - Checks include: - OS version (RHEL 9+ required) - SELinux enforcing mode - Firewalld installation and status - Required package availability (rpcbind, podman, firewalld) - Podman version check (>= 3.3) - RHEL software profile validation - Tuned profile check - CPU, RAM, Swap, and Filesystem (part of other checks) - Check whether jumbo frames are enabled - Is it configured with DHCP or static IP - Is the bandwidth sufficient - Collect and output current NIC options set (e.g. Bonding, not bridged or virtual) - Check and report network latency (ping) with all hosts provided in the inventory file - Separate NICs for front-end and back-end networks --- ceph_defaults/defaults/main.yml | 2 + cephadm-preflight.yml | 4 + preflight_checks.yml | 314 ++++++++++++++++++++++++++++++++ templates/preflight_report.j2 | 20 ++ 4 files changed, 340 insertions(+) create mode 100644 preflight_checks.yml create mode 100644 templates/preflight_report.j2 diff --git a/ceph_defaults/defaults/main.yml b/ceph_defaults/defaults/main.yml index a3b5c31..f85f259 100644 --- a/ceph_defaults/defaults/main.yml +++ b/ceph_defaults/defaults/main.yml @@ -22,4 +22,6 @@ infra_pkgs: - podman - lvm2 - sos + - rpcbind + - firewalld client_group: clients diff --git a/cephadm-preflight.yml b/cephadm-preflight.yml index 88c58e7..7295fa2 100644 --- a/cephadm-preflight.yml +++ b/cephadm-preflight.yml @@ -25,6 +25,7 @@ become: true gather_facts: true vars: + preflight_results: [] repos_4_to_disable: - rhceph-4-tools-for-rhel-{{ ansible_facts['distribution_major_version'] }}-{{ ansible_facts['architecture'] }}-rpms - rhceph-4-mon-for-rhel-{{ ansible_facts['distribution_major_version'] }}-{{ ansible_facts['architecture'] }}-rpms @@ -45,6 +46,9 @@ import_role: name: ceph_defaults + - name: Run checks + import_tasks: preflight_checks.yml + - name: redhat family of OS related tasks when: ansible_facts['os_family'] == 'RedHat' block: diff --git a/preflight_checks.yml b/preflight_checks.yml new file mode 100644 index 0000000..faf4591 --- /dev/null +++ b/preflight_checks.yml @@ -0,0 +1,314 @@ +--- +- name: Initialize preflight results list + set_fact: + preflight_results: [] + preflight_failures: [] + +- name: Gather all Ansible facts + setup: + +- name: Check if OS is RHEL 9+ + set_fact: + os_check: "{{ 'PASS' if ansible_facts['distribution'] == 'RedHat' and ansible_facts['distribution_major_version'] | int >= 9 else 'FAIL' }}" + os_reason: "{{ 'Ceph requires RHEL 9+. Detected: ' ~ ansible_facts['distribution'] ~ ' ' ~ ansible_facts['distribution_version'] if ansible_facts['distribution_major_version'] | int < 9 else 'N/A' }}" + +- name: Store OS check result + set_fact: + preflight_results: "{{ preflight_results + [{'Check': 'OS Version', 'Result': os_check, 'Reason': os_reason}] }}" + preflight_failures: "{{ preflight_failures + ['OS Version'] if os_check == 'FAIL' else preflight_failures }}" + +- name: Ensure SELinux is set to Enforcing mode + ansible.posix.selinux: + policy: targeted + state: enforcing + register: selinux_status + changed_when: false + failed_when: selinux_status.failed + +- name: Retrieve SELinux status from ansible_facts + setup: + gather_subset: + - selinux + +- name: Determine SELinux Check Result + set_fact: + selinux_check: "{{ 'PASS' if ansible_facts['selinux']['status'] == 'enabled' and ansible_facts['selinux']['mode'] == 'enforcing' else 'FAIL' }}" + +- name: Determine SELinux Failure Reason + set_fact: + selinux_reason: "{{ 'SELinux was not in enforcing mode and could not be enforced automatically' if selinux_check == 'FAIL' else 'N/A' }}" + +- name: Store SELinux check result + set_fact: + selinux_check: "{{ 'PASS' if ansible_facts['selinux']['status'] == 'enabled' and ansible_facts['selinux']['mode'] == 'enforcing' else 'FAIL' }}" + selinux_reason: "{{ 'SELinux was not in enforcing mode and could not be enforced automatically' if selinux_check == 'FAIL' else 'N/A' }}" + preflight_results: "{{ preflight_results + [{'Check': 'SELinux', 'Result': selinux_check, 'Reason': selinux_reason}] }}" + preflight_failures: "{{ preflight_failures + ['SELinux'] if selinux_check == 'FAIL' else preflight_failures }}" + +- name: Ensure required packages are installed + package: + name: "{{ infra_pkgs }}" + state: present + register: package_install + failed_when: false + +- name: Determine Package Installation Check Result + set_fact: + package_check: "{{ 'PASS' if not package_install.failed else 'FAIL' }}" + +- name: Determine Package Installation Failure Reason + set_fact: + package_reason: "{{ 'Some required packages failed to install' if package_check == 'FAIL' else 'N/A' }}" + +- name: Store Package Installation Result + set_fact: + preflight_results: "{{ preflight_results + [{'Check': 'Required Packages Installed', 'Result': package_check, 'Reason': package_reason}] }}" + preflight_failures: "{{ preflight_failures + ['Required Packages'] if package_check == 'FAIL' else preflight_failures }}" + +- name: Ensure firewalld is enabled and running + systemd: + name: firewalld + state: started + enabled: true + register: firewall_status + failed_when: false + +- name: Determine Firewalld Check Status + set_fact: + firewalld_check: "{{ 'PASS' if firewall_status.status.ActiveState == 'active' else 'FAIL' }}" + firewalld_reason: "{{ 'Firewalld was not running and could not be started' if firewall_status.failed else 'N/A' }}" + +- name: Store Firewalld check result + set_fact: + firewalld_check: "{{ 'PASS' if firewall_status.status.ActiveState == 'active' else 'FAIL' }}" + firewalld_reason: "{{ 'Firewalld was not running and could not be started' if firewall_status.failed else 'N/A' }}" + preflight_results: "{{ preflight_results + [{'Check': 'Firewalld Running', 'Result': firewalld_check, 'Reason': firewalld_reason}] }}" + preflight_failures: "{{ preflight_failures + ['Firewalld Running'] if firewalld_check == 'FAIL' else preflight_failures }}" + +- name: Collect installed package facts + package_facts: + manager: auto + +- name: Check if Podman is installed + set_fact: + podman_installed: "{{ 'podman' in ansible_facts.packages }}" + +- name: Extract Podman version + set_fact: + podman_version: "{{ ansible_facts.packages['podman'][0].version if podman_installed else 'NOT_INSTALLED' }}" + +- name: Define Podman Check Variables + set_fact: + podman_check: "{{ 'PASS' if podman_installed else 'FAIL' }}" + podman_reason: "{{ 'Podman is not installed, required for Ceph' if not podman_installed else 'Podman version is ' ~ podman_version }}" + preflight_failures: "{{ preflight_failures + ['Podman Installed'] if not podman_installed else preflight_failures }}" + +- name: Store Podman Installation Check + set_fact: + preflight_results: "{{ preflight_results + [{'Check': 'Podman Installed', 'Result': podman_check, 'Reason': podman_reason}] }}" + +- name: Ensure Podman is installed if missing (Fixable) + package: + name: podman + state: present + when: not podman_installed + +- name: Validate RHEL software profile + command: subscription-manager list --consumed + register: rhel_profile + changed_when: false + failed_when: false + +- name: Define RHEL Profile Check Result + set_fact: + rhel_profile_check: "{{ 'PASS' if ('Server' in rhel_profile.stdout and 'File and Storage Server' in rhel_profile.stdout) else 'FAIL' }}" + +- name: Define RHEL Profile Check Reason + set_fact: + rhel_profile_reason: "{{ 'Incorrect RHEL software profile. Expected: Server with File and Storage Server.' if rhel_profile_check == 'FAIL' else 'N/A' }}" + +- name: Store RHEL Profile check + set_fact: + preflight_results: "{{ preflight_results + [{'Check': 'RHEL Profile', 'Result': rhel_profile_check, 'Reason': rhel_profile_reason}] }}" + preflight_failures: "{{ preflight_failures + ['RHEL Profile'] if rhel_profile_check == 'FAIL' else preflight_failures }}" + +- name: Get current tuned profile + command: tuned-adm active + register: tuned_profile + changed_when: false + failed_when: false + +- name: Define Tuned Profile Check Result + set_fact: + tuned_profile_check: "{{ 'PASS' if 'throughput-performance' in tuned_profile.stdout else 'FAIL' }}" + +- name: Define Tuned Profile Check Reason + set_fact: + tuned_profile_reason: "{{ 'Incorrect tuned profile. Expected: throughput-performance' if tuned_profile_check == 'FAIL' else 'N/A' }}" + +- name: Store Tuned Profile Check + set_fact: + preflight_results: "{{ preflight_results + [{'Check': 'Tuned Profile', 'Result': tuned_profile_check, 'Reason': tuned_profile_reason}] }}" + preflight_failures: "{{ preflight_failures + ['Tuned Profile'] if tuned_profile_check == 'FAIL' else preflight_failures }}" + +- name: Check CPU requirements + shell: "lscpu | grep -q 'avx2' && echo 'yes' || echo 'no'" + register: cpu_supports_x86_64_v2 + changed_when: false + failed_when: false + +- name: Define CPU Check Variables + set_fact: + cpu_checks: + x86_64_v2: + result: "{{ 'PASS' if cpu_supports_x86_64_v2.stdout | trim == 'yes' else 'FAIL' }}" + reason: "{{ 'AVX2 instruction set missing. RHEL 9 requires AVX2 support.' if cpu_supports_x86_64_v2.stdout | trim != 'yes' else 'N/A' }}" + cores: + result: "{{ 'PASS' if ansible_facts['processor_vcpus'] | int >= 4 else 'FAIL' }}" + reason: "{{ 'System has only ' ~ ansible_facts['processor_vcpus'] ~ ' cores, required: 4' if ansible_facts['processor_vcpus'] | int < 4 else 'N/A' }}" + +- name: Store CPU Checks + set_fact: + preflight_results: "{{ preflight_results + [ + {'Check': 'CPU x86-64-v2', 'Result': cpu_checks['x86_64_v2']['result'], 'Reason': cpu_checks['x86_64_v2']['reason']}, + {'Check': 'CPU Cores >= 4', 'Result': cpu_checks['cores']['result'], 'Reason': cpu_checks['cores']['reason']} + ] }}" + preflight_failures: "{{ preflight_failures + + (['CPU x86-64-v2'] if cpu_checks['x86_64_v2']['result'] == 'FAIL' else []) + + (['CPU Cores'] if cpu_checks['cores']['result'] == 'FAIL' else []) }}" + +- name: Define RAM and Swap Check Variables + set_fact: + memory_checks: + ram: + result: "{{ 'PASS' if ansible_facts['memtotal_mb'] | int >= 8192 else 'FAIL' }}" + reason: "{{ 'System has only ' ~ ansible_facts['memtotal_mb'] ~ ' MB RAM, required: 8192MB' if ansible_facts['memtotal_mb'] | int < 8192 else 'N/A' }}" + swap: + required: "{{ ((ansible_facts['memtotal_mb'] | int * 1.5) | round) | int }}" + actual: "{{ ansible_facts['swaptotal_mb'] | int }}" + result: "{{ 'PASS' if (ansible_facts['swaptotal_mb'] | int) >= ((ansible_facts['memtotal_mb'] | int * 1.5) | round) | int else 'FAIL' }}" + reason: "{{ 'System has only ' ~ ansible_facts['swaptotal_mb'] ~ ' MB Swap, required: ' ~ ((ansible_facts['memtotal_mb'] | int * 1.5) | round) | int ~ ' MB' if ansible_facts['swaptotal_mb'] | int < ((ansible_facts['memtotal_mb'] | int * 1.5) | round) | int else 'N/A' }}" + +- name: Store RAM and Swap Space Check Results + set_fact: + preflight_results: "{{ preflight_results + [ + {'Check': 'Minimum RAM (8GB)', 'Result': memory_checks['ram']['result'], 'Reason': memory_checks['ram']['reason']}, + {'Check': 'Swap Space (1.5x RAM)', 'Result': memory_checks['swap']['result'], 'Reason': memory_checks['swap']['reason']} + ] }}" + preflight_failures: "{{ preflight_failures + + (['Minimum RAM'] if memory_checks['ram']['result'] == 'FAIL' else []) + + (['Swap Space'] if memory_checks['swap']['result'] == 'FAIL' else []) }}" + +- name: Define /var Partition and Root Filesystem Check Variables + set_fact: + filesystem_checks: + var_partition: + exists: "{{ ansible_facts['mounts'] | selectattr('mount', 'equalto', '/var') | list | length > 0 }}" + result: "{{ 'PASS' if (ansible_facts['mounts'] | selectattr('mount', 'equalto', '/var') | list | length > 0) else 'FAIL' }}" + reason: "{{ 'N/A' if (ansible_facts['mounts'] | selectattr('mount', 'equalto', '/var') | list | length > 0) else '/var is not a separate partition' }}" + root_fs: + size_gb: "{{ (ansible_facts['mounts'] | selectattr('mount', 'equalto', '/') | map(attribute='size_total') | first | default(0) | int // 1024**3) }}" + result: "{{ 'PASS' if ((ansible_facts['mounts'] | selectattr('mount', 'equalto', '/') | map(attribute='size_total') | first | default(0) | int // 1024**3) >= 100) else 'FAIL' }}" + reason: "{{ 'Root FS is only ' ~ (ansible_facts['mounts'] | selectattr('mount', 'equalto', '/') | map(attribute='size_total') | first | default(0) | int // 1024**3) ~ 'GB, required: 100GB' if ((ansible_facts['mounts'] | selectattr('mount', 'equalto', '/') | map(attribute='size_total') | first | default(0) | int // 1024**3) < 100) else 'N/A' }}" + +- name: Store Filesystem Checks + set_fact: + preflight_results: "{{ preflight_results + [ + {'Check': '/var is a separate partition', 'Result': filesystem_checks['var_partition']['result'], 'Reason': filesystem_checks['var_partition']['reason']}, + {'Check': 'Root Filesystem >= 100GB', 'Result': filesystem_checks['root_fs']['result'], 'Reason': filesystem_checks['root_fs']['reason']} + ] }}" + preflight_failures: "{{ preflight_failures + + (['/var Partition'] if filesystem_checks['var_partition']['result'] == 'FAIL' else []) + + (['Root Filesystem'] if filesystem_checks['root_fs']['result'] == 'FAIL' else []) }}" + +- name: Extract networking facts + set_fact: + primary_nic: "{{ ansible_facts['default_ipv4']['interface'] }}" + primary_ip: "{{ ansible_facts['default_ipv4']['address'] }}" + primary_mac: "{{ ansible_facts['default_ipv4']['macaddress'] }}" + primary_mtu: "{{ ansible_facts[ansible_facts['default_ipv4']['interface']]['mtu'] | default('0') | int }}" + primary_speed: "{{ ansible_facts[ansible_facts['default_ipv4']['interface']]['speed'] | default('-1') | int }}" + primary_dhcp: "{{ 'dhcp' if ansible_facts['default_ipv4'].get('gateway') else 'manual' }}" + +- name: Define Jumbo Frames Check + set_fact: + jumbo_frames_check: "{{ 'PASS' if (primary_mtu | int) > 1500 else 'FAIL' }}" + jumbo_frames_reason: "{{ 'MTU is ' ~ (primary_mtu | int) ~ ', recommended > 1500' if (primary_mtu | int) <= 1500 else 'N/A' }}" + +- name: Store Jumbo Frames Check + set_fact: + preflight_results: "{{ preflight_results + [{'Check': 'Jumbo Frames Enabled', 'Result': jumbo_frames_check, 'Reason': jumbo_frames_reason}] }}" + +- name: Define NIC Configuration Check + set_fact: + nic_config_check: "{{ 'PASS' if primary_dhcp == 'manual' else 'FAIL' }}" + nic_config_reason: "{{ 'NIC is using DHCP, static IP is recommended' if primary_dhcp != 'manual' else 'N/A' }}" + +- name: Store NIC Configuration Check + set_fact: + preflight_results: "{{ preflight_results + [{'Check': 'NIC Static IP Configuration', 'Result': nic_config_check, 'Reason': nic_config_reason}] }}" + +- name: Define NIC Bandwidth Check + set_fact: + nic_speed_check: "{{ 'PASS' if (primary_speed | int) >= 10000 else 'FAIL' }}" + nic_speed_reason: "{{ 'NIC speed is ' ~ primary_speed ~ ' Mbps, recommended is 10GbE' if (primary_speed | int) < 10000 else 'N/A' }}" + +- name: Store NIC Bandwidth Check + set_fact: + preflight_results: "{{ preflight_results + [{'Check': 'NIC Bandwidth (10GbE Recommended)', 'Result': nic_speed_check, 'Reason': nic_speed_reason}] }}" + +- name: Extract NIC Details + set_fact: + nic_config_details: "{{ ansible_facts['interfaces'] }}" + +- name: Store NIC Configuration Info + set_fact: + preflight_results: "{{ preflight_results + [{'Check': 'NIC Configuration', 'Result': 'INFO', 'Reason': 'NIC options: ' ~ nic_config_details | join(', ')}] }}" + +- name: Identify Front-End and Back-End NICs + set_fact: + frontend_nic: "{{ ansible_facts['default_ipv4']['interface'] | default('Unknown') }}" + backend_nic: "{{ ansible_facts['interfaces'] | difference(['lo', ansible_facts['default_ipv4']['interface']]) | first | default(ansible_facts['default_ipv4']['interface']) }}" + +- name: Define NIC Separation Check + set_fact: + nic_separation_check: "{{ 'PASS' if frontend_nic != backend_nic else 'FAIL' }}" + nic_separation_reason: "{{ 'Using same NIC for both front-end and back-end networks. Customers with large deployments should separate traffic for performance optimization.' if frontend_nic == backend_nic else 'N/A' }}" + preflight_failures: "{{ preflight_failures + ['NIC Separation'] if frontend_nic == backend_nic else preflight_failures }}" + +- name: Store NIC Separation Check + set_fact: + preflight_results: "{{ preflight_results + [{'Check': 'Separate NICs for Frontend & Backend Networks', 'Result': nic_separation_check, 'Reason': nic_separation_reason}] }}" + +- name: Ping all hosts in inventory + ansible.builtin.ping: + register: ping_results + failed_when: false + delegate_to: "{{ item }}" + with_items: "{{ groups['all'] }}" + +- name: Store Network Latency Check + set_fact: + preflight_results: "{{ preflight_results + [{'Check': 'Network Latency', 'Result': 'INFO', 'Reason': 'Latency results: ' ~ ping_results.results | map(attribute='ping') | list}] }}" + +- name: Generate Preflight Check Report + delegate_to: localhost + run_once: true + become: false + template: + src: templates/preflight_report.j2 + dest: ./ceph_preflight_report.txt + +- name: Read Preflight Check Report + slurp: + src: ./ceph_preflight_report.txt + register: report_content + +- name: Show Report Summary + debug: + msg: "{{ report_content['content'] | b64decode | regex_replace('\\r', '') | split('\n') }}" + +- name: Final Check - Fail if any critical checks failed + fail: + msg: "Preflight checks failed for the following: {{ preflight_failures | join(', ') }}. Please resolve these issues before proceeding." + when: preflight_failures | length > 0 diff --git a/templates/preflight_report.j2 b/templates/preflight_report.j2 new file mode 100644 index 0000000..8998f7b --- /dev/null +++ b/templates/preflight_report.j2 @@ -0,0 +1,20 @@ +Preflight Check Report + +================================================== +System Checks +-------------------------------------------------- +{% for item in preflight_results %} +- **{{ item['Check'] }}**: {% if item['Result'] == 'PASS' %}✅ Passed{% else %}❌ Failed{% endif %} + - **Reason:** {{ item['Reason'] }} +{% endfor %} + +================================================== +Summary +-------------------------------------------------- +{% if preflight_failures | length > 0 %} +❌ **Critical Failures Detected**: + - {{ preflight_failures | join(', ') }} + - **Action Required**: Please fix the above issues before proceeding. +{% else %} +✅ **All Critical Checks Passed! You are good to go.** +{% endif %}