Version 1.0 Date: December 12, 2024
- System Overview
- Hardware Architecture
- Operating System Setup
- Network Configuration
- SSH Configuration
- NFS Setup
- OpenMPI Installation
- Cluster Configuration
- Testing and Validation
- Troubleshooting
- Appendix
This document provides detailed instructions for setting up an OpenMPI cluster across 5 compute nodes for high-performance computing applications.
- Initial OS installation
- Network configuration
- Security setup
- Parallel computing environment configuration
- Testing and validation procedures
- 1 Master Node (PDC-Node-1)
- 4 Compute Nodes (PDC-Node-2 to PDC-Node-5)
- High-speed network interconnect
- Shared storage system
Master Node (PDC-Node-1):
- CPU: Intel Xeon E5-2680 v4
- RAM: 128GB DDR4
- Storage: 2TB NVMe SSD
- Network: 10GbE
Compute Nodes (PDC-Node-2 to PDC-Node-5):
- CPU: Intel Xeon E5-2680 v4
- RAM: 64GB DDR4
- Storage: 1TB SSD
- Network: 10GbE
[10GbE Switch]
|
+----------+----------+----------+----------+
| | | | |
[PDC-Node-1][PDC-Node-2][PDC-Node-3][PDC-Node-4][PDC-Node-5]
(Master) (Compute) (Compute) (Compute) (Compute)
# Install Ubuntu Server 24.04 LTS on all nodes
# Minimal installation with SSH server
# On all nodes
sudo apt update
sudo apt upgrade -y
sudo apt install -y build-essential
Edit /etc/netplan/00-installer-config.yaml
:
network:
ethernets:
eno1:
addresses:
- <node_ip>/24
gateway4: <gatway_ip>
nameservers:
addresses: [8.8.8.8, 8.8.4.4]
Edit /etc/hosts
on all nodes:
127.0.0.1 localhost
192.168.1.101 PDC-Node-1
192.168.1.102 PDC-Node-2
192.168.1.103 PDC-Node-3
192.168.1.104 PDC-Node-4
192.168.1.105 PDC-Node-5
# On master node (PDC-Node-1)
ssh-keygen -t ed25519 -f ~/.ssh/cluster-key
# On master node
for i in {2..5}; do
ssh-copy-id -i ~/.ssh/cluster-key.pub username@PDC-Node-$i
done
Create ~/.ssh/config
on master node:
Host PDC-Node-*
User username
IdentityFile ~/.ssh/cluster-key
StrictHostKeyChecking no
# To disable firewall
sudo ufw disable
# Install NFS server
sudo apt install -y nfs-kernel-server
# Create shared directory
sudo mkdir -p /shared
sudo chown -R username:username /shared
sudo chmod 755 /shared
# Configure exports
echo "/shared *(rw,sync,no_subtree_check)" | sudo tee -a /etc/exports
sudo exportfs -a
sudo systemctl restart nfs-kernel-server
# Install NFS client
sudo apt install -y nfs-common
# Create mount point
sudo mkdir -p /shared
# Add to /etc/fstab
echo "PDC-Node-1:/shared /shared nfs defaults 0 0" | sudo tee -a /etc/fstab
sudo mount -a
# On all nodes
sudo apt install -y \
build-essential \
gfortran \
openmpi-bin \
openmpi-common \
libopenmpi-dev \
libopenblas-dev
Create /shared/machinefile
:
PDC-Node-1 slots=32
PDC-Node-2 slots=32
PDC-Node-3 slots=32
PDC-Node-4 slots=32
PDC-Node-5 slots=32
Add to ~/.bashrc
on all nodes:
export PATH="/shared:$PATH"
export LD_LIBRARY_PATH="/shared/lib:$LD_LIBRARY_PATH"
# Test all nodes
for i in {1..5}; do
ssh PDC-Node-$i hostname
done
Create /shared/mpi_test.c
:
#include <mpi.h>
#include <stdio.h>
#include <unistd.h>
int main(int argc, char** argv) {
int world_size, world_rank;
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Get_processor_name(processor_name, &name_len);
printf("Process %d of %d on %s\n",
world_rank, world_size, processor_name);
MPI_Finalize();
return 0;
}
# Compile
mpicc -o /shared/mpi_test /shared/mpi_test.c
# Run on all nodes
mpirun -np 80 --hostfile /shared/machinefile /shared/mpi_test
- SSH Connection Issues
# Check SSH service
sudo systemctl status ssh
# Test SSH connection
ssh -vv username@PDC-Node-2
- NFS Issues
# Check NFS mounts
showmount -e PDC-Node-1
# Check NFS service
sudo systemctl status nfs-kernel-server
- MPI Issues
# Check OpenMPI version
mpirun --version
# Check process limits
ulimit -a
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <math.h>
#define NUM_POINTS 1000000
int main(int argc, char** argv) {
int rank, size;
double pi_local, pi_global;
double start_time, end_time;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
start_time = MPI_Wtime();
unsigned int seed = rank;
int count = 0;
for(int i = 0; i < NUM_POINTS; i++) {
double x = (double)rand_r(&seed)/RAND_MAX;
double y = (double)rand_r(&seed)/RAND_MAX;
if(x*x + y*y <= 1.0) count++;
}
pi_local = 4.0 * count / NUM_POINTS;
MPI_Reduce(&pi_local, &pi_global, 1, MPI_DOUBLE,
MPI_SUM, 0, MPI_COMM_WORLD);
end_time = MPI_Wtime();
if(rank == 0) {
pi_global = pi_global/size;
printf("Pi: %.10f\n", pi_global);
printf("Time: %f seconds\n", end_time - start_time);
}
MPI_Finalize();
return 0;
}
- Operating System: Ubuntu 24.04 LTS
- OpenMPI Version: 4.1.6
- GCC Version: 13.2.0
- Configure UFW firewall on all nodes
- Implement regular security updates
- Monitor system logs
- Use strong SSH key encryption
- Regularly audit user access
- Enable processor-specific optimizations
- Tune network parameters
- Optimize MPI parameters
- Monitor system performance
- Regular maintenance schedule
This documentation follows the IEEE documentation standards and includes all necessary information for setting up and maintaining an OpenMPI cluster. Regular updates and maintenance procedures should be followed to ensure optimal performance and security.