From c66db4cd5cf0302559a1f46b81600f0d7ba48181 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 19 Mar 2024 11:42:42 -0700 Subject: [PATCH 01/76] add json file --- requirements.json | 314 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 requirements.json diff --git a/requirements.json b/requirements.json new file mode 100644 index 00000000..27e887c1 --- /dev/null +++ b/requirements.json @@ -0,0 +1,314 @@ +{ + "spack": { + "ubuntu20.04": { + "branch": "develop" + }, + "ubuntu22.04": { + "branch": "develop" + }, + "almalinux8.7": { + "branch": "develop" + } + }, + "azcopy": { + "ubuntu20.04": { + "version": "10.17.0", + "release": "release20230123", + "sha256": "7da94b560f4de8265ae834a94b22b1ea94f1dbccc4551782eba56aa370244042" + }, + "ubuntu22.04": { + "version": "10.17.0", + "release": "release20230123", + "sha256": "7da94b560f4de8265ae834a94b22b1ea94f1dbccc4551782eba56aa370244042" + }, + "almalinux8.7": { + "version": "10.19.0", + "release": "release20230530", + "sha256": "71f583f80a31d54bd307b4fe068678e5cdde0dd4c8c121ee384e336340cb8017" + } + }, + "gcc": { + "almalinux8.7": { + "version": "12.3.0" + }, + "ubuntu22.04": { + "version": "12.3.0" + }, + "ubuntu20.04": { + "version": "12.3.0" + } + }, + "mofed": { + "ubuntu20.04": { + "version": "23.07-0.5.1.2", + "sha256": "923ddbd48d250b25ba50098ad8852ad6a591df3e975f3e0b9922b752181bdd12" + }, + "ubuntu22.04": { + "version": "23.07-0.5.1.2", + "sha256": "77e032a48de4c040b2f2dd3bf2edd11921de7caff59c773ac35208514f72eff5" + }, + "almalinux8.7": { + "version": "23.07-0.5.1.2", + "sha256": "59d318ea9814797f9196d16af06d7d1324114ea63015762b527478d8aec7d25e" + } + }, + "hpcx": { + "ubuntu20.04": { + "version": "2.16", + "sha256": "addda11a710c52268b7a725c13d9cc54c93deddf259c87d8547ad3c0422b87e1", + "url": "https://content.mellanox.com/hpc/hpc-x/v2.16/hpcx-v2.16-gcc-mlnx_ofed-ubuntu20.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz" + }, + "ubuntu22.04": { + "version": "2.16", + "sha256": "97eac5555d54f5fd8da1c354222a1aff2e85eb017682441e06287971a5b95772", + "url": "https://content.mellanox.com/hpc/hpc-x/v2.16/hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz" + }, + "almalinux8.7": { + "version": "2.16", + "sha256": "78dc6bc152489decc8a4191121c7f070adadf657b0c90d8713dd8feb7e5e968e", + "url": "https://content.mellanox.com/hpc/hpc-x/v2.16/hpcx-v2.16-gcc-mlnx_ofed-redhat8-cuda12-gdrcopy2-nccl2.18-x86_64.tbz" + } + }, + "mvapich2": { + "ubuntu20.04": { + "version": "2.3.7-1" + }, + "ubuntu22.04": { + "version": "2.3.7-1" + }, + "almalinux8.7": { + "version": "2.3.7-1" + } + }, + "ompi": { + "ubuntu20.04": { + "version": "4.1.5" + }, + "ubuntu22.04": { + "version": "4.1.5" + }, + "almalinux8.7": { + "version": "4.1.5" + } + }, + "impi_2021": { + "ubuntu20.04": { + "version": "2021.9.0" + }, + "ubuntu22.04": { + "version": "2021.9.0" + }, + "almalinux8.7": { + "version": "2021.9.0" + } + }, + "nvidia": { + "ubuntu20.04": { + "driver": { + "version": "535.86.10", + "sha256": "cec37fd9317091a01fd0380309400a14284769790651fe021e1d5aaa2327a370" + }, + "fabricmanager": { + "prefix": "535", + "distribution": "ubuntu2004", + "version": "535_535.86.10-1", + "sha256": "d0c4662279301187614646650da07f34a6fe267d789d48bc9ed63181af06ac29" + } + }, + "ubuntu22.04": { + "driver": { + "version": "535.86.10", + "sha256": "cec37fd9317091a01fd0380309400a14284769790651fe021e1d5aaa2327a370" + }, + "fabricmanager": { + "prefix": "535", + "distribution": "ubuntu2204", + "version": "535_535.86.10-1", + "sha256": "d0c4662279301187614646650da07f34a6fe267d789d48bc9ed63181af06ac29" + } + }, + "almalinux8.7": { + "driver": { + "version": "535.86.10", + "sha256": "cec37fd9317091a01fd0380309400a14284769790651fe021e1d5aaa2327a370" + }, + "fabricmanager": { + "prefix": "535", + "distribution": "rhel8", + "version": "535.86.10-1", + "sha256": "4c3cfc9f410c5c3e8dd2c50f9cdfc0c7e807094020bce3555bf8f1e09c053045" + } + } + }, + "cuda": { + "ubuntu20.04": { + "driver": { + "version": "12.2.1" + }, + "samples": { + "version": "12.2", + "sha256": "1823cfe28e97a9230107aa72b231f78952c0f178b71a920f036d360518480bdc" + } + }, + "ubuntu22.04": { + "driver": { + "version": "12.2.1" + }, + "samples": { + "version": "12.2", + "sha256": "1823cfe28e97a9230107aa72b231f78952c0f178b71a920f036d360518480bdc" + } + }, + "almalinux8.7": { + "driver": { + "version": "12.2.0" + }, + "samples": { + "version": "12.2", + "sha256": "1823cfe28e97a9230107aa72b231f78952c0f178b71a920f036d360518480bdc" + } + } + }, + "gdrcopy": { + "ubuntu20.04": { + "version": "2.3" + }, + "ubuntu22.04": { + "version": "2.3" + }, + "almalinux8.7": { + "version": "2.3" + } + }, + "nccl": { + "ubuntu20.04": { + "version": "2.19.3-1", + "rdmasharpplugins": { + "commit": "575c1e0" + } + }, + "ubuntu22.04": { + "version": "2.19.3-1", + "rdmasharpplugins": { + "commit": "575c1e0" + } + }, + "almalinux8.7": { + "version": "2.19.3-1", + "rdmasharpplugins": { + "commit": "575c1e0" + } + } + }, + "dcgm": { + "ubuntu20.04": { + "version": "3.1.8", + "distribution": "ubuntu2004" + }, + "ubuntu22.04": { + "version": "3.1.8", + "distribution": "ubuntu2204" + }, + "almalinux8.7": { + "version": "3.1.8", + "distribution": "rhel8" + } + }, + "intel_one_mkl": { + "ubuntu20.04": { + "version": "2023.2.0" + }, + "ubuntu22.04": { + "version": "2023.2.0" + }, + "almalinux8.7": { + "version": "2023.2.0" + } + }, + "waagent": { + "ubuntu20.04": { + "version": "2.9.0.4", + "sha256": "040969f507f73f3a2c95d5b0568225ad68f7f91bfec99bd92154c3fa9e28034b" + }, + "ubuntu22.04": { + "version": "2.9.0.4", + "sha256": "040969f507f73f3a2c95d5b0568225ad68f7f91bfec99bd92154c3fa9e28034b" + } + }, + "moneo": { + "ubuntu20.04": { + "version": "0.3.4" + }, + "ubuntu22.04": { + "version": "0.3.4" + }, + "almalinux8.7": { + "version": "0.3.4" + } + }, + "pssh": { + "almalinux8.7": { + "version": "2.3.1-29", + "sha256": "a9fee8148837b1cad15359028f85bc9ce688fd727e0ff13441972fdacf6be282" + } + }, + "amd": { + "almalinux8.7": { + "aocl": { + "version": "4.1" + }, + "aocc": { + "version": "4.1.0" + } + }, + "ubuntu20.04": { + "aocl": { + "version": "4.1" + }, + "aocc": { + "version": "4.1.0" + } + }, + "ubuntu22.04": { + "aocl": { + "version": "4.1" + }, + "aocc": { + "version": "4.1.0" + } + } + }, + "lustre": { + "almalinux8.7": { + "version": "2.15.1_24_gbaa21ca" + }, + "ubuntu20.04": { + "version": "2.15.1-29-gbae0abe" + }, + "ubuntu22.04": { + "version": "2.15.1-29-gbae0abe" + } + }, + "azhc": { + "almalinux8.7": { + "version": "0.2.9" + }, + "ubuntu20.04": { + "version": "0.2.9" + }, + "ubuntu22.04": { + "version": "0.2.9" + } + }, + "rdma_core": { + "almalinux8.7": { + "branch": "stable-v48" + }, + "ubuntu20.04": { + "branch": "stable-v48" + }, + "ubuntu22.04": { + "branch": "stable-v48" + } + } +} From 3155c8fa04c13bb60e5b04174d83e9cfc1990959 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 19 Mar 2024 20:04:59 -0700 Subject: [PATCH 02/76] first pass of adding config calls --- .../alma-8.7-hpc/install_lustre_client.sh | 4 +- .../alma-8.7-hpc/install_mellanoxofed.sh | 12 +- alma/alma-8.x/alma-8.7-hpc/install_mpis.sh | 16 +-- alma/alma-8.x/alma-8.7-hpc/set_properties.sh | 3 + alma/alma-8.x/common/install_amd_libs.sh | 8 +- alma/alma-8.x/common/install_dcgm.sh | 5 +- alma/alma-8.x/common/install_mpis.sh | 19 +-- .../common/install_nvidiagpudriver.sh | 60 +++++----- alma/alma-8.x/common/install_utils.sh | 15 +-- alma/common/install_amd_libs.sh | 4 +- alma/common/install_intel_libs.sh | 26 ++--- alma/common/install_lustre_client.sh | 3 +- alma/common/install_monitoring_tools.sh | 9 +- alma/common/install_mpis.sh | 59 ++++++---- alma/common/install_nccl.sh | 19 ++- .../install_azure_persistent_rdma_naming.sh | 2 +- common/install_health_checks.sh | 11 +- requirements.json | 15 +-- ubuntu/common/hpc-tuning.sh | 5 +- ubuntu/common/install_amd_libs.sh | 4 +- ubuntu/common/install_dcgm.sh | 5 +- ubuntu/common/install_intel_libs.sh | 27 ++--- ubuntu/common/install_lustre_client.sh | 3 +- ubuntu/common/install_monitoring_tools.sh | 9 +- ubuntu/common/install_mpis.sh | 108 ++++++++++-------- ubuntu/common/install_nccl.sh | 31 ++--- .../common/install_nvidia_fabric_manager.sh | 33 ++---- ubuntu/common/install_nvidiagpudriver.sh | 43 +++---- ubuntu/common/install_utils.sh | 14 +-- .../ubuntu-20.04-hpc/install_mellanoxofed.sh | 12 +- .../ubuntu-20.04-hpc/install_mpis.sh | 2 +- .../install_nvidiagpudriver.sh | 6 +- .../ubuntu-20.04-hpc/install_utils.sh | 5 +- .../ubuntu-20.04-hpc/set_properties.sh | 3 + .../ubuntu-22.04-hpc/install_mellanoxofed.sh | 12 +- .../ubuntu-22.04-hpc/install_mpis.sh | 2 +- .../install_nvidiagpudriver.sh | 6 +- .../ubuntu-22.04-hpc/install_utils.sh | 5 +- .../ubuntu-22.04-hpc/set_properties.sh | 3 + 39 files changed, 307 insertions(+), 321 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install_lustre_client.sh b/alma/alma-8.x/alma-8.7-hpc/install_lustre_client.sh index ad563027..a90d6c5d 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install_lustre_client.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install_lustre_client.sh @@ -1,8 +1,10 @@ #!/bin/bash set -ex +# Set Lustre driver version +LUSTRE_VERSION=$(jq -r '.lustre."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) + DISTRIB_CODENAME=el8 -LUSTRE_VERSION=2.15.1_24_g98d1cac REPO_PATH=/etc/yum.repos.d/amlfs.repo rpm --import https://packages.microsoft.com/keys/microsoft.asc diff --git a/alma/alma-8.x/alma-8.7-hpc/install_mellanoxofed.sh b/alma/alma-8.x/alma-8.7-hpc/install_mellanoxofed.sh index 1cc72422..940c45f1 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install_mellanoxofed.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install_mellanoxofed.sh @@ -1,12 +1,14 @@ #!/bin/bash set -ex -VERSION="23.07-0.5.1.2" -TARBALL="MLNX_OFED_LINUX-$VERSION-rhel8.7-x86_64.tgz" -MLNX_OFED_DOWNLOAD_URL=https://content.mellanox.com/ofed/MLNX_OFED-${VERSION}/$TARBALL +mofed_metadata=$(jq -r '.mofed."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +MOFED_VERSION=$(jq -r '.version' <<< $mofed_metadata) +MOFED_SHA256=$(jq -r '.sha256' <<< $mofed_metadata) +TARBALL="MLNX_OFED_LINUX-$MOFED_VERSION-rhel8.7-x86_64.tgz" +MLNX_OFED_DOWNLOAD_URL=https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/$TARBALL MOFED_FOLDER=$(basename ${MLNX_OFED_DOWNLOAD_URL} .tgz) -$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL "59d318ea9814797f9196d16af06d7d1324114ea63015762b527478d8aec7d25e" +$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL $MOFED_SHA256 tar zxvf ${TARBALL} KERNEL=( $(rpm -q kernel | sed 's/kernel\-//g') ) @@ -18,7 +20,7 @@ KERNEL=${KERNEL[-1]} # This causes openibd to ignore the kernel difference but relies on weak-updates # Restarting openibd /etc/init.d/openibd force-restart -$COMMON_DIR/write_component_version.sh "MOFED" $VERSION +$COMMON_DIR/write_component_version.sh "MOFED" $MOFED_VERSION # exclude opensm from updates sed -i "$ s/$/ opensm*/" /etc/dnf/dnf.conf diff --git a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh index f04bdd5d..7bc0cf97 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh @@ -11,13 +11,15 @@ set GCC=/opt/${GCC_VERSION}/bin/gcc INSTALL_PREFIX=/opt -# HPC-X v2.15 -HPCX_VERSION="v2.16" -TARBALL="hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-redhat8-cuda12-gdrcopy2-nccl2.18-x86_64.tbz" -HPCX_DOWNLOAD_URL=https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/${TARBALL} -HPCX_FOLDER=$(basename ${HPCX_DOWNLOAD_URL} .tbz) - -$COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL "78dc6bc152489decc8a4191121c7f070adadf657b0c90d8713dd8feb7e5e968e" +# Install HPC-x +hpcx_metadata=$(jq -r '.hpcx."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +HPCX_VERSION=$(jq -r '.version' <<< $hpcx_metadata) +HPCX_SHA256=$(jq -r '.sha256' <<< $hpcx_metadata) +HPCX_DOWNLOAD_URL=$(jq -r '.url' <<< $hpcx_metadata) +TARBALL=$(basename $HPCX_DOWNLOAD_URL) +HPCX_FOLDER=$(basename $HPCX_DOWNLOAD_URL .tbz) + +$COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL $HPCX_SHA256 tar -xvf ${TARBALL} mv ${HPCX_FOLDER} ${INSTALL_PREFIX} HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} diff --git a/alma/alma-8.x/alma-8.7-hpc/set_properties.sh b/alma/alma-8.x/alma-8.7-hpc/set_properties.sh index ea73c23a..b12e4789 100755 --- a/alma/alma-8.x/alma-8.7-hpc/set_properties.sh +++ b/alma/alma-8.x/alma-8.7-hpc/set_properties.sh @@ -5,3 +5,6 @@ export COMMON_DIR=../../../common export ALMA_COMMON_DIR=../../common export TEST_DIR=../../../tests export DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) + +# Component Versions +export COMPONENT_VERSIONS=$(jq -r . $TOP_DIR/requirements.json) \ No newline at end of file diff --git a/alma/alma-8.x/common/install_amd_libs.sh b/alma/alma-8.x/common/install_amd_libs.sh index f71edd10..9964a37e 100755 --- a/alma/alma-8.x/common/install_amd_libs.sh +++ b/alma/alma-8.x/common/install_amd_libs.sh @@ -4,10 +4,14 @@ set -ex INSTALL_PREFIX=/opt/amd mkdir -p ${INSTALL_PREFIX} -AOCL_VERSION="4.0" +# Set AOCL version +amd_metadata=$(jq -r '.amd."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +AOCL_VERSION=$(jq -r '.aocl.version' <<< $amd_metadata) +AOCL_SHA256=$(jq -r '.aocl.version' <<< $amd_metadata) + TARBALL="aocl-linux-aocc-${AOCL_VERSION}.tar.gz" AOCL_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/${TARBALL} -$COMMON_DIR/download_and_verify.sh $AOCL_DOWNLOAD_URL "c8000a66aaa2a257252cbb307732b4e66758b72b08f43b3723f4eb5404ba28c8" +$COMMON_DIR/download_and_verify.sh $AOCL_DOWNLOAD_URL $AOCL_SHA256 tar -xvf ${TARBALL} pushd aocl-linux-aocc-${AOCL_VERSION} diff --git a/alma/alma-8.x/common/install_dcgm.sh b/alma/alma-8.x/common/install_dcgm.sh index fa3f5b97..e8cb36c1 100755 --- a/alma/alma-8.x/common/install_dcgm.sh +++ b/alma/alma-8.x/common/install_dcgm.sh @@ -1,10 +1,13 @@ #!/bin/bash set -ex +# Set DCGM version info +dcgm_metadata=$(jq -r '.dcgm."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +DCGM_VERSION=$(jq -r '.version' <<< $dcgm_metadata) + # Install DCGM # Reference: https://developer.nvidia.com/dcgm#Downloads # the repo is already added during nvidia/ cuda installations -DCGM_VERSION=3.1.8 dnf clean expire-cache dnf install -y datacenter-gpu-manager-1:${DCGM_VERSION} $COMMON_DIR/write_component_version.sh "DCGM" ${DCGM_VERSION} diff --git a/alma/alma-8.x/common/install_mpis.sh b/alma/alma-8.x/common/install_mpis.sh index d5e3f19c..3f712579 100755 --- a/alma/alma-8.x/common/install_mpis.sh +++ b/alma/alma-8.x/common/install_mpis.sh @@ -14,13 +14,18 @@ export LD_LIBRARY_PATH=/opt/${GCC_VERSION}/lib64:$LD_LIBRARY_PATH set CC=/opt/${GCC_VERSION}/bin/gcc set GCC=/opt/${GCC_VERSION}/bin/gcc -# Intel MPI 2018 (update 4) -IMPI_VERSION="2018.4.274" -$COMMON_DIR/write_component_version.sh "IMPI_2018" ${IMPI_VERSION} -IMPI_2018_DOWNLOAD_URL=http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/13651/l_mpi_${IMPI_VERSION}.tgz -$COMMON_DIR/download_and_verify.sh $IMPI_2018_DOWNLOAD_URL "a1114b3eb4149c2f108964b83cad02150d619e50032059d119ac4ffc9d5dd8e0" -tar -xvf l_mpi_${IMPI_VERSION}.tgz -cd l_mpi_${IMPI_VERSION} +# Install Intel MPI +impi_metadata=$(jq -r '.impi."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +IMPI_VERSION=$(jq -r '.version' <<< $impi_metadata) +IMPI_SHA256=$(jq -r '.sha256' <<< $impi_metadata) +IMPI_DOWNLOAD_URL=$(jq -r '.url' <<< $impi_metadata) +TARBALL=$(basename $IMPI_DOWNLOAD_URL) +IMPI_FOLDER=$(basename $IMPI_DOWNLOAD_URL .tbz) + +$COMMON_DIR/write_component_version.sh "IMPI" ${IMPI_VERSION} +$COMMON_DIR/download_and_verify.sh $IMPI_DOWNLOAD_URL $IMPI_SHA256 +tar -xvf ${TARBALL} +cd ${IMPI_FOLDER} # Update the silent.cfg file to proceed with installation sed -i -e 's/ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' silent.cfg ./install.sh --silent ./silent.cfg diff --git a/alma/alma-8.x/common/install_nvidiagpudriver.sh b/alma/alma-8.x/common/install_nvidiagpudriver.sh index 785d9510..190f7456 100755 --- a/alma/alma-8.x/common/install_nvidiagpudriver.sh +++ b/alma/alma-8.x/common/install_nvidiagpudriver.sh @@ -1,31 +1,19 @@ #!/bin/bash set -ex -case ${DISTRIBUTION} in - "almalinux8.6") NVIDIA_VERSION="510.85.02"; - CUDA_VERSION="11-6"; - CUDA_SAMPLES_VERSION="11.6"; - NVIDIA_DRIVER_CHECKSUM="372427e633f32cff6dd76020e8ed471ef825d38878bd9655308b6efea1051090"; - NVIDIA_FABRIC_MANAGER_VERSION="510.85.02-1"; - NVIDIA_FABRIC_MANAGER_CHECKSUM="7f8468e92deb78e427df8b4947c4b0fd7a7b5eedf1e3961e60436b4620b2fa1d"; - ;; - "almalinux8.7") NVIDIA_VERSION="535.86.10"; - CUDA_VERSION="12-2"; - CUDA_SAMPLES_VERSION="12.2"; - NVIDIA_DRIVER_CHECKSUM="cec37fd9317091a01fd0380309400a14284769790651fe021e1d5aaa2327a370"; - NVIDIA_FABRIC_MANAGER_VERSION="535.86.10-1"; - NVIDIA_FABRIC_MANAGER_CHECKSUM="4c3cfc9f410c5c3e8dd2c50f9cdfc0c7e807094020bce3555bf8f1e09c053045"; - ;; - *) ;; -esac +# Set the driver versions +cuda_metadata=$(jq -r '.cuda."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +CUDA_DRIVER_VERSION=$(jq -r '.driver.version' <<< $cuda_metadata) +CUDA_DRIVER_DISTRIBUTION=$(jq -r '.driver.distribution' <<< $cuda_metadata) +CUDA_SAMPLES_VERSION=$(jq -r '.samples.version' <<< $cuda_metadata) # Install Cuda -dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo +dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DRIVER_DISTRIBUTION}/x86_64/cuda-${CUDA_DRIVER_DISTRIBUTION}.repo dnf clean expire-cache -dnf install cuda-toolkit-${CUDA_VERSION} -y +dnf install cuda-toolkit-${CUDA_DRIVER_VERSION} -y echo 'export PATH=$PATH:/usr/local/cuda/bin' | tee -a /etc/bash.bashrc echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64' | tee -a /etc/bash.bashrc -$COMMON_DIR/write_component_version.sh "CUDA" ${CUDA_VERSION} +$COMMON_DIR/write_component_version.sh "CUDA" ${CUDA_DRIVER_VERSION} # Download CUDA samples TARBALL="v${CUDA_SAMPLES_VERSION}.tar.gz" @@ -37,12 +25,16 @@ make -j $(nproc) mv -vT ./Samples /usr/local/cuda-${CUDA_SAMPLES_VERSION}/samples popd -# Nvidia driver -NVIDIA_DRIVER_URL=https://us.download.nvidia.com/tesla/${NVIDIA_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run -$COMMON_DIR/download_and_verify.sh $NVIDIA_DRIVER_URL ${NVIDIA_DRIVER_CHECKSUM} -bash NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run --silent --dkms -dkms install --no-depmod -m nvidia -v ${NVIDIA_VERSION} -k `uname -r` --force -$COMMON_DIR/write_component_version.sh "NVIDIA" ${NVIDIA_VERSION} +# Install NVIDIA driver +nvidia_driver_metadata=$(jq -r '.nvidia."'"$DISTRIBUTION"'".driver' <<< $COMPONENT_VERSIONS) +NVIDIA_DRIVER_VERSION=$(jq -r '.version' <<< $nvidia_driver_metadata) +NVIDIA_DRIVER_SHA256=$(jq -r '.sha256' <<< $nvidia_driver_metadata) +NVIDIA_DRIVER_URL=https://us.download.nvidia.com/tesla/${NVIDIA_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run + +$COMMON_DIR/download_and_verify.sh $NVIDIA_DRIVER_URL ${NVIDIA_DRIVER_SHA256} +bash NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run --silent --dkms +dkms install --no-depmod -m nvidia -v ${NVIDIA_DRIVER_VERSION} -k `uname -r` --force +$COMMON_DIR/write_component_version.sh "NVIDIA" ${NVIDIA_DRIVER_VERSION} # load the nvidia-peermem coming as a part of NVIDIA GPU driver # Reference - https://download.nvidia.com/XFree86/Linux-x86_64/510.85.02/README/nvidia-peermem.html @@ -51,7 +43,7 @@ modprobe nvidia-peermem lsmod | grep nvidia_peermem # Install GDRCopy -GDRCOPY_VERSION="2.3" +GDRCOPY_VERSION=$(jq -r '.gdrcopy."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) TARBALL="v${GDRCOPY_VERSION}.tar.gz" GDRCOPY_DOWNLOAD_URL=https://github.com/NVIDIA/gdrcopy/archive/refs/tags/${TARBALL} wget $GDRCOPY_DOWNLOAD_URL @@ -67,12 +59,18 @@ popd $COMMON_DIR/write_component_version.sh "GDRCOPY" ${GDRCOPY_VERSION} +# Set NVIDIA fabricmanager version +nvidia_fabricmanager_metadata=$(jq -r '.nvidia."'"$DISTRIBUTION"'".fabricmanager' <<< $COMPONENT_VERSIONS) +NVIDIA_FABRICMANAGER_DISTRIBUTION=$(jq -r '.distribution' <<< $nvidia_fabricmanager_metadata) +NVIDIA_FABRICMANAGER_VERSION=$(jq -r '.version' <<< $nvidia_fabricmanager_metadata) +NVIDIA_FABRICMANAGER_SHA256=$(jq -r '.sha256' <<< $nvidia_fabricmanager_metadata) + # Install Fabric Manager -NVIDIA_FABRIC_MNGR_URL=http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/nvidia-fabric-manager-${NVIDIA_FABRIC_MANAGER_VERSION}.x86_64.rpm -$COMMON_DIR/download_and_verify.sh ${NVIDIA_FABRIC_MNGR_URL} ${NVIDIA_FABRIC_MANAGER_CHECKSUM} -yum install -y ./nvidia-fabric-manager-${NVIDIA_FABRIC_MANAGER_VERSION}.x86_64.rpm +NVIDIA_FABRIC_MNGR_URL=http://developer.download.nvidia.com/compute/cuda/repos/${NVIDIA_FABRICMANAGER_DISTRIBUTION}/x86_64/nvidia-fabric-manager-${NVIDIA_FABRICMANAGER_VERSION}.x86_64.rpm +$COMMON_DIR/download_and_verify.sh ${NVIDIA_FABRIC_MNGR_URL} ${NVIDIA_FABRICMANAGER_SHA256} +yum install -y ./nvidia-fabric-manager-${NVIDIA_FABRICMANAGER_VERSION}.x86_64.rpm sed -i "$ s/$/ nvidia-fabric-manager/" /etc/dnf/dnf.conf -$COMMON_DIR/write_component_version.sh "NVIDIA_FABRIC_MANAGER" ${NVIDIA_FABRIC_MANAGER_VERSION} +$COMMON_DIR/write_component_version.sh "NVIDIA_FABRIC_MANAGER" ${NVIDIA_FABRICMANAGER_VERSION} # cleanup downloaded files rm -rf *.run *tar.gz *.rpm diff --git a/alma/alma-8.x/common/install_utils.sh b/alma/alma-8.x/common/install_utils.sh index bbf57dd9..01028094 100755 --- a/alma/alma-8.x/common/install_utils.sh +++ b/alma/alma-8.x/common/install_utils.sh @@ -77,13 +77,14 @@ yum localinstall ./repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/j rm -rf ./dl.fedoraproject.org/ rm -rf ./repo.almalinux.org/ -# Install azcopy tool -# To copy blobs or files to or from a storage account. -VERSION="10.17.0" -RELEASE_TAG="release20230123" -TARBALL="azcopy_linux_amd64_${VERSION}.tar.gz" -AZCOPY_DOWNLOAD_URL="https://azcopyvnext.azureedge.net/${RELEASE_TAG}/${TARBALL}" -AZCOPY_FOLDER=$(basename ${AZCOPY_DOWNLOAD_URL} .tgz) +# Install azcopy tool +# To copy blobs or files to or from a storage account +azcopy_metadata=$(jq -r '.azcopy."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +azcopy_version=$(jq -r '.version' <<< $azcopy_metadata) +azcopy_release=$(jq -r '.release' <<< $azcopy_metadata) +azcopy_sha256=$(jq -r '.sha256' <<< $azcopy_metadata) +TARBALL="azcopy_linux_amd64_$azcopy_version.tar.gz" +AZCOPY_DOWNLOAD_URL="https://azcopyvnext.azureedge.net/$azcopy_release/$tarball" wget ${AZCOPY_DOWNLOAD_URL} tar -xvf ${TARBALL} diff --git a/alma/common/install_amd_libs.sh b/alma/common/install_amd_libs.sh index c6132990..dda0a8c4 100755 --- a/alma/common/install_amd_libs.sh +++ b/alma/common/install_amd_libs.sh @@ -1,6 +1,8 @@ #!/bin/bash -AOCC_VERSION=4.0.0-1 +# Set AOCC and AOCL versions +amd_metadata=$(jq -r '.amd."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +AOCC_VERSION=$(jq -r '.aocc.version' <<< $amd_metadata) # install dependency wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-${AOCC_VERSION}.x86_64.rpm diff --git a/alma/common/install_intel_libs.sh b/alma/common/install_intel_libs.sh index a731199d..3d1ddc23 100755 --- a/alma/common/install_intel_libs.sh +++ b/alma/common/install_intel_libs.sh @@ -1,22 +1,14 @@ #!/bin/bash set -ex -case ${DISTRIBUTION} in - "almalinux8.6") INTEL_MKL_VERSION="2022.1.0.223"; - RELEASE_VERSION="18721"; - CHECKSUM="4b325a3c4c56e52f4ce6c8fbb55d7684adc16425000afc860464c0f29ea4563e"; - IDENTIFIER="irc_nas"; - ;; - "almalinux8.7") INTEL_MKL_VERSION="2023.2.0.49497"; - RELEASE_VERSION="adb8a02c-4ee7-4882-97d6-a524150da358"; - CHECKSUM="4a0d93da85a94d92e0ad35dc0fc3b3ab7f040bd55ad374c4d5ec81a57a2b872b"; - IDENTIFIER="IRC_NAS"; - ;; - *) ;; -esac +# Set Intel® oneAPI Math Kernel Library info +intel_one_mkl_metadata=$(jq -r '.intel_one_mkl."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +INTEL_ONE_MKL_VERSION=$(jq -r '.version' <<< $intel_one_mkl_metadata) +INTEL_ONE_MKL_SHA256=$(jq -r '.sha256' <<< $impi_metadata) +INTEL_ONE_MKL_DOWNLOAD_URL=$(jq -r '.url' <<< $impi_metadata) +INTEL_ONE_MKL_OFFLINE_INSTALLER=$(basename $IMPI_DOWNLOAD_URL) # Intel® oneAPI Math Kernel Library -ONE_MKL_DOWNLOAD_URL=https://registrationcenter-download.intel.com/akdlm/${IDENTIFIER}/${RELEASE_VERSION}/l_onemkl_p_${INTEL_MKL_VERSION}_offline.sh -$COMMON_DIR/write_component_version.sh "INTEL_ONE_MKL" ${INTEL_MKL_VERSION} -$COMMON_DIR/download_and_verify.sh ${ONE_MKL_DOWNLOAD_URL} ${CHECKSUM} -sh ./l_onemkl_p_${INTEL_MKL_VERSION}_offline.sh -s -a -s --eula accept +$COMMON_DIR/write_component_version.sh "INTEL_ONE_MKL" ${INTEL_ONE_MKL_VERSION} +$COMMON_DIR/download_and_verify.sh ${INTEL_ONE_MKL_DOWNLOAD_URL} ${INTEL_ONE_MKL_SHA256} +sh ./${INTEL_ONE_MKL_OFFLINE_INSTALLER} -s -a -s --eula accept diff --git a/alma/common/install_lustre_client.sh b/alma/common/install_lustre_client.sh index de87ee80..a951327f 100755 --- a/alma/common/install_lustre_client.sh +++ b/alma/common/install_lustre_client.sh @@ -1,7 +1,8 @@ #!/bin/bash set -ex -LUSTRE_VERSION=2.15.1_24_gbaa21ca +# Set Lustre version +LUSTRE_VERSION=$(jq -r '.lustre."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) # Expected params: # $1 = the major version of the distro. "8" for RHEL/Alma8, "9" for RHEL/Alma9. diff --git a/alma/common/install_monitoring_tools.sh b/alma/common/install_monitoring_tools.sh index 79568288..028b4326 100755 --- a/alma/common/install_monitoring_tools.sh +++ b/alma/common/install_monitoring_tools.sh @@ -2,11 +2,8 @@ set -e -# grab latest release version -repo=Azure/Moneo -release_version=$(curl -s "https://api.github.com/repos/$repo/releases/latest" | jq -r '.tag_name') - -MONEO_VERSION=$release_version +# Set moneo metadata +MONEO_VERSION=$(jq -r '.moneo."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) # Dependencies python3 -m pip install --upgrade pip @@ -19,7 +16,7 @@ MONITOR_DIR=/opt/azurehpc/tools mkdir -p $MONITOR_DIR pushd $MONITOR_DIR - git clone https://github.com/Azure/Moneo --branch $MONEO_VERSION + git clone https://github.com/Azure/Moneo --branch v$MONEO_VERSION chmod 777 Moneo diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 97049aba..9068c1c5 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -14,23 +14,32 @@ export LD_LIBRARY_PATH=/opt/${GCC_VERSION}/lib64:$LD_LIBRARY_PATH set CC=/opt/${GCC_VERSION}/bin/gcc set GCC=/opt/${GCC_VERSION}/bin/gcc -# MVAPICH2 2.3.7-1 -MV2_VERSION="2.3.7-1" -MV2_DOWNLOAD_URL=http://mvapich.cse.ohio-state.edu/download/mvapich/mv2/mvapich2-${MV2_VERSION}.tar.gz -$COMMON_DIR/download_and_verify.sh $MV2_DOWNLOAD_URL "fdd971cf36d6476d007b5d63d19414546ca8a2937b66886f24a1d9ca154634e4" -tar -xvf mvapich2-${MV2_VERSION}.tar.gz -cd mvapich2-${MV2_VERSION} -./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MV2_VERSION} --enable-g=none --enable-fast=yes && make -j$(nproc) && make install +# MVAPICH2 +mvapich2_metadata=$(jq -r '.mvapich2."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +MVAPICH2_VERSION=$(jq -r '.version' <<< $mvapich2_metadata) +MVAPICH2_SHA256=$(jq -r '.sha256' <<< $mvapich2_metadata) +MVAPICH2_DOWNLOAD_URL=$(jq -r '.url' <<< $mvapich2_metadata) +TARBALL=$(basename $MVAPICH2_DOWNLOAD_URL) +MVAPICH2_FOLDER=$(basename $MVAPICH2_DOWNLOAD_URL .tar.gz) + +tar -xvf ${TARBALL} +cd ${MVAPICH2_FOLDER} +./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes && make -j$(nproc) && make install cd .. -$COMMON_DIR/write_component_version.sh "MVAPICH2" ${MV2_VERSION} +$COMMON_DIR/write_component_version.sh "MVAPICH2" ${MVAPICH2_VERSION} + +# Install Open MPI +ompi_metadata=$(jq -r '.ompi."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +OMPI_VERSION=$(jq -r '.version' <<< $ompi_metadata) +OMPI_SHA256=$(jq -r '.sha256' <<< $ompi_metadata) +OMPI_DOWNLOAD_URL=$(jq -r '.url' <<< $ompi_metadata) +TARBALL=$(basename $OMPI_DOWNLOAD_URL) +OMPI_FOLDER=$(basename $OMPI_DOWNLOAD_URL .tar.gz) -# OpenMPI 4.1.5 -OMPI_VERSION="4.1.5" -OMPI_DOWNLOAD_URL=https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz -$COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL "c018b127619d2a2a30c1931f316fc8a245926d0f5b4ebed4711f9695e7f70925" -tar -xvf openmpi-${OMPI_VERSION}.tar.gz -cd openmpi-${OMPI_VERSION} +$COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL $OMPI_SHA256 +tar -xvf $TARBALL +cd $OMPI_FOLDER ./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "OMPI" ${OMPI_VERSION} @@ -50,21 +59,21 @@ $COMMON_DIR/write_component_version.sh "IMPI_2021" ${IMPI_2021_VERSION} mkdir -p /usr/share/Modules/modulefiles/mpi/ # MVAPICH2 -cat << EOF >> /usr/share/Modules/modulefiles/mpi/mvapich2-${MV2_VERSION} +cat << EOF >> /usr/share/Modules/modulefiles/mpi/mvapich2-${MVAPICH2_VERSION} #%Module 1.0 # -# MVAPICH2 ${MV2_VERSION} +# MVAPICH2 ${MVAPICH2_VERSION} # conflict mpi module load ${GCC_VERSION} -prepend-path PATH /opt/mvapich2-${MV2_VERSION}/bin -prepend-path LD_LIBRARY_PATH /opt/mvapich2-${MV2_VERSION}/lib -prepend-path MANPATH /opt/mvapich2-${MV2_VERSION}/share/man -setenv MPI_BIN /opt/mvapich2-${MV2_VERSION}/bin -setenv MPI_INCLUDE /opt/mvapich2-${MV2_VERSION}/include -setenv MPI_LIB /opt/mvapich2-${MV2_VERSION}/lib -setenv MPI_MAN /opt/mvapich2-${MV2_VERSION}/share/man -setenv MPI_HOME /opt/mvapich2-${MV2_VERSION} +prepend-path PATH /opt/mvapich2-${MVAPICH2_VERSION}/bin +prepend-path LD_LIBRARY_PATH /opt/mvapich2-${MVAPICH2_VERSION}/lib +prepend-path MANPATH /opt/mvapich2-${MVAPICH2_VERSION}/share/man +setenv MPI_BIN /opt/mvapich2-${MVAPICH2_VERSION}/bin +setenv MPI_INCLUDE /opt/mvapich2-${MVAPICH2_VERSION}/include +setenv MPI_LIB /opt/mvapich2-${MVAPICH2_VERSION}/lib +setenv MPI_MAN /opt/mvapich2-${MVAPICH2_VERSION}/share/man +setenv MPI_HOME /opt/mvapich2-${MVAPICH2_VERSION} EOF # OpenMPI @@ -101,7 +110,7 @@ setenv MPI_HOME /opt/intel/oneapi/mpi/${IMPI_2021_VERSION} EOF # Create symlinks for modulefiles -ln -s /usr/share/Modules/modulefiles/mpi/mvapich2-${MV2_VERSION} /usr/share/Modules/modulefiles/mpi/mvapich2 +ln -s /usr/share/Modules/modulefiles/mpi/mvapich2-${MVAPICH2_VERSION} /usr/share/Modules/modulefiles/mpi/mvapich2 ln -s /usr/share/Modules/modulefiles/mpi/openmpi-${OMPI_VERSION} /usr/share/Modules/modulefiles/mpi/openmpi ln -s /usr/share/Modules/modulefiles/mpi/impi_${IMPI_2021_VERSION} /usr/share/Modules/modulefiles/mpi/impi-2021 diff --git a/alma/common/install_nccl.sh b/alma/common/install_nccl.sh index 8a0806dd..5e5b42f3 100755 --- a/alma/common/install_nccl.sh +++ b/alma/common/install_nccl.sh @@ -1,15 +1,10 @@ #!/bin/bash set -ex -case ${DISTRIBUTION} in - "almalinux8.6") NCCL_VERSION="2.14.3-1"; - CUDA_VERSION="11.6"; - NCCL_RDMA_SHARP_COMMIT="575c1e0";; - "almalinux8.7") NCCL_VERSION="2.19.3-1"; - CUDA_VERSION="12.2"; - NCCL_RDMA_SHARP_COMMIT="575c1e0";; - *) ;; -esac +# Set NCCL versions +NCCL_VERSION=$(jq -r '.nccl."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) +NCCL_RDMA_SHARP_COMMIT=$(jq -r '.nccl."'"$DISTRIBUTION"'".rdmasharpplugins.commit' <<< $COMPONENT_VERSIONS) +CUDA_DRIVER_VERSION=$(jq -r '.cuda."'"$DISTRIBUTION"'".driver.version' <<< $COMPONENT_VERSIONS) # Install NCCL yum install -y rpm-build rpmdevtools @@ -22,9 +17,9 @@ tar -xvf ${TARBALL} pushd nccl-${NCCL_VERSION} make -j src.build make pkg.redhat.build -rpm -i ./build/pkg/rpm/x86_64/libnccl-${NCCL_VERSION}+cuda${CUDA_VERSION}.x86_64.rpm -rpm -i ./build/pkg/rpm/x86_64/libnccl-devel-${NCCL_VERSION}+cuda${CUDA_VERSION}.x86_64.rpm -rpm -i ./build/pkg/rpm/x86_64/libnccl-static-${NCCL_VERSION}+cuda${CUDA_VERSION}.x86_64.rpm +rpm -i ./build/pkg/rpm/x86_64/libnccl-${NCCL_VERSION}+cuda${CUDA_DRIVER_VERSION}.x86_64.rpm +rpm -i ./build/pkg/rpm/x86_64/libnccl-devel-${NCCL_VERSION}+cuda${CUDA_DRIVER_VERSION}.x86_64.rpm +rpm -i ./build/pkg/rpm/x86_64/libnccl-static-${NCCL_VERSION}+cuda${CUDA_DRIVER_VERSION}.x86_64.rpm sed -i "$ s/$/ libnccl*/" /etc/dnf/dnf.conf popd diff --git a/common/install_azure_persistent_rdma_naming.sh b/common/install_azure_persistent_rdma_naming.sh index 369ca637..0f19c356 100755 --- a/common/install_azure_persistent_rdma_naming.sh +++ b/common/install_azure_persistent_rdma_naming.sh @@ -7,7 +7,7 @@ set -ex # pushd /tmp -rdma_core_branch=stable-v34 +rdma_core_branch=$(jq -r '.rdma_core."'"$DISTRIBUTION"'".branch' <<< $COMPONENT_VERSIONS) git clone -b $rdma_core_branch https://github.com/linux-rdma/rdma-core.git pushd rdma-core bash build.sh diff --git a/common/install_health_checks.sh b/common/install_health_checks.sh index c9b2c51f..a0fc7482 100755 --- a/common/install_health_checks.sh +++ b/common/install_health_checks.sh @@ -1,13 +1,8 @@ #!/bin/bash - - set -e -# grab latest release version -repo=Azure/azurehpc-health-checks -release_version=$(curl -s "https://api.github.com/repos/$repo/releases/latest" | jq -r '.tag_name') - -AZHC_VERSION=$release_version +# Set the azhc version +AZHC_VERSION=$(jq -r '.azhc."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) DEST_TEST_DIR=/opt/azurehpc/test AZHC_DIR=/opt/azurehpc/test/azurehpc-health-checks @@ -16,7 +11,7 @@ mkdir -p $DEST_TEST_DIR pushd $DEST_TEST_DIR -git clone https://github.com/Azure/azurehpc-health-checks.git --branch $AZHC_VERSION +git clone https://github.com/Azure/azurehpc-health-checks.git --branch v$AZHC_VERSION pushd azurehpc-health-checks diff --git a/requirements.json b/requirements.json index 27e887c1..e3165650 100644 --- a/requirements.json +++ b/requirements.json @@ -27,17 +27,6 @@ "sha256": "71f583f80a31d54bd307b4fe068678e5cdde0dd4c8c121ee384e336340cb8017" } }, - "gcc": { - "almalinux8.7": { - "version": "12.3.0" - }, - "ubuntu22.04": { - "version": "12.3.0" - }, - "ubuntu20.04": { - "version": "12.3.0" - } - }, "mofed": { "ubuntu20.04": { "version": "23.07-0.5.1.2", @@ -91,7 +80,7 @@ "version": "4.1.5" } }, - "impi_2021": { + "impi": { "ubuntu20.04": { "version": "2021.9.0" }, @@ -99,7 +88,7 @@ "version": "2021.9.0" }, "almalinux8.7": { - "version": "2021.9.0" + "version": "2018.4.274" } }, "nvidia": { diff --git a/ubuntu/common/hpc-tuning.sh b/ubuntu/common/hpc-tuning.sh index 5db6b386..29c25ea0 100755 --- a/ubuntu/common/hpc-tuning.sh +++ b/ubuntu/common/hpc-tuning.sh @@ -70,10 +70,13 @@ then exit ${error_code} fi +# Set waagent version and sha256 +waagent_metadata=$(jq -r '.waagent."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +WAAGENT_VERSION=$(jq -r '.version' <<< $waagent_metadata) + # Install WALinuxAgent apt-get install -y python3-setuptools pip3 install distro -WAAGENT_VERSION=2.9.0.4 $COMMON_DIR/write_component_version.sh "WAAGENT" ${WAAGENT_VERSION} DOWNLOAD_URL=https://github.com/Azure/WALinuxAgent/archive/refs/tags/v${WAAGENT_VERSION}.tar.gz wget ${DOWNLOAD_URL} diff --git a/ubuntu/common/install_amd_libs.sh b/ubuntu/common/install_amd_libs.sh index bebc1d76..03b46977 100755 --- a/ubuntu/common/install_amd_libs.sh +++ b/ubuntu/common/install_amd_libs.sh @@ -1,6 +1,8 @@ #!/bin/bash -AOCC_VERSION=4.0.0_1 +# Set AOCC and AOCL versions +amd_metadata=$(jq -r '.amd."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +AOCC_VERSION=$(jq -r '.aocc.version' <<< $amd_metadata) # install dependency wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-${AOCC_VERSION}_amd64.deb diff --git a/ubuntu/common/install_dcgm.sh b/ubuntu/common/install_dcgm.sh index d5003ee6..3c96acd3 100755 --- a/ubuntu/common/install_dcgm.sh +++ b/ubuntu/common/install_dcgm.sh @@ -1,10 +1,13 @@ #!/bin/bash set -ex +# Set DCGM version info +dcgm_metadata=$(jq -r '.dcgm."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +DCGM_VERSION=$(jq -r '.version' <<< $dcgm_metadata) + # Install DCGM # Reference: https://developer.nvidia.com/dcgm#Downloads # the repo is already added during nvidia/ cuda installations -DCGM_VERSION=3.1.8 apt-get install -y datacenter-gpu-manager=1:${DCGM_VERSION} $COMMON_DIR/write_component_version.sh "DCGM" ${DCGM_VERSION} diff --git a/ubuntu/common/install_intel_libs.sh b/ubuntu/common/install_intel_libs.sh index 01a087d2..8dd83115 100755 --- a/ubuntu/common/install_intel_libs.sh +++ b/ubuntu/common/install_intel_libs.sh @@ -1,21 +1,14 @@ #!/bin/bash set -ex -# Intel® oneAPI Math Kernel Library -case ${DISTRIBUTION} in - "ubuntu18.04") INTEL_MKL_VERSION="2023.1.0.46342"; - RELEASE_VERSION="cd17b7fe-500e-4305-a89b-bd5b42bfd9f8"; - CHECKSUM="cc28c94cab23c185520b93c5a04f3979d8da6b4c90cee8c0681dd89819d76167";; - "ubuntu20.04") INTEL_MKL_VERSION="2023.2.0.49497"; - RELEASE_VERSION="adb8a02c-4ee7-4882-97d6-a524150da358"; - CHECKSUM="4a0d93da85a94d92e0ad35dc0fc3b3ab7f040bd55ad374c4d5ec81a57a2b872b";; - "ubuntu22.04") INTEL_MKL_VERSION="2023.2.0.49497"; - RELEASE_VERSION="adb8a02c-4ee7-4882-97d6-a524150da358"; - CHECKSUM="4a0d93da85a94d92e0ad35dc0fc3b3ab7f040bd55ad374c4d5ec81a57a2b872b";; - *) ;; -esac +# Set Intel® oneAPI Math Kernel Library info +intel_one_mkl_metadata=$(jq -r '.intel_one_mkl."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +INTEL_ONE_MKL_VERSION=$(jq -r '.version' <<< $intel_one_mkl_metadata) +INTEL_ONE_MKL_SHA256=$(jq -r '.sha256' <<< $impi_metadata) +INTEL_ONE_MKL_DOWNLOAD_URL=$(jq -r '.url' <<< $impi_metadata) +INTEL_ONE_MKL_OFFLINE_INSTALLER=$(basename $IMPI_DOWNLOAD_URL) -ONE_MKL_DOWNLOAD_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/${RELEASE_VERSION}/l_onemkl_p_${INTEL_MKL_VERSION}_offline.sh -$COMMON_DIR/write_component_version.sh "INTEL_ONE_MKL" ${INTEL_MKL_VERSION} -$COMMON_DIR/download_and_verify.sh ${ONE_MKL_DOWNLOAD_URL} ${CHECKSUM} -sh ./l_onemkl_p_${INTEL_MKL_VERSION}_offline.sh -s -a -s --eula accept +# Install Intel® oneAPI Math Kernel Library +$COMMON_DIR/write_component_version.sh "INTEL_ONE_MKL" ${INTEL_ONE_MKL_VERSION} +$COMMON_DIR/download_and_verify.sh ${INTEL_ONE_MKL_DOWNLOAD_URL} ${INTEL_ONE_MKL_SHA256} +sh ./${INTEL_ONE_MKL_OFFLINE_INSTALLER} -s -a -s --eula accept diff --git a/ubuntu/common/install_lustre_client.sh b/ubuntu/common/install_lustre_client.sh index 8aefeba9..7b097af2 100755 --- a/ubuntu/common/install_lustre_client.sh +++ b/ubuntu/common/install_lustre_client.sh @@ -1,7 +1,8 @@ #!/bin/bash set -ex -LUSTRE_VERSION=2.15.1-29-gbae0abe +# Set Lustre version +LUSTRE_VERSION=$(jq -r '.lustre."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) source $UBUNTU_COMMON_DIR/setup_lustre_repo.sh diff --git a/ubuntu/common/install_monitoring_tools.sh b/ubuntu/common/install_monitoring_tools.sh index 1be7b277..adf7c78f 100755 --- a/ubuntu/common/install_monitoring_tools.sh +++ b/ubuntu/common/install_monitoring_tools.sh @@ -2,11 +2,8 @@ set -e -# grab latest release version -repo=Azure/Moneo -release_version=$(curl -s "https://api.github.com/repos/$repo/releases/latest" | jq -r '.tag_name') - -MONEO_VERSION=$release_version +# Set the Moneo version +MONEO_VERSION=$(jq -r '.moneo."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) # Dependencies python3 -m pip install --upgrade pip @@ -17,7 +14,7 @@ mkdir -p $MONITOR_DIR pushd $MONITOR_DIR - git clone https://github.com/Azure/Moneo --branch $MONEO_VERSION + git clone https://github.com/Azure/Moneo --branch v$MONEO_VERSION chmod 777 Moneo diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index f80556cd..a8dcd6ff 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -1,9 +1,6 @@ #!/bin/bash set -ex -# Parameters -HPCX_CHECKSUM=$1 - # Load gcc set CC=/usr/bin/gcc set GCC=/usr/bin/gcc @@ -11,47 +8,60 @@ set GCC=/usr/bin/gcc INSTALL_PREFIX=/opt # HPC-X v2.16 -HPCX_VERSION="v2.16" -TARBALL="hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-$DISTRIBUTION-cuda12-gdrcopy2-nccl2.18-x86_64.tbz" -HPCX_DOWNLOAD_URL=https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/${TARBALL} -HPCX_FOLDER=$(basename ${HPCX_DOWNLOAD_URL} .tbz) +hpcx_metadata=$(jq -r '.hpcx."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +HPCX_VERSION=$(jq -r '.version' <<< $hpcx_metadata) +HPCX_SHA256=$(jq -r '.sha256' <<< $hpcx_metadata) +HPCX_DOWNLOAD_URL=$(jq -r '.url' <<< $hpcx_metadata) +TARBALL=$(basename $HPCX_DOWNLOAD_URL) +HPCX_FOLDER=$(basename $HPCX_DOWNLOAD_URL .tbz) -$COMMON_DIR/download_and_verify.sh ${HPCX_DOWNLOAD_URL} ${HPCX_CHECKSUM} +$COMMON_DIR/download_and_verify.sh ${HPCX_DOWNLOAD_URL} ${HPCX_SHA256} tar -xvf ${TARBALL} mv ${HPCX_FOLDER} ${INSTALL_PREFIX} HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} $COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION -# MVAPICH2 2.3.7-1 -MV2_VERSION="2.3.7-1" -MV2_DOWNLOAD_URL=http://mvapich.cse.ohio-state.edu/download/mvapich/mv2/mvapich2-${MV2_VERSION}.tar.gz -$COMMON_DIR/download_and_verify.sh $MV2_DOWNLOAD_URL "fdd971cf36d6476d007b5d63d19414546ca8a2937b66886f24a1d9ca154634e4" -tar -xvf mvapich2-${MV2_VERSION}.tar.gz -cd mvapich2-${MV2_VERSION} +# Install MVAPICH2 +mvapich2_metadata=$(jq -r '.mvapich2."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +MVAPICH2_VERSION=$(jq -r '.version' <<< $mvapich2_metadata) +MVAPICH2_SHA256=$(jq -r '.sha256' <<< $mvapich2_metadata) +MV2_DOWNLOAD_URL=http://mvapich.cse.ohio-state.edu/download/mvapich/mv2/mvapich2-${MVAPICH2_VERSION}.tar.gz +$COMMON_DIR/download_and_verify.sh $MV2_DOWNLOAD_URL $MVAPICH2_SHA256 +tar -xvf mvapich2-${MVAPICH2_VERSION}.tar.gz +cd mvapich2-${MVAPICH2_VERSION} # Error exclusive to Ubuntu 22.04 # configure: error: The Fortran compiler gfortran will not compile files that call # the same routine with arguments of different types. -./configure $(if [[ ${DISTRIBUTION} == "ubuntu22.04" ]]; then echo "FFLAGS=-fallow-argument-mismatch"; fi) --prefix=${INSTALL_PREFIX}/mvapich2-${MV2_VERSION} --enable-g=none --enable-fast=yes && make -j$(nproc) && make install +./configure $(if [[ ${DISTRIBUTION} == "ubuntu22.04" ]]; then echo "FFLAGS=-fallow-argument-mismatch"; fi) --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes && make -j$(nproc) && make install cd .. -$COMMON_DIR/write_component_version.sh "MVAPICH2" ${MV2_VERSION} +$COMMON_DIR/write_component_version.sh "MVAPICH2" ${MVAPICH2_VERSION} + +# Install Open MPI +ompi_metadata=$(jq -r '.ompi."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +OMPI_VERSION=$(jq -r '.version' <<< $ompi_metadata) +OMPI_SHA256=$(jq -r '.sha256' <<< $ompi_metadata) +OMPI_DOWNLOAD_URL=$(jq -r '.url' <<< $ompi_metadata) +TARBALL=$(basename $OMPI_DOWNLOAD_URL) +OMPI_FOLDER=$(basename $OMPI_DOWNLOAD_URL .tar.gz) -# OpenMPI 4.1.5 -OMPI_VERSION="4.1.5" -OMPI_DOWNLOAD_URL=https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz -$COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL "c018b127619d2a2a30c1931f316fc8a245926d0f5b4ebed4711f9695e7f70925" -tar -xvf openmpi-${OMPI_VERSION}.tar.gz -cd openmpi-${OMPI_VERSION} +$COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL $OMPI_SHA256 +tar -xvf $TARBALL +cd $OMPI_FOLDER ./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "OMPI" ${OMPI_VERSION} -# Intel MPI 2021 (Update 9) -IMPI_2021_VERSION="2021.9.0" -IMPI_2021_DOWNLOAD_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/718d6f8f-2546-4b36-b97b-bc58d5482ebf/l_mpi_oneapi_p_${IMPI_2021_VERSION}.43482_offline.sh -$COMMON_DIR/download_and_verify.sh $IMPI_2021_DOWNLOAD_URL "5c170cdf26901311408809ced28498b630a494428703685203ceef6e62735ef8" -bash l_mpi_oneapi_p_${IMPI_2021_VERSION}.43482_offline.sh -s -a -s --eula accept -mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/impi -$COMMON_DIR/write_component_version.sh "IMPI_2021" ${IMPI_2021_VERSION} +# Install Intel MPI +impi_metadata=$(jq -r '.impi."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +IMPI_VERSION=$(jq -r '.version' <<< $impi_metadata) +IMPI_SHA256=$(jq -r '.sha256' <<< $impi_metadata) +IMPI_DOWNLOAD_URL=$(jq -r '.url' <<< $impi_metadata) +IMPI_OFFLINE_INSTALLER=$(basename $IMPI_DOWNLOAD_URL) + +$COMMON_DIR/download_and_verify.sh $IMPI_DOWNLOAD_URL $IMPI_SHA256 +bash OFFLINE_INSTALLER -s -a -s --eula accept +mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/impi +$COMMON_DIR/write_component_version.sh "IMPI" ${IMPI_VERSION} # Module Files MODULE_FILES_DIRECTORY=/usr/share/modules/modulefiles/mpi @@ -68,20 +78,20 @@ module load ${HPCX_PATH}/modulefiles/hpcx EOF # MVAPICH2 -cat << EOF >> ${MODULE_FILES_DIRECTORY}/mvapich2-${MV2_VERSION} +cat << EOF >> ${MODULE_FILES_DIRECTORY}/mvapich2-${MVAPICH2_VERSION} #%Module 1.0 # -# MVAPICH2 ${MV2_VERSION} +# MVAPICH2 ${MVAPICH2_VERSION} # conflict mpi -prepend-path PATH /opt/mvapich2-${MV2_VERSION}/bin -prepend-path LD_LIBRARY_PATH /opt/mvapich2-${MV2_VERSION}/lib -prepend-path MANPATH /opt/mvapich2-${MV2_VERSION}/share/man -setenv MPI_BIN /opt/mvapich2-${MV2_VERSION}/bin -setenv MPI_INCLUDE /opt/mvapich2-${MV2_VERSION}/include -setenv MPI_LIB /opt/mvapich2-${MV2_VERSION}/lib -setenv MPI_MAN /opt/mvapich2-${MV2_VERSION}/share/man -setenv MPI_HOME /opt/mvapich2-${MV2_VERSION} +prepend-path PATH /opt/mvapich2-${MVAPICH2_VERSION}/bin +prepend-path LD_LIBRARY_PATH /opt/mvapich2-${MVAPICH2_VERSION}/lib +prepend-path MANPATH /opt/mvapich2-${MVAPICH2_VERSION}/share/man +setenv MPI_BIN /opt/mvapich2-${MVAPICH2_VERSION}/bin +setenv MPI_INCLUDE /opt/mvapich2-${MVAPICH2_VERSION}/include +setenv MPI_LIB /opt/mvapich2-${MVAPICH2_VERSION}/lib +setenv MPI_MAN /opt/mvapich2-${MVAPICH2_VERSION}/share/man +setenv MPI_HOME /opt/mvapich2-${MVAPICH2_VERSION} EOF # OpenMPI @@ -102,22 +112,22 @@ setenv MPI_HOME /opt/openmpi-${OMPI_VERSION} EOF # Intel 2021 -cat << EOF >> ${MODULE_FILES_DIRECTORY}/impi_${IMPI_2021_VERSION} +cat << EOF >> ${MODULE_FILES_DIRECTORY}/impi_${IMPI_VERSION} #%Module 1.0 # -# Intel MPI ${IMPI_2021_VERSION} +# Intel MPI ${IMPI_VERSION} # conflict mpi -module load /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/impi -setenv MPI_BIN /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/bin -setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/include -setenv MPI_LIB /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/lib -setenv MPI_MAN /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/man -setenv MPI_HOME /opt/intel/oneapi/mpi/${IMPI_2021_VERSION} +module load /opt/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/impi +setenv MPI_BIN /opt/intel/oneapi/mpi/${IMPI_VERSION}/bin +setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${IMPI_VERSION}/include +setenv MPI_LIB /opt/intel/oneapi/mpi/${IMPI_VERSION}/lib +setenv MPI_MAN /opt/intel/oneapi/mpi/${IMPI_VERSION}/man +setenv MPI_HOME /opt/intel/oneapi/mpi/${IMPI_VERSION} EOF # Softlinks ln -s ${MODULE_FILES_DIRECTORY}/hpcx-${HPCX_VERSION} ${MODULE_FILES_DIRECTORY}/hpcx -ln -s ${MODULE_FILES_DIRECTORY}/mvapich2-${MV2_VERSION} ${MODULE_FILES_DIRECTORY}/mvapich2 +ln -s ${MODULE_FILES_DIRECTORY}/mvapich2-${MVAPICH2_VERSION} ${MODULE_FILES_DIRECTORY}/mvapich2 ln -s ${MODULE_FILES_DIRECTORY}/openmpi-${OMPI_VERSION} ${MODULE_FILES_DIRECTORY}/openmpi -ln -s ${MODULE_FILES_DIRECTORY}/impi_${IMPI_2021_VERSION} ${MODULE_FILES_DIRECTORY}/impi-2021 +ln -s ${MODULE_FILES_DIRECTORY}/impi_${IMPI_VERSION} ${MODULE_FILES_DIRECTORY}/impi-2021 diff --git a/ubuntu/common/install_nccl.sh b/ubuntu/common/install_nccl.sh index 79511913..6b6b567e 100755 --- a/ubuntu/common/install_nccl.sh +++ b/ubuntu/common/install_nccl.sh @@ -1,28 +1,17 @@ #!/bin/bash set -ex +# Set NCCL versions +NCCL_VERSION=$(jq -r '.nccl."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) +NCCL_RDMA_SHARP_COMMIT=$(jq -r '.nccl."'"$DISTRIBUTION"'".rdmasharpplugins.commit' <<< $COMPONENT_VERSIONS) +CUDA_DRIVER_VERSION=$(jq -r '.cuda."'"$DISTRIBUTION"'".driver.version' <<< $COMPONENT_VERSIONS) + +TARBALL="v${NCCL_VERSION}.tar.gz"; +NCCL_DOWNLOAD_URL=https://github.com/NVIDIA/nccl/archive/refs/tags/${TARBALL}; + # Install NCCL apt install -y build-essential devscripts debhelper fakeroot -case ${DISTRIBUTION} in - "ubuntu18.04") NCCL_VERSION="2.18.3-1"; - CUDA_VERSION="12.1"; - TARBALL="v${NCCL_VERSION}.tar.gz"; - NCCL_DOWNLOAD_URL=https://github.com/NVIDIA/nccl/archive/refs/tags/${TARBALL}; - NCCL_RDMA_SHARP_COMMIT="575c1e0";; - "ubuntu20.04") NCCL_VERSION="2.19.3-1"; - CUDA_VERSION="12.2"; - TARBALL="v${NCCL_VERSION}.tar.gz"; - NCCL_DOWNLOAD_URL=https://github.com/NVIDIA/nccl/archive/refs/tags/${TARBALL}; - NCCL_RDMA_SHARP_COMMIT="575c1e0";; - "ubuntu22.04") NCCL_VERSION="2.19.3-1"; - CUDA_VERSION="12.2"; - TARBALL="v${NCCL_VERSION}.tar.gz"; - NCCL_DOWNLOAD_URL=https://github.com/NVIDIA/nccl/archive/refs/tags/${TARBALL}; - NCCL_RDMA_SHARP_COMMIT="575c1e0";; - *) ;; -esac - pushd /tmp wget ${NCCL_DOWNLOAD_URL} tar -xvf ${TARBALL} @@ -31,9 +20,9 @@ pushd nccl-${NCCL_VERSION} make -j src.build make pkg.debian.build pushd build/pkg/deb/ -dpkg -i libnccl2_${NCCL_VERSION}+cuda${CUDA_VERSION}_amd64.deb +dpkg -i libnccl2_${NCCL_VERSION}+cuda${CUDA_DRIVER_VERSION}_amd64.deb sudo apt-mark hold libnccl2 -dpkg -i libnccl-dev_${NCCL_VERSION}+cuda${CUDA_VERSION}_amd64.deb +dpkg -i libnccl-dev_${NCCL_VERSION}+cuda${CUDA_DRIVER_VERSION}_amd64.deb sudo apt-mark hold libnccl-dev popd popd diff --git a/ubuntu/common/install_nvidia_fabric_manager.sh b/ubuntu/common/install_nvidia_fabric_manager.sh index 1037b2eb..44ff69de 100755 --- a/ubuntu/common/install_nvidia_fabric_manager.sh +++ b/ubuntu/common/install_nvidia_fabric_manager.sh @@ -1,26 +1,15 @@ #!/bin/bash set -ex -# Parameter -# Ubuntu Version -VERSION=$1 +# Set NVIDIA fabricmanager version +nvidia_fabricmanager_metadata=$(jq -r '.nvidia."'"$DISTRIBUTION"'".fabricmanager' <<< $COMPONENT_VERSIONS) +NVIDIA_FABRICMANAGER_PREFIX=$(jq -r '.prefix' <<< $nvidia_fabricmanager_metadata) +NVIDIA_FABRICMANAGER_DISTRIBUTION=$(jq -r '.distribution' <<< $nvidia_fabricmanager_metadata) +NVIDIA_FABRICMANAGER_VERSION=$(jq -r '.version' <<< $nvidia_fabricmanager_metadata) +NVIDIA_FABRICMANAGER_SHA256=$(jq -r '.sha256' <<< $nvidia_fabricmanager_metadata) -# Install nvidia fabric manager -case ${VERSION} in - 1804) NVIDIA_FABRIC_MANAGER_VERSION="525_525.105.17-1"; - CHECKSUM="b487db5923194ba9f4d7c34891f4f8513a3f633a22a0c9f51fba3ef971681977"; - VERSION_PREFIX="525";; - 2004) NVIDIA_FABRIC_MANAGER_VERSION="535_535.86.10-1"; - CHECKSUM="d0c4662279301187614646650da07f34a6fe267d789d48bc9ed63181af06ac29"; - VERSION_PREFIX="535";; - 2204) NVIDIA_FABRIC_MANAGER_VERSION="535_535.86.10-1"; - CHECKSUM="d0c4662279301187614646650da07f34a6fe267d789d48bc9ed63181af06ac29"; - VERSION_PREFIX="535";; - *) ;; -esac - -NVIDIA_FABRIC_MNGR_URL=http://developer.download.nvidia.com/compute/cuda/repos/ubuntu${VERSION}/x86_64/nvidia-fabricmanager-${NVIDIA_FABRIC_MANAGER_VERSION}_amd64.deb -$COMMON_DIR/download_and_verify.sh $NVIDIA_FABRIC_MNGR_URL ${CHECKSUM} -apt install -y ./nvidia-fabricmanager-${NVIDIA_FABRIC_MANAGER_VERSION}_amd64.deb -apt-mark hold nvidia-fabricmanager-${VERSION_PREFIX} -$COMMON_DIR/write_component_version.sh "NVIDIA_FABRIC_MANAGER" ${NVIDIA_FABRIC_MANAGER_VERSION} +NVIDIA_FABRIC_MNGR_URL=http://developer.download.nvidia.com/compute/cuda/repos/${NVIDIA_FABRICMANAGER_DISTRIBUTION}/x86_64/nvidia-fabricmanager-${NVIDIA_FABRICMANAGER_VERSION}_amd64.deb +$COMMON_DIR/download_and_verify.sh $NVIDIA_FABRIC_MNGR_URL ${NVIDIA_FABRICMANAGER_SHA256} +apt install -y ./nvidia-fabricmanager-${NVIDIA_FABRICMANAGER_VERSION}_amd64.deb +apt-mark hold nvidia-fabricmanager-${NVIDIA_FABRICMANAGER_PREFIX} +$COMMON_DIR/write_component_version.sh "NVIDIA_FABRIC_MANAGER" ${NVIDIA_FABRICMANAGER_VERSION} diff --git a/ubuntu/common/install_nvidiagpudriver.sh b/ubuntu/common/install_nvidiagpudriver.sh index 8a5ccc57..56cbebd3 100755 --- a/ubuntu/common/install_nvidiagpudriver.sh +++ b/ubuntu/common/install_nvidiagpudriver.sh @@ -1,35 +1,22 @@ #!/bin/bash set -ex -# Parameters -VERSION=$1 - -case ${VERSION} in - 1804) NVIDIA_VERSION="525.105.17"; - CUDA_VERSION="12-1"; - CUDA_SAMPLES_VERSION="12.1"; - CHECKSUM="c635a21a282c9b53485f19ebb64a0f4b536a968b94d4d97629e0bc547a58142a";; - 2004) NVIDIA_VERSION="535.86.10"; - CUDA_VERSION="12-2"; - CUDA_SAMPLES_VERSION="12.2"; - CHECKSUM="cec37fd9317091a01fd0380309400a14284769790651fe021e1d5aaa2327a370";; - 2204) NVIDIA_VERSION="535.86.10"; - CUDA_VERSION="12-2"; - CUDA_SAMPLES_VERSION="12.2"; - CHECKSUM="cec37fd9317091a01fd0380309400a14284769790651fe021e1d5aaa2327a370";; - *) ;; -esac +# Set the driver versions +cuda_metadata=$(jq -r '.cuda."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +CUDA_DRIVER_VERSION=$(jq -r '.driver.version' <<< $cuda_metadata) +CUDA_DRIVER_DISTRIBUTION=$(jq -r '.driver.distribution' <<< $cuda_metadata) +CUDA_SAMPLES_VERSION=$(jq -r '.samples.version' <<< $cuda_metadata) # Reference - https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#ubuntu-installation # Install Cuda -wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${VERSION}/x86_64/cuda-keyring_1.0-1_all.deb +wget https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DRIVER_DISTRIBUTION}/x86_64/cuda-keyring_1.0-1_all.deb dpkg -i ./cuda-keyring_1.0-1_all.deb apt-get update -apt install -y cuda-toolkit-${CUDA_VERSION} +apt install -y cuda-toolkit-${CUDA_DRIVER_VERSION} echo 'export PATH=$PATH:/usr/local/cuda/bin' | tee -a /etc/bash.bashrc echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64' | tee -a /etc/bash.bashrc -$COMMON_DIR/write_component_version.sh "CUDA" ${CUDA_VERSION} +$COMMON_DIR/write_component_version.sh "CUDA" ${CUDA_DRIVER_VERSION} # Download CUDA samples TARBALL="v${CUDA_SAMPLES_VERSION}.tar.gz" @@ -41,8 +28,12 @@ make mv -vT ./Samples /usr/local/cuda-${CUDA_SAMPLES_VERSION}/samples popd -# Nvidia driver -NVIDIA_DRIVER_URL=https://us.download.nvidia.com/tesla/${NVIDIA_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run -$COMMON_DIR/download_and_verify.sh $NVIDIA_DRIVER_URL ${CHECKSUM} -bash NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run --silent --dkms -$COMMON_DIR/write_component_version.sh "NVIDIA" ${NVIDIA_VERSION} +# Install NVIDIA driver +nvidia_driver_metadata=$(jq -r '.nvidia."'"$DISTRIBUTION"'".driver' <<< $COMPONENT_VERSIONS) +NVIDIA_DRIVER_VERSION=$(jq -r '.version' <<< $nvidia_driver_metadata) +NVIDIA_DRIVER_SHA256=$(jq -r '.sha256' <<< $nvidia_driver_metadata) +NVIDIA_DRIVER_URL=https://us.download.nvidia.com/tesla/${NVIDIA_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run + +$COMMON_DIR/download_and_verify.sh $NVIDIA_DRIVER_URL ${NVIDIA_DRIVER_SHA256} +bash NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run --silent --dkms +$COMMON_DIR/write_component_version.sh "NVIDIA" ${NVIDIA_DRIVER_VERSION} diff --git a/ubuntu/common/install_utils.sh b/ubuntu/common/install_utils.sh index 7397e1a5..5673f6d3 100755 --- a/ubuntu/common/install_utils.sh +++ b/ubuntu/common/install_utils.sh @@ -49,13 +49,13 @@ apt-get -y install numactl \ if [[ $DISTRIBUTION != "ubuntu22.04" ]]; then apt-get install -y python-dev; fi # Install azcopy tool -# To copy blobs or files to or from a storage account. -# Parameters - Version, Release Tag -VERSION=$1 -RELEASE_TAG=$2 -TARBALL="azcopy_linux_amd64_${VERSION}.tar.gz" -AZCOPY_DOWNLOAD_URL="https://azcopyvnext.azureedge.net/${RELEASE_TAG}/${TARBALL}" -AZCOPY_FOLDER=$(basename ${AZCOPY_DOWNLOAD_URL} .tgz) +# To copy blobs or files to or from a storage account +azcopy_metadata=$(jq -r '.azcopy."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +azcopy_version=$(jq -r '.version' <<< $azcopy_metadata) +azcopy_release=$(jq -r '.release' <<< $azcopy_metadata) +azcopy_sha256=$(jq -r '.sha256' <<< $azcopy_metadata) +TARBALL="azcopy_linux_amd64_$azcopy_version.tar.gz" +AZCOPY_DOWNLOAD_URL="https://azcopyvnext.azureedge.net/$azcopy_release/$tarball" wget ${AZCOPY_DOWNLOAD_URL} tar -xvf ${TARBALL} diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_mellanoxofed.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_mellanoxofed.sh index ec480a14..e8e2239b 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_mellanoxofed.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_mellanoxofed.sh @@ -1,16 +1,18 @@ #!/bin/bash set -ex -VERSION="23.07-0.5.1.2" -TARBALL="MLNX_OFED_LINUX-$VERSION-ubuntu20.04-x86_64.tgz" -MLNX_OFED_DOWNLOAD_URL=https://content.mellanox.com/ofed/MLNX_OFED-${VERSION}/$TARBALL +mofed_metadata=$(jq -r '.mofed."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +MOFED_VERSION=$(jq -r '.version' <<< $mofed_metadata) +MOFED_SHA256=$(jq -r '.sha256' <<< $mofed_metadata) +TARBALL="MLNX_OFED_LINUX-$MOFED_VERSION-ubuntu20.04-x86_64.tgz" +MLNX_OFED_DOWNLOAD_URL=https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/$TARBALL MOFED_FOLDER=$(basename ${MLNX_OFED_DOWNLOAD_URL} .tgz) -$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL "923ddbd48d250b25ba50098ad8852ad6a591df3e975f3e0b9922b752181bdd12" +$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL $MOFED_SHA256 tar zxvf ${TARBALL} ./${MOFED_FOLDER}/mlnxofedinstall --add-kernel-support --skip-unsupported-devices-check --without-fw-update -$COMMON_DIR/write_component_version.sh "MOFED" $VERSION +$COMMON_DIR/write_component_version.sh "MOFED" $MOFED_VERSION # Restarting openibd /etc/init.d/openibd restart diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_mpis.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_mpis.sh index 7a9cc582..be07ccc1 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_mpis.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_mpis.sh @@ -2,4 +2,4 @@ set -ex # Install common MPIs for Ubuntu -$UBUNTU_COMMON_DIR/install_mpis.sh "addda11a710c52268b7a725c13d9cc54c93deddf259c87d8547ad3c0422b87e1" +$UBUNTU_COMMON_DIR/install_mpis.sh diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_nvidiagpudriver.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_nvidiagpudriver.sh index 155bc196..6b5b6c4c 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_nvidiagpudriver.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_nvidiagpudriver.sh @@ -1,11 +1,11 @@ #!/bin/bash set -ex -$UBUNTU_COMMON_DIR/install_nvidiagpudriver.sh 2004 +$UBUNTU_COMMON_DIR/install_nvidiagpudriver.sh # Install gdrcopy sudo apt install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -GDRCOPY_VERSION="2.3" +GDRCOPY_VERSION=$(jq -r '.gdrcopy."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) TARBALL="v${GDRCOPY_VERSION}.tar.gz" GDRCOPY_DOWNLOAD_URL=https://github.com/NVIDIA/gdrcopy/archive/refs/tags/${TARBALL} wget $GDRCOPY_DOWNLOAD_URL @@ -26,4 +26,4 @@ popd $COMMON_DIR/write_component_version.sh "GDRCOPY" ${GDRCOPY_VERSION} # Install nvidia fabric manager (required for ND96asr_v4) -$UBUNTU_COMMON_DIR/install_nvidia_fabric_manager.sh 2004 +$UBUNTU_COMMON_DIR/install_nvidia_fabric_manager.sh diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_utils.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_utils.sh index 49a69183..714712f1 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_utils.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_utils.sh @@ -11,6 +11,5 @@ curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microso cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ #apt-get install packages -AZCOPY_VERSION="10.17.0" -AZCOPY_RELEASE_TAG="release20230123" -$UBUNTU_COMMON_DIR/install_utils.sh ${AZCOPY_VERSION} ${AZCOPY_RELEASE_TAG} + +$UBUNTU_COMMON_DIR/install_utils.sh diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/set_properties.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/set_properties.sh index 2ab0f97e..623425bf 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/set_properties.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/set_properties.sh @@ -5,3 +5,6 @@ export COMMON_DIR=../../../common export UBUNTU_COMMON_DIR=../../common export TEST_DIR=../../../tests export DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) + +# Component Versions +export COMPONENT_VERSIONS=$(jq -r . $TOP_DIR/requirements.json) \ No newline at end of file diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_mellanoxofed.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_mellanoxofed.sh index f040a324..9a4a266e 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_mellanoxofed.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_mellanoxofed.sh @@ -1,16 +1,18 @@ #!/bin/bash set -ex -VERSION="23.07-0.5.1.2" -TARBALL="MLNX_OFED_LINUX-$VERSION-ubuntu22.04-x86_64.tgz" -MLNX_OFED_DOWNLOAD_URL=https://content.mellanox.com/ofed/MLNX_OFED-${VERSION}/$TARBALL +mofed_metadata=$(jq -r '.mofed."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +MOFED_VERSION=$(jq -r '.version' <<< $mofed_metadata) +MOFED_SHA256=$(jq -r '.sha256' <<< $mofed_metadata) +TARBALL="MLNX_OFED_LINUX-$MOFED_VERSION-ubuntu22.04-x86_64.tgz" +MLNX_OFED_DOWNLOAD_URL=https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/$TARBALL MOFED_FOLDER=$(basename ${MLNX_OFED_DOWNLOAD_URL} .tgz) -$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL "77e032a48de4c040b2f2dd3bf2edd11921de7caff59c773ac35208514f72eff5" +$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL $MOFED_SHA256 tar zxvf ${TARBALL} ./${MOFED_FOLDER}/mlnxofedinstall --add-kernel-support --skip-unsupported-devices-check --without-fw-update -$COMMON_DIR/write_component_version.sh "MOFED" $VERSION +$COMMON_DIR/write_component_version.sh "MOFED" $MOFED_VERSION # Restarting openibd /etc/init.d/openibd restart diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_mpis.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_mpis.sh index 1c8daffc..be07ccc1 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_mpis.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_mpis.sh @@ -2,4 +2,4 @@ set -ex # Install common MPIs for Ubuntu -$UBUNTU_COMMON_DIR/install_mpis.sh "97eac5555d54f5fd8da1c354222a1aff2e85eb017682441e06287971a5b95772" +$UBUNTU_COMMON_DIR/install_mpis.sh diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_nvidiagpudriver.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_nvidiagpudriver.sh index 71211d57..4a21174e 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_nvidiagpudriver.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_nvidiagpudriver.sh @@ -4,11 +4,11 @@ set -ex # Dependency for nvidia driver installation apt-get install -y libvulkan1 -$UBUNTU_COMMON_DIR/install_nvidiagpudriver.sh 2204 +$UBUNTU_COMMON_DIR/install_nvidiagpudriver.sh # Install gdrcopy sudo apt install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -GDRCOPY_VERSION="2.3" +GDRCOPY_VERSION=$(jq -r '.gdrcopy."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) TARBALL="v${GDRCOPY_VERSION}.tar.gz" GDRCOPY_DOWNLOAD_URL=https://github.com/NVIDIA/gdrcopy/archive/refs/tags/${TARBALL} wget $GDRCOPY_DOWNLOAD_URL @@ -29,4 +29,4 @@ popd $COMMON_DIR/write_component_version.sh "GDRCOPY" ${GDRCOPY_VERSION} # Install nvidia fabric manager (required for ND96asr_v4) -$UBUNTU_COMMON_DIR/install_nvidia_fabric_manager.sh 2204 +$UBUNTU_COMMON_DIR/install_nvidia_fabric_manager.sh diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_utils.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_utils.sh index ee74352e..4f57c486 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_utils.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_utils.sh @@ -11,6 +11,5 @@ curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microso cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ #apt-get install packages -AZCOPY_VERSION="10.17.0" -AZCOPY_RELEASE_TAG="release20230123" -$UBUNTU_COMMON_DIR/install_utils.sh ${AZCOPY_VERSION} ${AZCOPY_RELEASE_TAG} + +$UBUNTU_COMMON_DIR/install_utils.sh diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/set_properties.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/set_properties.sh index 2ab0f97e..623425bf 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/set_properties.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/set_properties.sh @@ -5,3 +5,6 @@ export COMMON_DIR=../../../common export UBUNTU_COMMON_DIR=../../common export TEST_DIR=../../../tests export DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) + +# Component Versions +export COMPONENT_VERSIONS=$(jq -r . $TOP_DIR/requirements.json) \ No newline at end of file From 6a6aaa8d47f3dd5a61a85e62ee398c4eaa44d8a8 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 19 Mar 2024 20:15:23 -0700 Subject: [PATCH 03/76] cross reference from JSON --- alma/alma-8.x/common/install_utils.sh | 12 ++++++++---- requirements.json | 11 ----------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/alma/alma-8.x/common/install_utils.sh b/alma/alma-8.x/common/install_utils.sh index 01028094..f32d0d03 100755 --- a/alma/alma-8.x/common/install_utils.sh +++ b/alma/alma-8.x/common/install_utils.sh @@ -12,10 +12,14 @@ yum install -y python3.8 ln -fs /usr/bin/python3.8 /usr/bin/python3 # install pssh -PSSH_VER=2.3.1-29 -wget https://dl.fedoraproject.org/pub/epel/8/Everything/aarch64/Packages/p/pssh-$PSSH_VER.el8.noarch.rpm -yum install -y pssh-$PSSH_VER.el8.noarch.rpm -rm -f pssh-$PSSH_VER.el8.noarch.rpm +pssh_metadata=$(jq -r '.pssh."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +PSSH_VERSION=$(jq -r '.version' <<< $pssh_metadata) +PSSH_SHA256=$(jq -r '.sha256' <<< $pssh_metadata) +pssh_download_url="https://dl.fedoraproject.org/pub/epel/8/Everything/aarch64/Packages/p/pssh-$PSSH_VERSION.el8.noarch.rpm" +$COMMON_DIR/download_and_verify.sh $pssh_download_url $PSSH_SHA256 + +yum install -y pssh-$PSSH_VERSION.el8.noarch.rpm +rm -f pssh-$PSSH_VERSION.el8.noarch.rpm # Install pre-reqs and development tools yum groupinstall -y "Development Tools" diff --git a/requirements.json b/requirements.json index e3165650..d0e99169 100644 --- a/requirements.json +++ b/requirements.json @@ -1,15 +1,4 @@ { - "spack": { - "ubuntu20.04": { - "branch": "develop" - }, - "ubuntu22.04": { - "branch": "develop" - }, - "almalinux8.7": { - "branch": "develop" - } - }, "azcopy": { "ubuntu20.04": { "version": "10.17.0", From d10c56736aea293ccee100198fe8cdf6d4306fe2 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 19 Mar 2024 20:57:09 -0700 Subject: [PATCH 04/76] triple check --- alma/alma-8.x/common/install_amd_libs.sh | 2 +- alma/common/install_mpis.sh | 2 +- requirements.json | 88 ++++++++++++++---------- 3 files changed, 52 insertions(+), 40 deletions(-) diff --git a/alma/alma-8.x/common/install_amd_libs.sh b/alma/alma-8.x/common/install_amd_libs.sh index 9964a37e..2d47e099 100755 --- a/alma/alma-8.x/common/install_amd_libs.sh +++ b/alma/alma-8.x/common/install_amd_libs.sh @@ -7,7 +7,7 @@ mkdir -p ${INSTALL_PREFIX} # Set AOCL version amd_metadata=$(jq -r '.amd."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) AOCL_VERSION=$(jq -r '.aocl.version' <<< $amd_metadata) -AOCL_SHA256=$(jq -r '.aocl.version' <<< $amd_metadata) +AOCL_SHA256=$(jq -r '.aocl.sha256' <<< $amd_metadata) TARBALL="aocl-linux-aocc-${AOCL_VERSION}.tar.gz" AOCL_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/${TARBALL} diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 9068c1c5..fac4c23c 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -18,7 +18,7 @@ set GCC=/opt/${GCC_VERSION}/bin/gcc mvapich2_metadata=$(jq -r '.mvapich2."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) MVAPICH2_VERSION=$(jq -r '.version' <<< $mvapich2_metadata) MVAPICH2_SHA256=$(jq -r '.sha256' <<< $mvapich2_metadata) -MVAPICH2_DOWNLOAD_URL=$(jq -r '.url' <<< $mvapich2_metadata) +MVAPICH2_DOWNLOAD_URL="http://mvapich.cse.ohio-state.edu/download/mvapich/mv2/mvapich2-${MVAPICH2_VERSION}.tar.gz" TARBALL=$(basename $MVAPICH2_DOWNLOAD_URL) MVAPICH2_FOLDER=$(basename $MVAPICH2_DOWNLOAD_URL .tar.gz) diff --git a/requirements.json b/requirements.json index d0e99169..2742ba1d 100644 --- a/requirements.json +++ b/requirements.json @@ -49,35 +49,50 @@ }, "mvapich2": { "ubuntu20.04": { - "version": "2.3.7-1" + "version": "2.3.7-1", + "sha256": "fdd971cf36d6476d007b5d63d19414546ca8a2937b66886f24a1d9ca154634e4" }, "ubuntu22.04": { - "version": "2.3.7-1" + "version": "2.3.7-1", + "sha256": "fdd971cf36d6476d007b5d63d19414546ca8a2937b66886f24a1d9ca154634e4" }, "almalinux8.7": { - "version": "2.3.7-1" + "version": "2.3.7-1", + "sha256": "fdd971cf36d6476d007b5d63d19414546ca8a2937b66886f24a1d9ca154634e4" } }, "ompi": { "ubuntu20.04": { - "version": "4.1.5" + "version": "4.1.5", + "sha256": "c018b127619d2a2a30c1931f316fc8a245926d0f5b4ebed4711f9695e7f70925", + "url": "https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz" }, "ubuntu22.04": { - "version": "4.1.5" + "version": "4.1.5", + "sha256": "c018b127619d2a2a30c1931f316fc8a245926d0f5b4ebed4711f9695e7f70925", + "url": "https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz" }, "almalinux8.7": { - "version": "4.1.5" + "version": "4.1.5", + "sha256": "c018b127619d2a2a30c1931f316fc8a245926d0f5b4ebed4711f9695e7f70925", + "url": "https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz" } }, "impi": { "ubuntu20.04": { - "version": "2021.9.0" + "version": "2021.9.0", + "sha256": "5c170cdf26901311408809ced28498b630a494428703685203ceef6e62735ef8", + "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/718d6f8f-2546-4b36-b97b-bc58d5482ebf/l_mpi_oneapi_p_2021.9.0.43482_offline.sh" }, "ubuntu22.04": { - "version": "2021.9.0" + "version": "2021.9.0", + "sha256": "5c170cdf26901311408809ced28498b630a494428703685203ceef6e62735ef8", + "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/718d6f8f-2546-4b36-b97b-bc58d5482ebf/l_mpi_oneapi_p_2021.9.0.43482_offline.sh" }, "almalinux8.7": { - "version": "2018.4.274" + "version": "2018.4.274", + "sha256": "a1114b3eb4149c2f108964b83cad02150d619e50032059d119ac4ffc9d5dd8e0", + "url": "http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/13651/l_mpi_2018.4.274.tgz" } }, "nvidia": { @@ -121,29 +136,29 @@ "cuda": { "ubuntu20.04": { "driver": { - "version": "12.2.1" + "version": "12.2.1", + "distribution": "ubuntu2004" }, "samples": { - "version": "12.2", - "sha256": "1823cfe28e97a9230107aa72b231f78952c0f178b71a920f036d360518480bdc" + "version": "12.2" } }, "ubuntu22.04": { "driver": { - "version": "12.2.1" + "version": "12.2.1", + "distribution": "ubuntu2204" }, "samples": { - "version": "12.2", - "sha256": "1823cfe28e97a9230107aa72b231f78952c0f178b71a920f036d360518480bdc" + "version": "12.2" } }, "almalinux8.7": { "driver": { - "version": "12.2.0" + "version": "12.2.0", + "distribution": "rhel8" }, "samples": { - "version": "12.2", - "sha256": "1823cfe28e97a9230107aa72b231f78952c0f178b71a920f036d360518480bdc" + "version": "12.2" } } }, @@ -180,27 +195,29 @@ }, "dcgm": { "ubuntu20.04": { - "version": "3.1.8", - "distribution": "ubuntu2004" + "version": "3.1.8" }, "ubuntu22.04": { - "version": "3.1.8", - "distribution": "ubuntu2204" + "version": "3.1.8" }, "almalinux8.7": { - "version": "3.1.8", - "distribution": "rhel8" + "version": "3.1.8" } }, "intel_one_mkl": { "ubuntu20.04": { - "version": "2023.2.0" - }, + "version": "2023.2.0.49497", + "sha256": "4a0d93da85a94d92e0ad35dc0fc3b3ab7f040bd55ad374c4d5ec81a57a2b872b", + "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh" }, "ubuntu22.04": { - "version": "2023.2.0" + "version": "2023.2.0.49497", + "sha256": "4a0d93da85a94d92e0ad35dc0fc3b3ab7f040bd55ad374c4d5ec81a57a2b872b", + "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh" }, "almalinux8.7": { - "version": "2023.2.0" + "version": "2023.2.0.49497", + "sha256": "4a0d93da85a94d92e0ad35dc0fc3b3ab7f040bd55ad374c4d5ec81a57a2b872b", + "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh" } }, "waagent": { @@ -233,26 +250,21 @@ "amd": { "almalinux8.7": { "aocl": { - "version": "4.1" + "version": "4.0", + "sha256": "c8000a66aaa2a257252cbb307732b4e66758b72b08f43b3723f4eb5404ba28c8" }, "aocc": { - "version": "4.1.0" + "version": "4.0.0-1" } }, "ubuntu20.04": { - "aocl": { - "version": "4.1" - }, "aocc": { - "version": "4.1.0" + "version": "4.0.0_1" } }, "ubuntu22.04": { - "aocl": { - "version": "4.1" - }, "aocc": { - "version": "4.1.0" + "version": "4.0.0_1" } } }, From 764e32e3ffa49aa2c25f5d16fd889cb1f02824ba Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 08:09:24 -0700 Subject: [PATCH 05/76] fix impi in alma --- alma/alma-8.x/common/install_mpis.sh | 21 ++++++++------------- alma/common/install_mpis.sh | 18 +++++++++++------- requirements.json | 6 +++--- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/alma/alma-8.x/common/install_mpis.sh b/alma/alma-8.x/common/install_mpis.sh index 3f712579..8ac9c9c6 100755 --- a/alma/alma-8.x/common/install_mpis.sh +++ b/alma/alma-8.x/common/install_mpis.sh @@ -14,18 +14,13 @@ export LD_LIBRARY_PATH=/opt/${GCC_VERSION}/lib64:$LD_LIBRARY_PATH set CC=/opt/${GCC_VERSION}/bin/gcc set GCC=/opt/${GCC_VERSION}/bin/gcc -# Install Intel MPI -impi_metadata=$(jq -r '.impi."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) -IMPI_VERSION=$(jq -r '.version' <<< $impi_metadata) -IMPI_SHA256=$(jq -r '.sha256' <<< $impi_metadata) -IMPI_DOWNLOAD_URL=$(jq -r '.url' <<< $impi_metadata) -TARBALL=$(basename $IMPI_DOWNLOAD_URL) -IMPI_FOLDER=$(basename $IMPI_DOWNLOAD_URL .tbz) - -$COMMON_DIR/write_component_version.sh "IMPI" ${IMPI_VERSION} -$COMMON_DIR/download_and_verify.sh $IMPI_DOWNLOAD_URL $IMPI_SHA256 -tar -xvf ${TARBALL} -cd ${IMPI_FOLDER} +# Intel MPI 2018 (update 4) +IMPI_VERSION="2018.4.274" +$COMMON_DIR/write_component_version.sh "IMPI_2018" ${IMPI_VERSION} +IMPI_2018_DOWNLOAD_URL=http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/13651/l_mpi_${IMPI_VERSION}.tgz +$COMMON_DIR/download_and_verify.sh $IMPI_2018_DOWNLOAD_URL "a1114b3eb4149c2f108964b83cad02150d619e50032059d119ac4ffc9d5dd8e0" +tar -xvf l_mpi_${IMPI_VERSION}.tgz +cd l_mpi_${IMPI_VERSION} # Update the silent.cfg file to proceed with installation sed -i -e 's/ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' silent.cfg ./install.sh --silent ./silent.cfg @@ -49,4 +44,4 @@ EOF # Create symlinks for modulefiles ln -s /usr/share/Modules/modulefiles/mpi/impi_${IMPI_VERSION} /usr/share/Modules/modulefiles/mpi/impi -../../common/install_mpis.sh ${GCC_VERSION} ${HPCX_PATH} +../../common/install_mpis.sh ${GCC_VERSION} ${HPCX_PATH} \ No newline at end of file diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index fac4c23c..1d1b9b31 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -47,13 +47,17 @@ $COMMON_DIR/write_component_version.sh "OMPI" ${OMPI_VERSION} # exclude openmpi, perftest from updates sed -i "$ s/$/ openmpi perftest/" /etc/dnf/dnf.conf -# Intel MPI 2021 (Update 9) -IMPI_2021_VERSION="2021.9.0" -IMPI_2021_DOWNLOAD_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/718d6f8f-2546-4b36-b97b-bc58d5482ebf/l_mpi_oneapi_p_${IMPI_2021_VERSION}.43482_offline.sh -$COMMON_DIR/download_and_verify.sh $IMPI_2021_DOWNLOAD_URL "5c170cdf26901311408809ced28498b630a494428703685203ceef6e62735ef8" -bash l_mpi_oneapi_p_${IMPI_2021_VERSION}.43482_offline.sh -s -a -s --eula accept -mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/impi -$COMMON_DIR/write_component_version.sh "IMPI_2021" ${IMPI_2021_VERSION} +# Install Intel MPI +impi_metadata=$(jq -r '.impi."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) +IMPI_VERSION=$(jq -r '.version' <<< $impi_metadata) +IMPI_SHA256=$(jq -r '.sha256' <<< $impi_metadata) +IMPI_DOWNLOAD_URL=$(jq -r '.url' <<< $impi_metadata) +IMPI_OFFLINE_INSTALLER=$(basename $IMPI_DOWNLOAD_URL) + +$COMMON_DIR/download_and_verify.sh $IMPI_DOWNLOAD_URL $IMPI_SHA256 +bash OFFLINE_INSTALLER -s -a -s --eula accept +mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/impi +$COMMON_DIR/write_component_version.sh "IMPI" ${IMPI_VERSION} # Setup module files for MPIs mkdir -p /usr/share/Modules/modulefiles/mpi/ diff --git a/requirements.json b/requirements.json index 2742ba1d..cccb3550 100644 --- a/requirements.json +++ b/requirements.json @@ -90,9 +90,9 @@ "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/718d6f8f-2546-4b36-b97b-bc58d5482ebf/l_mpi_oneapi_p_2021.9.0.43482_offline.sh" }, "almalinux8.7": { - "version": "2018.4.274", - "sha256": "a1114b3eb4149c2f108964b83cad02150d619e50032059d119ac4ffc9d5dd8e0", - "url": "http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/13651/l_mpi_2018.4.274.tgz" + "version": "2021.9.0", + "sha256": "5c170cdf26901311408809ced28498b630a494428703685203ceef6e62735ef8", + "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/718d6f8f-2546-4b36-b97b-bc58d5482ebf/l_mpi_oneapi_p_2021.9.0.43482_offline.sh" } }, "nvidia": { From 1db4f9832f4ce3b6e32d5f8f80883049ca2e0d9e Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 08:13:30 -0700 Subject: [PATCH 06/76] add prerequisites --- alma/alma-8.x/alma-8.7-hpc/install.sh | 3 +++ alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh | 8 ++++++++ ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install.sh | 3 +++ .../ubuntu-20.04-hpc/install_prerequisites.sh | 12 ++++++++++++ ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh | 3 +++ .../ubuntu-22.04-hpc/install_prerequisites.sh | 12 ++++++++++++ 6 files changed, 41 insertions(+) create mode 100644 alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh create mode 100644 ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh create mode 100644 ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh diff --git a/alma/alma-8.x/alma-8.7-hpc/install.sh b/alma/alma-8.x/alma-8.7-hpc/install.sh index 10b82b81..279a951b 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install.sh @@ -1,6 +1,9 @@ #!/bin/bash set -ex +# install pre-requisites +./install_prerequisites.sh + # set properties source ./set_properties.sh diff --git a/alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh b/alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh new file mode 100644 index 00000000..e0252329 --- /dev/null +++ b/alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -ex + +# Import the newest AlmaLinux 8 GPG key +rpm --import https://repo.almalinux.org/almalinux/RPM-GPG-KEY-AlmaLinux + +# jq is needed to parse the component versions from the requirements.json file +yum install -y jq \ No newline at end of file diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install.sh index 1c3cc1de..271ad8dc 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install.sh @@ -1,6 +1,9 @@ #!/bin/bash set -ex +# install pre-requisites +./install_prerequisites.sh + # set properties source ./set_properties.sh diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh new file mode 100644 index 00000000..be321f89 --- /dev/null +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -ex + +# Don't allow the kernel to be updated +apt-mark hold linux-azure + +# upgrade pre-installed components +apt update +apt upgrade -y + +# jq is needed to parse the component versions from the requirements.json file +apt install -y jq \ No newline at end of file diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh index 1c3cc1de..271ad8dc 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh @@ -1,6 +1,9 @@ #!/bin/bash set -ex +# install pre-requisites +./install_prerequisites.sh + # set properties source ./set_properties.sh diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh new file mode 100644 index 00000000..be321f89 --- /dev/null +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -ex + +# Don't allow the kernel to be updated +apt-mark hold linux-azure + +# upgrade pre-installed components +apt update +apt upgrade -y + +# jq is needed to parse the component versions from the requirements.json file +apt install -y jq \ No newline at end of file From 0795b7ff16c090a54f4d1bc426224cfd41b7b27d Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 08:16:55 -0700 Subject: [PATCH 07/76] update execute permissions --- alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh | 0 ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh | 0 ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh | 0 3 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh mode change 100644 => 100755 ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh mode change 100644 => 100755 ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh diff --git a/alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh b/alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh old mode 100644 new mode 100755 diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh old mode 100644 new mode 100755 diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh old mode 100644 new mode 100755 From e731d63513acff50d1cdd660469a0da2247ab614 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 09:27:44 -0700 Subject: [PATCH 08/76] fix azcopy casing --- alma/alma-8.x/common/install_utils.sh | 2 +- ubuntu/common/install_utils.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/alma/alma-8.x/common/install_utils.sh b/alma/alma-8.x/common/install_utils.sh index f32d0d03..e41d80dc 100755 --- a/alma/alma-8.x/common/install_utils.sh +++ b/alma/alma-8.x/common/install_utils.sh @@ -88,7 +88,7 @@ azcopy_version=$(jq -r '.version' <<< $azcopy_metadata) azcopy_release=$(jq -r '.release' <<< $azcopy_metadata) azcopy_sha256=$(jq -r '.sha256' <<< $azcopy_metadata) TARBALL="azcopy_linux_amd64_$azcopy_version.tar.gz" -AZCOPY_DOWNLOAD_URL="https://azcopyvnext.azureedge.net/$azcopy_release/$tarball" +AZCOPY_DOWNLOAD_URL="https://azcopyvnext.azureedge.net/$azcopy_release/$TARBALL" wget ${AZCOPY_DOWNLOAD_URL} tar -xvf ${TARBALL} diff --git a/ubuntu/common/install_utils.sh b/ubuntu/common/install_utils.sh index 5673f6d3..70001fbe 100755 --- a/ubuntu/common/install_utils.sh +++ b/ubuntu/common/install_utils.sh @@ -55,7 +55,7 @@ azcopy_version=$(jq -r '.version' <<< $azcopy_metadata) azcopy_release=$(jq -r '.release' <<< $azcopy_metadata) azcopy_sha256=$(jq -r '.sha256' <<< $azcopy_metadata) TARBALL="azcopy_linux_amd64_$azcopy_version.tar.gz" -AZCOPY_DOWNLOAD_URL="https://azcopyvnext.azureedge.net/$azcopy_release/$tarball" +AZCOPY_DOWNLOAD_URL="https://azcopyvnext.azureedge.net/$azcopy_release/$TARBALL" wget ${AZCOPY_DOWNLOAD_URL} tar -xvf ${TARBALL} From 8ad80735e08b790e7bff1235e01b6c3364cf2d57 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 09:55:08 -0700 Subject: [PATCH 09/76] fix azcopy path --- alma/alma-8.x/common/install_utils.sh | 2 +- ubuntu/common/install_utils.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/alma/alma-8.x/common/install_utils.sh b/alma/alma-8.x/common/install_utils.sh index e41d80dc..9c63c022 100755 --- a/alma/alma-8.x/common/install_utils.sh +++ b/alma/alma-8.x/common/install_utils.sh @@ -93,7 +93,7 @@ wget ${AZCOPY_DOWNLOAD_URL} tar -xvf ${TARBALL} # copy the azcopy to the bin path -pushd azcopy_linux_amd64_${VERSION} +pushd azcopy_linux_amd64_${azcopy_version} cp azcopy /usr/bin/ popd diff --git a/ubuntu/common/install_utils.sh b/ubuntu/common/install_utils.sh index 70001fbe..1efb151a 100755 --- a/ubuntu/common/install_utils.sh +++ b/ubuntu/common/install_utils.sh @@ -60,7 +60,7 @@ wget ${AZCOPY_DOWNLOAD_URL} tar -xvf ${TARBALL} # copy the azcopy to the bin path -pushd azcopy_linux_amd64_${VERSION} +pushd azcopy_linux_amd64_${azcopy_version} cp azcopy /usr/bin/ popd From 48729eeb0d117fd8d8f632a79d66404b3d0a13b5 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 10:39:40 -0700 Subject: [PATCH 10/76] don't upgrade ubuntu 20.04 to see if it fixes lustre --- ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh index be321f89..ea499faf 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh @@ -6,7 +6,7 @@ apt-mark hold linux-azure # upgrade pre-installed components apt update -apt upgrade -y +# apt upgrade -y # test to see if this fixes lustre # jq is needed to parse the component versions from the requirements.json file apt install -y jq \ No newline at end of file From f324ea830975971e2466ca1649cdb939cab2437a Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 11:08:56 -0700 Subject: [PATCH 11/76] fix MVAPICH2 and IMPI --- alma/common/install_mpis.sh | 3 ++- ubuntu/common/install_mpis.sh | 13 ++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 1d1b9b31..1dfe4adf 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -22,6 +22,7 @@ MVAPICH2_DOWNLOAD_URL="http://mvapich.cse.ohio-state.edu/download/mvapich/mv2/mv TARBALL=$(basename $MVAPICH2_DOWNLOAD_URL) MVAPICH2_FOLDER=$(basename $MVAPICH2_DOWNLOAD_URL .tar.gz) +$COMMON_DIR/download_and_verify.sh $MVAPICH2_DOWNLOAD_URL $MVAPICH2_SHA256 tar -xvf ${TARBALL} cd ${MVAPICH2_FOLDER} ./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes && make -j$(nproc) && make install @@ -55,7 +56,7 @@ IMPI_DOWNLOAD_URL=$(jq -r '.url' <<< $impi_metadata) IMPI_OFFLINE_INSTALLER=$(basename $IMPI_DOWNLOAD_URL) $COMMON_DIR/download_and_verify.sh $IMPI_DOWNLOAD_URL $IMPI_SHA256 -bash OFFLINE_INSTALLER -s -a -s --eula accept +bash $IMPI_OFFLINE_INSTALLER -s -a -s --eula accept mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/impi $COMMON_DIR/write_component_version.sh "IMPI" ${IMPI_VERSION} diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index a8dcd6ff..4a55b9b1 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -25,10 +25,13 @@ $COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION mvapich2_metadata=$(jq -r '.mvapich2."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) MVAPICH2_VERSION=$(jq -r '.version' <<< $mvapich2_metadata) MVAPICH2_SHA256=$(jq -r '.sha256' <<< $mvapich2_metadata) -MV2_DOWNLOAD_URL=http://mvapich.cse.ohio-state.edu/download/mvapich/mv2/mvapich2-${MVAPICH2_VERSION}.tar.gz -$COMMON_DIR/download_and_verify.sh $MV2_DOWNLOAD_URL $MVAPICH2_SHA256 -tar -xvf mvapich2-${MVAPICH2_VERSION}.tar.gz -cd mvapich2-${MVAPICH2_VERSION} +MVAPICH2_DOWNLOAD_URL=http://mvapich.cse.ohio-state.edu/download/mvapich/mv2/mvapich2-${MVAPICH2_VERSION}.tar.gz +TARBALL=$(basename $MVAPICH2_DOWNLOAD_URL) +MVAPICH2_FOLDER=$(basename $MVAPICH2_DOWNLOAD_URL .tar.gz) + +$COMMON_DIR/download_and_verify.sh $MVAPICH2_DOWNLOAD_URL $MVAPICH2_SHA256 +tar -xvf ${TARBALL} +cd ${MVAPICH2_FOLDER} # Error exclusive to Ubuntu 22.04 # configure: error: The Fortran compiler gfortran will not compile files that call # the same routine with arguments of different types. @@ -59,7 +62,7 @@ IMPI_DOWNLOAD_URL=$(jq -r '.url' <<< $impi_metadata) IMPI_OFFLINE_INSTALLER=$(basename $IMPI_DOWNLOAD_URL) $COMMON_DIR/download_and_verify.sh $IMPI_DOWNLOAD_URL $IMPI_SHA256 -bash OFFLINE_INSTALLER -s -a -s --eula accept +bash $IMPI_OFFLINE_INSTALLER -s -a -s --eula accept mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/impi $COMMON_DIR/write_component_version.sh "IMPI" ${IMPI_VERSION} From 6570c27485abc6569af5e0505a9cdd4b5726c17b Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 11:44:24 -0700 Subject: [PATCH 12/76] put the 20.04 upgrade back --- ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh index ea499faf..be321f89 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh @@ -6,7 +6,7 @@ apt-mark hold linux-azure # upgrade pre-installed components apt update -# apt upgrade -y # test to see if this fixes lustre +apt upgrade -y # jq is needed to parse the component versions from the requirements.json file apt install -y jq \ No newline at end of file From 92b5f163798a8adad803596f9e05e1039677cc12 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 12:04:41 -0700 Subject: [PATCH 13/76] update cuda version --- requirements.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements.json b/requirements.json index cccb3550..b8c9689a 100644 --- a/requirements.json +++ b/requirements.json @@ -136,29 +136,29 @@ "cuda": { "ubuntu20.04": { "driver": { - "version": "12.2.1", + "version": "12-1", "distribution": "ubuntu2004" }, "samples": { - "version": "12.2" + "version": "12.1" } }, "ubuntu22.04": { "driver": { - "version": "12.2.1", + "version": "12-1", "distribution": "ubuntu2204" }, "samples": { - "version": "12.2" + "version": "12.1" } }, "almalinux8.7": { "driver": { - "version": "12.2.0", + "version": "12-1", "distribution": "rhel8" }, "samples": { - "version": "12.2" + "version": "12.1" } } }, From 0c7ba59b618b206fafddbb02f4880f41fa7d0f5b Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 12:30:39 -0700 Subject: [PATCH 14/76] fix cuda and intel libs --- alma/common/install_intel_libs.sh | 4 ++-- requirements.json | 12 ++++++------ ubuntu/common/install_intel_libs.sh | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/alma/common/install_intel_libs.sh b/alma/common/install_intel_libs.sh index 3d1ddc23..ef4b8d5a 100755 --- a/alma/common/install_intel_libs.sh +++ b/alma/common/install_intel_libs.sh @@ -4,8 +4,8 @@ set -ex # Set Intel® oneAPI Math Kernel Library info intel_one_mkl_metadata=$(jq -r '.intel_one_mkl."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) INTEL_ONE_MKL_VERSION=$(jq -r '.version' <<< $intel_one_mkl_metadata) -INTEL_ONE_MKL_SHA256=$(jq -r '.sha256' <<< $impi_metadata) -INTEL_ONE_MKL_DOWNLOAD_URL=$(jq -r '.url' <<< $impi_metadata) +INTEL_ONE_MKL_SHA256=$(jq -r '.sha256' <<< $intel_one_mkl_metadata) +INTEL_ONE_MKL_DOWNLOAD_URL=$(jq -r '.url' <<< $intel_one_mkl_metadata) INTEL_ONE_MKL_OFFLINE_INSTALLER=$(basename $IMPI_DOWNLOAD_URL) # Intel® oneAPI Math Kernel Library diff --git a/requirements.json b/requirements.json index b8c9689a..b03b315e 100644 --- a/requirements.json +++ b/requirements.json @@ -136,29 +136,29 @@ "cuda": { "ubuntu20.04": { "driver": { - "version": "12-1", + "version": "12-2", "distribution": "ubuntu2004" }, "samples": { - "version": "12.1" + "version": "12.2" } }, "ubuntu22.04": { "driver": { - "version": "12-1", + "version": "12-2", "distribution": "ubuntu2204" }, "samples": { - "version": "12.1" + "version": "12.2" } }, "almalinux8.7": { "driver": { - "version": "12-1", + "version": "12-2", "distribution": "rhel8" }, "samples": { - "version": "12.1" + "version": "12.2" } } }, diff --git a/ubuntu/common/install_intel_libs.sh b/ubuntu/common/install_intel_libs.sh index 8dd83115..a3469363 100755 --- a/ubuntu/common/install_intel_libs.sh +++ b/ubuntu/common/install_intel_libs.sh @@ -4,8 +4,8 @@ set -ex # Set Intel® oneAPI Math Kernel Library info intel_one_mkl_metadata=$(jq -r '.intel_one_mkl."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) INTEL_ONE_MKL_VERSION=$(jq -r '.version' <<< $intel_one_mkl_metadata) -INTEL_ONE_MKL_SHA256=$(jq -r '.sha256' <<< $impi_metadata) -INTEL_ONE_MKL_DOWNLOAD_URL=$(jq -r '.url' <<< $impi_metadata) +INTEL_ONE_MKL_SHA256=$(jq -r '.sha256' <<< $intel_one_mkl_metadata) +INTEL_ONE_MKL_DOWNLOAD_URL=$(jq -r '.url' <<< $intel_one_mkl_metadata) INTEL_ONE_MKL_OFFLINE_INSTALLER=$(basename $IMPI_DOWNLOAD_URL) # Install Intel® oneAPI Math Kernel Library From 352429c8b02a6c225120ac45cd7788e8cb8151c5 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 12:59:56 -0700 Subject: [PATCH 15/76] update lustre version --- requirements.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.json b/requirements.json index b03b315e..13693b22 100644 --- a/requirements.json +++ b/requirements.json @@ -273,7 +273,7 @@ "version": "2.15.1_24_gbaa21ca" }, "ubuntu20.04": { - "version": "2.15.1-29-gbae0abe" + "version": "2.15.4-42-gd6d405d" }, "ubuntu22.04": { "version": "2.15.1-29-gbae0abe" From 1211980d097e2ed01b485493cb8f3dfb3ad68e24 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 14:17:26 -0700 Subject: [PATCH 16/76] fix nccl install --- alma/common/install_nccl.sh | 7 ++++--- ubuntu/common/install_nccl.sh | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/alma/common/install_nccl.sh b/alma/common/install_nccl.sh index 5e5b42f3..54ea60c1 100755 --- a/alma/common/install_nccl.sh +++ b/alma/common/install_nccl.sh @@ -5,6 +5,7 @@ set -ex NCCL_VERSION=$(jq -r '.nccl."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) NCCL_RDMA_SHARP_COMMIT=$(jq -r '.nccl."'"$DISTRIBUTION"'".rdmasharpplugins.commit' <<< $COMPONENT_VERSIONS) CUDA_DRIVER_VERSION=$(jq -r '.cuda."'"$DISTRIBUTION"'".driver.version' <<< $COMPONENT_VERSIONS) +CUDA_VERSION="${CUDA_DRIVER_VERSION//-/.}" # Install NCCL yum install -y rpm-build rpmdevtools @@ -17,9 +18,9 @@ tar -xvf ${TARBALL} pushd nccl-${NCCL_VERSION} make -j src.build make pkg.redhat.build -rpm -i ./build/pkg/rpm/x86_64/libnccl-${NCCL_VERSION}+cuda${CUDA_DRIVER_VERSION}.x86_64.rpm -rpm -i ./build/pkg/rpm/x86_64/libnccl-devel-${NCCL_VERSION}+cuda${CUDA_DRIVER_VERSION}.x86_64.rpm -rpm -i ./build/pkg/rpm/x86_64/libnccl-static-${NCCL_VERSION}+cuda${CUDA_DRIVER_VERSION}.x86_64.rpm +rpm -i ./build/pkg/rpm/x86_64/libnccl-${NCCL_VERSION}+cuda${CUDA_VERSION}.x86_64.rpm +rpm -i ./build/pkg/rpm/x86_64/libnccl-devel-${NCCL_VERSION}+cuda${CUDA_VERSION}.x86_64.rpm +rpm -i ./build/pkg/rpm/x86_64/libnccl-static-${NCCL_VERSION}+cuda${CUDA_VERSION}.x86_64.rpm sed -i "$ s/$/ libnccl*/" /etc/dnf/dnf.conf popd diff --git a/ubuntu/common/install_nccl.sh b/ubuntu/common/install_nccl.sh index 6b6b567e..275942be 100755 --- a/ubuntu/common/install_nccl.sh +++ b/ubuntu/common/install_nccl.sh @@ -6,6 +6,7 @@ NCCL_VERSION=$(jq -r '.nccl."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS NCCL_RDMA_SHARP_COMMIT=$(jq -r '.nccl."'"$DISTRIBUTION"'".rdmasharpplugins.commit' <<< $COMPONENT_VERSIONS) CUDA_DRIVER_VERSION=$(jq -r '.cuda."'"$DISTRIBUTION"'".driver.version' <<< $COMPONENT_VERSIONS) +CUDA_VERSION="${CUDA_DRIVER_VERSION//-/.}" TARBALL="v${NCCL_VERSION}.tar.gz"; NCCL_DOWNLOAD_URL=https://github.com/NVIDIA/nccl/archive/refs/tags/${TARBALL}; @@ -20,9 +21,9 @@ pushd nccl-${NCCL_VERSION} make -j src.build make pkg.debian.build pushd build/pkg/deb/ -dpkg -i libnccl2_${NCCL_VERSION}+cuda${CUDA_DRIVER_VERSION}_amd64.deb +dpkg -i libnccl2_${NCCL_VERSION}+cuda${CUDA_VERSION}_amd64.deb sudo apt-mark hold libnccl2 -dpkg -i libnccl-dev_${NCCL_VERSION}+cuda${CUDA_DRIVER_VERSION}_amd64.deb +dpkg -i libnccl-dev_${NCCL_VERSION}+cuda${CUDA_VERSION}_amd64.deb sudo apt-mark hold libnccl-dev popd popd From 489fbef6250c097e0335063462270caf766db10a Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 14:24:53 -0700 Subject: [PATCH 17/76] fix intel lib on alma --- alma/common/install_intel_libs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alma/common/install_intel_libs.sh b/alma/common/install_intel_libs.sh index ef4b8d5a..7019f0f3 100755 --- a/alma/common/install_intel_libs.sh +++ b/alma/common/install_intel_libs.sh @@ -6,7 +6,7 @@ intel_one_mkl_metadata=$(jq -r '.intel_one_mkl."'"$DISTRIBUTION"'"' <<< $COMPONE INTEL_ONE_MKL_VERSION=$(jq -r '.version' <<< $intel_one_mkl_metadata) INTEL_ONE_MKL_SHA256=$(jq -r '.sha256' <<< $intel_one_mkl_metadata) INTEL_ONE_MKL_DOWNLOAD_URL=$(jq -r '.url' <<< $intel_one_mkl_metadata) -INTEL_ONE_MKL_OFFLINE_INSTALLER=$(basename $IMPI_DOWNLOAD_URL) +INTEL_ONE_MKL_OFFLINE_INSTALLER=$(basename $INTEL_ONE_MKL_DOWNLOAD_URL) # Intel® oneAPI Math Kernel Library $COMMON_DIR/write_component_version.sh "INTEL_ONE_MKL" ${INTEL_ONE_MKL_VERSION} From a7e3b42b19a667bb98be0faa3c22682be2a13e05 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 15:23:50 -0700 Subject: [PATCH 18/76] fix intel libs on ubuntu --- ubuntu/common/install_intel_libs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/common/install_intel_libs.sh b/ubuntu/common/install_intel_libs.sh index a3469363..c1cace4f 100755 --- a/ubuntu/common/install_intel_libs.sh +++ b/ubuntu/common/install_intel_libs.sh @@ -6,7 +6,7 @@ intel_one_mkl_metadata=$(jq -r '.intel_one_mkl."'"$DISTRIBUTION"'"' <<< $COMPONE INTEL_ONE_MKL_VERSION=$(jq -r '.version' <<< $intel_one_mkl_metadata) INTEL_ONE_MKL_SHA256=$(jq -r '.sha256' <<< $intel_one_mkl_metadata) INTEL_ONE_MKL_DOWNLOAD_URL=$(jq -r '.url' <<< $intel_one_mkl_metadata) -INTEL_ONE_MKL_OFFLINE_INSTALLER=$(basename $IMPI_DOWNLOAD_URL) +INTEL_ONE_MKL_OFFLINE_INSTALLER=$(basename $INTEL_ONE_MKL_DOWNLOAD_URL) # Install Intel® oneAPI Math Kernel Library $COMMON_DIR/write_component_version.sh "INTEL_ONE_MKL" ${INTEL_ONE_MKL_VERSION} From 73bb63d146d76db5d7c6153336d7fa0214c584e4 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 15:40:29 -0700 Subject: [PATCH 19/76] add execute to disable_user_namespaces --- alma/alma-8.x/alma-8.7-hpc/disable_user_namespaces.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 alma/alma-8.x/alma-8.7-hpc/disable_user_namespaces.sh diff --git a/alma/alma-8.x/alma-8.7-hpc/disable_user_namespaces.sh b/alma/alma-8.x/alma-8.7-hpc/disable_user_namespaces.sh old mode 100644 new mode 100755 From 3385addad5fedd4d79a53bf680d9c2c3e20b0728 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 20 Mar 2024 17:15:29 -0700 Subject: [PATCH 20/76] fix alma impi --- alma/common/install_mpis.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 1dfe4adf..67999362 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -100,24 +100,24 @@ setenv MPI_HOME /opt/openmpi-${OMPI_VERSION} EOF #IntelMPI-v2021 -cat << EOF >> /usr/share/Modules/modulefiles/mpi/impi_${IMPI_2021_VERSION} +cat << EOF >> /usr/share/Modules/modulefiles/mpi/impi_${IMPI_VERSION} #%Module 1.0 # -# Intel MPI ${IMPI_2021_VERSION} +# Intel MPI ${IMPI_VERSION} # conflict mpi -module load /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/impi -setenv MPI_BIN /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/bin -setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/include -setenv MPI_LIB /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/lib -setenv MPI_MAN /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/man -setenv MPI_HOME /opt/intel/oneapi/mpi/${IMPI_2021_VERSION} +module load /opt/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/impi +setenv MPI_BIN /opt/intel/oneapi/mpi/${IMPI_VERSION}/bin +setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${IMPI_VERSION}/include +setenv MPI_LIB /opt/intel/oneapi/mpi/${IMPI_VERSION}/lib +setenv MPI_MAN /opt/intel/oneapi/mpi/${IMPI_VERSION}/man +setenv MPI_HOME /opt/intel/oneapi/mpi/${IMPI_VERSION} EOF # Create symlinks for modulefiles ln -s /usr/share/Modules/modulefiles/mpi/mvapich2-${MVAPICH2_VERSION} /usr/share/Modules/modulefiles/mpi/mvapich2 ln -s /usr/share/Modules/modulefiles/mpi/openmpi-${OMPI_VERSION} /usr/share/Modules/modulefiles/mpi/openmpi -ln -s /usr/share/Modules/modulefiles/mpi/impi_${IMPI_2021_VERSION} /usr/share/Modules/modulefiles/mpi/impi-2021 +ln -s /usr/share/Modules/modulefiles/mpi/impi_${IMPI_VERSION} /usr/share/Modules/modulefiles/mpi/impi-2021 # cleanup downloaded tarballs and other installation files/folders rm -rf *.tar.gz *offline.sh From f9703cf84c1a4949a60f56278e5a8a3f0ae487c4 Mon Sep 17 00:00:00 2001 From: KimPhillips128 Date: Thu, 21 Mar 2024 15:33:51 -0700 Subject: [PATCH 21/76] Update requirments with packages for mar2024 --- requirements.json | 138 +++++++++++++++++++++++----------------------- 1 file changed, 69 insertions(+), 69 deletions(-) diff --git a/requirements.json b/requirements.json index 13693b22..1f51a561 100644 --- a/requirements.json +++ b/requirements.json @@ -18,33 +18,33 @@ }, "mofed": { "ubuntu20.04": { - "version": "23.07-0.5.1.2", - "sha256": "923ddbd48d250b25ba50098ad8852ad6a591df3e975f3e0b9922b752181bdd12" + "version": "24.01-0.3.3.1", + "sha256": "72e4b2961b73989793304eda9320ba3cdcdb226ea12fe1a7b8e7a179ea516e22" }, "ubuntu22.04": { - "version": "23.07-0.5.1.2", - "sha256": "77e032a48de4c040b2f2dd3bf2edd11921de7caff59c773ac35208514f72eff5" + "version": "24.01-0.3.3.1", + "sha256": "caa74c295ec88afd1f1c950b4bd6e32c609e3c5185199c7e344bf492ccef6429" }, "almalinux8.7": { - "version": "23.07-0.5.1.2", - "sha256": "59d318ea9814797f9196d16af06d7d1324114ea63015762b527478d8aec7d25e" + "version": "24.01-0.3.3.1", + "sha256": "34e826fea03d6505b50909959ed029282a6238c75830fbe5b2bc15442cf8f8b5" } }, "hpcx": { "ubuntu20.04": { - "version": "2.16", - "sha256": "addda11a710c52268b7a725c13d9cc54c93deddf259c87d8547ad3c0422b87e1", - "url": "https://content.mellanox.com/hpc/hpc-x/v2.16/hpcx-v2.16-gcc-mlnx_ofed-ubuntu20.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz" + "version": "2.18", + "sha256": "8e088f5ae94576e1d845fbb4ef67c7dc54983555d3583c244dde7e19dc082074", + "url": "https://content.mellanox.com/hpc/hpc-x/v2.18/hpcx-v2.18-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64.tbz" }, "ubuntu22.04": { - "version": "2.16", - "sha256": "97eac5555d54f5fd8da1c354222a1aff2e85eb017682441e06287971a5b95772", - "url": "https://content.mellanox.com/hpc/hpc-x/v2.16/hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz" + "version": "2.18", + "sha256": "1258c060d56a2b650dc697ce91a746976ab9f198e4d46cdba2a5315315214147", + "url": "https://content.mellanox.com/hpc/hpc-x/v2.18/hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64.tbz" }, "almalinux8.7": { - "version": "2.16", - "sha256": "78dc6bc152489decc8a4191121c7f070adadf657b0c90d8713dd8feb7e5e968e", - "url": "https://content.mellanox.com/hpc/hpc-x/v2.16/hpcx-v2.16-gcc-mlnx_ofed-redhat8-cuda12-gdrcopy2-nccl2.18-x86_64.tbz" + "version": "2.18", + "sha256": "45276ff7bd676cc668d1cc6a1fe926d5e157646aaf06201415e0aadb048be16d", + "url": "https://content.mellanox.com/hpc/hpc-x/v2.18/hpcx-v2.18-gcc-mlnx_ofed-redhat8-cuda12-x86_64.tbz" } }, "mvapich2": { @@ -63,102 +63,102 @@ }, "ompi": { "ubuntu20.04": { - "version": "4.1.5", - "sha256": "c018b127619d2a2a30c1931f316fc8a245926d0f5b4ebed4711f9695e7f70925", - "url": "https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz" + "version": "5.0.2", + "sha256": "095ab1cddb0fa0f9e7fc211a1d33185c6727c5237d0ee55f80a7e4311e5d279c", + "url": "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.2.tar.gz" }, "ubuntu22.04": { - "version": "4.1.5", - "sha256": "c018b127619d2a2a30c1931f316fc8a245926d0f5b4ebed4711f9695e7f70925", - "url": "https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz" + "version": "5.0.2", + "sha256": "095ab1cddb0fa0f9e7fc211a1d33185c6727c5237d0ee55f80a7e4311e5d279c", + "url": "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.2.tar.gz" }, "almalinux8.7": { - "version": "4.1.5", - "sha256": "c018b127619d2a2a30c1931f316fc8a245926d0f5b4ebed4711f9695e7f70925", - "url": "https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz" + "version": "5.0.2", + "sha256": "095ab1cddb0fa0f9e7fc211a1d33185c6727c5237d0ee55f80a7e4311e5d279c", + "url": "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.2.tar.gz" } }, "impi": { "ubuntu20.04": { - "version": "2021.9.0", - "sha256": "5c170cdf26901311408809ced28498b630a494428703685203ceef6e62735ef8", - "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/718d6f8f-2546-4b36-b97b-bc58d5482ebf/l_mpi_oneapi_p_2021.9.0.43482_offline.sh" + "version": "2021.11.0", + "sha256": "9a96caeb7abcf5aa08426216db38a2c7936462008b9825036266bc79cb0e30d8", + "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/2c45ede0-623c-4c8e-9e09-bed27d70fa33/l_mpi_oneapi_p_2021.11.0.49513_offline.sh" }, "ubuntu22.04": { - "version": "2021.9.0", - "sha256": "5c170cdf26901311408809ced28498b630a494428703685203ceef6e62735ef8", - "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/718d6f8f-2546-4b36-b97b-bc58d5482ebf/l_mpi_oneapi_p_2021.9.0.43482_offline.sh" + "version": "2021.11.0", + "sha256": "9a96caeb7abcf5aa08426216db38a2c7936462008b9825036266bc79cb0e30d8", + "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/2c45ede0-623c-4c8e-9e09-bed27d70fa33/l_mpi_oneapi_p_2021.11.0.49513_offline.sh" }, "almalinux8.7": { - "version": "2021.9.0", - "sha256": "5c170cdf26901311408809ced28498b630a494428703685203ceef6e62735ef8", - "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/718d6f8f-2546-4b36-b97b-bc58d5482ebf/l_mpi_oneapi_p_2021.9.0.43482_offline.sh" + "version": "2021.11.0", + "sha256": "9a96caeb7abcf5aa08426216db38a2c7936462008b9825036266bc79cb0e30d8", + "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/2c45ede0-623c-4c8e-9e09-bed27d70fa33/l_mpi_oneapi_p_2021.11.0.49513_offline.sh" } }, "nvidia": { "ubuntu20.04": { "driver": { - "version": "535.86.10", - "sha256": "cec37fd9317091a01fd0380309400a14284769790651fe021e1d5aaa2327a370" + "version": "550.54.14", + "sha256": "8c497ff1cfc7c310fb875149bc30faa4fd26d2237b2cba6cd2e8b0780157cfe3" }, "fabricmanager": { - "prefix": "535", + "prefix": "550", "distribution": "ubuntu2004", - "version": "535_535.86.10-1", - "sha256": "d0c4662279301187614646650da07f34a6fe267d789d48bc9ed63181af06ac29" + "version": "550_550.54.14-1", + "sha256": "3f167f26a606cf4adb9f8c7b2afc3e39b1e689f185fbdbb1e76e21aeb4327d59" } }, "ubuntu22.04": { "driver": { - "version": "535.86.10", - "sha256": "cec37fd9317091a01fd0380309400a14284769790651fe021e1d5aaa2327a370" + "version": "550.54.14", + "sha256": "8c497ff1cfc7c310fb875149bc30faa4fd26d2237b2cba6cd2e8b0780157cfe3" }, "fabricmanager": { - "prefix": "535", + "prefix": "550", "distribution": "ubuntu2204", - "version": "535_535.86.10-1", - "sha256": "d0c4662279301187614646650da07f34a6fe267d789d48bc9ed63181af06ac29" + "version": "550_550.54.14-1", + "sha256": "3f167f26a606cf4adb9f8c7b2afc3e39b1e689f185fbdbb1e76e21aeb4327d59" } }, "almalinux8.7": { "driver": { - "version": "535.86.10", - "sha256": "cec37fd9317091a01fd0380309400a14284769790651fe021e1d5aaa2327a370" + "version": "550.54.14", + "sha256": "8c497ff1cfc7c310fb875149bc30faa4fd26d2237b2cba6cd2e8b0780157cfe3" }, "fabricmanager": { - "prefix": "535", + "prefix": "550", "distribution": "rhel8", - "version": "535.86.10-1", - "sha256": "4c3cfc9f410c5c3e8dd2c50f9cdfc0c7e807094020bce3555bf8f1e09c053045" + "version": "550.54.14-1", + "sha256": "31c54439b3abc03e98a2e29fa950253d0989f4591b432c0e6e1461809c2a9cb3" } } }, "cuda": { "ubuntu20.04": { "driver": { - "version": "12-2", + "version": "12-4", "distribution": "ubuntu2004" }, "samples": { - "version": "12.2" + "version": "12.4" } }, "ubuntu22.04": { "driver": { - "version": "12-2", + "version": "12-4", "distribution": "ubuntu2204" }, "samples": { - "version": "12.2" + "version": "12.4" } }, "almalinux8.7": { "driver": { - "version": "12-2", + "version": "12-4", "distribution": "rhel8" }, "samples": { - "version": "12.2" + "version": "12.4" } } }, @@ -175,19 +175,19 @@ }, "nccl": { "ubuntu20.04": { - "version": "2.19.3-1", + "version": "2.20.3-1", "rdmasharpplugins": { "commit": "575c1e0" } }, "ubuntu22.04": { - "version": "2.19.3-1", + "version": "2.20.3-1", "rdmasharpplugins": { "commit": "575c1e0" } }, "almalinux8.7": { - "version": "2.19.3-1", + "version": "2.20.3-1", "rdmasharpplugins": { "commit": "575c1e0" } @@ -195,29 +195,29 @@ }, "dcgm": { "ubuntu20.04": { - "version": "3.1.8" + "version": "3.3.3" }, "ubuntu22.04": { - "version": "3.1.8" + "version": "3.3.3" }, "almalinux8.7": { - "version": "3.1.8" + "version": "3.3.3" } }, "intel_one_mkl": { "ubuntu20.04": { - "version": "2023.2.0.49497", - "sha256": "4a0d93da85a94d92e0ad35dc0fc3b3ab7f040bd55ad374c4d5ec81a57a2b872b", - "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh" }, + "version": "2024.0.0.49673", + "sha256": "2a3be7d01d75ba8cc3059f9a32ae72e5bfc93e68e72e94e79d7fa6ea2f7814de", + "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/86d6a4c1-c998-4c6b-9fff-ca004e9f7455/l_onemkl_p_2024.0.0.49673_offline.sh" }, "ubuntu22.04": { - "version": "2023.2.0.49497", - "sha256": "4a0d93da85a94d92e0ad35dc0fc3b3ab7f040bd55ad374c4d5ec81a57a2b872b", - "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh" + "version": "2024.0.0.49673", + "sha256": "2a3be7d01d75ba8cc3059f9a32ae72e5bfc93e68e72e94e79d7fa6ea2f7814de", + "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/86d6a4c1-c998-4c6b-9fff-ca004e9f7455/l_onemkl_p_2024.0.0.49673_offline.sh" }, "almalinux8.7": { - "version": "2023.2.0.49497", - "sha256": "4a0d93da85a94d92e0ad35dc0fc3b3ab7f040bd55ad374c4d5ec81a57a2b872b", - "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh" + "version": "2024.0.0.49673", + "sha256": "2a3be7d01d75ba8cc3059f9a32ae72e5bfc93e68e72e94e79d7fa6ea2f7814de", + "url": "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/86d6a4c1-c998-4c6b-9fff-ca004e9f7455/l_onemkl_p_2024.0.0.49673_offline.sh" } }, "waagent": { From 1b54c51b37b22b6f3af6b8d7e20f565ffd491c3a Mon Sep 17 00:00:00 2001 From: KimPhillips128 Date: Thu, 21 Mar 2024 17:02:55 -0700 Subject: [PATCH 22/76] update mpis stripping trailing zero from version for mv statement-impi2021.11 --- alma/common/install_mpis.sh | 4 +++- ubuntu/common/install_mpis.sh | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 67999362..9505e57c 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -57,7 +57,9 @@ IMPI_OFFLINE_INSTALLER=$(basename $IMPI_DOWNLOAD_URL) $COMMON_DIR/download_and_verify.sh $IMPI_DOWNLOAD_URL $IMPI_SHA256 bash $IMPI_OFFLINE_INSTALLER -s -a -s --eula accept -mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/impi + +impi_2021_version=${IMPI_VERSION:0:-2} +mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${impi_2021_version}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${impi_2021_version}/etc/modulefiles/impi $COMMON_DIR/write_component_version.sh "IMPI" ${IMPI_VERSION} # Setup module files for MPIs diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index 4a55b9b1..2858510a 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -63,7 +63,9 @@ IMPI_OFFLINE_INSTALLER=$(basename $IMPI_DOWNLOAD_URL) $COMMON_DIR/download_and_verify.sh $IMPI_DOWNLOAD_URL $IMPI_SHA256 bash $IMPI_OFFLINE_INSTALLER -s -a -s --eula accept -mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/impi + +impi_2021_version=${IMPI_VERSION:0:-2} +mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${impi_2021_version}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${impi_2021_version}/etc/modulefiles/impi $COMMON_DIR/write_component_version.sh "IMPI" ${IMPI_VERSION} # Module Files From f2c42356941d0bfc90823d982c9940a2637175fd Mon Sep 17 00:00:00 2001 From: KimPhillips128 Date: Thu, 21 Mar 2024 18:03:59 -0700 Subject: [PATCH 23/76] fix path for impi_2021 --- alma/common/install_mpis.sh | 2 +- ubuntu/common/install_mpis.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 9505e57c..8a63404d 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -59,7 +59,7 @@ $COMMON_DIR/download_and_verify.sh $IMPI_DOWNLOAD_URL $IMPI_SHA256 bash $IMPI_OFFLINE_INSTALLER -s -a -s --eula accept impi_2021_version=${IMPI_VERSION:0:-2} -mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${impi_2021_version}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${impi_2021_version}/etc/modulefiles/impi +mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${impi_2021_version}/etc/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${impi_2021_version}/etc/modulefiles/impi $COMMON_DIR/write_component_version.sh "IMPI" ${IMPI_VERSION} # Setup module files for MPIs diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index 2858510a..161d8a1b 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -65,7 +65,7 @@ $COMMON_DIR/download_and_verify.sh $IMPI_DOWNLOAD_URL $IMPI_SHA256 bash $IMPI_OFFLINE_INSTALLER -s -a -s --eula accept impi_2021_version=${IMPI_VERSION:0:-2} -mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${impi_2021_version}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${impi_2021_version}/etc/modulefiles/impi +mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${impi_2021_version}/etc/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${impi_2021_version}/etc/modulefiles/impi $COMMON_DIR/write_component_version.sh "IMPI" ${IMPI_VERSION} # Module Files From 5af1c03660f0f3952ec3d656df32dbb4e88b6e92 Mon Sep 17 00:00:00 2001 From: KimPhillips128 Date: Fri, 22 Mar 2024 08:49:26 -0700 Subject: [PATCH 24/76] Update run-tests with package versions --- tests/run-tests.sh | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 718be482..ae337fd9 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -84,27 +84,27 @@ echo "Detected distro: ${distro}" if [ "${MOFED_LTS}" = true ] then - HPCX_VERSION_UBUNTU="v2.7.0" - MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-4.9-6.0.6.0" - HPCX_MOFED_INTEGRATION_VERSION="MLNX_OFED_LINUX-4.7-1.0.0.1" + HPCX_VERSION_UBUNTU="v2.18" + MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-24.01-0.3.3.1" + HPCX_MOFED_INTEGRATION_VERSION="MLNX_OFED" HPCX_OMB_PATH_UBUNTU_1804="/opt/hpcx-${HPCX_VERSION_UBUNTU}-gcc-${HPCX_MOFED_INTEGRATION_VERSION}-ubuntu18.04-x86_64/ompi/tests/osu-micro-benchmarks-5.6.2" - IMPI_2021_VERSION_UBUNTU="2021.9.0" - OMPI_VERSION_UBUNTU="4.1.5" + IMPI_2021_VERSION_UBUNTU="2021.11" + OMPI_VERSION_UBUNTU="5.0.2" else - OMPI_VERSION_UBUNTU="4.1.5" - HPCX_MOFED_INTEGRATION_VERSION="MLNX_OFED_LINUX-5.4-1.0.3.0" + OMPI_VERSION_UBUNTU="5.0.2" + HPCX_MOFED_INTEGRATION_VERSION="MLNX_OFED_LINUX-24.01-0.3.3.1" case ${distro} in "Ubuntu 18.04") HPCX_VERSION_UBUNTU="v2.15"; MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-23.04-1.1.3.0"; IMPI_2021_VERSION_UBUNTU="2021.9.0"; ;; - "Ubuntu 20.04") HPCX_VERSION_UBUNTU="v2.16"; - MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-23.07-0.5.1.2"; - IMPI_2021_VERSION_UBUNTU="2021.9.0"; + "Ubuntu 20.04") HPCX_VERSION_UBUNTU="v2.18"; + MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-24.01-0.3.3.1"; + IMPI_2021_VERSION_UBUNTU="2021.11"; ;; - "Ubuntu 22.04") HPCX_VERSION_UBUNTU="v2.16"; - MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-23.07-0.5.1.2"; - IMPI_2021_VERSION_UBUNTU="2021.9.0"; + "Ubuntu 22.04") HPCX_VERSION_UBUNTU="v2.18"; + MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-24.01-0.3.3.1"; + IMPI_2021_VERSION_UBUNTU="2021.11"; ;; *) ;; esac @@ -121,12 +121,12 @@ MVAPICH2_VERSION_SUSE="2.3.6" OMPI_VERSION_CENTOS="4.1.1" OMPI_VERSION_ALMA_86="4.1.3" -OMPI_VERSION_ALMA_87="4.1.5" +OMPI_VERSION_ALMA_87="5.0.2" OMPI_VERSION_SUSE="4.1.1" IMPI_2021_VERSION_CENTOS="2021.4.0" IMPI_2021_VERSION_ALMA_86="2021.7.0" -IMPI_2021_VERSION_ALMA_87="2021.9.0" +IMPI_2021_VERSION_ALMA_87="2021.11" IMPI_2021_VERSION_SUSE="2021.9.0" MVAPICH2X_INSTALLATION_DIRECTORY="/opt/mvapich2-x" @@ -136,7 +136,7 @@ MOFED_VERSION_CENTOS="MLNX_OFED_LINUX-5.4-1.0.3.0" MOFED_VERSION_CENTOS_79="MLNX_OFED_LINUX-5.4-3.0.0.0" MOFED_VERSION_CENTOS_83="MLNX_OFED_LINUX-5.2-1.0.4.0" MOFED_VERSION_ALMA_86="MLNX_OFED_LINUX-5.8-1.0.1.1" -MOFED_VERSION_ALMA_87="MLNX_OFED_LINUX-23.07-0.5.1.2" +MOFED_VERSION_ALMA_87="MLNX_OFED_LINUX-24.01-0.3.3.1" MOFED_VERSION_SUSE="MLNX_OFED_INBOX_5.14.21-4.0.0" #MOFED_VERSION_SUSE="MLNX_OFED-5.7-1.0.2.0" @@ -154,7 +154,7 @@ MVAPICH2X_PATH_CENTOS="${MVAPICH2X_INSTALLATION_DIRECTORY}/gnu9.2.0/mofed5.1/azu OPENMPI_PATH_CENTOS="/opt/openmpi-${OMPI_VERSION_CENTOS}" HPCX_OMB_PATH_ALMA_86="/opt/hpcx-v2.14-gcc-MLNX_OFED_LINUX-5-redhat8-cuda11-gdrcopy2-nccl2.16-x86_64/ompi/tests/osu-micro-benchmarks-5.8" -HPCX_OMB_PATH_ALMA_87="/opt/hpcx-v2.16-gcc-mlnx_ofed-redhat8-cuda12-gdrcopy2-nccl2.18-x86_64/ompi/tests/osu-micro-benchmarks-5.8" +HPCX_OMB_PATH_ALMA_87="/opt/hpcx-v2.18-gcc-mlnx_ofed-redhat8-cuda12-x86_64/ompi/tests/osu-micro-benchmarks" MODULE_FILES_ROOT_ALMA="/usr/share/Modules/modulefiles" IMPI2021_PATH_ALMA_86="/opt/intel/oneapi/mpi/${IMPI_2021_VERSION_ALMA_86}" IMPI2021_PATH_ALMA_87="/opt/intel/oneapi/mpi/${IMPI_2021_VERSION_ALMA_87}" @@ -164,8 +164,8 @@ OPENMPI_PATH_ALMA_86="/opt/openmpi-${OMPI_VERSION_ALMA_86}" OPENMPI_PATH_ALMA_87="/opt/openmpi-${OMPI_VERSION_ALMA_87}" MODULE_FILES_ROOT_UBUNTU="/usr/share/modules/modulefiles" -HPCX_OMB_PATH_UBUNTU_2004="/opt/hpcx-${HPCX_VERSION_UBUNTU}-gcc-mlnx_ofed-ubuntu20.04-cuda12-gdrcopy2-nccl2.18-x86_64/ompi/tests/osu-micro-benchmarks-5.8" -HPCX_OMB_PATH_UBUNTU_2204="/opt/hpcx-${HPCX_VERSION_UBUNTU}-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64/ompi/tests/osu-micro-benchmarks-5.8" +HPCX_OMB_PATH_UBUNTU_2004="/opt/hpcx-${HPCX_VERSION_UBUNTU}-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64/ompi/tests/osu-micro-benchmarks" +HPCX_OMB_PATH_UBUNTU_2204="/opt/hpcx-${HPCX_VERSION_UBUNTU}-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64/ompi/tests/osu-micro-benchmarks" IMPI2021_PATH_UBUNTU="/opt/intel/oneapi/mpi/${IMPI_2021_VERSION_UBUNTU}" # added "libexec" to the path, as centos and ubuntu use "libexec", but SUSE only "lib" MVAPICH2_PATH_UBUNTU="/opt/mvapich2-${MVAPICH2_VERSION_UBUNTU}/libexec" @@ -200,13 +200,13 @@ then MKL_VERSION="2023.1.0" elif [[ $distro == "Ubuntu"* ]] then - MKL_VERSION="2023.2.0" + MKL_VERSION="2024.0.0.49673" elif [[ $distro == "AlmaLinux 8.6" ]] then MKL_VERSION="2022.1.0" elif [[ $distro == "AlmaLinux 8.7" ]] then - MKL_VERSION="2023.2.0" + MKL_VERSION="2024.0.0.49673" else MKL_VERSION="2023.1.0" fi From b99c80670ee7e6603eb193893c80d980db11d461 Mon Sep 17 00:00:00 2001 From: KimPhillips128 Date: Fri, 22 Mar 2024 10:09:05 -0700 Subject: [PATCH 25/76] update run-tests fixnng mkl version location --- tests/run-tests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index ae337fd9..f439369f 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -200,13 +200,13 @@ then MKL_VERSION="2023.1.0" elif [[ $distro == "Ubuntu"* ]] then - MKL_VERSION="2024.0.0.49673" + MKL_VERSION="2024.0" elif [[ $distro == "AlmaLinux 8.6" ]] then MKL_VERSION="2022.1.0" elif [[ $distro == "AlmaLinux 8.7" ]] then - MKL_VERSION="2024.0.0.49673" + MKL_VERSION="2024.0" else MKL_VERSION="2023.1.0" fi From 24005bc3e39e0462ac633046bded7f4715c53d19 Mon Sep 17 00:00:00 2001 From: KimPhillips128 Date: Fri, 22 Mar 2024 12:49:17 -0700 Subject: [PATCH 26/76] fix impi2021 path reference in run_tests --- tests/run-tests.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index f439369f..fafcfb3e 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -527,12 +527,12 @@ fi # impi 2021 if [ $CHECK_IMPI_2021 -eq 1 ] then - check_exists "${MODULE_FILES_ROOT}/mpi/impi-2021" + check_exists "${MODULE_FILES_ROOT}/etc/modulefiles/mpi" - module load mpi/impi-2021 + module load mpi/2021.11 mpiexec -np 2 -ppn 2 -env FI_PROVIDER=mlx -env I_MPI_SHM=0 ${IMPI2021_PATH}/bin/IMB-MPI1 pingpong check_exit_code "Intel MPI 2021" "Failed to run Intel MPI 2021" - module unload mpi/impi-2021 + module unload mpi/2021.11 fi # impi 2018 From 77c0cdfa733c7941f445e42c37a39184123ec98f Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Fri, 22 Mar 2024 14:13:28 -0700 Subject: [PATCH 27/76] delete old distros --- alma/alma-8.x/alma-8.6-hpc/README.md | 41 --- alma/alma-8.x/alma-8.6-hpc/hpc-tuning.sh | 4 - alma/alma-8.x/alma-8.6-hpc/install.sh | 72 ------ .../alma-8.x/alma-8.6-hpc/install_amd_libs.sh | 3 - alma/alma-8.x/alma-8.6-hpc/install_dcgm.sh | 29 --- alma/alma-8.x/alma-8.6-hpc/install_docker.sh | 65 ----- alma/alma-8.x/alma-8.6-hpc/install_gcc.sh | 7 - .../alma-8.6-hpc/install_intel_libs.sh | 3 - .../alma-8.6-hpc/install_mellanoxofed.sh | 28 -- alma/alma-8.x/alma-8.6-hpc/install_mpis.sh | 136 ---------- alma/alma-8.x/alma-8.6-hpc/install_nccl.sh | 3 - .../alma-8.6-hpc/install_nvidiagpudriver.sh | 3 - alma/alma-8.x/alma-8.6-hpc/install_utils.sh | 13 - alma/alma-8.x/alma-8.6-hpc/set_properties.sh | 7 - centos/centos-7.x/centos-7.6-hpc/README.md | 25 -- .../centos-7.x/centos-7.6-hpc/hpc-tuning.sh | 4 - centos/centos-7.x/centos-7.6-hpc/install.sh | 47 ---- .../centos-7.6-hpc/install_amd_libs.sh | 3 - .../centos-7.x/centos-7.6-hpc/install_gcc.sh | 8 - .../centos-7.6-hpc/install_intel_libs.sh | 4 - .../centos-7.6-hpc/install_lustre_client.sh | 4 - .../centos-7.6-hpc/install_mellanoxofed.sh | 25 -- .../centos-7.x/centos-7.6-hpc/install_mpis.sh | 50 ---- .../centos-7.6-hpc/install_nvidiagpudriver.sh | 13 - .../centos-7.6-hpc/install_utils.sh | 4 - .../centos-7.6-hpc/set_properties.sh | 5 - centos/centos-7.x/centos-7.7-hpc/README.md | 25 -- .../centos-7.x/centos-7.7-hpc/hpc-tuning.sh | 4 - centos/centos-7.x/centos-7.7-hpc/install.sh | 47 ---- .../centos-7.7-hpc/install_amd_libs.sh | 3 - .../centos-7.x/centos-7.7-hpc/install_gcc.sh | 7 - .../centos-7.7-hpc/install_intel_libs.sh | 4 - .../centos-7.7-hpc/install_lustre_client.sh | 4 - .../centos-7.7-hpc/install_mellanoxofed.sh | 25 -- .../centos-7.x/centos-7.7-hpc/install_mpis.sh | 50 ---- .../centos-7.7-hpc/install_nvidiagpudriver.sh | 13 - .../centos-7.7-hpc/install_utils.sh | 4 - .../centos-7.7-hpc/set_properties.sh | 5 - centos/centos-7.x/centos-7.8-hpc/README.md | 25 -- .../centos-7.x/centos-7.8-hpc/hpc-tuning.sh | 4 - centos/centos-7.x/centos-7.8-hpc/install.sh | 47 ---- .../centos-7.8-hpc/install_amd_libs.sh | 3 - .../centos-7.x/centos-7.8-hpc/install_gcc.sh | 7 - .../centos-7.8-hpc/install_intel_libs.sh | 4 - .../centos-7.8-hpc/install_lustre_client.sh | 4 - .../centos-7.8-hpc/install_mellanoxofed.sh | 25 -- .../centos-7.x/centos-7.8-hpc/install_mpis.sh | 50 ---- .../centos-7.8-hpc/install_nvidiagpudriver.sh | 13 - .../centos-7.8-hpc/install_utils.sh | 4 - .../centos-7.8-hpc/set_properties.sh | 5 - centos/centos-7.x/centos-7.9-hpc/README.md | 35 --- .../centos-7.9-hpc/disable_cloudinit.sh | 12 - .../centos-7.x/centos-7.9-hpc/hpc-tuning.sh | 4 - centos/centos-7.x/centos-7.9-hpc/install.sh | 70 ----- .../centos-7.9-hpc/install_amd_libs.sh | 3 - .../centos-7.x/centos-7.9-hpc/install_dcgm.sh | 3 - .../centos-7.x/centos-7.9-hpc/install_gcc.sh | 7 - .../centos-7.9-hpc/install_intel_libs.sh | 4 - .../centos-7.9-hpc/install_mellanoxofed.sh | 25 -- .../centos-7.x/centos-7.9-hpc/install_mpis.sh | 50 ---- .../centos-7.x/centos-7.9-hpc/install_nccl.sh | 3 - .../centos-7.9-hpc/install_nvidiagpudriver.sh | 3 - .../centos-7.9-hpc/install_utils.sh | 13 - .../centos-7.9-hpc/set_properties.sh | 5 - centos/centos-7.x/common/hpc-tuning.sh | 7 - centos/centos-7.x/common/install_amd_libs.sh | 33 --- centos/centos-7.x/common/install_dcgm.sh | 28 -- centos/centos-7.x/common/install_docker.sh | 52 ---- .../common/install_lustre_client.sh | 37 --- centos/centos-7.x/common/install_mpis.sh | 46 ---- .../common/install_nvidiagpudriver.sh | 53 ---- centos/centos-7.x/common/install_utils.sh | 69 ----- centos/centos-8.x/centos-8.1-hpc/README.md | 25 -- .../centos-8.x/centos-8.1-hpc/hpc-tuning.sh | 4 - centos/centos-8.x/centos-8.1-hpc/install.sh | 44 ---- .../centos-8.1-hpc/install_amd_libs.sh | 3 - .../centos-8.x/centos-8.1-hpc/install_gcc.sh | 7 - .../centos-8.1-hpc/install_intel_libs.sh | 4 - .../centos-8.1-hpc/install_mellanoxofed.sh | 31 --- .../centos-8.x/centos-8.1-hpc/install_mpis.sh | 49 ---- .../centos-8.1-hpc/install_nvidiagpudriver.sh | 16 -- .../centos-8.1-hpc/install_utils.sh | 4 - .../centos-8.1-hpc/set_properties.sh | 5 - centos/centos-8.x/centos-8.3-hpc/README.md | 25 -- .../centos-8.x/centos-8.3-hpc/hpc-tuning.sh | 4 - centos/centos-8.x/centos-8.3-hpc/install.sh | 44 ---- .../centos-8.3-hpc/install_amd_libs.sh | 3 - .../centos-8.x/centos-8.3-hpc/install_gcc.sh | 7 - .../centos-8.3-hpc/install_intel_libs.sh | 4 - .../centos-8.3-hpc/install_mellanoxofed.sh | 15 -- .../centos-8.x/centos-8.3-hpc/install_mpis.sh | 47 ---- .../centos-8.3-hpc/install_utils.sh | 4 - .../centos-8.3-hpc/set_properties.sh | 5 - centos/centos-8.x/common/install_amd_libs.sh | 33 --- centos/centos-8.x/common/install_utils.sh | 50 ---- centos/common/add-udev-rules.sh | 33 --- centos/common/hpc-tuning.sh | 36 --- centos/common/install_mpis.sh | 105 -------- centos/common/install_nccl.sh | 52 ---- centos/common/network-config.sh | 11 - common/clear_history.sh | 13 +- common/extract_distro.sh | 11 +- suse/sle-hpc-15.x/common/hpc-tuning.sh | 52 ---- suse/sle-hpc-15.x/common/install_amd_libs.sh | 48 ---- suse/sle-hpc-15.x/common/install_dcgm.sh | 37 --- suse/sle-hpc-15.x/common/install_docker.sh | 23 -- suse/sle-hpc-15.x/common/install_gcc.sh | 9 - .../sle-hpc-15.x/common/install_intel_libs.sh | 18 -- .../common/install_mellanoxofed.sh | 35 --- suse/sle-hpc-15.x/common/install_mpis.sh | 123 --------- suse/sle-hpc-15.x/common/install_nccl.sh | 70 ----- .../common/install_nvidiagpudriver.sh | 43 ---- suse/sle-hpc-15.x/common/install_utils.sh | 117 --------- suse/sle-hpc-15.x/sle-hpc-15-sp4/README.md | 62 ----- suse/sle-hpc-15.x/sle-hpc-15-sp4/config | 120 --------- suse/sle-hpc-15.x/sle-hpc-15-sp4/fixes.sh | 4 - suse/sle-hpc-15.x/sle-hpc-15-sp4/install.sh | 70 ----- .../sle-hpc-15-sp4/set_properties.sh | 5 - tests/run-tests.sh | 241 +----------------- ubuntu/ubuntu-18.x/common/install_mpis.sh | 123 --------- .../ubuntu-18.04-LTS-hpc/README.md | 42 --- .../ubuntu-18.04-LTS-hpc/install.sh | 52 ---- .../install_mellanoxofed.sh | 16 -- .../ubuntu-18.04-LTS-hpc/install_mpis.sh | 122 --------- .../ubuntu-18.04-LTS-hpc/install_nccl.sh | 5 - .../install_nvidiagpudriver.sh | 4 - .../ubuntu-18.04-LTS-hpc/install_utils.sh | 16 -- .../ubuntu-18.04-LTS-hpc/set_properties.sh | 7 - ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/README.md | 53 ---- .../ubuntu-18.x/ubuntu-18.04-hpc/install.sh | 63 ----- .../ubuntu-18.04-hpc/install_mellanoxofed.sh | 20 -- .../ubuntu-18.04-hpc/install_mpis.sh | 5 - .../install_nvidiagpudriver.sh | 29 --- .../ubuntu-18.04-hpc/install_utils.sh | 39 --- .../ubuntu-18.04-hpc/set_properties.sh | 7 - 135 files changed, 9 insertions(+), 3915 deletions(-) delete mode 100644 alma/alma-8.x/alma-8.6-hpc/README.md delete mode 100755 alma/alma-8.x/alma-8.6-hpc/hpc-tuning.sh delete mode 100755 alma/alma-8.x/alma-8.6-hpc/install.sh delete mode 100755 alma/alma-8.x/alma-8.6-hpc/install_amd_libs.sh delete mode 100755 alma/alma-8.x/alma-8.6-hpc/install_dcgm.sh delete mode 100644 alma/alma-8.x/alma-8.6-hpc/install_docker.sh delete mode 100755 alma/alma-8.x/alma-8.6-hpc/install_gcc.sh delete mode 100755 alma/alma-8.x/alma-8.6-hpc/install_intel_libs.sh delete mode 100755 alma/alma-8.x/alma-8.6-hpc/install_mellanoxofed.sh delete mode 100755 alma/alma-8.x/alma-8.6-hpc/install_mpis.sh delete mode 100755 alma/alma-8.x/alma-8.6-hpc/install_nccl.sh delete mode 100755 alma/alma-8.x/alma-8.6-hpc/install_nvidiagpudriver.sh delete mode 100755 alma/alma-8.x/alma-8.6-hpc/install_utils.sh delete mode 100755 alma/alma-8.x/alma-8.6-hpc/set_properties.sh delete mode 100644 centos/centos-7.x/centos-7.6-hpc/README.md delete mode 100755 centos/centos-7.x/centos-7.6-hpc/hpc-tuning.sh delete mode 100755 centos/centos-7.x/centos-7.6-hpc/install.sh delete mode 100755 centos/centos-7.x/centos-7.6-hpc/install_amd_libs.sh delete mode 100755 centos/centos-7.x/centos-7.6-hpc/install_gcc.sh delete mode 100755 centos/centos-7.x/centos-7.6-hpc/install_intel_libs.sh delete mode 100755 centos/centos-7.x/centos-7.6-hpc/install_lustre_client.sh delete mode 100755 centos/centos-7.x/centos-7.6-hpc/install_mellanoxofed.sh delete mode 100755 centos/centos-7.x/centos-7.6-hpc/install_mpis.sh delete mode 100755 centos/centos-7.x/centos-7.6-hpc/install_nvidiagpudriver.sh delete mode 100755 centos/centos-7.x/centos-7.6-hpc/install_utils.sh delete mode 100755 centos/centos-7.x/centos-7.6-hpc/set_properties.sh delete mode 100644 centos/centos-7.x/centos-7.7-hpc/README.md delete mode 100755 centos/centos-7.x/centos-7.7-hpc/hpc-tuning.sh delete mode 100755 centos/centos-7.x/centos-7.7-hpc/install.sh delete mode 100755 centos/centos-7.x/centos-7.7-hpc/install_amd_libs.sh delete mode 100755 centos/centos-7.x/centos-7.7-hpc/install_gcc.sh delete mode 100755 centos/centos-7.x/centos-7.7-hpc/install_intel_libs.sh delete mode 100755 centos/centos-7.x/centos-7.7-hpc/install_lustre_client.sh delete mode 100755 centos/centos-7.x/centos-7.7-hpc/install_mellanoxofed.sh delete mode 100755 centos/centos-7.x/centos-7.7-hpc/install_mpis.sh delete mode 100755 centos/centos-7.x/centos-7.7-hpc/install_nvidiagpudriver.sh delete mode 100755 centos/centos-7.x/centos-7.7-hpc/install_utils.sh delete mode 100755 centos/centos-7.x/centos-7.7-hpc/set_properties.sh delete mode 100644 centos/centos-7.x/centos-7.8-hpc/README.md delete mode 100755 centos/centos-7.x/centos-7.8-hpc/hpc-tuning.sh delete mode 100755 centos/centos-7.x/centos-7.8-hpc/install.sh delete mode 100755 centos/centos-7.x/centos-7.8-hpc/install_amd_libs.sh delete mode 100755 centos/centos-7.x/centos-7.8-hpc/install_gcc.sh delete mode 100755 centos/centos-7.x/centos-7.8-hpc/install_intel_libs.sh delete mode 100755 centos/centos-7.x/centos-7.8-hpc/install_lustre_client.sh delete mode 100755 centos/centos-7.x/centos-7.8-hpc/install_mellanoxofed.sh delete mode 100755 centos/centos-7.x/centos-7.8-hpc/install_mpis.sh delete mode 100755 centos/centos-7.x/centos-7.8-hpc/install_nvidiagpudriver.sh delete mode 100755 centos/centos-7.x/centos-7.8-hpc/install_utils.sh delete mode 100755 centos/centos-7.x/centos-7.8-hpc/set_properties.sh delete mode 100644 centos/centos-7.x/centos-7.9-hpc/README.md delete mode 100755 centos/centos-7.x/centos-7.9-hpc/disable_cloudinit.sh delete mode 100755 centos/centos-7.x/centos-7.9-hpc/hpc-tuning.sh delete mode 100755 centos/centos-7.x/centos-7.9-hpc/install.sh delete mode 100755 centos/centos-7.x/centos-7.9-hpc/install_amd_libs.sh delete mode 100755 centos/centos-7.x/centos-7.9-hpc/install_dcgm.sh delete mode 100755 centos/centos-7.x/centos-7.9-hpc/install_gcc.sh delete mode 100755 centos/centos-7.x/centos-7.9-hpc/install_intel_libs.sh delete mode 100755 centos/centos-7.x/centos-7.9-hpc/install_mellanoxofed.sh delete mode 100755 centos/centos-7.x/centos-7.9-hpc/install_mpis.sh delete mode 100755 centos/centos-7.x/centos-7.9-hpc/install_nccl.sh delete mode 100755 centos/centos-7.x/centos-7.9-hpc/install_nvidiagpudriver.sh delete mode 100755 centos/centos-7.x/centos-7.9-hpc/install_utils.sh delete mode 100755 centos/centos-7.x/centos-7.9-hpc/set_properties.sh delete mode 100755 centos/centos-7.x/common/hpc-tuning.sh delete mode 100755 centos/centos-7.x/common/install_amd_libs.sh delete mode 100755 centos/centos-7.x/common/install_dcgm.sh delete mode 100755 centos/centos-7.x/common/install_docker.sh delete mode 100755 centos/centos-7.x/common/install_lustre_client.sh delete mode 100755 centos/centos-7.x/common/install_mpis.sh delete mode 100755 centos/centos-7.x/common/install_nvidiagpudriver.sh delete mode 100755 centos/centos-7.x/common/install_utils.sh delete mode 100644 centos/centos-8.x/centos-8.1-hpc/README.md delete mode 100755 centos/centos-8.x/centos-8.1-hpc/hpc-tuning.sh delete mode 100755 centos/centos-8.x/centos-8.1-hpc/install.sh delete mode 100755 centos/centos-8.x/centos-8.1-hpc/install_amd_libs.sh delete mode 100755 centos/centos-8.x/centos-8.1-hpc/install_gcc.sh delete mode 100755 centos/centos-8.x/centos-8.1-hpc/install_intel_libs.sh delete mode 100755 centos/centos-8.x/centos-8.1-hpc/install_mellanoxofed.sh delete mode 100755 centos/centos-8.x/centos-8.1-hpc/install_mpis.sh delete mode 100755 centos/centos-8.x/centos-8.1-hpc/install_nvidiagpudriver.sh delete mode 100755 centos/centos-8.x/centos-8.1-hpc/install_utils.sh delete mode 100755 centos/centos-8.x/centos-8.1-hpc/set_properties.sh delete mode 100644 centos/centos-8.x/centos-8.3-hpc/README.md delete mode 100755 centos/centos-8.x/centos-8.3-hpc/hpc-tuning.sh delete mode 100755 centos/centos-8.x/centos-8.3-hpc/install.sh delete mode 100755 centos/centos-8.x/centos-8.3-hpc/install_amd_libs.sh delete mode 100755 centos/centos-8.x/centos-8.3-hpc/install_gcc.sh delete mode 100755 centos/centos-8.x/centos-8.3-hpc/install_intel_libs.sh delete mode 100755 centos/centos-8.x/centos-8.3-hpc/install_mellanoxofed.sh delete mode 100755 centos/centos-8.x/centos-8.3-hpc/install_mpis.sh delete mode 100755 centos/centos-8.x/centos-8.3-hpc/install_utils.sh delete mode 100755 centos/centos-8.x/centos-8.3-hpc/set_properties.sh delete mode 100755 centos/centos-8.x/common/install_amd_libs.sh delete mode 100755 centos/centos-8.x/common/install_utils.sh delete mode 100755 centos/common/add-udev-rules.sh delete mode 100755 centos/common/hpc-tuning.sh delete mode 100755 centos/common/install_mpis.sh delete mode 100755 centos/common/install_nccl.sh delete mode 100755 centos/common/network-config.sh delete mode 100755 suse/sle-hpc-15.x/common/hpc-tuning.sh delete mode 100755 suse/sle-hpc-15.x/common/install_amd_libs.sh delete mode 100755 suse/sle-hpc-15.x/common/install_dcgm.sh delete mode 100755 suse/sle-hpc-15.x/common/install_docker.sh delete mode 100755 suse/sle-hpc-15.x/common/install_gcc.sh delete mode 100755 suse/sle-hpc-15.x/common/install_intel_libs.sh delete mode 100755 suse/sle-hpc-15.x/common/install_mellanoxofed.sh delete mode 100755 suse/sle-hpc-15.x/common/install_mpis.sh delete mode 100755 suse/sle-hpc-15.x/common/install_nccl.sh delete mode 100755 suse/sle-hpc-15.x/common/install_nvidiagpudriver.sh delete mode 100755 suse/sle-hpc-15.x/common/install_utils.sh delete mode 100755 suse/sle-hpc-15.x/sle-hpc-15-sp4/README.md delete mode 100644 suse/sle-hpc-15.x/sle-hpc-15-sp4/config delete mode 100755 suse/sle-hpc-15.x/sle-hpc-15-sp4/fixes.sh delete mode 100755 suse/sle-hpc-15.x/sle-hpc-15-sp4/install.sh delete mode 100755 suse/sle-hpc-15.x/sle-hpc-15-sp4/set_properties.sh delete mode 100755 ubuntu/ubuntu-18.x/common/install_mpis.sh delete mode 100644 ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/README.md delete mode 100755 ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install.sh delete mode 100755 ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_mellanoxofed.sh delete mode 100755 ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_mpis.sh delete mode 100755 ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_nccl.sh delete mode 100755 ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_nvidiagpudriver.sh delete mode 100755 ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_utils.sh delete mode 100755 ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/set_properties.sh delete mode 100644 ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/README.md delete mode 100755 ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install.sh delete mode 100755 ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_mellanoxofed.sh delete mode 100755 ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_mpis.sh delete mode 100755 ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_nvidiagpudriver.sh delete mode 100755 ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_utils.sh delete mode 100755 ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/set_properties.sh diff --git a/alma/alma-8.x/alma-8.6-hpc/README.md b/alma/alma-8.x/alma-8.6-hpc/README.md deleted file mode 100644 index 4856f3df..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# AlmaLinux 8.6 HPC Image - -The AlmaLInux 8.6 HPC Image includes optimizations and recommended configurations to deliver optimal performance, -consistency, and reliability. This image consists of the following HPC tools and libraries: - -- Mellanox OFED -- Pre-configured IPoIB (IP-over-InfiniBand) -- Popular InfiniBand based MPI Libraries - - HPC-X - - IntelMPI - - MVAPICH2 - - OpenMPI -- Communication Runtimes - - Libfabric - - OpenUCX -- Optimized librares - - AMD Blis - - AMD FFTW - - AMD Flame - - Intel MKL -- GPU Drivers - - Nvidia GPU Driver -- NCCL - - NCCL RDMA Sharp Plugin - - NCCL Tests -- NV Peer Memory (GPU Direct RDMA) -- GDRCopy -- Data Center GPU Manager -- Azure HPC Diagnostics Tool -- Moby -- NVIDIA-Docker -- Moneo (Distributed HPC/AI system monitor) - -Software packages are configured as environment modules. Users can select preferred MPI or software packages as follows: - -`module load ` - -## Azure Managed Lustre -Users that wish to use [Azure Managed Lustre Filesystem](https://learn.microsoft.com/en-us/azure/azure-managed-lustre/amlfs-overview) offering on virtual machine images with the following Azure Marketplace URN: `almalinux:almalinux-hpc:8_6-hpc-gen2:xxxxx` will need to install the amlfs client.
- -Please refer to the [AlmaLinux HPC 8.6 installation instructions](https://learn.microsoft.com/en-us/azure/azure-managed-lustre/install-hpc-alma-86) for specific instructions. diff --git a/alma/alma-8.x/alma-8.6-hpc/hpc-tuning.sh b/alma/alma-8.x/alma-8.6-hpc/hpc-tuning.sh deleted file mode 100755 index a07165ca..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/hpc-tuning.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../common/hpc-tuning.sh - diff --git a/alma/alma-8.x/alma-8.6-hpc/install.sh b/alma/alma-8.x/alma-8.6-hpc/install.sh deleted file mode 100755 index d22d4505..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/install.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash -set -ex - -# set properties -source ./set_properties.sh - -# install utils -./install_utils.sh - -# install compilers -./install_gcc.sh - -# install mellanox ofed -./install_mellanoxofed.sh - -# install mpi libraries -./install_mpis.sh - -# install nvidia gpu driver -./install_nvidiagpudriver.sh - -# install AMD tuned libraries -./install_amd_libs.sh - -# install Intel libraries -./install_intel_libs.sh - -# Install NCCL -./install_nccl.sh - -# Install NVIDIA docker container -./install_docker.sh - -# Install DCGM -./install_dcgm.sh - -# optimizations -./hpc-tuning.sh - -# install persistent rdma naming -$COMMON_DIR/install_azure_persistent_rdma_naming.sh - -# add udev rule -$COMMON_DIR/../alma/common/add-udev-rules.sh - -# add interface rules -$COMMON_DIR/../alma/common/network-config.sh - -# install diagnostic script -$COMMON_DIR/install_hpcdiag.sh - -#install monitoring tools -$COMMON_DIR/../alma/common/install_monitoring_tools.sh - -# install AMD libs -$COMMON_DIR/../alma/common/install_amd_libs.sh - -# install Azure/NHC Health Checks -$COMMON_DIR/install_health_checks.sh - -# copy test file -$COMMON_DIR/copy_test_file.sh - -# disable cloud-init -$ALMA_COMMON_DIR/disable_cloudinit.sh - -# SKU Customization -$COMMON_DIR/setup_sku_customizations.sh - -# clear history -# Uncomment the line below if you are running this on a VM -# $COMMON_DIR/clear_history.sh diff --git a/alma/alma-8.x/alma-8.6-hpc/install_amd_libs.sh b/alma/alma-8.x/alma-8.6-hpc/install_amd_libs.sh deleted file mode 100755 index 3c53c5da..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/install_amd_libs.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/../alma/alma-8.x/common/install_amd_libs.sh diff --git a/alma/alma-8.x/alma-8.6-hpc/install_dcgm.sh b/alma/alma-8.x/alma-8.6-hpc/install_dcgm.sh deleted file mode 100755 index c2c1a7a8..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/install_dcgm.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -set -ex - -# Install DCGM -DCGM_VERSION=2.4.4 -DCGM_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/datacenter-gpu-manager-${DCGM_VERSION}-1-x86_64.rpm -$COMMON_DIR/download_and_verify.sh $DCGM_URL "1d8fbe97797fada8048a7832bfac4bc7d3ad661bb24163d21324965ae7e7817d" -rpm -i datacenter-gpu-manager-${DCGM_VERSION}-1-x86_64.rpm -sed -i "$ s/$/ datacenter-gpu-manager/" /etc/dnf/dnf.conf -rm -f datacenter-gpu-manager-${DCGM_VERSION}-1-x86_64.rpm -$COMMON_DIR/write_component_version.sh "DCGM" ${DCGM_VERSION} - -# Create service for dcgm to launch on bootup -bash -c "cat > /etc/systemd/system/dcgm.service" <<'EOF' -[Unit] -Description=DCGM service - -[Service] -User=root -PrivateTmp=false -ExecStart=/usr/bin/nv-hostengine -n -Restart=on-abort - -[Install] -WantedBy=multi-user.target -EOF - -systemctl enable dcgm -systemctl start dcgm diff --git a/alma/alma-8.x/alma-8.6-hpc/install_docker.sh b/alma/alma-8.x/alma-8.6-hpc/install_docker.sh deleted file mode 100644 index 992fcefc..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/install_docker.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash -set -ex - -# Install Moby Engine + CLI -yum install -y moby-engine -yum install -y moby-cli - -# Install NVIDIA Docker -# Reference: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html -# Setting up NVIDIA Container Toolkit -case ${DISTRIBUTION} in - "almalinux8.6") distribution="rhel8.6" - ;; - "almalinux8.7") distribution="rhel8.7"; - ;; - *) ;; -esac - -curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | tee /etc/yum.repos.d/nvidia-docker.repo -# MIG Capability on A100 -# curl -s -L https://nvidia.github.io/nvidia-container-runtime/experimental/$distribution/nvidia-container-runtime.list | tee /etc/yum.repos.d/nvidia-container-runtime.list - -yum clean expire-cache -# Install nvidia-docker package -# Install NVIDIA container toolkit and mark NVIDIA packages on hold -yum install -y nvidia-container-toolkit - -# Install NVIDIA container runtime and mark NVIDIA packages on hold -yum install -y nvidia-container-runtime -# Mark the installed packages on hold to disable updates -sed -i "$ s/$/ *nvidia-container*/" /etc/dnf/dnf.conf - -wget https://raw.githubusercontent.com/NVIDIA/nvidia-docker/master/nvidia-docker -cp nvidia-docker /bin/ -chmod +x /bin/nvidia-docker -wget https://raw.githubusercontent.com/NVIDIA/nvidia-docker/master/daemon.json -cp daemon.json /etc/docker/ - -# Working setup can be tested by running a base CUDA container -# nvidia-docker run -e NVIDIA_VISIBLE_DEVICES=all nvidia/cuda:11.0-base nvidia-smi - -# disabling aufs, btrfs, zfs and devmapper snapshotter plugins -mkdir -p /etc/containerd -cat << EOF | tee -a /etc/containerd/config.toml -disabled_plugins = ["cri", "zfs", "aufs", "btrfs", "devmapper"] -EOF - -# restart containerd service -systemctl restart containerd - -# status of containerd snapshotter plugins -ctr plugin ls - -# enable and restart the docker daemon to complete the installation -systemctl enable docker -systemctl restart docker - -# Write the docker version to components file -docker_version=$(nvidia-docker --version | awk -F' ' '{print $3}') -$COMMON_DIR/write_component_version.sh "NVIDIA-DOCKER" ${docker_version::-1} - -# Clean repos -rm -rf /etc/yum.repos.d/nvidia-* -rm -rf /var/cache/yum/x86_64/8/nvidia-* -rm -rf /var/cache/yum/x86_64/8/libnvidia-container/ diff --git a/alma/alma-8.x/alma-8.6-hpc/install_gcc.sh b/alma/alma-8.x/alma-8.6-hpc/install_gcc.sh deleted file mode 100755 index dbc4bb01..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/install_gcc.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -MODULE_FILES_DIRECTORY=/usr/share/Modules/modulefiles - -mkdir -p ${MODULE_FILES_DIRECTORY} - -$COMMON_DIR/install_gcc-9.2.sh ${MODULE_FILES_DIRECTORY} diff --git a/alma/alma-8.x/alma-8.6-hpc/install_intel_libs.sh b/alma/alma-8.x/alma-8.6-hpc/install_intel_libs.sh deleted file mode 100755 index ab1a8b70..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/install_intel_libs.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -$ALMA_COMMON_DIR/install_intel_libs.sh diff --git a/alma/alma-8.x/alma-8.6-hpc/install_mellanoxofed.sh b/alma/alma-8.x/alma-8.6-hpc/install_mellanoxofed.sh deleted file mode 100755 index d8fd2a55..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/install_mellanoxofed.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -set -ex - -VERSION="5.8-1.0.1.1" -TARBALL="MLNX_OFED_LINUX-$VERSION-rhel8.6-x86_64.tgz" -MLNX_OFED_DOWNLOAD_URL=https://content.mellanox.com/ofed/MLNX_OFED-${VERSION}/$TARBALL -MOFED_FOLDER=$(basename ${MLNX_OFED_DOWNLOAD_URL} .tgz) - -$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL "f340317047cc815b0f41c3f3c52d2d83ddc7d81b29d9fff91abb859850f77945" -tar zxvf ${TARBALL} - -KERNEL=( $(rpm -q kernel | sed 's/kernel\-//g') ) -KERNEL=${KERNEL[-1]} -./${MOFED_FOLDER}/mlnxofedinstall --kernel $KERNEL --kernel-sources /usr/src/kernels/${KERNEL} --add-kernel-support --skip-repo --skip-unsupported-devices-check --without-fw-update --distro rhel8.6 - -# Issue: Module mlx5_ib belong to a kernel which is not a part of MLNX -# Resolution: set FORCE=1/ force-restart /etc/init.d/openibd -# This causes openibd to ignore the kernel difference but relies on weak-updates -# Restarting openibd -/etc/init.d/openibd force-restart -$COMMON_DIR/write_component_version.sh "MOFED" $VERSION - -# exclude opensm from updates -sed -i "$ s/$/ opensm*/" /etc/dnf/dnf.conf - -# cleanup downloaded files -rm -rf *.tgz -rm -rf -- */ diff --git a/alma/alma-8.x/alma-8.6-hpc/install_mpis.sh b/alma/alma-8.x/alma-8.6-hpc/install_mpis.sh deleted file mode 100755 index aa8e2514..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/install_mpis.sh +++ /dev/null @@ -1,136 +0,0 @@ -#!/bin/bash -set -ex - -# Load gcc -GCC_VERSION=gcc-9.2.0 -export PATH=/opt/${GCC_VERSION}/bin:$PATH -export LD_LIBRARY_PATH=/opt/${GCC_VERSION}/lib64:$LD_LIBRARY_PATH -set CC=/opt/${GCC_VERSION}/bin/gcc -set GCC=/opt/${GCC_VERSION}/bin/gcc - - -INSTALL_PREFIX=/opt - -# HPC-X v2.14 -HPCX_VERSION="v2.14" -TARBALL="hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-redhat8-cuda11-gdrcopy2-nccl2.16-x86_64.tbz" -HPCX_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/${TARBALL} -HPCX_FOLDER=$(basename ${HPCX_DOWNLOAD_URL} .tbz) - -$COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL "126d7dfd71a8e7095baea200c8be9ff9318ee41018fbef9ec6733a54023d6c60" -tar -xvf ${TARBALL} -mv ${HPCX_FOLDER} ${INSTALL_PREFIX} -HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} -$COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION - -# exclude ucx from updates -sed -i "$ s/$/ ucx*/" /etc/dnf/dnf.conf - -# Install platform independent MPIs -HCOLL_PATH=${HPCX_PATH}/hcoll -UCX_PATH=${HPCX_PATH}/ucx - -# MVAPICH2 2.3.7 -MV2_VERSION="2.3.7" -MV2_DOWNLOAD_URL=http://mvapich.cse.ohio-state.edu/download/mvapich/mv2/mvapich2-${MV2_VERSION}.tar.gz -$COMMON_DIR/download_and_verify.sh $MV2_DOWNLOAD_URL "c39a4492f4be50df6100785748ba2894e23ce450a94128181d516da5757751ae" -tar -xvf mvapich2-${MV2_VERSION}.tar.gz -cd mvapich2-${MV2_VERSION} -./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MV2_VERSION} --enable-g=none --enable-fast=yes && make -j$(nproc) && make install -cd .. -$COMMON_DIR/write_component_version.sh "MVAPICH2" ${MV2_VERSION} - -# OpenMPI 4.1.3 -OMPI_VERSION="4.1.3" -OMPI_DOWNLOAD_URL=https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz -$COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL "9c0fd1f78fc90ca9b69ae4ab704687d5544220005ccd7678bf58cc13135e67e0" -tar -xvf openmpi-${OMPI_VERSION}.tar.gz -cd openmpi-${OMPI_VERSION} -./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install -cd .. -$COMMON_DIR/write_component_version.sh "OMPI" ${OMPI_VERSION} - -# exclude openmpi, perftest from updates -sed -i "$ s/$/ openmpi perftest/" /etc/dnf/dnf.conf - -# Intel MPI 2021 (Update 7) -IMPI_2021_VERSION="2021.7.0" -IMPI_2021_DOWNLOAD_URL=https://registrationcenter-download.intel.com/akdlm/irc_nas/18926/l_mpi_oneapi_p_${IMPI_2021_VERSION}.8711_offline.sh -$COMMON_DIR/download_and_verify.sh $IMPI_2021_DOWNLOAD_URL "4eb1e1487b67b98857bc9b7b37bcac4998e0aa6d1b892b2c87b003bf84fb38e9" -bash l_mpi_oneapi_p_${IMPI_2021_VERSION}.8711_offline.sh -s -a -s --eula accept -mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/impi -$COMMON_DIR/write_component_version.sh "IMPI_2021" ${IMPI_2021_VERSION} - -# Setup module files for MPIs -mkdir -p /usr/share/Modules/modulefiles/mpi/ - -# HPC-X -cat << EOF >> /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} -#%Module 1.0 -# -# HPCx ${HPCX_VERSION} -# -conflict mpi -module load ${HPCX_PATH}/modulefiles/hpcx -EOF - -# MVAPICH2 -cat << EOF >> /usr/share/Modules/modulefiles/mpi/mvapich2-${MV2_VERSION} -#%Module 1.0 -# -# MVAPICH2 ${MV2_VERSION} -# -conflict mpi -module load ${GCC_VERSION} -prepend-path PATH /opt/mvapich2-${MV2_VERSION}/bin -prepend-path LD_LIBRARY_PATH /opt/mvapich2-${MV2_VERSION}/lib -prepend-path MANPATH /opt/mvapich2-${MV2_VERSION}/share/man -setenv MPI_BIN /opt/mvapich2-${MV2_VERSION}/bin -setenv MPI_INCLUDE /opt/mvapich2-${MV2_VERSION}/include -setenv MPI_LIB /opt/mvapich2-${MV2_VERSION}/lib -setenv MPI_MAN /opt/mvapich2-${MV2_VERSION}/share/man -setenv MPI_HOME /opt/mvapich2-${MV2_VERSION} -EOF - -# OpenMPI -cat << EOF >> /usr/share/Modules/modulefiles/mpi/openmpi-${OMPI_VERSION} -#%Module 1.0 -# -# OpenMPI ${OMPI_VERSION} -# -conflict mpi -module load ${GCC_VERSION} -prepend-path PATH /opt/openmpi-${OMPI_VERSION}/bin -prepend-path LD_LIBRARY_PATH /opt/openmpi-${OMPI_VERSION}/lib -prepend-path MANPATH /opt/openmpi-${OMPI_VERSION}/share/man -setenv MPI_BIN /opt/openmpi-${OMPI_VERSION}/bin -setenv MPI_INCLUDE /opt/openmpi-${OMPI_VERSION}/include -setenv MPI_LIB /opt/openmpi-${OMPI_VERSION}/lib -setenv MPI_MAN /opt/openmpi-${OMPI_VERSION}/share/man -setenv MPI_HOME /opt/openmpi-${OMPI_VERSION} -EOF - -#IntelMPI-v2021 -cat << EOF >> /usr/share/Modules/modulefiles/mpi/impi_${IMPI_2021_VERSION} -#%Module 1.0 -# -# Intel MPI ${IMPI_2021_VERSION} -# -conflict mpi -module load /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/impi -setenv MPI_BIN /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/bin -setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/include -setenv MPI_LIB /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/lib -setenv MPI_MAN /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/man -setenv MPI_HOME /opt/intel/oneapi/mpi/${IMPI_2021_VERSION} -EOF - -# Create symlinks for modulefiles -ln -s /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} /usr/share/Modules/modulefiles/mpi/hpcx -ln -s /usr/share/Modules/modulefiles/mpi/mvapich2-${MV2_VERSION} /usr/share/Modules/modulefiles/mpi/mvapich2 -ln -s /usr/share/Modules/modulefiles/mpi/openmpi-${OMPI_VERSION} /usr/share/Modules/modulefiles/mpi/openmpi -ln -s /usr/share/Modules/modulefiles/mpi/impi_${IMPI_2021_VERSION} /usr/share/Modules/modulefiles/mpi/impi-2021 - -# cleanup downloaded tarball for HPC-x -rm -rf *.tar.gz *offline.sh *.tbz -rm -rf -- */ diff --git a/alma/alma-8.x/alma-8.6-hpc/install_nccl.sh b/alma/alma-8.x/alma-8.6-hpc/install_nccl.sh deleted file mode 100755 index e441029b..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/install_nccl.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -$ALMA_COMMON_DIR/install_nccl.sh diff --git a/alma/alma-8.x/alma-8.6-hpc/install_nvidiagpudriver.sh b/alma/alma-8.x/alma-8.6-hpc/install_nvidiagpudriver.sh deleted file mode 100755 index 8ce261c5..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/install_nvidiagpudriver.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/../alma/alma-8.x/common/install_nvidiagpudriver.sh diff --git a/alma/alma-8.x/alma-8.6-hpc/install_utils.sh b/alma/alma-8.x/alma-8.6-hpc/install_utils.sh deleted file mode 100755 index 4292adaa..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/install_utils.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -ex - -# Setup microsoft packages repository for moby -# Download the repository configuration package -curl https://packages.microsoft.com/config/rhel/8/prod.repo > ./microsoft-prod.repo -# Copy the generated list to the sources.list.d directory -cp ./microsoft-prod.repo /etc/yum.repos.d/ - -yum repolist - -../common/install_utils.sh - diff --git a/alma/alma-8.x/alma-8.6-hpc/set_properties.sh b/alma/alma-8.x/alma-8.6-hpc/set_properties.sh deleted file mode 100755 index ea73c23a..00000000 --- a/alma/alma-8.x/alma-8.6-hpc/set_properties.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -export TOP_DIR=../../.. -export COMMON_DIR=../../../common -export ALMA_COMMON_DIR=../../common -export TEST_DIR=../../../tests -export DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) diff --git a/centos/centos-7.x/centos-7.6-hpc/README.md b/centos/centos-7.x/centos-7.6-hpc/README.md deleted file mode 100644 index ee678f40..00000000 --- a/centos/centos-7.x/centos-7.6-hpc/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# CentOS 7.6 HPC Image - -The CentOS 7.6 HPC Image includes optimizations and recommended configurations to deliver optimal performance, -consistency, and reliability. This image consists of the following HPC tools and libraries: - -- Mellanox OFED -- Pre-configured IPoIB (IP-over-InfiniBand) -- Popular InfiniBand based MPI Libraries - - HPC-X - - IntelMPI - - MVAPICH2 - - OpenMPI -- Communication Runtimes - - Libfabric - - OpenUCX -- Optimized librares - - AMD Blis - - AMD FFTW - - AMD Flame - - Intel MKL -- Azure HPC Diagnostics Tool - -Software packages are configured as environment modules. Users can select preferred MPI or software packages as follows: - -`module load ` diff --git a/centos/centos-7.x/centos-7.6-hpc/hpc-tuning.sh b/centos/centos-7.x/centos-7.6-hpc/hpc-tuning.sh deleted file mode 100755 index a07165ca..00000000 --- a/centos/centos-7.x/centos-7.6-hpc/hpc-tuning.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../common/hpc-tuning.sh - diff --git a/centos/centos-7.x/centos-7.6-hpc/install.sh b/centos/centos-7.x/centos-7.6-hpc/install.sh deleted file mode 100755 index 28d377c8..00000000 --- a/centos/centos-7.x/centos-7.6-hpc/install.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -set -ex - -# set properties -source ./set_properties.sh - -# install utils -./install_utils.sh - -# install compilers -./install_gcc.sh - -# install mellanox ofed -./install_mellanoxofed.sh - -# install mpi libraries -./install_mpis.sh - -# install nvidia gpu driver -#./install_nvidiagpudriver.sh - -# install AMD tuned libraries -./install_amd_libs.sh - -# install Intel libraries -./install_intel_libs.sh - -# add udev rule -$COMMON_DIR/../centos/common/add-udev-rules.sh - -# add interface rules -$COMMON_DIR/../centos/common/network-config.sh - -# optimizations -./hpc-tuning.sh - -# copy test file -$COMMON_DIR/copy_test_file.sh - -# lustre client -#./install_lustre_client.sh - -# install diagnostic script -$COMMON_DIR/install_hpcdiag.sh - -# install persistent rdma naming -$COMMON_DIR/install_azure_persistent_rdma_naming.sh diff --git a/centos/centos-7.x/centos-7.6-hpc/install_amd_libs.sh b/centos/centos-7.x/centos-7.6-hpc/install_amd_libs.sh deleted file mode 100755 index 11497c19..00000000 --- a/centos/centos-7.x/centos-7.6-hpc/install_amd_libs.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/../centos/centos-7.x/common/install_amd_libs.sh diff --git a/centos/centos-7.x/centos-7.6-hpc/install_gcc.sh b/centos/centos-7.x/centos-7.6-hpc/install_gcc.sh deleted file mode 100755 index abbe9056..00000000 --- a/centos/centos-7.x/centos-7.6-hpc/install_gcc.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -MODULE_FILES_DIRECTORY=/usr/share/Modules/modulefiles - -mkdir -p ${MODULE_FILES_DIRECTORY} - -$COMMON_DIR/install_gcc-9.2.sh ${MODULE_FILES_DIRECTORY} - diff --git a/centos/centos-7.x/centos-7.6-hpc/install_intel_libs.sh b/centos/centos-7.x/centos-7.6-hpc/install_intel_libs.sh deleted file mode 100755 index 2f69d765..00000000 --- a/centos/centos-7.x/centos-7.6-hpc/install_intel_libs.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/install_intel_libs.sh - diff --git a/centos/centos-7.x/centos-7.6-hpc/install_lustre_client.sh b/centos/centos-7.x/centos-7.6-hpc/install_lustre_client.sh deleted file mode 100755 index b224222b..00000000 --- a/centos/centos-7.x/centos-7.6-hpc/install_lustre_client.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../common/install_lustre_client.sh - diff --git a/centos/centos-7.x/centos-7.6-hpc/install_mellanoxofed.sh b/centos/centos-7.x/centos-7.6-hpc/install_mellanoxofed.sh deleted file mode 100755 index 2db821ce..00000000 --- a/centos/centos-7.x/centos-7.6-hpc/install_mellanoxofed.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -set -ex - -VERSION="5.4-1.0.3.0" -$COMMON_DIR/write_component_version.sh "MOFED" $VERSION -TARBALL="MLNX_OFED_LINUX-$VERSION-rhel7.6-x86_64.tgz" -MLNX_OFED_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/${TARBALL} -MOFED_FOLDER=$(basename ${MLNX_OFED_DOWNLOAD_URL} .tgz) - -$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL "ae72826b3b9a57e4560f7f0a2975c0645eb2f31f47f1b3e2f388346e507f4c8b" -tar zxvf ${TARBALL} - -KERNEL=( $(rpm -q kernel | sed 's/kernel\-//g') ) -KERNEL=${KERNEL[-1]} -# Uncomment the lines below if you are running this on a VM -#RELEASE=( $(cat /etc/centos-release | awk '{print $4}') ) -#yum -y install http://olcentgbl.trafficmanager.net/centos/${RELEASE}/updates/x86_64/kernel-devel-${KERNEL}.rpm -yum install -y kernel-devel-${KERNEL} -./${MOFED_FOLDER}/mlnxofedinstall --kernel $KERNEL --kernel-sources /usr/src/kernels/${KERNEL} --add-kernel-support --skip-repo --skip-unsupported-devices-check --without-fw-update - -# Issue: Module mlx5_ib belong to a kernel which is not a part of MLNX -# Resolution: set FORCE=1/ force-restart /etc/init.d/openibd -# This causes openibd to ignore the kernel difference but relies on weak-updates -# Restarting openibd -/etc/init.d/openibd force-restart diff --git a/centos/centos-7.x/centos-7.6-hpc/install_mpis.sh b/centos/centos-7.x/centos-7.6-hpc/install_mpis.sh deleted file mode 100755 index 664d5c46..00000000 --- a/centos/centos-7.x/centos-7.6-hpc/install_mpis.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -set -ex - -# Load gcc -GCC_VERSION=gcc-9.2.0 -export PATH=/opt/${GCC_VERSION}/bin:$PATH -export LD_LIBRARY_PATH=/opt/${GCC_VERSION}/lib64:$LD_LIBRARY_PATH -set CC=/opt/${GCC_VERSION}/bin/gcc -set GCC=/opt/${GCC_VERSION}/bin/gcc - - -INSTALL_PREFIX=/opt - -# HPC-X v2.9.0 -MLNX_OFED_VERSION="5.4-1.0.3.0" -HPCX_VERSION="v2.9.0" -$COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION -TARBALL="hpcx-${HPCX_VERSION}-gcc9.2.0-MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-redhat7.6-x86_64.tbz" -HPCX_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/${TARBALL} -HPCX_FOLDER=$(basename ${HPCX_DOWNLOAD_URL} .tbz) - -$COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL "fb395b656d96d959ab833133163bd023c36b8f6d08808a78e4304c7e1e786f6e" -tar -xvf ${TARBALL} -mv ${HPCX_FOLDER} ${INSTALL_PREFIX} -HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} - -# Enable Sharpd -${HPCX_PATH}/sharp/sbin/sharp_daemons_setup.sh -s -d sharpd -systemctl enable sharpd -systemctl start sharpd - -# Setup module files for MPIs -mkdir -p /usr/share/Modules/modulefiles/mpi/ - -# HPC-X -cat << EOF >> /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} -#%Module 1.0 -# -# HPCx ${HPCX_VERSION} -# -conflict mpi -module load ${HPCX_PATH}/modulefiles/hpcx -EOF - -# Create symlinks for modulefiles -ln -s /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} /usr/share/Modules/modulefiles/mpi/hpcx - -# Install platform independent MPIs -../common/install_mpis.sh ${GCC_VERSION} ${HPCX_PATH} - diff --git a/centos/centos-7.x/centos-7.6-hpc/install_nvidiagpudriver.sh b/centos/centos-7.x/centos-7.6-hpc/install_nvidiagpudriver.sh deleted file mode 100755 index 27f42b3e..00000000 --- a/centos/centos-7.x/centos-7.6-hpc/install_nvidiagpudriver.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -# Install kernel libs, these should already be installed with Mellanox OFED installation -KERNEL=( $(rpm -q kernel | sed 's/kernel\-//g') ) -KERNEL=${KERNEL[-1]} -# Uncomment the lines below if you are running this on a VM -#RELEASE=( $(cat /etc/centos-release | awk '{print $4}') ) -#yum -y install http://olcentgbl.trafficmanager.net/centos/${RELEASE}/updates/x86_64/kernel-devel-${KERNEL}.rpm -#yum -y install http://olcentgbl.trafficmanager.net/centos/${RELEASE}/updates/x86_64/kernel-headers-${KERNEL}.rpm -yum install -y kernel-devel-${KERNEL} -yum install -y kernel-headers-${KERNEL} - -$COMMON_DIR/install_nvidiagpudriver.sh \ No newline at end of file diff --git a/centos/centos-7.x/centos-7.6-hpc/install_utils.sh b/centos/centos-7.x/centos-7.6-hpc/install_utils.sh deleted file mode 100755 index 268f487a..00000000 --- a/centos/centos-7.x/centos-7.6-hpc/install_utils.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../common/install_utils.sh - diff --git a/centos/centos-7.x/centos-7.6-hpc/set_properties.sh b/centos/centos-7.x/centos-7.6-hpc/set_properties.sh deleted file mode 100755 index 64cc71b8..00000000 --- a/centos/centos-7.x/centos-7.6-hpc/set_properties.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -export TOP_DIR=../../.. -export COMMON_DIR=../../../common -export TEST_DIR=../../../tests diff --git a/centos/centos-7.x/centos-7.7-hpc/README.md b/centos/centos-7.x/centos-7.7-hpc/README.md deleted file mode 100644 index 95461268..00000000 --- a/centos/centos-7.x/centos-7.7-hpc/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# CentOS 7.7 HPC Image - -The CentOS 7.7 HPC Image includes optimizations and recommended configurations to deliver optimal performance, -consistency, and reliability. This image consists of the following HPC tools and libraries: - -- Mellanox OFED -- Pre-configured IPoIB (IP-over-InfiniBand) -- Popular InfiniBand based MPI Libraries - - HPC-X - - IntelMPI - - MVAPICH2 - - OpenMPI -- Communication Runtimes - - Libfabric - - OpenUCX -- Optimized librares - - AMD Blis - - AMD FFTW - - AMD Flame - - Intel MKL -- Azure HPC Diagnostics Tool - -Software packages are configured as environment modules. Users can select preferred MPI or software packages as follows: - -`module load ` diff --git a/centos/centos-7.x/centos-7.7-hpc/hpc-tuning.sh b/centos/centos-7.x/centos-7.7-hpc/hpc-tuning.sh deleted file mode 100755 index a07165ca..00000000 --- a/centos/centos-7.x/centos-7.7-hpc/hpc-tuning.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../common/hpc-tuning.sh - diff --git a/centos/centos-7.x/centos-7.7-hpc/install.sh b/centos/centos-7.x/centos-7.7-hpc/install.sh deleted file mode 100755 index 28d377c8..00000000 --- a/centos/centos-7.x/centos-7.7-hpc/install.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -set -ex - -# set properties -source ./set_properties.sh - -# install utils -./install_utils.sh - -# install compilers -./install_gcc.sh - -# install mellanox ofed -./install_mellanoxofed.sh - -# install mpi libraries -./install_mpis.sh - -# install nvidia gpu driver -#./install_nvidiagpudriver.sh - -# install AMD tuned libraries -./install_amd_libs.sh - -# install Intel libraries -./install_intel_libs.sh - -# add udev rule -$COMMON_DIR/../centos/common/add-udev-rules.sh - -# add interface rules -$COMMON_DIR/../centos/common/network-config.sh - -# optimizations -./hpc-tuning.sh - -# copy test file -$COMMON_DIR/copy_test_file.sh - -# lustre client -#./install_lustre_client.sh - -# install diagnostic script -$COMMON_DIR/install_hpcdiag.sh - -# install persistent rdma naming -$COMMON_DIR/install_azure_persistent_rdma_naming.sh diff --git a/centos/centos-7.x/centos-7.7-hpc/install_amd_libs.sh b/centos/centos-7.x/centos-7.7-hpc/install_amd_libs.sh deleted file mode 100755 index 11497c19..00000000 --- a/centos/centos-7.x/centos-7.7-hpc/install_amd_libs.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/../centos/centos-7.x/common/install_amd_libs.sh diff --git a/centos/centos-7.x/centos-7.7-hpc/install_gcc.sh b/centos/centos-7.x/centos-7.7-hpc/install_gcc.sh deleted file mode 100755 index dbc4bb01..00000000 --- a/centos/centos-7.x/centos-7.7-hpc/install_gcc.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -MODULE_FILES_DIRECTORY=/usr/share/Modules/modulefiles - -mkdir -p ${MODULE_FILES_DIRECTORY} - -$COMMON_DIR/install_gcc-9.2.sh ${MODULE_FILES_DIRECTORY} diff --git a/centos/centos-7.x/centos-7.7-hpc/install_intel_libs.sh b/centos/centos-7.x/centos-7.7-hpc/install_intel_libs.sh deleted file mode 100755 index 2f69d765..00000000 --- a/centos/centos-7.x/centos-7.7-hpc/install_intel_libs.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/install_intel_libs.sh - diff --git a/centos/centos-7.x/centos-7.7-hpc/install_lustre_client.sh b/centos/centos-7.x/centos-7.7-hpc/install_lustre_client.sh deleted file mode 100755 index b224222b..00000000 --- a/centos/centos-7.x/centos-7.7-hpc/install_lustre_client.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../common/install_lustre_client.sh - diff --git a/centos/centos-7.x/centos-7.7-hpc/install_mellanoxofed.sh b/centos/centos-7.x/centos-7.7-hpc/install_mellanoxofed.sh deleted file mode 100755 index 6d5879b7..00000000 --- a/centos/centos-7.x/centos-7.7-hpc/install_mellanoxofed.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -set -ex - -VERSION="5.4-1.0.3.0" -$COMMON_DIR/write_component_version.sh "MOFED" ${VERSION} -TARBALL="MLNX_OFED_LINUX-${VERSION}-rhel7.7-x86_64.tgz" -MLNX_OFED_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/$TARBALL -MOFED_FOLDER=$(basename ${MLNX_OFED_DOWNLOAD_URL} .tgz) - -$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL "e12e7d1606c8ad875c61be261d45e7cd11b0e15258ca29f7a6830077c57c9792" -tar zxvf ${TARBALL} - -KERNEL=( $(rpm -q kernel | sed 's/kernel\-//g') ) -KERNEL=${KERNEL[-1]} -# Uncomment the lines below if you are running this on a VM -#RELEASE=( $(cat /etc/centos-release | awk '{print $4}') ) -#yum -y install http://olcentgbl.trafficmanager.net/centos/${RELEASE}/updates/x86_64/kernel-devel-${KERNEL}.rpm -yum install -y kernel-devel-${KERNEL} -./${MOFED_FOLDER}/mlnxofedinstall --kernel $KERNEL --kernel-sources /usr/src/kernels/${KERNEL} --add-kernel-support --skip-repo --skip-unsupported-devices-check --without-fw-update - -# Issue: Module mlx5_ib belong to a kernel which is not a part of MLNX -# Resolution: set FORCE=1/ force-restart /etc/init.d/openibd -# This causes openibd to ignore the kernel difference but relies on weak-updates -# Restarting openibd -/etc/init.d/openibd force-restart diff --git a/centos/centos-7.x/centos-7.7-hpc/install_mpis.sh b/centos/centos-7.x/centos-7.7-hpc/install_mpis.sh deleted file mode 100755 index 7026c7f6..00000000 --- a/centos/centos-7.x/centos-7.7-hpc/install_mpis.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -set -ex - -# Load gcc -GCC_VERSION=gcc-9.2.0 -export PATH=/opt/${GCC_VERSION}/bin:$PATH -export LD_LIBRARY_PATH=/opt/${GCC_VERSION}/lib64:$LD_LIBRARY_PATH -set CC=/opt/${GCC_VERSION}/bin/gcc -set GCC=/opt/${GCC_VERSION}/bin/gcc - - -INSTALL_PREFIX=/opt - -# HPC-X v2.9.0 -MLNX_OFED_VERSION="5.4-1.0.3.0" -HPCX_VERSION="v2.9.0" -$COMMON_DIR/write_component_version.sh "HPCX" ${HPCX_VERSION} -TARBALL="hpcx-${HPCX_VERSION}-gcc9.2.0-MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-redhat7.7-x86_64.tbz" -HPCX_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/${TARBALL} -HPCX_FOLDER=$(basename ${HPCX_DOWNLOAD_URL} .tbz) - -$COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL "e8ed81cfb48d65d2111eab5108206b161b626f711336e084c3312297bef118bb" -tar -xvf ${TARBALL} -mv ${HPCX_FOLDER} ${INSTALL_PREFIX} -HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} - -# Enable Sharpd -${HPCX_PATH}/sharp/sbin/sharp_daemons_setup.sh -s -d sharpd -systemctl enable sharpd -systemctl start sharpd - -# Setup module files for MPIs -mkdir -p /usr/share/Modules/modulefiles/mpi/ - -# HPC-X -cat << EOF >> /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} -#%Module 1.0 -# -# HPCx ${HPCX_VERSION} -# -conflict mpi -module load ${HPCX_PATH}/modulefiles/hpcx -EOF - -# Create symlinks for modulefiles -ln -s /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} /usr/share/Modules/modulefiles/mpi/hpcx - -# Install platform independent MPIs -../common/install_mpis.sh ${GCC_VERSION} ${HPCX_PATH} - diff --git a/centos/centos-7.x/centos-7.7-hpc/install_nvidiagpudriver.sh b/centos/centos-7.x/centos-7.7-hpc/install_nvidiagpudriver.sh deleted file mode 100755 index 27f42b3e..00000000 --- a/centos/centos-7.x/centos-7.7-hpc/install_nvidiagpudriver.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -# Install kernel libs, these should already be installed with Mellanox OFED installation -KERNEL=( $(rpm -q kernel | sed 's/kernel\-//g') ) -KERNEL=${KERNEL[-1]} -# Uncomment the lines below if you are running this on a VM -#RELEASE=( $(cat /etc/centos-release | awk '{print $4}') ) -#yum -y install http://olcentgbl.trafficmanager.net/centos/${RELEASE}/updates/x86_64/kernel-devel-${KERNEL}.rpm -#yum -y install http://olcentgbl.trafficmanager.net/centos/${RELEASE}/updates/x86_64/kernel-headers-${KERNEL}.rpm -yum install -y kernel-devel-${KERNEL} -yum install -y kernel-headers-${KERNEL} - -$COMMON_DIR/install_nvidiagpudriver.sh \ No newline at end of file diff --git a/centos/centos-7.x/centos-7.7-hpc/install_utils.sh b/centos/centos-7.x/centos-7.7-hpc/install_utils.sh deleted file mode 100755 index 268f487a..00000000 --- a/centos/centos-7.x/centos-7.7-hpc/install_utils.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../common/install_utils.sh - diff --git a/centos/centos-7.x/centos-7.7-hpc/set_properties.sh b/centos/centos-7.x/centos-7.7-hpc/set_properties.sh deleted file mode 100755 index 64cc71b8..00000000 --- a/centos/centos-7.x/centos-7.7-hpc/set_properties.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -export TOP_DIR=../../.. -export COMMON_DIR=../../../common -export TEST_DIR=../../../tests diff --git a/centos/centos-7.x/centos-7.8-hpc/README.md b/centos/centos-7.x/centos-7.8-hpc/README.md deleted file mode 100644 index b9380fbb..00000000 --- a/centos/centos-7.x/centos-7.8-hpc/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# CentOS 7.8 HPC Image - -The CentOS 7.8 HPC Image includes optimizations and recommended configurations to deliver optimal performance, -consistency, and reliability. This image consists of the following HPC tools and libraries: - -- Mellanox OFED -- Pre-configured IPoIB (IP-over-InfiniBand) -- Popular InfiniBand based MPI Libraries - - HPC-X - - IntelMPI - - MVAPICH2 - - OpenMPI -- Communication Runtimes - - Libfabric - - OpenUCX -- Optimized librares - - AMD Blis - - AMD FFTW - - AMD Flame - - Intel MKL -- Azure HPC Diagnostics Tool - -Software packages are configured as environment modules. Users can select preferred MPI or software packages as follows: - -`module load ` diff --git a/centos/centos-7.x/centos-7.8-hpc/hpc-tuning.sh b/centos/centos-7.x/centos-7.8-hpc/hpc-tuning.sh deleted file mode 100755 index a07165ca..00000000 --- a/centos/centos-7.x/centos-7.8-hpc/hpc-tuning.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../common/hpc-tuning.sh - diff --git a/centos/centos-7.x/centos-7.8-hpc/install.sh b/centos/centos-7.x/centos-7.8-hpc/install.sh deleted file mode 100755 index 28d377c8..00000000 --- a/centos/centos-7.x/centos-7.8-hpc/install.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -set -ex - -# set properties -source ./set_properties.sh - -# install utils -./install_utils.sh - -# install compilers -./install_gcc.sh - -# install mellanox ofed -./install_mellanoxofed.sh - -# install mpi libraries -./install_mpis.sh - -# install nvidia gpu driver -#./install_nvidiagpudriver.sh - -# install AMD tuned libraries -./install_amd_libs.sh - -# install Intel libraries -./install_intel_libs.sh - -# add udev rule -$COMMON_DIR/../centos/common/add-udev-rules.sh - -# add interface rules -$COMMON_DIR/../centos/common/network-config.sh - -# optimizations -./hpc-tuning.sh - -# copy test file -$COMMON_DIR/copy_test_file.sh - -# lustre client -#./install_lustre_client.sh - -# install diagnostic script -$COMMON_DIR/install_hpcdiag.sh - -# install persistent rdma naming -$COMMON_DIR/install_azure_persistent_rdma_naming.sh diff --git a/centos/centos-7.x/centos-7.8-hpc/install_amd_libs.sh b/centos/centos-7.x/centos-7.8-hpc/install_amd_libs.sh deleted file mode 100755 index 11497c19..00000000 --- a/centos/centos-7.x/centos-7.8-hpc/install_amd_libs.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/../centos/centos-7.x/common/install_amd_libs.sh diff --git a/centos/centos-7.x/centos-7.8-hpc/install_gcc.sh b/centos/centos-7.x/centos-7.8-hpc/install_gcc.sh deleted file mode 100755 index dbc4bb01..00000000 --- a/centos/centos-7.x/centos-7.8-hpc/install_gcc.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -MODULE_FILES_DIRECTORY=/usr/share/Modules/modulefiles - -mkdir -p ${MODULE_FILES_DIRECTORY} - -$COMMON_DIR/install_gcc-9.2.sh ${MODULE_FILES_DIRECTORY} diff --git a/centos/centos-7.x/centos-7.8-hpc/install_intel_libs.sh b/centos/centos-7.x/centos-7.8-hpc/install_intel_libs.sh deleted file mode 100755 index 2f69d765..00000000 --- a/centos/centos-7.x/centos-7.8-hpc/install_intel_libs.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/install_intel_libs.sh - diff --git a/centos/centos-7.x/centos-7.8-hpc/install_lustre_client.sh b/centos/centos-7.x/centos-7.8-hpc/install_lustre_client.sh deleted file mode 100755 index b224222b..00000000 --- a/centos/centos-7.x/centos-7.8-hpc/install_lustre_client.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../common/install_lustre_client.sh - diff --git a/centos/centos-7.x/centos-7.8-hpc/install_mellanoxofed.sh b/centos/centos-7.x/centos-7.8-hpc/install_mellanoxofed.sh deleted file mode 100755 index fb308808..00000000 --- a/centos/centos-7.x/centos-7.8-hpc/install_mellanoxofed.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -set -ex - -VERSION="5.4-1.0.3.0" -$COMMON_DIR/write_component_version.sh "MOFED" $VERSION -TARBALL="MLNX_OFED_LINUX-$VERSION-rhel7.8-x86_64.tgz" -MLNX_OFED_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/$TARBALL -MOFED_FOLDER=$(basename ${MLNX_OFED_DOWNLOAD_URL} .tgz) - -$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL "c78d9548b550fe2d8d1f217bab0b7697ed23885311c7b82baaf03f7a724cf7e2" -tar zxvf ${TARBALL} - -KERNEL=( $(rpm -q kernel | sed 's/kernel\-//g') ) -KERNEL=${KERNEL[-1]} -# Uncomment the lines below if you are running this on a VM -#RELEASE=( $(cat /etc/centos-release | awk '{print $4}') ) -#yum -y install http://olcentgbl.trafficmanager.net/centos/${RELEASE}/updates/x86_64/kernel-devel-${KERNEL}.rpm -yum install -y kernel-devel-${KERNEL} -./${MOFED_FOLDER}/mlnxofedinstall --kernel $KERNEL --kernel-sources /usr/src/kernels/${KERNEL} --add-kernel-support --skip-repo --skip-unsupported-devices-check --without-fw-update - -# Issue: Module mlx5_ib belong to a kernel which is not a part of MLNX -# Resolution: set FORCE=1/ force-restart /etc/init.d/openibd -# This causes openibd to ignore the kernel difference but relies on weak-updates -# Restarting openibd -/etc/init.d/openibd force-restart diff --git a/centos/centos-7.x/centos-7.8-hpc/install_mpis.sh b/centos/centos-7.x/centos-7.8-hpc/install_mpis.sh deleted file mode 100755 index f18cb699..00000000 --- a/centos/centos-7.x/centos-7.8-hpc/install_mpis.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -set -ex - -# Load gcc -GCC_VERSION=gcc-9.2.0 -export PATH=/opt/${GCC_VERSION}/bin:$PATH -export LD_LIBRARY_PATH=/opt/${GCC_VERSION}/lib64:$LD_LIBRARY_PATH -set CC=/opt/${GCC_VERSION}/bin/gcc -set GCC=/opt/${GCC_VERSION}/bin/gcc - - -INSTALL_PREFIX=/opt - -# HPC-X v2.9.0 -MLNX_OFED_VERSION="5.4-1.0.3.0" -HPCX_VERSION="v2.9.0" -$COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION -TARBALL="hpcx-${HPCX_VERSION}-gcc9.2.0-MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-redhat7.8-x86_64.tbz" -HPCX_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/${TARBALL} -HPCX_FOLDER=$(basename ${HPCX_DOWNLOAD_URL} .tbz) - -$COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL "7f5d7cc1f53b1f53cd999335a17ce4486898e1a80a37dad6fd15a8a4ab927bbb" -tar -xvf ${TARBALL} -mv ${HPCX_FOLDER} ${INSTALL_PREFIX} -HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} - -# Enable Sharpd -${HPCX_PATH}/sharp/sbin/sharp_daemons_setup.sh -s -d sharpd -systemctl enable sharpd -systemctl start sharpd - -# Setup module files for MPIs -mkdir -p /usr/share/Modules/modulefiles/mpi/ - -# HPC-X -cat << EOF >> /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} -#%Module 1.0 -# -# HPCx ${HPCX_VERSION} -# -conflict mpi -module load ${HPCX_PATH}/modulefiles/hpcx -EOF - -# Create symlinks for modulefiles -ln -s /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} /usr/share/Modules/modulefiles/mpi/hpcx - -# Install platform independent MPIs -../common/install_mpis.sh ${GCC_VERSION} ${HPCX_PATH} - diff --git a/centos/centos-7.x/centos-7.8-hpc/install_nvidiagpudriver.sh b/centos/centos-7.x/centos-7.8-hpc/install_nvidiagpudriver.sh deleted file mode 100755 index 27f42b3e..00000000 --- a/centos/centos-7.x/centos-7.8-hpc/install_nvidiagpudriver.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -# Install kernel libs, these should already be installed with Mellanox OFED installation -KERNEL=( $(rpm -q kernel | sed 's/kernel\-//g') ) -KERNEL=${KERNEL[-1]} -# Uncomment the lines below if you are running this on a VM -#RELEASE=( $(cat /etc/centos-release | awk '{print $4}') ) -#yum -y install http://olcentgbl.trafficmanager.net/centos/${RELEASE}/updates/x86_64/kernel-devel-${KERNEL}.rpm -#yum -y install http://olcentgbl.trafficmanager.net/centos/${RELEASE}/updates/x86_64/kernel-headers-${KERNEL}.rpm -yum install -y kernel-devel-${KERNEL} -yum install -y kernel-headers-${KERNEL} - -$COMMON_DIR/install_nvidiagpudriver.sh \ No newline at end of file diff --git a/centos/centos-7.x/centos-7.8-hpc/install_utils.sh b/centos/centos-7.x/centos-7.8-hpc/install_utils.sh deleted file mode 100755 index 268f487a..00000000 --- a/centos/centos-7.x/centos-7.8-hpc/install_utils.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../common/install_utils.sh - diff --git a/centos/centos-7.x/centos-7.8-hpc/set_properties.sh b/centos/centos-7.x/centos-7.8-hpc/set_properties.sh deleted file mode 100755 index 64cc71b8..00000000 --- a/centos/centos-7.x/centos-7.8-hpc/set_properties.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -export TOP_DIR=../../.. -export COMMON_DIR=../../../common -export TEST_DIR=../../../tests diff --git a/centos/centos-7.x/centos-7.9-hpc/README.md b/centos/centos-7.x/centos-7.9-hpc/README.md deleted file mode 100644 index 14db97a3..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# CentOS 7.9 HPC Image - -The CentOS 7.9 HPC Image includes optimizations and recommended configurations to deliver optimal performance, -consistency, and reliability. This image consists of the following HPC tools and libraries: - -- Mellanox OFED -- Pre-configured IPoIB (IP-over-InfiniBand) -- Popular InfiniBand based MPI Libraries - - HPC-X - - IntelMPI - - MVAPICH2 - - OpenMPI -- Communication Runtimes - - Libfabric - - OpenUCX -- Optimized librares - - AMD Blis - - AMD FFTW - - AMD Flame - - Intel MKL -- GPU Drivers - - Nvidia GPU Driver -- NCCL - - NCCL RDMA Sharp Plugin - - NCCL Tests -- NV Peer Memory (GPU Direct RDMA) -- GRD Copy -- Data Center GPU Manager -- Azure HPC Diagnostics Tool -- Moby -- NVIDIA-Docker - -Software packages are configured as environment modules. Users can select preferred MPI or software packages as follows: - -`module load ` diff --git a/centos/centos-7.x/centos-7.9-hpc/disable_cloudinit.sh b/centos/centos-7.x/centos-7.9-hpc/disable_cloudinit.sh deleted file mode 100755 index 33b5f4d8..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/disable_cloudinit.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -set -ex - -# Disable Cloud-Init -cat << EOF >> /etc/cloud/cloud.cfg.d/99-custom-networking.cfg -network: {config: disabled} -EOF - -# Remove Hardware Mac Address and DHCP Name -cp /etc/sysconfig/network-scripts/ifcfg-eth0 tempFile -grep -v -E "HWADDR=|DHCP_HOSTNAME=" /etc/sysconfig/network-scripts/ifcfg-eth0 > tempFile -mv tempFile /etc/sysconfig/network-scripts/ifcfg-eth0 diff --git a/centos/centos-7.x/centos-7.9-hpc/hpc-tuning.sh b/centos/centos-7.x/centos-7.9-hpc/hpc-tuning.sh deleted file mode 100755 index a07165ca..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/hpc-tuning.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../common/hpc-tuning.sh - diff --git a/centos/centos-7.x/centos-7.9-hpc/install.sh b/centos/centos-7.x/centos-7.9-hpc/install.sh deleted file mode 100755 index d7c2a6b4..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/install.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -set -ex - -# set properties -source ./set_properties.sh - -# install utils -./install_utils.sh - -# install compilers -./install_gcc.sh - -# install mellanox ofed -./install_mellanoxofed.sh - -# install mpi libraries -./install_mpis.sh - -# cleanup downloaded tarballs -rm -rf *.tgz *.bz2 *.tbz *.tar.gz -rm -rf -- */ - -# install nvidia gpu driver -./install_nvidiagpudriver.sh - -# install AMD tuned libraries -./install_amd_libs.sh - -# install Intel libraries -./install_intel_libs.sh - -# Install NCCL -./install_nccl.sh - -# Install NVIDIA docker container -$COMMON_DIR/../centos/centos-7.x/common/install_docker.sh - -# cleanup downloaded tarballs -rm -rf *.tar.gz *_offline.sh *.rpm *.run - -# Install DCGM -./install_dcgm.sh - -# optimizations -./hpc-tuning.sh - -# Network Optimization -$COMMON_DIR/network-tuning.sh - -# install persistent rdma naming -$COMMON_DIR/install_azure_persistent_rdma_naming.sh - -# add udev rule -$COMMON_DIR/../centos/common/add-udev-rules.sh - -# add interface rules -$COMMON_DIR/../centos/common/network-config.sh - -# install diagnostic script -$COMMON_DIR/install_hpcdiag.sh - -# copy test file -$COMMON_DIR/copy_test_file.sh - -# disable cloud-init -./disable_cloudinit.sh - -# clear history -# Uncomment the line below if you are running this on a VM -# $COMMON_DIR/clear_history.sh diff --git a/centos/centos-7.x/centos-7.9-hpc/install_amd_libs.sh b/centos/centos-7.x/centos-7.9-hpc/install_amd_libs.sh deleted file mode 100755 index 11497c19..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/install_amd_libs.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/../centos/centos-7.x/common/install_amd_libs.sh diff --git a/centos/centos-7.x/centos-7.9-hpc/install_dcgm.sh b/centos/centos-7.x/centos-7.9-hpc/install_dcgm.sh deleted file mode 100755 index cc32477f..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/install_dcgm.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/../centos/centos-7.x/common/install_dcgm.sh diff --git a/centos/centos-7.x/centos-7.9-hpc/install_gcc.sh b/centos/centos-7.x/centos-7.9-hpc/install_gcc.sh deleted file mode 100755 index dbc4bb01..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/install_gcc.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -MODULE_FILES_DIRECTORY=/usr/share/Modules/modulefiles - -mkdir -p ${MODULE_FILES_DIRECTORY} - -$COMMON_DIR/install_gcc-9.2.sh ${MODULE_FILES_DIRECTORY} diff --git a/centos/centos-7.x/centos-7.9-hpc/install_intel_libs.sh b/centos/centos-7.x/centos-7.9-hpc/install_intel_libs.sh deleted file mode 100755 index 2f69d765..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/install_intel_libs.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/install_intel_libs.sh - diff --git a/centos/centos-7.x/centos-7.9-hpc/install_mellanoxofed.sh b/centos/centos-7.x/centos-7.9-hpc/install_mellanoxofed.sh deleted file mode 100755 index 4b027cea..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/install_mellanoxofed.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -set -ex - -VERSION="5.4-3.0.0.0" -$COMMON_DIR/write_component_version.sh "MOFED" $VERSION -TARBALL="MLNX_OFED_LINUX-$VERSION-rhel7.9-x86_64.tgz" -MLNX_OFED_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/$TARBALL -MOFED_FOLDER=$(basename ${MLNX_OFED_DOWNLOAD_URL} .tgz) - -$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL "b11a653769bb7e05ab67181d6dea1b271d0be759e49ba5e6a64ffcca4a2ab80f" -tar zxvf ${TARBALL} - -KERNEL=( $(rpm -q kernel | sed 's/kernel\-//g') ) -KERNEL=${KERNEL[-1]} -# Uncomment the lines below if you are running this on a VM -#RELEASE=( $(cat /etc/centos-release | awk '{print $4}') ) -#yum -y install http://olcentgbl.trafficmanager.net/centos/${RELEASE}/updates/x86_64/kernel-devel-${KERNEL}.rpm -yum install -y kernel-devel-${KERNEL} -./${MOFED_FOLDER}/mlnxofedinstall --kernel $KERNEL --kernel-sources /usr/src/kernels/${KERNEL} --add-kernel-support --skip-repo --skip-unsupported-devices-check --without-fw-update - -# Issue: Module mlx5_ib belong to a kernel which is not a part of MLNX -# Resolution: set FORCE=1/ force-restart /etc/init.d/openibd -# This causes openibd to ignore the kernel difference but relies on weak-updates -# Restarting openibd -/etc/init.d/openibd force-restart diff --git a/centos/centos-7.x/centos-7.9-hpc/install_mpis.sh b/centos/centos-7.x/centos-7.9-hpc/install_mpis.sh deleted file mode 100755 index 545ace6f..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/install_mpis.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -set -ex - -# Load gcc -GCC_VERSION=gcc-9.2.0 -export PATH=/opt/${GCC_VERSION}/bin:$PATH -export LD_LIBRARY_PATH=/opt/${GCC_VERSION}/lib64:$LD_LIBRARY_PATH -set CC=/opt/${GCC_VERSION}/bin/gcc -set GCC=/opt/${GCC_VERSION}/bin/gcc - - -INSTALL_PREFIX=/opt - -# HPC-X v2.9.0 -MLNX_OFED_VERSION="5.4-1.0.3.0" -HPCX_VERSION="v2.9.0" -$COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION -TARBALL="hpcx-${HPCX_VERSION}-gcc9.2.0-MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-redhat7.9-x86_64.tbz" -HPCX_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/${TARBALL} -HPCX_FOLDER=$(basename ${HPCX_DOWNLOAD_URL} .tbz) - -$COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL "ed1288376f3296c7dd95edb64984d94f09d1c7e84f91af773f376b24f1b79c9b" -tar -xvf ${TARBALL} -mv ${HPCX_FOLDER} ${INSTALL_PREFIX} -HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} - -# Enable Sharpd -${HPCX_PATH}/sharp/sbin/sharp_daemons_setup.sh -s -d sharpd -systemctl enable sharpd -systemctl start sharpd - -# Setup module files for MPIs -mkdir -p /usr/share/Modules/modulefiles/mpi/ - -# HPC-X -cat << EOF >> /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} -#%Module 1.0 -# -# HPCx ${HPCX_VERSION} -# -conflict mpi -module load ${HPCX_PATH}/modulefiles/hpcx -EOF - -# Create symlinks for modulefiles -ln -s /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} /usr/share/Modules/modulefiles/mpi/hpcx - -# Install platform independent MPIs -../common/install_mpis.sh ${GCC_VERSION} ${HPCX_PATH} - diff --git a/centos/centos-7.x/centos-7.9-hpc/install_nccl.sh b/centos/centos-7.x/centos-7.9-hpc/install_nccl.sh deleted file mode 100755 index 75d93ebe..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/install_nccl.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/../centos/common/install_nccl.sh diff --git a/centos/centos-7.x/centos-7.9-hpc/install_nvidiagpudriver.sh b/centos/centos-7.x/centos-7.9-hpc/install_nvidiagpudriver.sh deleted file mode 100755 index b264fa72..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/install_nvidiagpudriver.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/../centos/centos-7.x/common/install_nvidiagpudriver.sh diff --git a/centos/centos-7.x/centos-7.9-hpc/install_utils.sh b/centos/centos-7.x/centos-7.9-hpc/install_utils.sh deleted file mode 100755 index 9dd81d42..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/install_utils.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -ex - -# Setup microsoft packages repository for moby -# Download the repository configuration package -curl https://packages.microsoft.com/config/centos/7/prod.repo > ./microsoft-prod.repo -# Copy the generated list to the sources.list.d directory -cp ./microsoft-prod.repo /etc/yum.repos.d/ - -yum repolist - -../common/install_utils.sh - diff --git a/centos/centos-7.x/centos-7.9-hpc/set_properties.sh b/centos/centos-7.x/centos-7.9-hpc/set_properties.sh deleted file mode 100755 index 64cc71b8..00000000 --- a/centos/centos-7.x/centos-7.9-hpc/set_properties.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -export TOP_DIR=../../.. -export COMMON_DIR=../../../common -export TEST_DIR=../../../tests diff --git a/centos/centos-7.x/common/hpc-tuning.sh b/centos/centos-7.x/common/hpc-tuning.sh deleted file mode 100755 index 2446d826..00000000 --- a/centos/centos-7.x/common/hpc-tuning.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -# Disable some unneeded services by default (administrators can re-enable if desired) -systemctl disable wpa_supplicant -systemctl disable abrtd - -../../common/hpc-tuning.sh diff --git a/centos/centos-7.x/common/install_amd_libs.sh b/centos/centos-7.x/common/install_amd_libs.sh deleted file mode 100755 index cc4abf49..00000000 --- a/centos/centos-7.x/common/install_amd_libs.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -set -ex - -INSTALL_PREFIX=/opt/amd -mkdir -p ${INSTALL_PREFIX} - -AOCL_VERSION="2.2-4" -$COMMON_DIR/write_component_version.sh "AOCL" ${AOCL_VERSION} -TARBALL="aocl-linux-aocc-${AOCL_VERSION}_centos7.tar.gz" -AOCL_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/${TARBALL} -$COMMON_DIR/download_and_verify.sh $AOCL_DOWNLOAD_URL "4bdc5caec0233066ded3e2bfedb1e03bdeec82725361764d8860075f64ff4031" -tar -xvf ${TARBALL} -cd aocl-linux-aocc-${AOCL_VERSION} - -./install.sh -t amd -l blis fftw libflame -cp -r amd/2.2/* ${INSTALL_PREFIX} -cd .. && rm -rf aocl-linux-aocc-${AOCL_VERSION} - -# Setup module files for AMD Libraries -mkdir -p /usr/share/Modules/modulefiles/amd/ - -# fftw -cat << EOF >> /usr/share/Modules/modulefiles/amd/aocl-${AOCL_VERSION} -#%Module 1.0 -# -# AOCL -# -prepend-path LD_LIBRARY_PATH ${INSTALL_PREFIX}/lib -setenv AMD_FFTW_INCLUDE ${INSTALL_PREFIX}/include -EOF - -# Create symlinks for modulefiles -ln -s /usr/share/Modules/modulefiles/amd/aocl-${AOCL_VERSION} /usr/share/Modules/modulefiles/amd/aocl diff --git a/centos/centos-7.x/common/install_dcgm.sh b/centos/centos-7.x/common/install_dcgm.sh deleted file mode 100755 index f7e0b4d3..00000000 --- a/centos/centos-7.x/common/install_dcgm.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -set -ex - -# Install DCGM -DCGM_VERSION=2.3.1 -$COMMON_DIR/write_component_version.sh "DCGM" ${DCGM_VERSION} -DCGM_URL=https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/datacenter-gpu-manager-${DCGM_VERSION}-1-x86_64.rpm -$COMMON_DIR/download_and_verify.sh $DCGM_URL "586bf03a7b0c9827c80dc0a82c6e8fe780ff1d76d82b103866906e4cdd191710" -sudo rpm -i datacenter-gpu-manager-${DCGM_VERSION}-1-x86_64.rpm -sudo rm -f datacenter-gpu-manager-${DCGM_VERSION}-1-x86_64.rpm - -# Create service for dcgm to launch on bootup -sudo bash -c "cat > /etc/systemd/system/dcgm.service" <<'EOF' -[Unit] -Description=DCGM service - -[Service] -User=root -PrivateTmp=false -ExecStart=/usr/bin/nv-hostengine -n -Restart=on-abort - -[Install] -WantedBy=multi-user.target -EOF - -sudo systemctl enable dcgm -sudo systemctl start dcgm diff --git a/centos/centos-7.x/common/install_docker.sh b/centos/centos-7.x/common/install_docker.sh deleted file mode 100755 index a327e14e..00000000 --- a/centos/centos-7.x/common/install_docker.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash -set -ex - -# Install Moby Engine + CLI -yum install -y moby-engine -yum install -y moby-cli - -# Install NVIDIA Docker -# Reference: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html -# Setting up NVIDIA Container Toolkit -distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \ - && curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | tee /etc/yum.repos.d/nvidia-docker.repo -# MIG Capability on A100 -curl -s -L https://nvidia.github.io/nvidia-container-runtime/experimental/$distribution/nvidia-container-runtime.list | tee /etc/yum.repos.d/nvidia-container-runtime.list - -yum clean expire-cache -# Install nvidia-docker package -# Install NVIDIA container toolkit and mark NVIDIA packages on hold -yum install -y nvidia-container-toolkit -# Mark the installed packages on hold to disable updates -echo "exclude=nvidia-container-toolkit" | tee -a /etc/yum.conf -echo "exclude=libnvidia-container-tools" | tee -a /etc/yum.conf -echo "exclude=libnvidia-container1" | tee -a /etc/yum.conf - -# Install NVIDIA container runtime and mark NVIDIA packages on hold -yum install -y nvidia-container-runtime -echo "exclude=nvidia-container-runtime" | tee -a /etc/yum.conf - -wget https://raw.githubusercontent.com/NVIDIA/nvidia-docker/master/nvidia-docker -cp nvidia-docker /bin/ -chmod +x /bin/nvidia-docker -wget https://raw.githubusercontent.com/NVIDIA/nvidia-docker/master/daemon.json -cp daemon.json /etc/docker/ - -# Working setup can be tested by running a base CUDA container -# nvidia-docker run -e NVIDIA_VISIBLE_DEVICES=all nvidia/cuda:11.0-base nvidia-smi - -# enable and restart the docker daemon to complete the installation -systemctl enable docker -systemctl restart docker - -# Write the docker version to components file -docker_version=$(nvidia-docker --version | awk -F' ' '{print $3}') -$COMMON_DIR/write_component_version.sh "NVIDIA-DOCKER" ${docker_version::-1} - -# Clean repos -rm -rf /etc/yum.repos.d/nvidia-* -rm -rf /etc/yum.repos.d/microsoft-prod.repo - -rm -rf /var/cache/yum/x86_64/7/packages-microsoft-com-prod/ -rm -rf /var/cache/yum/x86_64/7/nvidia-* -rm -rf /var/cache/yum/x86_64/7/libnvidia-container/ diff --git a/centos/centos-7.x/common/install_lustre_client.sh b/centos/centos-7.x/common/install_lustre_client.sh deleted file mode 100755 index 9dbb82d8..00000000 --- a/centos/centos-7.x/common/install_lustre_client.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -set -ex - -# Install packages required to facilitate DKMS-based installations -yum install -y asciidoc audit-libs-devel automake bc \ - bison device-mapper-devel elfutils-devel \ - elfutils-libelf-devel expect flex gcc gcc-c++ git \ - glib2 glib2-devel hmaccalc keyutils-libs-devel krb5-devel ksh \ - libattr-devel libblkid-devel libselinux-devel libtool \ - libuuid-devel libyaml-devel lsscsi make ncurses-devel \ - net-snmp-devel net-tools newt-devel \ - parted patchutils pciutils-devel perl-ExtUtils-Embed \ - pesign redhat-rpm-config rpm-build systemd-devel \ - tcl-devel tk-devel wget xmlto yum-utils zlib-devel - -# Install the kernel packages -yum install -y kernel \ - kernel-devel \ - kernel-headers \ - kernel-abi-whitelists \ - kernel-tools \ - kernel-tools-libs \ - kernel-tools-libs-devel - -# Install the EPEL repository definition. EPEL provides the DKMS software -yum install -y epel-release - -cat << EOF >> /etc/yum.repos.d/lustre-client.repo -[lustre-client] -name=lustre-client -baseurl=https://downloads.whamcloud.com/public/lustre/latest-release/el7/client -# exclude=*debuginfo* -gpgcheck=0 -EOF - -# Install the Lustre client user-space tools and DKMS kernel module package -yum --nogpgcheck --enablerepo=lustre-client install -y lustre-client-dkms lustre-client diff --git a/centos/centos-7.x/common/install_mpis.sh b/centos/centos-7.x/common/install_mpis.sh deleted file mode 100755 index d514eb92..00000000 --- a/centos/centos-7.x/common/install_mpis.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -set -ex - -GCC_VERSION=$1 -HPCX_PATH=$2 - -HCOLL_PATH=${HPCX_PATH}/hcoll -UCX_PATH=${HPCX_PATH}/ucx -INSTALL_PREFIX=/opt - -# Load gcc -export PATH=/opt/${GCC_VERSION}/bin:$PATH -export LD_LIBRARY_PATH=/opt/${GCC_VERSION}/lib64:$LD_LIBRARY_PATH -set CC=/opt/${GCC_VERSION}/bin/gcc -set GCC=/opt/${GCC_VERSION}/bin/gcc - -# Intel MPI 2018 (update 4) -IMPI_VERSION="2018.4.274" -$COMMON_DIR/write_component_version.sh "IMPI_2018" ${IMPI_VERSION} -IMPI_2018_DOWNLOAD_URL=http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/13651/l_mpi_${IMPI_VERSION}.tgz -$COMMON_DIR/download_and_verify.sh $IMPI_2018_DOWNLOAD_URL "a1114b3eb4149c2f108964b83cad02150d619e50032059d119ac4ffc9d5dd8e0" -tar -xvf l_mpi_${IMPI_VERSION}.tgz -cd l_mpi_${IMPI_VERSION} -sed -i -e 's/ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' silent.cfg -./install.sh --silent ./silent.cfg -cd .. - -#IntelMPI-v2018 -cat << EOF >> /usr/share/Modules/modulefiles/mpi/impi_${IMPI_VERSION} -#%Module 1.0 -# -# Intel MPI ${IMPI_VERSION} -# -conflict mpi -module load /opt/intel/impi/${IMPI_VERSION}/intel64/modulefiles/mpi -setenv MPI_BIN /opt/intel/impi/${IMPI_VERSION}/intel64/bin -setenv MPI_INCLUDE /opt/intel/impi/${IMPI_VERSION}/intel64/include -setenv MPI_LIB /opt/intel/impi/${IMPI_VERSION}/intel64/lib -setenv MPI_MAN /opt/intel/impi/${IMPI_VERSION}/man -setenv MPI_HOME /opt/intel/impi/${IMPI_VERSION}/intel64 -EOF - -# Create symlinks for modulefiles -ln -s /usr/share/Modules/modulefiles/mpi/impi_${IMPI_VERSION} /usr/share/Modules/modulefiles/mpi/impi - -../../common/install_mpis.sh ${GCC_VERSION} ${HPCX_PATH} diff --git a/centos/centos-7.x/common/install_nvidiagpudriver.sh b/centos/centos-7.x/common/install_nvidiagpudriver.sh deleted file mode 100755 index f76e4b45..00000000 --- a/centos/centos-7.x/common/install_nvidiagpudriver.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -set -ex - -$COMMON_DIR/install_nvidiagpudriver.sh - -# Install NV Peer Memory (GPU Direct RDMA) -NV_PEER_MEMORY_VERSION="1.2-0" -$COMMON_DIR/write_component_version.sh "NV_PEER_MEMORY" ${NV_PEER_MEMORY_VERSION} -git clone https://github.com/gpudirect/nv_peer_memory.git - -pushd nv_peer_memory -yum install -y rpm-build -./build_module.sh -rpmbuild --rebuild /tmp/nvidia_peer_memory-${NV_PEER_MEMORY_VERSION}.src.rpm -rpm -ivh ~/rpmbuild/RPMS/x86_64/nvidia_peer_memory-${NV_PEER_MEMORY_VERSION}.x86_64.rpm -echo "exclude=nvidia_peer_memory" | sudo tee -a /etc/yum.conf -sudo modprobe nv_peer_mem -lsmod | grep nv -popd - -sudo bash -c "cat > /etc/modules-load.d/nv_peer_mem.conf" <<'EOF' -nv_peer_mem -EOF - -sudo systemctl enable nv_peer_mem.service - -# Install GDRCopy -GDRCOPY_VERSION="2.3" -$COMMON_DIR/write_component_version.sh "GDRCOPY" ${GDRCOPY_VERSION} -TARBALL="v${GDRCOPY_VERSION}.tar.gz" -GDRCOPY_DOWNLOAD_URL=https://github.com/NVIDIA/gdrcopy/archive/refs/tags/${TARBALL} -wget $GDRCOPY_DOWNLOAD_URL -tar -xvf $TARBALL - -pushd gdrcopy-${GDRCOPY_VERSION}/packages/ -CUDA=/usr/local/cuda ./build-rpm-packages.sh -rpm -Uvh gdrcopy-kmod-${GDRCOPY_VERSION}-1dkms.noarch.el7.rpm -echo "exclude=gdrcopy-kmod.noarch" | sudo tee -a /etc/yum.conf -rpm -Uvh gdrcopy-${GDRCOPY_VERSION}-1.x86_64.el7.rpm -echo "exclude=gdrcopy" | sudo tee -a /etc/yum.conf -rpm -Uvh gdrcopy-devel-${GDRCOPY_VERSION}-1.noarch.el7.rpm -echo "exclude=gdrcopy-devel.noarch" | sudo tee -a /etc/yum.conf -popd - -# Install Fabric Manager -NVIDIA_FABRIC_MANAGER_VERSION="470.82.01-1" -$COMMON_DIR/write_component_version.sh "NVIDIA_FABRIC_MANAGER" ${NVIDIA_FABRIC_MANAGER_VERSION} -NVIDIA_FABRIC_MNGR_URL=http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/nvidia-fabric-manager-${NVIDIA_FABRIC_MANAGER_VERSION}.x86_64.rpm -$COMMON_DIR/download_and_verify.sh ${NVIDIA_FABRIC_MNGR_URL} "ade1051a189fe84a326b8021d1446eb03d48e0a998e8cada85081b27a89923f1" -yum install -y ./nvidia-fabric-manager-${NVIDIA_FABRIC_MANAGER_VERSION}.x86_64.rpm -echo "exclude=nvidia-fabric-manager" | sudo tee -a /etc/yum.conf -systemctl enable nvidia-fabricmanager -systemctl start nvidia-fabricmanager diff --git a/centos/centos-7.x/common/install_utils.sh b/centos/centos-7.x/common/install_utils.sh deleted file mode 100755 index 21c806dc..00000000 --- a/centos/centos-7.x/common/install_utils.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -set -ex - -# Install pre-reqs and development tools -yum groupinstall -y "Development Tools" -yum install -y numactl \ - numactl-devel \ - libxml2-devel \ - byacc \ - environment-modules \ - python-devel \ - python-setuptools \ - gtk2 \ - atk \ - cairo \ - tcl \ - tk \ - m4 \ - texinfo \ - glibc-devel \ - glibc-static \ - libudev-devel \ - binutils \ - binutils-devel \ - selinux-policy-devel \ - kernel-headers \ - nfs-utils \ - fuse-libs \ - libpciaccess \ - cmake \ - libnl3-devel \ - libsecret \ - rpm-build \ - make \ - check \ - check-devel \ - subunit \ - subunit-devel - -## Install dkms from the EPEL repository -wget -r --no-parent -A "dkms-*.el7.noarch.rpm" https://dl.fedoraproject.org/pub/epel/7/x86_64/Packages/d/ -yum localinstall ./dl.fedoraproject.org/pub/epel/7/x86_64/Packages/d/dkms-*.el7.noarch.rpm -y - -## Install jq Utility -# Download dependency libonig.so for jq -wget -r --no-parent -A "oniguruma-*.el7.x86_64.rpm" https://dl.fedoraproject.org/pub/epel/7/x86_64/Packages/o/ -yum localinstall ./dl.fedoraproject.org/pub/epel/7/x86_64/Packages/o/oniguruma-*.el7.x86_64.rpm -y -# Download jq utility -wget -r --no-parent -A "jq-*.el7.x86_64.rpm" https://dl.fedoraproject.org/pub/epel/7/x86_64/Packages/j/ -yum localinstall ./dl.fedoraproject.org/pub/epel/7/x86_64/Packages/j/jq-*.el7.x86_64.rpm -y - -# Remove rpm files -rm -rf ./dl.fedoraproject.org/ - -# Install azcopy tool -# To copy blobs or files to or from a storage account. -wget https://azhpcstor.blob.core.windows.net/azhpc-images-store/azcopy_linux_se_amd64_10.12.2.tar.gz -tar -xvf azcopy_linux_se_amd64_10.12.2.tar.gz - -# copy the azcopy to the bin path -pushd azcopy_linux_se_amd64_10.12.2 -cp azcopy /usr/bin/ -popd - -# Allow execute permissions -chmod +x /usr/bin/azcopy - -# remove tarball from azcopy -rm -rf *.tar.gz diff --git a/centos/centos-8.x/centos-8.1-hpc/README.md b/centos/centos-8.x/centos-8.1-hpc/README.md deleted file mode 100644 index c726ef55..00000000 --- a/centos/centos-8.x/centos-8.1-hpc/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# CentOS 8.1 HPC Image - -The CentOS 8.1 HPC Image includes optimizations and recommended configurations to deliver optimal performance, -consistency, and reliability. This image consists of the following HPC tools and libraries: - -- Mellanox OFED -- Pre-configured IPoIB (IP-over-InfiniBand) -- Popular InfiniBand based MPI Libraries - - HPC-X - - IntelMPI - - MVAPICH2 - - OpenMPI -- Communication Runtimes - - Libfabric - - OpenUCX -- Optimized librares - - AMD Blis - - AMD FFTW - - AMD Flame - - Intel MKL -- Azure HPC Diagnostics Tool - -Software packages are configured as environment modules. Users can select preferred MPI or software packages as follows: - -`module load ` diff --git a/centos/centos-8.x/centos-8.1-hpc/hpc-tuning.sh b/centos/centos-8.x/centos-8.1-hpc/hpc-tuning.sh deleted file mode 100755 index 417db3e9..00000000 --- a/centos/centos-8.x/centos-8.1-hpc/hpc-tuning.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../../common/hpc-tuning.sh - diff --git a/centos/centos-8.x/centos-8.1-hpc/install.sh b/centos/centos-8.x/centos-8.1-hpc/install.sh deleted file mode 100755 index 7eb6d90c..00000000 --- a/centos/centos-8.x/centos-8.1-hpc/install.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -set -ex - -# set properties -source ./set_properties.sh - -# install utils -./install_utils.sh - -# install compilers -./install_gcc.sh - -# install mellanox ofed -./install_mellanoxofed.sh - -# install mpi libraries -./install_mpis.sh - -# install nvidia gpu driver -#./install_nvidiagpudriver.sh - -# install AMD tuned libraries -./install_amd_libs.sh - -# install Intel libraries -./install_intel_libs.sh - -# add udev rule -$COMMON_DIR/../centos/common/add-udev-rules.sh - -# add interface rules -$COMMON_DIR/../centos/common/network-config.sh - -# optimizations -./hpc-tuning.sh - -# copy test file -$COMMON_DIR/copy_test_file.sh - -# install diagnostic script -"$COMMON_DIR/install_hpcdiag.sh" - -# install persistent rdma naming -$COMMON_DIR/install_azure_persistent_rdma_naming.sh diff --git a/centos/centos-8.x/centos-8.1-hpc/install_amd_libs.sh b/centos/centos-8.x/centos-8.1-hpc/install_amd_libs.sh deleted file mode 100755 index 657d2b5b..00000000 --- a/centos/centos-8.x/centos-8.1-hpc/install_amd_libs.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/../centos/centos-8.x/common/install_amd_libs.sh diff --git a/centos/centos-8.x/centos-8.1-hpc/install_gcc.sh b/centos/centos-8.x/centos-8.1-hpc/install_gcc.sh deleted file mode 100755 index dbc4bb01..00000000 --- a/centos/centos-8.x/centos-8.1-hpc/install_gcc.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -MODULE_FILES_DIRECTORY=/usr/share/Modules/modulefiles - -mkdir -p ${MODULE_FILES_DIRECTORY} - -$COMMON_DIR/install_gcc-9.2.sh ${MODULE_FILES_DIRECTORY} diff --git a/centos/centos-8.x/centos-8.1-hpc/install_intel_libs.sh b/centos/centos-8.x/centos-8.1-hpc/install_intel_libs.sh deleted file mode 100755 index 2f69d765..00000000 --- a/centos/centos-8.x/centos-8.1-hpc/install_intel_libs.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/install_intel_libs.sh - diff --git a/centos/centos-8.x/centos-8.1-hpc/install_mellanoxofed.sh b/centos/centos-8.x/centos-8.1-hpc/install_mellanoxofed.sh deleted file mode 100755 index 60f62f6d..00000000 --- a/centos/centos-8.x/centos-8.1-hpc/install_mellanoxofed.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -set -ex - -VERSION="5.4-1.0.3.0" -$COMMON_DIR/write_component_version.sh "MOFED" $VERSION -TARBALL="MLNX_OFED_LINUX-$VERSION-rhel8.1-x86_64.tgz" -MLNX_OFED_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/$TARBALL -MOFED_FOLDER=$(basename ${MLNX_OFED_DOWNLOAD_URL} .tgz) - -$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL "171b304d76f2e88ec62138b01691a5580c5fa3674209f262592669c637891c13" -tar zxvf ${TARBALL} - -# Uncomment the lines below if you are running this on a VM -#KERNEL=( $(rpm -q kernel | sed 's/kernel\-//g') ) -#KERNEL=${KERNEL[-1]} -#RELEASE=( $(cat /etc/centos-release | awk '{print $4}') ) -#yum install -y https://packages.microsoft.com/yumrepos/1.0-eval-base-x8664-rpms/Packages/k/kernel-devel-${KERNEL}.rpm -#yum install -y https://packages.microsoft.com/yumrepos/1.0-eval-base-x8664-rpms/Packages/k/kernel-modules-extra-${KERNEL}.rpm - -yum install -y kernel-devel -yum install -y kernel-modules-extra -KERNEL=( $(rpm -q kernel-devel | sed 's/kernel-devel\-//g') ) -KERNEL=${KERNEL[-1]} - -./${MOFED_FOLDER}/mlnxofedinstall --kernel $KERNEL --kernel-sources /usr/src/kernels/${KERNEL} --add-kernel-support --skip-repo --skip-unsupported-devices-check --without-fw-update - -# Issue: Module mlx5_ib belong to a kernel which is not a part of MLNX -# Resolution: set FORCE=1/ force-restart /etc/init.d/openibd -# This causes openibd to ignore the kernel difference but relies on weak-updates -# Restarting openibd -/etc/init.d/openibd force-restart diff --git a/centos/centos-8.x/centos-8.1-hpc/install_mpis.sh b/centos/centos-8.x/centos-8.1-hpc/install_mpis.sh deleted file mode 100755 index 4f60fa9e..00000000 --- a/centos/centos-8.x/centos-8.1-hpc/install_mpis.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -set -ex - -# Load gcc -GCC_VERSION=gcc-9.2.0 -export PATH=/opt/${GCC_VERSION}/bin:$PATH -export LD_LIBRARY_PATH=/opt/${GCC_VERSION}/lib64:$LD_LIBRARY_PATH -set CC=/opt/${GCC_VERSION}/bin/gcc -set GCC=/opt/${GCC_VERSION}/bin/gcc - - -INSTALL_PREFIX=/opt - -# HPC-X v2.9.0 -MLNX_OFED_VERSION="5.4-1.0.3.0" -HPCX_VERSION="v2.9.0" -$COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION -TARBALL="hpcx-${HPCX_VERSION}-gcc9.2.0-MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-redhat8.1-x86_64.tbz" -HPCX_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/${TARBALL} -HPCX_FOLDER=$(basename ${HPCX_DOWNLOAD_URL} .tbz) - -$COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL "e47d708f3b89b3fcff169dac15ff2c454bc4bcc07575d8add2a8348f3eabebeb" -tar -xvf ${TARBALL} -mv ${HPCX_FOLDER} ${INSTALL_PREFIX} -HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} - -# Enable Sharpd -${HPCX_PATH}/sharp/sbin/sharp_daemons_setup.sh -s -d sharpd -systemctl enable sharpd -systemctl start sharpd - -# Setup module files for MPIs -mkdir -p /usr/share/Modules/modulefiles/mpi/ - -# HPC-X -cat << EOF >> /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} -#%Module 1.0 -# -# HPCx ${HPCX_VERSION} -# -conflict mpi -module load ${HPCX_PATH}/modulefiles/hpcx -EOF - -# Create symlinks for modulefiles -ln -s /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} /usr/share/Modules/modulefiles/mpi/hpcx - -# Install platform independent MPIs -../../common/install_mpis.sh ${GCC_VERSION} ${HPCX_PATH} diff --git a/centos/centos-8.x/centos-8.1-hpc/install_nvidiagpudriver.sh b/centos/centos-8.x/centos-8.1-hpc/install_nvidiagpudriver.sh deleted file mode 100755 index f510ecfc..00000000 --- a/centos/centos-8.x/centos-8.1-hpc/install_nvidiagpudriver.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -# Install kernel libs, these should already be installed with Mellanox OFED installation -KERNEL=( $(rpm -q kernel | sed 's/kernel\-//g') ) -KERNEL=${KERNEL[-1]} -# Uncomment the lines below if you are running this on a VM -#RELEASE=( $(cat /etc/centos-release | awk '{print $4}') ) -#yum install -y http://olcentwus.cloudapp.net/centos/${RELEASE}/BaseOS/x86_64/os/kernel-devel-${KERNEL}.rpm -#yum install -y http://olcentwus.cloudapp.net/centos/${RELEASE}/BaseOS/x86_64/os/kernel-headers-${KERNEL}.rpm -yum install -y kernel-devel-${KERNEL} -yum install -y kernel-headers-${KERNEL} - -# Install DKMS -sudo dnf install -y dkms - -$COMMON_DIR/install_nvidiagpudriver.sh \ No newline at end of file diff --git a/centos/centos-8.x/centos-8.1-hpc/install_utils.sh b/centos/centos-8.x/centos-8.1-hpc/install_utils.sh deleted file mode 100755 index 268f487a..00000000 --- a/centos/centos-8.x/centos-8.1-hpc/install_utils.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../common/install_utils.sh - diff --git a/centos/centos-8.x/centos-8.1-hpc/set_properties.sh b/centos/centos-8.x/centos-8.1-hpc/set_properties.sh deleted file mode 100755 index 64cc71b8..00000000 --- a/centos/centos-8.x/centos-8.1-hpc/set_properties.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -export TOP_DIR=../../.. -export COMMON_DIR=../../../common -export TEST_DIR=../../../tests diff --git a/centos/centos-8.x/centos-8.3-hpc/README.md b/centos/centos-8.x/centos-8.3-hpc/README.md deleted file mode 100644 index 61763b38..00000000 --- a/centos/centos-8.x/centos-8.3-hpc/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# CentOS 8.3 HPC Image - -The CentOS 8.3 HPC Image includes optimizations and recommended configurations to deliver optimal performance, -consistency, and reliability. This image consists of the following HPC tools and libraries: - -- Mellanox OFED -- Pre-configured IPoIB (IP-over-InfiniBand) -- Popular InfiniBand based MPI Libraries - - HPC-X - - IntelMPI - - MVAPICH2 - - OpenMPI -- Communication Runtimes - - Libfabric - - OpenUCX -- Optimized librares - - AMD Blis - - AMD FFTW - - AMD Flame - - Intel MKL -- Azure HPC Diagnostics Tool - -Software packages are configured as environment modules. Users can select preferred MPI or software packages as follows: - -`module load ` diff --git a/centos/centos-8.x/centos-8.3-hpc/hpc-tuning.sh b/centos/centos-8.x/centos-8.3-hpc/hpc-tuning.sh deleted file mode 100755 index 417db3e9..00000000 --- a/centos/centos-8.x/centos-8.3-hpc/hpc-tuning.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../../common/hpc-tuning.sh - diff --git a/centos/centos-8.x/centos-8.3-hpc/install.sh b/centos/centos-8.x/centos-8.3-hpc/install.sh deleted file mode 100755 index 7eb6d90c..00000000 --- a/centos/centos-8.x/centos-8.3-hpc/install.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -set -ex - -# set properties -source ./set_properties.sh - -# install utils -./install_utils.sh - -# install compilers -./install_gcc.sh - -# install mellanox ofed -./install_mellanoxofed.sh - -# install mpi libraries -./install_mpis.sh - -# install nvidia gpu driver -#./install_nvidiagpudriver.sh - -# install AMD tuned libraries -./install_amd_libs.sh - -# install Intel libraries -./install_intel_libs.sh - -# add udev rule -$COMMON_DIR/../centos/common/add-udev-rules.sh - -# add interface rules -$COMMON_DIR/../centos/common/network-config.sh - -# optimizations -./hpc-tuning.sh - -# copy test file -$COMMON_DIR/copy_test_file.sh - -# install diagnostic script -"$COMMON_DIR/install_hpcdiag.sh" - -# install persistent rdma naming -$COMMON_DIR/install_azure_persistent_rdma_naming.sh diff --git a/centos/centos-8.x/centos-8.3-hpc/install_amd_libs.sh b/centos/centos-8.x/centos-8.3-hpc/install_amd_libs.sh deleted file mode 100755 index 657d2b5b..00000000 --- a/centos/centos-8.x/centos-8.3-hpc/install_amd_libs.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/../centos/centos-8.x/common/install_amd_libs.sh diff --git a/centos/centos-8.x/centos-8.3-hpc/install_gcc.sh b/centos/centos-8.x/centos-8.3-hpc/install_gcc.sh deleted file mode 100755 index dbc4bb01..00000000 --- a/centos/centos-8.x/centos-8.3-hpc/install_gcc.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -MODULE_FILES_DIRECTORY=/usr/share/Modules/modulefiles - -mkdir -p ${MODULE_FILES_DIRECTORY} - -$COMMON_DIR/install_gcc-9.2.sh ${MODULE_FILES_DIRECTORY} diff --git a/centos/centos-8.x/centos-8.3-hpc/install_intel_libs.sh b/centos/centos-8.x/centos-8.3-hpc/install_intel_libs.sh deleted file mode 100755 index 2f69d765..00000000 --- a/centos/centos-8.x/centos-8.3-hpc/install_intel_libs.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -$COMMON_DIR/install_intel_libs.sh - diff --git a/centos/centos-8.x/centos-8.3-hpc/install_mellanoxofed.sh b/centos/centos-8.x/centos-8.3-hpc/install_mellanoxofed.sh deleted file mode 100755 index e1c7aa49..00000000 --- a/centos/centos-8.x/centos-8.3-hpc/install_mellanoxofed.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -set -ex - -MLNX_OFED_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/MLNX_OFED_LINUX-5.2-1.0.4.0-rhel8.3-x86_64.tgz -TARBALL=$(basename ${MLNX_OFED_DOWNLOAD_URL}) -MOFED_FOLDER=$(basename ${MLNX_OFED_DOWNLOAD_URL} .tgz) - -$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL "dd13ffa54524af9509e3de7288a863aa48ed7016bb3d15446523703020c41715" -tar zxvf ${TARBALL} - -KERNEL=( $(rpm -q kernel | sed 's/kernel\-//g') ) -KERNEL=${KERNEL[-1]} -yum install -y kernel-devel-${KERNEL} -./${MOFED_FOLDER}/mlnxofedinstall --kernel $KERNEL --kernel-sources /usr/src/kernels/${KERNEL} --add-kernel-support --skip-repo - diff --git a/centos/centos-8.x/centos-8.3-hpc/install_mpis.sh b/centos/centos-8.x/centos-8.3-hpc/install_mpis.sh deleted file mode 100755 index 170e1c3e..00000000 --- a/centos/centos-8.x/centos-8.3-hpc/install_mpis.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -set -ex - -# Load gcc -GCC_VERSION=gcc-9.2.0 -export PATH=/opt/${GCC_VERSION}/bin:$PATH -export LD_LIBRARY_PATH=/opt/${GCC_VERSION}/lib64:$LD_LIBRARY_PATH -set CC=/opt/${GCC_VERSION}/bin/gcc -set GCC=/opt/${GCC_VERSION}/bin/gcc - - -INSTALL_PREFIX=/opt - -# HPC-X v2.7.4 -HPCX_VERSION="v2.7.4" -HPCX_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/hpcx-v2.8.0-gcc-MLNX_OFED_LINUX-5.2-1.0.4.0-redhat8.3-x86_64.tbz -TARBALL=$(basename ${HPCX_DOWNLOAD_URL}) -HPCX_FOLDER=$(basename ${HPCX_DOWNLOAD_URL} .tbz) - -$COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL "ee1b0f3fe8d295ae662cd08794a890135b4014b9b3e8efa3f00aafe4678bae9c" -tar -xvf ${TARBALL} -mv ${HPCX_FOLDER} ${INSTALL_PREFIX} -HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} - -# Enable Sharpd -${HPCX_PATH}/sharp/sbin/sharp_daemons_setup.sh -s -d sharpd -systemctl enable sharpd -systemctl start sharpd - -# Setup module files for MPIs -mkdir -p /usr/share/Modules/modulefiles/mpi/ - -# HPC-X -cat << EOF >> /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} -#%Module 1.0 -# -# HPCx ${HPCX_VERSION} -# -conflict mpi -module load ${HPCX_PATH}/modulefiles/hpcx -EOF - -# Create symlinks for modulefiles -ln -s /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} /usr/share/Modules/modulefiles/mpi/hpcx - -# Install platform independent MPIs -../../common/install_mpis.sh ${GCC_VERSION} ${HPCX_PATH} diff --git a/centos/centos-8.x/centos-8.3-hpc/install_utils.sh b/centos/centos-8.x/centos-8.3-hpc/install_utils.sh deleted file mode 100755 index 268f487a..00000000 --- a/centos/centos-8.x/centos-8.3-hpc/install_utils.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -../common/install_utils.sh - diff --git a/centos/centos-8.x/centos-8.3-hpc/set_properties.sh b/centos/centos-8.x/centos-8.3-hpc/set_properties.sh deleted file mode 100755 index 64cc71b8..00000000 --- a/centos/centos-8.x/centos-8.3-hpc/set_properties.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -export TOP_DIR=../../.. -export COMMON_DIR=../../../common -export TEST_DIR=../../../tests diff --git a/centos/centos-8.x/common/install_amd_libs.sh b/centos/centos-8.x/common/install_amd_libs.sh deleted file mode 100755 index add62632..00000000 --- a/centos/centos-8.x/common/install_amd_libs.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -set -ex - -INSTALL_PREFIX=/opt/amd -mkdir -p ${INSTALL_PREFIX} - -AOCL_VERSION="2.2.1" -$COMMON_DIR/write_component_version.sh "AOCL" ${AOCL_VERSION} -TARBALL="aocl-linux-aocc-${AOCL_VERSION}_centos8.tar.gz" -AOCL_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/${TARBALL} -$COMMON_DIR/download_and_verify.sh $AOCL_DOWNLOAD_URL "cbe5afbdc241047a9d8814b5557be429aa0d9d2b83408eca8244e1ab9c8e2c87" -tar -xvf ${TARBALL} -cd aocl-linux-aocc-${AOCL_VERSION}_centos8 - -./install.sh -t amd -l blis fftw libflame -cp -r amd/${AOCL_VERSION}_centos8/* ${INSTALL_PREFIX} -cd .. && rm -rf aocl-linux-aocc-${AOCL_VERSION}_centos8 - -# Setup module files for AMD Libraries -mkdir -p /usr/share/Modules/modulefiles/amd/ - -# fftw -cat << EOF >> /usr/share/Modules/modulefiles/amd/aocl-${AOCL_VERSION} -#%Module 1.0 -# -# AOCL -# -prepend-path LD_LIBRARY_PATH ${INSTALL_PREFIX}/lib -setenv AMD_FFTW_INCLUDE ${INSTALL_PREFIX}/include -EOF - -# Create symlinks for modulefiles -ln -s /usr/share/Modules/modulefiles/amd/aocl-${AOCL_VERSION} /usr/share/Modules/modulefiles/amd/aocl diff --git a/centos/centos-8.x/common/install_utils.sh b/centos/centos-8.x/common/install_utils.sh deleted file mode 100755 index e2584f9e..00000000 --- a/centos/centos-8.x/common/install_utils.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -set -ex - -# Install pre-reqs and development tools -yum groupinstall -y "Development Tools" -yum install -y numactl \ - numactl-devel \ - libxml2-devel \ - byacc \ - environment-modules \ - gtk2 \ - atk \ - cairo \ - tcl \ - tk \ - m4 \ - tcsh \ - gcc-gfortran \ - python36-devel \ - elfutils-libelf-devel \ - kernel-rpm-macros \ - glibc-devel \ - libudev-devel \ - binutils \ - binutils-devel \ - selinux-policy-devel \ - kernel-headers \ - nfs-utils \ - fuse-libs \ - libpciaccess \ - cmake \ - libnl3-devel \ - libarchive \ - libsecret - -# Install azcopy tool -# To copy blobs or files to or from a storage account. -wget https://azhpcstor.blob.core.windows.net/azhpc-images-store/azcopy_linux_se_amd64_10.12.2.tar.gz -tar -xvf azcopy_linux_se_amd64_10.12.2.tar.gz - -# copy the azcopy to the bin path -pushd azcopy_linux_se_amd64_10.12.2 -cp azcopy /usr/bin/ -popd - -# Allow execute permissions -chmod +x /usr/bin/azcopy - -# remove tarball from azcopy -rm -rf *.tar.gz diff --git a/centos/common/add-udev-rules.sh b/centos/common/add-udev-rules.sh deleted file mode 100755 index 71910c97..00000000 --- a/centos/common/add-udev-rules.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -set -e - -#cat << EOF >> /etc/udev/rules.d/60-ib.rules -# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) -# Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file -# -# Rename modes: -# NAME_FALLBACK - Try to name devices in the following order: -# by-pci -> by-guid -> kernel -# NAME_KERNEL - leave name as kernel provided -# NAME_PCI - based on PCI/slot/function location -# NAME_GUID - based on system image GUID -# -# The stable names are combination of device type technology and rename mode. -# Infiniband - ib* -# RoCE - roce* -# iWARP - iw* -# OPA - opa* -# Default (unknown protocol) - rdma* -# -# Example: -# * NAME_PCI -# pci = 0000:00:0c.4 -# Device type = IB -# mlx5_0 -> ibp0s12f4 -# * NAME_GUID -# GUID = 5254:00c0:fe12:3455 -# Device type = RoCE -# mlx5_0 -> rocex525400c0fe123455 -# -#ACTION=="add", SUBSYSTEM=="infiniband", PROGRAM="rdma_rename %k NAME_PCI" -#EOF diff --git a/centos/common/hpc-tuning.sh b/centos/common/hpc-tuning.sh deleted file mode 100755 index 7887d4a4..00000000 --- a/centos/common/hpc-tuning.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -# Disable some unneeded services by default (administrators can re-enable if desired) -systemctl disable firewalld - -# Update memory limits -cat << EOF >> /etc/security/limits.conf -* hard memlock unlimited -* soft memlock unlimited -* hard nofile 65535 -* soft nofile 65535 -* hard stack unlimited -* soft stack unlimited -EOF - -# Enable reclaim mode -echo "vm.zone_reclaim_mode = 1" >> /etc/sysctl.conf -sysctl -p - -# Remove auoms if exists - Prevent CPU utilization by auoms -if yum list installed azsec-monitor >/dev/null 2>&1; then yum remove -y azsec-monitor; fi - -# Update WALinuxAgent - for IPoIB -yum update -y WALinuxAgent - -# Configure WALinuxAgent -sudo sed -i -e 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf -echo "Extensions.GoalStatePeriod=300" | sudo tee -a /etc/waagent.conf -echo "Extensions.InitialGoalStatePeriod=6" | sudo tee -a /etc/waagent.conf -echo "OS.EnableFirewallPeriod=300" | sudo tee -a /etc/waagent.conf -echo "OS.RemovePersistentNetRulesPeriod=300" | sudo tee -a /etc/waagent.conf -echo "OS.RootDeviceScsiTimeoutPeriod=300" | sudo tee -a /etc/waagent.conf -echo "OS.MonitorDhcpClientRestartPeriod=60" | sudo tee -a /etc/waagent.conf -echo "Provisioning.MonitorHostNamePeriod=60" | sudo tee -a /etc/waagent.conf -sudo systemctl restart waagent -$COMMON_DIR/write_component_version.sh "WAAGENT" $(waagent --version | head -n 1 | awk -F' ' '{print $1}' | awk -F- '{print $2}') diff --git a/centos/common/install_mpis.sh b/centos/common/install_mpis.sh deleted file mode 100755 index 879009b7..00000000 --- a/centos/common/install_mpis.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash -set -e - -GCC_VERSION=$1 -HPCX_PATH=$2 - -HCOLL_PATH=${HPCX_PATH}/hcoll -UCX_PATH=${HPCX_PATH}/ucx -INSTALL_PREFIX=/opt - -# Load gcc -export PATH=/opt/${GCC_VERSION}/bin:$PATH -export LD_LIBRARY_PATH=/opt/${GCC_VERSION}/lib64:$LD_LIBRARY_PATH -set CC=/opt/${GCC_VERSION}/bin/gcc -set GCC=/opt/${GCC_VERSION}/bin/gcc - -# MVAPICH2 2.3.6 -MV2_VERSION="2.3.6" -$COMMON_DIR/write_component_version.sh "MVAPICH2" ${MV2_VERSION} -MV2_DOWNLOAD_URL=http://mvapich.cse.ohio-state.edu/download/mvapich/mv2/mvapich2-${MV2_VERSION}.tar.gz -$COMMON_DIR/download_and_verify.sh $MV2_DOWNLOAD_URL "b3a62f2a05407191b856485f99da05f5e769d6381cd63e2fcb83ee98fc46a249" -tar -xvf mvapich2-${MV2_VERSION}.tar.gz -cd mvapich2-${MV2_VERSION} -./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MV2_VERSION} --enable-g=none --enable-fast=yes && make -j$(nproc) && make install -cd .. - - -# OpenMPI 4.1.1 -OMPI_VERSION="4.1.1" -$COMMON_DIR/write_component_version.sh "OMPI" ${OMPI_VERSION} -OMPI_DOWNLOAD_URL=https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz -$COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL "d80b9219e80ea1f8bcfe5ad921bd9014285c4948c5965f4156a3831e60776444" -tar -xvf openmpi-${OMPI_VERSION}.tar.gz -cd openmpi-${OMPI_VERSION} -./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install -cd .. - -# Intel MPI 2021 (Update 2) -IMPI_2021_VERSION="2021.4.0" -$COMMON_DIR/write_component_version.sh "IMPI_2021" ${IMPI_2021_VERSION} -IMPI_2021_DOWNLOAD_URL=https://registrationcenter-download.intel.com/akdlm/irc_nas/18186/l_mpi_oneapi_p_2021.4.0.441_offline.sh -$COMMON_DIR/download_and_verify.sh $IMPI_2021_DOWNLOAD_URL "cc4b7072c61d0bd02b1c431b22d2ea3b84b967b59d2e587e77a9e7b2c24f2a29" -bash l_mpi_oneapi_p_2021.4.0.441_offline.sh -s -a -s --eula accept -mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/impi - -# Setup module files for MPIs -mkdir -p /usr/share/Modules/modulefiles/mpi/ - -# MVAPICH2 -cat << EOF >> /usr/share/Modules/modulefiles/mpi/mvapich2-${MV2_VERSION} -#%Module 1.0 -# -# MVAPICH2 ${MV2_VERSION} -# -conflict mpi -module load ${GCC_VERSION} -prepend-path PATH /opt/mvapich2-${MV2_VERSION}/bin -prepend-path LD_LIBRARY_PATH /opt/mvapich2-${MV2_VERSION}/lib -prepend-path MANPATH /opt/mvapich2-${MV2_VERSION}/share/man -setenv MPI_BIN /opt/mvapich2-${MV2_VERSION}/bin -setenv MPI_INCLUDE /opt/mvapich2-${MV2_VERSION}/include -setenv MPI_LIB /opt/mvapich2-${MV2_VERSION}/lib -setenv MPI_MAN /opt/mvapich2-${MV2_VERSION}/share/man -setenv MPI_HOME /opt/mvapich2-${MV2_VERSION} -EOF - -# OpenMPI -cat << EOF >> /usr/share/Modules/modulefiles/mpi/openmpi-${OMPI_VERSION} -#%Module 1.0 -# -# OpenMPI ${OMPI_VERSION} -# -conflict mpi -module load ${GCC_VERSION} -prepend-path PATH /opt/openmpi-${OMPI_VERSION}/bin -prepend-path LD_LIBRARY_PATH /opt/openmpi-${OMPI_VERSION}/lib -prepend-path MANPATH /opt/openmpi-${OMPI_VERSION}/share/man -setenv MPI_BIN /opt/openmpi-${OMPI_VERSION}/bin -setenv MPI_INCLUDE /opt/openmpi-${OMPI_VERSION}/include -setenv MPI_LIB /opt/openmpi-${OMPI_VERSION}/lib -setenv MPI_MAN /opt/openmpi-${OMPI_VERSION}/share/man -setenv MPI_HOME /opt/openmpi-${OMPI_VERSION} -EOF - -#IntelMPI-v2021 -cat << EOF >> /usr/share/Modules/modulefiles/mpi/impi_${IMPI_2021_VERSION} -#%Module 1.0 -# -# Intel MPI ${IMPI_2021_VERSION} -# -conflict mpi -module load /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/impi -setenv MPI_BIN /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/bin -setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/include -setenv MPI_LIB /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/lib -setenv MPI_MAN /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/man -setenv MPI_HOME /opt/intel/oneapi/mpi/${IMPI_2021_VERSION} -EOF - -# Create symlinks for modulefiles -ln -s /usr/share/Modules/modulefiles/mpi/mvapich2-${MV2_VERSION} /usr/share/Modules/modulefiles/mpi/mvapich2 -ln -s /usr/share/Modules/modulefiles/mpi/openmpi-${OMPI_VERSION} /usr/share/Modules/modulefiles/mpi/openmpi -ln -s /usr/share/Modules/modulefiles/mpi/impi_${IMPI_2021_VERSION} /usr/share/Modules/modulefiles/mpi/impi-2021 - - diff --git a/centos/common/install_nccl.sh b/centos/common/install_nccl.sh deleted file mode 100755 index d204d4c7..00000000 --- a/centos/common/install_nccl.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash -set -ex - -# Install NCCL -yum install -y rpm-build rpmdevtools -NCCL_VERSION="2.11.4-1" -$COMMON_DIR/write_component_version.sh "NCCL" ${NCCL_VERSION} -TARBALL="v${NCCL_VERSION}.tar.gz" -NCCL_DOWNLOAD_URL=https://github.com/NVIDIA/nccl/archive/refs/tags/${TARBALL} -pushd /tmp -wget ${NCCL_DOWNLOAD_URL} -tar -xvf ${TARBALL} - -pushd nccl-${NCCL_VERSION} -make -j src.build -make pkg.redhat.build -rpm -i ./build/pkg/rpm/x86_64/libnccl-${NCCL_VERSION}+cuda11.4.x86_64.rpm -echo "exclude=libnccl" | sudo tee -a /etc/yum.conf -rpm -i ./build/pkg/rpm/x86_64/libnccl-devel-${NCCL_VERSION}+cuda11.4.x86_64.rpm -echo "exclude=libnccl-devel" | sudo tee -a /etc/yum.conf -rpm -i ./build/pkg/rpm/x86_64/libnccl-static-${NCCL_VERSION}+cuda11.4.x86_64.rpm -echo "exclude=libnccl-static" | sudo tee -a /etc/yum.conf -popd - -# Install the nccl rdma sharp plugin -mkdir -p /usr/local/nccl-rdma-sharp-plugins -git clone https://github.com/Mellanox/nccl-rdma-sharp-plugins.git -pushd nccl-rdma-sharp-plugins -./autogen.sh -./configure --prefix=/usr/local/nccl-rdma-sharp-plugins --with-cuda=/usr/local/cuda -make -make install -popd -popd - -# Build the nccl tests -source /etc/profile.d/modules.sh -module load mpi/hpcx -NCCL_TESTS_VERSION="2.13.3" -TARBALL="v${NCCL_TESTS_VERSION}.tar.gz" -NCCL_TESTS_DOWNLOAD_URL="https://github.com/NVIDIA/nccl-tests/archive/refs/tags/${TARBALL}" -wget ${NCCL_TESTS_DOWNLOAD_URL} -tar -xvf ${TARBALL} -pushd nccl-tests-${NCCL_TESTS_VERSION} -make MPI=1 MPI_HOME=${HPCX_MPI_DIR} CUDA_HOME=/usr/local/cuda -popd -mv nccl-tests-${NCCL_TESTS_VERSION} /opt/nccl-tests -module unload mpi/hpcx - -# Remove installation files -rm -rf /tmp/${TARBALL} -rm -rf /tmp/nccl-${NCCL_VERSION} diff --git a/centos/common/network-config.sh b/centos/common/network-config.sh deleted file mode 100755 index 4ffa70df..00000000 --- a/centos/common/network-config.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -set -ex - -sed -i '/\[main\]/a no-auto-default=*' /etc/NetworkManager/NetworkManager.conf - -# update network config on reboot -mkdir -p /lib/systemd/system/cloud-init-local.service.d/ -cat < /lib/systemd/system/cloud-init-local.service.d/50-azure-clear-persistent-obj-pkl.conf -[Service] -ExecStartPre=-/bin/sh -xc 'if [ -e /var/lib/cloud/instance/obj.pkl ]; then echo "cleaning persistent cloud-init object"; rm /var/lib/cloud/instance/obj.pkl; fi; exit 0' -EOF diff --git a/common/clear_history.sh b/common/clear_history.sh index a7841a2e..53dced5c 100755 --- a/common/clear_history.sh +++ b/common/clear_history.sh @@ -4,11 +4,7 @@ set -ex # Find distro find_distro() { local os=`cat /etc/os-release | awk 'match($0, /^NAME="(.*)"/, result) { print result[1] }'` - if [[ $os == "CentOS Linux" ]] - then - local centos_distro=`find_centos_distro` - echo "${os} ${centos_distro}" - elif [[ $os == "AlmaLinux" ]] + if [[ $os == "AlmaLinux" ]] then local alma_distro=`find_alma_distro` echo "${os} ${alma_distro}" @@ -22,11 +18,6 @@ find_distro() { fi } -# Find CentOS distro -find_centos_distro() { - echo `cat /etc/redhat-release | awk '{print $4}'` -} - # Find Alma distro find_alma_distro() { echo `cat /etc/redhat-release | awk '{print $3}'` @@ -40,7 +31,7 @@ find_ubuntu_distro() { distro=`find_distro` echo "Detected distro: ${distro}" -if [[ $distro == *"CentOS Linux"* ]] || [[ $distro == *"AlmaLinux"* ]] +if [[ $distro == *"AlmaLinux"* ]] then # Sync yum and rpmdb after installing rpm's outside yum yum history sync diff --git a/common/extract_distro.sh b/common/extract_distro.sh index 5ecef830..2c5a18b6 100755 --- a/common/extract_distro.sh +++ b/common/extract_distro.sh @@ -1,10 +1,5 @@ #!/bin/bash -# Find CentOS distro -find_centos_distro() { - echo `cat /etc/redhat-release | awk '{print $4}'` -} - # Find Ubuntu distro find_ubuntu_distro() { echo `cat /etc/os-release | awk 'match($0, /^PRETTY_NAME="(.*)"/, result) { print result[1] }' | awk '{print $2}'` @@ -12,11 +7,7 @@ find_ubuntu_distro() { # Find distro os=`cat /etc/os-release | awk 'match($0, /^NAME="(.*)"/, result) { print result[1] }'` -if [[ $os == "CentOS Linux" ]] -then - centos_distro=`find_centos_distro` - echo "${os} ${centos_distro}" -elif [[ $os == "Ubuntu" ]] +if [[ $os == "Ubuntu" ]] then ubuntu_distro=`find_ubuntu_distro` echo "${os} ${ubuntu_distro}" diff --git a/suse/sle-hpc-15.x/common/hpc-tuning.sh b/suse/sle-hpc-15.x/common/hpc-tuning.sh deleted file mode 100755 index 8a448640..00000000 --- a/suse/sle-hpc-15.x/common/hpc-tuning.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -# Disable some unneeded services by default (administrators can re-enable if desired) -systemctl disable firewalld - -# Update memory limits -cat << EOF >> /etc/security/limits.conf -* hard memlock unlimited -* soft memlock unlimited -* hard nofile 65535 -* soft nofile 65535 -* hard stack unlimited -* soft stack unlimited -EOF - -# Enable reclaim mode -echo "vm.zone_reclaim_mode = 1" >> /etc/sysctl.conf -#echo "net.ipv4.neigh.default.gc_thresh1 = 4096" >> /etc/sysctl.conf -#echo "net.ipv4.neigh.default.gc_thresh2 = 8192" >> /etc/sysctl.conf -#echo "net.ipv4.neigh.default.gc_thresh3 = 16384" >> /etc/sysctl.conf -#echo "sunrpc.tcp_max_slot_table_entries = 128" >> /etc/sysctl.conf -sysctl -p - -# on SUSE sunrpc get automatically loaded with nfs-client -# if you have problems psl. look at https://www.suse.com/support/kb/doc/?id=000019178 - -# Remove auoms if exists - Prevent CPU utilization by auoms -if zypper se --installed-only azsec-monitor >/dev/null 2>&1; then zypper --non-interactive remove -y azsec-monitor; fi - -# Update WALinuxAgent - for IPoIB -zypper --non-interactive update -y python-azure-agent - -# Configure WALinuxAgent -# EnableRDMA=y is already set by default within the SLE HPC image - -cat << EOF | tee -a /etc/waagent.conf -# default 6 -Extensions.GoalStatePeriod=300 -# default 30 -OS.RemovePersistentNetRulesPeriod=300 -# default 30 -OS.MonitorDhcpClientRestartPeriod=60 -# default 30 -Provisioning.MonitorHostNamePeriod=60 -EOF -systemctl restart waagent -$COMMON_DIR/write_component_version.sh "WAAGENT" $(waagent --version | head -n 1 | awk -F' ' '{print $1}' | awk -F- '{print $2}') - -# NFS read-ahead limit should be ok, no need for change -# check settings with: cat /sys/class/bdi/*/read_ahead_kb -# https://learn.microsoft.com/en-us/azure/azure-netapp-files/performance-linux-nfs-read-ahead - diff --git a/suse/sle-hpc-15.x/common/install_amd_libs.sh b/suse/sle-hpc-15.x/common/install_amd_libs.sh deleted file mode 100755 index d677e716..00000000 --- a/suse/sle-hpc-15.x/common/install_amd_libs.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -set -ex - -## AMD provides RPM packages, so in theory no need to use tarballs, -## but there is no way to get around the licence section at the website -## https://developer.amd.com/amd-aocl/#downloads -## there are two options, gcc 11.1 or AOCC3.2 -## aocl-linux-aocc- 3.1.0-1.x86_64.rpm -## aocl-linux-gcc-3.1.0-1.x86_64.rpm -## do be able to download you need to agree to licence -# -# Additionally, AMD provides the Spack (https://spack.io/) recipes for optimally installing BLIS, -# libFLAME, ScaLAPACK, LibM, FFTW, and Sparse libraries - -INSTALL_PREFIX=/opt/amd -mkdir -p ${INSTALL_PREFIX} - -TARBALL=$(basename $AOCL_DOWNLOAD_URL) - -# TODO: this seems a workaround to accept the licence prior download -# should be fixed in readme and made be more general e.g. prior download of the rpm and not the tarball -$COMMON_DIR/download_and_verify.sh $AOCL_DOWNLOAD_URL $AOCL_CHKSUM -tar -xvf ${TARBALL} - -cd aocl-linux-aocc-${AOCL_VERSION} -./install.sh -t amd -l blis fftw libflame -i lp64 -cp -r amd/${AOCL_VERSION}/* ${INSTALL_PREFIX} -cd .. && rm -rf aocl-linux-aocc-${AOCL_VERSION} ${TARBALL} - -$COMMON_DIR/write_component_version.sh "AOCL" ${AOCL_VERSION} - -# Setup module files for AMD Libraries -# SUSE HPC uses lmod by default -mkdir -p ${MODULE_FILES_DIRECTORY}/amd/ - -cat << EOF >> ${MODULE_FILES_DIRECTORY}/amd/aocl-${AOCL_VERSION} - -#%Module 1.0 -# -# AOCL -# -prepend-path LD_LIBRARY_PATH ${INSTALL_PREFIX}/lib -setenv AMD_FFTW_INCLUDE ${INSTALL_PREFIX}/include -EOF - -# Create symlinks for modulefiles -ln -sf $(readlink --canonicalize ${MODULE_FILES_DIRECTORY}/amd/aocl-${AOCL_VERSION}) ${MODULE_FILES_DIRECTORY}/amd/aocl - diff --git a/suse/sle-hpc-15.x/common/install_dcgm.sh b/suse/sle-hpc-15.x/common/install_dcgm.sh deleted file mode 100755 index f9b69adc..00000000 --- a/suse/sle-hpc-15.x/common/install_dcgm.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -set -ex - -# Install DCGM - -# workaround to create group for serviceUser=nvidia-dcgm -# as the package expects that the variable "USERGROUPS_ENAB" in /etc/login.defs is set to yes -# in oder that the useradd create the group too, but the SUSE default is "no", so we would change the -# variable or simply use the parameter -U -# wrong cmd: useradd -r -M -s /usr/sbin/nologin ${serviceUser} -# right cmd: useradd -r -M -U -s /usr/sbin/nologin ${serviceUser} -# see: man useradd -# bug reported to Nvidia -serviceUser="nvidia-dcgm" - -#check if user exists, if not create it -if ! id $serviceUser &>/dev/null; then - useradd -r -M -U -s /usr/sbin/nologin ${serviceUser} -fi -zypper --non-interactive install -y -l datacenter-gpu-manager = ${DCGM_VERSION} - -systemctl --now enable nvidia-dcgm - -# Check if the service is active -systemctl is-active --quiet nvidia-dcgm -error_code=$? -if [ ${error_code} -ne 0 ] -then - echo "DCGM is inactive!" - exit ${error_code} -fi - -# to verify the installation we can query the system -# You should see a listing of all supported GPUs (and any NVSwitches) found in the system: -# dcgmi discovery -l - -$COMMON_DIR/write_component_version.sh "DCGM" ${DCGM_VERSION} diff --git a/suse/sle-hpc-15.x/common/install_docker.sh b/suse/sle-hpc-15.x/common/install_docker.sh deleted file mode 100755 index b05f86a1..00000000 --- a/suse/sle-hpc-15.x/common/install_docker.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -set -ex - -# Install docker -zypper --non-interactive in -y -l docker - -DOCKER_VERSION=$(rpm -q --qf="%{VERSION}" docker) - -# Ensure the Docker service is running -systemctl --now enable docker - -# if experimental is needed -#zypper modifyrepo --enable libnvidia-container-experimental - -zypper --non-interactive install -y -l --replacefiles nvidia-docker2 nvidia-container-runtime - -systemctl restart docker - -# Test with -#docker run --rm --gpus all nvidia/cuda:11.0.3-base-ubuntu20.04 nvidia-smi - -# Write the docker version to components file -$COMMON_DIR/write_component_version.sh "NVIDIA-DOCKER" ${DOCKER_VERSION} diff --git a/suse/sle-hpc-15.x/common/install_gcc.sh b/suse/sle-hpc-15.x/common/install_gcc.sh deleted file mode 100755 index 69fd6759..00000000 --- a/suse/sle-hpc-15.x/common/install_gcc.sh +++ /dev/null @@ -1,9 +0,0 @@ - -# With SLE HPC 15 SP4 we have gcc7 and gcc11 provided as the base compiler toolchain -# and requires lua-lmod to supply environment module support. -# the default is gnu/7 within package gnu-compilers-hpc, gcc11 is in gnu11-compilers-hpc -# MODULE_FILES_DIRECTORY=/usr/share/lmod/modulefiles - -zypper in -y gnu${GNU_COMPILER_VERSION}-compilers-hpc-devel - -$COMMON_DIR/write_component_version.sh "GCC" ${GNU_COMPILER_VERSION} diff --git a/suse/sle-hpc-15.x/common/install_intel_libs.sh b/suse/sle-hpc-15.x/common/install_intel_libs.sh deleted file mode 100755 index dfb39c43..00000000 --- a/suse/sle-hpc-15.x/common/install_intel_libs.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -set -ex - -# Intel® oneAPI Math Kernel Library - -# Intel provides oneapi RPM packages for SUSE -# and the repository is set up in install_utils.sh -# so we can simply install the needed package - -# workaround for wrong multi-version repos of intel. -# instead of use the version for a package search and install -# we need to provide a fixed name including the version -# so instead of -# zypper install -y -l intel-oneapi-mkl = $INTEL_ONE_MKL_VERSION -# we forced to use -zypper --non-interactive install -y -l intel-oneapi-mkl-$INTEL_ONE_MKL_VERSION - -$COMMON_DIR/write_component_version.sh "INTEL_ONE_MKL" $INTEL_ONE_MKL_VERSION diff --git a/suse/sle-hpc-15.x/common/install_mellanoxofed.sh b/suse/sle-hpc-15.x/common/install_mellanoxofed.sh deleted file mode 100755 index d9101267..00000000 --- a/suse/sle-hpc-15.x/common/install_mellanoxofed.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -set -ex - -# SLES comes with mellanox inbox (kernel) drivers by default, so no need to install anything -# - -# the ibdev2netdev is only in the external mellanox package, so we do not have it with inbox drivers -wget https://raw.githubusercontent.com/Mellanox/container_scripts/master/ibdev2netdev -mv ibdev2netdev /usr/local/bin -chmod +x /usr/local/bin/ibdev2netdev - -# IF you want the external drivers provided by https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/ -# it provides packages for SLES with MLNX_OFED_LINUX-5.7-1.0.2.0-sles15sp4-x86_64.tgz -# SHA256: f3af9dd691dc07404fa07a1c3819de14361dc292d90a9b81aac6a7c729a2ea0f -# you need to agree to the eula and provide the file somewhere - -#MLNX_OFED_DOWNLOAD_URL=https://content.mellanox.com/ofed/MLNX_OFED-5.7-1.0.2.0/MLNX_OFED_LINUX-5.7-1.0.2.0-sles15sp4-x86_64.tgz -#TARBALL=$(basename ${MLNX_OFED_DOWNLOAD_URL}) -#MOFED_FOLDER=$(basename ${MLNX_OFED_DOWNLOAD_URL} .tgz) - -#$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL "f3af9dd691dc07404fa07a1c3819de14361dc292d90a9b81aac6a7c729a2ea0f" -#tar zxvf ${TARBALL} - -# SUSE - if you use the tarball -# The tarball contains modules for SLES default kernel already, there is no need for adding additional parameters to the installscript -# but our default for SLES HPC is the -azure kernel, so we need to add it or switch to kernel-default. - -# SUSE default kernel would be simply: -#./${MOFED_FOLDER}/mlnxofedinstall - -# SUSE azure kernel (manual check before if kernel and kernel-src fit together, could be not the same due to updates) -#KERNEL=$(uname -r) -#zypper in -y -l kernel-azure-devel kernel-source-azure -#make -C /usr/src/linux-azure oldconfig -#./${MOFED_FOLDER}/mlnxofedinstall --kernel $KERNEL --kernel-sources /usr/src/linux-${KERNEL} --add-kernel-support --skip-repo diff --git a/suse/sle-hpc-15.x/common/install_mpis.sh b/suse/sle-hpc-15.x/common/install_mpis.sh deleted file mode 100755 index 195226fc..00000000 --- a/suse/sle-hpc-15.x/common/install_mpis.sh +++ /dev/null @@ -1,123 +0,0 @@ -#!/bin/bash -set -ex - -# SLE HPC 15 SP4 comes with three different implementation of the Message Passing Interface (MPI) standard are provided standard with the HPC module: -# Open MPI 4 (and version 3) -# MVAPICH2 -# MPICH 4 -# MPICH-ofi 4.0.1 -# openblas -# Intel MPI Benchmarks -# These packages have been built with full environment module support (LMOD). -# PLEASE have a look at our documentation -# https://documentation.suse.com/sle-hpc/15-SP4/single-html/hpc-guide/#sec-compute-lib - -# https://docs.nvidia.com/networking/category/hpcx -# https://docs.nvidia.com/networking/display/HPCXv28 - - -# Load gcc -set CC=/usr/bin/gcc -set GCC=/usr/bin/gcc - -# -INSTALL_PREFIX=/opt - -# MVAPICH2 -# shipped with SLE HPC -zypper --non-interactive install -y mvapich2-gnu-hpc -MV2_VERSION=$(rpm -q --qf="%{VERSION}" mvapich2-gnu-hpc) -$COMMON_DIR/write_component_version.sh "MVAPICH2" ${MV2_VERSION} - -# OpenMPI 4 -# shipped with SLE HPC -zypper --non-interactive install -y ${OMPI}-gnu-hpc lib${OMPI}-gnu-hpc -OMPI_VERSION=$(rpm -q --qf="%{VERSION}" ${OMPI}-gnu-hpc) -$COMMON_DIR/write_component_version.sh "OMPI" ${OMPI_VERSION} - -# HPC-X -TARBALL=$(basename ${HPCX_DOWNLOAD_URL}) -HPCX_FOLDER=$(basename ${HPCX_DOWNLOAD_URL} .tbz) -# the web page said checksum is md5 but in reality is sha256 -$COMMON_DIR/download_and_verify.sh ${HPCX_DOWNLOAD_URL} ${HPCX_CHKSUM} -tar -xvf ${TARBALL} -rm -rf ${INSTALL_PREFIX}/${HPCX_FOLDER} -mv ${HPCX_FOLDER} ${INSTALL_PREFIX} -HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} -$COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION - -# Enable Sharpd -#${HPCX_PATH}/sharp/sbin/sharp_daemons_setup.sh -s -d sharpd -#systemctl enable sharpd -#systemctl start sharpd - -# Intel MPI -# as there are more versions in the repos we need to select one -# instead of always get the newest -# workaround for wrong multi-version repos of intel. -# instead of use the version for a package search and install -# we need to provide a fixed name including the version -# so instead of -# zypper install -y -l intel-oneapi-mpi = $INTEL_ONE_MKL_VERSION -# we forced to use -zypper install -y -l intel-oneapi-mpi-${INTEL_ONE_MPI_VERSION} -# Create modulesfiles -/opt/intel/oneapi/modulefiles-setup.sh --force - -$COMMON_DIR/write_component_version.sh "IMPI_${IMPI_MAJOR}" ${INTEL_ONE_MPI_VERSION} - -# -# # Setup module files for MPIs -# - -mkdir -p $MODULE_FILES_DIRECTORY/mpi/ - -# -# # HPC-X -cat << EOF >> $MODULE_FILES_DIRECTORY/mpi/hpcx-${HPCX_VERSION} -#%Module 1.0 -# -# HPCx ${HPCX_VERSION} -# -#module-whatis "Description: Mellanox HPC-X™ Software Toolkit" -set version ${HPCX_VERSION} -conflict mpi -module use ${HPCX_PATH}/modulefiles -module load hpcx -EOF - -# libraries are build against gnu-7 and not gnu-11, so we need to have the path hardcoded to gnu-7 -# Cannot use links because we need to load compiler as well. -# e.g. /usr/share/lmod/moduledeps/gnu-7/mvapich2/2.3.6 -# MVAPICH2 -> already provided by suse package, build with gcc7 -cat <> ${MODULE_FILES_DIRECTORY}/mpi/mvapich2-${MV2_VERSION} -#%Module 1.0 -set version ${MV2_VERSION} -conflict mpi -module use /usr/share/lmod/modulefiles -module load gnu/7 mvapich2/${MV2_VERSION} -EOF - -# OpenMPI -> already provided by suse package, build with gcc7 -cat <> ${MODULE_FILES_DIRECTORY}/mpi/openmpi-${OMPI_VERSION} -#%Module 1.0 -set version ${OMPI_VERSION} -conflict mpi -module use /usr/share/lmod/modulefiles -module load gnu/7 openmpi/${OMPI_VERSION} -EOF - -# Intel oneAPI -# oneapi provides its own modulefiles -ln -sf $(readlink --canonicalize $INTELLIBS/mpi/${INTEL_ONE_MPI_VERSION}/modulefiles/mpi) ${MODULE_FILES_DIRECTORY}/mpi/impi_${INTEL_ONE_MPI_VERSION} - - -# # Create symlinks for modulefiles -ln -sf $(readlink --canonicalize ${MODULE_FILES_DIRECTORY}/mpi/hpcx-${HPCX_VERSION}) ${MODULE_FILES_DIRECTORY}/mpi/hpcx -ln -sf $(readlink --canonicalize ${MODULE_FILES_DIRECTORY}/mpi/mvapich2-${MV2_VERSION}) ${MODULE_FILES_DIRECTORY}/mpi/mvapich2 -ln -sf $(readlink --canonicalize ${MODULE_FILES_DIRECTORY}/mpi/openmpi-${OMPI_VERSION}) ${MODULE_FILES_DIRECTORY}/mpi/openmpi -ln -sf $(readlink --canonicalize ${MODULE_FILES_DIRECTORY}/mpi/impi_${INTEL_ONE_MPI_VERSION}) ${MODULE_FILES_DIRECTORY}/mpi/impi-${IMPI_MAJOR} - -# cleanup downloaded tarballs and other installation files/folders -rm -rf *.tar.gz *offline.sh -rm -rf -- */ diff --git a/suse/sle-hpc-15.x/common/install_nccl.sh b/suse/sle-hpc-15.x/common/install_nccl.sh deleted file mode 100755 index 1346572b..00000000 --- a/suse/sle-hpc-15.x/common/install_nccl.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -set -ex - -# Install NCCL -# Optimized primitives for inter-GPU communication. - -# add rpm build tools -zypper install -y -l rpm-build rpmdevtools git - -CUDA_MAJOR=$( echo ${CUDA_VERSION} | cut -d "." -f 1) -CUDA_MINOR=$( echo ${CUDA_VERSION} | cut -d "." -f 2) - -TARBALL=$(basename ${NCCL_DOWNLOAD_URL}) - -pushd /tmp -wget ${NCCL_DOWNLOAD_URL} -tar -xvf ${TARBALL} - -pushd nccl-${NCCL_VERSION} - -# if you need to limit the number of parallel runs on smaller machines -#mem=$(cat /proc/meminfo | head -1 | sed -e "s/^[^ ]\+[ ]\+\([^ ]\+\)[ ]\+.*/\\1/") -#core=$(cat /proc/cpuinfo | grep processor | wc -l) -#cnt=$(( a=mem/(512*1024), a < core ? a : core )) -#make -j $cnt src.build NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80" - -# You should define NVCC_GENCODE in your environment to the minimal set -# of archs to reduce compile time. -#make -j src.build NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80" - -make -j src.build - -# build rpm packages -make pkg.redhat.build -rpm -i ./build/pkg/rpm/x86_64/libnccl-${NCCL_VERSION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}.x86_64.rpm -rpm -i ./build/pkg/rpm/x86_64/libnccl-devel-${NCCL_VERSION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}.x86_64.rpm -rpm -i ./build/pkg/rpm/x86_64/libnccl-static-${NCCL_VERSION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}.x86_64.rpm -popd -rm -rf nccl-${NCCL_VERSION} $TARBALL - -# Install the nccl rdma sharp plugin -# we need the packages: autoconf automake libtool rdma-core-devel -mkdir -p /usr/local/nccl-rdma-sharp-plugins -git clone https://github.com/Mellanox/nccl-rdma-sharp-plugins.git -pushd nccl-rdma-sharp-plugins -./autogen.sh -./configure --prefix=/usr/local/nccl-rdma-sharp-plugins --with-cuda=/usr/local/cuda -make -make install -popd -rm -rf nccl-rdma-sharp-plugins - - -# Build the nccl tests -source /etc/profile.d/lmod.sh -module load mpi/hpcx -git clone https://github.com/NVIDIA/nccl-tests.git -pushd nccl-tests -make MPI=1 MPI_HOME=${HPCX_MPI_DIR} CUDA_HOME=/usr/local/cuda -popd -mv nccl-tests /opt -module purge -popd - - -$COMMON_DIR/write_component_version.sh "NCCL" ${NCCL_VERSION} - -# Remove installation files -rm -rf /tmp/${TARBALL} -rm -rf /tmp/nccl-${NCCL_VERSION} diff --git a/suse/sle-hpc-15.x/common/install_nvidiagpudriver.sh b/suse/sle-hpc-15.x/common/install_nvidiagpudriver.sh deleted file mode 100755 index 010fab04..00000000 --- a/suse/sle-hpc-15.x/common/install_nvidiagpudriver.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -set -ex - -# -## Nvidia provide certified packages for SLES 15 SP4, so we only need to add the repositories and install the packages -# -DRIVER_BRANCH_VERSION=${NVIDIA_VERSION%.*.*} # branch is like main version e.g. 525 from 525.85.12 -CUDA_DASH_VERSION=${CUDA_VERSION/./-} - -# to check whats all available in the repo -# zypper se --repo cuda-sles15-x86_64 - -# install latest cuda package -# the repo contains more versions, so for example we want cuda 11.3 the package name is "cuda-11-3" -# -# cuda Installs all CUDA Toolkit and Driver packages. Handles upgrading to the next version of the cuda package when it's released. -# cuda-11-8 Installs all CUDA Toolkit and Driver packages. Remains at version 11.8 until an additional version of CUDA is installed. -# cuda-toolkit-11-8 Installs all CUDA Toolkit packages required to develop CUDA applications. Does not include the driver. -# cuda-tools-11-8 Installs all CUDA command line and visual tools. -# cuda-runtime-11-8 Installs all CUDA Toolkit packages required to run CUDA applications, as well as the Driver packages. -# cuda-compiler-11-8 Installs all CUDA compiler packages. -# cuda-libraries-11-8 Installs all runtime CUDA Library packages. -# cuda-libraries-devel-11-8 Installs all development CUDA Library packages. -# cuda-drivers Installs all Driver packages. Handles upgrading to the next version of the Driver packages when they're released. - -# due to NVIDIA bug in post-install of the nvidia-drivers for kernel-azure, we need to select and install nvidia-gfxG05-kmp-azure manually -# The cuda dependencies select packages with "-default" and then the (wrong) modules for kernel-default instead of kernel-azure got installed -# -# Don't install cuda-drivers: this introduces X11 and Wayland - instead install nvidia-computeGXX -# Don't install cuda-toolkit: this introduces visualization tools -# - instead install cuda-compilers, cuda-command-line-tools, gds-tools and cuda_libraries -zypper -n install -y -l --no-recommends cuda-toolkit-${CUDA_DASH_VERSION} cuda-compiler-${CUDA_DASH_VERSION} cuda-command-line-tools-${CUDA_DASH_VERSION} gds-tools-${CUDA_DASH_VERSION} cuda-libraries-${CUDA_DASH_VERSION} nvidia-fabricmanager = ${NVIDIA_VERSION} "nvidia-gfxG05-kmp-azure = ${NVIDIA_VERSION}" "nvidia-computeG05 = ${NVIDIA_VERSION}" - - -$COMMON_DIR/write_component_version.sh "CUDA" ${CUDA_VERSION} -$COMMON_DIR/write_component_version.sh "NVIDIA" ${NVIDIA_VERSION} - -# Post-install tasks (version its set through 'alternatives') -echo 'export PATH=$PATH:/usr/local/cuda/bin' | tee -a /etc/bash.bashrc.local -echo '/usr/local/cuda/lib64' | tee /etc/ld.so.conf.d/cuda.conf - -# start the fabricmanager - needed for run-tests on ND96asr_v4 -# systemctl start nvidia-fabricmanager.service diff --git a/suse/sle-hpc-15.x/common/install_utils.sh b/suse/sle-hpc-15.x/common/install_utils.sh deleted file mode 100755 index f5673e8a..00000000 --- a/suse/sle-hpc-15.x/common/install_utils.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash -set -ex - -#------------------------------------------------------------------- -# Container Repository -#------------------------------------------------------------------- -# Docker is shipped with SLES by default -# with SLE HPC we need to enable the Container repository -SUSEConnect -p sle-module-containers/${SLE_DOTV}/x86_64 - -#------------------------------------------------------------------- -# Add SUSE Package Hub -# byacc is only in packagehub -SUSEConnect -p PackageHub/${SLE_DOTV}/x86_64 -#------------------------------------------------------------------- - -#------------------------------------------------------------------- -# Nvidia provide certified packages for SLES 15, so we only need to add the repositories and install the packages -# add the repo key separately beforehand. -SUSEConnect -p sle-module-NVIDIA-compute/${SLE_MAJOR}/x86_64 --gpg-auto-import-keys -#------------------------------------------------------------------- - -# Install pre-reqs and development tools -# - -# Add additional repositories - -#------------------------------------------------------------------- -# Intel provides oneapi RPM packages for SUSE, so we only need to add the repositories and install the packages -#------------------------------------------------------------------- -# see -# https://www.intel.com/content/www/us/en/develop/documentation/installation-guide-for-intel-oneapi-toolkits-linux/top/installation/install-using-package-managers/yum-dnf-zypper.html - -# import package signing keys -rpm --import $INTEL_PUBKEY_URI -# delete if exists -zypper -n rr oneAPI &>/dev/null || : -# add repository -zypper -n addrepo -f -g $INTEL_REPO_URI oneAPI -# fetch key -zypper --non-interactive --gpg-auto-import-keys refresh oneAPI -# disable auto-refresh for the repo (mr -F) -zypper --non-interactive modifyrepo --no-refresh oneAPI - -# list all packages -# sudo -E zypper pa -ir oneAPI -#------------------------------------------------------------------- - -#------------------------------------------------------------------- -# Nvidia container repo -#------------------------------------------------------------------- -# see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html -# Check https://nvidia.github.io/libnvidia-container -zypper --non-interactive rr libnvidia-container &>/dev/null || : -zypper addrepo -f -g $NVIDIA_CONTAINER_REPO_URI -# fetch key -zypper --non-interactive --gpg-auto-import-keys refresh libnvidia-container - -#------------------------------------------------------------------- -# Add SUSE Package Hub -# byacc is only in packagehub -SUSEConnect -p PackageHub/${SLE_DOTV}/x86_64 -#------------------------------------------------------------------- - -# -## SLES HPC ship with many HPC packages already, so no need to build it - simple install is enough -# -# Install base compiler (this will pull in packages for HPC and Lmod as well) -zypper --non-interactive in -y gnu-compilers-hpc-devel - -# -# If you run kernel-default remove "-azure" from the kernel package names below -# -zypper install -y \ - numactl \ - byacc \ - atk \ - m4 \ - ${KERNEL_VERSION:+kernel-azure-devel = ${KERNEL_VERSION}} \ - ${KERNEL_VERSION:+kernel-source-azure = ${KERNEL_VERSION}} \ - binutils \ - fuse \ - cmake \ - libarchive13 \ - libsecret-1-0 \ - libnuma-devel \ - libibverbs-utils \ - perftest \ - mstflint \ - bzip2 \ - vim-data \ - clone-master-clean-up \ - insserv-compat \ - rpm-build \ - python3-devel\ - patch \ - python-rpm-macros \ - lshw \ - autoconf \ - automake \ - libtool \ - nfs-client \ - jq \ - rdma-core-devel \ - wget - -# Install azcopy tool -## To copy blobs or files to or from a storage account. -wget ${AZCOPY_DOWNLOAD_URL} -tar -xvf ${AZTARBALL} -## copy the azcopy to the bin path - better would be ${LOCALBIN} -mv azcopy_linux_amd64_${AZVERSION}/azcopy ${LOCALBIN} -chmod +x ${LOCALBIN}/azcopy -$COMMON_DIR/write_component_version.sh "azcopy" ${AZCOPY_VERSION} -## remove azcopy tarball and directory -rm -rf *.tar.gz azcopy_linux_amd64_${AZVERSION} - diff --git a/suse/sle-hpc-15.x/sle-hpc-15-sp4/README.md b/suse/sle-hpc-15.x/sle-hpc-15-sp4/README.md deleted file mode 100755 index 4c9443f6..00000000 --- a/suse/sle-hpc-15.x/sle-hpc-15-sp4/README.md +++ /dev/null @@ -1,62 +0,0 @@ -# SUSE Linux Enterprise HPC 15 SP4 enhanced image - -SUSE Linux Enterprise HPC is a SUSE maintained and supported commercial product for the HPC market. -see https://www.suse.com/products/server/hpc/ - -SUSE provides images in the Azure Marketplace as PayAsYouGo (PAYG) or BringYourOwnSubscription (BYOS) model - -The SLE HPC 15 SP4 includes optimizations and recommended configurations to deliver optimal performance, -consistency, and reliability. As an enterprise distribution, SUSE provides stable long term support and many certifications with vendors. - -Many components come per default with the distribution like slurm, genders, pdsh, munge, hwloc, conman, numpy, scipy, PLlx,openblas, hdf5, openmpi, mvapich2, mpich, imb, papi, mpiP, spack, dolly, lmod. - -See documentation at https://documentation.suse.com/sle-hpc/15-SP4/ - -This setup here is build on top of **the PAYG image**, as it provides easy access without any registration to all SUSE provided software packages. - -The azhpc-images script will in addition add modules and libraries which are NOT supported by SUSE and could not delivered by SUSE. You need to own the subscriptions/licences and agree to the respective EULAs from the vendors by yourself. - -## Enhancements -This image consists of the following additional HPC tools and libraries: - -- *(S) provided and supported by SUSE* -- *(E) external sources, added by the script - not supported by SUSE* - -### Infiniband - -- (S) - Mellanox OFED (inbox drivers) -- (S) - Pre-configured IPoIB (IP-over-InfiniBand) - -### Popular InfiniBand based MPI Libraries - -- (E) - HPC-X -- (E) - IntelMPI (via Intel oneAPI) -- (S) - MVAPICH2 -- (S) - MPICH 4 -- (S) - OpenMPI (v3 and v4) - -### Optimized librares - -- (E) - AMD (via tarball AOCL ) - - Blis - - FFTW - - Flame - -- (E) - Intel MKL (Intel oneAPI via intel-oneapi-repo) - -- (E) - Nvida drivers -- (E) - CUDA -- (E) - NCCL - -- (E) - Data Center GPU Manager -- (E) - Azure HPC Diagnostics Tool - -- (S) - Docker -- (E) - NVIDIA Docker - -Software packages are configured as environment modules (lmod). Users can select preferred MPI or software packages as follows: -`module load ` - -Don't forget to set the group "video" for your user running nvidia cmds - -`sudo usermod -a -G video ` diff --git a/suse/sle-hpc-15.x/sle-hpc-15-sp4/config b/suse/sle-hpc-15.x/sle-hpc-15-sp4/config deleted file mode 100644 index 2ed0bdc8..00000000 --- a/suse/sle-hpc-15.x/sle-hpc-15-sp4/config +++ /dev/null @@ -1,120 +0,0 @@ -# SLE Setup - -# SLE Version -# it can be generated, but better is to go through this file -# manual and set the right versions as some have dependencies like nvidia, cuda and nccl - -# #source /etc/os-release -# #export SLE_DOTV=${VERSION_ID} -# #export SLE_MAJOR=${VERSION_ID%.*} -export SLE_DOTV=15.4 -export SLE_MAJOR=${SLE_DOTV%%.*} -export SLE_MINOR=${SLE_DOTV##*.} - - -# we need the running kernel version to get the right -devel packages -# PAYG SLES and SLE HPC use kernel-azure by default -# BYOS has kernel-default by default, so pls. replace name below if you use BYOS images -export KERNEL_VERSION=$(rpm -q --qf="%{VERSION}-%{RELEASE}" kernel-azure) -[[ $KERNEL_VERSION =~ "is not" ]] && KERNEL_VERSION= - - -# GNU compiler version -export GNU_COMPILER_VERSION=11 - -# SUSE uses lmod -export MODULE_FILES_DIRECTORY=/usr/share/lmod/modulefiles -export MODULE_DEPS_DIRECTORY=/usr/share/lmod/moduledeps - -# local directories -export LOCALDIR=/usr/local -export LOCALBIN=${LOCALDIR}/bin - -# azcopy -export AZVERSION="10.16.2" -RELEASE_TAG="release20221108" -# -export AZTARBALL="azcopy_linux_amd64_${AZVERSION}.tar.gz" -export AZCOPY_DOWNLOAD_URL="https://azcopyvnext.azureedge.net/${RELEASE_TAG}/${AZTARBALL}" - - -# HPC-X -# pls accept the EULA -export HPCX_VERSION="2.12" -export HPCX_DOWNLOAD_URL=https://content.mellanox.com/hpc/hpc-x/v${HPCX_VERSION}/hpcx-v${HPCX_VERSION}-gcc-inbox-suse${SLE_DOTV}-cuda11-gdrcopy2-nccl${HPCX_VERSION}-x86_64.tbz -export HPCX_CHKSUM="bc315d3b485d13c97cd174ef5c9cba5c2fa1fbc3e5175f96f1a406a6c0699bdb" -# URL need to be changed if you do not use the inbox drivers and want use MLNX_OFED -#export HPCX_DOWNLOAD_URL=https://content.mellanox.com/hpc/hpc-x/v${HPCX_VERSION}/hpcx-v${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-suse${SLE_DOTV}-cuda11-gdrcopy2-nccl${HPCX_VERSION}-x86_64.tbz -#export HPCX_CHKSUM="295376818a596b39196ca3843f4b5fc9e0607e50b0a0f96b4b97907af5f45fcb" - - -# NVIDIA - -## CUDA -## Nvidia provide certified packages for SLES, so we only need to add the repositories and install the packages -export CUDA_PUBKEY_URI=https://developer.download.nvidia.com/compute/cuda/repos/sles${SLE_MAJOR}/x86_64/D42D0685.pub -export CUDA_REPO_URI=https://developer.download.nvidia.com/compute/cuda/repos/sles${SLE_MAJOR}/x86_64/cuda-sles${SLE_MAJOR}.repo -# Version need to be with dot -export CUDA_VERSION="11.8" - -## NVIDIA DRIVER -export NVIDIA_VERSION="525.85.12" - -## NVIDIA CONTAINER -## see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html -export NVIDIA_CONTAINER_REPO_URI=https://nvidia.github.io/libnvidia-container/sles15.3/libnvidia-container.repo - -## NVIDIA DATACENTER MGR -export DCGM_VERSION="1:3.1.6" - -# NCCL -# actual is v2.17.1-1 (march 2023) -# v2.16.2-1 does support CUDA 12, and drop Kepler (sm_35) -export NCCL_VERSION="2.15.1-1" # for cuda-11.8 -export NCCL_DOWNLOAD_URL=https://github.com/NVIDIA/nccl/archive/refs/tags/v${NCCL_VERSION}.tar.gz - -# IBDEV2NETDEV -export IBDEV2NETDEV_BINARY_SOURCE=https://raw.githubusercontent.com/Mellanox/container_scripts/master/ibdev2netdev - -# AMD -export AOCL_VERSION="4.0" -export AOCL_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/aocl-linux-aocc-${AOCL_VERSION}.tar.gz -export AOCL_CHKSUM="c8000a66aaa2a257252cbb307732b4e66758b72b08f43b3723f4eb5404ba28c8" - -# OpenMPI 4 -export OMPI=openmpi4 - -# Intel oneAPI -# https://www.intel.com/content/www/us/en/develop/documentation/installation-guide-for-intel-oneapi-toolkits-linux/top/installation/install-using-package-managers/yum-dnf-zypper.html -export INTEL_PUBKEY_URI=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB -export INTEL_REPO_URI=https://yum.repos.intel.com/oneapi -export INTELLIBS=/opt/intel/oneapi - -## Intel MPI - to check whats available with: zypper se -s --repo oneAPI -# intel-oneapi-mpi-2021.1.1 -# intel-oneapi-mpi-2021.2.0 -# intel-oneapi-mpi-2021.3.0 -# intel-oneapi-mpi-2021.3.1 -# intel-oneapi-mpi-2021.4.0 -# intel-oneapi-mpi-2021.5.0 -# intel-oneapi-mpi-2021.5.1 -# intel-oneapi-mpi-2021.6.0 -# intel-oneapi-mpi-2021.7.0 -# intel-oneapi-mpi-2021.7.1 -# intel-oneapi-mpi-2021.8.0 -# intel-oneapi-mpi-2021.9.0 -export INTEL_ONE_MPI_VERSION="2021.9.0" -export IMPI_MAJOR=${INTEL_ONE_MPI_VERSION%.*.*} - -## Intel MKL (Math Kernel Library) - to check whats available with: zypper se -s --repo oneAPI -# intel-oneapi-mkl-2021.1.1 -# intel-oneapi-mkl-2021.2.0 -# intel-oneapi-mkl-2021.3.0 -# intel-oneapi-mkl-2021.4.0 -# intel-oneapi-mkl-2022.0.1 -# intel-oneapi-mkl-2022.0.2 -# intel-oneapi-mkl-2022.1.0 -# intel-oneapi-mkl-2022.2.0 -# intel-oneapi-mkl-2023.0.0 -# intel-oneapi-mkl-2023.1.0 -export INTEL_ONE_MKL_VERSION="2023.1.0" diff --git a/suse/sle-hpc-15.x/sle-hpc-15-sp4/fixes.sh b/suse/sle-hpc-15.x/sle-hpc-15-sp4/fixes.sh deleted file mode 100755 index 0257e6e8..00000000 --- a/suse/sle-hpc-15.x/sle-hpc-15-sp4/fixes.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -# Get the new SUSEConnect -zypper --non-interactive install -y suseconnect-ng diff --git a/suse/sle-hpc-15.x/sle-hpc-15-sp4/install.sh b/suse/sle-hpc-15.x/sle-hpc-15-sp4/install.sh deleted file mode 100755 index 21e539e9..00000000 --- a/suse/sle-hpc-15.x/sle-hpc-15-sp4/install.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -set -ex - -# set configuration -source ./config - -# set properties -source ./set_properties.sh - -# SLE version specific fixes -./fixes.sh - -# install utils -../common/install_utils.sh - -# install compilers -../common/install_gcc.sh - -# install mellanox ofed -../common/install_mellanoxofed.sh - -# install mpi libraries -../common/install_mpis.sh - -# install nvidia gpu driver -../common/install_nvidiagpudriver.sh - -# Install NCCL -../common/install_nccl.sh - -# install AMD tuned libraries -../common/install_amd_libs.sh - -# install Intel libraries -../common/install_intel_libs.sh - -# Install NVIDIA docker container -../common/install_docker.sh - -# Install Nvidia Datacenter GPU Manager (DCGM) -../common/install_dcgm.sh - -# optimizations -../common/hpc-tuning.sh - -# Network Optimization -$COMMON_DIR/network-tuning.sh - -# copy test file -$COMMON_DIR/copy_test_file.sh - -# install diagnostic script -$COMMON_DIR/install_hpcdiag.sh - -# install persistent rdma naming -# -# SUSE ships the rdma core userspace and libraries package by default -# and provides persistent naming through udev rules -# see for example /usr/lib/udev/rules.d/60-rdma-persistent-naming.rules - -# cleanup downloaded tarballs -rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh -# cleanup directories -rm -rf -- */ - -# if you want to use it as golden-image pls. run -# -#/usr/sbin/clone-master-clean-up -#/usr/sbin/waagent -force -deprovision+user && export HISTSIZE=0 && sync - diff --git a/suse/sle-hpc-15.x/sle-hpc-15-sp4/set_properties.sh b/suse/sle-hpc-15.x/sle-hpc-15-sp4/set_properties.sh deleted file mode 100755 index 64cc71b8..00000000 --- a/suse/sle-hpc-15.x/sle-hpc-15-sp4/set_properties.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -export TOP_DIR=../../.. -export COMMON_DIR=../../../common -export TEST_DIR=../../../tests diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 718be482..88acdb3b 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -37,11 +37,7 @@ GCC_VERSION="9.2.0" # Find distro find_distro() { local os=`cat /etc/os-release | awk 'match($0, /^NAME="(.*)"/, result) { print result[1] }'` - if [[ $os == "CentOS Linux" ]] - then - local centos_distro=`find_centos_distro` - echo "${os} ${centos_distro}" - elif [[ $os == "AlmaLinux" ]] + if [[ $os == "AlmaLinux" ]] then local alma_distro=`find_alma_distro` echo "${os} ${alma_distro}" @@ -49,21 +45,12 @@ find_distro() { then local ubuntu_distro=`find_ubuntu_distro` echo "${os} ${ubuntu_distro}" - elif [[ $os == "SLE_HPC" ]] - then - local sle_hpc_distro=`find_sle_hpc_distro` - echo ${sle_hpc_distro} else echo "*** Error - invalid distro!" exit -1 fi } -# Find CentOS distro -find_centos_distro() { - echo `cat /etc/redhat-release | awk '{print $4}'` -} - # Find Alma distro find_alma_distro() { echo `cat /etc/redhat-release | awk '{print $3}'` @@ -74,11 +61,6 @@ find_ubuntu_distro() { echo `cat /etc/os-release | awk 'match($0, /^PRETTY_NAME="(.*)"/, result) { print result[1] }' | awk '{print $2}' | cut -d. -f1,2` } -# Find SUSE Linux Enterprise HPC distro -find_sle_hpc_distro() { - echo $(cat /etc/os-release | awk 'match($0, /^PRETTY_NAME="(.*)"/, result) { print result[1] }') -} - distro=`find_distro` echo "Detected distro: ${distro}" @@ -111,75 +93,33 @@ else HPCX_OMB_PATH_UBUNTU_1804="/opt/hpcx-${HPCX_VERSION_UBUNTU}-gcc-MLNX_OFED_LINUX-5-ubuntu18.04-cuda12-gdrcopy2-nccl2.17-x86_64/ompi/tests/osu-micro-benchmarks-5.8" fi -HPCX_VERSION_CENTOS="v2.9.0" -HPCX_VERSION_SUSE="2.12" - -MVAPICH2_VERSION_CENTOS="2.3.6" MVAPICH2_VERSION_ALMA="2.3.7-1" MVAPICH2_VERSION_UBUNTU="2.3.7-1" -MVAPICH2_VERSION_SUSE="2.3.6" -OMPI_VERSION_CENTOS="4.1.1" OMPI_VERSION_ALMA_86="4.1.3" OMPI_VERSION_ALMA_87="4.1.5" -OMPI_VERSION_SUSE="4.1.1" -IMPI_2021_VERSION_CENTOS="2021.4.0" IMPI_2021_VERSION_ALMA_86="2021.7.0" IMPI_2021_VERSION_ALMA_87="2021.9.0" -IMPI_2021_VERSION_SUSE="2021.9.0" MVAPICH2X_INSTALLATION_DIRECTORY="/opt/mvapich2-x" IMPI2018_PATH="/opt/intel/compilers_and_libraries_2018.5.274" -MOFED_VERSION_CENTOS="MLNX_OFED_LINUX-5.4-1.0.3.0" -MOFED_VERSION_CENTOS_79="MLNX_OFED_LINUX-5.4-3.0.0.0" -MOFED_VERSION_CENTOS_83="MLNX_OFED_LINUX-5.2-1.0.4.0" -MOFED_VERSION_ALMA_86="MLNX_OFED_LINUX-5.8-1.0.1.1" MOFED_VERSION_ALMA_87="MLNX_OFED_LINUX-23.07-0.5.1.2" -MOFED_VERSION_SUSE="MLNX_OFED_INBOX_5.14.21-4.0.0" -#MOFED_VERSION_SUSE="MLNX_OFED-5.7-1.0.2.0" - -HPCX_OMB_PATH_CENTOS_76="/opt/hpcx-${HPCX_VERSION_CENTOS}-gcc${GCC_VERSION}-${MOFED_VERSION_CENTOS}-redhat7.6-x86_64/ompi/tests/osu-micro-benchmarks-5.6.2" -HPCX_OMB_PATH_CENTOS_77="/opt/hpcx-${HPCX_VERSION_CENTOS}-gcc${GCC_VERSION}-${MOFED_VERSION_CENTOS}-redhat7.7-x86_64/ompi/tests/osu-micro-benchmarks-5.6.2" -HPCX_OMB_PATH_CENTOS_78="/opt/hpcx-${HPCX_VERSION_CENTOS}-gcc${GCC_VERSION}-${MOFED_VERSION_CENTOS}-redhat7.8-x86_64/ompi/tests/osu-micro-benchmarks-5.6.2" -HPCX_OMB_PATH_CENTOS_79="/opt/hpcx-${HPCX_VERSION_CENTOS}-gcc${GCC_VERSION}-${HPCX_MOFED_INTEGRATION_VERSION}-redhat7.9-x86_64/ompi/tests/osu-micro-benchmarks-5.6.2" -HPCX_OMB_PATH_CENTOS_81="/opt/hpcx-${HPCX_VERSION_CENTOS}-gcc${GCC_VERSION}-${MOFED_VERSION_CENTOS}-redhat8.1-x86_64/ompi/tests/osu-micro-benchmarks-5.6.2" -HPCX_OMB_PATH_CENTOS_83="/opt/hpcx-v2.8.0-gcc-${MOFED_VERSION_CENTOS_83}-redhat8.3-x86_64/ompi/tests/osu-micro-benchmarks-5.6.2" -MODULE_FILES_ROOT_CENTOS="/usr/share/Modules/modulefiles" -IMPI2021_PATH_CENTOS="/opt/intel/oneapi/mpi/${IMPI_2021_VERSION_CENTOS}" -# added "libexec" to the path, as centos and ubuntu use "libexec", but SUSE only "lib" -MVAPICH2_PATH_CENTOS="/opt/mvapich2-${MVAPICH2_VERSION_CENTOS}/libexec" -MVAPICH2X_PATH_CENTOS="${MVAPICH2X_INSTALLATION_DIRECTORY}/gnu9.2.0/mofed5.1/azure-xpmem/mpirun" -OPENMPI_PATH_CENTOS="/opt/openmpi-${OMPI_VERSION_CENTOS}" - -HPCX_OMB_PATH_ALMA_86="/opt/hpcx-v2.14-gcc-MLNX_OFED_LINUX-5-redhat8-cuda11-gdrcopy2-nccl2.16-x86_64/ompi/tests/osu-micro-benchmarks-5.8" HPCX_OMB_PATH_ALMA_87="/opt/hpcx-v2.16-gcc-mlnx_ofed-redhat8-cuda12-gdrcopy2-nccl2.18-x86_64/ompi/tests/osu-micro-benchmarks-5.8" MODULE_FILES_ROOT_ALMA="/usr/share/Modules/modulefiles" -IMPI2021_PATH_ALMA_86="/opt/intel/oneapi/mpi/${IMPI_2021_VERSION_ALMA_86}" IMPI2021_PATH_ALMA_87="/opt/intel/oneapi/mpi/${IMPI_2021_VERSION_ALMA_87}" -# added "libexec" to the path, as rh+clones and ubuntu use "libexec", but SUSE only "lib" MVAPICH2_PATH_ALMA="/opt/mvapich2-${MVAPICH2_VERSION_ALMA}/libexec" -OPENMPI_PATH_ALMA_86="/opt/openmpi-${OMPI_VERSION_ALMA_86}" OPENMPI_PATH_ALMA_87="/opt/openmpi-${OMPI_VERSION_ALMA_87}" MODULE_FILES_ROOT_UBUNTU="/usr/share/modules/modulefiles" HPCX_OMB_PATH_UBUNTU_2004="/opt/hpcx-${HPCX_VERSION_UBUNTU}-gcc-mlnx_ofed-ubuntu20.04-cuda12-gdrcopy2-nccl2.18-x86_64/ompi/tests/osu-micro-benchmarks-5.8" HPCX_OMB_PATH_UBUNTU_2204="/opt/hpcx-${HPCX_VERSION_UBUNTU}-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64/ompi/tests/osu-micro-benchmarks-5.8" IMPI2021_PATH_UBUNTU="/opt/intel/oneapi/mpi/${IMPI_2021_VERSION_UBUNTU}" -# added "libexec" to the path, as centos and ubuntu use "libexec", but SUSE only "lib" MVAPICH2_PATH_UBUNTU="/opt/mvapich2-${MVAPICH2_VERSION_UBUNTU}/libexec" MVAPICH2X_PATH_UBUNTU="${MVAPICH2X_INSTALLATION_DIRECTORY}/gnu9.2.0/mofed5.0/advanced-xpmem/mpirun" OPENMPI_PATH_UBUNTU="/opt/openmpi-${OMPI_VERSION_UBUNTU}" -MODULE_FILES_ROOT_SUSE="/usr/share/lmod/modulefiles" -HPCX_OMB_PATH_SUSE="/opt/hpcx-v${HPCX_VERSION_SUSE}-gcc-inbox-suse15.4-cuda11-gdrcopy2-nccl${HPCX_VERSION_SUSE}-x86_64/ompi/tests/osu-micro-benchmarks-5.8" -IMPI2021_PATH_SUSE="/opt/intel/oneapi/mpi/${IMPI_2021_VERSION_SUSE}" -# SUSE use lib instead of libexec -MVAPICH2_PATH_SUSE="/usr/lib/hpc/gnu7/mpi/mvapich2-psm2/2.3.6/lib" -MVAPICH2X_PATH_SUSE="" -OPENMPI_PATH_SUSE="/usr/lib/hpc/gnu7/mpi/openmpi/4.1.1" - CHECK_HPCX=0 CHECK_IMPI_2021=0 CHECK_IMPI_2018=0 @@ -192,18 +132,9 @@ CHECK_NCCL=0 CHECK_GCC=1 CHECK_DOCKER=0 -if [[ $distro == *"CentOS Linux"* ]] -then - MKL_VERSION="2021.1.1" -elif [[ $distro == "Ubuntu 18.04" ]] -then - MKL_VERSION="2023.1.0" -elif [[ $distro == "Ubuntu"* ]] +if [[ $distro == "Ubuntu"* ]] then MKL_VERSION="2023.2.0" -elif [[ $distro == "AlmaLinux 8.6" ]] -then - MKL_VERSION="2022.1.0" elif [[ $distro == "AlmaLinux 8.7" ]] then MKL_VERSION="2023.2.0" @@ -211,115 +142,7 @@ else MKL_VERSION="2023.1.0" fi -if [[ $distro == "CentOS Linux 7.6.1810" ]] -then - HPCX_OMB_PATH=${HPCX_OMB_PATH_CENTOS_76} - CHECK_HPCX=1 - CHECK_IMPI_2021=1 - CHECK_IMPI_2018=1 - CHECK_OMPI=1 - CHECK_MVAPICH2=1 - CHECK_MVAPICH2X=0 - MODULE_FILES_ROOT=${MODULE_FILES_ROOT_CENTOS} - MOFED_VERSION=${MOFED_VERSION_CENTOS} - IMPI2021_PATH=${IMPI2021_PATH_CENTOS} - MVAPICH2_PATH=${MVAPICH2_PATH_CENTOS} - MVAPICH2X_PATH=${MVAPICH2X_PATH_CENTOS} - OPENMPI_PATH=${OPENMPI_PATH_CENTOS} -elif [[ $distro == "CentOS Linux 7.7.1908" ]] -then - HPCX_OMB_PATH=${HPCX_OMB_PATH_CENTOS_77} - CHECK_HPCX=1 - CHECK_IMPI_2021=1 - CHECK_IMPI_2018=1 - CHECK_OMPI=1 - CHECK_MVAPICH2=1 - CHECK_MVAPICH2X=0 - MODULE_FILES_ROOT=${MODULE_FILES_ROOT_CENTOS} - MOFED_VERSION=${MOFED_VERSION_CENTOS} - IMPI2021_PATH=${IMPI2021_PATH_CENTOS} - MVAPICH2_PATH=${MVAPICH2_PATH_CENTOS} - MVAPICH2X_PATH=${MVAPICH2X_PATH_CENTOS} - OPENMPI_PATH=${OPENMPI_PATH_CENTOS} -elif [[ $distro == "CentOS Linux 7.8.2003" ]] -then - HPCX_OMB_PATH=${HPCX_OMB_PATH_CENTOS_78} - CHECK_HPCX=1 - CHECK_IMPI_2021=1 - CHECK_IMPI_2018=1 - CHECK_OMPI=1 - CHECK_MVAPICH2=1 - CHECK_MVAPICH2X=0 - MODULE_FILES_ROOT=${MODULE_FILES_ROOT_CENTOS} - MOFED_VERSION=${MOFED_VERSION_CENTOS} - IMPI2021_PATH=${IMPI2021_PATH_CENTOS} - MVAPICH2_PATH=${MVAPICH2_PATH_CENTOS} - MVAPICH2X_PATH=${MVAPICH2X_PATH_CENTOS} - OPENMPI_PATH=${OPENMPI_PATH_CENTOS} -elif [[ $distro == "CentOS Linux 7.9.2009" ]] -then - HPCX_OMB_PATH=${HPCX_OMB_PATH_CENTOS_79} - CHECK_HPCX=1 - CHECK_IMPI_2021=1 - CHECK_IMPI_2018=1 - CHECK_OMPI=1 - CHECK_MVAPICH2=1 - CHECK_MVAPICH2X=0 - CHECK_DOCKER=1 - MODULE_FILES_ROOT=${MODULE_FILES_ROOT_CENTOS} - MOFED_VERSION=${MOFED_VERSION_CENTOS_79} - IMPI2021_PATH=${IMPI2021_PATH_CENTOS} - MVAPICH2_PATH=${MVAPICH2_PATH_CENTOS} - MVAPICH2X_PATH=${MVAPICH2X_PATH_CENTOS} - OPENMPI_PATH=${OPENMPI_PATH_CENTOS} - CHECK_AOCL=1 - CHECK_NCCL=1 -elif [[ $distro == "CentOS Linux 8.1.1911" ]] -then - HPCX_OMB_PATH=${HPCX_OMB_PATH_CENTOS_81} - CHECK_HPCX=1 - CHECK_IMPI_2021=1 - CHECK_OMPI=1 - CHECK_MVAPICH2=1 - CHECK_MVAPICH2X=0 - MODULE_FILES_ROOT=${MODULE_FILES_ROOT_CENTOS} - MOFED_VERSION=${MOFED_VERSION_CENTOS} - IMPI2021_PATH=${IMPI2021_PATH_CENTOS} - MVAPICH2_PATH=${MVAPICH2_PATH_CENTOS} - MVAPICH2X_PATH=${MVAPICH2X_PATH_CENTOS} - OPENMPI_PATH=${OPENMPI_PATH_CENTOS} -elif [[ $distro == "CentOS Linux 8.3.2011" ]] -then - HPCX_OMB_PATH=${HPCX_OMB_PATH_CENTOS_83} - CHECK_HPCX=1 - CHECK_IMPI_2021=1 - CHECK_OMPI=1 - CHECK_MVAPICH2=1 - CHECK_MVAPICH2X=0 - MODULE_FILES_ROOT=${MODULE_FILES_ROOT_CENTOS} - MOFED_VERSION=${MOFED_VERSION_CENTOS} - IMPI2021_PATH=${IMPI2021_PATH_CENTOS} - MVAPICH2_PATH=${MVAPICH2_PATH_CENTOS} - MVAPICH2X_PATH=${MVAPICH2X_PATH_CENTOS} - OPENMPI_PATH=${OPENMPI_PATH_CENTOS} -elif [[ $distro == "AlmaLinux 8.6" ]] -then - HPCX_OMB_PATH=${HPCX_OMB_PATH_ALMA_86} - CHECK_HPCX=1 - CHECK_IMPI_2021=1 - CHECK_IMPI_2018=0 - CHECK_OMPI=1 - CHECK_MVAPICH2=1 - CHECK_MVAPICH2X=0 - MODULE_FILES_ROOT=${MODULE_FILES_ROOT_ALMA} - MOFED_VERSION=${MOFED_VERSION_ALMA_86} - IMPI2021_PATH=${IMPI2021_PATH_ALMA_86} - MVAPICH2_PATH=${MVAPICH2_PATH_ALMA} - OPENMPI_PATH=${OPENMPI_PATH_ALMA_86} - CHECK_AOCL=1 - CHECK_NCCL=1 - CHECK_DOCKER=1 -elif [[ $distro == "AlmaLinux 8.7" ]] +if [[ $distro == "AlmaLinux 8.7" ]] then HPCX_OMB_PATH=${HPCX_OMB_PATH_ALMA_87} CHECK_HPCX=1 @@ -336,24 +159,6 @@ then CHECK_AOCL=1 CHECK_NCCL=1 CHECK_DOCKER=1 -elif [[ $distro == "Ubuntu 18.04" ]] -then - HPCX_OMB_PATH=${HPCX_OMB_PATH_UBUNTU_1804} - CHECK_HPCX=1 - CHECK_IMPI_2021=1 - CHECK_MVAPICH2=1 - CHECK_OMPI=1 - CHECK_BLIS_MT=1 - MODULE_FILES_ROOT=${MODULE_FILES_ROOT_UBUNTU} - MOFED_VERSION=${MOFED_VERSION_UBUNTU} - IMPI2021_PATH=${IMPI2021_PATH_UBUNTU} - MVAPICH2_PATH=${MVAPICH2_PATH_UBUNTU} - MVAPICH2X_PATH=${MVAPICH2X_PATH_UBUNTU} - OPENMPI_PATH=${OPENMPI_PATH_UBUNTU} - CHECK_AOCL=0 - CHECK_GCC=0 - CHECK_NCCL=1 - CHECK_DOCKER=1 elif [[ $distro == "Ubuntu 20.04" ]] then HPCX_OMB_PATH=${HPCX_OMB_PATH_UBUNTU_2004} @@ -390,35 +195,6 @@ then CHECK_NCCL=1 CHECK_GCC=0 CHECK_DOCKER=1 -elif [[ $distro == "SUSE Linux Enterprise High Performance Computing 15 SP4" ]] -then - # add /sbin and /usr/sbin to the path to allow lscpi and ibstatus called without path - # as only UID=0 get it by default - export PATH=$PATH:/sbin:/usr/sbin - MKL_VERSION="2023.1.0" - # - CHECK_GCC=0 - CHECK_ONEAPI=1 - CHECK_AZURE_HPC_DIAG=1 - - HPCX_OMB_PATH=${HPCX_OMB_PATH_SUSE} - CHECK_HPCX=1 - CHECK_IMPI_2021=1 - CHECK_MVAPICH2=1 - CHECK_MVAPICH2X=0 - CHECK_CUDA=1 - CHECK_OMPI=1 - CHECK_BLIS_MT=1 - - MODULE_FILES_ROOT=${MODULE_FILES_ROOT_SUSE} - MOFED_VERSION=${MOFED_VERSION_SUSE} - IMPI2021_PATH=${IMPI2021_PATH_SUSE} - MVAPICH2_PATH=${MVAPICH2_PATH_SUSE} - OPENMPI_PATH=${OPENMPI_PATH_SUSE} - - CHECK_AOCL=1 - CHECK_NV_PMEM=0 - CHECK_NCCL=1 else echo "*** Error - invalid distro!" exit -1 @@ -452,19 +228,14 @@ check_exit_code() { # verify if package updates work case ${distro} in Ubuntu*) sudo apt-get -q --assume-no update;; - CentOS* | AlmaLinux*) sudo yum update -y --setopt tsflags=test;; + AlmaLinux*) sudo yum update -y --setopt tsflags=test;; * ) ;; esac check_exit_code "Package update works" "Package update fails!" # verify MOFED installation -if [[ $distro == "SUSE Linux Enterprise High Performance Computing 15 SP4" ]] -then - echo "ofed inbox driver does miss the ofed_info tool" -else - ofed_info | grep ${MOFED_VERSION} - check_exit_code "MOFED installed" "MOFED not installed" -fi +ofed_info | grep ${MOFED_VERSION} +check_exit_code "MOFED installed" "MOFED not installed" # verify IB device is listed lspci | grep "Infiniband controller\|Network controller" diff --git a/ubuntu/ubuntu-18.x/common/install_mpis.sh b/ubuntu/ubuntu-18.x/common/install_mpis.sh deleted file mode 100755 index 21572fd7..00000000 --- a/ubuntu/ubuntu-18.x/common/install_mpis.sh +++ /dev/null @@ -1,123 +0,0 @@ -#!/bin/bash -set -ex - -# Parameters -HPCX_CHECKSUM=$1 - -# Load gcc -set CC=/usr/bin/gcc -set GCC=/usr/bin/gcc - -INSTALL_PREFIX=/opt - -# HPC-X v2.15 -HPCX_VERSION="v2.15" -TARBALL="hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-$DISTRIBUTION-cuda12-gdrcopy2-nccl2.17-x86_64.tbz" -HPCX_DOWNLOAD_URL=https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/${TARBALL} -HPCX_FOLDER=$(basename ${HPCX_DOWNLOAD_URL} .tbz) - -$COMMON_DIR/download_and_verify.sh ${HPCX_DOWNLOAD_URL} ${HPCX_CHECKSUM} -tar -xvf ${TARBALL} -mv ${HPCX_FOLDER} ${INSTALL_PREFIX} -HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} -$COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION - -# MVAPICH2 2.3.7-1 -MV2_VERSION="2.3.7-1" -MV2_DOWNLOAD_URL=http://mvapich.cse.ohio-state.edu/download/mvapich/mv2/mvapich2-${MV2_VERSION}.tar.gz -$COMMON_DIR/download_and_verify.sh $MV2_DOWNLOAD_URL "fdd971cf36d6476d007b5d63d19414546ca8a2937b66886f24a1d9ca154634e4" -tar -xvf mvapich2-${MV2_VERSION}.tar.gz -cd mvapich2-${MV2_VERSION} -# Error exclusive to Ubuntu 22.04 -# configure: error: The Fortran compiler gfortran will not compile files that call -# the same routine with arguments of different types. -./configure $(if [[ ${DISTRIBUTION} == "ubuntu22.04" ]]; then echo "FFLAGS=-fallow-argument-mismatch"; fi) --prefix=${INSTALL_PREFIX}/mvapich2-${MV2_VERSION} --enable-g=none --enable-fast=yes && make -j$(nproc) && make install -cd .. -$COMMON_DIR/write_component_version.sh "MVAPICH2" ${MV2_VERSION} - -# OpenMPI 4.1.5 -OMPI_VERSION="4.1.5" -OMPI_DOWNLOAD_URL=https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz -$COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL "c018b127619d2a2a30c1931f316fc8a245926d0f5b4ebed4711f9695e7f70925" -tar -xvf openmpi-${OMPI_VERSION}.tar.gz -cd openmpi-${OMPI_VERSION} -./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install -cd .. -$COMMON_DIR/write_component_version.sh "OMPI" ${OMPI_VERSION} - -# Intel MPI 2021 (Update 9) -IMPI_2021_VERSION="2021.9.0" -IMPI_2021_DOWNLOAD_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/718d6f8f-2546-4b36-b97b-bc58d5482ebf/l_mpi_oneapi_p_${IMPI_2021_VERSION}.43482_offline.sh -$COMMON_DIR/download_and_verify.sh $IMPI_2021_DOWNLOAD_URL "5c170cdf26901311408809ced28498b630a494428703685203ceef6e62735ef8" -bash l_mpi_oneapi_p_${IMPI_2021_VERSION}.43482_offline.sh -s -a -s --eula accept -mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/impi -$COMMON_DIR/write_component_version.sh "IMPI_2021" ${IMPI_2021_VERSION} - -# Module Files -MODULE_FILES_DIRECTORY=/usr/share/modules/modulefiles/mpi -mkdir -p ${MODULE_FILES_DIRECTORY} - -# HPC-X -cat << EOF >> ${MODULE_FILES_DIRECTORY}/hpcx-${HPCX_VERSION} -#%Module 1.0 -# -# HPCx ${HPCX_VERSION} -# -conflict mpi -module load ${HPCX_PATH}/modulefiles/hpcx -EOF - -# MVAPICH2 -cat << EOF >> ${MODULE_FILES_DIRECTORY}/mvapich2-${MV2_VERSION} -#%Module 1.0 -# -# MVAPICH2 ${MV2_VERSION} -# -conflict mpi -prepend-path PATH /opt/mvapich2-${MV2_VERSION}/bin -prepend-path LD_LIBRARY_PATH /opt/mvapich2-${MV2_VERSION}/lib -prepend-path MANPATH /opt/mvapich2-${MV2_VERSION}/share/man -setenv MPI_BIN /opt/mvapich2-${MV2_VERSION}/bin -setenv MPI_INCLUDE /opt/mvapich2-${MV2_VERSION}/include -setenv MPI_LIB /opt/mvapich2-${MV2_VERSION}/lib -setenv MPI_MAN /opt/mvapich2-${MV2_VERSION}/share/man -setenv MPI_HOME /opt/mvapich2-${MV2_VERSION} -EOF - -# OpenMPI -cat << EOF >> ${MODULE_FILES_DIRECTORY}/openmpi-${OMPI_VERSION} -#%Module 1.0 -# -# OpenMPI ${OMPI_VERSION} -# -conflict mpi -prepend-path PATH /opt/openmpi-${OMPI_VERSION}/bin -prepend-path LD_LIBRARY_PATH /opt/openmpi-${OMPI_VERSION}/lib -prepend-path MANPATH /opt/openmpi-${OMPI_VERSION}/share/man -setenv MPI_BIN /opt/openmpi-${OMPI_VERSION}/bin -setenv MPI_INCLUDE /opt/openmpi-${OMPI_VERSION}/include -setenv MPI_LIB /opt/openmpi-${OMPI_VERSION}/lib -setenv MPI_MAN /opt/openmpi-${OMPI_VERSION}/share/man -setenv MPI_HOME /opt/openmpi-${OMPI_VERSION} -EOF - -# Intel 2021 -cat << EOF >> ${MODULE_FILES_DIRECTORY}/impi_${IMPI_2021_VERSION} -#%Module 1.0 -# -# Intel MPI ${IMPI_2021_VERSION} -# -conflict mpi -module load /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/impi -setenv MPI_BIN /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/bin -setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/include -setenv MPI_LIB /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/lib -setenv MPI_MAN /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/man -setenv MPI_HOME /opt/intel/oneapi/mpi/${IMPI_2021_VERSION} -EOF - -# Softlinks -ln -s ${MODULE_FILES_DIRECTORY}/hpcx-${HPCX_VERSION} ${MODULE_FILES_DIRECTORY}/hpcx -ln -s ${MODULE_FILES_DIRECTORY}/mvapich2-${MV2_VERSION} ${MODULE_FILES_DIRECTORY}/mvapich2 -ln -s ${MODULE_FILES_DIRECTORY}/openmpi-${OMPI_VERSION} ${MODULE_FILES_DIRECTORY}/openmpi -ln -s ${MODULE_FILES_DIRECTORY}/impi_${IMPI_2021_VERSION} ${MODULE_FILES_DIRECTORY}/impi-2021 diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/README.md b/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/README.md deleted file mode 100644 index dd0af5df..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# Ubuntu 18.04 HPC Image -# Intended for CX3-Pro cards - -The Ubuntu 18.04 HPC Image with MOFED LTS includes optimizations and recommended configurations to deliver optimal performance, -consistency, and reliability. This image consists of the following HPC tools and libraries: - -- Mellanox OFED LTS -- Pre-configured IPoIB (IP-over-InfiniBand) -- Popular InfiniBand based MPI Libraries - - HPC-X - - IntelMPI - - MVAPICH2 -- Communication Runtimes - - Libfabric - - OpenUCX -- Optimized librares - - Intel MKL -- GPU Drivers - - Nvidia GPU Driver -- Data Center GPU Manager -- Azure HPC Diagnostics Tool - -This Image is compliant with the Linux Kernel 5.4.0-1043-azure. - -Software packages (MPI / HPC libraries) are configured as environment modules. Users can select preferred MPI or software packages as follows: - -`module load ` - -Running Single Node NCCL Test (example): - -```sh -mpirun -np 4 \ - -x LD_LIBRARY_PATH \ - --allow-run-as-root \ - --map-by ppr:4:node \ - -mca coll_hcoll_enable 0 \ - -x UCX_TLS=tcp \ - -x CUDA_DEVICE_ORDER=PCI_BUS_ID \ - -x NCCL_SOCKET_IFNAME=eth0 \ - -x NCCL_DEBUG=WARN \ - /opt/nccl-tests/build/all_reduce_perf -b1K -f2 -g1 -e 4G -``` \ No newline at end of file diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install.sh b/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install.sh deleted file mode 100755 index 656b13dd..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash -set -ex - -# set properties -source ./set_properties.sh - -# install utils -./install_utils.sh - -# install Lustre client -$UBUNTU_COMMON_DIR/install_lustre_client.sh - -# install mellanox ofed -./install_mellanoxofed.sh - -# install mpi libraries -./install_mpis.sh - -# install nvidia gpu driver -./install_nvidiagpudriver.sh - -# Install NCCL -./install_nccl.sh - -# cleanup downloaded tarballs -rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh -rm -Rf -- */ - -# Install DCGM -$UBUNTU_COMMON_DIR/install_dcgm.sh - -# install Intel libraries -$UBUNTU_COMMON_DIR/install_intel_libs.sh - -# install diagnostic script -$COMMON_DIR/install_hpcdiag.sh - -# optimizations -$UBUNTU_COMMON_DIR/hpc-tuning.sh - -# SKU Customization -$COMMON_DIR/setup_sku_customizations.sh - -# copy test file -$COMMON_DIR/copy_test_file.sh - -# diable auto kernel updates -$UBUNTU_COMMON_DIR/disable_auto_upgrade.sh - -# clear history -# Uncomment the line below if you are running this on a VM -# $COMMON_DIR/clear_history.sh diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_mellanoxofed.sh b/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_mellanoxofed.sh deleted file mode 100755 index 848560c0..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_mellanoxofed.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -set -ex - -VERSION="4.9-6.0.6.0" -TARBALL="MLNX_OFED_LINUX-$VERSION-ubuntu18.04-x86_64.tgz" -MLNX_OFED_DOWNLOAD_URL=https://content.mellanox.com/ofed/MLNX_OFED-${VERSION}/$TARBALL -MOFED_FOLDER=$(basename ${MLNX_OFED_DOWNLOAD_URL} .tgz) - -$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL "ca56ddf53a22192d2c91d1c410db686a8f8faf339020cfe44ab0c664e63c8b99" -tar zxvf ${TARBALL} - -./${MOFED_FOLDER}/mlnxofedinstall --add-kernel-support --without-fw-update -$COMMON_DIR/write_component_version.sh "MOFED" $VERSION - -# Restarting openibd -/etc/init.d/openibd restart diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_mpis.sh b/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_mpis.sh deleted file mode 100755 index 88ac9f09..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_mpis.sh +++ /dev/null @@ -1,122 +0,0 @@ -#!/bin/bash -set -ex - -# Load gcc -GCC_VERSION=gcc-9.2.0 -set CC=/usr/bin/gcc -set GCC=/usr/bin/gcc - -INSTALL_PREFIX=/opt - -# HPC-X v2.7.0 -MLNX_OFED_VERSION="4.7-1.0.0.1" -HPCX_VERSION="v2.7.0" -TARBALL="hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${DISTRIBUTION}-x86_64.tbz" -HPCX_DOWNLOAD_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/${TARBALL} -HPCX_FOLDER=$(basename ${HPCX_DOWNLOAD_URL} .tbz) - -$COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL "83f03e2d01cb1e4198e50ed9bdadb742aebf5d247369071bc911096824147d7a" -tar -xvf ${TARBALL} -mv ${HPCX_FOLDER} ${INSTALL_PREFIX} -HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} -$COMMON_DIR/write_component_version.sh "HPCX" ${HPCX_VERSION} - -# MVAPICH2 2.3.7 -MV2_VERSION="2.3.7" -MV2_DOWNLOAD_URL=http://mvapich.cse.ohio-state.edu/download/mvapich/mv2/mvapich2-${MV2_VERSION}.tar.gz -$COMMON_DIR/download_and_verify.sh $MV2_DOWNLOAD_URL "c39a4492f4be50df6100785748ba2894e23ce450a94128181d516da5757751ae" -tar -xvf mvapich2-${MV2_VERSION}.tar.gz -cd mvapich2-${MV2_VERSION} -./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MV2_VERSION} --enable-g=none --enable-fast=yes && make -j$(nproc) && make install -cd .. -$COMMON_DIR/write_component_version.sh "MVAPICH2" ${MV2_VERSION} - -# OpenMPI 4.1.3 -OMPI_VERSION="4.1.5" -OMPI_DOWNLOAD_URL=https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz -$COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL "c018b127619d2a2a30c1931f316fc8a245926d0f5b4ebed4711f9695e7f70925" -tar -xvf openmpi-${OMPI_VERSION}.tar.gz -cd openmpi-${OMPI_VERSION} -# disable OpenSHMEM build -sed -i "s/enable_oshmem_fortran=yes/enable_oshmem_fortran=no/" contrib/platform/mellanox/optimized -sed -i "s/enable_oshmem=yes/enable_oshmem=no/" contrib/platform/mellanox/optimized -./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --enable-mpirun-prefix-by-default --disable-oshmem --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install -cd .. -$COMMON_DIR/write_component_version.sh "OMPI" ${OMPI_VERSION} - -# Intel MPI 2021 (Update 7) -IMPI_2021_VERSION="2021.9.0" -IMPI_2021_DOWNLOAD_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/718d6f8f-2546-4b36-b97b-bc58d5482ebf/l_mpi_oneapi_p_${IMPI_2021_VERSION}.43482_offline.sh -$COMMON_DIR/download_and_verify.sh $IMPI_2021_DOWNLOAD_URL "5c170cdf26901311408809ced28498b630a494428703685203ceef6e62735ef8" -bash l_mpi_oneapi_p_${IMPI_2021_VERSION}.43482_offline.sh -s -a -s --eula accept -mv ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/mpi ${INSTALL_PREFIX}/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/impi -$COMMON_DIR/write_component_version.sh "IMPI_2021" ${IMPI_2021_VERSION} - -# Module Files -MODULE_FILES_DIRECTORY=/usr/share/modules/modulefiles/mpi -mkdir -p ${MODULE_FILES_DIRECTORY} - -# HPC-X -cat << EOF >> ${MODULE_FILES_DIRECTORY}/hpcx-${HPCX_VERSION} -#%Module 1.0 -# -# HPCx ${HPCX_VERSION} -# -conflict mpi -module load ${HPCX_PATH}/modulefiles/hpcx -EOF - -# MVAPICH2 -cat << EOF >> ${MODULE_FILES_DIRECTORY}/mvapich2-${MV2_VERSION} -#%Module 1.0 -# -# MVAPICH2 ${MV2_VERSION} -# -conflict mpi -prepend-path PATH /opt/mvapich2-${MV2_VERSION}/bin -prepend-path LD_LIBRARY_PATH /opt/mvapich2-${MV2_VERSION}/lib -prepend-path MANPATH /opt/mvapich2-${MV2_VERSION}/share/man -setenv MPI_BIN /opt/mvapich2-${MV2_VERSION}/bin -setenv MPI_INCLUDE /opt/mvapich2-${MV2_VERSION}/include -setenv MPI_LIB /opt/mvapich2-${MV2_VERSION}/lib -setenv MPI_MAN /opt/mvapich2-${MV2_VERSION}/share/man -setenv MPI_HOME /opt/mvapich2-${MV2_VERSION} -EOF - -# OpenMPI -cat << EOF >> ${MODULE_FILES_DIRECTORY}/openmpi-${OMPI_VERSION} -#%Module 1.0 -# -# OpenMPI ${OMPI_VERSION} -# -conflict mpi -prepend-path PATH /opt/openmpi-${OMPI_VERSION}/bin -prepend-path LD_LIBRARY_PATH /opt/openmpi-${OMPI_VERSION}/lib -prepend-path MANPATH /opt/openmpi-${OMPI_VERSION}/share/man -setenv MPI_BIN /opt/openmpi-${OMPI_VERSION}/bin -setenv MPI_INCLUDE /opt/openmpi-${OMPI_VERSION}/include -setenv MPI_LIB /opt/openmpi-${OMPI_VERSION}/lib -setenv MPI_MAN /opt/openmpi-${OMPI_VERSION}/share/man -setenv MPI_HOME /opt/openmpi-${OMPI_VERSION} -EOF - -# Intel 2021 -cat << EOF >> ${MODULE_FILES_DIRECTORY}/impi_${IMPI_2021_VERSION} -#%Module 1.0 -# -# Intel MPI ${IMPI_2021_VERSION} -# -conflict mpi -module load /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/modulefiles/impi -setenv MPI_BIN /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/bin -setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/include -setenv MPI_LIB /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/lib -setenv MPI_MAN /opt/intel/oneapi/mpi/${IMPI_2021_VERSION}/man -setenv MPI_HOME /opt/intel/oneapi/mpi/${IMPI_2021_VERSION} -EOF - -# Softlinks -ln -s ${MODULE_FILES_DIRECTORY}/hpcx-${HPCX_VERSION} ${MODULE_FILES_DIRECTORY}/hpcx -ln -s ${MODULE_FILES_DIRECTORY}/mvapich2-${MV2_VERSION} ${MODULE_FILES_DIRECTORY}/mvapich2 -ln -s ${MODULE_FILES_DIRECTORY}/openmpi-${OMPI_VERSION} ${MODULE_FILES_DIRECTORY}/openmpi -ln -s ${MODULE_FILES_DIRECTORY}/impi_${IMPI_2021_VERSION} ${MODULE_FILES_DIRECTORY}/impi-2021 diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_nccl.sh b/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_nccl.sh deleted file mode 100755 index ce6f9e39..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_nccl.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -# Skip installation of NCCL RDMA sharp plugin for CX-3 pro -sed -i '35,43 s/^/#/' $UBUNTU_COMMON_DIR/install_nccl.sh -$UBUNTU_COMMON_DIR/install_nccl.sh diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_nvidiagpudriver.sh b/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_nvidiagpudriver.sh deleted file mode 100755 index 1528cca0..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_nvidiagpudriver.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -set -ex - -$UBUNTU_COMMON_DIR/install_nvidiagpudriver.sh 1804 diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_utils.sh b/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_utils.sh deleted file mode 100755 index de54457f..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/install_utils.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -set -ex - -# Setup microsoft packages repository for moby -# Download the repository configuration package -curl https://packages.microsoft.com/config/ubuntu/18.04/prod.list > ./microsoft-prod.list -# Copy the generated list to the sources.list.d directory -cp ./microsoft-prod.list /etc/apt/sources.list.d/ -# Install the Microsoft GPG public key -curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg -cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ - -#apt-get install packages -AZCOPY_VERSION="10.17.0" -AZCOPY_RELEASE_TAG="release20230123" -$UBUNTU_COMMON_DIR/install_utils.sh ${AZCOPY_VERSION} ${AZCOPY_RELEASE_TAG} diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/set_properties.sh b/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/set_properties.sh deleted file mode 100755 index 2ab0f97e..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-LTS-hpc/set_properties.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -export TOP_DIR=../../.. -export COMMON_DIR=../../../common -export UBUNTU_COMMON_DIR=../../common -export TEST_DIR=../../../tests -export DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/README.md b/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/README.md deleted file mode 100644 index 06e6d0d0..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/README.md +++ /dev/null @@ -1,53 +0,0 @@ -# Ubuntu 18.04 HPC Image - -The Ubuntu 18.04 HPC Image includes optimizations and recommended configurations to deliver optimal performance, -consistency, and reliability. This image consists of the following HPC tools and libraries: - -- Mellanox OFED -- Pre-configured IPoIB (IP-over-InfiniBand) -- Popular InfiniBand based MPI Libraries - - HPC-X - - IntelMPI - - MVAPICH2 -- Communication Runtimes - - Libfabric - - OpenUCX -- Optimized librares - - Intel MKL -- GPU Drivers - - Nvidia GPU Driver -- SHARP Daemon (sharpd) -- NCCL - - NCCL RDMA Sharp Plugin - - NCCL Benchmarks - - Topology file for NDv4 -- NV Peer Memory (GPU Direct RDMA) -- GDR Copy -- Data Center GPU Manager -- Azure HPC Diagnostics Tool -- Moby -- NVIDIA-Docker -- Moneo (Distributed HPC/AI system monitor) - - -This Image is compliant with the Linux Kernel 5.4.0-1107-azure. - -Software packages (MPI / HPC libraries) are configured as environment modules. Users can select preferred MPI or software packages as follows: - -`module load ` - -Running Single Node NCCL Test (example): - -```sh -mpirun -np 8 \ - --bind-to numa \ - --map-by ppr:8:node \ - -x LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH \ - -mca coll_hcoll_enable 0 \ - -x UCX_TLS=tcp \ - -x UCX_NET_DEVICES=eth0 \ - -x CUDA_DEVICE_ORDER=PCI_BUS_ID \ - -x NCCL_SOCKET_IFNAME=eth0 \ - -x NCCL_DEBUG=WARN \ - /opt/nccl-tests/build/all_reduce_perf -b1K -f2 -g1 -e 4G -``` diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install.sh b/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install.sh deleted file mode 100755 index a4ba250f..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -set -ex - -# set properties -source ./set_properties.sh - -# install utils -./install_utils.sh - -# install Lustre client -$UBUNTU_COMMON_DIR/install_lustre_client.sh - -# install mellanox ofed -./install_mellanoxofed.sh - -# install mpi libraries -./install_mpis.sh - -# install nvidia gpu driver -./install_nvidiagpudriver.sh - -# Install NCCL -$UBUNTU_COMMON_DIR/install_nccl.sh - -# Install NVIDIA docker container -$UBUNTU_COMMON_DIR/install_docker.sh - -# cleanup downloaded tarballs - clear some space -rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh -rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* -rm -rf /var/intel/ /var/cache/* -rm -Rf -- */ - -# Install DCGM -$UBUNTU_COMMON_DIR/install_dcgm.sh - -# install Intel libraries -$UBUNTU_COMMON_DIR/install_intel_libs.sh - -# install diagnostic script -$COMMON_DIR/install_hpcdiag.sh - -# install persistent rdma naming -$COMMON_DIR/install_azure_persistent_rdma_naming.sh - -# optimizations -$UBUNTU_COMMON_DIR/hpc-tuning.sh - -# copy test file -$COMMON_DIR/copy_test_file.sh - -# install monitor tools -$UBUNTU_COMMON_DIR/install_monitoring_tools.sh - -# diable auto kernel updates -$UBUNTU_COMMON_DIR/disable_auto_upgrade.sh - -# SKU Customization -$COMMON_DIR/setup_sku_customizations.sh - -# clear history -# Uncomment the line below if you are running this on a VM -# $COMMON_DIR/clear_history.sh diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_mellanoxofed.sh b/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_mellanoxofed.sh deleted file mode 100755 index 877e201b..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_mellanoxofed.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -set -ex - -VERSION="23.04-1.1.3.0" -TARBALL="MLNX_OFED_LINUX-$VERSION-ubuntu18.04-x86_64.tgz" -MLNX_OFED_DOWNLOAD_URL=https://content.mellanox.com/ofed/MLNX_OFED-${VERSION}/$TARBALL -MOFED_FOLDER=$(basename ${MLNX_OFED_DOWNLOAD_URL} .tgz) - -$COMMON_DIR/download_and_verify.sh $MLNX_OFED_DOWNLOAD_URL "634960b96698f845ccc8974987c71dfeced4553ddc5fb0aa1a79071307cbdd45" -tar zxvf ${TARBALL} - -./${MOFED_FOLDER}/mlnxofedinstall --add-kernel-support --skip-unsupported-devices-check --without-fw-update -$COMMON_DIR/write_component_version.sh "MOFED" $VERSION - -# Restarting openibd -/etc/init.d/openibd restart - -## Fix for systemd-modules-load service failing on boot -rm -rf /lib/modules/$(uname -r)/kernel/drivers/infiniband/ulp/iser/ib_iser.ko -depmod diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_mpis.sh b/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_mpis.sh deleted file mode 100755 index 0fcb7652..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_mpis.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -ex - -# Install common MPIs for Ubuntu 18 -../common/install_mpis.sh "c8868382a7b93286cd70ac46c63489f176bbfe0d76b3e5b2b91ecc0f93272414" diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_nvidiagpudriver.sh b/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_nvidiagpudriver.sh deleted file mode 100755 index f82d8504..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_nvidiagpudriver.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -set -ex - -$UBUNTU_COMMON_DIR/install_nvidiagpudriver.sh 1804 - -# Install gdrcopy -sudo apt install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -GDRCOPY_VERSION="2.3" -TARBALL="v${GDRCOPY_VERSION}.tar.gz" -GDRCOPY_DOWNLOAD_URL=https://github.com/NVIDIA/gdrcopy/archive/refs/tags/${TARBALL} -wget $GDRCOPY_DOWNLOAD_URL -tar -xvf $TARBALL - -pushd gdrcopy-${GDRCOPY_VERSION}/packages/ -CUDA=/usr/local/cuda ./build-deb-packages.sh -sudo dpkg -i gdrdrv-dkms_${GDRCOPY_VERSION}-1_amd64.Ubuntu18_04.deb -sudo apt-mark hold gdrdrv-dkms -sudo dpkg -i libgdrapi_${GDRCOPY_VERSION}-1_amd64.Ubuntu18_04.deb -sudo apt-mark hold libgdrapi -sudo dpkg -i gdrcopy-tests_${GDRCOPY_VERSION}-1_amd64.Ubuntu18_04.deb -sudo apt-mark hold gdrcopy-tests -sudo dpkg -i gdrcopy_${GDRCOPY_VERSION}-1_amd64.Ubuntu18_04.deb -sudo apt-mark hold gdrcopy -popd - -$COMMON_DIR/write_component_version.sh "GDRCOPY" ${GDRCOPY_VERSION} - -# Install nvidia fabric manager (required for ND96asr_v4) -$UBUNTU_COMMON_DIR/install_nvidia_fabric_manager.sh 1804 diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_utils.sh b/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_utils.sh deleted file mode 100755 index f9226b49..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/install_utils.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -set -ex - -# Setup microsoft packages repository for moby -# Download the repository configuration package -curl https://packages.microsoft.com/config/ubuntu/18.04/prod.list > ./microsoft-prod.list -# Copy the generated list to the sources.list.d directory -cp ./microsoft-prod.list /etc/apt/sources.list.d/ -# Install the Microsoft GPG public key -curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg -cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ - -#install apt pckages -AZCOPY_VERSION="10.16.2" -AZCOPY_RELEASE_TAG="release20221108" -$UBUNTU_COMMON_DIR/install_utils.sh ${AZCOPY_VERSION} ${AZCOPY_RELEASE_TAG} - -apt-get update -apt-get install -y python3.8 -update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 -apt-get -y install --reinstall python3-apt - -apt-get -y install python3-pip -DISTPACK=/usr/lib/python3/dist-packages -cp $DISTPACK/apt_pkg.cpython-36m-x86_64-linux-gnu.so $DISTPACK/apt_pkg.so -apt-get install -y libcairo2-dev -apt-get install -y python3-dev -apt-get install -y libpython3.8-dev -apt-get install -y libgirepository1.0-dev -python3.8 -m pip install --ignore-installed PyGObject -apt-get install -y software-properties-common - -# For networkd-dispatcher + unattended-upgrades services to work correctly. Specific to ubunut 18.04 -ln -sf /usr/lib/python3/dist-packages/_dbus_glib_bindings.cpython-36m-x86_64-linux-gnu.so /usr/lib/python3/dist-packages/_dbus_glib_bindings.so -ln -sf /usr/lib/python3/dist-packages/_dbus_bindings.cpython-36m-x86_64-linux-gnu.so /usr/lib/python3/dist-packages/_dbus_bindings.so -apt-get -y install libglib2.0-dev libdbus-1-3 libdbus-1-dev - -sudo python3 -m pip install meson ninja -sudo python3 -m pip install pgi dbus-python diff --git a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/set_properties.sh b/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/set_properties.sh deleted file mode 100755 index 2ab0f97e..00000000 --- a/ubuntu/ubuntu-18.x/ubuntu-18.04-hpc/set_properties.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -export TOP_DIR=../../.. -export COMMON_DIR=../../../common -export UBUNTU_COMMON_DIR=../../common -export TEST_DIR=../../../tests -export DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) From f22fb75186cc96a2aa3ae8dfb6d5bf3a2f23896f Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Fri, 22 Mar 2024 14:20:59 -0700 Subject: [PATCH 28/76] remove extension scripts --- extension_scripts/install_mellanoxofed.sh | 32 -------------------- extension_scripts/install_nvidiagpudriver.sh | 23 -------------- 2 files changed, 55 deletions(-) delete mode 100644 extension_scripts/install_mellanoxofed.sh delete mode 100644 extension_scripts/install_nvidiagpudriver.sh diff --git a/extension_scripts/install_mellanoxofed.sh b/extension_scripts/install_mellanoxofed.sh deleted file mode 100644 index 38cafb13..00000000 --- a/extension_scripts/install_mellanoxofed.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -distro=`./../common/extract_distro.sh` -if [ $? -eq 0 ] -then - echo "Detected distro: ${distro}" -else - echo "*** Error - invalid distro!" - exit -1 -fi - -if [[ $distro == "CentOS Linux 7.6.1810" ]] -then - pushd "../centos/centos-7.x/centos-7.6-hpc" -elif [[ $distro == "CentOS Linux 7.7.1908" ]] -then - pushd "../centos/centos-7.x/centos-7.7-hpc" -elif [[ $distro == "CentOS Linux 8.1.1911" ]] -then - pushd "../centos/centos-8.x/centos-8.1-hpc" -elif [[ $distro == "Ubuntu 18.04.4" ]] -then - pushd "../ubuntu/ubuntu-18.04-hpc" -else - echo "*** Error - unsupported distro!" - exit -1 -fi - -./install_mellanoxofed.sh -popd - -exit 0 diff --git a/extension_scripts/install_nvidiagpudriver.sh b/extension_scripts/install_nvidiagpudriver.sh deleted file mode 100644 index 2a141105..00000000 --- a/extension_scripts/install_nvidiagpudriver.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -distro=`./../common/extract_distro.sh` -if [ $? -eq 0 ] -then - echo "Detected distro: ${distro}" -else - echo "*** Error - invalid distro!" - exit -1 -fi - -if [[ $distro == "Ubuntu 18.04.4" ]] -then - pushd "../ubuntu/ubuntu-18.04-hpc" -else - echo "*** Error - unsupported distro!" - exit -1 -fi - -./install_nvidiagpudriver.sh -popd - -exit 0 From 2ac9ac33ccda06b9e9c03140c97dfe741a59d1ee Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Fri, 22 Mar 2024 14:23:31 -0700 Subject: [PATCH 29/76] remove Ubuntu 18.04 references --- tests/run-tests.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 88acdb3b..37620763 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -76,10 +76,6 @@ else OMPI_VERSION_UBUNTU="4.1.5" HPCX_MOFED_INTEGRATION_VERSION="MLNX_OFED_LINUX-5.4-1.0.3.0" case ${distro} in - "Ubuntu 18.04") HPCX_VERSION_UBUNTU="v2.15"; - MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-23.04-1.1.3.0"; - IMPI_2021_VERSION_UBUNTU="2021.9.0"; - ;; "Ubuntu 20.04") HPCX_VERSION_UBUNTU="v2.16"; MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-23.07-0.5.1.2"; IMPI_2021_VERSION_UBUNTU="2021.9.0"; @@ -90,7 +86,6 @@ else ;; *) ;; esac - HPCX_OMB_PATH_UBUNTU_1804="/opt/hpcx-${HPCX_VERSION_UBUNTU}-gcc-MLNX_OFED_LINUX-5-ubuntu18.04-cuda12-gdrcopy2-nccl2.17-x86_64/ompi/tests/osu-micro-benchmarks-5.8" fi MVAPICH2_VERSION_ALMA="2.3.7-1" From e0c48c7e748b6ba61cde9fc53a61e05e14aae489 Mon Sep 17 00:00:00 2001 From: KimPhillips128 Date: Fri, 22 Mar 2024 14:53:59 -0700 Subject: [PATCH 30/76] fix symbolic link for impi2021 --- alma/common/install_mpis.sh | 2 +- tests/run-tests.sh | 6 +++--- ubuntu/common/install_mpis.sh | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 8a63404d..1d6a473d 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -119,7 +119,7 @@ EOF # Create symlinks for modulefiles ln -s /usr/share/Modules/modulefiles/mpi/mvapich2-${MVAPICH2_VERSION} /usr/share/Modules/modulefiles/mpi/mvapich2 ln -s /usr/share/Modules/modulefiles/mpi/openmpi-${OMPI_VERSION} /usr/share/Modules/modulefiles/mpi/openmpi -ln -s /usr/share/Modules/modulefiles/mpi/impi_${IMPI_VERSION} /usr/share/Modules/modulefiles/mpi/impi-2021 +ln -s /usr/share/Modules/modulefiles/mpi/impi_${impi_2021_version} /usr/share/Modules/modulefiles/mpi/impi-2021 # cleanup downloaded tarballs and other installation files/folders rm -rf *.tar.gz *offline.sh diff --git a/tests/run-tests.sh b/tests/run-tests.sh index fafcfb3e..f439369f 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -527,12 +527,12 @@ fi # impi 2021 if [ $CHECK_IMPI_2021 -eq 1 ] then - check_exists "${MODULE_FILES_ROOT}/etc/modulefiles/mpi" + check_exists "${MODULE_FILES_ROOT}/mpi/impi-2021" - module load mpi/2021.11 + module load mpi/impi-2021 mpiexec -np 2 -ppn 2 -env FI_PROVIDER=mlx -env I_MPI_SHM=0 ${IMPI2021_PATH}/bin/IMB-MPI1 pingpong check_exit_code "Intel MPI 2021" "Failed to run Intel MPI 2021" - module unload mpi/2021.11 + module unload mpi/impi-2021 fi # impi 2018 diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index 161d8a1b..dea8d80d 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -135,4 +135,4 @@ EOF ln -s ${MODULE_FILES_DIRECTORY}/hpcx-${HPCX_VERSION} ${MODULE_FILES_DIRECTORY}/hpcx ln -s ${MODULE_FILES_DIRECTORY}/mvapich2-${MVAPICH2_VERSION} ${MODULE_FILES_DIRECTORY}/mvapich2 ln -s ${MODULE_FILES_DIRECTORY}/openmpi-${OMPI_VERSION} ${MODULE_FILES_DIRECTORY}/openmpi -ln -s ${MODULE_FILES_DIRECTORY}/impi_${IMPI_VERSION} ${MODULE_FILES_DIRECTORY}/impi-2021 +ln -s ${MODULE_FILES_DIRECTORY}/impi_${impi_2021_version} ${MODULE_FILES_DIRECTORY}/impi-2021 From 6fbc029c8b18c57f2ee2b28b0fabc1d6feea563c Mon Sep 17 00:00:00 2001 From: KimPhillips128 Date: Fri, 22 Mar 2024 16:44:01 -0700 Subject: [PATCH 31/76] removing .0 from impi_2021 folders. --- ubuntu/common/install_mpis.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index dea8d80d..8c8cdbe6 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -117,18 +117,18 @@ setenv MPI_HOME /opt/openmpi-${OMPI_VERSION} EOF # Intel 2021 -cat << EOF >> ${MODULE_FILES_DIRECTORY}/impi_${IMPI_VERSION} +cat << EOF >> ${MODULE_FILES_DIRECTORY}/impi_${impi_2021_version} #%Module 1.0 # # Intel MPI ${IMPI_VERSION} # conflict mpi -module load /opt/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/impi -setenv MPI_BIN /opt/intel/oneapi/mpi/${IMPI_VERSION}/bin -setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${IMPI_VERSION}/include -setenv MPI_LIB /opt/intel/oneapi/mpi/${IMPI_VERSION}/lib -setenv MPI_MAN /opt/intel/oneapi/mpi/${IMPI_VERSION}/man -setenv MPI_HOME /opt/intel/oneapi/mpi/${IMPI_VERSION} +module load /opt/intel/oneapi/mpi/${impi_2021_version}/modulefiles/impi +setenv MPI_BIN /opt/intel/oneapi/mpi/${impi_2021_version}/bin +setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${impi_2021_version}/include +setenv MPI_LIB /opt/intel/oneapi/mpi/${impi_2021_version}/lib +setenv MPI_MAN /opt/intel/oneapi/mpi/${impi_2021_version}/man +setenv MPI_HOME /opt/intel/oneapi/mpi/${impi_2021_version} EOF # Softlinks From 8c3846a80cb9c22e6928867aff90fca8c45c412f Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Fri, 22 Mar 2024 16:53:48 -0700 Subject: [PATCH 32/76] remove MOFED LTS --- tests/run-tests.sh | 76 +++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 51 deletions(-) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 37620763..29565cde 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -28,8 +28,6 @@ if [ -z "${l}" ]; then usage fi -MOFED_LTS=${l} # true/ false - source /etc/profile GCC_VERSION="9.2.0" @@ -64,29 +62,19 @@ find_ubuntu_distro() { distro=`find_distro` echo "Detected distro: ${distro}" -if [ "${MOFED_LTS}" = true ] -then - HPCX_VERSION_UBUNTU="v2.7.0" - MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-4.9-6.0.6.0" - HPCX_MOFED_INTEGRATION_VERSION="MLNX_OFED_LINUX-4.7-1.0.0.1" - HPCX_OMB_PATH_UBUNTU_1804="/opt/hpcx-${HPCX_VERSION_UBUNTU}-gcc-${HPCX_MOFED_INTEGRATION_VERSION}-ubuntu18.04-x86_64/ompi/tests/osu-micro-benchmarks-5.6.2" - IMPI_2021_VERSION_UBUNTU="2021.9.0" - OMPI_VERSION_UBUNTU="4.1.5" -else - OMPI_VERSION_UBUNTU="4.1.5" - HPCX_MOFED_INTEGRATION_VERSION="MLNX_OFED_LINUX-5.4-1.0.3.0" - case ${distro} in - "Ubuntu 20.04") HPCX_VERSION_UBUNTU="v2.16"; - MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-23.07-0.5.1.2"; - IMPI_2021_VERSION_UBUNTU="2021.9.0"; - ;; - "Ubuntu 22.04") HPCX_VERSION_UBUNTU="v2.16"; - MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-23.07-0.5.1.2"; - IMPI_2021_VERSION_UBUNTU="2021.9.0"; - ;; - *) ;; - esac -fi +OMPI_VERSION_UBUNTU="4.1.5" +HPCX_MOFED_INTEGRATION_VERSION="MLNX_OFED_LINUX-5.4-1.0.3.0" +case ${distro} in + "Ubuntu 20.04") HPCX_VERSION_UBUNTU="v2.16"; + MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-23.07-0.5.1.2"; + IMPI_2021_VERSION_UBUNTU="2021.9.0"; + ;; + "Ubuntu 22.04") HPCX_VERSION_UBUNTU="v2.16"; + MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-23.07-0.5.1.2"; + IMPI_2021_VERSION_UBUNTU="2021.9.0"; + ;; + *) ;; +esac MVAPICH2_VERSION_ALMA="2.3.7-1" MVAPICH2_VERSION_UBUNTU="2.3.7-1" @@ -268,7 +256,7 @@ then check_exists "/opt/amd/include/" fi -if [ $CHECK_DOCKER -eq 1 ] && [ "${MOFED_LTS}" = false ] +if [ $CHECK_DOCKER -eq 1 ] then sudo docker pull hello-world sudo docker run hello-world @@ -355,31 +343,17 @@ if [ $CHECK_NCCL -eq 1 ] then module load mpi/hpcx - if [ "${MOFED_LTS}" = true ] - then - mpirun -np 4 \ - -x LD_LIBRARY_PATH \ - --allow-run-as-root \ - --map-by ppr:4:node \ - -mca coll_hcoll_enable 0 \ - -x UCX_TLS=tcp \ - -x CUDA_DEVICE_ORDER=PCI_BUS_ID \ - -x NCCL_SOCKET_IFNAME=eth0 \ - -x NCCL_DEBUG=WARN \ - /opt/nccl-tests/build/all_reduce_perf -b1K -f2 -g1 -e 4G - else - mpirun -np 8 \ - --allow-run-as-root \ - --map-by ppr:8:node \ - -x LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH \ - -mca coll_hcoll_enable 0 \ - -x UCX_TLS=tcp \ - -x CUDA_DEVICE_ORDER=PCI_BUS_ID \ - -x NCCL_SOCKET_IFNAME=eth0 \ - -x NCCL_DEBUG=WARN \ - -x NCCL_NET_GDR_LEVEL=5 \ - /opt/nccl-tests/build/all_reduce_perf -b1K -f2 -g1 -e 4G - fi + mpirun -np 8 \ + --allow-run-as-root \ + --map-by ppr:8:node \ + -x LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH \ + -mca coll_hcoll_enable 0 \ + -x UCX_TLS=tcp \ + -x CUDA_DEVICE_ORDER=PCI_BUS_ID \ + -x NCCL_SOCKET_IFNAME=eth0 \ + -x NCCL_DEBUG=WARN \ + -x NCCL_NET_GDR_LEVEL=5 \ + /opt/nccl-tests/build/all_reduce_perf -b1K -f2 -g1 -e 4G check_exit_code "Single Node NCCL Test" "Failed" From 3b95d07e35b89801d9df43e8aacef5e33d056dfc Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Fri, 22 Mar 2024 20:42:11 -0700 Subject: [PATCH 33/76] tweak impi config --- ubuntu/common/install_mpis.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index 8c8cdbe6..ef9c2c6c 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -120,14 +120,14 @@ EOF cat << EOF >> ${MODULE_FILES_DIRECTORY}/impi_${impi_2021_version} #%Module 1.0 # -# Intel MPI ${IMPI_VERSION} +# Intel MPI ${impi_2021_version} # conflict mpi -module load /opt/intel/oneapi/mpi/${impi_2021_version}/modulefiles/impi +module load /opt/intel/oneapi/mpi/${impi_2021_version}/etc/modulefiles/impi setenv MPI_BIN /opt/intel/oneapi/mpi/${impi_2021_version}/bin setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${impi_2021_version}/include setenv MPI_LIB /opt/intel/oneapi/mpi/${impi_2021_version}/lib -setenv MPI_MAN /opt/intel/oneapi/mpi/${impi_2021_version}/man +setenv MPI_MAN /opt/intel/oneapi/mpi/${impi_2021_version}/share/man setenv MPI_HOME /opt/intel/oneapi/mpi/${impi_2021_version} EOF From 03e1a9152ad47be1cb57bc40826ba71382208784 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 25 Mar 2024 11:04:43 -0700 Subject: [PATCH 34/76] add version to end of module path --- ubuntu/common/install_mpis.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index ef9c2c6c..33d21113 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -123,7 +123,7 @@ cat << EOF >> ${MODULE_FILES_DIRECTORY}/impi_${impi_2021_version} # Intel MPI ${impi_2021_version} # conflict mpi -module load /opt/intel/oneapi/mpi/${impi_2021_version}/etc/modulefiles/impi +module load /opt/intel/oneapi/mpi/${impi_2021_version}/etc/modulefiles/impi/${impi_2021_version} setenv MPI_BIN /opt/intel/oneapi/mpi/${impi_2021_version}/bin setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${impi_2021_version}/include setenv MPI_LIB /opt/intel/oneapi/mpi/${impi_2021_version}/lib From 22f793dcf8ba89de7071c178bb00f8aee9489c5f Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 25 Mar 2024 14:07:01 -0700 Subject: [PATCH 35/76] test no apt upgrade --- ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh index 61d086b7..c060c514 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh @@ -2,11 +2,11 @@ set -ex # Don't allow the kernel to be updated -apt-mark hold linux-azure +# apt-mark hold linux-azure # upgrade pre-installed components apt update -apt upgrade -y +# apt upgrade -y # jq is needed to parse the component versions from the requirements.json file apt install -y jq From 2bd7c940cf1d6ba4d20cacdf03d2c89e6db91a3e Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 25 Mar 2024 16:23:38 -0700 Subject: [PATCH 36/76] Revert "test no apt upgrade" This reverts commit 22f793dcf8ba89de7071c178bb00f8aee9489c5f. --- ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh index c060c514..61d086b7 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh @@ -2,11 +2,11 @@ set -ex # Don't allow the kernel to be updated -# apt-mark hold linux-azure +apt-mark hold linux-azure # upgrade pre-installed components apt update -# apt upgrade -y +apt upgrade -y # jq is needed to parse the component versions from the requirements.json file apt install -y jq From 00991badff66eeccc8de8420ae072886572a7a8b Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 25 Mar 2024 16:25:01 -0700 Subject: [PATCH 37/76] stop fabric manager before starting --- customizations/ndv4.sh | 1 + customizations/ndv5.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/customizations/ndv4.sh b/customizations/ndv4.sh index 83890512..adedeff3 100755 --- a/customizations/ndv4.sh +++ b/customizations/ndv4.sh @@ -15,6 +15,7 @@ EOF ## NVIDIA Fabric manager systemctl enable nvidia-fabricmanager +systemctl stop nvidia-fabricmanager systemctl start nvidia-fabricmanager systemctl is-active --quiet nvidia-fabricmanager diff --git a/customizations/ndv5.sh b/customizations/ndv5.sh index dd82c4a6..53109a7a 100755 --- a/customizations/ndv5.sh +++ b/customizations/ndv5.sh @@ -15,6 +15,7 @@ EOF ## NVIDIA Fabric manager systemctl enable nvidia-fabricmanager +systemctl stop nvidia-fabricmanager systemctl start nvidia-fabricmanager systemctl is-active --quiet nvidia-fabricmanager From f163a9c0ce9d3af6401548934d7fe2aa2697e175 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 25 Mar 2024 17:22:12 -0700 Subject: [PATCH 38/76] upgrade nvidia drivers to 550.54.15 --- requirements.json | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/requirements.json b/requirements.json index 1f51a561..527b808a 100644 --- a/requirements.json +++ b/requirements.json @@ -98,38 +98,38 @@ "nvidia": { "ubuntu20.04": { "driver": { - "version": "550.54.14", - "sha256": "8c497ff1cfc7c310fb875149bc30faa4fd26d2237b2cba6cd2e8b0780157cfe3" + "version": "550.54.15", + "sha256": "2e859ae5f912a9a47aaa9b2d40a94a14f6f486b5d3b67c0ddf8b72c1c9650385" }, "fabricmanager": { "prefix": "550", "distribution": "ubuntu2004", - "version": "550_550.54.14-1", - "sha256": "3f167f26a606cf4adb9f8c7b2afc3e39b1e689f185fbdbb1e76e21aeb4327d59" + "version": "550_550.54.15-1", + "sha256": "bc4c77ba0e0e88201afe1e44ee673ae55377a55c2f31e5c722d56646454b50ef" } }, "ubuntu22.04": { "driver": { - "version": "550.54.14", - "sha256": "8c497ff1cfc7c310fb875149bc30faa4fd26d2237b2cba6cd2e8b0780157cfe3" + "version": "550.54.15", + "sha256": "2e859ae5f912a9a47aaa9b2d40a94a14f6f486b5d3b67c0ddf8b72c1c9650385" }, "fabricmanager": { "prefix": "550", "distribution": "ubuntu2204", - "version": "550_550.54.14-1", - "sha256": "3f167f26a606cf4adb9f8c7b2afc3e39b1e689f185fbdbb1e76e21aeb4327d59" + "version": "550_550.54.15-1", + "sha256": "bc4c77ba0e0e88201afe1e44ee673ae55377a55c2f31e5c722d56646454b50ef" } }, "almalinux8.7": { "driver": { - "version": "550.54.14", - "sha256": "8c497ff1cfc7c310fb875149bc30faa4fd26d2237b2cba6cd2e8b0780157cfe3" + "version": "550.54.15", + "sha256": "2e859ae5f912a9a47aaa9b2d40a94a14f6f486b5d3b67c0ddf8b72c1c9650385" }, "fabricmanager": { "prefix": "550", "distribution": "rhel8", - "version": "550.54.14-1", - "sha256": "31c54439b3abc03e98a2e29fa950253d0989f4591b432c0e6e1461809c2a9cb3" + "version": "550.54.15-1", + "sha256": "a617d5f4f93f9f04698af41b261dee2e1280f9802a5ecb8c0a0de0b5c113a343" } } }, From 7e9c75e510698bd2c1aa8d144e63788b6078e090 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 25 Mar 2024 18:53:28 -0700 Subject: [PATCH 39/76] disable fabric manager initialization --- customizations/ndv4.sh | 8 ++++---- customizations/ndv5.sh | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/customizations/ndv4.sh b/customizations/ndv4.sh index adedeff3..9c5898a0 100755 --- a/customizations/ndv4.sh +++ b/customizations/ndv4.sh @@ -14,10 +14,10 @@ NCCL_TOPO_FILE=/opt/microsoft/ndv4/topo.xml EOF ## NVIDIA Fabric manager -systemctl enable nvidia-fabricmanager -systemctl stop nvidia-fabricmanager -systemctl start nvidia-fabricmanager -systemctl is-active --quiet nvidia-fabricmanager +# systemctl enable nvidia-fabricmanager +# systemctl stop nvidia-fabricmanager +# systemctl start nvidia-fabricmanager +# systemctl is-active --quiet nvidia-fabricmanager error_code=$? if [ ${error_code} -ne 0 ] diff --git a/customizations/ndv5.sh b/customizations/ndv5.sh index 53109a7a..dc70513e 100755 --- a/customizations/ndv5.sh +++ b/customizations/ndv5.sh @@ -14,10 +14,10 @@ NCCL_TOPO_FILE=/opt/microsoft/ndv5/topo.xml EOF ## NVIDIA Fabric manager -systemctl enable nvidia-fabricmanager -systemctl stop nvidia-fabricmanager -systemctl start nvidia-fabricmanager -systemctl is-active --quiet nvidia-fabricmanager +# systemctl enable nvidia-fabricmanager +# systemctl stop nvidia-fabricmanager +# systemctl start nvidia-fabricmanager +# systemctl is-active --quiet nvidia-fabricmanager error_code=$? if [ ${error_code} -ne 0 ] From 69702c0be5780f6e2c097b141f8b03ae5959a7ac Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 25 Mar 2024 20:43:09 -0700 Subject: [PATCH 40/76] enable fabric manager --- customizations/ndv4.sh | 2 +- customizations/ndv5.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/customizations/ndv4.sh b/customizations/ndv4.sh index 9c5898a0..bd9c45ca 100755 --- a/customizations/ndv4.sh +++ b/customizations/ndv4.sh @@ -14,7 +14,7 @@ NCCL_TOPO_FILE=/opt/microsoft/ndv4/topo.xml EOF ## NVIDIA Fabric manager -# systemctl enable nvidia-fabricmanager +systemctl enable nvidia-fabricmanager # systemctl stop nvidia-fabricmanager # systemctl start nvidia-fabricmanager # systemctl is-active --quiet nvidia-fabricmanager diff --git a/customizations/ndv5.sh b/customizations/ndv5.sh index dc70513e..68dc0fdd 100755 --- a/customizations/ndv5.sh +++ b/customizations/ndv5.sh @@ -14,7 +14,7 @@ NCCL_TOPO_FILE=/opt/microsoft/ndv5/topo.xml EOF ## NVIDIA Fabric manager -# systemctl enable nvidia-fabricmanager +systemctl enable nvidia-fabricmanager # systemctl stop nvidia-fabricmanager # systemctl start nvidia-fabricmanager # systemctl is-active --quiet nvidia-fabricmanager From ecbaef4ebf3b62b3ab9d1198fe4e18975538c754 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 25 Mar 2024 22:42:04 -0700 Subject: [PATCH 41/76] add starting fabric manager to install --- ubuntu/common/install_nvidia_fabric_manager.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ubuntu/common/install_nvidia_fabric_manager.sh b/ubuntu/common/install_nvidia_fabric_manager.sh index 44ff69de..6a570ca5 100755 --- a/ubuntu/common/install_nvidia_fabric_manager.sh +++ b/ubuntu/common/install_nvidia_fabric_manager.sh @@ -13,3 +13,7 @@ $COMMON_DIR/download_and_verify.sh $NVIDIA_FABRIC_MNGR_URL ${NVIDIA_FABRICMANAGE apt install -y ./nvidia-fabricmanager-${NVIDIA_FABRICMANAGER_VERSION}_amd64.deb apt-mark hold nvidia-fabricmanager-${NVIDIA_FABRICMANAGER_PREFIX} $COMMON_DIR/write_component_version.sh "NVIDIA_FABRIC_MANAGER" ${NVIDIA_FABRICMANAGER_VERSION} + +systemctl enable nvidia-fabricmanager +systemctl start nvidia-fabricmanager +systemctl is-active --quiet nvidia-fabricmanager From 9446c45374bf8de60796c79de513a2c39d27b20d Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 26 Mar 2024 00:21:21 -0700 Subject: [PATCH 42/76] update alma --- alma/alma-8.x/common/install_nvidiagpudriver.sh | 4 ++++ alma/common/install_mpis.sh | 14 +++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/alma/alma-8.x/common/install_nvidiagpudriver.sh b/alma/alma-8.x/common/install_nvidiagpudriver.sh index 190f7456..aa5be140 100755 --- a/alma/alma-8.x/common/install_nvidiagpudriver.sh +++ b/alma/alma-8.x/common/install_nvidiagpudriver.sh @@ -72,6 +72,10 @@ yum install -y ./nvidia-fabric-manager-${NVIDIA_FABRICMANAGER_VERSION}.x86_64.rp sed -i "$ s/$/ nvidia-fabric-manager/" /etc/dnf/dnf.conf $COMMON_DIR/write_component_version.sh "NVIDIA_FABRIC_MANAGER" ${NVIDIA_FABRICMANAGER_VERSION} +systemctl enable nvidia-fabricmanager +systemctl start nvidia-fabricmanager +systemctl is-active --quiet nvidia-fabricmanager + # cleanup downloaded files rm -rf *.run *tar.gz *.rpm rm -rf -- */ diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 1d6a473d..79957aff 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -105,15 +105,15 @@ EOF cat << EOF >> /usr/share/Modules/modulefiles/mpi/impi_${IMPI_VERSION} #%Module 1.0 # -# Intel MPI ${IMPI_VERSION} +# Intel MPI ${impi_2021_version} # conflict mpi -module load /opt/intel/oneapi/mpi/${IMPI_VERSION}/modulefiles/impi -setenv MPI_BIN /opt/intel/oneapi/mpi/${IMPI_VERSION}/bin -setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${IMPI_VERSION}/include -setenv MPI_LIB /opt/intel/oneapi/mpi/${IMPI_VERSION}/lib -setenv MPI_MAN /opt/intel/oneapi/mpi/${IMPI_VERSION}/man -setenv MPI_HOME /opt/intel/oneapi/mpi/${IMPI_VERSION} +module load /opt/intel/oneapi/mpi/${impi_2021_version}/etc/modulefiles/impi/${impi_2021_version} +setenv MPI_BIN /opt/intel/oneapi/mpi/${impi_2021_version}/bin +setenv MPI_INCLUDE /opt/intel/oneapi/mpi/${impi_2021_version}/include +setenv MPI_LIB /opt/intel/oneapi/mpi/${impi_2021_version}/lib +setenv MPI_MAN /opt/intel/oneapi/mpi/${impi_2021_version}/share/man +setenv MPI_HOME /opt/intel/oneapi/mpi/${impi_2021_version} EOF # Create symlinks for modulefiles From ad2dc2d93c7120613e57c836a2c952747264af2c Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 26 Mar 2024 10:00:25 -0700 Subject: [PATCH 43/76] fix impi module filename in alma --- alma/common/install_mpis.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 79957aff..977b0509 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -102,7 +102,7 @@ setenv MPI_HOME /opt/openmpi-${OMPI_VERSION} EOF #IntelMPI-v2021 -cat << EOF >> /usr/share/Modules/modulefiles/mpi/impi_${IMPI_VERSION} +cat << EOF >> /usr/share/Modules/modulefiles/mpi/impi_${impi_2021_version} #%Module 1.0 # # Intel MPI ${impi_2021_version} From 1128732863fbc0ea34d5ac2d80ea0bcd736ab611 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 26 Mar 2024 12:10:33 -0700 Subject: [PATCH 44/76] add logging to mpi install --- alma/common/install_mpis.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 977b0509..e84c3618 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -1,5 +1,5 @@ #!/bin/bash -set -e +set -ex GCC_VERSION=$1 HPCX_PATH=$2 From d524d0b47f7831dcca244e3521cc1fd9fba6f14d Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 26 Mar 2024 19:15:50 -0700 Subject: [PATCH 45/76] add pmix scripts --- alma/alma-8.x/common/install_pmix.sh | 22 ++++++++++++ alma/alma-8.x/common/slurmel8.repo | 7 ++++ ubuntu/common/install_pmix.sh | 50 ++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+) create mode 100644 alma/alma-8.x/common/install_pmix.sh create mode 100644 alma/alma-8.x/common/slurmel8.repo create mode 100644 ubuntu/common/install_pmix.sh diff --git a/alma/alma-8.x/common/install_pmix.sh b/alma/alma-8.x/common/install_pmix.sh new file mode 100644 index 00000000..5626c6ee --- /dev/null +++ b/alma/alma-8.x/common/install_pmix.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +PMIX_VERSION=4.2.9-1 +OS_VERSION=$(cat /etc/os-release | grep VERSION_ID | cut -d= -f2 | cut -d\" -f2 | cut -d. -f1) +if [ $OS_VERSION == 8 ]; then + cp slurmel8.repo /etc/yum.repos.d/slurm.repo + release=el8 +elif [ $OS_VERSION == 7 ]; then + cp slurmel7.repo /etc/yum.repos.d/slurm.repo + release=el7 +else echo "unsupported version" +fi + +## This package is pre-installed in all hpc images used by cyclecloud, but if customer wants to +## build an image from generic marketplace images then this package sets up the right gpg keys for PMC. +if [ ! -e /etc/yum.repos.d/microsoft-prod.repo ];then + curl -sSL -O https://packages.microsoft.com/config/rhel/$OS_VERSION/packages-microsoft-prod.rpm + rpm -i packages-microsoft-prod.rpm + rm packages-microsoft-prod.rpm +fi + +yum -y install pmix-$PMIX_VERSION.$release \ No newline at end of file diff --git a/alma/alma-8.x/common/slurmel8.repo b/alma/alma-8.x/common/slurmel8.repo new file mode 100644 index 00000000..e83b1d42 --- /dev/null +++ b/alma/alma-8.x/common/slurmel8.repo @@ -0,0 +1,7 @@ +[slurm] +name=Slurm Workload Manager +baseurl=https://packages.microsoft.com/yumrepos/slurm-el8-insiders +enabled=1 +gpgcheck=1 +gpgkey=https://packages.microsoft.com/keys/microsoft.asc +priority=10 \ No newline at end of file diff --git a/ubuntu/common/install_pmix.sh b/ubuntu/common/install_pmix.sh new file mode 100644 index 00000000..46c34b48 --- /dev/null +++ b/ubuntu/common/install_pmix.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +set -x + +UBUNTU_VERSION=$(cat /etc/os-release | grep VERSION_ID | cut -d= -f2 | cut -d\" -f2) +PMIX_VERSION=4.2.9-1 + +if [ $UBUNTU_VERSION == 22.04 ]; then + REPO=slurm-ubuntu-jammy +elif [ $UBUNTU_VERSION == 20.04 ]; then + REPO=slurm-ubuntu-focal +else echo "$UBUNTU_VERSION not supported for pmix installation." +fi + +echo "deb [arch=amd64] https://packages.microsoft.com/repos/$REPO/ insiders main" > /etc/apt/sources.list.d/slurm.list + +# Set priority for pmix and slurm packages from PMC to be higher than upstream ubuntu. +echo "\ +Package: slurm-smd* +Pin: origin \"packages.microsoft.com\" +Pin-Priority: 990 + +Package: pmix +Pin: origin \"packages.microsoft.com\" +Pin-Priority: 990 + +Package: slurm* +Pin: origin *ubuntu.com* +Pin-Priority: -1 + +Package: pmix +Pin: origin *ubuntu.com* +Pin-Priority: -1" > /etc/apt/preferences.d/slurm-repository-pin-990 + +## This package is pre-installed in all hpc images used by cyclecloud, but if customer wants to +## use generic ubuntu marketplace image then this package sets up the right gpg keys for PMC. +if [ ! -e /etc/apt/sources.list.d/microsoft-prod.list ]; then + curl -sSL -O https://packages.microsoft.com/config/ubuntu/$UBUNTU_VERSION/packages-microsoft-prod.deb + dpkg -i packages-microsoft-prod.deb + rm packages-microsoft-prod.deb +fi + + +apt update + +apt install -y pmix=$PMIX_VERSION + +# Hold versions of packages to prevent accidental updates. Packages can still be upgraded explictly by +# '--allow-change-held-packages' flag. +apt-mark hold pmix=$PMIX_VERSION \ No newline at end of file From ac8d543aab2eb314f4788315bcc176a450949273 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 26 Mar 2024 19:18:42 -0700 Subject: [PATCH 46/76] update execute permissions --- alma/alma-8.x/common/hpc-tuning.sh | 14 +++++++------- alma/alma-8.x/common/install_pmix.sh | 0 ubuntu/common/install_pmix.sh | 0 3 files changed, 7 insertions(+), 7 deletions(-) mode change 100644 => 100755 alma/alma-8.x/common/install_pmix.sh mode change 100644 => 100755 ubuntu/common/install_pmix.sh diff --git a/alma/alma-8.x/common/hpc-tuning.sh b/alma/alma-8.x/common/hpc-tuning.sh index 2446d826..a0ab1fcf 100755 --- a/alma/alma-8.x/common/hpc-tuning.sh +++ b/alma/alma-8.x/common/hpc-tuning.sh @@ -1,7 +1,7 @@ -#!/bin/bash - -# Disable some unneeded services by default (administrators can re-enable if desired) -systemctl disable wpa_supplicant -systemctl disable abrtd - -../../common/hpc-tuning.sh +#!/bin/bash + +# Disable some unneeded services by default (administrators can re-enable if desired) +systemctl disable wpa_supplicant +systemctl disable abrtd + +../../common/hpc-tuning.sh diff --git a/alma/alma-8.x/common/install_pmix.sh b/alma/alma-8.x/common/install_pmix.sh old mode 100644 new mode 100755 diff --git a/ubuntu/common/install_pmix.sh b/ubuntu/common/install_pmix.sh old mode 100644 new mode 100755 From 8b1b2475a6e5e4137053e456f75c9c543cf24684 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 26 Mar 2024 19:51:13 -0700 Subject: [PATCH 47/76] add pmix version config --- alma/alma-8.x/common/install_pmix.sh | 20 ++++++++------------ requirements.json | 11 +++++++++++ ubuntu/common/install_pmix.sh | 9 +++++---- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/alma/alma-8.x/common/install_pmix.sh b/alma/alma-8.x/common/install_pmix.sh index 5626c6ee..6f3745ee 100755 --- a/alma/alma-8.x/common/install_pmix.sh +++ b/alma/alma-8.x/common/install_pmix.sh @@ -1,22 +1,18 @@ #!/bin/bash +set -ex -PMIX_VERSION=4.2.9-1 -OS_VERSION=$(cat /etc/os-release | grep VERSION_ID | cut -d= -f2 | cut -d\" -f2 | cut -d. -f1) -if [ $OS_VERSION == 8 ]; then - cp slurmel8.repo /etc/yum.repos.d/slurm.repo - release=el8 -elif [ $OS_VERSION == 7 ]; then - cp slurmel7.repo /etc/yum.repos.d/slurm.repo - release=el7 -else echo "unsupported version" -fi +PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) + +cp slurmel8.repo /etc/yum.repos.d/slurm.repo ## This package is pre-installed in all hpc images used by cyclecloud, but if customer wants to ## build an image from generic marketplace images then this package sets up the right gpg keys for PMC. if [ ! -e /etc/yum.repos.d/microsoft-prod.repo ];then - curl -sSL -O https://packages.microsoft.com/config/rhel/$OS_VERSION/packages-microsoft-prod.rpm + curl -sSL -O https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm rpm -i packages-microsoft-prod.rpm rm packages-microsoft-prod.rpm fi -yum -y install pmix-$PMIX_VERSION.$release \ No newline at end of file +yum -y install pmix-$PMIX_VERSION.el8 + +$COMMON_DIR/write_component_version.sh "PMIX" ${PMIX_VERSION} \ No newline at end of file diff --git a/requirements.json b/requirements.json index 527b808a..204b0411 100644 --- a/requirements.json +++ b/requirements.json @@ -300,5 +300,16 @@ "ubuntu22.04": { "branch": "stable-v48" } + }, + "pmix": { + "almalinux8.7": { + "version": "4.2.9-1" + }, + "ubuntu20.04": { + "version": "4.2.9-1" + }, + "ubuntu22.04": { + "version": "4.2.9-1" + } } } diff --git a/ubuntu/common/install_pmix.sh b/ubuntu/common/install_pmix.sh index 46c34b48..9cef7eee 100755 --- a/ubuntu/common/install_pmix.sh +++ b/ubuntu/common/install_pmix.sh @@ -1,9 +1,8 @@ #!/bin/bash +set -ex -set -x - +PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) UBUNTU_VERSION=$(cat /etc/os-release | grep VERSION_ID | cut -d= -f2 | cut -d\" -f2) -PMIX_VERSION=4.2.9-1 if [ $UBUNTU_VERSION == 22.04 ]; then REPO=slurm-ubuntu-jammy @@ -47,4 +46,6 @@ apt install -y pmix=$PMIX_VERSION # Hold versions of packages to prevent accidental updates. Packages can still be upgraded explictly by # '--allow-change-held-packages' flag. -apt-mark hold pmix=$PMIX_VERSION \ No newline at end of file +apt-mark hold pmix=$PMIX_VERSION + +$COMMON_DIR/write_component_version.sh "PMIX" ${PMIX_VERSION} \ No newline at end of file From 1e5073fb8ba3e87a8e67dd6ed9f05223e729c6e1 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 26 Mar 2024 19:54:11 -0700 Subject: [PATCH 48/76] call PMIX from install --- alma/alma-8.x/alma-8.7-hpc/install.sh | 3 +++ ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install.sh | 3 +++ ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh | 3 +++ 3 files changed, 9 insertions(+) diff --git a/alma/alma-8.x/alma-8.7-hpc/install.sh b/alma/alma-8.x/alma-8.7-hpc/install.sh index 279a951b..f5cbb4b0 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install.sh @@ -19,6 +19,9 @@ $ALMA_COMMON_DIR/install_lustre_client.sh "8" # install mellanox ofed ./install_mellanoxofed.sh +# install PMIX +$COMMON_DIR/../alma/alma-8.x/common/install_pmix.sh + # install mpi libraries ./install_mpis.sh diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install.sh index 271ad8dc..1a180eaf 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install.sh @@ -16,6 +16,9 @@ $UBUNTU_COMMON_DIR/install_lustre_client.sh # install mellanox ofed ./install_mellanoxofed.sh +# install PMIX +$UBUNTU_COMMON_DIR/install_pmix.sh + # install mpi libraries ./install_mpis.sh diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh index 271ad8dc..1a180eaf 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh @@ -16,6 +16,9 @@ $UBUNTU_COMMON_DIR/install_lustre_client.sh # install mellanox ofed ./install_mellanoxofed.sh +# install PMIX +$UBUNTU_COMMON_DIR/install_pmix.sh + # install mpi libraries ./install_mpis.sh From 27573273e6255d817c707c8a7da6e4dca6aa5196 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Wed, 27 Mar 2024 10:10:09 -0700 Subject: [PATCH 49/76] clean up line endings --- alma/alma-8.x/common/hpc-tuning.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/alma/alma-8.x/common/hpc-tuning.sh b/alma/alma-8.x/common/hpc-tuning.sh index a0ab1fcf..2446d826 100755 --- a/alma/alma-8.x/common/hpc-tuning.sh +++ b/alma/alma-8.x/common/hpc-tuning.sh @@ -1,7 +1,7 @@ -#!/bin/bash - -# Disable some unneeded services by default (administrators can re-enable if desired) -systemctl disable wpa_supplicant -systemctl disable abrtd - -../../common/hpc-tuning.sh +#!/bin/bash + +# Disable some unneeded services by default (administrators can re-enable if desired) +systemctl disable wpa_supplicant +systemctl disable abrtd + +../../common/hpc-tuning.sh From 5e72617d7f5087aa00d953ce2778f7ac02dee8d9 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Thu, 28 Mar 2024 16:12:10 -0700 Subject: [PATCH 50/76] install developer libraries --- alma/alma-8.x/common/install_pmix.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/alma/alma-8.x/common/install_pmix.sh b/alma/alma-8.x/common/install_pmix.sh index 6f3745ee..5938658a 100755 --- a/alma/alma-8.x/common/install_pmix.sh +++ b/alma/alma-8.x/common/install_pmix.sh @@ -13,6 +13,7 @@ if [ ! -e /etc/yum.repos.d/microsoft-prod.repo ];then rm packages-microsoft-prod.rpm fi -yum -y install pmix-$PMIX_VERSION.el8 +dnf config-manager --set-enabled powertools +yum -y install pmix-$PMIX_VERSION.el8 hwloc-devel libevent-devel $COMMON_DIR/write_component_version.sh "PMIX" ${PMIX_VERSION} \ No newline at end of file From 466aef5b445eae707e4d871f3d426da67999b851 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Thu, 28 Mar 2024 16:14:33 -0700 Subject: [PATCH 51/76] update ubuntu to use the LTS kernel --- .../ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh | 7 +++---- .../ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh index 61d086b7..cc443b44 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh @@ -1,12 +1,11 @@ #!/bin/bash set -ex -# Don't allow the kernel to be updated -apt-mark hold linux-azure - # upgrade pre-installed components apt update -apt upgrade -y + +# install LTS kernel +apt install -y linux-azure-lts # jq is needed to parse the component versions from the requirements.json file apt install -y jq diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh index 61d086b7..cc443b44 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh @@ -1,12 +1,11 @@ #!/bin/bash set -ex -# Don't allow the kernel to be updated -apt-mark hold linux-azure - # upgrade pre-installed components apt update -apt upgrade -y + +# install LTS kernel +apt install -y linux-azure-lts # jq is needed to parse the component versions from the requirements.json file apt install -y jq From b4370aea9c73a6a079fe5d45dde26e4963eee4be Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Thu, 28 Mar 2024 16:55:20 -0700 Subject: [PATCH 52/76] move PMC repo to prerequisites --- .../ubuntu-20.04-hpc/install_prerequisites.sh | 9 +++++++++ .../ubuntu-20.04-hpc/install_utils.sh | 16 ++++++++-------- .../ubuntu-22.04-hpc/install_prerequisites.sh | 9 +++++++++ .../ubuntu-22.04-hpc/install_utils.sh | 16 ++++++++-------- 4 files changed, 34 insertions(+), 16 deletions(-) diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh index cc443b44..9bc89ca4 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh @@ -1,6 +1,15 @@ #!/bin/bash set -ex +# Setup microsoft packages repository for moby +# Download the repository configuration package +curl https://packages.microsoft.com/config/ubuntu/20.04/prod.list > ./microsoft-prod.list +# Copy the generated list to the sources.list.d directory +cp ./microsoft-prod.list /etc/apt/sources.list.d/ +# Install the Microsoft GPG public key +curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg +cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ + # upgrade pre-installed components apt update diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_utils.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_utils.sh index 714712f1..5d21e268 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_utils.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_utils.sh @@ -1,14 +1,14 @@ #!/bin/bash set -ex -# Setup microsoft packages repository for moby -# Download the repository configuration package -curl https://packages.microsoft.com/config/ubuntu/20.04/prod.list > ./microsoft-prod.list -# Copy the generated list to the sources.list.d directory -cp ./microsoft-prod.list /etc/apt/sources.list.d/ -# Install the Microsoft GPG public key -curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg -cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ +# # Setup microsoft packages repository for moby +# # Download the repository configuration package +# curl https://packages.microsoft.com/config/ubuntu/20.04/prod.list > ./microsoft-prod.list +# # Copy the generated list to the sources.list.d directory +# cp ./microsoft-prod.list /etc/apt/sources.list.d/ +# # Install the Microsoft GPG public key +# curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg +# cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ #apt-get install packages diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh index cc443b44..f2f905c9 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh @@ -1,6 +1,15 @@ #!/bin/bash set -ex +# Setup microsoft packages repository for moby +# Download the repository configuration package +curl https://packages.microsoft.com/config/ubuntu/22.04/prod.list > ./microsoft-prod.list +# Copy the generated list to the sources.list.d directory +cp ./microsoft-prod.list /etc/apt/sources.list.d/ +# Install the Microsoft GPG public key +curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg +cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ + # upgrade pre-installed components apt update diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_utils.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_utils.sh index 4f57c486..e7544a1d 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_utils.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_utils.sh @@ -1,14 +1,14 @@ #!/bin/bash set -ex -# Setup microsoft packages repository for moby -# Download the repository configuration package -curl https://packages.microsoft.com/config/ubuntu/22.04/prod.list > ./microsoft-prod.list -# Copy the generated list to the sources.list.d directory -cp ./microsoft-prod.list /etc/apt/sources.list.d/ -# Install the Microsoft GPG public key -curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg -cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ +# # Setup microsoft packages repository for moby +# # Download the repository configuration package +# curl https://packages.microsoft.com/config/ubuntu/22.04/prod.list > ./microsoft-prod.list +# # Copy the generated list to the sources.list.d directory +# cp ./microsoft-prod.list /etc/apt/sources.list.d/ +# # Install the Microsoft GPG public key +# curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg +# cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ #apt-get install packages From 533c6ab9457fd3911e1d9e2e1ba142e02080ba51 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Thu, 28 Mar 2024 17:12:51 -0700 Subject: [PATCH 53/76] update LTS kernel package name --- .../ubuntu-20.04-hpc/install_prerequisites.sh | 11 +---------- .../ubuntu-20.04-hpc/install_utils.sh | 16 ++++++++-------- .../ubuntu-22.04-hpc/install_prerequisites.sh | 11 +---------- .../ubuntu-22.04-hpc/install_utils.sh | 16 ++++++++-------- 4 files changed, 18 insertions(+), 36 deletions(-) diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh index 9bc89ca4..678262f3 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh @@ -1,20 +1,11 @@ #!/bin/bash set -ex -# Setup microsoft packages repository for moby -# Download the repository configuration package -curl https://packages.microsoft.com/config/ubuntu/20.04/prod.list > ./microsoft-prod.list -# Copy the generated list to the sources.list.d directory -cp ./microsoft-prod.list /etc/apt/sources.list.d/ -# Install the Microsoft GPG public key -curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg -cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ - # upgrade pre-installed components apt update # install LTS kernel -apt install -y linux-azure-lts +apt install -y linux-azure-lts-20.04 # jq is needed to parse the component versions from the requirements.json file apt install -y jq diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_utils.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_utils.sh index 5d21e268..714712f1 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_utils.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_utils.sh @@ -1,14 +1,14 @@ #!/bin/bash set -ex -# # Setup microsoft packages repository for moby -# # Download the repository configuration package -# curl https://packages.microsoft.com/config/ubuntu/20.04/prod.list > ./microsoft-prod.list -# # Copy the generated list to the sources.list.d directory -# cp ./microsoft-prod.list /etc/apt/sources.list.d/ -# # Install the Microsoft GPG public key -# curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg -# cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ +# Setup microsoft packages repository for moby +# Download the repository configuration package +curl https://packages.microsoft.com/config/ubuntu/20.04/prod.list > ./microsoft-prod.list +# Copy the generated list to the sources.list.d directory +cp ./microsoft-prod.list /etc/apt/sources.list.d/ +# Install the Microsoft GPG public key +curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg +cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ #apt-get install packages diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh index f2f905c9..84c8e8bf 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh @@ -1,20 +1,11 @@ #!/bin/bash set -ex -# Setup microsoft packages repository for moby -# Download the repository configuration package -curl https://packages.microsoft.com/config/ubuntu/22.04/prod.list > ./microsoft-prod.list -# Copy the generated list to the sources.list.d directory -cp ./microsoft-prod.list /etc/apt/sources.list.d/ -# Install the Microsoft GPG public key -curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg -cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ - # upgrade pre-installed components apt update # install LTS kernel -apt install -y linux-azure-lts +apt install -y linux-azure-lts-22.04 # jq is needed to parse the component versions from the requirements.json file apt install -y jq diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_utils.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_utils.sh index e7544a1d..4f57c486 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_utils.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_utils.sh @@ -1,14 +1,14 @@ #!/bin/bash set -ex -# # Setup microsoft packages repository for moby -# # Download the repository configuration package -# curl https://packages.microsoft.com/config/ubuntu/22.04/prod.list > ./microsoft-prod.list -# # Copy the generated list to the sources.list.d directory -# cp ./microsoft-prod.list /etc/apt/sources.list.d/ -# # Install the Microsoft GPG public key -# curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg -# cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ +# Setup microsoft packages repository for moby +# Download the repository configuration package +curl https://packages.microsoft.com/config/ubuntu/22.04/prod.list > ./microsoft-prod.list +# Copy the generated list to the sources.list.d directory +cp ./microsoft-prod.list /etc/apt/sources.list.d/ +# Install the Microsoft GPG public key +curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg +cp ./microsoft.gpg /etc/apt/trusted.gpg.d/ #apt-get install packages From 507a334e6d84620fb728b6ca4cb6d6758793a74c Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Fri, 29 Mar 2024 10:47:38 -0700 Subject: [PATCH 54/76] revert back to upgrading all packages --- alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh | 10 +++++++--- .../ubuntu-20.04-hpc/install_prerequisites.sh | 9 +++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh b/alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh index dc969167..be321f89 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh @@ -1,8 +1,12 @@ #!/bin/bash set -ex -# Import the newest AlmaLinux 8 GPG key -rpm --import https://repo.almalinux.org/almalinux/RPM-GPG-KEY-AlmaLinux +# Don't allow the kernel to be updated +apt-mark hold linux-azure + +# upgrade pre-installed components +apt update +apt upgrade -y # jq is needed to parse the component versions from the requirements.json file -yum install -y jq +apt install -y jq \ No newline at end of file diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh index 678262f3..be321f89 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh @@ -1,11 +1,12 @@ #!/bin/bash set -ex +# Don't allow the kernel to be updated +apt-mark hold linux-azure + # upgrade pre-installed components apt update - -# install LTS kernel -apt install -y linux-azure-lts-20.04 +apt upgrade -y # jq is needed to parse the component versions from the requirements.json file -apt install -y jq +apt install -y jq \ No newline at end of file From 095030ab6f9a1d3b1d7d603d1bb5b7b9db398f5c Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Fri, 29 Mar 2024 10:49:51 -0700 Subject: [PATCH 55/76] fix prerequisites --- alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh | 10 +++------- .../ubuntu-22.04-hpc/install_prerequisites.sh | 9 +++++---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh b/alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh index be321f89..dc969167 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install_prerequisites.sh @@ -1,12 +1,8 @@ #!/bin/bash set -ex -# Don't allow the kernel to be updated -apt-mark hold linux-azure - -# upgrade pre-installed components -apt update -apt upgrade -y +# Import the newest AlmaLinux 8 GPG key +rpm --import https://repo.almalinux.org/almalinux/RPM-GPG-KEY-AlmaLinux # jq is needed to parse the component versions from the requirements.json file -apt install -y jq \ No newline at end of file +yum install -y jq diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh index 84c8e8bf..be321f89 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh @@ -1,11 +1,12 @@ #!/bin/bash set -ex +# Don't allow the kernel to be updated +apt-mark hold linux-azure + # upgrade pre-installed components apt update - -# install LTS kernel -apt install -y linux-azure-lts-22.04 +apt upgrade -y # jq is needed to parse the component versions from the requirements.json file -apt install -y jq +apt install -y jq \ No newline at end of file From d1488966147fdb37dbe47e9ccba5fbb8236e6900 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Fri, 29 Mar 2024 10:54:18 -0700 Subject: [PATCH 56/76] cleanup whitespace --- alma/alma-8.x/common/install_mpis.sh | 2 +- alma/alma-8.x/common/install_pmix.sh | 2 +- tests/run-tests.sh | 2 +- ubuntu/common/install_pmix.sh | 2 +- ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh | 2 +- ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/alma/alma-8.x/common/install_mpis.sh b/alma/alma-8.x/common/install_mpis.sh index 8ac9c9c6..d5e3f19c 100755 --- a/alma/alma-8.x/common/install_mpis.sh +++ b/alma/alma-8.x/common/install_mpis.sh @@ -44,4 +44,4 @@ EOF # Create symlinks for modulefiles ln -s /usr/share/Modules/modulefiles/mpi/impi_${IMPI_VERSION} /usr/share/Modules/modulefiles/mpi/impi -../../common/install_mpis.sh ${GCC_VERSION} ${HPCX_PATH} \ No newline at end of file +../../common/install_mpis.sh ${GCC_VERSION} ${HPCX_PATH} diff --git a/alma/alma-8.x/common/install_pmix.sh b/alma/alma-8.x/common/install_pmix.sh index 5938658a..6e70a178 100755 --- a/alma/alma-8.x/common/install_pmix.sh +++ b/alma/alma-8.x/common/install_pmix.sh @@ -16,4 +16,4 @@ fi dnf config-manager --set-enabled powertools yum -y install pmix-$PMIX_VERSION.el8 hwloc-devel libevent-devel -$COMMON_DIR/write_component_version.sh "PMIX" ${PMIX_VERSION} \ No newline at end of file +$COMMON_DIR/write_component_version.sh "PMIX" ${PMIX_VERSION} diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 6d7c8476..d02c4414 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -62,9 +62,9 @@ find_ubuntu_distro() { distro=`find_distro` echo "Detected distro: ${distro}" - OMPI_VERSION_UBUNTU="5.0.2" HPCX_MOFED_INTEGRATION_VERSION="MLNX_OFED_LINUX-24.01-0.3.3.1" + case ${distro} in "Ubuntu 20.04") HPCX_VERSION_UBUNTU="v2.18"; MOFED_VERSION_UBUNTU="MLNX_OFED_LINUX-24.01-0.3.3.1"; diff --git a/ubuntu/common/install_pmix.sh b/ubuntu/common/install_pmix.sh index 9cef7eee..57069d33 100755 --- a/ubuntu/common/install_pmix.sh +++ b/ubuntu/common/install_pmix.sh @@ -48,4 +48,4 @@ apt install -y pmix=$PMIX_VERSION # '--allow-change-held-packages' flag. apt-mark hold pmix=$PMIX_VERSION -$COMMON_DIR/write_component_version.sh "PMIX" ${PMIX_VERSION} \ No newline at end of file +$COMMON_DIR/write_component_version.sh "PMIX" ${PMIX_VERSION} diff --git a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh index be321f89..61d086b7 100755 --- a/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-20.x/ubuntu-20.04-hpc/install_prerequisites.sh @@ -9,4 +9,4 @@ apt update apt upgrade -y # jq is needed to parse the component versions from the requirements.json file -apt install -y jq \ No newline at end of file +apt install -y jq diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh index be321f89..61d086b7 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh @@ -9,4 +9,4 @@ apt update apt upgrade -y # jq is needed to parse the component versions from the requirements.json file -apt install -y jq \ No newline at end of file +apt install -y jq From a465207640cd757bc082ef369a6128fc0081c270 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Fri, 29 Mar 2024 19:01:21 -0700 Subject: [PATCH 57/76] remove IMPI 2018 --- alma/alma-8.x/common/install_mpis.sh | 47 ---------------------------- 1 file changed, 47 deletions(-) delete mode 100755 alma/alma-8.x/common/install_mpis.sh diff --git a/alma/alma-8.x/common/install_mpis.sh b/alma/alma-8.x/common/install_mpis.sh deleted file mode 100755 index d5e3f19c..00000000 --- a/alma/alma-8.x/common/install_mpis.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -set -ex - -GCC_VERSION=$1 -HPCX_PATH=$2 - -HCOLL_PATH=${HPCX_PATH}/hcoll -UCX_PATH=${HPCX_PATH}/ucx -INSTALL_PREFIX=/opt - -# Load gcc -export PATH=/opt/${GCC_VERSION}/bin:$PATH -export LD_LIBRARY_PATH=/opt/${GCC_VERSION}/lib64:$LD_LIBRARY_PATH -set CC=/opt/${GCC_VERSION}/bin/gcc -set GCC=/opt/${GCC_VERSION}/bin/gcc - -# Intel MPI 2018 (update 4) -IMPI_VERSION="2018.4.274" -$COMMON_DIR/write_component_version.sh "IMPI_2018" ${IMPI_VERSION} -IMPI_2018_DOWNLOAD_URL=http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/13651/l_mpi_${IMPI_VERSION}.tgz -$COMMON_DIR/download_and_verify.sh $IMPI_2018_DOWNLOAD_URL "a1114b3eb4149c2f108964b83cad02150d619e50032059d119ac4ffc9d5dd8e0" -tar -xvf l_mpi_${IMPI_VERSION}.tgz -cd l_mpi_${IMPI_VERSION} -# Update the silent.cfg file to proceed with installation -sed -i -e 's/ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' silent.cfg -./install.sh --silent ./silent.cfg -cd .. - -#IntelMPI-v2018 -cat << EOF >> /usr/share/Modules/modulefiles/mpi/impi_${IMPI_VERSION} -#%Module 1.0 -# -# Intel MPI ${IMPI_VERSION} -# -conflict mpi -module load /opt/intel/impi/${IMPI_VERSION}/intel64/modulefiles/mpi -setenv MPI_BIN /opt/intel/impi/${IMPI_VERSION}/intel64/bin -setenv MPI_INCLUDE /opt/intel/impi/${IMPI_VERSION}/intel64/include -setenv MPI_LIB /opt/intel/impi/${IMPI_VERSION}/intel64/lib -setenv MPI_MAN /opt/intel/impi/${IMPI_VERSION}/man -setenv MPI_HOME /opt/intel/impi/${IMPI_VERSION}/intel64 -EOF - -# Create symlinks for modulefiles -ln -s /usr/share/Modules/modulefiles/mpi/impi_${IMPI_VERSION} /usr/share/Modules/modulefiles/mpi/impi - -../../common/install_mpis.sh ${GCC_VERSION} ${HPCX_PATH} From 0f37b6a548e0b569909496c9e1529ad87a4c4e16 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Fri, 29 Mar 2024 19:33:20 -0700 Subject: [PATCH 58/76] update mpi installs --- alma/alma-8.x/alma-8.7-hpc/install_mpis.sh | 8 +++++++- alma/alma-8.x/common/install_pmix.sh | 2 +- alma/common/install_mpis.sh | 7 +++++-- ubuntu/common/install_mpis.sh | 12 +++++++++--- ubuntu/common/install_pmix.sh | 4 ++-- 5 files changed, 24 insertions(+), 9 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh index 7bc0cf97..27cd2f8f 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh @@ -19,12 +19,18 @@ HPCX_DOWNLOAD_URL=$(jq -r '.url' <<< $hpcx_metadata) TARBALL=$(basename $HPCX_DOWNLOAD_URL) HPCX_FOLDER=$(basename $HPCX_DOWNLOAD_URL .tbz) +PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) +PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION} + $COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL $HPCX_SHA256 tar -xvf ${TARBALL} mv ${HPCX_FOLDER} ${INSTALL_PREFIX} HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} $COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION +# rebuild HPCX with PMIx +${HPCX_PATH}/utils/hpcx_rebuild.sh --with-hcoll --ompi-extra-config --with-pmix=${PMIX_PATH} + # exclude ucx from updates sed -i "$ s/$/ ucx*/" /etc/dnf/dnf.conf @@ -38,7 +44,7 @@ cat << EOF >> /usr/share/Modules/modulefiles/mpi/hpcx-${HPCX_VERSION} # HPCx ${HPCX_VERSION} # conflict mpi -module load ${HPCX_PATH}/modulefiles/hpcx +module load ${HPCX_PATH}/modulefiles/hpcx-rebuild EOF # Create symlinks for modulefiles diff --git a/alma/alma-8.x/common/install_pmix.sh b/alma/alma-8.x/common/install_pmix.sh index 6e70a178..ac0d24fd 100755 --- a/alma/alma-8.x/common/install_pmix.sh +++ b/alma/alma-8.x/common/install_pmix.sh @@ -14,6 +14,6 @@ if [ ! -e /etc/yum.repos.d/microsoft-prod.repo ];then fi dnf config-manager --set-enabled powertools -yum -y install pmix-$PMIX_VERSION.el8 hwloc-devel libevent-devel +yum -y install pmix-$PMIX_VERSION.el8 hwloc-devel libevent-devel munge-devel $COMMON_DIR/write_component_version.sh "PMIX" ${PMIX_VERSION} diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index e84c3618..622613ed 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -4,9 +4,12 @@ set -ex GCC_VERSION=$1 HPCX_PATH=$2 +PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) + HCOLL_PATH=${HPCX_PATH}/hcoll UCX_PATH=${HPCX_PATH}/ucx INSTALL_PREFIX=/opt +PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION} # Load gcc export PATH=/opt/${GCC_VERSION}/bin:$PATH @@ -25,7 +28,7 @@ MVAPICH2_FOLDER=$(basename $MVAPICH2_DOWNLOAD_URL .tar.gz) $COMMON_DIR/download_and_verify.sh $MVAPICH2_DOWNLOAD_URL $MVAPICH2_SHA256 tar -xvf ${TARBALL} cd ${MVAPICH2_FOLDER} -./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes && make -j$(nproc) && make install +./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes --with-pmix=${PMIX_PATH} && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "MVAPICH2" ${MVAPICH2_VERSION} @@ -41,7 +44,7 @@ OMPI_FOLDER=$(basename $OMPI_DOWNLOAD_URL .tar.gz) $COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL $OMPI_SHA256 tar -xvf $TARBALL cd $OMPI_FOLDER -./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install +./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --with-pmix=${PMIX_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "OMPI" ${OMPI_VERSION} diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index 33d21113..783bcd4d 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -7,6 +7,9 @@ set GCC=/usr/bin/gcc INSTALL_PREFIX=/opt +PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) +PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION} + # HPC-X v2.16 hpcx_metadata=$(jq -r '.hpcx."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) HPCX_VERSION=$(jq -r '.version' <<< $hpcx_metadata) @@ -21,6 +24,9 @@ mv ${HPCX_FOLDER} ${INSTALL_PREFIX} HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} $COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION +# rebuild HPCX with PMIx +${HPCX_PATH}/utils/hpcx_rebuild.sh --with-hcoll --ompi-extra-config --with-pmix=${PMIX_PATH} + # Install MVAPICH2 mvapich2_metadata=$(jq -r '.mvapich2."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) MVAPICH2_VERSION=$(jq -r '.version' <<< $mvapich2_metadata) @@ -35,7 +41,7 @@ cd ${MVAPICH2_FOLDER} # Error exclusive to Ubuntu 22.04 # configure: error: The Fortran compiler gfortran will not compile files that call # the same routine with arguments of different types. -./configure $(if [[ ${DISTRIBUTION} == "ubuntu22.04" ]]; then echo "FFLAGS=-fallow-argument-mismatch"; fi) --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes && make -j$(nproc) && make install +./configure $(if [[ ${DISTRIBUTION} == "ubuntu22.04" ]]; then echo "FFLAGS=-fallow-argument-mismatch"; fi) --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes --with-pmix=${PMIX_PATH} && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "MVAPICH2" ${MVAPICH2_VERSION} @@ -50,7 +56,7 @@ OMPI_FOLDER=$(basename $OMPI_DOWNLOAD_URL .tar.gz) $COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL $OMPI_SHA256 tar -xvf $TARBALL cd $OMPI_FOLDER -./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install +./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --with-pmix=${PMIX_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "OMPI" ${OMPI_VERSION} @@ -79,7 +85,7 @@ cat << EOF >> ${MODULE_FILES_DIRECTORY}/hpcx-${HPCX_VERSION} # HPCx ${HPCX_VERSION} # conflict mpi -module load ${HPCX_PATH}/modulefiles/hpcx +module load ${HPCX_PATH}/modulefiles/hpcx-rebuild EOF # MVAPICH2 diff --git a/ubuntu/common/install_pmix.sh b/ubuntu/common/install_pmix.sh index 57069d33..255eb9aa 100755 --- a/ubuntu/common/install_pmix.sh +++ b/ubuntu/common/install_pmix.sh @@ -42,10 +42,10 @@ fi apt update -apt install -y pmix=$PMIX_VERSION +apt install -y pmix=$PMIX_VERSION libevent-dev libhwloc-dev # libmunge-dev # Hold versions of packages to prevent accidental updates. Packages can still be upgraded explictly by # '--allow-change-held-packages' flag. -apt-mark hold pmix=$PMIX_VERSION +apt-mark hold pmix=$PMIX_VERSION libevent-dev libhwloc-dev # libmunge-dev $COMMON_DIR/write_component_version.sh "PMIX" ${PMIX_VERSION} From 7d394e2da43dc40d64567efb686367fb33006206 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Fri, 29 Mar 2024 20:47:27 -0700 Subject: [PATCH 59/76] fix pmix path --- alma/alma-8.x/alma-8.7-hpc/install_mpis.sh | 2 +- alma/common/install_mpis.sh | 2 +- ubuntu/common/install_mpis.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh index 27cd2f8f..ab10a03f 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh @@ -20,7 +20,7 @@ TARBALL=$(basename $HPCX_DOWNLOAD_URL) HPCX_FOLDER=$(basename $HPCX_DOWNLOAD_URL .tbz) PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) -PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION} +PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2} $COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL $HPCX_SHA256 tar -xvf ${TARBALL} diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 622613ed..06e0e7e2 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -9,7 +9,7 @@ PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS HCOLL_PATH=${HPCX_PATH}/hcoll UCX_PATH=${HPCX_PATH}/ucx INSTALL_PREFIX=/opt -PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION} +PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2} # Load gcc export PATH=/opt/${GCC_VERSION}/bin:$PATH diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index 783bcd4d..82800919 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -8,7 +8,7 @@ set GCC=/usr/bin/gcc INSTALL_PREFIX=/opt PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) -PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION} +PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2} # HPC-X v2.16 hpcx_metadata=$(jq -r '.hpcx."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) From 871638547a91afdd83592b94b9e4a9cbbbd8c14f Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Sat, 30 Mar 2024 11:12:17 -0700 Subject: [PATCH 60/76] fix path to repo file --- alma/alma-8.x/common/install_pmix.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/alma/alma-8.x/common/install_pmix.sh b/alma/alma-8.x/common/install_pmix.sh index ac0d24fd..415cb0dd 100755 --- a/alma/alma-8.x/common/install_pmix.sh +++ b/alma/alma-8.x/common/install_pmix.sh @@ -2,8 +2,9 @@ set -ex PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) +SCRIPT_DIR=dirname "$0" -cp slurmel8.repo /etc/yum.repos.d/slurm.repo +cp ${SCRIPT_DIR}/slurmel8.repo /etc/yum.repos.d/slurm.repo ## This package is pre-installed in all hpc images used by cyclecloud, but if customer wants to ## build an image from generic marketplace images then this package sets up the right gpg keys for PMC. From e4c50da6095dff8b8a65a5e9c02ed3df6d78fbf8 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Sat, 30 Mar 2024 16:31:50 -0700 Subject: [PATCH 61/76] fix pmix pathing --- alma/alma-8.x/alma-8.7-hpc/install.sh | 2 +- alma/alma-8.x/common/install_pmix.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install.sh b/alma/alma-8.x/alma-8.7-hpc/install.sh index f5cbb4b0..fa309899 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install.sh @@ -20,7 +20,7 @@ $ALMA_COMMON_DIR/install_lustre_client.sh "8" ./install_mellanoxofed.sh # install PMIX -$COMMON_DIR/../alma/alma-8.x/common/install_pmix.sh +$ALMA_COMMON_DIR/../alma-8.x/common/install_pmix.sh # install mpi libraries ./install_mpis.sh diff --git a/alma/alma-8.x/common/install_pmix.sh b/alma/alma-8.x/common/install_pmix.sh index 415cb0dd..bd69b7fb 100755 --- a/alma/alma-8.x/common/install_pmix.sh +++ b/alma/alma-8.x/common/install_pmix.sh @@ -2,9 +2,9 @@ set -ex PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) -SCRIPT_DIR=dirname "$0" +# SCRIPT_DIR=dirname "$0" -cp ${SCRIPT_DIR}/slurmel8.repo /etc/yum.repos.d/slurm.repo +cp ./slurmel8.repo /etc/yum.repos.d/slurm.repo ## This package is pre-installed in all hpc images used by cyclecloud, but if customer wants to ## build an image from generic marketplace images then this package sets up the right gpg keys for PMC. From 46f8149328e5f42710ba5caec09ef02587ab68b7 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Sat, 30 Mar 2024 17:06:05 -0700 Subject: [PATCH 62/76] set repo dir --- alma/alma-8.x/common/install_pmix.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/alma/alma-8.x/common/install_pmix.sh b/alma/alma-8.x/common/install_pmix.sh index bd69b7fb..ec5b779b 100755 --- a/alma/alma-8.x/common/install_pmix.sh +++ b/alma/alma-8.x/common/install_pmix.sh @@ -2,9 +2,9 @@ set -ex PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) -# SCRIPT_DIR=dirname "$0" +REPO_DIR="$ALMA_COMMON_DIR/../alma-8.x/common" -cp ./slurmel8.repo /etc/yum.repos.d/slurm.repo +cp ${REPO_DIR}/slurmel8.repo /etc/yum.repos.d/slurm.repo ## This package is pre-installed in all hpc images used by cyclecloud, but if customer wants to ## build an image from generic marketplace images then this package sets up the right gpg keys for PMC. From e9e245636db3e5b58876e76d897cc3d1e7844d12 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Sun, 31 Mar 2024 12:40:28 -0700 Subject: [PATCH 63/76] roll back openmpi --- alma/common/install_mpis.sh | 4 +++- requirements.json | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 06e0e7e2..a1cb59db 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -44,7 +44,9 @@ OMPI_FOLDER=$(basename $OMPI_DOWNLOAD_URL .tar.gz) $COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL $OMPI_SHA256 tar -xvf $TARBALL cd $OMPI_FOLDER -./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --with-pmix=${PMIX_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install +./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --with-pmix=${PMIX_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized +make -j$(nproc) +make install cd .. $COMMON_DIR/write_component_version.sh "OMPI" ${OMPI_VERSION} diff --git a/requirements.json b/requirements.json index 204b0411..49eb6f62 100644 --- a/requirements.json +++ b/requirements.json @@ -73,9 +73,9 @@ "url": "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.2.tar.gz" }, "almalinux8.7": { - "version": "5.0.2", - "sha256": "095ab1cddb0fa0f9e7fc211a1d33185c6727c5237d0ee55f80a7e4311e5d279c", - "url": "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.2.tar.gz" + "version": "4.1.6", + "sha256": "44da277b8cdc234e71c62473305a09d63f4dcca292ca40335aab7c4bf0e6a566", + "url": "https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz" } }, "impi": { From 48e94878031f3dce830115f471c5b26ca648c7fe Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Sun, 31 Mar 2024 14:16:59 -0700 Subject: [PATCH 64/76] fix alma ompi test directory --- tests/run-tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index d02c4414..544553d0 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -80,7 +80,7 @@ esac MVAPICH2_VERSION_ALMA="2.3.7-1" MVAPICH2_VERSION_UBUNTU="2.3.7-1" -OMPI_VERSION_ALMA_87="5.0.2" +OMPI_VERSION_ALMA_87="4.1.6" IMPI_2021_VERSION_ALMA_87="2021.11" MVAPICH2X_INSTALLATION_DIRECTORY="/opt/mvapich2-x" From 0cd10b618de51d2a6255a6065730dd3e1bde7ac0 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 1 Apr 2024 11:14:26 -0700 Subject: [PATCH 65/76] fox for ompi 5 on alma --- alma/alma-8.x/alma-8.7-hpc/install_mpis.sh | 2 ++ requirements.json | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh index ab10a03f..3cfecf2e 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh @@ -24,6 +24,8 @@ PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2} $COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL $HPCX_SHA256 tar -xvf ${TARBALL} + +sed -i "s/\/build-result\//\/opt\//" ${HPCX_FOLDER}/hcoll/lib/pkgconfig/hcoll.pc mv ${HPCX_FOLDER} ${INSTALL_PREFIX} HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} $COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION diff --git a/requirements.json b/requirements.json index 49eb6f62..204b0411 100644 --- a/requirements.json +++ b/requirements.json @@ -73,9 +73,9 @@ "url": "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.2.tar.gz" }, "almalinux8.7": { - "version": "4.1.6", - "sha256": "44da277b8cdc234e71c62473305a09d63f4dcca292ca40335aab7c4bf0e6a566", - "url": "https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz" + "version": "5.0.2", + "sha256": "095ab1cddb0fa0f9e7fc211a1d33185c6727c5237d0ee55f80a7e4311e5d279c", + "url": "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.2.tar.gz" } }, "impi": { From 1271d54bedcfe3e13a1c1e72ec9ea0257114cc16 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 1 Apr 2024 12:33:35 -0700 Subject: [PATCH 66/76] update alma ompi test --- tests/run-tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 544553d0..d02c4414 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -80,7 +80,7 @@ esac MVAPICH2_VERSION_ALMA="2.3.7-1" MVAPICH2_VERSION_UBUNTU="2.3.7-1" -OMPI_VERSION_ALMA_87="4.1.6" +OMPI_VERSION_ALMA_87="5.0.2" IMPI_2021_VERSION_ALMA_87="2021.11" MVAPICH2X_INSTALLATION_DIRECTORY="/opt/mvapich2-x" From 14af60cd919c7d835f812972a1f1563588a4f101 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 1 Apr 2024 12:52:28 -0700 Subject: [PATCH 67/76] use pmix-libdir --- alma/alma-8.x/alma-8.7-hpc/install_mpis.sh | 2 +- alma/common/install_mpis.sh | 4 ++-- ubuntu/common/install_mpis.sh | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh index 3cfecf2e..711f6f7a 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh @@ -31,7 +31,7 @@ HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} $COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION # rebuild HPCX with PMIx -${HPCX_PATH}/utils/hpcx_rebuild.sh --with-hcoll --ompi-extra-config --with-pmix=${PMIX_PATH} +${HPCX_PATH}/utils/hpcx_rebuild.sh --with-hcoll --ompi-extra-config --with-pmix-libdir=${PMIX_PATH} # exclude ucx from updates sed -i "$ s/$/ ucx*/" /etc/dnf/dnf.conf diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index a1cb59db..a3eabe9d 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -28,7 +28,7 @@ MVAPICH2_FOLDER=$(basename $MVAPICH2_DOWNLOAD_URL .tar.gz) $COMMON_DIR/download_and_verify.sh $MVAPICH2_DOWNLOAD_URL $MVAPICH2_SHA256 tar -xvf ${TARBALL} cd ${MVAPICH2_FOLDER} -./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes --with-pmix=${PMIX_PATH} && make -j$(nproc) && make install +./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes --with-pmix-libdir=${PMIX_PATH} && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "MVAPICH2" ${MVAPICH2_VERSION} @@ -44,7 +44,7 @@ OMPI_FOLDER=$(basename $OMPI_DOWNLOAD_URL .tar.gz) $COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL $OMPI_SHA256 tar -xvf $TARBALL cd $OMPI_FOLDER -./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --with-pmix=${PMIX_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized +./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --with-pmix-libdir=${PMIX_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized make -j$(nproc) make install cd .. diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index 82800919..b39edf22 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -25,7 +25,7 @@ HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} $COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION # rebuild HPCX with PMIx -${HPCX_PATH}/utils/hpcx_rebuild.sh --with-hcoll --ompi-extra-config --with-pmix=${PMIX_PATH} +${HPCX_PATH}/utils/hpcx_rebuild.sh --with-hcoll --ompi-extra-config --with-pmix-libdir=${PMIX_PATH} # Install MVAPICH2 mvapich2_metadata=$(jq -r '.mvapich2."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) @@ -41,7 +41,7 @@ cd ${MVAPICH2_FOLDER} # Error exclusive to Ubuntu 22.04 # configure: error: The Fortran compiler gfortran will not compile files that call # the same routine with arguments of different types. -./configure $(if [[ ${DISTRIBUTION} == "ubuntu22.04" ]]; then echo "FFLAGS=-fallow-argument-mismatch"; fi) --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes --with-pmix=${PMIX_PATH} && make -j$(nproc) && make install +./configure $(if [[ ${DISTRIBUTION} == "ubuntu22.04" ]]; then echo "FFLAGS=-fallow-argument-mismatch"; fi) --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes --with-pmix-libdir=${PMIX_PATH} && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "MVAPICH2" ${MVAPICH2_VERSION} @@ -56,7 +56,7 @@ OMPI_FOLDER=$(basename $OMPI_DOWNLOAD_URL .tar.gz) $COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL $OMPI_SHA256 tar -xvf $TARBALL cd $OMPI_FOLDER -./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --with-pmix=${PMIX_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install +./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --with-pmix-libdir=${PMIX_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "OMPI" ${OMPI_VERSION} From bd7c1248d89304eda52f9b6027744ac6dd873bcd Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 1 Apr 2024 14:29:57 -0700 Subject: [PATCH 68/76] update pmix path --- alma/alma-8.x/alma-8.7-hpc/install_mpis.sh | 2 +- alma/common/install_mpis.sh | 2 +- ubuntu/common/install_mpis.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh index 711f6f7a..df20d8b1 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh @@ -20,7 +20,7 @@ TARBALL=$(basename $HPCX_DOWNLOAD_URL) HPCX_FOLDER=$(basename $HPCX_DOWNLOAD_URL .tbz) PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) -PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2} +PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2}/lib $COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL $HPCX_SHA256 tar -xvf ${TARBALL} diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index a3eabe9d..4387b1ef 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -9,7 +9,7 @@ PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS HCOLL_PATH=${HPCX_PATH}/hcoll UCX_PATH=${HPCX_PATH}/ucx INSTALL_PREFIX=/opt -PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2} +PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2}/lib # Load gcc export PATH=/opt/${GCC_VERSION}/bin:$PATH diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index b39edf22..51fd9030 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -8,7 +8,7 @@ set GCC=/usr/bin/gcc INSTALL_PREFIX=/opt PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) -PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2} +PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2}/lib # HPC-X v2.16 hpcx_metadata=$(jq -r '.hpcx."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) From 6efa1fbd895656fcfa2dd906ab2a135379c5a1b7 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 1 Apr 2024 14:47:03 -0700 Subject: [PATCH 69/76] don't remove files --- alma/alma-8.x/alma-8.7-hpc/install.sh | 8 ++++---- alma/common/install_mpis.sh | 4 ++-- ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install.sh b/alma/alma-8.x/alma-8.7-hpc/install.sh index fa309899..976ac199 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install.sh @@ -35,10 +35,10 @@ $ALMA_COMMON_DIR/../alma-8.x/common/install_pmix.sh ./install_intel_libs.sh # cleanup downloaded tarballs - clear some space -rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh -rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* -rm -rf /var/intel/ /var/cache/* -rm -Rf -- */ +#rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh +#rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* +#rm -rf /var/intel/ /var/cache/* +#rm -Rf -- */ # Install NCCL ./install_nccl.sh diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 4387b1ef..f66f68d1 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -127,5 +127,5 @@ ln -s /usr/share/Modules/modulefiles/mpi/openmpi-${OMPI_VERSION} /usr/share/Modu ln -s /usr/share/Modules/modulefiles/mpi/impi_${impi_2021_version} /usr/share/Modules/modulefiles/mpi/impi-2021 # cleanup downloaded tarballs and other installation files/folders -rm -rf *.tar.gz *offline.sh -rm -rf -- */ +# rm -rf *.tar.gz *offline.sh +# rm -rf -- */ diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh index 1a180eaf..e8a26408 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh @@ -32,10 +32,10 @@ $UBUNTU_COMMON_DIR/install_nccl.sh $UBUNTU_COMMON_DIR/install_docker.sh # cleanup downloaded tarballs - clear some space -rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh -rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* -rm -rf /var/intel/ /var/cache/* -rm -Rf -- */ +#rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh +#rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* +#rm -rf /var/intel/ /var/cache/* +#rm -Rf -- */ # Install DCGM $UBUNTU_COMMON_DIR/install_dcgm.sh From 7e47ab06c75b0980de7a60850b5d9168bcdf1ad0 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 1 Apr 2024 22:22:54 -0700 Subject: [PATCH 70/76] Revert "update pmix path" This reverts commit bd7c1248d89304eda52f9b6027744ac6dd873bcd. --- alma/alma-8.x/alma-8.7-hpc/install_mpis.sh | 2 +- alma/common/install_mpis.sh | 2 +- ubuntu/common/install_mpis.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh index df20d8b1..711f6f7a 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh @@ -20,7 +20,7 @@ TARBALL=$(basename $HPCX_DOWNLOAD_URL) HPCX_FOLDER=$(basename $HPCX_DOWNLOAD_URL .tbz) PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) -PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2}/lib +PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2} $COMMON_DIR/download_and_verify.sh $HPCX_DOWNLOAD_URL $HPCX_SHA256 tar -xvf ${TARBALL} diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index f66f68d1..675afd77 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -9,7 +9,7 @@ PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS HCOLL_PATH=${HPCX_PATH}/hcoll UCX_PATH=${HPCX_PATH}/ucx INSTALL_PREFIX=/opt -PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2}/lib +PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2} # Load gcc export PATH=/opt/${GCC_VERSION}/bin:$PATH diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index 51fd9030..b39edf22 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -8,7 +8,7 @@ set GCC=/usr/bin/gcc INSTALL_PREFIX=/opt PMIX_VERSION=$(jq -r '.pmix."'"$DISTRIBUTION"'".version' <<< $COMPONENT_VERSIONS) -PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2}/lib +PMIX_PATH=${INSTALL_PREFIX}/pmix/${PMIX_VERSION:0:-2} # HPC-X v2.16 hpcx_metadata=$(jq -r '.hpcx."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) From dd54c4dd6e78097a471980af82a8f7d9fb3bd380 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Mon, 1 Apr 2024 22:23:16 -0700 Subject: [PATCH 71/76] Revert "use pmix-libdir" This reverts commit 14af60cd919c7d835f812972a1f1563588a4f101. --- alma/alma-8.x/alma-8.7-hpc/install_mpis.sh | 2 +- alma/common/install_mpis.sh | 4 ++-- ubuntu/common/install_mpis.sh | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh index 711f6f7a..3cfecf2e 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install_mpis.sh @@ -31,7 +31,7 @@ HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} $COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION # rebuild HPCX with PMIx -${HPCX_PATH}/utils/hpcx_rebuild.sh --with-hcoll --ompi-extra-config --with-pmix-libdir=${PMIX_PATH} +${HPCX_PATH}/utils/hpcx_rebuild.sh --with-hcoll --ompi-extra-config --with-pmix=${PMIX_PATH} # exclude ucx from updates sed -i "$ s/$/ ucx*/" /etc/dnf/dnf.conf diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 675afd77..652ad58e 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -28,7 +28,7 @@ MVAPICH2_FOLDER=$(basename $MVAPICH2_DOWNLOAD_URL .tar.gz) $COMMON_DIR/download_and_verify.sh $MVAPICH2_DOWNLOAD_URL $MVAPICH2_SHA256 tar -xvf ${TARBALL} cd ${MVAPICH2_FOLDER} -./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes --with-pmix-libdir=${PMIX_PATH} && make -j$(nproc) && make install +./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes --with-pmix=${PMIX_PATH} && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "MVAPICH2" ${MVAPICH2_VERSION} @@ -44,7 +44,7 @@ OMPI_FOLDER=$(basename $OMPI_DOWNLOAD_URL .tar.gz) $COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL $OMPI_SHA256 tar -xvf $TARBALL cd $OMPI_FOLDER -./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --with-pmix-libdir=${PMIX_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized +./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --with-pmix=${PMIX_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized make -j$(nproc) make install cd .. diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index b39edf22..82800919 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -25,7 +25,7 @@ HPCX_PATH=${INSTALL_PREFIX}/${HPCX_FOLDER} $COMMON_DIR/write_component_version.sh "HPCX" $HPCX_VERSION # rebuild HPCX with PMIx -${HPCX_PATH}/utils/hpcx_rebuild.sh --with-hcoll --ompi-extra-config --with-pmix-libdir=${PMIX_PATH} +${HPCX_PATH}/utils/hpcx_rebuild.sh --with-hcoll --ompi-extra-config --with-pmix=${PMIX_PATH} # Install MVAPICH2 mvapich2_metadata=$(jq -r '.mvapich2."'"$DISTRIBUTION"'"' <<< $COMPONENT_VERSIONS) @@ -41,7 +41,7 @@ cd ${MVAPICH2_FOLDER} # Error exclusive to Ubuntu 22.04 # configure: error: The Fortran compiler gfortran will not compile files that call # the same routine with arguments of different types. -./configure $(if [[ ${DISTRIBUTION} == "ubuntu22.04" ]]; then echo "FFLAGS=-fallow-argument-mismatch"; fi) --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes --with-pmix-libdir=${PMIX_PATH} && make -j$(nproc) && make install +./configure $(if [[ ${DISTRIBUTION} == "ubuntu22.04" ]]; then echo "FFLAGS=-fallow-argument-mismatch"; fi) --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes --with-pmix=${PMIX_PATH} && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "MVAPICH2" ${MVAPICH2_VERSION} @@ -56,7 +56,7 @@ OMPI_FOLDER=$(basename $OMPI_DOWNLOAD_URL .tar.gz) $COMMON_DIR/download_and_verify.sh $OMPI_DOWNLOAD_URL $OMPI_SHA256 tar -xvf $TARBALL cd $OMPI_FOLDER -./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --with-pmix-libdir=${PMIX_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install +./configure --prefix=${INSTALL_PREFIX}/openmpi-${OMPI_VERSION} --with-ucx=${UCX_PATH} --with-hcoll=${HCOLL_PATH} --with-pmix=${PMIX_PATH} --enable-mpirun-prefix-by-default --with-platform=contrib/platform/mellanox/optimized && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "OMPI" ${OMPI_VERSION} From 27e228adae55b0f120bf3fdf07a8067181505605 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 2 Apr 2024 12:22:48 -0700 Subject: [PATCH 72/76] Revert "don't remove files" This reverts commit 6efa1fbd895656fcfa2dd906ab2a135379c5a1b7. --- alma/alma-8.x/alma-8.7-hpc/install.sh | 8 ++++---- alma/common/install_mpis.sh | 4 ++-- ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install.sh b/alma/alma-8.x/alma-8.7-hpc/install.sh index 976ac199..fa309899 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install.sh @@ -35,10 +35,10 @@ $ALMA_COMMON_DIR/../alma-8.x/common/install_pmix.sh ./install_intel_libs.sh # cleanup downloaded tarballs - clear some space -#rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh -#rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* -#rm -rf /var/intel/ /var/cache/* -#rm -Rf -- */ +rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh +rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* +rm -rf /var/intel/ /var/cache/* +rm -Rf -- */ # Install NCCL ./install_nccl.sh diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 652ad58e..a1cb59db 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -127,5 +127,5 @@ ln -s /usr/share/Modules/modulefiles/mpi/openmpi-${OMPI_VERSION} /usr/share/Modu ln -s /usr/share/Modules/modulefiles/mpi/impi_${impi_2021_version} /usr/share/Modules/modulefiles/mpi/impi-2021 # cleanup downloaded tarballs and other installation files/folders -# rm -rf *.tar.gz *offline.sh -# rm -rf -- */ +rm -rf *.tar.gz *offline.sh +rm -rf -- */ diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh index e8a26408..1a180eaf 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh @@ -32,10 +32,10 @@ $UBUNTU_COMMON_DIR/install_nccl.sh $UBUNTU_COMMON_DIR/install_docker.sh # cleanup downloaded tarballs - clear some space -#rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh -#rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* -#rm -rf /var/intel/ /var/cache/* -#rm -Rf -- */ +rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh +rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* +rm -rf /var/intel/ /var/cache/* +rm -Rf -- */ # Install DCGM $UBUNTU_COMMON_DIR/install_dcgm.sh From 95ebd1a91e50f0d0dccf24e818c67ebd82aea733 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 2 Apr 2024 12:24:27 -0700 Subject: [PATCH 73/76] Reapply "don't remove files" This reverts commit 27e228adae55b0f120bf3fdf07a8067181505605. --- alma/alma-8.x/alma-8.7-hpc/install.sh | 8 ++++---- alma/common/install_mpis.sh | 4 ++-- ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install.sh b/alma/alma-8.x/alma-8.7-hpc/install.sh index fa309899..976ac199 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install.sh @@ -35,10 +35,10 @@ $ALMA_COMMON_DIR/../alma-8.x/common/install_pmix.sh ./install_intel_libs.sh # cleanup downloaded tarballs - clear some space -rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh -rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* -rm -rf /var/intel/ /var/cache/* -rm -Rf -- */ +#rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh +#rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* +#rm -rf /var/intel/ /var/cache/* +#rm -Rf -- */ # Install NCCL ./install_nccl.sh diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index a1cb59db..652ad58e 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -127,5 +127,5 @@ ln -s /usr/share/Modules/modulefiles/mpi/openmpi-${OMPI_VERSION} /usr/share/Modu ln -s /usr/share/Modules/modulefiles/mpi/impi_${impi_2021_version} /usr/share/Modules/modulefiles/mpi/impi-2021 # cleanup downloaded tarballs and other installation files/folders -rm -rf *.tar.gz *offline.sh -rm -rf -- */ +# rm -rf *.tar.gz *offline.sh +# rm -rf -- */ diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh index 1a180eaf..e8a26408 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh @@ -32,10 +32,10 @@ $UBUNTU_COMMON_DIR/install_nccl.sh $UBUNTU_COMMON_DIR/install_docker.sh # cleanup downloaded tarballs - clear some space -rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh -rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* -rm -rf /var/intel/ /var/cache/* -rm -Rf -- */ +#rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh +#rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* +#rm -rf /var/intel/ /var/cache/* +#rm -Rf -- */ # Install DCGM $UBUNTU_COMMON_DIR/install_dcgm.sh From 54696a9cda990c3328a91052b3f7a9377d666961 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 2 Apr 2024 12:26:38 -0700 Subject: [PATCH 74/76] remove PMIX from mvapich2 --- alma/common/install_mpis.sh | 2 +- ubuntu/common/install_mpis.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 652ad58e..7aa2c5cb 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -28,7 +28,7 @@ MVAPICH2_FOLDER=$(basename $MVAPICH2_DOWNLOAD_URL .tar.gz) $COMMON_DIR/download_and_verify.sh $MVAPICH2_DOWNLOAD_URL $MVAPICH2_SHA256 tar -xvf ${TARBALL} cd ${MVAPICH2_FOLDER} -./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes --with-pmix=${PMIX_PATH} && make -j$(nproc) && make install +./configure --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "MVAPICH2" ${MVAPICH2_VERSION} diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index 82800919..5e4187b8 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -41,7 +41,7 @@ cd ${MVAPICH2_FOLDER} # Error exclusive to Ubuntu 22.04 # configure: error: The Fortran compiler gfortran will not compile files that call # the same routine with arguments of different types. -./configure $(if [[ ${DISTRIBUTION} == "ubuntu22.04" ]]; then echo "FFLAGS=-fallow-argument-mismatch"; fi) --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes --with-pmix=${PMIX_PATH} && make -j$(nproc) && make install +./configure $(if [[ ${DISTRIBUTION} == "ubuntu22.04" ]]; then echo "FFLAGS=-fallow-argument-mismatch"; fi) --prefix=${INSTALL_PREFIX}/mvapich2-${MVAPICH2_VERSION} --enable-g=none --enable-fast=yes && make -j$(nproc) && make install cd .. $COMMON_DIR/write_component_version.sh "MVAPICH2" ${MVAPICH2_VERSION} From f34d85cfedfac84def37e054088032f98ca36f59 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 2 Apr 2024 15:22:43 -0700 Subject: [PATCH 75/76] Revert "Reapply "don't remove files"" This reverts commit 95ebd1a91e50f0d0dccf24e818c67ebd82aea733. --- alma/alma-8.x/alma-8.7-hpc/install.sh | 8 ++++---- alma/common/install_mpis.sh | 4 ++-- ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/alma/alma-8.x/alma-8.7-hpc/install.sh b/alma/alma-8.x/alma-8.7-hpc/install.sh index 976ac199..fa309899 100755 --- a/alma/alma-8.x/alma-8.7-hpc/install.sh +++ b/alma/alma-8.x/alma-8.7-hpc/install.sh @@ -35,10 +35,10 @@ $ALMA_COMMON_DIR/../alma-8.x/common/install_pmix.sh ./install_intel_libs.sh # cleanup downloaded tarballs - clear some space -#rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh -#rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* -#rm -rf /var/intel/ /var/cache/* -#rm -Rf -- */ +rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh +rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* +rm -rf /var/intel/ /var/cache/* +rm -Rf -- */ # Install NCCL ./install_nccl.sh diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 7aa2c5cb..9f8f5bfe 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -127,5 +127,5 @@ ln -s /usr/share/Modules/modulefiles/mpi/openmpi-${OMPI_VERSION} /usr/share/Modu ln -s /usr/share/Modules/modulefiles/mpi/impi_${impi_2021_version} /usr/share/Modules/modulefiles/mpi/impi-2021 # cleanup downloaded tarballs and other installation files/folders -# rm -rf *.tar.gz *offline.sh -# rm -rf -- */ +rm -rf *.tar.gz *offline.sh +rm -rf -- */ diff --git a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh index e8a26408..1a180eaf 100755 --- a/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh +++ b/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh @@ -32,10 +32,10 @@ $UBUNTU_COMMON_DIR/install_nccl.sh $UBUNTU_COMMON_DIR/install_docker.sh # cleanup downloaded tarballs - clear some space -#rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh -#rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* -#rm -rf /var/intel/ /var/cache/* -#rm -Rf -- */ +rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh +rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* +rm -rf /var/intel/ /var/cache/* +rm -Rf -- */ # Install DCGM $UBUNTU_COMMON_DIR/install_dcgm.sh From b953e8c316807f37a0a4145d47dc916db1843514 Mon Sep 17 00:00:00 2001 From: Matt Fraser Date: Tue, 2 Apr 2024 15:24:01 -0700 Subject: [PATCH 76/76] remove commented out lines --- customizations/ndv4.sh | 3 --- customizations/ndv5.sh | 3 --- 2 files changed, 6 deletions(-) diff --git a/customizations/ndv4.sh b/customizations/ndv4.sh index bd9c45ca..5c00213c 100755 --- a/customizations/ndv4.sh +++ b/customizations/ndv4.sh @@ -15,9 +15,6 @@ EOF ## NVIDIA Fabric manager systemctl enable nvidia-fabricmanager -# systemctl stop nvidia-fabricmanager -# systemctl start nvidia-fabricmanager -# systemctl is-active --quiet nvidia-fabricmanager error_code=$? if [ ${error_code} -ne 0 ] diff --git a/customizations/ndv5.sh b/customizations/ndv5.sh index 68dc0fdd..028d87cf 100755 --- a/customizations/ndv5.sh +++ b/customizations/ndv5.sh @@ -15,9 +15,6 @@ EOF ## NVIDIA Fabric manager systemctl enable nvidia-fabricmanager -# systemctl stop nvidia-fabricmanager -# systemctl start nvidia-fabricmanager -# systemctl is-active --quiet nvidia-fabricmanager error_code=$? if [ ${error_code} -ne 0 ]