diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index ec018bd049..dc18fd584a 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -35,7 +35,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: persist-credentials: false diff --git a/.github/workflows/test-software.eessi.io.yml b/.github/workflows/test-software.eessi.io.yml index 6756bd8854..8cfb023bc6 100644 --- a/.github/workflows/test-software.eessi.io.yml +++ b/.github/workflows/test-software.eessi.io.yml @@ -4,7 +4,7 @@ on: [push, pull_request, workflow_dispatch] permissions: contents: read # to fetch code (actions/checkout) jobs: - pilot: + check_missing: runs-on: ubuntu-22.04 strategy: fail-fast: false @@ -22,10 +22,10 @@ jobs: - x86_64/generic steps: - name: Check out software-layer repository - uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Mount EESSI CernVM-FS pilot repository - uses: cvmfs-contrib/github-action-cvmfs@d4641d0d591c9a5c3be23835ced2fb648b44c04b # v3.1 + uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0 with: cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb cvmfs_http_proxy: DIRECT @@ -35,6 +35,9 @@ jobs: run: | export EESSI_SOFTWARE_SUBDIR_OVERRIDE=${{matrix.EESSI_SOFTWARE_SUBDIR_OVERRIDE}} source /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/init/bash + # set $EESSI_CPU_FAMILY to the CPU architecture that corresponds to $EESSI_SOFTWARE_SUBDIR_OVERRIDE (part before the first slash), + # to prevent issues with checks in the Easybuild configuration that use this variable + export EESSI_CPU_FAMILY=${EESSI_SOFTWARE_SUBDIR_OVERRIDE%%/*} module load EasyBuild which eb eb --version @@ -53,6 +56,9 @@ jobs: run: | export EESSI_SOFTWARE_SUBDIR_OVERRIDE=${{matrix.EESSI_SOFTWARE_SUBDIR_OVERRIDE}} source /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/init/bash + # set $EESSI_CPU_FAMILY to the CPU architecture that corresponds to $EESSI_SOFTWARE_SUBDIR_OVERRIDE (part before the first slash), + # to prevent issues with checks in the Easybuild configuration that use this variable + export EESSI_CPU_FAMILY=${EESSI_SOFTWARE_SUBDIR_OVERRIDE%%/*} module load EasyBuild which eb eb --version diff --git a/.github/workflows/test_eessi_container_script.yml b/.github/workflows/test_eessi_container_script.yml index 33122e6ff4..32120d0087 100644 --- a/.github/workflows/test_eessi_container_script.yml +++ b/.github/workflows/test_eessi_container_script.yml @@ -22,7 +22,7 @@ jobs: #- save steps: - name: Check out software-layer repository - uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: install Apptainer run: | diff --git a/.github/workflows/test_licenses.yml b/.github/workflows/test_licenses.yml new file mode 100644 index 0000000000..3b9675d523 --- /dev/null +++ b/.github/workflows/test_licenses.yml @@ -0,0 +1,20 @@ +# documentation: https://help.github.com/en/articles/workflow-syntax-for-github-actions +name: Test software licenses +on: [push, pull_request] +permissions: + contents: read # to fetch code (actions/checkout) +jobs: + build: + runs-on: ubuntu-20.04 + steps: + - name: Check out software-layer repository + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - name: set up Python + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + with: + python-version: '3.9' + + - name: Check software licenses + run: | + python licenses/spdx.py licenses/licenses.json diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index cc00685a40..8e74a4e844 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -12,10 +12,10 @@ jobs: fail-fast: false steps: - name: checkout - uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: set up Python - uses: actions/setup-python@13ae5bb136fac2878aff31522b9efb785519f984 # v4.3.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: ${{matrix.python}} architecture: x64 diff --git a/.github/workflows/tests_archdetect.yml b/.github/workflows/tests_archdetect.yml index 1e8b830e14..922c9a1bf0 100644 --- a/.github/workflows/tests_archdetect.yml +++ b/.github/workflows/tests_archdetect.yml @@ -22,10 +22,10 @@ jobs: fail-fast: false steps: - name: checkout - uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Mount EESSI CernVM-FS pilot repository - uses: cvmfs-contrib/github-action-cvmfs@d4641d0d591c9a5c3be23835ced2fb648b44c04b # v3.1 + uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0 with: cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb cvmfs_http_proxy: DIRECT diff --git a/.github/workflows/tests_init.yml b/.github/workflows/tests_init.yml index 417b7851f1..38ccbbad31 100644 --- a/.github/workflows/tests_init.yml +++ b/.github/workflows/tests_init.yml @@ -12,10 +12,10 @@ jobs: fail-fast: false steps: - name: checkout - uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: set up Python - uses: actions/setup-python@13ae5bb136fac2878aff31522b9efb785519f984 # v4.3.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: ${{matrix.python}} architecture: x64 diff --git a/.github/workflows/tests_readme.yml b/.github/workflows/tests_readme.yml index d229879f67..efdb796e5e 100644 --- a/.github/workflows/tests_readme.yml +++ b/.github/workflows/tests_readme.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Check out software-layer repository - uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: verify if README.md is consistent with EESSI_VERSION from init/eessi_defaults run: | diff --git a/.github/workflows/tests_scripts.yml b/.github/workflows/tests_scripts.yml index a369f4f187..df1884dd8c 100644 --- a/.github/workflows/tests_scripts.yml +++ b/.github/workflows/tests_scripts.yml @@ -29,7 +29,7 @@ jobs: runs-on: ubuntu-20.04 steps: - name: checkout - uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: install Apptainer run: | diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index f6087b3cfe..69de9d1997 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -187,49 +187,70 @@ fi # assume there's only one diff file that corresponds to the PR patch file pr_diff=$(ls [0-9]*.diff | head -1) -# use PR patch file to determine in which easystack files stuff was added -for easystack_file in $(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing'); do - - echo -e "Processing easystack file ${easystack_file}...\n\n" - - # determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file - eb_version=$(echo ${easystack_file} | sed 's/.*eb-\([0-9.]*\).*/\1/g') - - # load EasyBuild module (will be installed if it's not available yet) - source ${TOPDIR}/load_easybuild_module.sh ${eb_version} +# install any additional required scripts +# order is important: these are needed to install a full CUDA SDK in host_injections +# for now, this just reinstalls all scripts. Note the most elegant, but works +${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX} - ${EB} --show-config +# Install full CUDA SDK in host_injections +# Hardcode this for now, see if it works +# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install +${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula - echo_green "All set, let's start installing some software with EasyBuild v${eb_version} in ${EASYBUILD_INSTALLPATH}..." +# Install drivers in host_injections +# TODO: this is commented out for now, because the script assumes that nvidia-smi is available and works; +# if not, an error is produced, and the bot flags the whole build as failed (even when not installing GPU software) +# ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh - if [ -f ${easystack_file} ]; then - echo_green "Feeding easystack file ${easystack_file} to EasyBuild..." - - ${EB} --easystack ${TOPDIR}/${easystack_file} --robot - ec=$? - - # copy EasyBuild log file if EasyBuild exited with an error - if [ ${ec} -ne 0 ]; then - eb_last_log=$(unset EB_VERBOSE; eb --last-log) - # copy to current working directory - cp -a ${eb_last_log} . - echo "Last EasyBuild log file copied from ${eb_last_log} to ${PWD}" - # copy to build logs dir (with context added) - copy_build_log "${eb_last_log}" "${build_logs_dir}" +# use PR patch file to determine in which easystack files stuff was added +changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing') +if [ -z ${changed_easystacks} ]; then + echo "No missing installations, party time!" # Ensure the bot report success, as there was nothing to be build here +else + for easystack_file in ${changed_easystacks}; do + + echo -e "Processing easystack file ${easystack_file}...\n\n" + + # determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file + eb_version=$(echo ${easystack_file} | sed 's/.*eb-\([0-9.]*\).*/\1/g') + + # load EasyBuild module (will be installed if it's not available yet) + source ${TOPDIR}/load_easybuild_module.sh ${eb_version} + + ${EB} --show-config + + echo_green "All set, let's start installing some software with EasyBuild v${eb_version} in ${EASYBUILD_INSTALLPATH}..." + + if [ -f ${easystack_file} ]; then + echo_green "Feeding easystack file ${easystack_file} to EasyBuild..." + + ${EB} --easystack ${TOPDIR}/${easystack_file} --robot + ec=$? + + # copy EasyBuild log file if EasyBuild exited with an error + if [ ${ec} -ne 0 ]; then + eb_last_log=$(unset EB_VERBOSE; eb --last-log) + # copy to current working directory + cp -a ${eb_last_log} . + echo "Last EasyBuild log file copied from ${eb_last_log} to ${PWD}" + # copy to build logs dir (with context added) + copy_build_log "${eb_last_log}" "${build_logs_dir}" + fi + + $TOPDIR/check_missing_installations.sh ${TOPDIR}/${easystack_file} + else + fatal_error "Easystack file ${easystack_file} not found!" fi - - $TOPDIR/check_missing_installations.sh ${TOPDIR}/${easystack_file} - else - fatal_error "Easystack file ${easystack_file} not found!" - fi - -done + + done +fi ### add packages here echo ">> Creating/updating Lmod cache..." export LMOD_RC="${EASYBUILD_INSTALLPATH}/.lmod/lmodrc.lua" -if [ ! -f $LMOD_RC ]; then +lmodrc_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' > /dev/null; echo $?) +if [ ! -f $LMOD_RC ] || [ ${lmodrc_changed} == '0' ]; then python3 $TOPDIR/create_lmodrc.py ${EASYBUILD_INSTALLPATH} check_exit_code $? "$LMOD_RC created" "Failed to create $LMOD_RC" fi diff --git a/bot/build.sh b/bot/build.sh index 4af217628e..1622e757e2 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -176,6 +176,11 @@ mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR} declare -a BUILD_STEP_ARGS=() BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") +# add options required to handle NVIDIA support +BUILD_STEP_ARGS+=("--nvidia" "all") +if [[ ! -z ${SHARED_FS_PATH} ]]; then + BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections") +fi # prepare arguments to install_software_layer.sh (specific to build step) declare -a INSTALL_SCRIPT_ARGS=() diff --git a/create_lmodrc.py b/create_lmodrc.py index ae65153a20..0e738a530e 100755 --- a/create_lmodrc.py +++ b/create_lmodrc.py @@ -17,6 +17,85 @@ } """ +GPU_LMOD_RC ="""require("strict") +local hook = require("Hook") +local open = io.open + +local function read_file(path) + local file = open(path, "rb") -- r read mode and b binary mode + if not file then return nil end + local content = file:read "*a" -- *a or *all reads the whole file + file:close() + return content +end + +local function cuda_enabled_load_hook(t) + local frameStk = require("FrameStk"):singleton() + local mt = frameStk:mt() + local simpleName = string.match(t.modFullName, "(.-)/") + -- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections. + -- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse + -- to load the CUDA module and print an informative message on how to set up GPU support for EESSI + local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n" + if simpleName == 'CUDA' then + -- get the full host_injections path + local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + -- build final path where the CUDA software should be installed + local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" + local cudaDirExists = isDir(cudaEasyBuildDir) + if not cudaDirExists then + local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " + advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where EESSI " + advice = advice .. "can find it.\\n" + advice = advice .. refer_to_docs + LmodError("\\nYou requested to load ", simpleName, " ", advice) + end + end + -- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker, + -- otherwise, refuse to load the requested module and print error message + local haveGpu = mt:haveProperty(simpleName,"arch","gpu") + if haveGpu then + local arch = os.getenv("EESSI_CPU_FAMILY") or "" + local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" + local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" + local cudaDriverExists = isFile(cudaDriverFile) + local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") + if not (cudaDriverExists or singularityCudaExists) then + local advice = "which relies on the CUDA runtime environment and driver libraries. " + advice = advice .. "In order to be able to use the module, you will need " + advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system.\\n" + advice = advice .. refer_to_docs + LmodError("\\nYou requested to load ", simpleName, " ", advice) + else + -- CUDA driver exists, now we check its version to see if an update is needed + if cudaDriverExists then + local cudaVersion = read_file(cudaVersionFile) + local cudaVersion_req = os.getenv("EESSICUDAVERSION") + -- driver CUDA versions don't give a patch version for CUDA + local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)") + local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)") + local driver_libs_need_update = false + if major < major_req then + driver_libs_need_update = true + elseif major == major_req then + if minor < minor_req then + driver_libs_need_update = true + end + end + if driver_libs_need_update == true then + local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " + advice = advice .. "Please update your CUDA driver libraries and then " + advice = advice .. "let EESSI know about the update.\\n" + advice = advice .. refer_to_docs + LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) + end + end + end + end +end + +hook.register("load", cuda_enabled_load_hook) +""" def error(msg): sys.stderr.write("ERROR: %s\n" % msg) @@ -36,6 +115,7 @@ def error(msg): 'dot_lmod': DOT_LMOD, 'prefix': prefix, } +lmodrc_txt += '\n' + GPU_LMOD_RC try: os.makedirs(os.path.dirname(lmodrc_path), exist_ok=True) with open(lmodrc_path, 'w') as fp: diff --git a/create_tarball.sh b/create_tarball.sh index 8510caebf1..a619df9439 100755 --- a/create_tarball.sh +++ b/create_tarball.sh @@ -43,8 +43,14 @@ module_files_list=${tmpdir}/module_files.list.txt if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/.lmod ]; then # include Lmod cache and configuration file (lmodrc.lua), # skip whiteout files and backup copies of Lmod cache (spiderT.old.*) - find ${eessi_version}/software/${os}/${cpu_arch_subdir}/.lmod -type f | egrep -v '/\.wh\.|spiderT.old' > ${files_list} + find ${eessi_version}/software/${os}/${cpu_arch_subdir}/.lmod -type f | egrep -v '/\.wh\.|spiderT.old' >> ${files_list} fi + +# include scripts that were copied by install_scripts.sh, which we want to ship in EESSI repository +if [ -d ${eessi_version}/scripts ]; then + find ${eessi_version}/scripts -type f | grep -v '/\.wh\.' >> ${files_list} +fi + if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/modules ]; then # module files find ${eessi_version}/software/${os}/${cpu_arch_subdir}/modules -type f | grep -v '/\.wh\.' >> ${files_list} @@ -55,6 +61,7 @@ if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/modules ]; then | grep -v '/\.wh\.' | grep -v '/\.modulerc\.lua' | sed -e 's/.lua$//' | sed -e 's@.*/modules/all/@@g' | sort -u \ >> ${module_files_list} fi + if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/software -a -r ${module_files_list} ]; then # installation directories but only those for which module files were created # Note, we assume that module names (as defined by 'PACKAGE_NAME/VERSION.lua' diff --git a/easystacks/software.eessi.io/2023.06/README.md b/easystacks/software.eessi.io/2023.06/README.md new file mode 100644 index 0000000000..733ebf9475 --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/README.md @@ -0,0 +1,7 @@ +File naming matters, since it determines the order in which easystack files are processed. + +Software installed with system toolchain should be installed first, +this includes EasyBuild itself, see `eessi-2023.06-eb-4.8.2-001-system.yml` . + +CUDA installations must be done before CUDA is required as dependency for something +built with a non-system toolchain, see `eessi-2023.06-eb-4.8.2-010-CUDA.yml` . diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-system.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-001-system.yml similarity index 54% rename from easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-system.yml rename to easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-001-system.yml index 5ce6a65913..f02b9f2802 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-system.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-001-system.yml @@ -2,3 +2,6 @@ easyconfigs: - EasyBuild-4.8.2.eb: options: from-pr: 19105 + - Nextflow-23.10.0.eb: + options: + from-pr: 19172 diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-010-CUDA.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-010-CUDA.yml new file mode 100644 index 0000000000..dda274b8db --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-010-CUDA.yml @@ -0,0 +1,5 @@ +easyconfigs: + - CUDA-12.1.1.eb: + options: + include-easyblocks-from-pr: 3045 + accept-eula-for: CUDA diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2022b.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2022b.yml new file mode 100644 index 0000000000..fd88fafb0c --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2022b.yml @@ -0,0 +1,7 @@ +easyconfigs: + - foss-2022b.eb + - HarfBuzz-5.3.1-GCCcore-12.2.0.eb: + options: + from-pr: 19339 + - Qt5-5.15.7-GCCcore-12.2.0.eb + - QuantumESPRESSO-7.2-foss-2022b.eb diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml index 2a3ce50fde..912c54101e 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml @@ -12,8 +12,41 @@ easyconfigs: options: from-pr: 19270 - SciPy-bundle-2023.07-gfbf-2023a.eb - - GROMACS-2023.3-foss-2023a.eb: - # pull in easyconfig file not included yet with EasyBuild v4.8.2; - # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19321 + - TensorFlow-2.13.0-foss-2023a.eb: + # patch setup.py for grpcio extension in TensorFlow 2.13.0 easyconfigs to take into account alternate sysroot; + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19268 options: - from-pr: 19321 + from-pr: 19268 + - X11-20230603-GCCcore-12.3.0.eb + - HarfBuzz-5.3.1-GCCcore-12.3.0.eb: + options: + from-pr: 19339 + - Qt5-5.15.10-GCCcore-12.3.0.eb + - OSU-Micro-Benchmarks-7.1-1-gompi-2023a.eb + - LHAPDF-6.5.4-GCC-12.3.0.eb: + options: + from-pr: 19363 + - LoopTools-2.15-GCC-12.3.0.eb: + options: + from-pr: 19397 + - R-4.3.2-gfbf-2023a.eb: + options: + from-pr: 19185 + - Boost-1.82.0-GCC-12.3.0.eb + - netCDF-4.9.2-gompi-2023a.eb + - FFmpeg-6.0-GCCcore-12.3.0.eb + - CUDA-Samples-12.1-GCC-12.3.0-CUDA-12.1.1.eb: + # use easyconfig that only install subset of CUDA samples, + # to circumvent problem with nvcc linking to glibc of host OS, + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19189; + # and where additional samples are excluded because they fail to build on aarch64, + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19451; + options: + from-pr: 19451 + - ALL-0.9.2-foss-2023a.eb: + options: + from-pr: 19455 + - CDO-2.2.2-gompi-2023a.eb: + options: + from-pr: 19735 +>>>>>>> 2023.06-software.eessi.io diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.0-001-system.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.0-001-system.yml new file mode 100644 index 0000000000..25c13e49c9 --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.0-001-system.yml @@ -0,0 +1,5 @@ +easyconfigs: + - EasyBuild-4.9.0.eb: + options: + from-pr: 19464 + - ReFrame-4.3.3.eb diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.0-2022b.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.0-2022b.yml new file mode 100644 index 0000000000..fdacd95c55 --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.0-2022b.yml @@ -0,0 +1,2 @@ +easyconfigs: + - SciPy-bundle-2023.02-gfbf-2022b.eb diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.0-2023a.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.0-2023a.yml new file mode 100644 index 0000000000..72da7c8826 --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.0-2023a.yml @@ -0,0 +1,18 @@ +easyconfigs: + - OpenFOAM-11-foss-2023a.eb: + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19545 + options: + from-pr: 19545 + - at-spi2-core-2.49.91-GCCcore-12.3.0.eb + - ESPResSo-4.2.1-foss-2023a.eb: + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19592 + options: + from-pr: 19592 + - Rivet-3.1.9-gompi-2023a-HepMC3-3.2.6.eb: + options: + from-pr: 19679 + - PyTorch-2.1.2-foss-2023a.eb: + options: + from-pr: 19573 + - scikit-learn-1.3.1-gfbf-2023a.eb + - GROMACS-2023.3-foss-2023a.eb diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.0-2023b.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.0-2023b.yml new file mode 100644 index 0000000000..4dd31dbd5d --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.0-2023b.yml @@ -0,0 +1,10 @@ +easyconfigs: + - GCC-13.2.0.eb + - foss-2023b.eb + - SciPy-bundle-2023.11-gfbf-2023b.eb + - netCDF-4.9.2-gompi-2023b.eb: + options: + from-pr: 19534 + - matplotlib-3.8.2-gfbf-2023b.eb: + options: + from-pr: 19552 diff --git a/eb_hooks.py b/eb_hooks.py index 3d0e63f4c2..9a122c18ca 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -3,11 +3,12 @@ import os import re +import easybuild.tools.environment as env from easybuild.easyblocks.generic.configuremake import obtain_config_guess from easybuild.framework.easyconfig.constants import EASYCONFIG_CONSTANTS from easybuild.tools.build_log import EasyBuildError, print_msg from easybuild.tools.config import build_option, update_build_option -from easybuild.tools.filetools import apply_regex_substitutions, copy_file, which +from easybuild.tools.filetools import apply_regex_substitutions, copy_file, remove_file, symlink, which from easybuild.tools.run import run_cmd from easybuild.tools.systemtools import AARCH64, POWER, X86_64, get_cpu_architecture, get_cpu_features from easybuild.tools.toolchain.compiler import OPTARCH_GENERIC @@ -68,6 +69,24 @@ def parse_hook(ec, *args, **kwargs): if ec.name in PARSE_HOOKS: PARSE_HOOKS[ec.name](ec, eprefix) + # inject the GPU property (if required) + ec = inject_gpu_property(ec) + + +def post_ready_hook(self, *args, **kwargs): + """ + Post-ready hook: limit parallellism for selected builds, because they require a lot of memory per used core. + """ + # 'parallel' easyconfig parameter is set via EasyBlock.set_parallel in ready step based on available cores. + # here we reduce parallellism to only use half of that for selected software, + # to avoid failing builds/tests due to out-of-memory problems + if self.name in ['TensorFlow', 'libxc']: + parallel = self.cfg['parallel'] + if parallel > 1: + self.cfg['parallel'] = parallel // 2 + msg = "limiting parallelism to %s (was %s) for %s to avoid out-of-memory failures during building/testing" + print_msg(msg % (self.cfg['parallel'], parallel, self.name), log=self.log) + def pre_prepare_hook(self, *args, **kwargs): """Main pre-prepare hook: trigger custom functions.""" @@ -166,20 +185,21 @@ def parse_hook_fontconfig_add_fonts(ec, eprefix): def parse_hook_openblas_relax_lapack_tests_num_errors(ec, eprefix): - """Relax number of failing numerical LAPACK tests for aarch64/neoverse_v1 CPU target.""" + """Relax number of failing numerical LAPACK tests for aarch64/neoverse_v1 CPU target for OpenBLAS < 0.3.23""" cpu_target = get_eessi_envvar('EESSI_SOFTWARE_SUBDIR') if ec.name == 'OpenBLAS': - # relax maximum number of failed numerical LAPACK tests for aarch64/neoverse_v1 CPU target - # since the default setting of 150 that works well on other aarch64 targets and x86_64 is a bit too strict - # See https://github.com/EESSI/software-layer/issues/314 - cfg_option = 'max_failing_lapack_tests_num_errors' - if cpu_target == CPU_TARGET_NEOVERSE_V1: - orig_value = ec[cfg_option] - ec[cfg_option] = 400 - print_msg("Maximum number of failing LAPACK tests with numerical errors for %s relaxed to %s (was %s)", - ec.name, ec[cfg_option], orig_value) - else: - print_msg("Not changing option %s for %s on non-AARCH64", cfg_option, ec.name) + if LooseVersion(ec.version) < LooseVersion('0.3.23'): + # relax maximum number of failed numerical LAPACK tests for aarch64/neoverse_v1 CPU target + # since the default setting of 150 that works well on other aarch64 targets and x86_64 is a bit too strict + # See https://github.com/EESSI/software-layer/issues/314 + cfg_option = 'max_failing_lapack_tests_num_errors' + if cpu_target == CPU_TARGET_NEOVERSE_V1: + orig_value = ec[cfg_option] + ec[cfg_option] = 400 + print_msg("Maximum number of failing LAPACK tests with numerical errors for %s relaxed to %s (was %s)", + ec.name, ec[cfg_option], orig_value) + else: + print_msg("Not changing option %s for %s on non-AARCH64", cfg_option, ec.name) else: raise EasyBuildError("OpenBLAS-specific hook triggered for non-OpenBLAS easyconfig?!") @@ -240,6 +260,13 @@ def pre_configure_hook_openblas_optarch_generic(self, *args, **kwargs): if build_option('optarch') == OPTARCH_GENERIC: for step in ('build', 'test', 'install'): self.cfg.update(f'{step}opts', "DYNAMIC_ARCH=1") + + # use -mtune=generic rather than -mcpu=generic in $CFLAGS on aarch64, + # because -mcpu=generic implies a particular -march=armv* which clashes with those used by OpenBLAS + # when building with DYNAMIC_ARCH=1 + if get_cpu_architecture() == AARCH64: + cflags = os.getenv('CFLAGS').replace('-mcpu=generic', '-mtune=generic') + env.setvar('CFLAGS', cflags) else: raise EasyBuildError("OpenBLAS-specific hook triggered for non-OpenBLAS easyconfig?!") @@ -309,6 +336,21 @@ def pre_configure_hook_LAMMPS_aarch64(self, *args, **kwargs): raise EasyBuildError("LAMMPS-specific hook triggered for non-LAMMPS easyconfig?!") +def pre_configure_hook_atspi2core_filter_ld_library_path(self, *args, **kwargs): + """ + pre-configure hook for at-spi2-core: + - instruct GObject-Introspection's g-ir-scanner tool to not set $LD_LIBRARY_PATH + when EasyBuild is configured to filter it, see: + https://github.com/EESSI/software-layer/issues/196 + """ + if self.name == 'at-spi2-core': + if build_option('filter_env_vars') and 'LD_LIBRARY_PATH' in build_option('filter_env_vars'): + sed_cmd = 'sed -i "s/gir_extra_args = \[/gir_extra_args = \[\\n \'--lib-dirs-envvar=FILTER_LD_LIBRARY_PATH\',/g" %(start_dir)s/atspi/meson.build && ' + self.cfg.update('preconfigopts', sed_cmd) + else: + raise EasyBuildError("at-spi2-core-specific hook triggered for non-at-spi2-core easyconfig?!") + + def pre_test_hook(self,*args, **kwargs): """Main pre-test hook: trigger custom functions based on software name.""" if self.name in PRE_TEST_HOOKS: @@ -341,23 +383,84 @@ def pre_test_hook_ignore_failing_tests_SciPybundle(self, *args, **kwargs): FAILED optimize/tests/test_linprog.py::TestLinprogIPSparse::test_bug_6139 - A... FAILED optimize/tests/test_linprog.py::TestLinprogIPSparsePresolve::test_bug_6139 = 2 failed, 30554 passed, 2064 skipped, 10992 deselected, 76 xfailed, 7 xpassed, 40 warnings in 380.27s (0:06:20) = - In versions 2023.07, 2 failing tests in scipy 1.11.1: + In versions 2023.02, 2023.07, and 2023.11, 2 failing tests in scipy (versions 1.10.1, 1.11.1, 1.11.4): FAILED scipy/spatial/tests/test_distance.py::TestPdist::test_pdist_correlation_iris FAILED scipy/spatial/tests/test_distance.py::TestPdist::test_pdist_correlation_iris_float32 = 2 failed, 54409 passed, 3016 skipped, 223 xfailed, 13 xpassed, 10917 warnings in 892.04s (0:14:52) = In previous versions we were not as strict yet on the numpy/SciPy tests """ cpu_target = get_eessi_envvar('EESSI_SOFTWARE_SUBDIR') - if self.name == 'SciPy-bundle' and self.version in ['2021.10', '2023.07'] and cpu_target == CPU_TARGET_NEOVERSE_V1: + scipy_bundle_versions = ('2021.10', '2023.02', '2023.07', '2023.11') + if self.name == 'SciPy-bundle' and self.version in scipy_bundle_versions and cpu_target == CPU_TARGET_NEOVERSE_V1: + self.cfg['testopts'] = "|| echo ignoring failing tests" + +def pre_test_hook_ignore_failing_tests_netCDF(self, *args, **kwargs): + """ + Pre-test hook for netCDF: skip failing tests for selected netCDF versions on neoverse_v1 + cfr. https://github.com/EESSI/software-layer/issues/425 + The following tests are problematic: + 163 - nc_test4_run_par_test (Timeout) + 190 - h5_test_run_par_tests (Timeout) + A few other tests are skipped in the easyconfig and patches for similar issues, see above issue for details. + """ + cpu_target = get_eessi_envvar('EESSI_SOFTWARE_SUBDIR') + if self.name == 'netCDF' and self.version == '4.9.2' and cpu_target == CPU_TARGET_NEOVERSE_V1: self.cfg['testopts'] = "|| echo ignoring failing tests" +def pre_test_hook_increase_max_failed_tests_arm_PyTorch(self, *args, **kwargs): + """ + Pre-test hook for PyTorch: increase max failing tests for ARM for PyTorch 2.1.2 + See https://github.com/EESSI/software-layer/pull/444#issuecomment-1890416171 + """ + if self.name == 'PyTorch' and self.version == '2.1.2' and get_cpu_architecture() == AARCH64: + self.cfg['max_failed_tests'] = 10 + def pre_single_extension_hook(ext, *args, **kwargs): - """Main pre-configure hook: trigger custom functions based on software name.""" + """Main pre-extension: trigger custom functions based on software name.""" if ext.name in PRE_SINGLE_EXTENSION_HOOKS: PRE_SINGLE_EXTENSION_HOOKS[ext.name](ext, *args, **kwargs) +def post_single_extension_hook(ext, *args, **kwargs): + """Main post-extension hook: trigger custom functions based on software name.""" + if ext.name in POST_SINGLE_EXTENSION_HOOKS: + POST_SINGLE_EXTENSION_HOOKS[ext.name](ext, *args, **kwargs) + + +def pre_single_extension_isoband(ext, *args, **kwargs): + """ + Pre-extension hook for isoband R package, to fix build on top of recent glibc. + """ + if ext.name == 'isoband' and LooseVersion(ext.version) < LooseVersion('0.2.5'): + # use constant value instead of SIGSTKSZ for stack size in vendored testthat included in isoband sources, + # cfr. https://github.com/r-lib/isoband/commit/6984e6ce8d977f06e0b5ff73f5d88e5c9a44c027 + ext.cfg['preinstallopts'] = "sed -i 's/SIGSTKSZ/32768/g' src/testthat/vendor/catch.h && " + + +def pre_single_extension_numpy(ext, *args, **kwargs): + """ + Pre-extension hook for numpy, to change -march=native to -march=armv8.4-a for numpy 1.24.2 + when building for aarch64/neoverse_v1 CPU target. + """ + cpu_target = get_eessi_envvar('EESSI_SOFTWARE_SUBDIR') + if ext.name == 'numpy' and ext.version == '1.24.2' and cpu_target == CPU_TARGET_NEOVERSE_V1: + # note: this hook is called before build environment is set up (by calling toolchain.prepare()), + # so environment variables like $CFLAGS are not defined yet + # unsure which of these actually matter for numpy, so changing all of them + ext.orig_optarch = build_option('optarch') + update_build_option('optarch', 'march=armv8.4-a') + + +def post_single_extension_numpy(ext, *args, **kwargs): + """ + Post-extension hook for numpy, to reset 'optarch' build option. + """ + cpu_target = get_eessi_envvar('EESSI_SOFTWARE_SUBDIR') + if ext.name == 'numpy' and ext.version == '1.24.2' and cpu_target == CPU_TARGET_NEOVERSE_V1: + update_build_option('optarch', ext.orig_optarch) + + def pre_single_extension_testthat(ext, *args, **kwargs): """ Pre-extension hook for testthat R package, to fix build on top of recent glibc. @@ -368,14 +471,102 @@ def pre_single_extension_testthat(ext, *args, **kwargs): ext.cfg['preinstallopts'] = "sed -i 's/SIGSTKSZ/32768/g' inst/include/testthat/vendor/catch.h && " -def pre_single_extension_isoband(ext, *args, **kwargs): +def post_sanitycheck_hook(self, *args, **kwargs): + """Main post-sanity-check hook: trigger custom functions based on software name.""" + if self.name in POST_SANITYCHECK_HOOKS: + POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs) + + +def post_sanitycheck_cuda(self, *args, **kwargs): """ - Pre-extension hook for isoband R package, to fix build on top of recent glibc. + Remove files from CUDA installation that we are not allowed to ship, + and replace them with a symlink to a corresponding installation under host_injections. """ - if ext.name == 'isoband' and LooseVersion(ext.version) < LooseVersion('0.2.5'): - # use constant value instead of SIGSTKSZ for stack size in vendored testthat included in isoband sources, - # cfr. https://github.com/r-lib/isoband/commit/6984e6ce8d977f06e0b5ff73f5d88e5c9a44c027 - ext.cfg['preinstallopts'] = "sed -i 's/SIGSTKSZ/32768/g' src/testthat/vendor/catch.h && " + if self.name == 'CUDA': + print_msg("Replacing files in CUDA installation that we can not ship with symlinks to host_injections...") + + # read CUDA EULA, construct allowlist based on section 2.6 that specifies list of files that can be shipped + eula_path = os.path.join(self.installdir, 'EULA.txt') + relevant_eula_lines = [] + with open(eula_path) as infile: + copy = False + for line in infile: + if line.strip() == "2.6. Attachment A": + copy = True + continue + elif line.strip() == "2.7. Attachment B": + copy = False + continue + elif copy: + relevant_eula_lines.append(line) + + # create list without file extensions, they're not really needed and they only complicate things + allowlist = ['EULA', 'README'] + file_extensions = ['.so', '.a', '.h', '.bc'] + for line in relevant_eula_lines: + for word in line.split(): + if any(ext in word for ext in file_extensions): + allowlist.append(os.path.splitext(word)[0]) + allowlist = sorted(set(allowlist)) + self.log.info("Allowlist for files in CUDA installation that can be redistributed: " + ', '.join(allowlist)) + + # Do some quick sanity checks for things we should or shouldn't have in the list + if 'nvcc' in allowlist: + raise EasyBuildError("Found 'nvcc' in allowlist: %s" % allowlist) + if 'libcudart' not in allowlist: + raise EasyBuildError("Did not find 'libcudart' in allowlist: %s" % allowlist) + + # iterate over all files in the CUDA installation directory + for dir_path, _, files in os.walk(self.installdir): + for filename in files: + full_path = os.path.join(dir_path, filename) + # we only really care about real files, i.e. not symlinks + if not os.path.islink(full_path): + # check if the current file is part of the allowlist + basename = os.path.splitext(filename)[0] + if basename in allowlist: + self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) + else: + self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s", + basename, full_path) + # if it is not in the allowlist, delete the file and create a symlink to host_injections + host_inj_path = full_path.replace('versions', 'host_injections') + # make sure source and target of symlink are not the same + if full_path == host_inj_path: + raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " + "are using this hook for an EESSI installation?", + full_path, host_inj_path) + remove_file(full_path) + symlink(host_inj_path, full_path) + else: + raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!") + + +def inject_gpu_property(ec): + """ + Add 'gpu' property, via modluafooter easyconfig parameter + """ + ec_dict = ec.asdict() + # Check if CUDA is in the dependencies, if so add the 'gpu' Lmod property + if ('CUDA' in [dep[0] for dep in iter(ec_dict['dependencies'])]): + ec.log.info("Injecting gpu as Lmod arch property and envvar with CUDA version") + key = 'modluafooter' + value = 'add_property("arch","gpu")' + cuda_version = 0 + for dep in iter(ec_dict['dependencies']): + # Make CUDA a build dependency only (rpathing saves us from link errors) + if 'CUDA' in dep[0]: + cuda_version = dep[1] + ec_dict['dependencies'].remove(dep) + if dep not in ec_dict['builddependencies']: + ec_dict['builddependencies'].append(dep) + value = '\n'.join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version]) + if key in ec_dict: + if not value in ec_dict[key]: + ec[key] = '\n'.join([ec_dict[key], value]) + else: + ec[key] = value + return ec PARSE_HOOKS = { @@ -397,15 +588,27 @@ def pre_single_extension_isoband(ext, *args, **kwargs): 'OpenBLAS': pre_configure_hook_openblas_optarch_generic, 'WRF': pre_configure_hook_wrf_aarch64, 'LAMMPS': pre_configure_hook_LAMMPS_aarch64, + 'at-spi2-core': pre_configure_hook_atspi2core_filter_ld_library_path, } PRE_TEST_HOOKS = { 'ESPResSo': pre_test_hook_ignore_failing_tests_ESPResSo, 'FFTW.MPI': pre_test_hook_ignore_failing_tests_FFTWMPI, 'SciPy-bundle': pre_test_hook_ignore_failing_tests_SciPybundle, + 'netCDF': pre_test_hook_ignore_failing_tests_netCDF, + 'PyTorch': pre_test_hook_increase_max_failed_tests_arm_PyTorch, } PRE_SINGLE_EXTENSION_HOOKS = { 'isoband': pre_single_extension_isoband, + 'numpy': pre_single_extension_numpy, 'testthat': pre_single_extension_testthat, } + +POST_SINGLE_EXTENSION_HOOKS = { + 'numpy': post_single_extension_numpy, +} + +POST_SANITYCHECK_HOOKS = { + 'CUDA': post_sanitycheck_cuda, +} diff --git a/eessi-2023.06-known-issues.yml b/eessi-2023.06-known-issues.yml new file mode 100644 index 0000000000..569a0d9f56 --- /dev/null +++ b/eessi-2023.06-known-issues.yml @@ -0,0 +1,39 @@ +- aarch64/generic: + - PyTorch-2.1.2-foss-2023a: + - issue: https://github.com/EESSI/software-layer/issues/461 + - info: "8 failing tests (out of 209539) on aarch64/*" +- aarch64/neoverse_n1: + - PyTorch-2.1.2-foss-2023a: + - issue: https://github.com/EESSI/software-layer/issues/461 + - info: "8 failing tests (out of 209539) on aarch64/*" +- aarch64/neoverse_v1: + - ESPResSo-4.2.1-foss-2023a: + - issue: https://github.com/EESSI/software-layer/issues/363 + - info: "ESPResSo tests failing due to timeouts" + - FFTW.MPI-3.3.10-gompi-2023a: + - issue: https://github.com/EESSI/software-layer/issues/325 + - info: "Flaky FFTW tests, random failures" + - FFTW.MPI-3.3.10-gompi-2023b: + - issue: https://github.com/EESSI/software-layer/issues/325 + - info: "Flaky FFTW tests, random failures" + - netCDF-4.9.2-gompi-2023a.eb: + - issue: https://github.com/EESSI/software-layer/issues/425 + - info: "netCDF intermittent test failures" + - netCDF-4.9.2-gompi-2023b.eb: + - issue: https://github.com/EESSI/software-layer/issues/425 + - info: "netCDF intermittent test failures" + - OpenBLAS-0.3.21-GCC-12.2.0: + - issue: https://github.com/EESSI/software-layer/issues/314 + - info: "Increased number of numerical errors in OpenBLAS test suite (344 vs max. 150 on x86_64/*)" + - PyTorch-2.1.2-foss-2023a: + - issue: https://github.com/EESSI/software-layer/issues/461 + - info: "8 failing tests (out of 209539) on aarch64/*" + - SciPy-bundle-2023.02-gfbf-2022b: + - issue: https://github.com/EESSI/software-layer/issues/318 + - info: "numpy built with -march=armv8.4-a instead of -mcpu=native (no SVE) + 2 failing tests (vs 50005 passed) in scipy test suite" + - SciPy-bundle-2023.07-gfbf-2023a: + - issue: https://github.com/EESSI/software-layer/issues/318 + - info: "2 failing tests (vs 54409 passed) in scipy test suite" + - SciPy-bundle-2023.11-gfbf-2023b: + - issue: https://github.com/EESSI/software-layer/issues/318 + - info: "2 failing tests (vs 54876 passed) in scipy test suite" diff --git a/eessi_container.sh b/eessi_container.sh index bf0294c7bf..d6e9558202 100755 --- a/eessi_container.sh +++ b/eessi_container.sh @@ -30,8 +30,8 @@ # -. initial settings & exit codes TOPDIR=$(dirname $(realpath $0)) -source ${TOPDIR}/scripts/utils.sh -source ${TOPDIR}/scripts/cfg_files.sh +source "${TOPDIR}"/scripts/utils.sh +source "${TOPDIR}"/scripts/cfg_files.sh # exit codes: bitwise shift codes to allow for combination of exit codes # ANY_ERROR_EXITCODE is sourced from ${TOPDIR}/scripts/utils.sh @@ -46,6 +46,7 @@ SAVE_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 8)) HTTP_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 9)) HTTPS_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 10)) RUN_SCRIPT_MISSING_EXITCODE=$((${ANY_ERROR_EXITCODE} << 11)) +NVIDIA_MODE_UNKNOWN_EXITCODE=$((${ANY_ERROR_EXITCODE} << 12)) # CernVM-FS settings CVMFS_VAR_LIB="var-lib-cvmfs" @@ -72,12 +73,17 @@ display_help() { echo " -a | --access {ro,rw} - ro (read-only), rw (read & write) [default: ro]" echo " -c | --container IMG - image file or URL defining the container to use" echo " [default: docker://ghcr.io/eessi/build-node:debian11]" - echo " -h | --help - display this usage information [default: false]" echo " -g | --storage DIR - directory space on host machine (used for" echo " temporary data) [default: 1. TMPDIR, 2. /tmp]" + echo " -h | --help - display this usage information [default: false]" + echo " -i | --host-injections - directory to link to for host_injections " + echo " [default: /..storage../opt-eessi]" echo " -l | --list-repos - list available repository identifiers [default: false]" echo " -m | --mode MODE - with MODE==shell (launch interactive shell) or" echo " MODE==run (run a script or command) [default: shell]" + echo " -n | --nvidia MODE - configure the container to work with NVIDIA GPUs," + echo " MODE==install for a CUDA installation, MODE==run to" + echo " attach a GPU, MODE==all for both [default: false]" echo " -r | --repository CFG - configuration file or identifier defining the" echo " repository to use [default: EESSI via" echo " default container, see --container]" @@ -111,6 +117,7 @@ VERBOSE=0 STORAGE= LIST_REPOS=0 MODE="shell" +SETUP_NVIDIA=0 REPOSITORY="EESSI" RESUME= SAVE= @@ -141,6 +148,10 @@ while [[ $# -gt 0 ]]; do display_help exit 0 ;; + -i|--host-injections) + USER_HOST_INJECTIONS="$2" + shift 2 + ;; -l|--list-repos) LIST_REPOS=1 shift 1 @@ -149,6 +160,11 @@ while [[ $# -gt 0 ]]; do MODE="$2" shift 2 ;; + -n|--nvidia) + SETUP_NVIDIA=1 + NVIDIA_MODE="$2" + shift 2 + ;; -r|--repository) REPOSITORY="$2" shift 2 @@ -224,6 +240,13 @@ if [[ "${MODE}" != "shell" && "${MODE}" != "run" ]]; then fatal_error "unknown execution mode '${MODE}'" "${MODE_UNKNOWN_EXITCODE}" fi +# Also validate the NVIDIA GPU mode (if present) +if [[ ${SETUP_NVIDIA} -eq 1 ]]; then + if [[ "${NVIDIA_MODE}" != "run" && "${NVIDIA_MODE}" != "install" && "${NVIDIA_MODE}" != "all" ]]; then + fatal_error "unknown NVIDIA mode '${NVIDIA_MODE}'" "${NVIDIA_MODE_UNKNOWN_EXITCODE}" + fi +fi + # TODO (arg -r|--repository) check if repository is known # REPOSITORY_ERROR_EXITCODE if [[ ! -z "${REPOSITORY}" && "${REPOSITORY}" != "EESSI" && ! -r ${EESSI_REPOS_CFG_FILE} ]]; then @@ -310,12 +333,25 @@ fi # |-overlay-work # |-home # |-repos_cfg +# |-opt-eessi (unless otherwise specificed for host_injections) # tmp dir for EESSI EESSI_TMPDIR=${EESSI_HOST_STORAGE} mkdir -p ${EESSI_TMPDIR} [[ ${VERBOSE} -eq 1 ]] && echo "EESSI_TMPDIR=${EESSI_TMPDIR}" +# Set host_injections directory and ensure it is a writable directory (if user provided) +if [ -z ${USER_HOST_INJECTIONS+x} ]; then + # Not set, so use our default + HOST_INJECTIONS=${EESSI_TMPDIR}/opt-eessi + mkdir -p $HOST_INJECTIONS +else + # Make sure the host_injections directory specified exists and is a folder + mkdir -p ${USER_HOST_INJECTIONS} || fatal_error "host_injections directory ${USER_HOST_INJECTIONS} is either not a directory or cannot be created" + HOST_INJECTIONS=${USER_HOST_INJECTIONS} +fi +[[ ${VERBOSE} -eq 1 ]] && echo "HOST_INJECTIONS=${HOST_INJECTIONS}" + # configure Singularity: if SINGULARITY_CACHEDIR is already defined, use that # a global SINGULARITY_CACHEDIR would ensure that we don't consume # storage space again and again for the container & also speed-up @@ -394,12 +430,36 @@ fi [[ ${VERBOSE} -eq 1 ]] && echo "SINGULARITY_HOME=${SINGULARITY_HOME}" # define paths to add to SINGULARITY_BIND (added later when all BIND mounts are defined) -BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs" +BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs,${HOST_INJECTIONS}:/opt/eessi" # provide a '/tmp' inside the container BIND_PATHS="${BIND_PATHS},${EESSI_TMPDIR}:${TMP_IN_CONTAINER}" [[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}" +declare -a ADDITIONAL_CONTAINER_OPTIONS=() + +# Configure anything we need for NVIDIA GPUs and CUDA installation +if [[ ${SETUP_NVIDIA} -eq 1 ]]; then + if [[ "${NVIDIA_MODE}" == "run" || "${NVIDIA_MODE}" == "all" ]]; then + # Give singularity the appropriate flag + ADDITIONAL_CONTAINER_OPTIONS+=("--nv") + [[ ${VERBOSE} -eq 1 ]] && echo "ADDITIONAL_CONTAINER_OPTIONS=${ADDITIONAL_CONTAINER_OPTIONS[@]}" + fi + if [[ "${NVIDIA_MODE}" == "install" || "${NVIDIA_MODE}" == "all" ]]; then + # Add additional bind mounts to allow CUDA to install within a container + # (Experience tells us that these are necessary, but we don't know _why_ + # as the CUDA installer is a black box. The suspicion is that the CUDA + # installer gets confused by the permissions on these directories when + # inside a container) + EESSI_VAR_LOG=${EESSI_TMPDIR}/var-log + EESSI_USR_LOCAL_CUDA=${EESSI_TMPDIR}/usr-local-cuda + mkdir -p ${EESSI_VAR_LOG} + mkdir -p ${EESSI_USR_LOCAL_CUDA} + BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda" + [[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}" + fi +fi + # set up repository config (always create directory repos_cfg and populate it with info when # arg -r|--repository is used) mkdir -p ${EESSI_TMPDIR}/repos_cfg @@ -562,8 +622,8 @@ if [ ! -z ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} ]; then fi echo "Launching container with command (next line):" -echo "singularity ${RUN_QUIET} ${MODE} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@" -singularity ${RUN_QUIET} ${MODE} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@" +echo "singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_CONTAINER_OPTIONS[@]} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@" +singularity ${RUN_QUIET} ${MODE} "${ADDITIONAL_CONTAINER_OPTIONS[@]}" "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@" exit_code=$? # 6. save tmp if requested (arg -s|--save) diff --git a/init/bash b/init/bash index 84fe783bce..0029f91454 100644 --- a/init/bash +++ b/init/bash @@ -1,10 +1,10 @@ -# Allow for a silent mode -if [[ -v EESSI_SILENT ]]; then - # EESSI_SILENT set - output=/dev/null -else - output=/dev/stdout -fi +function show_msg { + # only echo msg if EESSI_SILENT is unset + msg=$1 + if [[ ! -v EESSI_SILENT ]]; then + echo "$msg" + fi +} # The following method should be safe, but might break if file is a symlink # (could switch to $(dirname "$(readlink -f "$BASH_SOURCE")") in that case) @@ -13,26 +13,26 @@ source $(dirname "$BASH_SOURCE")/eessi_environment_variables # only continue if setting EESSI environment variables worked fine if [ $? -eq 0 ]; then - export PS1="[EESSI $EESSI_VERSION] $ " + export PS1="{EESSI $EESSI_VERSION} $PS1" # add location of commands provided by compat layer to $PATH; # see https://github.com/EESSI/software-layer/issues/52 export PATH=$EPREFIX/usr/bin:$EPREFIX/bin:$PATH # init Lmod - echo "Initializing Lmod..." >> $output + show_msg "Initializing Lmod..." source $EESSI_EPREFIX/usr/share/Lmod/init/bash # prepend location of modules for EESSI software stack to $MODULEPATH - echo "Prepending $EESSI_MODULEPATH to \$MODULEPATH..." >> $output + show_msg "Prepending $EESSI_MODULEPATH to \$MODULEPATH..." module use $EESSI_MODULEPATH - #echo >> $output - #echo "*** Known problems in the ${EESSI_VERSION} software stack ***" >> $output - #echo >> $output - #echo "1) ..." >> $output - #echo >> $output - #echo >> $output + #show_msg "" + #show_msg "*** Known problems in the ${EESSI_VERSION} software stack ***" + #show_msg "" + #show_msg "1) ..." + #show_msg "" + #show_msg "" echo "Environment set up to use EESSI (${EESSI_VERSION}), have fun!" diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index 42f4b6b76a..af5222e7b9 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -2,24 +2,24 @@ # $BASH_SOURCE points to correct path, see also http://mywiki.wooledge.org/BashFAQ/028 EESSI_INIT_DIR_PATH=$(dirname $(realpath $BASH_SOURCE)) -# Allow for a silent mode -if [[ -v EESSI_SILENT ]]; then - # EESSI_SILENT set - output=/dev/null -else - output=/dev/stdout -fi - function error() { echo -e "\e[31mERROR: $1\e[0m" >&2 false } +function show_msg { + # only echo msg if EESSI_SILENT is unset + msg=$1 + if [[ ! -v EESSI_SILENT ]]; then + echo "$msg" + fi +} + # set up minimal environment: $EESSI_PREFIX, $EESSI_VERSION, $EESSI_OS_TYPE, $EESSI_CPU_FAMILY, $EPREFIX source $EESSI_INIT_DIR_PATH/minimal_eessi_env if [ -d $EESSI_PREFIX ]; then - echo "Found EESSI repo @ $EESSI_PREFIX!" >> $output + show_msg "Found EESSI repo @ $EESSI_PREFIX!" export EESSI_EPREFIX=$EPREFIX if [ -d $EESSI_EPREFIX ]; then @@ -27,22 +27,31 @@ if [ -d $EESSI_PREFIX ]; then # determine subdirectory in software layer if [ "$EESSI_USE_ARCHDETECT" == "1" ]; then # if archdetect is enabled, use internal code - export EESSI_SOFTWARE_SUBDIR=$(${EESSI_INIT_DIR_PATH}/eessi_archdetect.sh cpupath) - echo "archdetect says ${EESSI_SOFTWARE_SUBDIR}" >> $output + all_cpupaths=$(${EESSI_INIT_DIR_PATH}/eessi_archdetect.sh -a cpupath) + # iterate over colon-separated list verifying if the architecture is present + # under $EESSI_PREFIX/software/$EESSI_OS_TYPE; if so use the architecture as best match + IFS=: read -r -a archs <<< "${all_cpupaths}" + for arch in "${archs[@]}"; do + if [ -d ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${arch} ]; then + export EESSI_SOFTWARE_SUBDIR=${arch} + show_msg "archdetect says ${EESSI_SOFTWARE_SUBDIR}" + break + fi + done elif [ "$EESSI_USE_ARCHSPEC" == "1" ]; then # note: eessi_software_subdir_for_host.py will pick up value from $EESSI_SOFTWARE_SUBDIR_OVERRIDE if it's defined! export EESSI_EPREFIX_PYTHON=$EESSI_EPREFIX/usr/bin/python3 export EESSI_SOFTWARE_SUBDIR=$($EESSI_EPREFIX_PYTHON ${EESSI_INIT_DIR_PATH}/eessi_software_subdir_for_host.py $EESSI_PREFIX) - echo "archspec says ${EESSI_SOFTWARE_SUBDIR}" >> $output + show_msg "archspec says ${EESSI_SOFTWARE_SUBDIR}" else error "Don't know how to detect host CPU, giving up!" fi if [ ! -z $EESSI_SOFTWARE_SUBDIR ]; then - echo "Using ${EESSI_SOFTWARE_SUBDIR} as software subdirectory." >> $output + show_msg "Using ${EESSI_SOFTWARE_SUBDIR} as software subdirectory." export EESSI_SOFTWARE_PATH=$EESSI_PREFIX/software/$EESSI_OS_TYPE/$EESSI_SOFTWARE_SUBDIR if [ ! -z $EESSI_BASIC_ENV ]; then - echo "Only setting up basic environment, so we're done" >> $output + show_msg "Only setting up basic environment, so we're done" elif [ -d $EESSI_SOFTWARE_PATH ]; then # Allow for the use of a custom MNS if [ -z ${EESSI_CUSTOM_MODULEPATH+x} ]; then @@ -55,13 +64,13 @@ if [ -d $EESSI_PREFIX ]; then fi EESSI_MODULEPATH=$EESSI_SOFTWARE_PATH/$EESSI_MODULE_SUBDIR else - echo "Using defined environment variable \$EESSI_CUSTOM_MODULEPATH to set EESSI_MODULEPATH." >> $output + show_msg "Using defined environment variable \$EESSI_CUSTOM_MODULEPATH to set EESSI_MODULEPATH." EESSI_MODULEPATH=$EESSI_CUSTOM_MODULEPATH fi if [ -d $EESSI_MODULEPATH ]; then export EESSI_MODULEPATH=$EESSI_MODULEPATH - echo "Using ${EESSI_MODULEPATH} as the directory to be added to MODULEPATH." >> $output + show_msg "Using ${EESSI_MODULEPATH} as the directory to be added to MODULEPATH." else error "EESSI module path at $EESSI_MODULEPATH not found!" false @@ -69,7 +78,7 @@ if [ -d $EESSI_PREFIX ]; then export LMOD_RC="$EESSI_SOFTWARE_PATH/.lmod/lmodrc.lua" if [ -f $LMOD_RC ]; then - echo "Found Lmod configuration file at $LMOD_RC" >> $output + show_msg "Found Lmod configuration file at $LMOD_RC" else error "Lmod configuration file not found at $LMOD_RC" fi diff --git a/install_scripts.sh b/install_scripts.sh new file mode 100755 index 0000000000..6e6cd825ac --- /dev/null +++ b/install_scripts.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# +# Script to install scripts from the software-layer repo into the EESSI software stack + +display_help() { + echo "usage: $0 [OPTIONS]" + echo " -p | --prefix - prefix to copy the scripts to" + echo " -h | --help - display this usage information" +} + +compare_and_copy() { + if [ "$#" -ne 2 ]; then + echo "Usage of function: compare_and_copy " + return 1 + fi + + source_file="$1" + destination_file="$2" + + if [ ! -f "$destination_file" ] || ! diff -q "$source_file" "$destination_file" ; then + cp "$source_file" "$destination_file" + echo "File $1 copied to $2" + else + echo "Files $1 and $2 are identical. No copy needed." + fi +} + + +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + -p|--prefix) + INSTALL_PREFIX="$2" + shift 2 + ;; + -h|--help) + display_help # Call your function + # no shifting needed here, we're done. + exit 0 + ;; + -*|--*) + echo "Error: Unknown option: $1" >&2 + exit 1 + ;; + *) # No more options + POSITIONAL_ARGS+=("$1") # save positional arg + shift + ;; + esac +done + +set -- "${POSITIONAL_ARGS[@]}" + +TOPDIR=$(dirname $(realpath $0)) + +# Subdirs for generic scripts +SCRIPTS_DIR_SOURCE=${TOPDIR}/scripts # Source dir +SCRIPTS_DIR_TARGET=${INSTALL_PREFIX}/scripts # Target dir + +# Create target dir +mkdir -p ${SCRIPTS_DIR_TARGET} + +# Copy scripts into this prefix +echo "copying scripts from ${SCRIPTS_DIR_SOURCE} to ${SCRIPTS_DIR_TARGET}" +for file in utils.sh; do + compare_and_copy ${SCRIPTS_DIR_SOURCE}/${file} ${SCRIPTS_DIR_TARGET}/${file} +done +# Subdirs for GPU support +NVIDIA_GPU_SUPPORT_DIR_SOURCE=${SCRIPTS_DIR_SOURCE}/gpu_support/nvidia # Source dir +NVIDIA_GPU_SUPPORT_DIR_TARGET=${SCRIPTS_DIR_TARGET}/gpu_support/nvidia # Target dir + +# Create target dir +mkdir -p ${NVIDIA_GPU_SUPPORT_DIR_TARGET} + +# Copy files from this directory into the prefix +# To be on the safe side, we dont do recursive copies, but we are explicitely copying each individual file we want to add +echo "copying scripts from ${NVIDIA_GPU_SUPPORT_DIR_SOURCE} to ${NVIDIA_GPU_SUPPORT_DIR_TARGET}" +for file in install_cuda_host_injections.sh link_nvidia_host_libraries.sh; do + compare_and_copy ${NVIDIA_GPU_SUPPORT_DIR_SOURCE}/${file} ${NVIDIA_GPU_SUPPORT_DIR_TARGET}/${file} +done diff --git a/licenses/README.md b/licenses/README.md new file mode 100644 index 0000000000..36a7615b21 --- /dev/null +++ b/licenses/README.md @@ -0,0 +1,3 @@ +see https://spdx.org/licenses + +Python function to download SPDX list of licenses is available in `spdx.py` diff --git a/licenses/licenses.json b/licenses/licenses.json new file mode 100644 index 0000000000..8831ed368c --- /dev/null +++ b/licenses/licenses.json @@ -0,0 +1,10 @@ +{ + "EasyBuild": { + "spdx": "GPL-2.0-only", + "license_url": "https://easybuild.io" + }, + "GCCcore": { + "spdx": "GPL-2.0-with-GCC-exception", + "license_url": "https://github.com/gcc-mirror/gcc/blob/master/COPYING" + } +} diff --git a/licenses/spdx.py b/licenses/spdx.py new file mode 100644 index 0000000000..06c3edb4e6 --- /dev/null +++ b/licenses/spdx.py @@ -0,0 +1,100 @@ +import json +import logging +import sys +import urllib.request + +SPDX_LICENSE_LIST_URL = 'https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json' + +LICENSE_URL = 'license_url' +SPDX = 'spdx' + +spdx_license_list = None + +# Configure the logging module +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") + + +def get_spdx_license_list(): + """ + Download JSON file with current list of SPDX licenses, parse it, and return it as a Python dictionary. + """ + global spdx_license_list + + if spdx_license_list is None: + with urllib.request.urlopen(SPDX_LICENSE_LIST_URL) as fp: + spdx_license_list = json.load(fp) + version, release_date = spdx_license_list['licenseListVersion'], spdx_license_list['releaseDate'] + logging.info(f"Downloaded version {version} of SPDX license list (release date: {release_date})") + licenses = spdx_license_list['licenses'] + logging.info(f"Found info on {len(licenses)} licenses!") + + return spdx_license_list + + +def license_info(spdx_id): + """Find license with specified SPDX identifier.""" + + spdx_license_list = get_spdx_license_list() + + licenses = spdx_license_list['licenses'] + for lic in licenses: + if lic['licenseId'] == spdx_id: + return lic + + # if no match is found, return None as result + return None + + +def read_licenses(path): + """ + Read software project to license mapping from specified path + """ + with open(path) as fp: + licenses = json.loads(fp.read()) + + return licenses + + +def check_licenses(licenses): + """ + Check mapping of software licenses: make sure SPDX identifiers are valid. + """ + faulty_licenses = {} + + for software_name in licenses: + spdx_lic_id = licenses[software_name][SPDX] + lic_info = license_info(spdx_lic_id) + if lic_info: + lic_url = licenses[software_name][LICENSE_URL] + logging.info(f"License for software '{software_name}': {lic_info['name']} (see {lic_url})") + else: + logging.warning(f"Found faulty SPDX license ID for {software_name}: {spdx_lic_id}") + faulty_licenses[software_name] = spdx_lic_id + + if faulty_licenses: + logging.warning(f"Found {len(faulty_licenses)} faulty SPDIX license IDs (out of {len(licenses)})!") + result = False + else: + logging.info(f"License check passed for {len(licenses)} licenses!") + result = True + + return result + + +def main(args): + if len(args) == 1: + licenses_path = args[0] + else: + logging.error("Usage: python spdx.py ") + sys.exit(1) + + licenses = read_licenses(licenses_path) + if check_licenses(licenses): + logging.info("All license checks PASSED!") + else: + logging.error("One or more licence checks failed!") + sys.exit(2) + + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/scripts/gpu_support/nvidia/install_cuda_host_injections.sh b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh new file mode 100755 index 0000000000..a9310d817a --- /dev/null +++ b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh @@ -0,0 +1,211 @@ +#!/usr/bin/env bash + +# This script can be used to install CUDA under the `.../host_injections` directory. +# This provides the parts of the CUDA installation that cannot be redistributed as +# part of EESSI due to license limitations. While GPU-based software from EESSI will +# _run_ without these, installation of additional CUDA software requires the CUDA +# installation(s) under `host_injections` to be present. +# +# The `host_injections` directory is a variant symlink that by default points to +# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see +# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the +# installation to be successful, this directory needs to be writeable by the user +# executing this script. + +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $BASH_SOURCE)) +source "$TOPDIR"/../../utils.sh + +# Function to display help message +show_help() { + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --help Display this help message" + echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install" + echo " CUDA, see the EULA at" + echo " https://docs.nvidia.com/cuda/eula/index.html" + echo " -c, --cuda-version CUDA_VERSION Specify a version o CUDA to install (must" + echo " have a corresponding easyconfig in the" + echo " EasyBuild release)" + echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" + echo " storage during the CUDA install" + echo " (must have >10GB available)" +} + +# Initialize variables +install_cuda_version="" +eula_accepted=0 + +# Parse command-line options +while [[ $# -gt 0 ]]; do + case "$1" in + --help) + show_help + exit 0 + ;; + -c|--cuda-version) + if [ -n "$2" ]; then + install_cuda_version="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + --accept-cuda-eula) + eula_accepted=1 + shift 1 + ;; + -t|--temp-dir) + if [ -n "$2" ]; then + CUDA_TEMP_DIR="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + *) + show_help + fatal_error "Error: Unknown option: $1" + ;; + esac +done + +# Make sure EESSI is initialised +check_eessi_initialised + +# Make sure the CUDA version supplied is a semantic version +is_semantic_version() { + local version=$1 + local regex='^[0-9]+\.[0-9]+\.[0-9]+$' + + if [[ $version =~ $regex ]]; then + return 0 # Return success (0) if it's a semantic version + else + return 1 # Return failure (1) if it's not a semantic version + fi +} +if ! is_semantic_version "$install_cuda_version"; then + show_help + error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n" + error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n" + error="${error}version to provide is probably one of those available under\n" + error="${error}$EESSI_SOFTWARE_PATH/software/CUDA\n" + fatal_error "${error}" +fi + +# Make sure they have accepted the CUDA EULA +if [ "$eula_accepted" -ne 1 ]; then + show_help + error="\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n" + fatal_error "${error}" +fi + +# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` +# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) +cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} + +# Only install CUDA if specified version is not found. +# (existence of easybuild subdir implies a successful install) +if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then + echo_green "CUDA software found! No need to install CUDA again." +else + # We need to be able write to the installation space so let's make sure we can + if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then + fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" + fi + + # we need a directory we can use for temporary storage + if [[ -z "${CUDA_TEMP_DIR}" ]]; then + tmpdir=$(mktemp -d) + else + tmpdir="${CUDA_TEMP_DIR}"/temp + if ! mkdir "$tmpdir" ; then + fatal_error "Could not create directory ${tmpdir}" + fi + fi + + required_space_in_tmpdir=50000 + # Let's see if we have sources and build locations defined if not, we use the temporary space + if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then + export EASYBUILD_BUILDPATH=${tmpdir}/build + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) + fi + if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then + export EASYBUILD_SOURCEPATH=${tmpdir}/sources + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) + fi + + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), + # need to do a space check before we proceed + avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < 5000000 )); then + fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." + fi + avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < required_space_in_tmpdir )); then + error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" + error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check." + error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " + error="${error}to reduce this requirement. Exiting now..." + fatal_error "${error}" + fi + + if ! command -v "eb" &>/dev/null; then + echo_yellow "Attempting to load an EasyBuild module to do actual install" + module load EasyBuild + # There are some scenarios where this may fail + if [ $? -ne 0 ]; then + error="'eb' command not found in your environment and\n" + error="${error} module load EasyBuild\n" + error="${error}failed for some reason.\n" + error="${error}Please re-run this script with the 'eb' command available." + fatal_error "${error}" + fi + fi + + cuda_easyconfig="CUDA-${install_cuda_version}.eb" + + # Check the easyconfig file is available in the release + # (eb search always returns 0, so we need a grep to ensure a usable exit code) + eb --search ^${cuda_easyconfig}|grep CUDA > /dev/null 2>&1 + # Check the exit code + if [ $? -ne 0 ]; then + eb_version=$(eb --version) + available_cuda_easyconfigs=$(eb --search ^CUDA-*.eb|grep CUDA) + + error="The easyconfig ${cuda_easyconfig} was not found in EasyBuild version:\n" + error="${error} ${eb_version}\n" + error="${error}You either need to give a different version of CUDA to install _or_ \n" + error="${error}use a different version of EasyBuild for the installation.\n" + error="${error}\nThe versions of available with the current eb command are:\n" + error="${error}${available_cuda_easyconfigs}" + fatal_error "${error}" + fi + + # We need the --rebuild option, as the CUDA module may or may not be on the + # `MODULEPATH` yet. Even if it is, we still want to redo this installation + # since it will provide the symlinked targets for the parts of the CUDA + # installation in the `.../versions/...` prefix + # We install the module in our `tmpdir` since we do not need the modulefile, + # we only care about providing the targets for the symlinks. + extra_args="--rebuild --installpath-modules=${tmpdir}" + + # We don't want hooks used in this install, we need a vanilla CUDA installation + touch "$tmpdir"/none.py + # shellcheck disable=SC2086 # Intended splitting of extra_args + eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" + ret=$? + if [ $ret -ne 0 ]; then + eb_last_log=$(unset EB_VERBOSE; eb --last-log) + cp -a ${eb_last_log} . + fatal_error "CUDA installation failed, please check EasyBuild logs $(basename ${eb_last_log})..." + else + echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" + fi + # clean up tmpdir + rm -rf "${tmpdir}" +fi diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh new file mode 100755 index 0000000000..e8d7f0d0a7 --- /dev/null +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +# This script links host libraries related to GPU drivers to a location where +# they can be found by the EESSI linker + +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $BASH_SOURCE)) +source "$TOPDIR"/../../utils.sh + +# We rely on ldconfig to give us the location of the libraries on the host +command_name="ldconfig" +# We cannot use a version of ldconfig that's being shipped under CVMFS +exclude_prefix="/cvmfs" + +found_paths=() +# Always attempt to use /sbin/ldconfig +if [ -x "/sbin/$command_name" ]; then + found_paths+=("/sbin/$command_name") +fi +IFS=':' read -ra path_dirs <<< "$PATH" +for dir in "${path_dirs[@]}"; do + if [ "$dir" = "/sbin" ]; then + continue # we've already checked for $command_name in /sbin, don't need to do it twice + fi + if [[ ! "$dir" =~ ^$exclude_prefix ]]; then + if [ -x "$dir/$command_name" ]; then + found_paths+=("$dir/$command_name") + fi + fi +done + +if [ ${#found_paths[@]} -gt 0 ]; then + echo "Found $command_name in the following locations:" + printf -- "- %s\n" "${found_paths[@]}" + echo "Using first version" + host_ldconfig=${found_paths[0]} +else + error="$command_name not found in PATH or only found in paths starting with $exclude_prefix." + fatal_error "$error" +fi + +# Make sure EESSI is initialised (doesn't matter what version) +check_eessi_initialised + +# Find the CUDA version of the host CUDA drivers +# (making sure that this can still work inside prefix environment inside a container) +export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH +nvidia_smi_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" +if $nvidia_smi_command > /dev/null; then + host_driver_version=$($nvidia_smi_command | tail -n1) + echo_green "Found NVIDIA GPU driver version ${host_driver_version}" + # If the first worked, this should work too + host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') + echo_green "Found host CUDA version ${host_cuda_version}" +else + error="Failed to successfully execute\n $nvidia_smi_command\n" + fatal_error "$error" +fi + +# Let's make sure the driver libraries are not already in place +link_drivers=1 + +# first make sure that target of host_injections variant symlink is an existing directory +host_injections_target=$(realpath -m ${EESSI_CVMFS_REPO}/host_injections) +if [ ! -d ${host_injections_target} ]; then + create_directory_structure ${host_injections_target} +fi + +host_injections_nvidia_dir="${EESSI_CVMFS_REPO}/host_injections/nvidia/${EESSI_CPU_FAMILY}" +host_injection_driver_dir="${host_injections_nvidia_dir}/host" +host_injection_driver_version_file="$host_injection_driver_dir/driver_version.txt" +if [ -e "$host_injection_driver_version_file" ]; then + if grep -q "$host_driver_version" "$host_injection_driver_version_file"; then + echo_green "The host GPU driver libraries (v${host_driver_version}) have already been linked! (based on ${host_injection_driver_version_file})" + link_drivers=0 + else + # There's something there but it is out of date + echo_yellow "Cleaning out outdated symlinks" + rm $host_injection_driver_dir/* + if [ $? -ne 0 ]; then + error="Unable to remove files under '$host_injection_driver_dir'." + fatal_error "$error" + fi + fi +fi + +drivers_linked=0 +if [ "$link_drivers" -eq 1 ]; then + if ! create_directory_structure "${host_injection_driver_dir}" ; then + fatal_error "No write permissions to directory ${host_injection_driver_dir}" + fi + cd ${host_injection_driver_dir} + # Need a small temporary space to hold a couple of files + temp_dir=$(mktemp -d) + + # Gather libraries on the host (_must_ be host ldconfig) + $host_ldconfig -p | awk '{print $NF}' > "$temp_dir"/libs.txt + # Allow for the fact that we may be in a container so the CUDA libs might be in there + ls /.singularity.d/libs/* >> "$temp_dir"/libs.txt 2>/dev/null + + # Leverage singularity to find the full list of libraries we should be linking to + echo_yellow "Downloading latest version of nvliblist.conf from Apptainer to ${temp_dir}/nvliblist.conf" + curl --silent --output "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf + + # Make symlinks to all the interesting libraries + grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} "$temp_dir"/libs.txt | xargs -i ln -s {} + + # Inject driver and CUDA versions into dir + echo $host_driver_version > driver_version.txt + echo $host_cuda_version > cuda_version.txt + drivers_linked=1 + + # Remove the temporary directory when done + rm -r "$temp_dir" +fi + +# Make latest symlink for NVIDIA drivers +cd $host_injections_nvidia_dir +symlink="latest" +if [ -L "$symlink" ]; then + # Unless the drivers have been installed, leave the symlink alone + if [ "$drivers_linked" -eq 1 ]; then + ln -sf host latest + fi +else + # No link exists yet + ln -s host latest +fi + +# Make sure the libraries can be found by the EESSI linker +host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections} +if [ -L "$host_injection_linker_dir/lib" ]; then + target_path=$(readlink -f "$host_injection_linker_dir/lib") + if [ "$target_path" != "$$host_injections_nvidia_dir/latest" ]; then + cd $host_injection_linker_dir + ln -sf $host_injections_nvidia_dir/latest lib + fi +else + create_directory_structure $host_injection_linker_dir + cd $host_injection_linker_dir + ln -s $host_injections_nvidia_dir/latest lib +fi + +echo_green "Host NVIDIA GPU drivers linked successfully for EESSI" diff --git a/scripts/utils.sh b/scripts/utils.sh index d0da95e87f..b2be3f6221 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -14,7 +14,7 @@ ANY_ERROR_EXITCODE=1 function fatal_error() { echo_red "ERROR: $1" >&2 if [[ $# -gt 1 ]]; then - exit $2 + exit "$2" else exit "${ANY_ERROR_EXITCODE}" fi @@ -32,11 +32,57 @@ function check_exit_code { fi } +function check_eessi_initialised() { + if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then + fatal_error "EESSI has not been initialised!" + else + return 0 + fi +} + +function check_in_prefix_shell() { + # Make sure EPREFIX is defined + if [[ -z "${EPREFIX}" ]]; then + fatal_error "This script cannot be used without having first defined EPREFIX" + fi + if [[ ! ${SHELL} = ${EPREFIX}/bin/bash ]]; then + fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!" + fi +} + +function create_directory_structure() { + # Ensure we are given a single path argument + if [ $# -ne 1 ]; then + echo_red "Function requires a single (relative or absolute) path argument" >&2 + return $ANY_ERROR_EXITCODE + fi + dir_structure="$1" + + # Attempt to create the directory structure + error_message=$(mkdir -p "$dir_structure" 2>&1) + return_code=$? + # If it fails be explicit about the error + if [ ${return_code} -ne 0 ]; then + real_dir=$(realpath -m "$dir_structure") + echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2 + else + # If we're creating it, our use case is that we want to be able to write there + # (this is a check in case the directory already existed) + if [ ! -w "${dir_structure}" ]; then + real_dir=$(realpath -m "$dir_structure") + echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" + return_code=$ANY_ERROR_EXITCODE + fi + fi + + return $return_code +} + function get_path_for_tool { tool_name=$1 tool_envvar_name=$2 - which_out=$(which ${tool_name} 2>&1) + which_out=$(which "${tool_name}" 2>&1) exit_code=$? if [[ ${exit_code} -eq 0 ]]; then echo "INFO: found tool ${tool_name} in PATH (${which_out})" >&2 @@ -68,7 +114,7 @@ function get_host_from_url { url=$1 re="(http|https)://([^/:]+)" if [[ $url =~ $re ]]; then - echo ${BASH_REMATCH[2]} + echo "${BASH_REMATCH[2]}" return 0 else echo "" @@ -80,7 +126,7 @@ function get_port_from_url { url=$1 re="(http|https)://[^:]+:([0-9]+)" if [[ $url =~ $re ]]; then - echo ${BASH_REMATCH[2]} + echo "${BASH_REMATCH[2]}" return 0 else echo "" @@ -90,7 +136,7 @@ function get_port_from_url { function get_ipv4_address { hname=$1 - hipv4=$(grep ${hname} /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) + hipv4=$(grep "${hname}" /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) # TODO try other methods if the one above does not work --> tool that verifies # what method can be used? echo "${hipv4}"