diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 9586abde1d..de921b8855 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -229,7 +229,8 @@ done echo ">> Creating/updating Lmod cache..." export LMOD_RC="${EASYBUILD_INSTALLPATH}/.lmod/lmodrc.lua" -if [ ! -f $LMOD_RC ]; then +lmodrc_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' > /dev/null; echo $?) +if [ ! -f $LMOD_RC ] || [ ${lmodrc_changed} == '0' ]; then python3 $TOPDIR/create_lmodrc.py ${EASYBUILD_INSTALLPATH} check_exit_code $? "$LMOD_RC created" "Failed to create $LMOD_RC" fi diff --git a/create_lmodrc.py b/create_lmodrc.py index ae65153a20..adf221ecba 100755 --- a/create_lmodrc.py +++ b/create_lmodrc.py @@ -17,6 +17,81 @@ } """ +GPU_LMOD_RC ="""require("strict") +local hook = require("Hook") +local open = io.open + +local function read_file(path) + local file = open(path, "rb") -- r read mode and b binary mode + if not file then return nil end + local content = file:read "*a" -- *a or *all reads the whole file + file:close() + return content +end + +local function cuda_enabled_load_hook(t) + local frameStk = require("FrameStk"):singleton() + local mt = frameStk:mt() + local simpleName = string.match(t.modFullName, "(.-)/") + -- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections. + -- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse + -- to load the CUDA module and print an informative message on how to set up GPU support for EESSI + if simpleName == 'CUDA' then + -- get the full host_injections path + local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + -- build final path where the CUDA software should be installed + local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" + local cudaDirExists = isDir(cudaEasyBuildDir) + if not cudaDirExists then + local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " + advice = advice .. "due to licencing. In order to be able to use the CUDA module, please follow the " + advice = advice .. "instructions available under https://www.eessi.io/docs/gpu/ \\n" + LmodError("\\nYou requested to load ", simpleName, " ", advice) + end + end + -- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker, + -- otherwise, refuse to load the requested module and print error message + local haveGpu = mt:haveProperty(simpleName,"arch","gpu") + if haveGpu then + local arch = os.getenv("EESSI_CPU_FAMILY") or "" + local cudaVersionFile = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" + local cudaDriverFile = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" + local cudaDriverExists = isFile(cudaDriverFile) + local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") + if not (cudaDriverExists or singularityCudaExists) then + local advice = "which relies on the CUDA runtime environment and driver libraries. " + advice = advice .. "In order to be able to use the module, please follow the instructions " + advice = advice .. "available under https://www.eessi.io/docs/gpu/ \\n" + LmodError("\\nYou requested to load ", simpleName, " ", advice) + else + -- CUDA driver exists, now we check its version to see if an update is needed + if cudaDriverExists then + local cudaVersion = read_file(cudaVersionFile) + local cudaVersion_req = os.getenv("EESSICUDAVERSION") + -- driver CUDA versions don't give a patch version for CUDA + local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)") + local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)") + local driver_libs_need_update = false + if major < major_req then + driver_libs_need_update = true + elseif major == major_req then + if minor < minor_req then + driver_libs_need_update = true + end + end + if driver_libs_need_update == true then + local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " + advice = advice .. "Please update your CUDA driver libraries and then follow the instructions " + advice = advice .. "under https://www.eessi.io/docs/gpu/ to let EESSI know about the update.\\n" + LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) + end + end + end + end +end + +hook.register("load", cuda_enabled_load_hook) +""" def error(msg): sys.stderr.write("ERROR: %s\n" % msg) @@ -36,6 +111,7 @@ def error(msg): 'dot_lmod': DOT_LMOD, 'prefix': prefix, } +lmodrc_txt += '\n' + GPU_LMOD_RC try: os.makedirs(os.path.dirname(lmodrc_path), exist_ok=True) with open(lmodrc_path, 'w') as fp: diff --git a/eb_hooks.py b/eb_hooks.py index 8d05523c2b..aa02a9b0db 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -65,6 +65,8 @@ def parse_hook(ec, *args, **kwargs): if ec.name in PARSE_HOOKS: PARSE_HOOKS[ec.name](ec, eprefix) + # inject the GPU property (if required) + ec = inject_gpu_property(ec) def pre_prepare_hook(self, *args, **kwargs): """Main pre-prepare hook: trigger custom functions.""" @@ -210,6 +212,12 @@ def pre_configure_hook(self, *args, **kwargs): PRE_CONFIGURE_HOOKS[self.name](self, *args, **kwargs) +def post_sanitycheck_hook(self, *args, **kwargs): + """Main post-sanity-check hook: trigger custom functions based on software name.""" + if self.name in POST_SANITYCHECK_HOOKS: + POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs) + + def pre_configure_hook_openblas_optarch_generic(self, *args, **kwargs): """ Pre-configure hook for OpenBLAS: add DYNAMIC_ARCH=1 to build/test/install options when using --optarch=GENERIC @@ -347,6 +355,80 @@ def pre_single_extension_isoband(ext, *args, **kwargs): # cfr. https://github.com/r-lib/isoband/commit/6984e6ce8d977f06e0b5ff73f5d88e5c9a44c027 ext.cfg['preinstallopts'] = "sed -i 's/SIGSTKSZ/32768/g' src/testthat/vendor/catch.h && " +def post_sanitycheck_cuda(self, *args, **kwargs): + """Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections.""" + print_msg("Replacing CUDA stuff we cannot ship with symlinks...") + # read CUDA EULA + eula_path = os.path.join(self.installdir, "EULA.txt") + tmp_buffer = [] + with open(eula_path) as infile: + copy = False + for line in infile: + if line.strip() == "2.6. Attachment A": + copy = True + continue + elif line.strip() == "2.7. Attachment B": + copy = False + continue + elif copy: + tmp_buffer.append(line) + # create whitelist without file extensions, they're not really needed and they only complicate things + whitelist = ['EULA', 'README'] + file_extensions = [".so", ".a", ".h", ".bc"] + for tmp in tmp_buffer: + for word in tmp.split(): + if any(ext in word for ext in file_extensions): + whitelist.append(word.split(".")[0]) + whitelist = list(set(whitelist)) + # Do some quick checks for things we should or shouldn't have in the list + if "nvcc" in whitelist: + raise EasyBuildError("Found 'nvcc' in whitelist: %s" % whitelist) + if "libcudart" not in whitelist: + raise EasyBuildError("Did not find 'libcudart' in whitelist: %s" % whitelist) + # iterate over all files in the CUDA path + for root, dirs, files in os.walk(self.installdir): + for filename in files: + # we only really care about real files, i.e. not symlinks + if not os.path.islink(os.path.join(root, filename)): + # check if the current file is part of the whitelist + basename = filename.split(".")[0] + if basename not in whitelist: + # if it is not in the whitelist, delete the file and create a symlink to host_injections + source = os.path.join(root, filename) + target = source.replace("versions", "host_injections") + # Make sure source and target are not the same + if source == target: + raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you are" + "using this hook for an EESSI installation?") + os.remove(source) + # Using os.symlink requires the existence of the target directory, so we use os.system + system_command="ln -s '%s' '%s'" % (target, source) + if os.system(system_command) != 0: + raise EasyBuildError("Failed to create symbolic link: %s" % system_command) + + +def inject_gpu_property(ec): + ec_dict = ec.asdict() + # Check if CUDA is in the dependencies, if so add the GPU Lmod tag + if ("CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])]): + ec.log.info("[parse hook] Injecting gpu as Lmod arch property and envvar with CUDA version") + key = "modluafooter" + value = 'add_property("arch","gpu")' + cuda_version = 0 + for dep in iter(ec_dict["dependencies"]): + # Make CUDA a build dependency only (rpathing saves us from link errors) + if "CUDA" in dep[0]: + cuda_version = dep[1] + ec_dict["dependencies"].remove(dep) + ec_dict["builddependencies"].append(dep) if dep not in ec_dict["builddependencies"] else ec_dict["builddependencies"] + value = "\n".join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version]) + if key in ec_dict: + if not value in ec_dict[key]: + ec[key] = "\n".join([ec_dict[key], value]) + else: + ec[key] = value + return ec + PARSE_HOOKS = { 'CGAL': parse_hook_cgal_toolchainopts_precise, @@ -378,3 +460,7 @@ def pre_single_extension_isoband(ext, *args, **kwargs): 'isoband': pre_single_extension_isoband, 'testthat': pre_single_extension_testthat, } + +POST_SANITYCHECK_HOOKS = { + 'CUDA': post_sanitycheck_cuda, +} diff --git a/eessi-2023.06-eb-4.8.1-system.yml b/eessi-2023.06-eb-4.8.1-system.yml index b0731d2534..59199dc597 100644 --- a/eessi-2023.06-eb-4.8.1-system.yml +++ b/eessi-2023.06-eb-4.8.1-system.yml @@ -7,3 +7,7 @@ easyconfigs: - EasyBuild-4.8.2.eb: options: from-pr: 19105 + - CUDA-12.1.1.eb: + options: + include-easyblocks-from-pr: 3045 + accept-eula-for: CUDA diff --git a/eessi-2023.06-eb-4.8.2-2023a.yml b/eessi-2023.06-eb-4.8.2-2023a.yml new file mode 100644 index 0000000000..cbcc3caca0 --- /dev/null +++ b/eessi-2023.06-eb-4.8.2-2023a.yml @@ -0,0 +1,7 @@ +easyconfigs: + - CUDA-Samples-12.1-GCC-12.3.0-CUDA-12.1.1.eb: + # use easyconfig that only install subset of CUDA samples, + # to circumvent problem with nvcc linking to glibc of host OS; + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19189 + options: + from-pr: 19189