From cac7d8ce4448e9f56ec398a12c881cf528cc8036 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 10 Nov 2023 15:58:33 +0100 Subject: [PATCH] Add support for installing CUDA and CUDA-Samples --- EESSI-pilot-install-software.sh | 2 +- create_lmodrc.py | 108 ++++++++++++++++++++++++++++++ eb_hooks.py | 82 +++++++++++++++++++++++ eessi-2023.06-eb-4.8.1-system.yml | 1 + eessi-2023.06-eb-4.8.2-2023a.yml | 2 + 5 files changed, 194 insertions(+), 1 deletion(-) create mode 100644 eessi-2023.06-eb-4.8.2-2023a.yml diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 9586abde1d..9baa987b01 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -229,7 +229,7 @@ done echo ">> Creating/updating Lmod cache..." export LMOD_RC="${EASYBUILD_INSTALLPATH}/.lmod/lmodrc.lua" -if [ ! -f $LMOD_RC ]; then +if [ ! -f $LMOD_RC ] || 'create_lmodrc.py' == $(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' | egrep -v 'known-issues|missing'); then python3 $TOPDIR/create_lmodrc.py ${EASYBUILD_INSTALLPATH} check_exit_code $? "$LMOD_RC created" "Failed to create $LMOD_RC" fi diff --git a/create_lmodrc.py b/create_lmodrc.py index ae65153a20..47195e5c1f 100755 --- a/create_lmodrc.py +++ b/create_lmodrc.py @@ -17,6 +17,113 @@ } """ +GPU_LMOD_RC ="""require("strict") +local hook = require("Hook") +local open = io.open + +local function read_file(path) + local file = open(path, "rb") -- r read mode and b binary mode + if not file then return nil end + local content = file:read "*a" -- *a or *all reads the whole file + file:close() + return content +end + +-- from https://stackoverflow.com/a/40195356 +--- Check if a file or directory exists in this path +function exists(file) + local ok, err, code = os.rename(file, file) + if not ok then + if code == 13 then + -- Permission denied, but it exists + return true + end + end + return ok, err +end + +local function visible_hook(modT) + local frameStk = require("FrameStk"):singleton() + local mt = frameStk:mt() + local cudaDir = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + local cudaDirExists = exists(cudaDir) + if not cudaDirExists then + local haveGpu = mt:haveProperty(modT.sn,"arch","gpu") + if haveGpu then + modT.isVisible = false + end + end +end + +local function cuda_enabled_load_hook(t) + local frameStk = require("FrameStk"):singleton() + local mt = frameStk:mt() + local simpleName = string.match(t.modFullName, "(.-)/") + local eprefix = os.getenv('EESSI_PREFIX') .. "/init/gpu_support" + -- if we try to load CUDA itself, check if the software exists in host_injections + -- otherwise, refuse to load CUDA and print error message + if simpleName == 'CUDA' then + -- get the full host_injections path + local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + -- build final path where the CUDA software should be installed + local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" + local cudaDirExists = exists(cudaEasyBuildDir) + if not cudaDirExists then + io.stderr:write("You requested to load ",simpleName,"\\n") + io.stderr:write("While the module file exists, the actual software is not shipped with EESSI.\\n") + io.stderr:write("In order to be able to use the CUDA module, please follow the instructions in the\\n") + io.stderr:write("gpu_support folder. Adding the CUDA software can be as easy as:\\n") + io.stderr:write("export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh\\n") + frameStk:__clear() + end + end + -- when loading CUDA enabled modules check if the necessary matching compatibility libraries are installed + -- otherwise, refuse to load the requested module and print error message + local haveGpu = mt:haveProperty(simpleName,"arch","gpu") + if haveGpu then + local arch = os.getenv("EESSI_CPU_FAMILY") or "" + local cudaVersionFile = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/" .. arch .. "/latest/version.txt" + local cudaDriverExists = exists(cudaVersionFile) + local singularityCudaExists = exists("/.singularity.d/libs/libcuda.so") + if not (cudaDriverExists or singularityCudaExists) then + io.stderr:write("You requested to load ",simpleName,"\\n") + io.stderr:write("which relies on the CUDA runtime environment and its compatibility libraries.\\n") + io.stderr:write("In order to be able to use the module, please follow the instructions in the\\n") + io.stderr:write("gpu_support folder. Installing the needed compatibility libraries can be as easy as:\\n") + io.stderr:write("./add_nvidia_gpu_support.sh\\n") + frameStk:__clear() + else + if cudaDriverExists then + local cudaVersion = read_file(cudaVersionFile) + local cudaVersion_req = os.getenv("EESSICUDAVERSION") + local major, minor, patch = string.match(cudaVersion, "(%d+)%.(%d+)%.(%d+)") + local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)") + local compat_libs_need_update = false + if major < major_req then + compat_libs_need_update = true + elseif major == major_req then + if minor < minor_req then + compat_libs_need_update = true + elseif minor == minor_req then + if patch < patch_req then + compat_libs_need_update = true + end + end + end + if compat_libs_need_update == true then + io.stderr:write("You requested to load CUDA version ",cudaVersion) + io.stderr:write("but the module you want to load requires CUDA version ",cudaVersion_req,".\\n") + io.stderr:write("Please update your CUDA compatibility libraries in order to use ",simpleName,".\\n") + frameStk:__clear() + end + end + end + end +end + +hook.register("load", cuda_enabled_load_hook) +hook.register("isVisibleHook", visible_hook) +""" def error(msg): sys.stderr.write("ERROR: %s\n" % msg) @@ -36,6 +143,7 @@ def error(msg): 'dot_lmod': DOT_LMOD, 'prefix': prefix, } +lmodrc_txt += '\n' + GPU_LMOD_RC try: os.makedirs(os.path.dirname(lmodrc_path), exist_ok=True) with open(lmodrc_path, 'w') as fp: diff --git a/eb_hooks.py b/eb_hooks.py index 31f2b9588d..c99ff2b436 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -64,6 +64,8 @@ def parse_hook(ec, *args, **kwargs): if ec.name in PARSE_HOOKS: PARSE_HOOKS[ec.name](ec, eprefix) + # inject the GPU property (if required) + ec = inject_gpu_property(ec) def pre_prepare_hook(self, *args, **kwargs): """Main pre-prepare hook: trigger custom functions.""" @@ -209,6 +211,12 @@ def pre_configure_hook(self, *args, **kwargs): PRE_CONFIGURE_HOOKS[self.name](self, *args, **kwargs) +def post_sanitycheck_hook(self, *args, **kwargs): + """Main post-sanity-check hook: trigger custom functions based on software name.""" + if self.name in POST_SANITYCHECK_HOOKS: + POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs) + + def pre_configure_hook_openblas_optarch_generic(self, *args, **kwargs): """ Pre-configure hook for OpenBLAS: add DYNAMIC_ARCH=1 to build/test/install options when using --optarch=GENERIC @@ -328,6 +336,76 @@ def pre_single_extension_isoband(ext, *args, **kwargs): # cfr. https://github.com/r-lib/isoband/commit/6984e6ce8d977f06e0b5ff73f5d88e5c9a44c027 ext.cfg['preinstallopts'] = "sed -i 's/SIGSTKSZ/32768/g' src/testthat/vendor/catch.h && " +def post_sanitycheck_cuda(self, *args, **kwargs): + """Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections.""" + print_msg("Replacing CUDA stuff we cannot ship with symlinks...") + # read CUDA EULA + eula_path = os.path.join(self.installdir, "EULA.txt") + tmp_buffer = [] + with open(eula_path) as infile: + copy = False + for line in infile: + if line.strip() == "2.6. Attachment A": + copy = True + continue + elif line.strip() == "2.7. Attachment B": + copy = False + continue + elif copy: + tmp_buffer.append(line) + # create whitelist without file extensions, they're not really needed and they only complicate things + whitelist = ['EULA', 'README'] + file_extensions = [".so", ".a", ".h", ".bc"] + for tmp in tmp_buffer: + for word in tmp.split(): + if any(ext in word for ext in file_extensions): + whitelist.append(word.split(".")[0]) + whitelist = list(set(whitelist)) + # Do some quick checks for things we should or shouldn't have in the list + if "nvcc" in whitelist: + raise EasyBuildError("Found 'nvcc' in whitelist: %s" % whitelist) + if "libcudart" not in whitelist: + raise EasyBuildError("Did not find 'libcudart' in whitelist: %s" % whitelist) + # iterate over all files in the CUDA path + for root, dirs, files in os.walk(self.installdir): + for filename in files: + # we only really care about real files, i.e. not symlinks + if not os.path.islink(os.path.join(root, filename)): + # check if the current file is part of the whitelist + basename = filename.split(".")[0] + if basename not in whitelist: + # if it is not in the whitelist, delete the file and create a symlink to host_injections + source = os.path.join(root, filename) + target = source.replace("versions", "host_injections") + os.remove(source) + # Using os.symlink requires the existence of the target directory, so we use os.system + system_command="ln -s '%s' '%s'" % (target, source) + if os.system(system_command) != 0: + raise EasyBuildError("Failed to create symbolic link: %s" % system_command) + + +def inject_gpu_property(ec): + ec_dict = ec.asdict() + # Check if CUDA is in the dependencies, if so add the GPU Lmod tag + if ("CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])]): + ec.log.info("[parse hook] Injecting gpu as Lmod arch property and envvar with CUDA version") + key = "modluafooter" + value = 'add_property("arch","gpu")' + cuda_version = 0 + for dep in iter(ec_dict["dependencies"]): + # Make CUDA a build dependency only (rpathing saves us from link errors) + if "CUDA" in dep[0]: + cuda_version = dep[1] + ec_dict["dependencies"].remove(dep) + ec_dict["builddependencies"].append(dep) if dep not in ec_dict["builddependencies"] else ec_dict["builddependencies"] + value = "\n".join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version]) + if key in ec_dict: + if not value in ec_dict[key]: + ec[key] = "\n".join([ec_dict[key], value]) + else: + ec[key] = value + return ec + PARSE_HOOKS = { 'CGAL': parse_hook_cgal_toolchainopts_precise, @@ -358,3 +436,7 @@ def pre_single_extension_isoband(ext, *args, **kwargs): 'isoband': pre_single_extension_isoband, 'testthat': pre_single_extension_testthat, } + +POST_SANITYCHECK_HOOKS = { + 'CUDA': post_sanitycheck_cuda, +} diff --git a/eessi-2023.06-eb-4.8.1-system.yml b/eessi-2023.06-eb-4.8.1-system.yml index b0731d2534..64ed1abc5f 100644 --- a/eessi-2023.06-eb-4.8.1-system.yml +++ b/eessi-2023.06-eb-4.8.1-system.yml @@ -7,3 +7,4 @@ easyconfigs: - EasyBuild-4.8.2.eb: options: from-pr: 19105 + - CUDA-12.1.1.eb diff --git a/eessi-2023.06-eb-4.8.2-2023a.yml b/eessi-2023.06-eb-4.8.2-2023a.yml new file mode 100644 index 0000000000..01a47bbc99 --- /dev/null +++ b/eessi-2023.06-eb-4.8.2-2023a.yml @@ -0,0 +1,2 @@ +easyconfigs: + - CUDA-Samples-12.1-GCC-12.3.0-CUDA-12.1.1.eb