Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Install CUDA and CUDA-Samples via the bot #381

Closed
wants to merge 13 commits into from
3 changes: 2 additions & 1 deletion EESSI-pilot-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,8 @@ done

echo ">> Creating/updating Lmod cache..."
export LMOD_RC="${EASYBUILD_INSTALLPATH}/.lmod/lmodrc.lua"
if [ ! -f $LMOD_RC ]; then
lmodrc_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' > /dev/null; echo $?)
if [ ! -f $LMOD_RC ] || [ ${lmodrc_changed} == '0' ]; then
python3 $TOPDIR/create_lmodrc.py ${EASYBUILD_INSTALLPATH}
check_exit_code $? "$LMOD_RC created" "Failed to create $LMOD_RC"
fi
Expand Down
108 changes: 108 additions & 0 deletions create_lmodrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,113 @@
}
"""

GPU_LMOD_RC ="""require("strict")
local hook = require("Hook")
local open = io.open

local function read_file(path)
local file = open(path, "rb") -- r read mode and b binary mode
if not file then return nil end
local content = file:read "*a" -- *a or *all reads the whole file
file:close()
return content
end

-- from https://stackoverflow.com/a/40195356
--- Check if a file or directory exists in this path
function exists(file)
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
local ok, err, code = os.rename(file, file)
if not ok then
if code == 13 then
-- Permission denied, but it exists
return true
end
end
return ok, err
end

local function visible_hook(modT)
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local cudaDir = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
local cudaDirExists = exists(cudaDir)
if not cudaDirExists then
local haveGpu = mt:haveProperty(modT.sn,"arch","gpu")
if haveGpu then
modT.isVisible = false
end
end
end

local function cuda_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
local eprefix = os.getenv('EESSI_PREFIX') .. "/init/gpu_support"
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
-- if we try to load CUDA itself, check if the software exists in host_injections
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
-- otherwise, refuse to load CUDA and print error message
if simpleName == 'CUDA' then
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the CUDA software should be installed
local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudaDirExists = exists(cudaEasyBuildDir)
if not cudaDirExists then
io.stderr:write("You requested to load ",simpleName,"\\n")
io.stderr:write("While the module file exists, the actual software is not shipped with EESSI.\\n")
io.stderr:write("In order to be able to use the CUDA module, please follow the instructions in the\\n")
io.stderr:write("gpu_support folder. Adding the CUDA software can be as easy as:\\n")
io.stderr:write("export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh\\n")
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
frameStk:__clear()
end
end
-- when loading CUDA enabled modules check if the necessary matching compatibility libraries are installed
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
-- otherwise, refuse to load the requested module and print error message
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
casparvl marked this conversation as resolved.
Show resolved Hide resolved
if haveGpu then
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
local cudaVersionFile = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/" .. arch .. "/latest/version.txt"
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
local cudaDriverExists = exists(cudaVersionFile)
local singularityCudaExists = exists("/.singularity.d/libs/libcuda.so")
if not (cudaDriverExists or singularityCudaExists) then
io.stderr:write("You requested to load ",simpleName,"\\n")
io.stderr:write("which relies on the CUDA runtime environment and its compatibility libraries.\\n")
io.stderr:write("In order to be able to use the module, please follow the instructions in the\\n")
io.stderr:write("gpu_support folder. Installing the needed compatibility libraries can be as easy as:\\n")
io.stderr:write("./add_nvidia_gpu_support.sh\\n")
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
frameStk:__clear()
else
if cudaDriverExists then
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
local cudaVersion = read_file(cudaVersionFile)
local cudaVersion_req = os.getenv("EESSICUDAVERSION")
local major, minor, patch = string.match(cudaVersion, "(%d+)%.(%d+)%.(%d+)")
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
local compat_libs_need_update = false
if major < major_req then
compat_libs_need_update = true
elseif major == major_req then
if minor < minor_req then
compat_libs_need_update = true
elseif minor == minor_req then
if patch < patch_req then
compat_libs_need_update = true
end
end
end
if compat_libs_need_update == true then
io.stderr:write("You requested to load CUDA version ",cudaVersion)
io.stderr:write("but the module you want to load requires CUDA version ",cudaVersion_req,".\\n")
io.stderr:write("Please update your CUDA compatibility libraries in order to use ",simpleName,".\\n")
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
frameStk:__clear()
end
end
end
end
end

hook.register("load", cuda_enabled_load_hook)
hook.register("isVisibleHook", visible_hook)
"""

def error(msg):
sys.stderr.write("ERROR: %s\n" % msg)
Expand All @@ -36,6 +143,7 @@ def error(msg):
'dot_lmod': DOT_LMOD,
'prefix': prefix,
}
lmodrc_txt += '\n' + GPU_LMOD_RC
try:
os.makedirs(os.path.dirname(lmodrc_path), exist_ok=True)
with open(lmodrc_path, 'w') as fp:
Expand Down
82 changes: 82 additions & 0 deletions eb_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def parse_hook(ec, *args, **kwargs):
if ec.name in PARSE_HOOKS:
PARSE_HOOKS[ec.name](ec, eprefix)

# inject the GPU property (if required)
ec = inject_gpu_property(ec)

def pre_prepare_hook(self, *args, **kwargs):
"""Main pre-prepare hook: trigger custom functions."""
Expand Down Expand Up @@ -209,6 +211,12 @@ def pre_configure_hook(self, *args, **kwargs):
PRE_CONFIGURE_HOOKS[self.name](self, *args, **kwargs)


def post_sanitycheck_hook(self, *args, **kwargs):
"""Main post-sanity-check hook: trigger custom functions based on software name."""
if self.name in POST_SANITYCHECK_HOOKS:
POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs)


def pre_configure_hook_openblas_optarch_generic(self, *args, **kwargs):
"""
Pre-configure hook for OpenBLAS: add DYNAMIC_ARCH=1 to build/test/install options when using --optarch=GENERIC
Expand Down Expand Up @@ -328,6 +336,76 @@ def pre_single_extension_isoband(ext, *args, **kwargs):
# cfr. https://github.com/r-lib/isoband/commit/6984e6ce8d977f06e0b5ff73f5d88e5c9a44c027
ext.cfg['preinstallopts'] = "sed -i 's/SIGSTKSZ/32768/g' src/testthat/vendor/catch.h && "

def post_sanitycheck_cuda(self, *args, **kwargs):
"""Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections."""
print_msg("Replacing CUDA stuff we cannot ship with symlinks...")
# read CUDA EULA
eula_path = os.path.join(self.installdir, "EULA.txt")
tmp_buffer = []
with open(eula_path) as infile:
copy = False
for line in infile:
if line.strip() == "2.6. Attachment A":
copy = True
continue
elif line.strip() == "2.7. Attachment B":
copy = False
continue
elif copy:
tmp_buffer.append(line)
# create whitelist without file extensions, they're not really needed and they only complicate things
whitelist = ['EULA', 'README']
file_extensions = [".so", ".a", ".h", ".bc"]
for tmp in tmp_buffer:
for word in tmp.split():
if any(ext in word for ext in file_extensions):
whitelist.append(word.split(".")[0])
whitelist = list(set(whitelist))
# Do some quick checks for things we should or shouldn't have in the list
if "nvcc" in whitelist:
raise EasyBuildError("Found 'nvcc' in whitelist: %s" % whitelist)
if "libcudart" not in whitelist:
raise EasyBuildError("Did not find 'libcudart' in whitelist: %s" % whitelist)
# iterate over all files in the CUDA path
for root, dirs, files in os.walk(self.installdir):
for filename in files:
# we only really care about real files, i.e. not symlinks
if not os.path.islink(os.path.join(root, filename)):
# check if the current file is part of the whitelist
basename = filename.split(".")[0]
if basename not in whitelist:
# if it is not in the whitelist, delete the file and create a symlink to host_injections
source = os.path.join(root, filename)
target = source.replace("versions", "host_injections")
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
os.remove(source)
# Using os.symlink requires the existence of the target directory, so we use os.system
system_command="ln -s '%s' '%s'" % (target, source)
if os.system(system_command) != 0:
raise EasyBuildError("Failed to create symbolic link: %s" % system_command)


def inject_gpu_property(ec):
ec_dict = ec.asdict()
# Check if CUDA is in the dependencies, if so add the GPU Lmod tag
if ("CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])]):
ec.log.info("[parse hook] Injecting gpu as Lmod arch property and envvar with CUDA version")
key = "modluafooter"
value = 'add_property("arch","gpu")'
cuda_version = 0
for dep in iter(ec_dict["dependencies"]):
# Make CUDA a build dependency only (rpathing saves us from link errors)
if "CUDA" in dep[0]:
cuda_version = dep[1]
ec_dict["dependencies"].remove(dep)
ec_dict["builddependencies"].append(dep) if dep not in ec_dict["builddependencies"] else ec_dict["builddependencies"]
value = "\n".join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version])
if key in ec_dict:
if not value in ec_dict[key]:
ec[key] = "\n".join([ec_dict[key], value])
else:
ec[key] = value
return ec


PARSE_HOOKS = {
'CGAL': parse_hook_cgal_toolchainopts_precise,
Expand Down Expand Up @@ -358,3 +436,7 @@ def pre_single_extension_isoband(ext, *args, **kwargs):
'isoband': pre_single_extension_isoband,
'testthat': pre_single_extension_testthat,
}

POST_SANITYCHECK_HOOKS = {
'CUDA': post_sanitycheck_cuda,
}
1 change: 1 addition & 0 deletions eessi-2023.06-eb-4.8.1-system.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ easyconfigs:
- EasyBuild-4.8.2.eb:
options:
from-pr: 19105
- CUDA-12.1.1.eb
7 changes: 7 additions & 0 deletions eessi-2023.06-eb-4.8.2-2023a.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
easyconfigs:
- CUDA-Samples-12.1-GCC-12.3.0-CUDA-12.1.1.eb:
# use easyconfig that only install subset of CUDA samples,
# to circumvent problem with nvcc linking to glibc of host OS;
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/19189
options:
from-pr: 19189
Loading