Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add a CUDA device code sanity check #4692

Draft
wants to merge 1 commit into
base: 5.0.x
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 65 additions & 2 deletions easybuild/framework/easyblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
* Maxime Boissonneault (Compute Canada)
* Davide Vanzo (Vanderbilt University)
* Caspar van Leeuwen (SURF)
* Jasper Grimm (UoY)
"""
import concurrent
import copy
Expand Down Expand Up @@ -101,8 +102,9 @@
from easybuild.tools.output import show_progress_bars, start_progress_bar, stop_progress_bar, update_progress_bar
from easybuild.tools.package.utilities import package
from easybuild.tools.repository.repository import init_repository
from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_linked_libs_raw
from easybuild.tools.systemtools import get_shared_lib_ext, pick_system_specific_value, use_group
from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_cuda_device_code_architectures
from easybuild.tools.systemtools import get_linked_libs_raw, get_shared_lib_ext, pick_system_specific_value, use_group
from easybuild.tools.toolchain.toolchain import TOOLCHAIN_CAPABILITY_CUDA
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'easybuild.tools.toolchain.toolchain.TOOLCHAIN_CAPABILITY_CUDA' imported but unused

from easybuild.tools.utilities import INDENT_4SPACES, get_class_for, nub, quote_str
from easybuild.tools.utilities import remove_unwanted_chars, time2str, trace_msg
from easybuild.tools.version import this_is_easybuild, VERBOSE_VERSION, VERSION
Expand Down Expand Up @@ -3193,6 +3195,59 @@ def _sanity_check_step_multi_deps(self, *args, **kwargs):
self.cfg['builddependencies'] = builddeps
self.cfg.iterating = False

def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True):
"""Sanity check that binaries/libraries contain device code for the correct architecture targets."""

self.log.info("Checking binaries/libraries for CUDA device code...")

fails = []
cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None)

if cuda_dirs is None:
cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs()

if not cuda_dirs:
cuda_dirs = DEFAULT_BIN_LIB_SUBDIRS
self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s",
cuda_dirs)
else:
self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s",
cuda_dirs)

for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]:
if os.path.exists(dirpath):
self.log.debug(f"Sanity checking files for CUDA device code in {dirpath}")

for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]:
self.log.debug("Sanity checking for CUDA device code in %s", path)

derived_ccs = get_cuda_device_code_architectures(path)

if derived_ccs is None:
msg = f"No CUDA device code found in {path}, so skipping it in CUDA device code sanity check"
self.log.debug(msg)
else:
# check whether device code architectures match cuda_compute_capabilities
additional_ccs = list(set(derived_ccs) - set(cfg_ccs))
missing_ccs = list(set(cfg_ccs) - set(derived_ccs))

if additional_ccs or missing_ccs:
fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. "
if additional_ccs:
fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs))
if missing_ccs:
fail_msg += "Missing compute capabilities: %s." % ', '.join(sorted(missing_ccs))
self.log.warning(fail_msg)
fails.append(fail_msg)
else:
msg = (f"Output of 'cuobjdump' checked for {path}; device code architecures match "
"those in cuda_compute_capabilities")
self.log.debug(msg)
else:
self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}")

return fails

def sanity_check_rpath(self, rpath_dirs=None, check_readelf_rpath=True):
"""Sanity check binaries/libraries w.r.t. RPATH linking."""

Expand Down Expand Up @@ -3782,6 +3837,14 @@ def xs2str(xs):
else:
self.log.debug("Skipping RPATH sanity check")

if get_software_root('CUDA'):
cuda_fails = self.sanity_check_cuda()
if cuda_fails:
self.log.warning("CUDA device code sanity check failed!")
self.sanity_check_fail_msgs.extend(cuda_fails)
else:
self.log.debug("Skipping CUDA device code sanity check")

# pass or fail
if self.sanity_check_fail_msgs:
raise EasyBuildError(
Expand Down
59 changes: 59 additions & 0 deletions easybuild/tools/systemtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

* Jens Timmerman (Ghent University)
* Ward Poelmans (Ghent University)
* Jasper Grimm (UoY)
"""
import ctypes
import errno
Expand Down Expand Up @@ -963,6 +964,64 @@ def get_glibc_version():
return glibc_ver


def get_cuda_object_dump_raw(path):
"""
Get raw ouput from command which extracts information from CUDA binary files in a human-readable format,
or None for files containing no CUDA device code.
See https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html#cuobjdump
"""

res = run_shell_cmd("file %s" % path, fail_on_error=False, hidden=True, output_file=False, stream_output=False)
if res.exit_code != EasyBuildExit.SUCCESS:
fail_msg = "Failed to run 'file %s': %s" % (path, res.output)
_log.warning(fail_msg)

# check that the file is an executable or library/object
if any(x in res.output for x in ['executable', 'object', 'library']):
cuda_cmd = f"cuobjdump {path}"
else:
return None

res = run_shell_cmd(cuda_cmd, fail_on_error=False, hidden=True, output_file=False, stream_output=False)
if res.exit_code == EasyBuildExit.SUCCESS:
return res.output
else:
msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'"
_log.debug(msg % (path, cuda_cmd, res.output))
return None


def get_cuda_device_code_architectures(path):
"""
Get list of supported CUDA architectures, by inspecting the device code of an executable/library. The format is the
same as cuda_compute_capabilities (e.g. ['8.6', '9.0'] for sm_86 sm_90).
Returns None if no CUDA device code is present in the file.
"""

# cudaobjdump uses the sm_XY format
device_code_regex = re.compile('(?<=arch = sm_)([0-9])([0-9]+a{0,1})')

# resolve symlinks
if os.path.islink(path) and os.path.exists(path):
path = os.path.realpath(path)

cuda_raw = get_cuda_object_dump_raw(path)
if cuda_raw is None:
return None

# extract unique architectures from raw dump
matches = re.findall(device_code_regex, cuda_raw)
if matches is not None:
# convert match tuples into unique list of cuda compute capabilities
# e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0']
matches = sorted(['.'.join(m) for m in set(matches)])
else:
fail_msg = f"Failed to determine supported CUDA architectures from {path}"
_log.warning(fail_msg)

return matches


def get_linked_libs_raw(path):
"""
Get raw output from command that reports linked libraries for dynamically linked executables/libraries,
Expand Down
Loading