Skip to content

Commit

Permalink
Merge pull request #242 from KernelTuner/register_observer
Browse files Browse the repository at this point in the history
Register observer & correct clock setting
  • Loading branch information
benvanwerkhoven authored Apr 22, 2024
2 parents eceb226 + dfd3da9 commit d792304
Show file tree
Hide file tree
Showing 12 changed files with 154 additions and 124 deletions.
2 changes: 1 addition & 1 deletion CONTRIBUTING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ Steps without :bash:`sudo` access (e.g. on a cluster):
- /path/to/directory
* [Optional] both Mamba and Miniconda can be automatically activated via :bash:`~/.bashrc`. Do not forget to add these (usually provided at the end of the installation).
* Exit the shell and re-enter to make sure Conda is available. :bash:`cd` to the kernel tuner directory.
* [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`.
* [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. On Linux, to point the entire :bash:`~/.cache` default elsewhere, use the :bash:`XDG_CACHE_HOME` environment variable.
* [Optional] update Conda if available before continuing: :bash:`conda update -n base -c conda-forge conda`.
#. Setup a virtual environment: :bash:`conda create --name kerneltuner python=3.11` (or whatever Python version and environment name you prefer).
#. Activate the virtual environment: :bash:`conda activate kerneltuner`.
Expand Down
5 changes: 5 additions & 0 deletions kernel_tuner/backends/cupy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""This module contains all Cupy specific kernel_tuner functions."""
from __future__ import print_function
from warnings import warn

import numpy as np

Expand Down Expand Up @@ -124,6 +125,7 @@ def compile(self, kernel_instance):
compiler_options = self.compiler_options
if not any(["-std=" in opt for opt in self.compiler_options]):
compiler_options = ["--std=c++11"] + self.compiler_options
# CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/main/cupy/cuda/compiler.py#L145

options = tuple(compiler_options)

Expand All @@ -132,6 +134,7 @@ def compile(self, kernel_instance):
)

self.func = self.current_module.get_function(kernel_name)
self.num_regs = self.func.num_regs
return self.func

def start_event(self):
Expand Down Expand Up @@ -197,6 +200,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
of the grid
:type grid: tuple(int, int)
"""
if stream is None:
stream = self.stream
func(grid, threads, gpu_args, stream=stream, shared_mem=self.smem_size)

def memset(self, allocation, value, size):
Expand Down
19 changes: 14 additions & 5 deletions kernel_tuner/backends/nvcuda.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""This module contains all NVIDIA cuda-python specific kernel_tuner functions."""
from warnings import warn

import numpy as np

from kernel_tuner.backends.backend import GPUBackend
from kernel_tuner.observers.nvcuda import CudaRuntimeObserver
from kernel_tuner.util import SkippableFailure, cuda_error_check
from kernel_tuner.util import SkippableFailure, cuda_error_check, to_valid_nvrtc_gpu_arch_cc

# embedded in try block to be able to generate documentation
# and run tests without cuda-python installed
Expand Down Expand Up @@ -161,12 +163,12 @@ def compile(self, kernel_instance):
compiler_options.append(b"--std=c++11")
if not any(["--std=" in opt for opt in self.compiler_options]):
self.compiler_options.append("--std=c++11")
if not any([b"--gpu-architecture=" in opt for opt in compiler_options]):
if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]):
compiler_options.append(
f"--gpu-architecture=compute_{self.cc}".encode("UTF-8")
f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8")
)
if not any(["--gpu-architecture=" in opt for opt in self.compiler_options]):
self.compiler_options.append(f"--gpu-architecture=compute_{self.cc}")
if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]):
self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}")

err, program = nvrtc.nvrtcCreateProgram(
str.encode(kernel_string), b"CUDAProgram", 0, [], []
Expand All @@ -192,6 +194,11 @@ def compile(self, kernel_instance):
)
cuda_error_check(err)

# get the number of registers per thread used in this kernel
num_regs = cuda.cuFuncGetAttribute(cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, self.func)
assert num_regs[0] == 0, f"Retrieving number of registers per thread unsuccesful: code {num_regs[0]}"
self.num_regs = num_regs[1]

except RuntimeError as re:
_, n = nvrtc.nvrtcGetProgramLogSize(program)
log = b" " * n
Expand Down Expand Up @@ -273,6 +280,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
of the grid
:type grid: tuple(int, int)
"""
if stream is None:
stream = self.stream
arg_types = list()
for arg in gpu_args:
if isinstance(arg, cuda.CUdeviceptr):
Expand Down
2 changes: 2 additions & 0 deletions kernel_tuner/backends/pycuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,8 @@ def compile(self, kernel_instance):
)

self.func = self.current_module.get_function(kernel_name)
if not isinstance(self.func, str):
self.num_regs = self.func.num_regs
return self.func
except drv.CompileError as e:
if "uses too much shared data" in e.stderr:
Expand Down
24 changes: 16 additions & 8 deletions kernel_tuner/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def __init__(
print("Using: " + self.dev.name)

def benchmark_default(self, func, gpu_args, threads, grid, result):
"""Benchmark one kernel execution at a time"""
"""Benchmark one kernel execution at a time."""
observers = [
obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver)
]
Expand Down Expand Up @@ -391,12 +391,8 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
for obs in self.continuous_observers:
result.update(obs.get_results())

def benchmark(self, func, gpu_args, instance, verbose, objective):
"""benchmark the kernel instance"""
logging.debug("benchmark " + instance.name)
logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads)
logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid)

def set_nvml_parameters(self, instance):
"""Set the NVML parameters. Avoids setting time leaking into benchmark time."""
if self.use_nvml:
if "nvml_pwr_limit" in instance.params:
new_limit = int(
Expand All @@ -409,6 +405,15 @@ def benchmark(self, func, gpu_args, instance, verbose, objective):
if "nvml_mem_clock" in instance.params:
self.nvml.mem_clock = instance.params["nvml_mem_clock"]

def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False):
"""Benchmark the kernel instance."""
logging.debug("benchmark " + instance.name)
logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads)
logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid)

if self.use_nvml and not skip_nvml_setting:
self.set_nvml_parameters(instance)

# Call the observers to register the configuration to be benchmarked
for obs in self.dev.observers:
obs.register_configuration(instance.params)
Expand Down Expand Up @@ -577,9 +582,12 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,

# benchmark
if func:
# setting the NVML parameters here avoids this time from leaking into the benchmark time, ends up in framework time instead
if self.use_nvml:
self.set_nvml_parameters(instance)
start_benchmark = time.perf_counter()
result.update(
self.benchmark(func, gpu_args, instance, verbose, to.objective)
self.benchmark(func, gpu_args, instance, verbose, to.objective, skip_nvml_setting=False)
)
last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark)

Expand Down
Loading

0 comments on commit d792304

Please sign in to comment.