Skip to content

Commit

Permalink
Merge pull request #275 from KernelTuner/tegra_observer
Browse files Browse the repository at this point in the history
Tegra observer with continuous observer
  • Loading branch information
benvanwerkhoven authored Oct 3, 2024
2 parents f307f50 + 1e9a55b commit 0b5fffc
Show file tree
Hide file tree
Showing 5 changed files with 447 additions and 76 deletions.
46 changes: 46 additions & 0 deletions examples/cuda/vector_add_tegra_observer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python
"""This is the minimal example from the README"""

import json

import numpy
from kernel_tuner import tune_kernel
from kernel_tuner.observers.tegra import TegraObserver

def tune():

kernel_string = """
__global__ void vector_add(float *c, float *a, float *b, int n) {
int i = blockIdx.x * block_size_x + threadIdx.x;
if (i<n) {
c[i] = a[i] + b[i];
}
}
"""

size = 800000

a = numpy.random.randn(size).astype(numpy.float32)
b = numpy.random.randn(size).astype(numpy.float32)
c = numpy.zeros_like(b)
n = numpy.int32(size)

args = [c, a, b, n]

tune_params = dict()
tune_params["block_size_x"] = [128+64*i for i in range(15)]

tegraobserver = TegraObserver(["core_freq"])

metrics = dict()
metrics["f"] = lambda p: p["core_freq"]

results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, observers=[tegraobserver], metrics=metrics)

print(results)

return results


if __name__ == "__main__":
tune()
16 changes: 13 additions & 3 deletions kernel_tuner/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@
from kernel_tuner.backends.nvcuda import CudaFunctions
from kernel_tuner.backends.opencl import OpenCLFunctions
from kernel_tuner.backends.compiler import CompilerFunctions
from kernel_tuner.backends.opencl import OpenCLFunctions
from kernel_tuner.backends.hip import HipFunctions
from kernel_tuner.observers.nvml import NVMLObserver
from kernel_tuner.observers.tegra import TegraObserver
from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver, PrologueObserver

try:
Expand Down Expand Up @@ -316,8 +315,9 @@ def __init__(
raise ValueError("Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet")
self.dev = dev

# look for NVMLObserver in observers, if present, enable special tunable parameters through nvml
# look for NVMLObserver and TegraObserver in observers, if present, enable special tunable parameters through nvml/tegra
self.use_nvml = False
self.use_tegra = False
self.continuous_observers = []
self.output_observers = []
self.prologue_observers = []
Expand All @@ -326,6 +326,9 @@ def __init__(
if isinstance(obs, NVMLObserver):
self.nvml = obs.nvml
self.use_nvml = True
if isinstance(obs, TegraObserver):
self.tegra = obs.tegra
self.use_tegra = True
if hasattr(obs, "continuous_observer"):
self.continuous_observers.append(obs.continuous_observer)
if isinstance(obs, OutputObserver):
Expand Down Expand Up @@ -382,6 +385,7 @@ def benchmark_default(self, func, gpu_args, threads, grid, result):
for obs in self.benchmark_observers:
result.update(obs.get_results())


def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
"""Benchmark continuously for at least 'duration' seconds"""
iterations = int(np.ceil(duration / (result["time"] / 1000)))
Expand All @@ -405,6 +409,7 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
for obs in self.continuous_observers:
result.update(obs.get_results())


def set_nvml_parameters(self, instance):
"""Set the NVML parameters. Avoids setting time leaking into benchmark time."""
if self.use_nvml:
Expand All @@ -419,6 +424,11 @@ def set_nvml_parameters(self, instance):
if "nvml_mem_clock" in instance.params:
self.nvml.mem_clock = instance.params["nvml_mem_clock"]

if self.use_tegra:
if "tegra_gr_clock" in instance.params:
self.tegra.gr_clock = instance.params["tegra_gr_clock"]


def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False):
"""Benchmark the kernel instance."""
logging.debug("benchmark " + instance.name)
Expand Down
77 changes: 6 additions & 71 deletions kernel_tuner/observers/nvml.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def __init__(
save_all=False,
nvidia_smi_fallback=None,
use_locked_clocks=False,
continous_duration=1,
continuous_duration=1,
):
"""Create an NVMLObserver."""
if nvidia_smi_fallback:
Expand Down Expand Up @@ -355,7 +355,7 @@ def __init__(
if any([obs in self.needs_power for obs in observables]):
self.measure_power = True
power_observables = [obs for obs in observables if obs in self.needs_power]
self.continuous_observer = NVMLPowerObserver(power_observables, self, self.nvml, continous_duration)
self.continuous_observer = ContinuousObserver("nvml", power_observables, self, continuous_duration=continuous_duration)

# remove power observables
self.observables = [obs for obs in observables if obs not in self.needs_power]
Expand All @@ -373,6 +373,10 @@ def __init__(
self.during_obs = [obs for obs in observables if obs in ["core_freq", "mem_freq", "temperature"]]
self.iteration = {obs: [] for obs in self.during_obs}

def read_power(self):
""" Return power in Watt """
return self.nvml.pwr_usage() / 1e3

def before_start(self):
# clear results of the observables for next measurement
self.iteration = {obs: [] for obs in self.during_obs}
Expand Down Expand Up @@ -428,75 +432,6 @@ def get_results(self):
return averaged_results


class NVMLPowerObserver(ContinuousObserver):
"""Observer that measures power using NVML and continuous benchmarking."""

def __init__(self, observables, parent, nvml_instance, continous_duration=1):
self.parent = parent
self.nvml = nvml_instance

supported = ["power_readings", "nvml_power", "nvml_energy"]
for obs in observables:
if obs not in supported:
raise ValueError(f"Observable {obs} not in supported: {supported}")
self.observables = observables

# duration in seconds
self.continuous_duration = continous_duration

self.power = 0
self.energy = 0
self.power_readings = []
self.t0 = 0

# results from the last iteration-based benchmark
self.results = None

def before_start(self):
self.parent.before_start()
self.power = 0
self.energy = 0
self.power_readings = []

def after_start(self):
self.parent.after_start()
self.t0 = time.perf_counter()

def during(self):
self.parent.during()
power_usage = self.nvml.pwr_usage()
timestamp = time.perf_counter() - self.t0
# only store the result if we get a new measurement from NVML
if len(self.power_readings) == 0 or (
self.power_readings[-1][1] != power_usage or timestamp - self.power_readings[-1][0] > 0.01
):
self.power_readings.append([timestamp, power_usage])

def after_finish(self):
self.parent.after_finish()
# safeguard in case we have no measurements, perhaps the kernel was too short to measure anything
if not self.power_readings:
return

# convert to seconds from milliseconds
execution_time = self.results["time"] / 1e3
self.power = np.median([d[1] / 1e3 for d in self.power_readings])
self.energy = self.power * execution_time

def get_results(self):
results = self.parent.get_results()
keys = list(results.keys())
for key in keys:
results["pwr_" + key] = results.pop(key)
if "nvml_energy" in self.observables:
results["nvml_energy"] = self.energy
if "nvml_power" in self.observables:
results["nvml_power"] = self.power
if "power_readings" in self.observables:
results["power_readings"] = self.power_readings
return results


# High-level Helper functions


Expand Down
75 changes: 73 additions & 2 deletions kernel_tuner/observers/observer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from abc import ABC, abstractmethod

import time
import numpy as np

class BenchmarkObserver(ABC):
"""Base class for Benchmark Observers"""
Expand Down Expand Up @@ -44,8 +45,78 @@ class IterationObserver(BenchmarkObserver):


class ContinuousObserver(BenchmarkObserver):
pass
"""Generic observer that measures power while and continuous benchmarking.
To support continuous benchmarking an Observer should support:
a .read_power() method, which the ContinuousObserver can call to read power in Watt
"""
def __init__(self, name, observables, parent, continuous_duration=1):
self.parent = parent
self.name = name

supported = [self.name + "_power", self.name + "_energy", "power_readings"]
for obs in observables:
if obs not in supported:
raise ValueError(f"Observable {obs} not in supported: {supported}")
self.observables = observables

# duration in seconds
self.continuous_duration = continuous_duration

self.power = 0
self.energy = 0
self.power_readings = []
self.t0 = 0

# results from the last iteration-based benchmark
# these are set by the benchmarking function of Kernel Tuner before
# the continuous observer is called.
self.results = None

def before_start(self):
self.parent.before_start()
self.power = 0
self.energy = 0
self.power_readings = []

def after_start(self):
self.parent.after_start()
self.t0 = time.perf_counter()

def during(self):
self.parent.during()
power_usage = self.parent.read_power()
timestamp = time.perf_counter() - self.t0
# only store the result if we get a new measurement from the GPU
if len(self.power_readings) == 0 or (
self.power_readings[-1][1] != power_usage
or timestamp - self.power_readings[-1][0] > 0.01
):
self.power_readings.append([timestamp, power_usage])

def after_finish(self):
self.parent.after_finish()
# safeguard in case we have no measurements, perhaps the kernel was too short to measure anything
if not self.power_readings:
return

# convert to seconds from milliseconds
execution_time = self.results["time"] / 1e3
self.power = np.median([d[1] for d in self.power_readings])
self.energy = self.power * execution_time

def get_results(self):
results = self.parent.get_results()
keys = list(results.keys())
for key in keys:
results["pwr_" + key] = results.pop(key)
if self.name + "_power" in self.observables:
results[self.name + "_power"] = self.power
if self.name + "_energy" in self.observables:
results[self.name + "_energy"] = self.energy
if "power_readings" in self.observables:
results["power_readings"] = self.power_readings
return results

class OutputObserver(BenchmarkObserver):
"""Observer that can verify or measure something about the output produced by a kernel."""
Expand Down
Loading

0 comments on commit 0b5fffc

Please sign in to comment.