Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tegra observer with continuous observer #275

Merged
merged 17 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions examples/cuda/vector_add_tegra_observer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python
"""This is the minimal example from the README"""

import json

import numpy
from kernel_tuner import tune_kernel
from kernel_tuner.observers.tegra import TegraObserver

def tune():

kernel_string = """
__global__ void vector_add(float *c, float *a, float *b, int n) {
int i = blockIdx.x * block_size_x + threadIdx.x;
if (i<n) {
c[i] = a[i] + b[i];
}
}
"""

size = 800000

a = numpy.random.randn(size).astype(numpy.float32)
b = numpy.random.randn(size).astype(numpy.float32)
c = numpy.zeros_like(b)
n = numpy.int32(size)

args = [c, a, b, n]

tune_params = dict()
tune_params["block_size_x"] = [128+64*i for i in range(15)]

tegraobserver = TegraObserver(["core_freq"])

metrics = dict()
metrics["f"] = lambda p: p["core_freq"]

results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, observers=[tegraobserver], metrics=metrics)

print(results)

return results


if __name__ == "__main__":
tune()
16 changes: 13 additions & 3 deletions kernel_tuner/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@
from kernel_tuner.backends.nvcuda import CudaFunctions
from kernel_tuner.backends.opencl import OpenCLFunctions
from kernel_tuner.backends.compiler import CompilerFunctions
from kernel_tuner.backends.opencl import OpenCLFunctions
from kernel_tuner.backends.hip import HipFunctions
from kernel_tuner.observers.nvml import NVMLObserver
from kernel_tuner.observers.tegra import TegraObserver
from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver, PrologueObserver

try:
Expand Down Expand Up @@ -316,8 +315,9 @@ def __init__(
raise ValueError("Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet")
self.dev = dev

# look for NVMLObserver in observers, if present, enable special tunable parameters through nvml
# look for NVMLObserver and TegraObserver in observers, if present, enable special tunable parameters through nvml/tegra
self.use_nvml = False
self.use_tegra = False
self.continuous_observers = []
self.output_observers = []
self.prologue_observers = []
Expand All @@ -326,6 +326,9 @@ def __init__(
if isinstance(obs, NVMLObserver):
self.nvml = obs.nvml
self.use_nvml = True
if isinstance(obs, TegraObserver):
self.tegra = obs.tegra
self.use_tegra = True
if hasattr(obs, "continuous_observer"):
self.continuous_observers.append(obs.continuous_observer)
if isinstance(obs, OutputObserver):
Expand Down Expand Up @@ -382,6 +385,7 @@ def benchmark_default(self, func, gpu_args, threads, grid, result):
for obs in self.benchmark_observers:
result.update(obs.get_results())


def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
"""Benchmark continuously for at least 'duration' seconds"""
iterations = int(np.ceil(duration / (result["time"] / 1000)))
Expand All @@ -405,6 +409,7 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
for obs in self.continuous_observers:
result.update(obs.get_results())


def set_nvml_parameters(self, instance):
"""Set the NVML parameters. Avoids setting time leaking into benchmark time."""
if self.use_nvml:
Expand All @@ -419,6 +424,11 @@ def set_nvml_parameters(self, instance):
if "nvml_mem_clock" in instance.params:
self.nvml.mem_clock = instance.params["nvml_mem_clock"]

if self.use_tegra:
if "tegra_gr_clock" in instance.params:
self.tegra.gr_clock = instance.params["tegra_gr_clock"]


def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False):
"""Benchmark the kernel instance."""
logging.debug("benchmark " + instance.name)
Expand Down
77 changes: 6 additions & 71 deletions kernel_tuner/observers/nvml.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def __init__(
save_all=False,
nvidia_smi_fallback=None,
use_locked_clocks=False,
continous_duration=1,
continuous_duration=1,
):
"""Create an NVMLObserver."""
if nvidia_smi_fallback:
Expand Down Expand Up @@ -355,7 +355,7 @@ def __init__(
if any([obs in self.needs_power for obs in observables]):
self.measure_power = True
power_observables = [obs for obs in observables if obs in self.needs_power]
self.continuous_observer = NVMLPowerObserver(power_observables, self, self.nvml, continous_duration)
self.continuous_observer = ContinuousObserver("nvml", power_observables, self, continuous_duration=continuous_duration)

# remove power observables
self.observables = [obs for obs in observables if obs not in self.needs_power]
Expand All @@ -373,6 +373,10 @@ def __init__(
self.during_obs = [obs for obs in observables if obs in ["core_freq", "mem_freq", "temperature"]]
self.iteration = {obs: [] for obs in self.during_obs}

def read_power(self):
""" Return power in Watt """
return self.nvml.pwr_usage() / 1e3

def before_start(self):
# clear results of the observables for next measurement
self.iteration = {obs: [] for obs in self.during_obs}
Expand Down Expand Up @@ -428,75 +432,6 @@ def get_results(self):
return averaged_results


class NVMLPowerObserver(ContinuousObserver):
"""Observer that measures power using NVML and continuous benchmarking."""

def __init__(self, observables, parent, nvml_instance, continous_duration=1):
self.parent = parent
self.nvml = nvml_instance

supported = ["power_readings", "nvml_power", "nvml_energy"]
for obs in observables:
if obs not in supported:
raise ValueError(f"Observable {obs} not in supported: {supported}")
self.observables = observables

# duration in seconds
self.continuous_duration = continous_duration

self.power = 0
self.energy = 0
self.power_readings = []
self.t0 = 0

# results from the last iteration-based benchmark
self.results = None

def before_start(self):
self.parent.before_start()
self.power = 0
self.energy = 0
self.power_readings = []

def after_start(self):
self.parent.after_start()
self.t0 = time.perf_counter()

def during(self):
self.parent.during()
power_usage = self.nvml.pwr_usage()
timestamp = time.perf_counter() - self.t0
# only store the result if we get a new measurement from NVML
if len(self.power_readings) == 0 or (
self.power_readings[-1][1] != power_usage or timestamp - self.power_readings[-1][0] > 0.01
):
self.power_readings.append([timestamp, power_usage])

def after_finish(self):
self.parent.after_finish()
# safeguard in case we have no measurements, perhaps the kernel was too short to measure anything
if not self.power_readings:
return

# convert to seconds from milliseconds
execution_time = self.results["time"] / 1e3
self.power = np.median([d[1] / 1e3 for d in self.power_readings])
self.energy = self.power * execution_time

def get_results(self):
results = self.parent.get_results()
keys = list(results.keys())
for key in keys:
results["pwr_" + key] = results.pop(key)
if "nvml_energy" in self.observables:
results["nvml_energy"] = self.energy
if "nvml_power" in self.observables:
results["nvml_power"] = self.power
if "power_readings" in self.observables:
results["power_readings"] = self.power_readings
return results


# High-level Helper functions


Expand Down
75 changes: 73 additions & 2 deletions kernel_tuner/observers/observer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from abc import ABC, abstractmethod

import time
import numpy as np

class BenchmarkObserver(ABC):
"""Base class for Benchmark Observers"""
Expand Down Expand Up @@ -44,8 +45,78 @@ class IterationObserver(BenchmarkObserver):


class ContinuousObserver(BenchmarkObserver):
pass
"""Generic observer that measures power while and continuous benchmarking.

To support continuous benchmarking an Observer should support:
a .read_power() method, which the ContinuousObserver can call to read power in Watt
"""
def __init__(self, name, observables, parent, continuous_duration=1):
self.parent = parent
self.name = name

supported = [self.name + "_power", self.name + "_energy", "power_readings"]
for obs in observables:
if obs not in supported:
raise ValueError(f"Observable {obs} not in supported: {supported}")
self.observables = observables

# duration in seconds
self.continuous_duration = continuous_duration

self.power = 0
self.energy = 0
self.power_readings = []
self.t0 = 0

# results from the last iteration-based benchmark
# these are set by the benchmarking function of Kernel Tuner before
# the continuous observer is called.
self.results = None

def before_start(self):
self.parent.before_start()
self.power = 0
self.energy = 0
self.power_readings = []

def after_start(self):
self.parent.after_start()
self.t0 = time.perf_counter()

def during(self):
self.parent.during()
power_usage = self.parent.read_power()
timestamp = time.perf_counter() - self.t0
# only store the result if we get a new measurement from the GPU
if len(self.power_readings) == 0 or (
self.power_readings[-1][1] != power_usage
or timestamp - self.power_readings[-1][0] > 0.01
):
self.power_readings.append([timestamp, power_usage])

def after_finish(self):
self.parent.after_finish()
# safeguard in case we have no measurements, perhaps the kernel was too short to measure anything
if not self.power_readings:
return

# convert to seconds from milliseconds
execution_time = self.results["time"] / 1e3
self.power = np.median([d[1] for d in self.power_readings])
self.energy = self.power * execution_time

def get_results(self):
results = self.parent.get_results()
keys = list(results.keys())
for key in keys:
results["pwr_" + key] = results.pop(key)
if self.name + "_power" in self.observables:
results[self.name + "_power"] = self.power
if self.name + "_energy" in self.observables:
results[self.name + "_energy"] = self.energy
if "power_readings" in self.observables:
results["power_readings"] = self.power_readings
return results

class OutputObserver(BenchmarkObserver):
"""Observer that can verify or measure something about the output produced by a kernel."""
Expand Down
Loading