Skip to content

Commit

Permalink
Tests for RAPL and ZeusMonitor (#100)
Browse files Browse the repository at this point in the history
Co-authored-by: Jae-Won Chung <jwnchung@umich.edu>
  • Loading branch information
wbjin and jaywonchung authored Aug 15, 2024
1 parent 71af89c commit b19fb14
Show file tree
Hide file tree
Showing 4 changed files with 246 additions and 16 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ pydocstyle.convention = "google"
"zeus/optimizer/batch_size/common.py" = ["N805"]
"zeus/device/gpu/*.py" = ["N802", "N803"]
"zeus/device/cpu/*.py" = ["N802"]
"zeus/utils/testing.py" = ["N802"]

[tool.pytest.ini_options]
addopts = "--numprocesses auto"
179 changes: 166 additions & 13 deletions tests/device/cpu/test_rapl.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@
import os
import pytest
from typing import Generator, TYPE_CHECKING, Sequence
from time import sleep
from unittest.mock import patch, mock_open, create_autospec, MagicMock
import unittest.mock as mock
from sys import stdout, stderr
import warnings

import multiprocessing as mp


if TYPE_CHECKING:
from pathlib import Path

Expand Down Expand Up @@ -56,24 +54,32 @@ def __exit__(self, exc_type, exc_value, traceback):
@pytest.fixture
def mock_rapl_values():
rapl_values = [
"1000",
"900",
"800",
"700",
"600",
"500",
"400",
"500",
"200",
"100",
"100000",
"90000",
"80000",
"70000",
"60000",
"50000",
"40000",
"50000",
"20000",
"10000",
]
mocked_rapl_file = MockRaplFile(RAPL_DIR + "/intel-rapl:0/energy_uj", rapl_values)
mocked_rapl_file_name = mock_open()
mocked_rapl_file_name.return_value.read.return_value = "package"
mocked_rapl_file_max = mock_open()
mocked_rapl_file_max.return_value.read.return_value = "100000"

real_open = builtins.open

def mock_file_open(filepath, *args, **kwargs):
if filepath == (RAPL_DIR + "/intel-rapl:0/energy_uj"):
return mocked_rapl_file
if filepath == (RAPL_DIR + "/intel-rapl:0/name"):
return mocked_rapl_file_name()
if filepath == (RAPL_DIR + "/intel-rapl:0/max_energy_range_uj"):
return mocked_rapl_file_max()
else:
return real_open(filepath, *args, **kwargs)

Expand All @@ -92,8 +98,155 @@ def mock_file_open(filepath, *args, **kwargs):
patch_sleep.stop()


@pytest.fixture()
def mock_rapl_wraparound_tracker():
patch_tracker = patch("zeus.device.cpu.rapl.RaplWraparoundTracker")
MockRaplWraparoundTracker = patch_tracker.start()

mock_tracker = MockRaplWraparoundTracker.return_value
mock_tracker.get_num_wraparounds.side_effect = [0, 5]

yield mock_tracker

patch_tracker.stop()


def test_rapl_polling_process(mock_rapl_values):
wraparound_counter = mp.Value("i", 0)
with pytest.raises(MockRaplFileOutOfValues) as exception:
_polling_process(RAPL_DIR + "/intel-rapl:0/energy_uj", 1000, wraparound_counter)
assert wraparound_counter.value == 8


# RAPLFile tests
@pytest.fixture
@patch("os.path.exists", return_value=False)
def test_rapl_available(mock_exists):
assert rapl_is_available() == False


def test_rapl_file_class(mock_rapl_values, mock_rapl_wraparound_tracker):
"""Test the `RAPLFile` class."""
# Test initialization
raplFile = RAPLFile("/sys/class/powercap/intel-rapl/intel-rapl:0")
assert raplFile.name == "package"
assert raplFile.last_energy == 100000.0
assert raplFile.max_energy_range_uj == 100000.0

# Test read method where get_num_wraparounds is 0
assert raplFile.read() == 90.0

# Test read method where get_num_wraparounds is 5
assert raplFile.read() == 580.0 # (80000+5*100000)/1000


def test_rapl_file_class_exceptions():
"""Test `RAPLFile` Init errors"""
with patch("builtins.open", mock_open()) as mock_file:
# Fails to open name file
mock_file.side_effect = FileNotFoundError
with pytest.raises(ZeusRAPLFileInitError):
RAPLFile("/sys/class/powercap/intel-rapl/intel-rapl:0")

# Fails to read energy_uj file
mock_file.side_effect = [
mock_open(read_data="package").return_value,
FileNotFoundError,
]
with pytest.raises(ZeusRAPLFileInitError):
RAPLFile("/sys/class/powercap/intel-rapl/intel-rapl:0")

# Fails to read max_energy_uj file
mock_file.side_effect = [
mock_open(read_data="package").return_value,
mock_open(read_data="1000000").return_value,
FileNotFoundError,
]
with pytest.raises(ZeusRAPLFileInitError):
RAPLFile("/sys/class/powercap/intel-rapl/intel-rapl:0")


# RAPLCPU tests
@pytest.fixture()
def mock_os_listdir_cpu(mocker):
return mocker.patch("os.listdir", return_value=["intel-rapl:0", "intel-rapl:0:0"])


def create_rapl_file_mock(name="package", read_value=1000.0):
"""Create a mock `RAPLFile` class"""
mock_rapl_file = create_autospec(RAPLFile, instance=True)
mock_rapl_file.name = name
mock_rapl_file.read.return_value = read_value
return mock_rapl_file


def test_rapl_cpu_class(mocker, mock_os_listdir_cpu):
"""Test `RAPLCPU` with `DRAM`"""
mock_rapl_file_package = create_rapl_file_mock()
mock_rapl_file_dram = create_rapl_file_mock(name="dram", read_value=500.0)

def rapl_file_side_effect(path):
if "0:0" in path:
return mock_rapl_file_dram
return mock_rapl_file_package

mocker.patch("zeus.device.cpu.rapl.RAPLFile", side_effect=rapl_file_side_effect)
cpu = RAPLCPU(cpu_index=0)
measurement = cpu.getTotalEnergyConsumption()

assert cpu.path == os.path.join(RAPL_DIR, "intel-rapl:0")
assert cpu.rapl_file == mock_rapl_file_package
assert cpu.dram == mock_rapl_file_dram
assert measurement.cpu_mj == mock_rapl_file_package.read.return_value
assert measurement.dram_mj == mock_rapl_file_dram.read.return_value


def test_rapl_cpu_class_exceptions(mocker, mock_os_listdir_cpu):
"""Test `RAPLCPU` subpackage init error"""
mock_rapl_file_package = create_rapl_file_mock()
mock_rapl_file_dram = create_rapl_file_mock(name="dram", read_value=500.0)

def rapl_file_side_effect(path):
if "0:0" in path:
raise ZeusRAPLFileInitError("Initilization Error")
return mock_rapl_file_package

mocker.patch("zeus.device.cpu.rapl.RAPLFile", side_effect=rapl_file_side_effect)
with warnings.catch_warnings(record=True) as w:
cpu = RAPLCPU(cpu_index=0)
assert "Failed to initialize subpackage" in str(w[-1].message)

assert cpu.path == os.path.join(RAPL_DIR, "intel-rapl:0")
assert cpu.rapl_file == mock_rapl_file_package
assert cpu.dram is None


# RAPLCPUs tests
def test_rapl_cpus_class(mocker):
"""Test initialization when RAPL is available."""
mocker.patch("zeus.device.cpu.rapl.rapl_is_available", return_value=True)
mocker.patch(
"zeus.device.cpu.rapl.glob",
return_value=[f"{RAPL_DIR}/intel-rapl:0", f"{RAPL_DIR}/intel-rapl:1"],
)
mock_rapl_cpu_constructor = mocker.patch("zeus.device.cpu.rapl.RAPLCPU")
mock_rapl_cpu_instance = MagicMock(spec=RAPLCPU)
mock_rapl_cpu_constructor.side_effect = [
mock_rapl_cpu_instance,
mock_rapl_cpu_instance,
]
rapl_cpus = RAPLCPUs()

assert len(rapl_cpus.cpus) == 2
assert all(isinstance(cpu, MagicMock) for cpu in rapl_cpus.cpus)
assert mock_rapl_cpu_constructor.call_count == 2


def test_rapl_cpus_class_init_error(mocker):
"""Test initialization when RAPL is not available."""
mocker.patch("zeus.device.cpu.rapl.rapl_is_available", return_value=False)

with pytest.raises(
ZeusRAPLNotSupportedError, match="RAPL is not supported on this CPU."
):
RAPLCPUs()
81 changes: 78 additions & 3 deletions tests/test_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from zeus.monitor import Measurement, ZeusMonitor
from zeus.utils.testing import ReplayZeusMonitor
import zeus.device.gpu
import zeus.device.cpu
from zeus.device.cpu.common import CpuDramMeasurement, CPUs, CPU

if TYPE_CHECKING:
from pathlib import Path
Expand All @@ -37,12 +39,56 @@
pynvml.NVML_DEVICE_ARCH_VOLTA,
pynvml.NVML_DEVICE_ARCH_AMPERE,
]
NUM_CPUS = 2


class MockCPU(CPU):
"""Control a single MOCK CPU for testing."""

def __init__(self, index):
"""Initialize the MOCKCPU with a specified index for testing."""
self.index = index
self.cpu_energy = itertools.count(start=1000, step=10)
self.dram_energy = (
itertools.count(start=200, step=5) if self.index % 2 == 0 else None
)

def getTotalEnergyConsumption(self):
"""Returns the total energy consumption of the specified powerzone. Units: mJ."""
return CpuDramMeasurement(
cpu_mj=float(next(self.cpu_energy)),
dram_mj=float(next(self.dram_energy))
if self.dram_energy is not None
else None,
)

def supportsGetDramEnergyConsumption(self):
"""Returns True if the specified CPU powerzone supports retrieving the subpackage energy consumption."""
return self.dram_energy is not None


class MockCPUs(CPUs):
"""MOCK CPU Manager object, containing individual MOCKCPU objects for testing."""

def __init__(self):
"""Instantiates MOCKCPUs object for testing."""
self._cpus = [MockCPU(i) for i in range(NUM_CPUS)]

@property
def cpus(self) -> Sequence[CPU]:
"""Returns a list of CPU objects being tracked."""
return self._cpus

def __del__(self) -> None:
"""Shuts down the Mock CPU monitoring."""
return


@pytest.fixture(autouse=True, scope="function")
def reset_gpus() -> None:
"""Reset the global variable `_gpus` to None on every test."""
def reset_gpus_and_cpus() -> None:
"""Reset the global variable `_gpus` and `_cpus` to None on every test."""
zeus.device.gpu._gpus = None
zeus.device.cpu._cpus = None


@pytest.fixture
Expand Down Expand Up @@ -209,11 +255,14 @@ def get_energy(self, start: float, end: float) -> dict[int, float]:

# want to make zeus.device.gpu.nvml_is_available is a function, want it to always return true when testing
mocker.patch("zeus.device.gpu.nvml_is_available", return_value=True)
mocker.patch("zeus.device.cpu._cpus", new=MockCPUs())

########################################
# Test ZeusMonitor initialization.
########################################
monitor = ZeusMonitor(gpu_indices=gpu_indices, log_file=log_file)
monitor = ZeusMonitor(
gpu_indices=gpu_indices, cpu_indices=list(range(NUM_CPUS)), log_file=log_file
)

# Check GPU index parsing from the log file.
replay_monitor = ReplayZeusMonitor(gpu_indices=None, log_file=log_file)
Expand All @@ -227,6 +276,10 @@ def tick():
next(time_counter)
for counter in energy_counters.values():
next(counter)
for i in range(len(monitor.cpu_indices)):
next(monitor.cpus._cpus[i].cpu_energy)
if i % 2 == 0:
next(monitor.cpus._cpus[i].dram_energy)

def assert_window_begin(name: str, begin_time: int):
"""Assert monitor measurement states right after a window begins."""
Expand All @@ -237,6 +290,16 @@ def assert_window_begin(name: str, begin_time: int):
for i in torch_gpu_indices
if not is_old_torch[i]
}
assert monitor.measurement_states[name].cpu_energy == {
i: pytest.approx((1000 + 10 * (begin_time - 4)) / 1000.0)
for i in range(len(monitor.cpu_indices))
}
assert monitor.measurement_states[name].dram_energy == {
i: pytest.approx((200 + 5 * (begin_time - 4)) / 1000.0)
if i % 2 == 0
else None
for i in range(0, len(monitor.cpu_indices), 2)
}
pynvml_mock.nvmlDeviceGetTotalEnergyConsumption.assert_has_calls(
[call(f"handle{i}") for i in nvml_gpu_indices if not is_old_nvml[i]]
)
Expand Down Expand Up @@ -269,6 +332,18 @@ def assert_measurement(
elapsed_time * 3 / 1000.0
)

if measurement.cpu_energy is not None:
for i in measurement.cpu_energy.keys():
assert measurement.cpu_energy[i] == pytest.approx(
elapsed_time * 10 / 1000.0
)

if measurement.dram_energy is not None:
for i in measurement.dram_energy.keys():
assert measurement.dram_energy[i] == pytest.approx(
elapsed_time * 5 / 1000.0
)

if not assert_calls:
return

Expand Down
1 change: 1 addition & 0 deletions zeus/device/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class ZeusBaseCPUError(ZeusBaseError):

def __init__(self, message: str) -> None:
"""Initialize Base Zeus Exception."""
super().__init__(message)


class ZeusdError(ZeusBaseGPUError):
Expand Down

0 comments on commit b19fb14

Please sign in to comment.