Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tests for CPU monitoring onn ZeusMonitor #100

Merged
merged 8 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ pydocstyle.convention = "google"
"zeus/optimizer/batch_size/common.py" = ["N805"]
"zeus/device/gpu/*.py" = ["N802", "N803"]
"zeus/device/cpu/*.py" = ["N802"]
"zeus/utils/testing.py" = ["N802"]

[tool.pytest.ini_options]
addopts = "--numprocesses auto"
176 changes: 166 additions & 10 deletions tests/device/cpu/test_rapl.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
RaplWraparoundTracker,
_polling_process,
)

wbjin marked this conversation as resolved.
Show resolved Hide resolved
from zeus.device.cpu.common import CpuDramMeasurement


Expand Down Expand Up @@ -56,24 +57,32 @@ def __exit__(self, exc_type, exc_value, traceback):
@pytest.fixture
def mock_rapl_values():
rapl_values = [
"1000",
"900",
"800",
"700",
"600",
"500",
"400",
"500",
"200",
"100",
"100000",
"90000",
"80000",
"70000",
"60000",
"50000",
"40000",
"50000",
"20000",
"10000",
]
mocked_rapl_file = MockRaplFile(RAPL_DIR + "/intel-rapl:0/energy_uj", rapl_values)
mocked_rapl_file_name = mock_open()
mocked_rapl_file_name.return_value.read.return_value = "package"
mocked_rapl_file_max = mock_open()
mocked_rapl_file_max.return_value.read.return_value = "100000"

real_open = builtins.open

def mock_file_open(filepath, *args, **kwargs):
if filepath == (RAPL_DIR + "/intel-rapl:0/energy_uj"):
return mocked_rapl_file
if filepath == (RAPL_DIR + "/intel-rapl:0/name"):
return mocked_rapl_file_name()
if filepath == (RAPL_DIR + "/intel-rapl:0/max_energy_range_uj"):
return mocked_rapl_file_max()
else:
return real_open(filepath, *args, **kwargs)

Expand All @@ -92,8 +101,155 @@ def mock_file_open(filepath, *args, **kwargs):
patch_sleep.stop()


@pytest.fixture()
def mock_rapl_wraparound_tracker():
patch_tracker = patch("zeus.device.cpu.rapl.RaplWraparoundTracker")
MockRaplWraparoundTracker = patch_tracker.start()

mock_tracker = MockRaplWraparoundTracker.return_value
mock_tracker.get_num_wraparounds.side_effect = [0, 5]

yield mock_tracker

patch_tracker.stop()


def test_rapl_polling_process(mock_rapl_values):
wraparound_counter = mp.Value("i", 0)
with pytest.raises(MockRaplFileOutOfValues) as exception:
_polling_process(RAPL_DIR + "/intel-rapl:0/energy_uj", 1000, wraparound_counter)
assert wraparound_counter.value == 8


# RAPLFile tests
@pytest.fixture
@patch("os.path.exists", return_value=False)
def test_rapl_available(mock_exists):
assert rapl_is_available() == False


def test_rapl_file_class(mock_rapl_values, mock_rapl_wraparound_tracker):
"""Test the `RAPLFile` class."""
# Test initialization
raplFile = RAPLFile("/sys/class/powercap/intel-rapl/intel-rapl:0")
assert raplFile.name == "package"
assert raplFile.last_energy == 100000.0
assert raplFile.max_energy_range_uj == 100000.0

# Test read method where get_num_wraparounds is 0
assert raplFile.read() == 90.0

# Test read method where get_num_wraparounds is 5
assert raplFile.read() == 580.0 # (80000+5*100000)/1000


def test_rapl_file_class_exceptions():
"""Test `RAPLFile` Init errors"""
with patch("builtins.open", mock_open()) as mock_file:
# Fails to open name file
mock_file.side_effect = FileNotFoundError
with pytest.raises(ZeusRAPLFileInitError):
RAPLFile("/sys/class/powercap/intel-rapl/intel-rapl:0")

# Fails to read energy_uj file
mock_file.side_effect = [
mock_open(read_data="package").return_value,
FileNotFoundError,
]
with pytest.raises(ZeusRAPLFileInitError):
RAPLFile("/sys/class/powercap/intel-rapl/intel-rapl:0")

# Fails to read max_energy_uj file
mock_file.side_effect = [
mock_open(read_data="package").return_value,
mock_open(read_data="1000000").return_value,
FileNotFoundError,
]
with pytest.raises(ZeusRAPLFileInitError):
RAPLFile("/sys/class/powercap/intel-rapl/intel-rapl:0")


# RAPLCPU tests
@pytest.fixture()
def mock_os_listdir_cpu(mocker):
return mocker.patch("os.listdir", return_value=["intel-rapl:0", "intel-rapl:0:0"])


def create_rapl_file_mock(name="package", read_value=1000.0):
"""Create a mock `RAPLFile` class"""
mock_rapl_file = create_autospec(RAPLFile, instance=True)
mock_rapl_file.name = name
mock_rapl_file.read.return_value = read_value
return mock_rapl_file


def test_rapl_cpu_class(mocker, mock_os_listdir_cpu):
"""Test `RAPLCPU` with `DRAM`"""
mock_rapl_file_package = create_rapl_file_mock()
mock_rapl_file_dram = create_rapl_file_mock(name="dram", read_value=500.0)

def rapl_file_side_effect(path):
if "0:0" in path:
return mock_rapl_file_dram
return mock_rapl_file_package

mocker.patch("zeus.device.cpu.rapl.RAPLFile", side_effect=rapl_file_side_effect)
cpu = RAPLCPU(cpu_index=0)
measurement = cpu.getTotalEnergyConsumption()

assert cpu.path == os.path.join(RAPL_DIR, "intel-rapl:0")
assert cpu.rapl_file == mock_rapl_file_package
assert cpu.dram == mock_rapl_file_dram
assert measurement.cpu_mj == mock_rapl_file_package.read.return_value
assert measurement.dram_mj == mock_rapl_file_dram.read.return_value


def test_rapl_cpu_class_exceptions(mocker, mock_os_listdir_cpu):
"""Test `RAPLCPU` subpackage init error"""
mock_rapl_file_package = create_rapl_file_mock()
mock_rapl_file_dram = create_rapl_file_mock(name="dram", read_value=500.0)

def rapl_file_side_effect(path):
if "0:0" in path:
raise ZeusRAPLFileInitError("Initilization Error")
return mock_rapl_file_package

mocker.patch("zeus.device.cpu.rapl.RAPLFile", side_effect=rapl_file_side_effect)
with warnings.catch_warnings(record=True) as w:
cpu = RAPLCPU(cpu_index=0)
assert "Failed to initialize subpackage" in str(w[-1].message)

assert cpu.path == os.path.join(RAPL_DIR, "intel-rapl:0")
assert cpu.rapl_file == mock_rapl_file_package
assert cpu.dram is None


# RAPLCPUs tests
def test_rapl_cpus_class(mocker):
"""Test initialization when RAPL is available."""
mocker.patch("zeus.device.cpu.rapl.rapl_is_available", return_value=True)
mocker.patch(
"zeus.device.cpu.rapl.glob",
return_value=[f"{RAPL_DIR}/intel-rapl:0", f"{RAPL_DIR}/intel-rapl:1"],
)
mock_rapl_cpu_constructor = mocker.patch("zeus.device.cpu.rapl.RAPLCPU")
mock_rapl_cpu_instance = MagicMock(spec=RAPLCPU)
mock_rapl_cpu_constructor.side_effect = [
mock_rapl_cpu_instance,
mock_rapl_cpu_instance,
]
rapl_cpus = RAPLCPUs()

assert len(rapl_cpus.cpus) == 2
assert all(isinstance(cpu, MagicMock) for cpu in rapl_cpus.cpus)
assert mock_rapl_cpu_constructor.call_count == 2


def test_rapl_cpus_class_init_error(mocker):
"""Test initialization when RAPL is not available."""
mocker.patch("zeus.device.cpu.rapl.rapl_is_available", return_value=False)

with pytest.raises(
ZeusRAPLNotSupportedError, match="RAPL is not supported on this CPU."
):
RAPLCPUs()
39 changes: 35 additions & 4 deletions tests/test_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@
import pytest

from zeus.monitor import Measurement, ZeusMonitor
from zeus.utils.testing import ReplayZeusMonitor
from zeus.utils.testing import ReplayZeusMonitor, MOCKCPUs, NUM_CPUS
import zeus.device.gpu
import zeus.device.cpu

if TYPE_CHECKING:
from pathlib import Path
Expand All @@ -40,9 +41,10 @@


@pytest.fixture(autouse=True, scope="function")
def reset_gpus() -> None:
"""Reset the global variable `_gpus` to None on every test."""
def reset_gpus_and_cpus() -> None:
"""Reset the global variable `_gpus` and `_cpus` to None on every test."""
zeus.device.gpu._gpus = None
zeus.device.cpu._cpus = None


@pytest.fixture
Expand Down Expand Up @@ -209,11 +211,14 @@ def get_energy(self, start: float, end: float) -> dict[int, float]:

# want to make zeus.device.gpu.nvml_is_available is a function, want it to always return true when testing
mocker.patch("zeus.device.gpu.nvml_is_available", return_value=True)
mocker.patch("zeus.device.cpu._cpus", new=MOCKCPUs())

########################################
# Test ZeusMonitor initialization.
########################################
monitor = ZeusMonitor(gpu_indices=gpu_indices, log_file=log_file)
monitor = ZeusMonitor(
gpu_indices=gpu_indices, cpu_indices=list(range(NUM_CPUS)), log_file=log_file
)

# Check GPU index parsing from the log file.
replay_monitor = ReplayZeusMonitor(gpu_indices=None, log_file=log_file)
Expand All @@ -227,6 +232,10 @@ def tick():
next(time_counter)
for counter in energy_counters.values():
next(counter)
for i in range(len(monitor.cpu_indices)):
next(monitor.cpus._cpus[i].cpu_energy)
if i % 2 == 0:
next(monitor.cpus._cpus[i].dram_energy)
jaywonchung marked this conversation as resolved.
Show resolved Hide resolved

def assert_window_begin(name: str, begin_time: int):
"""Assert monitor measurement states right after a window begins."""
Expand All @@ -237,6 +246,16 @@ def assert_window_begin(name: str, begin_time: int):
for i in torch_gpu_indices
if not is_old_torch[i]
}
assert monitor.measurement_states[name].cpu_energy == {
i: pytest.approx((1000 + 10 * (begin_time - 4)) / 1000.0)
for i in range(len(monitor.cpu_indices))
}
assert monitor.measurement_states[name].dram_energy == {
i: pytest.approx((200 + 5 * (begin_time - 4)) / 1000.0)
if i % 2 == 0
else None
for i in range(0, len(monitor.cpu_indices), 2)
}
pynvml_mock.nvmlDeviceGetTotalEnergyConsumption.assert_has_calls(
[call(f"handle{i}") for i in nvml_gpu_indices if not is_old_nvml[i]]
)
Expand Down Expand Up @@ -269,6 +288,18 @@ def assert_measurement(
elapsed_time * 3 / 1000.0
)

if measurement.cpu_energy is not None:
for i in measurement.cpu_energy.keys():
assert measurement.cpu_energy[i] == pytest.approx(
elapsed_time * 10 / 1000.0
)

if measurement.dram_energy is not None:
for i in measurement.dram_energy.keys():
assert measurement.dram_energy[i] == pytest.approx(
elapsed_time * 5 / 1000.0
)

if not assert_calls:
return

Expand Down
1 change: 1 addition & 0 deletions zeus/device/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class ZeusBaseCPUError(ZeusBaseError):

def __init__(self, message: str) -> None:
"""Initialize Base Zeus Exception."""
super().__init__(message)


class ZeusdError(ZeusBaseGPUError):
Expand Down
48 changes: 48 additions & 0 deletions zeus/utils/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@
from __future__ import annotations

from pathlib import Path
from typing import Sequence
import itertools

from zeus.monitor import Measurement, ZeusMonitor
from zeus.utils.framework import sync_execution as sync_execution_fn
from zeus.device.cpu.common import CpuDramMeasurement, CPUs, CPU
from zeus.utils.logging import get_logger


Expand Down Expand Up @@ -159,3 +162,48 @@ def end_window(
self.logger.info("Measurement window '%s' ended (%s).", key, measurement)

return measurement


NUM_CPUS = 2


class MOCKCPU(CPU):
wbjin marked this conversation as resolved.
Show resolved Hide resolved
"""Control a single MOCK CPU for testing."""

def __init__(self, index):
"""Initialize the MOCKCPU with a specified index for testing."""
self.index = index
self.cpu_energy = itertools.count(start=1000, step=10)
self.dram_energy = (
itertools.count(start=200, step=5) if self.index % 2 == 0 else None
)
wbjin marked this conversation as resolved.
Show resolved Hide resolved

def getTotalEnergyConsumption(self):
"""Returns the total energy consumption of the specified powerzone. Units: mJ."""
return CpuDramMeasurement(
cpu_mj=float(next(self.cpu_energy)),
dram_mj=float(next(self.dram_energy))
if self.dram_energy is not None
else None,
)

def supportsGetDramEnergyConsumption(self):
"""Returns True if the specified CPU powerzone supports retrieving the subpackage energy consumption."""
return self.dram_energy is not None


class MOCKCPUs(CPUs):
wbjin marked this conversation as resolved.
Show resolved Hide resolved
"""MOCK CPU Manager object, containing individual MOCKCPU objects for testing."""

def __init__(self):
"""Instantiates MOCKCPUs object for testing."""
self._cpus = [MOCKCPU(i) for i in range(NUM_CPUS)]

@property
def cpus(self) -> Sequence[CPU]:
"""Returns a list of CPU objects being tracked."""
return self._cpus

def __del__(self) -> None:
"""Shuts down the Mock CPU monitoring."""
return
Loading