From b19fb14a3267ca86f82a0bb5851f060ab27c503a Mon Sep 17 00:00:00 2001 From: Wonbin Jin <116508975+wbjin@users.noreply.github.com> Date: Fri, 16 Aug 2024 00:23:50 +0900 Subject: [PATCH] Tests for RAPL and `ZeusMonitor` (#100) Co-authored-by: Jae-Won Chung --- pyproject.toml | 1 + tests/device/cpu/test_rapl.py | 179 +++++++++++++++++++++++++++++++--- tests/test_monitor.py | 81 ++++++++++++++- zeus/device/exception.py | 1 + 4 files changed, 246 insertions(+), 16 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c8ee976d..c0d1e684 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,6 +94,7 @@ pydocstyle.convention = "google" "zeus/optimizer/batch_size/common.py" = ["N805"] "zeus/device/gpu/*.py" = ["N802", "N803"] "zeus/device/cpu/*.py" = ["N802"] +"zeus/utils/testing.py" = ["N802"] [tool.pytest.ini_options] addopts = "--numprocesses auto" diff --git a/tests/device/cpu/test_rapl.py b/tests/device/cpu/test_rapl.py index 3d2e03b8..d89fe9a1 100644 --- a/tests/device/cpu/test_rapl.py +++ b/tests/device/cpu/test_rapl.py @@ -4,14 +4,12 @@ import os import pytest from typing import Generator, TYPE_CHECKING, Sequence -from time import sleep from unittest.mock import patch, mock_open, create_autospec, MagicMock -import unittest.mock as mock -from sys import stdout, stderr import warnings import multiprocessing as mp + if TYPE_CHECKING: from pathlib import Path @@ -56,24 +54,32 @@ def __exit__(self, exc_type, exc_value, traceback): @pytest.fixture def mock_rapl_values(): rapl_values = [ - "1000", - "900", - "800", - "700", - "600", - "500", - "400", - "500", - "200", - "100", + "100000", + "90000", + "80000", + "70000", + "60000", + "50000", + "40000", + "50000", + "20000", + "10000", ] mocked_rapl_file = MockRaplFile(RAPL_DIR + "/intel-rapl:0/energy_uj", rapl_values) + mocked_rapl_file_name = mock_open() + mocked_rapl_file_name.return_value.read.return_value = "package" + mocked_rapl_file_max = mock_open() + mocked_rapl_file_max.return_value.read.return_value = "100000" real_open = builtins.open def mock_file_open(filepath, *args, **kwargs): if filepath == (RAPL_DIR + "/intel-rapl:0/energy_uj"): return mocked_rapl_file + if filepath == (RAPL_DIR + "/intel-rapl:0/name"): + return mocked_rapl_file_name() + if filepath == (RAPL_DIR + "/intel-rapl:0/max_energy_range_uj"): + return mocked_rapl_file_max() else: return real_open(filepath, *args, **kwargs) @@ -92,8 +98,155 @@ def mock_file_open(filepath, *args, **kwargs): patch_sleep.stop() +@pytest.fixture() +def mock_rapl_wraparound_tracker(): + patch_tracker = patch("zeus.device.cpu.rapl.RaplWraparoundTracker") + MockRaplWraparoundTracker = patch_tracker.start() + + mock_tracker = MockRaplWraparoundTracker.return_value + mock_tracker.get_num_wraparounds.side_effect = [0, 5] + + yield mock_tracker + + patch_tracker.stop() + + def test_rapl_polling_process(mock_rapl_values): wraparound_counter = mp.Value("i", 0) with pytest.raises(MockRaplFileOutOfValues) as exception: _polling_process(RAPL_DIR + "/intel-rapl:0/energy_uj", 1000, wraparound_counter) assert wraparound_counter.value == 8 + + +# RAPLFile tests +@pytest.fixture +@patch("os.path.exists", return_value=False) +def test_rapl_available(mock_exists): + assert rapl_is_available() == False + + +def test_rapl_file_class(mock_rapl_values, mock_rapl_wraparound_tracker): + """Test the `RAPLFile` class.""" + # Test initialization + raplFile = RAPLFile("/sys/class/powercap/intel-rapl/intel-rapl:0") + assert raplFile.name == "package" + assert raplFile.last_energy == 100000.0 + assert raplFile.max_energy_range_uj == 100000.0 + + # Test read method where get_num_wraparounds is 0 + assert raplFile.read() == 90.0 + + # Test read method where get_num_wraparounds is 5 + assert raplFile.read() == 580.0 # (80000+5*100000)/1000 + + +def test_rapl_file_class_exceptions(): + """Test `RAPLFile` Init errors""" + with patch("builtins.open", mock_open()) as mock_file: + # Fails to open name file + mock_file.side_effect = FileNotFoundError + with pytest.raises(ZeusRAPLFileInitError): + RAPLFile("/sys/class/powercap/intel-rapl/intel-rapl:0") + + # Fails to read energy_uj file + mock_file.side_effect = [ + mock_open(read_data="package").return_value, + FileNotFoundError, + ] + with pytest.raises(ZeusRAPLFileInitError): + RAPLFile("/sys/class/powercap/intel-rapl/intel-rapl:0") + + # Fails to read max_energy_uj file + mock_file.side_effect = [ + mock_open(read_data="package").return_value, + mock_open(read_data="1000000").return_value, + FileNotFoundError, + ] + with pytest.raises(ZeusRAPLFileInitError): + RAPLFile("/sys/class/powercap/intel-rapl/intel-rapl:0") + + +# RAPLCPU tests +@pytest.fixture() +def mock_os_listdir_cpu(mocker): + return mocker.patch("os.listdir", return_value=["intel-rapl:0", "intel-rapl:0:0"]) + + +def create_rapl_file_mock(name="package", read_value=1000.0): + """Create a mock `RAPLFile` class""" + mock_rapl_file = create_autospec(RAPLFile, instance=True) + mock_rapl_file.name = name + mock_rapl_file.read.return_value = read_value + return mock_rapl_file + + +def test_rapl_cpu_class(mocker, mock_os_listdir_cpu): + """Test `RAPLCPU` with `DRAM`""" + mock_rapl_file_package = create_rapl_file_mock() + mock_rapl_file_dram = create_rapl_file_mock(name="dram", read_value=500.0) + + def rapl_file_side_effect(path): + if "0:0" in path: + return mock_rapl_file_dram + return mock_rapl_file_package + + mocker.patch("zeus.device.cpu.rapl.RAPLFile", side_effect=rapl_file_side_effect) + cpu = RAPLCPU(cpu_index=0) + measurement = cpu.getTotalEnergyConsumption() + + assert cpu.path == os.path.join(RAPL_DIR, "intel-rapl:0") + assert cpu.rapl_file == mock_rapl_file_package + assert cpu.dram == mock_rapl_file_dram + assert measurement.cpu_mj == mock_rapl_file_package.read.return_value + assert measurement.dram_mj == mock_rapl_file_dram.read.return_value + + +def test_rapl_cpu_class_exceptions(mocker, mock_os_listdir_cpu): + """Test `RAPLCPU` subpackage init error""" + mock_rapl_file_package = create_rapl_file_mock() + mock_rapl_file_dram = create_rapl_file_mock(name="dram", read_value=500.0) + + def rapl_file_side_effect(path): + if "0:0" in path: + raise ZeusRAPLFileInitError("Initilization Error") + return mock_rapl_file_package + + mocker.patch("zeus.device.cpu.rapl.RAPLFile", side_effect=rapl_file_side_effect) + with warnings.catch_warnings(record=True) as w: + cpu = RAPLCPU(cpu_index=0) + assert "Failed to initialize subpackage" in str(w[-1].message) + + assert cpu.path == os.path.join(RAPL_DIR, "intel-rapl:0") + assert cpu.rapl_file == mock_rapl_file_package + assert cpu.dram is None + + +# RAPLCPUs tests +def test_rapl_cpus_class(mocker): + """Test initialization when RAPL is available.""" + mocker.patch("zeus.device.cpu.rapl.rapl_is_available", return_value=True) + mocker.patch( + "zeus.device.cpu.rapl.glob", + return_value=[f"{RAPL_DIR}/intel-rapl:0", f"{RAPL_DIR}/intel-rapl:1"], + ) + mock_rapl_cpu_constructor = mocker.patch("zeus.device.cpu.rapl.RAPLCPU") + mock_rapl_cpu_instance = MagicMock(spec=RAPLCPU) + mock_rapl_cpu_constructor.side_effect = [ + mock_rapl_cpu_instance, + mock_rapl_cpu_instance, + ] + rapl_cpus = RAPLCPUs() + + assert len(rapl_cpus.cpus) == 2 + assert all(isinstance(cpu, MagicMock) for cpu in rapl_cpus.cpus) + assert mock_rapl_cpu_constructor.call_count == 2 + + +def test_rapl_cpus_class_init_error(mocker): + """Test initialization when RAPL is not available.""" + mocker.patch("zeus.device.cpu.rapl.rapl_is_available", return_value=False) + + with pytest.raises( + ZeusRAPLNotSupportedError, match="RAPL is not supported on this CPU." + ): + RAPLCPUs() diff --git a/tests/test_monitor.py b/tests/test_monitor.py index e5607255..4daf7027 100644 --- a/tests/test_monitor.py +++ b/tests/test_monitor.py @@ -25,6 +25,8 @@ from zeus.monitor import Measurement, ZeusMonitor from zeus.utils.testing import ReplayZeusMonitor import zeus.device.gpu +import zeus.device.cpu +from zeus.device.cpu.common import CpuDramMeasurement, CPUs, CPU if TYPE_CHECKING: from pathlib import Path @@ -37,12 +39,56 @@ pynvml.NVML_DEVICE_ARCH_VOLTA, pynvml.NVML_DEVICE_ARCH_AMPERE, ] +NUM_CPUS = 2 + + +class MockCPU(CPU): + """Control a single MOCK CPU for testing.""" + + def __init__(self, index): + """Initialize the MOCKCPU with a specified index for testing.""" + self.index = index + self.cpu_energy = itertools.count(start=1000, step=10) + self.dram_energy = ( + itertools.count(start=200, step=5) if self.index % 2 == 0 else None + ) + + def getTotalEnergyConsumption(self): + """Returns the total energy consumption of the specified powerzone. Units: mJ.""" + return CpuDramMeasurement( + cpu_mj=float(next(self.cpu_energy)), + dram_mj=float(next(self.dram_energy)) + if self.dram_energy is not None + else None, + ) + + def supportsGetDramEnergyConsumption(self): + """Returns True if the specified CPU powerzone supports retrieving the subpackage energy consumption.""" + return self.dram_energy is not None + + +class MockCPUs(CPUs): + """MOCK CPU Manager object, containing individual MOCKCPU objects for testing.""" + + def __init__(self): + """Instantiates MOCKCPUs object for testing.""" + self._cpus = [MockCPU(i) for i in range(NUM_CPUS)] + + @property + def cpus(self) -> Sequence[CPU]: + """Returns a list of CPU objects being tracked.""" + return self._cpus + + def __del__(self) -> None: + """Shuts down the Mock CPU monitoring.""" + return @pytest.fixture(autouse=True, scope="function") -def reset_gpus() -> None: - """Reset the global variable `_gpus` to None on every test.""" +def reset_gpus_and_cpus() -> None: + """Reset the global variable `_gpus` and `_cpus` to None on every test.""" zeus.device.gpu._gpus = None + zeus.device.cpu._cpus = None @pytest.fixture @@ -209,11 +255,14 @@ def get_energy(self, start: float, end: float) -> dict[int, float]: # want to make zeus.device.gpu.nvml_is_available is a function, want it to always return true when testing mocker.patch("zeus.device.gpu.nvml_is_available", return_value=True) + mocker.patch("zeus.device.cpu._cpus", new=MockCPUs()) ######################################## # Test ZeusMonitor initialization. ######################################## - monitor = ZeusMonitor(gpu_indices=gpu_indices, log_file=log_file) + monitor = ZeusMonitor( + gpu_indices=gpu_indices, cpu_indices=list(range(NUM_CPUS)), log_file=log_file + ) # Check GPU index parsing from the log file. replay_monitor = ReplayZeusMonitor(gpu_indices=None, log_file=log_file) @@ -227,6 +276,10 @@ def tick(): next(time_counter) for counter in energy_counters.values(): next(counter) + for i in range(len(monitor.cpu_indices)): + next(monitor.cpus._cpus[i].cpu_energy) + if i % 2 == 0: + next(monitor.cpus._cpus[i].dram_energy) def assert_window_begin(name: str, begin_time: int): """Assert monitor measurement states right after a window begins.""" @@ -237,6 +290,16 @@ def assert_window_begin(name: str, begin_time: int): for i in torch_gpu_indices if not is_old_torch[i] } + assert monitor.measurement_states[name].cpu_energy == { + i: pytest.approx((1000 + 10 * (begin_time - 4)) / 1000.0) + for i in range(len(monitor.cpu_indices)) + } + assert monitor.measurement_states[name].dram_energy == { + i: pytest.approx((200 + 5 * (begin_time - 4)) / 1000.0) + if i % 2 == 0 + else None + for i in range(0, len(monitor.cpu_indices), 2) + } pynvml_mock.nvmlDeviceGetTotalEnergyConsumption.assert_has_calls( [call(f"handle{i}") for i in nvml_gpu_indices if not is_old_nvml[i]] ) @@ -269,6 +332,18 @@ def assert_measurement( elapsed_time * 3 / 1000.0 ) + if measurement.cpu_energy is not None: + for i in measurement.cpu_energy.keys(): + assert measurement.cpu_energy[i] == pytest.approx( + elapsed_time * 10 / 1000.0 + ) + + if measurement.dram_energy is not None: + for i in measurement.dram_energy.keys(): + assert measurement.dram_energy[i] == pytest.approx( + elapsed_time * 5 / 1000.0 + ) + if not assert_calls: return diff --git a/zeus/device/exception.py b/zeus/device/exception.py index dda19ab5..3bbd504d 100644 --- a/zeus/device/exception.py +++ b/zeus/device/exception.py @@ -16,6 +16,7 @@ class ZeusBaseCPUError(ZeusBaseError): def __init__(self, message: str) -> None: """Initialize Base Zeus Exception.""" + super().__init__(message) class ZeusdError(ZeusBaseGPUError):