diff --git a/tests/test_monitor.py b/tests/test_monitor.py index 9b3d7f81..b9a7dbd8 100644 --- a/tests/test_monitor.py +++ b/tests/test_monitor.py @@ -440,3 +440,15 @@ def assert_log_file_row(row: str, name: str, begin_time: int, elapsed_time: int) assert_measurement( "window5", measurement, begin_time=25, elapsed_time=8, assert_calls=False ) + + # Calling `end_window` when the energy consumption of one or more GPUs was measured as zero should raise a warning. + pynvml_mock.nvmlDeviceGetTotalEnergyConsumption.side_effect = lambda handle: 0.0 + + monitor.begin_window("window0", sync_cuda=False) + + with pytest.warns( + match="The energy consumption of one or more GPUs was measured as zero. This means that the time duration of the measurement window was shorter than the GPU's energy counter update period. Consider turning on the `approx_instant_energy` option in `ZeusMonitor`, which approximates the energy consumption of a short time window as instant power draw x window duration.", + ): + test_measurement = monitor.end_window("window0", sync_cuda=False) + + assert all(value == 0.0 for value in test_measurement.gpu_energy.values()) diff --git a/zeus/monitor/energy.py b/zeus/monitor/energy.py index 18905296..113281d5 100644 --- a/zeus/monitor/energy.py +++ b/zeus/monitor/energy.py @@ -17,6 +17,7 @@ from __future__ import annotations import os +import warnings from time import time from pathlib import Path from dataclasses import dataclass @@ -143,7 +144,6 @@ def training(): result = monitor.end_window("entire_training") # Print the measurement result. - print(f"Training took {result.time} seconds.") print(f"Training consumed {result.total_energy} Joules.") for gpu_idx, gpu_energy in result.gpu_energy.items(): print(f"GPU {gpu_idx} consumed {gpu_energy} Joules.") @@ -395,6 +395,15 @@ def end_window( time_consumption - power_measurement_time ) + # Trigger a warning if energy consumption is zero and approx_instant_energy is not enabled. + if not self.approx_instant_energy and any( + energy == 0.0 for energy in gpu_energy_consumption.values() + ): + warnings.warn( + "The energy consumption of one or more GPUs was measured as zero. This means that the time duration of the measurement window was shorter than the GPU's energy counter update period. Consider turning on the `approx_instant_energy` option in `ZeusMonitor`, which approximates the energy consumption of a short time window as instant power draw x window duration.", + stacklevel=1, + ) + logger.debug("Measurement window '%s' ended.", key) # Add to log file.