Skip to content

Commit

Permalink
Feature: ZeusMonitor warns when energy is measured as zero (#93)
Browse files Browse the repository at this point in the history
Co-authored-by: Jae-Won Chung <jwnchung@umich.edu>
  • Loading branch information
sharonsyh and jaywonchung authored Jul 11, 2024
1 parent 59bcfc6 commit 869a675
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 1 deletion.
12 changes: 12 additions & 0 deletions tests/test_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,3 +440,15 @@ def assert_log_file_row(row: str, name: str, begin_time: int, elapsed_time: int)
assert_measurement(
"window5", measurement, begin_time=25, elapsed_time=8, assert_calls=False
)

# Calling `end_window` when the energy consumption of one or more GPUs was measured as zero should raise a warning.
pynvml_mock.nvmlDeviceGetTotalEnergyConsumption.side_effect = lambda handle: 0.0

monitor.begin_window("window0", sync_cuda=False)

with pytest.warns(
match="The energy consumption of one or more GPUs was measured as zero. This means that the time duration of the measurement window was shorter than the GPU's energy counter update period. Consider turning on the `approx_instant_energy` option in `ZeusMonitor`, which approximates the energy consumption of a short time window as instant power draw x window duration.",
):
test_measurement = monitor.end_window("window0", sync_cuda=False)

assert all(value == 0.0 for value in test_measurement.gpu_energy.values())
11 changes: 10 additions & 1 deletion zeus/monitor/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from __future__ import annotations

import os
import warnings
from time import time
from pathlib import Path
from dataclasses import dataclass
Expand Down Expand Up @@ -143,7 +144,6 @@ def training():
result = monitor.end_window("entire_training")
# Print the measurement result.
print(f"Training took {result.time} seconds.")
print(f"Training consumed {result.total_energy} Joules.")
for gpu_idx, gpu_energy in result.gpu_energy.items():
print(f"GPU {gpu_idx} consumed {gpu_energy} Joules.")
Expand Down Expand Up @@ -395,6 +395,15 @@ def end_window(
time_consumption - power_measurement_time
)

# Trigger a warning if energy consumption is zero and approx_instant_energy is not enabled.
if not self.approx_instant_energy and any(
energy == 0.0 for energy in gpu_energy_consumption.values()
):
warnings.warn(
"The energy consumption of one or more GPUs was measured as zero. This means that the time duration of the measurement window was shorter than the GPU's energy counter update period. Consider turning on the `approx_instant_energy` option in `ZeusMonitor`, which approximates the energy consumption of a short time window as instant power draw x window duration.",
stacklevel=1,
)

logger.debug("Measurement window '%s' ended.", key)

# Add to log file.
Expand Down

0 comments on commit 869a675

Please sign in to comment.