Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

min/max flagging added to system_metrics_monitor with only non-redundant, necessary gpu metrics logged #3373

Merged
merged 26 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
1eb8e4f
implemented min_max flag
JackZ-db Jun 6, 2024
659c905
fixed string parsing
JackZ-db Jun 6, 2024
1f3b289
refactoring compute_system_metrics for all_reduce
JackZ-db Jun 6, 2024
d9bfca3
keep track of rank within dict
JackZ-db Jun 6, 2024
a2bd879
added compute_min_max
JackZ-db Jun 6, 2024
5553e0a
added flag for both min_max and all_logging
JackZ-db Jun 6, 2024
36623bc
corrected min_max call with model_device
JackZ-db Jun 6, 2024
0de15b4
removing total bytes (always going ot be constant)
JackZ-db Jun 6, 2024
e8bf93c
handled no gpu case in min_max flag
JackZ-db Jun 6, 2024
b3b859c
removed unnecessary imports, patched unit tests
JackZ-db Jun 6, 2024
d652633
fixed assert statement for with gpu case, world size 1
JackZ-db Jun 6, 2024
d795550
case min_rank and max_rank as int to guarantee them working as indices
JackZ-db Jun 6, 2024
8958c40
fixed indent issue from fixing font
JackZ-db Jun 6, 2024
5cd20c0
Merge branch 'dev' into jz/metrics_monitor
mvpatel2000 Jun 6, 2024
8f7d273
Merge branch 'mosaicml:dev' into jz/metrics_monitor
JackZ-db Jun 6, 2024
15cdb09
made docs more concise and readable
JackZ-db Jun 6, 2024
3746bab
fixing unexpected unindent
JackZ-db Jun 6, 2024
3988f41
fixing unit test device
JackZ-db Jun 7, 2024
988195d
modifying device to equal model_device.type
JackZ-db Jun 7, 2024
80df26c
reverting to device=model_device
JackZ-db Jun 7, 2024
9e41dc8
setting device in unit test = 'gpu'
JackZ-db Jun 7, 2024
1d0ad04
setting device = 'cuda' in unit testing
JackZ-db Jun 7, 2024
91f1ac4
reverting to next(state.model.parameters()).device
JackZ-db Jun 7, 2024
56db297
removed torch as a dependecy for unit_testing
JackZ-db Jun 7, 2024
0477b4a
cleaned up UI to be consistent + removed calling next to obtain device
JackZ-db Jun 7, 2024
bfd0a12
Merge branch 'dev' into jz/metrics_monitor
j316chuck Jun 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 79 additions & 14 deletions composer/callbacks/system_metrics_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import os

import psutil
import torch

from composer.core import Callback, Event, State
from composer.loggers import Logger
Expand All @@ -19,13 +20,52 @@

__all__ = ['SystemMetricsMonitor']

_GPU_METRICS = [
'gpu_percentage',
'memory_percentage',
'gpu_temperature_C',
'gpu_power_usage_W',
]


class SystemMetricsMonitor(Callback):
"""Track system metrics."""
"""Logs GPU/CPU metrics.

GPU Metrics:
gpu_percentage: Occupancy rate, percent of time over sampling period during which one or more kernels was executing on the GPU.
memory_percentage: Percent of time over sampling period during which global memory was being read or written.
gpu_temperature_C: Temperature of device, in Celcius.
gpu_power_usage_W: Power usage of device, in Watts.

By default, only the maximum and minimum values for these metrics, alongside their respective ranks in the key names,
are logged on the :attr:`.Event.BATCH_START`, :attr:`.Event.EVAL_BATCH_START`, :attr:`.Event.PREDICT_BATCH_START`
events for every batch. If log_all_data is set to True, all values for these metrics across all ranks are logged on the
above events for every batch.

Example:
.. doctest::

def __init__(self, gpu_available: bool = False) -> None:
>>> from composer import Trainer
>>> from composer.callbacks import SystemMetricsMonitor
>>> # constructing trainer object with this callback
>>> trainer = Trainer(
... model=model,
... train_dataloader=train_dataloader,
... eval_dataloader=eval_dataloader,
... optimizers=optimizer,
... max_duration='1ep',
... callbacks=[SystemMetricsMonitor()],
... )

Args:
log_all_data (bool, optional): True if user wants to log data for all ranks, not just the min/max.
Defaults to False.
"""

def __init__(self, log_all_data: bool = False) -> None:
super().__init__()
self.gpu_available = gpu_available
self.gpu_available = torch.cuda.is_available()
self.log_all_data = log_all_data
if self.gpu_available:
try:
import pynvml
Expand All @@ -46,9 +86,23 @@ def run_event(self, event: Event, state: State, logger: Logger):
]:
local_node_system_metrics = self.compute_system_metrics()
all_system_metrics = dist.all_gather_object(local_node_system_metrics)
system_metrics = {
key: value for local_metrics in all_system_metrics for key, value in local_metrics.items()
}
system_metrics = {}

if self.log_all_data:
for rank, metrics in enumerate(all_system_metrics):
for key, value in metrics.items():
if key in _GPU_METRICS:
system_metrics[f'{key}_rank_{rank}'] = value
else:
system_metrics[key] = value

else:
system_metrics = self.compute_gpu_min_max_metrics(all_system_metrics, state)
for rank, metrics in enumerate(all_system_metrics):
for key, value in metrics.items():
if key not in _GPU_METRICS:
system_metrics[key] = value

logger.log_metrics(system_metrics)

def compute_system_metrics(self):
Expand All @@ -58,17 +112,14 @@ def compute_system_metrics(self):
if self.gpu_available:
import pynvml
local_rank = dist.get_local_rank()
global_rank = dist.get_global_rank()
handle = pynvml.nvmlDeviceGetHandleByIndex(local_rank)
memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
system_metrics[f'device{global_rank}_memory_total'] = memory.total
system_metrics[f'device{global_rank}_memory_free'] = memory.free
system_metrics[f'device{global_rank}_memory_used'] = memory.used
device_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
system_metrics[f'device{global_rank}_gpu_percentage'] = device_utilization.gpu
system_metrics[f'device{global_rank}_memory_percentage'] = device_utilization.memory
system_metrics['gpu_percentage'] = device_utilization.gpu
system_metrics['memory_percentage'] = device_utilization.memory
temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
system_metrics[f'device{global_rank}_gpu_temperature'] = temperature
system_metrics['gpu_temperature_C'] = temperature
power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0 # convert from mW to W
system_metrics['gpu_power_usage_W'] = power

# Get metrics for the system
cpu_percent = psutil.cpu_percent()
Expand All @@ -83,3 +134,17 @@ def compute_system_metrics(self):
for k, v in network_usage.items():
system_metrics[f'network_{k}'] = v
return system_metrics

def compute_gpu_min_max_metrics(self, all_metrics, state):
JackZ-db marked this conversation as resolved.
Show resolved Hide resolved
min_max_metrics = {}

if self.gpu_available:
for key in _GPU_METRICS:
values = torch.tensor([metrics_for_cur_rank[key] for metrics_for_cur_rank in all_metrics])
values = state.device.tensor_to_device(values)
min_rank = int(torch.argmin(values).item())
max_rank = int(torch.argmax(values).item())
min_max_metrics[f'min_{key}_rank_{min_rank}'] = values[min_rank].item()
min_max_metrics[f'max_{key}_rank_{max_rank}'] = values[max_rank].item()

return min_max_metrics
4 changes: 2 additions & 2 deletions tests/callbacks/test_system_metrics_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
@pytest.mark.gpu
def test_system_metrics_monitor_gpu():
# Construct the trainer
system_metrics_monitor = SystemMetricsMonitor(gpu_available=True)
system_metrics_monitor = SystemMetricsMonitor()
in_memory_logger = InMemoryLogger()
trainer = Trainer(
model=SimpleModel(),
Expand All @@ -24,7 +24,7 @@ def test_system_metrics_monitor_gpu():
)
trainer.fit()

assert 'device0_gpu_percentage' in in_memory_logger.data
assert 'min_gpu_percentage_rank_0' in in_memory_logger.data
assert 'cpu_percentage' in in_memory_logger.data


Expand Down
Loading