Skip to content

Commit

Permalink
managing nvml exception
Browse files Browse the repository at this point in the history
fix lint issue
  • Loading branch information
lromor committed Aug 25, 2022
1 parent a916ee5 commit 5d9e578
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion ts/metrics/system_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import logging
import types
from builtins import str

import psutil
import pynvml

from ts.metrics.dimension import Dimension
from ts.metrics.metric import Metric

Expand Down Expand Up @@ -60,6 +63,7 @@ def gpu_utilization(num_of_gpu):
# pylint: disable=import-outside-toplevel
import nvgpu
from nvgpu import list_gpus

# pylint: enable=wrong-import-position
# pylint: enable=import-outside-toplevel

Expand All @@ -69,7 +73,12 @@ def gpu_utilization(num_of_gpu):
system_metrics.append(Metric('GPUMemoryUtilization', value['mem_used_percent'], 'percent', dimension_gpu))
system_metrics.append(Metric('GPUMemoryUsed', value['mem_used'], 'MB', dimension_gpu))

statuses = list_gpus.device_statuses()
try:
statuses = list_gpus.device_statuses()
except pynvml.nvml.NVMLError_NotSupported:
logging.warning('gpu device monitoring not supported')
statuses = []

for idx, status in enumerate(statuses):
dimension_gpu = [Dimension('Level', 'Host'), Dimension("device_id", idx)]
system_metrics.append(Metric('GPUUtilization', status['utilization'], 'percent', dimension_gpu))
Expand Down

0 comments on commit 5d9e578

Please sign in to comment.