Skip to content

Commit

Permalink
managing nvml exception
Browse files Browse the repository at this point in the history
  • Loading branch information
lromor committed Aug 23, 2022
1 parent 8c23585 commit 5aea057
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion ts/metrics/system_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import psutil
from ts.metrics.dimension import Dimension
from ts.metrics.metric import Metric
import pynvml

system_metrics = []
dimension = [Dimension('Level', 'Host')]
Expand Down Expand Up @@ -69,7 +70,12 @@ def gpu_utilization(num_of_gpu):
system_metrics.append(Metric('GPUMemoryUtilization', value['mem_used_percent'], 'percent', dimension_gpu))
system_metrics.append(Metric('GPUMemoryUsed', value['mem_used'], 'MB', dimension_gpu))

statuses = list_gpus.device_statuses()
try:
statuses = list_gpus.device_statuses()
except pynvml.nvml.NVMLError_NotSupported:
logging.warning("gpu device monitoring not supported")
statuses = []

for idx, status in enumerate(statuses):
dimension_gpu = [Dimension('Level', 'Host'), Dimension("device_id", idx)]
system_metrics.append(Metric('GPUUtilization', status['utilization'], 'percent', dimension_gpu))
Expand Down

0 comments on commit 5aea057

Please sign in to comment.