Skip to content

Commit

Permalink
managing nvml exception (#1809)
Browse files Browse the repository at this point in the history
* managing nvml exception
  • Loading branch information
lromor authored Aug 26, 2022
1 parent b8c23e6 commit 696442b
Showing 1 changed file with 50 additions and 22 deletions.
72 changes: 50 additions & 22 deletions ts/metrics/system_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,75 +4,100 @@
import logging
import types
from builtins import str

import psutil
import pynvml

from ts.metrics.dimension import Dimension
from ts.metrics.metric import Metric

system_metrics = []
dimension = [Dimension('Level', 'Host')]
dimension = [Dimension("Level", "Host")]


def cpu_utilization():
data = psutil.cpu_percent()
system_metrics.append(Metric('CPUUtilization', data, 'percent', dimension))
system_metrics.append(Metric("CPUUtilization", data, "percent", dimension))


def memory_used():
data = psutil.virtual_memory().used / (1024 * 1024) # in MB
system_metrics.append(Metric('MemoryUsed', data, 'MB', dimension))
system_metrics.append(Metric("MemoryUsed", data, "MB", dimension))


def memory_available():
data = psutil.virtual_memory().available / (1024 * 1024) # in MB
system_metrics.append(Metric('MemoryAvailable', data, 'MB', dimension))
system_metrics.append(Metric("MemoryAvailable", data, "MB", dimension))


def memory_utilization():
data = psutil.virtual_memory().percent
system_metrics.append(Metric('MemoryUtilization', data, 'percent', dimension))
system_metrics.append(Metric("MemoryUtilization", data, "percent", dimension))


def disk_used():
data = psutil.disk_usage('/').used / (1024 * 1024 * 1024) # in GB
system_metrics.append(Metric('DiskUsage', data, 'GB', dimension))
data = psutil.disk_usage("/").used / (1024 * 1024 * 1024) # in GB
system_metrics.append(Metric("DiskUsage", data, "GB", dimension))


def disk_utilization():
data = psutil.disk_usage('/').percent
system_metrics.append(Metric('DiskUtilization', data, 'percent', dimension))
data = psutil.disk_usage("/").percent
system_metrics.append(Metric("DiskUtilization", data, "percent", dimension))


def disk_available():
data = psutil.disk_usage('/').free / (1024 * 1024 * 1024) # in GB
system_metrics.append(Metric('DiskAvailable', data, 'GB', dimension))
data = psutil.disk_usage("/").free / (1024 * 1024 * 1024) # in GB
system_metrics.append(Metric("DiskAvailable", data, "GB", dimension))


def gpu_utilization(num_of_gpu):
"""
Collect gpu metrics.
Collect gpu metrics.
:param num_of_gpu:
:return:
"""
:param num_of_gpu:
:return:
"""
if num_of_gpu <= 0:
return

# pylint: disable=wrong-import-position
# pylint: disable=import-outside-toplevel
import nvgpu
from nvgpu import list_gpus

# pylint: enable=wrong-import-position
# pylint: enable=import-outside-toplevel

info = nvgpu.gpu_info()
for value in info:
dimension_gpu = [Dimension('Level', 'Host'), Dimension("device_id", value['index'])]
system_metrics.append(Metric('GPUMemoryUtilization', value['mem_used_percent'], 'percent', dimension_gpu))
system_metrics.append(Metric('GPUMemoryUsed', value['mem_used'], 'MB', dimension_gpu))
dimension_gpu = [
Dimension("Level", "Host"),
Dimension("device_id", value["index"]),
]
system_metrics.append(
Metric(
"GPUMemoryUtilization",
value["mem_used_percent"],
"percent",
dimension_gpu,
)
)
system_metrics.append(
Metric("GPUMemoryUsed", value["mem_used"], "MB", dimension_gpu)
)

try:
statuses = list_gpus.device_statuses()
except pynvml.nvml.NVMLError_NotSupported:
logging.warning("gpu device monitoring not supported")
statuses = []

statuses = list_gpus.device_statuses()
for idx, status in enumerate(statuses):
dimension_gpu = [Dimension('Level', 'Host'), Dimension("device_id", idx)]
system_metrics.append(Metric('GPUUtilization', status['utilization'], 'percent', dimension_gpu))
dimension_gpu = [Dimension("Level", "Host"), Dimension("device_id", idx)]
system_metrics.append(
Metric("GPUUtilization", status["utilization"], "percent", dimension_gpu)
)


def collect_all(mod, num_of_gpu):
"""
Expand All @@ -86,7 +111,10 @@ def collect_all(mod, num_of_gpu):
members = dir(mod)
for i in members:
value = getattr(mod, i)
if isinstance(value, types.FunctionType) and value.__name__ not in ('collect_all', 'log_msg'):
if isinstance(value, types.FunctionType) and value.__name__ not in (
"collect_all",
"log_msg",
):
if value.__name__ == "gpu_utilization":
value(num_of_gpu)
else:
Expand Down

0 comments on commit 696442b

Please sign in to comment.