Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

managing nvml exception #1809

Merged
merged 3 commits into from
Aug 26, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 50 additions & 22 deletions ts/metrics/system_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,75 +4,100 @@
import logging
import types
from builtins import str

import psutil
import pynvml

from ts.metrics.dimension import Dimension
from ts.metrics.metric import Metric

system_metrics = []
dimension = [Dimension('Level', 'Host')]
dimension = [Dimension("Level", "Host")]


def cpu_utilization():
data = psutil.cpu_percent()
system_metrics.append(Metric('CPUUtilization', data, 'percent', dimension))
system_metrics.append(Metric("CPUUtilization", data, "percent", dimension))


def memory_used():
data = psutil.virtual_memory().used / (1024 * 1024) # in MB
system_metrics.append(Metric('MemoryUsed', data, 'MB', dimension))
system_metrics.append(Metric("MemoryUsed", data, "MB", dimension))


def memory_available():
data = psutil.virtual_memory().available / (1024 * 1024) # in MB
system_metrics.append(Metric('MemoryAvailable', data, 'MB', dimension))
system_metrics.append(Metric("MemoryAvailable", data, "MB", dimension))


def memory_utilization():
data = psutil.virtual_memory().percent
system_metrics.append(Metric('MemoryUtilization', data, 'percent', dimension))
system_metrics.append(Metric("MemoryUtilization", data, "percent", dimension))


def disk_used():
data = psutil.disk_usage('/').used / (1024 * 1024 * 1024) # in GB
system_metrics.append(Metric('DiskUsage', data, 'GB', dimension))
data = psutil.disk_usage("/").used / (1024 * 1024 * 1024) # in GB
system_metrics.append(Metric("DiskUsage", data, "GB", dimension))


def disk_utilization():
data = psutil.disk_usage('/').percent
system_metrics.append(Metric('DiskUtilization', data, 'percent', dimension))
data = psutil.disk_usage("/").percent
system_metrics.append(Metric("DiskUtilization", data, "percent", dimension))


def disk_available():
data = psutil.disk_usage('/').free / (1024 * 1024 * 1024) # in GB
system_metrics.append(Metric('DiskAvailable', data, 'GB', dimension))
data = psutil.disk_usage("/").free / (1024 * 1024 * 1024) # in GB
system_metrics.append(Metric("DiskAvailable", data, "GB", dimension))


def gpu_utilization(num_of_gpu):
"""
Collect gpu metrics.
Collect gpu metrics.

:param num_of_gpu:
:return:
"""
:param num_of_gpu:
:return:
"""
if num_of_gpu <= 0:
return

# pylint: disable=wrong-import-position
# pylint: disable=import-outside-toplevel
import nvgpu
from nvgpu import list_gpus

# pylint: enable=wrong-import-position
# pylint: enable=import-outside-toplevel

info = nvgpu.gpu_info()
for value in info:
dimension_gpu = [Dimension('Level', 'Host'), Dimension("device_id", value['index'])]
system_metrics.append(Metric('GPUMemoryUtilization', value['mem_used_percent'], 'percent', dimension_gpu))
system_metrics.append(Metric('GPUMemoryUsed', value['mem_used'], 'MB', dimension_gpu))
dimension_gpu = [
Dimension("Level", "Host"),
Dimension("device_id", value["index"]),
]
system_metrics.append(
Metric(
"GPUMemoryUtilization",
value["mem_used_percent"],
"percent",
dimension_gpu,
)
)
system_metrics.append(
Metric("GPUMemoryUsed", value["mem_used"], "MB", dimension_gpu)
)

try:
statuses = list_gpus.device_statuses()
except pynvml.nvml.NVMLError_NotSupported:
logging.warning("gpu device monitoring not supported")
statuses = []

statuses = list_gpus.device_statuses()
for idx, status in enumerate(statuses):
dimension_gpu = [Dimension('Level', 'Host'), Dimension("device_id", idx)]
system_metrics.append(Metric('GPUUtilization', status['utilization'], 'percent', dimension_gpu))
dimension_gpu = [Dimension("Level", "Host"), Dimension("device_id", idx)]
system_metrics.append(
Metric("GPUUtilization", status["utilization"], "percent", dimension_gpu)
)


def collect_all(mod, num_of_gpu):
"""
Expand All @@ -86,7 +111,10 @@ def collect_all(mod, num_of_gpu):
members = dir(mod)
for i in members:
value = getattr(mod, i)
if isinstance(value, types.FunctionType) and value.__name__ not in ('collect_all', 'log_msg'):
if isinstance(value, types.FunctionType) and value.__name__ not in (
"collect_all",
"log_msg",
):
if value.__name__ == "gpu_utilization":
value(num_of_gpu)
else:
Expand Down