Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CTX-6655: Fix node container losing access to gpu randomly. #259

Merged
merged 7 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion coretex/cli/modules/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import os
import logging
import requests
import platform

import click

Expand Down Expand Up @@ -376,6 +377,7 @@ def checkResourceLimitations() -> None:
def configureNode(advanced: bool) -> NodeConfiguration:
ui.highlightEcho("[Node Configuration]")
nodeConfig = NodeConfiguration({}) # create new empty node config
currentOS = platform.system().lower()

cpuLimit, ramLimit = docker.getResourceLimits()
swapLimit = docker.getDockerSwapLimit()
Expand All @@ -388,11 +390,42 @@ def configureNode(advanced: bool) -> NodeConfiguration:
else:
nodeConfig.image = "coretexai/coretex-node"

if isGPUAvailable():
# GPU Access is supported for:
# - Linux (Docker Engine)
# - Windows (Docker Desktop)

if isGPUAvailable() and not (docker.isDockerDesktop() and currentOS != "windows"):
nodeConfig.allowGpu = ui.clickPrompt("Do you want to allow the Node to access your GPU? (Y/n)", type = bool, default = True)
else:
nodeConfig.allowGpu = False

if nodeConfig.allowGpu and platform.system().lower() == "linux" and not docker.isDaemonFileUpdated():
shouldUpdateDockerConfig = ui.clickPrompt(
"NVIDIA has a bug where a docker container running Coretex Node can lose access to GPU "
"(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48). "
"\nDo you want Coretex CLI to apply a workaround for this bug "
"(NOTE: This requires docker daemon restart)? (Y/n)",
type = bool,
default = True
)

if shouldUpdateDockerConfig:
docker.updateDaemonFile()
shouldRestartDocker = ui.clickPrompt("Do you want to restart Docker to apply the changes? (Y/n)", type = bool, default = True)

if shouldRestartDocker:
docker.restartDocker()
else:
ui.warningEcho(
"Warning: The changes will not take effect until Docker is restarted. "
"(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)"
)
else:
ui.warningEcho(
"Warning: Not updating the daemon.json file may lead to GPU access issues in Docker "
"containers. (https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)"
)

if imageType == ImageType.official:
tag = "gpu" if nodeConfig.allowGpu else "cpu"
nodeConfig.image += f":latest-{tag}"
Expand Down
73 changes: 71 additions & 2 deletions coretex/utils/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import json
import platform
import tempfile

from .process import command, CommandException
from ..statistics import getTotalSwapMemory
Expand All @@ -22,7 +23,7 @@ def isDockerAvailable() -> None:

def networkExists(name: str) -> bool:
# This function inspects the specified Docker network using the
# 'docker network inspect' command. If the command exits with a return code
# "docker network inspect" command. If the command exits with a return code
# of 0, indicating success, the function returns True, meaning the network exists.
# If the command exits with a non-zero return code, indicating failure,
# the function returns False, meaning the network doesn't exist.
Expand Down Expand Up @@ -98,7 +99,7 @@ def start(

runCommand = [
"docker", "run", "-d",
"--restart", 'always',
"--restart", "always",
"-p", "21000:21000",
"--cap-add", "SYS_PTRACE",
"--network", name,
Expand Down Expand Up @@ -198,3 +199,71 @@ def getLogs(name: str, tail: Optional[int], follow: bool, timestamps: bool) -> N
runCommand.append("-f")

command(runCommand)


def isDockerDesktop() -> bool:
try:
_, output, _ = command(["docker", "info", "--format", "{{json .}}"], ignoreStdout = True, ignoreStderr = True)
jsonOutput = json.loads(output)

clientInfo = jsonOutput.get("ClientInfo")
if not isinstance(clientInfo, dict):
return False

pluginsInfo = clientInfo.get("Plugins")
if not isinstance(pluginsInfo, dict):
return False

versionInfo = pluginsInfo.get("Version")
if not isinstance(versionInfo, str):
return False

return "desktop" in versionInfo
except:
return False


def isDaemonFileUpdated() -> bool:
daemonFile = Path("/etc/docker/daemon.json")
cGroupFix = "native.cgroupdriver=cgroupfs"

if not daemonFile.exists():
return False

with daemonFile.open("r") as file:
try:
config = json.load(file)
execOpts = config.get("exec-opts", [])
return cGroupFix in execOpts
except json.JSONDecodeError:
return False


def updateDaemonFile() -> None:
daemonFile = Path("/etc/docker/daemon.json")
cGroupFix = "native.cgroupdriver=cgroupfs"
config: Dict[str, Any] = {}

if not daemonFile.exists():
config = {}

with daemonFile.open("r") as file:
try:
config = json.load(file)
except json.JSONDecodeError:
config = {}

execOpts: List[str] = config.get("exec-opts", [])
execOpts.append(cGroupFix)
config["exec-opts"] = execOpts

with tempfile.NamedTemporaryFile("w", delete = True) as tempFile:
json.dump(config, tempFile, indent = 4)
tempFilePath = tempFile.name

# Use sudo to move the temporary file to the protected location
command(["sudo", "mv", tempFilePath, str(daemonFile)], ignoreStderr = True, ignoreStdout = True)


def restartDocker() -> None:
command(["sudo", "systemctl", "restart", "docker"], ignoreStderr = True, ignoreStdout = True)
Loading