diff --git a/coretex/cli/modules/node.py b/coretex/cli/modules/node.py index b545b260..80696556 100644 --- a/coretex/cli/modules/node.py +++ b/coretex/cli/modules/node.py @@ -23,6 +23,7 @@ import os import logging import requests +import platform import click @@ -376,6 +377,7 @@ def checkResourceLimitations() -> None: def configureNode(advanced: bool) -> NodeConfiguration: ui.highlightEcho("[Node Configuration]") nodeConfig = NodeConfiguration({}) # create new empty node config + currentOS = platform.system().lower() cpuLimit, ramLimit = docker.getResourceLimits() swapLimit = docker.getDockerSwapLimit() @@ -388,11 +390,42 @@ def configureNode(advanced: bool) -> NodeConfiguration: else: nodeConfig.image = "coretexai/coretex-node" - if isGPUAvailable(): + # GPU Access is supported for: + # - Linux (Docker Engine) + # - Windows (Docker Desktop) + + if isGPUAvailable() and not (docker.isDockerDesktop() and currentOS != "windows"): nodeConfig.allowGpu = ui.clickPrompt("Do you want to allow the Node to access your GPU? (Y/n)", type = bool, default = True) else: nodeConfig.allowGpu = False + if nodeConfig.allowGpu and platform.system().lower() == "linux" and not docker.isDaemonFileUpdated(): + shouldUpdateDockerConfig = ui.clickPrompt( + "NVIDIA has a bug where a docker container running Coretex Node can lose access to GPU " + "(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48). " + "\nDo you want Coretex CLI to apply a workaround for this bug " + "(NOTE: This requires docker daemon restart)? (Y/n)", + type = bool, + default = True + ) + + if shouldUpdateDockerConfig: + docker.updateDaemonFile() + shouldRestartDocker = ui.clickPrompt("Do you want to restart Docker to apply the changes? (Y/n)", type = bool, default = True) + + if shouldRestartDocker: + docker.restartDocker() + else: + ui.warningEcho( + "Warning: The changes will not take effect until Docker is restarted. " + "(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)" + ) + else: + ui.warningEcho( + "Warning: Not updating the daemon.json file may lead to GPU access issues in Docker " + "containers. (https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)" + ) + if imageType == ImageType.official: tag = "gpu" if nodeConfig.allowGpu else "cpu" nodeConfig.image += f":latest-{tag}" diff --git a/coretex/utils/docker.py b/coretex/utils/docker.py index 8bd867fa..09e6426d 100644 --- a/coretex/utils/docker.py +++ b/coretex/utils/docker.py @@ -3,6 +3,7 @@ import json import platform +import tempfile from .process import command, CommandException from ..statistics import getTotalSwapMemory @@ -22,7 +23,7 @@ def isDockerAvailable() -> None: def networkExists(name: str) -> bool: # This function inspects the specified Docker network using the - # 'docker network inspect' command. If the command exits with a return code + # "docker network inspect" command. If the command exits with a return code # of 0, indicating success, the function returns True, meaning the network exists. # If the command exits with a non-zero return code, indicating failure, # the function returns False, meaning the network doesn't exist. @@ -98,7 +99,7 @@ def start( runCommand = [ "docker", "run", "-d", - "--restart", 'always', + "--restart", "always", "-p", "21000:21000", "--cap-add", "SYS_PTRACE", "--network", name, @@ -198,3 +199,71 @@ def getLogs(name: str, tail: Optional[int], follow: bool, timestamps: bool) -> N runCommand.append("-f") command(runCommand) + + +def isDockerDesktop() -> bool: + try: + _, output, _ = command(["docker", "info", "--format", "{{json .}}"], ignoreStdout = True, ignoreStderr = True) + jsonOutput = json.loads(output) + + clientInfo = jsonOutput.get("ClientInfo") + if not isinstance(clientInfo, dict): + return False + + pluginsInfo = clientInfo.get("Plugins") + if not isinstance(pluginsInfo, dict): + return False + + versionInfo = pluginsInfo.get("Version") + if not isinstance(versionInfo, str): + return False + + return "desktop" in versionInfo + except: + return False + + +def isDaemonFileUpdated() -> bool: + daemonFile = Path("/etc/docker/daemon.json") + cGroupFix = "native.cgroupdriver=cgroupfs" + + if not daemonFile.exists(): + return False + + with daemonFile.open("r") as file: + try: + config = json.load(file) + execOpts = config.get("exec-opts", []) + return cGroupFix in execOpts + except json.JSONDecodeError: + return False + + +def updateDaemonFile() -> None: + daemonFile = Path("/etc/docker/daemon.json") + cGroupFix = "native.cgroupdriver=cgroupfs" + config: Dict[str, Any] = {} + + if not daemonFile.exists(): + config = {} + + with daemonFile.open("r") as file: + try: + config = json.load(file) + except json.JSONDecodeError: + config = {} + + execOpts: List[str] = config.get("exec-opts", []) + execOpts.append(cGroupFix) + config["exec-opts"] = execOpts + + with tempfile.NamedTemporaryFile("w", delete = True) as tempFile: + json.dump(config, tempFile, indent = 4) + tempFilePath = tempFile.name + + # Use sudo to move the temporary file to the protected location + command(["sudo", "mv", tempFilePath, str(daemonFile)], ignoreStderr = True, ignoreStdout = True) + + +def restartDocker() -> None: + command(["sudo", "systemctl", "restart", "docker"], ignoreStderr = True, ignoreStdout = True)