From b7cc261bd1b1102e413fe4fb88de38f68b5c6135 Mon Sep 17 00:00:00 2001 From: Bogdan Tintor Date: Fri, 23 Aug 2024 16:21:44 +0200 Subject: [PATCH 1/7] CTX-6655: Fix regarding bug where node container could lose access to gpu randomly (https://github.com/NVIDIA/nvidia-container-toolkit/issues/48). --- coretex/cli/modules/node.py | 31 ++++++++++++++++++++++++++ coretex/utils/docker.py | 44 +++++++++++++++++++++++++++++++++++-- 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/coretex/cli/modules/node.py b/coretex/cli/modules/node.py index b545b260..824857d0 100644 --- a/coretex/cli/modules/node.py +++ b/coretex/cli/modules/node.py @@ -23,6 +23,7 @@ import os import logging import requests +import platform import click @@ -393,6 +394,36 @@ def configureNode(advanced: bool) -> NodeConfiguration: else: nodeConfig.allowGpu = False + if nodeConfig.allowGpu and platform.system().lower() == "linux": + shouldUpdateDaemon = not docker.isDaemonFileUpdated() + + if shouldUpdateDaemon: + userApproval = ui.clickPrompt( + "Node container could randomly lose access to GPU. " + "The daemon.json file does not contain the necessary cgroup fix " + "(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48). " + "Do you want to update the file to include the cgroup fix? (Y/n)", + type = bool, + default = True + ) + + if userApproval: + docker.updateDaemonFile() + restartPrompt = ui.clickPrompt("Do you want to restart Docker to apply the changes? (Y/n)", type = bool, default = True) + + if restartPrompt: + docker.restartDocker() + else: + ui.warningEcho( + "Warning: The changes will not take effect until Docker is restarted. " + "(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)" + ) + else: + ui.warningEcho( + "Warning: Not updating the daemon.json file may lead to GPU access issues in Docker " + "containers. (https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)" + ) + if imageType == ImageType.official: tag = "gpu" if nodeConfig.allowGpu else "cpu" nodeConfig.image += f":latest-{tag}" diff --git a/coretex/utils/docker.py b/coretex/utils/docker.py index 8bd867fa..567a45ba 100644 --- a/coretex/utils/docker.py +++ b/coretex/utils/docker.py @@ -1,6 +1,7 @@ from typing import Dict, Any, List, Tuple, Optional from pathlib import Path +import os import json import platform @@ -22,7 +23,7 @@ def isDockerAvailable() -> None: def networkExists(name: str) -> bool: # This function inspects the specified Docker network using the - # 'docker network inspect' command. If the command exits with a return code + # "docker network inspect" command. If the command exits with a return code # of 0, indicating success, the function returns True, meaning the network exists. # If the command exits with a non-zero return code, indicating failure, # the function returns False, meaning the network doesn't exist. @@ -98,7 +99,7 @@ def start( runCommand = [ "docker", "run", "-d", - "--restart", 'always', + "--restart", "always", "-p", "21000:21000", "--cap-add", "SYS_PTRACE", "--network", name, @@ -198,3 +199,42 @@ def getLogs(name: str, tail: Optional[int], follow: bool, timestamps: bool) -> N runCommand.append("-f") command(runCommand) + + +def isDaemonFileUpdated() -> bool: + daemonFile = "/etc/docker/daemon.json" + cGroupFix = "native.cgroupdriver=cgroupfs" + + if os.path.exists(daemonFile): + with open(daemonFile, "r") as file: + try: + config = json.load(file) + execOpts = config.get("exec-opts", []) + return cGroupFix in execOpts + except json.JSONDecodeError: + return False + return False + + +def updateDaemonFile() -> None: + daemonFile = "/etc/docker/daemon.json" + cGroupFix = "native.cgroupdriver=cgroupfs" + config = {} + + if os.path.exists(daemonFile): + with open(daemonFile, "r") as file: + try: + config = json.load(file) + except json.JSONDecodeError: + config = {} + + execOpts: List[str] = config.get("exec-opts", []) + execOpts.append(cGroupFix) + config["exec-opts"] = execOpts + with open(daemonFile, "w") as file: + json.dump(config, file, indent = 4) + + +def restartDocker() -> None: + command(["sudo", "systemctl", "restart", "docker"], ignoreStderr = True, ignoreStdout = True) + print("Docker restarted successfully.") From 6dc58a3b36c1cb49ee801085740f23e1a1b28d49 Mon Sep 17 00:00:00 2001 From: Bogdan Tintor Date: Fri, 23 Aug 2024 16:54:35 +0200 Subject: [PATCH 2/7] CTX-6655: Workaround because we need sudo privileges to change daemon.json file --- coretex/utils/docker.py | 47 +++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/coretex/utils/docker.py b/coretex/utils/docker.py index 567a45ba..11558290 100644 --- a/coretex/utils/docker.py +++ b/coretex/utils/docker.py @@ -1,9 +1,9 @@ from typing import Dict, Any, List, Tuple, Optional from pathlib import Path -import os import json import platform +import tempfile from .process import command, CommandException from ..statistics import getTotalSwapMemory @@ -205,36 +205,43 @@ def isDaemonFileUpdated() -> bool: daemonFile = "/etc/docker/daemon.json" cGroupFix = "native.cgroupdriver=cgroupfs" - if os.path.exists(daemonFile): - with open(daemonFile, "r") as file: - try: - config = json.load(file) - execOpts = config.get("exec-opts", []) - return cGroupFix in execOpts - except json.JSONDecodeError: - return False - return False + if not Path(daemonFile).exists(): + return False + + with open(daemonFile, "r") as file: + try: + config = json.load(file) + execOpts = config.get("exec-opts", []) + return cGroupFix in execOpts + except json.JSONDecodeError: + return False def updateDaemonFile() -> None: daemonFile = "/etc/docker/daemon.json" cGroupFix = "native.cgroupdriver=cgroupfs" - config = {} + config: Dict[str, Any] = {} + + if not Path(daemonFile).exists(): + config = {} - if os.path.exists(daemonFile): - with open(daemonFile, "r") as file: - try: - config = json.load(file) - except json.JSONDecodeError: - config = {} + with open(daemonFile, "r") as file: + try: + config = json.load(file) + except json.JSONDecodeError: + config = {} execOpts: List[str] = config.get("exec-opts", []) execOpts.append(cGroupFix) config["exec-opts"] = execOpts - with open(daemonFile, "w") as file: - json.dump(config, file, indent = 4) + + with tempfile.NamedTemporaryFile("w", delete = False) as tempFile: + json.dump(config, tempFile, indent = 4) + tempFilePath = tempFile.name + + # Use sudo to move the temporary file to the protected location + command(["sudo", "mv", tempFilePath, daemonFile], ignoreStderr = True, ignoreStdout = True) def restartDocker() -> None: command(["sudo", "systemctl", "restart", "docker"], ignoreStderr = True, ignoreStdout = True) - print("Docker restarted successfully.") From 7b84feadc26fa0890e3f9fca6a437ad9bd7b933c Mon Sep 17 00:00:00 2001 From: Bogdan Tintor Date: Mon, 26 Aug 2024 09:58:16 +0200 Subject: [PATCH 3/7] CTX-6655: Changes regarding discussions with Dusko. If user is using docker desktop on linux he can't start coretex gpu image, prompt is edited with appropriate message. --- coretex/cli/modules/node.py | 8 ++++---- coretex/utils/docker.py | 8 ++++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/coretex/cli/modules/node.py b/coretex/cli/modules/node.py index 824857d0..368b2536 100644 --- a/coretex/cli/modules/node.py +++ b/coretex/cli/modules/node.py @@ -389,7 +389,7 @@ def configureNode(advanced: bool) -> NodeConfiguration: else: nodeConfig.image = "coretexai/coretex-node" - if isGPUAvailable(): + if isGPUAvailable() and not docker.isDockerDesktop(): nodeConfig.allowGpu = ui.clickPrompt("Do you want to allow the Node to access your GPU? (Y/n)", type = bool, default = True) else: nodeConfig.allowGpu = False @@ -399,10 +399,10 @@ def configureNode(advanced: bool) -> NodeConfiguration: if shouldUpdateDaemon: userApproval = ui.clickPrompt( - "Node container could randomly lose access to GPU. " - "The daemon.json file does not contain the necessary cgroup fix " + "NVIDIA has a bug where a docker container running Coretex Node can lose access to GPU " "(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48). " - "Do you want to update the file to include the cgroup fix? (Y/n)", + "\nDo you want Coretex CLI to apply a workaround for this bug " + "(NOTE: This requires docker daemon restart)? (Y/n)", type = bool, default = True ) diff --git a/coretex/utils/docker.py b/coretex/utils/docker.py index 11558290..c5eb91dc 100644 --- a/coretex/utils/docker.py +++ b/coretex/utils/docker.py @@ -201,6 +201,14 @@ def getLogs(name: str, tail: Optional[int], follow: bool, timestamps: bool) -> N command(runCommand) +def isDockerDesktop() -> bool: + try: + _, output, _ = command(["docker", "context", "show"], ignoreStdout = True, ignoreStderr = True) + return output.strip() == "desktop-linux" + except: + return False + + def isDaemonFileUpdated() -> bool: daemonFile = "/etc/docker/daemon.json" cGroupFix = "native.cgroupdriver=cgroupfs" From 97533bb66a97f5aae589c673d9cef03e2d2fbb23 Mon Sep 17 00:00:00 2001 From: Bogdan Tintor Date: Mon, 26 Aug 2024 11:08:50 +0200 Subject: [PATCH 4/7] CTX-6655: Changes regarding discussion with Dusko. --- coretex/cli/modules/node.py | 11 +++++------ coretex/utils/docker.py | 36 +++++++++++++++++++++++++----------- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/coretex/cli/modules/node.py b/coretex/cli/modules/node.py index 368b2536..87138e83 100644 --- a/coretex/cli/modules/node.py +++ b/coretex/cli/modules/node.py @@ -377,6 +377,7 @@ def checkResourceLimitations() -> None: def configureNode(advanced: bool) -> NodeConfiguration: ui.highlightEcho("[Node Configuration]") nodeConfig = NodeConfiguration({}) # create new empty node config + currentOS = platform.system().lower() cpuLimit, ramLimit = docker.getResourceLimits() swapLimit = docker.getDockerSwapLimit() @@ -389,16 +390,14 @@ def configureNode(advanced: bool) -> NodeConfiguration: else: nodeConfig.image = "coretexai/coretex-node" - if isGPUAvailable() and not docker.isDockerDesktop(): + if isGPUAvailable() and (currentOS != "linux" or not docker.isDockerDesktop()): nodeConfig.allowGpu = ui.clickPrompt("Do you want to allow the Node to access your GPU? (Y/n)", type = bool, default = True) else: nodeConfig.allowGpu = False if nodeConfig.allowGpu and platform.system().lower() == "linux": - shouldUpdateDaemon = not docker.isDaemonFileUpdated() - - if shouldUpdateDaemon: - userApproval = ui.clickPrompt( + if not docker.isDaemonFileUpdated(): + shouldUpdateDockerConfig = ui.clickPrompt( "NVIDIA has a bug where a docker container running Coretex Node can lose access to GPU " "(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48). " "\nDo you want Coretex CLI to apply a workaround for this bug " @@ -407,7 +406,7 @@ def configureNode(advanced: bool) -> NodeConfiguration: default = True ) - if userApproval: + if shouldUpdateDockerConfig: docker.updateDaemonFile() restartPrompt = ui.clickPrompt("Do you want to restart Docker to apply the changes? (Y/n)", type = bool, default = True) diff --git a/coretex/utils/docker.py b/coretex/utils/docker.py index c5eb91dc..09e6426d 100644 --- a/coretex/utils/docker.py +++ b/coretex/utils/docker.py @@ -203,20 +203,34 @@ def getLogs(name: str, tail: Optional[int], follow: bool, timestamps: bool) -> N def isDockerDesktop() -> bool: try: - _, output, _ = command(["docker", "context", "show"], ignoreStdout = True, ignoreStderr = True) - return output.strip() == "desktop-linux" + _, output, _ = command(["docker", "info", "--format", "{{json .}}"], ignoreStdout = True, ignoreStderr = True) + jsonOutput = json.loads(output) + + clientInfo = jsonOutput.get("ClientInfo") + if not isinstance(clientInfo, dict): + return False + + pluginsInfo = clientInfo.get("Plugins") + if not isinstance(pluginsInfo, dict): + return False + + versionInfo = pluginsInfo.get("Version") + if not isinstance(versionInfo, str): + return False + + return "desktop" in versionInfo except: return False def isDaemonFileUpdated() -> bool: - daemonFile = "/etc/docker/daemon.json" + daemonFile = Path("/etc/docker/daemon.json") cGroupFix = "native.cgroupdriver=cgroupfs" - if not Path(daemonFile).exists(): + if not daemonFile.exists(): return False - with open(daemonFile, "r") as file: + with daemonFile.open("r") as file: try: config = json.load(file) execOpts = config.get("exec-opts", []) @@ -226,14 +240,14 @@ def isDaemonFileUpdated() -> bool: def updateDaemonFile() -> None: - daemonFile = "/etc/docker/daemon.json" + daemonFile = Path("/etc/docker/daemon.json") cGroupFix = "native.cgroupdriver=cgroupfs" config: Dict[str, Any] = {} - if not Path(daemonFile).exists(): + if not daemonFile.exists(): config = {} - with open(daemonFile, "r") as file: + with daemonFile.open("r") as file: try: config = json.load(file) except json.JSONDecodeError: @@ -243,12 +257,12 @@ def updateDaemonFile() -> None: execOpts.append(cGroupFix) config["exec-opts"] = execOpts - with tempfile.NamedTemporaryFile("w", delete = False) as tempFile: + with tempfile.NamedTemporaryFile("w", delete = True) as tempFile: json.dump(config, tempFile, indent = 4) tempFilePath = tempFile.name - # Use sudo to move the temporary file to the protected location - command(["sudo", "mv", tempFilePath, daemonFile], ignoreStderr = True, ignoreStdout = True) + # Use sudo to move the temporary file to the protected location + command(["sudo", "mv", tempFilePath, str(daemonFile)], ignoreStderr = True, ignoreStdout = True) def restartDocker() -> None: From e63414accd4e6ce912d052e29984abbbda715292 Mon Sep 17 00:00:00 2001 From: Bogdan Tintor Date: Tue, 27 Aug 2024 10:00:18 +0200 Subject: [PATCH 5/7] CTX-6655: Simplify if condition and explain it. --- coretex/cli/modules/node.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/coretex/cli/modules/node.py b/coretex/cli/modules/node.py index 87138e83..9b507bd4 100644 --- a/coretex/cli/modules/node.py +++ b/coretex/cli/modules/node.py @@ -390,7 +390,11 @@ def configureNode(advanced: bool) -> NodeConfiguration: else: nodeConfig.image = "coretexai/coretex-node" - if isGPUAvailable() and (currentOS != "linux" or not docker.isDockerDesktop()): + # NOTE: If node OS is linux and Docker Desktop is being used CLI won't prompt for GPU access + # Reason is that we cannot solve the NVIDIA bug without additionals changes to daemon.json file + # which doesn't exist when using Docker Desktop + + if isGPUAvailable() and not (docker.isDockerDesktop() and currentOS != "windows"): nodeConfig.allowGpu = ui.clickPrompt("Do you want to allow the Node to access your GPU? (Y/n)", type = bool, default = True) else: nodeConfig.allowGpu = False From 15592b92dca4400eda707c5e4e11486b3e26582f Mon Sep 17 00:00:00 2001 From: Bogdan Tintor Date: Tue, 27 Aug 2024 10:20:05 +0200 Subject: [PATCH 6/7] CTX-6655: Other discussion changes --- coretex/cli/modules/node.py | 45 ++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/coretex/cli/modules/node.py b/coretex/cli/modules/node.py index 9b507bd4..0f4d6b7d 100644 --- a/coretex/cli/modules/node.py +++ b/coretex/cli/modules/node.py @@ -399,33 +399,32 @@ def configureNode(advanced: bool) -> NodeConfiguration: else: nodeConfig.allowGpu = False - if nodeConfig.allowGpu and platform.system().lower() == "linux": - if not docker.isDaemonFileUpdated(): - shouldUpdateDockerConfig = ui.clickPrompt( - "NVIDIA has a bug where a docker container running Coretex Node can lose access to GPU " - "(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48). " - "\nDo you want Coretex CLI to apply a workaround for this bug " - "(NOTE: This requires docker daemon restart)? (Y/n)", - type = bool, - default = True - ) + if nodeConfig.allowGpu and platform.system().lower() == "linux" and not docker.isDaemonFileUpdated(): + shouldUpdateDockerConfig = ui.clickPrompt( + "NVIDIA has a bug where a docker container running Coretex Node can lose access to GPU " + "(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48). " + "\nDo you want Coretex CLI to apply a workaround for this bug " + "(NOTE: This requires docker daemon restart)? (Y/n)", + type = bool, + default = True + ) - if shouldUpdateDockerConfig: - docker.updateDaemonFile() - restartPrompt = ui.clickPrompt("Do you want to restart Docker to apply the changes? (Y/n)", type = bool, default = True) - - if restartPrompt: - docker.restartDocker() - else: - ui.warningEcho( - "Warning: The changes will not take effect until Docker is restarted. " - "(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)" - ) + if shouldUpdateDockerConfig: + docker.updateDaemonFile() + shouldRestartDocker = ui.clickPrompt("Do you want to restart Docker to apply the changes? (Y/n)", type = bool, default = True) + + if shouldRestartDocker: + docker.restartDocker() else: ui.warningEcho( - "Warning: Not updating the daemon.json file may lead to GPU access issues in Docker " - "containers. (https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)" + "Warning: The changes will not take effect until Docker is restarted. " + "(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)" ) + else: + ui.warningEcho( + "Warning: Not updating the daemon.json file may lead to GPU access issues in Docker " + "containers. (https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)" + ) if imageType == ImageType.official: tag = "gpu" if nodeConfig.allowGpu else "cpu" From 8d0b69e3ee4637c85587cf720fa595d203129389 Mon Sep 17 00:00:00 2001 From: Bogdan Tintor Date: Tue, 27 Aug 2024 14:30:09 +0200 Subject: [PATCH 7/7] CTX-6655: Change incorrect comment to correct one. --- coretex/cli/modules/node.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/coretex/cli/modules/node.py b/coretex/cli/modules/node.py index 0f4d6b7d..80696556 100644 --- a/coretex/cli/modules/node.py +++ b/coretex/cli/modules/node.py @@ -390,9 +390,9 @@ def configureNode(advanced: bool) -> NodeConfiguration: else: nodeConfig.image = "coretexai/coretex-node" - # NOTE: If node OS is linux and Docker Desktop is being used CLI won't prompt for GPU access - # Reason is that we cannot solve the NVIDIA bug without additionals changes to daemon.json file - # which doesn't exist when using Docker Desktop + # GPU Access is supported for: + # - Linux (Docker Engine) + # - Windows (Docker Desktop) if isGPUAvailable() and not (docker.isDockerDesktop() and currentOS != "windows"): nodeConfig.allowGpu = ui.clickPrompt("Do you want to allow the Node to access your GPU? (Y/n)", type = bool, default = True)