Skip to content

Commit

Permalink
Filter out malformed nvidia-smi process_name XML tag
Browse files Browse the repository at this point in the history
  • Loading branch information
jfennick authored and mr-c committed Oct 2, 2023
1 parent 83b99b4 commit ebaeb74
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 86 deletions.
51 changes: 36 additions & 15 deletions cwltool/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,47 @@
from .utils import CWLObjectType


def cuda_device_count() -> str:
"""Determine the number of attached CUDA GPUs."""
# For the number of GPUs, we can use the following query
cmd = ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"]
try:
# This is equivalent to subprocess.check_output, but use
# subprocess.run so we can use separate MagicMocks in test_cuda.py
proc = subprocess.run(cmd, stdout=subprocess.PIPE, check=True) # nosec
except Exception as e:
_logger.warning("Error checking number of GPUs with nvidia-smi: %s", e)
return "0"
return proc.stdout.decode("utf-8")

Check warning on line 22 in cwltool/cuda.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cuda.py#L22

Added line #L22 was not covered by tests


def cuda_version_and_device_count() -> Tuple[str, int]:
"""Determine the CUDA version and number of attached CUDA GPUs."""
count = int(cuda_device_count())

# Since there is no specific query for the cuda version, we have to use
# `nvidia-smi -q -x`
# However, apparently nvidia-smi is not safe to call concurrently.
# With --parallel, sometimes the returned XML will contain
# <process_name>\xff...\xff</process_name>
# (or other arbitrary bytes) and xml.dom.minidom.parseString will raise
# "xml.parsers.expat.ExpatError: not well-formed (invalid token)"
# So we either need to use `grep -v process_name` to blacklist that tag,
# (and hope that no other tags cause problems in the future)
# or better yet use `grep cuda_version` to only grab the tags we will use.
cmd = "nvidia-smi -q -x | grep cuda_version"
try:
out = subprocess.check_output(["nvidia-smi", "-q", "-x"]) # nosec
out = subprocess.check_output(cmd, shell=True) # nosec
except Exception as e:
_logger.warning("Error checking CUDA version with nvidia-smi: %s", e)
return ("", 0)
dm = xml.dom.minidom.parseString(out) # nosec

ag = dm.getElementsByTagName("attached_gpus")
if len(ag) < 1 or ag[0].firstChild is None:
_logger.warning(
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty.: %s",
out,
)
try:
dm = xml.dom.minidom.parseString(out) # nosec
except xml.parsers.expat.ExpatError as e:
_logger.warning("Error parsing XML stdout of nvidia-smi: %s", e)
_logger.warning("stdout: %s", out)
return ("", 0)
ag_element = ag[0].firstChild

cv = dm.getElementsByTagName("cuda_version")
if len(cv) < 1 or cv[0].firstChild is None:
Expand All @@ -35,13 +59,10 @@ def cuda_version_and_device_count() -> Tuple[str, int]:
return ("", 0)
cv_element = cv[0].firstChild

if isinstance(cv_element, xml.dom.minidom.Text) and isinstance(
ag_element, xml.dom.minidom.Text
):
return (cv_element.data, int(ag_element.data))
if isinstance(cv_element, xml.dom.minidom.Text):
return (cv_element.data, count)
_logger.warning(
"Error checking CUDA version with nvidia-smi. "
"Either 'attached_gpus' or 'cuda_version' was not a text node: %s",
"Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node: %s",
out,
)
return ("", 0)
Expand Down
98 changes: 28 additions & 70 deletions tests/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,11 @@ def _makebuilder(cudaReq: CWLObjectType) -> Builder:


@mock.patch("subprocess.check_output")
@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> None:
def test_cuda_job_setup_check(
makedirs: MagicMock, count: MagicMock, check_output: MagicMock
) -> None:
runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
Expand All @@ -101,74 +104,43 @@ def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> N
}
builder = _makebuilder(cudaReq)

count.return_value = "1"
check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version>1.0</cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
jb._setup(runtime_context)


@mock.patch("subprocess.check_output")
@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock) -> None:
runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
"cudaVersionMin": "2.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version>1.0</cuda_version>
</nvidia>
"""
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)


@mock.patch("subprocess.check_output")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_empty_attached_gpus(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
def test_cuda_job_setup_check_err(
makedirs: MagicMock, count: MagicMock, check_output: MagicMock
) -> None:
runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
"cudaVersionMin": "1.0",
"cudaVersionMin": "2.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)

count.return_value = "1"
check_output.return_value = """
<nvidia>
<attached_gpus></attached_gpus>
<cuda_version>1.0</cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
in caplog.text
)


@mock.patch("subprocess.check_output")
@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_empty_missing_attached_gpus(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
def test_cuda_job_setup_check_err_missing_query_gpu_count(
makedirs: MagicMock, count: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})

Expand All @@ -179,25 +151,19 @@ def test_cuda_job_setup_check_err_empty_missing_attached_gpus(
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<cuda_version>1.0</cuda_version>
</nvidia>
"""
count.return_value = ""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
in caplog.text
)
assert "invalid literal for int() with base 10" in caplog.text


@mock.patch("subprocess.check_output")
@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_empty_cuda_version(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})

Expand All @@ -208,11 +174,9 @@ def test_cuda_job_setup_check_err_empty_cuda_version(
}
builder = _makebuilder(cudaReq)

count.return_value = "1"
check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version></cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
Expand All @@ -225,9 +189,10 @@ def test_cuda_job_setup_check_err_empty_cuda_version(


@mock.patch("subprocess.check_output")
@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_missing_cuda_version(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})

Expand All @@ -238,25 +203,20 @@ def test_cuda_job_setup_check_err_missing_cuda_version(
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
</nvidia>
"""
count.return_value = "1"
check_output.return_value = ""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
"Error checking CUDA version with nvidia-smi. Missing 'cuda_version' or it is empty."
in caplog.text
)
assert "Error parsing XML stdout of nvidia-smi" in caplog.text


@mock.patch("subprocess.check_output")
@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_wrong_type_cuda_version(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})

Expand All @@ -267,19 +227,17 @@ def test_cuda_job_setup_check_err_wrong_type_cuda_version(
}
builder = _makebuilder(cudaReq)

count.return_value = "1"
check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version><subelement /></cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
"Error checking CUDA version with nvidia-smi. "
"Either 'attached_gpus' or 'cuda_version' was not a text node" in caplog.text
"Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node"
in caplog.text
)


Expand Down
2 changes: 1 addition & 1 deletion tests/wf/nvidia-smi-container.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ requirements:
cudaVersionMin: "1.0"
cudaComputeCapability: "1.0"
DockerRequirement:
dockerPull: "nvidia/cuda:11.4.2-runtime-ubuntu20.04"
dockerPull: "nvidia/cuda:11.4.3-runtime-ubuntu20.04"
inputs: []
outputs: []
# Assume this will exit non-zero (resulting in a failing test case) if
Expand Down

0 comments on commit ebaeb74

Please sign in to comment.