Skip to content

Commit

Permalink
[FIX] Must validate ENV settings or wrong gpu selected by nvidia-smi (a…
Browse files Browse the repository at this point in the history
…pache#59)

* Raise error if CUDA_DEVICE_ORDER=PCI_BUS_ID env is not applied in multi-gpu sys

* quotes

* protect bad gpu_id

* fix logic

* clean comment

* Update target_detector.py

---------

Co-authored-by: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
  • Loading branch information
Qubitium and LeiWang1999 authored Jun 21, 2024
1 parent cca477e commit 2634815
Showing 1 changed file with 12 additions and 4 deletions.
16 changes: 12 additions & 4 deletions python/bitblas/utils/target_detector.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import os
import subprocess
from typing import List
from thefuzz import process
Expand All @@ -26,16 +26,24 @@ def get_gpu_model_from_nvidia_smi(gpu_id: int = 0):
try:
# Execute nvidia-smi command to get the GPU name
output = subprocess.check_output(
["nvidia-smi", f"--id={gpu_id}", "--query-gpu=gpu_name", "--format=csv,noheader"],
["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader"],
encoding="utf-8",
).strip()
except subprocess.CalledProcessError as e:
logger.info("nvidia-smi failed with error: %s", e)
return None

# Return the name of the first GPU if multiple are present
return output.split("\n")[0]
gpus = output.split("\n")

# for multiple gpus, CUDA_DEVICE_ORDER=PCI_BUS_ID must be set to match nvidia-smi or else wrong
# gpu is returned for gpu_id
if len(gpus) > 0 and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID":
raise EnvironmentError("Multi-gpu environment must set `CUDA_DEVICE_ORDER=PCI_BUS_ID`.")

if gpu_id >= len(gpus) or gpu_id < 0:
raise ValueError(f"Passed gpu_id:{gpu_id} but there are {len(gpus)} detected Nvidia gpus.")

return gpus[gpu_id]

def find_best_match(tags, query):
"""
Expand Down

0 comments on commit 2634815

Please sign in to comment.