From b615a12d8ab2a2413022be8de34252dd32bc5385 Mon Sep 17 00:00:00 2001 From: Sergey Mezentsev Date: Wed, 8 May 2024 15:04:54 +0400 Subject: [PATCH] Improvements for adding SSH instances (#1202) * Change the GPU's full name to a short one * SSH instances must be filtered with other instances * Fixed the region for the instance * Check the instance in the existing ones before adding them --- .../_internal/core/backends/__init__.py | 1 + .../core/backends/remote/provisioning.py | 4 ++- .../background/tasks/process_instances.py | 8 +++--- src/dstack/_internal/server/services/pools.py | 14 ++++++---- src/dstack/_internal/utils/gpu.py | 27 +++++++++++++++++++ src/tests/_internal/utils/test_gpu.py | 19 +++++++++++++ 6 files changed, 63 insertions(+), 10 deletions(-) create mode 100644 src/dstack/_internal/utils/gpu.py create mode 100644 src/tests/_internal/utils/test_gpu.py diff --git a/src/dstack/_internal/core/backends/__init__.py b/src/dstack/_internal/core/backends/__init__.py index de82afea4..de5d4af91 100644 --- a/src/dstack/_internal/core/backends/__init__.py +++ b/src/dstack/_internal/core/backends/__init__.py @@ -4,6 +4,7 @@ BackendType.AWS, BackendType.AZURE, BackendType.GCP, + BackendType.REMOTE, ] BACKENDS_WITH_CREATE_INSTANCE_SUPPORT = [ BackendType.AWS, diff --git a/src/dstack/_internal/core/backends/remote/provisioning.py b/src/dstack/_internal/core/backends/remote/provisioning.py index 0b1cfec8a..5ba76f8e8 100644 --- a/src/dstack/_internal/core/backends/remote/provisioning.py +++ b/src/dstack/_internal/core/backends/remote/provisioning.py @@ -13,6 +13,7 @@ InstanceType, Resources, ) +from dstack._internal.utils.gpu import convert_gpu_name from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) @@ -188,9 +189,10 @@ def get_shim_healthcheck(client: paramiko.SSHClient) -> str: def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType: + gpu_name = convert_gpu_name(host_info["gpu_name"]) if host_info.get("gpu_count", 0): gpu_memory = int(host_info["gpu_memory"].lower().replace("mib", "").strip()) - gpus = [Gpu(name=host_info["gpu_name"], memory_mib=gpu_memory)] * host_info["gpu_count"] + gpus = [Gpu(name=gpu_name, memory_mib=gpu_memory)] * host_info["gpu_count"] else: gpus = [] instance_type = InstanceType( diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index b67ecd079..1e4b0f673 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -252,12 +252,14 @@ async def add_remote(instance_id: UUID) -> None: continue internal_ip = addresses[0] if addresses else None + region = instance.region + jpd = JobProvisioningData( backend=BackendType.REMOTE, instance_type=instance_type, instance_id="instance_id", hostname=remote_details.host, - region="remote", + region=region, price=0, internal_ip=internal_ip, username=remote_details.ssh_user, @@ -270,12 +272,10 @@ async def add_remote(instance_id: UUID) -> None: instance.status = InstanceStatus.IDLE if health else InstanceStatus.PROVISIONING instance.backend = BackendType.REMOTE - instance.region = "remote" - instance_offer = InstanceOfferWithAvailability( backend=BackendType.REMOTE, instance=instance_type, - region="remote", + region=region, price=0, availability=InstanceAvailability.AVAILABLE, instance_runtime=InstanceRuntime.SHIM, diff --git a/src/dstack/_internal/server/services/pools.py b/src/dstack/_internal/server/services/pools.py index 267c5cb98..c27705c24 100644 --- a/src/dstack/_internal/server/services/pools.py +++ b/src/dstack/_internal/server/services/pools.py @@ -258,6 +258,8 @@ async def add_remote( pools = await list_project_pool_models(session, project) for pool in pools: for instance in pool.instances: + if instance.deleted: + continue if instance.remote_connection_info is not None: rci = RemoteConnectionInfo.__response__.parse_raw(instance.remote_connection_info) if rci.host == host and rci.port == port and rci.ssh_user == ssh_user: @@ -272,12 +274,14 @@ async def add_remote( instance_resource = Resources(cpus=2, memory_mib=8, gpus=[], spot=False) instance_type = InstanceType(name="ssh", resources=instance_resource) - local = JobProvisioningData( + host_region = region if region is not None else "remote" + + remote = JobProvisioningData( backend=BackendType.REMOTE, instance_type=instance_type, instance_id=instance_name, hostname=host, - region=region or "remote", + region=host_region, internal_ip=None, price=0, username=ssh_user, @@ -289,7 +293,7 @@ async def add_remote( offer = InstanceOfferWithAvailability( backend=BackendType.REMOTE, instance=instance_type, - region=region or "remote", + region=host_region, price=0.0, availability=InstanceAvailability.AVAILABLE, ) @@ -306,7 +310,7 @@ async def add_remote( created_at=common_utils.get_current_datetime(), started_at=common_utils.get_current_datetime(), status=InstanceStatus.PENDING, - job_provisioning_data=local.json(), + job_provisioning_data=remote.json(), remote_connection_info=ssh_connection_info, offer=offer.json(), region=offer.region, @@ -342,7 +346,7 @@ def filter_pool_instances( continue if instance.backend == BackendType.REMOTE: - instances.append(instance) + candidates.append(instance) continue # TODO: remove on prod diff --git a/src/dstack/_internal/utils/gpu.py b/src/dstack/_internal/utils/gpu.py new file mode 100644 index 000000000..2455c545f --- /dev/null +++ b/src/dstack/_internal/utils/gpu.py @@ -0,0 +1,27 @@ +import re + + +def convert_gpu_name(name: str) -> str: + """Convert gpu_name from nvidia-smi to short version""" + # https://github.com/NVIDIA/open-gpu-kernel-modules/ + name = name.replace("NVIDIA ", "") + name = name.replace("Tesla ", "") + name = name.replace("Quadro ", "") + name = name.replace("GeForce ", "") + + if "GH200" in name: + return "GH200" + + if "RTX A" in name: + name = name.replace("RTX A", "A") + m = re.search(r"(A\d+)", name) + if m is not None: + return m.group(0) + return name.replace(" ", "") + + name = name.replace(" Ti", "Ti") + name = name.replace("RTX ", "RTX") + m = re.search(r"([A|H|L|P|T|V]\d+[Ti]?)", name) + if m is not None: + return m.group(0) + return name.replace(" ", "") diff --git a/src/tests/_internal/utils/test_gpu.py b/src/tests/_internal/utils/test_gpu.py new file mode 100644 index 000000000..fc7bd39a9 --- /dev/null +++ b/src/tests/_internal/utils/test_gpu.py @@ -0,0 +1,19 @@ +import pytest + +from dstack._internal.utils.gpu import convert_gpu_name + +TESTS = [ + ("NVIDIA GeForce RTX 4060 Ti", "RTX4060Ti"), + ("NVIDIA GeForce RTX 4060", "RTX4060"), + ("NVIDIA L4", "L4"), + ("NVIDIA GH200 120GB", "GH200"), + ("NVIDIA A100-SXM4-80GB", "A100"), + ("NVIDIA A10G", "A10"), + ("Tesla T4", "T4"), +] + + +class TestConvertGpuName: + @pytest.mark.parametrize("test_input,expected", TESTS) + def test_convert_gpu_name(self, test_input, expected): + assert convert_gpu_name(test_input) == expected