diff --git a/docs/docs/concepts/fleets.md b/docs/docs/concepts/fleets.md index 876112a4a..f148316bd 100644 --- a/docs/docs/concepts/fleets.md +++ b/docs/docs/concepts/fleets.md @@ -47,8 +47,8 @@ This ensures all instances are provisioned in the same backend and region with o ??? info "AWS" `dstack` automatically enables the Elastic Fabric Adapter for all [EFA-capable instance types :material-arrow-top-right-thin:{ .external }](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types){:target="_blank"}. - Currently, only one EFA interface is enabled per instance, regardless of its maximum capacity. - This will change once [this issue :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/1804){:target="_blank"} is resolved. + If the `aws` backend config has `public_ips: false` set, `dstack` enables the maximum number of interfaces supported by the instance. + Otherwise, if instances have public IPs, only one EFA interface is enabled per instance due to AWS limitations. > The `cluster` placement is supported only for `aws`, `azure`, `gcp`, `oci`, and `vultr` > backends. diff --git a/src/dstack/_internal/core/backends/aws/compute.py b/src/dstack/_internal/core/backends/aws/compute.py index 164f6ed25..7b7d2374d 100644 --- a/src/dstack/_internal/core/backends/aws/compute.py +++ b/src/dstack/_internal/core/backends/aws/compute.py @@ -240,6 +240,7 @@ def create_instance( allocate_public_ip=allocate_public_ip, placement_group_name=instance_config.placement_group_name, enable_efa=enable_efa, + max_efa_interfaces=max_efa_interfaces, reservation_id=instance_config.reservation, is_capacity_block=is_capacity_block, ) diff --git a/src/dstack/_internal/core/backends/aws/resources.py b/src/dstack/_internal/core/backends/aws/resources.py index 1819b972f..ae45d1634 100644 --- a/src/dstack/_internal/core/backends/aws/resources.py +++ b/src/dstack/_internal/core/backends/aws/resources.py @@ -140,6 +140,7 @@ def create_instances_struct( allocate_public_ip: bool = True, placement_group_name: Optional[str] = None, enable_efa: bool = False, + max_efa_interfaces: int = 0, reservation_id: Optional[str] = None, is_capacity_block: bool = False, ) -> Dict[str, Any]: @@ -183,7 +184,7 @@ def create_instances_struct( # AWS allows specifying either NetworkInterfaces for specific subnet_id # or instance-level SecurityGroupIds in case of no specific subnet_id, not both. if subnet_id is not None: - # Even if the instance type supports multiple cards, we always request only one interface + # If the instance type supports multiple cards, we request multiple interfaces only if not allocate_public_ip # due to the limitation: "AssociatePublicIpAddress [...] You cannot specify more than one # network interface in the request". # Error message: "(InvalidParameterCombination) when calling the RunInstances operation: @@ -199,9 +200,28 @@ def create_instances_struct( "DeviceIndex": 0, "SubnetId": subnet_id, "Groups": [security_group_id], - "InterfaceType": "efa" if enable_efa else "interface", + "InterfaceType": "efa" if max_efa_interfaces > 0 else "interface", }, ] + + if max_efa_interfaces > 1 and allocate_public_ip is False: + for i in range(1, max_efa_interfaces): + # Set to efa-only to use interfaces exclusively for GPU-to-GPU communication + interface_type = "efa-only" + if instance_type == "p5.48xlarge": + # EFA configuration for P5 instances: + # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html#efa-for-p5 + interface_type = "efa" if i % 4 == 0 else "efa-only" + struct["NetworkInterfaces"].append( + { + "AssociatePublicIpAddress": allocate_public_ip, + "NetworkCardIndex": i, + "DeviceIndex": 1, + "SubnetId": subnet_id, + "Groups": [security_group_id], + "InterfaceType": interface_type, + } + ) else: struct["SecurityGroupIds"] = [security_group_id]