Skip to content

Commit

Permalink
prov/verbs: Allow RDMACM to connect using GIDs
Browse files Browse the repository at this point in the history
The patch allows the Verbs provider to directly connect to the
network adapters using the GID. In other words, the patch allows
to use Libfabric even if there is no IP address set for the
Infiniband interfaces.

There are significant issues of issues IP addresses for connection
establishment:
- It requires to set up/maintain IP addresses for every IB interfaces.
- In the context of multirail (multiple local interfaces that belong
  to the same network subnet), it requires specific IP routes to
  prevent an interface to reply for another one. Connection
  establishment would fail otherwise.

The GID can be accessed  by looking at the field src_addr returned
by "fi_info -p verbs -v".

Example of output:
src_addr: fi_sockaddr_ib://fe80:0000:0000:0000:248a:0703:003f:1f6a

The patch also modifies fabtest so anybody can start testing this
new feature. A new option -F allows to specify the address format
that is use for the source/destination addresses.

After figuring out the GID of interface that will be used for the
server, one can run the following commands with fabtest:

Server:
fi_msg_bw -s fe80:0000:0000:0000:248a:0703:003f:1f6a -e msg \
-p verbs -F FI_SOCKADDR_IB

Client:
fi_msg_bw -e msg -p verbs \
-F FI_SOCKADDR_IB fe80:0000:0000:0000:248a:0703:003f:1f6a

Signed-off-by: Sylvain Didelot <sdidelot@ddn.com>
  • Loading branch information
sydidelot committed Feb 5, 2020
1 parent 1b8ed78 commit 0326c82
Show file tree
Hide file tree
Showing 10 changed files with 214 additions and 13 deletions.
1 change: 1 addition & 0 deletions fabtests/benchmarks/msg_bw.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ int main(int argc, char **argv)
hints->domain_attr->resource_mgmt = FI_RM_ENABLED;
hints->domain_attr->mr_mode = opts.mr_mode;
hints->domain_attr->threading = FI_THREAD_DOMAIN;
hints->addr_format = opts.address_format;

ret = run();

Expand Down
1 change: 1 addition & 0 deletions fabtests/benchmarks/msg_pingpong.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ int main(int argc, char **argv)
hints->caps = FI_MSG;
hints->domain_attr->mr_mode = opts.mr_mode;
hints->domain_attr->threading = FI_THREAD_DOMAIN;
hints->addr_format = opts.address_format;

ret = run();

Expand Down
1 change: 1 addition & 0 deletions fabtests/benchmarks/rma_bw.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ int main(int argc, char **argv)
hints->domain_attr->resource_mgmt = FI_RM_ENABLED;
hints->mode = FI_CONTEXT;
hints->domain_attr->threading = FI_THREAD_DOMAIN;
hints->addr_format = opts.address_format;

while ((op = getopt(argc, argv, "ho:" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) {
switch (op) {
Expand Down
7 changes: 7 additions & 0 deletions fabtests/common/shared.c
Original file line number Diff line number Diff line change
Expand Up @@ -2704,6 +2704,7 @@ void ft_addr_usage()
"over the, optional, port");
FT_PRINT_OPTS_USAGE("-C <number>", "number of connections to accept before "
"cleaning up a server");
FT_PRINT_OPTS_USAGE("-F <addr_format>", "Address format (default:FI_FORMAT_UNSPEC)");
}

void ft_usage(char *name, char *desc)
Expand Down Expand Up @@ -2850,6 +2851,12 @@ void ft_parse_addr_opts(int op, char *optarg, struct ft_opts *opts)
else
opts->oob_port = default_oob_port;
break;
case 'F':
if (strcmp(optarg, "FI_FORMAT_UNSPEC") == 0)
opts->address_format = FI_FORMAT_UNSPEC;
else if (strcmp(optarg, "FI_SOCKADDR_IB") == 0)
opts->address_format = FI_SOCKADDR_IB;
break;
case 'C':
opts->options |= FT_OPT_SERVER_PERSIST;
opts->num_connections = atoi(optarg);
Expand Down
6 changes: 4 additions & 2 deletions fabtests/include/shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ struct ft_opts {
char *oob_port;
int argc;
int num_connections;
int address_format;

uint64_t mr_mode;
/* Fail if the selected provider does not support FI_MSG_PREFIX. */
Expand Down Expand Up @@ -239,7 +240,7 @@ extern int ft_parent_proc;
extern int ft_socket_pair[2];
extern int sock;
extern int listen_sock;
#define ADDR_OPTS "B:P:s:a:b::E::C:"
#define ADDR_OPTS "B:P:s:a:b::E::C:F:"
#define FAB_OPTS "f:d:p:"
#define INFO_OPTS FAB_OPTS "e:M:"
#define CS_OPTS ADDR_OPTS "I:S:mc:t:w:l"
Expand All @@ -261,7 +262,8 @@ extern char default_port[8];
.rma_op = FT_RMA_WRITE, \
.oob_port = NULL, \
.mr_mode = FI_MR_LOCAL | OFI_MR_BASIC_MAP, \
.argc = argc, .argv = argv \
.argc = argc, .argv = argv, \
.address_format = FI_FORMAT_UNSPEC \
}

#define FT_STR_LEN 32
Expand Down
4 changes: 2 additions & 2 deletions prov/verbs/src/fi_verbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ int vrb_get_rai_id(const char *node, const char *service, uint64_t flags,
if (ret)
return ret;

ret = rdma_create_id(NULL, id, NULL, RDMA_PS_TCP);
ret = rdma_create_id(NULL, id, NULL, vrb_get_port_space(hints));
if (ret) {
VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_create_id", errno);
ret = -errno;
Expand Down Expand Up @@ -217,7 +217,7 @@ int vrb_create_ep(const struct fi_info *hints, enum rdma_port_space ps,
return ret;
}

if (rdma_create_id(NULL, id, NULL, ps)) {
if (rdma_create_id(NULL, id, NULL, vrb_get_port_space(hints))) {
ret = -errno;
FI_WARN(&vrb_prov, FI_LOG_FABRIC, "rdma_create_id failed: "
"%s (%d)\n", strerror(-ret), -ret);
Expand Down
1 change: 1 addition & 0 deletions prov/verbs/src/fi_verbs.h
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,7 @@ int vrb_get_rdma_rai(const char *node, const char *service, uint64_t flags,
int vrb_get_matching_info(uint32_t version, const struct fi_info *hints,
struct fi_info **info, const struct fi_info *verbs_info,
uint8_t passive);
int vrb_get_port_space(const struct fi_info *info);
void vrb_alter_info(const struct fi_info *hints, struct fi_info *info);

struct verbs_ep_domain {
Expand Down
11 changes: 6 additions & 5 deletions prov/verbs/src/verbs_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ static int vrb_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
VERBS_WARN(FI_LOG_DOMAIN,
"Rx CQ is fully reserved\n");
ep->rx_cq_size = 0;
}
}
cq->credits -= ep->rx_cq_size;
cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock);
}
Expand Down Expand Up @@ -1013,11 +1013,11 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
goto err1;
}

if (info->ep_attr->rx_ctx_cnt == 0 ||
if (info->ep_attr->rx_ctx_cnt == 0 ||
info->ep_attr->rx_ctx_cnt == 1)
ep->rx_cq_size = info->rx_attr->size;
if (info->ep_attr->tx_ctx_cnt == 0 ||

if (info->ep_attr->tx_ctx_cnt == 0 ||
info->ep_attr->tx_ctx_cnt == 1)
ep->tx_credits = info->tx_attr->size;

Expand Down Expand Up @@ -1154,7 +1154,8 @@ int vrb_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
_pep->info->dest_addrlen = 0;
}

ret = rdma_create_id(NULL, &_pep->id, &_pep->pep_fid.fid, RDMA_PS_TCP);
ret = rdma_create_id(NULL, &_pep->id, &_pep->pep_fid.fid,
vrb_get_port_space(info));
if (ret) {
VERBS_INFO(FI_LOG_DOMAIN, "Unable to create PEP rdma_cm_id\n");
goto err2;
Expand Down
169 changes: 166 additions & 3 deletions prov/verbs/src/verbs_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include <ifaddrs.h>
#include <net/if.h>
#include <stdint.h>
#include <rdma/rdma_cma.h>

#include "fi_verbs.h"

Expand Down Expand Up @@ -291,26 +292,30 @@ int vrb_fi_to_rai(const struct fi_info *fi, uint64_t flags,
rai->ai_flags |= RAI_NUMERICHOST;

rai->ai_qp_type = IBV_QPT_RC;
rai->ai_port_space = RDMA_PS_TCP;

if (!fi)
return 0;

switch(fi->addr_format) {
case FI_SOCKADDR_IN:
case FI_FORMAT_UNSPEC:
rai->ai_port_space = RDMA_PS_TCP;
rai->ai_family = AF_INET;
rai->ai_flags |= RAI_FAMILY;
break;
case FI_SOCKADDR_IN6:
rai->ai_port_space = RDMA_PS_TCP;
rai->ai_family = AF_INET6;
rai->ai_flags |= RAI_FAMILY;
break;
case FI_SOCKADDR_IB:
rai->ai_port_space = RDMA_PS_IB;
rai->ai_family = AF_IB;
rai->ai_flags |= RAI_FAMILY;
/* FIXME: SOCKADDRI_IB requires RAI_NUMERICHOST */
rai->ai_flags |= RAI_FAMILY | RAI_NUMERICHOST;
break;
case FI_SOCKADDR:
rai->ai_port_space = RDMA_PS_TCP;
if (fi->src_addrlen) {
rai->ai_family = ((struct sockaddr *)fi->src_addr)->sa_family;
rai->ai_flags |= RAI_FAMILY;
Expand Down Expand Up @@ -788,7 +793,7 @@ static int vrb_alloc_info(struct ibv_context *ctx, struct fi_info **info,
assert(0);
return -FI_EINVAL;
}


*(fi->fabric_attr) = verbs_fabric_attr;

Expand Down Expand Up @@ -1007,6 +1012,162 @@ static int vrb_ifa_rdma_info(const struct ifaddrs *ifa, char **dev_name,
return ret;
}

int vrb_get_port_space(const struct fi_info *info)
{
if (info != NULL && info->addr_format == FI_SOCKADDR_IB)
return RDMA_PS_IB;
else
return RDMA_PS_TCP;
}

static int vrb_gid_is_null(const union ibv_gid *gid)
{
return !(gid->raw[8] | gid->raw[9] | gid->raw[10] | gid->raw[11] |
gid->raw[12] | gid->raw[13] | gid->raw[14] | gid->raw[15]);
}

static int vrb_gid_to_str(const union ibv_gid *gid, char gid_str[40])
{
if (sprintf(gid_str, "%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x",
gid->raw[ 0], gid->raw[ 1],
gid->raw[ 2], gid->raw[ 3],
gid->raw[ 4], gid->raw[ 5],
gid->raw[ 6], gid->raw[ 7],
gid->raw[ 8], gid->raw[ 9],
gid->raw[10], gid->raw[11],
gid->raw[12], gid->raw[13],
gid->raw[14], gid->raw[15]) != 39)
{
return -FI_EINVAL;
}
return 0;
}

static int vrb_gid_rdma_info(char gid_str[40], char **dev_name,
struct rdma_addrinfo **rai)
{
struct rdma_cm_id *id = NULL;
struct rdma_addrinfo rai_hints = {
.ai_flags = RAI_PASSIVE | RAI_NUMERICHOST | RAI_FAMILY,
.ai_family = AF_IB,
.ai_port_space = RDMA_PS_IB,
}, *rai_ = NULL;
int ret;

ret = rdma_create_id(NULL, &id, NULL, rai_hints.ai_port_space);
if (ret) {
ret = -errno;
goto err;
}

ret = rdma_getaddrinfo(gid_str, NULL, &rai_hints, &rai_);
if (ret) {
ret = -errno;
goto err;
}

ret = rdma_bind_addr(id, rai_->ai_src_addr);
if (ret) {
ret = -errno;
goto err;
}

if (!id->verbs) {
ret = -FI_EINVAL;
goto err;
}

*dev_name = strdup(ibv_get_device_name(id->verbs->device));
if (!(!dev_name)) {
ret = -FI_ENOMEM;
goto err;
}

rdma_destroy_id(id);
*rai = rai_;
return 0;

err:
if (rai_)
rdma_freeaddrinfo(rai_);
if (id)
rdma_destroy_id(id);
return ret;
}

static int vrb_get_gids(struct dlist_entry *verbs_devs)
{
struct rdma_addrinfo *rai = NULL;
struct ibv_device **devices;
char *dev_name = NULL;
int num_devices;
struct ibv_context *context;
int ret, num_verbs_ifs = 0;
struct ibv_device_attr device_attr;
struct ibv_port_attr port_attr;
union ibv_gid gid;
char gid_str[40];

devices = ibv_get_device_list(&num_devices);
if (!devices)
return -errno;

for (int dev = 0; dev < num_devices; dev++) {
context = ibv_open_device(devices[dev]);

ret = ibv_query_device(context, &device_attr);
if (ret)
continue;

for (int port = 1; port <= device_attr.phys_port_cnt; port++) {
ret = ibv_query_port(context, port, &port_attr);
if (ret)
continue;

for (int tbl = 0; tbl < port_attr.gid_tbl_len; tbl++) {
ret = ibv_query_gid(context, port, tbl, &gid);
if (ret)
continue;

if (vrb_gid_is_null(&gid))
continue;

ret = vrb_gid_to_str(&gid, gid_str);
if (ret)
{
FI_DBG(&vrb_prov, FI_LOG_FABRIC,
"GID parsing failed with error code: %d for "
"interface %s",
ret, ibv_get_device_name(devices[dev]));
return ret;
}

ret = vrb_gid_rdma_info(gid_str, &dev_name, &rai);
if (ret)
{
FI_DBG(&vrb_prov, FI_LOG_FABRIC,
"Conversion to RDMA info failed with error code: %d "
"for %s (gid %s)",
ret, ibv_get_device_name(devices[dev]), gid_str);
return ret;
}

ret = verbs_devs_add(verbs_devs, dev_name, rai);
if (ret) {
free(dev_name);
rdma_freeaddrinfo(rai);
continue;
}

num_verbs_ifs++;
}
}
}

ibv_free_device_list(devices);
return num_verbs_ifs ? 0 : -FI_ENODATA;
}

/* Builds a list of interfaces that correspond to active verbs devices */
static int vrb_getifaddrs(struct dlist_entry *verbs_devs)
{
Expand Down Expand Up @@ -1265,6 +1426,8 @@ int vrb_init_info(const struct fi_info **all_infos)

vrb_getifaddrs(&verbs_devs);

vrb_get_gids(&verbs_devs);

if (dlist_empty(&verbs_devs))
FI_WARN(&vrb_prov, FI_LOG_FABRIC,
"no valid IPoIB interfaces found, FI_EP_MSG endpoint "
Expand Down
Loading

0 comments on commit 0326c82

Please sign in to comment.