diff --git a/fabtests/benchmarks/msg_bw.c b/fabtests/benchmarks/msg_bw.c index f273d6a1f54..2e36d0ff1d3 100644 --- a/fabtests/benchmarks/msg_bw.c +++ b/fabtests/benchmarks/msg_bw.c @@ -107,6 +107,7 @@ int main(int argc, char **argv) hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->domain_attr->mr_mode = opts.mr_mode; hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->addr_format = opts.address_format; ret = run(); diff --git a/fabtests/benchmarks/msg_pingpong.c b/fabtests/benchmarks/msg_pingpong.c index ef342eae898..a0386472429 100644 --- a/fabtests/benchmarks/msg_pingpong.c +++ b/fabtests/benchmarks/msg_pingpong.c @@ -107,6 +107,7 @@ int main(int argc, char **argv) hints->caps = FI_MSG; hints->domain_attr->mr_mode = opts.mr_mode; hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->addr_format = opts.address_format; ret = run(); diff --git a/fabtests/benchmarks/rma_bw.c b/fabtests/benchmarks/rma_bw.c index e4351c89bb2..a8ace33bcc1 100644 --- a/fabtests/benchmarks/rma_bw.c +++ b/fabtests/benchmarks/rma_bw.c @@ -95,6 +95,7 @@ int main(int argc, char **argv) hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->mode = FI_CONTEXT; hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->addr_format = opts.address_format; while ((op = getopt(argc, argv, "ho:" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) { switch (op) { diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index ba32da25238..51e83c77329 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -2764,6 +2764,7 @@ void ft_addr_usage() "over the, optional, port"); FT_PRINT_OPTS_USAGE("-C ", "number of connections to accept before " "cleaning up a server"); + FT_PRINT_OPTS_USAGE("-F ", "Address format (default:FI_FORMAT_UNSPEC)"); } void ft_usage(char *name, char *desc) @@ -2923,6 +2924,16 @@ void ft_parse_addr_opts(int op, char *optarg, struct ft_opts *opts) else opts->oob_port = default_oob_port; break; + case 'F': + if (!strncasecmp("fi_sockaddr_in", optarg, 14)) + opts->address_format = FI_SOCKADDR_IN; + else if (!strncasecmp("fi_sockaddr_in6", optarg, 15)) + opts->address_format = FI_SOCKADDR_IN6; + else if (!strncasecmp("fi_sockaddr_ib", optarg, 14)) + opts->address_format = FI_SOCKADDR_IB; + else if (!strncasecmp("fi_sockaddr", optarg, 11)) /* keep me last */ + opts->address_format = FI_SOCKADDR; + break; case 'C': opts->options |= FT_OPT_SERVER_PERSIST; opts->num_connections = atoi(optarg); diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index 85e27bbeb1f..ebb6d4daefb 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -165,13 +165,14 @@ struct ft_opts { char *oob_port; int argc; int num_connections; + int address_format; uint64_t mr_mode; /* Fail if the selected provider does not support FI_MSG_PREFIX. */ int force_prefix; enum fi_hmem_iface iface; uint64_t device; - + char **argv; }; @@ -241,7 +242,7 @@ extern int ft_parent_proc; extern int ft_socket_pair[2]; extern int sock; extern int listen_sock; -#define ADDR_OPTS "B:P:s:a:b::E::C:" +#define ADDR_OPTS "B:P:s:a:b::E::C:F:" #define FAB_OPTS "f:d:p:D:i:H" #define INFO_OPTS FAB_OPTS "e:M:" #define CS_OPTS ADDR_OPTS "I:S:mc:t:w:l" @@ -265,7 +266,8 @@ extern char default_port[8]; .mr_mode = FI_MR_LOCAL | OFI_MR_BASIC_MAP, \ .iface = FI_HMEM_SYSTEM, \ .device = 0, \ - .argc = argc, .argv = argv \ + .argc = argc, .argv = argv, \ + .address_format = FI_FORMAT_UNSPEC \ } #define FT_STR_LEN 32 diff --git a/fabtests/man/fabtests.7.md b/fabtests/man/fabtests.7.md index 72592563003..f64b2353900 100644 --- a/fabtests/man/fabtests.7.md +++ b/fabtests/man/fabtests.7.md @@ -382,6 +382,9 @@ the list available for that test. *-s
* : Specifies the address of the local endpoint. +*-F +: Specifies the address format. + *-b[=oob_port]* : Enables out-of-band (via sockets) address exchange and test synchronization. A port for the out-of-band connection may be specified diff --git a/include/ofi_net.h b/include/ofi_net.h index 7a924df2ac5..dff0903ff8f 100644 --- a/include/ofi_net.h +++ b/include/ofi_net.h @@ -127,11 +127,33 @@ int ofi_discard_socket(SOCKET sock, size_t len); #define OFI_ADDRSTRLEN (INET6_ADDRSTRLEN + 50) +/* values taken from librdmacm/rdma_cma.h */ +#define OFI_IB_IP_PS_MASK 0xFFFFFFFFFFFF0000ULL +#define OFI_IB_IP_PORT_MASK 0x000000000000FFFFULL + +struct ofi_sockaddr_ib { + unsigned short int sib_family; /* AF_IB */ + uint16_t sib_pkey; + uint32_t sib_flowinfo; + uint8_t sib_addr[16]; + uint64_t sib_sid; + uint64_t sib_sid_mask; + uint64_t sib_scope_id; +}; + +enum ofi_rdma_port_space { + OFI_RDMA_PS_IPOIB = 0x0002, + OFI_RDMA_PS_IB = 0x013F, + OFI_RDMA_PS_TCP = 0x0106, + OFI_RDMA_PS_UDP = 0x0111, +}; + union ofi_sock_ip { - struct sockaddr sa; - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - uint8_t align[32]; + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + struct ofi_sockaddr_ib sib; + uint8_t align[48]; }; struct ofi_addr_list_entry { @@ -160,6 +182,7 @@ void ofi_free_list_of_addr(struct slist *addr_list); #define ofi_sin6_addr(addr) (((struct sockaddr_in6 *)(addr))->sin6_addr) #define ofi_sin6_port(addr) (((struct sockaddr_in6 *)(addr))->sin6_port) +#define ofi_sib_addr(addr) (((struct ofi_sockaddr_ib *)(addr))->sib_addr) static inline size_t ofi_sizeofaddr(const struct sockaddr *addr) { @@ -168,6 +191,8 @@ static inline size_t ofi_sizeofaddr(const struct sockaddr *addr) return sizeof(struct sockaddr_in); case AF_INET6: return sizeof(struct sockaddr_in6); + case AF_IB: + return sizeof(struct ofi_sockaddr_ib); default: FI_WARN(&core_prov, FI_LOG_CORE, "Unknown address format\n"); return 0; @@ -181,6 +206,8 @@ static inline size_t ofi_sizeofip(const struct sockaddr *addr) return sizeof(struct in_addr); case AF_INET6: return sizeof(struct in6_addr); + case AF_IB: + return sizeof(ofi_sib_addr(addr)); default: FI_WARN(&core_prov, FI_LOG_CORE, "Unknown address format\n"); return 0; @@ -203,7 +230,7 @@ static inline int ofi_translate_addr_format(int family) uint16_t ofi_get_sa_family(const struct fi_info *info); -static inline int ofi_ipv4_is_any_addr(struct sockaddr *sa) +static inline int ofi_sin_is_any_addr(struct sockaddr *sa) { struct in_addr ia_any = { .s_addr = INADDR_ANY, @@ -216,7 +243,7 @@ static inline int ofi_ipv4_is_any_addr(struct sockaddr *sa) } -static inline int ofi_ipv6_is_any_addr(struct sockaddr *sa) +static inline int ofi_sin6_is_any_addr(struct sockaddr *sa) { struct in6_addr ia6_any = IN6ADDR_ANY_INIT; @@ -226,6 +253,16 @@ static inline int ofi_ipv6_is_any_addr(struct sockaddr *sa) return !memcmp(&ofi_sin6_addr(sa), &ia6_any, sizeof(ia6_any)); } +static inline int ofi_sib_is_any_addr(struct sockaddr *sa) +{ + struct in6_addr ia6_any = IN6ADDR_ANY_INIT; + + if (!sa) + return 0; + + return !memcmp(&ofi_sib_addr(sa), &ia6_any, sizeof(ia6_any)); +} + static inline int ofi_is_any_addr(struct sockaddr *sa) { if (!sa) @@ -233,9 +270,11 @@ static inline int ofi_is_any_addr(struct sockaddr *sa) switch(sa->sa_family) { case AF_INET: - return ofi_ipv4_is_any_addr(sa); + return ofi_sin_is_any_addr(sa); case AF_INET6: - return ofi_ipv6_is_any_addr(sa); + return ofi_sin6_is_any_addr(sa); + case AF_IB: + return ofi_sib_is_any_addr(sa); default: FI_WARN(&core_prov, FI_LOG_CORE, "Unknown address format!\n"); return 0; @@ -252,6 +291,8 @@ static inline uint16_t ofi_addr_get_port(const struct sockaddr *addr) return ntohs(ofi_sin_port((const struct sockaddr_in *) addr)); case AF_INET6: return ntohs(ofi_sin6_port((const struct sockaddr_in6 *) addr)); + case AF_IB: + return (uint16_t)ntohll(((const struct ofi_sockaddr_ib *)addr)->sib_sid); default: FI_WARN(&core_prov, FI_LOG_FABRIC, "Unknown address format\n"); assert(0); @@ -261,6 +302,8 @@ static inline uint16_t ofi_addr_get_port(const struct sockaddr *addr) static inline void ofi_addr_set_port(struct sockaddr *addr, uint16_t port) { + struct ofi_sockaddr_ib *sib; + switch (ofi_sa_family(addr)) { case AF_INET: ofi_sin_port(addr) = htons(port); @@ -268,6 +311,11 @@ static inline void ofi_addr_set_port(struct sockaddr *addr, uint16_t port) case AF_INET6: ofi_sin6_port(addr) = htons(port); break; + case AF_IB: + sib = (struct ofi_sockaddr_ib *)addr; + sib->sib_sid = htonll(((uint64_t)OFI_RDMA_PS_IB << 16) + ntohs(port)); + sib->sib_sid_mask = htonll(OFI_IB_IP_PS_MASK | OFI_IB_IP_PORT_MASK); + break; default: FI_WARN(&core_prov, FI_LOG_FABRIC, "Unknown address format\n"); assert(0); @@ -281,6 +329,8 @@ static inline void * ofi_get_ipaddr(const struct sockaddr *addr) return &ofi_sin_addr((const struct sockaddr_in *) addr); case AF_INET6: return &ofi_sin6_addr((const struct sockaddr_in6 *) addr); + case AF_IB: + return &ofi_sib_addr((const struct ofi_sockaddr_ib *) addr); default: return NULL; } @@ -299,6 +349,9 @@ static inline int ofi_equals_ipaddr(const struct sockaddr *addr1, case AF_INET6: return !memcmp(&ofi_sin6_addr(addr1), &ofi_sin6_addr(addr2), sizeof(ofi_sin6_addr(addr1))); + case AF_IB: + return !memcmp(&ofi_sib_addr(addr1), &ofi_sib_addr(addr2), + sizeof(ofi_sib_addr(addr1))); default: return 0; } @@ -323,6 +376,7 @@ size_t ofi_mask_addr(struct sockaddr *maskaddr, const struct sockaddr *srcaddr, */ const char *ofi_straddr(char *buf, size_t *len, uint32_t addr_format, const void *addr); +uint32_t ofi_addr_format(const char *str); /* Returns allocated address to caller. Caller must free. */ int ofi_str_toaddr(const char *str, uint32_t *addr_format, diff --git a/include/windows/osd.h b/include/windows/osd.h index 9ee9d0d60e5..d3cabcebb65 100644 --- a/include/windows/osd.h +++ b/include/windows/osd.h @@ -261,6 +261,7 @@ do \ #define strcasecmp _stricmp #define snprintf _snprintf #define sleep(x) Sleep(x * 1000) +#define strtok_r strtok_s #define __PRI64_PREFIX "ll" diff --git a/prov/verbs/src/fi_verbs.c b/prov/verbs/src/fi_verbs.c index e4a4ee8b86f..b1c95f8a6db 100644 --- a/prov/verbs/src/fi_verbs.c +++ b/prov/verbs/src/fi_verbs.c @@ -103,7 +103,7 @@ int vrb_sockaddr_len(struct sockaddr *addr) } static int -vrb_get_rdma_rai(const char *node, const char *service, uint64_t flags, +vrb_get_rdmacm_rai(const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct rdma_addrinfo **rai) { struct rdma_addrinfo rai_hints, *_rai; @@ -154,6 +154,97 @@ vrb_get_rdma_rai(const char *node, const char *service, uint64_t flags, return ret; } +static int vrb_get_sib_rai(const char *node, const char *service, uint64_t flags, + const struct fi_info *hints, struct rdma_addrinfo **rai) +{ + struct sockaddr_ib *sib; + size_t sib_len; + char *straddr; + uint32_t fmt; + int ret; + bool has_prefix; + const char *prefix = "fi_sockaddr_ib://"; + + *rai = calloc(1, sizeof(struct rdma_addrinfo)); + if (*rai == NULL) + return -FI_ENOMEM; + + ret = vrb_fi_to_rai(hints, flags, *rai); + if (ret) + return ret; + + if (node) { + fmt = ofi_addr_format(node); + if (fmt == FI_SOCKADDR_IB) + has_prefix = true; + else if (fmt == FI_FORMAT_UNSPEC) + has_prefix = false; + else + return -FI_EINVAL; + + if (service) { + ret = asprintf(&straddr, "%s%s:%s", has_prefix ? "" : prefix, + node, service); + } else { + ret = asprintf(&straddr, "%s%s", has_prefix ? "" : prefix, node); + } + + if (ret == -1) + return -FI_ENOMEM; + + ret = ofi_str_toaddr(straddr, &fmt, (void **)&sib, &sib_len); + free(straddr); + + if (ret || fmt != FI_SOCKADDR_IB) { + return -FI_EINVAL; + } + + if (flags & FI_SOURCE) { + (*rai)->ai_flags |= RAI_PASSIVE; + if ((*rai)->ai_src_addr) + free((*rai)->ai_src_addr); + (*rai)->ai_src_addr = (void *)sib; + (*rai)->ai_src_len = sizeof(struct sockaddr_ib); + } else { + if ((*rai)->ai_dst_addr) + free((*rai)->ai_dst_addr); + (*rai)->ai_dst_addr = (void *)sib; + (*rai)->ai_dst_len = sizeof(struct sockaddr_ib); + } + + } else if (service) { + if ((flags & FI_SOURCE) && (*rai)->ai_src_addr) { + if ((*rai)->ai_src_len < sizeof(struct sockaddr_ib)) + return -FI_EINVAL; + + (*rai)->ai_src_len = sizeof(struct sockaddr_ib); + sib = (struct sockaddr_ib *)(*rai)->ai_src_addr; + } else { + if ((*rai)->ai_dst_len < sizeof(struct sockaddr_ib)) + return -FI_EINVAL; + + (*rai)->ai_dst_len = sizeof(struct sockaddr_ib); + sib = (struct sockaddr_ib *)(*rai)->ai_dst_addr; + } + + sib->sib_sid = htonll(((uint64_t) RDMA_PS_IB << 16) + (uint16_t)atoi(service)); + sib->sib_sid_mask = htonll(OFI_IB_IP_PS_MASK | OFI_IB_IP_PORT_MASK); + } + + return 0; +} + +int vrb_get_rdma_rai(const char *node, const char *service, uint64_t flags, + const struct fi_info *hints, struct rdma_addrinfo **rai) +{ + if (hints && hints->addr_format == FI_SOCKADDR_IB && + (node || hints->src_addr || hints->dest_addr)) { + return vrb_get_sib_rai(node, service, flags, hints, rai); + } + + return vrb_get_rdmacm_rai(node, service, flags, hints, rai); +} + int vrb_get_rai_id(const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct rdma_addrinfo **rai, struct rdma_cm_id **id) @@ -165,7 +256,7 @@ int vrb_get_rai_id(const char *node, const char *service, uint64_t flags, if (ret) return ret; - ret = rdma_create_id(NULL, id, NULL, RDMA_PS_TCP); + ret = rdma_create_id(NULL, id, NULL, vrb_get_port_space(hints)); if (ret) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_create_id", errno); ret = -errno; diff --git a/prov/verbs/src/fi_verbs.h b/prov/verbs/src/fi_verbs.h index 80af7fecb82..6bc137aef81 100644 --- a/prov/verbs/src/fi_verbs.h +++ b/prov/verbs/src/fi_verbs.h @@ -594,6 +594,7 @@ struct vrb_ep { size_t rx_cq_size; struct rdma_conn_param conn_param; struct vrb_cm_data_hdr *cm_hdr; + void *cm_priv_data; }; @@ -700,6 +701,17 @@ struct vrb_connreq { struct vrb_xrc_conn_info xrc; }; +/* Structure below is a copy of the RDMA CM header (structure ib_connect_hdr in + * file librdmacm/cma.h) + * DO NOT MODIFY! */ +struct vrb_rdma_cm_hdr { + uint8_t cma_version; /* Set by the kernel */ + uint8_t ip_version; /* IP version: 7:4 */ + uint16_t port; + uint32_t src_addr[4]; + uint32_t dst_addr[4]; +}; + struct vrb_cm_data_hdr { uint8_t size; char data[]; @@ -755,7 +767,6 @@ void vrb_ep_tgt_conn_done(struct vrb_xrc_ep *qp); int vrb_ep_destroy_xrc_qp(struct vrb_xrc_ep *ep); int vrb_xrc_close_srq(struct vrb_srq_ep *srq_ep); -int vrb_sockaddr_len(struct sockaddr *addr); int vrb_init_info(const struct fi_info **all_infos); int vrb_getinfo(uint32_t version, const char *node, const char *service, @@ -768,6 +779,7 @@ int vrb_fi_to_rai(const struct fi_info *fi, uint64_t flags, int vrb_get_matching_info(uint32_t version, const struct fi_info *hints, struct fi_info **info, const struct fi_info *verbs_info, uint8_t passive); +int vrb_get_port_space(const struct fi_info *info); void vrb_alter_info(const struct fi_info *hints, struct fi_info *info); struct verbs_ep_domain { diff --git a/prov/verbs/src/verbs_cm.c b/prov/verbs/src/verbs_cm.c index 63aea5feed2..8ae1599f11f 100644 --- a/prov/verbs/src/verbs_cm.c +++ b/prov/verbs/src/verbs_cm.c @@ -37,7 +37,7 @@ static int vrb_copy_addr(void *dst_addr, size_t *dst_addrlen, void *src_addr) { - size_t src_addrlen = vrb_sockaddr_len(src_addr); + size_t src_addrlen = ofi_sizeofaddr(src_addr); if (*dst_addrlen == 0) { *dst_addrlen = src_addrlen; @@ -123,23 +123,41 @@ vrb_msg_ep_prepare_cm_data(const void *param, size_t param_size, static inline void vrb_ep_prepare_rdma_cm_param(struct rdma_conn_param *conn_param, - struct vrb_cm_data_hdr *cm_hdr, - size_t cm_hdr_data_size) + void *priv_data, size_t priv_data_size) { - conn_param->private_data = cm_hdr; - conn_param->private_data_len = (uint8_t)cm_hdr_data_size; + conn_param->private_data = priv_data; + conn_param->private_data_len = (uint8_t)priv_data_size; conn_param->responder_resources = RDMA_MAX_RESP_RES; conn_param->initiator_depth = RDMA_MAX_INIT_DEPTH; conn_param->flow_control = 1; conn_param->rnr_retry_count = 7; } +static void +vrb_msg_ep_prepare_rdma_cm_hdr(void *priv_data, + const struct rdma_cm_id *id) +{ + struct vrb_rdma_cm_hdr *rdma_cm_hdr = priv_data; + + rdma_cm_hdr->ip_version = 6 << 4; /* IPv6 */ + rdma_cm_hdr->port = htons(ofi_addr_get_port(&id->route.addr.src_addr)); + + /* Record the GIDs */ + memcpy(rdma_cm_hdr->src_addr, + &((struct ofi_sockaddr_ib *)&id->route.addr.src_addr)->sib_addr, 16); + memcpy(rdma_cm_hdr->dst_addr, + &((struct ofi_sockaddr_ib *)&id->route.addr.dst_addr)->sib_addr, 16); +} + static int vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr, const void *param, size_t paramlen) { struct vrb_ep *ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); + size_t priv_data_len; + struct vrb_cm_data_hdr *cm_hdr; + off_t rdma_cm_hdr_len = 0; int ret; if (OFI_UNLIKELY(paramlen > VERBS_CM_DATA_SIZE)) @@ -151,13 +169,21 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr, return ret; } - ep->cm_hdr = malloc(sizeof(*(ep->cm_hdr)) + paramlen); - if (!ep->cm_hdr) + if (ep->id->route.addr.src_addr.sa_family == AF_IB) + rdma_cm_hdr_len = sizeof(struct vrb_rdma_cm_hdr); + + priv_data_len = sizeof(*cm_hdr) + paramlen + rdma_cm_hdr_len; + ep->cm_priv_data = malloc(priv_data_len); + if (!ep->cm_priv_data) return -FI_ENOMEM; - vrb_msg_ep_prepare_cm_data(param, paramlen, ep->cm_hdr); - vrb_ep_prepare_rdma_cm_param(&ep->conn_param, ep->cm_hdr, - sizeof(*(ep->cm_hdr)) + paramlen); + if (rdma_cm_hdr_len) + vrb_msg_ep_prepare_rdma_cm_hdr(ep->cm_priv_data, ep->id); + + cm_hdr = (void *)((char *)ep->cm_priv_data + rdma_cm_hdr_len); + vrb_msg_ep_prepare_cm_data(param, paramlen, cm_hdr); + vrb_ep_prepare_rdma_cm_param(&ep->conn_param, ep->cm_priv_data, + priv_data_len); ep->conn_param.retry_count = 15; if (ep->srq_ep) @@ -168,8 +194,8 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr, FI_WARN(&vrb_prov, FI_LOG_EP_CTRL, "rdma_resolve_route failed: %s (%d)\n", strerror(-ret), -ret); - free(ep->cm_hdr); - ep->cm_hdr = NULL; + free(ep->cm_priv_data); + ep->cm_priv_data = NULL; return ret; } return 0; diff --git a/prov/verbs/src/verbs_ep.c b/prov/verbs/src/verbs_ep.c index 18348cc2dbd..a5993c36617 100644 --- a/prov/verbs/src/verbs_ep.c +++ b/prov/verbs/src/verbs_ep.c @@ -355,7 +355,7 @@ static int vrb_close_free_ep(struct vrb_ep *ep) free(ep->util_ep.ep_fid.msg); ep->util_ep.ep_fid.msg = NULL; - free(ep->cm_hdr); + free(ep->cm_priv_data); if (ep->util_ep.rx_cq) { cq = container_of(ep->util_ep.rx_cq, struct vrb_cq, util_cq); @@ -1046,7 +1046,7 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info, if (!info->handle) { /* Only RC, XRC active RDMA CM ID is created at connect */ if (!(dom->flags & VRB_USE_XRC)) { - ret = vrb_create_ep(info, RDMA_PS_TCP, + ret = vrb_create_ep(info, vrb_get_port_space(info), &ep->id); if (ret) goto err1; @@ -1252,7 +1252,8 @@ int vrb_passive_ep(struct fid_fabric *fabric, struct fi_info *info, _pep->info->dest_addrlen = 0; } - ret = rdma_create_id(NULL, &_pep->id, &_pep->pep_fid.fid, RDMA_PS_TCP); + ret = rdma_create_id(NULL, &_pep->id, &_pep->pep_fid.fid, + vrb_get_port_space(info)); if (ret) { VERBS_INFO(FI_LOG_DOMAIN, "Unable to create PEP rdma_cm_id\n"); goto err2; diff --git a/prov/verbs/src/verbs_eq.c b/prov/verbs/src/verbs_eq.c index c97b6fa2a67..002c8a9b053 100644 --- a/prov/verbs/src/verbs_eq.c +++ b/prov/verbs/src/verbs_eq.c @@ -226,13 +226,13 @@ vrb_eq_cm_getinfo(struct rdma_cm_event *event, struct fi_info *pep_info, free((*info)->src_addr); - (*info)->src_addrlen = vrb_sockaddr_len(rdma_get_local_addr(event->id)); + (*info)->src_addrlen = ofi_sizeofaddr(rdma_get_local_addr(event->id)); (*info)->src_addr = malloc((*info)->src_addrlen); if (!((*info)->src_addr)) goto err2; memcpy((*info)->src_addr, rdma_get_local_addr(event->id), (*info)->src_addrlen); - (*info)->dest_addrlen = vrb_sockaddr_len(rdma_get_peer_addr(event->id)); + (*info)->dest_addrlen = ofi_sizeofaddr(rdma_get_peer_addr(event->id)); (*info)->dest_addr = malloc((*info)->dest_addrlen); if (!((*info)->dest_addr)) goto err2; @@ -284,6 +284,17 @@ static inline int vrb_eq_copy_event_data(struct fi_eq_cm_entry *entry, return datalen; } +static void vrb_eq_skip_rdma_cm_hdr(const void **priv_data, + size_t *priv_data_len) +{ + size_t rdma_cm_hdr_len = sizeof(struct vrb_rdma_cm_hdr); + + if (*priv_data_len > rdma_cm_hdr_len) { + *priv_data = (void*)((char *)*priv_data + rdma_cm_hdr_len); + *priv_data_len -= rdma_cm_hdr_len; + } +} + static void vrb_eq_skip_xrc_cm_data(const void **priv_data, size_t *priv_data_len) { @@ -896,6 +907,8 @@ vrb_eq_cm_process_event(struct vrb_eq *eq, } if (*event == FI_CONNECTED) goto ack; + } else if (cma_event->id->route.addr.src_addr.sa_family == AF_IB) { + vrb_eq_skip_rdma_cm_hdr(&priv_data, &priv_datalen); } break; case RDMA_CM_EVENT_CONNECT_RESPONSE: diff --git a/prov/verbs/src/verbs_info.c b/prov/verbs/src/verbs_info.c index 7cbf05ffa2c..e9eae49a6ee 100644 --- a/prov/verbs/src/verbs_info.c +++ b/prov/verbs/src/verbs_info.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "fi_verbs.h" @@ -302,7 +303,6 @@ int vrb_fi_to_rai(const struct fi_info *fi, uint64_t flags, rai->ai_flags |= RAI_NUMERICHOST; rai->ai_qp_type = IBV_QPT_RC; - rai->ai_port_space = RDMA_PS_TCP; if (!fi) return 0; @@ -310,18 +310,22 @@ int vrb_fi_to_rai(const struct fi_info *fi, uint64_t flags, switch(fi->addr_format) { case FI_SOCKADDR_IN: case FI_FORMAT_UNSPEC: + rai->ai_port_space = RDMA_PS_TCP; rai->ai_family = AF_INET; rai->ai_flags |= RAI_FAMILY; break; case FI_SOCKADDR_IN6: + rai->ai_port_space = RDMA_PS_TCP; rai->ai_family = AF_INET6; rai->ai_flags |= RAI_FAMILY; break; case FI_SOCKADDR_IB: + rai->ai_port_space = RDMA_PS_IB; rai->ai_family = AF_IB; rai->ai_flags |= RAI_FAMILY; break; case FI_SOCKADDR: + rai->ai_port_space = RDMA_PS_TCP; if (fi->src_addrlen) { rai->ai_family = ((struct sockaddr *)fi->src_addr)->sa_family; rai->ai_flags |= RAI_FAMILY; @@ -983,6 +987,112 @@ static int vrb_ifa_rdma_info(const struct ifaddrs *ifa, char **dev_name, return ret; } +int vrb_get_port_space(const struct fi_info *info) +{ + if (info != NULL && info->addr_format == FI_SOCKADDR_IB) + return RDMA_PS_IB; + else + return RDMA_PS_TCP; +} + +static struct rdma_addrinfo *vrb_alloc_ib_addrinfo(uint8_t port_num, + const union ibv_gid *gid, uint16_t pkey) +{ + struct rdma_addrinfo *rai; + struct sockaddr_ib *sib; + + rai = calloc(1, sizeof(struct rdma_addrinfo)); + if (!rai) + return NULL; + + rai->ai_flags = RAI_PASSIVE | RAI_NUMERICHOST | RAI_FAMILY; + rai->ai_family = AF_IB; + rai->ai_port_space = RDMA_PS_IB; + + sib = calloc(1, sizeof(struct sockaddr_ib)); + if (!sib) { + free(rai); + return NULL; + } + rai->ai_src_addr = (struct sockaddr *) sib; + rai->ai_src_len = sizeof(struct sockaddr_ib); + + sib->sib_family = AF_IB; + memcpy(&sib->sib_addr.sib_raw, &gid->raw, sizeof(*gid)); + sib->sib_pkey = pkey; + sib->sib_scope_id = port_num; + + ofi_addr_set_port((struct sockaddr *)sib, 0); + + return rai; +} + +static int vrb_get_sib(struct dlist_entry *verbs_devs) +{ + struct rdma_addrinfo *rai = NULL; + struct ibv_device **devices; + char *dev_name = NULL; + int num_devices; + struct ibv_context *context; + int ret, num_verbs_ifs = 0; + struct ibv_device_attr device_attr; + struct ibv_port_attr port_attr; + union ibv_gid gid; + uint16_t pkey; + + devices = ibv_get_device_list(&num_devices); + if (!devices) + return -errno; + + for (int dev = 0; dev < num_devices; dev++) { + context = ibv_open_device(devices[dev]); + + ret = ibv_query_device(context, &device_attr); + if (ret) + continue; + + for (int port = 1; port <= device_attr.phys_port_cnt; port++) { + ret = ibv_query_port(context, port, &port_attr); + if (ret) + continue; + + for (int gidx = 0; gidx < port_attr.gid_tbl_len; gidx++) { + /* gid_tbl_len may contain GID entries that are NULL (fe80::), + * so we need to filter them out */ + ret = ibv_query_gid(context, port, gidx, &gid); + if (ret || !gid.global.interface_id || !gid.global.subnet_prefix) + continue; + + for (int pidx = 0; pidx < port_attr.pkey_tbl_len; pidx++) { + ret = ibv_query_pkey(context, port, pidx, &pkey); + if (ret || !pkey) + continue; + + rai = vrb_alloc_ib_addrinfo(port, &gid, pkey); + if (!rai) + continue; + + dev_name = strdup(ibv_get_device_name(context->device)); + if (!dev_name) + return -FI_ENOMEM; + + ret = verbs_devs_add(verbs_devs, dev_name, rai); + if (ret) { + free(dev_name); + rdma_freeaddrinfo(rai); + continue; + } + + num_verbs_ifs++; + } + } + } + } + + ibv_free_device_list(devices); + return num_verbs_ifs ? 0 : -FI_ENODATA; +} + /* Builds a list of interfaces that correspond to active verbs devices */ static int vrb_getifaddrs(struct dlist_entry *verbs_devs) { @@ -1109,18 +1219,6 @@ static int vrb_get_srcaddr_devs(struct fi_info **info) return 0; } -static void vrb_sockaddr_set_port(struct sockaddr *sa, uint16_t port) -{ - switch(sa->sa_family) { - case AF_INET: - ((struct sockaddr_in *)sa)->sin_port = port; - break; - case AF_INET6: - ((struct sockaddr_in6 *)sa)->sin6_port = port; - break; - } -} - /* the `rai` parameter is used for the MSG EP type */ /* the `fmt`, `[src | dest]_addr` parameters are used for the DGRAM EP type */ /* if the `fmt` parameter isn't used, pass FI_FORMAT_UNSPEC */ @@ -1181,7 +1279,7 @@ static int vrb_fill_addr(struct rdma_addrinfo *rai, struct fi_info **info, * corresponds to a valid dest addr) */ local_addr = rdma_get_local_addr(id); - rai->ai_src_len = vrb_sockaddr_len(local_addr); + rai->ai_src_len = ofi_sizeofaddr(local_addr); rai->ai_src_addr = malloc(rai->ai_src_len); if (!rai->ai_src_addr) return -FI_ENOMEM; @@ -1190,7 +1288,7 @@ static int vrb_fill_addr(struct rdma_addrinfo *rai, struct fi_info **info, /* User didn't specify a port. Zero out the random port * assigned by rdmamcm so that this rai/fi_info can be * used multiple times to create rdma endpoints.*/ - vrb_sockaddr_set_port(rai->ai_src_addr, 0); + ofi_addr_set_port(rai->ai_src_addr, 0); rai_to_fi: return vrb_set_info_addrs(*info, rai, FI_FORMAT_UNSPEC, @@ -1237,6 +1335,8 @@ int vrb_init_info(const struct fi_info **all_infos) vrb_getifaddrs(&verbs_devs); + vrb_get_sib(&verbs_devs); + if (dlist_empty(&verbs_devs)) FI_WARN(&vrb_prov, FI_LOG_FABRIC, "no valid IPoIB interfaces found, FI_EP_MSG endpoint " diff --git a/src/common.c b/src/common.c index cf65c8024cd..b351c5845a2 100644 --- a/src/common.c +++ b/src/common.c @@ -284,6 +284,7 @@ const char *ofi_straddr(char *buf, size_t *len, const struct sockaddr *sock_addr; const struct sockaddr_in6 *sin6; const struct sockaddr_in *sin; + const struct ofi_sockaddr_ib *sib; char str[INET6_ADDRSTRLEN + 8]; size_t size; @@ -332,7 +333,19 @@ const char *ofi_straddr(char *buf, size_t *len, str, *((uint16_t *)addr + 8), *((uint32_t *)addr + 5)); break; case FI_SOCKADDR_IB: - size = snprintf(buf, *len, "fi_sockaddr_ib://%p", addr); + sib = addr; + memset(str, 0, sizeof(str)); + if (!inet_ntop(AF_INET6, sib->sib_addr, str, INET6_ADDRSTRLEN)) + return NULL; + + size = snprintf(buf, *len, "fi_sockaddr_ib://[%s]" /* GID */ + ":0x%" PRIx16 /* P_Key */ + ":0x%" PRIx16 /* port space */ + ":0x%" PRIx8 /* Scope ID */, + str, /* GID */ + ntohs(sib->sib_pkey), /* P_Key */ + (uint16_t)(ntohll(sib->sib_sid) >> 16) & 0xfff, /* port space */ + (uint8_t)ntohll(sib->sib_scope_id) & 0xff); break; case FI_ADDR_PSMX: size = snprintf(buf, *len, "fi_addr_psmx://%" PRIx64, @@ -380,7 +393,7 @@ const char *ofi_straddr(char *buf, size_t *len, return buf; } -static uint32_t ofi_addr_format(const char *str) +uint32_t ofi_addr_format(const char *str) { char fmt[16]; int ret; @@ -476,6 +489,101 @@ static int ofi_str_to_ib_ud(const char *str, void **addr, size_t *len) return -FI_EINVAL; } +static int ofi_str_to_sib(const char *str, void **addr, size_t *len) +{ + int ret; + char *tok, *endptr, *saveptr; + struct ofi_sockaddr_ib *sib; + uint16_t pkey; + uint16_t ps; + uint64_t scope_id; + uint16_t port; + char gid[64 + 1]; + char extra_str[64 + 1]; + + memset(gid, 0, sizeof(gid)); + + ret = sscanf(str, "%*[^:]://[%64[^]]]" /* GID */ + ":%64s", /* P_Key : port_space : Scope ID : port */ + gid, extra_str); + if (ret != 2) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid GID in address: %s\n", str); + return -FI_EINVAL; + } + + tok = strtok_r(extra_str, ":", &saveptr); + if (!tok) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid pkey in address: %s\n", str); + return -FI_EINVAL; + } + + pkey = strtol(tok, &endptr, 0); + if (*endptr) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid pkey in address: %s\n", str); + return -FI_EINVAL; + } + + tok = strtok_r(NULL, ":", &saveptr); + if (!tok) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid port space in address: %s\n", str); + return -FI_EINVAL; + } + + ps = strtol(tok, &endptr, 0); + if (*endptr) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid port space in address: %s\n", str); + return -FI_EINVAL; + } + + tok = strtok_r(NULL, ":", &saveptr); + if (!tok) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid scope id in address: %s\n", str); + return -FI_EINVAL; + } + + scope_id = strtol(tok, &endptr, 0); + if (*endptr) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid scope id in address: %s\n", str); + return -FI_EINVAL; + } + + /* Port is optional */ + tok = strtok_r(NULL, ":", &saveptr); + if (tok) + port = strtol(tok, &endptr, 0); + else + port = 0; + + *len = sizeof(struct ofi_sockaddr_ib); + *addr = calloc(1, *len); + if (!*addr) + return -FI_ENOMEM; + + sib = (struct ofi_sockaddr_ib *)(*addr); + + if (inet_pton(AF_INET6, gid, sib->sib_addr) > 0) { + sib->sib_family = AF_IB; + sib->sib_pkey = htons(pkey); + if (ps && port) { + sib->sib_sid = htonll(((uint64_t) ps << 16) + port); + sib->sib_sid_mask = htonll(OFI_IB_IP_PS_MASK | + OFI_IB_IP_PORT_MASK); + } + sib->sib_scope_id = htonll(scope_id); + return FI_SUCCESS; + } + + free(*addr); + return -FI_EINVAL; +} + static int ofi_str_to_efa(const char *str, void **addr, size_t *len) { char gid[INET6_ADDRSTRLEN]; @@ -691,6 +799,7 @@ int ofi_str_toaddr(const char *str, uint32_t *addr_format, case FI_ADDR_EFA: return ofi_str_to_efa(str, addr, len); case FI_SOCKADDR_IB: + return ofi_str_to_sib(str, addr, len); case FI_ADDR_GNI: case FI_ADDR_BGQ: case FI_ADDR_MLX: @@ -749,10 +858,10 @@ static int ofi_is_any_addr_port(struct sockaddr *addr) { switch (ofi_sa_family(addr)) { case AF_INET: - return (ofi_ipv4_is_any_addr(addr) && + return (ofi_sin_is_any_addr(addr) && ofi_sin_port(addr)); case AF_INET6: - return (ofi_ipv6_is_any_addr(addr) && + return (ofi_sin6_is_any_addr(addr) && ofi_sin6_port(addr)); default: FI_WARN(&core_prov, FI_LOG_CORE,