Skip to content

Commit

Permalink
nvmf/rdma: Add WR batch rdma parameter
Browse files Browse the repository at this point in the history
With x86 and QD < 64 there is a benefit from disable batch when we have
randread IO pattern, initiators on many cores and BS = 4K.
With QD = 4 we also see benefit from disable batch for small BS( BS <
2K).

Batching is configurable with optional parameter WRBatching in
configuration file (default True).

Signed-off-by: Ivan Betsis <c_ivanb@mellanox.com>
Signed-off-by: Evgeniy Kochetov <evgeniik@mellanox.com>
Signed-off-by: Sasha Kotchubievsky <sashakot@mellanox.com>
  • Loading branch information
Ivan Betsis committed Feb 11, 2020
1 parent a8d0169 commit c3f17a8
Show file tree
Hide file tree
Showing 8 changed files with 54 additions and 14 deletions.
1 change: 1 addition & 0 deletions doc/jsonrpc.md
Original file line number Diff line number Diff line change
Expand Up @@ -3844,6 +3844,7 @@ no_srq | Optional | boolean | Disable shared receive queue
c2h_success | Optional | boolean | Disable C2H success optimization (TCP only)
dif_insert_or_strip | Optional | boolean | Enable DIF insert for write I/O and DIF strip for read I/O DIF (TCP only)
sock_priority | Optional | number | The socket priority of the connection owned by this transport (TCP only)
wr_batching | Optional | boolean | Disable work requests batching (RDMA only)

### Example:

Expand Down
3 changes: 3 additions & 0 deletions etc/spdk/nvmf.conf.in
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@
# Set the maximum number outstanding I/O per shared receive queue. Relevant only for RDMA transport
#MaxSRQDepth 4096

# Set batching for RDMA requests
#WRBatching True

[Transport]
# Set TCP transport type.
Type TCP
Expand Down
1 change: 1 addition & 0 deletions include/spdk/nvmf.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ struct spdk_nvmf_transport_opts {
bool no_srq;
bool c2h_success;
bool dif_insert_or_strip;
bool wr_batching;
uint32_t sock_priority;
};

Expand Down
5 changes: 5 additions & 0 deletions lib/nvmf/nvmf_rpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1602,6 +1602,10 @@ static const struct spdk_json_object_decoder nvmf_rpc_create_transport_decoder[]
"tgt_name", offsetof(struct nvmf_rpc_create_transport_ctx, tgt_name),
spdk_json_decode_string, true
},
{
"wr_batching", offsetof(struct nvmf_rpc_create_transport_ctx, opts.wr_batching),
spdk_json_decode_bool, true
},
};

static void
Expand Down Expand Up @@ -1745,6 +1749,7 @@ dump_nvmf_transport(struct spdk_json_write_ctx *w, struct spdk_nvmf_transport *t
if (type == SPDK_NVME_TRANSPORT_RDMA) {
spdk_json_write_named_uint32(w, "max_srq_depth", opts->max_srq_depth);
spdk_json_write_named_bool(w, "no_srq", opts->no_srq);
spdk_json_write_named_bool(w, "wr_batching", opts->wr_batching);
} else if (type == SPDK_NVME_TRANSPORT_TCP) {
spdk_json_write_named_bool(w, "c2h_success", opts->c2h_success);
spdk_json_write_named_uint32(w, "sock_priority", opts->sock_priority);
Expand Down
45 changes: 34 additions & 11 deletions lib/nvmf/rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,14 @@ struct spdk_nvmf_rdma_transport {
static inline void
spdk_nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair);

static void
_poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_poller *rpoller);

static void
_poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_poller *rpoller);

static inline int
spdk_nvmf_rdma_check_ibv_state(enum ibv_qp_state state)
{
Expand Down Expand Up @@ -1102,7 +1110,8 @@ spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair)
/* Append the given recv wr structure to the resource structs outstanding recvs list. */
/* This function accepts either a single wr or the first wr in a linked list. */
static void
nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first)
nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first,
struct spdk_nvmf_rdma_transport *rtransport)
{
struct ibv_recv_wr *last;

Expand All @@ -1121,12 +1130,17 @@ nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_r
rqpair->resources->recvs_to_post.last->next = first;
rqpair->resources->recvs_to_post.last = last;
}

if (!rtransport->transport.opts.wr_batching) {
_poller_submit_recvs(rtransport, rqpair->poller);
}
}

/* Append the given send wr structure to the qpair's outstanding sends list. */
/* This function accepts either a single wr or the first wr in a linked list. */
static void
nvmf_rdma_qpair_queue_send_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *first)
nvmf_rdma_qpair_queue_send_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *first,
struct spdk_nvmf_rdma_transport *rtransport)
{
struct ibv_send_wr *last;

Expand All @@ -1143,10 +1157,14 @@ nvmf_rdma_qpair_queue_send_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_s
rqpair->sends_to_post.last->next = first;
rqpair->sends_to_post.last = last;
}

if (!rtransport->transport.opts.wr_batching) {
_poller_submit_sends(rtransport, rqpair->poller);
}
}

static int
request_transfer_in(struct spdk_nvmf_request *req)
request_transfer_in(struct spdk_nvmf_request *req, struct spdk_nvmf_rdma_transport *rtransport)
{
struct spdk_nvmf_rdma_request *rdma_req;
struct spdk_nvmf_qpair *qpair;
Expand All @@ -1159,14 +1177,15 @@ request_transfer_in(struct spdk_nvmf_request *req)
assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER);
assert(rdma_req != NULL);

nvmf_rdma_qpair_queue_send_wrs(rqpair, &rdma_req->data.wr);
nvmf_rdma_qpair_queue_send_wrs(rqpair, &rdma_req->data.wr, rtransport);
rqpair->current_read_depth += rdma_req->num_outstanding_data_wr;
rqpair->current_send_depth += rdma_req->num_outstanding_data_wr;
return 0;
}

static int
request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
request_transfer_out(struct spdk_nvmf_request *req, int *data_posted,
struct spdk_nvmf_rdma_transport *rtransport)
{
int num_outstanding_data_wr = 0;
struct spdk_nvmf_rdma_request *rdma_req;
Expand All @@ -1192,7 +1211,7 @@ request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
/* queue the capsule for the recv buffer */
assert(rdma_req->recv != NULL);

nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr);
nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr, rtransport);

rdma_req->recv = NULL;
assert(rqpair->current_recv_depth > 0);
Expand All @@ -1210,7 +1229,7 @@ request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
*data_posted = 1;
num_outstanding_data_wr = rdma_req->num_outstanding_data_wr;
}
nvmf_rdma_qpair_queue_send_wrs(rqpair, first);
nvmf_rdma_qpair_queue_send_wrs(rqpair, first, rtransport);
/* +1 for the rsp wr */
rqpair->current_send_depth += num_outstanding_data_wr + 1;

Expand Down Expand Up @@ -2132,7 +2151,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
/* We have already verified that this request is the head of the queue. */
STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_read_queue, state_link);

rc = request_transfer_in(&rdma_req->req);
rc = request_transfer_in(&rdma_req->req, rtransport);
if (!rc) {
rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER;
} else {
Expand Down Expand Up @@ -2240,7 +2259,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
case RDMA_REQUEST_STATE_READY_TO_COMPLETE:
spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0,
(uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
rc = request_transfer_out(&rdma_req->req, &data_posted);
rc = request_transfer_out(&rdma_req->req, &data_posted, rtransport);
assert(rc == 0); /* No good way to handle this currently */
if (rc) {
rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
Expand Down Expand Up @@ -2295,6 +2314,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
#define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32
#define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false
#define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false
#define SPDK_NVMF_RDMA_WR_BATCHING true

static void
spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
Expand All @@ -2310,6 +2330,7 @@ spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
opts->max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH;
opts->no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ;
opts->dif_insert_or_strip = SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP;
opts->wr_batching = SPDK_NVMF_RDMA_WR_BATCHING;
}

const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = {
Expand Down Expand Up @@ -2370,7 +2391,8 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
" Transport opts: max_ioq_depth=%d, max_io_size=%d,\n"
" max_qpairs_per_ctrlr=%d, io_unit_size=%d,\n"
" in_capsule_data_size=%d, max_aq_depth=%d,\n"
" num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d\n",
" num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d,\n"
" wr_batching=%d\n",
opts->max_queue_depth,
opts->max_io_size,
opts->max_qpairs_per_ctrlr,
Expand All @@ -2379,7 +2401,8 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
opts->max_aq_depth,
opts->num_shared_buffers,
opts->max_srq_depth,
opts->no_srq);
opts->no_srq,
opts->wr_batching);

/* I/O unit size cannot be larger than max I/O size */
if (opts->io_unit_size > opts->max_io_size) {
Expand Down
2 changes: 2 additions & 0 deletions module/event/subsystems/nvmf/conf.c
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,8 @@ spdk_nvmf_parse_transport(struct spdk_nvmf_parse_transport_ctx *ctx)
}
bval = spdk_conf_section_get_boolval(ctx->sp, "NoSRQ", false);
opts.no_srq = bval;
bval = spdk_conf_section_get_boolval(ctx->sp, "WRBatching", true);
opts.wr_batching = bval;
}

if (trtype == SPDK_NVME_TRANSPORT_TCP) {
Expand Down
4 changes: 3 additions & 1 deletion scripts/rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1684,7 +1684,8 @@ def nvmf_create_transport(args):
no_srq=args.no_srq,
c2h_success=args.c2h_success,
dif_insert_or_strip=args.dif_insert_or_strip,
sock_priority=args.sock_priority)
sock_priority=args.sock_priority,
wr_batching=args.wr_batching)

p = subparsers.add_parser('nvmf_create_transport', help='Create NVMf transport')
p.add_argument('-t', '--trtype', help='Transport type (ex. RDMA)', type=str, required=True)
Expand All @@ -1702,6 +1703,7 @@ def nvmf_create_transport(args):
p.add_argument('-o', '--c2h-success', action='store_false', help='Disable C2H success optimization. Relevant only for TCP transport')
p.add_argument('-f', '--dif-insert-or-strip', action='store_true', help='Enable DIF insert/strip. Relevant only for TCP transport')
p.add_argument('-y', '--sock-priority', help='The sock priority of the tcp connection. Relevant only for TCP transport', type=int)
p.add_argument('-b', '--wr-batching', action='store_true', help='Disable work requests batching. Relevant only for RDMA transport')
p.set_defaults(func=nvmf_create_transport)

def nvmf_get_transports(args):
Expand Down
7 changes: 5 additions & 2 deletions scripts/rpc/nvmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ def nvmf_create_transport(client,
no_srq=False,
c2h_success=True,
dif_insert_or_strip=None,
sock_priority=None):
sock_priority=None,
wr_batching=True):
"""NVMf Transport Create options.
Args:
Expand All @@ -123,7 +124,7 @@ def nvmf_create_transport(client,
no_srq: Boolean flag to disable SRQ even for devices that support it - RDMA specific (optional)
c2h_success: Boolean flag to disable the C2H success optimization - TCP specific (optional)
dif_insert_or_strip: Boolean flag to enable DIF insert/strip for I/O - TCP specific (optional)
wq_batching: Boolean flag to disable work requests batching - RDMA specific (optional)
Returns:
True or False
"""
Expand Down Expand Up @@ -158,6 +159,8 @@ def nvmf_create_transport(client,
params['dif_insert_or_strip'] = dif_insert_or_strip
if sock_priority:
params['sock_priority'] = sock_priority
if wr_batching:
params['wr_batching'] = wr_batching
return client.call('nvmf_create_transport', params)


Expand Down

0 comments on commit c3f17a8

Please sign in to comment.