nvmf/rdma: Add WR batch rdma parameter

With x86 and QD < 64 there is a benefit from disable batch when we have randread IO pattern, initiators on many cores and BS = 4K. With QD = 4 we also see benefit from disable batch for small BS( BS < 2K). Batching is configurable with optional parameter WRBatching in configuration file (default True). Signed-off-by: Ivan Betsis <c_ivanb@mellanox.com> Signed-off-by: Evgeniy Kochetov <evgeniik@mellanox.com> Signed-off-by: Sasha Kotchubievsky <sashakot@mellanox.com>
Mellanox · Feb 11, 2020 · c3f17a8 · c3f17a8
1 parent a8d0169
commit c3f17a8
Show file tree

Hide file tree

Showing 8 changed files with 54 additions and 14 deletions.
diff --git a/doc/jsonrpc.md b/doc/jsonrpc.md
@@ -3844,6 +3844,7 @@ no_srq                      | Optional | boolean | Disable shared receive queue
 c2h_success                 | Optional | boolean | Disable C2H success optimization (TCP only)
 dif_insert_or_strip         | Optional | boolean | Enable DIF insert for write I/O and DIF strip for read I/O DIF (TCP only)
 sock_priority               | Optional | number  | The socket priority of the connection owned by this transport (TCP only)
+wr_batching                 | Optional | boolean | Disable work requests batching (RDMA only)
 
 ### Example:
 

diff --git a/etc/spdk/nvmf.conf.in b/etc/spdk/nvmf.conf.in
@@ -111,6 +111,9 @@
   # Set the maximum number outstanding I/O per shared receive queue. Relevant only for RDMA transport
   #MaxSRQDepth 4096
 
+  # Set batching for RDMA requests
+  #WRBatching True
+
 [Transport]
   # Set TCP transport type.
   Type TCP

diff --git a/include/spdk/nvmf.h b/include/spdk/nvmf.h
@@ -83,6 +83,7 @@ struct spdk_nvmf_transport_opts {
 	bool		no_srq;
 	bool		c2h_success;
 	bool		dif_insert_or_strip;
+	bool            wr_batching;
 	uint32_t	sock_priority;
 };
 

diff --git a/lib/nvmf/nvmf_rpc.c b/lib/nvmf/nvmf_rpc.c
@@ -1602,6 +1602,10 @@ static const struct spdk_json_object_decoder nvmf_rpc_create_transport_decoder[]
 		"tgt_name", offsetof(struct nvmf_rpc_create_transport_ctx, tgt_name),
 		spdk_json_decode_string, true
 	},
+	{
+		"wr_batching", offsetof(struct nvmf_rpc_create_transport_ctx, opts.wr_batching),
+		spdk_json_decode_bool, true
+	},
 };
 
 static void
@@ -1745,6 +1749,7 @@ dump_nvmf_transport(struct spdk_json_write_ctx *w, struct spdk_nvmf_transport *t
 	if (type == SPDK_NVME_TRANSPORT_RDMA) {
 		spdk_json_write_named_uint32(w, "max_srq_depth", opts->max_srq_depth);
 		spdk_json_write_named_bool(w, "no_srq", opts->no_srq);
+		spdk_json_write_named_bool(w, "wr_batching", opts->wr_batching);
 	} else if (type == SPDK_NVME_TRANSPORT_TCP) {
 		spdk_json_write_named_bool(w, "c2h_success", opts->c2h_success);
 		spdk_json_write_named_uint32(w, "sock_priority", opts->sock_priority);

diff --git a/lib/nvmf/rdma.c b/lib/nvmf/rdma.c
@@ -522,6 +522,14 @@ struct spdk_nvmf_rdma_transport {
 static inline void
 spdk_nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair);
 
+static void
+_poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport,
+		     struct spdk_nvmf_rdma_poller *rpoller);
+
+static void
+_poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport,
+		     struct spdk_nvmf_rdma_poller *rpoller);
+
 static inline int
 spdk_nvmf_rdma_check_ibv_state(enum ibv_qp_state state)
 {
@@ -1102,7 +1110,8 @@ spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair)
 /* Append the given recv wr structure to the resource structs outstanding recvs list. */
 /* This function accepts either a single wr or the first wr in a linked list. */
 static void
-nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first)
+nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first,
+				struct spdk_nvmf_rdma_transport *rtransport)
 {
 	struct ibv_recv_wr *last;
 
@@ -1121,12 +1130,17 @@ nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_r
 		rqpair->resources->recvs_to_post.last->next = first;
 		rqpair->resources->recvs_to_post.last = last;
 	}
+
+	if (!rtransport->transport.opts.wr_batching) {
+		_poller_submit_recvs(rtransport, rqpair->poller);
+	}
 }
 
 /* Append the given send wr structure to the qpair's outstanding sends list. */
 /* This function accepts either a single wr or the first wr in a linked list. */
 static void
-nvmf_rdma_qpair_queue_send_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *first)
+nvmf_rdma_qpair_queue_send_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *first,
+				struct spdk_nvmf_rdma_transport *rtransport)
 {
 	struct ibv_send_wr *last;
 
@@ -1143,10 +1157,14 @@ nvmf_rdma_qpair_queue_send_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_s
 		rqpair->sends_to_post.last->next = first;
 		rqpair->sends_to_post.last = last;
 	}
+
+	if (!rtransport->transport.opts.wr_batching) {
+		_poller_submit_sends(rtransport, rqpair->poller);
+	}
 }
 
 static int
-request_transfer_in(struct spdk_nvmf_request *req)
+request_transfer_in(struct spdk_nvmf_request *req, struct spdk_nvmf_rdma_transport *rtransport)
 {
 	struct spdk_nvmf_rdma_request	*rdma_req;
 	struct spdk_nvmf_qpair		*qpair;
@@ -1159,14 +1177,15 @@ request_transfer_in(struct spdk_nvmf_request *req)
 	assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER);
 	assert(rdma_req != NULL);
 
-	nvmf_rdma_qpair_queue_send_wrs(rqpair, &rdma_req->data.wr);
+	nvmf_rdma_qpair_queue_send_wrs(rqpair, &rdma_req->data.wr, rtransport);
 	rqpair->current_read_depth += rdma_req->num_outstanding_data_wr;
 	rqpair->current_send_depth += rdma_req->num_outstanding_data_wr;
 	return 0;
 }
 
 static int
-request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
+request_transfer_out(struct spdk_nvmf_request *req, int *data_posted,
+			struct spdk_nvmf_rdma_transport *rtransport)
 {
 	int				num_outstanding_data_wr = 0;
 	struct spdk_nvmf_rdma_request	*rdma_req;
@@ -1192,7 +1211,7 @@ request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
 	/* queue the capsule for the recv buffer */
 	assert(rdma_req->recv != NULL);
 
-	nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr);
+	nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr, rtransport);
 
 	rdma_req->recv = NULL;
 	assert(rqpair->current_recv_depth > 0);
@@ -1210,7 +1229,7 @@ request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
 		*data_posted = 1;
 		num_outstanding_data_wr = rdma_req->num_outstanding_data_wr;
 	}
-	nvmf_rdma_qpair_queue_send_wrs(rqpair, first);
+	nvmf_rdma_qpair_queue_send_wrs(rqpair, first, rtransport);
 	/* +1 for the rsp wr */
 	rqpair->current_send_depth += num_outstanding_data_wr + 1;
 
@@ -2132,7 +2151,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
 			/* We have already verified that this request is the head of the queue. */
 			STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_read_queue, state_link);
 
-			rc = request_transfer_in(&rdma_req->req);
+			rc = request_transfer_in(&rdma_req->req, rtransport);
 			if (!rc) {
 				rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER;
 			} else {
@@ -2240,7 +2259,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
 		case RDMA_REQUEST_STATE_READY_TO_COMPLETE:
 			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0,
 					  (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
-			rc = request_transfer_out(&rdma_req->req, &data_posted);
+			rc = request_transfer_out(&rdma_req->req, &data_posted, rtransport);
 			assert(rc == 0); /* No good way to handle this currently */
 			if (rc) {
 				rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
@@ -2295,6 +2314,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
 #define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32
 #define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false
 #define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false
+#define SPDK_NVMF_RDMA_WR_BATCHING true
 
 static void
 spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
@@ -2310,6 +2330,7 @@ spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
 	opts->max_srq_depth =		SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH;
 	opts->no_srq =			SPDK_NVMF_RDMA_DEFAULT_NO_SRQ;
 	opts->dif_insert_or_strip =	SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP;
+	opts->wr_batching =             SPDK_NVMF_RDMA_WR_BATCHING;
 }
 
 const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = {
@@ -2370,7 +2391,8 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
 		     "  Transport opts:  max_ioq_depth=%d, max_io_size=%d,\n"
 		     "  max_qpairs_per_ctrlr=%d, io_unit_size=%d,\n"
 		     "  in_capsule_data_size=%d, max_aq_depth=%d,\n"
-		     "  num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d\n",
+		     "  num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d,\n"
+		     "  wr_batching=%d\n",
 		     opts->max_queue_depth,
 		     opts->max_io_size,
 		     opts->max_qpairs_per_ctrlr,
@@ -2379,7 +2401,8 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
 		     opts->max_aq_depth,
 		     opts->num_shared_buffers,
 		     opts->max_srq_depth,
-		     opts->no_srq);
+		     opts->no_srq,
+		     opts->wr_batching);
 
 	/* I/O unit size cannot be larger than max I/O size */
 	if (opts->io_unit_size > opts->max_io_size) {

diff --git a/module/event/subsystems/nvmf/conf.c b/module/event/subsystems/nvmf/conf.c
@@ -642,6 +642,8 @@ spdk_nvmf_parse_transport(struct spdk_nvmf_parse_transport_ctx *ctx)
 		}
 		bval = spdk_conf_section_get_boolval(ctx->sp, "NoSRQ", false);
 		opts.no_srq = bval;
+		bval = spdk_conf_section_get_boolval(ctx->sp, "WRBatching", true);
+		opts.wr_batching = bval;
 	}
 
 	if (trtype == SPDK_NVME_TRANSPORT_TCP) {

diff --git a/scripts/rpc.py b/scripts/rpc.py
@@ -1684,7 +1684,8 @@ def nvmf_create_transport(args):
                                        no_srq=args.no_srq,
                                        c2h_success=args.c2h_success,
                                        dif_insert_or_strip=args.dif_insert_or_strip,
-                                       sock_priority=args.sock_priority)
+				       sock_priority=args.sock_priority,
+				       wr_batching=args.wr_batching)
 
     p = subparsers.add_parser('nvmf_create_transport', help='Create NVMf transport')
     p.add_argument('-t', '--trtype', help='Transport type (ex. RDMA)', type=str, required=True)
@@ -1702,6 +1703,7 @@ def nvmf_create_transport(args):
     p.add_argument('-o', '--c2h-success', action='store_false', help='Disable C2H success optimization. Relevant only for TCP transport')
     p.add_argument('-f', '--dif-insert-or-strip', action='store_true', help='Enable DIF insert/strip. Relevant only for TCP transport')
     p.add_argument('-y', '--sock-priority', help='The sock priority of the tcp connection. Relevant only for TCP transport', type=int)
+    p.add_argument('-b', '--wr-batching', action='store_true', help='Disable work requests batching. Relevant only for RDMA transport')
     p.set_defaults(func=nvmf_create_transport)
 
     def nvmf_get_transports(args):

diff --git a/scripts/rpc/nvmf.py b/scripts/rpc/nvmf.py
@@ -106,7 +106,8 @@ def nvmf_create_transport(client,
                           no_srq=False,
                           c2h_success=True,
                           dif_insert_or_strip=None,
-                          sock_priority=None):
+			  sock_priority=None,
+			  wr_batching=True):
     """NVMf Transport Create options.
 
     Args:
@@ -123,7 +124,7 @@ def nvmf_create_transport(client,
         no_srq: Boolean flag to disable SRQ even for devices that support it - RDMA specific (optional)
         c2h_success: Boolean flag to disable the C2H success optimization - TCP specific (optional)
         dif_insert_or_strip: Boolean flag to enable DIF insert/strip for I/O - TCP specific (optional)
-
+	wq_batching: Boolean flag to disable work requests batching - RDMA specific (optional)
     Returns:
         True or False
     """
@@ -158,6 +159,8 @@ def nvmf_create_transport(client,
         params['dif_insert_or_strip'] = dif_insert_or_strip
     if sock_priority:
         params['sock_priority'] = sock_priority
+    if wr_batching:
+	params['wr_batching'] = wr_batching
     return client.call('nvmf_create_transport', params)