Skip to content

Commit

Permalink
zvol: use multiple taskq
Browse files Browse the repository at this point in the history
Currently, zvol uses a single taskq, resulting in throughput bottleneck
under heavy load due to lock contention on the single taskq. This patch
addresses the performance bottleneck under heavy load conditions by
utilizing multiple taskqs, thus mitigating lock contention. The number
of taskqs scale dynamically based on the available CPUs in the system,
as illustrated below:

                taskq   total
cpus    taskqs  threads threads
------- ------- ------- -------
1       1       32       32
2       1       32       32
4       1       32       32
8       2       16       32
16      3       11       33
32      5       7        35
64      8       8        64
128     11      12       132
256     16      16       256

Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
  • Loading branch information
ixhamza committed Mar 14, 2024
1 parent 8f2f6cd commit 3e5bf3a
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 10 deletions.
7 changes: 7 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -2350,6 +2350,13 @@ The number of requests which can be handled concurrently is controlled by
is ignored when running on a kernel that supports block multiqueue
.Pq Li blk-mq .
.
.It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint
Number of zvol taskqs.
If
.Sy 0
(the default) then scaling is done internally to prefer 6 threads per taskq.
This only applies on Linux.
.
.It Sy zvol_threads Ns = Ns Sy 0 Pq uint
The number of system wide threads to use for processing zvol block IOs.
If
Expand Down
102 changes: 92 additions & 10 deletions module/os/linux/zfs/zvol_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include <sys/spa_impl.h>
#include <sys/zvol.h>
#include <sys/zvol_impl.h>
#include <cityhash.h>

#include <linux/blkdev_compat.h>
#include <linux/task_io_accounting_ops.h>
Expand All @@ -53,6 +54,12 @@ static unsigned int zvol_request_sync = 0;
static unsigned int zvol_prefetch_bytes = (128 * 1024);
static unsigned long zvol_max_discard_blocks = 16384;

/*
* Switch taskq at multiple of 512 MB offset. This can be set to a lower value
* to utilize more threads for small files but may affect prefetch hits.
*/
#define ZVOL_TASKQ_OFFSET_SHIFT 29

#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
static unsigned int zvol_open_timeout_ms = 1000;
#endif
Expand All @@ -74,6 +81,7 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
* read and write tests to a zvol in an NVMe pool (with 16 CPUs).
*/
static unsigned int zvol_blk_mq_blocks_per_thread = 8;
static unsigned int zvol_num_taskqs = 0;
#endif

#ifndef BLKDEV_DEFAULT_RQ
Expand Down Expand Up @@ -114,7 +122,11 @@ struct zvol_state_os {
boolean_t use_blk_mq;
};

static taskq_t *zvol_taskq;
typedef struct zv_taskq {
uint_t tqs_cnt;
taskq_t **tqs_taskq;
} zv_taskq_t;
static zv_taskq_t zvol_taskqs;
static struct ida zvol_ida;

typedef struct zv_request_stack {
Expand Down Expand Up @@ -532,6 +544,17 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
}

zv_request_task_t *task;
zv_taskq_t *ztqs = &zvol_taskqs;
uint_t blk_mq_hw_queue = 0;
uint_t tq_idx;
uint_t taskq_hash;
#ifdef HAVE_BLK_MQ
if (rq)
blk_mq_hw_queue = rq->mq_hctx->queue_num;
#endif
taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
blk_mq_hw_queue, 0);
tq_idx = taskq_hash % ztqs->tqs_cnt;

if (rw == WRITE) {
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
Expand Down Expand Up @@ -601,15 +624,15 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
zvol_discard(&zvr);
} else {
task = zv_request_task_create(zvr);
taskq_dispatch_ent(zvol_taskq,
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
zvol_discard_task, task, 0, &task->ent);
}
} else {
if (force_sync) {
zvol_write(&zvr);
} else {
task = zv_request_task_create(zvr);
taskq_dispatch_ent(zvol_taskq,
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
zvol_write_task, task, 0, &task->ent);
}
}
Expand All @@ -631,7 +654,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
zvol_read(&zvr);
} else {
task = zv_request_task_create(zvr);
taskq_dispatch_ent(zvol_taskq,
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
zvol_read_task, task, 0, &task->ent);
}
}
Expand Down Expand Up @@ -1570,8 +1593,40 @@ zvol_init(void)
zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
}

/*
* Use atleast 32 zvol_threads but for many core system,
* prefer 6 threads per taskq, but no more taskqs
* than threads in them on large systems.
*
* taskq total
* cpus taskqs threads threads
* ------- ------- ------- -------
* 1 1 32 32
* 2 1 32 32
* 4 1 32 32
* 8 2 16 32
* 16 3 11 33
* 32 5 7 35
* 64 8 8 64
* 128 11 12 132
* 256 16 16 256
*/
zv_taskq_t *ztqs = &zvol_taskqs;
uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
if (num_tqs == 0) {
num_tqs = 1 + num_online_cpus() / 6;
while (num_tqs * num_tqs > zvol_actual_threads)
num_tqs--;
}
uint_t per_tq_thread = zvol_actual_threads / num_tqs;
if (per_tq_thread * num_tqs < zvol_actual_threads)
per_tq_thread++;
ztqs->tqs_cnt = num_tqs;
ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
error = register_blkdev(zvol_major, ZVOL_DRIVER);
if (error) {
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
ztqs->tqs_taskq = NULL;
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
return (error);
}
Expand All @@ -1591,11 +1646,22 @@ zvol_init(void)
1024);
}
#endif
zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri,
zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
if (zvol_taskq == NULL) {
unregister_blkdev(zvol_major, ZVOL_DRIVER);
return (-ENOMEM);
for (uint_t i = 0; i < num_tqs; i++) {
char name[32];
(void) snprintf(name, sizeof (name), "%s_tq-%u",
ZVOL_DRIVER, i);
ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
maxclsyspri, per_tq_thread, INT_MAX,
TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
if (ztqs->tqs_taskq[i] == NULL) {
for (int j = i - 1; j >= 0; j--)
taskq_destroy(ztqs->tqs_taskq[j]);
unregister_blkdev(zvol_major, ZVOL_DRIVER);
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
sizeof (taskq_t *));
ztqs->tqs_taskq = NULL;
return (-ENOMEM);
}
}

zvol_init_impl();
Expand All @@ -1606,9 +1672,22 @@ zvol_init(void)
void
zvol_fini(void)
{
zv_taskq_t *ztqs = &zvol_taskqs;
zvol_fini_impl();
unregister_blkdev(zvol_major, ZVOL_DRIVER);
taskq_destroy(zvol_taskq);

if (ztqs->tqs_taskq == NULL) {
ASSERT3U(ztqs->tqs_cnt, ==, 0);
} else {
for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
taskq_destroy(ztqs->tqs_taskq[i]);
}
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
sizeof (taskq_t *));
ztqs->tqs_taskq = NULL;
}

ida_destroy(&zvol_ida);
}

Expand All @@ -1629,6 +1708,9 @@ MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
module_param(zvol_max_discard_blocks, ulong, 0444);
MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");

module_param(zvol_num_taskqs, uint, 0444);
MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");

module_param(zvol_prefetch_bytes, uint, 0644);
MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");

Expand Down

0 comments on commit 3e5bf3a

Please sign in to comment.