From 3bd4df3841529316e5145590cc67076467b6abb7 Mon Sep 17 00:00:00 2001
From: ednadolski-ix <137826107+ednadolski-ix@users.noreply.github.com>
Date: Mon, 6 Nov 2023 11:38:42 -0700
Subject: [PATCH] Improve ZFS objset sync parallelism

As part of transaction group commit, dsl_pool_sync() sequentially calls
dsl_dataset_sync() for each dirty dataset, which subsequently calls
dmu_objset_sync().  dmu_objset_sync() in turn uses up to 75% of CPU
cores to run sync_dnodes_task() in taskq threads to sync the dirty
dnodes (files).

There are two problems:

1. Each ZVOL in a pool is a separate dataset/objset having a single
   dnode.  This means the objsets are synchronized serially, which
   leads to a bottleneck of ~330K blocks written per second per pool.

2. In the case of multiple dirty dnodes/files on a dataset/objset on a
   big system they will be sync'd in parallel taskq threads. However,
   it is inefficient to to use 75% of CPU cores of a big system to do
   that, because of (a) bottlenecks on a single write issue taskq, and
   (b) allocation throttling.  In addition, if not for the allocation
   throttling sorting write requests by bookmarks (logical address),
   writes for different files may reach space allocators interleaved,
   leading to unwanted fragmentation.

The solution to both problems is to always sync no more and (if
possible) no fewer dnodes at the same time than there are allocators
the pool.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Edmund Nadolski <edmund.nadolski@ixsystems.com>
Closes #15197
---
 include/os/freebsd/spl/sys/taskq.h |   3 +
 include/os/linux/spl/sys/taskq.h   |   2 +
 include/sys/spa.h                  |   5 +
 include/sys/spa_impl.h             |  12 +-
 include/sys/zfs_context.h          |   2 +
 include/sys/zio.h                  |   6 +
 lib/libzpool/taskq.c               |  30 +++++
 man/man4/zfs.4                     |  22 ++--
 module/os/freebsd/spl/spl_taskq.c  |  82 +++++++++++++
 module/os/linux/spl/spl-taskq.c    |  36 ++++++
 module/zfs/dbuf.c                  |   9 +-
 module/zfs/dmu_objset.c            | 138 +++++++++++++++------
 module/zfs/dnode_sync.c            |   1 +
 module/zfs/dsl_dataset.c           |   5 +-
 module/zfs/dsl_pool.c              |  31 ++---
 module/zfs/spa.c                   | 190 +++++++++++++++++++++++------
 module/zfs/spa_misc.c              |  14 ++-
 module/zfs/zio.c                   |  43 ++++---
 18 files changed, 513 insertions(+), 118 deletions(-)

diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h
index b23a939b3aa7..0f23eafe3d4e 100644
--- a/include/os/freebsd/spl/sys/taskq.h
+++ b/include/os/freebsd/spl/sys/taskq.h
@@ -42,6 +42,7 @@ extern "C" {
 
 typedef struct taskq {
 	struct taskqueue	*tq_queue;
+	int			tq_nthreads;
 } taskq_t;
 
 typedef uintptr_t taskqid_t;
@@ -93,6 +94,8 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
     taskq_ent_t *);
 extern int taskq_empty_ent(taskq_ent_t *);
 taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
+taskq_t	*taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
+    kthread_t ***);
 taskq_t	*taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t);
 taskq_t	*taskq_create_proc(const char *, int, pri_t, int, int,
     struct proc *, uint_t);
diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h
index 6c1b4377a98a..aa5860c56e83 100644
--- a/include/os/linux/spl/sys/taskq.h
+++ b/include/os/linux/spl/sys/taskq.h
@@ -150,6 +150,8 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
 extern int taskq_empty_ent(taskq_ent_t *);
 extern void taskq_init_ent(taskq_ent_t *);
 extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
+extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
+    kthread_t ***);
 extern void taskq_destroy(taskq_t *);
 extern void taskq_wait_id(taskq_t *, taskqid_t);
 extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 88ef510b744b..cef7933df441 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -825,6 +825,11 @@ extern void spa_sync_allpools(void);
 
 extern uint_t zfs_sync_pass_deferred_free;
 
+/* spa sync taskqueues */
+taskq_t *spa_sync_tq_create(spa_t *spa, const char *name);
+void spa_sync_tq_destroy(spa_t *spa);
+void spa_select_allocator(zio_t *zio);
+
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;
 
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 094258d47a48..b1eb06f94fcc 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -188,6 +188,12 @@ typedef struct spa_taskqs {
 	taskq_t **stqs_taskq;
 } spa_taskqs_t;
 
+/* one for each thread in the spa sync taskq */
+typedef struct spa_syncthread_info {
+	kthread_t	*sti_thread;
+	taskq_t		*sti_wr_iss_tq;		/* assigned wr_iss taskq */
+} spa_syncthread_info_t;
+
 typedef enum spa_all_vdev_zap_action {
 	AVZ_ACTION_NONE = 0,
 	AVZ_ACTION_DESTROY,	/* Destroy all per-vdev ZAPs and the AVZ. */
@@ -265,6 +271,10 @@ struct spa {
 	int		spa_alloc_count;
 	int		spa_active_allocator;	/* selectable allocator */
 
+	/* per-allocator sync thread taskqs */
+	taskq_t		*spa_sync_tq;
+	spa_syncthread_info_t *spa_syncthreads;
+
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
 	nvlist_t	*spa_label_features;	/* Features for reading MOS */
@@ -456,7 +466,7 @@ extern char *spa_config_path;
 extern const char *zfs_deadman_failmode;
 extern uint_t spa_slop_shift;
 extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
-    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio);
 extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, void *arg, uint_t flags);
 extern void spa_load_spares(spa_t *spa);
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 750ca612b962..9ec2f73b366c 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -496,6 +496,8 @@ extern taskq_t *system_taskq;
 extern taskq_t *system_delay_taskq;
 
 extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
+extern taskq_t	*taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
+    kthread_t ***);
 #define	taskq_create_proc(a, b, c, d, e, p, f) \
 	    (taskq_create(a, b, c, d, e, f))
 #define	taskq_create_sysdc(a, b, d, e, p, dc, f) \
diff --git a/include/sys/zio.h b/include/sys/zio.h
index e1f4d5c04499..25a4b221f05e 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -223,6 +223,9 @@ typedef uint64_t zio_flag_t;
 #define	ZIO_FLAG_REEXECUTED	(1ULL << 29)
 #define	ZIO_FLAG_DELEGATED	(1ULL << 30)
 
+#define	ZIO_ALLOCATOR_NONE	(-1)
+#define	ZIO_HAS_ALLOCATOR(zio)	((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
+
 #define	ZIO_FLAG_MUSTSUCCEED		0
 #define	ZIO_FLAG_RAW	(ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
 
@@ -526,6 +529,9 @@ struct zio {
 
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
+
+	/* write issue taskq selection, based upon sync thread */
+	taskq_t		*io_wr_iss_tq;
 };
 
 enum blk_verify_flag {
diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c
index a2e457ef9e60..99a181ec3c93 100644
--- a/lib/libzpool/taskq.c
+++ b/lib/libzpool/taskq.c
@@ -337,6 +337,36 @@ taskq_destroy(taskq_t *tq)
 	kmem_free(tq, sizeof (taskq_t));
 }
 
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+    int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+	taskq_t *tq;
+	kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+	    KM_SLEEP);
+
+	(void) pri; (void) minalloc; (void) maxalloc;
+
+	flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+	tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+	    flags | TASKQ_PREPOPULATE);
+	VERIFY(tq != NULL);
+	VERIFY(tq->tq_nthreads == nthreads);
+
+	for (int i = 0; i < nthreads; i++) {
+		kthreads[i] = tq->tq_threadlist[i];
+	}
+	*ktpp = kthreads;
+	return (tq);
+}
+
 int
 taskq_member(taskq_t *tq, kthread_t *t)
 {
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index ddad00be412d..f9824ac170ea 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -496,6 +496,13 @@ If we have less than this amount of free space,
 most ZPL operations (e.g. write, create) will return
 .Sy ENOSPC .
 .
+.It Sy spa_num_allocators Ns = Ns Sy 4 Pq int
+Determines the number of block alloctators to use per spa instance.
+Capped by the number of actual CPUs in the system.
+.Pp
+Note that setting this value too high could result in performance
+degredation and/or excess fragmentation.
+.
 .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint
 Limits the number of on-disk error log entries that will be converted to the
 new format when enabling the
@@ -1974,13 +1981,6 @@ and may need to load new metaslabs to satisfy these allocations.
 .It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint
 Rewrite new block pointers starting in this pass.
 .
-.It Sy zfs_sync_taskq_batch_pct Ns = Ns Sy 75 Ns % Pq int
-This controls the number of threads used by
-.Sy dp_sync_taskq .
-The default value of
-.Sy 75%
-will create a maximum of one thread per CPU.
-.
 .It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
 Maximum size of TRIM command.
 Larger ranges will be split into chunks no larger than this value before
@@ -2265,6 +2265,14 @@ If
 .Sy 0 ,
 generate a system-dependent value close to 6 threads per taskq.
 .
+.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint
+Determines the number of CPUs to run write issue taskqs.
+.Pp
+When 0 (the default), the value to use is computed internally
+as the number of actual CPUs in the system divided by the
+.Sy spa_num_allocators
+value.
+.
 .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Do not create zvol device nodes.
 This may slightly improve startup time on
diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c
index 842b80ade1fb..6912b220a94e 100644
--- a/module/os/freebsd/spl/spl_taskq.c
+++ b/module/os/freebsd/spl/spl_taskq.c
@@ -220,6 +220,7 @@ taskq_create_impl(const char *name, int nthreads, pri_t pri,
 		nthreads = MAX((mp_ncpus * nthreads) / 100, 1);
 
 	tq = kmem_alloc(sizeof (*tq), KM_SLEEP);
+	tq->tq_nthreads = nthreads;
 	tq->tq_queue = taskqueue_create(name, M_WAITOK,
 	    taskqueue_thread_enqueue, &tq->tq_queue);
 	taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT,
@@ -254,6 +255,87 @@ taskq_destroy(taskq_t *tq)
 	kmem_free(tq, sizeof (*tq));
 }
 
+static void taskq_sync_assign(void *arg);
+
+typedef struct taskq_sync_arg {
+	kthread_t	*tqa_thread;
+	kcondvar_t	tqa_cv;
+	kmutex_t 	tqa_lock;
+	int		tqa_ready;
+} taskq_sync_arg_t;
+
+static void
+taskq_sync_assign(void *arg)
+{
+	taskq_sync_arg_t *tqa = arg;
+
+	mutex_enter(&tqa->tqa_lock);
+	tqa->tqa_thread = curthread;
+	tqa->tqa_ready = 1;
+	cv_signal(&tqa->tqa_cv);
+	while (tqa->tqa_ready == 1)
+		cv_wait(&tqa->tqa_cv, &tqa->tqa_lock);
+	mutex_exit(&tqa->tqa_lock);
+}
+
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+    int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+	taskq_t *tq;
+	taskq_sync_arg_t *tqs = kmem_zalloc(sizeof (*tqs) * nthreads, KM_SLEEP);
+	kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+	    KM_SLEEP);
+
+	flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+	tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+	    flags | TASKQ_PREPOPULATE);
+	VERIFY(tq != NULL);
+	VERIFY(tq->tq_nthreads == nthreads);
+
+	/* spawn all syncthreads */
+	for (int i = 0; i < nthreads; i++) {
+		cv_init(&tqs[i].tqa_cv, NULL, CV_DEFAULT, NULL);
+		mutex_init(&tqs[i].tqa_lock, NULL, MUTEX_DEFAULT, NULL);
+		(void) taskq_dispatch(tq, taskq_sync_assign,
+		    &tqs[i], TQ_FRONT);
+	}
+
+	/* wait on all syncthreads to start */
+	for (int i = 0; i < nthreads; i++) {
+		mutex_enter(&tqs[i].tqa_lock);
+		while (tqs[i].tqa_ready == 0)
+			cv_wait(&tqs[i].tqa_cv, &tqs[i].tqa_lock);
+		mutex_exit(&tqs[i].tqa_lock);
+	}
+
+	/* let all syncthreads resume, finish */
+	for (int i = 0; i < nthreads; i++) {
+		mutex_enter(&tqs[i].tqa_lock);
+		tqs[i].tqa_ready = 2;
+		cv_broadcast(&tqs[i].tqa_cv);
+		mutex_exit(&tqs[i].tqa_lock);
+	}
+	taskq_wait(tq);
+
+	for (int i = 0; i < nthreads; i++) {
+		kthreads[i] = tqs[i].tqa_thread;
+		mutex_destroy(&tqs[i].tqa_lock);
+		cv_destroy(&tqs[i].tqa_cv);
+	}
+	kmem_free(tqs, sizeof (*tqs) * nthreads);
+
+	*ktpp = kthreads;
+	return (tq);
+}
+
 int
 taskq_member(taskq_t *tq, kthread_t *thread)
 {
diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index d18f935b167c..79a1a8e5a5aa 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -1262,6 +1262,42 @@ taskq_destroy(taskq_t *tq)
 }
 EXPORT_SYMBOL(taskq_destroy);
 
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+    int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+	taskq_t *tq;
+	taskq_thread_t *tqt;
+	int i = 0;
+	kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+	    KM_SLEEP);
+
+	flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+	/* taskq_create spawns all the threads before returning */
+	tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+	    flags | TASKQ_PREPOPULATE);
+	VERIFY(tq != NULL);
+	VERIFY(tq->tq_nthreads == nthreads);
+
+	list_for_each_entry(tqt, &tq->tq_thread_list, tqt_thread_list) {
+		kthreads[i] = tqt->tqt_thread;
+		i++;
+	}
+
+	ASSERT3S(i, ==, nthreads);
+	*ktpp = kthreads;
+
+	return (tq);
+}
+EXPORT_SYMBOL(taskq_create_synced);
+
 static unsigned int spl_taskq_kick = 0;
 
 /*
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index c0c2692c113a..0a179fffb16a 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -4587,6 +4587,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	}
 }
 
+/*
+ * Syncs out a range of dirty records for indirect or leaf dbufs.  May be
+ * called recursively from dbuf_sync_indirect().
+ */
 void
 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
@@ -5005,7 +5009,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
 }
 
 
-/* Issue I/O to commit a dirty buffer to disk. */
+/*
+ * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
+ * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
+ */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 76e65b5506a9..f098e1daa44b 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1639,28 +1639,90 @@ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
 	kmem_free(bp, sizeof (*bp));
 }
 
+typedef struct sync_objset_arg {
+	zio_t		*soa_zio;
+	objset_t	*soa_os;
+	dmu_tx_t	*soa_tx;
+	kmutex_t	soa_mutex;
+	int		soa_count;
+	taskq_ent_t	soa_tq_ent;
+} sync_objset_arg_t;
+
 typedef struct sync_dnodes_arg {
-	multilist_t *sda_list;
-	int sda_sublist_idx;
-	multilist_t *sda_newlist;
-	dmu_tx_t *sda_tx;
+	multilist_t	*sda_list;
+	int		sda_sublist_idx;
+	multilist_t	*sda_newlist;
+	sync_objset_arg_t *sda_soa;
 } sync_dnodes_arg_t;
 
+static void sync_meta_dnode_task(void *arg);
+
 static void
 sync_dnodes_task(void *arg)
 {
 	sync_dnodes_arg_t *sda = arg;
+	sync_objset_arg_t *soa = sda->sda_soa;
+	objset_t *os = soa->soa_os;
 
 	multilist_sublist_t *ms =
 	    multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
 
-	dmu_objset_sync_dnodes(ms, sda->sda_tx);
+	dmu_objset_sync_dnodes(ms, soa->soa_tx);
 
 	multilist_sublist_unlock(ms);
 
 	kmem_free(sda, sizeof (*sda));
+
+	mutex_enter(&soa->soa_mutex);
+	ASSERT(soa->soa_count != 0);
+	if (--soa->soa_count != 0) {
+		mutex_exit(&soa->soa_mutex);
+		return;
+	}
+	mutex_exit(&soa->soa_mutex);
+
+	taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
+	    sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
 }
 
+/*
+ * Issue the zio_nowait() for all dirty record zios on the meta dnode,
+ * then trigger the callback for the zil_sync. This runs once for each
+ * objset, only after any/all sublists in the objset have been synced.
+ */
+static void
+sync_meta_dnode_task(void *arg)
+{
+	sync_objset_arg_t *soa = arg;
+	objset_t *os = soa->soa_os;
+	dmu_tx_t *tx = soa->soa_tx;
+	int txgoff = tx->tx_txg & TXG_MASK;
+	dbuf_dirty_record_t *dr;
+
+	ASSERT0(soa->soa_count);
+
+	list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
+	while ((dr = list_remove_head(list)) != NULL) {
+		ASSERT0(dr->dr_dbuf->db_level);
+		zio_nowait(dr->dr_zio);
+	}
+
+	/* Enable dnode backfill if enough objects have been freed. */
+	if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
+		os->os_rescan_dnodes = B_TRUE;
+		os->os_freed_dnodes = 0;
+	}
+
+	/*
+	 * Free intent log blocks up to this tx.
+	 */
+	zil_sync(os->os_zil, tx);
+	os->os_phys->os_zil_header = os->os_zil_header;
+	zio_nowait(soa->soa_zio);
+
+	mutex_destroy(&soa->soa_mutex);
+	kmem_free(soa, sizeof (*soa));
+}
 
 /* called from dsl */
 void
@@ -1670,8 +1732,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
-	list_t *list;
-	dbuf_dirty_record_t *dr;
 	int num_sublists;
 	multilist_t *ml;
 	blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
@@ -1758,39 +1818,49 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 		    offsetof(dnode_t, dn_dirty_link[txgoff]));
 	}
 
+	/*
+	 * zio_nowait(zio) is done after any/all sublist and meta dnode
+	 * zios have been nowaited, and the zil_sync() has been performed.
+	 * The soa is freed at the end of sync_meta_dnode_task.
+	 */
+	sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP);
+	soa->soa_zio = zio;
+	soa->soa_os = os;
+	soa->soa_tx = tx;
+	taskq_init_ent(&soa->soa_tq_ent);
+	mutex_init(&soa->soa_mutex, NULL, MUTEX_DEFAULT, NULL);
+
 	ml = &os->os_dirty_dnodes[txgoff];
-	num_sublists = multilist_get_num_sublists(ml);
+	soa->soa_count = num_sublists = multilist_get_num_sublists(ml);
+
 	for (int i = 0; i < num_sublists; i++) {
 		if (multilist_sublist_is_empty_idx(ml, i))
-			continue;
-		sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
-		sda->sda_list = ml;
-		sda->sda_sublist_idx = i;
-		sda->sda_tx = tx;
-		(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
-		    sync_dnodes_task, sda, 0);
-		/* callback frees sda */
-	}
-	taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
-
-	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
-	while ((dr = list_remove_head(list)) != NULL) {
-		ASSERT0(dr->dr_dbuf->db_level);
-		zio_nowait(dr->dr_zio);
+			soa->soa_count--;
 	}
 
-	/* Enable dnode backfill if enough objects have been freed. */
-	if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
-		os->os_rescan_dnodes = B_TRUE;
-		os->os_freed_dnodes = 0;
+	if (soa->soa_count == 0) {
+		taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
+		    sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
+	} else {
+		/*
+		 * Sync sublists in parallel. The last to finish
+		 * (i.e., when soa->soa_count reaches zero) must
+		 *  dispatch sync_meta_dnode_task.
+		 */
+		for (int i = 0; i < num_sublists; i++) {
+			if (multilist_sublist_is_empty_idx(ml, i))
+				continue;
+			sync_dnodes_arg_t *sda =
+			    kmem_alloc(sizeof (*sda), KM_SLEEP);
+			sda->sda_list = ml;
+			sda->sda_sublist_idx = i;
+			sda->sda_soa = soa;
+			(void) taskq_dispatch(
+			    dmu_objset_pool(os)->dp_sync_taskq,
+			    sync_dnodes_task, sda, 0);
+			/* sync_dnodes_task frees sda */
+		}
 	}
-
-	/*
-	 * Free intent log blocks up to this tx.
-	 */
-	zil_sync(os->os_zil, tx);
-	os->os_phys->os_zil_header = os->os_zil_header;
-	zio_nowait(zio);
 }
 
 boolean_t
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index 8e39af83bb0a..8cffbdb9d20b 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -627,6 +627,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 
 /*
  * Write out the dnode's dirty buffers.
+ * Does not wait for zio completions.
  */
 void
 dnode_sync(dnode_t *dn, dmu_tx_t *tx)
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index d6db61729223..62a1649d3786 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -2069,8 +2069,9 @@ dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
 	return (error);
 }
 
+/* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */
 void
-dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *rio, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(ds->ds_objset != NULL);
@@ -2098,7 +2099,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 		ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
 	}
 
-	dmu_objset_sync(ds->ds_objset, zio, tx);
+	dmu_objset_sync(ds->ds_objset, rio, tx);
 }
 
 /*
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 17b971248283..370c6a010dca 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -140,11 +140,6 @@ uint_t zfs_delay_min_dirty_percent = 60;
  */
 uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
 
-/*
- * This determines the number of threads used by the dp_sync_taskq.
- */
-static int zfs_sync_taskq_batch_pct = 75;
-
 /*
  * These tunables determine the behavior of how zil_itxg_clean() is
  * called via zil_clean() in the context of spa_sync(). When an itxg
@@ -214,9 +209,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	txg_list_create(&dp->dp_early_sync_tasks, spa,
 	    offsetof(dsl_sync_task_t, dst_node));
 
-	dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
-	    zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
-	    TASKQ_THREADS_CPU_PCT);
+	dp->dp_sync_taskq = spa_sync_tq_create(spa, "dp_sync_taskq");
 
 	dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
 	    zfs_zil_clean_taskq_nthr_pct, minclsyspri,
@@ -409,7 +402,7 @@ dsl_pool_close(dsl_pool_t *dp)
 	txg_list_destroy(&dp->dp_dirty_dirs);
 
 	taskq_destroy(dp->dp_zil_clean_taskq);
-	taskq_destroy(dp->dp_sync_taskq);
+	spa_sync_tq_destroy(dp->dp_spa);
 
 	/*
 	 * We can't set retry to TRUE since we're explicitly specifying
@@ -674,7 +667,7 @@ dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
-	zio_t *zio;
+	zio_t *rio;	/* root zio for all dirty dataset syncs */
 	dmu_tx_t *tx;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
@@ -704,9 +697,10 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	}
 
 	/*
-	 * Write out all dirty blocks of dirty datasets.
+	 * Write out all dirty blocks of dirty datasets. Note, this could
+	 * create a very large (+10k) zio tree.
 	 */
-	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		/*
 		 * We must not sync any non-MOS datasets twice, because
@@ -715,9 +709,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 		 */
 		ASSERT(!list_link_active(&ds->ds_synced_link));
 		list_insert_tail(&synced_datasets, ds);
-		dsl_dataset_sync(ds, zio, tx);
+		dsl_dataset_sync(ds, rio, tx);
 	}
-	VERIFY0(zio_wait(zio));
+	VERIFY0(zio_wait(rio));
 
 	/*
 	 * Update the long range free counter after
@@ -748,13 +742,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	 * user accounting information (and we won't get confused
 	 * about which blocks are part of the snapshot).
 	 */
-	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		objset_t *os = ds->ds_objset;
 
 		ASSERT(list_link_active(&ds->ds_synced_link));
 		dmu_buf_rele(ds->ds_dbuf, ds);
-		dsl_dataset_sync(ds, zio, tx);
+		dsl_dataset_sync(ds, rio, tx);
 
 		/*
 		 * Release any key mappings created by calls to
@@ -767,7 +761,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 			key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
 		}
 	}
-	VERIFY0(zio_wait(zio));
+	VERIFY0(zio_wait(rio));
 
 	/*
 	 * Now that the datasets have been completely synced, we can
@@ -1481,9 +1475,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW,
 	"How quickly delay approaches infinity");
 
-ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW,
-	"Max percent of CPUs that are used to sync dirty data");
-
 ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW,
 	"Max percent of CPUs that are used per dp_sync_taskq");
 
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index aa97144f16e4..68f367c1c744 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -99,6 +99,7 @@
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
+#include <cityhash.h>
 
 /*
  * spa_thread() existed on Illumos as a parent thread for the various worker
@@ -128,16 +129,16 @@ int zfs_ccw_retry_interval = 300;
 
 typedef enum zti_modes {
 	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
-	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
 	ZTI_MODE_SCALE,			/* Taskqs scale with CPUs. */
+	ZTI_MODE_SYNC,			/* sync thread assigned */
 	ZTI_MODE_NULL,			/* don't create a taskq */
 	ZTI_NMODES
 } zti_modes_t;
 
 #define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
 #define	ZTI_PCT(n)	{ ZTI_MODE_ONLINE_PERCENT, (n), 1 }
-#define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
 #define	ZTI_SCALE	{ ZTI_MODE_SCALE, 0, 1 }
+#define	ZTI_SYNC	{ ZTI_MODE_SYNC, 0, 1 }
 #define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
 
 #define	ZTI_N(n)	ZTI_P(n, 1)
@@ -158,14 +159,14 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
  * initializing a pool, we use this table to create an appropriately sized
  * taskq. Some operations are low volume and therefore have a small, static
  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
- * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macros. Other operations process a large amount of data; the ZTI_SCALE
  * macro causes us to create a taskq oriented for throughput. Some operations
  * are so high frequency and short-lived that the taskq itself can become a
  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  * additional degree of parallelism specified by the number of threads per-
  * taskq and the number of taskqs; when dispatching an event in this case, the
- * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
- * but with number of taskqs also scaling with number of CPUs.
+ * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
+ * that scales with the number of CPUs.
  *
  * The different taskq priorities are to handle the different contexts (issue
  * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
@@ -175,7 +176,7 @@ static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
 	{ ZTI_N(8),	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* READ */
-	{ ZTI_BATCH,	ZTI_N(5),	ZTI_SCALE,	ZTI_N(5) }, /* WRITE */
+	{ ZTI_SYNC,	ZTI_N(5),	ZTI_SCALE,	ZTI_N(5) }, /* WRITE */
 	{ ZTI_SCALE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
@@ -206,6 +207,8 @@ static const uint_t	zio_taskq_basedc = 80;	  /* base duty cycle */
 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
 #endif
 
+static uint_t	zio_taskq_wr_iss_ncpus = 0;
+
 /*
  * Report any spa_load_verify errors found, but do not fail spa_load.
  * This is used by zdb to analyze non-idle pools.
@@ -1054,21 +1057,34 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 	uint_t count = ztip->zti_count;
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	uint_t cpus, flags = TASKQ_DYNAMIC;
-#ifdef HAVE_SYSDC
-	boolean_t batch = B_FALSE;
-#endif
 
 	switch (mode) {
 	case ZTI_MODE_FIXED:
 		ASSERT3U(value, >, 0);
 		break;
 
-	case ZTI_MODE_BATCH:
-#ifdef HAVE_SYSDC
-		batch = B_TRUE;
-#endif
+	case ZTI_MODE_SYNC:
+
+		/*
+		 * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
+		 * not to exceed the number of spa allocators.
+		 */
+		if (zio_taskq_wr_iss_ncpus == 0) {
+			count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
+		} else {
+			count = MAX(1,
+			    boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
+		}
+		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
+		count = MIN(count, spa->spa_alloc_count);
+
+		/*
+		 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
+		 * single taskq may have more threads than 100% of online cpus.
+		 */
+		value = (zio_taskq_batch_pct + count / 2) / count;
+		value = MIN(value, 100);
 		flags |= TASKQ_THREADS_CPU_PCT;
-		value = MIN(zio_taskq_batch_pct, 100);
 		break;
 
 	case ZTI_MODE_SCALE:
@@ -1115,7 +1131,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 
 	default:
 		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
-		    "spa_activate()",
+		    "spa_taskqs_init()",
 		    zio_type_name[t], zio_taskq_types[q], mode, value);
 		break;
 	}
@@ -1137,9 +1153,6 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 
 #ifdef HAVE_SYSDC
 		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
-			if (batch)
-				flags |= TASKQ_DC_BATCH;
-
 			(void) zio_taskq_basedc;
 			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 			    spa->spa_proc, zio_taskq_basedc, flags);
@@ -1200,12 +1213,11 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 /*
  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
  * Note that a type may have multiple discrete taskqs to avoid lock contention
- * on the taskq itself. In that case we choose which taskq at random by using
- * the low bits of gethrtime().
+ * on the taskq itself.
  */
-void
-spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
-    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+static taskq_t *
+spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    zio_t *zio)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	taskq_t *tq;
@@ -1213,12 +1225,27 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
 	ASSERT3U(tqs->stqs_count, !=, 0);
 
+	if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+	    (zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
+		/* dispatch to assigned write issue taskq */
+		tq = zio->io_wr_iss_tq;
+		return (tq);
+	}
+
 	if (tqs->stqs_count == 1) {
 		tq = tqs->stqs_taskq[0];
 	} else {
 		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 	}
+	return (tq);
+}
 
+void
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent,
+    zio_t *zio)
+{
+	taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio);
 	taskq_dispatch_ent(tq, func, arg, flags, ent);
 }
 
@@ -1229,20 +1256,8 @@ void
 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, void *arg, uint_t flags)
 {
-	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
-	taskq_t *tq;
-	taskqid_t id;
-
-	ASSERT3P(tqs->stqs_taskq, !=, NULL);
-	ASSERT3U(tqs->stqs_count, !=, 0);
-
-	if (tqs->stqs_count == 1) {
-		tq = tqs->stqs_taskq[0];
-	} else {
-		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
-	}
-
-	id = taskq_dispatch(tq, func, arg, flags);
+	taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL);
+	taskqid_t id = taskq_dispatch(tq, func, arg, flags);
 	if (id)
 		taskq_wait_id(tq, id);
 }
@@ -9649,6 +9664,104 @@ spa_sync_allpools(void)
 	mutex_exit(&spa_namespace_lock);
 }
 
+taskq_t *
+spa_sync_tq_create(spa_t *spa, const char *name)
+{
+	kthread_t **kthreads;
+
+	ASSERT(spa->spa_sync_tq == NULL);
+	ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
+
+	/*
+	 * - do not allow more allocators than cpus.
+	 * - there may be more cpus than allocators.
+	 * - do not allow more sync taskq threads than allocators or cpus.
+	 */
+	int nthreads = spa->spa_alloc_count;
+	spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
+	    nthreads, KM_SLEEP);
+
+	spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
+	    nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
+	VERIFY(spa->spa_sync_tq != NULL);
+	VERIFY(kthreads != NULL);
+
+	spa_taskqs_t *tqs =
+	    &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
+
+	spa_syncthread_info_t *ti = spa->spa_syncthreads;
+	for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
+		ti->sti_thread = kthreads[i];
+		if (w == tqs->stqs_count) {
+			w = 0;
+		}
+		ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
+	}
+
+	kmem_free(kthreads, sizeof (*kthreads) * nthreads);
+	return (spa->spa_sync_tq);
+}
+
+void
+spa_sync_tq_destroy(spa_t *spa)
+{
+	ASSERT(spa->spa_sync_tq != NULL);
+
+	taskq_wait(spa->spa_sync_tq);
+	taskq_destroy(spa->spa_sync_tq);
+	kmem_free(spa->spa_syncthreads,
+	    sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
+	spa->spa_sync_tq = NULL;
+}
+
+void
+spa_select_allocator(zio_t *zio)
+{
+	zbookmark_phys_t *bm = &zio->io_bookmark;
+	spa_t *spa = zio->io_spa;
+
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+	/*
+	 * A gang block (for example) may have inherited its parent's
+	 * allocator, in which case there is nothing further to do here.
+	 */
+	if (ZIO_HAS_ALLOCATOR(zio))
+		return;
+
+	ASSERT(spa != NULL);
+	ASSERT(bm != NULL);
+
+	/*
+	 * First try to use an allocator assigned to the syncthread, and set
+	 * the corresponding write issue taskq for the allocator.
+	 * Note, we must have an open pool to do this.
+	 */
+	if (spa->spa_sync_tq != NULL) {
+		spa_syncthread_info_t *ti = spa->spa_syncthreads;
+		for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
+			if (ti->sti_thread == curthread) {
+				zio->io_allocator = i;
+				zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
+				return;
+			}
+		}
+	}
+
+	/*
+	 * We want to try to use as many allocators as possible to help improve
+	 * performance, but we also want logically adjacent IOs to be physically
+	 * adjacent to improve sequential read performance. We chunk each object
+	 * into 2^20 block regions, and then hash based on the objset, object,
+	 * level, and region to accomplish both of these goals.
+	 */
+	uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
+	    bm->zb_blkid >> 20);
+
+	zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
+	zio->io_wr_iss_tq = NULL;
+}
+
 /*
  * ==========================================================================
  * Miscellaneous routines
@@ -10242,3 +10355,6 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
 	"Whether extra ALLOC blkptrs were added to a livelist entry while it "
 	"was being condensed");
 /* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
+	"Number of CPUs to run write issue taskqs");
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index c7472f972cc2..3990af98c732 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -388,7 +388,11 @@ uint_t spa_asize_inflation = 24;
 uint_t spa_slop_shift = 5;
 static const uint64_t spa_min_slop = 128ULL * 1024 * 1024;
 static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
-static const int spa_allocators = 4;
+
+/*
+ * Number of allocators to use, per spa instance
+ */
+static int spa_num_allocators = 4;
 
 /*
  * Spa active allocator.
@@ -730,7 +734,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	if (altroot)
 		spa->spa_root = spa_strdup(altroot);
 
-	spa->spa_alloc_count = spa_allocators;
+	/* Do not allow more allocators than CPUs. */
+	spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus);
+
 	spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
 	    sizeof (spa_alloc_t), KM_SLEEP);
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
@@ -739,6 +745,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
 		    sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
 	}
+
 	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
 	avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg,
@@ -3009,3 +3016,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
 	param_get_uint, ZMOD_RW, "Reserved free space in pool");
+
+ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
+	"Number of allocators per spa, capped by ncpus");
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 4eb276352a23..2f5b423ee72e 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -899,6 +899,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
+	zio->io_allocator = ZIO_ALLOCATOR_NONE;
 
 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) ||
 	    (pipeline & ZIO_STAGE_READY) == 0;
@@ -2007,7 +2008,7 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 	 */
 	ASSERT(taskq_empty_ent(&zio->io_tqent));
 	spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags,
-	    &zio->io_tqent);
+	    &zio->io_tqent, zio);
 }
 
 static boolean_t
@@ -2032,8 +2033,8 @@ zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 static zio_t *
 zio_issue_async(zio_t *zio)
 {
+	ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio));
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
-
 	return (NULL);
 }
 
@@ -2347,6 +2348,9 @@ zio_wait(zio_t *zio)
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		spa_select_allocator(zio);
+	}
 	__zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
@@ -2399,6 +2403,9 @@ zio_nowait(zio_t *zio)
 
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		spa_select_allocator(zio);
+	}
 	__zio_execute(zio);
 }
 
@@ -2863,6 +2870,13 @@ zio_gang_issue(zio_t *zio)
 	return (zio);
 }
 
+static void
+zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
+{
+	cio->io_allocator = pio->io_allocator;
+	cio->io_wr_iss_tq = pio->io_wr_iss_tq;
+}
+
 static void
 zio_write_gang_member_ready(zio_t *zio)
 {
@@ -2934,6 +2948,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 		gbh_copies = MIN(2, spa_max_replication(spa));
 	}
 
+	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
 	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
@@ -2997,6 +3012,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 	    zio_write_gang_done, NULL, pio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
+	zio_gang_inherit_allocator(pio, zio);
+
 	/*
 	 * Create and nowait the gang children.
 	 */
@@ -3027,6 +3044,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
+		zio_gang_inherit_allocator(zio, cio);
+
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(has_data);
@@ -3539,6 +3558,7 @@ zio_io_to_allocate(spa_t *spa, int allocator)
 		return (NULL);
 
 	ASSERT(IO_IS_ALLOCATING(zio));
+	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 	/*
 	 * Try to place a reservation for this zio. If we're unable to
@@ -3575,21 +3595,12 @@ zio_dva_throttle(zio_t *zio)
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
-	zbookmark_phys_t *bm = &zio->io_bookmark;
-	/*
-	 * We want to try to use as many allocators as possible to help improve
-	 * performance, but we also want logically adjacent IOs to be physically
-	 * adjacent to improve sequential read performance. We chunk each object
-	 * into 2^20 block regions, and then hash based on the objset, object,
-	 * level, and region to accomplish both of these goals.
-	 */
-	int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object,
-	    bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
-	zio->io_allocator = allocator;
+	int allocator = zio->io_allocator;
 	zio->io_metaslab_class = mc;
 	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
 	avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
@@ -3663,6 +3674,7 @@ zio_dva_allocate(zio_t *zio)
 	 * sync write performance.  If a log allocation fails, we will fall
 	 * back to spa_sync() which is abysmal for performance.
 	 */
+	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 	    &zio->io_alloc_list, zio, zio->io_allocator);
@@ -4515,6 +4527,7 @@ zio_ready(zio_t *zio)
 			ASSERT(IO_IS_ALLOCATING(zio));
 			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(zio->io_metaslab_class != NULL);
+			ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 			/*
 			 * We were unable to allocate anything, unreserve and
@@ -4601,6 +4614,7 @@ zio_dva_throttle_done(zio_t *zio)
 	}
 
 	ASSERT(IO_IS_ALLOCATING(pio));
+	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	ASSERT3P(zio, !=, zio->io_logical);
 	ASSERT(zio->io_logical != NULL);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
@@ -4663,6 +4677,7 @@ zio_done(zio_t *zio)
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(zio->io_bp != NULL);
+		ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 		metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
 		    zio->io_allocator);
@@ -4928,7 +4943,7 @@ zio_done(zio_t *zio)
 			ASSERT(taskq_empty_ent(&zio->io_tqent));
 			spa_taskq_dispatch_ent(zio->io_spa,
 			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
-			    zio_reexecute, zio, 0, &zio->io_tqent);
+			    zio_reexecute, zio, 0, &zio->io_tqent, NULL);
 		}
 		return (NULL);
 	}