From 0e8a142837469a31a3d4fc556aa7ef8242abe96a Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 1 Jun 2023 15:40:31 -0400
Subject: [PATCH] Another set of vdev queue optimizations.

Switch FIFO queues (SYNC/TRIM) and active queue of vdev queue from
time-sorted AVL-trees to simple lists.  AVL-trees are too expensive
for such a simple task.  To change I/O priority without searching
through the trees, add io_queue_state field to struct zio.

To not check number of queued I/Os for each priority add vq_cqueued
bitmap to struct vdev_queue.  Update it when adding/removing I/Os.
Make vq_cactive a separate array instead of struct vdev_queue_class
member.  Together those allow to avoid lots of cache misses when
looking for work in vdev_queue_class_to_issue().

Introduce deadline of ~0.5s for LBA-sorted queues.  Before this I
saw some I/Os waiting in a queue for up to 8 seconds and possibly
more due to starvation.  With this change I no longer see it.  I
had to slightly more complicate the comparison function, but since
it uses all the same cache lines the difference is minimal.  For a
sequential I/Os the new code in vdev_queue_io_to_issue() actually
often uses more simple avl_first(), falling back to avl_find() and
avl_nearest() only when needed.

Arrange members in struct zio to access only one cache line when
searching through vdev queues.  While there, remove io_alloc_node,
reusing the io_queue_node instead.  Those two are never used same
time.

Remove zfs_vdev_aggregate_trim parameter.  It was disabled for 4
years since implemented, while still wasted time maintaining the
offset-sorted tree of TRIM requests.  Just remove the tree.

Remove locking from txg_all_lists_empty().  It is racy by design,
while 2 pair of locks/unlocks take noticeable time under the vdev
queue lock.

With these changes in my tests with volblocksize=4KB I measure vdev
queue lock spin time reduction by 50% on read and 75% on write.

Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
---
 include/sys/vdev.h      |   3 +-
 include/sys/vdev_impl.h |  17 +--
 include/sys/zio.h       |  15 +-
 man/man4/zfs.4          |   6 -
 module/zfs/spa_misc.c   |   2 +-
 module/zfs/txg.c        |  13 +-
 module/zfs/vdev.c       |  16 +--
 module/zfs/vdev_queue.c | 305 ++++++++++++++++++++++------------------
 8 files changed, 205 insertions(+), 172 deletions(-)

diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 26c834ff57cf..03e1f438aaf9 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -164,8 +164,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
 extern void vdev_queue_io_done(zio_t *zio);
 extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
 
-extern int vdev_queue_length(vdev_t *vd);
+extern uint32_t vdev_queue_length(vdev_t *vd);
 extern uint64_t vdev_queue_last_offset(vdev_t *vd);
+extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
 
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 74b3737d8ee5..2b22b973ba49 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -130,27 +130,24 @@ typedef const struct vdev_ops {
 /*
  * Virtual device properties
  */
-typedef struct vdev_queue_class {
-	uint32_t	vqc_active;
-
-	/*
-	 * Sorted by offset or timestamp, depending on if the queue is
-	 * LBA-ordered vs FIFO.
-	 */
-	avl_tree_t	vqc_queued_tree;
+typedef union vdev_queue_class {
+	list_t		vqc_list;
+	avl_tree_t	vqc_tree;
 } vdev_queue_class_t;
 
 struct vdev_queue {
 	vdev_t		*vq_vdev;
 	vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
-	avl_tree_t	vq_active_tree;
 	avl_tree_t	vq_read_offset_tree;
 	avl_tree_t	vq_write_offset_tree;
-	avl_tree_t	vq_trim_offset_tree;
 	uint64_t	vq_last_offset;
 	zio_priority_t	vq_last_prio;	/* Last sent I/O priority. */
+	uint32_t	vq_cqueued;	/* Classes with queued I/Os. */
+	uint32_t	vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
+	uint32_t	vq_active;	/* Number of active I/Os. */
 	uint32_t	vq_ia_active;	/* Active interactive I/Os. */
 	uint32_t	vq_nia_credit;	/* Non-interactive I/Os credit. */
+	list_t		vq_active_list;	/* List of active I/Os. */
 	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
 	hrtime_t	vq_io_delta_ts;
 	zio_t		vq_io_search; /* used as local for stack reduction */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 6b1352a72b9a..1e5163976bc9 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -436,6 +436,12 @@ typedef struct zio_link {
 	list_node_t	zl_child_node;
 } zio_link_t;
 
+enum zio_qstate {
+	ZIO_QS_NONE = 0,
+	ZIO_QS_QUEUED,
+	ZIO_QS_ACTIVE,
+};
+
 struct zio {
 	/* Core information about this I/O */
 	zbookmark_phys_t	io_bookmark;
@@ -480,6 +486,12 @@ struct zio {
 	const zio_vsd_ops_t *io_vsd_ops;
 	metaslab_class_t *io_metaslab_class;	/* dva throttle class */
 
+	enum zio_qstate	io_queue_state;	/* vdev queue state */
+	union {
+		list_node_t l;
+		avl_node_t a;
+	} io_queue_node ____cacheline_aligned;	/* allocator and vdev queues */
+	avl_node_t	io_offset_node;	/* vdev offset queues */
 	uint64_t	io_offset;
 	hrtime_t	io_timestamp;	/* submitted at */
 	hrtime_t	io_queued_timestamp;
@@ -487,9 +499,6 @@ struct zio {
 	hrtime_t	io_delta;	/* vdev queue service delta */
 	hrtime_t	io_delay;	/* Device access time (disk or */
 					/* file). */
-	avl_node_t	io_queue_node;
-	avl_node_t	io_offset_node;
-	avl_node_t	io_alloc_node;
 	zio_alloc_list_t 	io_alloc_list;
 
 	/* Internal pipeline state */
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 5fbd9d7db93f..04bbbc5fdf59 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2016,12 +2016,6 @@ Historical statistics for this many latest TXGs will be available in
 Flush dirty data to disk at least every this many seconds (maximum TXG
 duration).
 .
-.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint
-Allow TRIM I/O operations to be aggregated.
-This is normally not helpful because the extents to be trimmed
-will have been already been aggregated by the metaslab.
-This option is provided for debugging and performance analysis.
-.
 .It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
 Max vdev I/O aggregation size.
 .
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 9ef948e9e434..8dc83445e198 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
 		    NULL);
 		avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
-		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+		    sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
 	}
 	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index ec61cabcaab2..a67c043446f5 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl)
 boolean_t
 txg_all_lists_empty(txg_list_t *tl)
 {
-	mutex_enter(&tl->tl_lock);
-	for (int i = 0; i < TXG_SIZE; i++) {
-		if (!txg_list_empty_impl(tl, i)) {
-			mutex_exit(&tl->tl_lock);
-			return (B_FALSE);
-		}
-	}
-	mutex_exit(&tl->tl_lock);
-	return (B_TRUE);
+	boolean_t res = B_TRUE;
+	for (int i = 0; i < TXG_SIZE; i++)
+		res &= (tl->tl_head[i] == NULL);
+	return (res);
 }
 
 /*
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 612e66c3a8a8..30551feb6322 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -4608,11 +4608,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 
 		memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
 
-		for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
-			vsx->vsx_active_queue[t] =
-			    vd->vdev_queue.vq_class[t].vqc_active;
-			vsx->vsx_pend_queue[t] = avl_numnodes(
-			    &vd->vdev_queue.vq_class[t].vqc_queued_tree);
+		for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
+			vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
+			vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
 		}
 	}
 }
@@ -5470,20 +5468,20 @@ vdev_deadman(vdev_t *vd, const char *tag)
 		vdev_queue_t *vq = &vd->vdev_queue;
 
 		mutex_enter(&vq->vq_lock);
-		if (avl_numnodes(&vq->vq_active_tree) > 0) {
+		if (vq->vq_active > 0) {
 			spa_t *spa = vd->vdev_spa;
 			zio_t *fio;
 			uint64_t delta;
 
-			zfs_dbgmsg("slow vdev: %s has %lu active IOs",
-			    vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
+			zfs_dbgmsg("slow vdev: %s has %u active IOs",
+			    vd->vdev_path, vq->vq_active);
 
 			/*
 			 * Look at the head of all the pending queues,
 			 * if any I/O has been outstanding for longer than
 			 * the spa_deadman_synctime invoke the deadman logic.
 			 */
-			fio = avl_first(&vq->vq_active_tree);
+			fio = list_head(&vq->vq_active_list);
 			delta = gethrtime() - fio->io_timestamp;
 			if (delta > spa_deadman_synctime(spa))
 				zio_deadman(fio, tag);
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index abb7d0662b8c..08d918467d03 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -228,13 +228,6 @@ uint_t zfs_vdev_queue_depth_pct = 300;
  */
 uint_t zfs_vdev_def_queue_depth = 32;
 
-/*
- * Allow TRIM I/Os to be aggregated.  This should normally not be needed since
- * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
- * by the TRIM code in zfs_trim.c.
- */
-static uint_t zfs_vdev_aggregate_trim = 0;
-
 static int
 vdev_queue_offset_compare(const void *x1, const void *x2)
 {
@@ -249,38 +242,60 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
 	return (TREE_PCMP(z1, z2));
 }
 
-static inline avl_tree_t *
-vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
-{
-	return (&vq->vq_class[p].vqc_queued_tree);
-}
-
-static inline avl_tree_t *
-vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
-{
-	ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
-	if (t == ZIO_TYPE_READ)
-		return (&vq->vq_read_offset_tree);
-	else if (t == ZIO_TYPE_WRITE)
-		return (&vq->vq_write_offset_tree);
-	else
-		return (&vq->vq_trim_offset_tree);
-}
+#define	VDQ_T_SHIFT 29
 
 static int
-vdev_queue_timestamp_compare(const void *x1, const void *x2)
+vdev_queue_to_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = (const zio_t *)x1;
 	const zio_t *z2 = (const zio_t *)x2;
 
-	int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
+	int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT,
+	    z2->io_timestamp >> VDQ_T_SHIFT);
+	int ocmp = TREE_CMP(z1->io_offset, z2->io_offset);
+	int cmp = tcmp ? tcmp : ocmp;
 
-	if (likely(cmp))
+	if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE)))
 		return (cmp);
 
 	return (TREE_PCMP(z1, z2));
 }
 
+static inline boolean_t
+vdev_queue_class_fifo(zio_priority_t p)
+{
+	return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE ||
+	    p == ZIO_PRIORITY_TRIM);
+}
+
+static void
+vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio)
+{
+	zio_priority_t p = zio->io_priority;
+	vq->vq_cqueued |= 1U << p;
+	if (vdev_queue_class_fifo(p))
+		list_insert_tail(&vq->vq_class[p].vqc_list, zio);
+	else
+		avl_add(&vq->vq_class[p].vqc_tree, zio);
+}
+
+static void
+vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio)
+{
+	zio_priority_t p = zio->io_priority;
+	uint32_t empty;
+	if (vdev_queue_class_fifo(p)) {
+		list_t *list = &vq->vq_class[p].vqc_list;
+		list_remove(list, zio);
+		empty = list_is_empty(list);
+	} else {
+		avl_tree_t *tree = &vq->vq_class[p].vqc_tree;
+		avl_remove(tree, zio);
+		empty = avl_is_empty(tree);
+	}
+	vq->vq_cqueued &= ~(empty << p);
+}
+
 static uint_t
 vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
 {
@@ -360,7 +375,7 @@ vdev_queue_max_async_writes(spa_t *spa)
 }
 
 static uint_t
-vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
+vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SYNC_READ:
@@ -370,7 +385,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
 	case ZIO_PRIORITY_ASYNC_READ:
 		return (zfs_vdev_async_read_max_active);
 	case ZIO_PRIORITY_ASYNC_WRITE:
-		return (vdev_queue_max_async_writes(spa));
+		return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa));
 	case ZIO_PRIORITY_SCRUB:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
@@ -414,10 +429,10 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
 static zio_priority_t
 vdev_queue_class_to_issue(vdev_queue_t *vq)
 {
-	spa_t *spa = vq->vq_vdev->vdev_spa;
-	zio_priority_t p, n;
+	uint32_t cq = vq->vq_cqueued;
+	zio_priority_t p, p1;
 
-	if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
+	if (cq == 0 || vq->vq_active >= zfs_vdev_max_active)
 		return (ZIO_PRIORITY_NUM_QUEUEABLE);
 
 	/*
@@ -425,14 +440,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
 	 * Do round-robin to reduce starvation due to zfs_vdev_max_active
 	 * and vq_nia_credit limits.
 	 */
-	for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
-		p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
-		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
-		    vq->vq_class[p].vqc_active <
-		    vdev_queue_class_min_active(vq, p)) {
-			vq->vq_last_prio = p;
-			return (p);
-		}
+	p1 = vq->vq_last_prio + 1;
+	if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE)
+		p1 = 0;
+	for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+		    vdev_queue_class_min_active(vq, p))
+			goto found;
+	}
+	for (p = 0; p < p1; p++) {
+		if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+		    vdev_queue_class_min_active(vq, p))
+			goto found;
 	}
 
 	/*
@@ -440,16 +459,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
 	 * maximum # outstanding i/os.
 	 */
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
-		    vq->vq_class[p].vqc_active <
-		    vdev_queue_class_max_active(spa, vq, p)) {
-			vq->vq_last_prio = p;
-			return (p);
-		}
+		if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+		    vdev_queue_class_max_active(vq, p))
+			break;
 	}
 
-	/* No eligible queued i/os */
-	return (ZIO_PRIORITY_NUM_QUEUEABLE);
+found:
+	vq->vq_last_prio = p;
+	return (p);
 }
 
 void
@@ -458,42 +475,30 @@ vdev_queue_init(vdev_t *vd)
 	vdev_queue_t *vq = &vd->vdev_queue;
 	zio_priority_t p;
 
-	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
 	vq->vq_vdev = vd;
-	taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
-
-	avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_queue_node));
-	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
-	    vdev_queue_offset_compare, sizeof (zio_t),
-	    offsetof(struct zio, io_offset_node));
-	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
-	    vdev_queue_offset_compare, sizeof (zio_t),
-	    offsetof(struct zio, io_offset_node));
-	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
-	    vdev_queue_offset_compare, sizeof (zio_t),
-	    offsetof(struct zio, io_offset_node));
 
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-		int (*compfn) (const void *, const void *);
-
-		/*
-		 * The synchronous/trim i/o queues are dispatched in FIFO rather
-		 * than LBA order. This provides more consistent latency for
-		 * these i/os.
-		 */
-		if (p == ZIO_PRIORITY_SYNC_READ ||
-		    p == ZIO_PRIORITY_SYNC_WRITE ||
-		    p == ZIO_PRIORITY_TRIM) {
-			compfn = vdev_queue_timestamp_compare;
+		if (vdev_queue_class_fifo(p)) {
+			list_create(&vq->vq_class[p].vqc_list,
+			    sizeof (zio_t),
+			    offsetof(struct zio, io_queue_node.l));
 		} else {
-			compfn = vdev_queue_offset_compare;
+			avl_create(&vq->vq_class[p].vqc_tree,
+			    vdev_queue_to_compare, sizeof (zio_t),
+			    offsetof(struct zio, io_queue_node.a));
 		}
-		avl_create(vdev_queue_class_tree(vq, p), compfn,
-		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
 	}
+	avl_create(&vq->vq_read_offset_tree,
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
+	avl_create(&vq->vq_write_offset_tree,
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
 
 	vq->vq_last_offset = 0;
+	list_create(&vq->vq_active_list, sizeof (struct zio),
+	    offsetof(struct zio, io_queue_node.l));
+	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
 }
 
 void
@@ -501,30 +506,39 @@ vdev_queue_fini(vdev_t *vd)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 
-	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
-		avl_destroy(vdev_queue_class_tree(vq, p));
-	avl_destroy(&vq->vq_active_tree);
-	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
-	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
-	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
+	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		if (vdev_queue_class_fifo(p))
+			list_destroy(&vq->vq_class[p].vqc_list);
+		else
+			avl_destroy(&vq->vq_class[p].vqc_tree);
+	}
+	avl_destroy(&vq->vq_read_offset_tree);
+	avl_destroy(&vq->vq_write_offset_tree);
 
+	list_destroy(&vq->vq_active_list);
 	mutex_destroy(&vq->vq_lock);
 }
 
 static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
-	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
-	avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
+	zio->io_queue_state = ZIO_QS_QUEUED;
+	vdev_queue_class_add(vq, zio);
+	if (zio->io_type == ZIO_TYPE_READ)
+		avl_add(&vq->vq_read_offset_tree, zio);
+	else if (zio->io_type == ZIO_TYPE_WRITE)
+		avl_add(&vq->vq_write_offset_tree, zio);
 }
 
 static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
-	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
-	avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
+	vdev_queue_class_remove(vq, zio);
+	if (zio->io_type == ZIO_TYPE_READ)
+		avl_remove(&vq->vq_read_offset_tree, zio);
+	else if (zio->io_type == ZIO_TYPE_WRITE)
+		avl_remove(&vq->vq_write_offset_tree, zio);
+	zio->io_queue_state = ZIO_QS_NONE;
 }
 
 static boolean_t
@@ -546,14 +560,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 {
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	vq->vq_class[zio->io_priority].vqc_active++;
+	vq->vq_cactive[zio->io_priority]++;
+	vq->vq_active++;
 	if (vdev_queue_is_interactive(zio->io_priority)) {
 		if (++vq->vq_ia_active == 1)
 			vq->vq_nia_credit = 1;
 	} else if (vq->vq_ia_active > 0) {
 		vq->vq_nia_credit--;
 	}
-	avl_add(&vq->vq_active_tree, zio);
+	zio->io_queue_state = ZIO_QS_ACTIVE;
+	list_insert_tail(&vq->vq_active_list, zio);
 }
 
 static void
@@ -561,7 +577,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	vq->vq_class[zio->io_priority].vqc_active--;
+	vq->vq_cactive[zio->io_priority]--;
+	vq->vq_active--;
 	if (vdev_queue_is_interactive(zio->io_priority)) {
 		if (--vq->vq_ia_active == 0)
 			vq->vq_nia_credit = 0;
@@ -569,7 +586,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 			vq->vq_nia_credit = zfs_vdev_nia_credit;
 	} else if (vq->vq_ia_active == 0)
 		vq->vq_nia_credit++;
-	avl_remove(&vq->vq_active_tree, zio);
+	list_remove(&vq->vq_active_list, zio);
+	zio->io_queue_state = ZIO_QS_NONE;
 }
 
 static void
@@ -602,29 +620,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	uint64_t maxgap = 0;
 	uint64_t size;
 	uint64_t limit;
-	int maxblocksize;
 	boolean_t stretch = B_FALSE;
-	avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
-	zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 	uint64_t next_offset;
 	abd_t *abd;
+	avl_tree_t *t;
+
+	/*
+	 * TRIM aggregation should not be needed since code in zfs_trim.c can
+	 * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M).
+	 */
+	if (zio->io_type == ZIO_TYPE_TRIM)
+		return (NULL);
+
+	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
+		return (NULL);
 
-	maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
 	if (vq->vq_vdev->vdev_nonrot)
 		limit = zfs_vdev_aggregation_limit_non_rotating;
 	else
 		limit = zfs_vdev_aggregation_limit;
-	limit = MIN(limit, maxblocksize);
-
-	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
-		return (NULL);
-
-	/*
-	 * While TRIM commands could be aggregated based on offset this
-	 * behavior is disabled until it's determined to be beneficial.
-	 */
-	if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
+	if (limit == 0)
 		return (NULL);
+	limit = MIN(limit, SPA_MAXBLOCKSIZE);
 
 	/*
 	 * I/Os to distributed spares are directly dispatched to the dRAID
@@ -635,8 +652,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 
 	first = last = zio;
 
-	if (zio->io_type == ZIO_TYPE_READ)
+	if (zio->io_type == ZIO_TYPE_READ) {
 		maxgap = zfs_vdev_read_gap_limit;
+		t = &vq->vq_read_offset_tree;
+	} else {
+		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+		t = &vq->vq_write_offset_tree;
+	}
 
 	/*
 	 * We can aggregate I/Os that are sufficiently adjacent and of
@@ -657,6 +679,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	 * Walk backwards through sufficiently contiguous I/Os
 	 * recording the last non-optional I/O.
 	 */
+	zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 	while ((dio = AVL_PREV(t, first)) != NULL &&
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 	    IO_SPAN(dio, last) <= limit &&
@@ -686,7 +709,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 	    (IO_SPAN(first, dio) <= limit ||
 	    (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
-	    IO_SPAN(first, dio) <= maxblocksize &&
+	    IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE &&
 	    IO_GAP(last, dio) <= maxgap &&
 	    dio->io_type == zio->io_type) {
 		last = dio;
@@ -740,7 +763,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 		return (NULL);
 
 	size = IO_SPAN(first, last);
-	ASSERT3U(size, <=, maxblocksize);
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 
 	abd = abd_alloc_gang();
 	if (abd == NULL)
@@ -824,19 +847,30 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
 		return (NULL);
 	}
 
-	/*
-	 * For LBA-ordered queues (async / scrub / initializing), issue the
-	 * i/o which follows the most recently issued i/o in LBA (offset) order.
-	 *
-	 * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
-	 */
-	tree = vdev_queue_class_tree(vq, p);
-	vq->vq_io_search.io_timestamp = 0;
-	vq->vq_io_search.io_offset = vq->vq_last_offset - 1;
-	VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL);
-	zio = avl_nearest(tree, idx, AVL_AFTER);
-	if (zio == NULL)
-		zio = avl_first(tree);
+	if (vdev_queue_class_fifo(p)) {
+		zio = list_head(&vq->vq_class[p].vqc_list);
+	} else {
+		/*
+		 * For LBA-ordered queues (async / scrub / initializing),
+		 * issue the I/O which follows the most recently issued I/O
+		 * in LBA (offset) order, but to avoid starvation only within
+		 * the same 0.5 second interval as the first I/O.
+		 */
+		tree = &vq->vq_class[p].vqc_tree;
+		zio = aio = avl_first(tree);
+		if (zio->io_offset < vq->vq_last_offset) {
+			vq->vq_io_search.io_timestamp = zio->io_timestamp;
+			vq->vq_io_search.io_offset = vq->vq_last_offset;
+			zio = avl_find(tree, &vq->vq_io_search, &idx);
+			if (zio == NULL) {
+				zio = avl_nearest(tree, idx, AVL_AFTER);
+				if (zio == NULL ||
+				    (zio->io_timestamp >> VDQ_T_SHIFT) !=
+				    (aio->io_timestamp >> VDQ_T_SHIFT))
+					zio = aio;
+			}
+		}
+	}
 	ASSERT3U(zio->io_priority, ==, p);
 
 	aio = vdev_queue_aggregate(vq, zio);
@@ -967,7 +1001,6 @@ void
 vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
-	avl_tree_t *tree;
 
 	/*
 	 * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
@@ -1002,12 +1035,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
 	 * Otherwise, the zio is currently active and we cannot change its
 	 * priority.
 	 */
-	tree = vdev_queue_class_tree(vq, zio->io_priority);
-	if (avl_find(tree, zio, NULL) == zio) {
-		avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+	if (zio->io_queue_state == ZIO_QS_QUEUED) {
+		vdev_queue_class_remove(vq, zio);
 		zio->io_priority = priority;
-		avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
-	} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
+		vdev_queue_class_add(vq, zio);
+	} else if (zio->io_queue_state == ZIO_QS_NONE) {
 		zio->io_priority = priority;
 	}
 
@@ -1020,10 +1052,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
  * vq_lock mutex use here, instead we prefer to keep it lock free for
  * performance.
  */
-int
+uint32_t
 vdev_queue_length(vdev_t *vd)
 {
-	return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+	return (vd->vdev_queue.vq_active);
 }
 
 uint64_t
@@ -1032,15 +1064,22 @@ vdev_queue_last_offset(vdev_t *vd)
 	return (vd->vdev_queue.vq_last_offset);
 }
 
+uint64_t
+vdev_queue_class_length(vdev_t *vd, zio_priority_t p)
+{
+	vdev_queue_t *vq = &vd->vdev_queue;
+	if (vdev_queue_class_fifo(p))
+		return (list_is_empty(&vq->vq_class[p].vqc_list) == 0);
+	else
+		return (avl_numnodes(&vq->vq_class[p].vqc_tree));
+}
+
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW,
 	"Max vdev I/O aggregation size");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT,
 	ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, UINT, ZMOD_RW,
-	"Allow TRIM I/O to be aggregated");
-
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW,
 	"Aggregate read I/O over gap");