From 0e8a142837469a31a3d4fc556aa7ef8242abe96a Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 1 Jun 2023 15:40:31 -0400 Subject: [PATCH] Another set of vdev queue optimizations. Switch FIFO queues (SYNC/TRIM) and active queue of vdev queue from time-sorted AVL-trees to simple lists. AVL-trees are too expensive for such a simple task. To change I/O priority without searching through the trees, add io_queue_state field to struct zio. To not check number of queued I/Os for each priority add vq_cqueued bitmap to struct vdev_queue. Update it when adding/removing I/Os. Make vq_cactive a separate array instead of struct vdev_queue_class member. Together those allow to avoid lots of cache misses when looking for work in vdev_queue_class_to_issue(). Introduce deadline of ~0.5s for LBA-sorted queues. Before this I saw some I/Os waiting in a queue for up to 8 seconds and possibly more due to starvation. With this change I no longer see it. I had to slightly more complicate the comparison function, but since it uses all the same cache lines the difference is minimal. For a sequential I/Os the new code in vdev_queue_io_to_issue() actually often uses more simple avl_first(), falling back to avl_find() and avl_nearest() only when needed. Arrange members in struct zio to access only one cache line when searching through vdev queues. While there, remove io_alloc_node, reusing the io_queue_node instead. Those two are never used same time. Remove zfs_vdev_aggregate_trim parameter. It was disabled for 4 years since implemented, while still wasted time maintaining the offset-sorted tree of TRIM requests. Just remove the tree. Remove locking from txg_all_lists_empty(). It is racy by design, while 2 pair of locks/unlocks take noticeable time under the vdev queue lock. With these changes in my tests with volblocksize=4KB I measure vdev queue lock spin time reduction by 50% on read and 75% on write. Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. --- include/sys/vdev.h | 3 +- include/sys/vdev_impl.h | 17 +-- include/sys/zio.h | 15 +- man/man4/zfs.4 | 6 - module/zfs/spa_misc.c | 2 +- module/zfs/txg.c | 13 +- module/zfs/vdev.c | 16 +-- module/zfs/vdev_queue.c | 305 ++++++++++++++++++++++------------------ 8 files changed, 205 insertions(+), 172 deletions(-) diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 26c834ff57cf..03e1f438aaf9 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -164,8 +164,9 @@ extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); -extern int vdev_queue_length(vdev_t *vd); +extern uint32_t vdev_queue_length(vdev_t *vd); extern uint64_t vdev_queue_last_offset(vdev_t *vd); +extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 74b3737d8ee5..2b22b973ba49 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -130,27 +130,24 @@ typedef const struct vdev_ops { /* * Virtual device properties */ -typedef struct vdev_queue_class { - uint32_t vqc_active; - - /* - * Sorted by offset or timestamp, depending on if the queue is - * LBA-ordered vs FIFO. - */ - avl_tree_t vqc_queued_tree; +typedef union vdev_queue_class { + list_t vqc_list; + avl_tree_t vqc_tree; } vdev_queue_class_t; struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; - avl_tree_t vq_active_tree; avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; - avl_tree_t vq_trim_offset_tree; uint64_t vq_last_offset; zio_priority_t vq_last_prio; /* Last sent I/O priority. */ + uint32_t vq_cqueued; /* Classes with queued I/Os. */ + uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE]; + uint32_t vq_active; /* Number of active I/Os. */ uint32_t vq_ia_active; /* Active interactive I/Os. */ uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ + list_t vq_active_list; /* List of active I/Os. */ hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 6b1352a72b9a..1e5163976bc9 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -436,6 +436,12 @@ typedef struct zio_link { list_node_t zl_child_node; } zio_link_t; +enum zio_qstate { + ZIO_QS_NONE = 0, + ZIO_QS_QUEUED, + ZIO_QS_ACTIVE, +}; + struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; @@ -480,6 +486,12 @@ struct zio { const zio_vsd_ops_t *io_vsd_ops; metaslab_class_t *io_metaslab_class; /* dva throttle class */ + enum zio_qstate io_queue_state; /* vdev queue state */ + union { + list_node_t l; + avl_node_t a; + } io_queue_node ____cacheline_aligned; /* allocator and vdev queues */ + avl_node_t io_offset_node; /* vdev offset queues */ uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ hrtime_t io_queued_timestamp; @@ -487,9 +499,6 @@ struct zio { hrtime_t io_delta; /* vdev queue service delta */ hrtime_t io_delay; /* Device access time (disk or */ /* file). */ - avl_node_t io_queue_node; - avl_node_t io_offset_node; - avl_node_t io_alloc_node; zio_alloc_list_t io_alloc_list; /* Internal pipeline state */ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 5fbd9d7db93f..04bbbc5fdf59 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2016,12 +2016,6 @@ Historical statistics for this many latest TXGs will be available in Flush dirty data to disk at least every this many seconds (maximum TXG duration). . -.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint -Allow TRIM I/O operations to be aggregated. -This is normally not helpful because the extents to be trimmed -will have been already been aggregated by the metaslab. -This option is provided for debugging and performance analysis. -. .It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint Max vdev I/O aggregation size. . diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 9ef948e9e434..8dc83445e198 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, - sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index ec61cabcaab2..a67c043446f5 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl) boolean_t txg_all_lists_empty(txg_list_t *tl) { - mutex_enter(&tl->tl_lock); - for (int i = 0; i < TXG_SIZE; i++) { - if (!txg_list_empty_impl(tl, i)) { - mutex_exit(&tl->tl_lock); - return (B_FALSE); - } - } - mutex_exit(&tl->tl_lock); - return (B_TRUE); + boolean_t res = B_TRUE; + for (int i = 0; i < TXG_SIZE; i++) + res &= (tl->tl_head[i] == NULL); + return (res); } /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 612e66c3a8a8..30551feb6322 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4608,11 +4608,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); - for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { - vsx->vsx_active_queue[t] = - vd->vdev_queue.vq_class[t].vqc_active; - vsx->vsx_pend_queue[t] = avl_numnodes( - &vd->vdev_queue.vq_class[t].vqc_queued_tree); + for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { + vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t]; + vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t); } } } @@ -5470,20 +5468,20 @@ vdev_deadman(vdev_t *vd, const char *tag) vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); - if (avl_numnodes(&vq->vq_active_tree) > 0) { + if (vq->vq_active > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; - zfs_dbgmsg("slow vdev: %s has %lu active IOs", - vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); + zfs_dbgmsg("slow vdev: %s has %u active IOs", + vd->vdev_path, vq->vq_active); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ - fio = avl_first(&vq->vq_active_tree); + fio = list_head(&vq->vq_active_list); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index abb7d0662b8c..08d918467d03 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -228,13 +228,6 @@ uint_t zfs_vdev_queue_depth_pct = 300; */ uint_t zfs_vdev_def_queue_depth = 32; -/* - * Allow TRIM I/Os to be aggregated. This should normally not be needed since - * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted - * by the TRIM code in zfs_trim.c. - */ -static uint_t zfs_vdev_aggregate_trim = 0; - static int vdev_queue_offset_compare(const void *x1, const void *x2) { @@ -249,38 +242,60 @@ vdev_queue_offset_compare(const void *x1, const void *x2) return (TREE_PCMP(z1, z2)); } -static inline avl_tree_t * -vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) -{ - return (&vq->vq_class[p].vqc_queued_tree); -} - -static inline avl_tree_t * -vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) -{ - ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM); - if (t == ZIO_TYPE_READ) - return (&vq->vq_read_offset_tree); - else if (t == ZIO_TYPE_WRITE) - return (&vq->vq_write_offset_tree); - else - return (&vq->vq_trim_offset_tree); -} +#define VDQ_T_SHIFT 29 static int -vdev_queue_timestamp_compare(const void *x1, const void *x2) +vdev_queue_to_compare(const void *x1, const void *x2) { const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp); + int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT, + z2->io_timestamp >> VDQ_T_SHIFT); + int ocmp = TREE_CMP(z1->io_offset, z2->io_offset); + int cmp = tcmp ? tcmp : ocmp; - if (likely(cmp)) + if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE))) return (cmp); return (TREE_PCMP(z1, z2)); } +static inline boolean_t +vdev_queue_class_fifo(zio_priority_t p) +{ + return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE || + p == ZIO_PRIORITY_TRIM); +} + +static void +vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio) +{ + zio_priority_t p = zio->io_priority; + vq->vq_cqueued |= 1U << p; + if (vdev_queue_class_fifo(p)) + list_insert_tail(&vq->vq_class[p].vqc_list, zio); + else + avl_add(&vq->vq_class[p].vqc_tree, zio); +} + +static void +vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio) +{ + zio_priority_t p = zio->io_priority; + uint32_t empty; + if (vdev_queue_class_fifo(p)) { + list_t *list = &vq->vq_class[p].vqc_list; + list_remove(list, zio); + empty = list_is_empty(list); + } else { + avl_tree_t *tree = &vq->vq_class[p].vqc_tree; + avl_remove(tree, zio); + empty = avl_is_empty(tree); + } + vq->vq_cqueued &= ~(empty << p); +} + static uint_t vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) { @@ -360,7 +375,7 @@ vdev_queue_max_async_writes(spa_t *spa) } static uint_t -vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) +vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -370,7 +385,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_max_active); case ZIO_PRIORITY_ASYNC_WRITE: - return (vdev_queue_max_async_writes(spa)); + return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa)); case ZIO_PRIORITY_SCRUB: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, @@ -414,10 +429,10 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { - spa_t *spa = vq->vq_vdev->vdev_spa; - zio_priority_t p, n; + uint32_t cq = vq->vq_cqueued; + zio_priority_t p, p1; - if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) + if (cq == 0 || vq->vq_active >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); /* @@ -425,14 +440,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) * Do round-robin to reduce starvation due to zfs_vdev_max_active * and vq_nia_credit limits. */ - for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) { - p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE; - if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && - vq->vq_class[p].vqc_active < - vdev_queue_class_min_active(vq, p)) { - vq->vq_last_prio = p; - return (p); - } + p1 = vq->vq_last_prio + 1; + if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE) + p1 = 0; + for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_min_active(vq, p)) + goto found; + } + for (p = 0; p < p1; p++) { + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_min_active(vq, p)) + goto found; } /* @@ -440,16 +459,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) * maximum # outstanding i/os. */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && - vq->vq_class[p].vqc_active < - vdev_queue_class_max_active(spa, vq, p)) { - vq->vq_last_prio = p; - return (p); - } + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_max_active(vq, p)) + break; } - /* No eligible queued i/os */ - return (ZIO_PRIORITY_NUM_QUEUEABLE); +found: + vq->vq_last_prio = p; + return (p); } void @@ -458,42 +475,30 @@ vdev_queue_init(vdev_t *vd) vdev_queue_t *vq = &vd->vdev_queue; zio_priority_t p; - mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); vq->vq_vdev = vd; - taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent); - - avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - int (*compfn) (const void *, const void *); - - /* - * The synchronous/trim i/o queues are dispatched in FIFO rather - * than LBA order. This provides more consistent latency for - * these i/os. - */ - if (p == ZIO_PRIORITY_SYNC_READ || - p == ZIO_PRIORITY_SYNC_WRITE || - p == ZIO_PRIORITY_TRIM) { - compfn = vdev_queue_timestamp_compare; + if (vdev_queue_class_fifo(p)) { + list_create(&vq->vq_class[p].vqc_list, + sizeof (zio_t), + offsetof(struct zio, io_queue_node.l)); } else { - compfn = vdev_queue_offset_compare; + avl_create(&vq->vq_class[p].vqc_tree, + vdev_queue_to_compare, sizeof (zio_t), + offsetof(struct zio, io_queue_node.a)); } - avl_create(vdev_queue_class_tree(vq, p), compfn, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); } + avl_create(&vq->vq_read_offset_tree, + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + avl_create(&vq->vq_write_offset_tree, + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); vq->vq_last_offset = 0; + list_create(&vq->vq_active_list, sizeof (struct zio), + offsetof(struct zio, io_queue_node.l)); + mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); } void @@ -501,30 +506,39 @@ vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; - for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) - avl_destroy(vdev_queue_class_tree(vq, p)); - avl_destroy(&vq->vq_active_tree); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM)); + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if (vdev_queue_class_fifo(p)) + list_destroy(&vq->vq_class[p].vqc_list); + else + avl_destroy(&vq->vq_class[p].vqc_tree); + } + avl_destroy(&vq->vq_read_offset_tree); + avl_destroy(&vq->vq_write_offset_tree); + list_destroy(&vq->vq_active_list); mutex_destroy(&vq->vq_lock); } static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); - avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); + zio->io_queue_state = ZIO_QS_QUEUED; + vdev_queue_class_add(vq, zio); + if (zio->io_type == ZIO_TYPE_READ) + avl_add(&vq->vq_read_offset_tree, zio); + else if (zio->io_type == ZIO_TYPE_WRITE) + avl_add(&vq->vq_write_offset_tree, zio); } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); - avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); + vdev_queue_class_remove(vq, zio); + if (zio->io_type == ZIO_TYPE_READ) + avl_remove(&vq->vq_read_offset_tree, zio); + else if (zio->io_type == ZIO_TYPE_WRITE) + avl_remove(&vq->vq_write_offset_tree, zio); + zio->io_queue_state = ZIO_QS_NONE; } static boolean_t @@ -546,14 +560,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_class[zio->io_priority].vqc_active++; + vq->vq_cactive[zio->io_priority]++; + vq->vq_active++; if (vdev_queue_is_interactive(zio->io_priority)) { if (++vq->vq_ia_active == 1) vq->vq_nia_credit = 1; } else if (vq->vq_ia_active > 0) { vq->vq_nia_credit--; } - avl_add(&vq->vq_active_tree, zio); + zio->io_queue_state = ZIO_QS_ACTIVE; + list_insert_tail(&vq->vq_active_list, zio); } static void @@ -561,7 +577,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_class[zio->io_priority].vqc_active--; + vq->vq_cactive[zio->io_priority]--; + vq->vq_active--; if (vdev_queue_is_interactive(zio->io_priority)) { if (--vq->vq_ia_active == 0) vq->vq_nia_credit = 0; @@ -569,7 +586,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) vq->vq_nia_credit = zfs_vdev_nia_credit; } else if (vq->vq_ia_active == 0) vq->vq_nia_credit++; - avl_remove(&vq->vq_active_tree, zio); + list_remove(&vq->vq_active_list, zio); + zio->io_queue_state = ZIO_QS_NONE; } static void @@ -602,29 +620,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) uint64_t maxgap = 0; uint64_t size; uint64_t limit; - int maxblocksize; boolean_t stretch = B_FALSE; - avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); - zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; uint64_t next_offset; abd_t *abd; + avl_tree_t *t; + + /* + * TRIM aggregation should not be needed since code in zfs_trim.c can + * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M). + */ + if (zio->io_type == ZIO_TYPE_TRIM) + return (NULL); + + if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) + return (NULL); - maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); if (vq->vq_vdev->vdev_nonrot) limit = zfs_vdev_aggregation_limit_non_rotating; else limit = zfs_vdev_aggregation_limit; - limit = MIN(limit, maxblocksize); - - if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0) - return (NULL); - - /* - * While TRIM commands could be aggregated based on offset this - * behavior is disabled until it's determined to be beneficial. - */ - if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim) + if (limit == 0) return (NULL); + limit = MIN(limit, SPA_MAXBLOCKSIZE); /* * I/Os to distributed spares are directly dispatched to the dRAID @@ -635,8 +652,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) first = last = zio; - if (zio->io_type == ZIO_TYPE_READ) + if (zio->io_type == ZIO_TYPE_READ) { maxgap = zfs_vdev_read_gap_limit; + t = &vq->vq_read_offset_tree; + } else { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + t = &vq->vq_write_offset_tree; + } /* * We can aggregate I/Os that are sufficiently adjacent and of @@ -657,6 +679,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) * Walk backwards through sufficiently contiguous I/Os * recording the last non-optional I/O. */ + zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; while ((dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= limit && @@ -686,7 +709,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (IO_SPAN(first, dio) <= limit || (dio->io_flags & ZIO_FLAG_OPTIONAL)) && - IO_SPAN(first, dio) <= maxblocksize && + IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE && IO_GAP(last, dio) <= maxgap && dio->io_type == zio->io_type) { last = dio; @@ -740,7 +763,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) return (NULL); size = IO_SPAN(first, last); - ASSERT3U(size, <=, maxblocksize); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); abd = abd_alloc_gang(); if (abd == NULL) @@ -824,19 +847,30 @@ vdev_queue_io_to_issue(vdev_queue_t *vq) return (NULL); } - /* - * For LBA-ordered queues (async / scrub / initializing), issue the - * i/o which follows the most recently issued i/o in LBA (offset) order. - * - * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp. - */ - tree = vdev_queue_class_tree(vq, p); - vq->vq_io_search.io_timestamp = 0; - vq->vq_io_search.io_offset = vq->vq_last_offset - 1; - VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL); - zio = avl_nearest(tree, idx, AVL_AFTER); - if (zio == NULL) - zio = avl_first(tree); + if (vdev_queue_class_fifo(p)) { + zio = list_head(&vq->vq_class[p].vqc_list); + } else { + /* + * For LBA-ordered queues (async / scrub / initializing), + * issue the I/O which follows the most recently issued I/O + * in LBA (offset) order, but to avoid starvation only within + * the same 0.5 second interval as the first I/O. + */ + tree = &vq->vq_class[p].vqc_tree; + zio = aio = avl_first(tree); + if (zio->io_offset < vq->vq_last_offset) { + vq->vq_io_search.io_timestamp = zio->io_timestamp; + vq->vq_io_search.io_offset = vq->vq_last_offset; + zio = avl_find(tree, &vq->vq_io_search, &idx); + if (zio == NULL) { + zio = avl_nearest(tree, idx, AVL_AFTER); + if (zio == NULL || + (zio->io_timestamp >> VDQ_T_SHIFT) != + (aio->io_timestamp >> VDQ_T_SHIFT)) + zio = aio; + } + } + } ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio); @@ -967,7 +1001,6 @@ void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; - avl_tree_t *tree; /* * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio @@ -1002,12 +1035,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) * Otherwise, the zio is currently active and we cannot change its * priority. */ - tree = vdev_queue_class_tree(vq, zio->io_priority); - if (avl_find(tree, zio, NULL) == zio) { - avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + if (zio->io_queue_state == ZIO_QS_QUEUED) { + vdev_queue_class_remove(vq, zio); zio->io_priority = priority; - avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); - } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { + vdev_queue_class_add(vq, zio); + } else if (zio->io_queue_state == ZIO_QS_NONE) { zio->io_priority = priority; } @@ -1020,10 +1052,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) * vq_lock mutex use here, instead we prefer to keep it lock free for * performance. */ -int +uint32_t vdev_queue_length(vdev_t *vd) { - return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); + return (vd->vdev_queue.vq_active); } uint64_t @@ -1032,15 +1064,22 @@ vdev_queue_last_offset(vdev_t *vd) return (vd->vdev_queue.vq_last_offset); } +uint64_t +vdev_queue_class_length(vdev_t *vd, zio_priority_t p) +{ + vdev_queue_t *vq = &vd->vdev_queue; + if (vdev_queue_class_fifo(p)) + return (list_is_empty(&vq->vq_class[p].vqc_list) == 0); + else + return (avl_numnodes(&vq->vq_class[p].vqc_tree)); +} + ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW, "Max vdev I/O aggregation size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT, ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, UINT, ZMOD_RW, - "Allow TRIM I/O to be aggregated"); - ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW, "Aggregate read I/O over gap");