Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Another set of vdev queue optimizations. #14925

Merged
merged 1 commit into from
Jun 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/sys/vdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);

extern int vdev_queue_length(vdev_t *vd);
extern uint32_t vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);

extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);
Expand Down
17 changes: 7 additions & 10 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,27 +130,24 @@ typedef const struct vdev_ops {
/*
* Virtual device properties
*/
typedef struct vdev_queue_class {
uint32_t vqc_active;

/*
* Sorted by offset or timestamp, depending on if the queue is
* LBA-ordered vs FIFO.
*/
avl_tree_t vqc_queued_tree;
typedef union vdev_queue_class {
list_t vqc_list;
avl_tree_t vqc_tree;
} vdev_queue_class_t;

struct vdev_queue {
vdev_t *vq_vdev;
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
avl_tree_t vq_active_tree;
avl_tree_t vq_read_offset_tree;
avl_tree_t vq_write_offset_tree;
avl_tree_t vq_trim_offset_tree;
uint64_t vq_last_offset;
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
uint32_t vq_cqueued; /* Classes with queued I/Os. */
uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
uint32_t vq_active; /* Number of active I/Os. */
uint32_t vq_ia_active; /* Active interactive I/Os. */
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
list_t vq_active_list; /* List of active I/Os. */
hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts;
zio_t vq_io_search; /* used as local for stack reduction */
Expand Down
15 changes: 12 additions & 3 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,12 @@ typedef struct zio_link {
list_node_t zl_child_node;
} zio_link_t;

enum zio_qstate {
ZIO_QS_NONE = 0,
ZIO_QS_QUEUED,
ZIO_QS_ACTIVE,
};

struct zio {
/* Core information about this I/O */
zbookmark_phys_t io_bookmark;
Expand Down Expand Up @@ -480,16 +486,19 @@ struct zio {
const zio_vsd_ops_t *io_vsd_ops;
metaslab_class_t *io_metaslab_class; /* dva throttle class */

enum zio_qstate io_queue_state; /* vdev queue state */
union {
list_node_t l;
avl_node_t a;
} io_queue_node ____cacheline_aligned; /* allocator and vdev queues */
avl_node_t io_offset_node; /* vdev offset queues */
uint64_t io_offset;
hrtime_t io_timestamp; /* submitted at */
hrtime_t io_queued_timestamp;
hrtime_t io_target_timestamp;
hrtime_t io_delta; /* vdev queue service delta */
hrtime_t io_delay; /* Device access time (disk or */
/* file). */
avl_node_t io_queue_node;
avl_node_t io_offset_node;
avl_node_t io_alloc_node;
zio_alloc_list_t io_alloc_list;

/* Internal pipeline state */
Expand Down
6 changes: 0 additions & 6 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -2016,12 +2016,6 @@ Historical statistics for this many latest TXGs will be available in
Flush dirty data to disk at least every this many seconds (maximum TXG
duration).
.
.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint
Allow TRIM I/O operations to be aggregated.
This is normally not helpful because the extents to be trimmed
will have been already been aggregated by the metaslab.
This option is provided for debugging and performance analysis.
.
.It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
Max vdev I/O aggregation size.
.
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/spa_misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
NULL);
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
}
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
Expand Down
13 changes: 4 additions & 9 deletions module/zfs/txg.c
Original file line number Diff line number Diff line change
Expand Up @@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl)
boolean_t
txg_all_lists_empty(txg_list_t *tl)
{
mutex_enter(&tl->tl_lock);
for (int i = 0; i < TXG_SIZE; i++) {
if (!txg_list_empty_impl(tl, i)) {
mutex_exit(&tl->tl_lock);
return (B_FALSE);
}
}
mutex_exit(&tl->tl_lock);
return (B_TRUE);
boolean_t res = B_TRUE;
for (int i = 0; i < TXG_SIZE; i++)
res &= (tl->tl_head[i] == NULL);
behlendorf marked this conversation as resolved.
Show resolved Hide resolved
return (res);
}

/*
Expand Down
16 changes: 7 additions & 9 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -4608,11 +4608,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)

memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));

for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
vsx->vsx_active_queue[t] =
vd->vdev_queue.vq_class[t].vqc_active;
vsx->vsx_pend_queue[t] = avl_numnodes(
&vd->vdev_queue.vq_class[t].vqc_queued_tree);
for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
}
}
}
Expand Down Expand Up @@ -5470,20 +5468,20 @@ vdev_deadman(vdev_t *vd, const char *tag)
vdev_queue_t *vq = &vd->vdev_queue;

mutex_enter(&vq->vq_lock);
if (avl_numnodes(&vq->vq_active_tree) > 0) {
if (vq->vq_active > 0) {
spa_t *spa = vd->vdev_spa;
zio_t *fio;
uint64_t delta;

zfs_dbgmsg("slow vdev: %s has %lu active IOs",
vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
zfs_dbgmsg("slow vdev: %s has %u active IOs",
vd->vdev_path, vq->vq_active);

/*
* Look at the head of all the pending queues,
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime invoke the deadman logic.
*/
fio = avl_first(&vq->vq_active_tree);
fio = list_head(&vq->vq_active_list);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa))
zio_deadman(fio, tag);
Expand Down
Loading