Skip to content

Commit

Permalink
vdev probe to slow disk can stall mmp write checker
Browse files Browse the repository at this point in the history
Simplify vdev probes in the zio_vdev_io_done context to
avoid holding the spa config lock for long durations.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.

Signed-off-by: Don Brady <don.brady@klarasystems.com>
  • Loading branch information
don-brady committed Feb 8, 2024
1 parent a5a7254 commit 0757ab9
Show file tree
Hide file tree
Showing 10 changed files with 156 additions and 26 deletions.
2 changes: 1 addition & 1 deletion include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);

#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
#define SPA_ASYNC_PROBE 0x04
#define SPA_ASYNC_FAULT_VDEV 0x04
#define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10
#define SPA_ASYNC_AUTOEXPAND 0x20
Expand Down
2 changes: 1 addition & 1 deletion include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ struct vdev {
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
boolean_t vdev_remove_wanted; /* async remove wanted? */
boolean_t vdev_probe_wanted; /* async probe wanted? */
boolean_t vdev_fault_wanted; /* async faulted wanted? */
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
Expand Down
17 changes: 9 additions & 8 deletions module/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -8738,15 +8738,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
}

static void
spa_async_probe(spa_t *spa, vdev_t *vd)
spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
{
if (vd->vdev_probe_wanted) {
vd->vdev_probe_wanted = B_FALSE;
vdev_reopen(vd); /* vdev_open() does the actual probe */
if (vd->vdev_fault_wanted) {
vd->vdev_fault_wanted = B_FALSE;
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
VDEV_AUX_ERR_EXCEEDED);
}

for (int c = 0; c < vd->vdev_children; c++)
spa_async_probe(spa, vd->vdev_child[c]);
spa_async_fault_vdev(spa, vd->vdev_child[c]);
}

static void
Expand Down Expand Up @@ -8834,11 +8835,11 @@ spa_async_thread(void *arg)
}

/*
* See if any devices need to be probed.
* See if any devices need to be marked faulted.
*/
if (tasks & SPA_ASYNC_PROBE) {
if (tasks & SPA_ASYNC_FAULT_VDEV) {
spa_vdev_state_enter(spa, SCL_NONE);
spa_async_probe(spa, spa->spa_root_vdev);
spa_async_fault_vdev(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}

Expand Down
22 changes: 13 additions & 9 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -1664,6 +1664,7 @@ vdev_metaslab_fini(vdev_t *vd)
typedef struct vdev_probe_stats {
boolean_t vps_readable;
boolean_t vps_writeable;
boolean_t vps_zio_done_probe;
int vps_flags;
} vdev_probe_stats_t;

Expand Down Expand Up @@ -1709,6 +1710,17 @@ vdev_probe_done(zio_t *zio)
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
spa, vd, NULL, NULL, 0);
zio->io_error = SET_ERROR(ENXIO);

/*
* If this probe was initiated from zio pipeline, then
* change the state in a spa_async_request. Probes that
* were initiated from a vdev_open can change the state
* as part of the open call.
*/
if (vps->vps_zio_done_probe) {
vd->vdev_fault_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
}
}

mutex_enter(&vd->vdev_probe_lock);
Expand Down Expand Up @@ -1759,6 +1771,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)

vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
vps->vps_zio_done_probe = (zio != NULL);

if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
Expand All @@ -1785,15 +1798,6 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);

/*
* We can't change the vdev state in this context, so we
* kick off an async task to do it on our behalf.
*/
if (zio != NULL) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_PROBE);
}
}

if (zio != NULL)
Expand Down
22 changes: 20 additions & 2 deletions module/zfs/zfs_ioctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -5763,6 +5763,24 @@ zfs_ioc_error_log(zfs_cmd_t *zc)
return (error);
}

/*
* Is this a test pool?
*
* Used by ZTS functional/mmp/mmp_write_slow_disk to tear down a
* suspended mmp pool.
*/
static boolean_t
spa_is_test_pool(spa_t *spa)
{
if (strstr(spa_name(spa), "test") == NULL)
return (B_FALSE);

uint64_t size = metaslab_class_get_space(spa_normal_class(spa));
size += metaslab_class_get_space(spa_embedded_log_class(spa));

return (size <= 1024 * 1024 * 1024);
}

static int
zfs_ioc_clear(zfs_cmd_t *zc)
{
Expand Down Expand Up @@ -5815,9 +5833,9 @@ zfs_ioc_clear(zfs_cmd_t *zc)

/*
* If multihost is enabled, resuming I/O is unsafe as another
* host may have imported the pool.
* host may have imported the pool. (allowed for mini test pool)
*/
if (spa_multihost(spa) && spa_suspended(spa))
if (spa_multihost(spa) && spa_suspended(spa) && !spa_is_test_pool(spa))
return (SET_ERROR(EINVAL));

spa_vdev_state_enter(spa, SCL_NONE);
Expand Down
6 changes: 4 additions & 2 deletions module/zfs/zio.c
Original file line number Diff line number Diff line change
Expand Up @@ -2532,8 +2532,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
"failure and the failure mode property for this pool "
"is set to panic.", spa_name(spa));

cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
"failure and has been suspended.\n", spa_name(spa));
if (reason != ZIO_SUSPEND_MMP) {
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
"I/O failure and has been suspended.\n", spa_name(spa));
}

(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
NULL, NULL, 0);
Expand Down
6 changes: 4 additions & 2 deletions module/zfs/zio_inject.c
Original file line number Diff line number Diff line change
Expand Up @@ -605,9 +605,11 @@ zio_handle_io_delay(zio_t *zio)
if (vd->vdev_guid != handler->zi_record.zi_guid)
continue;

/* also match on I/O type (e.g., -T read) */
if (handler->zi_record.zi_iotype != ZIO_TYPES &&
handler->zi_record.zi_iotype != zio->io_type)
continue;
handler->zi_record.zi_iotype != zio->io_type) {
continue;
}

/*
* Defensive; should never happen as the array allocation
Expand Down
2 changes: 1 addition & 1 deletion tests/runfiles/linux.run
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ tags = ['functional', 'mmap']
tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import',
'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history',
'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid']
'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk']
tags = ['functional', 'mmp']

[tests/functional/mount:Linux]
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1587,6 +1587,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/mmp/mmp_on_zdb.ksh \
functional/mmp/mmp_reset_interval.ksh \
functional/mmp/mmp_write_distribution.ksh \
functional/mmp/mmp_write_slow_disk.ksh \
functional/mmp/mmp_write_uberblocks.ksh \
functional/mmp/multihost_history.ksh \
functional/mmp/setup.ksh \
Expand Down
102 changes: 102 additions & 0 deletions tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
# CDDL HEADER END
#

#
# Copyright (c) 2024, Klara Inc
#

# DESCRIPTION:
# Verify that long VDEV probes do not cause MMP checks to suspend pool
# Note: without PR-15839 fix, this test will suspend the pool.
#
# A device that is returning unexpeced errors will trigger a vdev_probe.
# When the device additionally has slow response times, the probe can hold
# the spa config loack as a writer for a long period of time such that the
# mmp uberblock updates stall when trying to aquire the spa config lock.
#
# STRATEGY:
# 1. Create a pool with multiple leaf vdevs
# 2. Enable multihost and multihost_history
# 3. Delay for MMP writes to occur
# 4. Verify that a long VDEV probe didn't cause MMP check to suspend pool
#

. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/mmp/mmp.cfg
. $STF_SUITE/tests/functional/mmp/mmp.kshlib

verify_runnable "both"

# Note pool name must contain 'test' to allow pool clear to resume pool io
MMP_POOL="test-mmp-pool"

function cleanup
{
log_must zinject -c all

if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then
# Note this clear works for suspended small test pools
log_must zpool clear $MMP_POOL
zpool get state $MMP_POOL $MMP_DIR/file.3
zpool events | grep ".fs.zfs." | grep -v "history_event"
fi

log_must zpool destroy $MMP_POOL
log_must rm -r $MMP_DIR
log_must mmp_clear_hostid
}

log_assert "A long VDEV probe doesn't cause a MMP check suspend"
log_onexit cleanup

MMP_HISTORY=/proc/spl/kstat/zfs/$MMP_POOL/multihost

# Create a multiple drive pool
log_must zpool events -c
log_must mkdir -p $MMP_DIR
log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5}
log_must zpool create -f $MMP_POOL \
mirror $MMP_DIR/file.{0,1,2} \
mirror $MMP_DIR/file.{3,4,5}

# Enable MMP
log_must mmp_set_hostid $HOSTID1
log_must zpool set multihost=on $MMP_POOL
set_tunable64 MULTIHOST_HISTORY 0
set_tunable64 MULTIHOST_HISTORY 40

# Inject vdev write error along with a delay
log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL
log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL
log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL

log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5
sleep 10
log_must zpool sync

# Confirm mmp writes to the non-slow disks have taken place
for x in {0,1,2,4}; do
write_count=$(grep -c file.${x} $MMP_HISTORY)
[[ $write_count -gt 0 ]] || log_fail "expecting mmp writes"
done

# Expect that the pool was not suspended
log_must check_state $MMP_POOL "" "ONLINE"
health=$(zpool list -H -o health $MMP_POOL)
log_note "$MMP_POOL health is $health"
[[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected"

log_pass "A long VDEV probe doesn't cause a MMP check suspend"

0 comments on commit 0757ab9

Please sign in to comment.