diff --git a/include/sys/spa.h b/include/sys/spa.h index 08099cd4fa29..265469ddd076 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -769,7 +769,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 -#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_FAULT_VDEV 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index f39ebf031cea..cb4d81a5e14d 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -274,7 +274,7 @@ struct vdev { txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ - boolean_t vdev_probe_wanted; /* async probe wanted? */ + boolean_t vdev_fault_wanted; /* async faulted wanted? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b144d0652930..1379083fb82f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -8738,15 +8738,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd) } static void -spa_async_probe(spa_t *spa, vdev_t *vd) +spa_async_fault_vdev(spa_t *spa, vdev_t *vd) { - if (vd->vdev_probe_wanted) { - vd->vdev_probe_wanted = B_FALSE; - vdev_reopen(vd); /* vdev_open() does the actual probe */ + if (vd->vdev_fault_wanted) { + vd->vdev_fault_wanted = B_FALSE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); } for (int c = 0; c < vd->vdev_children; c++) - spa_async_probe(spa, vd->vdev_child[c]); + spa_async_fault_vdev(spa, vd->vdev_child[c]); } static void @@ -8834,11 +8835,11 @@ spa_async_thread(void *arg) } /* - * See if any devices need to be probed. + * See if any devices need to be marked faulted. */ - if (tasks & SPA_ASYNC_PROBE) { + if (tasks & SPA_ASYNC_FAULT_VDEV) { spa_vdev_state_enter(spa, SCL_NONE); - spa_async_probe(spa, spa->spa_root_vdev); + spa_async_fault_vdev(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index ebba453e2b14..862015f70b71 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1664,6 +1664,7 @@ vdev_metaslab_fini(vdev_t *vd) typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; + boolean_t vps_zio_done_probe; int vps_flags; } vdev_probe_stats_t; @@ -1709,6 +1710,17 @@ vdev_probe_done(zio_t *zio) (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); + + /* + * If this probe was initiated from zio pipeline, then + * change the state in a spa_async_request. Probes that + * were initiated from a vdev_open can change the state + * as part of the open call. + */ + if (vps->vps_zio_done_probe) { + vd->vdev_fault_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); + } } mutex_enter(&vd->vdev_probe_lock); @@ -1759,6 +1771,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; + vps->vps_zio_done_probe = (zio != NULL); if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* @@ -1785,15 +1798,6 @@ vdev_probe(vdev_t *vd, zio_t *zio) vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); - - /* - * We can't change the vdev state in this context, so we - * kick off an async task to do it on our behalf. - */ - if (zio != NULL) { - vd->vdev_probe_wanted = B_TRUE; - spa_async_request(spa, SPA_ASYNC_PROBE); - } } if (zio != NULL) diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index b2b06881bdd4..d4b60f7c1614 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -5763,6 +5763,24 @@ zfs_ioc_error_log(zfs_cmd_t *zc) return (error); } +/* + * Is this a test pool? + * + * Used by ZTS functional/mmp/mmp_write_slow_disk to tear down a + * suspended mmp pool. + */ +static boolean_t +spa_is_test_pool(spa_t *spa) +{ + if (strstr(spa_name(spa), "test") == NULL) + return (B_FALSE); + + uint64_t size = metaslab_class_get_space(spa_normal_class(spa)); + size += metaslab_class_get_space(spa_embedded_log_class(spa)); + + return (size <= 1024 * 1024 * 1024); +} + static int zfs_ioc_clear(zfs_cmd_t *zc) { @@ -5815,9 +5833,9 @@ zfs_ioc_clear(zfs_cmd_t *zc) /* * If multihost is enabled, resuming I/O is unsafe as another - * host may have imported the pool. + * host may have imported the pool. (allowed for mini test pool) */ - if (spa_multihost(spa) && spa_suspended(spa)) + if (spa_multihost(spa) && spa_suspended(spa) && !spa_is_test_pool(spa)) return (SET_ERROR(EINVAL)); spa_vdev_state_enter(spa, SCL_NONE); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 213fe5c483f2..ba7774413843 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2532,8 +2532,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); - cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " - "failure and has been suspended.\n", spa_name(spa)); + if (reason != ZIO_SUSPEND_MMP) { + cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable " + "I/O failure and has been suspended.\n", spa_name(spa)); + } (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 609182f4a2cd..66ad72fb88e9 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -605,9 +605,11 @@ zio_handle_io_delay(zio_t *zio) if (vd->vdev_guid != handler->zi_record.zi_guid) continue; + /* also match on I/O type (e.g., -T read) */ if (handler->zi_record.zi_iotype != ZIO_TYPES && - handler->zi_record.zi_iotype != zio->io_type) - continue; + handler->zi_record.zi_iotype != zio->io_type) { + continue; + } /* * Defensive; should never happen as the array allocation diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index a0b74ef4a8c6..92ce09ec6fcb 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -146,7 +146,7 @@ tags = ['functional', 'mmap'] tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', - 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid'] + 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk'] tags = ['functional', 'mmp'] [tests/functional/mount:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index fe9c92108725..5bb933d44875 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1587,6 +1587,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/mmp/mmp_on_zdb.ksh \ functional/mmp/mmp_reset_interval.ksh \ functional/mmp/mmp_write_distribution.ksh \ + functional/mmp/mmp_write_slow_disk.ksh \ functional/mmp/mmp_write_uberblocks.ksh \ functional/mmp/multihost_history.ksh \ functional/mmp/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh new file mode 100755 index 000000000000..3fb11b9a5a7d --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh @@ -0,0 +1,102 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc +# + +# DESCRIPTION: +# Verify that long VDEV probes do not cause MMP checks to suspend pool +# Note: without PR-15839 fix, this test will suspend the pool. +# +# A device that is returning unexpeced errors will trigger a vdev_probe. +# When the device additionally has slow response times, the probe can hold +# the spa config loack as a writer for a long period of time such that the +# mmp uberblock updates stall when trying to aquire the spa config lock. +# +# STRATEGY: +# 1. Create a pool with multiple leaf vdevs +# 2. Enable multihost and multihost_history +# 3. Delay for MMP writes to occur +# 4. Verify that a long VDEV probe didn't cause MMP check to suspend pool +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/mmp/mmp.cfg +. $STF_SUITE/tests/functional/mmp/mmp.kshlib + +verify_runnable "both" + +# Note pool name must contain 'test' to allow pool clear to resume pool io +MMP_POOL="test-mmp-pool" + +function cleanup +{ + log_must zinject -c all + + if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then + # Note this clear works for suspended small test pools + log_must zpool clear $MMP_POOL + zpool get state $MMP_POOL $MMP_DIR/file.3 + zpool events | grep ".fs.zfs." | grep -v "history_event" + fi + + log_must zpool destroy $MMP_POOL + log_must rm -r $MMP_DIR + log_must mmp_clear_hostid +} + +log_assert "A long VDEV probe doesn't cause a MMP check suspend" +log_onexit cleanup + +MMP_HISTORY=/proc/spl/kstat/zfs/$MMP_POOL/multihost + +# Create a multiple drive pool +log_must zpool events -c +log_must mkdir -p $MMP_DIR +log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5} +log_must zpool create -f $MMP_POOL \ + mirror $MMP_DIR/file.{0,1,2} \ + mirror $MMP_DIR/file.{3,4,5} + +# Enable MMP +log_must mmp_set_hostid $HOSTID1 +log_must zpool set multihost=on $MMP_POOL +set_tunable64 MULTIHOST_HISTORY 0 +set_tunable64 MULTIHOST_HISTORY 40 + +# Inject vdev write error along with a delay +log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL +log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL +log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL + +log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5 +sleep 10 +log_must zpool sync + +# Confirm mmp writes to the non-slow disks have taken place +for x in {0,1,2,4}; do + write_count=$(grep -c file.${x} $MMP_HISTORY) + [[ $write_count -gt 0 ]] || log_fail "expecting mmp writes" +done + +# Expect that the pool was not suspended +log_must check_state $MMP_POOL "" "ONLINE" +health=$(zpool list -H -o health $MMP_POOL) +log_note "$MMP_POOL health is $health" +[[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected" + +log_pass "A long VDEV probe doesn't cause a MMP check suspend"