Skip to content

Commit

Permalink
Add scrub after resilver zed script
Browse files Browse the repository at this point in the history
* Add a zed script to kick off a scrub after a resilver.  The script is
disabled by default.

* Tweak the resilver_finish event timing so that it happens after the
bad disk has been detached.  Previously you would see the
resilver_finish event and then then the vdev_detach event.

* Add a test mode (-t) option to zed to allow it to use the native
paths to the ZFS utilities.  This is needed when you're running zed
under the ZTS in a local workspace.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes: openzfs#4662
  • Loading branch information
tonyhutter committed Jan 25, 2018
1 parent 522db29 commit 9901d20
Show file tree
Hide file tree
Showing 13 changed files with 182 additions and 19 deletions.
16 changes: 16 additions & 0 deletions cmd/zed/zed.d/resilver_finish-start-scrub.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/sh
# resilver_finish-start-scrub.sh
# Run a scrub after a resilver
#
# Exit codes:
# 9: internal error
#
[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
. "${ZED_ZEDLET_DIR}/zed-functions.sh"

[ -n "${ZEVENT_POOL}" ] || exit 9
[ -n "${ZEVENT_SUBCLASS}" ] || exit 9
zed_check_cmd "${ZPOOL}" || exit 9

zed_log_msg "Starting scrub after resilver on ${ZEVENT_POOL}"
${ZPOOL} scrub ${ZEVENT_POOL}
3 changes: 3 additions & 0 deletions cmd/zed/zed.d/zed.rc
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@
#
ZED_USE_ENCLOSURE_LEDS=1

##
# Run a scrub after every resilver
#ZED_SCRUB_AFTER_RESILVER=1

##
# The syslog priority (e.g., specified as a "facility.level" pair).
Expand Down
7 changes: 6 additions & 1 deletion cmd/zed/zed_conf.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ _zed_conf_display_help(const char *prog, int got_err)
"Run daemon in the foreground.");
fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-M",
"Lock all pages in memory.");
fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-t",
"Testing mode (only used by ZTS).");
fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-Z",
"Zero state file.");
fprintf(fp, "\n");
Expand Down Expand Up @@ -247,7 +249,7 @@ _zed_conf_parse_path(char **resultp, const char *path)
void
zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv)
{
const char * const opts = ":hLVc:d:p:s:vfFMZ";
const char * const opts = ":hLVc:d:p:s:vftFMZ";
int opt;

if (!zcp || !argv || !argv[0])
Expand Down Expand Up @@ -290,6 +292,9 @@ zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv)
case 'M':
zcp->do_memlock = 1;
break;
case 't':
zcp->do_testmode = 1;
break;
case 'Z':
zcp->do_zero = 1;
break;
Expand Down
1 change: 1 addition & 0 deletions cmd/zed/zed_conf.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ struct zed_conf {
unsigned do_memlock:1; /* true if locking memory */
unsigned do_verbose:1; /* true if verbosity enabled */
unsigned do_zero:1; /* true if zeroing state */
unsigned do_testmode:1; /* true if testmode is set */
int syslog_facility; /* syslog facility value */
int min_events; /* RESERVED FOR FUTURE USE */
int max_events; /* RESERVED FOR FUTURE USE */
Expand Down
34 changes: 30 additions & 4 deletions cmd/zed/zed_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -733,12 +733,14 @@ _zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp)

/*
* Restrict various environment variables to safe and sane values
* when constructing the environment for the child process.
* when constructing the environment for the child process, unless we're
* we're running in testmode (like under the ZFS test suite).
*
* Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1.
*/
static void
_zed_event_add_env_restrict(uint64_t eid, zed_strings_t *zsp)
_zed_event_add_env_restrict(uint64_t eid, zed_strings_t *zsp,
boolean_t testmode)
{
const char *env_restrict[][2] = {
{ "IFS", " \t\n" },
Expand All @@ -753,11 +755,35 @@ _zed_event_add_env_restrict(uint64_t eid, zed_strings_t *zsp)
{ "ZFS_RELEASE", ZFS_META_RELEASE },
{ NULL, NULL }
};

/*
* In test mode, use the ZFS binaries from $PATH instead of the
* hard-coded ones.
*/
const char *env_testmode[][2] = {
{ "IFS", " \t\n" },
{ "PATH", NULL }, /* $PATH copied in later on */
{ "ZDB", "zdb" },
{ "ZED", "zed" },
{ "ZFS", "zfs" },
{ "ZINJECT", "zinject" },
{ "ZPOOL", "zpool" },
{ "ZFS_ALIAS", ZFS_META_ALIAS },
{ "ZFS_VERSION", ZFS_META_VERSION },
{ "ZFS_RELEASE", ZFS_META_RELEASE },
{ NULL, NULL }
};
const char *(*pa)[2];

assert(zsp != NULL);

for (pa = env_restrict; *(*pa); pa++) {
pa = testmode ? env_testmode : env_restrict;

for (; *(*pa); pa++) {
/* In testmode, use our native $PATH */
if (testmode && strcmp((*pa)[0], "PATH") == 0)
(*pa)[1] = getenv("PATH");

_zed_event_add_var(eid, zsp, NULL, (*pa)[0], "%s", (*pa)[1]);
}
}
Expand Down Expand Up @@ -902,7 +928,7 @@ zed_event_service(struct zed_conf *zcp)
while ((nvp = nvlist_next_nvpair(nvl, nvp)))
_zed_event_add_nvpair(eid, zsp, nvp);

_zed_event_add_env_restrict(eid, zsp);
_zed_event_add_env_restrict(eid, zsp, zcp->do_testmode);
_zed_event_add_env_preserve(eid, zsp);

_zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "PID",
Expand Down
26 changes: 18 additions & 8 deletions module/zfs/dsl_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -804,17 +804,27 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
if (complete) {
spa_event_notify(spa, NULL, NULL,
scn->scn_phys.scn_min_txg ?
ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
/*
* scn->scn_phys.scn_min_txg == 0 means we finished
* a scrub. scn_phys.scn_min_txg != 0 means we
* finished a resilver of the new disk.
*
* The actual resilver_finish event happens later in
* in spa_async_thread() after the old vdev is removed.
*/
if (scn->scn_phys.scn_min_txg == 0)
spa_event_notify(spa, NULL, NULL,
ESC_ZFS_SCRUB_FINISH);
}
spa_errlog_rotate(spa);

/*
* We may have finished replacing a device.
* Let the async thread assess this and handle the detach.
*/
spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
if (complete && scn->scn_phys.scn_min_txg != 0) {
/*
* We may have finished replacing a device.
* Let the async thread assess this and handle the detach.
*/
spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
}
}

scn->scn_phys.scn_end_time = gethrestime_sec();
Expand Down
4 changes: 3 additions & 1 deletion module/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -6160,8 +6160,10 @@ spa_async_thread(void *arg)
/*
* If any devices are done replacing, detach them.
*/
if (tasks & SPA_ASYNC_RESILVER_DONE)
if (tasks & SPA_ASYNC_RESILVER_DONE) {
spa_vdev_resilver_done(spa);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_RESILVER_FINISH);
}

/*
* Kick off a resilver.
Expand Down
3 changes: 2 additions & 1 deletion tests/runfiles/linux.run
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,8 @@ tags = ['functional', 'exec']

[tests/functional/fault]
tests = ['auto_online_001_pos', 'auto_replace_001_pos', 'auto_spare_001_pos',
'auto_spare_002_pos', 'auto_spare_ashift', 'auto_spare_multiple']
'auto_spare_002_pos', 'auto_spare_ashift', 'auto_spare_multiple',
'scrub_after_resilver']
tags = ['functional', 'fault']

[tests/functional/features/async_destroy]
Expand Down
34 changes: 33 additions & 1 deletion tests/zfs-tests/include/libtest.shlib
Original file line number Diff line number Diff line change
Expand Up @@ -3053,9 +3053,32 @@ function wait_replacing #pool
done
}

#
# Wait for a pool to be scrubbed
#
# $1 pool name
# $2 number of seconds to wait (optional)
#
# Returns true when pool has been scrubbed, or false if there's a timeout or if
# no scrub was done.
#
function wait_scrubbed
{
typeset pool=${1:-$TESTPOOL}
typeset iter=${2:-10}
for i in {1..$iter} ; do
if is_pool_scrubbed $pool ; then
return 0
fi
sleep 1
done
return 1
}

#
# Setup custom environment for the ZED.
#
# $1 Optional zedlet script to copy into our zedlet test directory.
function zed_setup
{
if ! is_linux; then
Expand All @@ -3073,6 +3096,7 @@ function zed_setup
if [[ -e $VDEVID_CONF_ETC ]]; then
log_fail "Must not have $VDEVID_CONF_ETC file present on system"
fi
EXTRA_ZEDLET="$1"

# Create a symlink for /etc/zfs/vdev_id.conf file.
log_must ln -s $VDEVID_CONF $VDEVID_CONF_ETC
Expand All @@ -3081,6 +3105,9 @@ function zed_setup
# add additional ZEDLETs as needed for their specific test.
log_must cp ${ZEDLET_ETC_DIR}/zed.rc $ZEDLET_DIR
log_must cp ${ZEDLET_ETC_DIR}/zed-functions.sh $ZEDLET_DIR
if [[ ! -z "$EXTRA_ZEDLET" ]] ; then
log_must cp ${ZEDLET_ETC_DIR}/$EXTRA_ZEDLET $ZEDLET_DIR
fi

# Customize the zed.rc file to enable the full debug log.
log_must sed -i '/\#ZED_DEBUG_LOG=.*/d' $ZEDLET_DIR/zed.rc
Expand All @@ -3097,17 +3124,22 @@ function zed_setup
#
# Cleanup custom ZED environment.
#
# $1 Optional zedlet script(s) to remove from our zedlet test directory.
function zed_cleanup
{
if ! is_linux; then
return
fi
EXTRA_ZEDLET="$1"

log_must rm -f ${ZEDLET_DIR}/zed.rc
log_must rm -f ${ZEDLET_DIR}/zed-functions.sh
log_must rm -f ${ZEDLET_DIR}/all-syslog.sh
log_must rm -f ${ZEDLET_DIR}/all-debug.sh
log_must rm -f ${ZEDLET_DIR}/state
if [[ ! -z "$EXTRA_ZEDLET" ]] ; then
log_must rm -f ${ZEDLET_DIR}/$EXTRA_ZEDLET
fi
log_must rm -f $ZED_LOG
log_must rm -f $ZED_DEBUG_LOG
log_must rm -f $VDEVID_CONF_ETC
Expand Down Expand Up @@ -3139,7 +3171,7 @@ function zed_start
# run ZED in the background and redirect foreground logging
# output to $ZED_LOG.
log_must truncate -s 0 $ZED_DEBUG_LOG
log_must eval "zed -vF -d $ZEDLET_DIR -p $ZEDLET_DIR/zed.pid" \
log_must eval "zed -vF -t -d $ZEDLET_DIR -p $ZEDLET_DIR/zed.pid" \
"-s $ZEDLET_DIR/state 2>$ZED_LOG &"

return 0
Expand Down
3 changes: 2 additions & 1 deletion tests/zfs-tests/tests/functional/fault/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ dist_pkgdata_SCRIPTS = \
auto_spare_001_pos.ksh \
auto_spare_002_pos.ksh \
auto_spare_ashift.ksh \
auto_spare_multiple.ksh
auto_spare_multiple.ksh \
scrub_after_resilver.ksh
2 changes: 1 addition & 1 deletion tests/zfs-tests/tests/functional/fault/cleanup.ksh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ verify_runnable "global"
cleanup_devices $DISKS

zed_stop
zed_cleanup
zed_cleanup resilver_finish-start-scrub.sh

log_pass
66 changes: 66 additions & 0 deletions tests/zfs-tests/tests/functional/fault/scrub_after_resilver.ksh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#

#
# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
# All rights reserved.
#

. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/fault/fault.cfg

#
# DESCRIPTION:
# Test the scrub after resilver zedlet
#
# STRATEGY:
# 1. Create a mirrored pool
# 2. Fault a disk
# 3. Replace the disk, starting a resilver
# 4. Verify that a scrub happens after the resilver finishes
#

log_assert "Testing the scrub after resilver zedlet"

# Backup our zed.rc
old_zedrc="$(cat $ZEDLET_DIR/zed.rc)"

# Enable ZED_SCRUB_AFTER_RESILVER
log_must sed -i 's/\#ZED_SCRUB_AFTER_RESILVER/ZED_SCRUB_AFTER_RESILVER/g' $ZEDLET_DIR/zed.rc

function cleanup
{
# Restore our zed.rc
log_must echo "$old_zedrc" > $ZEDLET_DIR/zed.rc

default_cleanup
}

log_onexit cleanup

verify_disk_count "$DISKS" 3
default_mirror_setup_noexit $DISKS

log_must zpool offline -f $TESTPOOL $DISK1
log_must zpool detach $TESTPOOL $DISK2

# Write to our degraded pool so we have some data to resilver
log_must mkfile 16M $TESTDIR/file1

# Replace the failed disks, forcing a resilver
log_must zpool replace $TESTPOOL $DISK1 $DISK2

# Wait for the resilver to finish, and then the subsequent scrub to finish.
# Waiting for the scrub has the effect of waiting for both. Timeout after 10
# seconds if nothing is happening.
log_must wait_scrubbed $TESTPOOL 10
log_pass "Successfully ran the scrub after resilver zedlet"
2 changes: 1 addition & 1 deletion tests/zfs-tests/tests/functional/fault/setup.ksh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

verify_runnable "global"

zed_setup
zed_setup resilver_finish-start-scrub.sh
zed_start

log_pass

0 comments on commit 9901d20

Please sign in to comment.