Skip to content

Commit

Permalink
zpool import progress reports for long imports
Browse files Browse the repository at this point in the history
When an import requires a long MMP activity check, or when the user
requests pool recovery, the import make take a long time.  The user may
not know why, or be able to tell whether the import is progressing or is
hung.

Add a new option to "zpool import", "-v" (for verbose).

When the -v option is used, create an additional thread which
periodically checks the contents of kstat/zfs/import_progress.  If the
import does not finish quickly, it reports the following on stdout:
* Name and guid of pool being imported
* User-friendly version of spa_load_state
* Expected time import will complete
* the max txg if recovering the pool

If one or more of these values change, the thread prints a new record.

A new record will be printed to stdout with a maximum frequency of one
per second so as not to spam the user.  As a result the printed output
may reflect only some of the import states transitioned through.

Use import kstat to check for Multihost activity check in relevant
tests, instead of using import duration, which works poorly when testing
on a slow machine.

Sample output:

	Pool tank1 (guid 4591991398949870326):
	Checking for a remote import.
	Check will complete by Tue Apr 16 08:43:58 2019

	Pool tank1 (guid 4591991398949870326):
	Checking for a remote import.
	Check will complete by Tue Apr 16 08:44:43 2019
	Recovering Pool max txg 745

Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
  • Loading branch information
ofaaland committed Jul 21, 2021
1 parent a7bd20e commit ddcc1c1
Show file tree
Hide file tree
Showing 3 changed files with 205 additions and 9 deletions.
181 changes: 180 additions & 1 deletion cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -3398,6 +3398,152 @@ zpool_do_checkpoint(int argc, char **argv)

#define CHECKPOINT_OPT 1024

/*
* User-friendly strings for spa_load_state_t
*/
static char *load_state_names[] = {
[SPA_LOAD_NONE] = "Import Complete",
[SPA_LOAD_OPEN] = "Importing",
[SPA_LOAD_IMPORT] = "Importing",
[SPA_LOAD_TRYIMPORT] = "Verifying Configuration",
[SPA_LOAD_RECOVER] = "Recovering Pool",
[SPA_LOAD_ERROR] = "Import Error",
[SPA_LOAD_CREATE] = "Importing"
};

typedef struct import_progress_args {
boolean_t import_done;
} import_progress_args_t;

/*
* SILENT_SECS is amount of time we think user will wait before
* thinking something is broken. SLEEP_USEC is time between
* checks to see if import is done or state changed.
*/
#define IMPORT_PROGRESS_SILENT_SECS 2
#define IMPORT_PROGRESS_PERIOD_SECS 1
#define IMPORT_PROGRESS_SLEEP_USEC 100*1000

static void *
report_import_progress_thread(void *arg)
{
char path[] = "/proc/spl/kstat/zfs/import_progress";
import_progress_args_t *pa = arg;
time_t start_time = time(NULL);
time_t last_report = 0;
unsigned long long last_pool_guid = 0;
unsigned long long last_multihost_secs_time = 0;
unsigned long long last_max_txg = 0;
int rc = 0;
int import_kstat;

ASSERT(arg);

import_kstat = open(path, O_RDONLY);
if (import_kstat < 0)
rc = -1;

while (pa->import_done == B_FALSE && rc >= 0) {
char *pool_name;
ssize_t count;
unsigned long long load_state;
unsigned long long pool_guid;
unsigned long long multihost_secs;
unsigned long long max_txg;
char kstat_buf[512];
char mmp_status_buf[100];
char pool_rewind_buf[100];
char *bufp;
off_t off;

if ((time(NULL) - start_time) < IMPORT_PROGRESS_SILENT_SECS)
continue;

if ((time(NULL) - last_report) < IMPORT_PROGRESS_PERIOD_SECS)
continue;

last_report = time(NULL);

off = lseek(import_kstat, 0, SEEK_SET);
if (off < 0) {
rc = -1;
break;
}

/* Read into a buffer first for debugging parse failure */
bzero(kstat_buf, sizeof (kstat_buf));
count = read(import_kstat, kstat_buf, sizeof (kstat_buf) - 1);
if (count < 0) {
fprintf(stderr, "Import monitor: read kstat fail %d\n",
errno);
continue;
}

/* Skip header line */
for (bufp = kstat_buf; *bufp && *bufp != '\n'; bufp++)
;

count = sscanf(bufp, "%llu %llu %llu %llu %ms\n",
&pool_guid, &load_state, &multihost_secs,
&max_txg, &pool_name);

if (count == 5) {
time_t multihost_secs_time = time(NULL) +
multihost_secs;

mmp_status_buf[0] = '\0';
pool_rewind_buf[0] = '\0';

if (pool_guid != last_pool_guid ||
multihost_secs_time != last_multihost_secs_time)
snprintf(mmp_status_buf,
sizeof (mmp_status_buf), "Multihost is "
"checking for a remote import.\nCheck will "
"complete by %s",
ctime(&multihost_secs_time));

if ((pool_guid != last_pool_guid ||
max_txg != last_max_txg) &&
max_txg != 0) {
snprintf(pool_rewind_buf,
sizeof (pool_rewind_buf), "%s rewind txg "
"%llu\n", load_state_names[load_state],
max_txg);
}

if (mmp_status_buf[0] != '\0' ||
pool_rewind_buf[0] != '\0') {
printf("Pool %s (guid %llu):\n%s%s\n",
pool_name, pool_guid, mmp_status_buf,
pool_rewind_buf);
}

last_pool_guid = pool_guid;
last_multihost_secs_time = multihost_secs_time;
last_max_txg = max_txg;

if (pool_name)
free(pool_name);
} else {
fprintf(stderr, "Import monitor failure: "
"unable to parse %s\n", kstat_buf);
}

usleep(IMPORT_PROGRESS_SLEEP_USEC);
}

if (rc < 0) {
fprintf(stderr, "%s is not available to monitor the "
"progress of the import. The import is "
"continuing.\n", path);
}

if (import_kstat > 0)
close(import_kstat);

return (NULL);
}

/*
* zpool import [-d dir] [-D]
* import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l]
Expand Down Expand Up @@ -3450,6 +3596,9 @@ zpool_do_checkpoint(int argc, char **argv)
* --rewind-to-checkpoint
* Import the pool and revert back to the checkpoint.
*
* -v Report import progress periodically, for slow imports.
*
*
* The import command scans for pools to import, and import pools based on pool
* name and GUID. The pool can also be renamed as part of the import process.
*/
Expand All @@ -3476,20 +3625,24 @@ zpool_do_import(int argc, char **argv)
boolean_t do_rewind = B_FALSE;
boolean_t xtreme_rewind = B_FALSE;
boolean_t do_scan = B_FALSE;
boolean_t do_verbose = B_FALSE;
boolean_t pool_exists = B_FALSE;
boolean_t pool_specified = B_FALSE;
uint64_t txg = -1ULL;
char *cachefile = NULL;
importargs_t idata = { 0 };
char *endptr;

pthread_t tid;
import_progress_args_t pa = { B_FALSE };

struct option long_options[] = {
{"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT},
{0, 0, 0, 0}
};

/* check options */
while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX",
while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:vVX",
long_options, NULL)) != -1) {
switch (c) {
case 'a':
Expand Down Expand Up @@ -3563,6 +3716,9 @@ zpool_do_import(int argc, char **argv)
}
rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND;
break;
case 'v':
do_verbose = B_TRUE;
break;
case 'V':
flags |= ZFS_IMPORT_VERBATIM;
break;
Expand Down Expand Up @@ -3711,6 +3867,15 @@ zpool_do_import(int argc, char **argv)
idata.scan = do_scan;
idata.policy = policy;

if (do_verbose) {
int pthread_err;
pthread_err = pthread_create(&tid, NULL,
report_import_progress_thread, &pa);
if (err)
fprintf(stderr, "Unable to report progress: "
"err %d. Import will continue.\n", pthread_err);
}

pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops);

if (pools != NULL && pool_exists &&
Expand Down Expand Up @@ -3773,6 +3938,20 @@ zpool_do_import(int argc, char **argv)
do_all, &idata);
}

if (do_verbose) {
pa.import_done = B_TRUE;
(void) pthread_cancel(tid);
(void) pthread_join(tid, NULL);
}

/*
* If we were just looking for pools, report an error if none were
* found.
*/
if (argc == 0)
(void) fprintf(stderr,
gettext("no pools available to import\n"));

error:
nvlist_free(props);
nvlist_free(pools);
Expand Down
26 changes: 18 additions & 8 deletions tests/zfs-tests/tests/functional/mmp/mmp.kshlib
Original file line number Diff line number Diff line change
Expand Up @@ -206,36 +206,46 @@ function import_no_activity_check # pool opts
{
typeset pool=$1
typeset opts=$2

typeset max_duration=$((MMP_TEST_DURATION_DEFAULT-1))
typeset tmpfile=$(mktemp)

SECONDS=0
zpool import $opts $pool
zpool import $opts -v $pool > $tmpfile 2>&1
typeset rc=$?
cat $tmpfile

if [[ $SECONDS -gt $max_duration ]]; then
log_fail "ERROR: import_no_activity_check unexpected activity \
check (${SECONDS}s gt $max_duration)"
if grep "Multihost is checking" $tmpfile; then
log_note $(grep "Multihost is checking" $tmpfile)
log_fail "ERROR: import_no_activity_check unexpected \
activity check reported"
fi

rm $tmpfile
return $rc
}

function import_activity_check # pool opts act_test_duration
{
typeset pool=$1
typeset opts=$2
typeset min_duration=${3:-$MMP_TEST_DURATION_DEFAULT}
typeset min_duration=${4:-$MMP_TEST_DURATION_DEFAULT}
typeset tmpfile=$(mktemp)

SECONDS=0
zpool import $opts $pool
zpool import -v $opts $pool > $tmpfile 2>&1
typeset rc=$?
cat $tmpfile

if [[ $SECONDS -le $min_duration ]]; then
log_fail "ERROR: import_activity_check expected activity check \
(${SECONDS}s le min_duration $min_duration)"
fi

if ! grep "use .zpool import -f" $tmpfile && ! grep "Multihost is checking" $tmpfile; then
log_fail "ERROR: import_activity_check expected \
Multihost status message"
fi

rm $tmpfile
return $rc
}

Expand Down
7 changes: 7 additions & 0 deletions tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,22 @@ for opt in "" "-f"; do
done

# 3. Verify multihost=off and hostids differ (no activity check)
log_note "3. Verify multihost=off and hostids differ"
log_must zpool export -F $TESTPOOL
log_must mmp_clear_hostid
log_must mmp_set_hostid $HOSTID2
log_mustnot import_no_activity_check $TESTPOOL ""
log_must import_no_activity_check $TESTPOOL "-f"

# 4. Verify multihost=off and hostid zero allowed (no activity check)
log_note "4. Verify multihost=off and hostid zero allowed"
log_must zpool export -F $TESTPOOL
log_must mmp_clear_hostid
log_mustnot import_no_activity_check $TESTPOOL ""
log_must import_no_activity_check $TESTPOOL "-f"

# 5. Verify multihost=on and hostids match (no activity check)
log_note "5. Verify multihost=on and hostids match"
log_must mmp_pool_set_hostid $TESTPOOL $HOSTID1
log_must zpool set multihost=on $TESTPOOL

Expand All @@ -84,23 +87,27 @@ for opt in "" "-f"; do
done

# 6. Verify multihost=on and hostids differ (activity check)
log_note "6. Verify multihost=on and hostids differ"
log_must zpool export -F $TESTPOOL
log_must mmp_clear_hostid
log_must mmp_set_hostid $HOSTID2
log_mustnot import_activity_check $TESTPOOL ""
log_must import_activity_check $TESTPOOL "-f"

# 7. Verify mmp_write and mmp_fail are set correctly
log_note "7. Verify mmp_write and mmp_fail are set correctly"
log_must zpool export -F $TESTPOOL
log_must verify_mmp_write_fail_present ${DISK[0]}

# 8. Verify multihost=on and hostid zero fails (no activity check)
log_note "8. Verify multihost=on and hostid zero fails (no activity check)"
log_must mmp_clear_hostid
MMP_IMPORTED_MSG="Set a unique system hostid"
log_must check_pool_import $TESTPOOL "-f" "action" "$MMP_IMPORTED_MSG"
log_mustnot import_no_activity_check $TESTPOOL "-f"

# 9. Verify activity check duration based on mmp_write and mmp_fail
log_note "9. Verify activity check duration based on mmp_write and mmp_fail"
# Specify a short test via tunables but import pool imported while
# tunables set to default duration.
log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_MIN
Expand Down

0 comments on commit ddcc1c1

Please sign in to comment.