Skip to content

Commit

Permalink
Parallel pool import
Browse files Browse the repository at this point in the history
This commit allow spa_load() to drop the spa_namespace_lock so
that imports can happen concurrently. Prior to dropping the
spa_namespace_lock, the import logic will set the spa_load_thread
value to track the thread which is doing the import.

Consumers of spa_lookup() retain the same behavior by blocking
when either a thread is holding the spa_namespace_lock or the
spa_load_thread value is set. This will ensure that critical
concurrent operations cannot take place while a pool is being
imported.

The zpool command is also enhanced to provide multi-threaded support
when invoking zpool import -a.

Lastly, zinject provides a mechanism to insert artificial delays
when importing a pool and new zfs tests are added to verify parallel
import functionality.

Contributions-by: Don Brady <don.brady@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Wilson <gwilson@delphix.com>
Closes openzfs#16093
  • Loading branch information
grwilson authored Apr 22, 2024
1 parent f4f1561 commit c183d16
Show file tree
Hide file tree
Showing 19 changed files with 818 additions and 72 deletions.
115 changes: 106 additions & 9 deletions cmd/zinject/zinject.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2024, Klara Inc.
* Copyright (c) 2023-2024, Klara Inc.
*/

/*
Expand Down Expand Up @@ -310,6 +310,11 @@ usage(void)
"\t\tcreate 3 lanes on the device; one lane with a latency\n"
"\t\tof 10 ms and two lanes with a 25 ms latency.\n"
"\n"
"\tzinject -P import|export -s <seconds> pool\n"
"\t\tAdd an artificial delay to a future pool import or export,\n"
"\t\tsuch that the operation takes a minimum of supplied seconds\n"
"\t\tto complete.\n"
"\n"
"\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
"\t\tCause the pool to stop writing blocks yet not\n"
"\t\treport errors for a duration. Simulates buggy hardware\n"
Expand Down Expand Up @@ -392,8 +397,10 @@ print_data_handler(int id, const char *pool, zinject_record_t *record,
{
int *count = data;

if (record->zi_guid != 0 || record->zi_func[0] != '\0')
if (record->zi_guid != 0 || record->zi_func[0] != '\0' ||
record->zi_duration != 0) {
return (0);
}

if (*count == 0) {
(void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s "
Expand Down Expand Up @@ -507,6 +514,33 @@ print_panic_handler(int id, const char *pool, zinject_record_t *record,
return (0);
}

static int
print_pool_delay_handler(int id, const char *pool, zinject_record_t *record,
void *data)
{
int *count = data;

if (record->zi_cmd != ZINJECT_DELAY_IMPORT &&
record->zi_cmd != ZINJECT_DELAY_EXPORT) {
return (0);
}

if (*count == 0) {
(void) printf("%3s %-19s %-11s %s\n",
"ID", "POOL", "DELAY (sec)", "COMMAND");
(void) printf("--- ------------------- -----------"
" -------\n");
}

*count += 1;

(void) printf("%3d %-19s %-11llu %s\n",
id, pool, (u_longlong_t)record->zi_duration,
record->zi_cmd == ZINJECT_DELAY_IMPORT ? "import": "export");

return (0);
}

/*
* Print all registered error handlers. Returns the number of handlers
* registered.
Expand Down Expand Up @@ -537,6 +571,13 @@ print_all_handlers(void)
count = 0;
}

(void) iter_handlers(print_pool_delay_handler, &count);
if (count > 0) {
total += count;
(void) printf("\n");
count = 0;
}

(void) iter_handlers(print_panic_handler, &count);

return (count + total);
Expand Down Expand Up @@ -609,9 +650,27 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
zc.zc_guid = flags;

if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
(void) fprintf(stderr, "failed to add handler: %s\n",
errno == EDOM ? "block level exceeds max level of object" :
strerror(errno));
const char *errmsg = strerror(errno);

switch (errno) {
case EDOM:
errmsg = "block level exceeds max level of object";
break;
case EEXIST:
if (record->zi_cmd == ZINJECT_DELAY_IMPORT)
errmsg = "pool already imported";
if (record->zi_cmd == ZINJECT_DELAY_EXPORT)
errmsg = "a handler already exists";
break;
case ENOENT:
/* import delay injector running on older zfs module */
if (record->zi_cmd == ZINJECT_DELAY_IMPORT)
errmsg = "import delay injector not supported";
break;
default:
break;
}
(void) fprintf(stderr, "failed to add handler: %s\n", errmsg);
return (1);
}

Expand All @@ -636,6 +695,9 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
} else if (record->zi_duration < 0) {
(void) printf(" txgs: %lld \n",
(u_longlong_t)-record->zi_duration);
} else if (record->zi_timer > 0) {
(void) printf(" timer: %lld ms\n",
(u_longlong_t)NSEC2MSEC(record->zi_timer));
} else {
(void) printf("objset: %llu\n",
(u_longlong_t)record->zi_objset);
Expand Down Expand Up @@ -834,7 +896,7 @@ main(int argc, char **argv)
}

while ((c = getopt(argc, argv,
":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) {
switch (c) {
case 'a':
flags |= ZINJECT_FLUSH_ARC;
Expand Down Expand Up @@ -952,6 +1014,19 @@ main(int argc, char **argv)
sizeof (record.zi_func));
record.zi_cmd = ZINJECT_PANIC;
break;
case 'P':
if (strcasecmp(optarg, "import") == 0) {
record.zi_cmd = ZINJECT_DELAY_IMPORT;
} else if (strcasecmp(optarg, "export") == 0) {
record.zi_cmd = ZINJECT_DELAY_EXPORT;
} else {
(void) fprintf(stderr, "invalid command '%s': "
"must be 'import' or 'export'\n", optarg);
usage();
libzfs_fini(g_zfs);
return (1);
}
break;
case 'q':
quiet = 1;
break;
Expand Down Expand Up @@ -1033,7 +1108,7 @@ main(int argc, char **argv)
argc -= optind;
argv += optind;

if (record.zi_duration != 0)
if (record.zi_duration != 0 && record.zi_cmd == 0)
record.zi_cmd = ZINJECT_IGNORED_WRITES;

if (cancel != NULL) {
Expand Down Expand Up @@ -1179,8 +1254,8 @@ main(int argc, char **argv)
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || device != NULL || record.zi_freq > 0 ||
dvas != 0) {
(void) fprintf(stderr, "panic (-p) incompatible with "
"other options\n");
(void) fprintf(stderr, "%s incompatible with other "
"options\n", "import|export delay (-P)");
usage();
libzfs_fini(g_zfs);
return (2);
Expand All @@ -1198,6 +1273,28 @@ main(int argc, char **argv)
if (argv[1] != NULL)
record.zi_type = atoi(argv[1]);
dataset[0] = '\0';
} else if (record.zi_cmd == ZINJECT_DELAY_IMPORT ||
record.zi_cmd == ZINJECT_DELAY_EXPORT) {
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || device != NULL || record.zi_freq > 0 ||
dvas != 0) {
(void) fprintf(stderr, "%s incompatible with other "
"options\n", "import|export delay (-P)");
usage();
libzfs_fini(g_zfs);
return (2);
}

if (argc != 1 || record.zi_duration <= 0) {
(void) fprintf(stderr, "import|export delay (-P) "
"injection requires a duration (-s) and a single "
"pool name\n");
usage();
libzfs_fini(g_zfs);
return (2);
}

(void) strlcpy(pool, argv[0], sizeof (pool));
} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || record.zi_freq > 0 || dvas != 0) {
Expand Down
72 changes: 55 additions & 17 deletions cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <thread_pool.h>
#include <time.h>
#include <unistd.h>
#include <pwd.h>
Expand Down Expand Up @@ -3455,15 +3456,40 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
return (ret);
}

typedef struct import_parameters {
nvlist_t *ip_config;
const char *ip_mntopts;
nvlist_t *ip_props;
int ip_flags;
int *ip_err;
} import_parameters_t;

static void
do_import_task(void *arg)
{
import_parameters_t *ip = arg;
*ip->ip_err |= do_import(ip->ip_config, NULL, ip->ip_mntopts,
ip->ip_props, ip->ip_flags);
free(ip);
}


static int
import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags,
char *orig_name, char *new_name,
boolean_t do_destroyed, boolean_t pool_specified, boolean_t do_all,
importargs_t *import)
char *orig_name, char *new_name, importargs_t *import)
{
nvlist_t *config = NULL;
nvlist_t *found_config = NULL;
uint64_t pool_state;
boolean_t pool_specified = (import->poolname != NULL ||
import->guid != 0);


tpool_t *tp = NULL;
if (import->do_all) {
tp = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN),
0, NULL);
}

/*
* At this point we have a list of import candidate configs. Even if
Expand All @@ -3480,9 +3506,11 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags,

verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
&pool_state) == 0);
if (!do_destroyed && pool_state == POOL_STATE_DESTROYED)
if (!import->do_destroyed &&
pool_state == POOL_STATE_DESTROYED)
continue;
if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
if (import->do_destroyed &&
pool_state != POOL_STATE_DESTROYED)
continue;

verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
Expand All @@ -3491,12 +3519,21 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags,
if (!pool_specified) {
if (first)
first = B_FALSE;
else if (!do_all)
else if (!import->do_all)
(void) fputc('\n', stdout);

if (do_all) {
err |= do_import(config, NULL, mntopts,
props, flags);
if (import->do_all) {
import_parameters_t *ip = safe_malloc(
sizeof (import_parameters_t));

ip->ip_config = config;
ip->ip_mntopts = mntopts;
ip->ip_props = props;
ip->ip_flags = flags;
ip->ip_err = &err;

(void) tpool_dispatch(tp, do_import_task,
(void *)ip);
} else {
/*
* If we're importing from cachefile, then
Expand Down Expand Up @@ -3544,6 +3581,10 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags,
found_config = config;
}
}
if (import->do_all) {
tpool_wait(tp);
tpool_destroy(tp);
}

/*
* If we were searching for a specific pool, verify that we found a
Expand Down Expand Up @@ -3773,7 +3814,6 @@ zpool_do_import(int argc, char **argv)
boolean_t xtreme_rewind = B_FALSE;
boolean_t do_scan = B_FALSE;
boolean_t pool_exists = B_FALSE;
boolean_t pool_specified = B_FALSE;
uint64_t txg = -1ULL;
char *cachefile = NULL;
importargs_t idata = { 0 };
Expand Down Expand Up @@ -3972,7 +4012,6 @@ zpool_do_import(int argc, char **argv)
searchname = argv[0];
searchguid = 0;
}
pool_specified = B_TRUE;

/*
* User specified a name or guid. Ensure it's unique.
Expand Down Expand Up @@ -4005,6 +4044,8 @@ zpool_do_import(int argc, char **argv)
idata.cachefile = cachefile;
idata.scan = do_scan;
idata.policy = policy;
idata.do_destroyed = do_destroyed;
idata.do_all = do_all;

libpc_handle_t lpch = {
.lpc_lib_handle = g_zfs,
Expand Down Expand Up @@ -4047,9 +4088,7 @@ zpool_do_import(int argc, char **argv)
}

err = import_pools(pools, props, mntopts, flags,
argc >= 1 ? argv[0] : NULL,
argc >= 2 ? argv[1] : NULL,
do_destroyed, pool_specified, do_all, &idata);
argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, &idata);

/*
* If we're using the cachefile and we failed to import, then
Expand All @@ -4070,9 +4109,8 @@ zpool_do_import(int argc, char **argv)
pools = zpool_search_import(&lpch, &idata);

err = import_pools(pools, props, mntopts, flags,
argc >= 1 ? argv[0] : NULL,
argc >= 2 ? argv[1] : NULL,
do_destroyed, pool_specified, do_all, &idata);
argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL,
&idata);
}

error:
Expand Down
4 changes: 3 additions & 1 deletion include/libzutil.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018 by Delphix. All rights reserved.
* Copyright (c) 2018, 2024 by Delphix. All rights reserved.
*/

#ifndef _LIBZUTIL_H
Expand Down Expand Up @@ -79,6 +79,8 @@ typedef struct importargs {
boolean_t can_be_active; /* can the pool be active? */
boolean_t scan; /* prefer scanning to libblkid cache */
nvlist_t *policy; /* load policy (max txg, rewind, etc.) */
boolean_t do_destroyed;
boolean_t do_all;
} importargs_t;

typedef struct libpc_handle {
Expand Down
2 changes: 2 additions & 0 deletions include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,8 @@ void spa_select_allocator(zio_t *zio);

/* spa namespace global mutex */
extern kmutex_t spa_namespace_lock;
extern avl_tree_t spa_namespace_avl;
extern kcondvar_t spa_namespace_cv;

/*
* SPA configuration functions in spa_config.c
Expand Down
3 changes: 2 additions & 1 deletion include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2019 by Delphix. All rights reserved.
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
Expand Down Expand Up @@ -237,6 +237,7 @@ struct spa {
dsl_pool_t *spa_dsl_pool;
boolean_t spa_is_initializing; /* true while opening pool */
boolean_t spa_is_exporting; /* true while exporting pool */
kthread_t *spa_load_thread; /* loading, no namespace lock */
metaslab_class_t *spa_normal_class; /* normal data class */
metaslab_class_t *spa_log_class; /* intent log data class */
metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */
Expand Down
Loading

0 comments on commit c183d16

Please sign in to comment.