Skip to content

Commit

Permalink
Add support for parallel pool exports
Browse files Browse the repository at this point in the history
Changed spa_export_common() such that it no longer holds the
spa_namespace_lock for the entire duration and instead sets
spa_export_thread to indicate an import is in progress on the
spa.  This allows for an export to a diffent pool to proceed
in parallel while an export is still processing potentially
long operations like spa_unload_log_sm_flush_all().

Calls like spa_lookup() and spa_vdev_enter() that rely on
the spa_namespace_lock to serialize them against a concurrent
export, now wait for any in-progress export thread to complete
before proceeding.

The 'zpool import -a' sub-command also provides multi-threaded
support, using a thread pool to submit the exports in parallel.

Sponsored-By: Klara Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <gwilson@delphix.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes openzfs#16153
  • Loading branch information
don-brady authored and behlendorf committed May 14, 2024
1 parent abec7dc commit 975a132
Show file tree
Hide file tree
Showing 12 changed files with 373 additions and 33 deletions.
88 changes: 82 additions & 6 deletions cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -2030,10 +2030,19 @@ zpool_do_destroy(int argc, char **argv)
}

typedef struct export_cbdata {
tpool_t *tpool;
pthread_mutex_t mnttab_lock;
boolean_t force;
boolean_t hardforce;
int retval;
} export_cbdata_t;


typedef struct {
char *aea_poolname;
export_cbdata_t *aea_cbdata;
} async_export_args_t;

/*
* Export one pool
*/
Expand All @@ -2042,11 +2051,20 @@ zpool_export_one(zpool_handle_t *zhp, void *data)
{
export_cbdata_t *cb = data;

if (zpool_disable_datasets(zhp, cb->force) != 0)
return (1);
/*
* zpool_disable_datasets() is not thread-safe for mnttab access.
* So we serialize access here for 'zpool export -a' parallel case.
*/
if (cb->tpool != NULL)
pthread_mutex_lock(&cb->mnttab_lock);

/* The history must be logged as part of the export */
log_history = B_FALSE;
int retval = zpool_disable_datasets(zhp, cb->force);

if (cb->tpool != NULL)
pthread_mutex_unlock(&cb->mnttab_lock);

if (retval)
return (1);

if (cb->hardforce) {
if (zpool_export_force(zhp, history_str) != 0)
Expand All @@ -2058,6 +2076,48 @@ zpool_export_one(zpool_handle_t *zhp, void *data)
return (0);
}

/*
* Asynchronous export request
*/
static void
zpool_export_task(void *arg)
{
async_export_args_t *aea = arg;

zpool_handle_t *zhp = zpool_open(g_zfs, aea->aea_poolname);
if (zhp != NULL) {
int ret = zpool_export_one(zhp, aea->aea_cbdata);
if (ret != 0)
aea->aea_cbdata->retval = ret;
zpool_close(zhp);
} else {
aea->aea_cbdata->retval = 1;
}

free(aea->aea_poolname);
free(aea);
}

/*
* Process an export request in parallel
*/
static int
zpool_export_one_async(zpool_handle_t *zhp, void *data)
{
tpool_t *tpool = ((export_cbdata_t *)data)->tpool;
async_export_args_t *aea = safe_malloc(sizeof (async_export_args_t));

/* save pool name since zhp will go out of scope */
aea->aea_poolname = strdup(zpool_get_name(zhp));
aea->aea_cbdata = data;

/* ship off actual export to another thread */
if (tpool_dispatch(tpool, zpool_export_task, (void *)aea) != 0)
return (errno); /* unlikely */
else
return (0);
}

/*
* zpool export [-f] <pool> ...
*
Expand Down Expand Up @@ -2098,17 +2158,33 @@ zpool_do_export(int argc, char **argv)

cb.force = force;
cb.hardforce = hardforce;
cb.tpool = NULL;
cb.retval = 0;
argc -= optind;
argv += optind;

/* The history will be logged as part of the export itself */
log_history = B_FALSE;

if (do_all) {
if (argc != 0) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}

return (for_each_pool(argc, argv, B_TRUE, NULL,
ZFS_TYPE_POOL, B_FALSE, zpool_export_one, &cb));
cb.tpool = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN),
0, NULL);
pthread_mutex_init(&cb.mnttab_lock, NULL);

/* Asynchronously call zpool_export_one using thread pool */
ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL,
B_FALSE, zpool_export_one_async, &cb);

tpool_wait(cb.tpool);
tpool_destroy(cb.tpool);
(void) pthread_mutex_destroy(&cb.mnttab_lock);

return (ret | cb.retval);
}

/* check arguments */
Expand Down
1 change: 1 addition & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ struct spa {
dsl_pool_t *spa_dsl_pool;
boolean_t spa_is_initializing; /* true while opening pool */
boolean_t spa_is_exporting; /* true while exporting pool */
kthread_t *spa_export_thread; /* valid during pool export */
kthread_t *spa_load_thread; /* loading, no namespace lock */
metaslab_class_t *spa_normal_class; /* normal data class */
metaslab_class_t *spa_log_class; /* intent log data class */
Expand Down
4 changes: 2 additions & 2 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -8143,11 +8143,11 @@ l2arc_dev_get_next(void)

ASSERT3P(next, !=, NULL);
} while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
next->l2ad_trim_all);
next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting);

/* if we were unable to find any usable vdevs, return NULL */
if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
next->l2ad_trim_all)
next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting)
next = NULL;

l2arc_dev_last = next;
Expand Down
36 changes: 29 additions & 7 deletions module/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
* Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
* Copyright (c) 2024, Klara Inc.
*/

/*
Expand Down Expand Up @@ -1991,7 +1992,8 @@ spa_destroy_aux_threads(spa_t *spa)
static void
spa_unload(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);

spa_import_progress_remove(spa_guid(spa));
Expand Down Expand Up @@ -6955,7 +6957,7 @@ static int
spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
boolean_t force, boolean_t hardforce)
{
int error;
int error = 0;
spa_t *spa;
hrtime_t export_start = gethrtime();

Expand All @@ -6979,8 +6981,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
spa->spa_is_exporting = B_TRUE;

/*
* Put a hold on the pool, drop the namespace lock, stop async tasks,
* reacquire the namespace lock, and see if we can export.
* Put a hold on the pool, drop the namespace lock, stop async tasks
* and see if we can export.
*/
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
Expand All @@ -6990,10 +6992,18 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
taskq_wait(spa->spa_zvol_taskq);
}
mutex_enter(&spa_namespace_lock);
spa->spa_export_thread = curthread;
spa_close(spa, FTAG);
mutex_exit(&spa_namespace_lock);

/*
* At this point we no longer hold the spa_namespace_lock and
* the spa_export_thread indicates that an export is in progress.
*/

if (spa->spa_state == POOL_STATE_UNINITIALIZED)
goto export_spa;

/*
* The pool will be in core if it's openable, in which case we can
* modify its state. Objsets may be open only because they're dirty,
Expand Down Expand Up @@ -7089,6 +7099,10 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
if (oldconfig && spa->spa_config)
*oldconfig = fnvlist_dup(spa->spa_config);

if (new_state == POOL_STATE_EXPORTED)
zio_handle_export_delay(spa, gethrtime() - export_start);

mutex_enter(&spa_namespace_lock);
if (new_state != POOL_STATE_UNINITIALIZED) {
if (!hardforce)
spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
Expand All @@ -7100,17 +7114,25 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
* we make sure to reset the exporting flag.
*/
spa->spa_is_exporting = B_FALSE;
spa->spa_export_thread = NULL;
}

if (new_state == POOL_STATE_EXPORTED)
zio_handle_export_delay(spa, gethrtime() - export_start);

/*
* Wake up any waiters on spa_namespace_lock
* They need to re-attempt a spa_lookup()
*/
cv_broadcast(&spa_namespace_cv);
mutex_exit(&spa_namespace_lock);
return (0);

fail:
mutex_enter(&spa_namespace_lock);
spa->spa_is_exporting = B_FALSE;
spa->spa_export_thread = NULL;
spa_async_resume(spa);

/* Wake up any waiters on spa_namespace_lock */
cv_broadcast(&spa_namespace_cv);
mutex_exit(&spa_namespace_lock);
return (error);
}
Expand Down
50 changes: 40 additions & 10 deletions module/zfs/spa_misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
* Copyright (c) 2017 Datto Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
* Copyright (c) 2023, Klara Inc.
* Copyright (c) 2023, 2024, Klara Inc.
*/

#include <sys/zfs_context.h>
Expand Down Expand Up @@ -82,8 +82,8 @@
* - Check if spa_refcount is zero
* - Rename a spa_t
* - add/remove/attach/detach devices
* - Held for the duration of create/destroy/export
* - Held at the start and end of import
* - Held for the duration of create/destroy
* - Held at the start and end of import and export
*
* It does not need to handle recursion. A create or destroy may
* reference objects (files or zvols) in other pools, but by
Expand Down Expand Up @@ -636,8 +636,14 @@ spa_lookup(const char *name)
if (spa == NULL)
return (NULL);

if (spa->spa_load_thread != NULL &&
spa->spa_load_thread != curthread) {
/*
* Avoid racing with import/export, which don't hold the namespace
* lock for their entire duration.
*/
if ((spa->spa_load_thread != NULL &&
spa->spa_load_thread != curthread) ||
(spa->spa_export_thread != NULL &&
spa->spa_export_thread != curthread)) {
cv_wait(&spa_namespace_cv, &spa_namespace_lock);
goto retry;
}
Expand Down Expand Up @@ -950,14 +956,15 @@ spa_open_ref(spa_t *spa, const void *tag)

/*
* Remove a reference to the given spa_t. Must have at least one reference, or
* have the namespace lock held.
* have the namespace lock held or be part of a pool import/export.
*/
void
spa_close(spa_t *spa, const void *tag)
{
ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_load_thread == curthread);
spa->spa_load_thread == curthread ||
spa->spa_export_thread == curthread);
(void) zfs_refcount_remove(&spa->spa_refcount, tag);
}

Expand All @@ -977,13 +984,15 @@ spa_async_close(spa_t *spa, const void *tag)

/*
* Check to see if the spa refcount is zero. Must be called with
* spa_namespace_lock held. We really compare against spa_minref, which is the
* number of references acquired when opening a pool
* spa_namespace_lock held or be the spa export thread. We really
* compare against spa_minref, which is the number of references
* acquired when opening a pool
*/
boolean_t
spa_refcount_zero(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);

return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
}
Expand Down Expand Up @@ -1231,6 +1240,21 @@ spa_vdev_enter(spa_t *spa)
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);

/*
* We have a reference on the spa and a spa export could be
* starting but no longer holding the spa_namespace_lock. So
* check if there is an export and if so wait. It will fail
* fast (EBUSY) since we are still holding a spa reference.
*
* Note that we can be woken by a different spa transitioning
* through an import/export, so we must wait for our condition
* to change before proceeding.
*/
while (spa->spa_export_thread != NULL &&
spa->spa_export_thread != curthread) {
cv_wait(&spa_namespace_cv, &spa_namespace_lock);
}

vdev_autotrim_stop_all(spa);

return (spa_vdev_config_enter(spa));
Expand All @@ -1248,6 +1272,12 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);

/* See comment in spa_vdev_enter() */
while (spa->spa_export_thread != NULL &&
spa->spa_export_thread != curthread) {
cv_wait(&spa_namespace_cv, &spa_namespace_lock);
}

vdev_autotrim_stop_all(spa);

if (guid != 0) {
Expand Down
9 changes: 6 additions & 3 deletions module/zfs/vdev_initialize.c
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,8 @@ vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
(void) spa;
vdev_t *vd;

ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);

while ((vd = list_remove_head(vd_list)) != NULL) {
mutex_enter(&vd->vdev_initialize_lock);
Expand Down Expand Up @@ -724,7 +725,8 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
if (vd_list == NULL) {
vdev_initialize_stop_wait_impl(vd);
} else {
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
vd->vdev_spa->spa_export_thread == curthread);
list_insert_tail(vd_list, vd);
}
}
Expand Down Expand Up @@ -756,7 +758,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
spa_t *spa = vd->vdev_spa;
list_t vd_list;

ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);

list_create(&vd_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_initialize_node));
Expand Down
3 changes: 2 additions & 1 deletion module/zfs/vdev_rebuild.c
Original file line number Diff line number Diff line change
Expand Up @@ -1087,7 +1087,8 @@ vdev_rebuild_stop_wait(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;

ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);

if (vd == spa->spa_root_vdev) {
for (uint64_t i = 0; i < vd->vdev_children; i++)
Expand Down
Loading

0 comments on commit 975a132

Please sign in to comment.