Skip to content

Commit

Permalink
Remove spa_namespace_lock from zpool status
Browse files Browse the repository at this point in the history
This commit removes spa_namespace_lock from the zpool status codepath.
This means that zpool status will not hang if a pool fails while holding
the spa_namespace_lock.

Background:

The spa_namespace_lock was originally meant to protect the
spa_namespace_avl AVL tree.  The spa_namespace_avl tree held the
mappings from pool names to spa_t's.  So if you wanted to lookup the
spa_t for the "tank" pool, you would do an AVL search for "tank" while
holding spa_namespace_lock.

Over time though the spa_namespace_lock was re-purposed to protect other
critical codepaths in the spa subsystem as well.  In many cases we don't
know what the original authors meant to protect with it, or if they
needed it for read-only or read-write protection.  It is simply "too big
and risky to fix properly".

The workaround is to add a new lightweight version of the
spa_namespace_lock called spa_namespace_lite_lock.  The
spa_namespace_lite_lock protects a parallel copy of the
spa_namespace_avl AVL tree called spa_namespace_lite_avl.  Anytime
spa_namespace_avl gets updated, so does spa_namespace_lite_avl.  They
are always in sync.  This allows us to conceptually use
spa_namespace_lite_avl as a protected, read-only version
spa_namespace_avl.  Specifically, calls to spa_lookup_lite() and
spa_next_lite() only need to acquire a reader lock on
spa_namespace_lite_lock; they do not need to also acquire the old
spa_namespace_lock.  This allows us to still run zpool status even if
the zfs module has spa_namespace_lock held.  Note that these AVL tree
locks only protect the tree, not the actual spa_t contents.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
  • Loading branch information
tonyhutter committed Sep 4, 2024
1 parent b3b7491 commit 5eed8aa
Show file tree
Hide file tree
Showing 19 changed files with 300 additions and 76 deletions.
20 changes: 20 additions & 0 deletions include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,23 @@ extern kmutex_t spa_namespace_lock;
extern avl_tree_t spa_namespace_avl;
extern kcondvar_t spa_namespace_cv;

extern krwlock_t spa_namespace_lite_lock;
extern avl_tree_t spa_namespace_lite_avl;

extern uint_t spa_namespace_delay_ms;
/*
* Special version of mutex_enter() used for testing the spa_namespace_lock.
* It inserts an artificial delay after acquiring the lock to simulate a failure
* while holding the lock. The delay is controlled by the
* spa_namespace_lock_delay module param and is only to be used by ZTS.
*/
#define mutex_enter_ns(lock) do { \
ASSERT(lock == &spa_namespace_lock); \
mutex_enter(lock); \
if (unlikely(spa_namespace_delay_ms != 0)) \
zfs_msleep(spa_namespace_delay_ms); \
} while (0);

/*
* SPA configuration functions in spa_config.c
*/
Expand All @@ -866,9 +883,11 @@ extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv,

/* Namespace manipulation */
extern spa_t *spa_lookup(const char *name);
extern spa_t *spa_lookup_lite(const char *name);
extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
extern void spa_remove(spa_t *spa);
extern spa_t *spa_next(spa_t *prev);
extern spa_t *spa_next_lite(spa_t *prev);

/* Refcount functions */
extern void spa_open_ref(spa_t *spa, const void *tag);
Expand Down Expand Up @@ -1086,6 +1105,7 @@ extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
dmu_tx_t *tx);
extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
extern spa_t *spa_by_guid_lite(uint64_t pool_guid, uint64_t device_guid);
extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
extern char *spa_strdup(const char *);
extern void spa_strfree(char *);
Expand Down
2 changes: 2 additions & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ struct spa {
char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */
char *spa_comment; /* comment */
avl_node_t spa_avl; /* node in spa_namespace_avl */
avl_node_t spa_lite_avl; /* spa_namespace_lite_avl */
nvlist_t *spa_config; /* last synced config */
nvlist_t *spa_config_syncing; /* currently syncing config */
nvlist_t *spa_config_splitting; /* config for splitting */
Expand Down Expand Up @@ -483,6 +484,7 @@ struct spa {
extern char *spa_config_path;
extern const char *zfs_deadman_failmode;
extern uint_t spa_slop_shift;
extern uint_t spa_namespace_delay_ms;
extern void spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, zio_t *zio, boolean_t cutinline);
extern void spa_load_spares(spa_t *spa);
Expand Down
2 changes: 2 additions & 0 deletions include/sys/zfs_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,8 @@ void ksiddomain_rele(ksiddomain_t *);
(void) nanosleep(&ts, NULL); \
} while (0)

#define zfs_msleep(ms) zfs_sleep_until(gethrtime() + (MSEC2NSEC(ms)))

typedef int fstrans_cookie_t;

extern fstrans_cookie_t spl_fstrans_mark(void);
Expand Down
1 change: 1 addition & 0 deletions include/sys/zfs_delay.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,6 @@
usleep_range(delta_us, delta_us + 100); \
} \
} while (0)
#define zfs_msleep(ms) zfs_sleep_until(gethrtime() + (MSEC2NSEC(ms)))

#endif /* _SYS_FS_ZFS_DELAY_H */
2 changes: 1 addition & 1 deletion module/os/freebsd/zfs/spa_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ spa_import_rootpool(const char *name, bool checkpointrewind)
*/
config = spa_generate_rootconf(name);

mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
if (config != NULL) {
pname = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME);
VERIFY0(strcmp(name, pname));
Expand Down
2 changes: 1 addition & 1 deletion module/os/freebsd/zfs/zfs_ioctl_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
"command", &command) != 0)
return (EINVAL);

mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa = spa_by_guid(pool_guid, vdev_guid);
if (spa != NULL)
strcpy(name, spa_name(spa));
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -8205,7 +8205,7 @@ l2arc_dev_get_next(void)
* of cache devices (l2arc_dev_mtx). Once a device has been selected,
* both locks will be dropped and a spa config lock held instead.
*/
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
mutex_enter(&l2arc_dev_mtx);

/* if there are no vdevs, there is nothing to do */
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/mmp.c
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,7 @@ mmp_signal_all_threads(void)
{
spa_t *spa = NULL;

mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
while ((spa = spa_next(spa))) {
if (spa->spa_state == POOL_STATE_ACTIVE)
mmp_signal_thread(spa);
Expand Down
Loading

0 comments on commit 5eed8aa

Please sign in to comment.