Skip to content

Commit

Permalink
Remove spa_namespace_lock from zpool status
Browse files Browse the repository at this point in the history
This commit removes spa_namespace_lock from the zpool status codepath.
This means that zpool status will not hang if a pool fails while holding
the spa_namespace_lock.

Background:

The spa_namespace_lock was originally meant to protect the
spa_namespace_avl AVL tree.  The spa_namespace_avl tree held the
mappings from pool names to spa_t's.  So if you wanted to lookup the
spa_t for the "tank" pool, you would do an AVL search for "tank" while
holding spa_namespace_lock.

Over time though the spa_namespace_lock was re-purposed to protect other
critical codepaths in the spa subsystem as well.  In many cases we don't
know what the original authors meant to protect with it, or if they
needed it for read-only or read-write protection.  It is simply "too big
and risky to fix properly".

The workaround is to add a new lightweight version of the
spa_namespace_lock called spa_namespace_lite_lock.
spa_namespace_lite_lock only protects the AVL tree, and nothing else.
It can be used for read-only access to the AVL tree without requiring
the spa_namespace_lock.  Calls to spa_lookup_lite() and spa_next_lite()
only need to acquire a reader lock on spa_namespace_lite_lock; they do
not need to also acquire the old spa_namespace_lock.  This allows us to
still run zpool status even if the zfs module has spa_namespace_lock
held.  Note that these AVL tree locks only protect the tree, not the
actual spa_t contents.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
  • Loading branch information
tonyhutter committed Sep 10, 2024
1 parent 5c67820 commit 9df9d5e
Show file tree
Hide file tree
Showing 20 changed files with 376 additions and 74 deletions.
4 changes: 2 additions & 2 deletions TEST
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
#TEST_ZFSTESTS_DISKS="vdb vdc vdd"
#TEST_ZFSTESTS_DISKSIZE="8G"
#TEST_ZFSTESTS_ITERS="1"
#TEST_ZFSTESTS_OPTIONS="-vx"
#TEST_ZFSTESTS_RUNFILE="linux.run"
TEST_ZFSTESTS_OPTIONS="-vx -T zpool_status"
TEST_ZFSTESTS_RUNFILE="linux.run"
#TEST_ZFSTESTS_TAGS="functional"

### zfsstress
Expand Down
18 changes: 18 additions & 0 deletions include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,22 @@ extern kmutex_t spa_namespace_lock;
extern avl_tree_t spa_namespace_avl;
extern kcondvar_t spa_namespace_cv;

extern krwlock_t spa_namespace_lite_lock;

extern uint_t spa_namespace_delay_ms;
/*
* Special version of mutex_enter() used for testing the spa_namespace_lock.
* It inserts an artificial delay after acquiring the lock to simulate a failure
* while holding the lock. The delay is controlled by the
* spa_namespace_lock_delay module param and is only to be used by ZTS.
*/
#define mutex_enter_ns(lock) do { \
ASSERT(lock == &spa_namespace_lock); \
mutex_enter(lock); \
if (unlikely(spa_namespace_delay_ms != 0)) \
zfs_msleep(spa_namespace_delay_ms); \
} while (0);

/*
* SPA configuration functions in spa_config.c
*/
Expand All @@ -866,9 +882,11 @@ extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv,

/* Namespace manipulation */
extern spa_t *spa_lookup(const char *name);
extern spa_t *spa_lookup_lite(const char *name);
extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
extern void spa_remove(spa_t *spa);
extern spa_t *spa_next(spa_t *prev);
extern spa_t *spa_next_lite(spa_t *prev);

/* Refcount functions */
extern void spa_open_ref(spa_t *spa, const void *tag);
Expand Down
1 change: 1 addition & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,7 @@ struct spa {
extern char *spa_config_path;
extern const char *zfs_deadman_failmode;
extern uint_t spa_slop_shift;
extern uint_t spa_namespace_delay_ms;
extern void spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, zio_t *zio, boolean_t cutinline);
extern void spa_load_spares(spa_t *spa);
Expand Down
2 changes: 2 additions & 0 deletions include/sys/zfs_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,8 @@ void ksiddomain_rele(ksiddomain_t *);
(void) nanosleep(&ts, NULL); \
} while (0)

#define zfs_msleep(ms) zfs_sleep_until(gethrtime() + (MSEC2NSEC(ms)))

typedef int fstrans_cookie_t;

extern fstrans_cookie_t spl_fstrans_mark(void);
Expand Down
1 change: 1 addition & 0 deletions include/sys/zfs_delay.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,6 @@
usleep_range(delta_us, delta_us + 100); \
} \
} while (0)
#define zfs_msleep(ms) zfs_sleep_until(gethrtime() + (MSEC2NSEC(ms)))

#endif /* _SYS_FS_ZFS_DELAY_H */
2 changes: 1 addition & 1 deletion module/os/freebsd/zfs/spa_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ spa_import_rootpool(const char *name, bool checkpointrewind)
*/
config = spa_generate_rootconf(name);

mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
if (config != NULL) {
pname = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME);
VERIFY0(strcmp(name, pname));
Expand Down
2 changes: 1 addition & 1 deletion module/os/freebsd/zfs/zfs_ioctl_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
"command", &command) != 0)
return (EINVAL);

mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa = spa_by_guid(pool_guid, vdev_guid);
if (spa != NULL)
strcpy(name, spa_name(spa));
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -8205,7 +8205,7 @@ l2arc_dev_get_next(void)
* of cache devices (l2arc_dev_mtx). Once a device has been selected,
* both locks will be dropped and a spa config lock held instead.
*/
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
mutex_enter(&l2arc_dev_mtx);

/* if there are no vdevs, there is nothing to do */
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/mmp.c
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,7 @@ mmp_signal_all_threads(void)
{
spa_t *spa = NULL;

mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
while ((spa = spa_next(spa))) {
if (spa->spa_state == POOL_STATE_ACTIVE)
mmp_signal_thread(spa);
Expand Down
Loading

0 comments on commit 9df9d5e

Please sign in to comment.