Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use a struct to organize metaslab-group-allocator fields #10213

Merged
merged 1 commit into from
Apr 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions include/sys/metaslab_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,16 @@ struct metaslab_class {
multilist_t *mc_metaslab_txg_list;
};

/*
* Per-allocator data structure.
*/
typedef struct metaslab_group_allocator {
uint64_t mga_cur_max_alloc_queue_depth;
zfs_refcount_t mga_alloc_queue_depth;
metaslab_t *mga_primary;
metaslab_t *mga_secondary;
} metaslab_group_allocator_t;

/*
* Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
* of a top-level vdev. They are linked together to form a circular linked
Expand All @@ -214,8 +224,6 @@ struct metaslab_class {
*/
struct metaslab_group {
kmutex_t mg_lock;
metaslab_t **mg_primaries;
metaslab_t **mg_secondaries;
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
boolean_t mg_allocatable; /* can we allocate? */
Expand Down Expand Up @@ -263,9 +271,8 @@ struct metaslab_group {
* groups are unable to handle their share of allocations.
*/
uint64_t mg_max_alloc_queue_depth;
uint64_t *mg_cur_max_alloc_queue_depth;
zfs_refcount_t *mg_alloc_queue_depth;
int mg_allocators;
metaslab_group_allocator_t *mg_allocator; /* array */
/*
* A metalab group that can no longer allocate the minimum block
* size will set mg_no_free_space. Once a metaslab group is out
Expand Down
95 changes: 46 additions & 49 deletions module/zfs/metaslab.c
Original file line number Diff line number Diff line change
Expand Up @@ -814,10 +814,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
KM_SLEEP);
mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
KM_SLEEP);
avl_create(&mg->mg_metaslab_tree, metaslab_compare,
sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
mg->mg_vd = vd;
Expand All @@ -827,13 +823,11 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
mg->mg_no_free_space = B_TRUE;
mg->mg_allocators = allocators;

mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
sizeof (zfs_refcount_t), KM_SLEEP);
mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
sizeof (uint64_t), KM_SLEEP);
mg->mg_allocator = kmem_zalloc(allocators *
sizeof (metaslab_group_allocator_t), KM_SLEEP);
for (int i = 0; i < allocators; i++) {
zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
mg->mg_cur_max_alloc_queue_depth[i] = 0;
metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
}

mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
Expand All @@ -856,21 +850,16 @@ metaslab_group_destroy(metaslab_group_t *mg)

taskq_destroy(mg->mg_taskq);
avl_destroy(&mg->mg_metaslab_tree);
kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
kmem_free(mg->mg_secondaries, mg->mg_allocators *
sizeof (metaslab_t *));
mutex_destroy(&mg->mg_lock);
mutex_destroy(&mg->mg_ms_disabled_lock);
cv_destroy(&mg->mg_ms_disabled_cv);

for (int i = 0; i < mg->mg_allocators; i++) {
zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
mg->mg_cur_max_alloc_queue_depth[i] = 0;
metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
}
kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
sizeof (zfs_refcount_t));
kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
sizeof (uint64_t));
kmem_free(mg->mg_allocator, mg->mg_allocators *
sizeof (metaslab_group_allocator_t));

kmem_free(mg, sizeof (metaslab_group_t));
}
Expand Down Expand Up @@ -951,14 +940,15 @@ metaslab_group_passivate(metaslab_group_t *mg)
spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
metaslab_group_alloc_update(mg);
for (int i = 0; i < mg->mg_allocators; i++) {
metaslab_t *msp = mg->mg_primaries[i];
metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
metaslab_t *msp = mga->mga_primary;
if (msp != NULL) {
mutex_enter(&msp->ms_lock);
metaslab_passivate(msp,
metaslab_weight_from_range_tree(msp));
mutex_exit(&msp->ms_lock);
}
msp = mg->mg_secondaries[i];
msp = mga->mga_secondary;
if (msp != NULL) {
mutex_enter(&msp->ms_lock);
metaslab_passivate(msp,
Expand Down Expand Up @@ -1218,9 +1208,9 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
* regardless of the mg_allocatable or throttle settings.
*/
if (mg->mg_allocatable) {
metaslab_group_t *mgp;
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
int64_t qdepth;
uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
uint64_t qmax = mga->mga_cur_max_alloc_queue_depth;

if (!mc->mc_alloc_throttle_enabled)
return (B_TRUE);
Expand All @@ -1239,8 +1229,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
*/
qmax = qmax * (4 + d) / 4;

qdepth = zfs_refcount_count(
&mg->mg_alloc_queue_depth[allocator]);
qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth);

/*
* If this metaslab group is below its qmax or it's
Expand All @@ -1258,11 +1247,14 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
* racy since we can't hold the locks for all metaslab
* groups at the same time when we make this check.
*/
for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
for (metaslab_group_t *mgp = mg->mg_next;
mgp != rotor; mgp = mgp->mg_next) {
metaslab_group_allocator_t *mgap =
&mgp->mg_allocator[allocator];
qmax = mgap->mga_cur_max_alloc_queue_depth;
qmax = qmax * (4 + d) / 4;
qdepth = zfs_refcount_count(
&mgp->mg_alloc_queue_depth[allocator]);
qdepth =
zfs_refcount_count(&mgap->mga_alloc_queue_depth);

/*
* If there is another metaslab group that
Expand Down Expand Up @@ -3205,6 +3197,7 @@ static int
metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
int allocator, uint64_t activation_weight)
{
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
ASSERT(MUTEX_HELD(&msp->ms_lock));

/*
Expand All @@ -3219,16 +3212,16 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
return (0);
}

metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
mg->mg_primaries : mg->mg_secondaries);
metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
&mga->mga_primary : &mga->mga_secondary);

mutex_enter(&mg->mg_lock);
if (arr[allocator] != NULL) {
if (*mspp != NULL) {
mutex_exit(&mg->mg_lock);
return (EEXIST);
}

arr[allocator] = msp;
*mspp = msp;
ASSERT3S(msp->ms_allocator, ==, -1);
msp->ms_allocator = allocator;
msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
Expand All @@ -3237,7 +3230,6 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
msp->ms_activation_weight = msp->ms_weight;
metaslab_group_sort_impl(mg, msp,
msp->ms_weight | activation_weight);

mutex_exit(&mg->mg_lock);

return (0);
Expand Down Expand Up @@ -3337,14 +3329,15 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
ASSERT3S(0, <=, msp->ms_allocator);
ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);

metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
if (msp->ms_primary) {
ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
ASSERT3P(mga->mga_primary, ==, msp);
ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
mg->mg_primaries[msp->ms_allocator] = NULL;
mga->mga_primary = NULL;
} else {
ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
ASSERT3P(mga->mga_secondary, ==, msp);
ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
mg->mg_secondaries[msp->ms_allocator] = NULL;
mga->mga_secondary = NULL;
}
msp->ms_allocator = -1;
metaslab_group_sort_impl(mg, msp, weight);
Expand Down Expand Up @@ -4493,22 +4486,24 @@ metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
if (!mg->mg_class->mc_alloc_throttle_enabled)
return;

(void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
(void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag);
}

static void
metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
{
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
uint64_t max = mg->mg_max_alloc_queue_depth;
uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
while (cur < max) {
if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
cur, cur + 1) == cur) {
atomic_inc_64(
&mg->mg_class->mc_alloc_max_slots[allocator]);
return;
}
cur = mg->mg_cur_max_alloc_queue_depth[allocator];
cur = mga->mga_cur_max_alloc_queue_depth;
}
}

Expand All @@ -4524,7 +4519,8 @@ metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
if (!mg->mg_class->mc_alloc_throttle_enabled)
return;

(void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
(void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag);
if (io_complete)
metaslab_group_increment_qdepth(mg, allocator);
}
Expand All @@ -4540,8 +4536,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
for (int d = 0; d < ndvas; d++) {
uint64_t vdev = DVA_GET_VDEV(&dva[d]);
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
VERIFY(zfs_refcount_not_held(
&mg->mg_alloc_queue_depth[allocator], tag));
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag));
}
#endif
}
Expand Down Expand Up @@ -4716,6 +4712,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
*/
if (mg->mg_ms_ready < mg->mg_allocators * 3)
allocator = 0;
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];

ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);

Expand All @@ -4737,8 +4734,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
mutex_enter(&mg->mg_lock);

if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
mg->mg_primaries[allocator] != NULL) {
msp = mg->mg_primaries[allocator];
mga->mga_primary != NULL) {
msp = mga->mga_primary;

/*
* Even though we don't hold the ms_lock for the
Expand All @@ -4753,8 +4750,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
was_active = B_TRUE;
ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
mg->mg_secondaries[allocator] != NULL) {
msp = mg->mg_secondaries[allocator];
mga->mga_secondary != NULL) {
msp = mga->mga_secondary;

/*
* See comment above about the similar assertions
Expand Down
9 changes: 5 additions & 4 deletions module/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -8720,13 +8720,14 @@ spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
* allocations look at mg_max_alloc_queue_depth, and async
* allocations all happen from spa_sync().
*/
for (int i = 0; i < spa->spa_alloc_count; i++)
for (int i = 0; i < mg->mg_allocators; i++) {
ASSERT0(zfs_refcount_count(
&(mg->mg_alloc_queue_depth[i])));
&(mg->mg_allocator[i].mga_alloc_queue_depth)));
}
mg->mg_max_alloc_queue_depth = max_queue_depth;

for (int i = 0; i < spa->spa_alloc_count; i++) {
mg->mg_cur_max_alloc_queue_depth[i] =
for (int i = 0; i < mg->mg_allocators; i++) {
mg->mg_allocator[i].mga_cur_max_alloc_queue_depth =
zfs_vdev_def_queue_depth;
}
slots_per_allocator += zfs_vdev_def_queue_depth;
Expand Down