openzfs · behlendorf · Apr 22, 2020 · Mar 5, 2020
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
@@ -203,6 +203,16 @@ struct metaslab_class {
 	multilist_t		*mc_metaslab_txg_list;
 };
 
+/*
+ * Per-allocator data structure.
+ */
+typedef struct metaslab_group_allocator {
+	uint64_t	mga_cur_max_alloc_queue_depth;
+	zfs_refcount_t	mga_alloc_queue_depth;
+	metaslab_t	*mga_primary;
+	metaslab_t	*mga_secondary;
+} metaslab_group_allocator_t;
+
 /*
  * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
  * of a top-level vdev. They are linked together to form a circular linked
@@ -214,8 +224,6 @@ struct metaslab_class {
  */
 struct metaslab_group {
 	kmutex_t		mg_lock;
-	metaslab_t		**mg_primaries;
-	metaslab_t		**mg_secondaries;
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
 	boolean_t		mg_allocatable;		/* can we allocate? */
@@ -263,9 +271,8 @@ struct metaslab_group {
 	 * groups are unable to handle their share of allocations.
 	 */
 	uint64_t		mg_max_alloc_queue_depth;
-	uint64_t		*mg_cur_max_alloc_queue_depth;
-	zfs_refcount_t		*mg_alloc_queue_depth;
 	int			mg_allocators;
+	metaslab_group_allocator_t *mg_allocator; /* array */
 	/*
 	 * A metalab group that can no longer allocate the minimum block
 	 * size will set mg_no_free_space. Once a metaslab group is out

diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
@@ -814,10 +814,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
-	mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
-	    KM_SLEEP);
-	mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
-	    KM_SLEEP);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
 	mg->mg_vd = vd;
@@ -827,13 +823,11 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 	mg->mg_no_free_space = B_TRUE;
 	mg->mg_allocators = allocators;
 
-	mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
-	    sizeof (zfs_refcount_t), KM_SLEEP);
-	mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
-	    sizeof (uint64_t), KM_SLEEP);
+	mg->mg_allocator = kmem_zalloc(allocators *
+	    sizeof (metaslab_group_allocator_t), KM_SLEEP);
 	for (int i = 0; i < allocators; i++) {
-		zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
-		mg->mg_cur_max_alloc_queue_depth[i] = 0;
+		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
+		zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
 	}
 
 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
@@ -856,21 +850,16 @@ metaslab_group_destroy(metaslab_group_t *mg)
 
 	taskq_destroy(mg->mg_taskq);
 	avl_destroy(&mg->mg_metaslab_tree);
-	kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
-	kmem_free(mg->mg_secondaries, mg->mg_allocators *
-	    sizeof (metaslab_t *));
 	mutex_destroy(&mg->mg_lock);
 	mutex_destroy(&mg->mg_ms_disabled_lock);
 	cv_destroy(&mg->mg_ms_disabled_cv);
 
 	for (int i = 0; i < mg->mg_allocators; i++) {
-		zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
-		mg->mg_cur_max_alloc_queue_depth[i] = 0;
+		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
+		zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
 	}
-	kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
-	    sizeof (zfs_refcount_t));
-	kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
-	    sizeof (uint64_t));
+	kmem_free(mg->mg_allocator, mg->mg_allocators *
+	    sizeof (metaslab_group_allocator_t));
 
 	kmem_free(mg, sizeof (metaslab_group_t));
 }
@@ -951,14 +940,15 @@ metaslab_group_passivate(metaslab_group_t *mg)
 	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 	metaslab_group_alloc_update(mg);
 	for (int i = 0; i < mg->mg_allocators; i++) {
-		metaslab_t *msp = mg->mg_primaries[i];
+		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
+		metaslab_t *msp = mga->mga_primary;
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
 			    metaslab_weight_from_range_tree(msp));
 			mutex_exit(&msp->ms_lock);
 		}
-		msp = mg->mg_secondaries[i];
+		msp = mga->mga_secondary;
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
@@ -1218,9 +1208,9 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 	 * regardless of the mg_allocatable or throttle settings.
 	 */
 	if (mg->mg_allocatable) {
-		metaslab_group_t *mgp;
+		metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 		int64_t qdepth;
-		uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
+		uint64_t qmax = mga->mga_cur_max_alloc_queue_depth;
 
 		if (!mc->mc_alloc_throttle_enabled)
 			return (B_TRUE);
@@ -1239,8 +1229,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 		 */
 		qmax = qmax * (4 + d) / 4;
 
-		qdepth = zfs_refcount_count(
-		    &mg->mg_alloc_queue_depth[allocator]);
+		qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth);
 
 		/*
 		 * If this metaslab group is below its qmax or it's
@@ -1258,11 +1247,14 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 		 * racy since we can't hold the locks for all metaslab
 		 * groups at the same time when we make this check.
 		 */
-		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
-			qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
+		for (metaslab_group_t *mgp = mg->mg_next;
+		    mgp != rotor; mgp = mgp->mg_next) {
+			metaslab_group_allocator_t *mgap =
+			    &mgp->mg_allocator[allocator];
+			qmax = mgap->mga_cur_max_alloc_queue_depth;
 			qmax = qmax * (4 + d) / 4;
-			qdepth = zfs_refcount_count(
-			    &mgp->mg_alloc_queue_depth[allocator]);
+			qdepth =
+			    zfs_refcount_count(&mgap->mga_alloc_queue_depth);
 
 			/*
 			 * If there is another metaslab group that
@@ -3205,6 +3197,7 @@ static int
 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
     int allocator, uint64_t activation_weight)
 {
+	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
@@ -3219,16 +3212,16 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
 		return (0);
 	}
 
-	metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
-	    mg->mg_primaries : mg->mg_secondaries);
+	metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
+	    &mga->mga_primary : &mga->mga_secondary);
 
 	mutex_enter(&mg->mg_lock);
-	if (arr[allocator] != NULL) {
+	if (*mspp != NULL) {
 		mutex_exit(&mg->mg_lock);
 		return (EEXIST);
 	}
 
-	arr[allocator] = msp;
+	*mspp = msp;
 	ASSERT3S(msp->ms_allocator, ==, -1);
 	msp->ms_allocator = allocator;
 	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
@@ -3237,7 +3230,6 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
 	msp->ms_activation_weight = msp->ms_weight;
 	metaslab_group_sort_impl(mg, msp,
 	    msp->ms_weight | activation_weight);
-
 	mutex_exit(&mg->mg_lock);
 
 	return (0);
@@ -3337,14 +3329,15 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
 	ASSERT3S(0, <=, msp->ms_allocator);
 	ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
 
+	metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
 	if (msp->ms_primary) {
-		ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
+		ASSERT3P(mga->mga_primary, ==, msp);
 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
-		mg->mg_primaries[msp->ms_allocator] = NULL;
+		mga->mga_primary = NULL;
 	} else {
-		ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
+		ASSERT3P(mga->mga_secondary, ==, msp);
 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
-		mg->mg_secondaries[msp->ms_allocator] = NULL;
+		mga->mga_secondary = NULL;
 	}
 	msp->ms_allocator = -1;
 	metaslab_group_sort_impl(mg, msp, weight);
@@ -4493,22 +4486,24 @@ metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
-	(void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
+	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+	(void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag);
 }
 
 static void
 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
 {
+	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	uint64_t max = mg->mg_max_alloc_queue_depth;
-	uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+	uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
 	while (cur < max) {
-		if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
+		if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
 		    cur, cur + 1) == cur) {
 			atomic_inc_64(
 			    &mg->mg_class->mc_alloc_max_slots[allocator]);
 			return;
 		}
-		cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+		cur = mga->mga_cur_max_alloc_queue_depth;
 	}
 }
 
@@ -4524,7 +4519,8 @@ metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
-	(void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
+	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+	(void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag);
 	if (io_complete)
 		metaslab_group_increment_qdepth(mg, allocator);
 }
@@ -4540,8 +4536,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
 	for (int d = 0; d < ndvas; d++) {
 		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
 		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
-		VERIFY(zfs_refcount_not_held(
-		    &mg->mg_alloc_queue_depth[allocator], tag));
+		metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+		VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag));
 	}
 #endif
 }
@@ -4716,6 +4712,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 	 */
 	if (mg->mg_ms_ready < mg->mg_allocators * 3)
 		allocator = 0;
+	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 
 	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
 
@@ -4737,8 +4734,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 		mutex_enter(&mg->mg_lock);
 
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
-		    mg->mg_primaries[allocator] != NULL) {
-			msp = mg->mg_primaries[allocator];
+		    mga->mga_primary != NULL) {
+			msp = mga->mga_primary;
 
 			/*
 			 * Even though we don't hold the ms_lock for the
@@ -4753,8 +4750,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 			was_active = B_TRUE;
 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
-		    mg->mg_secondaries[allocator] != NULL) {
-			msp = mg->mg_secondaries[allocator];
+		    mga->mga_secondary != NULL) {
+			msp = mga->mga_secondary;
 
 			/*
 			 * See comment above about the similar assertions

diff --git a/module/zfs/spa.c b/module/zfs/spa.c
@@ -8720,13 +8720,14 @@ spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
 		 * allocations look at mg_max_alloc_queue_depth, and async
 		 * allocations all happen from spa_sync().
 		 */
-		for (int i = 0; i < spa->spa_alloc_count; i++)
+		for (int i = 0; i < mg->mg_allocators; i++) {
 			ASSERT0(zfs_refcount_count(
-			    &(mg->mg_alloc_queue_depth[i])));
+			    &(mg->mg_allocator[i].mga_alloc_queue_depth)));
+		}
 		mg->mg_max_alloc_queue_depth = max_queue_depth;
 
-		for (int i = 0; i < spa->spa_alloc_count; i++) {
-			mg->mg_cur_max_alloc_queue_depth[i] =
+		for (int i = 0; i < mg->mg_allocators; i++) {
+			mg->mg_allocator[i].mga_cur_max_alloc_queue_depth =
 			    zfs_vdev_def_queue_depth;
 		}
 		slots_per_allocator += zfs_vdev_def_queue_depth;