diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index e45bff26944..77d7441e8a1 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -5844,6 +5844,7 @@ zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) * metaslabs. We want to set them up for * zio_claim(). */ + vdev_metaslab_group_create(vd); VERIFY0(vdev_metaslab_init(vd, 0)); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; @@ -5882,6 +5883,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) */ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; spa->spa_log_class->mc_ops = &zdb_metaslab_ops; + spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops; zcb->zcb_vd_obsolete_counts = umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), @@ -6015,7 +6017,6 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) vdev_t *rvd = spa->spa_root_vdev; for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; - metaslab_group_t *mg __maybe_unused = vd->vdev_mg; if (zcb->zcb_vd_obsolete_counts[c] != NULL) { leaks |= zdb_check_for_obsolete_leaks(vd, zcb); @@ -6023,7 +6024,9 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - ASSERT3P(mg, ==, msp->ms_group); + ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class == + spa_embedded_log_class(spa)) ? + vd->vdev_log_mg : vd->vdev_mg); /* * ms_allocatable has been overloaded @@ -6230,6 +6233,8 @@ dump_block_stats(spa_t *spa) zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); + zcb.zcb_totalasize += + metaslab_class_get_alloc(spa_embedded_log_class(spa)); zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); @@ -6277,6 +6282,7 @@ dump_block_stats(spa_t *spa) total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)) + + metaslab_class_get_alloc(spa_embedded_log_class(spa)) + metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); @@ -6344,6 +6350,17 @@ dump_block_stats(spa_t *spa) 100.0 * alloc / space); } + if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_embedded_log_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_embedded_log_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Embedded log class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb.zcb_embedded_blocks[i] == 0) continue; diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 0c50d0409b2..052dcfb10d4 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -142,9 +142,6 @@ typedef enum dmu_object_byteswap { #define DMU_OT_IS_DDT(ot) \ ((ot) == DMU_OT_DDT_ZAP) -#define DMU_OT_IS_ZIL(ot) \ - ((ot) == DMU_OT_INTENT_LOG) - /* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */ #define DMU_OT_IS_FILE(ot) \ ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER) diff --git a/include/sys/spa.h b/include/sys/spa.h index 045431c2096..0762ae8a3e1 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1047,6 +1047,7 @@ extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); +extern metaslab_class_t *spa_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index a3afaef3872..7f15fd030fa 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -226,6 +226,7 @@ struct spa { boolean_t spa_is_exporting; /* true while exporting pool */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ + metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ metaslab_class_t *spa_special_class; /* special allocation class */ metaslab_class_t *spa_dedup_class; /* dedup allocation class */ uint64_t spa_first_txg; /* first txg after spa_open() */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 7bc72a03db1..d1ef6b5b59b 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #ifdef __cplusplus @@ -113,6 +114,9 @@ extern void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg); extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); + +extern metaslab_group_t *vdev_get_mg(vdev_t *vd, metaslab_class_t *mc); + extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index fc169842a86..1239451bf42 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -280,6 +280,7 @@ struct vdev { uint64_t vdev_ms_shift; /* metaslab size shift */ uint64_t vdev_ms_count; /* number of metaslabs */ metaslab_group_t *vdev_mg; /* metaslab group */ + metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ uint64_t vdev_pending_fastwrite; /* allocated fastwrites */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ @@ -636,6 +637,7 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); * Other miscellaneous functions */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); +void vdev_metaslab_group_create(vdev_t *vd); /* * Vdev ashift optimization tunables diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index 89587876f94..8b9629fb5e2 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -56,6 +56,7 @@ extern int zfs_dbgmsg_enable; #define ZFS_DEBUG_INDIRECT_REMAP (1 << 10) #define ZFS_DEBUG_TRIM (1 << 11) #define ZFS_DEBUG_LOG_SPACEMAP (1 << 12) +#define ZFS_DEBUG_METASLAB_ALLOC (1 << 13) extern void __set_error(const char *file, const char *func, int line, int err); extern void __zfs_dbgmsg(char *buf); diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 41e8ffa7958..8fec44dd37e 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -3936,6 +3936,22 @@ to limit potential SLOG device abuse by single active ZIL writer. Default value: \fB786,432\fR. .RE +.sp +.ne 2 +.na +\fBzfs_embedded_slog_min_ms\fR (int) +.ad +.RS 12n +Usually, one metaslab from each (normal-class) vdev is dedicated for use by +the ZIL (to log synchronous writes). +However, if there are fewer than zfs_embedded_slog_min_ms metaslabs in the +vdev, this functionality is disabled. +This ensures that we don't set aside an unreasonable amount of space for the +ZIL. +.sp +Default value: \fB64\fR. +.RE + .sp .ne 2 .na diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index bed6bf64c92..fdc6922b001 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -524,7 +524,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; + metaslab_group_t *mg = vdev_get_mg(tvd, mc); /* * Skip any holes, uninitialized top-levels, or @@ -535,12 +535,16 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) continue; } + IMPLY(mg == mg->mg_vd->vdev_log_mg, + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; } - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); + } kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } @@ -1004,16 +1008,22 @@ metaslab_group_initialized(metaslab_group_t *mg) uint64_t metaslab_group_get_space(metaslab_group_t *mg) { - return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); + /* + * Note that the number of nodes in mg_metaslab_tree may be one less + * than vdev_ms_count, due to the embedded log metaslab. + */ + mutex_enter(&mg->mg_lock); + uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree); + mutex_exit(&mg->mg_lock); + return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count); } void metaslab_group_histogram_verify(metaslab_group_t *mg) { uint64_t *mg_hist; - vdev_t *vd = mg->mg_vd; - uint64_t ashift = vd->vdev_ashift; - int i; + avl_tree_t *t = &mg->mg_metaslab_tree; + uint64_t ashift = mg->mg_vd->vdev_ashift; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; @@ -1024,21 +1034,25 @@ metaslab_group_histogram_verify(metaslab_group_t *mg) ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, SPACE_MAP_HISTOGRAM_SIZE + ashift); - for (int m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - - /* skip if not active or not a member */ - if (msp->ms_sm == NULL || msp->ms_group != mg) + mutex_enter(&mg->mg_lock); + for (metaslab_t *msp = avl_first(t); + msp != NULL; msp = AVL_NEXT(t, msp)) { + VERIFY3P(msp->ms_group, ==, mg); + /* skip if not active */ + if (msp->ms_sm == NULL) continue; - for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { mg_hist[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; + } } - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); + mutex_exit(&mg->mg_lock); + kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } @@ -1054,6 +1068,8 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) mutex_enter(&mg->mg_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + IMPLY(mg == mg->mg_vd->vdev_log_mg, + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += @@ -1078,6 +1094,8 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) msp->ms_sm->sm_phys->smp_histogram[i]); ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); + IMPLY(mg == mg->mg_vd->vdev_log_mg, + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; @@ -2741,37 +2759,47 @@ metaslab_fini(metaslab_t *msp) mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); - metaslab_space_update(vd, mg->mg_class, - -metaslab_allocated_space(msp), 0, -msp->ms_size); + /* + * If the range trees haven't been allocated, this metaslab hasn't + * been through metaslab_sync_done() for the first time yet, so its + * space hasn't been accounted for in its vdev and doesn't need to be + * subtracted. + */ + if (msp->ms_freed != NULL) { + metaslab_space_update(vd, mg->mg_class, + -metaslab_allocated_space(msp), 0, -msp->ms_size); + } space_map_close(msp->ms_sm); msp->ms_sm = NULL; metaslab_unload(msp); + range_tree_destroy(msp->ms_allocatable); - range_tree_destroy(msp->ms_freeing); - range_tree_destroy(msp->ms_freed); - ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, - metaslab_unflushed_changes_memused(msp)); - spa->spa_unflushed_stats.sus_memused -= - metaslab_unflushed_changes_memused(msp); - range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); - range_tree_destroy(msp->ms_unflushed_allocs); - range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); - range_tree_destroy(msp->ms_unflushed_frees); + if (msp->ms_freed != NULL) { + range_tree_destroy(msp->ms_freeing); + range_tree_destroy(msp->ms_freed); - for (int t = 0; t < TXG_SIZE; t++) { - range_tree_destroy(msp->ms_allocating[t]); - } + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_allocs); + range_tree_destroy(msp->ms_checkpointing); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_frees); - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_destroy(msp->ms_defer[t]); + for (int t = 0; t < TXG_SIZE; t++) { + range_tree_destroy(msp->ms_allocating[t]); + } + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_destroy(msp->ms_defer[t]); + } } ASSERT0(msp->ms_deferspace); - range_tree_destroy(msp->ms_checkpointing); - for (int t = 0; t < TXG_SIZE; t++) ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); @@ -5113,7 +5141,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * all else fails. */ if (vd != NULL && vd->vdev_mg != NULL) { - mg = vd->vdev_mg; + mg = vdev_get_mg(vd, mc); if (flags & METASLAB_HINTBP_AVOID && mg->mg_next != NULL) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 53ffbc31c18..57a492993ea 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -303,10 +303,12 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) alloc = metaslab_class_get_alloc(mc); alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); + alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); + size += metaslab_class_get_space(spa_embedded_log_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); @@ -1196,6 +1198,8 @@ spa_activate(spa_t *spa, spa_mode_t mode) spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_embedded_log_class = + metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops); @@ -1347,6 +1351,9 @@ spa_deactivate(spa_t *spa) metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; + metaslab_class_destroy(spa->spa_embedded_log_class); + spa->spa_embedded_log_class = NULL; + metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; @@ -2103,6 +2110,9 @@ spa_check_logs(spa_t *spa) return (rv); } +/* + * Passivate any log vdevs (note, does not apply to embedded log metaslabs). + */ static boolean_t spa_passivate_log(spa_t *spa) { @@ -2113,10 +2123,10 @@ spa_passivate_log(spa_t *spa) for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) { - metaslab_group_passivate(mg); + ASSERT3P(tvd->vdev_log_mg, ==, NULL); + metaslab_group_passivate(tvd->vdev_mg); slog_found = B_TRUE; } } @@ -2124,6 +2134,9 @@ spa_passivate_log(spa_t *spa) return (slog_found); } +/* + * Activate any log vdevs (note, does not apply to embedded log metaslabs). + */ static void spa_activate_log(spa_t *spa) { @@ -2133,10 +2146,11 @@ spa_activate_log(spa_t *spa) for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; - if (tvd->vdev_islog) - metaslab_group_activate(mg); + if (tvd->vdev_islog) { + ASSERT3P(tvd->vdev_log_mg, ==, NULL); + metaslab_group_activate(tvd->vdev_mg); + } } } @@ -8033,12 +8047,16 @@ spa_async_thread(void *arg) old_space = metaslab_class_get_space(spa_normal_class(spa)); old_space += metaslab_class_get_space(spa_special_class(spa)); old_space += metaslab_class_get_space(spa_dedup_class(spa)); + old_space += metaslab_class_get_space( + spa_embedded_log_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); new_space += metaslab_class_get_space(spa_special_class(spa)); new_space += metaslab_class_get_space(spa_dedup_class(spa)); + new_space += metaslab_class_get_space( + spa_embedded_log_class(spa)); mutex_exit(&spa_namespace_lock); /* diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index f49be8eec01..b4c73f58d3b 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -349,9 +349,11 @@ int spa_asize_inflation = 24; * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in * the pool to be consumed. This ensures that we don't run the pool * completely out of space, due to unaccounted changes (e.g. to the MOS). - * It also limits the worst-case time to allocate space. If we have - * less than this amount of free space, most ZPL operations (e.g. write, - * create) will return ENOSPC. + * It also limits the worst-case time to allocate space. If we have less than + * this amount of free space, most ZPL operations (e.g. write, create) will + * return ENOSPC. The ZIL metaslabs (spa_embedded_log_class) are also part of + * this 3.2% of space which can't be consumed by normal writes; the slop space + * "proper" (spa_get_slop_space()) is decreased by the embedded log space. * * Certain operations (e.g. file removal, most administrative actions) can * use half the slop space. They will only return ENOSPC if less than half @@ -1026,10 +1028,10 @@ spa_aux_activate(vdev_t *vd, avl_tree_t *avl) /* * Spares are tracked globally due to the following constraints: * - * - A spare may be part of multiple pools. - * - A spare may be added to a pool even if it's actively in use within + * - A spare may be part of multiple pools. + * - A spare may be added to a pool even if it's actively in use within * another pool. - * - A spare in use in any pool can only be the source of a replacement if + * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference @@ -1236,6 +1238,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); + ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); @@ -1776,17 +1779,37 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) } /* - * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), - * or at least 128MB, unless that would cause it to be more than half the - * pool size. - * - * See the comment above spa_slop_shift for details. + * Return the amount of slop space in bytes. It is typically 1/32 of the pool + * (3.2%), minus the embedded log space. On very small pools, it may be + * slightly larger than this. The embedded log space is not included in + * spa_dspace. By subtracting it, the usable space (per "zfs list") is a + * constant 97% of the total space, regardless of metaslab size (assuming the + * default spa_slop_shift=5 and a non-tiny pool). + * + * See the comment above spa_slop_shift for more details. */ uint64_t spa_get_slop_space(spa_t *spa) { uint64_t space = spa_get_dspace(spa); - return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop))); + uint64_t slop = space >> spa_slop_shift; + + /* + * Subtract the embedded log space, but no more than half the (3.2%) + * unusable space. Note, the "no more than half" is only relevant if + * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by + * default. + */ + uint64_t embedded_log = + metaslab_class_get_dspace(spa_embedded_log_class(spa)); + slop -= MIN(embedded_log, slop >> 1); + + /* + * Slop space should be at least spa_min_slop, but no more than half + * the entire pool. + */ + slop = MAX(slop, MIN(space >> 1, spa_min_slop)); + return (slop); } uint64_t @@ -1872,6 +1895,12 @@ spa_log_class(spa_t *spa) return (spa->spa_log_class); } +metaslab_class_t * +spa_embedded_log_class(spa_t *spa) +{ + return (spa->spa_embedded_log_class); +} + metaslab_class_t * spa_special_class(spa_t *spa) { @@ -1891,12 +1920,10 @@ metaslab_class_t * spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk) { - if (DMU_OT_IS_ZIL(objtype)) { - if (spa->spa_log_class->mc_groups != 0) - return (spa_log_class(spa)); - else - return (spa_normal_class(spa)); - } + /* + * ZIL allocations determine their class in zio_alloc_zil(). + */ + ASSERT(objtype != DMU_OT_INTENT_LOG); boolean_t has_special_class = spa->spa_special_class->mc_groups != 0; @@ -2432,9 +2459,9 @@ spa_fini(void) } /* - * Return whether this pool has slogs. No locking needed. + * Return whether this pool has a dedicated slog device. No locking needed. * It's not a problem if the wrong answer is returned as it's only for - * performance and not correctness + * performance and not correctness. */ boolean_t spa_has_slogs(spa_t *spa) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 7ffe924212d..f305da6f56e 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -59,6 +59,27 @@ #include #include +/* + * One metaslab from each (normal-class) vdev is used by the ZIL. These are + * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are + * part of the spa_embedded_log_class. The metaslab with the most free space + * in each vdev is selected for this purpose when the pool is opened (or a + * vdev is added). See vdev_metaslab_init(). + * + * Log blocks can be allocated from the following locations. Each one is tried + * in order until the allocation succeeds: + * 1. dedicated log vdevs, aka "slog" (spa_log_class) + * 2. embedded slog metaslabs (spa_embedded_log_class) + * 3. other metaslabs in normal vdevs (spa_normal_class) + * + * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer + * than this number of metaslabs in the vdev. This ensures that we don't set + * aside an unreasonable amount of space for the ZIL. If set to less than + * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced + * (by more than 1<vdev_spa) && + vd->vdev_log_mg != NULL) + return (vd->vdev_log_mg); + else + return (vd->vdev_mg); +} + /* ARGSUSED */ void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, @@ -978,6 +1015,11 @@ vdev_free(vdev_t *vd) metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } + if (vd->vdev_log_mg != NULL) { + ASSERT0(vd->vdev_ms_count); + metaslab_group_destroy(vd->vdev_log_mg); + vd->vdev_log_mg = NULL; + } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); @@ -1098,14 +1140,20 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); + if (tvd->vdev_log_mg) + ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); tvd->vdev_mg = svd->vdev_mg; + tvd->vdev_log_mg = svd->vdev_log_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; + svd->vdev_log_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; + if (tvd->vdev_log_mg != NULL) + tvd->vdev_log_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; @@ -1283,7 +1331,7 @@ vdev_remove_parent(vdev_t *cvd) vdev_free(mvd); } -static void +void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; @@ -1317,6 +1365,11 @@ vdev_metaslab_group_create(vdev_t *vd) vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); + if (!vd->vdev_islog) { + vd->vdev_log_mg = metaslab_group_create( + spa_embedded_log_class(spa), vd, 1); + } + /* * The spa ashift min/max only apply for the normal metaslab * class. Class destination is late binding so ashift boundry @@ -1340,8 +1393,6 @@ int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; - uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; @@ -1369,16 +1420,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) vd->vdev_ms = mspp; vd->vdev_ms_count = newc; - for (m = oldc; m < newc; m++) { - uint64_t object = 0; + for (uint64_t m = oldc; m < newc; m++) { + uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { - error = dmu_read(mos, vd->vdev_ms_array, + error = dmu_read(spa->spa_meta_objset, + vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { @@ -1388,17 +1440,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) } } -#ifndef _KERNEL - /* - * To accommodate zdb_leak_init() fake indirect - * metaslabs, we allocate a metaslab group for - * indirect vdevs which normally don't have one. - */ - if (vd->vdev_mg == NULL) { - ASSERT0(vdev_is_concrete(vd)); - vdev_metaslab_group_create(vd); - } -#endif error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { @@ -1408,6 +1449,47 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) } } + /* + * Find the emptiest metaslab on the vdev and mark it for use for + * embedded slog by moving it from the regular to the log metaslab + * group. + */ + if (vd->vdev_mg->mg_class == spa_normal_class(spa) && + vd->vdev_ms_count > zfs_embedded_slog_min_ms && + avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { + uint64_t slog_msid = 0; + uint64_t smallest = UINT64_MAX; + + /* + * Note, we only search the new metaslabs, because the old + * (pre-existing) ones may be active (e.g. have non-empty + * range_tree's), and we don't move them to the new + * metaslab_t. + */ + for (uint64_t m = oldc; m < newc; m++) { + uint64_t alloc = + space_map_allocated(vd->vdev_ms[m]->ms_sm); + if (alloc < smallest) { + slog_msid = m; + smallest = alloc; + } + } + metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; + /* + * The metaslab was marked as dirty at the end of + * metaslab_init(). Remove it from the dirty list so that we + * can uninitialize and reinitialize it to the new class. + */ + if (txg != 0) { + (void) txg_list_remove_this(&vd->vdev_ms_list, + slog_ms, txg); + } + uint64_t sm_obj = space_map_object(slog_ms->ms_sm); + metaslab_fini(slog_ms); + VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, + &vd->vdev_ms[slog_msid])); + } + if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); @@ -1418,6 +1500,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) */ if (!expanding && !vd->vdev_removing) { metaslab_group_activate(vd->vdev_mg); + if (vd->vdev_log_mg != NULL) + metaslab_group_activate(vd->vdev_log_mg); } if (txg == 0) @@ -1453,7 +1537,12 @@ vdev_metaslab_fini(vdev_t *vd) if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; + metaslab_group_passivate(mg); + if (vd->vdev_log_mg != NULL) { + ASSERT(!vd->vdev_islog); + metaslab_group_passivate(vd->vdev_log_mg); + } uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { @@ -1463,11 +1552,13 @@ vdev_metaslab_fini(vdev_t *vd) } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; - vd->vdev_ms_count = 0; - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); + if (vd->vdev_log_mg != NULL) + ASSERT0(vd->vdev_log_mg->mg_histogram[i]); + } } ASSERT0(vd->vdev_ms_count); ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); @@ -3531,8 +3622,11 @@ vdev_sync_done(vdev_t *vd, uint64_t txg) != NULL) metaslab_sync_done(msp, txg); - if (reassess) + if (reassess) { metaslab_sync_reassess(vd->vdev_mg); + if (vd->vdev_log_mg != NULL) + metaslab_sync_reassess(vd->vdev_log_mg); + } } void @@ -3856,6 +3950,7 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) /* * Prevent any future allocations. */ + ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); @@ -4256,6 +4351,12 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { + /* + * The vdev fragmentation rating doesn't take into + * account the embedded slog metaslab (vdev_log_mg). + * Since it's only one metaslab, it would have a tiny + * impact on the overall fragmentation. + */ vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } @@ -5234,6 +5335,9 @@ ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); +ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW, + "Minimum number of metaslabs required to dedicate one for log blocks"); + ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_ulong, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 6eaaddd3979..a758fe4fb34 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1207,6 +1207,11 @@ vdev_remove_complete(spa_t *spa) vd->vdev_mg = NULL; spa_log_sm_set_blocklimit(spa); } + if (vd->vdev_log_mg != NULL) { + ASSERT0(vd->vdev_ms_count); + metaslab_group_destroy(vd->vdev_log_mg); + vd->vdev_log_mg = NULL; + } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); @@ -1780,6 +1785,8 @@ spa_vdev_remove_cancel_impl(spa_t *spa) spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); vdev_t *vd = vdev_lookup_top(spa, vdid); metaslab_group_activate(vd->vdev_mg); + ASSERT(!vd->vdev_islog); + metaslab_group_activate(vd->vdev_log_mg); spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); } @@ -1858,6 +1865,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); + ASSERT3P(vd->vdev_log_mg, ==, NULL); ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* @@ -1893,6 +1901,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) if (error != 0) { metaslab_group_activate(mg); + ASSERT3P(vd->vdev_log_mg, ==, NULL); return (error); } ASSERT0(vd->vdev_stat.vs_alloc); @@ -2121,6 +2130,8 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) */ metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); + ASSERT(!vd->vdev_islog); + metaslab_group_passivate(vd->vdev_log_mg); /* * Wait for the youngest allocations and frees to sync, @@ -2157,6 +2168,8 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) if (error != 0) { metaslab_group_activate(mg); + ASSERT(!vd->vdev_islog); + metaslab_group_activate(vd->vdev_log_mg); spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3c2b731f7c4..bb343960973 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2750,10 +2750,9 @@ zio_write_gang_done(zio_t *zio) } static zio_t * -zio_write_gang_block(zio_t *pio) +zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { spa_t *spa = pio->io_spa; - metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -3470,6 +3469,17 @@ zio_dva_allocate(zio_t *zio) zio->io_metaslab_class = mc; } + /* + * Try allocating the block in the usual metaslab class. + * If that's full, allocate it in the normal class. + * If that's full, allocate as a gang block, + * and if all are full, the allocation fails (which shouldn't happen). + * + * Note that we do not fall back on embedded slog (ZIL) space, to + * preserve unfragmented slog space, which is critical for decent + * sync write performance. If a log allocation fails, we will fall + * back to spa_sync() which is abysmal for performance. + */ error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); @@ -3489,26 +3499,38 @@ zio_dva_allocate(zio_t *zio) zio->io_prop.zp_copies, zio->io_allocator, zio); zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; - mc = spa_normal_class(spa); - VERIFY(metaslab_class_throttle_reserve(mc, + VERIFY(metaslab_class_throttle_reserve( + spa_normal_class(spa), zio->io_prop.zp_copies, zio->io_allocator, zio, flags | METASLAB_MUST_RESERVE)); - } else { - mc = spa_normal_class(spa); } - zio->io_metaslab_class = mc; + zio->io_metaslab_class = mc = spa_normal_class(spa); + if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { + zfs_dbgmsg("%s: metaslab allocation failure, " + "trying normal class: zio %px, size %llu, error %d", + spa_name(spa), zio, zio->io_size, error); + } error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); } + if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { + if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { + zfs_dbgmsg("%s: metaslab allocation failure, " + "trying ganging: zio %px, size %llu, error %d", + spa_name(spa), zio, zio->io_size, error); + } + return (zio_write_gang_block(zio, mc)); + } if (error != 0) { - zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " - "size %llu, error %d", spa_name(spa), zio, zio->io_size, - error); - if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) - return (zio_write_gang_block(zio)); + if (error != ENOSPC || + (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) { + zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " + "size %llu, error %d", + spa_name(spa), zio, zio->io_size, error); + } zio->io_error = error; } @@ -3588,15 +3610,18 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, int flags = METASLAB_FASTWRITE | METASLAB_ZIL; int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; - error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, - 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); - if (error == 0) { - *slog = TRUE; - } else { - error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, - 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); - if (error == 0) - *slog = FALSE; + error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, + txg, NULL, flags, &io_alloc_list, NULL, allocator); + *slog = (error == 0); + if (error != 0) { + error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, + new_bp, 1, txg, NULL, flags, + &io_alloc_list, NULL, allocator); + } + if (error != 0) { + error = metaslab_alloc(spa, spa_normal_class(spa), size, + new_bp, 1, txg, NULL, flags, + &io_alloc_list, NULL, allocator); } metaslab_trace_fini(&io_alloc_list);