diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 1ddff0d4e4e7..03513f9f2c6c 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -1067,6 +1067,8 @@ int dmu_diff(const char *tosnap_name, const char *fromsnap_name, #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; +extern int dmu_prefetch_max; + #ifdef __cplusplus } #endif diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 129a68be41c5..b777a3cae439 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -49,11 +49,14 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, metaslab_t **); void metaslab_fini(metaslab_t *); +void metaslab_set_unflushed_dirty(metaslab_t *, boolean_t); void metaslab_set_unflushed_txg(metaslab_t *, uint64_t, dmu_tx_t *); void metaslab_set_estimated_condensed_size(metaslab_t *, uint64_t, dmu_tx_t *); +boolean_t metaslab_unflushed_dirty(metaslab_t *); uint64_t metaslab_unflushed_txg(metaslab_t *); uint64_t metaslab_estimated_condensed_size(metaslab_t *); int metaslab_sort_by_flushed(const void *, const void *); +void metaslab_unflushed_bump(metaslab_t *, dmu_tx_t *, boolean_t); uint64_t metaslab_unflushed_changes_memused(metaslab_t *); int metaslab_load(metaslab_t *); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 3dbee4c17fef..820c61a252e2 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -553,6 +553,7 @@ struct metaslab { * log space maps. */ uint64_t ms_unflushed_txg; + boolean_t ms_unflushed_dirty; /* updated every time we are done syncing the metaslab's space map */ uint64_t ms_synced_length; diff --git a/include/sys/spa_log_spacemap.h b/include/sys/spa_log_spacemap.h index b2ed77fac3e4..72229df6cd16 100644 --- a/include/sys/spa_log_spacemap.h +++ b/include/sys/spa_log_spacemap.h @@ -30,7 +30,10 @@ typedef struct log_summary_entry { uint64_t lse_start; /* start TXG */ + uint64_t lse_end; /* last TXG */ + uint64_t lse_txgcount; /* # of TXGs */ uint64_t lse_mscount; /* # of metaslabs needed to be flushed */ + uint64_t lse_msdcount; /* # of dirty metaslabs needed to be flushed */ uint64_t lse_blkcount; /* blocks held by this entry */ list_node_t lse_node; } log_summary_entry_t; @@ -50,6 +53,7 @@ typedef struct spa_log_sm { uint64_t sls_nblocks; /* number of blocks in this log */ uint64_t sls_mscount; /* # of metaslabs flushed in the log's txg */ avl_node_t sls_node; /* node in spa_sm_logs_by_txg */ + space_map_t *sls_sm; /* space map pointer, if open */ } spa_log_sm_t; int spa_ld_log_spacemaps(spa_t *); @@ -68,8 +72,9 @@ uint64_t spa_log_sm_memused(spa_t *); void spa_log_sm_decrement_mscount(spa_t *, uint64_t); void spa_log_sm_increment_current_mscount(spa_t *); -void spa_log_summary_add_flushed_metaslab(spa_t *); -void spa_log_summary_decrement_mscount(spa_t *, uint64_t); +void spa_log_summary_add_flushed_metaslab(spa_t *, boolean_t); +void spa_log_summary_dirty_flushed_metaslab(spa_t *, uint64_t); +void spa_log_summary_decrement_mscount(spa_t *, uint64_t, boolean_t); void spa_log_summary_decrement_blkcount(spa_t *, uint64_t); boolean_t spa_flush_all_logs_requested(spa_t *); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 01e9c5de445d..a7168a7ce2e2 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1011,7 +1011,12 @@ Thus we always allow at least this many log blocks. .It Sy zfs_unflushed_log_block_pct Ns = Ns Sy 400 Ns % Pq ulong Tunable used to determine the number of blocks that can be used for the spacemap log, expressed as a percentage of the total number of -metaslabs in the pool. +unflushed metaslabs in the pool. +. +.It Sy zfs_unflushed_log_txg_max Ns = Ns Sy 1000 Pq ulong +Tunable limiting maximum time in TXGs any metaslab may remain unflushed. +It effectively limits maximum number of unflushed per-TXG spacemap logs +that need to be read after unclean pool export. . .It Sy zfs_unlink_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint When enabled, files will not be asynchronously removed from the list of pending diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 461feeffb6a3..7d8b2c96bd74 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -86,7 +86,7 @@ static int zfs_dmu_offset_next_sync = 1; * helps to limit the amount of memory that can be used by prefetching. * Larger objects should be prefetched a bit at a time. */ -static int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; +int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 7ed83b305db7..f9c16f00d51a 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -2750,7 +2750,8 @@ metaslab_fini_flush_data(metaslab_t *msp) mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); - spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp)); + spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp), + metaslab_unflushed_dirty(msp)); } uint64_t @@ -3728,50 +3729,45 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) metaslab_flush_update(msp, tx); } -/* - * Called when the metaslab has been flushed (its own spacemap now reflects - * all the contents of the pool-wide spacemap log). Updates the metaslab's - * metadata and any pool-wide related log space map data (e.g. summary, - * obsolete logs, etc..) to reflect that. - */ static void -metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) +metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx) { - metaslab_group_t *mg = msp->ms_group; - spa_t *spa = mg->mg_vd->vdev_spa; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - ASSERT3U(spa_sync_pass(spa), ==, 1); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + ASSERT(spa_syncing_log_sm(spa) != NULL); + ASSERT(msp->ms_sm != NULL); ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); - /* - * Just because a metaslab got flushed, that doesn't mean that - * it will pass through metaslab_sync_done(). Thus, make sure to - * update ms_synced_length here in case it doesn't. - */ - msp->ms_synced_length = space_map_length(msp->ms_sm); + mutex_enter(&spa->spa_flushed_ms_lock); + metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + metaslab_set_unflushed_dirty(msp, B_TRUE); + avl_add(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); - /* - * We may end up here from metaslab_condense() without the - * feature being active. In that case this is a no-op. - */ - if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) - return; + spa_log_sm_increment_current_mscount(spa); + spa_log_summary_add_flushed_metaslab(spa, B_TRUE); +} +void +metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); /* update metaslab's position in our flushing tree */ uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); + boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + metaslab_set_unflushed_dirty(msp, dirty); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); @@ -3779,17 +3775,47 @@ metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); spa_log_sm_increment_current_mscount(spa); + /* update log space map summary */ + spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg, + ms_prev_flushed_dirty); + spa_log_summary_add_flushed_metaslab(spa, dirty); + /* cleanup obsolete logs if any */ - uint64_t log_blocks_before = spa_log_sm_nblocks(spa); spa_cleanup_old_sm_logs(spa, tx); - uint64_t log_blocks_after = spa_log_sm_nblocks(spa); - VERIFY3U(log_blocks_after, <=, log_blocks_before); +} - /* update log space map summary */ - uint64_t blocks_gone = log_blocks_before - log_blocks_after; - spa_log_summary_add_flushed_metaslab(spa); - spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg); - spa_log_summary_decrement_blkcount(spa, blocks_gone); +/* + * Called when the metaslab has been flushed (its own spacemap now reflects + * all the contents of the pool-wide spacemap log). Updates the metaslab's + * metadata and any pool-wide related log space map data (e.g. summary, + * obsolete logs, etc..) to reflect that. + */ +static void +metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) +{ + metaslab_group_t *mg = msp->ms_group; + spa_t *spa = mg->mg_vd->vdev_spa; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + ASSERT3U(spa_sync_pass(spa), ==, 1); + + /* + * Just because a metaslab got flushed, that doesn't mean that + * it will pass through metaslab_sync_done(). Thus, make sure to + * update ms_synced_length here in case it doesn't. + */ + msp->ms_synced_length = space_map_length(msp->ms_sm); + + /* + * We may end up here from metaslab_condense() without the + * feature being active. In that case this is a no-op. + */ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) || + metaslab_unflushed_txg(msp) == 0) + return; + + metaslab_unflushed_bump(msp, tx, B_FALSE); } boolean_t @@ -4005,23 +4031,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) ASSERT0(metaslab_allocated_space(msp)); } - if (metaslab_unflushed_txg(msp) == 0 && - spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { - ASSERT(spa_syncing_log_sm(spa) != NULL); - - metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); - spa_log_sm_increment_current_mscount(spa); - spa_log_summary_add_flushed_metaslab(spa); - - ASSERT(msp->ms_sm != NULL); - mutex_enter(&spa->spa_flushed_ms_lock); - avl_add(&spa->spa_metaslabs_by_flushed, msp); - mutex_exit(&spa->spa_flushed_ms_lock); - - ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); - ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); - } - if (!range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); @@ -4069,6 +4078,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_t *log_sm = spa_syncing_log_sm(spa); if (log_sm != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); + if (metaslab_unflushed_txg(msp) == 0) + metaslab_unflushed_add(msp, tx); + else if (!metaslab_unflushed_dirty(msp)) + metaslab_unflushed_bump(msp, tx, B_TRUE); space_map_write(log_sm, alloctree, SM_ALLOC, vd->vdev_id, tx); @@ -6131,6 +6144,12 @@ metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) mutex_exit(&mg->mg_ms_disabled_lock); } +void +metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty) +{ + ms->ms_unflushed_dirty = dirty; +} + static void metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) { @@ -6167,15 +6186,16 @@ metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) void metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) { - spa_t *spa = ms->ms_group->mg_vd->vdev_spa; - - if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) - return; - ms->ms_unflushed_txg = txg; metaslab_update_ondisk_flush_data(ms, tx); } +boolean_t +metaslab_unflushed_dirty(metaslab_t *ms) +{ + return (ms->ms_unflushed_dirty); +} + uint64_t metaslab_unflushed_txg(metaslab_t *ms) { diff --git a/module/zfs/spa.c b/module/zfs/spa.c index e69cb5527be8..01114dedef48 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -4355,7 +4355,7 @@ spa_ld_load_vdev_metadata(spa_t *spa) error = spa_ld_log_spacemaps(spa); if (error != 0) { - spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]", + spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index 110a4eab99f9..0e7a0bfe5773 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -259,6 +259,11 @@ static unsigned long zfs_unflushed_log_block_min = 1000; */ static unsigned long zfs_unflushed_log_block_max = (1ULL << 18); +/* + * Also we have a hard limit in the size of the log in terms of dirty TXGs. + */ +static unsigned long zfs_unflushed_log_txg_max = 1000; + /* * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and * stability of the flushing algorithm (longer summary) vs its runtime overhead @@ -333,9 +338,13 @@ spa_log_sm_set_blocklimit(spa_t *spa) return; } - uint64_t calculated_limit = - (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100; - spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit, + uint64_t msdcount = 0; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) + msdcount += e->lse_msdcount; + + uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100; + spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit, zfs_unflushed_log_block_min), zfs_unflushed_log_block_max); } @@ -380,8 +389,13 @@ spa_log_summary_verify_counts(spa_t *spa) } static boolean_t -summary_entry_is_full(spa_t *spa, log_summary_entry_t *e) +summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg) { + if (e->lse_end == txg) + return (0); + if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max, + zfs_max_logsm_summary_length)) + return (1); uint64_t blocks_per_row = MAX(1, DIV_ROUND_UP(spa_log_sm_blocklimit(spa), zfs_max_logsm_summary_length)); @@ -401,7 +415,7 @@ summary_entry_is_full(spa_t *spa, log_summary_entry_t *e) * the metaslab. */ void -spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) +spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty) { /* * We don't track summary data for read-only pools and this function @@ -429,6 +443,8 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) } target->lse_mscount--; + if (dirty) + target->lse_msdcount--; } /* @@ -490,8 +506,10 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) void spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone) { - for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); - e != NULL; e = list_head(&spa->spa_log_summary)) { + log_summary_entry_t *e = list_head(&spa->spa_log_summary); + if (e->lse_txgcount > 0) + e->lse_txgcount--; + for (; e != NULL; e = list_head(&spa->spa_log_summary)) { if (e->lse_blkcount > blocks_gone) { /* * Assert that we stopped at an entry that is not @@ -560,31 +578,52 @@ spa_log_sm_increment_current_mscount(spa_t *spa) static void summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed, - uint64_t nblocks) + uint64_t metaslabs_dirty, uint64_t nblocks) { log_summary_entry_t *e = list_tail(&spa->spa_log_summary); - if (e == NULL || summary_entry_is_full(spa, e)) { + if (e == NULL || summary_entry_is_full(spa, e, txg)) { e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP); - e->lse_start = txg; + e->lse_start = e->lse_end = txg; + e->lse_txgcount = 1; list_insert_tail(&spa->spa_log_summary, e); } ASSERT3U(e->lse_start, <=, txg); + if (e->lse_end < txg) { + e->lse_end = txg; + e->lse_txgcount++; + } e->lse_mscount += metaslabs_flushed; + e->lse_msdcount += metaslabs_dirty; e->lse_blkcount += nblocks; } static void spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks) { - summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks); + summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks); } void -spa_log_summary_add_flushed_metaslab(spa_t *spa) +spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty) { - summary_add_data(spa, spa_syncing_txg(spa), 1, 0); + summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0); +} + +void +spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg) +{ + log_summary_entry_t *target = NULL; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_next(&spa->spa_log_summary, e)) { + if (e->lse_start > txg) + break; + target = e; + } + ASSERT3P(target, !=, NULL); + ASSERT3U(target->lse_mscount, !=, 0); + target->lse_msdcount++; } /* @@ -630,6 +669,11 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) int64_t available_blocks = spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming; + int64_t available_txgs = zfs_unflushed_log_txg_max; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) + available_txgs -= e->lse_txgcount; + /* * This variable tells us the total number of flushes needed to * keep the log size within the limit when we reach txgs_in_future. @@ -637,9 +681,7 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) uint64_t total_flushes = 0; /* Holds the current maximum of our estimates so far. */ - uint64_t max_flushes_pertxg = - MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed), - zfs_min_metaslabs_to_flush); + uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush; /* * For our estimations we only look as far in the future @@ -653,11 +695,14 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) * then keep skipping TXGs accumulating more blocks * based on the incoming rate until we exceed it. */ - if (available_blocks >= 0) { - uint64_t skip_txgs = (available_blocks / incoming) + 1; + if (available_blocks >= 0 && available_txgs >= 0) { + uint64_t skip_txgs = MIN(available_txgs + 1, + (available_blocks / incoming) + 1); available_blocks -= (skip_txgs * incoming); + available_txgs -= skip_txgs; txgs_in_future += skip_txgs; ASSERT3S(available_blocks, >=, -incoming); + ASSERT3S(available_txgs, >=, -1); } /* @@ -666,9 +711,10 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) * based on the current entry in the summary, updating * our available_blocks. */ - ASSERT3S(available_blocks, <, 0); + ASSERT(available_blocks < 0 || available_txgs < 0); available_blocks += e->lse_blkcount; - total_flushes += e->lse_mscount; + available_txgs += e->lse_txgcount; + total_flushes += e->lse_msdcount; /* * Keep the running maximum of the total_flushes that @@ -680,8 +726,6 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) */ max_flushes_pertxg = MAX(max_flushes_pertxg, DIV_ROUND_UP(total_flushes, txgs_in_future)); - ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, - max_flushes_pertxg); } return (max_flushes_pertxg); } @@ -771,14 +815,11 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) uint64_t want_to_flush; if (spa_flush_all_logs_requested(spa)) { ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); - want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed); + want_to_flush = UINT64_MAX; } else { want_to_flush = spa_estimate_metaslabs_to_flush(spa); } - ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, - want_to_flush); - /* Used purely for verification purposes */ uint64_t visited = 0; @@ -809,31 +850,22 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa)) break; - mutex_enter(&curr->ms_sync_lock); - mutex_enter(&curr->ms_lock); - boolean_t flushed = metaslab_flush(curr, tx); - mutex_exit(&curr->ms_lock); - mutex_exit(&curr->ms_sync_lock); - - /* - * If we failed to flush a metaslab (because it was loading), - * then we are done with the block heuristic as it's not - * possible to destroy any log space maps once you've skipped - * a metaslab. In that case we just set our counter to 0 but - * we continue looping in case there is still memory pressure - * due to unflushed changes. Note that, flushing a metaslab - * that is not the oldest flushed in the pool, will never - * destroy any log space maps [see spa_cleanup_old_sm_logs()]. - */ - if (!flushed) { - want_to_flush = 0; - } else if (want_to_flush > 0) { - want_to_flush--; - } + if (metaslab_unflushed_dirty(curr)) { + mutex_enter(&curr->ms_sync_lock); + mutex_enter(&curr->ms_lock); + metaslab_flush(curr, tx); + mutex_exit(&curr->ms_lock); + mutex_exit(&curr->ms_sync_lock); + if (want_to_flush > 0) + want_to_flush--; + } else + metaslab_unflushed_bump(curr, tx, B_FALSE); visited++; } ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited); + + spa_log_sm_set_blocklimit(spa); } /* @@ -904,6 +936,7 @@ spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx) avl_remove(&spa->spa_sm_logs_by_txg, sls); space_map_free_obj(mos, sls->sls_sm_obj, tx); VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx)); + spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks); spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks; kmem_free(sls, sizeof (spa_log_sm_t)); } @@ -963,12 +996,7 @@ spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx) VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); - /* - * If the log space map feature was just enabled, the blocklimit - * has not yet been set. - */ - if (spa_log_sm_blocklimit(spa) == 0) - spa_log_sm_set_blocklimit(spa); + spa_log_sm_set_blocklimit(spa); } /* @@ -1094,12 +1122,18 @@ spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg) panic("invalid maptype_t"); break; } + if (!metaslab_unflushed_dirty(ms)) { + metaslab_set_unflushed_dirty(ms, B_TRUE); + spa_log_summary_dirty_flushed_metaslab(spa, + metaslab_unflushed_txg(ms)); + } return (0); } static int spa_ld_log_sm_data(spa_t *spa) { + spa_log_sm_t *sls, *psls; int error = 0; /* @@ -1113,41 +1147,71 @@ spa_ld_log_sm_data(spa_t *spa) ASSERT0(spa->spa_unflushed_stats.sus_memused); hrtime_t read_logs_starttime = gethrtime(); - /* this is a no-op when we don't have space map logs */ - for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); - sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { - space_map_t *sm = NULL; - error = space_map_open(&sm, spa_meta_objset(spa), - sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT); - if (error != 0) { - spa_load_failed(spa, "spa_ld_log_sm_data(): failed at " - "space_map_open(obj=%llu) [error %d]", - (u_longlong_t)sls->sls_sm_obj, error); - goto out; + + /* Prefetch log spacemaps dnodes. */ + for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls; + sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj, + 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + } + + uint_t pn = 0; + uint64_t ps = 0; + psls = sls = avl_first(&spa->spa_sm_logs_by_txg); + while (sls != NULL) { + /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */ + if (psls != NULL && pn < 16 && + (pn < 2 || ps < 2 * dmu_prefetch_max)) { + error = space_map_open(&psls->sls_sm, + spa_meta_objset(spa), psls->sls_sm_obj, 0, + UINT64_MAX, SPA_MINBLOCKSHIFT); + if (error != 0) { + spa_load_failed(spa, "spa_ld_log_sm_data(): " + "failed at space_map_open(obj=%llu) " + "[error %d]", + (u_longlong_t)sls->sls_sm_obj, error); + goto out; + } + dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj, + 0, 0, space_map_length(psls->sls_sm), + ZIO_PRIORITY_ASYNC_READ); + pn++; + ps += space_map_length(psls->sls_sm); + psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls); + continue; } + /* Load TXG log spacemap into ms_unflushed_allocs/frees. */ + cond_resched(); + ASSERT0(sls->sls_nblocks); + sls->sls_nblocks = space_map_nblocks(sls->sls_sm); + spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; + summary_add_data(spa, sls->sls_txg, + sls->sls_mscount, 0, sls->sls_nblocks); + struct spa_ld_log_sm_arg vla = { .slls_spa = spa, .slls_txg = sls->sls_txg }; - error = space_map_iterate(sm, space_map_length(sm), - spa_ld_log_sm_cb, &vla); + error = space_map_iterate(sls->sls_sm, + space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla); if (error != 0) { - space_map_close(sm); spa_load_failed(spa, "spa_ld_log_sm_data(): failed " "at space_map_iterate(obj=%llu) [error %d]", (u_longlong_t)sls->sls_sm_obj, error); goto out; } - ASSERT0(sls->sls_nblocks); - sls->sls_nblocks = space_map_nblocks(sm); - spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; - summary_add_data(spa, sls->sls_txg, - sls->sls_mscount, sls->sls_nblocks); + pn--; + ps -= space_map_length(sls->sls_sm); + space_map_close(sls->sls_sm); + sls->sls_sm = NULL; + sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls); - space_map_close(sm); + /* Update log block limits considering just loaded. */ + spa_log_sm_set_blocklimit(spa); } + hrtime_t read_logs_endtime = gethrtime(); spa_load_note(spa, "read %llu log space maps (%llu total blocks - blksz = %llu bytes) " @@ -1157,6 +1221,18 @@ spa_ld_log_sm_data(spa_t *spa) (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000)); out: + if (error != 0) { + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + if (sls->sls_sm) { + space_map_close(sls->sls_sm); + sls->sls_sm = NULL; + } + } + } else { + ASSERT0(pn); + ASSERT0(ps); + } /* * Now that the metaslabs contain their unflushed changes: * [1] recalculate their actual allocated space @@ -1237,6 +1313,9 @@ spa_ld_unflushed_txgs(vdev_t *vd) } ms->ms_unflushed_txg = entry.msp_unflushed_txg; + ms->ms_unflushed_dirty = B_FALSE; + ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(ms->ms_unflushed_frees)); if (ms->ms_unflushed_txg != 0) { mutex_enter(&spa->spa_flushed_ms_lock); avl_add(&spa->spa_metaslabs_by_flushed, ms); @@ -1300,6 +1379,10 @@ ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW, "Lower-bound limit for the maximum amount of blocks allowed in " "log spacemap (see zfs_unflushed_log_block_max)"); +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, ULONG, ZMOD_RW, + "Hard limit (upper-bound) in the size of the space map log " + "in terms of dirty TXGs."); + ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW, "Tunable used to determine the number of blocks that can be used for " "the spacemap log, expressed as a percentage of the total number of " diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index db2d2c5e44fb..ce7f020a0d86 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1523,13 +1523,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); - /* - * Regardless whether this vdev was just added or it is being - * expanded, the metaslab count has changed. Recalculate the - * block limit. - */ - spa_log_sm_set_blocklimit(spa); - return (0); } diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index f988ca22fa4a..5508d273758d 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1386,7 +1386,6 @@ vdev_remove_complete(spa_t *spa) vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; - spa_log_sm_set_blocklimit(spa); } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); @@ -2131,7 +2130,6 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) * metaslab_class_histogram_verify() */ vdev_metaslab_fini(vd); - spa_log_sm_set_blocklimit(spa); spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); *txg = spa_vdev_config_enter(spa);