From d77525f7c09232677a933d561b694c656c900454 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Wed, 19 Aug 2020 11:01:57 +0300 Subject: [PATCH 1/2] Add scratch object creation at the beginning of the reflow process --- include/sys/fs/zfs.h | 1 + include/sys/vdev_raidz.h | 5 ++ module/zfs/vdev.c | 10 +++ module/zfs/vdev_raidz.c | 146 ++++++++++++++++++++++++++++++++++----- 4 files changed, 145 insertions(+), 17 deletions(-) diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 9c570aca176d..94ccb93f0b32 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -688,6 +688,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_NPARITY "nparity" #define ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH "raidz_logical_width" #define ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET "raidz_expand_offset" +#define ZPOOL_CONFIG_RAIDZ_EXPAND_SCROBJ "raidz_expand_scratch_object" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 146dd3c29660..54dfd64b1ffd 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -84,6 +84,11 @@ typedef struct vdev_raidz_expand { uint64_t vre_offset_pertxg[TXG_SIZE]; + /* + * Scratch object built on top of this number of leaf devices. + */ + uint64_t vre_scratch_devices; + dsl_scan_state_t vre_state; time_t vre_start_time; time_t vre_end_time; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 8edd786331ff..0b1e78090ffa 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1346,6 +1346,16 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) } } + /* + * Disable first metaslab if raidz expansion scratch object exist. + */ + if (vd->vdev_ops == &vdev_raidz_ops) { + vdev_raidz_t *vdrz = (vdev_raidz_t *)vd->vdev_tsd; + if (vdrz->vd_physical_width - 1 == + vdrz->vn_vre.vre_scratch_devices) + metaslab_disable(vd->vdev_ms[0]); + } + if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index e1b1950f04ec..7e3e98417f9d 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2936,6 +2936,74 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); } + +static void +raidz_scratch_read_done(zio_t *zio) +{ + zio_nowait(zio_unique_parent(zio)); +} + +static void +raidz_scratch_write_done(zio_t *zio) +{ + vdev_raidz_expand_t *vre = zio->io_private; + + if (zio->io_error == 0) + vre->vre_scratch_devices++; + + abd_free(zio->io_abd); + + spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); +} + +static void +raidz_scratch_object(spa_t *spa, vdev_raidz_expand_t *vre) +{ + (void)vre; + + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + + for (int i = 0; i < raidvd->vdev_children - 1; i++) { + dmu_tx_t *tx = + dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + zio_t *pio = spa->spa_txg_zio[txgoff]; + abd_t *abd = abd_alloc_for_io(VDEV_BOOT_SIZE, B_FALSE); + zio_t *write_zio = zio_vdev_child_io(pio, NULL, + raidvd->vdev_child[i], + -VDEV_BOOT_SIZE, + abd, VDEV_BOOT_SIZE, + ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + raidz_scratch_write_done, vre); + + zio_nowait(zio_vdev_child_io(write_zio, NULL, + raidvd->vdev_child[i], + 0, + abd, VDEV_BOOT_SIZE, + ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + raidz_scratch_read_done, NULL)); + + dmu_tx_commit(tx); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + txg_wait_synced(spa->spa_dsl_pool, txg); + } + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + vdev_config_dirty(vd); +} + static void raidz_reflow_sync(void *arg, dmu_tx_t *tx) { @@ -2969,6 +3037,14 @@ raidz_reflow_sync(void *arg, dmu_tx_t *tx) * real offset from the MOS? Or rely on ditto blocks? */ vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + + /* + * Invalidate scratch object on first vre_offset_phys update. + * Enable first metaslab. + */ + vre->vre_scratch_devices = 0; + metaslab_enable(vd->vdev_ms[0], B_FALSE, B_FALSE); + vdev_config_dirty(vd); } @@ -3055,7 +3131,7 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, { spa_t *spa = vd->vdev_spa; int ashift = vd->vdev_top->vdev_ashift; - uint64_t offset, size; + uint64_t offset, roffset, size; if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize, &offset, &size)) { @@ -3141,9 +3217,12 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra); + roffset = (blkid / old_children) << ashift; + if (vre->vre_scratch_devices != 0) + roffset -= VDEV_BOOT_SIZE; zio_nowait(zio_vdev_child_io(write_zio, NULL, vd->vdev_child[blkid % old_children], - (blkid / old_children) << ashift, + roffset, abd, length, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, @@ -3152,25 +3231,11 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, return (B_FALSE); } -/* ARGSUSED */ -static boolean_t -spa_raidz_expand_cb_check(void *arg, zthr_t *zthr) -{ - spa_t *spa = arg; - - return (spa->spa_raidz_expand != NULL); -} - -/* ARGSUSED */ static void -spa_raidz_expand_cb(void *arg, zthr_t *zthr) +raidz_reflow(spa_t *spa, vdev_raidz_expand_t *vre) { - spa_t *spa = arg; - vdev_raidz_expand_t *vre = spa->spa_raidz_expand; - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); - uint64_t guid = raidvd->vdev_guid; for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; @@ -3311,10 +3376,44 @@ spa_raidz_expand_cb(void *arg, zthr_t *zthr) } else { txg_wait_synced(spa->spa_dsl_pool, 0); } +} + +/* ARGSUSED */ +static void +spa_raidz_expand_cb(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + + metaslab_t *msp = raidvd->vdev_ms[0]; + ASSERT(raidvd->vdev_children*VDEV_BOOT_SIZE < msp->ms_size); + metaslab_disable(msp); + + if (vre->vre_scratch_devices != raidvd->vdev_children - 1) + raidz_scratch_object(spa, vre); + + /* + * XXX Handle inconsistent or unavailable scratch object. + */ + ASSERT(vre->vre_scratch_devices == + raidvd->vdev_children - 1); + + raidz_reflow(spa, vre); spa->spa_raidz_expand = NULL; } +/* ARGSUSED */ +static boolean_t +spa_raidz_expand_cb_check(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + + return (spa->spa_raidz_expand != NULL); +} + void spa_start_raidz_expansion_thread(spa_t *spa) { @@ -3402,6 +3501,9 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET, vdrz->vn_vre.vre_offset_phys); } + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_SCROBJ, + vdrz->vn_vre.vre_scratch_devices); } /* @@ -3450,6 +3552,16 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) */ } + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_SCROBJ, + &vdrz->vn_vre.vre_scratch_devices) == 0) { + /* + * XXX If we got inconsistent scratch object, assert for now. + */ + ASSERT(vdrz->vn_vre.vre_scratch_devices == 0 || + vdrz->vn_vre.vre_scratch_devices == + vdrz->vd_physical_width - 1); + } + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) From b72fc80360941de913e5bf24cd69a3319c2681fa Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 14 Sep 2020 09:58:07 +0300 Subject: [PATCH 2/2] Scratch object improvements: - make scratch_devices variable increment atomic - improve metaslab disabling logic - invalidate scratch object when it offset exceeded instead of first reflow sync --- include/sys/vdev_raidz.h | 2 + module/zfs/vdev.c | 4 +- module/zfs/vdev_raidz.c | 89 +++++++++++++++++++++------------------- 3 files changed, 51 insertions(+), 44 deletions(-) diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 54dfd64b1ffd..38767a02c2eb 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -88,6 +88,7 @@ typedef struct vdev_raidz_expand { * Scratch object built on top of this number of leaf devices. */ uint64_t vre_scratch_devices; + int64_t vre_scratch_metaslabs_cnt; dsl_scan_state_t vre_state; time_t vre_start_time; @@ -108,6 +109,7 @@ typedef struct vdev_raidz { extern void vdev_raidz_attach_sync(void *, dmu_tx_t *); extern void vdev_raidz_config_generate(vdev_t *, nvlist_t *); +extern void raidz_disable_scratch_metaslabs(spa_t *); extern void *vdev_raidz_get_tsd(spa_t *, nvlist_t *); extern void spa_start_raidz_expansion_thread(spa_t *); extern int spa_raidz_expand_get_stats(spa_t *, pool_raidz_expand_stat_t *); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 0b1e78090ffa..6743c70c9a11 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1347,13 +1347,13 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) } /* - * Disable first metaslab if raidz expansion scratch object exist. + * Disable metalsabs included to raidz scratch object. */ if (vd->vdev_ops == &vdev_raidz_ops) { vdev_raidz_t *vdrz = (vdev_raidz_t *)vd->vdev_tsd; if (vdrz->vd_physical_width - 1 == vdrz->vn_vre.vre_scratch_devices) - metaslab_disable(vd->vdev_ms[0]); + raidz_disable_scratch_metaslabs(spa); } if (txg == 0) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 7e3e98417f9d..989d09fb7d83 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2949,7 +2949,7 @@ raidz_scratch_write_done(zio_t *zio) vdev_raidz_expand_t *vre = zio->io_private; if (zio->io_error == 0) - vre->vre_scratch_devices++; + atomic_inc_64(&vre->vre_scratch_devices); abd_free(zio->io_abd); @@ -2957,26 +2957,23 @@ raidz_scratch_write_done(zio_t *zio) } static void -raidz_scratch_object(spa_t *spa, vdev_raidz_expand_t *vre) +raidz_build_scratch_object(spa_t *spa) { - (void)vre; - + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + spa_config_exit(spa, SCL_CONFIG, FTAG); + for (int i = 0; i < raidvd->vdev_children - 1; i++) { + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); - - int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; - spa_config_enter(spa, SCL_STATE, spa, RW_READER); - zio_t *pio = spa->spa_txg_zio[txgoff]; + zio_t *pio = spa->spa_txg_zio[txg & TXG_MASK]; abd_t *abd = abd_alloc_for_io(VDEV_BOOT_SIZE, B_FALSE); zio_t *write_zio = zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], @@ -2996,12 +2993,33 @@ raidz_scratch_object(spa_t *spa, vdev_raidz_expand_t *vre) dmu_tx_commit(tx); - spa_config_exit(spa, SCL_CONFIG, FTAG); txg_wait_synced(spa->spa_dsl_pool, txg); } + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); +} + +static void +raidz_enable_scratch_metaslabs(spa_t *spa) +{ + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); - vdev_config_dirty(vd); + + while (vre->vre_scratch_metaslabs_cnt >= 0) + metaslab_enable(vd->vdev_ms[vre->vre_scratch_metaslabs_cnt--], + B_FALSE, B_FALSE); +} + +void +raidz_disable_scratch_metaslabs(spa_t *spa) +{ + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + + do { + metaslab_disable(vd->vdev_ms[vre->vre_scratch_metaslabs_cnt++]); + } while ((vd->vdev_children - 1) * VDEV_BOOT_SIZE > + vre->vre_scratch_metaslabs_cnt * vd->vdev_ms[0]->ms_size); } static void @@ -3039,11 +3057,13 @@ raidz_reflow_sync(void *arg, dmu_tx_t *tx) vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); /* - * Invalidate scratch object on first vre_offset_phys update. - * Enable first metaslab. + * Invalidate scratch object, enable all disabled metaslabs. */ - vre->vre_scratch_devices = 0; - metaslab_enable(vd->vdev_ms[0], B_FALSE, B_FALSE); + if (vre->vre_scratch_devices && + vre->vre_offset_phys / vd->vdev_children > VDEV_BOOT_SIZE) { + vre->vre_scratch_devices = 0; + raidz_enable_scratch_metaslabs(spa); + } vdev_config_dirty(vd); } @@ -3231,13 +3251,23 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, return (B_FALSE); } +/* ARGSUSED */ static void -raidz_reflow(spa_t *spa, vdev_raidz_expand_t *vre) +spa_raidz_expand_cb(void *arg, zthr_t *zthr) { + spa_t *spa = arg; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); uint64_t guid = raidvd->vdev_guid; + /* Build scratch object */ + if (vre->vre_offset == 0) { + raidz_disable_scratch_metaslabs(spa); + raidz_build_scratch_object(spa); + } + for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; i < raidvd->vdev_ms_count && !zthr_iscancelled(spa->spa_raidz_expand_zthr); i++) { @@ -3376,31 +3406,6 @@ raidz_reflow(spa_t *spa, vdev_raidz_expand_t *vre) } else { txg_wait_synced(spa->spa_dsl_pool, 0); } -} - -/* ARGSUSED */ -static void -spa_raidz_expand_cb(void *arg, zthr_t *zthr) -{ - spa_t *spa = arg; - - vdev_raidz_expand_t *vre = spa->spa_raidz_expand; - vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); - - metaslab_t *msp = raidvd->vdev_ms[0]; - ASSERT(raidvd->vdev_children*VDEV_BOOT_SIZE < msp->ms_size); - metaslab_disable(msp); - - if (vre->vre_scratch_devices != raidvd->vdev_children - 1) - raidz_scratch_object(spa, vre); - - /* - * XXX Handle inconsistent or unavailable scratch object. - */ - ASSERT(vre->vre_scratch_devices == - raidvd->vdev_children - 1); - - raidz_reflow(spa, vre); spa->spa_raidz_expand = NULL; }