Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add scratch object creation at the beginning of the reflow process #10

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_NPARITY "nparity"
#define ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH "raidz_logical_width"
#define ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET "raidz_expand_offset"
#define ZPOOL_CONFIG_RAIDZ_EXPAND_SCROBJ "raidz_expand_scratch_object"
#define ZPOOL_CONFIG_HOSTID "hostid"
#define ZPOOL_CONFIG_HOSTNAME "hostname"
#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"
Expand Down
7 changes: 7 additions & 0 deletions include/sys/vdev_raidz.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@ typedef struct vdev_raidz_expand {

uint64_t vre_offset_pertxg[TXG_SIZE];

/*
* Scratch object built on top of this number of leaf devices.
*/
uint64_t vre_scratch_devices;
int64_t vre_scratch_metaslabs_cnt;

dsl_scan_state_t vre_state;
time_t vre_start_time;
time_t vre_end_time;
Expand All @@ -103,6 +109,7 @@ typedef struct vdev_raidz {

extern void vdev_raidz_attach_sync(void *, dmu_tx_t *);
extern void vdev_raidz_config_generate(vdev_t *, nvlist_t *);
extern void raidz_disable_scratch_metaslabs(spa_t *);
extern void *vdev_raidz_get_tsd(spa_t *, nvlist_t *);
extern void spa_start_raidz_expansion_thread(spa_t *);
extern int spa_raidz_expand_get_stats(spa_t *, pool_raidz_expand_stat_t *);
Expand Down
10 changes: 10 additions & 0 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -1346,6 +1346,16 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
}
}

/*
* Disable metalsabs included to raidz scratch object.
*/
if (vd->vdev_ops == &vdev_raidz_ops) {
vdev_raidz_t *vdrz = (vdev_raidz_t *)vd->vdev_tsd;
if (vdrz->vd_physical_width - 1 ==
vdrz->vn_vre.vre_scratch_devices)
raidz_disable_scratch_metaslabs(spa);
}

if (txg == 0)
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);

Expand Down
141 changes: 129 additions & 12 deletions module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -2936,6 +2936,92 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res)
ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
}


static void
raidz_scratch_read_done(zio_t *zio)
{
zio_nowait(zio_unique_parent(zio));
}

static void
raidz_scratch_write_done(zio_t *zio)
{
vdev_raidz_expand_t *vre = zio->io_private;

if (zio->io_error == 0)
atomic_inc_64(&vre->vre_scratch_devices);

abd_free(zio->io_abd);

spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
}

static void
raidz_build_scratch_object(spa_t *spa)
{
vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);

spa_config_exit(spa, SCL_CONFIG, FTAG);

for (int i = 0; i < raidvd->vdev_children - 1; i++) {

dmu_tx_t *tx =
dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
uint64_t txg = dmu_tx_get_txg(tx);

spa_config_enter(spa, SCL_STATE, spa, RW_READER);

zio_t *pio = spa->spa_txg_zio[txg & TXG_MASK];
abd_t *abd = abd_alloc_for_io(VDEV_BOOT_SIZE, B_FALSE);
zio_t *write_zio = zio_vdev_child_io(pio, NULL,
raidvd->vdev_child[i],
-VDEV_BOOT_SIZE,
abd, VDEV_BOOT_SIZE,
ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
ZIO_FLAG_CANFAIL,
raidz_scratch_write_done, vre);

zio_nowait(zio_vdev_child_io(write_zio, NULL,
raidvd->vdev_child[i],
0,
abd, VDEV_BOOT_SIZE,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure that the boot region can't be used with RAIDZ?

Can we add some checks that the boot region is big enough that we won't have overlapping writes? I think it just has to be at least nchildren <<ashift.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I considered boot sector as scratch object, the next calculations were used:
VDEV_BOOT_SIZE eq 3.5M (7ULL << 19) per child. Max number of childs is 255.
255*(7ULL << 19) = 935854080 bytes => max scratch object size

nchildren <<ashift in case of 4k ashift:
255 << 12 = 1044480 bytes

So, if I am correct it should be enough, of course excluding bootloader if it present, but I prefere do not accaunt it for now.
Later it will be needed to add logic which will detect the boot loader and check it size.
The simplest way will be to zero boot sector on pool creatrion phase and check non-zeroed data size to accaunt it.

Also, you mentioned "Copy first new_ncols^2 sectors to scratch object".
Could you please explain why new_ncols^2 and from other side "nchildren <<ashift". I cannot get it clear.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I think the design doc talks about "Copy first new_ncols^2 sectors to scratch object". This is the same as each device having "nchildren << ashift" bytes copied to it. Since each device has nchildren sectors, and there are nchildren devices, we have nchildren * nchildren = nchildren^2 sectors copied total (across all devices).

The reason we need to copy this much data is so that after the scratch data, there is at least one whole stripe (aka one whole row, which is at most old_nchildren << ashift bytes) of not-yet-overwritten old data before the next old block to copy. We need this because we read this old data (from just before the "next old block to copy" when there's a stripe that crosses the "next block to copy" boundary, vre_offset_phys. This is implemented by the code in vdev_raidz_map_alloc_expanded() following this comment:

		 * If we are in the middle of a reflow, and any part of this
		 * row has not been copied, then use the old location of
		 * this row.

ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
ZIO_FLAG_CANFAIL,
raidz_scratch_read_done, NULL));

dmu_tx_commit(tx);

txg_wait_synced(spa->spa_dsl_pool, txg);
}

spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
}

static void
raidz_enable_scratch_metaslabs(spa_t *spa)
{
vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);

while (vre->vre_scratch_metaslabs_cnt >= 0)
metaslab_enable(vd->vdev_ms[vre->vre_scratch_metaslabs_cnt--],
B_FALSE, B_FALSE);
}

void
raidz_disable_scratch_metaslabs(spa_t *spa)
{
vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);

do {
metaslab_disable(vd->vdev_ms[vre->vre_scratch_metaslabs_cnt++]);
} while ((vd->vdev_children - 1) * VDEV_BOOT_SIZE >
vre->vre_scratch_metaslabs_cnt * vd->vdev_ms[0]->ms_size);
}

static void
raidz_reflow_sync(void *arg, dmu_tx_t *tx)
{
Expand Down Expand Up @@ -2969,6 +3055,16 @@ raidz_reflow_sync(void *arg, dmu_tx_t *tx)
* real offset from the MOS? Or rely on ditto blocks?
*/
vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);

/*
* Invalidate scratch object, enable all disabled metaslabs.
*/
if (vre->vre_scratch_devices &&
vre->vre_offset_phys / vd->vdev_children > VDEV_BOOT_SIZE) {
vre->vre_scratch_devices = 0;
raidz_enable_scratch_metaslabs(spa);
}

vdev_config_dirty(vd);
}

Expand Down Expand Up @@ -3055,7 +3151,7 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
{
spa_t *spa = vd->vdev_spa;
int ashift = vd->vdev_top->vdev_ashift;
uint64_t offset, size;
uint64_t offset, roffset, size;

if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize,
&offset, &size)) {
Expand Down Expand Up @@ -3141,9 +3237,12 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
ZIO_FLAG_CANFAIL,
raidz_reflow_write_done, rra);

roffset = (blkid / old_children) << ashift;
if (vre->vre_scratch_devices != 0)
roffset -= VDEV_BOOT_SIZE;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in this case the roffset has to be < VDEV_BOOT_SIZE, and this goes negative, right?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because of next expression in the zio_vdev_child_io():
if (vd->vdev_ops->vdev_op_leaf) {
ASSERT0(vd->vdev_children);
offset += VDEV_LABEL_START_SIZE;
}

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem that the scratch space is addressing is:

  • we just started an expansion
  • a single stripe has overwritten itself, such that it now has 2 sectors on the same disk
  • that disk dies
  • we now need to reconstruct this stripe, but we don't have enough of its sectors

To address this we need to ensure that when moving blocks, a stripe never overwrites itself, so that the stripe is always available intact at its old location.

So the place where we need to use the scratch copy is when doing a normal (or scrub/resilver) read. We don't need to use the scratch copy when doing the reflow read (although it shouldn't hurt).

I don't think we can test this case currently, because we don't handle disk failure during reflow. That's another thing on the to-do list. We need to pause the reflow and wait for the disk to be replaced and resilvered.

However, I think we can test this by reading blocks that are in the beginning of the disk, while we are reflowing the beginning of the disk. Today this will simply read from the old location, see this code in vdev_raidz_map_alloc_expanded():

		/*
		 * If we are in the middle of a reflow, and any part of this
		 * row has not been copied, then use the old location of
		 * this row.
		 */
		int row_phys_cols = physical_cols;
		if (b + (logical_cols - nparity) > reflow_offset >> ashift)
			row_phys_cols--;

I think that if the expansion has overwritten a stripe with itself, the read will not get the right data (because the old location has been partially overwritten). We could test this by pausing the reflow in the very beginning, and then reading the block containing the partially-overwritten stripe. I think that @stuartmaybee is working on some code for pausing the reflow for testing purposes.

Note that in this test case, we actually still have all the data and we could write code to read this "split stripe" partially from the old location and partially from the new location. But that code doesn't exist, because it won't be needed once we solve the problem of handling resilvering while in the middle of reflow (by using the scratch copy).

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, the reflow-pausing code is already there, zfs_raidz_expand_max_offset_pause

zio_nowait(zio_vdev_child_io(write_zio, NULL,
vd->vdev_child[blkid % old_children],
(blkid / old_children) << ashift,
roffset,
abd, length,
ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
ZIO_FLAG_CANFAIL,
Expand All @@ -3152,15 +3251,6 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
return (B_FALSE);
}

/* ARGSUSED */
static boolean_t
spa_raidz_expand_cb_check(void *arg, zthr_t *zthr)
{
spa_t *spa = arg;

return (spa->spa_raidz_expand != NULL);
}

/* ARGSUSED */
static void
spa_raidz_expand_cb(void *arg, zthr_t *zthr)
Expand All @@ -3170,9 +3260,14 @@ spa_raidz_expand_cb(void *arg, zthr_t *zthr)

spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);

uint64_t guid = raidvd->vdev_guid;

/* Build scratch object */
if (vre->vre_offset == 0) {
raidz_disable_scratch_metaslabs(spa);
raidz_build_scratch_object(spa);
}

for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
i < raidvd->vdev_ms_count &&
!zthr_iscancelled(spa->spa_raidz_expand_zthr); i++) {
Expand Down Expand Up @@ -3315,6 +3410,15 @@ spa_raidz_expand_cb(void *arg, zthr_t *zthr)
spa->spa_raidz_expand = NULL;
}

/* ARGSUSED */
static boolean_t
spa_raidz_expand_cb_check(void *arg, zthr_t *zthr)
{
spa_t *spa = arg;

return (spa->spa_raidz_expand != NULL);
}

void
spa_start_raidz_expansion_thread(spa_t *spa)
{
Expand Down Expand Up @@ -3402,6 +3506,9 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET,
vdrz->vn_vre.vre_offset_phys);
}

fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_SCROBJ,
vdrz->vn_vre.vre_scratch_devices);
}

/*
Expand Down Expand Up @@ -3450,6 +3557,16 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv)
*/
}

if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_SCROBJ,
&vdrz->vn_vre.vre_scratch_devices) == 0) {
/*
* XXX If we got inconsistent scratch object, assert for now.
*/
ASSERT(vdrz->vn_vre.vre_scratch_devices == 0 ||
vdrz->vn_vre.vre_scratch_devices ==
vdrz->vd_physical_width - 1);
}

if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
Expand Down