Skip to content

Commit

Permalink
Single IO issue for raidz writes with skip sector
Browse files Browse the repository at this point in the history
In order to reduce contention on the vq_lock, optional skip sectors
for Raidz writes can be placed into a single IO request. This is done by
padding out the linear ABD for a parity column to contain the skip
sector and by creating gang ABD to contain the data and skip sector for
data columns.

The vdev_raidz_map_alloc() function now contains specific functions for
both reads and write to allocate the ABD's that will be issued down to
the VDEV chldren.

Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
  • Loading branch information
bwatkinson committed Sep 14, 2021
1 parent acb2046 commit 2c09b66
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 92 deletions.
1 change: 1 addition & 0 deletions include/sys/vdev_raidz.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ void vdev_raidz_map_free(struct raidz_map *);
void vdev_raidz_generate_parity_row(struct raidz_map *, struct raidz_row *);
void vdev_raidz_generate_parity(struct raidz_map *);
void vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
void vdev_raidz_child_done(zio_t *);
void vdev_raidz_io_done(zio_t *);

extern const zio_vsd_ops_t vdev_raidz_vsd_ops;
Expand Down
14 changes: 2 additions & 12 deletions module/zfs/vdev_draid.c
Original file line number Diff line number Diff line change
Expand Up @@ -1798,16 +1798,6 @@ vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
#endif
}

static void
vdev_draid_child_done(zio_t *zio)
{
raidz_col_t *rc = zio->io_private;

rc->rc_error = zio->io_error;
rc->rc_tried = 1;
rc->rc_skipped = 0;
}

/*
* For write operations:
* 1. Generate the parity data
Expand Down Expand Up @@ -1838,7 +1828,7 @@ vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr)
zio_nowait(zio_vdev_child_io(zio, NULL,
vd->vdev_child[rc->rc_devidx], rc->rc_offset,
rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
0, vdev_draid_child_done, rc));
0, vdev_raidz_child_done, rc));
}
}

Expand Down Expand Up @@ -1988,7 +1978,7 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_draid_child_done, rc));
vdev_raidz_child_done, rc));
}
}
}
Expand Down
211 changes: 131 additions & 80 deletions module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,115 @@ const zio_vsd_ops_t vdev_raidz_vsd_ops = {
.vsd_free = vdev_raidz_map_free_vsd,
};

static void
vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
{
int c;
int nwrapped = 0;
uint64_t off = 0;
raidz_row_t *rr = rm->rm_row[0];

ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
ASSERT3U(rm->rm_nrows, ==, 1);

/*
* We will pad any parity columns with additional space to account for
* skip sectors.
*/
if (rm->rm_skipstart < rr->rr_firstdatacol) {
ASSERT0(rm->rm_skipstart);
nwrapped = rm->rm_nskip;
} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
nwrapped =
(rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
}

/*
* Optional single skip sectors (rc_size == 0) will be handled in
* vdev_raidz_io_start_write().
*/
int skipped = rr->rr_scols - rr->rr_cols;

/* Allocate buffers for the parity columns */
for (c = 0; c < rr->rr_firstdatacol; c++) {
raidz_col_t *rc = &rr->rr_col[c];

/*
* Parity columns will pad out a linear ABD to account for
* the skip sector. A linear ABD is used here because
* parity calculations use the ABD buffer directly to calculate
* parity. This avoids doing a memcpy back to the ABD after the
* parity has been calculated. By issuing the parity column
* with the skip sector we can reduce contention on the child
* VDEV queue locks (vq_lock).
*/
if (c < nwrapped) {
rc->rc_abd = abd_alloc_linear(
rc->rc_size + (1ULL << ashift), B_FALSE);
abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
skipped++;
} else {
rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
}

}

for (off = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
zio->io_abd, off, rc->rc_size);

/*
* Generate I/O for skip sectors to improve aggregation
* continuity. We will use gang ABD's to reduce contention
* on the child VDEV queue locks (vq_lock) by issuing
* a single I/O that contains the data and skip sector.
*
* It is important to make sure that rc_size is not updated
* even though we are adding a skip sector to the ABD. With
* calculating the parity in vdev_raidz_generate_parity_row()
* the rc_size is used for iterating through the ABD's. We
* can not have zero'd out skip sectors used for calculating
* parity for raidz, because those same sectors are not used
* during reconstruction.
*/
if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
rc->rc_abd = abd_alloc_gang();
abd_gang_add(rc->rc_abd, abd, B_TRUE);
abd_gang_add(rc->rc_abd,
abd_get_zeros(1ULL << ashift), B_TRUE);
skipped++;
} else {
rc->rc_abd = abd;
}
off += rc->rc_size;
}

ASSERT3U(off, ==, zio->io_size);
ASSERT3S(skipped, ==, rm->rm_nskip);
}

static void
vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
{
int c;
raidz_row_t *rr = rm->rm_row[0];

ASSERT3U(rm->rm_nrows, ==, 1);

/* Allocate buffers for the parity columns */
for (c = 0; c < rr->rr_firstdatacol; c++)
rr->rr_col[c].rc_abd =
abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);

for (uint64_t off = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
zio->io_abd, off, rc->rc_size);
off += rc->rc_size;
}
}

/*
* Divides the IO evenly across all child vdevs; usually, dcols is
* the number of children in the target vdev.
Expand Down Expand Up @@ -287,17 +396,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
rm->rm_nskip = roundup(tot, nparity + 1) - tot;
rm->rm_skipstart = bc;

for (c = 0; c < rr->rr_firstdatacol; c++)
rr->rr_col[c].rc_abd =
abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);

for (uint64_t off = 0; c < acols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
zio->io_abd, off, rc->rc_size);
off += rc->rc_size;
}

/*
* If all data stored spans all columns, there's a danger that parity
* will always be on the same device and, since parity isn't read
Expand Down Expand Up @@ -333,6 +431,12 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
rm->rm_skipstart = 1;
}

if (zio->io_type == ZIO_TYPE_WRITE) {
vdev_raidz_map_alloc_write(zio, rm, ashift);
} else {
vdev_raidz_map_alloc_read(zio, rm);
}

/* init RAIDZ parity ops */
rm->rm_ops = vdev_raidz_math_get_ops();

Expand Down Expand Up @@ -1477,7 +1581,7 @@ vdev_raidz_min_asize(vdev_t *vd)
vd->vdev_children);
}

static void
void
vdev_raidz_child_done(zio_t *zio)
{
raidz_col_t *rc = zio->io_private;
Expand All @@ -1486,18 +1590,6 @@ vdev_raidz_child_done(zio_t *zio)
rc->rc_error = zio->io_error;
rc->rc_tried = 1;
rc->rc_skipped = 0;

/*
* If we created a gang ABD to aggregate IO's for writes we will
* free the gang ABD here and reset the column's ABD to the original
* ABD.
*/
if (zio->io_type == ZIO_TYPE_WRITE && abd_is_gang(rc->rc_abd)) {
ASSERT3P(rc->rc_orig_data, !=, rc->rc_abd);
abd_free(rc->rc_abd);
rc->rc_abd = rc->rc_orig_data;
rc->rc_orig_data = NULL;
}
}

static void
Expand Down Expand Up @@ -1538,78 +1630,37 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
{
vdev_t *vd = zio->io_vd;
raidz_map_t *rm = zio->io_vsd;
int c, skipped = 0, nwrapped = 0;

vdev_raidz_generate_parity_row(rm, rr);

if (rm->rm_skipstart < rr->rr_firstdatacol) {
ASSERT0(rm->rm_skipstart);
nwrapped = rm->rm_nskip;
} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
nwrapped =
(rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
}

for (c = 0; c < rr->rr_scols; c++) {
abd_t *abd = NULL;
for (int c = 0; c < rr->rr_scols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];

/* Verify physical to logical translation */
vdev_raidz_io_verify(vd, rr, c);

/*
* Generate I/O for skip sectors to improve aggregation
* contiguity. We will use gang ABD's to reduce contention
* on the children VDEV queue locks (vq_lock) by issuing
* a single I/O that contains the data and skip sectors.
* Generate optional I/O for single skip sector to improve
* aggregation contiguity.
*/
if (c < nwrapped || (c >= rm->rm_skipstart &&
skipped < rm->rm_nskip)) {
skipped++;
if (rc->rc_size > 0) {
abd = abd_alloc_gang();
abd_gang_add(abd, rc->rc_abd, B_FALSE);
abd_gang_add(abd,
abd_get_zeros(1ULL << ashift), B_TRUE);

/*
* Store original ABD so the gang ABD can be
* freed in vdev_raidz_child_done().
*
* Because rc_orig_data is only used for
* reconstruction during reads, we can safely
* stash the original raidz_col_t's ABD in it
* for writes.
*/
ASSERT3P(rc->rc_orig_data, ==, NULL);
rc->rc_orig_data = rc->rc_abd;
rc->rc_abd = abd;
} else {
ASSERT3P(rc->rc_abd, ==, NULL);
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, NULL, 1ULL << ashift,
zio->io_type, zio->io_priority,
ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL,
NULL));
continue;
}

if (rc->rc_size == 0) {
ASSERT3P(rc->rc_abd, ==, NULL);
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, NULL, 1ULL << ashift,
zio->io_type, zio->io_priority,
ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL,
NULL));
} else {
/*
* I/O does not contain any skip sectors.
*/
abd = rc->rc_abd;
ASSERT3P(rc->rc_abd, !=, NULL);
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd,
abd_get_size(rc->rc_abd), zio->io_type,
zio->io_priority, 0, vdev_raidz_child_done, rc));
}

ASSERT3P(abd, !=, NULL);

zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
abd, abd_get_size(abd), zio->io_type, zio->io_priority,
0, vdev_raidz_child_done, rc));
}

ASSERT3S(skipped, ==, rm->rm_nskip);
}

static void
Expand Down

0 comments on commit 2c09b66

Please sign in to comment.