Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add IO aggregation in case of expanded read #7

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/sys/vdev_raidz_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,13 @@ typedef struct raidz_map {
uintptr_t rm_reports; /* # of referencing checksum reports */
boolean_t rm_freed; /* map no longer has referencing ZIO */
boolean_t rm_ecksuminjected; /* checksum error was injected */
boolean_t rm_io_aggregation;
int rm_nrows;
int rm_nskip; /* Sectors skipped for padding */
int rm_nphys_cols; /* Number of leaf devices */
zfs_locked_range_t *rm_lr;
const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
raidz_col_t *rm_phys_col; /* leaf devices array */
raidz_row_t *rm_row[0]; /* flexible array of rows */
} raidz_map_t;

Expand Down
153 changes: 141 additions & 12 deletions module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,18 @@
uint64_t zfs_raidz_expand_max_offset_pause = UINT64_MAX;
uint64_t zfs_raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;

/*
* Apply raidz map abds aggregation if the number of rows in the map is equal
* or greater than the value below.
*/
unsigned long raidz_io_aggregate_rows = 8;

/*
* Aggregated IO statistics.
*/
unsigned long raidz_io_count_total = 0;
unsigned long raidz_io_count_aggregated = 0;

static void
vdev_raidz_row_free(raidz_row_t *rr)
{
Expand Down Expand Up @@ -179,6 +191,16 @@ vdev_raidz_map_free(raidz_map_t *rm)
for (int i = 0; i < rm->rm_nrows; i++) {
vdev_raidz_row_free(rm->rm_row[i]);
}

if (rm->rm_nphys_cols) {
for (int i = 0; i < rm->rm_nphys_cols; i++)
if (rm->rm_phys_col[i].rc_abd)
abd_free(rm->rm_phys_col[i].rc_abd);

kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
rm->rm_nphys_cols);
}

ASSERT3P(rm->rm_lr, ==, NULL);
kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
}
Expand Down Expand Up @@ -562,6 +584,20 @@ vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
rm->rm_nskip = roundup(tot, nparity + 1) - tot;
asize = 0;

if (rows >= raidz_io_aggregate_rows) {
rm->rm_io_aggregation = B_TRUE;
rm->rm_nphys_cols = physical_cols;
rm->rm_phys_col =
kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
KM_SLEEP);
for (int i = 0; i < rm->rm_nphys_cols; i++)
rm->rm_phys_col[i].rc_offset = UINT64_MAX;

raidz_io_count_aggregated++;
}

raidz_io_count_total++;

zfs_dbgmsg("rm=%p s=%d q=%d r=%d bc=%d nrows=%d cols=%d rfo=%llx",
rm, (int)s, (int)q, (int)r, (int)bc, (int)rows, (int)cols,
(long long)reflow_offset);
Expand Down Expand Up @@ -1967,7 +2003,22 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
}

static void
vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
vdev_raidz_get_phys_col_offset(raidz_map_t *rm, raidz_row_t *rr)
{
for (int c = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
raidz_col_t *prc = &rm->rm_phys_col[rc->rc_devidx];

if (rc->rc_size == 0)
continue;

prc->rc_offset = MIN(prc->rc_offset, rc->rc_offset);
prc->rc_size += rc->rc_size;
}
}

static void
vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
{
vdev_t *vd = zio->io_vd;

Expand Down Expand Up @@ -2010,6 +2061,66 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
}
}

static void
vdev_raidz_io_start_read_phys_col(zio_t *zio, raidz_map_t *rm)
{
vdev_t *vd = zio->io_vd;

for (int i = 0; i < rm->rm_nphys_cols; i++) {
raidz_col_t *prc = &rm->rm_phys_col[i];
if (prc->rc_size == 0)
continue;

prc->rc_abd = abd_alloc_linear(rm->rm_phys_col[i].rc_size,
B_FALSE);

vdev_t *cvd = vd->vdev_child[i];
/*
* XXX The physcical column errors are ignored by *_io_done().
* If raidz_checksum_verify() will fail, the data will be
* re-read by vdev_raidz_read_all() without IO aggregation.
*/
if (!vdev_readable(cvd)) {
prc->rc_error = SET_ERROR(ENXIO);
prc->rc_tried = 1; /* don't even try */
prc->rc_skipped = 1;
continue;
}
if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
prc->rc_error = SET_ERROR(ESTALE);
prc->rc_skipped = 1;
continue;
}
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
prc->rc_offset, prc->rc_abd, prc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, prc));
}
}

static void
vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
{
/*
* If there are multiple rows, we will be hitting
* all disks, so go ahead and read the parity so
* that we are reading in decent size chunks.
* XXX maybe doesn't really matter?
*/
boolean_t forceparity = rm->rm_nrows > 1;

for (int i = 0; i < rm->rm_nrows; i++) {
raidz_row_t *rr = rm->rm_row[i];
if (rm->rm_io_aggregation)
vdev_raidz_get_phys_col_offset(rm, rr);
else
vdev_raidz_io_start_read_row(zio, rr, forceparity);
}

if (rm->rm_io_aggregation)
vdev_raidz_io_start_read_phys_col(zio, rm);
}

/*
* Start an IO operation on a RAIDZ VDev
*
Expand Down Expand Up @@ -2067,17 +2178,7 @@ vdev_raidz_io_start(zio_t *zio)
}
} else {
ASSERT(zio->io_type == ZIO_TYPE_READ);
/*
* If there are multiple rows, we will be hitting
* all disks, so go ahead and read the parity so
* that we are reading in decent size chunks.
* XXX maybe doesn't really matter?
*/
boolean_t forceparity = rm->rm_nrows > 1;
for (int i = 0; i < rm->rm_nrows; i++) {
vdev_raidz_io_start_read(zio,
rm->rm_row[i], forceparity);
}
vdev_raidz_io_start_read(zio, rm);
}

zio_execute(zio);
Expand Down Expand Up @@ -2720,6 +2821,25 @@ vdev_raidz_io_done(zio_t *zio)
vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
}
} else {
if (rm->rm_io_aggregation) {
for (int i = 0; i < rm->rm_nrows; i++) {
raidz_row_t *rr = rm->rm_row[i];
for (int c = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
if (rc->rc_size == 0)
continue;

int idx = rc->rc_devidx;
raidz_col_t *prc = &rm->rm_phys_col[idx];

abd_copy_off(rc->rc_abd, prc->rc_abd,
0,
rc->rc_offset - prc->rc_offset,
rc->rc_size);
}
}
}

for (int i = 0; i < rm->rm_nrows; i++) {
raidz_row_t *rr = rm->rm_row[i];
rr->rr_code =
Expand All @@ -2743,6 +2863,7 @@ vdev_raidz_io_done(zio_t *zio)
* be marked as tried so we'll proceed to combinatorial
* reconstruction.
*/
rm->rm_io_aggregation = B_FALSE;
int nread = 0;
for (int i = 0; i < rm->rm_nrows; i++) {
nread += vdev_raidz_read_all(zio,
Expand Down Expand Up @@ -3592,3 +3713,11 @@ vdev_ops_t vdev_raidz_ops = {
.vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};

ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
"Apply raidz map abds aggregation if the map contain more rows than value");

ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_count_total, ULONG, ZMOD_RD,
"Expanded reads total");
ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_count_aggregated, ULONG, ZMOD_RD,
"Expanded reads aggregated");