Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resilver performance tuning #14428

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -1769,7 +1769,7 @@ completes in order to verify the checksums of all blocks which have been
resilvered.
This is enabled by default and strongly recommended.
.
.It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64
.It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64
Maximum amount of I/O that can be concurrently issued for a sequential
resilver per leaf device, given in bytes.
.
Expand Down Expand Up @@ -1898,7 +1898,7 @@ When disabled, the memory limit may be exceeded by fast disks.
Freezes a scrub/resilver in progress without actually pausing it.
Intended for testing/debugging.
.
.It Sy zfs_scan_vdev_limit Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq int
.It Sy zfs_scan_vdev_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int
Maximum amount of data that can be concurrently issued at once for scrubs and
resilvers per leaf device, given in bytes.
.
Expand Down
28 changes: 16 additions & 12 deletions module/zfs/dsl_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include <sys/dmu_tx.h>
#include <sys/dmu_objset.h>
#include <sys/arc.h>
#include <sys/arc_impl.h>
#include <sys/zap.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
Expand Down Expand Up @@ -126,7 +127,7 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
static uint64_t dsl_scan_count_data_disks(vdev_t *vd);
static uint64_t dsl_scan_count_data_disks(spa_t *spa);

extern uint_t zfs_vdev_async_write_active_min_dirty_percent;
static int zfs_scan_blkstats = 0;
Expand All @@ -147,7 +148,7 @@ static int zfs_scan_strict_mem_lim = B_FALSE;
* overload the drives with I/O, since that is protected by
* zfs_vdev_scrub_max_active.
*/
static uint64_t zfs_scan_vdev_limit = 4 << 20;
static uint64_t zfs_scan_vdev_limit = 16 << 20;

static uint_t zfs_scan_issue_strategy = 0;

Expand Down Expand Up @@ -466,11 +467,12 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)

/*
* Calculate the max number of in-flight bytes for pool-wide
* scanning operations (minimum 1MB). Limits for the issuing
* phase are done per top-level vdev and are handled separately.
* scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
* Limits for the issuing phase are done per top-level vdev and
* are handled separately.
*/
scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));

avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
offsetof(scan_ds_t, sds_node));
Expand Down Expand Up @@ -2811,8 +2813,9 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
}

static uint64_t
dsl_scan_count_data_disks(vdev_t *rvd)
dsl_scan_count_data_disks(spa_t *spa)
{
vdev_t *rvd = spa->spa_root_vdev;
uint64_t i, leaves = 0;

for (i = 0; i < rvd->vdev_children; i++) {
Expand Down Expand Up @@ -3711,12 +3714,13 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
taskqid_t prefetch_tqid;

/*
* Recalculate the max number of in-flight bytes for pool-wide
* scanning operations (minimum 1MB). Limits for the issuing
* phase are done per top-level vdev and are handled separately.
* Calculate the max number of in-flight bytes for pool-wide
* scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
* Limits for the issuing phase are done per top-level vdev and
* are handled separately.
*/
scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));

if (scnp->scn_ddt_bookmark.ddb_class <=
scnp->scn_ddt_class_max) {
Expand Down
27 changes: 18 additions & 9 deletions module/zfs/vdev_rebuild.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <sys/zio.h>
#include <sys/dmu_tx.h>
#include <sys/arc.h>
#include <sys/arc_impl.h>
#include <sys/zap.h>

/*
Expand Down Expand Up @@ -116,13 +117,12 @@ static uint64_t zfs_rebuild_max_segment = 1024 * 1024;
* segment size is also large (zfs_rebuild_max_segment=1M). This helps keep
* the queue depth short.
*
* 32MB was selected as the default value to achieve good performance with
* a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential
* rebuild was unable to saturate all of the drives using smaller values.
* With a value of 32MB the sequential resilver write rate was measured at
* 800MB/s sustained while rebuilding to a distributed spare.
* 64MB was observed to deliver the best performance and set as the default.
* Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c)
* and a rebuild rate of 1.2GB/s was measured to the distribute spare.
* Smaller values were unable to fully saturate the available pool I/O.
*/
static uint64_t zfs_rebuild_vdev_limit = 32 << 20;
static uint64_t zfs_rebuild_vdev_limit = 64 << 20;

/*
* Automatically start a pool scrub when the last active sequential resilver
Expand Down Expand Up @@ -754,6 +754,7 @@ vdev_rebuild_thread(void *arg)
{
vdev_t *vd = arg;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
int error = 0;

/*
Expand Down Expand Up @@ -786,9 +787,6 @@ vdev_rebuild_thread(void *arg)
vr->vr_pass_bytes_scanned = 0;
vr->vr_pass_bytes_issued = 0;

vr->vr_bytes_inflight_max = MAX(1ULL << 20,
zfs_rebuild_vdev_limit * vd->vdev_children);

uint64_t update_est_time = gethrtime();
vdev_rebuild_update_bytes_est(vd, 0);

Expand All @@ -804,6 +802,17 @@ vdev_rebuild_thread(void *arg)
metaslab_t *msp = vd->vdev_ms[i];
vr->vr_scan_msp = msp;

/*
* Calculate the max number of in-flight bytes for top-level
* vdev scanning operations (minimum 1MB, maximum 1/4 of
* arc_c_max shared by all top-level vdevs). Limits for the
* issuing phase are done per top-level vdev and are handled
* separately.
*/
uint64_t limit = (arc_c_max / 4) / MAX(rvd->vdev_children, 1);
vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20,
zfs_rebuild_vdev_limit * vd->vdev_children));

/*
* Removal of vdevs from the vdev tree may eliminate the need
* for the rebuild, in which case it should be canceled. The
Expand Down