From d73da60c677a754639df357d2d9cc1db15c63a90 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 24 Jan 2023 14:05:45 -0800 Subject: [PATCH 1/2] Increase default zfs_scan_vdev_limit to 16MB For HDD based pools the default zfs_scan_vdev_limit of 4M per-vdev can significantly limit the maximum scrub performance. Increasing the default to 16M can double the scrub speed from 80 MB/s per disk to 160 MB/s per disk. This does increase the memory footprint during scrub/resilver but given the performance win this is a reasonable trade off. Memory usage is capped at 1/4 of arc_c_max. Note that number of outstanding I/Os has not changed and is still limited by zfs_vdev_scrub_max_active. Signed-off-by: Brian Behlendorf --- man/man4/zfs.4 | 2 +- module/zfs/dsl_scan.c | 28 ++++++++++++++++------------ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index e20d601340c6..677eb9a8df3a 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1898,7 +1898,7 @@ When disabled, the memory limit may be exceeded by fast disks. Freezes a scrub/resilver in progress without actually pausing it. Intended for testing/debugging. . -.It Sy zfs_scan_vdev_limit Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq int +.It Sy zfs_scan_vdev_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int Maximum amount of data that can be concurrently issued at once for scrubs and resilvers per leaf device, given in bytes. . diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f9e437f0c947..a680d95fd6e3 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -126,7 +127,7 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); -static uint64_t dsl_scan_count_data_disks(vdev_t *vd); +static uint64_t dsl_scan_count_data_disks(spa_t *spa); extern uint_t zfs_vdev_async_write_active_min_dirty_percent; static int zfs_scan_blkstats = 0; @@ -147,7 +148,7 @@ static int zfs_scan_strict_mem_lim = B_FALSE; * overload the drives with I/O, since that is protected by * zfs_vdev_scrub_max_active. */ -static uint64_t zfs_scan_vdev_limit = 4 << 20; +static uint64_t zfs_scan_vdev_limit = 16 << 20; static uint_t zfs_scan_issue_strategy = 0; @@ -466,11 +467,12 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) /* * Calculate the max number of in-flight bytes for pool-wide - * scanning operations (minimum 1MB). Limits for the issuing - * phase are done per top-level vdev and are handled separately. + * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). + * Limits for the issuing phase are done per top-level vdev and + * are handled separately. */ - scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * - dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); + scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, + zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); @@ -2811,8 +2813,9 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) } static uint64_t -dsl_scan_count_data_disks(vdev_t *rvd) +dsl_scan_count_data_disks(spa_t *spa) { + vdev_t *rvd = spa->spa_root_vdev; uint64_t i, leaves = 0; for (i = 0; i < rvd->vdev_children; i++) { @@ -3711,12 +3714,13 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) taskqid_t prefetch_tqid; /* - * Recalculate the max number of in-flight bytes for pool-wide - * scanning operations (minimum 1MB). Limits for the issuing - * phase are done per top-level vdev and are handled separately. + * Calculate the max number of in-flight bytes for pool-wide + * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). + * Limits for the issuing phase are done per top-level vdev and + * are handled separately. */ - scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * - dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); + scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, + zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); if (scnp->scn_ddt_bookmark.ddb_class <= scnp->scn_ddt_class_max) { From 30842083e4935d40a2d91798e12846ca6db6ac76 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 24 Jan 2023 15:23:32 -0800 Subject: [PATCH 2/2] Increase default zfs_rebuild_vdev_limit to 64MB When testing distributed rebuild performance with more capable hardware it was observed than increasing the zfs_rebuild_vdev_limit to 64M reduced the rebuild time by 17%. Beyond 64MB there was some improvement (~2%) but it was not significant when weighed against the increased memory usage. Memory usage is capped at 1/4 of arc_c_max. Additionally, vr_bytes_inflight_max has been moved so it's updated per-metaslab to allow the size to be adjust while a rebuild is running. Signed-off-by: Brian Behlendorf --- man/man4/zfs.4 | 2 +- module/zfs/vdev_rebuild.c | 27 ++++++++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 677eb9a8df3a..aa4328783ed8 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1769,7 +1769,7 @@ completes in order to verify the checksums of all blocks which have been resilvered. This is enabled by default and strongly recommended. . -.It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64 +.It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64 Maximum amount of I/O that can be concurrently issued for a sequential resilver per leaf device, given in bytes. . diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 1f56275c853b..62aa61b3b9e7 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -34,6 +34,7 @@ #include #include #include +#include #include /* @@ -116,13 +117,12 @@ static uint64_t zfs_rebuild_max_segment = 1024 * 1024; * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep * the queue depth short. * - * 32MB was selected as the default value to achieve good performance with - * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential - * rebuild was unable to saturate all of the drives using smaller values. - * With a value of 32MB the sequential resilver write rate was measured at - * 800MB/s sustained while rebuilding to a distributed spare. + * 64MB was observed to deliver the best performance and set as the default. + * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c) + * and a rebuild rate of 1.2GB/s was measured to the distribute spare. + * Smaller values were unable to fully saturate the available pool I/O. */ -static uint64_t zfs_rebuild_vdev_limit = 32 << 20; +static uint64_t zfs_rebuild_vdev_limit = 64 << 20; /* * Automatically start a pool scrub when the last active sequential resilver @@ -754,6 +754,7 @@ vdev_rebuild_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; int error = 0; /* @@ -786,9 +787,6 @@ vdev_rebuild_thread(void *arg) vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; - vr->vr_bytes_inflight_max = MAX(1ULL << 20, - zfs_rebuild_vdev_limit * vd->vdev_children); - uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); @@ -804,6 +802,17 @@ vdev_rebuild_thread(void *arg) metaslab_t *msp = vd->vdev_ms[i]; vr->vr_scan_msp = msp; + /* + * Calculate the max number of in-flight bytes for top-level + * vdev scanning operations (minimum 1MB, maximum 1/4 of + * arc_c_max shared by all top-level vdevs). Limits for the + * issuing phase are done per top-level vdev and are handled + * separately. + */ + uint64_t limit = (arc_c_max / 4) / MAX(rvd->vdev_children, 1); + vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20, + zfs_rebuild_vdev_limit * vd->vdev_children)); + /* * Removal of vdevs from the vdev tree may eliminate the need * for the rebuild, in which case it should be canceled. The