Scrub mirror children without BPs

When scrubbing a raidz/draid pool, which contains a replacing or sparing mirror with multiple online children, only one child will be read. This is not normally a serious concern because the DTL records are used to determine where a good copy of the data is. As long as the data can be read from one child the mirror vdev will use it to repair gaps in any of its children. Furthermore, even if the data which was read is corrupt the raidz code will detect this and issue its own repair I/O to correct the damage in the mirror vdev. However, in the scenario where the DTL is wrong due to silent data corruption (say due to overwritting one child) and the scrub happens to read from a child with good data, then the other damaged mirror child will not be detected nor repaired. While this is possible for both raidz and draid vdevs, it's most pronouced when using draid. This is because sequential rebuilds may be used with draid and they rely solely on parity to resilver. And because distributed spares are usually available and always preferred, which means damage on other mirror child will go undetected. For system administrations this behavior is non-intuitive and in a worst case scenario could result in the only good copy of the data being unknowingly detached from the mirror. This change resolves the issue by reading all replacing/sparing mirror children when scrubbing. When the BP isn't available for verification, then compare the data buffers from each child. They must all be indentical, if not there's silent damage and an error is returned to prompt the top-level vdev to issue a repair I/O to rewrite the data on all of the mirror children. Since we can't tell which child was wrong a checksum error is logged against the replacing or sparing mirror vdev. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
openzfs · Jun 16, 2022 · f672ce5 · f672ce5
1 parent 4ff7a8f
commit f672ce5
Show file tree

Hide file tree

Showing 7 changed files with 248 additions and 34 deletions.
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
@@ -35,6 +35,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/zio.h>
+#include <sys/zio_checksum.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 
@@ -102,6 +103,7 @@ vdev_mirror_stat_fini(void)
  */
 typedef struct mirror_child {
 	vdev_t		*mc_vd;
+	abd_t		*mc_abd;
 	uint64_t	mc_offset;
 	int		mc_error;
 	int		mc_load;
@@ -434,6 +436,10 @@ vdev_mirror_child_done(zio_t *zio)
 {
 	mirror_child_t *mc = zio->io_private;
 
+	/* See scrubbing read comment in vdev_mirror_io_start() */
+	if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL)
+		mc->mc_abd = zio->io_abd;
+
 	mc->mc_error = zio->io_error;
 	mc->mc_tried = 1;
 	mc->mc_skipped = 0;
@@ -637,15 +643,16 @@ vdev_mirror_io_start(zio_t *zio)
 	}
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		if (zio->io_bp != NULL &&
-		    (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
+		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
 			/*
-			 * For scrubbing reads (if we can verify the
-			 * checksum here, as indicated by io_bp being
-			 * non-NULL) we need to allocate a read buffer for
-			 * each child and issue reads to all children.  If
-			 * any child succeeds, it will copy its data into
-			 * zio->io_data in vdev_mirror_scrub_done.
+			 * For scrubbing reads we need to allocate a buffer
+			 * for each child and issue reads to all children.
+			 * If we can verify the checksum here, as indicated
+			 * by io_bp being non-NULL, the data will be copied
+			 * into zio->io_data in vdev_mirror_scrub_done().
+			 * If not, then vdev_mirror_io_done() will compare
+			 * all of the read buffers and return a checksum
+			 * error if they aren't all identical.
 			 */
 			for (c = 0; c < mm->mm_children; c++) {
 				mc = &mm->mm_child[c];
@@ -663,7 +670,8 @@ vdev_mirror_io_start(zio_t *zio)
 				    abd_alloc_sametype(zio->io_abd,
 				    zio->io_size), zio->io_size,
 				    zio->io_type, zio->io_priority, 0,
-				    vdev_mirror_scrub_done, mc));
+				    zio->io_bp ? vdev_mirror_scrub_done :
+				    vdev_mirror_child_done, mc));
 			}
 			zio_execute(zio);
 			return;
@@ -731,6 +739,7 @@ vdev_mirror_io_done(zio_t *zio)
 	int c;
 	int good_copies = 0;
 	int unexpected_errors = 0;
+	int last_good_copy = -1;
 
 	if (mm == NULL)
 		return;
@@ -742,6 +751,7 @@ vdev_mirror_io_done(zio_t *zio)
 			if (!mc->mc_skipped)
 				unexpected_errors++;
 		} else if (mc->mc_tried) {
+			last_good_copy = c;
 			good_copies++;
 		}
 	}
@@ -755,7 +765,6 @@ vdev_mirror_io_done(zio_t *zio)
 		 * no non-degraded top-level vdevs left, and not update DTLs
 		 * if we intend to reallocate.
 		 */
-		/* XXPOLICY */
 		if (good_copies != mm->mm_children) {
 			/*
 			 * Always require at least one good copy.
@@ -782,7 +791,6 @@ vdev_mirror_io_done(zio_t *zio)
 	/*
 	 * If we don't have a good copy yet, keep trying other children.
 	 */
-	/* XXPOLICY */
 	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
 		ASSERT(c >= 0 && c < mm->mm_children);
 		mc = &mm->mm_child[c];
@@ -794,7 +802,54 @@ vdev_mirror_io_done(zio_t *zio)
 		return;
 	}
 
-	/* XXPOLICY */
+	/*
+	 * If we're scrubbing but don't have a BP available (because this
+	 * vdev is under a raidz or draid vdev) then the best we can do is
+	 * compare all of the copies read.  If they're not identical then
+	 * return a checksum error and the most likely correct data.  The
+	 * raidz code will issue a repair I/O if possible.
+	 */
+	if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL) {
+		abd_t *last_good_abd;
+
+		ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops ||
+		    zio->io_vd->vdev_ops == &vdev_spare_ops);
+
+		if (good_copies > 1) {
+			last_good_abd = mm->mm_child[last_good_copy].mc_abd;
+			abd_t *best_abd = NULL;
+
+			for (c = 0; c < last_good_copy; c++) {
+				mc = &mm->mm_child[c];
+
+				if (mc->mc_error || !mc->mc_tried)
+					continue;
+
+				if (abd_cmp(mc->mc_abd, last_good_abd) != 0)
+					zio->io_error = SET_ERROR(ECKSUM);
+
+				if (best_abd == NULL &&
+				    mc->mc_vd->vdev_ops ==
+				    &vdev_draid_spare_ops) {
+					best_abd = mc->mc_abd;
+				}
+			}
+
+			abd_copy(zio->io_abd, best_abd ? best_abd :
+			    last_good_abd, zio->io_size);
+
+		} else if (good_copies == 1) {
+			last_good_abd = mm->mm_child[last_good_copy].mc_abd;
+			abd_copy(zio->io_abd, last_good_abd, zio->io_size);
+		}
+
+		for (c = 0; c < mm->mm_children; c++) {
+			mc = &mm->mm_child[c];
+			abd_free(mc->mc_abd);
+			mc->mc_abd = NULL;
+		}
+	}
+
 	if (good_copies == 0) {
 		zio->io_error = vdev_mirror_worst_error(mm);
 		ASSERT(zio->io_error != 0);

diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
@@ -1885,6 +1885,9 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
+
+		if (rc->rc_force_repair)
+			unexpected_errors++;
 	}
 
 	/*
@@ -2249,9 +2252,20 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
-		if (rc->rc_error) {
-			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
+		/*
+		 * While scrubbing a replacing/sparing vdev determined
+		 * that not all children have identical copies of the data.
+		 * Clear the error so the buffer is treated like any other
+		 * read, then force a repair to correct the damage.
+		 */
+		if (rc->rc_error == ECKSUM) {
+			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
+			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
+			rc->rc_force_repair = 1;
+			rc->rc_error = 0;
+		}
 
+		if (rc->rc_error) {
 			if (c < rr->rr_firstdatacol)
 				parity_errors++;
 			else

diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
@@ -763,7 +763,8 @@ tags = ['functional', 'raidz']
 
 [tests/functional/redundancy]
 tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
-    'redundancy_draid3', 'redundancy_draid_damaged', 'redundancy_draid_spare1',
+    'redundancy_draid3', 'redundancy_draid_damaged1',
+    'redundancy_draid_damaged2', 'redundancy_draid_spare1',
     'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror',
     'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2',
     'redundancy_raidz3', 'redundancy_stripe']

diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
@@ -227,6 +227,7 @@ maybe = {
     'pool_checkpoint/checkpoint_discard_busy': ['FAIL', 11946],
     'projectquota/setup': ['SKIP', exec_reason],
     'redundancy/redundancy_004_neg': ['FAIL', 7290],
+    'redundancy/redundancy_draid_damaged2': ['FAIL', known_reason],
     'redundancy/redundancy_draid_spare1': ['FAIL', known_reason],
     'redundancy/redundancy_draid_spare3': ['FAIL', known_reason],
     'removal/removal_condense_export': ['FAIL', known_reason],

diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
@@ -1632,7 +1632,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/redundancy/redundancy_draid1.ksh \
 	functional/redundancy/redundancy_draid2.ksh \
 	functional/redundancy/redundancy_draid3.ksh \
-	functional/redundancy/redundancy_draid_damaged.ksh \
+	functional/redundancy/redundancy_draid_damaged1.ksh \
+	functional/redundancy/redundancy_draid_damaged2.ksh \
 	functional/redundancy/redundancy_draid.ksh \
 	functional/redundancy/redundancy_draid_spare1.ksh \
 	functional/redundancy/redundancy_draid_spare2.ksh \

diff --git a/...l/redundancy/redundancy_draid_damaged.ksh → .../redundancy/redundancy_draid_damaged1.ksh b/...l/redundancy/redundancy_draid_damaged.ksh → .../redundancy/redundancy_draid_damaged1.ksh
@@ -85,28 +85,13 @@ function test_sequential_resilver # <pool> <parity> <dir>
 
 	for (( i=0; i<$nparity; i=i+1 )); do
 		spare=draid${nparity}-0-$i
-		zpool status $pool
-		zpool replace -fsw $pool $dir/dev-$i $spare
-		zpool status $pool
+		log_must zpool replace -fsw $pool $dir/dev-$i $spare
 	done
 
 	log_must zpool scrub -w $pool
+	log_must zpool status $pool
 
-	# When only a single child was overwritten the sequential resilver
-	# can fully repair the damange from parity and the scrub will have
-	# nothing to repair. When multiple children are silently damaged
-	# the sequential resilver will calculate the wrong data since only
-	# the parity information is used and it cannot be verified with
-	# the checksum. However, since only the resilvering devices are
-	# written to with the bad data a subsequent scrub will be able to
-	# fully repair the pool.
-	#
-	if [[ $nparity == 1 ]]; then
-		log_must check_pool_status $pool "scan" "repaired 0B"
-	else
-		log_mustnot check_pool_status $pool "scan" "repaired 0B"
-	fi
-
+	log_mustnot check_pool_status $pool "scan" "repaired 0B"
 	log_must check_pool_status $pool "errors" "No known data errors"
 	log_must check_pool_status $pool "scan" "with 0 errors"
 }