cleanup

ahrens · Aug 31, 2020 · a0baa47 · a0baa47
1 parent 0ccbc9e
commit a0baa47
Showing 1 changed file with 32 additions and 60 deletions.
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
@@ -3119,6 +3119,28 @@ raidz_reflow_read_done(zio_t *zio)
 	zio_nowait(zio_unique_parent(zio));
 }
 
+static void
+raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
+    dmu_tx_t *tx)
+{
+	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	if (offset == 0)
+		return;
+
+	mutex_enter(&vre->vre_lock);
+	ASSERT3U(vre->vre_offset, <=, offset);
+	vre->vre_offset = offset;
+	mutex_exit(&vre->vre_lock);
+
+	if (vre->vre_offset_pertxg[txgoff] == 0) {
+		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
+		    spa, 0, ZFS_SPACE_CHECK_NONE, tx);
+	}
+	vre->vre_offset_pertxg[txgoff] = offset;
+}
+
 static boolean_t
 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
     dmu_tx_t *tx)
@@ -3141,28 +3163,20 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
 	int old_children = vd->vdev_children - 1;
 
 	/*
-	 * If this would cause us to pass a block whose progress has not
-	 * yet been committed to disk, return TRUE indicating that we need
-	 * to try again in the next txg, and advance only to the point we
-	 * are able.  Otherwise a subsequent write into the unallocated region
-	 * we are skipping could cause an overlap.
+	 * We can only progress to the point that writes will not overlap with
+	 * blocks whose progress has not yet been recorded on disk
+	 * (vre_offset_phys).  Note that even if we are skipping over a large
+	 * unallocated region, we can't move the on-disk progress to `offset`,
+	 * because concurrent writes/allocations could still use the
+	 * currently-unallocated region.
 	 */
 	uint64_t vre_offset_phys_blkid =
 	    MAX(old_children, vre->vre_offset_phys >> ashift);
-	/*
-	 * We can't overwrite this block.
-	 */
 	uint64_t next_overwrite_blkid = vre_offset_phys_blkid +
 	    vre_offset_phys_blkid / old_children;
 	if (blkid >= next_overwrite_blkid) {
-		mutex_enter(&vre->vre_lock);
-		vre->vre_offset = next_overwrite_blkid << ashift;
-		mutex_exit(&vre->vre_lock);
-		if (vre->vre_offset > 0 && vre->vre_offset_pertxg[txgoff] == 0) {
-			dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
-			    spa, 0, ZFS_SPACE_CHECK_NONE, tx);
-		}
-		vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
+		raidz_reflow_record_progress(vre,
+		    next_overwrite_blkid << ashift, tx);
 
 		zfs_dbgmsg("copying offset %llu, vre_offset_phys %llu, "
 		    "max_overwrite = %llu wait for txg %llu",
@@ -3173,40 +3187,6 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
 		return (B_TRUE);
 	}
 
-	/*
-	 * Record the fact that we've completed up to the beginning
-	 * of this segment.  This is important since there could be
-	 * an unallocated segment preceding this, and the overwrite-check
-	 * code needs to know that we have processed up to this point.
-	 */
-	mutex_enter(&vre->vre_lock);
-	vre->vre_offset = offset;
-	mutex_exit(&vre->vre_lock);
-	if (vre->vre_offset > 0 && vre->vre_offset_pertxg[txgoff] == 0) {
-		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
-		    spa, 0, ZFS_SPACE_CHECK_NONE, tx);
-	}
-	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
-
-	/*
-	 * If this would cause us to overwrite a block whose progress has not
-	 * yet been committed to disk, return TRUE indicating that we need
-	 * to try again in the next txg.
-	 */
-	uint64_t overwrite_blkid =
-	    (blkid / vd->vdev_children) * old_children +
-	    (blkid % vd->vdev_children);
-	/* XXX allow overwrite of first row for now */
-	if (blkid > vd->vdev_children &&
-	    overwrite_blkid << ashift >= vre->vre_offset_phys) {
-		zfs_dbgmsg("copying offset %llu, vre_offset_phys %llu, "
-		    "wait for txg %llu",
-		    (long long)offset,
-		    (long long)vre->vre_offset_phys,
-		    (long long)dmu_tx_get_txg(tx));
-		return (B_TRUE);
-	}
-
 	range_tree_remove(rt, offset, length);
 
 	raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP);
@@ -3217,20 +3197,12 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
 	zfs_dbgmsg("initiating reflow write offset=%llu length=%llu",
 	    offset, length);
 
+	raidz_reflow_record_progress(vre, offset + length, tx);
+
 	mutex_enter(&vre->vre_lock);
-	ASSERT3U(vre->vre_offset, <=, offset);
-	vre->vre_offset = offset + length;
 	vre->vre_outstanding_bytes += length;
 	mutex_exit(&vre->vre_lock);
 
-#if 0 /* XXX already done above */
-	if (vre->vre_offset_pertxg[txgoff] == 0) {
-		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
-		    spa, 0, ZFS_SPACE_CHECK_NONE, tx);
-	}
-	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
-#endif
-
 	/*
 	 * SCL_STATE will be released when the read and write are done,
 	 * by raidz_reflow_write_done().