From 4ad1561e350d80578605191a1334a7acc72936cf Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 2 Apr 2024 17:40:49 -0400 Subject: [PATCH] Fix read errors race after block cloning Investigating read errors triggering panic fixed in #16042 I've found that we have a race in a sync process between the moment dirty record for cloned block is removed and the moment dbuf is destroyed. If dmu_buf_hold_array_by_dnode() take a hold on a cloned dbuf before it is synced/destroyed, then dbuf_read_impl() may see it still in DB_NOFILL state, but without the dirty record. Such case is not an error, but equivalent to DB_UNCACHED, since the dbuf block pointer is already updated by dbuf_write_ready(). Unfortunately it is impossible to safely change the dbuf state to DB_UNCACHED there, since there may already be another cloning in progress, that dropped dbuf lock before creating a new dirty record, protected only by the range lock. Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. --- module/zfs/dbuf.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 4e190c131e1d..7e6c73fec583 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1564,7 +1564,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; - blkptr_t bp, *bpp; + blkptr_t bp, *bpp = NULL; DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -1580,29 +1580,28 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, goto early_unlock; } - if (db->db_state == DB_UNCACHED) { - if (db->db_blkptr == NULL) { - bpp = NULL; - } else { - bp = *db->db_blkptr; + /* + * If we have a pending block clone, we don't want to read the + * underlying block, but the content of the block being cloned, + * pointed by the dirty record, so we have the most recent data. + * If there is no dirty record, then we hit a race in a sync + * process when the dirty record is already removed, while the + * dbuf is not yet destroyed. Such case is equivalent to uncached. + */ + if (db->db_state == DB_NOFILL) { + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + if (dr != NULL) { + if (!dr->dt.dl.dr_brtwrite) { + err = EIO; + goto early_unlock; + } + bp = dr->dt.dl.dr_overridden_by; bpp = &bp; } - } else { - dbuf_dirty_record_t *dr; - - ASSERT3S(db->db_state, ==, DB_NOFILL); + } - /* - * Block cloning: If we have a pending block clone, - * we don't want to read the underlying block, but the content - * of the block being cloned, so we have the most recent data. - */ - dr = list_head(&db->db_dirty_records); - if (dr == NULL || !dr->dt.dl.dr_brtwrite) { - err = EIO; - goto early_unlock; - } - bp = dr->dt.dl.dr_overridden_by; + if (bpp == NULL && db->db_blkptr != NULL) { + bp = *db->db_blkptr; bpp = &bp; }