diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 0518205f9906..feadbbf7deea 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1890,9 +1890,11 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) db->db.db_size = size; if (db->db_level == 0) { - ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); db->db_last_dirty->dt.dl.dr_data = buf; } + ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); + ASSERT3U(db->db_last_dirty->dr_accounted, ==, osize); + db->db_last_dirty->dr_accounted = size; mutex_exit(&db->db_mtx); dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); @@ -2105,7 +2107,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } - if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) + if (db->db_blkid != DMU_BONUS_BLKID) dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; @@ -4312,8 +4314,7 @@ dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) /* * The callback will be called io_phys_children times. Retire one * portion of our dirty space each time we are called. Any rounding - * error will be cleaned up by dsl_pool_sync()'s call to - * dsl_pool_undirty_space(). + * error will be cleaned up by dbuf_write_done(). */ delta = dr->dr_accounted / zio->io_phys_children; dsl_pool_undirty_space(dp, delta, zio->io_txg); @@ -4396,13 +4397,36 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); + + /* + * If we didn't do a physical write in this ZIO and we + * still ended up here, it means that the space of the + * dbuf that we just released (and undirtied) above hasn't + * been marked as undirtied in the pool's accounting. + * + * Thus, we undirty that space in the pool's view of the + * world here. For physical writes this type of update + * happens in dbuf_write_physdone(). + * + * If we did a physical write, cleanup any rounding errors + * that came up due to writing multiple copies of a block + * on disk [see dbuf_write_physdone()]. + */ + if (zio->io_phys_children == 0) { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted, zio->io_txg); + } else { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted % zio->io_phys_children, zio->io_txg); + } + + kmem_free(dr, sizeof (dbuf_dirty_record_t)); } static void diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 955588fb7b6a..aa3ef6458d47 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1090,6 +1090,9 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } +/* + * Note: Lustre is an external consumer of this interface. + */ void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 7a540bdfaf10..3afafd1827ac 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -2908,9 +2908,17 @@ dmu_fsname(const char *snapname, char *buf) } /* - * Call when we think we're going to write/free space in open context to track - * the amount of dirty data in the open txg, which is also the amount - * of memory that can not be evicted until this txg syncs. + * Call when we think we're going to write/free space in open context + * to track the amount of dirty data in the open txg, which is also the + * amount of memory that can not be evicted until this txg syncs. + * + * Note that there are two conditions where this can be called from + * syncing context: + * + * [1] When we just created the dataset, in which case we go on with + * updating any accounting of dirty data as usual. + * [2] When we are dirtying MOS data, in which case we only update the + * pool's accounting of dirty data. */ void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) @@ -2920,8 +2928,9 @@ dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) if (ds != NULL) { dsl_dir_willuse_space(ds->ds_dir, aspace, tx); - dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } + + dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } #if defined(_KERNEL) diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 9fb3a061d946..1f1fd6462720 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -658,15 +658,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) } VERIFY0(zio_wait(zio)); - /* - * We have written all of the accounted dirty data, so our - * dp_space_towrite should now be zero. However, some seldom-used - * code paths do not adhere to this (e.g. dbuf_undirty(), also - * rounding error in dbuf_write_physdone). - * Shore up the accounting of any dirtied space now. - */ - dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); - /* * Update the long range free counter after * we're done syncing user data @@ -762,6 +753,21 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dsl_pool_sync_mos(dp, tx); } + /* + * We have written all of the accounted dirty data, so our + * dp_space_towrite should now be zero. However, some seldom-used + * code paths do not adhere to this (e.g. dbuf_undirty()). Shore up + * the accounting of any dirtied space now. + * + * Note that, besides any dirty data from datasets, the amount of + * dirty data in the MOS is also accounted by the pool. Therefore, + * we want to do this cleanup after dsl_pool_sync_mos() so we don't + * attempt to update the accounting for the same dirty data twice. + * (i.e. at this point we only update the accounting for the space + * that we know that we "leaked"). + */ + dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); + /* * If we modify a dataset in the same txg that we want to destroy it, * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.