Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dmu_tx_wait() hang likely due to cv_signal() in dsl_pool_dirty_delta() #9137

Merged
merged 1 commit into from
Aug 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 29 additions & 5 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1890,9 +1890,11 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
db->db.db_size = size;

if (db->db_level == 0) {
ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
db->db_last_dirty->dt.dl.dr_data = buf;
}
ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
ASSERT3U(db->db_last_dirty->dr_accounted, ==, osize);
db->db_last_dirty->dr_accounted = size;
mutex_exit(&db->db_mtx);

dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
Expand Down Expand Up @@ -2105,7 +2107,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
if (db->db_blkid != DMU_BONUS_BLKID)
dr->dr_accounted = db->db.db_size;
dr->dr_dbuf = db;
dr->dr_txg = tx->tx_txg;
Expand Down Expand Up @@ -4312,8 +4314,7 @@ dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
/*
* The callback will be called io_phys_children times. Retire one
* portion of our dirty space each time we are called. Any rounding
* error will be cleaned up by dsl_pool_sync()'s call to
* dsl_pool_undirty_space().
* error will be cleaned up by dbuf_write_done().
*/
delta = dr->dr_accounted / zio->io_phys_children;
dsl_pool_undirty_space(dp, delta, zio->io_txg);
Expand Down Expand Up @@ -4396,13 +4397,36 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
mutex_destroy(&dr->dt.di.dr_mtx);
list_destroy(&dr->dt.di.dr_children);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));

cv_broadcast(&db->db_changed);
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
db->db_data_pending = NULL;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);

/*
* If we didn't do a physical write in this ZIO and we
* still ended up here, it means that the space of the
* dbuf that we just released (and undirtied) above hasn't
* been marked as undirtied in the pool's accounting.
*
* Thus, we undirty that space in the pool's view of the
* world here. For physical writes this type of update
* happens in dbuf_write_physdone().
*
* If we did a physical write, cleanup any rounding errors
* that came up due to writing multiple copies of a block
* on disk [see dbuf_write_physdone()].
*/
if (zio->io_phys_children == 0) {
dsl_pool_undirty_space(dmu_objset_pool(os),
dr->dr_accounted, zio->io_txg);
} else {
dsl_pool_undirty_space(dmu_objset_pool(os),
dr->dr_accounted % zio->io_phys_children, zio->io_txg);
}

kmem_free(dr, sizeof (dbuf_dirty_record_t));
}

static void
Expand Down
3 changes: 3 additions & 0 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1090,6 +1090,9 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
}

/*
* Note: Lustre is an external consumer of this interface.
*/
void
dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx)
Expand Down
17 changes: 13 additions & 4 deletions module/zfs/dmu_objset.c
Original file line number Diff line number Diff line change
Expand Up @@ -2908,9 +2908,17 @@ dmu_fsname(const char *snapname, char *buf)
}

/*
* Call when we think we're going to write/free space in open context to track
* the amount of dirty data in the open txg, which is also the amount
* of memory that can not be evicted until this txg syncs.
* Call when we think we're going to write/free space in open context
* to track the amount of dirty data in the open txg, which is also the
* amount of memory that can not be evicted until this txg syncs.
*
* Note that there are two conditions where this can be called from
* syncing context:
*
* [1] When we just created the dataset, in which case we go on with
* updating any accounting of dirty data as usual.
* [2] When we are dirtying MOS data, in which case we only update the
* pool's accounting of dirty data.
*/
void
dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
Expand All @@ -2920,8 +2928,9 @@ dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)

if (ds != NULL) {
dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
}

dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
}

#if defined(_KERNEL)
Expand Down
24 changes: 15 additions & 9 deletions module/zfs/dsl_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -658,15 +658,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
}
VERIFY0(zio_wait(zio));

/*
* We have written all of the accounted dirty data, so our
* dp_space_towrite should now be zero. However, some seldom-used
* code paths do not adhere to this (e.g. dbuf_undirty(), also
* rounding error in dbuf_write_physdone).
* Shore up the accounting of any dirtied space now.
*/
dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);

/*
* Update the long range free counter after
* we're done syncing user data
Expand Down Expand Up @@ -762,6 +753,21 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dsl_pool_sync_mos(dp, tx);
}

/*
* We have written all of the accounted dirty data, so our
* dp_space_towrite should now be zero. However, some seldom-used
* code paths do not adhere to this (e.g. dbuf_undirty()). Shore up
* the accounting of any dirtied space now.
*
* Note that, besides any dirty data from datasets, the amount of
* dirty data in the MOS is also accounted by the pool. Therefore,
* we want to do this cleanup after dsl_pool_sync_mos() so we don't
* attempt to update the accounting for the same dirty data twice.
* (i.e. at this point we only update the accounting for the space
* that we know that we "leaked").
*/
dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);

/*
* If we modify a dataset in the same txg that we want to destroy it,
* its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
Expand Down