Skip to content

Commit

Permalink
Switch KM_SLEEP to KM_PUSHPAGE
Browse files Browse the repository at this point in the history
Differences between how paging is done on Solaris and Linux can cause
deadlocks if KM_SLEEP is used in any the following contexts.

  * The z_wr_* threads
  * The txg_sync thread
  * The zvol write/discard threads
  * The zpl_putpage() VFS callback

This is because KM_SLEEP will allow for direct reclaim which may result
in the VM calling back in to the filesystem or block layer to write out
pages.  If a lock is held over this operation the potential exists to
deadlock the system.  To ensure forward progress all memory allocations
in these contexts must us KM_PUSHPAGE which disables performing any I/O
to accomplish the memory allocation.

Previously, this behavior was acheived by setting PF_MEMALLOC on the
thread.  However, that resulted in unexpected side effects such as the
exhaustion of pages in ZONE_DMA.  This approach touchs more of the zfs
code, but it is more consistent with the right way to handle these cases
under Linux.

This is patch lays the ground work for being able to safely revert the
following commits which used PF_MEMALLOC:

  21ade34 Disable direct reclaim for z_wr_* threads
  cfc9a5c Fix zpl_writepage() deadlock
  eec8164 Fix ASSERTION(!dsl_pool_sync_context(tx->tx_pool))

Signed-off-by: Richard Yao <ryao@cs.stonybrook.edu>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #726
  • Loading branch information
ryao authored and behlendorf committed May 21, 2012
1 parent c421831 commit 48279ed
Show file tree
Hide file tree
Showing 7 changed files with 24 additions and 24 deletions.
6 changes: 3 additions & 3 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ dbuf_init(void)
#if defined(_KERNEL) && defined(HAVE_SPL)
/* Large allocations which do not require contiguous pages
* should be using vmem_alloc() in the linux kernel */
h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_PUSHPAGE);
#else
h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
#endif
Expand Down Expand Up @@ -1719,7 +1719,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_type != DMU_OT_NONE);

db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
db = kmem_cache_alloc(dbuf_cache, KM_PUSHPAGE);

db->db_objset = os;
db->db.db_object = dn->dn_object;
Expand Down Expand Up @@ -2019,7 +2019,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
int error;

dh = kmem_zalloc(sizeof(struct dbuf_hold_impl_data) *
DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP);
DBUF_HOLD_IMPL_MAX_DEPTH, KM_PUSHPAGE);
__dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0);

error = __dbuf_hold_impl(dh);
Expand Down
12 changes: 6 additions & 6 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
}
nblks = 1;
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP | KM_NODEBUG);
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_PUSHPAGE | KM_NODEBUG);

if (dn->dn_objset->os_dsl_dataset)
dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
Expand Down Expand Up @@ -863,11 +863,11 @@ dmu_xuio_init(xuio_t *xuio, int nblk)
uio_t *uio = &xuio->xu_uio;

uio->uio_iovcnt = nblk;
uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_PUSHPAGE);

priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_PUSHPAGE);
priv->cnt = nblk;
priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_PUSHPAGE);
priv->iovp = uio->uio_iov;
XUIO_XUZC_PRIV(xuio) = priv;

Expand Down Expand Up @@ -1431,7 +1431,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
return (EIO); /* Make zl_get_data do txg_waited_synced() */
}

dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_PUSHPAGE);
dsa->dsa_dr = NULL;
dsa->dsa_done = done;
dsa->dsa_zgd = zgd;
Expand Down Expand Up @@ -1555,7 +1555,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
mutex_exit(&db->db_mtx);

dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_PUSHPAGE);
dsa->dsa_dr = dr;
dsa->dsa_done = done;
dsa->dsa_zgd = zgd;
Expand Down
6 changes: 3 additions & 3 deletions module/zfs/dmu_tx.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ static kstat_t *dmu_tx_ksp;
dmu_tx_t *
dmu_tx_create_dd(dsl_dir_t *dd)
{
dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_PUSHPAGE);
tx->tx_dir = dd;
if (dd)
tx->tx_pool = dd->dd_pool;
Expand Down Expand Up @@ -141,7 +141,7 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
}
}

txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_PUSHPAGE);
txh->txh_tx = tx;
txh->txh_dnode = dn;
#ifdef DEBUG_DMU_TX
Expand Down Expand Up @@ -1241,7 +1241,7 @@ dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
{
dmu_tx_callback_t *dcb;

dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_PUSHPAGE);

dcb->dcb_func = func;
dcb->dcb_data = data;
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/dmu_zfetch.c
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
if (cur_streams >= max_streams) {
return;
}
newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
newstream = kmem_zalloc(sizeof (zstream_t), KM_PUSHPAGE);
}

newstream->zst_offset = zst.zst_offset;
Expand Down
6 changes: 3 additions & 3 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ static dnode_t *
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
uint64_t object, dnode_handle_t *dnh)
{
dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_PUSHPAGE);

ASSERT(!POINTER_IS_VALID(dn->dn_objset));
dn->dn_moved = 0;
Expand Down Expand Up @@ -1491,7 +1491,7 @@ dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
} else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
/* clear a chunk out of this range */
free_range_t *new_rp =
kmem_alloc(sizeof (free_range_t), KM_SLEEP);
kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE);

new_rp->fr_blkid = endblk;
new_rp->fr_nblks = fr_endblk - endblk;
Expand Down Expand Up @@ -1669,7 +1669,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];

/* Add new range to dn_ranges */
rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
rp = kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE);
rp->fr_blkid = blkid;
rp->fr_nblks = nblks;
found = avl_find(tree, rp, &where);
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/txg.c
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
TASKQ_THREADS_CPU_PCT | TASKQ_PREPOPULATE);
}

cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
cb_list = kmem_alloc(sizeof (list_t), KM_PUSHPAGE);
list_create(cb_list, sizeof (dmu_tx_callback_t),
offsetof(dmu_tx_callback_t, dcb_node));

Expand Down
14 changes: 7 additions & 7 deletions module/zfs/zil.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
if (avl_find(t, dva, &where) != NULL)
return (EEXIST);

zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
zn = kmem_alloc(sizeof (zil_bp_node_t), KM_PUSHPAGE);
zn->zn_dva = *dva;
avl_insert(t, zn, where);

Expand Down Expand Up @@ -434,7 +434,7 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
{
lwb_t *lwb;

lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
lwb = kmem_cache_alloc(zil_lwb_cache, KM_PUSHPAGE);
lwb->lwb_zilog = zilog;
lwb->lwb_blk = *bp;
lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
Expand Down Expand Up @@ -731,7 +731,7 @@ zil_add_block(zilog_t *zilog, const blkptr_t *bp)
for (i = 0; i < ndvas; i++) {
zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
if (avl_find(t, &zvsearch, &where) == NULL) {
zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
zv = kmem_alloc(sizeof (*zv), KM_PUSHPAGE);
zv->zv_vdev = zvsearch.zv_vdev;
avl_insert(t, zv, where);
}
Expand Down Expand Up @@ -1235,7 +1235,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
}
ASSERT(itxg->itxg_sod == 0);
itxg->itxg_txg = txg;
itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_PUSHPAGE);

list_create(&itxs->i_sync_list, sizeof (itx_t),
offsetof(itx_t, itx_node));
Expand All @@ -1255,7 +1255,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)

ian = avl_find(t, &foid, &where);
if (ian == NULL) {
ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
ian = kmem_alloc(sizeof (itx_async_node_t), KM_PUSHPAGE);
list_create(&ian->ia_list, sizeof (itx_t),
offsetof(itx_t, itx_node));
ian->ia_foid = foid;
Expand Down Expand Up @@ -1626,7 +1626,7 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
zilog_t *zilog;
int i;

zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
zilog = kmem_zalloc(sizeof (zilog_t), KM_PUSHPAGE);

zilog->zl_header = zh_phys;
zilog->zl_os = os;
Expand Down Expand Up @@ -1948,7 +1948,7 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
zr.zr_replay = replay_func;
zr.zr_arg = arg;
zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_PUSHPAGE);

/*
* Wait for in-progress removes to sync before starting replay.
Expand Down

0 comments on commit 48279ed

Please sign in to comment.