From 48279eda95adbec525237f1d05118028a44b954a Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 7 May 2012 13:49:51 -0400 Subject: [PATCH] Switch KM_SLEEP to KM_PUSHPAGE Differences between how paging is done on Solaris and Linux can cause deadlocks if KM_SLEEP is used in any the following contexts. * The z_wr_* threads * The txg_sync thread * The zvol write/discard threads * The zpl_putpage() VFS callback This is because KM_SLEEP will allow for direct reclaim which may result in the VM calling back in to the filesystem or block layer to write out pages. If a lock is held over this operation the potential exists to deadlock the system. To ensure forward progress all memory allocations in these contexts must us KM_PUSHPAGE which disables performing any I/O to accomplish the memory allocation. Previously, this behavior was acheived by setting PF_MEMALLOC on the thread. However, that resulted in unexpected side effects such as the exhaustion of pages in ZONE_DMA. This approach touchs more of the zfs code, but it is more consistent with the right way to handle these cases under Linux. This is patch lays the ground work for being able to safely revert the following commits which used PF_MEMALLOC: 21ade34 Disable direct reclaim for z_wr_* threads cfc9a5c Fix zpl_writepage() deadlock eec8164 Fix ASSERTION(!dsl_pool_sync_context(tx->tx_pool)) Signed-off-by: Richard Yao Signed-off-by: Brian Behlendorf Issue #726 --- module/zfs/dbuf.c | 6 +++--- module/zfs/dmu.c | 12 ++++++------ module/zfs/dmu_tx.c | 6 +++--- module/zfs/dmu_zfetch.c | 2 +- module/zfs/dnode.c | 6 +++--- module/zfs/txg.c | 2 +- module/zfs/zil.c | 14 +++++++------- 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 34ce2f62bd92..d5b469f67123 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -298,7 +298,7 @@ dbuf_init(void) #if defined(_KERNEL) && defined(HAVE_SPL) /* Large allocations which do not require contiguous pages * should be using vmem_alloc() in the linux kernel */ - h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); + h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_PUSHPAGE); #else h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); #endif @@ -1719,7 +1719,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); - db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + db = kmem_cache_alloc(dbuf_cache, KM_PUSHPAGE); db->db_objset = os; db->db.db_object = dn->dn_object; @@ -2019,7 +2019,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, int error; dh = kmem_zalloc(sizeof(struct dbuf_hold_impl_data) * - DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); + DBUF_HOLD_IMPL_MAX_DEPTH, KM_PUSHPAGE); __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0); error = __dbuf_hold_impl(dh); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index cda4f8428483..1d4d1257d54e 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -381,7 +381,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, } nblks = 1; } - dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP | KM_NODEBUG); + dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_PUSHPAGE | KM_NODEBUG); if (dn->dn_objset->os_dsl_dataset) dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; @@ -863,11 +863,11 @@ dmu_xuio_init(xuio_t *xuio, int nblk) uio_t *uio = &xuio->xu_uio; uio->uio_iovcnt = nblk; - uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); + uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_PUSHPAGE); - priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); + priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_PUSHPAGE); priv->cnt = nblk; - priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); + priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_PUSHPAGE); priv->iovp = uio->uio_iov; XUIO_XUZC_PRIV(xuio) = priv; @@ -1431,7 +1431,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, return (EIO); /* Make zl_get_data do txg_waited_synced() */ } - dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_PUSHPAGE); dsa->dsa_dr = NULL; dsa->dsa_done = done; dsa->dsa_zgd = zgd; @@ -1555,7 +1555,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); - dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_PUSHPAGE); dsa->dsa_dr = dr; dsa->dsa_done = done; dsa->dsa_zgd = zgd; diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index ead0f3e2a140..81c6dfea2eab 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -63,7 +63,7 @@ static kstat_t *dmu_tx_ksp; dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { - dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); + dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_PUSHPAGE); tx->tx_dir = dd; if (dd) tx->tx_pool = dd->dd_pool; @@ -141,7 +141,7 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, } } - txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); + txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_PUSHPAGE); txh->txh_tx = tx; txh->txh_dnode = dn; #ifdef DEBUG_DMU_TX @@ -1241,7 +1241,7 @@ dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { dmu_tx_callback_t *dcb; - dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); + dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_PUSHPAGE); dcb->dcb_func = func; dcb->dcb_data = data; diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 897ea8adbcb1..1763bae5184a 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -699,7 +699,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) if (cur_streams >= max_streams) { return; } - newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); + newstream = kmem_zalloc(sizeof (zstream_t), KM_PUSHPAGE); } newstream->zst_offset = zst.zst_offset; diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 5438f60d0003..99ac6256561d 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -372,7 +372,7 @@ static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { - dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); + dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_PUSHPAGE); ASSERT(!POINTER_IS_VALID(dn->dn_objset)); dn->dn_moved = 0; @@ -1491,7 +1491,7 @@ dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) } else if (blkid > rp->fr_blkid && endblk < fr_endblk) { /* clear a chunk out of this range */ free_range_t *new_rp = - kmem_alloc(sizeof (free_range_t), KM_SLEEP); + kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE); new_rp->fr_blkid = endblk; new_rp->fr_nblks = fr_endblk - endblk; @@ -1669,7 +1669,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK]; /* Add new range to dn_ranges */ - rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP); + rp = kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE); rp->fr_blkid = blkid; rp->fr_nblks = nblks; found = avl_find(tree, rp, &where); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 6e64adf9376e..5021e441fab7 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -339,7 +339,7 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) TASKQ_THREADS_CPU_PCT | TASKQ_PREPOPULATE); } - cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); + cb_list = kmem_alloc(sizeof (list_t), KM_PUSHPAGE); list_create(cb_list, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 5296b38be726..895ba52de748 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -144,7 +144,7 @@ zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) if (avl_find(t, dva, &where) != NULL) return (EEXIST); - zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); + zn = kmem_alloc(sizeof (zil_bp_node_t), KM_PUSHPAGE); zn->zn_dva = *dva; avl_insert(t, zn, where); @@ -434,7 +434,7 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg) { lwb_t *lwb; - lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); + lwb = kmem_cache_alloc(zil_lwb_cache, KM_PUSHPAGE); lwb->lwb_zilog = zilog; lwb->lwb_blk = *bp; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); @@ -731,7 +731,7 @@ zil_add_block(zilog_t *zilog, const blkptr_t *bp) for (i = 0; i < ndvas; i++) { zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); if (avl_find(t, &zvsearch, &where) == NULL) { - zv = kmem_alloc(sizeof (*zv), KM_SLEEP); + zv = kmem_alloc(sizeof (*zv), KM_PUSHPAGE); zv->zv_vdev = zvsearch.zv_vdev; avl_insert(t, zv, where); } @@ -1235,7 +1235,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) } ASSERT(itxg->itxg_sod == 0); itxg->itxg_txg = txg; - itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); + itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_PUSHPAGE); list_create(&itxs->i_sync_list, sizeof (itx_t), offsetof(itx_t, itx_node)); @@ -1255,7 +1255,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) ian = avl_find(t, &foid, &where); if (ian == NULL) { - ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); + ian = kmem_alloc(sizeof (itx_async_node_t), KM_PUSHPAGE); list_create(&ian->ia_list, sizeof (itx_t), offsetof(itx_t, itx_node)); ian->ia_foid = foid; @@ -1626,7 +1626,7 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) zilog_t *zilog; int i; - zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); + zilog = kmem_zalloc(sizeof (zilog_t), KM_PUSHPAGE); zilog->zl_header = zh_phys; zilog->zl_os = os; @@ -1948,7 +1948,7 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) zr.zr_replay = replay_func; zr.zr_arg = arg; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); - zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); + zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_PUSHPAGE); /* * Wait for in-progress removes to sync before starting replay.