Skip to content

Commit

Permalink
Make callers responsible for memory allocation in zfs_range_lock()
Browse files Browse the repository at this point in the history
zfs_range_lock() is used in zvols, and previously, it could deadlock due
to an allocation using KM_SLEEP. We avoid this by moving responsibility
the memory allocation from zfs_range_lock() to the caller. This enables
us to avoid such deadlocks and use stack allocations, which are more
efficient and prevents deadlocks. The contexts in which stack
allocations are done do not appear to be stack heavy, so we do not risk
overflowing the stack from doing this.

Signed-off-by: Richard Yao <ryao@cs.stonybrook.edu>

Conflicts:

	module/zfs/zvol.c
  • Loading branch information
ryao committed Jun 25, 2012
1 parent 95b2930 commit 513ad2a
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 62 deletions.
32 changes: 17 additions & 15 deletions cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -973,12 +973,11 @@ ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
}

static rl_t *
ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
ztest_range_lock(rl_t *rl, ztest_ds_t *zd, uint64_t object, uint64_t offset,
uint64_t size, rl_type_t type)
{
uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
rl_t *rl;

rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
rl->rl_object = object;
Expand Down Expand Up @@ -1389,7 +1388,7 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
dmu_tx_t *tx;
dmu_buf_t *db;
arc_buf_t *abuf = NULL;
rl_t *rl;
rl_t rl;

if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
Expand All @@ -1413,7 +1412,7 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
bt = NULL;

ztest_object_lock(zd, lr->lr_foid, RL_READER);
rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
ztest_range_lock(&rl, zd, lr->lr_foid, offset, length, RL_WRITER);

VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));

Expand All @@ -1438,7 +1437,7 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
if (abuf != NULL)
dmu_return_arcbuf(abuf);
dmu_buf_rele(db, FTAG);
ztest_range_unlock(rl);
ztest_range_unlock(&rl);
ztest_object_unlock(zd, lr->lr_foid);
return (ENOSPC);
}
Expand Down Expand Up @@ -1495,7 +1494,7 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)

dmu_tx_commit(tx);

ztest_range_unlock(rl);
ztest_range_unlock(&rl);
ztest_object_unlock(zd, lr->lr_foid);

return (0);
Expand All @@ -1507,13 +1506,13 @@ ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)
objset_t *os = zd->zd_os;
dmu_tx_t *tx;
uint64_t txg;
rl_t *rl;
rl_t rl;

if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));

ztest_object_lock(zd, lr->lr_foid, RL_READER);
rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
ztest_range_lock(&rl, zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
RL_WRITER);

tx = dmu_tx_create(os);
Expand All @@ -1522,7 +1521,7 @@ ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)

txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
if (txg == 0) {
ztest_range_unlock(rl);
ztest_range_unlock(&rl);
ztest_object_unlock(zd, lr->lr_foid);
return (ENOSPC);
}
Expand All @@ -1534,7 +1533,7 @@ ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)

dmu_tx_commit(tx);

ztest_range_unlock(rl);
ztest_range_unlock(&rl);
ztest_object_unlock(zd, lr->lr_foid);

return (0);
Expand Down Expand Up @@ -1670,6 +1669,8 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
dmu_object_info_t doi;
dmu_buf_t *db;
zgd_t *zgd;
rl_t rl;

int error;

ztest_object_lock(zd, object, RL_READER);
Expand All @@ -1694,9 +1695,10 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
zgd->zgd_zilog = zd->zd_zilog;
zgd->zgd_private = zd;
zgd->zgd_rl = &rl;

if (buf != NULL) { /* immediate write */
zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
ztest_range_lock(zgd->zgd_rl, zd, object, offset, size,
RL_READER);

error = dmu_read(os, object, offset, size, buf,
Expand All @@ -1711,7 +1713,7 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
offset = 0;
}

zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
ztest_range_lock(zgd->zgd_rl, zd, object, offset, size,
RL_READER);

error = dmu_buf_hold(os, object, offset, zgd, &db,
Expand Down Expand Up @@ -1953,12 +1955,12 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
objset_t *os = zd->zd_os;
dmu_tx_t *tx;
uint64_t txg;
rl_t *rl;
rl_t rl;

txg_wait_synced(dmu_objset_pool(os), 0);

ztest_object_lock(zd, object, RL_READER);
rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
ztest_range_lock(&rl, zd, object, offset, size, RL_WRITER);

tx = dmu_tx_create(os);

Expand All @@ -1974,7 +1976,7 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
(void) dmu_free_long_range(os, object, offset, size);
}

ztest_range_unlock(rl);
ztest_range_unlock(&rl);
ztest_object_unlock(zd, object);
}

Expand Down
2 changes: 1 addition & 1 deletion include/sys/zfs_rlock.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ typedef struct rl {
* is converted to WRITER that specified to lock from the start of the
* end of file. zfs_range_lock() returns the range lock structure.
*/
rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);
rl_t *zfs_range_lock(rl_t *rl, znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);

/*
* Unlock range and destroy range lock structure.
Expand Down
10 changes: 4 additions & 6 deletions module/zfs/zfs_rlock.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@
* Interface
* ---------
* Defined in zfs_rlock.h but essentially:
* rl = zfs_range_lock(zp, off, len, lock_type);
* zfs_range_unlock(rl);
* zfs_range_reduce(rl, off, len);
* zfs_range_lock(&rl, zp, off, len, lock_type);
* zfs_range_unlock(&rl);
* zfs_range_reduce(&rl, off, len);
*
* AVL tree
* --------
Expand Down Expand Up @@ -420,13 +420,11 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new)
* previously locked as RL_WRITER).
*/
rl_t *
zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
zfs_range_lock(rl_t *new, znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
{
rl_t *new;

ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);

new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
new->r_zp = zp;
new->r_off = off;
if (len + off < off) /* overflow */
Expand Down
28 changes: 14 additions & 14 deletions module/zfs/zfs_vnops.c
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
objset_t *os;
ssize_t n, nbytes;
int error = 0;
rl_t *rl;
rl_t rl;
#ifdef HAVE_UIO_ZEROCOPY
xuio_t *xuio = NULL;
#endif /* HAVE_UIO_ZEROCOPY */
Expand Down Expand Up @@ -418,7 +418,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
/*
* Lock the range against changes.
*/
rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
zfs_range_lock(&rl, zp, uio->uio_loffset, uio->uio_resid, RL_READER);

/*
* If we are reading past end-of-file we can skip
Expand Down Expand Up @@ -482,7 +482,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
n -= nbytes;
}
out:
zfs_range_unlock(rl);
zfs_range_unlock(&rl);

ZFS_ACCESSTIME_STAMP(zsb, zp);
zfs_inode_update(zp);
Expand Down Expand Up @@ -524,7 +524,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
zilog_t *zilog;
offset_t woff;
ssize_t n, nbytes;
rl_t *rl;
rl_t rl;
int max_blksz = zsb->z_max_blksz;
int error = 0;
arc_buf_t *abuf;
Expand Down Expand Up @@ -608,9 +608,9 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
* Obtain an appending range lock to guarantee file append
* semantics. We reset the write offset once we have the lock.
*/
rl = zfs_range_lock(zp, 0, n, RL_APPEND);
woff = rl->r_off;
if (rl->r_len == UINT64_MAX) {
zfs_range_lock(&rl, zp, 0, n, RL_APPEND);
woff = rl.r_off;
if (rl.r_len == UINT64_MAX) {
/*
* We overlocked the file because this write will cause
* the file block size to increase.
Expand All @@ -625,11 +625,11 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
* this write, then this range lock will lock the entire file
* so that we can re-write the block safely.
*/
rl = zfs_range_lock(zp, woff, n, RL_WRITER);
zfs_range_lock(&rl, zp, woff, n, RL_WRITER);
}

if (woff >= limit) {
zfs_range_unlock(rl);
zfs_range_unlock(&rl);
ZFS_EXIT(zsb);
return (EFBIG);
}
Expand Down Expand Up @@ -719,7 +719,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
* on the first iteration since zfs_range_reduce() will
* shrink down r_len to the appropriate size.
*/
if (rl->r_len == UINT64_MAX) {
if (rl.r_len == UINT64_MAX) {
uint64_t new_blksz;

if (zp->z_blksz > max_blksz) {
Expand All @@ -729,7 +729,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
new_blksz = MIN(end_size, max_blksz);
}
zfs_grow_blocksize(zp, new_blksz, tx);
zfs_range_reduce(rl, woff, n);
zfs_range_reduce(&rl, woff, n);
}

/*
Expand Down Expand Up @@ -842,7 +842,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
uio_prefaultpages(MIN(n, max_blksz), uio);
}

zfs_range_unlock(rl);
zfs_range_unlock(&rl);

/*
* If we're in replay mode, or we made no progress, return error.
Expand Down Expand Up @@ -946,7 +946,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
zfs_range_lock(zgd->zgd_rl, zp, offset, size, RL_READER);
/* test for truncation needs to be done while range locked */
if (offset >= zp->z_size) {
error = ENOENT;
Expand All @@ -967,7 +967,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
size = zp->z_blksz;
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
offset -= blkoff;
zgd->zgd_rl = zfs_range_lock(zp, offset, size,
zfs_range_lock(zgd->zgd_rl, zp, offset, size,
RL_READER);
if (zp->z_blksz == size)
break;
Expand Down
Loading

0 comments on commit 513ad2a

Please sign in to comment.