Skip to content

Commit

Permalink
zfs_rename: pick up and finish renameat2 flags support
Browse files Browse the repository at this point in the history
Removing new txtypes in favor of compound ZIL operations, see comment in
module/zfs/zfs_log.c.

Signed-off-by: Pavel Snajdr <snajpa@snajpa.net>
  • Loading branch information
snajpa committed Oct 5, 2019
1 parent 612cab3 commit 008bc65
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 68 deletions.
7 changes: 7 additions & 0 deletions include/sys/zfs_znode.h
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,13 @@ extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name);
extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name, char *link);
extern void zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx,
uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp,
char *dname, znode_t *szp);
extern void zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx,
uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp,
char *dname, znode_t *szp, vsecattr_t *vsecp,
zfs_fuid_info_t *fuidp, vattr_t *vap);
extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
Expand Down
4 changes: 1 addition & 3 deletions include/sys/zil.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,7 @@ typedef enum zil_create {
#define TX_MKDIR_ATTR 18 /* mkdir with attr */
#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
#define TX_WRITE2 20 /* dmu_sync EALREADY write */
#define TX_EXCHANGE 21 /* Exchange two paths */
#define TX_WHITEOUT 22 /* Rename a file, leaving a whiteout */
#define TX_MAX_TYPE 23 /* Max transaction type */
#define TX_MAX_TYPE 21 /* Max transaction type */

/*
* The transactions for mkdir, symlink, remove, rmdir, link, and rename
Expand Down
83 changes: 42 additions & 41 deletions module/os/linux/zfs/zfs_vnops.c
Original file line number Diff line number Diff line change
Expand Up @@ -3671,7 +3671,6 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
int error = 0;
int zflg = 0;
boolean_t waited = B_FALSE;
uint64_t txtype;
/* Needed for whiteout inode creation. */
vattr_t wo_vap;
uint64_t wo_projid;
Expand All @@ -3682,13 +3681,6 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
if (snm == NULL || tnm == NULL)
return (SET_ERROR(EINVAL));

if (flags & RENAME_EXCHANGE)
txtype = TX_EXCHANGE;
else if (flags & RENAME_WHITEOUT)
txtype = TX_WHITEOUT;
else
txtype = TX_RENAME;

ZFS_ENTER(zfsvfs);
zilog = zfsvfs->z_log;

Expand Down Expand Up @@ -3887,7 +3879,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
/*
* Source and target must be the same type (unless exchanging).
*/
if (txtype != TX_EXCHANGE) {
if (!(flags & RENAME_EXCHANGE)) {
boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;

Expand All @@ -3907,13 +3899,13 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
}
}
/* Target must exist for RENAME_EXCHANGE. */
if (!tzp && txtype == TX_EXCHANGE) {
if ((flags & RENAME_EXCHANGE) && !tzp) {
error = SET_ERROR(ENOENT);
goto out;
}

/* Set up inode creation for RENAME_WHITEOUT. */
if (txtype == TX_WHITEOUT) {
if (flags & RENAME_WHITEOUT) {
error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr);
if (error)
goto out;
Expand All @@ -3937,7 +3929,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, sdzp->z_id, txtype == TX_EXCHANGE, snm);
dmu_tx_hold_zap(tx, sdzp->z_id, !!(flags & RENAME_EXCHANGE), snm);
dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
if (sdzp != tdzp) {
dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
Expand All @@ -3947,7 +3939,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, tzp);
}
if (txtype == TX_WHITEOUT) {
if (flags & RENAME_WHITEOUT) {
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
ZFS_SA_BASE_ATTR_SIZE);

Expand Down Expand Up @@ -4011,7 +4003,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
if (tzp) {
int tzflg = zflg;

if (txtype == TX_EXCHANGE) {
if (flags & RENAME_EXCHANGE) {
/* This inode will be re-linked soon. */
tzflg |= ZRENAMING;

Expand Down Expand Up @@ -4042,38 +4034,47 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
*/
ASSERT0(error);

switch (txtype) {
case TX_EXCHANGE:
error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
/*
* The same argument as zfs_link_create() failing for
* szp applies here, since the source directory must
* have had an entry we are replacing.
*/
ASSERT3U(error, ==, 0);
if (error)
goto commit_unlink_td_szp;
break;
case TX_WHITEOUT: {
znode_t *wzp;

zfs_mknode(sdzp, &wo_vap, tx, cr, 0, &wzp, &acl_ids);
error = zfs_link_create(sdl, wzp, tx, ZNEW);
if (error) {
zfs_znode_delete(wzp, tx);
remove_inode_hash(ZTOI(wzp));
goto commit_unlink_td_szp;
}
/* No need to zfs_log_create_txtype here. */
}
if (flags & RENAME_EXCHANGE) {
error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
/*
* The same argument as above, we're
* holding dirent_lock, this shouldn't fail.
*/
ASSERT0(error);
} else if (flags & RENAME_WHITEOUT) {
znode_t *wzp;

zfs_mknode(sdzp, &wo_vap, tx, cr, 0, &wzp, &acl_ids);
error = zfs_link_create(sdl, wzp, tx, ZNEW);
/*
* The same argument as above, we're
* holding dirent_lock, this shouldn't fail.
*/
ASSERT0(error);
}

if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);

zfs_log_rename(zilog, tx, txtype |
(flags & FIGNORECASE ? TX_CI : 0), sdzp,
sdl->dl_name, tdzp, tdl->dl_name, szp);
if (flags & RENAME_EXCHANGE) {
zfs_log_rename_exchange(zilog, tx,
(flags & FIGNORECASE ? TX_CI : 0), sdzp,
sdl->dl_name, tdzp, tdl->dl_name, szp);
} else if (flags & RENAME_WHITEOUT) {
vsecattr_t vsecp;

vsecp.vsa_mask |= VSA_ACE_ALLTYPES;
error = zfs_getacl(szp, &vsecp, B_TRUE, cr);

zfs_log_rename_whiteout(zilog, tx,
(flags & FIGNORECASE ? TX_CI : 0), sdzp,
sdl->dl_name, tdzp, tdl->dl_name, szp,
&vsecp, acl_ids.z_fuidp, &wo_vap);
} else {
zfs_log_rename(zilog, tx,
(flags & FIGNORECASE ? TX_CI : 0), sdzp,
sdl->dl_name, tdzp, tdl->dl_name, szp);
}

dmu_tx_commit(tx);
out:
Expand Down
92 changes: 89 additions & 3 deletions module/zfs/zfs_log.c
Original file line number Diff line number Diff line change
Expand Up @@ -474,9 +474,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
}

/*
* Handles TX_{RENAME,EXCHANGE,WHITEOUT} transactions. They all have the same
* underyling structure (lr_rename_t) but have different txtypes to indicate
* different renameat2(2) flags.
* Handles TX_RENAME transactions.
*/
void
zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
Expand All @@ -490,6 +488,7 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
if (zil_replaying(zilog, tx))
return;

txtype |= TX_RENAME;
itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
lr = (lr_rename_t *)&itx->itx_lr;
lr->lr_sdoid = sdzp->z_id;
Expand All @@ -501,6 +500,93 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
zil_itx_assign(zilog, itx, tx);
}

/*
* At the moment, only Linux supports renameat2 variant of renameat, which
* adds three new flags of interest for us:
* RENAME_NOREPLACE: if the target name at the moment of the call exists,
* don't rewrite it and return error
* RENAME_EXCHANGE: atomically swap the two names on the filesystem
* RENAME_WHITEOUT: creates a whiteout inode in place of renamed file as
* an atomic operation
*
* Ideally, these operations should be represented as new ZFS Intent Log record
* types, which should mandate a new ZFS feature flag due to the on-disk format
* change. One would use spa_feature_incr/decr functions to indicate that we're
* actually actively using the new on-disk txtypes - but these functions are
* only supposed to be called from the txg syncing context.
* This means that would need to force out in-progress txg to disk and start
* a new one before writing any ZIL records, just so we can be sure that ZIL
* replaying ZFS gets told it should expect potentially incompatible ZIL
* txtypes. Doing this would hurt performance.
* Alternatively, we could just activate the feature on a pool when these
* renameat2 flags get first used and leave it at that - which would render the
* pool importable read-only on implementations without the new feature flag,
* even when no new txtypes IL records would be present on-disk - which on most
* setups could be 'almost' all the time, so it'd be a shame to have them all
* read-only on non-Linux platforms.
* As a third option, at least until more platforms implement renameat2, we
* choose to rely on the fact that the ZIL is replayed in single-threaded mode
* before the dataset is mounted. This way, we can represent the otherwise
* atomic operations as a series of plain good old txtypes known to all current
* OpenZFS implementations. To do that, we use these hacky functions:
*
* zfs_log_rename_exchange
* zfs_log_rename_whiteout
*
* To represent atomic rename with old non-atomic operations, we need
* a temporary new name; so we try picking a name until we succeed, then
* we get a dirent lock for that temp name until the final itx gets queued
*/

void
zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
{
zfs_dirlock_t *tmpdl;
znode_t *tmpzp = NULL;
char rndname[16];
char *tmpname;
int retries = 0;
int error;

tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
ASSERT3P(tmpname, !=, NULL);

for (int i = 0; i < 16; i++) {
int r = 0xFF;
while (r > 127 || r == 0 || r == '/')
random_get_pseudo_bytes((void *)&r, 1);
rndname[i] = (char)r;
}

do {
retries++;
(void) snprintf(tmpname, MAXPATHLEN,
"%s.zfs_renameat2_emul_%s%4d",
dname, rndname, retries);
error = zfs_dirent_lock(&tmpdl, tdzp, tmpname,
&tmpzp, ZNEW, NULL, NULL);
} while (error != 0 && retries != INT_MAX);
ASSERT3U(retries, !=, INT_MAX);

zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, tmpname, szp);
zfs_log_rename(zilog, tx, txtype, tdzp, tmpname, tdzp, dname, szp);

zfs_dirent_unlock(tmpdl);
kmem_free(tmpname, MAXPATHLEN);
}

/* See comment above zfs_log_rename_exchange */
void
zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp,
vsecattr_t *vsecp, zfs_fuid_info_t *fuidp, vattr_t *vap)
{
zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp);
txtype |= TX_CREATE_ATTR;
zfs_log_create(zilog, tx, txtype, sdzp, szp, sname, vsecp, fuidp, vap);
}

/*
* zfs_log_write() handles TX_WRITE transactions. The specified callback is
* called as soon as the write is on stable storage (be it via a DMU sync or a
Expand Down
23 changes: 2 additions & 21 deletions module/zfs/zfs_replay.c
Original file line number Diff line number Diff line change
Expand Up @@ -625,14 +625,15 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
}

static int
_zfs_replay_renameat2(void *arg1, void *arg2, boolean_t byteswap, int vflg)
zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
{
zfsvfs_t *zfsvfs = arg1;
lr_rename_t *lr = arg2;
char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
char *tname = sname + strlen(sname) + 1;
znode_t *sdzp, *tdzp;
int error;
int vflg = 0;

if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
Expand All @@ -656,24 +657,6 @@ _zfs_replay_renameat2(void *arg1, void *arg2, boolean_t byteswap, int vflg)
return (error);
}

static int
zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
{
return (_zfs_replay_renameat2(arg1, arg2, byteswap, 0));
}

static int
zfs_replay_exchange(void *arg1, void *arg2, boolean_t byteswap)
{
return (_zfs_replay_renameat2(arg1, arg2, byteswap, RENAME_EXCHANGE));
}

static int
zfs_replay_whiteout(void *arg1, void *arg2, boolean_t byteswap)
{
return (_zfs_replay_renameat2(arg1, arg2, byteswap, RENAME_WHITEOUT));
}

static int
zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
{
Expand Down Expand Up @@ -998,6 +981,4 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
zfs_replay_create, /* TX_MKDIR_ATTR */
zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
zfs_replay_write2, /* TX_WRITE2 */
zfs_replay_exchange, /* TX_EXCHANGE */
zfs_replay_whiteout, /* TX_WHITEOUT */
};

0 comments on commit 008bc65

Please sign in to comment.