diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index 734499341fd1..6bfdc05cf434 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -402,6 +402,13 @@ extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name); extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name, char *link); +extern void zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, + uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, + char *dname, znode_t *szp); +extern void zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, + uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, + char *dname, znode_t *szp, vsecattr_t *vsecp, + zfs_fuid_info_t *fuidp, vattr_t *vap); extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, diff --git a/include/sys/zil.h b/include/sys/zil.h index f27345b364e1..6b038a9dd228 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -162,9 +162,7 @@ typedef enum zil_create { #define TX_MKDIR_ATTR 18 /* mkdir with attr */ #define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ #define TX_WRITE2 20 /* dmu_sync EALREADY write */ -#define TX_EXCHANGE 21 /* Exchange two paths */ -#define TX_WHITEOUT 22 /* Rename a file, leaving a whiteout */ -#define TX_MAX_TYPE 23 /* Max transaction type */ +#define TX_MAX_TYPE 21 /* Max transaction type */ /* * The transactions for mkdir, symlink, remove, rmdir, link, and rename diff --git a/module/os/linux/zfs/zfs_vnops.c b/module/os/linux/zfs/zfs_vnops.c index a11a18126f22..423a89dee069 100644 --- a/module/os/linux/zfs/zfs_vnops.c +++ b/module/os/linux/zfs/zfs_vnops.c @@ -3671,7 +3671,6 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, int error = 0; int zflg = 0; boolean_t waited = B_FALSE; - uint64_t txtype; /* Needed for whiteout inode creation. */ vattr_t wo_vap; uint64_t wo_projid; @@ -3682,13 +3681,6 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); - if (flags & RENAME_EXCHANGE) - txtype = TX_EXCHANGE; - else if (flags & RENAME_WHITEOUT) - txtype = TX_WHITEOUT; - else - txtype = TX_RENAME; - ZFS_ENTER(zfsvfs); zilog = zfsvfs->z_log; @@ -3887,7 +3879,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, /* * Source and target must be the same type (unless exchanging). */ - if (txtype != TX_EXCHANGE) { + if (!(flags & RENAME_EXCHANGE)) { boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; @@ -3907,13 +3899,13 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, } } /* Target must exist for RENAME_EXCHANGE. */ - if (!tzp && txtype == TX_EXCHANGE) { + if ((flags & RENAME_EXCHANGE) && !tzp) { error = SET_ERROR(ENOENT); goto out; } /* Set up inode creation for RENAME_WHITEOUT. */ - if (txtype == TX_WHITEOUT) { + if (flags & RENAME_WHITEOUT) { error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr); if (error) goto out; @@ -3937,7 +3929,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, sdzp->z_id, txtype == TX_EXCHANGE, snm); + dmu_tx_hold_zap(tx, sdzp->z_id, !!(flags & RENAME_EXCHANGE), snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); @@ -3947,7 +3939,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } - if (txtype == TX_WHITEOUT) { + if (flags & RENAME_WHITEOUT) { dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); @@ -4011,7 +4003,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, if (tzp) { int tzflg = zflg; - if (txtype == TX_EXCHANGE) { + if (flags & RENAME_EXCHANGE) { /* This inode will be re-linked soon. */ tzflg |= ZRENAMING; @@ -4042,38 +4034,47 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, */ ASSERT0(error); - switch (txtype) { - case TX_EXCHANGE: - error = zfs_link_create(sdl, tzp, tx, ZRENAMING); - /* - * The same argument as zfs_link_create() failing for - * szp applies here, since the source directory must - * have had an entry we are replacing. - */ - ASSERT3U(error, ==, 0); - if (error) - goto commit_unlink_td_szp; - break; - case TX_WHITEOUT: { - znode_t *wzp; - - zfs_mknode(sdzp, &wo_vap, tx, cr, 0, &wzp, &acl_ids); - error = zfs_link_create(sdl, wzp, tx, ZNEW); - if (error) { - zfs_znode_delete(wzp, tx); - remove_inode_hash(ZTOI(wzp)); - goto commit_unlink_td_szp; - } - /* No need to zfs_log_create_txtype here. */ - } + if (flags & RENAME_EXCHANGE) { + error = zfs_link_create(sdl, tzp, tx, ZRENAMING); + /* + * The same argument as above, we're + * holding dirent_lock, this shouldn't fail. + */ + ASSERT0(error); + } else if (flags & RENAME_WHITEOUT) { + znode_t *wzp; + + zfs_mknode(sdzp, &wo_vap, tx, cr, 0, &wzp, &acl_ids); + error = zfs_link_create(sdl, wzp, tx, ZNEW); + /* + * The same argument as above, we're + * holding dirent_lock, this shouldn't fail. + */ + ASSERT0(error); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - zfs_log_rename(zilog, tx, txtype | - (flags & FIGNORECASE ? TX_CI : 0), sdzp, - sdl->dl_name, tdzp, tdl->dl_name, szp); + if (flags & RENAME_EXCHANGE) { + zfs_log_rename_exchange(zilog, tx, + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); + } else if (flags & RENAME_WHITEOUT) { + vsecattr_t vsecp; + + vsecp.vsa_mask |= VSA_ACE_ALLTYPES; + error = zfs_getacl(szp, &vsecp, B_TRUE, cr); + + zfs_log_rename_whiteout(zilog, tx, + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp, + &vsecp, acl_ids.z_fuidp, &wo_vap); + } else { + zfs_log_rename(zilog, tx, + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); + } dmu_tx_commit(tx); out: diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 328faf30d5df..aae8f69f1446 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -474,9 +474,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, } /* - * Handles TX_{RENAME,EXCHANGE,WHITEOUT} transactions. They all have the same - * underyling structure (lr_rename_t) but have different txtypes to indicate - * different renameat2(2) flags. + * Handles TX_RENAME transactions. */ void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, @@ -490,6 +488,7 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, if (zil_replaying(zilog, tx)) return; + txtype |= TX_RENAME; itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); lr = (lr_rename_t *)&itx->itx_lr; lr->lr_sdoid = sdzp->z_id; @@ -501,6 +500,93 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, zil_itx_assign(zilog, itx, tx); } +/* + * At the moment, only Linux supports renameat2 variant of renameat, which + * adds three new flags of interest for us: + * RENAME_NOREPLACE: if the target name at the moment of the call exists, + * don't rewrite it and return error + * RENAME_EXCHANGE: atomically swap the two names on the filesystem + * RENAME_WHITEOUT: creates a whiteout inode in place of renamed file as + * an atomic operation + * + * Ideally, these operations should be represented as new ZFS Intent Log record + * types, which should mandate a new ZFS feature flag due to the on-disk format + * change. One would use spa_feature_incr/decr functions to indicate that we're + * actually actively using the new on-disk txtypes - but these functions are + * only supposed to be called from the txg syncing context. + * This means that would need to force out in-progress txg to disk and start + * a new one before writing any ZIL records, just so we can be sure that ZIL + * replaying ZFS gets told it should expect potentially incompatible ZIL + * txtypes. Doing this would hurt performance. + * Alternatively, we could just activate the feature on a pool when these + * renameat2 flags get first used and leave it at that - which would render the + * pool importable read-only on implementations without the new feature flag, + * even when no new txtypes IL records would be present on-disk - which on most + * setups could be 'almost' all the time, so it'd be a shame to have them all + * read-only on non-Linux platforms. + * As a third option, at least until more platforms implement renameat2, we + * choose to rely on the fact that the ZIL is replayed in single-threaded mode + * before the dataset is mounted. This way, we can represent the otherwise + * atomic operations as a series of plain good old txtypes known to all current + * OpenZFS implementations. To do that, we use these hacky functions: + * + * zfs_log_rename_exchange + * zfs_log_rename_whiteout + * + * To represent atomic rename with old non-atomic operations, we need + * a temporary new name; so we try picking a name until we succeed, then + * we get a dirent lock for that temp name until the final itx gets queued + */ + +void +zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) +{ + zfs_dirlock_t *tmpdl; + znode_t *tmpzp = NULL; + char rndname[16]; + char *tmpname; + int retries = 0; + int error; + + tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + ASSERT3P(tmpname, !=, NULL); + + for (int i = 0; i < 16; i++) { + int r = 0xFF; + while (r > 127 || r == 0 || r == '/') + random_get_pseudo_bytes((void *)&r, 1); + rndname[i] = (char)r; + } + + do { + retries++; + (void) snprintf(tmpname, MAXPATHLEN, + "%s.zfs_renameat2_emul_%s%4d", + dname, rndname, retries); + error = zfs_dirent_lock(&tmpdl, tdzp, tmpname, + &tmpzp, ZNEW, NULL, NULL); + } while (error != 0 && retries != INT_MAX); + ASSERT3U(retries, !=, INT_MAX); + + zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, tmpname, szp); + zfs_log_rename(zilog, tx, txtype, tdzp, tmpname, tdzp, dname, szp); + + zfs_dirent_unlock(tmpdl); + kmem_free(tmpname, MAXPATHLEN); +} + +/* See comment above zfs_log_rename_exchange */ +void +zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp, + vsecattr_t *vsecp, zfs_fuid_info_t *fuidp, vattr_t *vap) +{ + zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp); + txtype |= TX_CREATE_ATTR; + zfs_log_create(zilog, tx, txtype, sdzp, szp, sname, vsecp, fuidp, vap); +} + /* * zfs_log_write() handles TX_WRITE transactions. The specified callback is * called as soon as the write is on stable storage (be it via a DMU sync or a diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index fd31417c102e..7dea85bb6614 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -625,7 +625,7 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) } static int -_zfs_replay_renameat2(void *arg1, void *arg2, boolean_t byteswap, int vflg) +zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_rename_t *lr = arg2; @@ -633,6 +633,7 @@ _zfs_replay_renameat2(void *arg1, void *arg2, boolean_t byteswap, int vflg) char *tname = sname + strlen(sname) + 1; znode_t *sdzp, *tdzp; int error; + int vflg = 0; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -656,24 +657,6 @@ _zfs_replay_renameat2(void *arg1, void *arg2, boolean_t byteswap, int vflg) return (error); } -static int -zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) -{ - return (_zfs_replay_renameat2(arg1, arg2, byteswap, 0)); -} - -static int -zfs_replay_exchange(void *arg1, void *arg2, boolean_t byteswap) -{ - return (_zfs_replay_renameat2(arg1, arg2, byteswap, RENAME_EXCHANGE)); -} - -static int -zfs_replay_whiteout(void *arg1, void *arg2, boolean_t byteswap) -{ - return (_zfs_replay_renameat2(arg1, arg2, byteswap, RENAME_WHITEOUT)); -} - static int zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) { @@ -998,6 +981,4 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_create, /* TX_MKDIR_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ zfs_replay_write2, /* TX_WRITE2 */ - zfs_replay_exchange, /* TX_EXCHANGE */ - zfs_replay_whiteout, /* TX_WHITEOUT */ };