From 147e9581256cce16eafac20c331250798e6d0339 Mon Sep 17 00:00:00 2001 From: Pavel Snajdr Date: Thu, 5 Dec 2019 01:52:27 +0100 Subject: [PATCH 1/4] Remove zpl_revalidate: fix snapshot rollback Open files, which aren't present in the snapshot, which is being roll-backed to, need to disappear from the visible VFS image of the dataset. Kernel provides d_drop function to drop invalid entry from the dcache, but inode can be referenced by dentry multiple dentries. The introduced zpl_d_drop_aliases function walks and invalidates all aliases of an inode. Signed-off-by: Pavel Snajdr --- config/kernel-dentry-alias.m4 | 30 +++++++++++++ config/kernel.m4 | 2 + include/os/linux/kernel/linux/dcache_compat.h | 21 +++++++++ include/os/linux/zfs/sys/trace_acl.h | 9 ++-- include/os/linux/zfs/sys/zpl.h | 3 +- include/sys/zfs_znode.h | 1 - module/os/linux/zfs/zfs_ctldir.c | 1 - module/os/linux/zfs/zfs_vfsops.c | 3 +- module/os/linux/zfs/zfs_znode.c | 1 - module/os/linux/zfs/zpl_inode.c | 44 ------------------- 10 files changed, 59 insertions(+), 56 deletions(-) create mode 100644 config/kernel-dentry-alias.m4 diff --git a/config/kernel-dentry-alias.m4 b/config/kernel-dentry-alias.m4 new file mode 100644 index 000000000000..f0ddb8d010b0 --- /dev/null +++ b/config/kernel-dentry-alias.m4 @@ -0,0 +1,30 @@ +dnl # +dnl # 3.18 API change +dnl # Dentry aliases are in d_u struct dentry member +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_DENTRY_ALIAS_D_U], [ + ZFS_LINUX_TEST_SRC([dentry_alias_d_u], [ + #include + #include + #include + ], [ + struct inode *inode __attribute__ ((unused)) = NULL; + struct dentry *dentry __attribute__ ((unused)) = NULL; + hlist_for_each_entry(dentry, &inode->i_dentry, + d_u.d_alias) { + d_drop(dentry); + } + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_DENTRY_ALIAS_D_U], [ + AC_MSG_CHECKING([whether dentry aliases are in d_u member]) + ZFS_LINUX_TEST_RESULT([dentry_alias_d_u], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_DENTRY_D_U_ALIASES, 1, + [dentry aliases are in d_u member]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + diff --git a/config/kernel.m4 b/config/kernel.m4 index d4d13ddd1d1a..b573881c4400 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -96,6 +96,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_SETATTR_PREPARE ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED ZFS_AC_KERNEL_SRC_DENTRY + ZFS_AC_KERNEL_SRC_DENTRY_ALIAS_D_U ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SRC_SECURITY_INODE ZFS_AC_KERNEL_SRC_FST_MOUNT @@ -217,6 +218,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_SETATTR_PREPARE ZFS_AC_KERNEL_INSERT_INODE_LOCKED ZFS_AC_KERNEL_DENTRY + ZFS_AC_KERNEL_DENTRY_ALIAS_D_U ZFS_AC_KERNEL_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SECURITY_INODE ZFS_AC_KERNEL_FST_MOUNT diff --git a/include/os/linux/kernel/linux/dcache_compat.h b/include/os/linux/kernel/linux/dcache_compat.h index 0fbd92458679..4de1118daafa 100644 --- a/include/os/linux/kernel/linux/dcache_compat.h +++ b/include/os/linux/kernel/linux/dcache_compat.h @@ -61,4 +61,25 @@ d_clear_d_op(struct dentry *dentry) DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE); } +/* + * Walk and invalidate all dentry aliases of an inode + * unless it's a mountpoint + */ +static inline void +zpl_d_drop_aliases(struct inode *inode) +{ + struct dentry *dentry; + spin_lock(&inode->i_lock); +#ifdef HAVE_DENTRY_D_U_ALIASES + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { +#else + hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { +#endif + if (!IS_ROOT(dentry) && !d_mountpoint(dentry) && + (dentry->d_inode == inode)) { + d_drop(dentry); + } + } + spin_unlock(&inode->i_lock); +} #endif /* _ZFS_DCACHE_H */ diff --git a/include/os/linux/zfs/sys/trace_acl.h b/include/os/linux/zfs/sys/trace_acl.h index 6a73545fecda..2c734322267a 100644 --- a/include/os/linux/zfs/sys/trace_acl.h +++ b/include/os/linux/zfs/sys/trace_acl.h @@ -64,7 +64,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __field(boolean_t, z_is_sa) __field(boolean_t, z_is_mapped) __field(boolean_t, z_is_ctldir) - __field(boolean_t, z_is_stale) __field(uint32_t, i_uid) __field(uint32_t, i_gid) @@ -99,7 +98,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->z_is_sa = zn->z_is_sa; __entry->z_is_mapped = zn->z_is_mapped; __entry->z_is_ctldir = zn->z_is_ctldir; - __entry->z_is_stale = zn->z_is_stale; __entry->i_uid = KUID_TO_SUID(ZTOI(zn)->i_uid); __entry->i_gid = KGID_TO_SGID(ZTOI(zn)->i_gid); @@ -121,9 +119,8 @@ DECLARE_EVENT_CLASS(zfs_ace_class, "zn_prefetch %u blksz %u seq %u " "mapcnt %llu size %llu pflags %llu " "sync_cnt %u sync_writes_cnt %u async_writes_cnt %u " - "mode 0x%x is_sa %d is_mapped %d " - "is_ctldir %d is_stale %d inode { " - "uid %u gid %u ino %lu nlink %u size %lli " + "mode 0x%x is_sa %d is_mapped %d is_ctldir %d " + "inode { uid %u gid %u ino %lu nlink %u size %lli " "blkbits %u bytes %u mode 0x%x generation %x } } " "ace { type %u flags %u access_mask %u } mask_matched %u", __entry->z_id, __entry->z_unlinked, __entry->z_atime_dirty, @@ -132,7 +129,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->z_pflags, __entry->z_sync_cnt, __entry->z_sync_writes_cnt, __entry->z_async_writes_cnt, __entry->z_mode, __entry->z_is_sa, __entry->z_is_mapped, - __entry->z_is_ctldir, __entry->z_is_stale, __entry->i_uid, + __entry->z_is_ctldir, __entry->i_uid, __entry->i_gid, __entry->i_ino, __entry->i_nlink, __entry->i_size, __entry->i_blkbits, __entry->i_bytes, __entry->i_mode, __entry->i_generation, diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 30d73db6b9e8..83416d64744c 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -45,7 +45,8 @@ extern const struct inode_operations zpl_inode_operations; extern const struct inode_operations zpl_dir_inode_operations; extern const struct inode_operations zpl_symlink_inode_operations; extern const struct inode_operations zpl_special_inode_operations; -extern dentry_operations_t zpl_dentry_operations; + +/* zpl_file.c */ extern const struct address_space_operations zpl_address_space_operations; extern const struct file_operations zpl_file_operations; extern const struct file_operations zpl_dir_file_operations; diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index 7c906050bc47..c8656b3f6162 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -190,7 +190,6 @@ typedef struct znode { boolean_t z_is_sa; /* are we native sa? */ boolean_t z_is_mapped; /* are we mmap'ed */ boolean_t z_is_ctldir; /* are we .zfs entry */ - boolean_t z_is_stale; /* are we stale due to rollback? */ boolean_t z_suspended; /* extra ref from a suspend? */ uint_t z_blksz; /* block size in bytes */ uint_t z_seq; /* modification sequence number */ diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 4ae0a65370e5..519f13212fac 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -487,7 +487,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, zp->z_is_sa = B_FALSE; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_TRUE; - zp->z_is_stale = B_FALSE; zp->z_sa_hdl = NULL; zp->z_blksz = 0; zp->z_seq = 0; diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 7b682e49d84e..c921e587c75c 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1522,7 +1522,6 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) sb->s_op = &zpl_super_operations; sb->s_xattr = zpl_xattr_handlers; sb->s_export_op = &zpl_export_operations; - sb->s_d_op = &zpl_dentry_operations; /* Set features for file system. */ zfs_set_fuid_feature(zfsvfs); @@ -1881,8 +1880,8 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) zp = list_next(&zfsvfs->z_all_znodes, zp)) { err2 = zfs_rezget(zp); if (err2) { + zpl_d_drop_aliases(ZTOI(zp)); remove_inode_hash(ZTOI(zp)); - zp->z_is_stale = B_TRUE; } /* see comment in zfs_suspend_fs() */ diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index 9aeffba86150..3ded79a30a6f 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -552,7 +552,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_atime_dirty = B_FALSE; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_FALSE; - zp->z_is_stale = B_FALSE; zp->z_suspended = B_FALSE; zp->z_sa_hdl = NULL; zp->z_mapcnt = 0; diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index 8d073ff8cbd3..9b702c535ea7 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -728,46 +728,6 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) return (error); } -static int -#ifdef HAVE_D_REVALIDATE_NAMEIDATA -zpl_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - unsigned int flags = (nd ? nd->flags : 0); -#else -zpl_revalidate(struct dentry *dentry, unsigned int flags) -{ -#endif /* HAVE_D_REVALIDATE_NAMEIDATA */ - /* CSTYLED */ - zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; - int error; - - if (flags & LOOKUP_RCU) - return (-ECHILD); - - /* - * After a rollback negative dentries created before the rollback - * time must be invalidated. Otherwise they can obscure files which - * are only present in the rolled back dataset. - */ - if (dentry->d_inode == NULL) { - spin_lock(&dentry->d_lock); - error = time_before(dentry->d_time, zfsvfs->z_rollback_time); - spin_unlock(&dentry->d_lock); - - if (error) - return (0); - } - - /* - * The dentry may reference a stale inode if a mounted file system - * was rolled back to a point in time where the object didn't exist. - */ - if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale) - return (0); - - return (1); -} - const struct inode_operations zpl_inode_operations = { .setattr = zpl_setattr, .getattr = zpl_getattr, @@ -856,7 +816,3 @@ const struct inode_operations zpl_special_inode_operations = { .get_acl = zpl_get_acl, #endif /* CONFIG_FS_POSIX_ACL */ }; - -dentry_operations_t zpl_dentry_operations = { - .d_revalidate = zpl_revalidate, -}; From 7d57ff21807531617c75011c38cd672af01e87c2 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 18 May 2022 20:29:33 +1000 Subject: [PATCH 2/4] debug: add VERIFY_{IMPLY,EQUIV} variants This allows for much cleaner VERIFY-level assertions. Signed-off-by: Aleksa Sarai --- include/os/linux/spl/sys/debug.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/include/os/linux/spl/sys/debug.h b/include/os/linux/spl/sys/debug.h index 3c6f6d1b83bb..8bdc0b1d72d5 100644 --- a/include/os/linux/spl/sys/debug.h +++ b/include/os/linux/spl/sys/debug.h @@ -140,6 +140,16 @@ spl_assert(const char *buf, const char *file, const char *func, int line) (long long) (_verify3_right)); \ } while (0) +#define VERIFY_IMPLY(A, B) \ + ((void)(likely((!(A)) || (B)) || \ + spl_assert("(" #A ") implies (" #B ")", \ + __FILE__, __FUNCTION__, __LINE__))) + +#define VERIFY_EQUIV(A, B) \ + ((void)(likely(!!(A) == !!(B)) || \ + spl_assert("(" #A ") is equivalent to (" #B ")", \ + __FILE__, __FUNCTION__, __LINE__))) + /* * Debugging disabled (--disable-debug) */ @@ -171,14 +181,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line) #define ASSERT3P VERIFY3P #define ASSERT0 VERIFY0 #define ASSERT VERIFY -#define IMPLY(A, B) \ - ((void)(likely((!(A)) || (B)) || \ - spl_assert("(" #A ") implies (" #B ")", \ - __FILE__, __FUNCTION__, __LINE__))) -#define EQUIV(A, B) \ - ((void)(likely(!!(A) == !!(B)) || \ - spl_assert("(" #A ") is equivalent to (" #B ")", \ - __FILE__, __FUNCTION__, __LINE__))) +#define IMPLY VERIFY_IMPLY +#define EQUIV VERIFY_EQUIV #endif /* NDEBUG */ From 8b8e122aebf205a6802b76195808b04d0f69489a Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 26 Apr 2019 23:23:07 +1000 Subject: [PATCH 3/4] zfs_rename: restructure to have cleaner fallbacks This is in preparation for RENAME_EXCHANGE and RENAME_WHITEOUT support for ZoL, but the changes here allow for far nicer fallbacks than the previous implementation (the source and target are re-linked in case of the final link failing). In addition, a small cleanup was done for the "target exists but is a different type" codepath so that it's more understandable. Signed-off-by: Aleksa Sarai --- include/os/linux/zfs/sys/zfs_dir.h | 1 + module/os/linux/zfs/zfs_dir.c | 95 +++++++++++++++++------ module/os/linux/zfs/zfs_vnops_os.c | 119 ++++++++++++++++------------- 3 files changed, 139 insertions(+), 76 deletions(-) diff --git a/include/os/linux/zfs/sys/zfs_dir.h b/include/os/linux/zfs/sys/zfs_dir.h index 91d873ea4b7f..9b2232c68ba4 100644 --- a/include/os/linux/zfs/sys/zfs_dir.h +++ b/include/os/linux/zfs/sys/zfs_dir.h @@ -52,6 +52,7 @@ extern "C" { extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, int, int *, pathname_t *); extern void zfs_dirent_unlock(zfs_dirlock_t *); +extern int zfs_drop_nlink(znode_t *, dmu_tx_t *, boolean_t *); extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int); extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, boolean_t *); diff --git a/module/os/linux/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c index 611a2471dd94..fb6c28f95c3b 100644 --- a/module/os/linux/zfs/zfs_dir.c +++ b/module/os/linux/zfs/zfs_dir.c @@ -926,6 +926,74 @@ zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, return (error); } +static int +zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); + boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[3]; + uint64_t mtime[2], ctime[2]; + uint64_t links; + int count = 0; + int error; + + if (zp_is_dir && !zfs_dirempty(zp)) + return (SET_ERROR(ENOTEMPTY)); + + if (ZTOI(zp)->i_nlink <= zp_is_dir) { + zfs_panic_recover("zfs: link count on %lu is %u, " + "should be at least %u", zp->z_id, + (int)ZTOI(zp)->i_nlink, zp_is_dir + 1); + set_nlink(ZTOI(zp), zp_is_dir + 1); + } + drop_nlink(ZTOI(zp)); + if (ZTOI(zp)->i_nlink == zp_is_dir) { + zp->z_unlinked = B_TRUE; + clear_nlink(ZTOI(zp)); + unlinked = B_TRUE; + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + links = ZTOI(zp)->i_nlink; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &links, sizeof (links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT3U(error, ==, 0); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Forcefully drop an nlink reference from (zp) and mark it for deletion if it + * was the last link. This *must* only be done to znodes which have already + * been zfs_link_destroy()'d with ZRENAMING. This is explicitly only used in + * the error path of zfs_rename(), where we have to correct the nlink count if + * we failed to link the target as well as failing to re-link the original + * znodes. + */ +int +zfs_drop_nlink(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) +{ + int error; + + mutex_enter(&zp->z_lock); + error = zfs_drop_nlink_locked(zp, tx, unlinkedp); + mutex_exit(&zp->z_lock); + + return (error); +} + /* * Unlink zp from dl, and mark zp for deletion if this was the last link. Can * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY). @@ -966,31 +1034,8 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, return (error); } - if (ZTOI(zp)->i_nlink <= zp_is_dir) { - zfs_panic_recover("zfs: link count on %lu is %u, " - "should be at least %u", zp->z_id, - (int)ZTOI(zp)->i_nlink, zp_is_dir + 1); - set_nlink(ZTOI(zp), zp_is_dir + 1); - } - drop_nlink(ZTOI(zp)); - if (ZTOI(zp)->i_nlink == zp_is_dir) { - zp->z_unlinked = B_TRUE; - clear_nlink(ZTOI(zp)); - unlinked = B_TRUE; - } else { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), - NULL, &ctime, sizeof (ctime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), - NULL, &zp->z_pflags, sizeof (zp->z_pflags)); - zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, - ctime); - } - links = ZTOI(zp)->i_nlink; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), - NULL, &links, sizeof (links)); - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - count = 0; - ASSERT(error == 0); + /* The only error is !zfs_dirempty() and we checked earlier. */ + ASSERT3U(zfs_drop_nlink_locked(zp, tx, &unlinked), ==, 0); mutex_exit(&zp->z_lock); } else { error = zfs_dropname(dl, zp, dzp, tx, flag); diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 9160f3e77390..f02cefea222b 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -2876,16 +2876,12 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, /* * Source and target must be the same type. */ - if (S_ISDIR(ZTOI(szp)->i_mode)) { - if (!S_ISDIR(ZTOI(tzp)->i_mode)) { - error = SET_ERROR(ENOTDIR); - goto out; - } - } else { - if (S_ISDIR(ZTOI(tzp)->i_mode)) { - error = SET_ERROR(EISDIR); - goto out; - } + boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; + boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; + + if (s_is_dir != t_is_dir) { + error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); + goto out; } /* * POSIX dictates that when the source and target @@ -2941,51 +2937,49 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, return (error); } - if (tzp) /* Attempt to remove the existing target */ - error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + /* + * Unlink the source. + */ + szp->z_pflags |= ZFS_AV_MODIFIED; + if (tdzp->z_pflags & ZFS_PROJINHERIT) + szp->z_pflags |= ZFS_PROJINHERIT; - if (error == 0) { - error = zfs_link_create(tdl, szp, tx, ZRENAMING); - if (error == 0) { - szp->z_pflags |= ZFS_AV_MODIFIED; - if (tdzp->z_pflags & ZFS_PROJINHERIT) - szp->z_pflags |= ZFS_PROJINHERIT; - - error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), - (void *)&szp->z_pflags, sizeof (uint64_t), tx); - ASSERT0(error); + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); - error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); - if (error == 0) { - zfs_log_rename(zilog, tx, TX_RENAME | - (flags & FIGNORECASE ? TX_CI : 0), sdzp, - sdl->dl_name, tdzp, tdl->dl_name, szp); - } else { - /* - * At this point, we have successfully created - * the target name, but have failed to remove - * the source name. Since the create was done - * with the ZRENAMING flag, there are - * complications; for one, the link count is - * wrong. The easiest way to deal with this - * is to remove the newly created target, and - * return the original error. This must - * succeed; fortunately, it is very unlikely to - * fail, since we just created it. - */ - VERIFY3U(zfs_link_destroy(tdl, szp, tx, - ZRENAMING, NULL), ==, 0); - } - } else { - /* - * If we had removed the existing target, subsequent - * call to zfs_link_create() to add back the same entry - * but, the new dnode (szp) should not fail. - */ - ASSERT(tzp == NULL); - } + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + if (error) + goto commit; + + /* + * Unlink the target. + */ + if (tzp) { + error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + if (error) + goto commit_link_szp; + } + + /* + * Create a new link at the target. + */ + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error) { + /* + * If we have removed the existing target, a subsequent call to + * zfs_link_create() to add back the same entry, but with a new + * dnode (szp), should not fail. + */ + ASSERT3P(tzp, ==, NULL); + goto commit_link_tzp; } + zfs_log_rename(zilog, tx, TX_RENAME | + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); + +commit: dmu_tx_commit(tx); out: if (zl != NULL) @@ -3013,6 +3007,29 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, zfs_exit(zfsvfs, FTAG); return (error); + + /* + * Clean-up path for broken link state. + * + * At this point we are in a (very) bad state, so we need to do our + * best to correct the state. In particular, the nlink of szp is wrong + * because we were destroying and creating links with ZRENAMING. + * + * link_create()s are allowed to fail (though they shouldn't because we + * only just unlinked them and are putting the entries back during + * clean-up). But if they fail, we can just forcefully drop the nlink + * value to (at the very least) avoid broken nlink values -- though in + * the case of non-empty directories we will have to panic. + */ +commit_link_tzp: + if (tzp) { + if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) + VERIFY3U(zfs_drop_nlink(tzp, tx, NULL), ==, 0); + } +commit_link_szp: + if (zfs_link_create(sdl, szp, tx, ZRENAMING)) + VERIFY3U(zfs_drop_nlink(szp, tx, NULL), ==, 0); + goto commit; } /* From 0f88b9a396ad6aaa264dadd6e5b56a35d0b4e99b Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 22 Jun 2019 10:35:11 +1000 Subject: [PATCH 4/4] zfs_rename: support RENAME_* flags Implement support for Linux's RENAME_* flags (for renameat2). Aside from being quite useful for userspace (providing race-free ways to exchange paths and implement mv --no-clobber), they are used by overlayfs and are thus required in order to use overlayfs-on-ZFS. In order for us to represent the new renameat2(2) flags in the ZIL, we create two new transaction types for the two flags which need transactional-level support (RENAME_EXCHANGE and RENAME_WHITEOUT). RENAME_NOREPLACE does not need any ZIL support because we know that if the operation succeeded before creating the ZIL entry, there was no file to be clobbered and thus it can be treated as a regular TX_RENAME. Cc: Pavel Snajdr Signed-off-by: Aleksa Sarai --- AUTHORS | 2 + cmd/zdb/zdb_il.c | 10 + cmd/ztest.c | 2 + config/kernel-rename.m4 | 71 +++++- include/os/freebsd/zfs/sys/zfs_vnops_os.h | 3 +- include/os/linux/kernel/linux/vfs_compat.h | 13 ++ include/os/linux/spl/sys/sysmacros.h | 10 + include/os/linux/zfs/sys/zfs_vnops_os.h | 3 +- include/os/linux/zfs/sys/zpl.h | 4 + include/sys/zfs_znode.h | 6 + include/sys/zil.h | 17 +- module/os/freebsd/zfs/zfs_vnops_os.c | 5 +- module/os/linux/zfs/zfs_dir.c | 3 +- module/os/linux/zfs/zfs_vnops_os.c | 205 +++++++++++++++--- module/os/linux/zfs/zfs_znode.c | 5 + module/os/linux/zfs/zpl_inode.c | 37 +++- module/zfs/zfs_log.c | 84 ++++++- module/zfs/zfs_replay.c | 106 ++++++++- module/zfs/zil.c | 6 +- module/zfs/zvol.c | 2 + tests/runfiles/linux.run | 4 + tests/test-runner/bin/zts-report.py.in | 6 + tests/zfs-tests/cmd/.gitignore | 1 + tests/zfs-tests/cmd/Makefile.am | 3 +- tests/zfs-tests/cmd/renameat2.c | 128 +++++++++++ tests/zfs-tests/include/commands.cfg | 1 + .../tests/functional/renameat2/Makefile.am | 7 + .../tests/functional/renameat2/cleanup.ksh | 34 +++ .../renameat2/renameat2_exchange.ksh | 61 ++++++ .../renameat2/renameat2_noreplace.ksh | 51 +++++ .../renameat2/renameat2_whiteout.ksh | 50 +++++ .../tests/functional/renameat2/setup.ksh | 37 ++++ .../functional/slog/slog_replay_fs_001.ksh | 23 ++ 33 files changed, 929 insertions(+), 71 deletions(-) create mode 100644 tests/zfs-tests/cmd/renameat2.c create mode 100644 tests/zfs-tests/tests/functional/renameat2/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/renameat2/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/renameat2/renameat2_exchange.ksh create mode 100755 tests/zfs-tests/tests/functional/renameat2/renameat2_noreplace.ksh create mode 100755 tests/zfs-tests/tests/functional/renameat2/renameat2_whiteout.ksh create mode 100755 tests/zfs-tests/tests/functional/renameat2/setup.ksh diff --git a/AUTHORS b/AUTHORS index 86083ba87715..c2af58d75085 100644 --- a/AUTHORS +++ b/AUTHORS @@ -20,6 +20,7 @@ CONTRIBUTORS: Alec Salazar Alejandro R. SedeƱo Alek Pinchuk + Aleksa Sarai Alex Braunegg Alex McWhirter Alex Reece @@ -236,6 +237,7 @@ CONTRIBUTORS: Paul Dagnelie Paul Zuchowski Pavel Boldin + Pavel Snajdr Pavel Zakharov Pawel Jakub Dawidek Pedro Giffuni diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 02cc10fb7817..55df1f559f6e 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -128,6 +128,14 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, const void *arg) (void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix, (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid); (void) printf("%ssrc %s tgt %s\n", tab_prefix, snm, tnm); + switch (txtype) { + case TX_RENAME_EXCHANGE: + (void) printf("%sflags RENAME_EXCHANGE\n", tab_prefix); + break; + case TX_RENAME_WHITEOUT: + (void) printf("%sflags RENAME_WHITEOUT\n", tab_prefix); + break; + } } static int @@ -330,6 +338,8 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE2 "}, {.zri_print = zil_prt_rec_setsaxattr, .zri_name = "TX_SETSAXATTR "}, + {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_EXCHANGE "}, + {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_WHITEOUT "}, }; static int diff --git a/cmd/ztest.c b/cmd/ztest.c index a8f9e6b8760a..19edab4eb7a2 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -2368,6 +2368,8 @@ static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { NULL, /* TX_MKDIR_ACL_ATTR */ NULL, /* TX_WRITE2 */ NULL, /* TX_SETSAXATTR */ + NULL, /* TX_RENAME_EXCHANGE */ + NULL, /* TX_RENAME_WHITEOUT */ }; /* diff --git a/config/kernel-rename.m4 b/config/kernel-rename.m4 index 302db43f5748..a2b0800ab4d2 100644 --- a/config/kernel-rename.m4 +++ b/config/kernel-rename.m4 @@ -1,8 +1,28 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [ + dnl # + dnl # 3.9 (to 4.9) API change, + dnl # + dnl # A new version of iops->rename() was added (rename2) that takes a flag + dnl # argument (to support renameat2). However this separate function was + dnl # merged back into iops->rename() in Linux 4.9. + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_rename2], [ + #include + int rename2_fn(struct inode *sip, struct dentry *sdp, + struct inode *tip, struct dentry *tdp, + unsigned int flags) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .rename2 = rename2_fn, + }; + ],[]) + dnl # dnl # 4.9 API change, - dnl # iops->rename2() merged into iops->rename(), and iops->rename() now wants - dnl # flags. + dnl # + dnl # iops->rename2() merged into iops->rename(), and iops->rename() now + dnl # wants flags. dnl # ZFS_LINUX_TEST_SRC([inode_operations_rename_flags], [ #include @@ -16,11 +36,29 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [ }; ],[]) + dnl # + dnl # EL7 compatibility + dnl # + dnl # EL7 has backported renameat2 support, but it's done by defining a + dnl # separate iops wrapper structure that takes the .renameat2 function. + dnl # + ZFS_LINUX_TEST_SRC([dir_inode_operations_wrapper_rename2], [ + #include + int rename2_fn(struct inode *sip, struct dentry *sdp, + struct inode *tip, struct dentry *tdp, + unsigned int flags) { return 0; } + + static const struct inode_operations_wrapper + iops __attribute__ ((unused)) = { + .rename2 = rename2_fn, + }; + ],[]) + dnl # dnl # 5.12 API change, dnl # - dnl # Linux 5.12 introduced passing struct user_namespace* as the first argument - dnl # of the rename() and other inode_operations members. + dnl # Linux 5.12 introduced passing struct user_namespace* as the first + dnl # argument of the rename() and other inode_operations members. dnl # ZFS_LINUX_TEST_SRC([inode_operations_rename_userns], [ #include @@ -44,13 +82,30 @@ AC_DEFUN([ZFS_AC_KERNEL_RENAME], [ ],[ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether iop->rename() wants flags]) - ZFS_LINUX_TEST_RESULT([inode_operations_rename_flags], [ + AC_MSG_CHECKING([whether iops->rename2() exists]) + ZFS_LINUX_TEST_RESULT([inode_operations_rename2], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1, - [iops->rename() wants flags]) + AC_DEFINE(HAVE_RENAME2, 1, [iops->rename2() exists]) ],[ AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether iops->rename() wants flags]) + ZFS_LINUX_TEST_RESULT([inode_operations_rename_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1, + [iops->rename() wants flags]) + ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether struct inode_operations_wrapper takes .rename2()]) + ZFS_LINUX_TEST_RESULT([dir_inode_operations_wrapper_rename2], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RENAME2_OPERATIONS_WRAPPER, 1, + [struct inode_operations_wrapper takes .rename2()]) + ],[ + AC_MSG_RESULT(no) + ]) + ]) ]) ]) ]) diff --git a/include/os/freebsd/zfs/sys/zfs_vnops_os.h b/include/os/freebsd/zfs/sys/zfs_vnops_os.h index 460aecd2e708..839ee629a5ab 100644 --- a/include/os/freebsd/zfs/sys/zfs_vnops_os.h +++ b/include/os/freebsd/zfs/sys/zfs_vnops_os.h @@ -41,7 +41,8 @@ extern int zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr, zuserns_t *mnt_ns); extern int zfs_rename(znode_t *sdzp, const char *snm, znode_t *tdzp, - const char *tnm, cred_t *cr, int flags, zuserns_t *mnt_ns); + const char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, + zuserns_t *mnt_ns); extern int zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, const char *link, znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns); extern int zfs_link(znode_t *tdzp, znode_t *sp, diff --git a/include/os/linux/kernel/linux/vfs_compat.h b/include/os/linux/kernel/linux/vfs_compat.h index eeed0a388ce4..fd0b9e8e1068 100644 --- a/include/os/linux/kernel/linux/vfs_compat.h +++ b/include/os/linux/kernel/linux/vfs_compat.h @@ -324,6 +324,19 @@ static inline void zfs_gid_write(struct inode *ip, gid_t gid) ip->i_gid = make_kgid(kcred->user_ns, gid); } +/* + * 3.15 API change + */ +#ifndef RENAME_NOREPLACE +#define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */ +#endif +#ifndef RENAME_EXCHANGE +#define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ +#endif +#ifndef RENAME_WHITEOUT +#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ +#endif + /* * 4.9 API change */ diff --git a/include/os/linux/spl/sys/sysmacros.h b/include/os/linux/spl/sys/sysmacros.h index be1f77e43bda..99e3a6fb41c6 100644 --- a/include/os/linux/spl/sys/sysmacros.h +++ b/include/os/linux/spl/sys/sysmacros.h @@ -120,6 +120,16 @@ extern uint32_t zone_get_hostid(void *zone); extern void spl_setup(void); extern void spl_cleanup(void); +/* + * Only handles the first 4096 majors and first 256 minors. We don't have a + * libc for the kernel module so we define this inline. + */ +static inline dev_t +makedev(unsigned int major, unsigned int minor) +{ + return ((major & 0xFFF) << 8) | (minor & 0xFF); +} + #define highbit(x) __fls(x) #define lowbit(x) __ffs(x) diff --git a/include/os/linux/zfs/sys/zfs_vnops_os.h b/include/os/linux/zfs/sys/zfs_vnops_os.h index 787d258e1388..197ea9bec500 100644 --- a/include/os/linux/zfs/sys/zfs_vnops_os.h +++ b/include/os/linux/zfs/sys/zfs_vnops_os.h @@ -61,7 +61,8 @@ extern int zfs_getattr_fast(struct user_namespace *, struct inode *ip, extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr, zuserns_t *mnt_ns); extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, - char *tnm, cred_t *cr, int flags, zuserns_t *mnt_ns); + char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, + zuserns_t *mnt_ns); extern int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns); extern int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr); diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 83416d64744c..c3ee0ae4a600 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -42,7 +42,11 @@ extern void zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr, zuserns_t *mnt_ns); extern const struct inode_operations zpl_inode_operations; +#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER +extern const struct inode_operations_wrapper zpl_dir_inode_operations; +#else extern const struct inode_operations zpl_dir_inode_operations; +#endif extern const struct inode_operations zpl_symlink_inode_operations; extern const struct inode_operations zpl_special_inode_operations; diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index c8656b3f6162..88d642350691 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -299,6 +299,12 @@ extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, znode_t *szp); +extern void zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, + uint64_t txtype, znode_t *sdzp, const char *sname, znode_t *tdzp, + const char *dname, znode_t *szp); +extern void zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, + uint64_t txtype, znode_t *sdzp, const char *sname, znode_t *tdzp, + const char *dname, znode_t *szp, znode_t *wzp); extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t len, int ioflag, zil_callback_t callback, void *callback_data); diff --git a/include/sys/zil.h b/include/sys/zil.h index cec04f120ce3..9591fb4f6440 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -164,7 +164,9 @@ typedef enum zil_create { #define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ #define TX_WRITE2 20 /* dmu_sync EALREADY write */ #define TX_SETSAXATTR 21 /* Set sa xattrs on file */ -#define TX_MAX_TYPE 22 /* Max transaction type */ +#define TX_RENAME_EXCHANGE 22 /* Atomic swap via renameat2 */ +#define TX_RENAME_WHITEOUT 23 /* Atomic whiteout via renameat2 */ +#define TX_MAX_TYPE 24 /* Max transaction type */ /* * The transactions for mkdir, symlink, remove, rmdir, link, and rename @@ -317,6 +319,19 @@ typedef struct { /* 2 strings: names of source and destination follow this */ } lr_rename_t; +typedef struct { + lr_rename_t lr_rename; /* common rename portion */ + /* members related to the whiteout file (based on lr_create_t) */ + uint64_t lr_wfoid; /* obj id of the new whiteout file */ + uint64_t lr_wmode; /* mode of object */ + uint64_t lr_wuid; /* uid of whiteout */ + uint64_t lr_wgid; /* gid of whiteout */ + uint64_t lr_wgen; /* generation (txg of creation) */ + uint64_t lr_wcrtime[2]; /* creation time */ + uint64_t lr_wrdev; /* always makedev(0, 0) */ + /* 2 strings: names of source and destination follow this */ +} lr_rename_whiteout_t; + typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_foid; /* file object to write */ diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 362e02751ee4..bcf4e2f18d83 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -3420,7 +3420,7 @@ zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, int zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname, - cred_t *cr, int flags, zuserns_t *mnt_ns) + cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns) { struct componentname scn, tcn; vnode_t *sdvp, *tdvp; @@ -3428,6 +3428,9 @@ zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname, int error; svp = tvp = NULL; + if (rflags != 0 || wo_vap != NULL) + return (SET_ERROR(EINVAL)); + sdvp = ZTOV(sdzp); tdvp = ZTOV(tdzp); error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE); diff --git a/module/os/linux/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c index fb6c28f95c3b..b4e4146b09e9 100644 --- a/module/os/linux/zfs/zfs_dir.c +++ b/module/os/linux/zfs/zfs_dir.c @@ -1035,7 +1035,8 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, } /* The only error is !zfs_dirempty() and we checked earlier. */ - ASSERT3U(zfs_drop_nlink_locked(zp, tx, &unlinked), ==, 0); + error = zfs_drop_nlink_locked(zp, tx, &unlinked); + ASSERT3U(error, ==, 0); mutex_exit(&zp->z_lock); } else { error = zfs_dropname(dl, zp, dzp, tx, flag); diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index f02cefea222b..545d8ad8d79c 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -2655,6 +2655,8 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) * tnm - New entry name. * cr - credentials of caller. * flags - case flags + * rflags - RENAME_* flags + * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). * mnt_ns - user namespace of the mount * * RETURN: 0 on success, error code on failure. @@ -2664,7 +2666,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) */ int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, - cred_t *cr, int flags, zuserns_t *mnt_ns) + cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns) { znode_t *szp, *tzp; zfsvfs_t *zfsvfs = ZTOZSB(sdzp); @@ -2676,10 +2678,33 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, int error = 0; int zflg = 0; boolean_t waited = B_FALSE; + /* Needed for whiteout inode creation. */ + boolean_t fuid_dirtied; + zfs_acl_ids_t acl_ids; + boolean_t have_acl = B_FALSE; + znode_t *wzp = NULL; + if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); + if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return (SET_ERROR(EINVAL)); + + /* Already checked by Linux VFS, but just to make sure. */ + if (rflags & RENAME_EXCHANGE && + (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) + return (SET_ERROR(EINVAL)); + + /* + * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the + * right kind of vattr_t for the whiteout file. These are set + * internally by ZFS so should never be incorrect. + */ + VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); + VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); + VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); + if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; @@ -2856,7 +2881,6 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, * Note that if target and source are the same, this can be * done in a single check. */ - if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) goto out; @@ -2873,15 +2897,21 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, * Does target exist? */ if (tzp) { + if (rflags & RENAME_NOREPLACE) { + error = SET_ERROR(EEXIST); + goto out; + } /* - * Source and target must be the same type. + * Source and target must be the same type (unless exchanging). */ - boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; - boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; + if (!(rflags & RENAME_EXCHANGE)) { + boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; + boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; - if (s_is_dir != t_is_dir) { - error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); - goto out; + if (s_is_dir != t_is_dir) { + error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); + goto out; + } } /* * POSIX dictates that when the source and target @@ -2892,12 +2922,43 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, error = 0; goto out; } + } else if (rflags & RENAME_EXCHANGE) { + /* Target must exist for RENAME_EXCHANGE. */ + error = SET_ERROR(ENOENT); + goto out; + } + + /* Set up inode creation for RENAME_WHITEOUT. */ + if (rflags & RENAME_WHITEOUT) { + /* + * Whiteout files are not regular files or directories, so to + * match zfs_create() we do not inherit the project id. + */ + uint64_t wo_projid = ZFS_DEFAULT_PROJID; + + error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); + if (error) + goto out; + + if (!have_acl) { + error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, + &acl_ids, mnt_ns); + if (error) + goto out; + have_acl = B_TRUE; + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { + error = SET_ERROR(EDQUOT); + goto out; + } } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, sdzp->z_id, + (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); @@ -2907,7 +2968,21 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } + if (rflags & RENAME_WHITEOUT) { + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); @@ -2946,7 +3021,7 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); - ASSERT0(error); + VERIFY0(error); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error) @@ -2956,13 +3031,30 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, * Unlink the target. */ if (tzp) { - error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + int tzflg = zflg; + + if (rflags & RENAME_EXCHANGE) { + /* This inode will be re-linked soon. */ + tzflg |= ZRENAMING; + + tzp->z_pflags |= ZFS_AV_MODIFIED; + if (sdzp->z_pflags & ZFS_PROJINHERIT) + tzp->z_pflags |= ZFS_PROJINHERIT; + + error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&tzp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); + } + error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); if (error) goto commit_link_szp; } /* - * Create a new link at the target. + * Create the new target links: + * * We always link the target. + * * RENAME_EXCHANGE: Link the old target to the source. + * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. */ error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error) { @@ -2975,18 +3067,55 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, goto commit_link_tzp; } - zfs_log_rename(zilog, tx, TX_RENAME | - (flags & FIGNORECASE ? TX_CI : 0), sdzp, - sdl->dl_name, tdzp, tdl->dl_name, szp); + switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { + case RENAME_EXCHANGE: + error = zfs_link_create(sdl, tzp, tx, ZRENAMING); + /* + * The same argument as zfs_link_create() failing for + * szp applies here, since the source directory must + * have had an entry we are replacing. + */ + ASSERT0(error); + if (error) + goto commit_unlink_td_szp; + break; + case RENAME_WHITEOUT: + zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); + error = zfs_link_create(sdl, wzp, tx, ZNEW); + if (error) { + zfs_znode_delete(wzp, tx); + remove_inode_hash(ZTOI(wzp)); + goto commit_unlink_td_szp; + } + break; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { + case RENAME_EXCHANGE: + zfs_log_rename_exchange(zilog, tx, + (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, + tdzp, tdl->dl_name, szp); + break; + case RENAME_WHITEOUT: + zfs_log_rename_whiteout(zilog, tx, + (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, + tdzp, tdl->dl_name, szp, wzp); + break; + default: + ASSERT0(rflags & ~RENAME_NOREPLACE); + zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), + sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); + break; + } commit: dmu_tx_commit(tx); out: - if (zl != NULL) - zfs_rename_unlock(&zl); - - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); + if (have_acl) + zfs_acl_ids_free(&acl_ids); zfs_znode_update_vfs(sdzp); if (sdzp == tdzp) @@ -2997,11 +3126,21 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, zfs_znode_update_vfs(szp); zrele(szp); + if (wzp) { + zfs_znode_update_vfs(wzp); + zrele(wzp); + } if (tzp) { zfs_znode_update_vfs(tzp); zrele(tzp); } + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); @@ -3012,23 +3151,31 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, * Clean-up path for broken link state. * * At this point we are in a (very) bad state, so we need to do our - * best to correct the state. In particular, the nlink of szp is wrong - * because we were destroying and creating links with ZRENAMING. + * best to correct the state. In particular, all of the nlinks are + * wrong because we were destroying and creating links with ZRENAMING. + * + * In some form, all of these operations have to resolve the state: + * + * * link_destroy() *must* succeed. Fortunately, this is very likely + * since we only just created it. * - * link_create()s are allowed to fail (though they shouldn't because we - * only just unlinked them and are putting the entries back during - * clean-up). But if they fail, we can just forcefully drop the nlink - * value to (at the very least) avoid broken nlink values -- though in - * the case of non-empty directories we will have to panic. + * * link_create()s are allowed to fail (though they shouldn't because + * we only just unlinked them and are putting the entries back + * during clean-up). But if they fail, we can just forcefully drop + * the nlink value to (at the very least) avoid broken nlink values + * -- though in the case of non-empty directories we will have to + * panic (otherwise we'd have a leaked directory with a broken ..). */ +commit_unlink_td_szp: + VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); commit_link_tzp: if (tzp) { if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) - VERIFY3U(zfs_drop_nlink(tzp, tx, NULL), ==, 0); + VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); } commit_link_szp: if (zfs_link_create(sdl, szp, tx, ZRENAMING)) - VERIFY3U(zfs_drop_nlink(szp, tx, NULL), ==, 0); + VERIFY0(zfs_drop_nlink(szp, tx, NULL)); goto commit; } diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index 3ded79a30a6f..c8f6e02bd224 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -422,7 +422,12 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) break; case S_IFDIR: +#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER + ip->i_flags |= S_IOPS_WRAPPER; + ip->i_op = &zpl_dir_inode_operations.ops; +#else ip->i_op = &zpl_dir_inode_operations; +#endif ip->i_fop = &zpl_dir_file_operations; ITOZ(ip)->z_zn_prefetch = B_TRUE; break; diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index 9b702c535ea7..64016f9ac1de 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -24,6 +24,7 @@ */ +#include #include #include #include @@ -498,35 +499,42 @@ static int #ifdef HAVE_IOPS_RENAME_USERNS zpl_rename2(struct user_namespace *user_ns, struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, - unsigned int flags) + unsigned int rflags) #else zpl_rename2(struct inode *sdip, struct dentry *sdentry, - struct inode *tdip, struct dentry *tdentry, unsigned int flags) + struct inode *tdip, struct dentry *tdentry, unsigned int rflags) #endif { cred_t *cr = CRED(); + vattr_t *wo_vap = NULL; int error; fstrans_cookie_t cookie; #ifndef HAVE_IOPS_RENAME_USERNS zuserns_t *user_ns = NULL; #endif - /* We don't have renameat2(2) support */ - if (flags) - return (-EINVAL); - crhold(cr); + if (rflags & RENAME_WHITEOUT) { + wo_vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(wo_vap, sdip, S_IFCHR, cr, user_ns); + wo_vap->va_rdev = makedevice(0, 0); + } + cookie = spl_fstrans_mark(); error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip), - dname(tdentry), cr, 0, user_ns); + dname(tdentry), cr, 0, rflags, wo_vap, user_ns); spl_fstrans_unmark(cookie); + if (wo_vap) + kmem_free(wo_vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } -#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS) +#if !defined(HAVE_IOPS_RENAME_USERNS) && \ + !defined(HAVE_RENAME_WANTS_FLAGS) && \ + !defined(HAVE_RENAME2) static int zpl_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) @@ -745,7 +753,12 @@ const struct inode_operations zpl_inode_operations = { #endif /* CONFIG_FS_POSIX_ACL */ }; +#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER +const struct inode_operations_wrapper zpl_dir_inode_operations = { + .ops = { +#else const struct inode_operations zpl_dir_inode_operations = { +#endif .create = zpl_create, .lookup = zpl_lookup, .link = zpl_link, @@ -754,7 +767,9 @@ const struct inode_operations zpl_dir_inode_operations = { .mkdir = zpl_mkdir, .rmdir = zpl_rmdir, .mknod = zpl_mknod, -#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) +#ifdef HAVE_RENAME2 + .rename2 = zpl_rename2, +#elif defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) .rename = zpl_rename2, #else .rename = zpl_rename, @@ -776,6 +791,10 @@ const struct inode_operations zpl_dir_inode_operations = { #endif /* HAVE_SET_ACL */ .get_acl = zpl_get_acl, #endif /* CONFIG_FS_POSIX_ACL */ +#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER + }, + .rename2 = zpl_rename2, +#endif }; const struct inode_operations zpl_symlink_inode_operations = { diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 245699882aa9..77bf9140d52d 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -494,25 +494,101 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, zil_itx_assign(zilog, itx, tx); } +static void +do_zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, + const char *sname, znode_t *tdzp, const char *dname, znode_t *szp) +{ + itx_t *itx; + lr_rename_t *lr; + size_t snamesize = strlen(sname) + 1; + size_t dnamesize = strlen(dname) + 1; + + if (zil_replaying(zilog, tx)) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); + lr = (lr_rename_t *)&itx->itx_lr; + lr->lr_sdoid = sdzp->z_id; + lr->lr_tdoid = tdzp->z_id; + memcpy((char *)(lr + 1), sname, snamesize); + memcpy((char *)(lr + 1) + snamesize, dname, dnamesize); + itx->itx_oid = szp->z_id; + + zil_itx_assign(zilog, itx, tx); +} + /* * Handles TX_RENAME transactions. */ void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, znode_t *szp) +{ + txtype |= TX_RENAME; + do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp); +} + +/* + * Handles TX_RENAME_EXCHANGE transactions. + */ +void +zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, + znode_t *szp) +{ + txtype |= TX_RENAME_EXCHANGE; + do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp); +} + +/* + * Handles TX_RENAME_WHITEOUT transactions. + * + * Unfortunately we cannot reuse do_zfs_log_rename because we we need to call + * zfs_mknode() on replay which requires stashing bits as with TX_CREATE. + */ +void +zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, + znode_t *szp, znode_t *wzp) { itx_t *itx; - lr_rename_t *lr; + lr_rename_whiteout_t *lr; size_t snamesize = strlen(sname) + 1; size_t dnamesize = strlen(dname) + 1; if (zil_replaying(zilog, tx)) return; + txtype |= TX_RENAME_WHITEOUT; itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); - lr = (lr_rename_t *)&itx->itx_lr; - lr->lr_sdoid = sdzp->z_id; - lr->lr_tdoid = tdzp->z_id; + lr = (lr_rename_whiteout_t *)&itx->itx_lr; + lr->lr_rename.lr_sdoid = sdzp->z_id; + lr->lr_rename.lr_tdoid = tdzp->z_id; + + /* + * RENAME_WHITEOUT will create an entry at the source znode, so we need + * to store the same data that the equivalent call to zfs_log_create() + * would. + */ + lr->lr_wfoid = wzp->z_id; + LR_FOID_SET_SLOTS(lr->lr_wfoid, wzp->z_dnodesize >> DNODE_SHIFT); + (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(wzp)), &lr->lr_wgen, + sizeof (uint64_t)); + (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(wzp)), + lr->lr_wcrtime, sizeof (uint64_t) * 2); + lr->lr_wmode = wzp->z_mode; + lr->lr_wuid = (uint64_t)KUID_TO_SUID(ZTOUID(wzp)); + lr->lr_wgid = (uint64_t)KGID_TO_SGID(ZTOGID(wzp)); + + /* + * This rdev will always be makdevice(0, 0) but because the ZIL log and + * replay code needs to be platform independent (and there is no + * platform independent makdev()) we need to copy the one created + * during the rename operation. + */ + (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(wzp)), &lr->lr_wrdev, + sizeof (lr->lr_wrdev)); + memcpy((char *)(lr + 1), sname, snamesize); memcpy((char *)(lr + 1) + snamesize, dname, dnamesize); itx->itx_oid = szp->z_id; diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 45c2fa3720cf..5e20ce3319b4 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -643,18 +643,21 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) } static int -zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) +do_zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, char *sname, + char *tname, uint64_t rflags, vattr_t *wo_vap) { - zfsvfs_t *zfsvfs = arg1; - lr_rename_t *lr = arg2; - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; znode_t *sdzp, *tdzp; - int error; - int vflg = 0; + int error, vflg = 0; - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); + /* Only Linux currently supports RENAME_* flags. */ +#ifdef __linux__ + VERIFY0(rflags & ~(RENAME_EXCHANGE | RENAME_WHITEOUT)); + + /* wo_vap must be non-NULL iff. we're doing RENAME_WHITEOUT */ + VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); +#else + VERIFY0(rflags); +#endif if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) return (error); @@ -667,13 +670,94 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; - error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, NULL); + error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags, + wo_vap, NULL); zrele(tdzp); zrele(sdzp); return (error); } +static int +zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_rename_t *lr = arg2; + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL)); +} + +static int +zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap) +{ +#ifdef __linux__ + zfsvfs_t *zfsvfs = arg1; + lr_rename_t *lr = arg2; + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE, + NULL)); +#else + return (SET_ERROR(ENOTSUP)); +#endif +} + +static int +zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap) +{ +#ifdef __linux__ + zfsvfs_t *zfsvfs = arg1; + lr_rename_whiteout_t *lr = arg2; + int error; + /* sname and tname follow lr_rename_whiteout_t */ + char *sname = (char *)(lr + 1); + char *tname = sname + strlen(sname) + 1; + /* For the whiteout file. */ + xvattr_t xva; + uint64_t objid; + uint64_t dnodesize; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + objid = LR_FOID_GET_OBJ(lr->lr_wfoid); + dnodesize = LR_FOID_GET_SLOTS(lr->lr_wfoid) << DNODE_SHIFT; + + xva_init(&xva); + zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, + lr->lr_wmode, lr->lr_wuid, lr->lr_wgid, lr->lr_wrdev, objid); + + /* + * As with TX_CREATE, RENAME_WHITEOUT ends up in zfs_mknode(), which + * assigns the object's creation time, generation number, and dnode + * slot count. The generic zfs_rename() has no concept of these + * attributes, so we smuggle the values inside the vattr's otherwise + * unused va_ctime, va_nblocks, and va_fsid fields. + */ + ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_wcrtime); + xva.xva_vattr.va_nblocks = lr->lr_wgen; + xva.xva_vattr.va_fsid = dnodesize; + + error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); + if (error) + return (error); + + return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname, + RENAME_WHITEOUT, &xva.xva_vattr)); +#else + return (SET_ERROR(ENOTSUP)); +#endif +} + static int zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) { @@ -1069,4 +1153,6 @@ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ zfs_replay_write2, /* TX_WRITE2 */ zfs_replay_setsaxattr, /* TX_SETSAXATTR */ + zfs_replay_rename_exchange, /* TX_RENAME_EXCHANGE */ + zfs_replay_rename_whiteout, /* TX_RENAME_WHITEOUT */ }; diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 6bb99c4b1cdf..23afc8a40bb4 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -759,11 +759,9 @@ zil_commit_activate_saxattr_feature(zilog_t *zilog) uint64_t txg = 0; dmu_tx_t *tx = NULL; - if (spa_feature_is_enabled(zilog->zl_spa, - SPA_FEATURE_ZILSAXATTR) && + if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL && - !dsl_dataset_feature_is_active(ds, - SPA_FEATURE_ZILSAXATTR)) { + !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) { tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(ds, tx); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index be8ee34f27ae..20578a8223b2 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -514,6 +514,8 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ zvol_replay_err, /* TX_WRITE2 */ zvol_replay_err, /* TX_SETSAXATTR */ + zvol_replay_err, /* TX_RENAME_EXCHANGE */ + zvol_replay_err, /* TX_RENAME_WHITEOUT */ }; /* diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 21e0f882dc40..13f7efd96bd3 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -157,6 +157,10 @@ tags = ['functional', 'projectquota'] tests = ['read_dos_attrs_001', 'write_dos_attrs_001'] tags = ['functional', 'dos_attributes'] +[tests/functional/renameat2:Linux] +tests = ['renameat2_noreplace', 'renameat2_exchange', 'renameat2_whiteout'] +tags = ['functional', 'renameat2'] + [tests/functional/rsend:Linux] tests = ['send_realloc_dnode_size', 'send_encrypted_files'] tags = ['functional', 'rsend'] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index e7d338fcf8a9..1cebf50827b9 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -69,6 +69,11 @@ exec_reason = 'Test user execute permissions required for utilities' # python_deps_reason = 'Python modules missing: python3-cffi' +# +# Some tests require that the kernel supports renameat2 syscall. +# +renameat2_reason = 'Kernel renameat2 support required' + # # Some tests require the O_TMPFILE flag which was first introduced in the # 3.11 kernel. @@ -231,6 +236,7 @@ maybe = { 'pool_checkpoint/checkpoint_discard_busy': ['FAIL', 11946], 'projectquota/setup': ['SKIP', exec_reason], 'removal/removal_condense_export': ['FAIL', known_reason], + 'renameat2/setup': ['SKIP', renameat2_reason], 'reservation/reservation_008_pos': ['FAIL', 7741], 'reservation/reservation_018_pos': ['FAIL', 5642], 'snapshot/clone_001_pos': ['FAIL', known_reason], diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index 0ec450e248db..f68f58072818 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -27,6 +27,7 @@ /randwritecomp /read_dos_attributes /readmmap +/renameat2 /rename_dir /rm_lnkcnt_zero_file /send_doall diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 673a18b4c083..066abb6ce3b5 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -112,10 +112,10 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/edonr_test %D%/skein_test \ %C%_edonr_test_LDADD = $(%C%_skein_test_LDADD) %C%_blake3_test_LDADD = $(%C%_skein_test_LDADD) - if BUILD_LINUX scripts_zfs_tests_bin_PROGRAMS += %D%/getversion scripts_zfs_tests_bin_PROGRAMS += %D%/user_ns_exec +scripts_zfs_tests_bin_PROGRAMS += %D%/renameat2 scripts_zfs_tests_bin_PROGRAMS += %D%/xattrtest scripts_zfs_tests_bin_PROGRAMS += %D%/zed_fd_spill-zedlet scripts_zfs_tests_bin_PROGRAMS += %D%/idmap_util @@ -127,7 +127,6 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/read_dos_attributes %D%/write_dos_attribu %C%_read_dos_attributes_SOURCES = %D%/linux_dos_attributes/read_dos_attributes.c %C%_write_dos_attributes_SOURCES = %D%/linux_dos_attributes/write_dos_attributes.c - scripts_zfs_tests_bin_PROGRAMS += %D%/randfree_file %C%_randfree_file_SOURCES = %D%/file/randfree_file.c diff --git a/tests/zfs-tests/cmd/renameat2.c b/tests/zfs-tests/cmd/renameat2.c new file mode 100644 index 000000000000..a9d0a8b20adf --- /dev/null +++ b/tests/zfs-tests/cmd/renameat2.c @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: CDDL-1.0 OR MPL-2.0 */ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2019 Aleksa Sarai + * Copyright (C) 2019 SUSE LLC + */ + +/* + * mv(1) doesn't currently support RENAME_{EXCHANGE,WHITEOUT} so this is a very + * simple renameat2(2) wrapper for the OpenZFS self-tests. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifndef SYS_renameat2 +#ifdef __NR_renameat2 +#define SYS_renameat2 __NR_renameat2 +#elif defined(__x86_64__) +#define SYS_renameat2 316 +#elif defined(__i386__) +#define SYS_renameat2 353 +#elif defined(__arm__) || defined(__aarch64__) +#define SYS_renameat2 382 +#else +#error "SYS_renameat2 not known for this architecture." +#endif +#endif + +#ifndef RENAME_NOREPLACE +#define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */ +#endif +#ifndef RENAME_EXCHANGE +#define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ +#endif +#ifndef RENAME_WHITEOUT +#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ +#endif + +/* glibc doesn't provide renameat2 wrapper, let's use our own */ +static int +sys_renameat2(int olddirfd, const char *oldpath, + int newdirfd, const char *newpath, unsigned int flags) +{ + int ret = syscall(SYS_renameat2, olddirfd, oldpath, newdirfd, newpath, + flags); + return ((ret < 0) ? -errno : ret); +} + +static void +usage(void) +{ + fprintf(stderr, "usage: renameat2 [-Cnwx] src dst\n"); + exit(1); +} + +static void +check(void) +{ + int err = sys_renameat2(AT_FDCWD, ".", AT_FDCWD, ".", RENAME_EXCHANGE); + exit(err == -ENOSYS); +} + +int +main(int argc, char **argv) +{ + char *src, *dst; + int ch, err; + unsigned int flags = 0; + + while ((ch = getopt(argc, argv, "Cnwx")) >= 0) { + switch (ch) { + case 'C': + check(); + break; + case 'n': + flags |= RENAME_NOREPLACE; + break; + case 'w': + flags |= RENAME_WHITEOUT; + break; + case 'x': + flags |= RENAME_EXCHANGE; + break; + default: + usage(); + break; + } + } + + argc -= optind; + argv += optind; + + if (argc != 2) + usage(); + src = argv[0]; + dst = argv[1]; + + err = sys_renameat2(AT_FDCWD, src, AT_FDCWD, dst, flags); + if (err < 0) + fprintf(stderr, "renameat2: %s", strerror(-err)); + return (err != 0); +} diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 30514361ad57..b3cfe149ffa7 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -208,6 +208,7 @@ export ZFSTEST_FILES='badsend randwritecomp readmmap read_dos_attributes + renameat2 rename_dir rm_lnkcnt_zero_file send_doall diff --git a/tests/zfs-tests/tests/functional/renameat2/Makefile.am b/tests/zfs-tests/tests/functional/renameat2/Makefile.am new file mode 100644 index 000000000000..bd8d6c9d68bf --- /dev/null +++ b/tests/zfs-tests/tests/functional/renameat2/Makefile.am @@ -0,0 +1,7 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/renameat2 +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + renameat2_noreplace.ksh \ + renameat2_exchange.ksh \ + renameat2_whiteout.ksh diff --git a/tests/zfs-tests/tests/functional/renameat2/cleanup.ksh b/tests/zfs-tests/tests/functional/renameat2/cleanup.ksh new file mode 100755 index 000000000000..3166bd6ec16e --- /dev/null +++ b/tests/zfs-tests/tests/functional/renameat2/cleanup.ksh @@ -0,0 +1,34 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/renameat2/renameat2_exchange.ksh b/tests/zfs-tests/tests/functional/renameat2/renameat2_exchange.ksh new file mode 100755 index 000000000000..94e56231feb1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/renameat2/renameat2_exchange.ksh @@ -0,0 +1,61 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (C) 2019 Aleksa Sarai +# Copyright (C) 2019 SUSE LLC +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +function cleanup +{ + log_must rm -rf $TESTDIR/* +} + +log_assert "ZFS supports RENAME_EXCHANGE." +log_onexit cleanup + +cd $TESTDIR +echo "foo" > foo +echo "bar" > bar + +# Self-exchange is a no-op. +log_must renameat2 -x foo foo +log_must grep '^foo$' foo + +# Basic exchange. +log_must renameat2 -x foo bar +log_must grep '^bar$' foo +log_must grep '^foo$' bar + +# And exchange back. +log_must renameat2 -x foo bar +log_must grep '^foo$' foo +log_must grep '^bar$' bar + +# Exchange with a bad path should fail. +log_mustnot renameat2 -x bar baz + +log_pass "ZFS supports RENAME_EXCHANGE as expected." diff --git a/tests/zfs-tests/tests/functional/renameat2/renameat2_noreplace.ksh b/tests/zfs-tests/tests/functional/renameat2/renameat2_noreplace.ksh new file mode 100755 index 000000000000..d75b94fab465 --- /dev/null +++ b/tests/zfs-tests/tests/functional/renameat2/renameat2_noreplace.ksh @@ -0,0 +1,51 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (C) 2019 Aleksa Sarai +# Copyright (C) 2019 SUSE LLC +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +function cleanup +{ + log_must rm -rf $TESTDIR/* +} + +log_assert "ZFS supports RENAME_NOREPLACE." +log_onexit cleanup + +cd $TESTDIR +touch foo bar + +# Clobbers should always fail. +log_mustnot renameat2 -n foo foo +log_mustnot renameat2 -n foo bar +log_mustnot renameat2 -n bar foo + +# Regular renames should succeed. +log_must renameat2 -n bar baz + +log_pass "ZFS supports RENAME_NOREPLACE as expected." diff --git a/tests/zfs-tests/tests/functional/renameat2/renameat2_whiteout.ksh b/tests/zfs-tests/tests/functional/renameat2/renameat2_whiteout.ksh new file mode 100755 index 000000000000..8ecb074dbbdb --- /dev/null +++ b/tests/zfs-tests/tests/functional/renameat2/renameat2_whiteout.ksh @@ -0,0 +1,50 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (C) 2019 Aleksa Sarai +# Copyright (C) 2019 SUSE LLC +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +function cleanup +{ + log_must rm -rf $TESTDIR/* +} + +log_assert "ZFS supports RENAME_WHITEOUT." +log_onexit cleanup + +cd $TESTDIR +echo "whiteout" > whiteout + +# Straight-forward rename-with-whiteout. +log_must renameat2 -w whiteout new +# Check new file. +log_must grep '^whiteout$' new +# Check that the whiteout is actually a {0,0} char device. +log_must grep '^character special file:0:0$' <<<"$(stat -c '%F:%t:%T' whiteout)" + +log_pass "ZFS supports RENAME_WHITEOUT as expected." diff --git a/tests/zfs-tests/tests/functional/renameat2/setup.ksh b/tests/zfs-tests/tests/functional/renameat2/setup.ksh new file mode 100755 index 000000000000..b8c26d5ba062 --- /dev/null +++ b/tests/zfs-tests/tests/functional/renameat2/setup.ksh @@ -0,0 +1,37 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (C) 2019 Aleksa Sarai +# Copyright (C) 2019 SUSE LLC +# + +. $STF_SUITE/include/libtest.shlib + +if ! is_linux ; then + log_unsupported "renameat2 is linux-only" +elif ! renameat2 -C ; then + log_unsupported "renameat2 not supported on this (pre-3.15) linux kernel" +fi + +DISK=${DISKS%% *} +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh index eddecbc2db7e..8f3585a5997f 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh @@ -175,6 +175,29 @@ log_must ln /$TESTPOOL/$TESTFS/link_and_unlink \ /$TESTPOOL/$TESTFS/link_and_unlink.link log_must rm /$TESTPOOL/$TESTFS/link_and_unlink.link +# We can't test RENAME_* flags without renameat2(2) support. +if ! is_linux ; then + log_note "renameat2 is linux-only" +elif ! renameat2 -C ; then + log_note "renameat2 not supported on this (pre-3.15) linux kernel" +else + # TX_RENAME_EXCHANGE + log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/xchg-a bs=1k count=1 + log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/xchg-b bs=1k count=1 + log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/xchg-c bs=1k count=1 + log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/xchg-d bs=1k count=1 + # rotate the files around + log_must renameat2 -x /$TESTPOOL/$TESTFS/xchg-{a,b} + log_must renameat2 -x /$TESTPOOL/$TESTFS/xchg-{b,c} + log_must renameat2 -x /$TESTPOOL/$TESTFS/xchg-{c,a} + # exchange same path + log_must renameat2 -x /$TESTPOOL/$TESTFS/xchg-{d,d} + + # TX_RENAME_WHITEOUT + log_must mkfile 1k /$TESTPOOL/$TESTFS/whiteout + log_must renameat2 -w /$TESTPOOL/$TESTFS/whiteout{,-moved} +fi + # # 4. Copy TESTFS to temporary location (TESTDIR/copy) #