From e0cd6c28a38bb514351eb696e613e0e36755f867 Mon Sep 17 00:00:00 2001 From: Rafael Kitover Date: Thu, 23 May 2019 14:40:28 -0700 Subject: [PATCH 001/109] kernel timer API rework In `config/kernel-timer.m4` refactor slightly to check more generally for the new `timer_setup()` APIs, but also check the callback signature because some kernels (notably 4.14) have the new `timer_setup()` API but use the old callback signature. Also add a check for a `flags` member in `struct timer_list`, which was added in 4.1-rc8. Add compatibility shims to `include/spl/sys/timer.h` to allow using the new timer APIs with the only two caveats being that the callback argument type must be declared as `spl_timer_list_t` and an explicit assignment is required to get the timer variable for the `timer_of()` macro. So the callback would look like this: ```c __cv_wakeup(spl_timer_list_t t) { struct timer_list *tmr = (struct timer_list *)t; struct thing *parent = from_timer(parent, tmr, parent_timer_field); ... /* do stuff with parent */ ``` Make some minor changes to `spl-condvar.c` and `spl-taskq.c` to use the new timer APIs instead of conditional code. Reviewed-by: Tomohiro Kusumi Reviewed-by: Brian Behlendorf Signed-off-by: Rafael Kitover Closes #8647 --- config/kernel-timer.m4 | 63 +++++++++++++++++++++++++++++++++------- config/kernel.m4 | 2 +- include/spl/sys/timer.h | 25 ++++++++++++++++ module/spl/spl-condvar.c | 29 +++++++++++++----- module/spl/spl-taskq.c | 24 +++------------ 5 files changed, 103 insertions(+), 40 deletions(-) diff --git a/config/kernel-timer.m4 b/config/kernel-timer.m4 index 4dc3f84ed47e..b0e1afa153ab 100644 --- a/config/kernel-timer.m4 +++ b/config/kernel-timer.m4 @@ -1,26 +1,51 @@ +dnl # 4.14-rc3 API change +dnl # https://lwn.net/Articles/735887/ dnl # -dnl # 4.15 API change -dnl # https://lkml.org/lkml/2017/11/25/90 dnl # Check if timer_list.func get passed a timer_list or an unsigned long dnl # (older kernels). Also sanity check the from_timer() and timer_setup() dnl # macros are available as well, since they will be used in the same newer dnl # kernels that support the new timer_list.func signature. dnl # -AC_DEFUN([ZFS_AC_KERNEL_TIMER_FUNCTION_TIMER_LIST], [ - AC_MSG_CHECKING([whether timer_list.function gets a timer_list]) +dnl # Also check for the existance of flags in struct timer_list, they were +dnl # added in 4.1-rc8 via 0eeda71bc30d. + +AC_DEFUN([ZFS_AC_KERNEL_TIMER_SETUP], [ + AC_MSG_CHECKING([whether timer_setup() is available]) tmp_flags="$EXTRA_KCFLAGS" EXTRA_KCFLAGS="-Werror" + ZFS_LINUX_TRY_COMPILE([ #include - void task_expire(struct timer_list *tl) {} + + struct my_task_timer { + struct timer_list timer; + int data; + }; + + void task_expire(struct timer_list *tl) + { + struct my_task_timer *task_timer = from_timer(task_timer, tl, timer); + task_timer->data = 42; + } + ],[ + struct my_task_timer task_timer; + timer_setup(&task_timer.timer, task_expire, 0); ],[ - #ifndef from_timer - #error "No from_timer() macro" - #endif + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KERNEL_TIMER_SETUP, 1, + [timer_setup() is available]) + ],[ + AC_MSG_RESULT(no) + ]) - struct timer_list timer; - timer.function = task_expire; - timer_setup(&timer, NULL, 0); + AC_MSG_CHECKING([whether timer function expects timer_list]) + + ZFS_LINUX_TRY_COMPILE([ + #include + void task_expire(struct timer_list *tl) {} + ],[ + struct timer_list tl; + tl.function = task_expire; ],[ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST, 1, @@ -28,5 +53,21 @@ AC_DEFUN([ZFS_AC_KERNEL_TIMER_FUNCTION_TIMER_LIST], [ ],[ AC_MSG_RESULT(no) ]) + + AC_MSG_CHECKING([whether struct timer_list has flags]) + + ZFS_LINUX_TRY_COMPILE([ + #include + ],[ + struct timer_list tl; + tl.flags = 2; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KERNEL_TIMER_LIST_FLAGS, 1, + [struct timer_list has a flags member]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" ]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 9a36302c0489..fbc04bdf7d70 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -36,7 +36,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_KERNEL_GROUP_INFO_GID ZFS_AC_KERNEL_WRITE ZFS_AC_KERNEL_READ - ZFS_AC_KERNEL_TIMER_FUNCTION_TIMER_LIST + ZFS_AC_KERNEL_TIMER_SETUP ZFS_AC_KERNEL_DECLARE_EVENT_CLASS ZFS_AC_KERNEL_CURRENT_BIO_TAIL ZFS_AC_KERNEL_SUPER_USER_NS diff --git a/include/spl/sys/timer.h b/include/spl/sys/timer.h index a6b134570cd8..31d89d3b97d6 100644 --- a/include/spl/sys/timer.h +++ b/include/spl/sys/timer.h @@ -72,4 +72,29 @@ usleep_range(unsigned long min, unsigned long max) #define USEC_TO_TICK(us) usecs_to_jiffies(us) #define NSEC_TO_TICK(ns) usecs_to_jiffies(ns / NSEC_PER_USEC) +#ifndef from_timer +#define from_timer(var, timer, timer_field) \ + container_of(timer, typeof(*var), timer_field) +#endif + +#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST +typedef struct timer_list *spl_timer_list_t; +#else +typedef unsigned long spl_timer_list_t; +#endif + +#ifndef HAVE_KERNEL_TIMER_SETUP + +static inline void +timer_setup(struct timer_list *timer, void (*func)(spl_timer_list_t), u32 fl) +{ +#ifdef HAVE_KERNEL_TIMER_LIST_FLAGS + (timer)->flags = fl; +#endif + init_timer(timer); + setup_timer(timer, func, (spl_timer_list_t)(timer)); +} + +#endif /* HAVE_KERNEL_TIMER_SETUP */ + #endif /* _SPL_TIMER_H */ diff --git a/module/spl/spl-condvar.c b/module/spl/spl-condvar.c index 1e6e38b7874b..a7a9d1db9a98 100644 --- a/module/spl/spl-condvar.c +++ b/module/spl/spl-condvar.c @@ -154,26 +154,39 @@ EXPORT_SYMBOL(__cv_wait_sig); #if defined(HAVE_IO_SCHEDULE_TIMEOUT) #define spl_io_schedule_timeout(t) io_schedule_timeout(t) #else + +struct spl_task_timer { + struct timer_list timer; + struct task_struct *task; +}; + static void -__cv_wakeup(unsigned long data) +__cv_wakeup(spl_timer_list_t t) { - wake_up_process((struct task_struct *)data); + struct timer_list *tmr = (struct timer_list *)t; + struct spl_task_timer *task_timer = from_timer(task_timer, tmr, timer); + + wake_up_process(task_timer->task); } static long spl_io_schedule_timeout(long time_left) { long expire_time = jiffies + time_left; - struct timer_list timer; + struct spl_task_timer task_timer; + struct timer_list *timer = &task_timer.timer; + + task_timer.task = current; - init_timer(&timer); - setup_timer(&timer, __cv_wakeup, (unsigned long)current); - timer.expires = expire_time; - add_timer(&timer); + timer_setup(timer, __cv_wakeup, 0); + + timer->expires = expire_time; + add_timer(timer); io_schedule(); - del_timer_sync(&timer); + del_timer_sync(timer); + time_left = expire_time - jiffies; return (time_left < 0 ? 0 : time_left); diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c index 7684257be7ad..a39f94e4cc20 100644 --- a/module/spl/spl-taskq.c +++ b/module/spl/spl-taskq.c @@ -24,6 +24,7 @@ * Solaris Porting Layer (SPL) Task Queue Implementation. */ +#include #include #include #include @@ -242,20 +243,13 @@ task_expire_impl(taskq_ent_t *t) wake_up(&tq->tq_work_waitq); } -#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST static void -task_expire(struct timer_list *tl) +task_expire(spl_timer_list_t tl) { - taskq_ent_t *t = from_timer(t, tl, tqent_timer); + struct timer_list *tmr = (struct timer_list *)tl; + taskq_ent_t *t = from_timer(t, tmr, tqent_timer); task_expire_impl(t); } -#else -static void -task_expire(unsigned long data) -{ - task_expire_impl((taskq_ent_t *)data); -} -#endif /* * Returns the lowest incomplete taskqid_t. The taskqid_t may @@ -597,9 +591,6 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) t->tqent_func = func; t->tqent_arg = arg; t->tqent_taskq = tq; -#ifndef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST - t->tqent_timer.data = 0; -#endif t->tqent_timer.function = NULL; t->tqent_timer.expires = 0; t->tqent_birth = jiffies; @@ -649,9 +640,6 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, t->tqent_func = func; t->tqent_arg = arg; t->tqent_taskq = tq; -#ifndef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST - t->tqent_timer.data = (unsigned long)t; -#endif t->tqent_timer.function = task_expire; t->tqent_timer.expires = (unsigned long)expire_time; add_timer(&t->tqent_timer); @@ -744,11 +732,7 @@ taskq_init_ent(taskq_ent_t *t) { spin_lock_init(&t->tqent_lock); init_waitqueue_head(&t->tqent_waitq); -#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST timer_setup(&t->tqent_timer, NULL, 0); -#else - init_timer(&t->tqent_timer); -#endif INIT_LIST_HEAD(&t->tqent_list); t->tqent_id = 0; t->tqent_func = NULL; From 4933b0a25b24fbfe79d1495871cd9ed3eeae97ea Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Sat, 25 May 2019 08:43:23 +0900 Subject: [PATCH 002/109] Drop local definition of MOUNT_BUSY It's accessible via . Reviewed-by: Brian Behlendorf Reviewed-by: Tom Caputi Signed-off-by: Tomohiro Kusumi Closes #8765 --- module/zfs/zfs_ctldir.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index 46e6e19b91d5..c8071a7c215f 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -85,6 +85,7 @@ #include #include #include +#include #include "zfs_namecheck.h" /* @@ -1047,8 +1048,6 @@ zfsctl_snapshot_unmount(char *snapname, int flags) return (error); } -#define MOUNT_BUSY 0x80 /* Mount failed due to EBUSY (from mntent.h) */ - int zfsctl_snapshot_mount(struct path *path, int flags) { From e5a877c5d09cd6002cd5375f298570ac38a5b19d Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Sun, 26 May 2019 06:29:10 +0900 Subject: [PATCH 003/109] Update descriptions for vnops These descriptions are not uptodate with the code. Reviewed-by: Igor Kozhukhov Reviewed-by: Brian Behlendorf Signed-off-by: Tomohiro Kusumi Closes #8767 --- module/zfs/zfs_vnops.c | 20 +++++++++++--------- module/zfs/zfs_znode.c | 7 +++---- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 885d9633b01f..9d8a9cbc5419 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1676,6 +1676,7 @@ zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, * IN: dip - inode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. + * flags - case flags. * * RETURN: 0 if success * error code if failure @@ -1917,6 +1918,7 @@ zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags) * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. + * flags - case flags. * vsecp - ACL to be set * * OUT: ipp - inode of created directory. @@ -2235,13 +2237,12 @@ zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr, } /* - * Read as many directory entries as will fit into the provided - * dirent buffer from the given directory cursor position. + * Read directory entries from the given directory cursor position and emit + * name and position for each entry. * * IN: ip - inode of directory to read. - * dirent - buffer for directory entries. - * - * OUT: dirent - filler buffer of directory entries. + * ctx - directory entry context. + * cr - credentials of caller. * * RETURN: 0 if success * error code if failure @@ -4006,13 +4007,14 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, * Insert the indicated symbolic reference entry into the directory. * * IN: dip - Directory to contain new symbolic link. - * link - Name for new symlink entry. + * name - Name of directory entry in dip. * vap - Attributes of new entry. - * target - Target path of new symlink. - * + * link - Name for new symlink entry. * cr - credentials of caller. * flags - case flags * + * OUT: ipp - Inode for new symbolic link. + * * RETURN: 0 on success, error code on failure. * * Timestamps: @@ -4216,6 +4218,7 @@ zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr) * sip - inode of new entry. * name - name of new entry. * cr - credentials of caller. + * flags - case flags. * * RETURN: 0 if success * error code if failure @@ -4729,7 +4732,6 @@ zfs_inactive(struct inode *ip) * IN: ip - inode seeking within * ooff - old file offset * noffp - pointer to new file offset - * ct - caller context * * RETURN: 0 if success * EINVAL if new offset invalid diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index d5ed4af7029d..a27129b7992b 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -651,12 +651,11 @@ static zfs_acl_phys_t acl_phys; * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root + * IS_TMPFILE - new object is of O_TMPFILE * IS_XATTR - new object is an attribute - * bonuslen - length of bonus buffer - * setaclp - File/Dir initial ACL - * fuidp - Tracks fuid allocation. + * acl_ids - ACL related attributes * - * OUT: zpp - allocated znode + * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) * */ void From 90d8067a77977184cbd99d18582984b9a767fb7f Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Tue, 28 May 2019 15:18:31 -0700 Subject: [PATCH 004/109] Update comments to match code s/get_vdev_spec/make_root_vdev The former doesn't exist anymore. Sponsored by: iXsystems, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Tom Caputi Signed-off-by: Ryan Moeller Closes #8759 --- cmd/zpool/zpool_main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 2cb6774b9adb..a3c76030d634 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -785,7 +785,7 @@ add_prop_list_default(const char *propname, char *propval, nvlist_t **props, * -P Display full path for vdev name. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is - * handled by get_vdev_spec(), which constructs the nvlist needed to pass to + * handled by make_root_vdev(), which constructs the nvlist needed to pass to * libzfs. */ int @@ -883,7 +883,7 @@ zpool_do_add(int argc, char **argv) } } - /* pass off to get_vdev_spec for processing */ + /* pass off to make_root_vdev for processing */ nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { @@ -1232,9 +1232,9 @@ zpool_do_labelclear(int argc, char **argv) * -O Set fsproperty=value in the pool's root file system * * Creates the named pool according to the given vdev specification. The - * bulk of the vdev processing is done in get_vdev_spec() in zpool_vdev.c. Once - * we get the nvlist back from get_vdev_spec(), we either print out the contents - * (if '-n' was specified), or pass it to libzfs to do the creation. + * bulk of the vdev processing is done in make_root_vdev() in zpool_vdev.c. + * Once we get the nvlist back from make_root_vdev(), we either print out the + * contents (if '-n' was specified), or pass it to libzfs to do the creation. */ int zpool_do_create(int argc, char **argv) @@ -1388,7 +1388,7 @@ zpool_do_create(int argc, char **argv) goto errout; } - /* pass off to get_vdev_spec for bulk processing */ + /* pass off to make_root_vdev for bulk processing */ nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun, argc - 1, argv + 1); if (nvroot == NULL) From e4a11acfac078b21f1b84c95d8ddb7a99306eb34 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Wed, 29 May 2019 07:31:39 +0900 Subject: [PATCH 005/109] Refactor parent dataset handling in libzfs zfs_rename() For recursive renaming, simplify the code by moving `zhrp` and `parentname` to inner scope. `zhrp` is only used to test existence of a parent dataset for recursive dataset dir scan since ba6a24026c. Reviewed by: Brian Behlendorf Reviewed-by: Richard Laager Reviewed-by: Giuseppe Di Natale Signed-off-by: Tomohiro Kusumi Closes #8815 --- lib/libzfs/libzfs_dataset.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index e26b32786db5..93af50b99cdd 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -4470,8 +4470,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, zfs_cmd_t zc = {"\0"}; char *delim; prop_changelist_t *cl = NULL; - zfs_handle_t *zhrp = NULL; - char *parentname = NULL; char parent[ZFS_MAX_DATASET_NAME_LEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[1024]; @@ -4566,7 +4564,8 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, } if (recursive) { - parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); + zfs_handle_t *zhrp; + char *parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); if (parentname == NULL) { ret = -1; goto error; @@ -4574,10 +4573,12 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, delim = strchr(parentname, '@'); *delim = '\0'; zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET); + free(parentname); if (zhrp == NULL) { ret = -1; goto error; } + zfs_close(zhrp); } else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) { if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, CL_GATHER_ITER_MOUNTED, @@ -4650,12 +4651,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, } error: - if (parentname != NULL) { - free(parentname); - } - if (zhrp != NULL) { - zfs_close(zhrp); - } if (cl != NULL) { changelist_free(cl); } From 6ce10fdabb0c071b1cf5d7c21564c076d9882ec9 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 28 May 2019 18:58:32 -0400 Subject: [PATCH 006/109] grammar: it is / plural agreement Reviewed-by: Richard Laager Reviewed-by: Matt Ahrens Reviewed-by: Chris Dunlop Signed-off-by: Josh Soref Closes #8818 --- cmd/zfs/zfs_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index d75f089acd1f..214a437c5dd1 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -6733,8 +6733,8 @@ unshare_unmount_compare(const void *larg, const void *rarg, void *unused) /* * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an - * absolute path, find the entry /proc/self/mounts, verify that its a - * ZFS filesystems, and unmount it appropriately. + * absolute path, find the entry /proc/self/mounts, verify that it's a + * ZFS filesystem, and unmount it appropriately. */ static int unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) From 328c95e391ed775ab781392ab57cb64200caa928 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Thu, 30 May 2019 08:18:14 +0900 Subject: [PATCH 007/109] Remove vn_set_fs_pwd()/vn_set_pwd() (no need to be at / during insmod) Per suggestion from @behlendorf in #8777, remove vn_set_fs_pwd() and vn_set_pwd() which are only used in zfs_ioctl.c:_init() while loading zfs.ko. The rest of initialization functions being called here after cwd set to / don't depend on cwd of the process except for spa_config_load(). spa_config_load() uses a relative path ".//etc/zfs/zpool.cache" when `rootdir` is non-NULL, which is "/etc/zfs/zpool.cache" given cwd is /, so just unconditionally use the absolute path without "./", so that `vn_set_pwd("/")` as well as the entire functions can be removed. This is also what FreeBSD does. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Signed-off-by: Tomohiro Kusumi Closes #8826 --- config/kernel-spinlock.m4 | 24 ---------------- config/kernel.m4 | 1 - include/spl/sys/vnode.h | 1 - module/spl/spl-vnode.c | 58 --------------------------------------- module/zfs/spa_config.c | 3 +- module/zfs/zfs_ioctl.c | 7 ----- 6 files changed, 1 insertion(+), 93 deletions(-) delete mode 100644 config/kernel-spinlock.m4 diff --git a/config/kernel-spinlock.m4 b/config/kernel-spinlock.m4 deleted file mode 100644 index d6d6640070b5..000000000000 --- a/config/kernel-spinlock.m4 +++ /dev/null @@ -1,24 +0,0 @@ -dnl # -dnl # 2.6.36 API change, -dnl # The 'struct fs_struct->lock' was changed from a rwlock_t to -dnl # a spinlock_t to improve the fastpath performance. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_FS_STRUCT_SPINLOCK], [ - AC_MSG_CHECKING([whether struct fs_struct uses spinlock_t]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" - ZFS_LINUX_TRY_COMPILE([ - #include - #include - ],[ - static struct fs_struct fs; - spin_lock_init(&fs.lock); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_FS_STRUCT_SPINLOCK, 1, - [struct fs_struct uses spinlock_t]) - ],[ - AC_MSG_RESULT(no) - ]) - EXTRA_KCFLAGS="$tmp_flags" -]) diff --git a/config/kernel.m4 b/config/kernel.m4 index fbc04bdf7d70..8e89c8014d8a 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -12,7 +12,6 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_KERNEL_CTL_NAME ZFS_AC_KERNEL_PDE_DATA ZFS_AC_KERNEL_2ARGS_VFS_FSYNC - ZFS_AC_KERNEL_FS_STRUCT_SPINLOCK ZFS_AC_KERNEL_KUIDGID_T ZFS_AC_KERNEL_FALLOCATE ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE diff --git a/include/spl/sys/vnode.h b/include/spl/sys/vnode.h index 71278b08c867..7bd278e4e13b 100644 --- a/include/spl/sys/vnode.h +++ b/include/spl/sys/vnode.h @@ -182,7 +182,6 @@ extern int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag, extern file_t *vn_getf(int fd); extern void vn_releasef(int fd); extern void vn_areleasef(int fd, uf_info_t *fip); -extern int vn_set_pwd(const char *filename); int spl_vn_init(void); void spl_vn_fini(void); diff --git a/module/spl/spl-vnode.c b/module/spl/spl-vnode.c index 11b5e4e5a2f2..d9056c964e5a 100644 --- a/module/spl/spl-vnode.c +++ b/module/spl/spl-vnode.c @@ -641,64 +641,6 @@ vn_areleasef(int fd, uf_info_t *fip) } /* releasef() */ EXPORT_SYMBOL(areleasef); - -static void -vn_set_fs_pwd(struct fs_struct *fs, struct path *path) -{ - struct path old_pwd; - -#ifdef HAVE_FS_STRUCT_SPINLOCK - spin_lock(&fs->lock); - old_pwd = fs->pwd; - fs->pwd = *path; - path_get(path); - spin_unlock(&fs->lock); -#else - write_lock(&fs->lock); - old_pwd = fs->pwd; - fs->pwd = *path; - path_get(path); - write_unlock(&fs->lock); -#endif /* HAVE_FS_STRUCT_SPINLOCK */ - - if (old_pwd.dentry) - path_put(&old_pwd); -} - -int -vn_set_pwd(const char *filename) -{ - struct path path; - mm_segment_t saved_fs; - int rc; - - /* - * user_path_dir() and __user_walk() both expect 'filename' to be - * a user space address so we must briefly increase the data segment - * size to ensure strncpy_from_user() does not fail with -EFAULT. - */ - saved_fs = get_fs(); - set_fs(KERNEL_DS); - - rc = user_path_dir(filename, &path); - if (rc) - goto out; - - rc = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); - if (rc) - goto dput_and_out; - - vn_set_fs_pwd(current->fs, &path); - -dput_and_out: - path_put(&path); -out: - set_fs(saved_fs); - - return (-rc); -} /* vn_set_pwd() */ -EXPORT_SYMBOL(vn_set_pwd); - static int vn_cache_constructor(void *buf, void *cdrarg, int kmflags) { diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 8616abda37bd..6c0894338e25 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -93,8 +93,7 @@ spa_config_load(void) */ pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) snprintf(pathname, MAXPATHLEN, "%s%s", - (rootdir != NULL) ? "./" : "", spa_config_path); + (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); file = kobj_open_file(pathname); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index f30d0a894414..c6b55d24f7ef 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -7380,13 +7380,6 @@ _init(void) { int error; - error = -vn_set_pwd("/"); - if (error) { - printk(KERN_NOTICE - "ZFS: Warning unable to set pwd to '/': %d\n", error); - return (error); - } - if ((error = -zvol_init()) != 0) return (error); From fafe72712afbbedd9bcf6cd4b3d7b2b2f168b054 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Thu, 6 Jun 2019 06:18:46 +0900 Subject: [PATCH 008/109] Drop objid argument in zfs_znode_alloc() (sync with OpenZFS) Since zfs_znode_alloc() already takes dmu_buf_t*, taking another uint64_t argument for objid is redundant. inode's ->i_ino does and needs to match znode's ->z_id. zfs_znode_alloc() in FreeBSD and illumos doesn't have this argument since vnode doesn't have vnode# in VFS (hence ->z_id exists). Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Signed-off-by: Tomohiro Kusumi Closes #8841 --- module/zfs/zfs_znode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index a27129b7992b..3dd299942202 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -515,7 +515,7 @@ zfs_inode_update(znode_t *zp) */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, - dmu_object_type_t obj_type, uint64_t obj, sa_handle_t *hdl) + dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; struct inode *ip; @@ -596,7 +596,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ZFS_TIME_DECODE(&ip->i_mtime, mtime); ZFS_TIME_DECODE(&ip->i_ctime, ctime); - ip->i_ino = obj; + ip->i_ino = zp->z_id; zfs_inode_update(zp); zfs_inode_set_ops(zfsvfs, ip); @@ -910,8 +910,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, * not fail retry until sufficient memory has been reclaimed. */ do { - *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, obj, - sa_hdl); + *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); } while (*zpp == NULL); VERIFY(*zpp != NULL); @@ -1134,7 +1133,7 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, - doi.doi_bonus_type, obj_num, NULL); + doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { From b63ed49c2996d3fe400ddd5e032a521cf05a7d10 Mon Sep 17 00:00:00 2001 From: Serapheim Dimitropoulos Date: Thu, 6 Jun 2019 13:08:41 -0700 Subject: [PATCH 009/109] Reduced IOPS when all vdevs are in the zfs_mg_fragmentation_threshold Historically while doing performance testing we've noticed that IOPS can be significantly reduced when all vdevs in the pool are hitting the zfs_mg_fragmentation_threshold percentage. Specifically in a hypothetical pool with two vdevs, what can happen is the following: Vdev A would go above that threshold and only vdev B would be used. Then vdev B would pass that threshold but vdev A would go below it (we've been freeing from A to allocate to B). The allocations would go back and forth utilizing one vdev at a time with IOPS taking a hit. Empirically, we've seen that our vdev selection for allocations is good enough that fragmentation increases uniformly across all vdevs the majority of the time. Thus we set the threshold percentage high enough to avoid hitting the speed bump on pools that are being pushed to the edge. We effectively disable its effect in the majority of the cases but we don't remove (at least for now) just in case we hit any weird behavior in the future. Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Reviewed-by: Matt Ahrens Signed-off-by: Serapheim Dimitropoulos Closes #8859 --- man/man5/zfs-module-parameters.5 | 2 +- module/zfs/metaslab.c | 25 ++++++++++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 5bca12e06ea2..282563f13723 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1817,7 +1817,7 @@ this value. If a metaslab group exceeds this threshold then it will be skipped unless all metaslab groups within the metaslab class have also crossed this threshold. .sp -Default value: \fB85\fR. +Default value: \fB95\fR. .RE .sp diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index ec89810b48ab..d1d5a243f403 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -103,12 +103,27 @@ int zfs_mg_noalloc_threshold = 0; /* * Metaslab groups are considered eligible for allocations if their - * fragmenation metric (measured as a percentage) is less than or equal to - * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold - * then it will be skipped unless all metaslab groups within the metaslab - * class have also crossed this threshold. + * fragmenation metric (measured as a percentage) is less than or + * equal to zfs_mg_fragmentation_threshold. If a metaslab group + * exceeds this threshold then it will be skipped unless all metaslab + * groups within the metaslab class have also crossed this threshold. + * + * This tunable was introduced to avoid edge cases where we continue + * allocating from very fragmented disks in our pool while other, less + * fragmented disks, exists. On the other hand, if all disks in the + * pool are uniformly approaching the threshold, the threshold can + * be a speed bump in performance, where we keep switching the disks + * that we allocate from (e.g. we allocate some segments from disk A + * making it bypassing the threshold while freeing segments from disk + * B getting its fragmentation below the threshold). + * + * Empirically, we've seen that our vdev selection for allocations is + * good enough that fragmentation increases uniformly across all vdevs + * the majority of the time. Thus we set the threshold percentage high + * enough to avoid hitting the speed bump on pools that are being pushed + * to the edge. */ -int zfs_mg_fragmentation_threshold = 85; +int zfs_mg_fragmentation_threshold = 95; /* * Allow metaslabs to keep their active state as long as their fragmentation From 60cbc18136d8a5c389ec3e6f3da703f30b9687be Mon Sep 17 00:00:00 2001 From: Allan Jude Date: Thu, 6 Jun 2019 16:14:48 -0400 Subject: [PATCH 010/109] l2arc_apply_transforms: Fix typo in comment Reviewed-by: Chris Dunlop Reviewed-by: Matt Ahrens Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Reviewed-by: Richard Laager Signed-off-by: Allan Jude Closes #8822 --- module/zfs/arc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 3dfa6ca202d1..946ea3415eda 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8760,7 +8760,7 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, /* * If this data simply needs its own buffer, we simply allocate it - * and copy the data. This may be done to elimiate a depedency on a + * and copy the data. This may be done to eliminate a dependency on a * shared buffer or to reallocate the buffer to match asize. */ if (HDR_HAS_RABD(hdr) && asize != psize) { From 06900c409ba9dd62ace0fec5aa0558ca4f115f18 Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Fri, 7 Jun 2019 11:01:41 +0900 Subject: [PATCH 011/109] Avoid updating zfs_gitrev.h when rev is unchanged Build process would always re-compile spa_history.c due to touching zfs_gitrev.h - avoid if no change in gitrev. Reviewed-by: Brian Behlendorf Reviewed-by: Chris Dunlop Reviewed-by: Allan Jude Signed-off-by: Jorgen Lundman Closes #8860 --- scripts/make_gitrev.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/make_gitrev.sh b/scripts/make_gitrev.sh index bab9be88d734..1cf143794b26 100755 --- a/scripts/make_gitrev.sh +++ b/scripts/make_gitrev.sh @@ -39,3 +39,7 @@ trap cleanup EXIT git rev-parse --git-dir > /dev/null 2>&1 # Get the git current git revision ZFS_GIT_REV=$(git describe --always --long --dirty 2>/dev/null) +# Check if header file already contain the exact string +grep -sq "\"${ZFS_GIT_REV}\"" "$(dirname "$0")"/../include/zfs_gitrev.h && + trap - EXIT +exit 0 From 6f7bc7582539048c2280b7d7892a06e4c7f917f8 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 6 Jun 2019 19:10:43 -0700 Subject: [PATCH 012/109] Allow metaslab to be unloaded even when not freed from On large systems, the memory used by loaded metaslabs can become a concern. While range trees are a fairly efficient data structure, on heavily fragmented pools they can still consume a significant amount of memory. This problem is amplified when we fail to unload metaslabs that we aren't using. Currently, we only unload a metaslab during metaslab_sync_done; in order for that function to be called on a given metaslab in a given txg, we have to have dirtied that metaslab in that txg. If the dirtying was the result of an allocation, we wouldn't be unloading it (since it wouldn't be 8 txgs since it was selected), so in effect we only unload a metaslab during txgs where it's being freed from. We move the unload logic from sync_done to a new function, and call that function on all metaslabs in a given vdev during vdev_sync_done(). Reviewed-by: Richard Elling Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Closes #8837 --- include/sys/metaslab.h | 1 + module/zfs/metaslab.c | 47 ++++++++++++++++++++++-------------------- module/zfs/vdev.c | 14 +++++++++++++ 3 files changed, 40 insertions(+), 22 deletions(-) diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 2790d06c71d2..330902529664 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -50,6 +50,7 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, void metaslab_fini(metaslab_t *); int metaslab_load(metaslab_t *); +void metaslab_potentially_unload(metaslab_t *, uint64_t); void metaslab_unload(metaslab_t *); uint64_t metaslab_allocated_space(metaslab_t *); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index d1d5a243f403..41cbaad5f8df 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ @@ -2949,6 +2949,30 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) dmu_tx_commit(tx); } +void +metaslab_potentially_unload(metaslab_t *msp, uint64_t txg) +{ + /* + * If the metaslab is loaded and we've not tried to load or allocate + * from it in 'metaslab_unload_delay' txgs, then unload it. + */ + if (msp->ms_loaded && + msp->ms_disabled == 0 && + msp->ms_selected_txg + metaslab_unload_delay < txg) { + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + VERIFY0(range_tree_space( + msp->ms_allocating[(txg + t) & TXG_MASK])); + } + if (msp->ms_allocator != -1) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } + + if (!metaslab_debug_unload) + metaslab_unload(msp); + } +} + /* * Called after a transaction group has completely synced to mark * all of the metaslab's free space as usable. @@ -3086,27 +3110,6 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) */ metaslab_recalculate_weight_and_sort(msp); - /* - * If the metaslab is loaded and we've not tried to load or allocate - * from it in 'metaslab_unload_delay' txgs, then unload it. - */ - if (msp->ms_loaded && - msp->ms_disabled == 0 && - msp->ms_selected_txg + metaslab_unload_delay < txg) { - - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { - VERIFY0(range_tree_space( - msp->ms_allocating[(txg + t) & TXG_MASK])); - } - if (msp->ms_allocator != -1) { - metaslab_passivate(msp, msp->ms_weight & - ~METASLAB_ACTIVE_MASK); - } - - if (!metaslab_debug_unload) - metaslab_unload(msp); - } - ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_freed)); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 1c4812cd86d9..81ef87e254a8 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -3234,6 +3234,20 @@ vdev_sync_done(vdev_t *vd, uint64_t txg) != NULL) metaslab_sync_done(msp, txg); + /* + * Because this function is only called on dirty vdevs, it's possible + * we won't consider all metaslabs for unloading on every + * txg. However, unless the system is largely idle it is likely that + * we will dirty all vdevs within a few txgs. + */ + for (int i = 0; i < vd->vdev_ms_count; i++) { + msp = vd->vdev_ms[i]; + mutex_enter(&msp->ms_lock); + if (msp->ms_sm != NULL) + metaslab_potentially_unload(msp, txg); + mutex_exit(&msp->ms_lock); + } + if (reassess) metaslab_sync_reassess(vd->vdev_mg); } From c350e62309edc413f9f2312338e5a0b084ebeb8d Mon Sep 17 00:00:00 2001 From: Richard Elling Date: Wed, 5 Jun 2019 16:13:57 -0700 Subject: [PATCH 013/109] Fix logic error in setpartition function Reviewed by: John Kennedy Reviewed-by: Giuseppe Di Natale Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Richard Elling Closes #8839 --- tests/zfs-tests/include/libtest.shlib | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 57d0880cc9bb..b3893c2c3812 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -861,7 +861,8 @@ function zero_partitions # # best to retire this interface and replace it with something more flexible. # At the moment a best effort is made. # -function set_partition # +# arguments: +function set_partition { typeset -i slicenum=$1 typeset start=$2 @@ -872,6 +873,7 @@ function set_partition # /dev/null + parted $disk -s -- print 1 >/dev/null typeset ret_val=$? if [[ $slicenum -eq 0 || $ret_val -ne 0 ]]; then - parted $DEV_DSKDIR/$disk -s -- mklabel gpt + parted $disk -s -- mklabel gpt if [[ $? -ne 0 ]]; then log_note "Failed to create GPT partition table on $disk" return 1 @@ -899,20 +901,21 @@ function set_partition # /dev/null - block_device_wait + blockdev --rereadpt $disk 2>/dev/null + block_device_wait $disk else if [[ -z $slicenum || -z $size || -z $disk ]]; then log_fail "The slice, size or disk name is unspecified." @@ -932,9 +935,10 @@ function set_partition # > $format_file format -e -s -d $disk -f $format_file + typeset ret_val=$? + rm -f $format_file fi - typeset ret_val=$? rm -f $format_file if [[ $ret_val -ne 0 ]]; then log_note "Unable to format $disk slice $slicenum to $size" From a22b00f92480b7341859266176b23c4a801e462b Mon Sep 17 00:00:00 2001 From: Richard Elling Date: Wed, 5 Jun 2019 16:22:04 -0700 Subject: [PATCH 014/109] Remove redundant redundant remove Reviewed by: John Kennedy Reviewed-by: Giuseppe Di Natale Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Richard Elling Closes #8839 --- tests/zfs-tests/include/libtest.shlib | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index b3893c2c3812..1b841d7ba02c 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -939,7 +939,6 @@ function set_partition rm -f $format_file fi - rm -f $format_file if [[ $ret_val -ne 0 ]]; then log_note "Unable to format $disk slice $slicenum to $size" return 1 From fb52bf9b1daf237e23e49a6ba43eb9d3e300f758 Mon Sep 17 00:00:00 2001 From: Richard Elling Date: Fri, 7 Jun 2019 10:12:42 -0700 Subject: [PATCH 015/109] Block_device_wait does not return an error code Reviewed by: John Kennedy Reviewed-by: Giuseppe Di Natale Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Richard Elling Closes #8839 --- tests/zfs-tests/include/blkdev.shlib | 3 +++ .../tests/functional/rsend/send-wDR_encrypted_zvol.ksh | 4 ++-- tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh | 4 ++-- .../zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh | 2 +- .../zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh | 4 ++-- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/zfs-tests/include/blkdev.shlib b/tests/zfs-tests/include/blkdev.shlib index 9cac7184f9fc..e9d584af4b6a 100644 --- a/tests/zfs-tests/include/blkdev.shlib +++ b/tests/zfs-tests/include/blkdev.shlib @@ -56,6 +56,9 @@ function scan_scsi_hosts # # Wait for newly created block devices to have their minors created. # +# Note: there is no meaningful return code if udevadm fails. Consumers +# should not expect a return code (do not call as argument to log_must) +# function block_device_wait { if is_linux; then diff --git a/tests/zfs-tests/tests/functional/rsend/send-wDR_encrypted_zvol.ksh b/tests/zfs-tests/tests/functional/rsend/send-wDR_encrypted_zvol.ksh index 49b846e9c332..443887bfa238 100755 --- a/tests/zfs-tests/tests/functional/rsend/send-wDR_encrypted_zvol.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send-wDR_encrypted_zvol.ksh @@ -62,7 +62,7 @@ log_must eval "echo 'password' > $keyfile" log_must zfs create -o dedup=on -o encryption=on -o keyformat=passphrase \ -o keylocation=file://$keyfile -V 128M $TESTPOOL/$TESTVOL -log_must block_device_wait +block_device_wait log_must eval "echo 'y' | newfs -t ext4 -v $zdev" log_must mkdir -p $mntpnt @@ -82,7 +82,7 @@ done log_must eval "zfs send -wDR $TESTPOOL/$TESTVOL@snap$snap_count > $sendfile" log_must eval "zfs recv $TESTPOOL/recv < $sendfile" log_must zfs load-key $TESTPOOL/recv -log_must block_device_wait +block_device_wait log_must mount $recvdev $recvmnt diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh index 2cdcb38dc257..c8a3cbbf43c4 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh @@ -86,7 +86,7 @@ log_must zfs create -V 128M $TESTPOOL/$TESTVOL log_must zfs set compression=on $TESTPOOL/$TESTVOL log_must zfs set sync=always $TESTPOOL/$TESTVOL log_must mkdir -p $TESTDIR -log_must block_device_wait +block_device_wait echo "y" | newfs -t ext4 -v $VOLUME log_must mkdir -p $MNTPNT log_must mount -o discard $VOLUME $MNTPNT @@ -149,7 +149,7 @@ log_must zpool export $TESTPOOL # `zpool import -f` because we can't write a frozen pool's labels! # log_must zpool import -f $TESTPOOL -log_must block_device_wait +block_device_wait log_must mount $VOLUME $MNTPNT # diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh index 6607d4ca4974..1ee7e33c2ac2 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh @@ -88,7 +88,7 @@ else fi log_must zfs snapshot -r $snappool -log_must block_device_wait +block_device_wait #verify the snapshot -r results for snap in $snappool $snapfs $snapvol $snapctr $snapctrvol \ diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh index 0f876ad6d61e..128b443c6fc9 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh @@ -83,7 +83,7 @@ else fi log_must zfs snapshot -r $snappool -log_must block_device_wait +block_device_wait #select the $TESTCTR as destroy point, $TESTCTR is a child of $TESTPOOL log_must zfs destroy -r $snapctr @@ -92,7 +92,7 @@ for snap in $snapctr $snapctrvol $snapctrclone $snapctrfs; do log_fail "The snapshot $snap is not destroyed correctly." done -for snap in $snappool $snapfs $snapvol $ctrfs@$TESTSNAP1;do +for snap in $snappool $snapfs $snapvol $ctrfs@$TESTSNAP1; do ! snapexists $snap && \ log_fail "The snapshot $snap should be not destroyed." done From 4be4dedb9f50edb35b18db4eef5c277bd93d23fa Mon Sep 17 00:00:00 2001 From: Richard Elling Date: Thu, 30 May 2019 16:38:51 -0700 Subject: [PATCH 016/109] Improve ZTS block_device_wait debugging The udevadm settle timeout can be 120 or 180 seconds by default for some distributions. If a long delay is experienced, it could be due to some strangeness in a malfunctioning device that isn't related to the devices under test. To help debug this condition, a notice is given if settle takes too long. Arguments can now be passed to block_device_wait. The expected arguments are block device pathnames. Reviewed by: John Kennedy Reviewed-by: Giuseppe Di Natale Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Richard Elling Closes #8839 --- tests/zfs-tests/include/blkdev.shlib | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/zfs-tests/include/blkdev.shlib b/tests/zfs-tests/include/blkdev.shlib index e9d584af4b6a..ca8807e82c6a 100644 --- a/tests/zfs-tests/include/blkdev.shlib +++ b/tests/zfs-tests/include/blkdev.shlib @@ -18,6 +18,7 @@ # Copyright (c) 2017 Lawrence Livermore National Security, LLC. # Copyright (c) 2017 Datto Inc. # Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +# Copyright 2019 Richard Elling # # @@ -55,6 +56,16 @@ function scan_scsi_hosts # # Wait for newly created block devices to have their minors created. +# Additional arguments can be passed to udevadm trigger, with the expected +# arguments to typically be a block device pathname. This is useful when +# checking waiting on a specific device to settle rather than triggering +# all devices and waiting for them all to settle. +# +# The udevadm settle timeout can be 120 or 180 seconds by default for +# some distros. If a long delay is experienced, it could be due to some +# strangeness in a malfunctioning device that isn't related to the devices +# under test. To help debug this condition, a notice is given if settle takes +# too long. # # Note: there is no meaningful return code if udevadm fails. Consumers # should not expect a return code (do not call as argument to log_must) @@ -62,8 +73,12 @@ function scan_scsi_hosts function block_device_wait { if is_linux; then - udevadm trigger + udevadm trigger $* + typeset local start=$SECONDS udevadm settle + typeset local elapsed=$((SECONDS - start)) + [[ $elapsed > 60 ]] && \ + log_note udevadm settle time too long: $elapsed fi } From fe11968bbfb6bd825790a51228483f51b3d30d1f Mon Sep 17 00:00:00 2001 From: bnjf Date: Thu, 13 Jun 2019 06:03:33 +1000 Subject: [PATCH 017/109] Fix typo in vdev_raidz_math.c Fix typo in vdev_raidz_math.c Reviewed by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Brad Forschinger Closes #8875 Closes #8880 --- module/zfs/vdev_raidz_math.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index e6112bc02137..3ef67768f916 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -472,7 +472,7 @@ vdev_raidz_math_init(void) return; #endif - /* Fake an zio and run the benchmark on a warmed up buffer */ + /* Fake a zio and run the benchmark on a warmed up buffer */ bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); bench_zio->io_offset = 0; bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ From 812c36fc711b5f1dc7b41f27761b5e283f16df19 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Wed, 12 Jun 2019 13:06:55 -0700 Subject: [PATCH 018/109] Target ARC size can get reduced to arc_c_min Sometimes the target ARC size is reduced to arc_c_min, which impacts performance. We've seen this happen as part of the random_reads performance regression test, where the ARC size is reduced before the reads test starts which impacts how long it takes for system to reach good IOPS performance. We call arc_reduce_target_size when arc_reap_cb_check() returns TRUE, and arc_available_memory() is less than arc_c>>arc_shrink_shift. However, arc_available_memory() could easily be low, even when arc_c is low, because we can have tons of unused bufs in the abd kmem cache. This would be especially true just after the DMU requests a bunch of stuff be evicted from the ARC (e.g. due to "zpool export"). To fix this, the ARC should reduce arc_c by the requested amount, not all the way down to arc_size (or arc_c_min), which can be very small. Reviewed-by: Tim Chase Reviewed by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Matthew Ahrens External-issue: DLPX-59431 Closes #8864 --- module/zfs/arc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 946ea3415eda..a7e7d26996f8 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4801,8 +4801,6 @@ arc_reduce_target_size(int64_t to_free) if (c > to_free && c - to_free > arc_c_min) { arc_c = c - to_free; atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); - if (asize < arc_c) - arc_c = MAX(asize, arc_c_min); if (arc_p > arc_c) arc_p = (arc_c >> 1); ASSERT(arc_c >= arc_c_min); From 516a08ebb4e24e09fc9ec39a7204d2f9d20d043d Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Wed, 12 Jun 2019 13:13:09 -0700 Subject: [PATCH 019/109] fat zap should prefetch when iterating When iterating over a ZAP object, we're almost always certain to iterate over the entire object. If there are multiple leaf blocks, we can realize a performance win by issuing reads for all the leaf blocks in parallel when the iteration begins. For example, if we have 10,000 snapshots, "zfs destroy -nv pool/fs@1%9999" can take 30 minutes when the cache is cold. This change provides a >3x performance improvement, by issuing the reads for all ~64 blocks of each ZAP object in parallel. Reviewed-by: Andreas Dilger Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens External-issue: DLPX-58347 Closes #8862 --- include/sys/zap.h | 7 ++-- man/man5/zfs-module-parameters.5 | 25 ++++++++++++++ module/zfs/ddt_zap.c | 14 +++++++- module/zfs/dmu.c | 16 +++++++++ module/zfs/zap.c | 56 +++++++++++++++++++++++++++++++- module/zfs/zap_micro.c | 31 +++++++++++++++--- 6 files changed, 140 insertions(+), 9 deletions(-) diff --git a/include/sys/zap.h b/include/sys/zap.h index ab13652d8c07..b19b4643879c 100644 --- a/include/sys/zap.h +++ b/include/sys/zap.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ @@ -350,6 +350,7 @@ typedef struct zap_cursor { uint64_t zc_serialized; uint64_t zc_hash; uint32_t zc_cd; + boolean_t zc_prefetch; } zap_cursor_t; typedef struct { @@ -375,7 +376,9 @@ typedef struct { * Initialize a zap cursor, pointing to the "first" attribute of the * zapobj. You must _fini the cursor when you are done with it. */ -void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); +void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj); +void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, + uint64_t zapobj); void zap_cursor_fini(zap_cursor_t *zc); /* diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 282563f13723..29374a9d3965 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -104,6 +104,18 @@ to a log2 fraction of the target arc size. Default value: \fB6\fR. .RE +.sp +.ne 2 +.na +\fBdmu_prefetch_max\fR (int) +.ad +.RS 12n +Limit the amount we can prefetch with one call to this amount (in bytes). +This helps to limit the amount of memory that can be used by prefetching. +.sp +Default value: \fB134,217,728\fR (128MB). +.RE + .sp .ne 2 .na @@ -502,6 +514,19 @@ regular reads (but there's no reason it has to be the same). Default value: \fB32,768\fR. .RE +.sp +.ne 2 +.na +\fBzap_iterate_prefetch\fR (int) +.ad +.RS 12n +If this is set, when we start iterating over a ZAP object, zfs will prefetch +the entire object (all leaf blocks). However, this is limited by +\fBdmu_prefetch_max\fR. +.sp +Use \fB1\fR for on (default) and \fB0\fR for off. +.RE + .sp .ne 2 .na diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 77c0784cca0b..3489d31d9c9e 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018 by Delphix. All rights reserved. */ #include @@ -117,7 +118,18 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) zap_attribute_t za; int error; - zap_cursor_init_serialized(&zc, os, object, *walk); + if (*walk == 0) { + /* + * We don't want to prefetch the entire ZAP object, because + * it can be enormous. Also the primary use of DDT iteration + * is for scrubbing, in which case we will be issuing many + * scrub I/Os for each ZAP block that we read in, so + * reading the ZAP is unlikely to be the bottleneck. + */ + zap_cursor_init_noprefetch(&zc, os, object); + } else { + zap_cursor_init_serialized(&zc, os, object, *walk); + } if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { uchar_t cbuf[sizeof (dde->dde_phys) + 1]; uint64_t csize = za.za_num_integers; diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 2d6740576bb6..b4131d91781a 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -81,6 +81,13 @@ int zfs_dmu_offset_next_sync = 0; */ int zfs_object_remap_one_indirect_delay_ms = 0; +/* + * Limit the amount we can prefetch with one call to this amount. This + * helps to limit the amount of memory that can be used by prefetching. + * Larger objects should be prefetched a bit at a time. + */ +int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; + const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, @@ -667,6 +674,11 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, return; } + /* + * See comment before the definition of dmu_prefetch_max. + */ + len = MIN(len, dmu_prefetch_max); + /* * XXX - Note, if the dnode for the requested object is not * already cached, we will do a *synchronous* read in the @@ -2629,6 +2641,10 @@ module_param(zfs_dmu_offset_next_sync, int, 0644); MODULE_PARM_DESC(zfs_dmu_offset_next_sync, "Enable forcing txg sync to find holes"); +module_param(dmu_prefetch_max, int, 0644); +MODULE_PARM_DESC(dmu_prefetch_max, + "Limit one prefetch call to this size"); + /* END CSTYLED */ #endif diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 6d8c498042c9..30f62ac43b62 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -49,6 +49,36 @@ #include #include +/* + * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object + * (all leaf blocks) when we start iterating over it. + * + * For zap_cursor_init(), the callers all intend to iterate through all the + * entries. There are a few cases where an error (typically i/o error) could + * cause it to bail out early. + * + * For zap_cursor_init_serialized(), there are callers that do the iteration + * outside of ZFS. Typically they would iterate over everything, but we + * don't have control of that. E.g. zfs_ioc_snapshot_list_next(), + * zcp_snapshots_iter(), and other iterators over things in the MOS - these + * are called by /sbin/zfs and channel programs. The other example is + * zfs_readdir() which iterates over directory entries for the getdents() + * syscall. /sbin/ls iterates to the end (unless it receives a signal), but + * userland doesn't have to. + * + * Given that the ZAP entries aren't returned in a specific order, the only + * legitimate use cases for partial iteration would be: + * + * 1. Pagination: e.g. you only want to display 100 entries at a time, so you + * get the first 100 and then wait for the user to hit "next page", which + * they may never do). + * + * 2. You want to know if there are more than X entries, without relying on + * the zfs-specific implementation of the directory's st_size (which is + * the number of entries). + */ +int zap_iterate_prefetch = B_TRUE; + int fzap_default_block_shift = 14; /* 16k blocksize */ extern inline zap_phys_t *zap_f_phys(zap_t *zap); @@ -1189,6 +1219,21 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) /* retrieve the next entry at or after zc_hash/zc_cd */ /* if no entry, return ENOENT */ + /* + * If we are reading from the beginning, we're almost certain to + * iterate over the entire ZAP object. If there are multiple leaf + * blocks (freeblk > 2), prefetch the whole object (up to + * dmu_prefetch_max bytes), so that we read the leaf blocks + * concurrently. (Unless noprefetch was requested via + * zap_cursor_init_noprefetch()). + */ + if (zc->zc_hash == 0 && zap_iterate_prefetch && + zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { + dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0, + zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), + ZIO_PRIORITY_ASYNC_READ); + } + if (zc->zc_leaf && (ZAP_HASH_IDX(zc->zc_hash, zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != @@ -1333,3 +1378,12 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) } } } + +#if defined(_KERNEL) +/* BEGIN CSTYLED */ +module_param(zap_iterate_prefetch, int, 0644); +MODULE_PARM_DESC(zap_iterate_prefetch, + "When iterating ZAP object, prefetch it"); + +/* END CSTYLED */ +#endif diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index fa369f797548..467812ff637c 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ @@ -1472,9 +1472,9 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, * Routines for iterating over the attributes. */ -void -zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, - uint64_t serialized) +static void +zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized, boolean_t prefetch) { zc->zc_objset = os; zc->zc_zap = NULL; @@ -1483,12 +1483,33 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, zc->zc_serialized = serialized; zc->zc_hash = 0; zc->zc_cd = 0; + zc->zc_prefetch = prefetch; +} +void +zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized) +{ + zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); } +/* + * Initialize a cursor at the beginning of the ZAP object. The entire + * ZAP object will be prefetched. + */ void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { - zap_cursor_init_serialized(zc, os, zapobj, 0); + zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); +} + +/* + * Initialize a cursor at the beginning, but request that we not prefetch + * the entire ZAP object. + */ +void +zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) +{ + zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); } void From 4f809bddc67b152afd9e9a52a01d1af132151a9f Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Thu, 13 Jun 2019 09:15:06 +0900 Subject: [PATCH 020/109] Fix lockdep warning on insmod sysfs_attr_init() is required to make lockdep happy for dynamically allocated sysfs attributes. This fixed #8868 on Fedora 29 running kernel-debug. This requirement was introduced in 2.6.34. See include/linux/sysfs.h for what it actually does. Reviewed-by: Brian Behlendorf Reviewed-by: Olaf Faaland Signed-off-by: Tomohiro Kusumi Closes #8868 Closes #8884 --- module/zfs/zfs_sysfs.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/module/zfs/zfs_sysfs.c b/module/zfs/zfs_sysfs.c index 30b5edb01e18..2f5bea9aa996 100644 --- a/module/zfs/zfs_sysfs.c +++ b/module/zfs/zfs_sysfs.c @@ -144,6 +144,10 @@ zfs_kobj_release(struct kobject *kobj) zkobj->zko_attr_count = 0; } +#ifndef sysfs_attr_init +#define sysfs_attr_init(attr) do {} while (0) +#endif + static void zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name) { @@ -154,6 +158,7 @@ zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name) zkobj->zko_attr_list[attr_num].name = attr_name; zkobj->zko_attr_list[attr_num].mode = 0444; zkobj->zko_default_attrs[attr_num] = &zkobj->zko_attr_list[attr_num]; + sysfs_attr_init(&zkobj->zko_attr_list[attr_num]); } static int From 77e64c6fffa2af6c3b8aeb8b486873a3fca91e53 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 13 Jun 2019 08:48:43 -0700 Subject: [PATCH 021/109] ztest: dmu_tx_assign() gets ENOSPC in spa_vdev_remove_thread() When running zloop, we occasionally see the following crash: dmu_tx_assign(tx, TXG_WAIT) == 0 (0x1c == 0) ASSERT at ../../module/zfs/vdev_removal.c:1507:spa_vdev_remove_thread()/sbin/ztest(+0x89c3)[0x55faf567b9c3] The error value 0x1c is ENOSPC. The transaction used by spa_vdev_remove_thread() should not be able to fail due to being out of space. i.e. we should not call dmu_tx_hold_space(). This will allow the removal thread to schedule its work even when the pool is low on space. The "slop space" will provide enough free space to sync out the txg. Reviewed-by: Igor Kozhukhov Reviewed-by: Paul Dagnelie Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens External-issue: DLPX-37853 Closes #8889 --- module/zfs/vdev_removal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index f2d18d9257bd..536a982eca2b 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1498,7 +1498,7 @@ spa_vdev_remove_thread(void *arg) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - dmu_tx_hold_space(tx, SPA_MAXBLOCKSIZE); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); From 19cebf05187d60605ae38ddef9cdf7b10a51deba Mon Sep 17 00:00:00 2001 From: Tulsi Jain Date: Thu, 13 Jun 2019 08:56:15 -0700 Subject: [PATCH 022/109] Restrict filesystem creation if name referred either '.' or '..' This change restricts filesystem creation if the given name contains either '.' or '..' Reviewed-by: Matt Ahrens Reviewed-by: Brian Behlendorf Reviewed-by: Richard Elling Signed-off-by: TulsiJain Closes #8842 Closes #8564 --- include/zfs_namecheck.h | 2 ++ lib/libzfs/libzfs_dataset.c | 10 +++++++++ module/zcommon/zfs_namecheck.c | 21 +++++++++++++++++++ .../zfs_create/zfs_create_009_neg.ksh | 4 +++- 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/include/zfs_namecheck.h b/include/zfs_namecheck.h index 527db92b0cfa..56d3d36f026e 100644 --- a/include/zfs_namecheck.h +++ b/include/zfs_namecheck.h @@ -43,6 +43,8 @@ typedef enum { NAME_ERR_RESERVED, /* entire name is reserved */ NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].*) */ NAME_ERR_TOOLONG, /* name is too long */ + NAME_ERR_SELF_REF, /* reserved self path name ('.') */ + NAME_ERR_PARENT_REF, /* reserved parent path name ('..') */ NAME_ERR_NO_AT, /* permission set is missing '@' */ } namecheck_err_t; diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 93af50b99cdd..3be205f1f437 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -197,6 +197,16 @@ zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, "reserved disk name")); break; + case NAME_ERR_SELF_REF: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "self reference, '.' is found in name")); + break; + + case NAME_ERR_PARENT_REF: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "parent reference, '..' is found in name")); + break; + default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index 58b23b0e00b0..b1e0de6d8181 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -232,6 +232,27 @@ entity_namecheck(const char *path, namecheck_err_t *why, char *what) } } + if (*end == '\0' || *end == '/') { + int component_length = end - start; + /* Validate the contents of this component is not '.' */ + if (component_length == 1) { + if (start[0] == '.') { + if (why) + *why = NAME_ERR_SELF_REF; + return (-1); + } + } + + /* Validate the content of this component is not '..' */ + if (component_length == 2) { + if (start[0] == '.' && start[1] == '.') { + if (why) + *why = NAME_ERR_PARENT_REF; + return (-1); + } + } + } + /* Snapshot or bookmark delimiter found */ if (*end == '@' || *end == '#') { /* Multiple delimiters are not allowed */ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh index b8190626c7b3..63f5e595ea38 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh @@ -90,7 +90,9 @@ set -A args "$TESTPOOL/" "$TESTPOOL//blah" "$TESTPOOL/@blah" \ "$TESTPOOL/blah*blah" "$TESTPOOL/blah blah" \ "-s $TESTPOOL/$TESTFS1" "-b 1092 $TESTPOOL/$TESTFS1" \ "-b 64k $TESTPOOL/$TESTFS1" "-s -b 32k $TESTPOOL/$TESTFS1" \ - "$TESTPOOL/$BYND_MAX_NAME" "$TESTPOOL/$BYND_NEST_LIMIT" + "$TESTPOOL/$BYND_MAX_NAME" "$TESTPOOL/$BYND_NEST_LIMIT" \ + "$TESTPOOL/." "$TESTPOOL/.." "$TESTPOOL/../blah" "$TESTPOOL/./blah" \ + "$TESTPOOL/blah/./blah" "$TESTPOOL/blah/../blah" log_assert "Verify 'zfs create ' fails with bad argument." From cab7d856ea619db0d5d17e0a17fedac273f9945d Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 13 Jun 2019 16:08:24 -0400 Subject: [PATCH 023/109] Move write aggregation memory copy out of vq_lock Memory copy is too heavy operation to do under the congested lock. Moving it out reduces congestion by many times to almost invisible. Since the original zio removed from the queue, and the child zio is not executed yet, I don't see why would the copy need protection. My guess it just remained like this from the time when lock was not dropped here, which was added later to fix lock ordering issue. Multi-threaded sequential write tests with both HDD and SSD pools with ZVOL block sizes of 4KB, 16KB, 64KB and 128KB all show major reduction of lock congestion, saving from 15% to 35% of CPU time and increasing throughput from 10% to 40%. Reviewed-by: Richard Yao Reviewed-by: Matt Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Closes #8890 --- module/zfs/vdev_queue.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index e74df76b7530..86b20f134834 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -709,6 +709,18 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) do { dio = nio; nio = AVL_NEXT(t, dio); + zio_add_child(dio, aio); + vdev_queue_io_remove(vq, dio); + } while (dio != last); + + /* + * We need to drop the vdev queue's lock during zio_execute() to + * avoid a deadlock that we could encounter due to lock order + * reversal between vq_lock and io_lock in zio_change_priority(). + * Use the dropped lock to do memory copy without congestion. + */ + mutex_exit(&vq->vq_lock); + while ((dio = zio_walk_parents(aio, &zl)) != NULL) { ASSERT3U(dio->io_type, ==, aio->io_type); if (dio->io_flags & ZIO_FLAG_NODATA) { @@ -720,16 +732,6 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) dio->io_offset - aio->io_offset, 0, dio->io_size); } - zio_add_child(dio, aio); - vdev_queue_io_remove(vq, dio); - } while (dio != last); - - /* - * We need to drop the vdev queue's lock to avoid a deadlock that we - * could encounter since this I/O will complete immediately. - */ - mutex_exit(&vq->vq_lock); - while ((dio = zio_walk_parents(aio, &zl)) != NULL) { zio_vdev_io_bypass(dio); zio_execute(dio); } From 592ee2e6ddcad339398e825bdb39569167c550ab Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 13 Jun 2019 13:10:19 -0700 Subject: [PATCH 024/109] compress metadata in later sync passes Starting in sync pass 5 (zfs_sync_pass_dont_compress), we disable compression (including of metadata). Ostensibly this helps the sync passes to converge (i.e. for a sync pass to not need to allocate anything because it is 100% overwrites). However, in practice it increases the average number of sync passes, because when we turn compression off, a lot of block's size will change and thus we have to re-allocate (not overwrite) them. It also increases the number of 128KB allocations (e.g. for indirect blocks and spacemaps) because these will not be compressed. The 128K allocations are especially detrimental to performance on highly fragmented systems, which may have very few free segments of this size, and may need to load new metaslabs to satisfy 128K allocations. We should increase zfs_sync_pass_dont_compress. In practice on a highly fragmented system we see a few 5-pass txg's, a tiny number of 6-pass txg's, and no txg's with more than 6 passes. Reviewed-by: Brian Behlendorf Reviewed-by: Richard Elling Reviewed by: Pavel Zakharov Reviewed-by: Serapheim Dimitropoulos Reviewed-by: George Wilson Signed-off-by: Matthew Ahrens External-issue: DLPX-63431 Closes #8892 --- man/man5/zfs-module-parameters.5 | 16 ++++++++++++++-- module/zfs/zio.c | 18 ++++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 29374a9d3965..2d2a79413d97 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -2444,9 +2444,21 @@ Default value: \fB25\fR. \fBzfs_sync_pass_dont_compress\fR (int) .ad .RS 12n -Don't compress starting in this pass +Starting in this sync pass, we disable compression (including of metadata). +With the default setting, in practice, we don't have this many sync passes, +so this has no effect. +.sp +The original intent was that disabling compression would help the sync passes +to converge. However, in practice disabling compression increases the average +number of sync passes, because when we turn compression off, a lot of block's +size will change and thus we have to re-allocate (not overwrite) them. It +also increases the number of 128KB allocations (e.g. for indirect blocks and +spacemaps) because these will not be compressed. The 128K allocations are +especially detrimental to performance on highly fragmented systems, which may +have very few free segments of this size, and may need to load new metaslabs +to satisfy 128K allocations. .sp -Default value: \fB5\fR. +Default value: \fB8\fR. .RE .sp diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 016ac07eabd9..5bfff37eb3b5 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ @@ -96,9 +96,23 @@ int zio_slow_io_ms = (30 * MILLISEC); * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. + * + * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable + * compression (including of metadata). In practice, we don't have this + * many sync passes, so this has no effect. + * + * The original intent was that disabling compression would help the sync + * passes to converge. However, in practice disabling compression increases + * the average number of sync passes, because when we turn compression off, a + * lot of block's size will change and thus we have to re-allocate (not + * overwrite) them. It also increases the number of 128KB allocations (e.g. + * for indirect blocks and spacemaps) because these will not be compressed. + * The 128K allocations are especially detrimental to performance on highly + * fragmented systems, which may have very few free segments of this size, + * and may need to load new metaslabs to satisfy 128K allocations. */ int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ -int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ +int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */ int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ /* From 6083f403873f5e427ee8d86f903aa08c7b69daab Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 13 Jun 2019 13:12:39 -0700 Subject: [PATCH 025/109] panic in removal_remap test on 4K devices If the zfs_remove_max_segment tunable is changed to be not a multiple of the sector size, then the device removal code will malfunction and try to create mappings that are smaller than one sector, leading to a panic. On debug bits this assertion will fail in spa_vdev_copy_segment(): ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); On nondebug, the system panics with a stack like: metaslab_free_concrete() metaslab_free_impl() metaslab_free_impl_cb() vdev_indirect_remap() free_from_removing_vdev() metaslab_free_impl() metaslab_free_dva() metaslab_free() Fortunately, the default for zfs_remove_max_segment is 1MB, so this can't occur by default. We hit it during this test because removal_remap.ksh changes zfs_remove_max_segment to 1KB. When testing on 4KB-sector disks, we hit the bug. This change makes the zfs_remove_max_segment tunable more robust, automatically rounding it up to a multiple of the sector size. We also turn some key assertions into VERIFY's so that similar bugs would be caught before they are encoded on disk (and thus avoid a panic-reboot-loop). Reviewed-by: Sean Eric Fagan Reviewed-by: Pavel Zakharov Reviewed-by: Serapheim Dimitropoulos Reviewed-by: Sebastien Roy Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens External-issue: DLPX-61342 Closes #8893 --- include/sys/vdev_removal.h | 8 ++++---- man/man5/zfs-module-parameters.5 | 27 ++++++++++++++++++++++++++ module/zfs/vdev_label.c | 5 ++--- module/zfs/vdev_removal.c | 33 +++++++++++++++++++++++++------- 4 files changed, 59 insertions(+), 14 deletions(-) diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h index 3962237afdab..e3bab0658d62 100644 --- a/include/sys/vdev_removal.h +++ b/include/sys/vdev_removal.h @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2014, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_REMOVAL_H @@ -81,13 +81,13 @@ extern void spa_vdev_condense_suspend(spa_t *); extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t); extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t); extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *); -extern void svr_sync(spa_t *spa, dmu_tx_t *tx); +extern void svr_sync(spa_t *, dmu_tx_t *); extern void spa_vdev_remove_suspend(spa_t *); extern int spa_vdev_remove_cancel(spa_t *); -extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr); +extern void spa_vdev_removal_destroy(spa_vdev_removal_t *); +extern uint64_t spa_remove_max_segment(spa_t *); extern int vdev_removal_max_span; -extern int zfs_remove_max_segment; #ifdef __cplusplus } diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 2d2a79413d97..8ad3ce466ce5 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -2194,6 +2194,33 @@ pool cannot be returned to a healthy state prior to removing the device. Default value: \fB0\fR. .RE +.sp +.ne 2 +.na +\fBzfs_removal_suspend_progress\fR (int) +.ad +.RS 12n +.sp +This is used by the test suite so that it can ensure that certain actions +happen while in the middle of a removal. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_remove_max_segment\fR (int) +.ad +.RS 12n +.sp +The largest contiguous segment that we will attempt to allocate when removing +a device. This can be no larger than 16MB. If there is a performance +problem with attempting to allocate large blocks, consider decreasing this. +.sp +Default value: \fB16,777,216\fR (16MB). +.RE + .sp .ne 2 .na diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index a0e373b3dfc5..6320732ed6da 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -21,8 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ @@ -613,7 +612,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, * zfs_remove_max_segment, so we need at least one entry * per zfs_remove_max_segment of allocated data. */ - seg_count += to_alloc / zfs_remove_max_segment; + seg_count += to_alloc / spa_remove_max_segment(spa); fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, seg_count * diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 536a982eca2b..6f64edd8c473 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. */ #include @@ -100,6 +100,8 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If * there is a performance problem with attempting to allocate large blocks, * consider decreasing this. + * + * See also the accessor function spa_remove_max_segment(). */ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE; @@ -951,8 +953,10 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, vdev_indirect_mapping_entry_t *entry; dva_t dst = {{ 0 }}; uint64_t start = range_tree_min(segs); + ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift)); ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); + ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift)); uint64_t size = range_tree_span(segs); if (range_tree_span(segs) > maxalloc) { @@ -983,6 +987,7 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, } } ASSERT3U(size, <=, maxalloc); + ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift)); /* * An allocation class might not have any remaining vdevs or space @@ -1026,11 +1031,11 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, /* * We can't have any padding of the allocated size, otherwise we will - * misunderstand what's allocated, and the size of the mapping. - * The caller ensures this will be true by passing in a size that is - * aligned to the worst (highest) ashift in the pool. + * misunderstand what's allocated, and the size of the mapping. We + * prevent padding by ensuring that all devices in the pool have the + * same ashift, and the allocation size is a multiple of the ashift. */ - ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); + VERIFY3U(DVA_GET_ASIZE(&dst), ==, size); entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); @@ -1363,6 +1368,20 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, range_tree_destroy(segs); } +/* + * The size of each removal mapping is limited by the tunable + * zfs_remove_max_segment, but we must adjust this to be a multiple of the + * pool's ashift, so that we don't try to split individual sectors regardless + * of the tunable value. (Note that device removal requires that all devices + * have the same ashift, so there's no difference between spa_min_ashift and + * spa_max_ashift.) The raw tunable should not be used elsewhere. + */ +uint64_t +spa_remove_max_segment(spa_t *spa) +{ + return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift)); +} + /* * The removal thread operates in open context. It iterates over all * allocated space in the vdev, by loading each metaslab's spacemap. @@ -1385,7 +1404,7 @@ spa_vdev_remove_thread(void *arg) spa_t *spa = arg; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_copy_arg_t vca; - uint64_t max_alloc = zfs_remove_max_segment; + uint64_t max_alloc = spa_remove_max_segment(spa); uint64_t last_txg = 0; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -1511,7 +1530,7 @@ spa_vdev_remove_thread(void *arg) vd = vdev_lookup_top(spa, svr->svr_vdev_id); if (txg != last_txg) - max_alloc = zfs_remove_max_segment; + max_alloc = spa_remove_max_segment(spa); last_txg = txg; spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); From b033353b2548a357a7e2bbde2cf68b2ccf8f0054 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 13 Jun 2019 13:14:35 -0700 Subject: [PATCH 026/109] lz4_decompress_abd declared but not defined `lz4_decompress_abd` is declared in zio_compress.h but it is not defined anywhere. The declaration should be removed. Reviewed by: Dan Kimmel Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens External-issue: DLPX-47477 Closes #8894 --- include/sys/zio_compress.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h index 1642823d3d42..208117eee4b5 100644 --- a/include/sys/zio_compress.h +++ b/include/sys/zio_compress.h @@ -105,8 +105,7 @@ extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len, int level); extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, int level); -extern int lz4_decompress_abd(abd_t *src, void *dst, size_t s_len, size_t d_len, - int level); + /* * Compress and decompress data if necessary. */ From 9e54b9d930849e2ccb9ae12d729c7f20e54c670f Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Thu, 13 Jun 2019 13:15:46 -0700 Subject: [PATCH 027/109] Python config cleanup Don't require Python at configure/build unless building pyzfs. Move ZFS_AC_PYTHON_MODULE to always-pyzfs.m4 where it is used. Make test syntax more consistent. Sponsored by: iXsystems, Inc. Reviewed-by: Neal Gompa Reviewed-by: Brian Behlendorf Signed-off-by: Ryan Moeller Closes #8895 --- config/always-python.m4 | 87 +++++++++++------------------------------ config/always-pyzfs.m4 | 45 ++++++++++++++------- 2 files changed, 53 insertions(+), 79 deletions(-) diff --git a/config/always-python.m4 b/config/always-python.m4 index 7cfefd9ebcae..c1c07597e688 100644 --- a/config/always-python.m4 +++ b/config/always-python.m4 @@ -1,47 +1,3 @@ -dnl # -dnl # ZFS_AC_PYTHON_VERSION(version, [action-if-true], [action-if-false]) -dnl # -dnl # Verify Python version -dnl # -AC_DEFUN([ZFS_AC_PYTHON_VERSION], [ - ver_check=`$PYTHON -c "import sys; print (sys.version.split()[[0]] $1)"` - AS_IF([test "$ver_check" = "True"], [ - m4_ifvaln([$2], [$2]) - ], [ - m4_ifvaln([$3], [$3]) - ]) -]) - -dnl # -dnl # ZFS_AC_PYTHON_VERSION_IS_2 -dnl # ZFS_AC_PYTHON_VERSION_IS_3 -dnl # -dnl # Tests if the $PYTHON_VERSION matches 2.x or 3.x. -dnl # -AC_DEFUN([ZFS_AC_PYTHON_VERSION_IS_2], - [test "${PYTHON_VERSION%%\.*}" = "2"]) -AC_DEFUN([ZFS_AC_PYTHON_VERSION_IS_3], - [test "${PYTHON_VERSION%%\.*}" = "3"]) - -dnl # -dnl # ZFS_AC_PYTHON_MODULE(module_name, [action-if-true], [action-if-false]) -dnl # -dnl # Checks for Python module. Freely inspired by AX_PYTHON_MODULE -dnl # https://www.gnu.org/software/autoconf-archive/ax_python_module.html -dnl # Required by ZFS_AC_CONFIG_ALWAYS_PYZFS. -dnl # -AC_DEFUN([ZFS_AC_PYTHON_MODULE], [ - PYTHON_NAME=`basename $PYTHON` - AC_MSG_CHECKING([for $PYTHON_NAME module: $1]) - AS_IF([$PYTHON -c "import $1" 2>/dev/null], [ - AC_MSG_RESULT(yes) - m4_ifvaln([$2], [$2]) - ], [ - AC_MSG_RESULT(no) - m4_ifvaln([$3], [$3]) - ]) -]) - dnl # dnl # The majority of the python scripts are written to be compatible dnl # with Python 2.6 and Python 3.4. Therefore, they may be installed @@ -66,35 +22,38 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYTHON], [ [AC_MSG_ERROR([Unknown --with-python value '$with_python'])] ) - AS_IF([test $PYTHON != :], [ - AS_IF([$PYTHON --version >/dev/null 2>&1], - [AM_PATH_PYTHON([2.6], [], [:])], - [AC_MSG_ERROR([Cannot find $PYTHON in your system path])] - ) - ]) - AM_CONDITIONAL([USING_PYTHON], [test "$PYTHON" != :]) - AM_CONDITIONAL([USING_PYTHON_2], [ZFS_AC_PYTHON_VERSION_IS_2]) - AM_CONDITIONAL([USING_PYTHON_3], [ZFS_AC_PYTHON_VERSION_IS_3]) - dnl # dnl # Minimum supported Python versions for utilities: - dnl # Python 2.6.x, or Python 3.4.x + dnl # Python 2.6 or Python 3.4 dnl # - AS_IF([ZFS_AC_PYTHON_VERSION_IS_2], [ - ZFS_AC_PYTHON_VERSION([>= '2.6'], [ true ], - [AC_MSG_ERROR("Python >= 2.6.x is not available")]) + AM_PATH_PYTHON([], [], [:]) + AS_IF([test -z "$PYTHON_VERSION"], [ + PYTHON_VERSION=$(basename $PYTHON | tr -cd 0-9.) ]) + PYTHON_MINOR=${PYTHON_VERSION#*\.} - AS_IF([ZFS_AC_PYTHON_VERSION_IS_3], [ - ZFS_AC_PYTHON_VERSION([>= '3.4'], [ true ], - [AC_MSG_ERROR("Python >= 3.4.x is not available")]) - ]) + AS_CASE([$PYTHON_VERSION], + [2.*], [ + AS_IF([test $PYTHON_MINOR -lt 6], + [AC_MSG_ERROR("Python >= 2.6 is required")]) + ], + [3.*], [ + AS_IF([test $PYTHON_MINOR -lt 4], + [AC_MSG_ERROR("Python >= 3.4 is required")]) + ], + [:|2|3], [], + [PYTHON_VERSION=3] + ) + + AM_CONDITIONAL([USING_PYTHON], [test "$PYTHON" != :]) + AM_CONDITIONAL([USING_PYTHON_2], [test "x${PYTHON_VERSION%%\.*}" = x2]) + AM_CONDITIONAL([USING_PYTHON_3], [test "x${PYTHON_VERSION%%\.*}" = x3]) dnl # dnl # Request that packages be built for a specific Python version. dnl # - AS_IF([test $with_python != check], [ - PYTHON_PKG_VERSION=`echo ${PYTHON} | tr -d 'a-zA-Z.'` + AS_IF([test "x$with_python" != xcheck], [ + PYTHON_PKG_VERSION=$(echo $PYTHON_VERSION | tr -d .) DEFINE_PYTHON_PKG_VERSION='--define "__use_python_pkg_version '${PYTHON_PKG_VERSION}'"' DEFINE_PYTHON_VERSION='--define "__use_python '${PYTHON}'"' ], [ diff --git a/config/always-pyzfs.m4 b/config/always-pyzfs.m4 index 6f32e98feed2..f620a8f9a18b 100644 --- a/config/always-pyzfs.m4 +++ b/config/always-pyzfs.m4 @@ -1,5 +1,24 @@ dnl # -dnl # Determines if pyzfs can be built, requires Python 2.7 or latter. +dnl # ZFS_AC_PYTHON_MODULE(module_name, [action-if-true], [action-if-false]) +dnl # +dnl # Checks for Python module. Freely inspired by AX_PYTHON_MODULE +dnl # https://www.gnu.org/software/autoconf-archive/ax_python_module.html +dnl # Required by ZFS_AC_CONFIG_ALWAYS_PYZFS. +dnl # +AC_DEFUN([ZFS_AC_PYTHON_MODULE], [ + PYTHON_NAME=$(basename $PYTHON) + AC_MSG_CHECKING([for $PYTHON_NAME module: $1]) + AS_IF([$PYTHON -c "import $1" 2>/dev/null], [ + AC_MSG_RESULT(yes) + m4_ifvaln([$2], [$2]) + ], [ + AC_MSG_RESULT(no) + m4_ifvaln([$3], [$3]) + ]) +]) + +dnl # +dnl # Determines if pyzfs can be built, requires Python 2.7 or later. dnl # AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ AC_ARG_ENABLE([pyzfs], @@ -18,7 +37,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ DEFINE_PYZFS='--without pyzfs' ]) ], [ - AS_IF([test $PYTHON != :], [ + AS_IF([test "$PYTHON" != :], [ DEFINE_PYZFS='' ], [ enable_pyzfs=no @@ -31,20 +50,16 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ dnl # Require python-devel libraries dnl # AS_IF([test "x$enable_pyzfs" = xcheck -o "x$enable_pyzfs" = xyes], [ - AS_IF([ZFS_AC_PYTHON_VERSION_IS_2], [ - PYTHON_REQUIRED_VERSION=">= '2.7.0'" - ], [ - AS_IF([ZFS_AC_PYTHON_VERSION_IS_3], [ - PYTHON_REQUIRED_VERSION=">= '3.4.0'" - ], [ - AC_MSG_ERROR("Python $PYTHON_VERSION unknown") - ]) - ]) + AS_CASE([$PYTHON_VERSION], + [3.*], [PYTHON_REQUIRED_VERSION=">= '3.4.0'"], + [2.*], [PYTHON_REQUIRED_VERSION=">= '2.7.0'"], + [AC_MSG_ERROR("Python $PYTHON_VERSION unknown")] + ) AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [ AS_IF([test "x$enable_pyzfs" = xyes], [ AC_MSG_ERROR("Python $PYTHON_REQUIRED_VERSION development library is not installed") - ], [test ! "x$enable_pyzfs" = xno], [ + ], [test "x$enable_pyzfs" != xno], [ enable_pyzfs=no ]) ]) @@ -57,7 +72,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ ZFS_AC_PYTHON_MODULE([setuptools], [], [ AS_IF([test "x$enable_pyzfs" = xyes], [ AC_MSG_ERROR("Python $PYTHON_VERSION setuptools is not installed") - ], [test ! "x$enable_pyzfs" = xno], [ + ], [test "x$enable_pyzfs" != xno], [ enable_pyzfs=no ]) ]) @@ -70,7 +85,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ ZFS_AC_PYTHON_MODULE([cffi], [], [ AS_IF([test "x$enable_pyzfs" = xyes], [ AC_MSG_ERROR("Python $PYTHON_VERSION cffi is not installed") - ], [test ! "x$enable_pyzfs" = xno], [ + ], [test "x$enable_pyzfs" != xno], [ enable_pyzfs=no ]) ]) @@ -81,7 +96,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ dnl # AS_IF([test "x$enable_pyzfs" = xcheck], [enable_pyzfs=yes]) - AM_CONDITIONAL([PYZFS_ENABLED], [test x$enable_pyzfs = xyes]) + AM_CONDITIONAL([PYZFS_ENABLED], [test "x$enable_pyzfs" = xyes]) AC_SUBST([PYZFS_ENABLED], [$enable_pyzfs]) AC_SUBST(pythonsitedir, [$PYTHON_SITE_PKG]) From ed7b0d357a070d28710abe9a6c6fc22c4fcbe854 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 14 Jun 2019 17:07:34 -0400 Subject: [PATCH 028/109] Minimize aggsum_compare(&arc_size, arc_c) calls. For busy ARC situation when arc_size close to arc_c is desired. But then it is quite likely that aggsum_compare(&arc_size, arc_c) will need to flush per-CPU buckets to find exact comparison result. Doing that often in a hot path penalizes whole idea of aggsum usage there, since it replaces few simple atomic additions with dozens of lock acquisitions. Replacing aggsum_compare() with aggsum_upper_bound() in code increasing arc_p when ARC is growing (arc_size < arc_c) according to PMC profiles allows to save ~5% of CPU time in aggsum code during sequential write to 12 ZVOLs with 16KB block size on large dual-socket system. I suppose there some minor arc_p behavior change due to lower precision of the new code, but I don't think it is a big deal, since it should affect only very small window in time (aggsum buckets are flushed every second) and in ARC size (buckets are limited to 10 average ARC blocks per CPU). Reviewed-by: Chris Dunlop Reviewed-by: Richard Elling Reviewed-by: George Melikov Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Closes #8901 --- module/zfs/arc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index a7e7d26996f8..720365c4a935 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5606,7 +5606,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ - if (aggsum_compare(&arc_size, arc_c) < 0 && + if (aggsum_upper_bound(&arc_size) < arc_c && hdr->b_l1hdr.b_state == arc_anon && (zfs_refcount_count(&arc_anon->arcs_size) + zfs_refcount_count(&arc_mru->arcs_size) > arc_p)) From b5e8d14a4b0c25b19c4e148123e5d579add0cfa5 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 19 Jun 2019 10:39:28 -0700 Subject: [PATCH 029/109] ZTS: Fix mmp_interval failure The mmp_interval test case was failing on Fedora 30 due to the built-in 'echo' command terminating the script when it was unable to write to the sysfs module parameter. This change in behavior was observed with ksh-2020.0.0-alpha1. Resolve the issue by using the external cat command which fails gracefully as expected. Additionally, remove some incorrect quotes around the $? return values. Reviewed-by: Giuseppe Di Natale Reviewed-by: Tony Hutter Reviewed-by: Olaf Faaland Reviewed-by: Richard Elling Reviewed-by: George Melikov Signed-off-by: Brian Behlendorf Closes #8906 --- tests/zfs-tests/include/libtest.shlib | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 1b841d7ba02c..c7cb36a8d0ee 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3494,13 +3494,13 @@ function set_tunable_impl Linux) typeset zfs_tunables="/sys/module/$module/parameters" [[ -w "$zfs_tunables/$tunable" ]] || return 1 - echo -n "$value" > "$zfs_tunables/$tunable" - return "$?" + cat >"$zfs_tunables/$tunable" <<<"$value" + return $? ;; SunOS) [[ "$module" -eq "zfs" ]] || return 1 echo "${tunable}/${mdb_cmd}0t${value}" | mdb -kw - return "$?" + return $? ;; esac } @@ -3527,7 +3527,7 @@ function get_tunable_impl typeset zfs_tunables="/sys/module/$module/parameters" [[ -f "$zfs_tunables/$tunable" ]] || return 1 cat $zfs_tunables/$tunable - return "$?" + return $? ;; SunOS) [[ "$module" -eq "zfs" ]] || return 1 From 5b0327bc5795b5ae8b1926d90a9b6b8b10433f72 Mon Sep 17 00:00:00 2001 From: Olaf Faaland Date: Wed, 19 Jun 2019 11:44:44 -0700 Subject: [PATCH 030/109] kmod-zfs-devel rpm should provide kmod-spl-devel When configure is run with --with-spec=redhat, and rpms are built, the kmod-zfs-devel package is missing Provides: kmod-spl-devel = %{version} which is required by software such as Lustre which builds against zfs kmods. Adding it makes it easier for such software to build against both zfs-0.7 (where SPL is separate and may be missing) and zfs-0.8. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Olaf Faaland Closes #8930 --- rpm/redhat/zfs-kmod.spec.in | 1 + 1 file changed, 1 insertion(+) diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in index 473f2d032509..f632c4867e63 100644 --- a/rpm/redhat/zfs-kmod.spec.in +++ b/rpm/redhat/zfs-kmod.spec.in @@ -41,6 +41,7 @@ This package contains the ZFS kernel modules. %package -n kmod-%{kmod_name}-devel Summary: ZFS kernel module(s) devel common Group: System Environment/Kernel +Provides: kmod-spl-devel = %{version} %description -n kmod-%{kmod_name}-devel This package provides the header files and objects to build kernel modules. From 2087b6cf4941b936583b48471a79b252dc0a9dbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Niew=C3=B6hner?= Date: Wed, 19 Jun 2019 20:53:37 +0200 Subject: [PATCH 031/109] Fix memory leak in check_disk() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Allan Jude Reviewed-by: Tony Hutter Reviewed-by: Richard Elling Signed-off-by: Michael Niewöhner Closes #8897 Closes #8911 --- cmd/zpool/zpool_vdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 7ea9d742006d..52c696816f73 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -433,6 +433,7 @@ check_disk(const char *path, blkid_cache cache, int force, char *value = blkid_get_tag_value(cache, "TYPE", path); (void) fprintf(stderr, gettext("%s is in use and contains " "a %s filesystem.\n"), path, value ? value : "unknown"); + free(value); return (-1); } From fb6f6b47d6f9b63e5768635b74160d94b3fe33f5 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Thu, 20 Jun 2019 04:27:31 +0900 Subject: [PATCH 032/109] Use ZFS_DEV macro instead of literals The rest of the code/comments use ZFS_DEV, so sync with that. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Reviewed-by: Richard Elling Signed-off-by: Tomohiro Kusumi Closes #8912 --- lib/libzfs_core/libzfs_core.c | 6 +++--- lib/libzpool/util.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 99fc84d04614..eb332bc94e8c 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -52,7 +52,7 @@ * * - Thin Layer. libzfs_core is a thin layer, marshaling arguments * to/from the kernel ioctls. There is generally a 1:1 correspondence - * between libzfs_core functions and ioctls to /dev/zfs. + * between libzfs_core functions and ioctls to ZFS_DEV. * * - Clear Atomicity. Because libzfs_core functions are generally 1:1 * with kernel ioctls, and kernel ioctls are general atomic, each @@ -135,7 +135,7 @@ libzfs_core_init(void) { (void) pthread_mutex_lock(&g_lock); if (g_refcount == 0) { - g_fd = open("/dev/zfs", O_RDWR); + g_fd = open(ZFS_DEV, O_RDWR); if (g_fd < 0) { (void) pthread_mutex_unlock(&g_lock); return (errno); @@ -499,7 +499,7 @@ lzc_sync(const char *pool_name, nvlist_t *innvl, nvlist_t **outnvl) * The snapshots must all be in the same pool. * The value is the name of the hold (string type). * - * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL). + * If cleanup_fd is not -1, it must be the result of open(ZFS_DEV, O_EXCL). * In this case, when the cleanup_fd is closed (including on process * termination), the holds will be released. If the system is shut down * uncleanly, the holds will be released when the pool is next opened diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c index ad05d2239ae0..67bc209ceec9 100644 --- a/lib/libzpool/util.c +++ b/lib/libzpool/util.c @@ -223,7 +223,7 @@ pool_active(void *unused, const char *name, uint64_t guid, * Use ZFS_IOC_POOL_SYNC to confirm if a pool is active */ - fd = open("/dev/zfs", O_RDWR); + fd = open(ZFS_DEV, O_RDWR); if (fd < 0) return (-1); From 01cc94f68d89c71943ecc5bd3dfaff6171dfe157 Mon Sep 17 00:00:00 2001 From: dacianstremtan <35844628+dacianstremtan@users.noreply.github.com> Date: Thu, 20 Jun 2019 15:27:14 -0400 Subject: [PATCH 033/109] Replace whereis with type in zfs-lib.sh The whereis command should not be used since it may not exist in the initramfs. The dracut plymouth module also uses the type command instead of whereis. Reviewed-by: Brian Behlendorf Reviewed-by: Garrett Fields Signed-off-by: Dacian Reece-Stremtan Closes #8920 Closes #8938 --- contrib/dracut/90zfs/zfs-lib.sh.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/dracut/90zfs/zfs-lib.sh.in b/contrib/dracut/90zfs/zfs-lib.sh.in index 23c07af9e86f..44021c6e5fc1 100755 --- a/contrib/dracut/90zfs/zfs-lib.sh.in +++ b/contrib/dracut/90zfs/zfs-lib.sh.in @@ -144,7 +144,7 @@ ask_for_password() { { flock -s 9; # Prompt for password with plymouth, if installed and running. - if whereis plymouth >/dev/null 2>&1 && plymouth --ping 2>/dev/null; then + if type plymouth >/dev/null 2>&1 && plymouth --ping 2>/dev/null; then plymouth ask-for-password \ --prompt "$ply_prompt" --number-of-tries="$ply_tries" \ --command="$ply_cmd" From b96ceeead2a9c7e0973fcef58356defb10f6df26 Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Thu, 20 Jun 2019 15:29:51 -0400 Subject: [PATCH 034/109] Allow unencrypted children of encrypted datasets When encryption was first added to ZFS, we made a decision to prevent users from creating unencrypted children of encrypted datasets. The idea was to prevent users from inadvertently leaving some of their data unencrypted. However, since the release of 0.8.0, some legitimate reasons have been brought up for this behavior to be allowed. This patch simply removes this limitation from all code paths that had checks for it and updates the tests accordingly. Reviewed-by: Jason King Reviewed-by: Sean Eric Fagan Reviewed-by: Richard Laager Reviewed-by: Brian Behlendorf Signed-off-by: Tom Caputi Closes #8737 Closes #8870 --- include/sys/dsl_crypt.h | 1 - lib/libzfs/libzfs_crypto.c | 41 +--------------- lib/libzfs/libzfs_dataset.c | 13 ++--- lib/libzfs/libzfs_sendrecv.c | 48 ++++++++----------- module/zfs/dmu_objset.c | 7 --- module/zfs/dmu_recv.c | 24 +++++----- module/zfs/dsl_crypt.c | 44 +---------------- .../zfs_create/zfs_create_encrypted.ksh | 20 ++++---- .../zfs_receive/zfs_receive_to_encrypted.ksh | 14 +++--- .../zfs_rename/zfs_rename_to_encrypted.ksh | 14 +++--- 10 files changed, 63 insertions(+), 163 deletions(-) diff --git a/include/sys/dsl_crypt.h b/include/sys/dsl_crypt.h index c2c0a548a488..0f73ea6c6df8 100644 --- a/include/sys/dsl_crypt.h +++ b/include/sys/dsl_crypt.h @@ -209,7 +209,6 @@ void dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd, struct dsl_dataset *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx); uint64_t dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey, dmu_tx_t *tx); -int dmu_objset_clone_crypt_check(dsl_dir_t *parentdd, dsl_dir_t *origindd); uint64_t dsl_crypto_key_clone_sync(dsl_dir_t *origindd, dmu_tx_t *tx); void dsl_crypto_key_destroy_sync(uint64_t dckobj, dmu_tx_t *tx); diff --git a/lib/libzfs/libzfs_crypto.c b/lib/libzfs/libzfs_crypto.c index 3318a6bd2e11..d31f43b1fdf2 100644 --- a/lib/libzfs/libzfs_crypto.c +++ b/lib/libzfs/libzfs_crypto.c @@ -740,14 +740,6 @@ zfs_crypto_create(libzfs_handle_t *hdl, char *parent_name, nvlist_t *props, pcrypt = ZIO_CRYPT_OFF; } - /* Check for encryption being explicitly truned off */ - if (crypt == ZIO_CRYPT_OFF && pcrypt != ZIO_CRYPT_OFF) { - ret = EINVAL; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Invalid encryption value. Dataset must be encrypted.")); - goto out; - } - /* Get the inherited encryption property if we don't have it locally */ if (!local_crypt) crypt = pcrypt; @@ -849,10 +841,7 @@ int zfs_crypto_clone_check(libzfs_handle_t *hdl, zfs_handle_t *origin_zhp, char *parent_name, nvlist_t *props) { - int ret; char errbuf[1024]; - zfs_handle_t *pzhp = NULL; - uint64_t pcrypt, ocrypt; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "Encryption clone error")); @@ -865,40 +854,12 @@ zfs_crypto_clone_check(libzfs_handle_t *hdl, zfs_handle_t *origin_zhp, nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)) || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION)) || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS))) { - ret = EINVAL; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Encryption properties must inherit from origin dataset.")); - goto out; - } - - /* get a reference to parent dataset, should never be NULL */ - pzhp = make_dataset_handle(hdl, parent_name); - if (pzhp == NULL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Failed to lookup parent.")); - return (ENOENT); + return (EINVAL); } - /* Lookup parent's crypt */ - pcrypt = zfs_prop_get_int(pzhp, ZFS_PROP_ENCRYPTION); - ocrypt = zfs_prop_get_int(origin_zhp, ZFS_PROP_ENCRYPTION); - - /* all children of encrypted parents must be encrypted */ - if (pcrypt != ZIO_CRYPT_OFF && ocrypt == ZIO_CRYPT_OFF) { - ret = EINVAL; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Cannot create unencrypted clone as a child " - "of encrypted parent.")); - goto out; - } - - zfs_close(pzhp); return (0); - -out: - if (pzhp != NULL) - zfs_close(pzhp); - return (ret); } typedef struct loadkeys_cbdata { diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 3be205f1f437..ee5a6412ead5 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -4632,16 +4632,9 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, "with the new name")); (void) zfs_error(hdl, EZFS_EXISTS, errbuf); } else if (errno == EACCES) { - if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) == - ZIO_CRYPT_OFF) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "cannot rename an unencrypted dataset to " - "be a decendent of an encrypted one")); - } else { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "cannot move encryption child outside of " - "its encryption root")); - } + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot move encrypted child outside of " + "its encryption root")); (void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); } else { (void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf); diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index f69a46430bbe..052b96b9b653 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -2827,7 +2827,7 @@ recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *destname, is_clone = zhp->zfs_dmustats.dds_origin[0] != '\0'; (void) zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); - /* we don't need to do anything for unencrypted filesystems */ + /* we don't need to do anything for unencrypted datasets */ if (crypt == ZIO_CRYPT_OFF) { zfs_close(zhp); continue; @@ -4210,34 +4210,6 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, goto out; } - /* - * It is invalid to receive a properties stream that was - * unencrypted on the send side as a child of an encrypted - * parent. Technically there is nothing preventing this, but - * it would mean that the encryption=off property which is - * locally set on the send side would not be received correctly. - * We can infer encryption=off if the stream is not raw and - * properties were included since the send side will only ever - * send the encryption property in a raw nvlist header. This - * check will be avoided if the user specifically overrides - * the encryption property on the command line. - */ - if (!raw && rcvprops != NULL && - !nvlist_exists(cmdprops, - zfs_prop_to_name(ZFS_PROP_ENCRYPTION))) { - uint64_t crypt; - - crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); - - if (crypt != ZIO_CRYPT_OFF) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "parent '%s' must not be encrypted to " - "receive unenecrypted property"), name); - err = zfs_error(hdl, EZFS_BADPROP, errbuf); - zfs_close(zhp); - goto out; - } - } zfs_close(zhp); newfs = B_TRUE; @@ -4274,6 +4246,24 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, &oxprops, &wkeydata, &wkeylen, errbuf)) != 0) goto out; + /* + * When sending with properties (zfs send -p), the encryption property + * is not included because it is a SETONCE property and therefore + * treated as read only. However, we are always able to determine its + * value because raw sends will include it in the DRR_BDEGIN payload + * and non-raw sends with properties are not allowed for encrypted + * datasets. Therefore, if this is a non-raw properties stream, we can + * infer that the value should be ZIO_CRYPT_OFF and manually add that + * to the received properties. + */ + if (stream_wantsnewfs && !raw && rcvprops != NULL && + !nvlist_exists(cmdprops, zfs_prop_to_name(ZFS_PROP_ENCRYPTION))) { + if (oxprops == NULL) + oxprops = fnvlist_alloc(); + fnvlist_add_uint64(oxprops, + zfs_prop_to_name(ZFS_PROP_ENCRYPTION), ZIO_CRYPT_OFF); + } + err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops, oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable, raw, infd, drr_noswap, cleanup_fd, &read_bytes, &errflags, diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index f95915b9e253..30436b188fc4 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1348,13 +1348,6 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx) return (SET_ERROR(EINVAL)); } - error = dmu_objset_clone_crypt_check(pdd, origin->ds_dir); - if (error != 0) { - dsl_dataset_rele(origin, FTAG); - dsl_dir_rele(pdd, FTAG); - return (error); - } - dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 65a031b42cc6..3481feb21dbc 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -327,7 +327,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) /* Open the parent of tofs */ ASSERT3U(strlen(tofs), <, sizeof (buf)); (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); - error = dsl_dataset_hold_flags(dp, buf, dsflags, FTAG, &ds); + error = dsl_dataset_hold(dp, buf, FTAG, &ds); if (error != 0) return (error); @@ -345,13 +345,13 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) error = dmu_objset_create_crypt_check(ds->ds_dir, drba->drba_dcp, &will_encrypt); if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } if (will_encrypt && (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } } @@ -364,25 +364,25 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } /* can't recv below anything but filesystems (eg. no ZVOLs) */ error = dmu_objset_from_ds(ds, &os); if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } if (dmu_objset_type(os) != DMU_OST_ZFS) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); } @@ -392,31 +392,31 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) error = dsl_dataset_hold_flags(dp, drba->drba_origin, dsflags, FTAG, &origin); if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } if (!origin->ds_is_snapshot) { dsl_dataset_rele_flags(origin, dsflags, FTAG); - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } if (dsl_dataset_phys(origin)->ds_guid != fromguid && fromguid != 0) { dsl_dataset_rele_flags(origin, dsflags, FTAG); - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENODEV)); } if (origin->ds_dir->dd_crypto_obj != 0 && (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { dsl_dataset_rele_flags(origin, dsflags, FTAG); - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } dsl_dataset_rele_flags(origin, dsflags, FTAG); } - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); error = 0; } return (error); diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c index 21db8e51ffd0..0c0ffaadd8fb 100644 --- a/module/zfs/dsl_crypt.c +++ b/module/zfs/dsl_crypt.c @@ -1610,15 +1610,8 @@ dsl_dir_rename_crypt_check(dsl_dir_t *dd, dsl_dir_t *newparent) int ret; uint64_t curr_rddobj, parent_rddobj; - if (dd->dd_crypto_obj == 0) { - /* children of encrypted parents must be encrypted */ - if (newparent->dd_crypto_obj != 0) { - ret = SET_ERROR(EACCES); - goto error; - } - + if (dd->dd_crypto_obj == 0) return (0); - } ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj); if (ret != 0) @@ -1747,34 +1740,6 @@ dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, kmem_free(keylocation, ZAP_MAXVALUELEN); } -int -dmu_objset_clone_crypt_check(dsl_dir_t *parentdd, dsl_dir_t *origindd) -{ - int ret; - uint64_t pcrypt, crypt; - - /* - * Check that we are not making an unencrypted child of an - * encrypted parent. - */ - ret = dsl_dir_get_crypt(parentdd, &pcrypt); - if (ret != 0) - return (ret); - - ret = dsl_dir_get_crypt(origindd, &crypt); - if (ret != 0) - return (ret); - - ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT); - ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT); - - if (crypt == ZIO_CRYPT_OFF && pcrypt != ZIO_CRYPT_OFF) - return (SET_ERROR(EINVAL)); - - return (0); -} - - int dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp, boolean_t *will_encrypt) @@ -1805,13 +1770,6 @@ dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp, ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT); ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT); - /* - * We can't create an unencrypted child of an encrypted parent - * under any circumstances. - */ - if (crypt == ZIO_CRYPT_OFF && pcrypt != ZIO_CRYPT_OFF) - return (SET_ERROR(EINVAL)); - /* check for valid dcp with no encryption (inherited or local) */ if (crypt == ZIO_CRYPT_OFF) { /* Must not specify encryption params */ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_encrypted.ksh index 9d5ecab0dfee..7e5072f0d5fd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_encrypted.ksh @@ -51,10 +51,10 @@ # yes unspec 0 1 no no keyformat specified # yes unspec 1 0 yes new encryption root, crypt inherited # yes unspec 1 1 yes new encryption root, crypt inherited -# yes off 0 0 no unencrypted child of encrypted parent -# yes off 0 1 no unencrypted child of encrypted parent -# yes off 1 0 no unencrypted child of encrypted parent -# yes off 1 1 no unencrypted child of encrypted parent +# yes off 0 0 yes unencrypted child of encrypted parent +# yes off 0 1 no keylocation given, but crypt off +# yes off 1 0 no keyformat given, but crypt off +# yes off 1 1 no keyformat given, but crypt off # yes on 0 0 yes inherited encryption, local crypt # yes on 0 1 no no keyformat specified for new key # yes on 1 0 yes new encryption root @@ -113,7 +113,9 @@ log_must eval "echo $PASSPHRASE | zfs create -o keyformat=passphrase" \ log_must eval "echo $PASSPHRASE | zfs create -o keyformat=passphrase" \ "-o keylocation=prompt $TESTPOOL/$TESTFS2/c4" -log_mustnot zfs create -o encryption=off $TESTPOOL/$TESTFS2/c5 +log_must zfs create -o encryption=off $TESTPOOL/$TESTFS2/c5 +log_must test "$(get_prop 'encryption' $TESTPOOL/$TESTFS2/c5)" == "off" + log_mustnot zfs create -o encryption=off -o keylocation=prompt \ $TESTPOOL/$TESTFS2/c5 log_mustnot zfs create -o encryption=off -o keyformat=passphrase \ @@ -122,13 +124,13 @@ log_mustnot zfs create -o encryption=off -o keyformat=passphrase \ -o keylocation=prompt $TESTPOOL/$TESTFS2/c5 log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ - "$TESTPOOL/$TESTFS2/c5" + "$TESTPOOL/$TESTFS2/c6" log_mustnot zfs create -o encryption=on -o keylocation=prompt \ - $TESTPOOL/$TESTFS2/c6 + $TESTPOOL/$TESTFS2/c7 log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ - "-o keyformat=passphrase $TESTPOOL/$TESTFS2/c6" + "-o keyformat=passphrase $TESTPOOL/$TESTFS2/c7" log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ - "-o keyformat=passphrase -o keylocation=prompt $TESTPOOL/$TESTFS2/c7" + "-o keyformat=passphrase -o keylocation=prompt $TESTPOOL/$TESTFS2/c8" log_pass "ZFS creates datasets only if they have a valid combination of" \ "encryption properties set." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh index 57896c6fd305..f8e53f02c23d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh @@ -46,7 +46,7 @@ function cleanup log_onexit cleanup -log_assert "ZFS should receive to an encrypted child dataset" +log_assert "ZFS should receive encrypted filesystems into child dataset" typeset passphrase="password" typeset snap="$TESTPOOL/$TESTFS@snap" @@ -60,11 +60,13 @@ log_must eval "echo $passphrase | zfs create -o encryption=on" \ log_note "Verifying ZFS will receive to an encrypted child" log_must eval "zfs send $snap | zfs receive $TESTPOOL/$TESTFS1/c1" -log_note "Verifying 'send -p' will not receive to an encrypted child" -log_mustnot eval "zfs send -p $snap | zfs receive $TESTPOOL/$TESTFS1/c2" +log_note "Verifying 'send -p' will receive to an encrypted child" +log_must eval "zfs send -p $snap | zfs receive $TESTPOOL/$TESTFS1/c2" +log_must test "$(get_prop 'encryption' $TESTPOOL/$TESTFS1/c2)" == "off" -log_note "Verifying 'send -R' will not receive to an encrypted child" -log_mustnot eval "zfs send -R $snap | zfs receive $TESTPOOL/$TESTFS1/c3" +log_note "Verifying 'send -R' will receive to an encrypted child" +log_must eval "zfs send -R $snap | zfs receive $TESTPOOL/$TESTFS1/c3" +log_must test "$(get_prop 'encryption' $TESTPOOL/$TESTFS1/c3)" == "off" log_note "Verifying ZFS will not receive to an encrypted child when the" \ "parent key is unloaded" @@ -72,4 +74,4 @@ log_must zfs unmount $TESTPOOL/$TESTFS1 log_must zfs unload-key $TESTPOOL/$TESTFS1 log_mustnot eval "zfs send $snap | zfs receive $TESTPOOL/$TESTFS1/c4" -log_pass "ZFS can receive to an encrypted child dataset" +log_pass "ZFS can receive encrypted filesystems into child dataset" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh index 400592aaca2c..1b9c6e3c704f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh @@ -23,12 +23,13 @@ # # DESCRIPTION: -# 'zfs rename' should not rename an unencrypted dataset to a child +# 'zfs rename' should be able to move an unencrypted dataset to a child # of an encrypted dataset # # STRATEGY: # 1. Create an encrypted dataset -# 2. Attempt to rename the default dataset to a child of the encrypted dataset +# 2. Rename the default dataset to a child of the encrypted dataset +# 3. Confirm the child dataset doesn't have any encryption properties # verify_runnable "both" @@ -36,16 +37,17 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy $TESTPOOL/$TESTFS2 + log_must zfs destroy -r $TESTPOOL/$TESTFS2 } log_onexit cleanup -log_assert "'zfs rename' should not rename an unencrypted dataset to a" \ +log_assert "'zfs rename' should allow renaming an unencrypted dataset to a" \ "child of an encrypted dataset" log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ "-o keyformat=passphrase -o keylocation=prompt $TESTPOOL/$TESTFS2" -log_mustnot zfs rename $TESTPOOL/$TESTFS $TESTPOOL/$TESTFS2/$TESTFS +log_must zfs rename $TESTPOOL/$TESTFS $TESTPOOL/$TESTFS2/$TESTFS +log_must test "$(get_prop 'encryption' $TESTPOOL/$TESTFS2/$TESTFS)" == "off" -log_pass "'zfs rename' does not rename an unencrypted dataset to a child" \ +log_pass "'zfs rename' allows renaming an unencrypted dataset to a child" \ "of an encrypted dataset" From 9af524b0ee26c821cf412b796ef178e108c5cb10 Mon Sep 17 00:00:00 2001 From: Igor K Date: Fri, 21 Jun 2019 04:29:02 +0300 Subject: [PATCH 035/109] Update vdev_ops_t from illumos Align vdev_ops_t from illumos for better compatibility. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Igor Kozhukhov Closes #8925 --- module/zfs/vdev_disk.c | 26 ++++++------- module/zfs/vdev_file.c | 52 ++++++++++++------------- module/zfs/vdev_indirect.c | 26 ++++++------- module/zfs/vdev_mirror.c | 78 +++++++++++++++++++------------------- module/zfs/vdev_missing.c | 52 ++++++++++++------------- module/zfs/vdev_raidz.c | 26 ++++++------- module/zfs/vdev_root.c | 26 ++++++------- 7 files changed, 143 insertions(+), 143 deletions(-) diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 1419ae6ad54a..1686ddfce77d 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -935,19 +935,19 @@ param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) } vdev_ops_t vdev_disk_ops = { - vdev_disk_open, - vdev_disk_close, - vdev_default_asize, - vdev_disk_io_start, - vdev_disk_io_done, - NULL, - NULL, - vdev_disk_hold, - vdev_disk_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_disk_open, + .vdev_op_close = vdev_disk_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_disk_io_start, + .vdev_op_io_done = vdev_disk_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_disk_hold, + .vdev_op_rele = vdev_disk_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index c155057852a3..b79017f3a610 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -277,19 +277,19 @@ vdev_file_io_done(zio_t *zio) } vdev_ops_t vdev_file_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_FILE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; void @@ -313,19 +313,19 @@ vdev_file_fini(void) #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; #endif diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 4d18e33c0ab7..4539fa638ada 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1842,19 +1842,19 @@ vdev_indirect_io_done(zio_t *zio) } vdev_ops_t vdev_indirect_ops = { - vdev_indirect_open, - vdev_indirect_close, - vdev_default_asize, - vdev_indirect_io_start, - vdev_indirect_io_done, - NULL, - NULL, - NULL, - NULL, - vdev_indirect_remap, - NULL, - VDEV_TYPE_INDIRECT, /* name of this vdev type */ - B_FALSE /* leaf vdev */ + .vdev_op_open = vdev_indirect_open, + .vdev_op_close = vdev_indirect_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_indirect_io_start, + .vdev_op_io_done = vdev_indirect_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = vdev_indirect_remap, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* leaf vdev */ }; #if defined(_KERNEL) diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 59cc2dcdd2ca..23ff75bfc96f 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -786,51 +786,51 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) } vdev_ops_t vdev_mirror_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_MIRROR, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_replacing_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_REPLACING, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_spare_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_SPARE, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; #if defined(_KERNEL) diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c index d85993bff052..205b23eba7f5 100644 --- a/module/zfs/vdev_missing.c +++ b/module/zfs/vdev_missing.c @@ -80,33 +80,33 @@ vdev_missing_io_done(zio_t *zio) } vdev_ops_t vdev_missing_ops = { - vdev_missing_open, - vdev_missing_close, - vdev_default_asize, - vdev_missing_io_start, - vdev_missing_io_done, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_MISSING, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_missing_open, + .vdev_op_close = vdev_missing_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_missing_io_start, + .vdev_op_io_done = vdev_missing_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; vdev_ops_t vdev_hole_ops = { - vdev_missing_open, - vdev_missing_close, - vdev_default_asize, - vdev_missing_io_start, - vdev_missing_io_done, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_HOLE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_missing_open, + .vdev_op_close = vdev_missing_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_missing_io_start, + .vdev_op_io_done = vdev_missing_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 215cd1c12064..327b186713fa 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2403,17 +2403,17 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) } vdev_ops_t vdev_raidz_ops = { - vdev_raidz_open, - vdev_raidz_close, - vdev_raidz_asize, - vdev_raidz_io_start, - vdev_raidz_io_done, - vdev_raidz_state_change, - vdev_raidz_need_resilver, - NULL, - NULL, - NULL, - vdev_raidz_xlate, - VDEV_TYPE_RAIDZ, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_raidz_open, + .vdev_op_close = vdev_raidz_close, + .vdev_op_asize = vdev_raidz_asize, + .vdev_op_io_start = vdev_raidz_io_start, + .vdev_op_io_done = vdev_raidz_io_done, + .vdev_op_state_change = vdev_raidz_state_change, + .vdev_op_need_resilver = vdev_raidz_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_raidz_xlate, + .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index e40b7ce8e4e8..7170f7013608 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -140,17 +140,17 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded) } vdev_ops_t vdev_root_ops = { - vdev_root_open, - vdev_root_close, - vdev_default_asize, - NULL, /* io_start - not applicable to the root */ - NULL, /* io_done - not applicable to the root */ - vdev_root_state_change, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_ROOT, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_root_open, + .vdev_op_close = vdev_root_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = NULL, /* not applicable to the root */ + .vdev_op_io_done = NULL, /* not applicable to the root */ + .vdev_op_state_change = vdev_root_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; From 3c2a42fd254917db78484c428bd317ec7189c968 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 20 Jun 2019 18:30:40 -0700 Subject: [PATCH 036/109] dedup=verify doesn't clear the blkptr's dedup flag The logic to handle strong checksum collisions where the data doesn't match is incorrect. It is not clearing the dedup bit of the blkptr, which can cause a panic later in zio_ddt_free() due to the dedup table not matching what is in the blkptr. Reviewed-by: Tom Caputi Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens External-issue: DLPX-48097 Closes #8936 --- module/zfs/zio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 5bfff37eb3b5..f9503bd3ff81 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3192,7 +3192,9 @@ zio_ddt_write(zio_t *zio) BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; + BP_SET_DEDUP(bp, B_FALSE); } + ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); From ab24c9cd4cbba2c4d5cb68f3e1e08dcf2275dc34 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Fri, 21 Jun 2019 10:31:53 +0900 Subject: [PATCH 037/109] Prevent pointer to an out-of-scope local variable `show_str` could be a pointer to a local variable in stack which is out-of-scope by the time `return (snprintf(buf, buflen, "%s\n", show_str));` is called. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Tomohiro Kusumi Closes #8924 Closes #8940 --- module/zfs/zfs_sysfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/module/zfs/zfs_sysfs.c b/module/zfs/zfs_sysfs.c index 2f5bea9aa996..bb7f3b69a662 100644 --- a/module/zfs/zfs_sysfs.c +++ b/module/zfs/zfs_sysfs.c @@ -264,6 +264,7 @@ zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property, char *buf, size_t buflen) { const char *show_str; + char number[32]; /* For dataset properties list the dataset types that apply */ if (strcmp(attr_name, "datasets") == 0 && @@ -291,8 +292,6 @@ zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property, } else if (strcmp(attr_name, "values") == 0) { show_str = property->pd_values ? property->pd_values : ""; } else if (strcmp(attr_name, "default") == 0) { - char number[32]; - switch (property->pd_proptype) { case PROP_TYPE_NUMBER: (void) snprintf(number, sizeof (number), "%llu", From 1fd28bd8d4e102a4ce5e4910427f612c7cf73e68 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Fri, 21 Jun 2019 09:40:56 -0700 Subject: [PATCH 038/109] Add SCSI_PASSTHROUGH to zvols to enable UNMAP support When exporting ZVOLs as SCSI LUNs, by default Windows will not issue them UNMAP commands. This reduces storage efficiency in many cases. We add the SCSI_PASSTHROUGH flag to the zvol's device queue, which lets the SCSI target logic know that it can handle SCSI commands. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Reviewed-by: John Gallagher Signed-off-by: Paul Dagnelie Closes #8933 --- module/zfs/zvol.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index c29f65f676b9..7c7500dbaaf7 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -1876,6 +1876,10 @@ zvol_create_minor_impl(const char *name) #ifdef QUEUE_FLAG_ADD_RANDOM blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue); #endif + /* This flag was introduced in kernel version 4.12. */ +#ifdef QUEUE_FLAG_SCSI_PASSTHROUGH + blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_queue); +#endif if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) From 7a5f4656ce76dbb2c7f3c6810f670a713da48a9e Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Sat, 22 Jun 2019 16:32:26 -0700 Subject: [PATCH 039/109] Fix comments on zfs_bookmark_phys Reviewed-by: Brian Behlendorf Reviewed-by: Matt Ahrens Reviewed-by: George Melikov Signed-off-by: Paul Dagnelie Closes #8945 --- include/sys/dsl_bookmark.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/sys/dsl_bookmark.h b/include/sys/dsl_bookmark.h index 3cdad7441407..ea7d70cf3232 100644 --- a/include/sys/dsl_bookmark.h +++ b/include/sys/dsl_bookmark.h @@ -37,9 +37,11 @@ typedef struct zfs_bookmark_phys { uint64_t zbm_creation_txg; /* birth transaction group */ uint64_t zbm_creation_time; /* bookmark creation time */ - /* the following fields are reserved for redacted send / recv */ + /* fields used for redacted send / recv */ uint64_t zbm_redaction_obj; /* redaction list object */ uint64_t zbm_flags; /* ZBM_FLAG_* */ + + /* fields used for bookmark written size */ uint64_t zbm_referenced_bytes_refd; uint64_t zbm_compressed_bytes_refd; uint64_t zbm_uncompressed_bytes_refd; From d053481523b369b7c00f5fd1c1b1ae54876b8f69 Mon Sep 17 00:00:00 2001 From: Allan Jude Date: Sat, 22 Jun 2019 19:33:44 -0400 Subject: [PATCH 040/109] zstreamdump: add per-record-type counters and an overhead counter Count the bytes of payload for each replication record type Count the bytes of overhead (replication records themselves) Include these counters in the output summary at the end of the run. Reviewed-by: Brian Behlendorf Reviewed-by: Matt Ahrens Signed-off-by: Allan Jude Sponsored-By: Klara Systems and Catalogic Closes #8432 --- cmd/zstreamdump/zstreamdump.c | 63 ++++++++++++------- .../tests/functional/rsend/rsend.kshlib | 2 +- 2 files changed, 42 insertions(+), 23 deletions(-) diff --git a/cmd/zstreamdump/zstreamdump.c b/cmd/zstreamdump/zstreamdump.c index a162eceda58f..a65b4cef3d31 100644 --- a/cmd/zstreamdump/zstreamdump.c +++ b/cmd/zstreamdump/zstreamdump.c @@ -53,7 +53,6 @@ */ #define DUMP_GROUPING 4 -uint64_t total_write_size = 0; uint64_t total_stream_len = 0; FILE *send_stream = 0; boolean_t do_byteswap = B_FALSE; @@ -219,6 +218,9 @@ main(int argc, char *argv[]) { char *buf = safe_malloc(SPA_MAXBLOCKSIZE); uint64_t drr_record_count[DRR_NUMTYPES] = { 0 }; + uint64_t total_payload_size = 0; + uint64_t total_overhead_size = 0; + uint64_t drr_byte_count[DRR_NUMTYPES] = { 0 }; char salt[ZIO_DATA_SALT_LEN * 2 + 1]; char iv[ZIO_DATA_IV_LEN * 2 + 1]; char mac[ZIO_DATA_MAC_LEN * 2 + 1]; @@ -336,7 +338,9 @@ main(int argc, char *argv[]) } drr_record_count[drr->drr_type]++; + total_overhead_size += sizeof (*drr); total_records++; + payload_size = 0; switch (drr->drr_type) { case DRR_BEGIN: @@ -390,6 +394,7 @@ main(int argc, char *argv[]) nvlist_print(stdout, nv); nvlist_free(nv); } + payload_size = sz; } break; @@ -554,7 +559,6 @@ main(int argc, char *argv[]) if (dump) { print_block(buf, payload_size); } - total_write_size += payload_size; break; case DRR_WRITE_BYREF: @@ -683,6 +687,7 @@ main(int argc, char *argv[]) print_block(buf, P2ROUNDUP(drrwe->drr_psize, 8)); } + payload_size = P2ROUNDUP(drrwe->drr_psize, 8); break; case DRR_OBJECT_RANGE: if (do_byteswap) { @@ -723,6 +728,8 @@ main(int argc, char *argv[]) (longlong_t)drrc->drr_checksum.zc_word[3]); } pcksum = zc; + drr_byte_count[drr->drr_type] += payload_size; + total_payload_size += payload_size; } free(buf); fletcher_4_fini(); @@ -730,28 +737,40 @@ main(int argc, char *argv[]) /* Print final summary */ (void) printf("SUMMARY:\n"); - (void) printf("\tTotal DRR_BEGIN records = %lld\n", - (u_longlong_t)drr_record_count[DRR_BEGIN]); - (void) printf("\tTotal DRR_END records = %lld\n", - (u_longlong_t)drr_record_count[DRR_END]); - (void) printf("\tTotal DRR_OBJECT records = %lld\n", - (u_longlong_t)drr_record_count[DRR_OBJECT]); - (void) printf("\tTotal DRR_FREEOBJECTS records = %lld\n", - (u_longlong_t)drr_record_count[DRR_FREEOBJECTS]); - (void) printf("\tTotal DRR_WRITE records = %lld\n", - (u_longlong_t)drr_record_count[DRR_WRITE]); - (void) printf("\tTotal DRR_WRITE_BYREF records = %lld\n", - (u_longlong_t)drr_record_count[DRR_WRITE_BYREF]); - (void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld\n", - (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED]); - (void) printf("\tTotal DRR_FREE records = %lld\n", - (u_longlong_t)drr_record_count[DRR_FREE]); - (void) printf("\tTotal DRR_SPILL records = %lld\n", - (u_longlong_t)drr_record_count[DRR_SPILL]); + (void) printf("\tTotal DRR_BEGIN records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_BEGIN], + (u_longlong_t)drr_byte_count[DRR_BEGIN]); + (void) printf("\tTotal DRR_END records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_END], + (u_longlong_t)drr_byte_count[DRR_END]); + (void) printf("\tTotal DRR_OBJECT records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_OBJECT], + (u_longlong_t)drr_byte_count[DRR_OBJECT]); + (void) printf("\tTotal DRR_FREEOBJECTS records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_FREEOBJECTS], + (u_longlong_t)drr_byte_count[DRR_FREEOBJECTS]); + (void) printf("\tTotal DRR_WRITE records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_WRITE], + (u_longlong_t)drr_byte_count[DRR_WRITE]); + (void) printf("\tTotal DRR_WRITE_BYREF records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_WRITE_BYREF], + (u_longlong_t)drr_byte_count[DRR_WRITE_BYREF]); + (void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld (%llu " + "bytes)\n", (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED], + (u_longlong_t)drr_byte_count[DRR_WRITE_EMBEDDED]); + (void) printf("\tTotal DRR_FREE records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_FREE], + (u_longlong_t)drr_byte_count[DRR_FREE]); + (void) printf("\tTotal DRR_SPILL records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_SPILL], + (u_longlong_t)drr_byte_count[DRR_SPILL]); (void) printf("\tTotal records = %lld\n", (u_longlong_t)total_records); - (void) printf("\tTotal write size = %lld (0x%llx)\n", - (u_longlong_t)total_write_size, (u_longlong_t)total_write_size); + (void) printf("\tTotal payload size = %lld (0x%llx)\n", + (u_longlong_t)total_payload_size, (u_longlong_t)total_payload_size); + (void) printf("\tTotal header overhead = %lld (0x%llx)\n", + (u_longlong_t)total_overhead_size, + (u_longlong_t)total_overhead_size); (void) printf("\tTotal stream length = %lld (0x%llx)\n", (u_longlong_t)total_stream_len, (u_longlong_t)total_stream_len); return (0); diff --git a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib index 521a1c7eb63c..8737ae55abfa 100644 --- a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib +++ b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib @@ -754,7 +754,7 @@ function verify_stream_size datasetexists $ds || log_fail "No such dataset: $ds" typeset stream_size=$(cat $stream | zstreamdump | sed -n \ - 's/ Total write size = \(.*\) (0x.*)/\1/p') + 's/ Total payload size = \(.*\) (0x.*)/\1/p') typeset inc_size=0 if [[ -n $inc_src ]]; then From 95fcb04215015950b3388ba0a6edad8e1b463415 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Sat, 22 Jun 2019 16:41:21 -0700 Subject: [PATCH 041/109] Let zfs mount all tolerate in-progress mounts The zfs-mount service can unexpectedly fail to start when zfs encounters a mount that is in progress. This service uses zfs mount -a, which has a window between the time it checks if the dataset was mounted and when the actual mount (via mount.zfs binary) occurs. The reason for the racing mounts is that both zfs-mount.target and zfs-share.target are allowed to execute concurrently after the import. This is more of an issue with the relatively recent addition of parallel mounting, and we should consider serializing the mount and share targets. Reviewed-by: Brian Behlendorf Reviewed by: John Kennedy Reviewed-by: Allan Jude Signed-off-by: Don Brady Closes #8881 --- cmd/zfs/zfs_main.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 214a437c5dd1..074216055227 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -6446,8 +6446,25 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, return (1); } - if (zfs_mount(zhp, options, flags) != 0) + if (zfs_mount(zhp, options, flags) != 0) { + /* + * Check if a mount sneaked in after we checked + */ + if (!explicit && + libzfs_errno(g_zfs) == EZFS_MOUNTFAILED) { + usleep(10 * MILLISEC); + libzfs_mnttab_cache(g_zfs, B_FALSE); + + if (zfs_is_mounted(zhp, NULL)) { + (void) fprintf(stderr, gettext( + "Ignoring previous 'already " + "mounted' error for '%s'\n"), + zfs_get_name(zhp)); + return (0); + } + } return (1); + } break; } From 2d88230d97d9f9f4f3b89d1081eeab86fe3d9373 Mon Sep 17 00:00:00 2001 From: Harry Mallon <1816667+hjmallon@users.noreply.github.com> Date: Sun, 23 Jun 2019 00:43:11 +0100 Subject: [PATCH 042/109] Add libnvpair to libzfs pkg-config Functions such as `fnvlist_lookup_nvlist` need libnvpair to be linked. Default pkg-config file did not contain it. Reviewed-by: Brian Behlendorf Signed-off-by: Harry Mallon Closes #8919 --- lib/libzfs/libzfs.pc.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/libzfs/libzfs.pc.in b/lib/libzfs/libzfs.pc.in index 0e83f7a64be0..1122401a6eb9 100644 --- a/lib/libzfs/libzfs.pc.in +++ b/lib/libzfs/libzfs.pc.in @@ -9,4 +9,4 @@ Version: @VERSION@ URL: http://zfsonlinux.org Requires: libzfs_core Cflags: -I${includedir}/libzfs -I${includedir}/libspl -Libs: -L${libdir} -lzfs +Libs: -L${libdir} -lzfs -lnvpair From be4a282a8ddc6bf42ec1ba9c3b99d06052f1d625 Mon Sep 17 00:00:00 2001 From: gordan-bobic Date: Sun, 23 Jun 2019 00:47:19 +0100 Subject: [PATCH 043/109] Remove arch and relax version dependency Remove arch and relax version dependency for zfs-dracut package. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Gordan Bobic Issue #8913 Closes #8914 --- rpm/generic/zfs.spec.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 9faa3ba771a1..0b16cd0e886b 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -255,7 +255,8 @@ validating the file system. %package dracut Summary: Dracut module Group: System Environment/Kernel -Requires: %{name}%{?_isa} = %{version}-%{release} +BuildArch: noarch +Requires: %{name} >= %{version} Requires: dracut Requires: /usr/bin/awk Requires: grep From 7d64595c251682f4a38809ecd44e81b4d1af8b74 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Sat, 22 Jun 2019 16:48:54 -0700 Subject: [PATCH 044/109] dn_struct_rwlock can not be held in dmu_tx_try_assign() The thread calling dmu_tx_try_assign() can't hold the dn_struct_rwlock while assigning the tx, because this can lead to deadlock. Specifically, if this dnode is already assigned to an earlier txg, this thread may need to wait for that txg to sync (the ERESTART case below). The other thread that has assigned this dnode to an earlier txg prevents this txg from syncing until its tx can complete (calling dmu_tx_commit()), but it may need to acquire the dn_struct_rwlock to do so (e.g. via dmu_buf_hold*()). This commit adds an assertion to dmu_tx_try_assign() to ensure that this deadlock is not inadvertently introduced. Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens Closes #8929 --- module/zfs/dmu_tx.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index cbadcc86fc61..7d65e842ff03 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -925,6 +925,25 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn != NULL) { + /* + * This thread can't hold the dn_struct_rwlock + * while assigning the tx, because this can lead to + * deadlock. Specifically, if this dnode is already + * assigned to an earlier txg, this thread may need + * to wait for that txg to sync (the ERESTART case + * below). The other thread that has assigned this + * dnode to an earlier txg prevents this txg from + * syncing until its tx can complete (calling + * dmu_tx_commit()), but it may need to acquire the + * dn_struct_rwlock to do so (e.g. via + * dmu_buf_hold*()). + * + * Note that this thread can't hold the lock for + * read either, but the rwlock doesn't record + * enough information to make that assertion. + */ + ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock)); + mutex_enter(&dn->dn_mtx); if (dn->dn_assigned_txg == tx->tx_txg - 1) { mutex_exit(&dn->dn_mtx); From cc7fe8a59967092a9b42355794a1859feb30548f Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 24 Jun 2019 09:32:47 -0700 Subject: [PATCH 045/109] Fix out-of-tree build failures Resolve the incorrect use of srcdir and builddir references for various files in the build system. These have crept in over time and went unnoticed because when building in the top level directory srcdir and builddir are identical. With this change it's again possible to build in a subdirectory. $ mkdir obj $ cd obj $ ../configure $ make Reviewed-by: loli10K Reviewed-by: Tony Hutter Reviewed-by: Don Brady Signed-off-by: Brian Behlendorf Closes #8921 Closes #8943 --- Makefile.am | 3 +- cmd/zed/Makefile.am | 57 +------------------ cmd/zed/zed.d/Makefile.am | 57 +++++++++++++++++++ configure.ac | 1 + contrib/initramfs/Makefile.am | 21 ++++--- contrib/pyzfs/Makefile.am | 2 +- module/Makefile.in | 5 +- scripts/Makefile.am | 5 +- tests/runfiles/Makefile.am | 5 +- .../tests/functional/checksum/Makefile.am | 2 +- .../tests/functional/hkdf/Makefile.am | 2 +- 11 files changed, 88 insertions(+), 72 deletions(-) create mode 100644 cmd/zed/zed.d/Makefile.am diff --git a/Makefile.am b/Makefile.am index 1ec2514922a9..9afe22954101 100644 --- a/Makefile.am +++ b/Makefile.am @@ -52,7 +52,8 @@ distclean-local:: -type f -print | xargs $(RM) all-local: - -${top_srcdir}/scripts/zfs-tests.sh -c + -[ -x ${top_builddir}/scripts/zfs-tests.sh ] && \ + ${top_builddir}/scripts/zfs-tests.sh -c dist-hook: gitrev cp ${top_srcdir}/include/zfs_gitrev.h $(distdir)/include; \ diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am index 9c11315f2a58..fb479f9b5c79 100644 --- a/cmd/zed/Makefile.am +++ b/cmd/zed/Makefile.am @@ -1,12 +1,11 @@ +SUBDIRS = zed.d + include $(top_srcdir)/config/Rules.am DEFAULT_INCLUDES += \ -I$(top_srcdir)/include \ -I$(top_srcdir)/lib/libspl/include -EXTRA_DIST = zed.d/README \ - zed.d/history_event-zfs-list-cacher.sh.in - sbin_PROGRAMS = zed ZED_SRC = \ @@ -47,55 +46,3 @@ zed_LDADD = \ zed_LDADD += -lrt zed_LDFLAGS = -pthread - -zedconfdir = $(sysconfdir)/zfs/zed.d - -dist_zedconf_DATA = \ - zed.d/zed-functions.sh \ - zed.d/zed.rc - -zedexecdir = $(zfsexecdir)/zed.d - -dist_zedexec_SCRIPTS = \ - zed.d/all-debug.sh \ - zed.d/all-syslog.sh \ - zed.d/data-notify.sh \ - zed.d/generic-notify.sh \ - zed.d/resilver_finish-notify.sh \ - zed.d/scrub_finish-notify.sh \ - zed.d/statechange-led.sh \ - zed.d/statechange-notify.sh \ - zed.d/vdev_clear-led.sh \ - zed.d/vdev_attach-led.sh \ - zed.d/pool_import-led.sh \ - zed.d/resilver_finish-start-scrub.sh - -nodist_zedexec_SCRIPTS = zed.d/history_event-zfs-list-cacher.sh - -$(nodist_zedexec_SCRIPTS): %: %.in - -$(SED) -e 's,@bindir\@,$(bindir),g' \ - -e 's,@runstatedir\@,$(runstatedir),g' \ - -e 's,@sbindir\@,$(sbindir),g' \ - -e 's,@sysconfdir\@,$(sysconfdir),g' \ - $< >'$@' - -zedconfdefaults = \ - all-syslog.sh \ - data-notify.sh \ - resilver_finish-notify.sh \ - scrub_finish-notify.sh \ - statechange-led.sh \ - statechange-notify.sh \ - vdev_clear-led.sh \ - vdev_attach-led.sh \ - pool_import-led.sh \ - resilver_finish-start-scrub.sh - -install-data-hook: - $(MKDIR_P) "$(DESTDIR)$(zedconfdir)" - for f in $(zedconfdefaults); do \ - test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \ - -L "$(DESTDIR)$(zedconfdir)/$${f}" || \ - ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \ - done - chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc" diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am new file mode 100644 index 000000000000..716db2b2f215 --- /dev/null +++ b/cmd/zed/zed.d/Makefile.am @@ -0,0 +1,57 @@ +include $(top_srcdir)/config/Rules.am + +EXTRA_DIST = \ + README \ + history_event-zfs-list-cacher.sh.in + +zedconfdir = $(sysconfdir)/zfs/zed.d + +dist_zedconf_DATA = \ + zed-functions.sh \ + zed.rc + +zedexecdir = $(zfsexecdir)/zed.d + +dist_zedexec_SCRIPTS = \ + all-debug.sh \ + all-syslog.sh \ + data-notify.sh \ + generic-notify.sh \ + resilver_finish-notify.sh \ + scrub_finish-notify.sh \ + statechange-led.sh \ + statechange-notify.sh \ + vdev_clear-led.sh \ + vdev_attach-led.sh \ + pool_import-led.sh \ + resilver_finish-start-scrub.sh + +nodist_zedexec_SCRIPTS = history_event-zfs-list-cacher.sh + +$(nodist_zedexec_SCRIPTS): %: %.in + -$(SED) -e 's,@bindir\@,$(bindir),g' \ + -e 's,@runstatedir\@,$(runstatedir),g' \ + -e 's,@sbindir\@,$(sbindir),g' \ + -e 's,@sysconfdir\@,$(sysconfdir),g' \ + $< >'$@' + +zedconfdefaults = \ + all-syslog.sh \ + data-notify.sh \ + resilver_finish-notify.sh \ + scrub_finish-notify.sh \ + statechange-led.sh \ + statechange-notify.sh \ + vdev_clear-led.sh \ + vdev_attach-led.sh \ + pool_import-led.sh \ + resilver_finish-start-scrub.sh + +install-data-hook: + $(MKDIR_P) "$(DESTDIR)$(zedconfdir)" + for f in $(zedconfdefaults); do \ + test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \ + -L "$(DESTDIR)$(zedconfdir)/$${f}" || \ + ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \ + done + chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc" diff --git a/configure.ac b/configure.ac index db614084e37e..ea2e355c70bf 100644 --- a/configure.ac +++ b/configure.ac @@ -120,6 +120,7 @@ AC_CONFIG_FILES([ cmd/dbufstat/Makefile cmd/arc_summary/Makefile cmd/zed/Makefile + cmd/zed/zed.d/Makefile cmd/raidz_test/Makefile cmd/zgenhostid/Makefile contrib/Makefile diff --git a/contrib/initramfs/Makefile.am b/contrib/initramfs/Makefile.am index 87ec7a86f5ac..9f912d946649 100644 --- a/contrib/initramfs/Makefile.am +++ b/contrib/initramfs/Makefile.am @@ -11,13 +11,18 @@ EXTRA_DIST = \ $(top_srcdir)/contrib/initramfs/README.initramfs.markdown install-initrdSCRIPTS: $(EXTRA_DIST) - for d in conf.d conf-hooks.d hooks scripts scripts/local-top; do \ - $(MKDIR_P) $(DESTDIR)$(initrddir)/$$d; \ - cp $(top_srcdir)/contrib/initramfs/$$d/zfs \ - $(DESTDIR)$(initrddir)/$$d/; \ + for d in conf.d conf-hooks.d scripts/local-top; do \ + $(MKDIR_P) $(DESTDIR)$(initrddir)/$$d; \ + cp $(top_srcdir)/contrib/initramfs/$$d/zfs \ + $(DESTDIR)$(initrddir)/$$d/; \ done - if [ -f etc/init.d/zfs ]; then \ - $(MKDIR_P) $(DESTDIR)$(DEFAULT_INITCONF_DIR); \ - cp $(top_srcdir)/etc/init.d/zfs \ - $(DESTDIR)$(DEFAULT_INITCONF_DIR)/; \ + for d in hooks scripts; do \ + $(MKDIR_P) $(DESTDIR)$(initrddir)/$$d; \ + cp $(top_builddir)/contrib/initramfs/$$d/zfs \ + $(DESTDIR)$(initrddir)/$$d/; \ + done + if [ -f $(top_builddir)/etc/init.d/zfs ]; then \ + $(MKDIR_P) $(DESTDIR)$(DEFAULT_INITCONF_DIR); \ + cp $(top_builddir)/etc/init.d/zfs \ + $(DESTDIR)$(DEFAULT_INITCONF_DIR)/; \ fi diff --git a/contrib/pyzfs/Makefile.am b/contrib/pyzfs/Makefile.am index 1549bf237932..fa1bb32ce2eb 100644 --- a/contrib/pyzfs/Makefile.am +++ b/contrib/pyzfs/Makefile.am @@ -24,7 +24,7 @@ all-local: # files are later created by manually loading the Python modules. # install-exec-local: - $(PYTHON) $(srcdir)/setup.py install \ + $(PYTHON) $(builddir)/setup.py install \ --prefix $(prefix) \ --root $(DESTDIR)/ \ --install-lib $(pythonsitedir) \ diff --git a/module/Makefile.in b/module/Makefile.in index 935bd2663062..eca7691aedbb 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -66,8 +66,9 @@ modules_uninstall: distdir: list='$(subdir-m)'; for subdir in $$list; do \ - (cd @top_srcdir@/module && find $$subdir -name '*.c' -o -name '*.h' -o -name '*.S' |\ - xargs cp --parents -t $$distdir); \ + (cd @top_srcdir@/module && find $$subdir \ + -name '*.c' -o -name '*.h' -o -name '*.S' | \ + xargs cp --parents -t @abs_top_builddir@/module/$$distdir); \ done distclean maintainer-clean: clean diff --git a/scripts/Makefile.am b/scripts/Makefile.am index 11e963c527a8..d275a41c4e04 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -60,7 +60,7 @@ all-local: -e '\|^export SBIN_DIR=|s|$$|@abs_top_builddir@/bin|' \ -e '\|^export ZTS_DIR=|s|$$|@abs_top_srcdir@/tests|' \ -e '\|^export SCRIPT_DIR=|s|$$|@abs_top_srcdir@/scripts|' \ - common.sh.in >common.sh + $(abs_top_srcdir)/scripts/common.sh.in >common.sh -echo "$$EXTRA_ENVIRONMENT" >>common.sh clean-local: @@ -71,4 +71,5 @@ install-data-hook: -e '\|^export SBIN_DIR=|s|$$|@sbindir@|' \ -e '\|^export ZTS_DIR=|s|$$|@datadir@/@PACKAGE@|' \ -e '\|^export SCRIPT_DIR=|s|$$|@datadir@/@PACKAGE@|' \ - common.sh.in >$(DESTDIR)$(datadir)/@PACKAGE@/common.sh + $(abs_top_srcdir)/scripts/common.sh.in \ + >$(DESTDIR)$(datadir)/@PACKAGE@/common.sh diff --git a/tests/runfiles/Makefile.am b/tests/runfiles/Makefile.am index 138d905a5722..4625806ff8ba 100644 --- a/tests/runfiles/Makefile.am +++ b/tests/runfiles/Makefile.am @@ -1,2 +1,5 @@ pkgdatadir = $(datadir)/@PACKAGE@/runfiles -dist_pkgdata_DATA = *.run +dist_pkgdata_DATA = \ + linux.run \ + longevity.run \ + perf-regression.run diff --git a/tests/zfs-tests/tests/functional/checksum/Makefile.am b/tests/zfs-tests/tests/functional/checksum/Makefile.am index f72546b22590..905d991ed75f 100644 --- a/tests/zfs-tests/tests/functional/checksum/Makefile.am +++ b/tests/zfs-tests/tests/functional/checksum/Makefile.am @@ -1,7 +1,7 @@ include $(top_srcdir)/config/Rules.am AM_CPPFLAGS += -I$(top_srcdir)/include -LDADD = $(top_srcdir)/lib/libicp/libicp.la +LDADD = $(top_builddir)/lib/libicp/libicp.la AUTOMAKE_OPTIONS = subdir-objects diff --git a/tests/zfs-tests/tests/functional/hkdf/Makefile.am b/tests/zfs-tests/tests/functional/hkdf/Makefile.am index 3ac26ed21c16..b54e353cd963 100644 --- a/tests/zfs-tests/tests/functional/hkdf/Makefile.am +++ b/tests/zfs-tests/tests/functional/hkdf/Makefile.am @@ -2,7 +2,7 @@ include $(top_srcdir)/config/Rules.am AM_CPPFLAGS += -I$(top_srcdir)/include AM_CPPFLAGS += -I$(top_srcdir)/lib/libspl/include -LDADD = $(top_srcdir)/lib/libzpool/libzpool.la +LDADD = $(top_builddir)/lib/libzpool/libzpool.la AUTOMAKE_OPTIONS = subdir-objects From bfe5f029cfb0ae5e246898baf928c944c220ff46 Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Mon, 24 Jun 2019 19:42:52 -0400 Subject: [PATCH 046/109] Fix error message on promoting encrypted dataset This patch corrects the error message reported when attempting to promote a dataset outside of its encryption root. Reviewed-by: Brian Behlendorf Signed-off-by: Tom Caputi Closes #8905 Closes #8935 --- lib/libzfs/libzfs_dataset.c | 10 ++++++++++ module/zfs/dsl_crypt.c | 8 ++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index ee5a6412ead5..0d0194e68453 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -4117,6 +4117,16 @@ zfs_promote(zfs_handle_t *zhp) if (ret != 0) { switch (ret) { + case EACCES: + /* + * Promoting encrypted dataset outside its + * encryption root. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot promote dataset outside its " + "encryption root")); + return (zfs_error(hdl, EZFS_EXISTS, errbuf)); + case EEXIST: /* There is a conflicting snapshot name. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c index 0c0ffaadd8fb..568fe7aa3263 100644 --- a/module/zfs/dsl_crypt.c +++ b/module/zfs/dsl_crypt.c @@ -1676,11 +1676,15 @@ dsl_dataset_promote_crypt_check(dsl_dir_t *target, dsl_dir_t *origin) * Check that the parent of the target has the same encryption root. */ ret = dsl_dir_get_encryption_root_ddobj(origin->dd_parent, &op_rddobj); - if (ret != 0) + if (ret == ENOENT) + return (SET_ERROR(EACCES)); + else if (ret != 0) return (ret); ret = dsl_dir_get_encryption_root_ddobj(target->dd_parent, &tp_rddobj); - if (ret != 0) + if (ret == ENOENT) + return (SET_ERROR(EACCES)); + else if (ret != 0) return (ret); if (op_rddobj != tp_rddobj) From 05006f125ccd97851d5f673483fb4ba606bdf0d3 Mon Sep 17 00:00:00 2001 From: Igor K Date: Tue, 25 Jun 2019 03:58:12 +0300 Subject: [PATCH 047/109] -Y option for zdb is valid The -Y option was added for ztest to test split block reconstruction. Reviewed-by: Brian Behlendorf Reviewed-by: Richard Elling Signed-off-by: Igor Kozhukhov Closes #8926 --- tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh index a5f827b5642f..e69779bd4b4c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh @@ -59,7 +59,7 @@ set -A args "create" "add" "destroy" "import fakepool" \ "-a" "-f" "-g" "-h" "-j" "-m" "-n" "-o" "-p" \ "-p /tmp" "-r" "-t" "-w" "-x" "-y" "-z" \ "-D" "-E" "-G" "-H" "-I" "-J" "-K" "-M" \ - "-N" "-Q" "-R" "-S" "-T" "-W" "-Y" "-Z" + "-N" "-Q" "-R" "-S" "-T" "-W" "-Z" log_assert "Execute zdb using invalid parameters." From 04d4df89f4526eecd66fa1c380dba5ee3aff261c Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 25 Jun 2019 15:03:38 -0400 Subject: [PATCH 048/109] Avoid extra taskq_dispatch() calls by DMU DMU sync code calls taskq_dispatch() for each sublist of os_dirty_dnodes and os_synced_dnodes. Since the number of sublists by default is equal to number of CPUs, it will dispatch equal, potentially large, number of tasks, waking up many CPUs to handle them, even if only one or few of sublists actually have any work to do. This change adds check for empty sublists to avoid this. Reviewed by: Sean Eric Fagan Reviewed by: Matt Ahrens Reviewed by: Brian Behlendorf Signed-off-by: Alexander Motin Closes #8909 --- include/sys/multilist.h | 2 ++ module/zfs/dmu_objset.c | 19 ++++++++++++++----- module/zfs/multilist.c | 22 ++++++++++++++++++++++ 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/include/sys/multilist.h b/include/sys/multilist.h index 439540685971..0c7b4075d9a3 100644 --- a/include/sys/multilist.h +++ b/include/sys/multilist.h @@ -89,6 +89,8 @@ void multilist_sublist_insert_head(multilist_sublist_t *, void *); void multilist_sublist_insert_tail(multilist_sublist_t *, void *); void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); void multilist_sublist_remove(multilist_sublist_t *, void *); +int multilist_sublist_is_empty(multilist_sublist_t *); +int multilist_sublist_is_empty_idx(multilist_t *, unsigned int); void *multilist_sublist_head(multilist_sublist_t *); void *multilist_sublist_tail(multilist_sublist_t *); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 30436b188fc4..29ed45a55dc7 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1692,6 +1692,8 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) zio_t *zio; list_t *list; dbuf_dirty_record_t *dr; + int num_sublists; + multilist_t *ml; blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); *blkptr_copy = *os->os_rootbp; @@ -1780,10 +1782,13 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) } } - for (int i = 0; - i < multilist_get_num_sublists(os->os_dirty_dnodes[txgoff]); i++) { + ml = os->os_dirty_dnodes[txgoff]; + num_sublists = multilist_get_num_sublists(ml); + for (int i = 0; i < num_sublists; i++) { + if (multilist_sublist_is_empty_idx(ml, i)) + continue; sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP); - sda->sda_list = os->os_dirty_dnodes[txgoff]; + sda->sda_list = ml; sda->sda_sublist_idx = i; sda->sda_tx = tx; (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, @@ -2086,6 +2091,8 @@ userquota_updates_task(void *arg) void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) { + int num_sublists; + if (!dmu_objset_userused_enabled(os)) return; @@ -2118,8 +2125,10 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } - for (int i = 0; - i < multilist_get_num_sublists(os->os_synced_dnodes); i++) { + num_sublists = multilist_get_num_sublists(os->os_synced_dnodes); + for (int i = 0; i < num_sublists; i++) { + if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i)) + continue; userquota_updates_arg_t *uua = kmem_alloc(sizeof (*uua), KM_SLEEP); uua->uua_os = os; diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c index 2a594c56cbd5..b74ee0f0670a 100644 --- a/module/zfs/multilist.c +++ b/module/zfs/multilist.c @@ -363,6 +363,28 @@ multilist_sublist_remove(multilist_sublist_t *mls, void *obj) list_remove(&mls->mls_list, obj); } +int +multilist_sublist_is_empty(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_is_empty(&mls->mls_list)); +} + +int +multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx) +{ + multilist_sublist_t *mls; + int empty; + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + mls = &ml->ml_sublists[sublist_idx]; + ASSERT(!MUTEX_HELD(&mls->mls_lock)); + mutex_enter(&mls->mls_lock); + empty = list_is_empty(&mls->mls_list); + mutex_exit(&mls->mls_lock); + return (empty); +} + void * multilist_sublist_head(multilist_sublist_t *mls) { From 7d2489cfad1b04c1b22292d0a9a58f85195ce00c Mon Sep 17 00:00:00 2001 From: George Wilson Date: Fri, 28 Jun 2019 15:40:24 -0400 Subject: [PATCH 049/109] nopwrites on dmu_sync-ed blocks can result in a panic After device removal, performing nopwrites on a dmu_sync-ed block will result in a panic. This panic can show up in two ways: 1. an attempt to issue an IOCTL in vdev_indirect_io_start() 2. a failed comparison of zio->io_bp and zio->io_bp_orig in zio_done() To resolve both of these panics, nopwrites of blocks on indirect vdevs should be ignored and new allocations should be performed on concrete vdevs. Reviewed-by: Igor Kozhukhov Reviewed-by: Pavel Zakharov Reviewed-by: Brian Behlendorf Reviewed-by: Don Brady Signed-off-by: George Wilson Closes #8957 --- module/zfs/zio.c | 14 +++ tests/runfiles/linux.run | 2 +- .../tests/functional/removal/Makefile.am | 2 +- .../functional/removal/removal_nopwrite.ksh | 87 +++++++++++++++++++ 4 files changed, 103 insertions(+), 2 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/removal/removal_nopwrite.ksh diff --git a/module/zfs/zio.c b/module/zfs/zio.c index f9503bd3ff81..94eaa5888a9c 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2862,6 +2862,20 @@ zio_nop_write(zio_t *zio) ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, sizeof (uint64_t)) == 0); + /* + * If we're overwriting a block that is currently on an + * indirect vdev, then ignore the nopwrite request and + * allow a new block to be allocated on a concrete vdev. + */ + spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER); + vdev_t *tvd = vdev_lookup_top(zio->io_spa, + DVA_GET_VDEV(&bp->blk_dva[0])); + if (tvd->vdev_ops == &vdev_indirect_ops) { + spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); + return (zio); + } + spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); + *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 22fc26212c0d..3f82676ef218 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -758,7 +758,7 @@ tags = ['functional', 'refreserv'] pre = tests = ['removal_all_vdev', 'removal_check_space', 'removal_condense_export', 'removal_multiple_indirection', - 'removal_remap', 'removal_remap_deadlists', + 'removal_remap', 'removal_nopwrite', 'removal_remap_deadlists', 'removal_resume_export', 'removal_sanity', 'removal_with_add', 'removal_with_create_fs', 'removal_with_dedup', 'removal_with_errors', 'removal_with_export', diff --git a/tests/zfs-tests/tests/functional/removal/Makefile.am b/tests/zfs-tests/tests/functional/removal/Makefile.am index ba42b899acac..df92e0b5ed44 100644 --- a/tests/zfs-tests/tests/functional/removal/Makefile.am +++ b/tests/zfs-tests/tests/functional/removal/Makefile.am @@ -18,7 +18,7 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/removal dist_pkgdata_SCRIPTS = \ cleanup.ksh removal_all_vdev.ksh removal_check_space.ksh \ removal_condense_export.ksh removal_multiple_indirection.ksh \ - removal_remap_deadlists.ksh removal_remap.ksh \ + removal_remap_deadlists.ksh removal_nopwrite.ksh removal_remap.ksh \ removal_reservation.ksh removal_resume_export.ksh \ removal_sanity.ksh removal_with_add.ksh removal_with_create_fs.ksh \ removal_with_dedup.ksh removal_with_errors.ksh \ diff --git a/tests/zfs-tests/tests/functional/removal/removal_nopwrite.ksh b/tests/zfs-tests/tests/functional/removal/removal_nopwrite.ksh new file mode 100755 index 000000000000..cb8bd6b810c1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/removal/removal_nopwrite.ksh @@ -0,0 +1,87 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib +. $STF_SUITE/tests/functional/nopwrite/nopwrite.shlib + +default_setup_noexit "$DISKS" +log_onexit default_cleanup_noexit +BLOCKSIZE=8192 + +origin="$TESTPOOL/$TESTFS" + +log_must zfs set compress=on $origin +log_must zfs set checksum=edonr $origin + +log_must zfs set recordsize=8k $origin +dd if=/dev/urandom of=$TESTDIR/file_8k bs=1024k count=$MEGS oflag=sync \ + conv=notrunc >/dev/null 2>&1 || log_fail "dd into $TESTDIR/file failed." +log_must zfs set recordsize=128k $origin +dd if=/dev/urandom of=$TESTDIR/file_128k bs=1024k count=$MEGS oflag=sync \ + conv=notrunc >/dev/null 2>&1 || log_fail "dd into $TESTDIR/file failed." + +zfs snapshot $origin@a || log_fail "zfs snap failed" +log_must zfs clone $origin@a $origin/clone + +# +# Verify that nopwrites work prior to removal +# +log_must zfs set recordsize=8k $origin/clone +dd if=/$TESTDIR/file_8k of=/$TESTDIR/clone/file_8k bs=1024k \ + oflag=sync conv=notrunc >/dev/null 2>&1 || log_fail "dd failed." +log_must verify_nopwrite $origin $origin@a $origin/clone + +log_must zfs set recordsize=128k $origin/clone +dd if=/$TESTDIR/file_128k of=/$TESTDIR/clone/file_128k bs=1024k \ + oflag=sync conv=notrunc >/dev/null 2>&1 || log_fail "dd failed." +log_must verify_nopwrite $origin $origin@a $origin/clone + +# +# Remove a device before testing nopwrites again +# +log_must zpool remove $TESTPOOL $REMOVEDISK +log_must wait_for_removal $TESTPOOL +log_mustnot vdevs_in_pool $TESTPOOL $REMOVEDISK + +# +# Normally, we expect nopwrites to avoid allocating new blocks, but +# after a device has been removed the DVAs will get remapped when +# a L0's indirect bloock is written. This will negate the effects +# of nopwrite and should result in new allocations. +# + +# +# Perform a direct zil nopwrite test +# +log_must zfs set recordsize=8k $origin/clone +dd if=/$TESTDIR/file_8k of=/$TESTDIR/clone/file_8k bs=1024k \ + oflag=sync conv=notrunc >/dev/null 2>&1 || log_fail "dd failed." +log_mustnot verify_nopwrite $origin $origin@a $origin/clone + +# +# Perform an indirect zil nopwrite test +# +log_must zfs set recordsize=128k $origin/clone +dd if=/$TESTDIR/file_128k of=/$TESTDIR/clone/file_128k bs=1024k \ + oflag=sync conv=notrunc >/dev/null 2>&1 || log_fail "dd failed." +log_mustnot verify_nopwrite $origin $origin@a $origin/clone + +log_pass "Remove works with nopwrite." From 093bb6446120c50a7109ed7e7a0f2e76730b3160 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Wed, 3 Jul 2019 00:25:23 +0900 Subject: [PATCH 050/109] Don't use d_path() for automount mount point for chroot'd process Chroot'd process fails to automount snapshots due to realpath(3) failure in mount.zfs(8). Construct a mount point path from sb of the ctldir inode and dirent name, instead of from d_path(), so that chroot'd process doesn't get affected by its view of fs. Reviewed-by: Brian Behlendorf Signed-off-by: Tomohiro Kusumi Closes #8903 Closes #8966 --- module/zfs/zfs_ctldir.c | 41 +++++++---------------------------------- 1 file changed, 7 insertions(+), 34 deletions(-) diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index c8071a7c215f..aa50646fef83 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -703,37 +703,6 @@ zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len, return (0); } -/* - * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/" - */ -static int -zfsctl_snapshot_path(struct path *path, int len, char *full_path) -{ - char *path_buffer, *path_ptr; - int path_len, error = 0; - - path_buffer = kmem_alloc(len, KM_SLEEP); - - path_ptr = d_path(path, path_buffer, len); - if (IS_ERR(path_ptr)) { - error = -PTR_ERR(path_ptr); - goto out; - } - - path_len = path_buffer + len - 1 - path_ptr; - if (path_len > len) { - error = SET_ERROR(EFAULT); - goto out; - } - - memcpy(full_path, path_ptr, path_len); - full_path[path_len] = '\0'; -out: - kmem_free(path_buffer, len); - - return (error); -} - /* * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/" */ @@ -1077,9 +1046,13 @@ zfsctl_snapshot_mount(struct path *path, int flags) if (error) goto error; - error = zfsctl_snapshot_path(path, MAXPATHLEN, full_path); - if (error) - goto error; + /* + * Construct a mount point path from sb of the ctldir inode and dirent + * name, instead of from d_path(), so that chroot'd process doesn't fail + * on mount.zfs(8). + */ + snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s", + zfsvfs->z_vfs->vfs_mntpoint, dname(dentry)); /* * Multiple concurrent automounts of a snapshot are never allowed. From 9e09826b33092bfe41dce14e098b2d2f4931da2f Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Tue, 2 Jul 2019 20:30:00 -0400 Subject: [PATCH 051/109] Fix error text for EINVAL in zfs_receive_one() This small patch fixes the EINVAL case for zfs_receive_one(). A missing 'else' has been added to the two possible cases, which will ensure the intended error message is printed. Reviewed-by: Brian Behlendorf Reviewed-by: loli10K Signed-off-by: Tom Caputi Closes #8977 --- lib/libzfs/libzfs_sendrecv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 052b96b9b653..0d3853e0a1c4 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -4418,14 +4418,15 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, *cp = '@'; break; case EINVAL: - if (flags->resumable) + if (flags->resumable) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "kernel modules must be upgraded to " "receive this stream.")); - if (embedded && !raw) + } else if (embedded && !raw) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incompatible embedded data stream " "feature with encrypted receive.")); + } (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ECKSUM: From 7a03d7c73cec63e3c3e771c8cf34d8876a0f0532 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 3 Jul 2019 13:01:54 -0700 Subject: [PATCH 052/109] Check b_freeze_cksum under ZFS_DEBUG_MODIFY conditional The b_freeze_cksum field can only have data when ZFS_DEBUG_MODIFY is set. Therefore, the EQUIV check must be wrapped accordingly. For the same reason the ASSERT in arc_buf_fill() in unsafe. However, since it's largely redundant it has simply been removed. Reviewed-by: George Wilson Reviewed-by: Allan Jude Reviewed-by: Igor Kozhukhov Signed-off-by: Brian Behlendorf Closes #8979 --- module/zfs/arc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 720365c4a935..f125ca6a4d14 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1872,7 +1872,8 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf) * There were no decompressed bufs, so there should not be a * checksum on the hdr either. */ - EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); + if (zfs_flags & ZFS_DEBUG_MODIFY) + EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); return (copied); } @@ -2253,7 +2254,6 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, */ if (arc_buf_try_copy_decompressed_data(buf)) { /* Skip byteswapping and checksumming (already done) */ - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); return (0); } else { error = zio_decompress_data(HDR_GET_COMPRESS(hdr), From 14a11bf2f6052413cdaa5cf8193d16ce8f2fa388 Mon Sep 17 00:00:00 2001 From: Paul Zuchowski <31706010+PaulZ-98@users.noreply.github.com> Date: Wed, 3 Jul 2019 16:05:02 -0400 Subject: [PATCH 053/109] Improve "Unable to automount" error message. Having the mountpoint and dataset name both in the message made it confusing to read. Additionally, convert this to a zfs_dbgmsg rather than sending it to the console. Reviewed-by: Tom Caputi Reviewed-by: Brian Behlendorf Signed-off-by: Paul Zuchowski Closes #8959 --- module/zfs/zfs_ctldir.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index aa50646fef83..52314f4e1bdb 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -30,6 +30,7 @@ * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2018 George Melikov. All Rights Reserved. + * Copyright (c) 2019 Datto, Inc. All rights reserved. */ /* @@ -1081,8 +1082,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (error) { if (!(error & MOUNT_BUSY << 8)) { - cmn_err(CE_WARN, "Unable to automount %s/%s: %d", - full_path, full_name, error); + zfs_dbgmsg("Unable to automount %s error=%d", + full_path, error); error = SET_ERROR(EISDIR); } else { /* From 1f72a18f59d73f6e09ea052fb51cc7e19eaa3250 Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Fri, 5 Jul 2019 19:53:14 -0400 Subject: [PATCH 054/109] Remove VERIFY from dsl_dataset_crypt_stats() This patch fixes an issue where dsl_dataset_crypt_stats() would VERIFY that it was able to hold the encryption root. This function should instead silently continue without populating the related field in the nvlist, as is the convention for this code. Reviewed-by: Brian Behlendorf Signed-off-by: Tom Caputi Closes #8976 --- module/zfs/dsl_crypt.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c index 568fe7aa3263..24711227ba55 100644 --- a/module/zfs/dsl_crypt.c +++ b/module/zfs/dsl_crypt.c @@ -2624,11 +2624,13 @@ dsl_dataset_crypt_stats(dsl_dataset_t *ds, nvlist_t *nv) } if (dsl_dir_get_encryption_root_ddobj(dd, &intval) == 0) { - VERIFY0(dsl_dir_hold_obj(dd->dd_pool, intval, NULL, FTAG, - &enc_root)); - dsl_dir_name(enc_root, buf); - dsl_dir_rele(enc_root, FTAG); - dsl_prop_nvlist_add_string(nv, ZFS_PROP_ENCRYPTION_ROOT, buf); + if (dsl_dir_hold_obj(dd->dd_pool, intval, NULL, FTAG, + &enc_root) == 0) { + dsl_dir_name(enc_root, buf); + dsl_dir_rele(enc_root, FTAG); + dsl_prop_nvlist_add_string(nv, + ZFS_PROP_ENCRYPTION_ROOT, buf); + } } } From 2ac233c633e9bce36df8e7a3d7501cf4a0e227bb Mon Sep 17 00:00:00 2001 From: loli10K Date: Tue, 9 Jul 2019 18:28:05 +0200 Subject: [PATCH 055/109] Fix dracut Debian/Ubuntu packaging This commit ensures make(1) targets that build .deb packages fail if alien(1) can't convert all .rpm files; additionally it also updates the zfs-dracut package name which was changed to "noarch" in ca4e5a7. Reviewed-by: Neal Gompa Reviewed-by: Brian Behlendorf Reviewed-by: Olaf Faaland Signed-off-by: loli10K Closes #8990 Closes #8991 --- config/deb.am | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/deb.am b/config/deb.am index e405547aa949..83059a923493 100644 --- a/config/deb.am +++ b/config/deb.am @@ -20,7 +20,7 @@ deb-kmod: deb-local rpm-kmod arch=`$(RPM) -qp $${name}-kmod-$${version}.src.rpm --qf %{arch} | tail -1`; \ debarch=`$(DPKG) --print-architecture`; \ pkg1=kmod-$${name}*$${version}.$${arch}.rpm; \ - fakeroot $(ALIEN) --bump=0 --scripts --to-deb --target=$$debarch $$pkg1; \ + fakeroot $(ALIEN) --bump=0 --scripts --to-deb --target=$$debarch $$pkg1 || exit 1; \ $(RM) $$pkg1 @@ -30,7 +30,7 @@ deb-dkms: deb-local rpm-dkms arch=`$(RPM) -qp $${name}-dkms-$${version}.src.rpm --qf %{arch} | tail -1`; \ debarch=`$(DPKG) --print-architecture`; \ pkg1=$${name}-dkms-$${version}.$${arch}.rpm; \ - fakeroot $(ALIEN) --bump=0 --scripts --to-deb --target=$$debarch $$pkg1; \ + fakeroot $(ALIEN) --bump=0 --scripts --to-deb --target=$$debarch $$pkg1 || exit 1; \ $(RM) $$pkg1 deb-utils: deb-local rpm-utils @@ -45,7 +45,7 @@ deb-utils: deb-local rpm-utils pkg5=libzpool2-$${version}.$${arch}.rpm; \ pkg6=libzfs2-devel-$${version}.$${arch}.rpm; \ pkg7=$${name}-test-$${version}.$${arch}.rpm; \ - pkg8=$${name}-dracut-$${version}.$${arch}.rpm; \ + pkg8=$${name}-dracut-$${version}.noarch.rpm; \ pkg9=$${name}-initramfs-$${version}.$${arch}.rpm; \ pkg10=`ls python*-pyzfs-$${version}* | tail -1`; \ ## Arguments need to be passed to dh_shlibdeps. Alien provides no mechanism @@ -63,7 +63,7 @@ deb-utils: deb-local rpm-utils env PATH=$${path_prepend}:$${PATH} \ fakeroot $(ALIEN) --bump=0 --scripts --to-deb --target=$$debarch \ $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 \ - $$pkg8 $$pkg9 $$pkg10; \ + $$pkg8 $$pkg9 $$pkg10 || exit 1; \ $(RM) $${path_prepend}/dh_shlibdeps; \ rmdir $${path_prepend}; \ $(RM) $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 \ From ccd8125e450c2968b2878fd887da7fac5b9a49f1 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Wed, 10 Jul 2019 01:31:46 +0900 Subject: [PATCH 056/109] Fix race in parallel mount's thread dispatching algorithm Strategy of parallel mount is as follows. 1) Initial thread dispatching is to select sets of mount points that don't have dependencies on other sets, hence threads can/should run lock-less and shouldn't race with other threads for other sets. Each thread dispatched corresponds to top level directory which may or may not have datasets to be mounted on sub directories. 2) Subsequent recursive thread dispatching for each thread from 1) is to mount datasets for each set of mount points. The mount points within each set have dependencies (i.e. child directories), so child directories are processed only after parent directory completes. The problem is that the initial thread dispatching in zfs_foreach_mountpoint() can be multi-threaded when it needs to be single-threaded, and this puts threads under race condition. This race appeared as mount/unmount issues on ZoL for ZoL having different timing regarding mount(2) execution due to fork(2)/exec(2) of mount(8). `zfs unmount -a` which expects proper mount order can't unmount if the mounts were reordered by the race condition. There are currently two known patterns of input list `handles` in `zfs_foreach_mountpoint(..,handles,..)` which cause the race condition. 1) #8833 case where input is `/a /a /a/b` after sorting. The problem is that libzfs_path_contains() can't correctly handle an input list with two same top level directories. There is a race between two POSIX threads A and B, * ThreadA for "/a" for test1 and "/a/b" * ThreadB for "/a" for test0/a and in case of #8833, ThreadA won the race. Two threads were created because "/a" wasn't considered as `"/a" contains "/a"`. 2) #8450 case where input is `/ /var/data /var/data/test` after sorting. The problem is that libzfs_path_contains() can't correctly handle an input list containing "/". There is a race between two POSIX threads A and B, * ThreadA for "/" and "/var/data/test" * ThreadB for "/var/data" and in case of #8450, ThreadA won the race. Two threads were created because "/var/data" wasn't considered as `"/" contains "/var/data"`. In other words, if there is (at least one) "/" in the input list, the initial thread dispatching must be single-threaded since every directory is a child of "/", meaning they all directly or indirectly depend on "/". In both cases, the first non_descendant_idx() call fails to correctly determine "path1-contains-path2", and as a result the initial thread dispatching creates another thread when it needs to be single-threaded. Fix a conditional in libzfs_path_contains() to consider above two. Reviewed-by: Brian Behlendorf Reviewed by: Sebastien Roy Signed-off-by: Tomohiro Kusumi Closes #8450 Closes #8833 Closes #8878 --- lib/libzfs/libzfs_mount.c | 6 +- tests/runfiles/linux.run | 3 +- .../functional/cli_root/zfs_mount/Makefile.am | 1 + .../cli_root/zfs_mount/zfs_mount_test_race.sh | 116 ++++++++++++++++++ 4 files changed, 123 insertions(+), 3 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index 649c232aa3e5..d62801cfdaca 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -1302,12 +1302,14 @@ mountpoint_cmp(const void *arga, const void *argb) } /* - * Return true if path2 is a child of path1. + * Return true if path2 is a child of path1 or path2 equals path1 or + * path1 is "/" (path2 is always a child of "/"). */ static boolean_t libzfs_path_contains(const char *path1, const char *path2) { - return (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/'); + return (strcmp(path1, path2) == 0 || strcmp(path1, "/") == 0 || + (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/')); } /* diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 3f82676ef218..27e36b594ab5 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -182,7 +182,8 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_007_pos', 'zfs_mount_008_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_neg', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', - 'zfs_multi_mount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints'] + 'zfs_multi_mount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', + 'zfs_mount_test_race'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am index b2de98934b74..c208a1c378dc 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am @@ -19,6 +19,7 @@ dist_pkgdata_SCRIPTS = \ zfs_mount_all_mountpoints.ksh \ zfs_mount_encrypted.ksh \ zfs_mount_remount.ksh \ + zfs_mount_test_race.sh \ zfs_multi_mount.ksh dist_pkgdata_DATA = \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh new file mode 100755 index 000000000000..404770b2727f --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh @@ -0,0 +1,116 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.cfg + +# +# DESCRIPTION: +# Verify parallel mount ordering is consistent. +# +# There was a bug in initial thread dispatching algorithm which put threads +# under race condition which resulted in undefined mount order. The purpose +# of this test is to verify `zfs unmount -a` succeeds (not `zfs mount -a` +# succeeds, it always does) after `zfs mount -a`, which could fail if threads +# race. See github.com/zfsonlinux/zfs/issues/{8450,8833,8878} for details. +# +# STRATEGY: +# 1. Create pools and filesystems. +# 2. Set same mount point for >1 datasets. +# 3. Unmount all datasets. +# 4. Mount all datasets. +# 5. Unmount all datasets (verify this succeeds). +# + +verify_runnable "both" + +TMPDIR=${TMPDIR:-$TEST_BASE_DIR} +MNTPT=$TMPDIR/zfs_mount_test_race_mntpt +DISK1="$TMPDIR/zfs_mount_test_race_disk1" +DISK2="$TMPDIR/zfs_mount_test_race_disk2" + +TESTPOOL1=zfs_mount_test_race_tp1 +TESTPOOL2=zfs_mount_test_race_tp2 + +export __ZFS_POOL_RESTRICT="$TESTPOOL1 $TESTPOOL2" +log_must zfs $unmountall +unset __ZFS_POOL_RESTRICT + +function cleanup +{ + zpool destroy $TESTPOOL1 + zpool destroy $TESTPOOL2 + rm -rf $MNTPT + rm -rf /$TESTPOOL1 + rm -rf /$TESTPOOL2 + rm -f $DISK1 + rm -f $DISK2 + export __ZFS_POOL_RESTRICT="$TESTPOOL1 $TESTPOOL2" + log_must zfs $mountall + unset __ZFS_POOL_RESTRICT +} +log_onexit cleanup + +log_note "Verify parallel mount ordering is consistent" + +log_must truncate -s $MINVDEVSIZE $DISK1 +log_must truncate -s $MINVDEVSIZE $DISK2 + +log_must zpool create -f $TESTPOOL1 $DISK1 +log_must zpool create -f $TESTPOOL2 $DISK2 + +log_must zfs create $TESTPOOL1/$TESTFS1 +log_must zfs create $TESTPOOL2/$TESTFS2 + +log_must zfs set mountpoint=none $TESTPOOL1 +log_must zfs set mountpoint=$MNTPT $TESTPOOL1/$TESTFS1 + +# Note that unmount can fail (due to race condition on `zfs mount -a`) with or +# without `canmount=off`. The race has nothing to do with canmount property, +# but turn it off for convenience of mount layout used in this test case. +log_must zfs set canmount=off $TESTPOOL2 +log_must zfs set mountpoint=$MNTPT $TESTPOOL2 + +# At this point, layout of datasets in two pools will look like below. +# Previously, on next `zfs mount -a`, pthreads assigned to TESTFS1 and TESTFS2 +# could race, and TESTFS2 usually (actually always) won in ZoL. Note that the +# problem is how two or more threads could initially be assigned to the same +# top level directory, not this specific layout. This layout is just an example +# that can reproduce race, and is also the layout reported in #8833. +# +# NAME MOUNTED MOUNTPOINT +# ---------------------------------------------- +# /$TESTPOOL1 no none +# /$TESTPOOL1/$TESTFS1 yes $MNTPT +# /$TESTPOOL2 no $MNTPT +# /$TESTPOOL2/$TESTFS2 yes $MNTPT/$TESTFS2 + +# Apparently two datasets must be mounted. +log_must ismounted $TESTPOOL1/$TESTFS1 +log_must ismounted $TESTPOOL2/$TESTFS2 +# This unmount always succeeds, because potential race hasn't happened yet. +log_must zfs unmount -a +# This mount always succeeds, whether threads are under race condition or not. +log_must zfs mount -a + +# Verify datasets are mounted (TESTFS2 fails if the race broke mount order). +log_must ismounted $TESTPOOL1/$TESTFS1 +log_must ismounted $TESTPOOL2/$TESTFS2 +# Verify unmount succeeds (fails if the race broke mount order). +log_must zfs unmount -a + +log_pass "Verify parallel mount ordering is consistent passed" From c3a3c5a30fea98f640e23b0f3c2c10d5606ba9fc Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Tue, 9 Jul 2019 15:02:40 -0500 Subject: [PATCH 057/109] pkg-utils python sitelib for SLES15 Use python -Esc to set __python_sitelib. Reviewed-by: Neal Gompa Reviewed-by: Brian Behlendorf Signed-off-by: Shaun Tancheff Closes #8969 --- rpm/generic/zfs.spec.in | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 0b16cd0e886b..0864a72a1155 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -99,6 +99,7 @@ %define __python_cffi_pkg python%{__python_pkg_version}-cffi %define __python_setuptools_pkg python%{__python_pkg_version}-setuptools %endif +%define __python_sitelib %(%{__python} -Esc "from distutils.sysconfig import get_python_lib; print(get_python_lib())") # By default python-pyzfs is enabled, with the exception of # RHEL 6 which by default uses Python 2.6 which is too old. @@ -474,8 +475,8 @@ systemctl --system daemon-reload >/dev/null || true %doc contrib/pyzfs/README %doc contrib/pyzfs/LICENSE %defattr(-,root,root,-) -%{python_sitelib}/libzfs_core/* -%{python_sitelib}/pyzfs* +%{__python_sitelib}/libzfs_core/* +%{__python_sitelib}/pyzfs* %endif %if 0%{?_initramfs} From 6e19cc77cfd10a8587181f57ef4f9d7a1a7bc5b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20F=C3=BCl=C3=B6p?= Date: Wed, 10 Jul 2019 20:44:52 +0200 Subject: [PATCH 058/109] Fix ZTS killed processes detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit log_neg_expect was using the wrong exit status to detect if a process got killed by SIGSEGV or SIGBUS, resulting in false positives. Reviewed-by: loli10K Reviewed by: John Kennedy Reviewed by: Brian Behlendorf Signed-off-by: Attila Fülöp Closes #9003 --- tests/test-runner/include/logapi.shlib | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test-runner/include/logapi.shlib b/tests/test-runner/include/logapi.shlib index 32fc00616180..cd7982a94a0b 100644 --- a/tests/test-runner/include/logapi.shlib +++ b/tests/test-runner/include/logapi.shlib @@ -198,12 +198,12 @@ function log_neg_expect elif (( $status == 127 )); then print -u2 $($out) _printerror "$@" "unexpectedly exited $status (File not found)" - # bus error - core dump - elif (( $status == 138 )); then + # bus error - core dump (256+signal, SIGBUS=7) + elif (( $status == 263 )); then print -u2 $($out) _printerror "$@" "unexpectedly exited $status (Bus Error)" - # segmentation violation - core dump - elif (( $status == 139 )); then + # segmentation violation - core dump (256+signal, SIGSEGV=11) + elif (( $status == 267 )); then print -u2 $($out) _printerror "$@" "unexpectedly exited $status (SEGV)" else From cf966cb19ae63f65c518678ce57642c716808ef6 Mon Sep 17 00:00:00 2001 From: Nick Mattis Date: Wed, 10 Jul 2019 18:54:49 -0400 Subject: [PATCH 059/109] Fixes: #8934 Large kmem_alloc Large allocation over the spl_kmem_alloc_warn value was being performed. Switched to vmem_alloc interface as specified for large allocations. Changed the subsequent frees to match. Reviewed-by: Tom Caputi Reviewed-by: Brian Behlendorf Signed-off-by: nmattis Closes #8934 Closes #9011 --- module/zfs/vdev_indirect_births.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/module/zfs/vdev_indirect_births.c b/module/zfs/vdev_indirect_births.c index 1c44a64287d3..99b83c392257 100644 --- a/module/zfs/vdev_indirect_births.c +++ b/module/zfs/vdev_indirect_births.c @@ -70,7 +70,7 @@ vdev_indirect_births_close(vdev_indirect_births_t *vib) if (vib->vib_phys->vib_count > 0) { uint64_t births_size = vdev_indirect_births_size_impl(vib); - kmem_free(vib->vib_entries, births_size); + vmem_free(vib->vib_entries, births_size); vib->vib_entries = NULL; } @@ -108,7 +108,7 @@ vdev_indirect_births_open(objset_t *os, uint64_t births_object) if (vib->vib_phys->vib_count > 0) { uint64_t births_size = vdev_indirect_births_size_impl(vib); - vib->vib_entries = kmem_alloc(births_size, KM_SLEEP); + vib->vib_entries = vmem_alloc(births_size, KM_SLEEP); VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0, births_size, vib->vib_entries, DMU_READ_PREFETCH)); } @@ -148,10 +148,10 @@ vdev_indirect_births_add_entry(vdev_indirect_births_t *vib, vib->vib_phys->vib_count++; new_size = vdev_indirect_births_size_impl(vib); - new_entries = kmem_alloc(new_size, KM_SLEEP); + new_entries = vmem_alloc(new_size, KM_SLEEP); if (old_size > 0) { bcopy(vib->vib_entries, new_entries, old_size); - kmem_free(vib->vib_entries, old_size); + vmem_free(vib->vib_entries, old_size); } new_entries[vib->vib_phys->vib_count - 1] = vibe; vib->vib_entries = new_entries; From 0a223246e124e68bbd2ee2cd7ddcd0bbcd6fa3a5 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 16 Jul 2019 05:57:56 +0900 Subject: [PATCH 060/109] Disable unused pathname::pn_path* (unneeded in Linux) struct pathname is originally from Solaris VFS, and it has been used in ZoL to merely call VOP from Linux VFS interface without API change, therefore pathname::pn_path* are unused and unneeded. Technically, struct pathname is a wrapper for C string in ZoL. Saves stack a bit on lookup and unlink. (#if0'd members instead of removing since comments refer to them.) Reviewed-by: Brian Behlendorf Reviewed-by: Richard Elling Reviewed-by: George Melikov Signed-off-by: Tomohiro Kusumi Closes #9025 --- include/sys/pathname.h | 2 ++ module/zfs/pathname.c | 15 +++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/sys/pathname.h b/include/sys/pathname.h index 5db69b1784c9..d79cc5c01afd 100644 --- a/include/sys/pathname.h +++ b/include/sys/pathname.h @@ -54,8 +54,10 @@ extern "C" { */ typedef struct pathname { char *pn_buf; /* underlying storage */ +#if 0 /* unused in ZoL */ char *pn_path; /* remaining pathname */ size_t pn_pathlen; /* remaining length */ +#endif size_t pn_bufsize; /* total size of pn_buf */ } pathname_t; diff --git a/module/zfs/pathname.c b/module/zfs/pathname.c index e3e97c9bb365..4766762f37d1 100644 --- a/module/zfs/pathname.c +++ b/module/zfs/pathname.c @@ -71,9 +71,12 @@ pn_alloc(struct pathname *pnp) void pn_alloc_sz(struct pathname *pnp, size_t sz) { - pnp->pn_path = pnp->pn_buf = kmem_alloc(sz, KM_SLEEP); - pnp->pn_pathlen = 0; + pnp->pn_buf = kmem_alloc(sz, KM_SLEEP); pnp->pn_bufsize = sz; +#if 0 /* unused in ZoL */ + pnp->pn_path = pnp->pn_buf; + pnp->pn_pathlen = 0; +#endif } /* @@ -84,6 +87,10 @@ pn_free(struct pathname *pnp) { /* pn_bufsize is usually MAXPATHLEN, but may not be */ kmem_free(pnp->pn_buf, pnp->pn_bufsize); - pnp->pn_path = pnp->pn_buf = NULL; - pnp->pn_pathlen = pnp->pn_bufsize = 0; + pnp->pn_buf = NULL; + pnp->pn_bufsize = 0; +#if 0 /* unused in ZoL */ + pnp->pn_path = NULL; + pnp->pn_pathlen = 0; +#endif } From 78831d42906436c93570a7181548faaf456eb60f Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Mon, 15 Jul 2019 16:08:42 -0700 Subject: [PATCH 061/109] Ensure dsl_destroy_head() decrypts objsets This patch corrects a small issue where the dsl_destroy_head() code that runs when the async_destroy feature is disabled would not properly decrypt the dataset before beginning processing. If the dataset is not able to be decrypted, the optimization code now simply does not run and the dataset is completely destroyed in the DSL sync task. Reviewed-by: Brian Behlendorf Signed-off-by: Tom Caputi Closes #9021 --- module/zfs/dsl_destroy.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 465b3dfac890..a01abfa0038d 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -1059,9 +1059,10 @@ dsl_destroy_head(const char *name) /* * Head deletion is processed in one txg on old pools; * remove the objects from open context so that the txg sync - * is not too long. + * is not too long. This optimization can only work for + * encrypted datasets if the wrapping key is loaded. */ - error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_FALSE, + error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_TRUE, FTAG, &os); if (error == 0) { uint64_t prev_snap_txg = @@ -1073,7 +1074,7 @@ dsl_destroy_head(const char *name) (void) dmu_free_long_object(os, obj); /* sync out all frees */ txg_wait_synced(dmu_objset_pool(os), 0); - dmu_objset_disown(os, B_FALSE, FTAG); + dmu_objset_disown(os, B_TRUE, FTAG); } } From d751b12a9d927d71a1c584be25bf705bb8decda2 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 15 Jul 2019 16:11:55 -0700 Subject: [PATCH 062/109] Export dnode symbols External consumers such as Lustre require access to the dnode interfaces in order to correctly manipulate dnodes. Reviewed-by: James Simmons Reviewed-by: Olaf Faaland Reviewed-by: George Melikov Signed-off-by: Brian Behlendorf Issue #8994 Closes #9027 --- module/zfs/dnode.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index c06f614e1993..5fd473303d7d 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -2483,3 +2483,13 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, return (error); } + +#if defined(_KERNEL) +EXPORT_SYMBOL(dnode_hold); +EXPORT_SYMBOL(dnode_rele); +EXPORT_SYMBOL(dnode_set_nlevels); +EXPORT_SYMBOL(dnode_set_blksz); +EXPORT_SYMBOL(dnode_free_range); +EXPORT_SYMBOL(dnode_evict_dbufs); +EXPORT_SYMBOL(dnode_evict_bonus); +#endif From 73e50a7d5ddb20e20fd1eab23f00f26f85bd717a Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 16 Jul 2019 08:26:52 +0900 Subject: [PATCH 063/109] Drop redundant POSIX ACL check in zpl_init_acl() ZFS_ACLTYPE_POSIXACL has already been tested in zpl_init_acl(), so no need to test again on POSIX ACL access. Reviewed by: Brian Behlendorf Reviewed-by: Tony Hutter Signed-off-by: Tomohiro Kusumi Closes #9009 --- module/zfs/zpl_xattr.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/module/zfs/zpl_xattr.c b/module/zfs/zpl_xattr.c index 8ee6e9a97f0a..95523f28e3b4 100644 --- a/module/zfs/zpl_xattr.c +++ b/module/zfs/zpl_xattr.c @@ -1130,12 +1130,9 @@ zpl_init_acl(struct inode *ip, struct inode *dir) return (0); if (!S_ISLNK(ip->i_mode)) { - if (ITOZSB(ip)->z_acl_type == ZFS_ACLTYPE_POSIXACL) { - acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return (PTR_ERR(acl)); - } - + acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return (PTR_ERR(acl)); if (!acl) { ip->i_mode &= ~current_umask(); ip->i_ctime = current_time(ip); @@ -1144,7 +1141,7 @@ zpl_init_acl(struct inode *ip, struct inode *dir) } } - if ((ITOZSB(ip)->z_acl_type == ZFS_ACLTYPE_POSIXACL) && acl) { + if (acl) { umode_t mode; if (S_ISDIR(ip->i_mode)) { From af7a5672c3d1ef17d352627e64c24d762da919e3 Mon Sep 17 00:00:00 2001 From: Antonio Russo Date: Sun, 2 Jun 2019 08:57:10 -0400 Subject: [PATCH 064/109] systemd encryption key support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modify zfs-mount-generator to produce a dependency on new zfs-import-key-*.service units, dynamically created at boot to call zfs load-key for the encryption root, before attempting to mount any encrypted datasets. These units are created by zfs-mount-generator, and RequiresMountsFor on the keyfile, if present, or call systemd-ask-password if a passphrase is requested. This patch includes suggestions from @Fabian-Gruenbichler, @ryanjaeb and @rlaager, as well an adaptation of @rlaager's script to retry on incorrect password entry. Reviewed-by: Richard Laager Reviewed-by: Fabian Grünbichler Reviewed-by: Brian Behlendorf Signed-off-by: Antonio Russo Closes #8750 Closes #8848 --- .../zed.d/history_event-zfs-list-cacher.sh.in | 4 +- .../system-generators/zfs-mount-generator.in | 54 ++++++++++++++++++- man/man8/zfs-mount-generator.8.in | 2 +- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in b/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in index c1513cf3a01f..6d0f44ab3260 100755 --- a/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in +++ b/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in @@ -47,7 +47,7 @@ case "${ZEVENT_HISTORY_INTERNAL_NAME}" in # Only act if one of the tracked properties is altered. case "${ZEVENT_HISTORY_INTERNAL_STR%%=*}" in canmount|mountpoint|atime|relatime|devices|exec| \ - readonly|setuid|nbmand) ;; + readonly|setuid|nbmand|encroot|keylocation) ;; *) exit 0 ;; esac ;; @@ -62,7 +62,7 @@ zed_lock zfs-list trap abort_alter EXIT PROPS="name,mountpoint,canmount,atime,relatime,devices,exec,readonly" -PROPS="${PROPS},setuid,nbmand" +PROPS="${PROPS},setuid,nbmand,encroot,keylocation" "${ZFS}" list -H -t filesystem -o $PROPS -r "${ZEVENT_POOL}" > "${FSLIST_TMP}" diff --git a/etc/systemd/system-generators/zfs-mount-generator.in b/etc/systemd/system-generators/zfs-mount-generator.in index 5428eb25d92c..ae208c965f97 100755 --- a/etc/systemd/system-generators/zfs-mount-generator.in +++ b/etc/systemd/system-generators/zfs-mount-generator.in @@ -71,6 +71,8 @@ process_line() { p_readonly="${8}" p_setuid="${9}" p_nbmand="${10}" + p_encroot="${11}" + p_keyloc="${12}" # Check for canmount=off . if [ "${p_canmount}" = "off" ] ; then @@ -168,6 +170,54 @@ process_line() { "${dataset}" >/dev/kmsg fi + # Minimal pre-requisites to mount a ZFS dataset + wants="zfs-import.target" + if [ -n "${p_encroot}" ] && + [ "${p_encroot}" != "-" ] ; then + keyloadunit="zfs-load-key-$(systemd-escape "${p_encroot}").service" + if [ "${p_encroot}" = "${dataset}" ] ; then + pathdep="" + if [ "${p_keyloc%%://*}" = "file" ] ; then + pathdep="RequiresMountsFor='${p_keyloc#file://}'" + keyloadcmd="@sbindir@/zfs load-key '${dataset}'" + elif [ "${p_keyloc}" = "prompt" ] ; then + keyloadcmd="sh -c 'set -eu;"\ +"count=0;"\ +"while [ \$\$count -lt 3 ];do"\ +" systemd-ask-password --id=\"zfs:${dataset}\""\ +" \"Enter passphrase for ${dataset}:\"|"\ +" @sbindir@/zfs load-key \"${dataset}\" && exit 0;"\ +" count=\$\$((count + 1));"\ +"done;"\ +"exit 1'" + else + printf 'zfs-mount-generator: (%s) invalid keylocation\n' \ + "${dataset}" >/dev/kmsg + fi + cat > "${dest_norm}/${keyloadunit}" << EOF +# Automatically generated by zfs-mount-generator + +[Unit] +Description=Load ZFS key for ${dataset} +SourcePath=${cachefile} +Documentation=man:zfs-mount-generator(8) +DefaultDependencies=no +Wants=${wants} +After=${wants} +${pathdep} + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=${keyloadcmd} +ExecStop=@sbindir@/zfs unload-key '${dataset}' +EOF + fi + # Update the dependencies for the mount file to require the + # key-loading unit. + wants="${wants},${keyloadunit}" + fi + # If the mountpoint has already been created, give it precedence. if [ -e "${dest_norm}/${mountfile}" ] ; then printf 'zfs-mount-generator: %s already exists\n' "${mountfile}" \ @@ -183,8 +233,8 @@ process_line() { SourcePath=${cachefile} Documentation=man:zfs-mount-generator(8) Before=local-fs.target zfs-mount.service -After=zfs-import.target -Wants=zfs-import.target +After=${wants} +Wants=${wants} [Mount] Where=${p_mountpoint} diff --git a/man/man8/zfs-mount-generator.8.in b/man/man8/zfs-mount-generator.8.in index 79720601d62a..48e4e2dfac29 100644 --- a/man/man8/zfs-mount-generator.8.in +++ b/man/man8/zfs-mount-generator.8.in @@ -26,7 +26,7 @@ information on ZFS mountpoints must be stored separately. The output of the command .PP .RS 4 -zfs list -H -o name,mountpoint,canmount,atime,relatime,devices,exec,readonly,setuid,nbmand +zfs list -H -o name,mountpoint,canmount,atime,relatime,devices,exec,readonly,setuid,nbmand,encroot,keylocation .RE .PP for datasets that should be mounted by systemd, should be kept From 446d08fba4f2a795a278906167157bb6378176a1 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 16 Jul 2019 14:14:12 -0700 Subject: [PATCH 065/109] Fix get_special_prop() build failure The cast of the size_t returned by strlcpy() to a uint64_t by the VERIFY3U can result in a build failure when CONFIG_FORTIFY_SOURCE is set. This is due to the additional hardening. Since the token is expected to always fit in strval the VERIFY3U has been removed. If somehow it doesn't, it will still be safely truncated. Reviewed-by: Tony Hutter Reviewed-by: Don Brady Signed-off-by: Brian Behlendorf Issue #8999 Closes #9020 --- module/zfs/zcp_get.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/module/zfs/zcp_get.c b/module/zfs/zcp_get.c index ed98f0d1025b..0a5f0b8242ab 100644 --- a/module/zfs/zcp_get.c +++ b/module/zfs/zcp_get.c @@ -423,13 +423,11 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, case ZFS_PROP_RECEIVE_RESUME_TOKEN: { char *token = get_receive_resume_stats_impl(ds); - VERIFY3U(strlcpy(strval, token, ZAP_MAXVALUELEN), - <, ZAP_MAXVALUELEN); + (void) strlcpy(strval, token, ZAP_MAXVALUELEN); if (strcmp(strval, "") == 0) { char *childval = get_child_receive_stats(ds); - VERIFY3U(strlcpy(strval, childval, ZAP_MAXVALUELEN), - <, ZAP_MAXVALUELEN); + (void) strlcpy(strval, childval, ZAP_MAXVALUELEN); if (strcmp(strval, "") == 0) error = ENOENT; From 984bfb373fe7816e7c1b3ea0bf3fa937bc34d5d8 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 16 Jul 2019 17:22:31 -0700 Subject: [PATCH 066/109] Minor style cleanup Resolve an assortment of style inconsistencies including use of white space, typos, capitalization, and line wrapping. There is no functional change. Reviewed-by: Tony Hutter Reviewed-by: George Melikov Signed-off-by: Brian Behlendorf Closes #9030 --- config/kernel-fpu.m4 | 9 ++++-- include/linux/simd_aarch64.h | 6 ++-- include/linux/simd_x86.h | 48 +++++++++++++++-------------- module/icp/algs/aes/aes_impl.c | 11 +++++-- module/icp/algs/modes/gcm.c | 10 +++--- module/icp/include/aes/aes_impl.h | 2 +- module/icp/include/modes/gcm_impl.h | 4 +-- module/spl/spl-thread.c | 3 +- module/zcommon/zfs_fletcher.c | 6 ++-- 9 files changed, 57 insertions(+), 42 deletions(-) diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4 index 5fff79a74c70..ebb02fb09a28 100644 --- a/config/kernel-fpu.m4 +++ b/config/kernel-fpu.m4 @@ -18,7 +18,8 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [ #include ],[ ],[ - AC_DEFINE(HAVE_KERNEL_FPU_API_HEADER, 1, [kernel has asm/fpu/api.h]) + AC_DEFINE(HAVE_KERNEL_FPU_API_HEADER, 1, + [kernel has asm/fpu/api.h]) AC_MSG_RESULT(asm/fpu/api.h) ],[ AC_MSG_RESULT(i387.h & xcr.h) @@ -39,8 +40,10 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [ kernel_fpu_end(); ], [kernel_fpu_begin], [arch/x86/kernel/fpu/core.c], [ AC_MSG_RESULT(kernel_fpu_*) - AC_DEFINE(HAVE_KERNEL_FPU, 1, [kernel has kernel_fpu_* functions]) - AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions]) + AC_DEFINE(HAVE_KERNEL_FPU, 1, + [kernel has kernel_fpu_* functions]) + AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, + [kernel exports FPU functions]) ],[ ZFS_LINUX_TRY_COMPILE_SYMBOL([ #include diff --git a/include/linux/simd_aarch64.h b/include/linux/simd_aarch64.h index 155ef6205599..56153a16072e 100644 --- a/include/linux/simd_aarch64.h +++ b/include/linux/simd_aarch64.h @@ -26,8 +26,10 @@ * USER API: * * Kernel fpu methods: - * kfpu_begin() - * kfpu_end() + * kfpu_allowed() + * kfpu_initialize() + * kfpu_begin() + * kfpu_end() */ #ifndef _SIMD_AARCH64_H diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h index 12cd7467788e..0489bfaa3a70 100644 --- a/include/linux/simd_x86.h +++ b/include/linux/simd_x86.h @@ -26,8 +26,10 @@ * USER API: * * Kernel fpu methods: - * kfpu_begin() - * kfpu_end() + * kfpu_allowed() + * kfpu_initialize() + * kfpu_begin() + * kfpu_end() * * SIMD support: * @@ -37,31 +39,31 @@ * all relevant feature test functions should be called. * * Supported features: - * zfs_sse_available() - * zfs_sse2_available() - * zfs_sse3_available() - * zfs_ssse3_available() - * zfs_sse4_1_available() - * zfs_sse4_2_available() + * zfs_sse_available() + * zfs_sse2_available() + * zfs_sse3_available() + * zfs_ssse3_available() + * zfs_sse4_1_available() + * zfs_sse4_2_available() * - * zfs_avx_available() - * zfs_avx2_available() + * zfs_avx_available() + * zfs_avx2_available() * - * zfs_bmi1_available() - * zfs_bmi2_available() + * zfs_bmi1_available() + * zfs_bmi2_available() * - * zfs_avx512f_available() - * zfs_avx512cd_available() - * zfs_avx512er_available() - * zfs_avx512pf_available() - * zfs_avx512bw_available() - * zfs_avx512dq_available() - * zfs_avx512vl_available() - * zfs_avx512ifma_available() - * zfs_avx512vbmi_available() + * zfs_avx512f_available() + * zfs_avx512cd_available() + * zfs_avx512er_available() + * zfs_avx512pf_available() + * zfs_avx512bw_available() + * zfs_avx512dq_available() + * zfs_avx512vl_available() + * zfs_avx512ifma_available() + * zfs_avx512vbmi_available() * * NOTE(AVX-512VL): If using AVX-512 instructions with 128Bit registers - * also add zfs_avx512vl_available() to feature check. + * also add zfs_avx512vl_available() to feature check. */ #ifndef _SIMD_X86_H @@ -190,7 +192,7 @@ typedef struct cpuid_feature_desc { * Descriptions of supported instruction sets */ static const cpuid_feature_desc_t cpuid_features[] = { - [SSE] = {1U, 0U, 1U << 25, EDX }, + [SSE] = {1U, 0U, 1U << 25, EDX }, [SSE2] = {1U, 0U, 1U << 26, EDX }, [SSE3] = {1U, 0U, 1U << 0, ECX }, [SSSE3] = {1U, 0U, 1U << 9, ECX }, diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c index e15050635741..36e0686a51c2 100644 --- a/module/icp/algs/aes/aes_impl.c +++ b/module/icp/algs/aes/aes_impl.c @@ -303,16 +303,21 @@ aes_impl_init(void) } aes_supp_impl_cnt = c; - /* set fastest implementation. assume hardware accelerated is fastest */ + /* + * Set the fastest implementation given the assumption that the + * hardware accelerated version is the fastest. + */ #if defined(__x86_64) #if defined(HAVE_AES) - if (aes_aesni_impl.is_supported()) + if (aes_aesni_impl.is_supported()) { memcpy(&aes_fastest_impl, &aes_aesni_impl, sizeof (aes_fastest_impl)); - else + } else #endif + { memcpy(&aes_fastest_impl, &aes_x86_64_impl, sizeof (aes_fastest_impl)); + } #else memcpy(&aes_fastest_impl, &aes_generic_impl, sizeof (aes_fastest_impl)); diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c index 13bceef0f170..0afd957f0cf9 100644 --- a/module/icp/algs/modes/gcm.c +++ b/module/icp/algs/modes/gcm.c @@ -646,7 +646,7 @@ const gcm_impl_ops_t *gcm_all_impl[] = { /* Indicate that benchmark has been completed */ static boolean_t gcm_impl_initialized = B_FALSE; -/* Select aes implementation */ +/* Select GCM implementation */ #define IMPL_FASTEST (UINT32_MAX) #define IMPL_CYCLE (UINT32_MAX-1) @@ -713,13 +713,15 @@ gcm_impl_init(void) /* set fastest implementation. assume hardware accelerated is fastest */ #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) - if (gcm_pclmulqdq_impl.is_supported()) + if (gcm_pclmulqdq_impl.is_supported()) { memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, sizeof (gcm_fastest_impl)); - else + } else #endif + { memcpy(&gcm_fastest_impl, &gcm_generic_impl, sizeof (gcm_fastest_impl)); + } strcpy(gcm_fastest_impl.name, "fastest"); @@ -742,7 +744,7 @@ static const struct { * If we are called before init(), user preference will be saved in * user_sel_impl, and applied in later init() call. This occurs when module * parameter is specified on module load. Otherwise, directly update - * icp_aes_impl. + * icp_gcm_impl. * * @val Name of gcm implementation to use * @param Unused. diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h index 95cfddf9e0a4..3a3de91cf6a5 100644 --- a/module/icp/include/aes/aes_impl.h +++ b/module/icp/include/aes/aes_impl.h @@ -162,7 +162,7 @@ typedef enum aes_mech_type { #endif /* _AES_IMPL */ /* - * Methods used to define aes implementation + * Methods used to define AES implementation * * @aes_gen_f Key generation * @aes_enc_f Function encrypts one block diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h index cbb904c059b7..b78cc8aab010 100644 --- a/module/icp/include/modes/gcm_impl.h +++ b/module/icp/include/modes/gcm_impl.h @@ -37,12 +37,12 @@ extern "C" { #include /* - * Methods used to define gcm implementation + * Methods used to define GCM implementation * * @gcm_mul_f Perform carry-less multiplication * @gcm_will_work_f Function tests whether implementation will function */ -typedef void (*gcm_mul_f)(uint64_t *, uint64_t *, uint64_t *); +typedef void (*gcm_mul_f)(uint64_t *, uint64_t *, uint64_t *); typedef boolean_t (*gcm_will_work_f)(void); #define GCM_IMPL_NAME_MAX (16) diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c index d441ad65f317..0352a31ea835 100644 --- a/module/spl/spl-thread.c +++ b/module/spl/spl-thread.c @@ -153,8 +153,9 @@ spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...) if (PTR_ERR(tsk) == -ENOMEM) continue; return (NULL); - } else + } else { return (tsk); + } } while (1); } EXPORT_SYMBOL(spl_kthread_create); diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 5a991ba6073a..f712ce40c6ea 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -592,8 +592,9 @@ fletcher_4_incremental_byteswap(void *buf, size_t size, void *data) } #if defined(_KERNEL) -/* Fletcher 4 kstats */ - +/* + * Fletcher 4 kstats + */ static int fletcher_4_kstat_headers(char *buf, size_t size) { @@ -669,7 +670,6 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) zio_cksum_t zc; uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen); - fletcher_checksum_func_t *fletcher_4_test = native ? fletcher_4_native : fletcher_4_byteswap; From 2b9f73e5e6ae6f210b1b316bbd7bcbf8c6c62d61 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Thu, 18 Jul 2019 01:07:53 +0900 Subject: [PATCH 067/109] Use zfsctl_snapshot_hold() wrapper zfs_refcount_*() are to be wrapped by zfsctl_snapshot_*() in this file. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Signed-off-by: Tomohiro Kusumi Closes #9039 --- module/zfs/zfs_ctldir.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index 52314f4e1bdb..8acbbb61ca9d 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -192,7 +192,7 @@ static void zfsctl_snapshot_add(zfs_snapentry_t *se) { ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); - zfs_refcount_add(&se->se_refcount, NULL); + zfsctl_snapshot_hold(se); avl_add(&zfs_snapshots_by_name, se); avl_add(&zfs_snapshots_by_objsetid, se); } @@ -269,7 +269,7 @@ zfsctl_snapshot_find_by_name(char *snapname) search.se_name = snapname; se = avl_find(&zfs_snapshots_by_name, &search, NULL); if (se) - zfs_refcount_add(&se->se_refcount, NULL); + zfsctl_snapshot_hold(se); return (se); } @@ -290,7 +290,7 @@ zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid) search.se_objsetid = objsetid; se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL); if (se) - zfs_refcount_add(&se->se_refcount, NULL); + zfsctl_snapshot_hold(se); return (se); } From ceb516ac2f4c2ddffcea8a6d282312dd941d3296 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Niew=C3=B6hner?= Date: Wed, 17 Jul 2019 18:09:22 +0200 Subject: [PATCH 068/109] Add missing __GFP_HIGHMEM flag to vmalloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make use of __GFP_HIGHMEM flag in vmem_alloc, which is required for some 32-bit systems to make use of full available memory. While kernel versions >=4.12-rc1 add this flag implicitly, older kernels do not. Reviewed-by: Brian Behlendorf Signed-off-by: Sebastian Gottschall Signed-off-by: Michael Niewöhner Closes #9031 --- module/spl/spl-kmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c index 1fdb61e6fce1..824b5e89f507 100644 --- a/module/spl/spl-kmem.c +++ b/module/spl/spl-kmem.c @@ -180,7 +180,8 @@ spl_kmem_alloc_impl(size_t size, int flags, int node) */ if ((size > spl_kmem_alloc_max) || use_vmem) { if (flags & KM_VMEM) { - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, + PAGE_KERNEL); } else { return (NULL); } From 4c98586dafab4518a7eea8db9a19271e99ae3110 Mon Sep 17 00:00:00 2001 From: jdike <52420226+jdike@users.noreply.github.com> Date: Wed, 17 Jul 2019 12:18:24 -0400 Subject: [PATCH 069/109] Fix lockdep recursive locking false positive in dbuf_destroy lockdep reports a possible recursive lock in dbuf_destroy. It is true that dbuf_destroy is acquiring the dn_dbufs_mtx on one dnode while holding it on another dnode. However, it is impossible for these to be the same dnode because, among other things,dbuf_destroy checks MUTEX_HELD before acquiring the mutex. This fix defines a class NESTED_SINGLE == 1 and changes that lock to call mutex_enter_nested with a subclass of NESTED_SINGLE. In order to make the userspace code compile, include/sys/zfs_context.h now defines mutex_enter_nested and NESTED_SINGLE. This is the lockdep report: [ 122.950921] ============================================ [ 122.950921] WARNING: possible recursive locking detected [ 122.950921] 4.19.29-4.19.0-debug-d69edad5368c1166 #1 Tainted: G O [ 122.950921] -------------------------------------------- [ 122.950921] dbu_evict/1457 is trying to acquire lock: [ 122.950921] 0000000083e9cbcf (&dn->dn_dbufs_mtx){+.+.}, at: dbuf_destroy+0x3c0/0xdb0 [zfs] [ 122.950921] but task is already holding lock: [ 122.950921] 0000000055523987 (&dn->dn_dbufs_mtx){+.+.}, at: dnode_evict_dbufs+0x90/0x740 [zfs] [ 122.950921] other info that might help us debug this: [ 122.950921] Possible unsafe locking scenario: [ 122.950921] CPU0 [ 122.950921] ---- [ 122.950921] lock(&dn->dn_dbufs_mtx); [ 122.950921] lock(&dn->dn_dbufs_mtx); [ 122.950921] *** DEADLOCK *** [ 122.950921] May be due to missing lock nesting notation [ 122.950921] 1 lock held by dbu_evict/1457: [ 122.950921] #0: 0000000055523987 (&dn->dn_dbufs_mtx){+.+.}, at: dnode_evict_dbufs+0x90/0x740 [zfs] [ 122.950921] stack backtrace: [ 122.950921] CPU: 0 PID: 1457 Comm: dbu_evict Tainted: G O 4.19.29-4.19.0-debug-d69edad5368c1166 #1 [ 122.950921] Hardware name: Supermicro H8SSL-I2/H8SSL-I2, BIOS 080011 03/13/2009 [ 122.950921] Call Trace: [ 122.950921] dump_stack+0x91/0xeb [ 122.950921] __lock_acquire+0x2ca7/0x4f10 [ 122.950921] lock_acquire+0x153/0x330 [ 122.950921] dbuf_destroy+0x3c0/0xdb0 [zfs] [ 122.950921] dbuf_evict_one+0x1cc/0x3d0 [zfs] [ 122.950921] dbuf_rele_and_unlock+0xb84/0xd60 [zfs] [ 122.950921] dnode_evict_dbufs+0x3a6/0x740 [zfs] [ 122.950921] dmu_objset_evict+0x7a/0x500 [zfs] [ 122.950921] dsl_dataset_evict_async+0x70/0x480 [zfs] [ 122.950921] taskq_thread+0x979/0x1480 [spl] [ 122.950921] kthread+0x2e7/0x3e0 [ 122.950921] ret_from_fork+0x27/0x50 Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Jeff Dike Closes #8984 --- include/spl/sys/mutex.h | 2 ++ include/sys/zfs_context.h | 2 ++ module/zfs/dbuf.c | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/spl/sys/mutex.h b/include/spl/sys/mutex.h index ed0cd4932cfa..a61f35c61eb1 100644 --- a/include/spl/sys/mutex.h +++ b/include/spl/sys/mutex.h @@ -127,6 +127,8 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ }) /* END CSTYLED */ +#define NESTED_SINGLE 1 + #ifdef CONFIG_DEBUG_LOCK_ALLOC #define mutex_enter_nested(mp, subclass) \ { \ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index e3fa2e61bdc9..598b86a7a659 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -257,6 +257,8 @@ extern void mutex_enter(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); +#define NESTED_SINGLE 1 +#define mutex_enter_nested(mp, class) mutex_enter(mp) /* * RW locks */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 07e616f6f0de..94c49b3ef0a9 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2591,7 +2591,8 @@ dbuf_destroy(dmu_buf_impl_t *db) if (db->db_blkid != DMU_BONUS_BLKID) { boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); if (needlock) - mutex_enter(&dn->dn_dbufs_mtx); + mutex_enter_nested(&dn->dn_dbufs_mtx, + NESTED_SINGLE); avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); From 54561073e7f6e258f6c9e96be60821d51db2ac34 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 12 Jul 2019 13:27:24 -0700 Subject: [PATCH 070/109] Linux 5.3 compat: rw_semaphore owner Commit https://github.com/torvalds/linux/commit/94a9717b updated the rwsem's owner field to contain additional flags describing the rwsem's state. Rather then update the wrappers to mask out these bits, the code no longer relies on the owner stored by the kernel. This does increase the size of a krwlock_t but it makes the implementation less sensitive to future kernel changes. Reviewed-by: Tony Hutter Reviewed-by: Tomohiro Kusumi Signed-off-by: Brian Behlendorf Closes #9029 --- include/spl/sys/rwlock.h | 68 +++------------------------------------- module/spl/spl-rwlock.c | 3 -- 2 files changed, 5 insertions(+), 66 deletions(-) diff --git a/include/spl/sys/rwlock.h b/include/spl/sys/rwlock.h index 408defac20d3..5e052b532a42 100644 --- a/include/spl/sys/rwlock.h +++ b/include/spl/sys/rwlock.h @@ -78,15 +78,9 @@ typedef enum { RW_READER = 2 } krw_t; -/* - * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, rw_semaphore will have an owner - * field, so we don't need our own. - */ typedef struct { struct rw_semaphore rw_rwlock; -#ifndef CONFIG_RWSEM_SPIN_ON_OWNER kthread_t *rw_owner; -#endif #ifdef CONFIG_LOCKDEP krw_type_t rw_type; #endif /* CONFIG_LOCKDEP */ @@ -97,31 +91,19 @@ typedef struct { static inline void spl_rw_set_owner(krwlock_t *rwp) { -/* - * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, down_write, up_write, - * downgrade_write and __init_rwsem will set/clear owner for us. - */ -#ifndef CONFIG_RWSEM_SPIN_ON_OWNER rwp->rw_owner = current; -#endif } static inline void spl_rw_clear_owner(krwlock_t *rwp) { -#ifndef CONFIG_RWSEM_SPIN_ON_OWNER rwp->rw_owner = NULL; -#endif } static inline kthread_t * rw_owner(krwlock_t *rwp) { -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER - return (SEM(rwp)->owner); -#else return (rwp->rw_owner); -#endif } #ifdef CONFIG_LOCKDEP @@ -148,62 +130,22 @@ spl_rw_lockdep_on_maybe(krwlock_t *rwp) \ #define spl_rw_lockdep_on_maybe(rwp) #endif /* CONFIG_LOCKDEP */ - static inline int -RW_WRITE_HELD(krwlock_t *rwp) +RW_LOCK_HELD(krwlock_t *rwp) { - return (rw_owner(rwp) == current); + return (spl_rwsem_is_locked(SEM(rwp))); } static inline int -RW_LOCK_HELD(krwlock_t *rwp) +RW_WRITE_HELD(krwlock_t *rwp) { - return (spl_rwsem_is_locked(SEM(rwp))); + return (rw_owner(rwp) == current); } static inline int RW_READ_HELD(krwlock_t *rwp) { - if (!RW_LOCK_HELD(rwp)) - return (0); - - /* - * rw_semaphore cheat sheet: - * - * < 3.16: - * There's no rw_semaphore.owner, so use rwp.owner instead. - * If rwp.owner == NULL then it's a reader - * - * 3.16 - 4.7: - * rw_semaphore.owner added (https://lwn.net/Articles/596656/) - * and CONFIG_RWSEM_SPIN_ON_OWNER introduced. - * If rw_semaphore.owner == NULL then it's a reader - * - * 4.8 - 4.16.16: - * RWSEM_READER_OWNED added as an internal #define. - * (https://lore.kernel.org/patchwork/patch/678590/) - * If rw_semaphore.owner == 1 then it's a reader - * - * 4.16.17 - 4.19: - * RWSEM_OWNER_UNKNOWN introduced as ((struct task_struct *)-1L) - * (https://do-db2.lkml.org/lkml/2018/5/15/985) - * If rw_semaphore.owner == 1 then it's a reader. - * - * 4.20+: - * RWSEM_OWNER_UNKNOWN changed to ((struct task_struct *)-2L) - * (https://lkml.org/lkml/2018/9/6/986) - * If rw_semaphore.owner & 1 then it's a reader, and also the reader's - * task_struct may be embedded in rw_semaphore->owner. - */ -#if defined(CONFIG_RWSEM_SPIN_ON_OWNER) && defined(RWSEM_OWNER_UNKNOWN) - if (RWSEM_OWNER_UNKNOWN == (struct task_struct *)-2L) { - /* 4.20+ kernels with CONFIG_RWSEM_SPIN_ON_OWNER */ - return ((unsigned long) SEM(rwp)->owner & 1); - } -#endif - - /* < 4.20 kernel or !CONFIG_RWSEM_SPIN_ON_OWNER */ - return (rw_owner(rwp) == NULL || (unsigned long) rw_owner(rwp) == 1); + return (RW_LOCK_HELD(rwp) && rw_owner(rwp) == NULL); } /* diff --git a/module/spl/spl-rwlock.c b/module/spl/spl-rwlock.c index 86727ed1957c..886e16924e65 100644 --- a/module/spl/spl-rwlock.c +++ b/module/spl/spl-rwlock.c @@ -119,9 +119,6 @@ rwsem_tryupgrade(struct rw_semaphore *rwsem) if (__rwsem_tryupgrade(rwsem)) { rwsem_release(&rwsem->dep_map, 1, _RET_IP_); rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER - rwsem->owner = current; -#endif return (1); } return (0); From 3982d959c5b8577993740c03392c4efa750c0479 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 12 Jul 2019 14:06:36 -0700 Subject: [PATCH 071/109] Linux 5.3 compat: retire rw_tryupgrade() The Linux kernel's rwsem's have never provided an interface to allow a reader to be upgraded to a writer. Historically, this functionality has been implemented by a SPL wrapper function. However, this approach depends on internal knowledge of the rw_semaphore and is therefore rather brittle. Since the ZFS code must always be able to fallback to rw_exit() and rw_enter() when an rw_tryupgrade() fails; this functionality isn't critical. Furthermore, the only potentially performance sensitive consumer is dmu_zfetch() and no decrease in performance was observed with this change applied. See the PR comments for additional testing details. Therefore, it is being retired to make the build more robust and to simplify the rwlock implementation. Reviewed-by: Tony Hutter Reviewed-by: Tomohiro Kusumi Signed-off-by: Brian Behlendorf Closes #9029 --- include/spl/sys/rwlock.h | 60 +++-------------------- module/spl/spl-rwlock.c | 101 --------------------------------------- 2 files changed, 7 insertions(+), 154 deletions(-) diff --git a/include/spl/sys/rwlock.h b/include/spl/sys/rwlock.h index 5e052b532a42..89e02fa8f044 100644 --- a/include/spl/sys/rwlock.h +++ b/include/spl/sys/rwlock.h @@ -29,43 +29,6 @@ #include #include -/* Linux kernel compatibility */ -#if defined(CONFIG_PREEMPT_RT_FULL) -#define SPL_RWSEM_SINGLE_READER_VALUE (1) -#define SPL_RWSEM_SINGLE_WRITER_VALUE (0) -#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK) -#define SPL_RWSEM_SINGLE_READER_VALUE (1) -#define SPL_RWSEM_SINGLE_WRITER_VALUE (-1) -#elif defined(RWSEM_ACTIVE_MASK) -#define SPL_RWSEM_SINGLE_READER_VALUE (RWSEM_ACTIVE_READ_BIAS) -#define SPL_RWSEM_SINGLE_WRITER_VALUE (RWSEM_ACTIVE_WRITE_BIAS) -#endif - -/* Linux 3.16 changed activity to count for rwsem-spinlock */ -#if defined(CONFIG_PREEMPT_RT_FULL) -#define RWSEM_COUNT(sem) sem->read_depth -#elif defined(HAVE_RWSEM_ACTIVITY) -#define RWSEM_COUNT(sem) sem->activity -/* Linux 4.8 changed count to an atomic_long_t for !rwsem-spinlock */ -#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT) -#define RWSEM_COUNT(sem) atomic_long_read(&(sem)->count) -#else -#define RWSEM_COUNT(sem) sem->count -#endif - -#if defined(RWSEM_SPINLOCK_IS_RAW) -#define spl_rwsem_lock_irqsave(lk, fl) raw_spin_lock_irqsave(lk, fl) -#define spl_rwsem_unlock_irqrestore(lk, fl) \ - raw_spin_unlock_irqrestore(lk, fl) -#define spl_rwsem_trylock_irqsave(lk, fl) raw_spin_trylock_irqsave(lk, fl) -#else -#define spl_rwsem_lock_irqsave(lk, fl) spin_lock_irqsave(lk, fl) -#define spl_rwsem_unlock_irqrestore(lk, fl) spin_unlock_irqrestore(lk, fl) -#define spl_rwsem_trylock_irqsave(lk, fl) spin_trylock_irqsave(lk, fl) -#endif /* RWSEM_SPINLOCK_IS_RAW */ - -#define spl_rwsem_is_locked(rwsem) rwsem_is_locked(rwsem) - typedef enum { RW_DRIVER = 2, RW_DEFAULT = 4, @@ -133,7 +96,7 @@ spl_rw_lockdep_on_maybe(krwlock_t *rwp) \ static inline int RW_LOCK_HELD(krwlock_t *rwp) { - return (spl_rwsem_is_locked(SEM(rwp))); + return (rwsem_is_locked(SEM(rwp))); } static inline int @@ -170,6 +133,12 @@ RW_READ_HELD(krwlock_t *rwp) */ #define rw_destroy(rwp) ((void) 0) +/* + * Upgrading a rwsem from a reader to a writer is not supported by the + * Linux kernel. The lock must be dropped and reacquired as a writer. + */ +#define rw_tryupgrade(rwp) RW_WRITE_HELD(rwp) + #define rw_tryenter(rwp, rw) \ ({ \ int _rc_ = 0; \ @@ -228,24 +197,9 @@ RW_READ_HELD(krwlock_t *rwp) spl_rw_lockdep_on_maybe(rwp); \ }) -#define rw_tryupgrade(rwp) \ -({ \ - int _rc_ = 0; \ - \ - if (RW_WRITE_HELD(rwp)) { \ - _rc_ = 1; \ - } else { \ - spl_rw_lockdep_off_maybe(rwp); \ - if ((_rc_ = rwsem_tryupgrade(SEM(rwp)))) \ - spl_rw_set_owner(rwp); \ - spl_rw_lockdep_on_maybe(rwp); \ - } \ - _rc_; \ -}) /* END CSTYLED */ int spl_rw_init(void); void spl_rw_fini(void); -int rwsem_tryupgrade(struct rw_semaphore *rwsem); #endif /* _SPL_RWLOCK_H */ diff --git a/module/spl/spl-rwlock.c b/module/spl/spl-rwlock.c index 886e16924e65..10f7c38db4eb 100644 --- a/module/spl/spl-rwlock.c +++ b/module/spl/spl-rwlock.c @@ -24,106 +24,5 @@ * Solaris Porting Layer (SPL) Reader/Writer Lock Implementation. */ -#include -#include - -#if defined(CONFIG_PREEMPT_RT_FULL) - -#include -#define RT_MUTEX_OWNER_MASKALL 1UL - -static int -__rwsem_tryupgrade(struct rw_semaphore *rwsem) -{ -#if defined(READER_BIAS) && defined(WRITER_BIAS) - /* - * After the 4.9.20-rt16 kernel the realtime patch series lifted the - * single reader restriction. While this could be accommodated by - * adding additional compatibility code assume the rwsem can never - * be upgraded. All caller must already cleanly handle this case. - */ - return (0); -#else - ASSERT((struct task_struct *) - ((unsigned long)rwsem->lock.owner & ~RT_MUTEX_OWNER_MASKALL) == - current); - - /* - * Prior to 4.9.20-rt16 kernel the realtime patch series, rwsem is - * implemented as a single mutex held by readers and writers alike. - * However, this implementation would prevent a thread from taking - * a read lock twice, as the mutex would already be locked on - * the second attempt. Therefore the implementation allows a - * single thread to take a rwsem as read lock multiple times - * tracking that nesting as read_depth counter. - */ - if (rwsem->read_depth <= 1) { - /* - * In case, the current thread has not taken the lock - * more than once as read lock, we can allow an - * upgrade to a write lock. rwsem_rt.h implements - * write locks as read_depth == 0. - */ - rwsem->read_depth = 0; - return (1); - } - return (0); -#endif -} -#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK) -static int -__rwsem_tryupgrade(struct rw_semaphore *rwsem) -{ - int ret = 0; - unsigned long flags; - spl_rwsem_lock_irqsave(&rwsem->wait_lock, flags); - if (RWSEM_COUNT(rwsem) == SPL_RWSEM_SINGLE_READER_VALUE && - list_empty(&rwsem->wait_list)) { - ret = 1; - RWSEM_COUNT(rwsem) = SPL_RWSEM_SINGLE_WRITER_VALUE; - } - spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags); - return (ret); -} -#elif defined(RWSEM_ACTIVE_MASK) -#if defined(HAVE_RWSEM_ATOMIC_LONG_COUNT) -static int -__rwsem_tryupgrade(struct rw_semaphore *rwsem) -{ - long val; - val = atomic_long_cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE, - SPL_RWSEM_SINGLE_WRITER_VALUE); - return (val == SPL_RWSEM_SINGLE_READER_VALUE); -} -#else -static int -__rwsem_tryupgrade(struct rw_semaphore *rwsem) -{ - typeof(rwsem->count) val; - val = cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE, - SPL_RWSEM_SINGLE_WRITER_VALUE); - return (val == SPL_RWSEM_SINGLE_READER_VALUE); -} -#endif -#else -static int -__rwsem_tryupgrade(struct rw_semaphore *rwsem) -{ - return (0); -} -#endif - -int -rwsem_tryupgrade(struct rw_semaphore *rwsem) -{ - if (__rwsem_tryupgrade(rwsem)) { - rwsem_release(&rwsem->dep_map, 1, _RET_IP_); - rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); - return (1); - } - return (0); -} -EXPORT_SYMBOL(rwsem_tryupgrade); - int spl_rw_init(void) { return 0; } void spl_rw_fini(void) { } From 428a63cc62c31056b602e80ec072d8093ca049c8 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 12 Jul 2019 14:40:15 -0700 Subject: [PATCH 072/109] Retire unused spl_{mutex,rwlock}_{init_fini} These functions are unused and can be removed along with the spl-mutex.c and spl-rwlock.c source files. Reviewed-by: Tony Hutter Reviewed-by: Tomohiro Kusumi Signed-off-by: Brian Behlendorf Closes #9029 --- include/spl/sys/mutex.h | 3 --- include/spl/sys/rwlock.h | 4 ---- module/spl/Makefile.in | 2 -- module/spl/spl-generic.c | 38 +++++++++++++------------------------- module/spl/spl-mutex.c | 30 ------------------------------ module/spl/spl-rwlock.c | 28 ---------------------------- 6 files changed, 13 insertions(+), 92 deletions(-) delete mode 100644 module/spl/spl-mutex.c delete mode 100644 module/spl/spl-rwlock.c diff --git a/include/spl/sys/mutex.h b/include/spl/sys/mutex.h index a61f35c61eb1..73da23685590 100644 --- a/include/spl/sys/mutex.h +++ b/include/spl/sys/mutex.h @@ -181,7 +181,4 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ /* NOTE: do not dereference mp after this point */ \ } -int spl_mutex_init(void); -void spl_mutex_fini(void); - #endif /* _SPL_MUTEX_H */ diff --git a/include/spl/sys/rwlock.h b/include/spl/sys/rwlock.h index 89e02fa8f044..60f5bfd986b4 100644 --- a/include/spl/sys/rwlock.h +++ b/include/spl/sys/rwlock.h @@ -196,10 +196,6 @@ RW_READ_HELD(krwlock_t *rwp) downgrade_write(SEM(rwp)); \ spl_rw_lockdep_on_maybe(rwp); \ }) - /* END CSTYLED */ -int spl_rw_init(void); -void spl_rw_fini(void); - #endif /* _SPL_RWLOCK_H */ diff --git a/module/spl/Makefile.in b/module/spl/Makefile.in index 3bcbf63cbc63..e16666aa94f3 100644 --- a/module/spl/Makefile.in +++ b/module/spl/Makefile.in @@ -16,10 +16,8 @@ $(MODULE)-objs += spl-kmem.o $(MODULE)-objs += spl-kmem-cache.o $(MODULE)-objs += spl-kobj.o $(MODULE)-objs += spl-kstat.o -$(MODULE)-objs += spl-mutex.o $(MODULE)-objs += spl-proc.o $(MODULE)-objs += spl-procfs-list.o -$(MODULE)-objs += spl-rwlock.o $(MODULE)-objs += spl-taskq.o $(MODULE)-objs += spl-thread.o $(MODULE)-objs += spl-tsd.o diff --git a/module/spl/spl-generic.c b/module/spl/spl-generic.c index cd2fa2020510..3c5ef60bd1a4 100644 --- a/module/spl/spl-generic.c +++ b/module/spl/spl-generic.c @@ -694,51 +694,41 @@ spl_init(void) if ((rc = spl_kvmem_init())) goto out1; - if ((rc = spl_mutex_init())) - goto out2; - - if ((rc = spl_rw_init())) - goto out3; - if ((rc = spl_tsd_init())) - goto out4; + goto out2; if ((rc = spl_taskq_init())) - goto out5; + goto out3; if ((rc = spl_kmem_cache_init())) - goto out6; + goto out4; if ((rc = spl_vn_init())) - goto out7; + goto out5; if ((rc = spl_proc_init())) - goto out8; + goto out6; if ((rc = spl_kstat_init())) - goto out9; + goto out7; if ((rc = spl_zlib_init())) - goto out10; + goto out8; return (rc); -out10: - spl_kstat_fini(); -out9: - spl_proc_fini(); out8: - spl_vn_fini(); + spl_kstat_fini(); out7: - spl_kmem_cache_fini(); + spl_proc_fini(); out6: - spl_taskq_fini(); + spl_vn_fini(); out5: - spl_tsd_fini(); + spl_kmem_cache_fini(); out4: - spl_rw_fini(); + spl_taskq_fini(); out3: - spl_mutex_fini(); + spl_tsd_fini(); out2: spl_kvmem_fini(); out1: @@ -755,8 +745,6 @@ spl_fini(void) spl_kmem_cache_fini(); spl_taskq_fini(); spl_tsd_fini(); - spl_rw_fini(); - spl_mutex_fini(); spl_kvmem_fini(); } diff --git a/module/spl/spl-mutex.c b/module/spl/spl-mutex.c deleted file mode 100644 index ba818862b679..000000000000 --- a/module/spl/spl-mutex.c +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see . - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . - * - * Solaris Porting Layer (SPL) Mutex Implementation. - */ - -#include - -int spl_mutex_init(void) { return 0; } -void spl_mutex_fini(void) { } diff --git a/module/spl/spl-rwlock.c b/module/spl/spl-rwlock.c deleted file mode 100644 index 10f7c38db4eb..000000000000 --- a/module/spl/spl-rwlock.c +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see . - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . - * - * Solaris Porting Layer (SPL) Reader/Writer Lock Implementation. - */ - -int spl_rw_init(void) { return 0; } -void spl_rw_fini(void) { } From 3c144b92671df9c6e9d926e6c19a34893645500e Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Fri, 19 Jul 2019 04:48:46 +0900 Subject: [PATCH 073/109] Fix wrong comment on zcr_blksz_{min,max} These aren't tunable; illumos has this comment fixed in "3742 zfs comments need cleaner, more consistent style", so sync with that. Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Signed-off-by: Tomohiro Kusumi Closes #9052 --- module/zfs/zfs_vnops.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 9d8a9cbc5419..4f07111f25e3 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -5074,13 +5074,14 @@ zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) #ifdef HAVE_UIO_ZEROCOPY /* - * Tunable, both must be a power of 2. - * - * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf - * zcr_blksz_max: if set to less than the file block size, allow loaning out of - * an arcbuf for a partial block read + * The smallest read we may consider to loan out an arcbuf. + * This must be a power of 2. */ int zcr_blksz_min = (1 << 10); /* 1K */ +/* + * If set to less than the file block size, allow loaning out of an + * arcbuf for a partial block read. This must be a power of 2. + */ int zcr_blksz_max = (1 << 17); /* 128K */ /*ARGSUSED*/ From bbbe4b0a9885fb671186da86b63c09f262852c65 Mon Sep 17 00:00:00 2001 From: Serapheim Dimitropoulos Date: Thu, 18 Jul 2019 12:55:29 -0700 Subject: [PATCH 074/109] hdr_recl calls zthr_wakeup() on destroyed zthr There exists a race condition were hdr_recl() calls zthr_wakeup() on a destroyed zthr. The timeline is the following: [1] hdr_recl() runs first and goes intro zthr_wakeup() because arc_initialized is set. [2] arc_fini() is called by another thread, zeroes that flag, destroying the zthr, and goes into buf_init(). [3] hdr_recl() tries to enter the destroyed mutex and we blow up. This patch ensures that the ARC's zthrs are not offloaded any new work once arc_initialized is set and then destroys them after all of the ARC state has been deleted. Reviewed by: Matt Ahrens Reviewed by: Brian Behlendorf Signed-off-by: Serapheim Dimitropoulos Closes #9047 --- module/zfs/arc.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index f125ca6a4d14..53a44bdaf44c 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ @@ -5079,6 +5079,9 @@ arc_kmem_reap_soon(void) static boolean_t arc_adjust_cb_check(void *arg, zthr_t *zthr) { + if (!arc_initialized) + return (B_FALSE); + /* * This is necessary so that any changes which may have been made to * many of the zfs_arc_* module parameters will be propagated to @@ -5166,6 +5169,9 @@ arc_adjust_cb(void *arg, zthr_t *zthr) static boolean_t arc_reap_cb_check(void *arg, zthr_t *zthr) { + if (!arc_initialized) + return (B_FALSE); + int64_t free_memory = arc_available_memory(); /* @@ -7924,11 +7930,9 @@ arc_fini(void) list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); - (void) zthr_cancel(arc_adjust_zthr); - zthr_destroy(arc_adjust_zthr); + (void) zthr_cancel(arc_adjust_zthr); (void) zthr_cancel(arc_reap_zthr); - zthr_destroy(arc_reap_zthr); mutex_destroy(&arc_adjust_lock); cv_destroy(&arc_adjust_waiters_cv); @@ -7941,6 +7945,14 @@ arc_fini(void) buf_fini(); arc_state_fini(); + /* + * We destroy the zthrs after all the ARC state has been + * torn down to avoid the case of them receiving any + * wakeup() signals after they are destroyed. + */ + zthr_destroy(arc_adjust_zthr); + zthr_destroy(arc_reap_zthr); + ASSERT0(arc_loaned_bytes); } From 1c4b0fc7457d6c6dac801f4a4a694ffe954bb91f Mon Sep 17 00:00:00 2001 From: Serapheim Dimitropoulos Date: Thu, 18 Jul 2019 13:02:33 -0700 Subject: [PATCH 075/109] Race condition between spa async threads and export In the past we've seen multiple race conditions that have to do with open-context threads async threads and concurrent calls to spa_export()/spa_destroy() (including the one referenced in issue #9015). This patch ensures that only one thread can execute the main body of spa_export_common() at a time, with subsequent threads returning with a new error code created just for this situation, eliminating this way any race condition bugs introduced by concurrent calls to this function. Reviewed by: Matt Ahrens Reviewed by: Brian Behlendorf Signed-off-by: Serapheim Dimitropoulos Closes #9015 Closes #9044 --- cmd/ztest/ztest.c | 18 +++++++++++++++++- include/libzfs.h | 1 + include/sys/fs/zfs.h | 1 + include/sys/spa_impl.h | 1 + lib/libzfs/libzfs_util.c | 5 +++++ module/zfs/spa.c | 18 +++++++++++++++++- 6 files changed, 42 insertions(+), 2 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 9c2cf9501831..3bf840d88ed6 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -2745,8 +2745,24 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); nvlist_free(nvroot); + + /* + * We open a reference to the spa and then we try to export it + * expecting one of the following errors: + * + * EBUSY + * Because of the reference we just opened. + * + * ZFS_ERR_EXPORT_IN_PROGRESS + * For the case that there is another ztest thread doing + * an export concurrently. + */ VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); - VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool)); + int error = spa_destroy(zo->zo_pool); + if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { + fatal(0, "spa_destroy(%s) returned unexpected value %d", + spa->spa_name, error); + } spa_close(spa, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); diff --git a/include/libzfs.h b/include/libzfs.h index e2ec2d9bce7b..a5b2a8393f43 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -147,6 +147,7 @@ typedef enum zfs_error { EZFS_NO_TRIM, /* no active trim */ EZFS_TRIM_NOTSUP, /* device does not support trim */ EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */ + EZFS_EXPORT_IN_PROGRESS, /* currently exporting the pool */ EZFS_UNKNOWN } zfs_error_t; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 3bcefdbfd775..c167a594a7d4 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1318,6 +1318,7 @@ typedef enum { ZFS_ERR_FROM_IVSET_GUID_MISSING, ZFS_ERR_FROM_IVSET_GUID_MISMATCH, ZFS_ERR_SPILL_BLOCK_FLAG_MISSING, + ZFS_ERR_EXPORT_IN_PROGRESS, } zfs_errno_t; /* diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 66032d9aad7a..0de8613d3eb8 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -219,6 +219,7 @@ struct spa { spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ + boolean_t spa_is_exporting; /* true while exporting pool */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_special_class; /* special allocation class */ diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 19bb57ad4378..dc2d68ebebbe 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -303,6 +303,8 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_NO_RESILVER_DEFER: return (dgettext(TEXT_DOMAIN, "this action requires the " "resilver_defer feature")); + case EZFS_EXPORT_IN_PROGRESS: + return (dgettext(TEXT_DOMAIN, "pool export in progress")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -598,6 +600,9 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_VDEV_TOO_BIG: zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap); break; + case ZFS_ERR_EXPORT_IN_PROGRESS: + zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap); + break; case ZFS_ERR_IOC_CMD_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support this operation. A reboot may " diff --git a/module/zfs/spa.c b/module/zfs/spa.c index eb3ff91a073c..ce622cee88b0 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -5722,6 +5722,13 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, return (SET_ERROR(ENOENT)); } + if (spa->spa_is_exporting) { + /* the pool is being exported by another thread */ + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); + } + spa->spa_is_exporting = B_TRUE; + /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. @@ -5757,6 +5764,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, (spa->spa_inject_ref != 0 && new_state != POOL_STATE_UNINITIALIZED)) { spa_async_resume(spa); + spa->spa_is_exporting = B_FALSE; mutex_exit(&spa_namespace_lock); return (SET_ERROR(EBUSY)); } @@ -5771,6 +5779,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { spa_async_resume(spa); + spa->spa_is_exporting = B_FALSE; mutex_exit(&spa_namespace_lock); return (SET_ERROR(EXDEV)); } @@ -5822,9 +5831,16 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); + } else { + /* + * If spa_remove() is not called for this spa_t and + * there is any possibility that it can be reused, + * we make sure to reset the exporting flag. + */ + spa->spa_is_exporting = B_FALSE; } - mutex_exit(&spa_namespace_lock); + mutex_exit(&spa_namespace_lock); return (0); } From be068aeea86433481c1bc18cf1a76ed033daea2e Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Fri, 19 Jul 2019 11:21:54 -0700 Subject: [PATCH 076/109] Move some tests to cli_user/zpool_status The tests in tests/functional/cli_root/zpool_status should all require root. However, linux.run has "user =" specified for those tests, which means they run as a normal user. When I removed that line to run them as root, the following tests did not pass: zpool_status_003_pos zpool_status_-c_disable zpool_status_-c_homedir zpool_status_-c_searchpath These tests need to be run as a normal user. To fix this, move these tests to a new tests/functional/cli_user/zpool_status directory. Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Reviewed-by: Giuseppe Di Natale Signed-off-by: Tony Hutter Closes #9057 --- configure.ac | 1 + tests/runfiles/linux.run | 11 ++++--- .../cli_root/zpool_status/Makefile.am | 6 +--- .../tests/functional/cli_user/Makefile.am | 3 +- .../cli_user/zpool_status/Makefile.am | 8 +++++ .../cli_user/zpool_status/cleanup.ksh | 30 +++++++++++++++++ .../cli_user/zpool_status/setup.ksh | 32 +++++++++++++++++++ .../zpool_status/zpool_status_-c_disable.ksh | 0 .../zpool_status/zpool_status_-c_homedir.ksh | 0 .../zpool_status_-c_searchpath.ksh | 0 .../zpool_status/zpool_status_003_pos.ksh | 0 11 files changed, 81 insertions(+), 10 deletions(-) create mode 100644 tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_status/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_status/setup.ksh rename tests/zfs-tests/tests/functional/{cli_root => cli_user}/zpool_status/zpool_status_-c_disable.ksh (100%) rename tests/zfs-tests/tests/functional/{cli_root => cli_user}/zpool_status/zpool_status_-c_homedir.ksh (100%) rename tests/zfs-tests/tests/functional/{cli_root => cli_user}/zpool_status/zpool_status_-c_searchpath.ksh (100%) rename tests/zfs-tests/tests/functional/{cli_root => cli_user}/zpool_status/zpool_status_003_pos.ksh (100%) diff --git a/configure.ac b/configure.ac index ea2e355c70bf..cf1d8b394adf 100644 --- a/configure.ac +++ b/configure.ac @@ -272,6 +272,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/cli_user/zfs_list/Makefile tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile tests/zfs-tests/tests/functional/cli_user/zpool_list/Makefile + tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile tests/zfs-tests/tests/functional/compression/Makefile tests/zfs-tests/tests/functional/cp_files/Makefile tests/zfs-tests/tests/functional/ctime/Makefile diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 27e36b594ab5..c08bc4e31a36 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -462,10 +462,7 @@ tests = ['zpool_split_cliargs', 'zpool_split_devices', tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] -tests = ['zpool_status_001_pos', 'zpool_status_002_pos','zpool_status_003_pos', - 'zpool_status_-c_disable', 'zpool_status_-c_homedir', - 'zpool_status_-c_searchpath'] -user = +tests = ['zpool_status_001_pos', 'zpool_status_002_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] @@ -529,6 +526,12 @@ tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] user = tags = ['functional', 'cli_user', 'zpool_list'] +[tests/functional/cli_user/zpool_status] +tests = ['zpool_status_003_pos', 'zpool_status_-c_disable', + 'zpool_status_-c_homedir', 'zpool_status_-c_searchpath'] +user = +tags = ['functional', 'cli_user', 'zpool_status'] + [tests/functional/compression] tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', 'compress_004_pos'] diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am index aab4de0e7c89..beb59e3d066b 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am @@ -3,8 +3,4 @@ dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ zpool_status_001_pos.ksh \ - zpool_status_002_pos.ksh \ - zpool_status_003_pos.ksh \ - zpool_status_-c_disable.ksh \ - zpool_status_-c_homedir.ksh \ - zpool_status_-c_searchpath.ksh + zpool_status_002_pos.ksh diff --git a/tests/zfs-tests/tests/functional/cli_user/Makefile.am b/tests/zfs-tests/tests/functional/cli_user/Makefile.am index f1ff32e8d22d..119f8ee187f6 100644 --- a/tests/zfs-tests/tests/functional/cli_user/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_user/Makefile.am @@ -2,4 +2,5 @@ SUBDIRS = \ misc \ zfs_list \ zpool_iostat \ - zpool_list + zpool_list \ + zpool_status diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile.am b/tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile.am new file mode 100644 index 000000000000..e1b339657749 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile.am @@ -0,0 +1,8 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_user/zpool_status +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + zpool_status_003_pos.ksh \ + zpool_status_-c_disable.ksh \ + zpool_status_-c_homedir.ksh \ + zpool_status_-c_searchpath.ksh diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_status/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/cleanup.ksh new file mode 100755 index 000000000000..79cd6e9f908e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_status/cleanup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_status/setup.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/setup.ksh new file mode 100755 index 000000000000..6a9af3bc28c3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_status/setup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} + +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_disable.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_disable.ksh similarity index 100% rename from tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_disable.ksh rename to tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_disable.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_homedir.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh similarity index 100% rename from tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_homedir.ksh rename to tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_searchpath.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh similarity index 100% rename from tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_searchpath.ksh rename to tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_003_pos.ksh similarity index 100% rename from tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh rename to tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_003_pos.ksh From 65a0b28b42976a23c354f0518e0e1cc02b943b46 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Sat, 20 Jul 2019 03:23:56 +0900 Subject: [PATCH 077/109] Fix module_param() type for zfs_read_chunk_size zfs_read_chunk_size is unsigned long. Reviewed-by: Brian Behlendorf Signed-off-by: Tomohiro Kusumi Closes #9051 --- module/zfs/zfs_vnops.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 4f07111f25e3..2a49293c245c 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -5260,9 +5260,11 @@ EXPORT_SYMBOL(zfs_putpage); EXPORT_SYMBOL(zfs_dirty_inode); EXPORT_SYMBOL(zfs_map); -/* CSTYLED */ +/* BEGIN CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); -module_param(zfs_read_chunk_size, long, 0644); +module_param(zfs_read_chunk_size, ulong, 0644); MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk"); +/* END CSTYLED */ + #endif From 4f951b183c645f320ad375bb41b319634370e3ac Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Fri, 26 Jul 2019 03:59:20 +0900 Subject: [PATCH 078/109] Don't directly cast unsigned long to void* Cast to uintptr_t first for portability on integer to/from pointer conversion. Reviewed-by: Brian Behlendorf Signed-off-by: Tomohiro Kusumi Closes #9065 --- module/zfs/zfs_ioctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index c6b55d24f7ef..152433d60790 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -7110,7 +7110,8 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); - error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); + error = ddi_copyin((void *)(uintptr_t)arg, zc, sizeof (zfs_cmd_t), + flag); if (error != 0) { error = SET_ERROR(EFAULT); goto out; @@ -7269,7 +7270,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) out: nvlist_free(innvl); - rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); + rc = ddi_copyout(zc, (void *)(uintptr_t)arg, sizeof (zfs_cmd_t), flag); if (error == 0 && rc != 0) error = SET_ERROR(EFAULT); if (error == 0 && vec->zvec_allow_log) { From 1f5979d23f4b06b3d8ebc58b7d7e3946393fa9ce Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Fri, 26 Jul 2019 12:07:48 -0700 Subject: [PATCH 079/109] zed crashes when devid not present zed core dumps due to a NULL pointer in zfs_agent_iter_vdev(). The gs_devid is NULL, but the nvl has a "devid" entry. zfs_agent_post_event() checks that ZFS_EV_VDEV_GUID or DEV_IDENTIFIER is present in nvl, but then later it and zfs_agent_iter_vdev() assume that DEV_IDENTIFIER is present and thus gs_devid is set. Typically this is not a problem because usually either all vdevs have devid's, or none of them do. Since zfs_agent_iter_vdev() first checks if the vdev has devid before dereferencing gs_devid, the problem isn't typically encountered. However, if some vdevs have devid's and some do not, then the problem is easily reproduced. This can happen if the pool has been moved from a system that has devid's to one that does not. The fix is for zfs_agent_iter_vdev() to only try to match the devid's if both nvl and gsp have devid's present. Reviewed-by: Prashanth Sreenivasa Reviewed-by: Don Brady Reviewed-by: Brian Behlendorf Reviewed-by: loli10K Signed-off-by: Matthew Ahrens External-issue: DLPX-65090 Closes #9054 Closes #9060 --- cmd/zed/agents/zfs_agents.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c index 6d392604bceb..006e0ab99f47 100644 --- a/cmd/zed/agents/zfs_agents.c +++ b/cmd/zed/agents/zfs_agents.c @@ -116,7 +116,8 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) /* * On a devid match, grab the vdev guid and expansion time, if any. */ - if ((nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) && + if (gsp->gs_devid != NULL && + (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) && (strcmp(gsp->gs_devid, path) == 0)) { (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &gsp->gs_vdev_guid); From 6c68594675ed3fdc1d663da47eaeb27c3db97f29 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Sat, 27 Jul 2019 05:52:30 +0900 Subject: [PATCH 080/109] Implement secpolicy_vnode_setid_retain() Don't unconditionally return 0 (i.e. retain SUID/SGID). Test CAP_FSETID capability. https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t which expects SUID/SGID to be dropped on write(2) by non-owner fails without this. Most filesystems make this decision within VFS by using a generic file write for fops. Reviewed-by: Brian Behlendorf Signed-off-by: Tomohiro Kusumi Closes #9035 Closes #9043 --- configure.ac | 1 + module/zfs/policy.c | 2 +- tests/runfiles/linux.run | 5 + tests/zfs-tests/tests/functional/Makefile.am | 1 + .../tests/functional/suid/.gitignore | 1 + .../tests/functional/suid/Makefile.am | 16 +++ .../tests/functional/suid/cleanup.ksh | 34 +++++ .../zfs-tests/tests/functional/suid/setup.ksh | 35 +++++ .../functional/suid/suid_write_to_file.c | 133 ++++++++++++++++++ .../functional/suid/suid_write_to_none.ksh | 52 +++++++ .../functional/suid/suid_write_to_sgid.ksh | 52 +++++++ .../functional/suid/suid_write_to_suid.ksh | 52 +++++++ .../suid/suid_write_to_suid_sgid.ksh | 52 +++++++ 13 files changed, 435 insertions(+), 1 deletion(-) create mode 100644 tests/zfs-tests/tests/functional/suid/.gitignore create mode 100644 tests/zfs-tests/tests/functional/suid/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/suid/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/suid/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/suid/suid_write_to_file.c create mode 100755 tests/zfs-tests/tests/functional/suid/suid_write_to_none.ksh create mode 100755 tests/zfs-tests/tests/functional/suid/suid_write_to_sgid.ksh create mode 100755 tests/zfs-tests/tests/functional/suid/suid_write_to_suid.ksh create mode 100755 tests/zfs-tests/tests/functional/suid/suid_write_to_suid_sgid.ksh diff --git a/configure.ac b/configure.ac index cf1d8b394adf..e8592ffb1d2d 100644 --- a/configure.ac +++ b/configure.ac @@ -328,6 +328,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/snapshot/Makefile tests/zfs-tests/tests/functional/snapused/Makefile tests/zfs-tests/tests/functional/sparse/Makefile + tests/zfs-tests/tests/functional/suid/Makefile tests/zfs-tests/tests/functional/alloc_class/Makefile tests/zfs-tests/tests/functional/threadsappend/Makefile tests/zfs-tests/tests/functional/tmpfile/Makefile diff --git a/module/zfs/policy.c b/module/zfs/policy.c index 55c932747915..a723235d3015 100644 --- a/module/zfs/policy.c +++ b/module/zfs/policy.c @@ -209,7 +209,7 @@ secpolicy_vnode_setdac(const cred_t *cr, uid_t owner) int secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot) { - return (0); + return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); } /* diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index c08bc4e31a36..1c368d20c454 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -847,6 +847,11 @@ tags = ['functional', 'snapused'] tests = ['sparse_001_pos'] tags = ['functional', 'sparse'] +[tests/functional/suid] +tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid', + 'suid_write_to_none'] +tags = ['functional', 'suid'] + [tests/functional/threadsappend] tests = ['threadsappend_001_pos'] tags = ['functional', 'threadsappend'] diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am index da27673ec946..ac0ba7cf3d1d 100644 --- a/tests/zfs-tests/tests/functional/Makefile.am +++ b/tests/zfs-tests/tests/functional/Makefile.am @@ -66,6 +66,7 @@ SUBDIRS = \ snapshot \ snapused \ sparse \ + suid \ threadsappend \ tmpfile \ trim \ diff --git a/tests/zfs-tests/tests/functional/suid/.gitignore b/tests/zfs-tests/tests/functional/suid/.gitignore new file mode 100644 index 000000000000..a9a3db79ba44 --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/.gitignore @@ -0,0 +1 @@ +/suid_write_to_file diff --git a/tests/zfs-tests/tests/functional/suid/Makefile.am b/tests/zfs-tests/tests/functional/suid/Makefile.am new file mode 100644 index 000000000000..594d2b77ca8e --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/Makefile.am @@ -0,0 +1,16 @@ +include $(top_srcdir)/config/Rules.am + +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/suid + +dist_pkgdata_SCRIPTS = \ + suid_write_to_suid.ksh \ + suid_write_to_sgid.ksh \ + suid_write_to_suid_sgid.ksh \ + suid_write_to_none.ksh \ + cleanup.ksh \ + setup.ksh + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/suid + +pkgexec_PROGRAMS = suid_write_to_file +suid_write_to_file_SOURCES = suid_write_to_file.c diff --git a/tests/zfs-tests/tests/functional/suid/cleanup.ksh b/tests/zfs-tests/tests/functional/suid/cleanup.ksh new file mode 100755 index 000000000000..6e41e02faf58 --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/cleanup.ksh @@ -0,0 +1,34 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/suid/setup.ksh b/tests/zfs-tests/tests/functional/suid/setup.ksh new file mode 100755 index 000000000000..d04d5568c003 --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/setup.ksh @@ -0,0 +1,35 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_file.c b/tests/zfs-tests/tests/functional/suid/suid_write_to_file.c new file mode 100644 index 000000000000..571dc553bec2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_file.c @@ -0,0 +1,133 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +static void +test_stat_mode(mode_t extra) +{ + struct stat st; + int i, fd; + char fpath[1024]; + char *penv[] = {"TESTDIR", "TESTFILE0"}; + char buf[] = "test"; + mode_t res; + mode_t mode = 0777 | extra; + + /* + * Get the environment variable values. + */ + for (i = 0; i < sizeof (penv) / sizeof (char *); i++) { + if ((penv[i] = getenv(penv[i])) == NULL) { + fprintf(stderr, "getenv(penv[%d])\n", i); + exit(1); + } + } + + umask(0); + if (stat(penv[0], &st) == -1 && mkdir(penv[0], mode) == -1) { + perror("mkdir"); + exit(2); + } + + snprintf(fpath, sizeof (fpath), "%s/%s", penv[0], penv[1]); + unlink(fpath); + if (stat(fpath, &st) == 0) { + fprintf(stderr, "%s exists\n", fpath); + exit(3); + } + + fd = creat(fpath, mode); + if (fd == -1) { + perror("creat"); + exit(4); + } + close(fd); + + if (setuid(65534) == -1) { + perror("setuid"); + exit(5); + } + + fd = open(fpath, O_RDWR); + if (fd == -1) { + perror("open"); + exit(6); + } + + if (write(fd, buf, sizeof (buf)) == -1) { + perror("write"); + exit(7); + } + close(fd); + + if (stat(fpath, &st) == -1) { + perror("stat"); + exit(8); + } + unlink(fpath); + + /* Verify SUID/SGID are dropped */ + res = st.st_mode & (0777 | S_ISUID | S_ISGID); + if (res != (mode & 0777)) { + fprintf(stderr, "stat(2) %o\n", res); + exit(9); + } +} + +int +main(int argc, char *argv[]) +{ + const char *name; + mode_t extra; + + if (argc < 2) { + fprintf(stderr, "Invalid argc\n"); + exit(1); + } + + name = argv[1]; + if (strcmp(name, "SUID") == 0) { + extra = S_ISUID; + } else if (strcmp(name, "SGID") == 0) { + extra = S_ISGID; + } else if (strcmp(name, "SUID_SGID") == 0) { + extra = S_ISUID | S_ISGID; + } else if (strcmp(name, "NONE") == 0) { + extra = 0; + } else { + fprintf(stderr, "Invalid name %s\n", name); + exit(1); + } + + test_stat_mode(extra); + + return (0); +} diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_none.ksh b/tests/zfs-tests/tests/functional/suid/suid_write_to_none.ksh new file mode 100755 index 000000000000..dd01978619f9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_none.ksh @@ -0,0 +1,52 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify write(2) to regular file by non-owner. +# Also see https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t +# +# STRATEGY: +# 1. creat(2) a file. +# 2. write(2) to the file with uid=65534. +# 3. stat(2) the file and verify .st_mode value. +# + +verify_runnable "both" + +function cleanup +{ + rm -f $TESTDIR/$TESTFILE0 +} + +log_onexit cleanup +log_note "Verify write(2) to regular file by non-owner" + +log_must $STF_SUITE/tests/functional/suid/suid_write_to_file "NONE" + +log_pass "Verify write(2) to regular file by non-owner passed" diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_sgid.ksh b/tests/zfs-tests/tests/functional/suid/suid_write_to_sgid.ksh new file mode 100755 index 000000000000..49ae2bd1b31e --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_sgid.ksh @@ -0,0 +1,52 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify write(2) to SGID file by non-owner. +# Also see https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t +# +# STRATEGY: +# 1. creat(2) a file with SGID. +# 2. write(2) to the file with uid=65534. +# 3. stat(2) the file and verify .st_mode value. +# + +verify_runnable "both" + +function cleanup +{ + rm -f $TESTDIR/$TESTFILE0 +} + +log_onexit cleanup +log_note "Verify write(2) to SGID file by non-owner" + +log_must $STF_SUITE/tests/functional/suid/suid_write_to_file "SGID" + +log_pass "Verify write(2) to SGID file by non-owner passed" diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_suid.ksh b/tests/zfs-tests/tests/functional/suid/suid_write_to_suid.ksh new file mode 100755 index 000000000000..3983aad2e51d --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_suid.ksh @@ -0,0 +1,52 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify write(2) to SUID file by non-owner. +# Also see https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t +# +# STRATEGY: +# 1. creat(2) a file with SUID. +# 2. write(2) to the file with uid=65534. +# 3. stat(2) the file and verify .st_mode value. +# + +verify_runnable "both" + +function cleanup +{ + rm -f $TESTDIR/$TESTFILE0 +} + +log_onexit cleanup +log_note "Verify write(2) to SUID file by non-owner" + +log_must $STF_SUITE/tests/functional/suid/suid_write_to_file "SUID" + +log_pass "Verify write(2) to SUID file by non-owner passed" diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_suid_sgid.ksh b/tests/zfs-tests/tests/functional/suid/suid_write_to_suid_sgid.ksh new file mode 100755 index 000000000000..a058c7e7d4bc --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_suid_sgid.ksh @@ -0,0 +1,52 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify write(2) to SUID/SGID file by non-owner. +# Also see https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t +# +# STRATEGY: +# 1. creat(2) a file with SUID/SGID. +# 2. write(2) to the file with uid=65534. +# 3. stat(2) the file and verify .st_mode value. +# + +verify_runnable "both" + +function cleanup +{ + rm -f $TESTDIR/$TESTFILE0 +} + +log_onexit cleanup +log_note "Verify write(2) to SUID/SGID file by non-owner" + +log_must $STF_SUITE/tests/functional/suid/suid_write_to_file "SUID_SGID" + +log_pass "Verify write(2) to SUID/SGID file by non-owner passed" From a8c5bcb5de431a792287fd355b8599513ddf69c5 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Sun, 28 Jul 2019 21:13:56 -0400 Subject: [PATCH 081/109] Race between zfs-share and zfs-mount services When a system boots the zfs-mount.service and the zfs-share.service can start simultaneously. What may be unclear is that sharing a filesystem will first mount the filesystem if it's not already mounted. This means that both service can race to mount the same fileystem. This race can result in a SEGFAULT or EBUSY conditions. This change explicitly defines the start ordering between the two services such that the zfs-mount.service is solely responsible for mounting filesystems eliminating the race between "zfs mount -a" and "zfs share -a" commands. Reviewed-by: Sebastien Roy Reviewed-by: Brian Behlendorf Signed-off-by: George Wilson Closes #9083 --- etc/systemd/system/zfs-share.service.in | 1 + 1 file changed, 1 insertion(+) diff --git a/etc/systemd/system/zfs-share.service.in b/etc/systemd/system/zfs-share.service.in index 75ff6e946767..5f4ba411b3cd 100644 --- a/etc/systemd/system/zfs-share.service.in +++ b/etc/systemd/system/zfs-share.service.in @@ -5,6 +5,7 @@ After=nfs-server.service nfs-kernel-server.service After=smb.service Before=rpc-statd-notify.service Wants=zfs-mount.service +After=zfs-mount.service PartOf=nfs-server.service nfs-kernel-server.service PartOf=smb.service From 8c00159411ed891b91f8b4f3d4356c038ffa81ca Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Sun, 28 Jul 2019 18:15:26 -0700 Subject: [PATCH 082/109] Fix channel programs on s390x When adapting the original sources for s390x the JMP_BUF_CNT was mistakenly halved due to an incorrect assumption of the size of a unsigned long. They are 8 bytes for the s390x architecture. Increase JMP_BUF_CNT accordingly. Authored-by: Don Brady Reviewed-by: Brian Behlendorf Reported-by: Colin Ian King Tested-by: Colin Ian King Signed-off-by: Brian Behlendorf Closes #8992 Closes #9080 --- module/lua/ldo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/lua/ldo.c b/module/lua/ldo.c index aca02b234770..59d0b6a2c298 100644 --- a/module/lua/ldo.c +++ b/module/lua/ldo.c @@ -61,7 +61,7 @@ #elif defined(__mips__) #define JMP_BUF_CNT 12 #elif defined(__s390x__) -#define JMP_BUF_CNT 9 +#define JMP_BUF_CNT 18 #else #define JMP_BUF_CNT 1 #endif From 6c9882d5dbc6bcaf39ae2ca54860743c083fa940 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Tue, 30 Jul 2019 09:18:30 -0700 Subject: [PATCH 083/109] Improve performance by using dmu_tx_hold_*_by_dnode() In zfs_write() and dmu_tx_hold_sa(), we can use dmu_tx_hold_*_by_dnode() instead of dmu_tx_hold_*(), since we already have a dbuf from the target dnode in hand. This eliminates some calls to dnode_hold(), which can be expensive. This is especially impactful if several threads are accessing objects that are in the same block of dnodes, because they will contend for that dbuf's lock. We are seeing 10-20% performance wins for the sequential_writes tests in the performance test suite, when doing >=128K writes to files with recordsize=8K. This also removes some unnecessary casts that are in the area. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Nguyen Signed-off-by: Matthew Ahrens Closes #9081 --- module/zfs/dmu_tx.c | 6 ++++-- module/zfs/sa.c | 10 +++++----- module/zfs/zfs_vnops.c | 8 ++++++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 7d65e842ff03..d6a42f84c751 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1338,7 +1338,10 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) object = sa_handle_object(hdl); - dmu_tx_hold_bonus(tx, object); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + DB_DNODE_ENTER(db); + dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db)); + DB_DNODE_EXIT(db); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; @@ -1360,7 +1363,6 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } else { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; dnode_t *dn; DB_DNODE_ENTER(db); diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 56a606962a7f..4999fef345dc 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -1380,7 +1380,7 @@ sa_handle_destroy(sa_handle_t *hdl) dmu_buf_rele(hdl->sa_bonus, NULL); if (hdl->sa_spill) - dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); + dmu_buf_rele(hdl->sa_spill, NULL); mutex_exit(&hdl->sa_lock); kmem_cache_free(sa_cache, hdl); @@ -2028,7 +2028,7 @@ sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, hdl->sa_spill_tab = NULL; } - dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); + dmu_buf_rele(hdl->sa_spill, NULL); hdl->sa_spill = NULL; } @@ -2131,13 +2131,13 @@ sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx) void sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi) { - dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi); + dmu_object_info_from_db(hdl->sa_bonus, doi); } void sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) { - dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus, + dmu_object_size_from_db(hdl->sa_bonus, blksize, nblocks); } @@ -2150,7 +2150,7 @@ sa_set_userp(sa_handle_t *hdl, void *ptr) dmu_buf_t * sa_get_db(sa_handle_t *hdl) { - return ((dmu_buf_t *)hdl->sa_bonus); + return (hdl->sa_bonus); } void * diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 2a49293c245c..7f33aea43d48 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -775,7 +775,11 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) */ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, + MIN(n, max_blksz)); + DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { @@ -1048,7 +1052,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) return (SET_ERROR(ENOENT)); } - zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; zgd->zgd_private = zp; From 6d1599c1e1d1fabb14eb27f8f28d3c6b539f3fdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Niew=C3=B6hner?= Date: Tue, 30 Jul 2019 18:59:38 +0200 Subject: [PATCH 084/109] Increase default zcmd allocation to 256K MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When creating hundreds of clones (for example using containers with LXD) cloning slows down as the number of clones increases over time. The reason for this is that the fetching of the clone information using a small zcmd buffer requires two ioctl calls, one to determine the size and a second to return the data. However, this requires gathering the data twice, once to determine the size and again to populate the zcmd buffer to return it to userspace. These are expensive ioctl() calls, so instead, make the default buffer size much larger: 256K. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Colin Ian King Signed-off-by: Michael Niewöhner Closes #9084 --- lib/libzfs/libzfs_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index dc2d68ebebbe..eed6282ca357 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -1139,7 +1139,7 @@ int zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len) { if (len == 0) - len = 16 * 1024; + len = 256 * 1024; zc->zc_nvlist_dst_size = len; zc->zc_nvlist_dst = (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); From 569f5d5d0543a1f1f4958a65fafc3eb7bf1778d1 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Tue, 13 Aug 2019 20:21:27 -0700 Subject: [PATCH 085/109] Fix out-of-order ZIL txtype lost on hardlinked files We should only call zil_remove_async when an object is removed. However, in current implementation, it is called whenever TX_REMOVE is called. In the case of hardlinked file, every unlink will generate TX_REMOVE and causing operations to be dropped even when the object is not removed. We fix this by only calling zil_remove_async when the file is fully unlinked. Reviewed-by: George Wilson Reviewed-by: Brian Behlendorf Reviewed-by: Prakash Surya Signed-off-by: Chunwei Chen Closes #8769 Closes #9061 --- include/sys/zfs_znode.h | 2 +- module/zfs/zfs_log.c | 15 ++++++++++++++- module/zfs/zfs_vnops.c | 5 +++-- module/zfs/zil.c | 12 +----------- .../tests/functional/slog/slog_replay_fs.ksh | 8 ++++++++ 5 files changed, 27 insertions(+), 15 deletions(-) diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index d4a3ea769331..add45a7f46e4 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -371,7 +371,7 @@ extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp, vattr_t *vap); extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name, uint64_t foid); + znode_t *dzp, char *name, uint64_t foid, boolean_t unlinked); #define ZFS_NO_OBJECT 0 /* no object id */ extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name); diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 15c396ce0329..5966b7612b35 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -380,12 +380,14 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, zil_itx_assign(zilog, itx, tx); } +void zil_remove_async(zilog_t *zilog, uint64_t oid); + /* * Handles both TX_REMOVE and TX_RMDIR transactions. */ void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name, uint64_t foid) + znode_t *dzp, char *name, uint64_t foid, boolean_t unlinked) { itx_t *itx; lr_remove_t *lr; @@ -401,6 +403,17 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, itx->itx_oid = foid; + /* + * Object ids can be re-instantiated in the next txg so + * remove any async transactions to avoid future leaks. + * This can happen if a fsync occurs on the re-instantiated + * object for a WR_INDIRECT or WR_NEED_COPY write, which gets + * the new file data and flushes a write record for the old object. + */ + if (unlinked) { + ASSERT((txtype & ~TX_CI) == TX_REMOVE); + zil_remove_async(zilog, foid); + } zil_itx_assign(zilog, itx, tx); } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 7f33aea43d48..3c2278164289 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1886,7 +1886,7 @@ zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags) txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name, obj); + zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); dmu_tx_commit(tx); out: @@ -2219,7 +2219,8 @@ zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr, uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); + zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, + B_FALSE); } dmu_tx_commit(tx); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index ff14a98b6b25..5249a0e93666 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1824,7 +1824,7 @@ zil_aitx_compare(const void *x1, const void *x2) /* * Remove all async itx with the given oid. */ -static void +void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; @@ -1876,16 +1876,6 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) itxg_t *itxg; itxs_t *itxs, *clean = NULL; - /* - * Object ids can be re-instantiated in the next txg so - * remove any async transactions to avoid future leaks. - * This can happen if a fsync occurs on the re-instantiated - * object for a WR_INDIRECT or WR_NEED_COPY write, which gets - * the new file data and flushes a write record for the old object. - */ - if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) - zil_remove_async(zilog, itx->itx_oid); - /* * Ensure the data of a renamed file is committed before the rename. */ diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh index 5f281a756f15..ea3f8451b9e3 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh @@ -160,6 +160,14 @@ log_must attr -qs fileattr -V HelloWorld /$TESTPOOL/$TESTFS/xattr.file log_must attr -qs tmpattr -V HelloWorld /$TESTPOOL/$TESTFS/xattr.file log_must attr -qr tmpattr /$TESTPOOL/$TESTFS/xattr.file +# TX_WRITE, TX_LINK, TX_REMOVE +# Make sure TX_REMOVE won't affect TX_WRITE if file is not destroyed +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/link_and_unlink bs=128k \ + count=8 +log_must ln /$TESTPOOL/$TESTFS/link_and_unlink \ + /$TESTPOOL/$TESTFS/link_and_unlink.link +log_must rm /$TESTPOOL/$TESTFS/link_and_unlink.link + # # 4. Copy TESTFS to temporary location (TESTDIR/copy) # From 65469f6e302205858b26da93c191ffab5bedbdff Mon Sep 17 00:00:00 2001 From: Dominic Pearson Date: Tue, 20 Aug 2019 00:22:52 +0200 Subject: [PATCH 086/109] Linux 5.3 compat: Makefile subdir-m no longer supported Uses obj-m instead, due to kernel changes. See LKML: Masahiro Yamada, Tue, 6 Aug 2019 19:03:23 +0900 Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Signed-off-by: Dominic Pearson Closes #9169 --- .gitignore | 11 +++++++++++ module/Makefile.in | 24 ++++++++++++------------ 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 549fa59f3822..ae9e22dfa7bb 100644 --- a/.gitignore +++ b/.gitignore @@ -63,3 +63,14 @@ cscope.* *.log venv +# +# Module leftovers +# +/module/avl/zavl.mod +/module/icp/icp.mod +/module/lua/zlua.mod +/module/nvpair/znvpair.mod +/module/spl/spl.mod +/module/unicode/zunicode.mod +/module/zcommon/zcommon.mod +/module/zfs/zfs.mod diff --git a/module/Makefile.in b/module/Makefile.in index eca7691aedbb..7477dbe56509 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -1,11 +1,11 @@ -subdir-m += avl -subdir-m += icp -subdir-m += lua -subdir-m += nvpair -subdir-m += spl -subdir-m += unicode -subdir-m += zcommon -subdir-m += zfs +obj-m += avl/ +obj-m += icp/ +obj-m += lua/ +obj-m += nvpair/ +obj-m += spl/ +obj-m += unicode/ +obj-m += zcommon/ +obj-m += zfs/ INSTALL_MOD_DIR ?= extra @@ -60,13 +60,13 @@ modules_install: modules_uninstall: @# Uninstall the kernel modules kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@ - list='$(subdir-m)'; for subdir in $$list; do \ - $(RM) -R $$kmoddir/$(INSTALL_MOD_DIR)/$$subdir; \ + list='$(obj-m)'; for objdir in $$list; do \ + $(RM) -R $$kmoddir/$(INSTALL_MOD_DIR)/$$objdir; \ done distdir: - list='$(subdir-m)'; for subdir in $$list; do \ - (cd @top_srcdir@/module && find $$subdir \ + list='$(obj-m)'; for objdir in $$list; do \ + (cd @top_srcdir@/module && find $$objdir \ -name '*.c' -o -name '*.h' -o -name '*.S' | \ xargs cp --parents -t @abs_top_builddir@/module/$$distdir); \ done From 023ab67a64fc297bb5d773406f5b1fc6dd0d957b Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Wed, 21 Aug 2019 09:29:23 -0700 Subject: [PATCH 087/109] Linux 5.3: Fix switch() fall though compiler errors Fix some switch() fall-though compiler errors: abd.c:1504:9: error: this statement may fall through Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #9170 --- module/lua/llex.c | 9 ++++++--- module/zfs/abd.c | 4 ++++ module/zfs/vdev_raidz_math_scalar.c | 1 + 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/module/lua/llex.c b/module/lua/llex.c index 8760155d0546..50c301f599f1 100644 --- a/module/lua/llex.c +++ b/module/lua/llex.c @@ -431,9 +431,12 @@ static int llex (LexState *ls, SemInfo *seminfo) { if (sep >= 0) { read_long_string(ls, seminfo, sep); return TK_STRING; - } - else if (sep == -1) return '['; - else lexerror(ls, "invalid long string delimiter", TK_STRING); + } else if (sep == -1) { + return '['; + } else { + lexerror(ls, "invalid long string delimiter", TK_STRING); + break; + } } case '=': { next(ls); diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 9041bd8b1841..32b2c842c0df 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -1370,8 +1370,10 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, switch (parity) { case 3: len = MIN(caiters[2].iter_mapsize, len); + /* falls through */ case 2: len = MIN(caiters[1].iter_mapsize, len); + /* falls through */ case 1: len = MIN(caiters[0].iter_mapsize, len); } @@ -1461,9 +1463,11 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, case 3: len = MIN(xiters[2].iter_mapsize, len); len = MIN(citers[2].iter_mapsize, len); + /* falls through */ case 2: len = MIN(xiters[1].iter_mapsize, len); len = MIN(citers[1].iter_mapsize, len); + /* falls through */ case 1: len = MIN(xiters[0].iter_mapsize, len); len = MIN(citers[0].iter_mapsize, len); diff --git a/module/zfs/vdev_raidz_math_scalar.c b/module/zfs/vdev_raidz_math_scalar.c index a693bff63ffb..cd742e146ca6 100644 --- a/module/zfs/vdev_raidz_math_scalar.c +++ b/module/zfs/vdev_raidz_math_scalar.c @@ -142,6 +142,7 @@ static const struct { a.b[6] = mul_lt[a.b[6]]; \ a.b[5] = mul_lt[a.b[5]]; \ a.b[4] = mul_lt[a.b[4]]; \ + /* falls through */ \ case 4: \ a.b[3] = mul_lt[a.b[3]]; \ a.b[2] = mul_lt[a.b[2]]; \ From 512a50f38d17f77118af6f297ddf7ba720a48ebc Mon Sep 17 00:00:00 2001 From: yshui Date: Fri, 23 Aug 2019 01:11:17 +0100 Subject: [PATCH 088/109] zfs-mount-genrator: dependencies should be space-separated Reviewed-by: Antonio Russo Reviewed-by: Richard Laager Signed-off-by: Yuxuan Shui Closes #9174 --- etc/systemd/system-generators/zfs-mount-generator.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/systemd/system-generators/zfs-mount-generator.in b/etc/systemd/system-generators/zfs-mount-generator.in index ae208c965f97..3e529cb67bb3 100755 --- a/etc/systemd/system-generators/zfs-mount-generator.in +++ b/etc/systemd/system-generators/zfs-mount-generator.in @@ -215,7 +215,7 @@ EOF fi # Update the dependencies for the mount file to require the # key-loading unit. - wants="${wants},${keyloadunit}" + wants="${wants} ${keyloadunit}" fi # If the mountpoint has already been created, give it precedence. From 33374f21f0f8922baa95796c70edcc4bc17df19f Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Thu, 22 Aug 2019 20:26:51 -0400 Subject: [PATCH 089/109] Make slog test setup more robust The slog tests fail when attempting to create pools using file vdevs that already exist from previous test runs. Remove these files in the setup for the test. Reviewed-by: Igor Kozhukhov Reviewed-by: Brian Behlendorf Reviewed-by: John Kennedy Signed-off-by: Ryan Moeller Closes #9194 --- tests/zfs-tests/tests/functional/slog/setup.ksh | 9 --------- tests/zfs-tests/tests/functional/slog/slog.kshlib | 11 ++++++++++- .../zfs-tests/tests/functional/slog/slog_001_pos.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_002_pos.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_003_pos.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_004_pos.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_005_pos.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_006_pos.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_007_pos.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_008_neg.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_009_neg.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_010_neg.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_011_neg.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_012_neg.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_013_pos.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_014_pos.ksh | 1 + .../zfs-tests/tests/functional/slog/slog_015_neg.ksh | 1 + .../tests/functional/slog/slog_replay_fs.ksh | 1 + .../tests/functional/slog/slog_replay_volume.ksh | 1 + 19 files changed, 27 insertions(+), 10 deletions(-) diff --git a/tests/zfs-tests/tests/functional/slog/setup.ksh b/tests/zfs-tests/tests/functional/slog/setup.ksh index f30824d3ee90..8e8d214d823c 100755 --- a/tests/zfs-tests/tests/functional/slog/setup.ksh +++ b/tests/zfs-tests/tests/functional/slog/setup.ksh @@ -38,13 +38,4 @@ if ! verify_slog_support ; then log_unsupported "This system doesn't support separate intent logs" fi -if [[ -d $VDEV ]]; then - log_must rm -rf $VDIR -fi -if [[ -d $VDEV2 ]]; then - log_must rm -rf $VDIR2 -fi -log_must mkdir -p $VDIR $VDIR2 -log_must truncate -s $MINVDEVSIZE $VDEV $SDEV $LDEV $VDEV2 $SDEV2 $LDEV2 - log_pass diff --git a/tests/zfs-tests/tests/functional/slog/slog.kshlib b/tests/zfs-tests/tests/functional/slog/slog.kshlib index 6ed7e4e0502f..75cfec2d832d 100644 --- a/tests/zfs-tests/tests/functional/slog/slog.kshlib +++ b/tests/zfs-tests/tests/functional/slog/slog.kshlib @@ -31,11 +31,20 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/slog/slog.cfg +function setup +{ + log_must rm -rf $VDIR $VDIR2 + log_must mkdir -p $VDIR $VDIR2 + log_must truncate -s $MINVDEVSIZE $VDEV $SDEV $LDEV $VDEV2 $SDEV2 $LDEV2 + + return 0 +} + function cleanup { poolexists $TESTPOOL && destroy_pool $TESTPOOL poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 - rm -rf $TESTDIR + rm -rf $TESTDIR $VDIR $VDIR2 } # diff --git a/tests/zfs-tests/tests/functional/slog/slog_001_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_001_pos.ksh index 3d3daf5f9ccc..a4c35ed9e98e 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_001_pos.ksh @@ -45,6 +45,7 @@ verify_runnable "global" log_assert "Creating a pool with a log device succeeds." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_002_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_002_pos.ksh index b056f19cdb80..91904aa612d1 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_002_pos.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Adding a log device to normal pool works." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_003_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_003_pos.ksh index c647b8f54b75..0b4d6ede3e13 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_003_pos.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Adding an extra log device works." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_004_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_004_pos.ksh index 4b0b3439a2e3..10f28dcc000b 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_004_pos.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Attaching a log device passes." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_005_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_005_pos.ksh index cbbb9486913a..4836f6f27937 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_005_pos.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Detaching a log device passes." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_006_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_006_pos.ksh index 53e8c67ca005..24143196fd2e 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_006_pos.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Replacing a log device passes." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_007_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_007_pos.ksh index 4926fb7b3192..27ac38606c29 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_007_pos.ksh @@ -48,6 +48,7 @@ verify_runnable "global" log_assert "Exporting and importing pool with log devices passes." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_008_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_008_neg.ksh index 587e0e321222..54587a0c61a7 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_008_neg.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_008_neg.ksh @@ -44,6 +44,7 @@ verify_runnable "global" log_assert "A raidz/raidz2 log is not supported." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_009_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_009_neg.ksh index e7091f17b759..222f71a99928 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_009_neg.ksh @@ -45,6 +45,7 @@ verify_runnable "global" log_assert "A raidz/raidz2 log can not be added to existed pool." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_010_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_010_neg.ksh index 8fe248ffbcba..edd9abea0930 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_010_neg.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_010_neg.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Slog device can not be replaced with spare device." log_onexit cleanup +log_must setup log_must zpool create $TESTPOOL $VDEV spare $SDEV log $LDEV sdev=$(random_get $SDEV) diff --git a/tests/zfs-tests/tests/functional/slog/slog_011_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_011_neg.ksh index 2dad200b31c1..3bebc8201713 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_011_neg.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_011_neg.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Offline and online a log device passes." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh index 45566d427f1d..8d6fb2bffb7f 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh @@ -45,6 +45,7 @@ verify_runnable "global" log_assert "Pool can survive when one of mirror log device get corrupted." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_013_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_013_pos.ksh index bbe5adc24174..d6917065ddbf 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_013_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_013_pos.ksh @@ -60,6 +60,7 @@ log_assert "Verify slog device can be disk, file, lofi device or any device " \ "that presents a block interface." verify_disk_count "$DISKS" 2 log_onexit cleanup_testenv +log_must setup dsk1=${DISKS%% *} log_must zpool create $TESTPOOL ${DISKS#$dsk1} diff --git a/tests/zfs-tests/tests/functional/slog/slog_014_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_014_pos.ksh index 0ec96ae1e6f7..e8ea29f1ffa3 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_014_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_014_pos.ksh @@ -44,6 +44,7 @@ verify_runnable "global" log_assert "log device can survive when one of the pool device get corrupted." +log_must setup for type in "mirror" "raidz" "raidz2"; do for spare in "" "spare"; do diff --git a/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh index 37821888ea00..fa6105116574 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh @@ -47,6 +47,7 @@ function cleanup ORIG_TIMEOUT=$(get_tunable zfs_commit_timeout_pct | tail -1 | awk '{print $NF}') log_onexit cleanup +log_must setup for PCT in 0 1 2 4 8 16 32 64 128 256 512 1024; do log_must set_tunable64 zfs_commit_timeout_pct $PCT diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh index ea3f8451b9e3..3e5bccd2ef18 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh @@ -66,6 +66,7 @@ function cleanup_fs log_assert "Replay of intent log succeeds." log_onexit cleanup_fs +log_must setup # # 1. Create an empty file system (TESTFS) diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh index c8a3cbbf43c4..a72c83b5bfc6 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh @@ -76,6 +76,7 @@ function cleanup_volume log_assert "Replay of intent log succeeds." log_onexit cleanup_volume +log_must setup # # 1. Create an empty volume (TESTVOL), set sync=always, and format From 95319fc569cf1ab322926f037b92dd4fd15b5630 Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Tue, 27 Aug 2019 12:55:51 -0400 Subject: [PATCH 090/109] Fix deadlock in 'zfs rollback' Currently, the 'zfs rollback' code can end up deadlocked due to the way the kernel handles unreferenced inodes on a suspended fs. Essentially, the zfs_resume_fs() code path may cause zfs to spawn new threads as it reinstantiates the suspended fs's zil. When a new thread is spawned, the kernel may attempt to free memory for that thread by freeing some unreferenced inodes. If it happens to select inodes that are a a part of the suspended fs a deadlock will occur because freeing inodes requires holding the fs's z_teardown_inactive_lock which is still held from the suspend. This patch corrects this issue by adding an additional reference to all inodes that are still present when a suspend is initiated. This prevents them from being freed by the kernel for any reason. Reviewed-by: Alek Pinchuk Reviewed-by: Brian Behlendorf Signed-off-by: Tom Caputi Closes #9203 --- include/sys/zfs_znode.h | 1 + module/zfs/zfs_vfsops.c | 16 +++++++++++++++- module/zfs/zfs_znode.c | 1 + 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index add45a7f46e4..01b358cc4da8 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -196,6 +196,7 @@ typedef struct znode { uint8_t z_atime_dirty; /* atime needs to be synced */ uint8_t z_zn_prefetch; /* Prefetch znodes? */ uint8_t z_moved; /* Has this znode been moved? */ + boolean_t z_suspended; /* extra ref from a suspend? */ uint_t z_blksz; /* block size in bytes */ uint_t z_seq; /* modification sequence number */ uint64_t z_mapcnt; /* number of pages mapped to file */ diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index 371c412f6beb..489f12b7fc0f 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -1736,7 +1736,12 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) * will fail with EIO since we have z_teardown_lock for writer (only * relevant for forced unmount). * - * Release all holds on dbufs. + * Release all holds on dbufs. We also grab an extra reference to all + * the remaining inodes so that the kernel does not attempt to free + * any inodes of a suspended fs. This can cause deadlocks since the + * zfs_resume_fs() process may involve starting threads, which might + * attempt to free unreferenced inodes to free up memory for the new + * thread. */ if (!unmounting) { mutex_enter(&zfsvfs->z_znodes_lock); @@ -1744,6 +1749,9 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) zp = list_next(&zfsvfs->z_all_znodes, zp)) { if (zp->z_sa_hdl) zfs_znode_dmu_fini(zp); + if (igrab(ZTOI(zp)) != NULL) + zp->z_suspended = B_TRUE; + } mutex_exit(&zfsvfs->z_znodes_lock); } @@ -2192,6 +2200,12 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) remove_inode_hash(ZTOI(zp)); zp->z_is_stale = B_TRUE; } + + /* see comment in zfs_suspend_fs() */ + if (zp->z_suspended) { + zfs_iput_async(ZTOI(zp)); + zp->z_suspended = B_FALSE; + } } mutex_exit(&zfsvfs->z_znodes_lock); diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 3dd299942202..91162e857d44 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -540,6 +540,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_moved = 0; + zp->z_suspended = B_FALSE; zp->z_sa_hdl = NULL; zp->z_unlinked = 0; zp->z_atime_dirty = 0; From ea34735203a259f331dc549c25c7ed92b34cd470 Mon Sep 17 00:00:00 2001 From: Richard Allen <33836503+belperite@users.noreply.github.com> Date: Tue, 27 Aug 2019 21:44:02 +0100 Subject: [PATCH 091/109] Fix Plymouth passphrase prompt in initramfs script Entering the ZFS encryption passphrase under Plymouth wasn't working because in the ZFS initrd script, Plymouth was calling zfs via "--command", which wasn't passing through the filesystem argument to zfs load-key properly (it was passing through the single quotes around the filesystem name intended to handle spaces literally, which zfs load-key couldn't understand). Reviewed-by: Richard Laager Reviewed-by: Garrett Fields Signed-off-by: Richard Allen Issue #9193 Closes #9202 --- contrib/initramfs/scripts/zfs.in | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/contrib/initramfs/scripts/zfs.in b/contrib/initramfs/scripts/zfs.in index ad604a82ce52..05410ea2bdce 100644 --- a/contrib/initramfs/scripts/zfs.in +++ b/contrib/initramfs/scripts/zfs.in @@ -411,29 +411,29 @@ decrypt_fs() # Determine dataset that holds key for root dataset ENCRYPTIONROOT=$(${ZFS} get -H -o value encryptionroot "${fs}") - DECRYPT_CMD="${ZFS} load-key '${ENCRYPTIONROOT}'" # If root dataset is encrypted... if ! [ "${ENCRYPTIONROOT}" = "-" ]; then - + TRY_COUNT=3 # Prompt with plymouth, if active if [ -e /bin/plymouth ] && /bin/plymouth --ping 2>/dev/null; then - plymouth ask-for-password --prompt "Encrypted ZFS password for ${ENCRYPTIONROOT}" \ - --number-of-tries="3" \ - --command="${DECRYPT_CMD}" + while [ $TRY_COUNT -gt 0 ]; do + plymouth ask-for-password --prompt "Encrypted ZFS password for ${ENCRYPTIONROOT}" | \ + $ZFS load-key "${ENCRYPTIONROOT}" && break + TRY_COUNT=$((TRY_COUNT - 1)) + done # Prompt with systemd, if active elif [ -e /run/systemd/system ]; then - TRY_COUNT=3 while [ $TRY_COUNT -gt 0 ]; do systemd-ask-password "Encrypted ZFS password for ${ENCRYPTIONROOT}" --no-tty | \ - ${DECRYPT_CMD} && break + $ZFS load-key "${ENCRYPTIONROOT}" && break TRY_COUNT=$((TRY_COUNT - 1)) done # Prompt with ZFS tty, otherwise else - eval "${DECRYPT_CMD}" + $ZFS load-key "${ENCRYPTIONROOT}" fi fi fi From 931bef81c8a4bda13e22be770c1dca3721dffc0f Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Tue, 27 Aug 2019 23:45:53 +0300 Subject: [PATCH 092/109] zfs_ioc_snapshot: check user-prop permissions on snapshotted datasets Previously, the permissions were checked on the pool which was obviously incorrect. After this change, zfs_check_userprops() only validates the properties without any permission checks. The permissions are checked individually for each snapshotted dataset. Reviewed-by: Brian Behlendorf Reviewed-by: Matt Ahrens Signed-off-by: Andriy Gapon Closes #9179 Closes #9180 --- module/zfs/zfs_ioctl.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 152433d60790..ac573ccbf170 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -2744,10 +2744,9 @@ zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, * Check that all the properties are valid user properties. */ static int -zfs_check_userprops(const char *fsname, nvlist_t *nvl) +zfs_check_userprops(nvlist_t *nvl) { nvpair_t *pair = NULL; - int error = 0; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); @@ -2756,10 +2755,6 @@ zfs_check_userprops(const char *fsname, nvlist_t *nvl) nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); - if ((error = zfs_secpolicy_write_perms(fsname, - ZFS_DELEG_PERM_USERPROP, CRED()))) - return (error); - if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); @@ -3473,19 +3468,18 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); - if ((error = zfs_check_userprops(poolname, props)) != 0) - return (error); - if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); + if ((error = zfs_check_userprops(props)) != 0) + return (error); snaps = fnvlist_lookup_nvlist(innvl, "snaps"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); - const char *cp = strchr(name, '@'); + char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must @@ -3502,6 +3496,18 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); + /* + * Check for permission to set the properties on the fs. + */ + if (!nvlist_empty(props)) { + *cp = '\0'; + error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_USERPROP, CRED()); + *cp = '@'; + if (error != 0) + return (error); + } + /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { From c7a4255f128cc493df8383cb9f1ed650191b2081 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 28 Aug 2019 10:42:02 -0700 Subject: [PATCH 093/109] Fix zil replay panic when TX_REMOVE followed by TX_CREATE If TX_REMOVE is followed by TX_CREATE on the same object id, we need to make sure the object removal is completely finished before creation. The current implementation relies on dnode_hold_impl with DNODE_MUST_BE_ALLOCATED returning ENOENT. While this check seems to work fine before, in current version it does not guarantee the object removal is completed. We fix this by checking if DNODE_MUST_BE_FREE returns successful instead. Also add test and remove dead code in dnode_hold_impl. Reviewed-by: Brian Behlendorf Reviewed-by: Tom Caputi Signed-off-by: Chunwei Chen Closes #7151 Closes #8910 Closes #9123 Closes #9145 --- include/sys/dnode.h | 7 +- module/zfs/dnode.c | 49 +++++-- module/zfs/zfs_replay.c | 8 +- tests/runfiles/linux.run | 4 +- .../tests/functional/slog/Makefile.am | 3 +- ...g_replay_fs.ksh => slog_replay_fs_001.ksh} | 0 .../functional/slog/slog_replay_fs_002.ksh | 137 ++++++++++++++++++ 7 files changed, 184 insertions(+), 24 deletions(-) rename tests/zfs-tests/tests/functional/slog/{slog_replay_fs.ksh => slog_replay_fs_001.ksh} (100%) create mode 100755 tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh diff --git a/include/sys/dnode.h b/include/sys/dnode.h index c60258bbc768..e97e40373b4d 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -46,6 +46,7 @@ extern "C" { */ #define DNODE_MUST_BE_ALLOCATED 1 #define DNODE_MUST_BE_FREE 2 +#define DNODE_DRY_RUN 4 /* * dnode_next_offset() flags. @@ -415,6 +416,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots, boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting); +int dnode_try_claim(objset_t *os, uint64_t object, int slots); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, @@ -532,11 +534,6 @@ typedef struct dnode_stats { * a range of dnode slots which would overflow the dnode_phys_t. */ kstat_named_t dnode_hold_free_overflow; - /* - * Number of times a dnode_hold(...) was attempted on a dnode - * which had already been unlinked in an earlier txg. - */ - kstat_named_t dnode_hold_free_txg; /* * Number of times dnode_free_interior_slots() needed to retry * acquiring a slot zrl lock due to contention. diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 5fd473303d7d..cc7bc5ec82c8 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -55,7 +55,6 @@ dnode_stats_t dnode_stats = { { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 }, { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 }, { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_txg", KSTAT_DATA_UINT64 }, { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 }, { "dnode_allocate", KSTAT_DATA_UINT64 }, { "dnode_reallocate", KSTAT_DATA_UINT64 }, @@ -1255,6 +1254,10 @@ dnode_buf_evict_async(void *dbu) * as an extra dnode slot by an large dnode, in which case it returns * ENOENT. * + * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just + * return whether the hold would succeed or not. tag and dnp should set to + * NULL in this case. + * * errors: * EINVAL - Invalid object number or flags. * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE) @@ -1283,6 +1286,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0)); ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0)); + IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL)); /* * If you are holding the spa config lock as writer, you shouldn't @@ -1312,8 +1316,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) return (SET_ERROR(EEXIST)); DNODE_VERIFY(dn); - (void) zfs_refcount_add(&dn->dn_holds, tag); - *dnp = dn; + /* Don't actually hold if dry run, just return 0 */ + if (!(flag & DNODE_DRY_RUN)) { + (void) zfs_refcount_add(&dn->dn_holds, tag); + *dnp = dn; + } return (0); } @@ -1455,6 +1462,14 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, return (SET_ERROR(ENOENT)); } + /* Don't actually hold if dry run, just return 0 */ + if (flag & DNODE_DRY_RUN) { + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (0); + } + DNODE_STAT_BUMP(dnode_hold_alloc_hits); } else if (flag & DNODE_MUST_BE_FREE) { @@ -1512,6 +1527,14 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, return (SET_ERROR(EEXIST)); } + /* Don't actually hold if dry run, just return 0 */ + if (flag & DNODE_DRY_RUN) { + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (0); + } + dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR); DNODE_STAT_BUMP(dnode_hold_free_hits); } else { @@ -1519,15 +1542,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, return (SET_ERROR(EINVAL)); } - if (dn->dn_free_txg) { - DNODE_STAT_BUMP(dnode_hold_free_txg); - type = dn->dn_type; - mutex_exit(&dn->dn_mtx); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ? - ENOENT : EEXIST)); - } + ASSERT0(dn->dn_free_txg); if (zfs_refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); @@ -1618,6 +1633,16 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) } } +/* + * Test whether we can create a dnode at the specified location. + */ +int +dnode_try_claim(objset_t *os, uint64_t object, int slots) +{ + return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN, + slots, NULL, NULL)); +} + void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) { diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 144381769059..7dea85bb6614 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -337,8 +337,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) xva.xva_vattr.va_nblocks = lr->lr_gen; xva.xva_vattr.va_fsid = dnodesize; - error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); - if (error != ENOENT) + error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); + if (error) goto bail; if (lr->lr_common.lrc_txtype & TX_CI) @@ -473,8 +473,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) xva.xva_vattr.va_nblocks = lr->lr_gen; xva.xva_vattr.va_fsid = dnodesize; - error = dmu_object_info(zfsvfs->z_os, objid, NULL); - if (error != ENOENT) + error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); + if (error) goto out; if (lr->lr_common.lrc_txtype & TX_CI) diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 1c368d20c454..0e157cf0e98e 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -824,8 +824,8 @@ tags = ['functional', 'scrub_mirror'] tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos', 'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg', 'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg', - 'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs', - 'slog_replay_volume'] + 'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001', + 'slog_replay_fs_002', 'slog_replay_volume'] tags = ['functional', 'slog'] [tests/functional/snapshot] diff --git a/tests/zfs-tests/tests/functional/slog/Makefile.am b/tests/zfs-tests/tests/functional/slog/Makefile.am index 4548ce63b40c..33e3a6d3a496 100644 --- a/tests/zfs-tests/tests/functional/slog/Makefile.am +++ b/tests/zfs-tests/tests/functional/slog/Makefile.am @@ -17,7 +17,8 @@ dist_pkgdata_SCRIPTS = \ slog_013_pos.ksh \ slog_014_pos.ksh \ slog_015_neg.ksh \ - slog_replay_fs.ksh \ + slog_replay_fs_001.ksh \ + slog_replay_fs_002.ksh \ slog_replay_volume.ksh dist_pkgdata_DATA = \ diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh similarity index 100% rename from tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh rename to tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh new file mode 100755 index 000000000000..3c3ccdf4ad23 --- /dev/null +++ b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh @@ -0,0 +1,137 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/tests/functional/slog/slog.kshlib + +# +# DESCRIPTION: +# Verify slog replay correctly when TX_REMOVEs are followed by +# TX_CREATEs. +# +# STRATEGY: +# 1. Create a file system (TESTFS) with a lot of files +# 2. Freeze TESTFS +# 3. Remove all files then create a lot of files +# 4. Copy TESTFS to temporary location (TESTDIR/copy) +# 5. Unmount filesystem +# +# 6. Remount TESTFS +# 7. Compare TESTFS against the TESTDIR/copy +# + +verify_runnable "global" + +function cleanup_fs +{ + cleanup +} + +log_assert "Replay of intent log succeeds." +log_onexit cleanup_fs +log_must setup + +# +# 1. Create a file system (TESTFS) with a lot of files +# +log_must zpool create $TESTPOOL $VDEV log mirror $LDEV +log_must zfs set compression=on $TESTPOOL +log_must zfs create $TESTPOOL/$TESTFS + +# Prep for the test of TX_REMOVE followed by TX_CREATE +dnsize=(legacy auto 1k 2k 4k 8k 16k) +NFILES=200 +log_must mkdir /$TESTPOOL/$TESTFS/dir0 +log_must eval 'for i in $(seq $NFILES); do zfs set dnodesize=${dnsize[$RANDOM % ${#dnsize[@]}]} $TESTPOOL/$TESTFS; touch /$TESTPOOL/$TESTFS/dir0/file.$i; done' + +# +# Reimport to reset dnode allocation pointer. +# This is to make sure we will have TX_REMOVE and TX_CREATE on same id +# +log_must zpool export $TESTPOOL +log_must zpool import -f -d $VDIR $TESTPOOL + +# +# This dd command works around an issue where ZIL records aren't created +# after freezing the pool unless a ZIL header already exists. Create a file +# synchronously to force ZFS to write one out. +# +log_must dd if=/dev/zero of=/$TESTPOOL/$TESTFS/sync \ + conv=fdatasync,fsync bs=1 count=1 + +# +# 2. Freeze TESTFS +# +log_must zpool freeze $TESTPOOL + +# +# 3. Remove all files then create a lot of files +# +# TX_REMOVE followed by TX_CREATE +log_must eval 'rm -f /$TESTPOOL/$TESTFS/dir0/*' +log_must eval 'for i in $(seq $NFILES); do zfs set dnodesize=${dnsize[$RANDOM % ${#dnsize[@]}]} $TESTPOOL/$TESTFS; touch /$TESTPOOL/$TESTFS/dir0/file.$i; done' + +# +# 4. Copy TESTFS to temporary location (TESTDIR/copy) +# +log_must mkdir -p $TESTDIR/copy +log_must cp -a /$TESTPOOL/$TESTFS/* $TESTDIR/copy/ + +# +# 5. Unmount filesystem and export the pool +# +# At this stage TESTFS is empty again and frozen, the intent log contains +# a complete set of deltas to replay. +# +log_must zfs unmount /$TESTPOOL/$TESTFS + +log_note "Verify transactions to replay:" +log_must zdb -iv $TESTPOOL/$TESTFS + +log_must zpool export $TESTPOOL + +# +# 6. Remount TESTFS +# +# Import the pool to unfreeze it and claim log blocks. It has to be +# `zpool import -f` because we can't write a frozen pool's labels! +# +log_must zpool import -f -d $VDIR $TESTPOOL + +# +# 7. Compare TESTFS against the TESTDIR/copy +# +log_note "Verify current block usage:" +log_must zdb -bcv $TESTPOOL + +log_note "Verify number of files" +log_must test "$(ls /$TESTPOOL/$TESTFS/dir0 | wc -l)" -eq $NFILES + +log_note "Verify working set diff:" +log_must diff -r /$TESTPOOL/$TESTFS $TESTDIR/copy + +log_pass "Replay of intent log succeeds." From 0e765c4eb89346a77733037a46b32aec85205a19 Mon Sep 17 00:00:00 2001 From: Pavel Zakharov Date: Wed, 28 Aug 2019 18:02:58 -0400 Subject: [PATCH 094/109] zfs_handle used after being closed/freed in change_one callback This is a typical case of use after free. We would call zfs_close(zhp) which would free the handle, and then call zfs_iter_children() on that handle later. This change ensures that the zfs_handle is only closed when we are ready to return. Running `zfs inherit -r sharenfs pool` was failing with an error code without any error messages. After some debugging I've pinpointed the issue to be memory corruption, which would cause zfs to try to issue an ioctl to the wrong device and receive ENOTTY. Reviewed-by: Paul Dagnelie Reviewed-by: George Wilson Reviewed-by: Sebastien Roy Reviewed-by: Brian Behlendorf Reviewed-by: Alek Pinchuk Signed-off-by: Pavel Zakharov Issue #7967 Closes #9165 --- lib/libzfs/libzfs_changelist.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index 3101febc1605..72f641056edc 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -475,9 +475,10 @@ change_one(zfs_handle_t *zhp, void *data) prop_changelist_t *clp = data; char property[ZFS_MAXPROPLEN]; char where[64]; - prop_changenode_t *cn; + prop_changenode_t *cn = NULL; zprop_source_t sourcetype = ZPROP_SRC_NONE; zprop_source_t share_sourcetype = ZPROP_SRC_NONE; + int ret = 0; /* * We only want to unmount/unshare those filesystems that may inherit @@ -493,8 +494,7 @@ change_one(zfs_handle_t *zhp, void *data) zfs_prop_get(zhp, clp->cl_prop, property, sizeof (property), &sourcetype, where, sizeof (where), B_FALSE) != 0) { - zfs_close(zhp); - return (0); + goto out; } /* @@ -506,8 +506,7 @@ change_one(zfs_handle_t *zhp, void *data) zfs_prop_get(zhp, clp->cl_shareprop, property, sizeof (property), &share_sourcetype, where, sizeof (where), B_FALSE) != 0) { - zfs_close(zhp); - return (0); + goto out; } if (clp->cl_alldependents || clp->cl_allchildren || @@ -518,8 +517,8 @@ change_one(zfs_handle_t *zhp, void *data) share_sourcetype == ZPROP_SRC_INHERITED))) { if ((cn = zfs_alloc(zfs_get_handle(zhp), sizeof (prop_changenode_t))) == NULL) { - zfs_close(zhp); - return (-1); + ret = -1; + goto out; } cn->cn_handle = zhp; @@ -541,16 +540,23 @@ change_one(zfs_handle_t *zhp, void *data) uu_avl_insert(clp->cl_tree, cn, idx); } else { free(cn); - zfs_close(zhp); + cn = NULL; } if (!clp->cl_alldependents) - return (zfs_iter_children(zhp, change_one, data)); - } else { - zfs_close(zhp); + ret = zfs_iter_children(zhp, change_one, data); + + /* + * If we added the handle to the changelist, we will re-use it + * later so return without closing it. + */ + if (cn != NULL) + return (ret); } - return (0); +out: + zfs_close(zhp); + return (ret); } static int From 3cf4ecb03fecca9d9a326c32e8f1f7573a93a8e3 Mon Sep 17 00:00:00 2001 From: Georgy Yakovlev <168902+gyakovlev@users.noreply.github.com> Date: Thu, 29 Aug 2019 12:14:48 -0800 Subject: [PATCH 095/109] etc/init.d/zfs-functions.in: remove arch warning Remove the x86_64 warning, it's no longer the case that this is the only supported architecture. Reviewed-by: Brian Behlendorf Signed-off-by: Georgy Yakovlev Closes: #9177 --- etc/init.d/zfs-functions.in | 7 ------- 1 file changed, 7 deletions(-) diff --git a/etc/init.d/zfs-functions.in b/etc/init.d/zfs-functions.in index 490503e91391..cbc7fd22a0a0 100644 --- a/etc/init.d/zfs-functions.in +++ b/etc/init.d/zfs-functions.in @@ -294,13 +294,6 @@ checksystem() # Just make sure that /dev/zfs is created. udev_trigger - if ! [ "$(uname -m)" = "x86_64" ]; then - echo "Warning: You're not running 64bit. Currently native zfs in"; - echo " Linux is only supported and tested on 64bit."; - # should we break here? People doing this should know what they - # do, thus i'm not breaking here. - fi - return 0 } From 13e5e396a31df268cba6571a800abe9e54c47db4 Mon Sep 17 00:00:00 2001 From: loli10K Date: Tue, 3 Sep 2019 19:36:33 +0200 Subject: [PATCH 096/109] Fix Intel QAT / ZFS compatibility on v4.7.1+ kernels This change use the compat code introduced in 9cc1844a. Reviewed-by: Brian Behlendorf Signed-off-by: loli10K Closes #9268 Closes #9269 --- module/zfs/qat_compress.c | 2 +- module/zfs/qat_crypt.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/module/zfs/qat_compress.c b/module/zfs/qat_compress.c index 1c5c0a4e7256..b3c8c1621675 100644 --- a/module/zfs/qat_compress.c +++ b/module/zfs/qat_compress.c @@ -547,7 +547,7 @@ qat_compress(qat_compress_dir_t dir, char *src, int src_len, } static int -param_set_qat_compress(const char *val, struct kernel_param *kp) +param_set_qat_compress(const char *val, zfs_kernel_param_t *kp) { int ret; int *pvalue = kp->arg; diff --git a/module/zfs/qat_crypt.c b/module/zfs/qat_crypt.c index 34c19b5823a8..2170366df142 100644 --- a/module/zfs/qat_crypt.c +++ b/module/zfs/qat_crypt.c @@ -578,7 +578,7 @@ qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp) } static int -param_set_qat_encrypt(const char *val, struct kernel_param *kp) +param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp) { int ret; int *pvalue = kp->arg; @@ -600,7 +600,7 @@ param_set_qat_encrypt(const char *val, struct kernel_param *kp) } static int -param_set_qat_checksum(const char *val, struct kernel_param *kp) +param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp) { int ret; int *pvalue = kp->arg; From beb21db3c6ac503a43ef7c6532d099c056f89f5b Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Tue, 3 Sep 2019 20:56:55 +0300 Subject: [PATCH 097/109] Always refuse receving non-resume stream when resume state exists This fixes a hole in the situation where the resume state is left from receiving a new dataset and, so, the state is set on the dataset itself (as opposed to %recv child). Additionally, distinguish incremental and resume streams in error messages. Reviewed-by: Matt Ahrens Reviewed-by: Tom Caputi Reviewed-by: Brian Behlendorf Signed-off-by: Andriy Gapon Closes #9252 --- lib/libzfs/libzfs_sendrecv.c | 15 +++++++++++---- module/zfs/dmu_recv.c | 10 +++++++--- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 0d3853e0a1c4..d967e043b4e5 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -3992,11 +3992,18 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, } } else { /* - * if the fs does not exist, look for it based on the - * fromsnap GUID + * If the fs does not exist, look for it based on the + * fromsnap GUID. */ - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot receive incremental stream")); + if (resuming) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot receive resume stream")); + } else { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot receive incremental stream")); + } (void) strcpy(name, destsnap); *strchr(name, '@') = '\0'; diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 3481feb21dbc..2324e8e87ba2 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -86,21 +86,25 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0; boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0; - /* temporary clone name must not exist */ + /* Temporary clone name must not exist. */ error = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 8, 1, &val); if (error != ENOENT) return (error == 0 ? EBUSY : error); - /* new snapshot name must not exist */ + /* Resume state must not be set. */ + if (dsl_dataset_has_resume_receive_state(ds)) + return (SET_ERROR(EBUSY)); + + /* New snapshot name must not exist. */ error = zap_lookup(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 8, 1, &val); if (error != ENOENT) return (error == 0 ? EEXIST : error); - /* must not have children if receiving a ZVOL */ + /* Must not have children if receiving a ZVOL. */ error = zap_count(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children); if (error != 0) From 38528476bf0b64e7462a1141ff73d016a94f3471 Mon Sep 17 00:00:00 2001 From: Pavel Zakharov Date: Wed, 17 Jul 2019 18:33:05 -0400 Subject: [PATCH 098/109] New service that waits on zvol links to be created MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The zfs-volume-wait.service scans existing zvols and waits for their links under /dev to be created. Any service that depends on zvol links to be there should add a dependency on zfs-volumes.target. By default, this target is not enabled. Reviewed-by: Fabian Grünbichler Reviewed-by: Antonio Russo Reviewed-by: Richard Laager Reviewed-by: loli10K Reviewed-by: John Gallagher Reviewed-by: George Wilson Reviewed-by: Brian Behlendorf Signed-off-by: Pavel Zakharov Closes #8975 --- cmd/Makefile.am | 2 +- cmd/zvol_wait/Makefile.am | 1 + cmd/zvol_wait/zvol_wait | 93 +++++++++++++++++++ configure.ac | 1 + etc/systemd/system/50-zfs.preset.in | 1 + etc/systemd/system/Makefile.am | 4 + etc/systemd/system/zfs-volume-wait.service.in | 13 +++ etc/systemd/system/zfs-volumes.target.in | 7 ++ man/man1/Makefile.am | 2 +- man/man1/zvol_wait.1 | 21 +++++ rpm/generic/zfs.spec.in | 3 +- 11 files changed, 145 insertions(+), 3 deletions(-) create mode 100644 cmd/zvol_wait/Makefile.am create mode 100755 cmd/zvol_wait/zvol_wait create mode 100644 etc/systemd/system/zfs-volume-wait.service.in create mode 100644 etc/systemd/system/zfs-volumes.target.in create mode 100644 man/man1/zvol_wait.1 diff --git a/cmd/Makefile.am b/cmd/Makefile.am index 0d990789b0c6..88609e455f2b 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -5,4 +5,4 @@ if USING_PYTHON SUBDIRS += arcstat arc_summary dbufstat endif -SUBDIRS += mount_zfs zed zvol_id +SUBDIRS += mount_zfs zed zvol_id zvol_wait diff --git a/cmd/zvol_wait/Makefile.am b/cmd/zvol_wait/Makefile.am new file mode 100644 index 000000000000..564031c9799d --- /dev/null +++ b/cmd/zvol_wait/Makefile.am @@ -0,0 +1 @@ +dist_bin_SCRIPTS = zvol_wait diff --git a/cmd/zvol_wait/zvol_wait b/cmd/zvol_wait/zvol_wait new file mode 100755 index 000000000000..d512be41bcb5 --- /dev/null +++ b/cmd/zvol_wait/zvol_wait @@ -0,0 +1,93 @@ +#!/bin/sh + +count_zvols() { + if [ -z "$zvols" ]; then + echo 0 + else + echo "$zvols" | wc -l + fi +} + +filter_out_zvols_with_links() { + while read -r zvol; do + if [ ! -L "/dev/zvol/$zvol" ]; then + echo "$zvol" + fi + done +} + +filter_out_deleted_zvols() { + while read -r zvol; do + if zfs list "$zvol" >/dev/null 2>&1; then + echo "$zvol" + fi + done +} + +list_zvols() { + zfs list -t volume -H -o name,volmode | while read -r zvol_line; do + name=$(echo "$zvol_line" | awk '{print $1}') + volmode=$(echo "$zvol_line" | awk '{print $2}') + # /dev links are not created for zvols with volmode = "none". + [ "$volmode" = "none" ] || echo "$name" + done +} + +zvols=$(list_zvols) +zvols_count=$(count_zvols) +if [ "$zvols_count" -eq 0 ]; then + echo "No zvols found, nothing to do." + exit 0 +fi + +echo "Testing $zvols_count zvol links" + +outer_loop=0 +while [ "$outer_loop" -lt 20 ]; do + outer_loop=$((outer_loop + 1)) + + old_zvols_count=$(count_zvols) + + inner_loop=0 + while [ "$inner_loop" -lt 30 ]; do + inner_loop=$((inner_loop + 1)) + + zvols="$(echo "$zvols" | filter_out_zvols_with_links)" + + zvols_count=$(count_zvols) + if [ "$zvols_count" -eq 0 ]; then + echo "All zvol links are now present." + exit 0 + fi + sleep 1 + done + + echo "Still waiting on $zvols_count zvol links ..." + # + # Although zvols should normally not be deleted at boot time, + # if that is the case then their links will be missing and + # we would stall. + # + if [ "$old_zvols_count" -eq "$zvols_count" ]; then + echo "No progress since last loop." + echo "Checking if any zvols were deleted." + + zvols=$(echo "$zvols" | filter_out_deleted_zvols) + zvols_count=$(count_zvols) + + if [ "$old_zvols_count" -ne "$zvols_count" ]; then + echo "$((old_zvols_count - zvols_count)) zvol(s) deleted." + fi + + if [ "$zvols_count" -ne 0 ]; then + echo "Remaining zvols:" + echo "$zvols" + else + echo "All zvol links are now present." + exit 0 + fi + fi +done + +echo "Timed out waiting on zvol links" +exit 1 diff --git a/configure.ac b/configure.ac index e8592ffb1d2d..a3ac134ffccf 100644 --- a/configure.ac +++ b/configure.ac @@ -123,6 +123,7 @@ AC_CONFIG_FILES([ cmd/zed/zed.d/Makefile cmd/raidz_test/Makefile cmd/zgenhostid/Makefile + cmd/zvol_wait/Makefile contrib/Makefile contrib/bash_completion.d/Makefile contrib/dracut/Makefile diff --git a/etc/systemd/system/50-zfs.preset.in b/etc/systemd/system/50-zfs.preset.in index 884a69b5b683..e4056a92cd98 100644 --- a/etc/systemd/system/50-zfs.preset.in +++ b/etc/systemd/system/50-zfs.preset.in @@ -5,4 +5,5 @@ enable zfs-import.target enable zfs-mount.service enable zfs-share.service enable zfs-zed.service +enable zfs-volume-wait.service enable zfs.target diff --git a/etc/systemd/system/Makefile.am b/etc/systemd/system/Makefile.am index 1586209caa6d..9249f15eb455 100644 --- a/etc/systemd/system/Makefile.am +++ b/etc/systemd/system/Makefile.am @@ -7,7 +7,9 @@ systemdunit_DATA = \ zfs-import-scan.service \ zfs-mount.service \ zfs-share.service \ + zfs-volume-wait.service \ zfs-import.target \ + zfs-volumes.target \ zfs.target EXTRA_DIST = \ @@ -17,6 +19,8 @@ EXTRA_DIST = \ $(top_srcdir)/etc/systemd/system/zfs-mount.service.in \ $(top_srcdir)/etc/systemd/system/zfs-share.service.in \ $(top_srcdir)/etc/systemd/system/zfs-import.target.in \ + $(top_srcdir)/etc/systemd/system/zfs-volume-wait.service.in \ + $(top_srcdir)/etc/systemd/system/zfs-volumes.target.in \ $(top_srcdir)/etc/systemd/system/zfs.target.in \ $(top_srcdir)/etc/systemd/system/50-zfs.preset.in diff --git a/etc/systemd/system/zfs-volume-wait.service.in b/etc/systemd/system/zfs-volume-wait.service.in new file mode 100644 index 000000000000..75bd9fcdd56c --- /dev/null +++ b/etc/systemd/system/zfs-volume-wait.service.in @@ -0,0 +1,13 @@ +[Unit] +Description=Wait for ZFS Volume (zvol) links in /dev +DefaultDependencies=no +After=systemd-udev-settle.service +After=zfs-import.target + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=@bindir@/zvol_wait + +[Install] +WantedBy=zfs-volumes.target diff --git a/etc/systemd/system/zfs-volumes.target.in b/etc/systemd/system/zfs-volumes.target.in new file mode 100644 index 000000000000..5cb9a10f49c5 --- /dev/null +++ b/etc/systemd/system/zfs-volumes.target.in @@ -0,0 +1,7 @@ +[Unit] +Description=ZFS volumes are ready +After=zfs-volume-wait.service +Requires=zfs-volume-wait.service + +[Install] +WantedBy=zfs.target diff --git a/man/man1/Makefile.am b/man/man1/Makefile.am index bd78be1452a8..2af917fa5c2e 100644 --- a/man/man1/Makefile.am +++ b/man/man1/Makefile.am @@ -1,4 +1,4 @@ -dist_man_MANS = zhack.1 ztest.1 raidz_test.1 +dist_man_MANS = zhack.1 ztest.1 raidz_test.1 zvol_wait.1 EXTRA_DIST = cstyle.1 install-data-local: diff --git a/man/man1/zvol_wait.1 b/man/man1/zvol_wait.1 new file mode 100644 index 000000000000..0366da5376d3 --- /dev/null +++ b/man/man1/zvol_wait.1 @@ -0,0 +1,21 @@ +.Dd July 5, 2019 +.Dt ZVOL_WAIT 1 SMM +.Os Linux +.Sh NAME +.Nm zvol_wait +.Nd Wait for ZFS volume links in +.Em /dev +to be created. +.Sh SYNOPSIS +.Nm +.Sh DESCRIPTION +When a ZFS pool is imported, ZFS will register each ZFS volume +(zvol) as a disk device with the system. As the disks are registered, +.Xr \fBudev 7\fR +will asynchronously create symlinks under +.Em /dev/zvol +using the zvol's name. +.Nm +will wait for all those symlinks to be created before returning. +.Sh SEE ALSO +.Xr \fBudev 7\fR diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 0864a72a1155..4fdf7bb69ec7 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -322,7 +322,7 @@ image which is ZFS aware. %if 0%{?_systemd} %define systemd --enable-systemd --with-systemdunitdir=%{_unitdir} --with-systemdpresetdir=%{_presetdir} --with-systemdmodulesloaddir=%{_modulesloaddir} --with-systemdgeneratordir=%{_systemdgeneratordir} --disable-sysvinit - %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-share.service zfs-zed.service zfs.target zfs-import.target + %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-share.service zfs-zed.service zfs.target zfs-import.target zfs-volume-wait.service zfs-volumes.target %else %define systemd --enable-sysvinit --disable-systemd %endif @@ -419,6 +419,7 @@ systemctl --system daemon-reload >/dev/null || true %{_sbindir}/* %{_bindir}/raidz_test %{_bindir}/zgenhostid +%{_bindir}/zvol_wait # Optional Python 2/3 scripts %{_bindir}/arc_summary %{_bindir}/arcstat From 5acba22ec0bd934894d746ca967d451fdc6d3368 Mon Sep 17 00:00:00 2001 From: Pavel Zakharov Date: Tue, 3 Sep 2019 14:29:52 -0400 Subject: [PATCH 099/109] zvol_wait script should ignore partially received zvols Partially received zvols won't have links in /dev/zvol. Reviewed-by: Sebastien Roy Reviewed-by: Paul Dagnelie Reviewed-by: Brian Behlendorf Signed-off-by: Pavel Zakharov Closes #9260 --- cmd/zvol_wait/zvol_wait | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/cmd/zvol_wait/zvol_wait b/cmd/zvol_wait/zvol_wait index d512be41bcb5..e5df82dd376a 100755 --- a/cmd/zvol_wait/zvol_wait +++ b/cmd/zvol_wait/zvol_wait @@ -25,11 +25,30 @@ filter_out_deleted_zvols() { } list_zvols() { - zfs list -t volume -H -o name,volmode | while read -r zvol_line; do + zfs list -t volume -H -o name,volmode,receive_resume_token | + while read -r zvol_line; do name=$(echo "$zvol_line" | awk '{print $1}') volmode=$(echo "$zvol_line" | awk '{print $2}') + token=$(echo "$zvol_line" | awk '{print $3}') + # # /dev links are not created for zvols with volmode = "none". - [ "$volmode" = "none" ] || echo "$name" + # + [ "$volmode" = "none" ] && continue + # + # We also also ignore partially received zvols if it is + # not an incremental receive, as those won't even have a block + # device minor node created yet. + # + if [ "$token" != "-" ]; then + # + # Incremental receives create an invisible clone that + # is not automatically displayed by zfs list. + # + if ! zfs list "$name/%recv" >/dev/null 2>&1; then + continue + fi + fi + echo "$name" done } From 9f261b1be681e93158d65fa8e5f2a0553af05b20 Mon Sep 17 00:00:00 2001 From: loli10K Date: Wed, 4 Sep 2019 00:20:39 +0200 Subject: [PATCH 100/109] Fix zfs-dkms .deb package warning in prerm script Debian zfs-dkms package generated by alien doesn't call the prerm script (rpm's %preun) with an integer as first parameter, which results in the following warning when the package is uninstalled: "zfs-dkms.prerm: line 3: [: remove: integer expression expected" Modify the if-condition to avoid the warning. Reviewed-by: Brian Behlendorf Signed-off-by: loli10K Closes #9271 --- rpm/generic/zfs-dkms.spec.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in index 568bef988ca0..d87293686422 100644 --- a/rpm/generic/zfs-dkms.spec.in +++ b/rpm/generic/zfs-dkms.spec.in @@ -73,7 +73,7 @@ exit 1 %preun # Are we doing an upgrade? -if [ $1 -ne 0 ] ; then +if [ "$1" = "1" -o "$1" = "upgrade" ] ; then # Yes we are. Are we upgrading to a new ZFS version? NEWEST_VER=$(dkms status zfs | sed 's/,//g' | sort -r -V | awk '/installed/{print $2; exit}') if [ "$NEWEST_VER" != "%{version}" ] ; then From 146d7d8846d532a0ee66454ec0b14d6a511a6228 Mon Sep 17 00:00:00 2001 From: loli10K Date: Wed, 4 Sep 2019 22:36:25 +0200 Subject: [PATCH 101/109] Fix zpool subcommands error message with some unsupported options Both 'detach' and 'online' zpool subcommands, when provided with an unsupported option, forget to print it in the error message: # zpool online -t rpool vda3 invalid option '' usage: online [-e] ... This changes fixes the error message in order to include the actual option that is not supported. Reviewed-by: Ryan Moeller Reviewed-by: Brian Behlendorf Signed-off-by: loli10K Closes #9270 --- cmd/zpool/zpool_main.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index a3c76030d634..b9c7462b618e 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -6111,9 +6111,8 @@ zpool_do_detach(int argc, char **argv) int ret; /* check options */ - while ((c = getopt(argc, argv, "f")) != -1) { + while ((c = getopt(argc, argv, "")) != -1) { switch (c) { - case 'f': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -6342,12 +6341,11 @@ zpool_do_online(int argc, char **argv) int flags = 0; /* check options */ - while ((c = getopt(argc, argv, "et")) != -1) { + while ((c = getopt(argc, argv, "e")) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; - case 't': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); From 0ae5f0c8d29f2dff2470779cd7e1b4c3cfeaf12b Mon Sep 17 00:00:00 2001 From: Olaf Faaland Date: Fri, 6 Sep 2019 11:30:07 -0700 Subject: [PATCH 102/109] BuildRequires libtirpc-devel needed for RHEL 8 Building against RHEL 8 requires libtirpc-devel, as with fedora 28. Add rhel8 and centos8 options to the test, to account for that. BuildRequires Originally added for fedora 28 via commit 1a62a305be01972ef1b81469134faa4937836096 Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Olaf Faaland Closes #9289 --- rpm/generic/zfs.spec.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 4fdf7bb69ec7..b9ca5ed5fb74 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -139,7 +139,7 @@ BuildRequires: libblkid-devel BuildRequires: libudev-devel BuildRequires: libattr-devel BuildRequires: openssl-devel -%if 0%{?fedora} >= 28 +%if 0%{?fedora} >= 28 || 0%{?rhel} >= 8 || 0%{?centos} >= 8 BuildRequires: libtirpc-devel %endif Requires: openssl From 97d4986214e2f1a003f60a931bb6c9dafdead7bf Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 10 Sep 2019 13:42:30 -0700 Subject: [PATCH 103/109] Fix /etc/hostid on root pool deadlock Accidentally introduced by dc04a8c which now takes the SCL_VDEV lock as a reader in zfs_blkptr_verify(). A deadlock can occur if the /etc/hostid file resides on a dataset in the same pool. This is because reading the /etc/hostid file may occur while the caller is holding the SCL_VDEV lock as a writer. For example, to perform a `zpool attach` as shown in the abbreviated stack below. To resolve the issue we cache the system's hostid when initializing the spa_t, or when modifying the multihost property. The cached value is then relied upon for subsequent accesses. Call Trace: spa_config_enter+0x1e8/0x350 [zfs] zfs_blkptr_verify+0x33c/0x4f0 [zfs] <--- trying read lock zio_read+0x6c/0x140 [zfs] ... vfs_read+0xfc/0x1e0 kernel_read+0x50/0x90 ... spa_get_hostid+0x1c/0x38 [zfs] spa_config_generate+0x1a0/0x610 [zfs] vdev_label_init+0xa0/0xc80 [zfs] vdev_create+0x98/0xe0 [zfs] spa_vdev_attach+0x14c/0xb40 [zfs] <--- grabbed write lock Reviewed-by: loli10K Signed-off-by: Brian Behlendorf Closes #9256 Closes #9285 --- include/sys/spa.h | 2 +- include/sys/spa_impl.h | 1 + module/zfs/spa.c | 15 ++-- module/zfs/spa_config.c | 2 +- module/zfs/spa_misc.c | 19 +--- tests/runfiles/linux.run | 2 +- .../tests/functional/mmp/Makefile.am | 1 + .../tests/functional/mmp/mmp_hostid.ksh | 90 +++++++++++++++++++ 8 files changed, 109 insertions(+), 23 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_hostid.ksh diff --git a/include/sys/spa.h b/include/sys/spa.h index 23434edbc72e..ca63d3a49058 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1104,7 +1104,7 @@ extern uint64_t spa_missing_tvds_allowed(spa_t *spa); extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); -extern unsigned long spa_get_hostid(void); +extern uint32_t spa_get_hostid(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern int spa_mode(spa_t *spa); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 0de8613d3eb8..9ab107599fd6 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -395,6 +395,7 @@ struct spa { mmp_thread_t spa_mmp; /* multihost mmp thread */ list_t spa_leaf_list; /* list of leaf vdevs */ uint64_t spa_leaf_list_gen; /* track leaf_list changes */ + uint32_t spa_hostid; /* cached system hostid */ /* * spa_refcount & spa_config_lock must be the last elements diff --git a/module/zfs/spa.c b/module/zfs/spa.c index ce622cee88b0..4e322e34b080 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -567,8 +567,13 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) if (!error && intval > 1) error = SET_ERROR(EINVAL); - if (!error && !spa_get_hostid()) - error = SET_ERROR(ENOTSUP); + if (!error) { + uint32_t hostid = zone_get_hostid(NULL); + if (hostid) + spa->spa_hostid = hostid; + else + error = SET_ERROR(ENOTSUP); + } break; @@ -2496,7 +2501,7 @@ spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); - if (hostid == spa_get_hostid()) + if (hostid == spa_get_hostid(spa)) return (B_FALSE); /* @@ -3015,7 +3020,7 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) spa->spa_config); if (activity_check) { if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && - spa_get_hostid() == 0) { + spa_get_hostid(spa) == 0) { nvlist_free(label); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); @@ -3695,7 +3700,7 @@ spa_ld_load_vdev_metadata(spa_t *spa) * be imported when the system hostid is zero. The exception to * this rule is zdb which is always allowed to access pools. */ - if (spa_multihost(spa) && spa_get_hostid() == 0 && + if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 6c0894338e25..8c7c14999da6 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -457,7 +457,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); - hostid = spa_get_hostid(); + hostid = spa_get_hostid(spa); if (hostid != 0) fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname()->nodename); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index a111a9e4e611..185b70201483 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -658,6 +658,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; spa->spa_trust_config = B_TRUE; + spa->spa_hostid = zone_get_hostid(NULL); spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); @@ -2540,22 +2541,10 @@ spa_multihost(spa_t *spa) return (spa->spa_multihost ? B_TRUE : B_FALSE); } -unsigned long -spa_get_hostid(void) +uint32_t +spa_get_hostid(spa_t *spa) { - unsigned long myhostid; - -#ifdef _KERNEL - myhostid = zone_get_hostid(NULL); -#else /* _KERNEL */ - /* - * We're emulating the system's hostid in userland, so - * we can't use zone_get_hostid(). - */ - (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); -#endif /* _KERNEL */ - - return (myhostid); + return (spa->spa_hostid); } boolean_t diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 0e157cf0e98e..ff98661ec795 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -657,7 +657,7 @@ tags = ['functional', 'mmap'] tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', - 'mmp_on_zdb', 'mmp_write_distribution'] + 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid'] tags = ['functional', 'mmp'] [tests/functional/mount] diff --git a/tests/zfs-tests/tests/functional/mmp/Makefile.am b/tests/zfs-tests/tests/functional/mmp/Makefile.am index e39a0a5aac8e..2848fd4ce692 100644 --- a/tests/zfs-tests/tests/functional/mmp/Makefile.am +++ b/tests/zfs-tests/tests/functional/mmp/Makefile.am @@ -12,6 +12,7 @@ dist_pkgdata_SCRIPTS = \ mmp_reset_interval.ksh \ mmp_on_zdb.ksh \ mmp_write_distribution.ksh \ + mmp_hostid.ksh \ setup.ksh \ cleanup.ksh diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_hostid.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_hostid.ksh new file mode 100755 index 000000000000..b492b1070caf --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmp/mmp_hostid.ksh @@ -0,0 +1,90 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Lawrence Livermore National Security, LLC. +# + +# DESCRIPTION: +# Verify the hostid file can reside on a ZFS dataset. +# +# STRATEGY: +# 1. Create a non-redundant pool +# 2. Create an 'etc' dataset containing a valid hostid file +# 3. Create a file so the pool will have some contents +# 4. Verify multihost cannot be enabled until the /etc/hostid is linked +# 5. Verify vdevs may be attached and detached +# 6. Verify normal, cache, log and special vdevs can be added +# 7. Verify normal, cache, and log vdevs can be removed +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/mmp/mmp.cfg +. $STF_SUITE/tests/functional/mmp/mmp.kshlib + +verify_runnable "both" + +function cleanup +{ + default_cleanup_noexit + log_must rm $MMP_DIR/file.{0,1,2,3,4,5} + log_must rmdir $MMP_DIR + log_must mmp_clear_hostid +} + +log_assert "Verify hostid file can reside on a ZFS dataset" +log_onexit cleanup + +log_must mkdir -p $MMP_DIR +log_must truncate -s $MINVDEVSIZE $MMP_DIR/file.{0,1,2,3,4,5} + +# 1. Create a non-redundant pool +log_must zpool create $MMP_POOL $MMP_DIR/file.0 + +# 2. Create an 'etc' dataset containing a valid hostid file; caching is +# disabled on the dataset to force the hostid to be read from disk. +log_must zfs create -o primarycache=none -o secondarycache=none $MMP_POOL/etc +mntpnt_etc=$(get_prop mountpoint $MMP_POOL/etc) +log_must mmp_set_hostid $HOSTID1 +log_must mv $HOSTID_FILE $mntpnt_etc/hostid + +# 3. Create a file so the pool will have some contents +log_must zfs create $MMP_POOL/fs +mntpnt_fs=$(get_prop mountpoint $MMP_POOL/fs) +log_must mkfile 1M $fs_mntpnt/file + +# 4. Verify multihost cannot be enabled until the /etc/hostid is linked +log_mustnot zpool set multihost=on $MMP_POOL +log_must ln -s $mntpnt_etc/hostid $HOSTID_FILE +log_must zpool set multihost=on $MMP_POOL + +# 5. Verify vdevs may be attached and detached +log_must zpool attach $MMP_POOL $MMP_DIR/file.0 $MMP_DIR/file.1 +log_must zpool detach $MMP_POOL $MMP_DIR/file.1 + +# 6. Verify normal, cache, log and special vdevs can be added +log_must zpool add $MMP_POOL $MMP_DIR/file.1 +log_must zpool add $MMP_POOL $MMP_DIR/file.2 +log_must zpool add $MMP_POOL cache $MMP_DIR/file.3 +log_must zpool add $MMP_POOL log $MMP_DIR/file.4 +log_must zpool add $MMP_POOL special $MMP_DIR/file.5 + +# 7. Verify normal, cache, and log vdevs can be removed +log_must zpool remove $MMP_POOL $MMP_DIR/file.2 +log_must zpool remove $MMP_POOL $MMP_DIR/file.3 +log_must zpool remove $MMP_POOL $MMP_DIR/file.4 + +log_pass "Verify hostid file can reside on a ZFS dataset." From e17445d1f70600c22cd319765c0e403d5f9d5024 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 11 Sep 2019 11:14:50 -0700 Subject: [PATCH 104/109] kmodtool: depmod path Determine the location of depmod on the system, either /sbin/depmod or /usr/sbin/depmod. Then use that path when generating the specfile. Additionally, update the Requires lines to reference the package which provides depmod rather than the binary itself. For CentOS/RHEL 7+8 and all supported Fedora releases this is the kmod package, and for CentOS/RHEL 6 it is the module-init-tools package. Reviewed-by: Minh Diep Signed-off-by: Olaf Faaland Signed-off-by: Brian Behlendorf Closes #8724 Closes #9310 --- scripts/kmodtool | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/scripts/kmodtool b/scripts/kmodtool index a632dd046b5a..9298d6d27dfa 100755 --- a/scripts/kmodtool +++ b/scripts/kmodtool @@ -144,7 +144,13 @@ print_rpmtemplate_per_kmodpkg () local kernel_uname_r=${1} local kernel_variant="${2:+-${2}}" - # first part + # Detect depmod install location + local depmod_path=/sbin/depmod + if [ ! -f ${depmod_path} ]; then + depmod_path=/usr/sbin/depmod + fi + + # first part cat <= %{?epoch:%{epoch}:}%{version} -Requires(post): ${prefix}/sbin/depmod -Requires(postun): ${prefix}/sbin/depmod + +%if 0%{?rhel} == 6 || 0%{?centos} == 6 +Requires(post): module-init-tools +Requires(postun): module-init-tools +%else +Requires(post): kmod +Requires(postun): kmod +%endif EOF if [[ ${obsolete_name} ]]; then @@ -170,17 +182,17 @@ BuildRequires: kernel-devel-uname-r = ${kernel_uname_r} %{?KmodsRequires:Requires: %{KmodsRequires}-uname-r = ${kernel_uname_r}} %{?KmodsRequires:BuildRequires: %{KmodsRequires}-uname-r = ${kernel_uname_r}} %post -n kmod-${kmodname}-${kernel_uname_r} -${prefix}/sbin/depmod -aeF /boot/System.map-${kernel_uname_r} ${kernel_uname_r} > /dev/null || : +${prefix}${depmod_path} -aeF /boot/System.map-${kernel_uname_r} ${kernel_uname_r} > /dev/null || : %postun -n kmod-${kmodname}-${kernel_uname_r} -${prefix}/sbin/depmod -aF /boot/System.map-${kernel_uname_r} ${kernel_uname_r} &> /dev/null || : +${prefix}${depmod_path} -aF /boot/System.map-${kernel_uname_r} ${kernel_uname_r} &> /dev/null || : EOF else cat < /dev/null || : +[[ "\$(uname -r)" == "${kernel_uname_r}" ]] && ${prefix}${depmod_path} -a > /dev/null || : %postun -n kmod-${kmodname}-${kernel_uname_r} -[[ "\$(uname -r)" == "${kernel_uname_r}" ]] && ${prefix}/sbin/depmod -a > /dev/null || : +[[ "\$(uname -r)" == "${kernel_uname_r}" ]] && ${prefix}${depmod_path} -a > /dev/null || : EOF fi From 9fa8b5b55b44f1d860b05587bff1dccd896cb77b Mon Sep 17 00:00:00 2001 From: Chengfei ZHu Date: Fri, 13 Sep 2019 04:33:44 +0800 Subject: [PATCH 105/109] QAT related bug fixes 1. Fix issue: Kernel BUG with QAT during decompression #9276. Now it is uninterruptible for a specific given QAT request, but Ctrl-C interrupt still works in user-space process. 2. Copy the digest result to the buffer only when doing encryption, and vise-versa for decryption. Reviewed-by: Tom Caputi Reviewed-by: Brian Behlendorf Signed-off-by: Chengfei Zhu Closes #9276 Closes #9303 --- module/zfs/qat.c | 2 +- module/zfs/qat.h | 5 ----- module/zfs/qat_compress.c | 14 +++----------- module/zfs/qat_crypt.c | 29 ++++++++++++++--------------- 4 files changed, 18 insertions(+), 32 deletions(-) diff --git a/module/zfs/qat.c b/module/zfs/qat.c index a6f024cb44d7..08613b3a2042 100644 --- a/module/zfs/qat.c +++ b/module/zfs/qat.c @@ -21,7 +21,7 @@ #if defined(_KERNEL) && defined(HAVE_QAT) #include -#include "qat.h" +#include qat_stats_t qat_stats = { { "comp_requests", KSTAT_DATA_UINT64 }, diff --git a/module/zfs/qat.h b/module/zfs/qat.h index 9014c03148ba..5c1cd15d09d6 100644 --- a/module/zfs/qat.h +++ b/module/zfs/qat.h @@ -40,11 +40,6 @@ typedef enum qat_encrypt_dir { #include "dc/cpa_dc.h" #include "lac/cpa_cy_sym.h" -/* - * Timeout - no response from hardware after 0.5 seconds - */ -#define QAT_TIMEOUT_MS 500 - /* * The minimal and maximal buffer size which are not restricted * in the QAT hardware, but with the input buffer size between 4KB diff --git a/module/zfs/qat_compress.c b/module/zfs/qat_compress.c index b3c8c1621675..46ccb997a3b7 100644 --- a/module/zfs/qat_compress.c +++ b/module/zfs/qat_compress.c @@ -28,7 +28,7 @@ #include #include #include -#include "qat.h" +#include /* * Max instances in a QAT device, each instance is a channel to submit @@ -404,11 +404,7 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, } /* we now wait until the completion of the operation. */ - if (!wait_for_completion_interruptible_timeout(&complete, - QAT_TIMEOUT_MS)) { - status = CPA_STATUS_FAIL; - goto fail; - } + wait_for_completion(&complete); if (dc_results.status != CPA_STATUS_SUCCESS) { status = CPA_STATUS_FAIL; @@ -463,11 +459,7 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, } /* we now wait until the completion of the operation. */ - if (!wait_for_completion_interruptible_timeout(&complete, - QAT_TIMEOUT_MS)) { - status = CPA_STATUS_FAIL; - goto fail; - } + wait_for_completion(&complete); if (dc_results.status != CPA_STATUS_SUCCESS) { status = CPA_STATUS_FAIL; diff --git a/module/zfs/qat_crypt.c b/module/zfs/qat_crypt.c index 2170366df142..1e77f143e3ec 100644 --- a/module/zfs/qat_crypt.c +++ b/module/zfs/qat_crypt.c @@ -36,7 +36,7 @@ #include #include "lac/cpa_cy_im.h" #include "lac/cpa_cy_common.h" -#include "qat.h" +#include /* * Max instances in a QAT device, each instance is a channel to submit @@ -415,6 +415,9 @@ qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf, op_data.messageLenToCipherInBytes = enc_len; op_data.ivLenInBytes = ZIO_DATA_IV_LEN; bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN); + /* if dir is QAT_DECRYPT, copy digest_buf to pDigestResult */ + if (dir == QAT_DECRYPT) + bcopy(digest_buf, op_data.pDigestResult, ZIO_DATA_MAC_LEN); cb.verify_result = CPA_FALSE; init_completion(&cb.complete); @@ -423,23 +426,21 @@ qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf, if (status != CPA_STATUS_SUCCESS) goto fail; - if (!wait_for_completion_interruptible_timeout(&cb.complete, - QAT_TIMEOUT_MS)) { - status = CPA_STATUS_FAIL; - goto fail; - } + /* we now wait until the completion of the operation. */ + wait_for_completion(&cb.complete); if (cb.verify_result == CPA_FALSE) { status = CPA_STATUS_FAIL; goto fail; } - /* save digest result to digest_buf */ - bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN); - if (dir == QAT_ENCRYPT) + if (dir == QAT_ENCRYPT) { + /* if dir is QAT_ENCRYPT, save pDigestResult to digest_buf */ + bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN); QAT_STAT_INCR(encrypt_total_out_bytes, enc_len); - else + } else { QAT_STAT_INCR(decrypt_total_out_bytes, enc_len); + } fail: if (status != CPA_STATUS_SUCCESS) @@ -549,11 +550,9 @@ qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp) if (status != CPA_STATUS_SUCCESS) goto fail; - if (!wait_for_completion_interruptible_timeout(&cb.complete, - QAT_TIMEOUT_MS)) { - status = CPA_STATUS_FAIL; - goto fail; - } + /* we now wait until the completion of the operation. */ + wait_for_completion(&cb.complete); + if (cb.verify_result == CPA_FALSE) { status = CPA_STATUS_FAIL; goto fail; From 63d8f57fe794dadc629c430470545b636665c1b6 Mon Sep 17 00:00:00 2001 From: loli10K Date: Sat, 14 Sep 2019 03:09:59 +0200 Subject: [PATCH 106/109] Scrubbing root pools may deadlock on kernels without elevator_change() (#9321) Originally the zfs_vdev_elevator module option was added as a convenience so the requested elevator would be automatically set on the underlying block devices. At the time this was simple because the kernel provided an API function which did exactly this. This API was then removed in the Linux 4.12 kernel which prompted us to add compatibly code to set the elevator via a usermodehelper. Unfortunately changing the evelator via usermodehelper requires reading some userland binaries, most notably modprobe(8) or sh(1), from a zfs dataset on systems with root-on-zfs. This can deadlock the system if used during the following call path because it may need, if the data is not already cached in the ARC, reading directly from disk while holding the spa config lock as a writer: zfs_ioc_pool_scan() -> spa_scan() -> spa_scan() -> vdev_reopen() -> vdev_elevator_switch() -> call_usermodehelper() While the usermodehelper waits sh(1), modprobe(8) is blocked in the ZIO pipeline trying to read from disk: INFO: task modprobe:2650 blocked for more than 10 seconds. Tainted: P OE 5.2.14 modprobe D 0 2650 206 0x00000000 Call Trace: ? __schedule+0x244/0x5f0 schedule+0x2f/0xa0 cv_wait_common+0x156/0x290 [spl] ? do_wait_intr_irq+0xb0/0xb0 spa_config_enter+0x13b/0x1e0 [zfs] zio_vdev_io_start+0x51d/0x590 [zfs] ? tsd_get_by_thread+0x3b/0x80 [spl] zio_nowait+0x142/0x2f0 [zfs] arc_read+0xb2d/0x19d0 [zfs] ... zpl_iter_read+0xfa/0x170 [zfs] new_sync_read+0x124/0x1b0 vfs_read+0x91/0x140 ksys_read+0x59/0xd0 do_syscall_64+0x4f/0x130 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This commit changes how we use the usermodehelper functionality from synchronous (UMH_WAIT_PROC) to asynchronous (UMH_NO_WAIT) which prevents scrubs, and other vdev_elevator_switch() consumers, from triggering the aforementioned issue. Signed-off-by: Brian Behlendorf Signed-off-by: loli10K Issue #8664 Closes #9321 --- module/zfs/vdev_disk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 1686ddfce77d..46437f21fb78 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -220,7 +220,7 @@ vdev_elevator_switch(vdev_t *v, char *elevator) char *envp[] = { NULL }; argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator); - error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + error = call_usermodehelper(argv[0], argv, envp, UMH_NO_WAIT); strfree(argv[2]); #endif /* HAVE_ELEVATOR_CHANGE */ if (error) { From 12a78fbb4fcbba6c4c8d9b0aa34d23e33107b0ae Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Wed, 18 Sep 2019 19:04:45 +0300 Subject: [PATCH 107/109] Fix dsl_scan_ds_clone_swapped logic The was incorrect with respect to swapping dataset IDs both in the on-disk ZAP object and the in-memory queue. In both cases, if ds1 was already present, then it would be first replaced with ds2 and then ds would be replaced back with ds1. Also, both cases did not properly handle a situation where both ds1 and ds2 are already queued. A duplicate insertion would be attempted and its failure would result in a panic. Reviewed-by: Matt Ahrens Reviewed-by: Tom Caputi Signed-off-by: Andriy Gapon Closes #9140 Closes #9163 --- module/zfs/dsl_scan.c | 100 +++++++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 31 deletions(-) diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 04a439fad5c5..9ccb17b7e141 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2165,16 +2165,17 @@ ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2, } /* - * Called when a parent dataset and its clone are swapped. If we were + * Called when an origin dataset and its clone are swapped. If we were * currently traversing the dataset, we need to switch to traversing the - * newly promoted parent. + * newly promoted clone. */ void dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) { dsl_pool_t *dp = ds1->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; - uint64_t mintxg; + uint64_t mintxg1, mintxg2; + boolean_t ds1_queued, ds2_queued; if (!dsl_scan_is_running(scn)) return; @@ -2182,44 +2183,81 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); - if (scan_ds_queue_contains(scn, ds1->ds_object, &mintxg)) { - scan_ds_queue_remove(scn, ds1->ds_object); - scan_ds_queue_insert(scn, ds2->ds_object, mintxg); + /* + * Handle the in-memory scan queue. + */ + ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1); + ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2); + + /* Sanity checking. */ + if (ds1_queued) { + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + } + if (ds2_queued) { + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } - if (scan_ds_queue_contains(scn, ds2->ds_object, &mintxg)) { + + if (ds1_queued && ds2_queued) { + /* + * If both are queued, we don't need to do anything. + * The swapping code below would not handle this case correctly, + * since we can't insert ds2 if it is already there. That's + * because scan_ds_queue_insert() prohibits a duplicate insert + * and panics. + */ + } else if (ds1_queued) { + scan_ds_queue_remove(scn, ds1->ds_object); + scan_ds_queue_insert(scn, ds2->ds_object, mintxg1); + } else if (ds2_queued) { scan_ds_queue_remove(scn, ds2->ds_object); - scan_ds_queue_insert(scn, ds1->ds_object, mintxg); + scan_ds_queue_insert(scn, ds1->ds_object, mintxg2); } - if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds1->ds_object, &mintxg) == 0) { - int err; - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + /* + * Handle the on-disk scan queue. + * The on-disk state is an out-of-date version of the in-memory state, + * so the in-memory and on-disk values for ds1_queued and ds2_queued may + * be different. Therefore we need to apply the swap logic to the + * on-disk state independently of the in-memory state. + */ + ds1_queued = zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0; + ds2_queued = zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0; + + /* Sanity checking. */ + if (ds1_queued) { + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + } + if (ds2_queued) { + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + } + + if (ds1_queued && ds2_queued) { + /* + * If both are queued, we don't need to do anything. + * Alternatively, we could check for EEXIST from + * zap_add_int_key() and back out to the original state, but + * that would be more work than checking for this case upfront. + */ + } else if (ds1_queued) { + VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); - err = zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); - VERIFY(err == 0 || err == EEXIST); - if (err == EEXIST) { - /* Both were there to begin with */ - VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, - ds1->ds_object, mintxg, tx)); - } + VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx)); zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); - } - if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds2->ds_object, &mintxg) == 0) { - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + } else if (ds2_queued) { + VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); - VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); + VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx)); zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds2->ds_object, From c37fa0d5a86c1ce646fcceecfbb069d3dc1dc36d Mon Sep 17 00:00:00 2001 From: Kody A Kantor Date: Sun, 22 Sep 2019 17:25:39 -0500 Subject: [PATCH 108/109] Disabled resilver_defer feature leads to looping resilvers When a disk is replaced with another on a pool with the resilver_defer feature present, but not enabled the resilver activity restarts during each spa_sync. This patch checks to make sure that the resilver_defer feature is first enabled before requesting a deferred resilver. This was originally fixed in illumos-joyent as OS-7982. Reviewed-by: Chris Dunlop Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Reviewed-by: Tom Caputi Reviewed by: Jerry Jelinek Signed-off-by: Kody A Kantor External-issue: illumos-joyent OS-7982 Closes #9299 Closes #9338 --- module/zfs/dsl_scan.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 9ccb17b7e141..202c6e8d8f3f 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -23,7 +23,7 @@ * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2016 Gary Mills * Copyright (c) 2017 Datto Inc. - * Copyright 2017 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include @@ -952,13 +952,16 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) * will find the drives that need to be resilvered * when the machine reboots and start the resilver then. */ - boolean_t resilver_needed = - dsl_scan_clear_deferred(spa->spa_root_vdev, tx); - if (resilver_needed) { - spa_history_log_internal(spa, - "starting deferred resilver", tx, - "errors=%llu", spa_get_errlog_size(spa)); - spa_async_request(spa, SPA_ASYNC_RESILVER); + if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { + boolean_t resilver_needed = + dsl_scan_clear_deferred(spa->spa_root_vdev, tx); + if (resilver_needed) { + spa_history_log_internal(spa, + "starting deferred resilver", tx, + "errors=%llu", + (u_longlong_t)spa_get_errlog_size(spa)); + spa_async_request(spa, SPA_ASYNC_RESILVER); + } } } From 1222e921c9e3d8f5c693f196435be4604a1187c0 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Fri, 23 Aug 2019 15:52:32 -0700 Subject: [PATCH 109/109] Tag zfs-0.8.2 META file and changelog updated. Signed-off-by: Tony Hutter --- META | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/META b/META index d9285b7732e4..960a2b73ab30 100644 --- a/META +++ b/META @@ -1,10 +1,10 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 0.8.1 +Version: 0.8.2 Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS on Linux -Linux-Maximum: 5.1 +Linux-Maximum: 5.3 Linux-Minimum: 2.6.32