From e0cd6c28a38bb514351eb696e613e0e36755f867 Mon Sep 17 00:00:00 2001
From: Rafael Kitover <rkitover@gmail.com>
Date: Thu, 23 May 2019 14:40:28 -0700
Subject: [PATCH 001/109] kernel timer API rework

In `config/kernel-timer.m4` refactor slightly to check more generally
for the new `timer_setup()` APIs, but also check the callback signature
because some kernels (notably 4.14) have the new `timer_setup()` API but
use the old callback signature. Also add a check for a `flags` member in
`struct timer_list`, which was added in 4.1-rc8.

Add compatibility shims to `include/spl/sys/timer.h` to allow using the
new timer APIs with the only two caveats being that the callback
argument type must be declared as `spl_timer_list_t` and an explicit
assignment is required to get the timer variable for the `timer_of()`
macro. So the callback would look like this:

```c
__cv_wakeup(spl_timer_list_t t)
{
        struct timer_list *tmr = (struct timer_list *)t;
	struct thing *parent = from_timer(parent, tmr,
		parent_timer_field);
	... /* do stuff with parent */
```

Make some minor changes to `spl-condvar.c` and `spl-taskq.c` to use the
new timer APIs instead of conditional code.

Reviewed-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rafael Kitover <rkitover@gmail.com>
Closes #8647
---
 config/kernel-timer.m4   | 63 +++++++++++++++++++++++++++++++++-------
 config/kernel.m4         |  2 +-
 include/spl/sys/timer.h  | 25 ++++++++++++++++
 module/spl/spl-condvar.c | 29 +++++++++++++-----
 module/spl/spl-taskq.c   | 24 +++------------
 5 files changed, 103 insertions(+), 40 deletions(-)

diff --git a/config/kernel-timer.m4 b/config/kernel-timer.m4
index 4dc3f84ed47e..b0e1afa153ab 100644
--- a/config/kernel-timer.m4
+++ b/config/kernel-timer.m4
@@ -1,26 +1,51 @@
+dnl # 4.14-rc3 API change
+dnl # https://lwn.net/Articles/735887/
 dnl #
-dnl # 4.15 API change
-dnl # https://lkml.org/lkml/2017/11/25/90
 dnl # Check if timer_list.func get passed a timer_list or an unsigned long
 dnl # (older kernels).  Also sanity check the from_timer() and timer_setup()
 dnl # macros are available as well, since they will be used in the same newer
 dnl # kernels that support the new timer_list.func signature.
 dnl #
-AC_DEFUN([ZFS_AC_KERNEL_TIMER_FUNCTION_TIMER_LIST], [
-	AC_MSG_CHECKING([whether timer_list.function gets a timer_list])
+dnl # Also check for the existance of flags in struct timer_list, they were
+dnl # added in 4.1-rc8 via 0eeda71bc30d.
+
+AC_DEFUN([ZFS_AC_KERNEL_TIMER_SETUP], [
+	AC_MSG_CHECKING([whether timer_setup() is available])
 	tmp_flags="$EXTRA_KCFLAGS"
 	EXTRA_KCFLAGS="-Werror"
+
 	ZFS_LINUX_TRY_COMPILE([
 		#include <linux/timer.h>
-		void task_expire(struct timer_list *tl) {}
+
+		struct my_task_timer {
+			struct timer_list timer;
+			int data;
+		};
+
+		void task_expire(struct timer_list *tl)
+		{
+			struct my_task_timer *task_timer = from_timer(task_timer, tl, timer);
+			task_timer->data = 42;
+		}
+	],[
+		struct my_task_timer task_timer;
+		timer_setup(&task_timer.timer, task_expire, 0);
 	],[
-		#ifndef from_timer
-		#error "No from_timer() macro"
-		#endif
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_KERNEL_TIMER_SETUP, 1,
+		    [timer_setup() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
 
-		struct timer_list timer;
-		timer.function = task_expire;
-		timer_setup(&timer, NULL, 0);
+	AC_MSG_CHECKING([whether timer function expects timer_list])
+
+	ZFS_LINUX_TRY_COMPILE([
+		#include <linux/timer.h>
+		void task_expire(struct timer_list *tl) {}
+	],[
+		struct timer_list tl;
+		tl.function = task_expire;
 	],[
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST, 1,
@@ -28,5 +53,21 @@ AC_DEFUN([ZFS_AC_KERNEL_TIMER_FUNCTION_TIMER_LIST], [
 	],[
 		AC_MSG_RESULT(no)
 	])
+
+	AC_MSG_CHECKING([whether struct timer_list has flags])
+
+	ZFS_LINUX_TRY_COMPILE([
+		#include <linux/timer.h>
+	],[
+		struct timer_list tl;
+		tl.flags = 2;
+	],[
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_KERNEL_TIMER_LIST_FLAGS, 1,
+		    [struct timer_list has a flags member])
+	],[
+		AC_MSG_RESULT(no)
+	])
+
 	EXTRA_KCFLAGS="$tmp_flags"
 ])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 9a36302c0489..fbc04bdf7d70 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -36,7 +36,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
 	ZFS_AC_KERNEL_GROUP_INFO_GID
 	ZFS_AC_KERNEL_WRITE
 	ZFS_AC_KERNEL_READ
-	ZFS_AC_KERNEL_TIMER_FUNCTION_TIMER_LIST
+	ZFS_AC_KERNEL_TIMER_SETUP
 	ZFS_AC_KERNEL_DECLARE_EVENT_CLASS
 	ZFS_AC_KERNEL_CURRENT_BIO_TAIL
 	ZFS_AC_KERNEL_SUPER_USER_NS
diff --git a/include/spl/sys/timer.h b/include/spl/sys/timer.h
index a6b134570cd8..31d89d3b97d6 100644
--- a/include/spl/sys/timer.h
+++ b/include/spl/sys/timer.h
@@ -72,4 +72,29 @@ usleep_range(unsigned long min, unsigned long max)
 #define	USEC_TO_TICK(us)		usecs_to_jiffies(us)
 #define	NSEC_TO_TICK(ns)		usecs_to_jiffies(ns / NSEC_PER_USEC)
 
+#ifndef from_timer
+#define	from_timer(var, timer, timer_field) \
+	container_of(timer, typeof(*var), timer_field)
+#endif
+
+#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST
+typedef struct timer_list *spl_timer_list_t;
+#else
+typedef unsigned long spl_timer_list_t;
+#endif
+
+#ifndef HAVE_KERNEL_TIMER_SETUP
+
+static inline void
+timer_setup(struct timer_list *timer, void (*func)(spl_timer_list_t), u32 fl)
+{
+#ifdef HAVE_KERNEL_TIMER_LIST_FLAGS
+	(timer)->flags = fl;
+#endif
+	init_timer(timer);
+	setup_timer(timer, func, (spl_timer_list_t)(timer));
+}
+
+#endif /* HAVE_KERNEL_TIMER_SETUP */
+
 #endif  /* _SPL_TIMER_H */
diff --git a/module/spl/spl-condvar.c b/module/spl/spl-condvar.c
index 1e6e38b7874b..a7a9d1db9a98 100644
--- a/module/spl/spl-condvar.c
+++ b/module/spl/spl-condvar.c
@@ -154,26 +154,39 @@ EXPORT_SYMBOL(__cv_wait_sig);
 #if defined(HAVE_IO_SCHEDULE_TIMEOUT)
 #define	spl_io_schedule_timeout(t)	io_schedule_timeout(t)
 #else
+
+struct spl_task_timer {
+	struct timer_list timer;
+	struct task_struct *task;
+};
+
 static void
-__cv_wakeup(unsigned long data)
+__cv_wakeup(spl_timer_list_t t)
 {
-	wake_up_process((struct task_struct *)data);
+	struct timer_list *tmr = (struct timer_list *)t;
+	struct spl_task_timer *task_timer = from_timer(task_timer, tmr, timer);
+
+	wake_up_process(task_timer->task);
 }
 
 static long
 spl_io_schedule_timeout(long time_left)
 {
 	long expire_time = jiffies + time_left;
-	struct timer_list timer;
+	struct spl_task_timer task_timer;
+	struct timer_list *timer = &task_timer.timer;
+
+	task_timer.task = current;
 
-	init_timer(&timer);
-	setup_timer(&timer, __cv_wakeup, (unsigned long)current);
-	timer.expires = expire_time;
-	add_timer(&timer);
+	timer_setup(timer, __cv_wakeup, 0);
+
+	timer->expires = expire_time;
+	add_timer(timer);
 
 	io_schedule();
 
-	del_timer_sync(&timer);
+	del_timer_sync(timer);
+
 	time_left = expire_time - jiffies;
 
 	return (time_left < 0 ? 0 : time_left);
diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c
index 7684257be7ad..a39f94e4cc20 100644
--- a/module/spl/spl-taskq.c
+++ b/module/spl/spl-taskq.c
@@ -24,6 +24,7 @@
  *  Solaris Porting Layer (SPL) Task Queue Implementation.
  */
 
+#include <sys/timer.h>
 #include <sys/taskq.h>
 #include <sys/kmem.h>
 #include <sys/tsd.h>
@@ -242,20 +243,13 @@ task_expire_impl(taskq_ent_t *t)
 	wake_up(&tq->tq_work_waitq);
 }
 
-#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST
 static void
-task_expire(struct timer_list *tl)
+task_expire(spl_timer_list_t tl)
 {
-	taskq_ent_t *t = from_timer(t, tl, tqent_timer);
+	struct timer_list *tmr = (struct timer_list *)tl;
+	taskq_ent_t *t = from_timer(t, tmr, tqent_timer);
 	task_expire_impl(t);
 }
-#else
-static void
-task_expire(unsigned long data)
-{
-	task_expire_impl((taskq_ent_t *)data);
-}
-#endif
 
 /*
  * Returns the lowest incomplete taskqid_t.  The taskqid_t may
@@ -597,9 +591,6 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 	t->tqent_func = func;
 	t->tqent_arg = arg;
 	t->tqent_taskq = tq;
-#ifndef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST
-	t->tqent_timer.data = 0;
-#endif
 	t->tqent_timer.function = NULL;
 	t->tqent_timer.expires = 0;
 	t->tqent_birth = jiffies;
@@ -649,9 +640,6 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
 	t->tqent_func = func;
 	t->tqent_arg = arg;
 	t->tqent_taskq = tq;
-#ifndef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST
-	t->tqent_timer.data = (unsigned long)t;
-#endif
 	t->tqent_timer.function = task_expire;
 	t->tqent_timer.expires = (unsigned long)expire_time;
 	add_timer(&t->tqent_timer);
@@ -744,11 +732,7 @@ taskq_init_ent(taskq_ent_t *t)
 {
 	spin_lock_init(&t->tqent_lock);
 	init_waitqueue_head(&t->tqent_waitq);
-#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST
 	timer_setup(&t->tqent_timer, NULL, 0);
-#else
-	init_timer(&t->tqent_timer);
-#endif
 	INIT_LIST_HEAD(&t->tqent_list);
 	t->tqent_id = 0;
 	t->tqent_func = NULL;

From 4933b0a25b24fbfe79d1495871cd9ed3eeae97ea Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Sat, 25 May 2019 08:43:23 +0900
Subject: [PATCH 002/109] Drop local definition of MOUNT_BUSY

It's accessible via <sys/mntent.h>.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tom Caputi <tcaputi@datto.com>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@osnexus.com>
Closes #8765
---
 module/zfs/zfs_ctldir.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
index 46e6e19b91d5..c8071a7c215f 100644
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -85,6 +85,7 @@
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_deleg.h>
 #include <sys/zpl.h>
+#include <sys/mntent.h>
 #include "zfs_namecheck.h"
 
 /*
@@ -1047,8 +1048,6 @@ zfsctl_snapshot_unmount(char *snapname, int flags)
 	return (error);
 }
 
-#define	MOUNT_BUSY 0x80		/* Mount failed due to EBUSY (from mntent.h) */
-
 int
 zfsctl_snapshot_mount(struct path *path, int flags)
 {

From e5a877c5d09cd6002cd5375f298570ac38a5b19d Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Sun, 26 May 2019 06:29:10 +0900
Subject: [PATCH 003/109] Update descriptions for vnops

These descriptions are not uptodate with the code.

Reviewed-by: Igor Kozhukhov <igor@dilos.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #8767
---
 module/zfs/zfs_vnops.c | 20 +++++++++++---------
 module/zfs/zfs_znode.c |  7 +++----
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 885d9633b01f..9d8a9cbc5419 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -1676,6 +1676,7 @@ zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
  *	IN:	dip	- inode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
+ *		flags	- case flags.
  *
  *	RETURN:	0 if success
  *		error code if failure
@@ -1917,6 +1918,7 @@ zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags)
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
+ *		flags	- case flags.
  *		vsecp	- ACL to be set
  *
  *	OUT:	ipp	- inode of created directory.
@@ -2235,13 +2237,12 @@ zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr,
 }
 
 /*
- * Read as many directory entries as will fit into the provided
- * dirent buffer from the given directory cursor position.
+ * Read directory entries from the given directory cursor position and emit
+ * name and position for each entry.
  *
  *	IN:	ip	- inode of directory to read.
- *		dirent	- buffer for directory entries.
- *
- *	OUT:	dirent	- filler buffer of directory entries.
+ *		ctx	- directory entry context.
+ *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
@@ -4006,13 +4007,14 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dip	- Directory to contain new symbolic link.
- *		link	- Name for new symlink entry.
+ *		name	- Name of directory entry in dip.
  *		vap	- Attributes of new entry.
- *		target	- Target path of new symlink.
- *
+ *		link	- Name for new symlink entry.
  *		cr	- credentials of caller.
  *		flags	- case flags
  *
+ *	OUT:	ipp	- Inode for new symbolic link.
+ *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
@@ -4216,6 +4218,7 @@ zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr)
  *		sip	- inode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
+ *		flags	- case flags.
  *
  *	RETURN:	0 if success
  *		error code if failure
@@ -4729,7 +4732,6 @@ zfs_inactive(struct inode *ip)
  *	IN:	ip	- inode seeking within
  *		ooff	- old file offset
  *		noffp	- pointer to new file offset
- *		ct	- caller context
  *
  *	RETURN:	0 if success
  *		EINVAL if new offset invalid
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index d5ed4af7029d..a27129b7992b 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -651,12 +651,11 @@ static zfs_acl_phys_t acl_phys;
  *		cr	- credentials of caller
  *		flag	- flags:
  *			  IS_ROOT_NODE	- new object will be root
+ *			  IS_TMPFILE	- new object is of O_TMPFILE
  *			  IS_XATTR	- new object is an attribute
- *		bonuslen - length of bonus buffer
- *		setaclp  - File/Dir initial ACL
- *		fuidp	 - Tracks fuid allocation.
+ *		acl_ids	- ACL related attributes
  *
- *	OUT:	zpp	- allocated znode
+ *	OUT:	zpp	- allocated znode (set to dzp if IS_ROOT_NODE)
  *
  */
 void

From 90d8067a77977184cbd99d18582984b9a767fb7f Mon Sep 17 00:00:00 2001
From: Ryan Moeller <ryan@freqlabs.com>
Date: Tue, 28 May 2019 15:18:31 -0700
Subject: [PATCH 004/109] Update comments to match code

s/get_vdev_spec/make_root_vdev

The former doesn't exist anymore.

Sponsored by: iXsystems, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tom Caputi <tcaputi@datto.com>
Signed-off-by: Ryan Moeller <ryan@freqlabs.com>
Closes #8759
---
 cmd/zpool/zpool_main.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 2cb6774b9adb..a3c76030d634 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -785,7 +785,7 @@ add_prop_list_default(const char *propname, char *propval, nvlist_t **props,
  *	-P	Display full path for vdev name.
  *
  * Adds the given vdevs to 'pool'.  As with create, the bulk of this work is
- * handled by get_vdev_spec(), which constructs the nvlist needed to pass to
+ * handled by make_root_vdev(), which constructs the nvlist needed to pass to
  * libzfs.
  */
 int
@@ -883,7 +883,7 @@ zpool_do_add(int argc, char **argv)
 		}
 	}
 
-	/* pass off to get_vdev_spec for processing */
+	/* pass off to make_root_vdev for processing */
 	nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun,
 	    argc, argv);
 	if (nvroot == NULL) {
@@ -1232,9 +1232,9 @@ zpool_do_labelclear(int argc, char **argv)
  *	-O	Set fsproperty=value in the pool's root file system
  *
  * Creates the named pool according to the given vdev specification.  The
- * bulk of the vdev processing is done in get_vdev_spec() in zpool_vdev.c.  Once
- * we get the nvlist back from get_vdev_spec(), we either print out the contents
- * (if '-n' was specified), or pass it to libzfs to do the creation.
+ * bulk of the vdev processing is done in make_root_vdev() in zpool_vdev.c.
+ * Once we get the nvlist back from make_root_vdev(), we either print out the
+ * contents (if '-n' was specified), or pass it to libzfs to do the creation.
  */
 int
 zpool_do_create(int argc, char **argv)
@@ -1388,7 +1388,7 @@ zpool_do_create(int argc, char **argv)
 		goto errout;
 	}
 
-	/* pass off to get_vdev_spec for bulk processing */
+	/* pass off to make_root_vdev for bulk processing */
 	nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun,
 	    argc - 1, argv + 1);
 	if (nvroot == NULL)

From e4a11acfac078b21f1b84c95d8ddb7a99306eb34 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Wed, 29 May 2019 07:31:39 +0900
Subject: [PATCH 005/109] Refactor parent dataset handling in libzfs
 zfs_rename()

For recursive renaming, simplify the code by moving `zhrp` and
`parentname` to inner scope. `zhrp` is only used to test existence
of a parent dataset for recursive dataset dir scan since ba6a24026c.

Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Laager <rlaager@wiktel.com>
Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@osnexus.com>
Closes #8815
---
 lib/libzfs/libzfs_dataset.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index e26b32786db5..93af50b99cdd 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -4470,8 +4470,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive,
 	zfs_cmd_t zc = {"\0"};
 	char *delim;
 	prop_changelist_t *cl = NULL;
-	zfs_handle_t *zhrp = NULL;
-	char *parentname = NULL;
 	char parent[ZFS_MAX_DATASET_NAME_LEN];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[1024];
@@ -4566,7 +4564,8 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive,
 	}
 
 	if (recursive) {
-		parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name);
+		zfs_handle_t *zhrp;
+		char *parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name);
 		if (parentname == NULL) {
 			ret = -1;
 			goto error;
@@ -4574,10 +4573,12 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive,
 		delim = strchr(parentname, '@');
 		*delim = '\0';
 		zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET);
+		free(parentname);
 		if (zhrp == NULL) {
 			ret = -1;
 			goto error;
 		}
+		zfs_close(zhrp);
 	} else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) {
 		if ((cl = changelist_gather(zhp, ZFS_PROP_NAME,
 		    CL_GATHER_ITER_MOUNTED,
@@ -4650,12 +4651,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive,
 	}
 
 error:
-	if (parentname != NULL) {
-		free(parentname);
-	}
-	if (zhrp != NULL) {
-		zfs_close(zhrp);
-	}
 	if (cl != NULL) {
 		changelist_free(cl);
 	}

From 6ce10fdabb0c071b1cf5d7c21564c076d9882ec9 Mon Sep 17 00:00:00 2001
From: Josh Soref <jsoref@users.noreply.github.com>
Date: Tue, 28 May 2019 18:58:32 -0400
Subject: [PATCH 006/109] grammar: it is / plural agreement

Reviewed-by: Richard Laager <rlaager@wiktel.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Chris Dunlop <chris@onthe.net.au>
Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>
Closes #8818
---
 cmd/zfs/zfs_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index d75f089acd1f..214a437c5dd1 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -6733,8 +6733,8 @@ unshare_unmount_compare(const void *larg, const void *rarg, void *unused)
 
 /*
  * Convenience routine used by zfs_do_umount() and manual_unmount().  Given an
- * absolute path, find the entry /proc/self/mounts, verify that its a
- * ZFS filesystems, and unmount it appropriately.
+ * absolute path, find the entry /proc/self/mounts, verify that it's a
+ * ZFS filesystem, and unmount it appropriately.
  */
 static int
 unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)

From 328c95e391ed775ab781392ab57cb64200caa928 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Thu, 30 May 2019 08:18:14 +0900
Subject: [PATCH 007/109] Remove vn_set_fs_pwd()/vn_set_pwd() (no need to be at
 / during insmod)

Per suggestion from @behlendorf in #8777, remove vn_set_fs_pwd() and
vn_set_pwd() which are only used in zfs_ioctl.c:_init() while loading
zfs.ko.

The rest of initialization functions being called here after cwd set
to / don't depend on cwd of the process except for spa_config_load().
spa_config_load() uses a relative path ".//etc/zfs/zpool.cache" when
`rootdir` is non-NULL, which is "/etc/zfs/zpool.cache" given cwd is /,
so just unconditionally use the absolute path without "./", so that
`vn_set_pwd("/")` as well as the entire functions can be removed.
This is also what FreeBSD does.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@osnexus.com>
Closes #8826
---
 config/kernel-spinlock.m4 | 24 ----------------
 config/kernel.m4          |  1 -
 include/spl/sys/vnode.h   |  1 -
 module/spl/spl-vnode.c    | 58 ---------------------------------------
 module/zfs/spa_config.c   |  3 +-
 module/zfs/zfs_ioctl.c    |  7 -----
 6 files changed, 1 insertion(+), 93 deletions(-)
 delete mode 100644 config/kernel-spinlock.m4

diff --git a/config/kernel-spinlock.m4 b/config/kernel-spinlock.m4
deleted file mode 100644
index d6d6640070b5..000000000000
--- a/config/kernel-spinlock.m4
+++ /dev/null
@@ -1,24 +0,0 @@
-dnl #
-dnl # 2.6.36 API change,
-dnl # The 'struct fs_struct->lock' was changed from a rwlock_t to
-dnl # a spinlock_t to improve the fastpath performance.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_FS_STRUCT_SPINLOCK], [
-	AC_MSG_CHECKING([whether struct fs_struct uses spinlock_t])
-	tmp_flags="$EXTRA_KCFLAGS"
-	EXTRA_KCFLAGS="-Werror"
-	ZFS_LINUX_TRY_COMPILE([
-		#include <linux/sched.h>
-		#include <linux/fs_struct.h>
-	],[
-		static struct fs_struct fs;
-		spin_lock_init(&fs.lock);
-	],[
-		AC_MSG_RESULT(yes)
-		AC_DEFINE(HAVE_FS_STRUCT_SPINLOCK, 1,
-		          [struct fs_struct uses spinlock_t])
-	],[
-		AC_MSG_RESULT(no)
-	])
-	EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/config/kernel.m4 b/config/kernel.m4
index fbc04bdf7d70..8e89c8014d8a 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -12,7 +12,6 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
 	ZFS_AC_KERNEL_CTL_NAME
 	ZFS_AC_KERNEL_PDE_DATA
 	ZFS_AC_KERNEL_2ARGS_VFS_FSYNC
-	ZFS_AC_KERNEL_FS_STRUCT_SPINLOCK
 	ZFS_AC_KERNEL_KUIDGID_T
 	ZFS_AC_KERNEL_FALLOCATE
 	ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE
diff --git a/include/spl/sys/vnode.h b/include/spl/sys/vnode.h
index 71278b08c867..7bd278e4e13b 100644
--- a/include/spl/sys/vnode.h
+++ b/include/spl/sys/vnode.h
@@ -182,7 +182,6 @@ extern int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag,
 extern file_t *vn_getf(int fd);
 extern void vn_releasef(int fd);
 extern void vn_areleasef(int fd, uf_info_t *fip);
-extern int vn_set_pwd(const char *filename);
 
 int spl_vn_init(void);
 void spl_vn_fini(void);
diff --git a/module/spl/spl-vnode.c b/module/spl/spl-vnode.c
index 11b5e4e5a2f2..d9056c964e5a 100644
--- a/module/spl/spl-vnode.c
+++ b/module/spl/spl-vnode.c
@@ -641,64 +641,6 @@ vn_areleasef(int fd, uf_info_t *fip)
 } /* releasef() */
 EXPORT_SYMBOL(areleasef);
 
-
-static void
-vn_set_fs_pwd(struct fs_struct *fs, struct path *path)
-{
-	struct path old_pwd;
-
-#ifdef HAVE_FS_STRUCT_SPINLOCK
-	spin_lock(&fs->lock);
-	old_pwd = fs->pwd;
-	fs->pwd = *path;
-	path_get(path);
-	spin_unlock(&fs->lock);
-#else
-	write_lock(&fs->lock);
-	old_pwd = fs->pwd;
-	fs->pwd = *path;
-	path_get(path);
-	write_unlock(&fs->lock);
-#endif /* HAVE_FS_STRUCT_SPINLOCK */
-
-	if (old_pwd.dentry)
-		path_put(&old_pwd);
-}
-
-int
-vn_set_pwd(const char *filename)
-{
-	struct path path;
-	mm_segment_t saved_fs;
-	int rc;
-
-	/*
-	 * user_path_dir() and __user_walk() both expect 'filename' to be
-	 * a user space address so we must briefly increase the data segment
-	 * size to ensure strncpy_from_user() does not fail with -EFAULT.
-	 */
-	saved_fs = get_fs();
-	set_fs(KERNEL_DS);
-
-	rc = user_path_dir(filename, &path);
-	if (rc)
-		goto out;
-
-	rc = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
-	if (rc)
-		goto dput_and_out;
-
-	vn_set_fs_pwd(current->fs, &path);
-
-dput_and_out:
-	path_put(&path);
-out:
-	set_fs(saved_fs);
-
-	return (-rc);
-} /* vn_set_pwd() */
-EXPORT_SYMBOL(vn_set_pwd);
-
 static int
 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
 {
diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c
index 8616abda37bd..6c0894338e25 100644
--- a/module/zfs/spa_config.c
+++ b/module/zfs/spa_config.c
@@ -93,8 +93,7 @@ spa_config_load(void)
 	 */
 	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
-	(void) snprintf(pathname, MAXPATHLEN, "%s%s",
-	    (rootdir != NULL) ? "./" : "", spa_config_path);
+	(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
 
 	file = kobj_open_file(pathname);
 
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index f30d0a894414..c6b55d24f7ef 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -7380,13 +7380,6 @@ _init(void)
 {
 	int error;
 
-	error = -vn_set_pwd("/");
-	if (error) {
-		printk(KERN_NOTICE
-		    "ZFS: Warning unable to set pwd to '/': %d\n", error);
-		return (error);
-	}
-
 	if ((error = -zvol_init()) != 0)
 		return (error);
 

From fafe72712afbbedd9bcf6cd4b3d7b2b2f168b054 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Thu, 6 Jun 2019 06:18:46 +0900
Subject: [PATCH 008/109] Drop objid argument in zfs_znode_alloc() (sync with
 OpenZFS)

Since zfs_znode_alloc() already takes dmu_buf_t*, taking another
uint64_t argument for objid is redundant. inode's ->i_ino does and
needs to match znode's ->z_id.

zfs_znode_alloc() in FreeBSD and illumos doesn't have this argument
since vnode doesn't have vnode# in VFS (hence ->z_id exists).

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@osnexus.com>
Closes #8841
---
 module/zfs/zfs_znode.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index a27129b7992b..3dd299942202 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -515,7 +515,7 @@ zfs_inode_update(znode_t *zp)
  */
 static znode_t *
 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
-    dmu_object_type_t obj_type, uint64_t obj, sa_handle_t *hdl)
+    dmu_object_type_t obj_type, sa_handle_t *hdl)
 {
 	znode_t	*zp;
 	struct inode *ip;
@@ -596,7 +596,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	ZFS_TIME_DECODE(&ip->i_mtime, mtime);
 	ZFS_TIME_DECODE(&ip->i_ctime, ctime);
 
-	ip->i_ino = obj;
+	ip->i_ino = zp->z_id;
 	zfs_inode_update(zp);
 	zfs_inode_set_ops(zfsvfs, ip);
 
@@ -910,8 +910,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 		 * not fail retry until sufficient memory has been reclaimed.
 		 */
 		do {
-			*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, obj,
-			    sa_hdl);
+			*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
 		} while (*zpp == NULL);
 
 		VERIFY(*zpp != NULL);
@@ -1134,7 +1133,7 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 	 * bonus buffer.
 	 */
 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
-	    doi.doi_bonus_type, obj_num, NULL);
+	    doi.doi_bonus_type, NULL);
 	if (zp == NULL) {
 		err = SET_ERROR(ENOENT);
 	} else {

From b63ed49c2996d3fe400ddd5e032a521cf05a7d10 Mon Sep 17 00:00:00 2001
From: Serapheim Dimitropoulos <serapheim@delphix.com>
Date: Thu, 6 Jun 2019 13:08:41 -0700
Subject: [PATCH 009/109] Reduced IOPS when all vdevs are in the
 zfs_mg_fragmentation_threshold

Historically while doing performance testing we've noticed that IOPS
can be significantly reduced when all vdevs in the pool are hitting
the zfs_mg_fragmentation_threshold percentage. Specifically in a
hypothetical pool with two vdevs, what can happen is the following:
Vdev A would go above that threshold and only vdev B would be used.
Then vdev B would pass that threshold but vdev A would go below it
(we've been freeing from A to allocate to B). The allocations would
go back and forth utilizing one vdev at a time with IOPS taking a hit.

Empirically, we've seen that our vdev selection for allocations is
good enough that fragmentation increases uniformly across all vdevs
the majority of the time. Thus we set the threshold percentage high
enough to avoid hitting the speed bump on pools that are being pushed
to the edge. We effectively disable its effect in the majority of the
cases but we don't remove (at least for now) just in case we hit any
weird behavior in the future.

Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #8859
---
 man/man5/zfs-module-parameters.5 |  2 +-
 module/zfs/metaslab.c            | 25 ++++++++++++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 5bca12e06ea2..282563f13723 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1817,7 +1817,7 @@ this value. If a metaslab group exceeds this threshold then it will be
 skipped unless all metaslab groups within the metaslab class have also
 crossed this threshold.
 .sp
-Default value: \fB85\fR.
+Default value: \fB95\fR.
 .RE
 
 .sp
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index ec89810b48ab..d1d5a243f403 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -103,12 +103,27 @@ int zfs_mg_noalloc_threshold = 0;
 
 /*
  * Metaslab groups are considered eligible for allocations if their
- * fragmenation metric (measured as a percentage) is less than or equal to
- * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
- * then it will be skipped unless all metaslab groups within the metaslab
- * class have also crossed this threshold.
+ * fragmenation metric (measured as a percentage) is less than or
+ * equal to zfs_mg_fragmentation_threshold. If a metaslab group
+ * exceeds this threshold then it will be skipped unless all metaslab
+ * groups within the metaslab class have also crossed this threshold.
+ *
+ * This tunable was introduced to avoid edge cases where we continue
+ * allocating from very fragmented disks in our pool while other, less
+ * fragmented disks, exists. On the other hand, if all disks in the
+ * pool are uniformly approaching the threshold, the threshold can
+ * be a speed bump in performance, where we keep switching the disks
+ * that we allocate from (e.g. we allocate some segments from disk A
+ * making it bypassing the threshold while freeing segments from disk
+ * B getting its fragmentation below the threshold).
+ *
+ * Empirically, we've seen that our vdev selection for allocations is
+ * good enough that fragmentation increases uniformly across all vdevs
+ * the majority of the time. Thus we set the threshold percentage high
+ * enough to avoid hitting the speed bump on pools that are being pushed
+ * to the edge.
  */
-int zfs_mg_fragmentation_threshold = 85;
+int zfs_mg_fragmentation_threshold = 95;
 
 /*
  * Allow metaslabs to keep their active state as long as their fragmentation

From 60cbc18136d8a5c389ec3e6f3da703f30b9687be Mon Sep 17 00:00:00 2001
From: Allan Jude <allanjude@freebsd.org>
Date: Thu, 6 Jun 2019 16:14:48 -0400
Subject: [PATCH 010/109] l2arc_apply_transforms: Fix typo in comment

Reviewed-by: Chris Dunlop <chris@onthe.net.au>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Richard Laager <rlaager@wiktel.com>
Signed-off-by: Allan Jude <allanjude@freebsd.org>
Closes #8822
---
 module/zfs/arc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 3dfa6ca202d1..946ea3415eda 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -8760,7 +8760,7 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 
 	/*
 	 * If this data simply needs its own buffer, we simply allocate it
-	 * and copy the data. This may be done to elimiate a depedency on a
+	 * and copy the data. This may be done to eliminate a dependency on a
 	 * shared buffer or to reallocate the buffer to match asize.
 	 */
 	if (HDR_HAS_RABD(hdr) && asize != psize) {

From 06900c409ba9dd62ace0fec5aa0558ca4f115f18 Mon Sep 17 00:00:00 2001
From: Jorgen Lundman <lundman@lundman.net>
Date: Fri, 7 Jun 2019 11:01:41 +0900
Subject: [PATCH 011/109] Avoid updating zfs_gitrev.h when rev is unchanged

Build process would always re-compile spa_history.c due to touching
zfs_gitrev.h - avoid if no change in gitrev.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Chris Dunlop <chris@onthe.net.au>
Reviewed-by: Allan Jude <allanjude@freebsd.org>
Signed-off-by: Jorgen Lundman <lundman@lundman.net>
Closes #8860
---
 scripts/make_gitrev.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/make_gitrev.sh b/scripts/make_gitrev.sh
index bab9be88d734..1cf143794b26 100755
--- a/scripts/make_gitrev.sh
+++ b/scripts/make_gitrev.sh
@@ -39,3 +39,7 @@ trap cleanup EXIT
 git rev-parse --git-dir > /dev/null 2>&1
 # Get the git current git revision
 ZFS_GIT_REV=$(git describe --always --long --dirty 2>/dev/null)
+# Check if header file already contain the exact string
+grep -sq "\"${ZFS_GIT_REV}\"" "$(dirname "$0")"/../include/zfs_gitrev.h &&
+	trap - EXIT
+exit 0

From 6f7bc7582539048c2280b7d7892a06e4c7f917f8 Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <pcd@delphix.com>
Date: Thu, 6 Jun 2019 19:10:43 -0700
Subject: [PATCH 012/109] Allow metaslab to be unloaded even when not freed
 from

On large systems, the memory used by loaded metaslabs can become
a concern. While range trees are a fairly efficient data structure,
on heavily fragmented pools they can still consume a significant
amount of memory. This problem is amplified when we fail to unload
metaslabs that we aren't using. Currently, we only unload a metaslab
during metaslab_sync_done; in order for that function to be called
on a given metaslab in a given txg, we have to have dirtied that
metaslab in that txg. If the dirtying was the result of an allocation,
we wouldn't be unloading it (since it wouldn't be 8 txgs since it
was selected), so in effect we only unload a metaslab during txgs
where it's being freed from.

We move the unload logic from sync_done to a new function, and
call that function on all metaslabs in a given vdev during
vdev_sync_done().

Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #8837
---
 include/sys/metaslab.h |  1 +
 module/zfs/metaslab.c  | 47 ++++++++++++++++++++++--------------------
 module/zfs/vdev.c      | 14 +++++++++++++
 3 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 2790d06c71d2..330902529664 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -50,6 +50,7 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
 void metaslab_fini(metaslab_t *);
 
 int metaslab_load(metaslab_t *);
+void metaslab_potentially_unload(metaslab_t *, uint64_t);
 void metaslab_unload(metaslab_t *);
 
 uint64_t metaslab_allocated_space(metaslab_t *);
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index d1d5a243f403..41cbaad5f8df 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  */
@@ -2949,6 +2949,30 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	dmu_tx_commit(tx);
 }
 
+void
+metaslab_potentially_unload(metaslab_t *msp, uint64_t txg)
+{
+	/*
+	 * If the metaslab is loaded and we've not tried to load or allocate
+	 * from it in 'metaslab_unload_delay' txgs, then unload it.
+	 */
+	if (msp->ms_loaded &&
+	    msp->ms_disabled == 0 &&
+	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
+		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+			VERIFY0(range_tree_space(
+			    msp->ms_allocating[(txg + t) & TXG_MASK]));
+		}
+		if (msp->ms_allocator != -1) {
+			metaslab_passivate(msp, msp->ms_weight &
+			    ~METASLAB_ACTIVE_MASK);
+		}
+
+		if (!metaslab_debug_unload)
+			metaslab_unload(msp);
+	}
+}
+
 /*
  * Called after a transaction group has completely synced to mark
  * all of the metaslab's free space as usable.
@@ -3086,27 +3110,6 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	 */
 	metaslab_recalculate_weight_and_sort(msp);
 
-	/*
-	 * If the metaslab is loaded and we've not tried to load or allocate
-	 * from it in 'metaslab_unload_delay' txgs, then unload it.
-	 */
-	if (msp->ms_loaded &&
-	    msp->ms_disabled == 0 &&
-	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
-
-		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
-			VERIFY0(range_tree_space(
-			    msp->ms_allocating[(txg + t) & TXG_MASK]));
-		}
-		if (msp->ms_allocator != -1) {
-			metaslab_passivate(msp, msp->ms_weight &
-			    ~METASLAB_ACTIVE_MASK);
-		}
-
-		if (!metaslab_debug_unload)
-			metaslab_unload(msp);
-	}
-
 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_freeing));
 	ASSERT0(range_tree_space(msp->ms_freed));
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 1c4812cd86d9..81ef87e254a8 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -3234,6 +3234,20 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
 	    != NULL)
 		metaslab_sync_done(msp, txg);
 
+	/*
+	 * Because this function is only called on dirty vdevs, it's possible
+	 * we won't consider all metaslabs for unloading on every
+	 * txg. However, unless the system is largely idle it is likely that
+	 * we will dirty all vdevs within a few txgs.
+	 */
+	for (int i = 0; i < vd->vdev_ms_count; i++) {
+		msp = vd->vdev_ms[i];
+		mutex_enter(&msp->ms_lock);
+		if (msp->ms_sm != NULL)
+			metaslab_potentially_unload(msp, txg);
+		mutex_exit(&msp->ms_lock);
+	}
+
 	if (reassess)
 		metaslab_sync_reassess(vd->vdev_mg);
 }

From c350e62309edc413f9f2312338e5a0b084ebeb8d Mon Sep 17 00:00:00 2001
From: Richard Elling <Richard.Elling@RichardElling.com>
Date: Wed, 5 Jun 2019 16:13:57 -0700
Subject: [PATCH 013/109] Fix logic error in setpartition function

Reviewed by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Elling <Richard.Elling@RichardElling.com>
Closes #8839
---
 tests/zfs-tests/include/libtest.shlib | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index 57d0880cc9bb..b3893c2c3812 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -861,7 +861,8 @@ function zero_partitions #<whole_disk_name>
 # best to retire this interface and replace it with something more flexible.
 # At the moment a best effort is made.
 #
-function set_partition #<slice_num> <slice_start> <size_plus_units>  <whole_disk_name>
+# arguments: <slice_num> <slice_start> <size_plus_units>  <whole_disk_name>
+function set_partition
 {
 	typeset -i slicenum=$1
 	typeset start=$2
@@ -872,6 +873,7 @@ function set_partition #<slice_num> <slice_start> <size_plus_units>  <whole_disk
 		if [[ -z $size || -z $disk ]]; then
 			log_fail "The size or disk name is unspecified."
 		fi
+		[[ -n $DEV_DSKDIR ]] && disk=$DEV_DSKDIR/$disk
 		typeset size_mb=${size%%[mMgG]}
 
 		size_mb=${size_mb%%[mMgG][bB]}
@@ -881,10 +883,10 @@ function set_partition #<slice_num> <slice_start> <size_plus_units>  <whole_disk
 
 		# Create GPT partition table when setting slice 0 or
 		# when the device doesn't already contain a GPT label.
-		parted $DEV_DSKDIR/$disk -s -- print 1 >/dev/null
+		parted $disk -s -- print 1 >/dev/null
 		typeset ret_val=$?
 		if [[ $slicenum -eq 0 || $ret_val -ne 0 ]]; then
-			parted $DEV_DSKDIR/$disk -s -- mklabel gpt
+			parted $disk -s -- mklabel gpt
 			if [[ $? -ne 0 ]]; then
 				log_note "Failed to create GPT partition table on $disk"
 				return 1
@@ -899,20 +901,21 @@ function set_partition #<slice_num> <slice_start> <size_plus_units>  <whole_disk
 		# Determine the cylinder size for the device and using
 		# that calculate the end offset in cylinders.
 		typeset -i cly_size_kb=0
-		cly_size_kb=$(parted -m $DEV_DSKDIR/$disk -s -- \
+		cly_size_kb=$(parted -m $disk -s -- \
 			unit cyl print | head -3 | tail -1 | \
 			awk -F '[:k.]' '{print $4}')
 		((end = (size_mb * 1024 / cly_size_kb) + start))
 
-		parted $DEV_DSKDIR/$disk -s -- \
+		parted $disk -s -- \
 		    mkpart part$slicenum ${start}cyl ${end}cyl
-		if [[ $? -ne 0 ]]; then
+		typeset ret_val=$?
+		if [[ $ret_val -ne 0 ]]; then
 			log_note "Failed to create partition $slicenum on $disk"
 			return 1
 		fi
 
-		blockdev --rereadpt $DEV_DSKDIR/$disk 2>/dev/null
-		block_device_wait
+		blockdev --rereadpt $disk 2>/dev/null
+		block_device_wait $disk
 	else
 		if [[ -z $slicenum || -z $size || -z $disk ]]; then
 			log_fail "The slice, size or disk name is unspecified."
@@ -932,9 +935,10 @@ function set_partition #<slice_num> <slice_start> <size_plus_units>  <whole_disk
 		echo "q" >> $format_file
 
 		format -e -s -d $disk -f $format_file
+		typeset ret_val=$?
+		rm -f $format_file
 	fi
 
-	typeset ret_val=$?
 	rm -f $format_file
 	if [[ $ret_val -ne 0 ]]; then
 		log_note "Unable to format $disk slice $slicenum to $size"

From a22b00f92480b7341859266176b23c4a801e462b Mon Sep 17 00:00:00 2001
From: Richard Elling <Richard.Elling@RichardElling.com>
Date: Wed, 5 Jun 2019 16:22:04 -0700
Subject: [PATCH 014/109] Remove redundant redundant remove

Reviewed by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Elling <Richard.Elling@RichardElling.com>
Closes #8839
---
 tests/zfs-tests/include/libtest.shlib | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index b3893c2c3812..1b841d7ba02c 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -939,7 +939,6 @@ function set_partition
 		rm -f $format_file
 	fi
 
-	rm -f $format_file
 	if [[ $ret_val -ne 0 ]]; then
 		log_note "Unable to format $disk slice $slicenum to $size"
 		return 1

From fb52bf9b1daf237e23e49a6ba43eb9d3e300f758 Mon Sep 17 00:00:00 2001
From: Richard Elling <Richard.Elling@RichardElling.com>
Date: Fri, 7 Jun 2019 10:12:42 -0700
Subject: [PATCH 015/109] Block_device_wait does not return an error code

Reviewed by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Elling <Richard.Elling@RichardElling.com>
Closes #8839
---
 tests/zfs-tests/include/blkdev.shlib                          | 3 +++
 .../tests/functional/rsend/send-wDR_encrypted_zvol.ksh        | 4 ++--
 tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh  | 4 ++--
 .../zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh  | 2 +-
 .../zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh  | 4 ++--
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/zfs-tests/include/blkdev.shlib b/tests/zfs-tests/include/blkdev.shlib
index 9cac7184f9fc..e9d584af4b6a 100644
--- a/tests/zfs-tests/include/blkdev.shlib
+++ b/tests/zfs-tests/include/blkdev.shlib
@@ -56,6 +56,9 @@ function scan_scsi_hosts
 #
 # Wait for newly created block devices to have their minors created.
 #
+# Note: there is no meaningful return code if udevadm fails. Consumers
+# should not expect a return code (do not call as argument to log_must)
+#
 function block_device_wait
 {
 	if is_linux; then
diff --git a/tests/zfs-tests/tests/functional/rsend/send-wDR_encrypted_zvol.ksh b/tests/zfs-tests/tests/functional/rsend/send-wDR_encrypted_zvol.ksh
index 49b846e9c332..443887bfa238 100755
--- a/tests/zfs-tests/tests/functional/rsend/send-wDR_encrypted_zvol.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/send-wDR_encrypted_zvol.ksh
@@ -62,7 +62,7 @@ log_must eval "echo 'password' > $keyfile"
 
 log_must zfs create -o dedup=on -o encryption=on -o keyformat=passphrase \
 	-o keylocation=file://$keyfile -V 128M $TESTPOOL/$TESTVOL
-log_must block_device_wait
+block_device_wait
 
 log_must eval "echo 'y' | newfs -t ext4 -v $zdev"
 log_must mkdir -p $mntpnt
@@ -82,7 +82,7 @@ done
 log_must eval "zfs send -wDR $TESTPOOL/$TESTVOL@snap$snap_count > $sendfile"
 log_must eval "zfs recv $TESTPOOL/recv < $sendfile"
 log_must zfs load-key $TESTPOOL/recv
-log_must block_device_wait
+block_device_wait
 
 log_must mount $recvdev $recvmnt
 
diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh
index 2cdcb38dc257..c8a3cbbf43c4 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh
@@ -86,7 +86,7 @@ log_must zfs create -V 128M $TESTPOOL/$TESTVOL
 log_must zfs set compression=on $TESTPOOL/$TESTVOL
 log_must zfs set sync=always $TESTPOOL/$TESTVOL
 log_must mkdir -p $TESTDIR
-log_must block_device_wait
+block_device_wait
 echo "y" | newfs -t ext4 -v $VOLUME
 log_must mkdir -p $MNTPNT
 log_must mount -o discard $VOLUME $MNTPNT
@@ -149,7 +149,7 @@ log_must zpool export $TESTPOOL
 # `zpool import -f` because we can't write a frozen pool's labels!
 #
 log_must zpool import -f $TESTPOOL
-log_must block_device_wait
+block_device_wait
 log_must mount $VOLUME $MNTPNT
 
 #
diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh
index 6607d4ca4974..1ee7e33c2ac2 100755
--- a/tests/zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh
+++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh
@@ -88,7 +88,7 @@ else
 fi
 
 log_must zfs snapshot -r $snappool
-log_must block_device_wait
+block_device_wait
 
 #verify the snapshot -r results
 for snap in $snappool $snapfs $snapvol $snapctr $snapctrvol \
diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh
index 0f876ad6d61e..128b443c6fc9 100755
--- a/tests/zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh
+++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh
@@ -83,7 +83,7 @@ else
 fi
 
 log_must zfs snapshot -r $snappool
-log_must block_device_wait
+block_device_wait
 
 #select the $TESTCTR as destroy point, $TESTCTR is a child of $TESTPOOL
 log_must zfs destroy -r $snapctr
@@ -92,7 +92,7 @@ for snap in $snapctr $snapctrvol $snapctrclone $snapctrfs; do
 		log_fail "The snapshot $snap is not destroyed correctly."
 done
 
-for snap in $snappool $snapfs $snapvol $ctrfs@$TESTSNAP1;do
+for snap in $snappool $snapfs $snapvol $ctrfs@$TESTSNAP1; do
 	! snapexists $snap && \
 		log_fail "The snapshot $snap should be not destroyed."
 done

From 4be4dedb9f50edb35b18db4eef5c277bd93d23fa Mon Sep 17 00:00:00 2001
From: Richard Elling <Richard.Elling@RichardElling.com>
Date: Thu, 30 May 2019 16:38:51 -0700
Subject: [PATCH 016/109] Improve ZTS block_device_wait debugging

The udevadm settle timeout can be 120 or 180 seconds by default
for some distributions. If a long delay is experienced, it could
be due to some strangeness in a malfunctioning device that isn't
related to the devices under test. To help debug this condition,
a notice is given if settle takes too long.

Arguments can now be passed to block_device_wait. The expected
arguments are block device pathnames.

Reviewed by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Elling <Richard.Elling@RichardElling.com>
Closes #8839
---
 tests/zfs-tests/include/blkdev.shlib | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tests/zfs-tests/include/blkdev.shlib b/tests/zfs-tests/include/blkdev.shlib
index e9d584af4b6a..ca8807e82c6a 100644
--- a/tests/zfs-tests/include/blkdev.shlib
+++ b/tests/zfs-tests/include/blkdev.shlib
@@ -18,6 +18,7 @@
 # Copyright (c) 2017 Lawrence Livermore National Security, LLC.
 # Copyright (c) 2017 Datto Inc.
 # Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+# Copyright 2019 Richard Elling
 #
 
 #
@@ -55,6 +56,16 @@ function scan_scsi_hosts
 
 #
 # Wait for newly created block devices to have their minors created.
+# Additional arguments can be passed to udevadm trigger, with the expected
+# arguments to typically be a block device pathname. This is useful when
+# checking waiting on a specific device to settle rather than triggering
+# all devices and waiting for them all to settle.
+#
+# The udevadm settle timeout can be 120 or 180 seconds by default for
+# some distros. If a long delay is experienced, it could be due to some
+# strangeness in a malfunctioning device that isn't related to the devices
+# under test. To help debug this condition, a notice is given if settle takes
+# too long.
 #
 # Note: there is no meaningful return code if udevadm fails. Consumers
 # should not expect a return code (do not call as argument to log_must)
@@ -62,8 +73,12 @@ function scan_scsi_hosts
 function block_device_wait
 {
 	if is_linux; then
-		udevadm trigger
+		udevadm trigger $*
+		typeset local start=$SECONDS
 		udevadm settle
+		typeset local elapsed=$((SECONDS - start))
+		[[ $elapsed > 60 ]] && \
+		    log_note udevadm settle time too long: $elapsed
 	fi
 }
 

From fe11968bbfb6bd825790a51228483f51b3d30d1f Mon Sep 17 00:00:00 2001
From: bnjf <bnjf@users.noreply.github.com>
Date: Thu, 13 Jun 2019 06:03:33 +1000
Subject: [PATCH 017/109] Fix typo in vdev_raidz_math.c

Fix typo in vdev_raidz_math.c

Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Brad Forschinger <github@bnjf.id.au>
Closes #8875
Closes #8880
---
 module/zfs/vdev_raidz_math.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c
index e6112bc02137..3ef67768f916 100644
--- a/module/zfs/vdev_raidz_math.c
+++ b/module/zfs/vdev_raidz_math.c
@@ -472,7 +472,7 @@ vdev_raidz_math_init(void)
 	return;
 #endif
 
-	/* Fake an zio and run the benchmark on a warmed up buffer */
+	/* Fake a zio and run the benchmark on a warmed up buffer */
 	bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
 	bench_zio->io_offset = 0;
 	bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */

From 812c36fc711b5f1dc7b41f27761b5e283f16df19 Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <mahrens@delphix.com>
Date: Wed, 12 Jun 2019 13:06:55 -0700
Subject: [PATCH 018/109] Target ARC size can get reduced to arc_c_min

Sometimes the target ARC size is reduced to arc_c_min, which impacts
performance.  We've seen this happen as part of the random_reads
performance regression test, where the ARC size is reduced before the
reads test starts which impacts how long it takes for system to reach
good IOPS performance.

We call arc_reduce_target_size when arc_reap_cb_check() returns TRUE,
and arc_available_memory() is less than arc_c>>arc_shrink_shift.

However, arc_available_memory() could easily be low, even when arc_c is
low, because we can have tons of unused bufs in the abd kmem cache. This
would be especially true just after the DMU requests a bunch of stuff be
evicted from the ARC (e.g. due to "zpool export").

To fix this, the ARC should reduce arc_c by the requested amount, not
all the way down to arc_size (or arc_c_min), which can be very small.

Reviewed-by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59431
Closes #8864
---
 module/zfs/arc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 946ea3415eda..a7e7d26996f8 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -4801,8 +4801,6 @@ arc_reduce_target_size(int64_t to_free)
 	if (c > to_free && c - to_free > arc_c_min) {
 		arc_c = c - to_free;
 		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
-		if (asize < arc_c)
-			arc_c = MAX(asize, arc_c_min);
 		if (arc_p > arc_c)
 			arc_p = (arc_c >> 1);
 		ASSERT(arc_c >= arc_c_min);

From 516a08ebb4e24e09fc9ec39a7204d2f9d20d043d Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <mahrens@delphix.com>
Date: Wed, 12 Jun 2019 13:13:09 -0700
Subject: [PATCH 019/109] fat zap should prefetch when iterating

When iterating over a ZAP object, we're almost always certain to iterate
over the entire object. If there are multiple leaf blocks, we can
realize a performance win by issuing reads for all the leaf blocks in
parallel when the iteration begins.

For example, if we have 10,000 snapshots, "zfs destroy -nv
pool/fs@1%9999" can take 30 minutes when the cache is cold. This change
provides a >3x performance improvement, by issuing the reads for all ~64
blocks of each ZAP object in parallel.

Reviewed-by: Andreas Dilger <andreas.dilger@whamcloud.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-58347
Closes #8862
---
 include/sys/zap.h                |  7 ++--
 man/man5/zfs-module-parameters.5 | 25 ++++++++++++++
 module/zfs/ddt_zap.c             | 14 +++++++-
 module/zfs/dmu.c                 | 16 +++++++++
 module/zfs/zap.c                 | 56 +++++++++++++++++++++++++++++++-
 module/zfs/zap_micro.c           | 31 +++++++++++++++---
 6 files changed, 140 insertions(+), 9 deletions(-)

diff --git a/include/sys/zap.h b/include/sys/zap.h
index ab13652d8c07..b19b4643879c 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
@@ -350,6 +350,7 @@ typedef struct zap_cursor {
 	uint64_t zc_serialized;
 	uint64_t zc_hash;
 	uint32_t zc_cd;
+	boolean_t zc_prefetch;
 } zap_cursor_t;
 
 typedef struct {
@@ -375,7 +376,9 @@ typedef struct {
  * Initialize a zap cursor, pointing to the "first" attribute of the
  * zapobj.  You must _fini the cursor when you are done with it.
  */
-void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
+void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);
+void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
+    uint64_t zapobj);
 void zap_cursor_fini(zap_cursor_t *zc);
 
 /*
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 282563f13723..29374a9d3965 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -104,6 +104,18 @@ to a log2 fraction of the target arc size.
 Default value: \fB6\fR.
 .RE
 
+.sp
+.ne 2
+.na
+\fBdmu_prefetch_max\fR (int)
+.ad
+.RS 12n
+Limit the amount we can prefetch with one call to this amount (in bytes).
+This helps to limit the amount of memory that can be used by prefetching.
+.sp
+Default value: \fB134,217,728\fR (128MB).
+.RE
+
 .sp
 .ne 2
 .na
@@ -502,6 +514,19 @@ regular reads (but there's no reason it has to be the same).
 Default value: \fB32,768\fR.
 .RE
 
+.sp
+.ne 2
+.na
+\fBzap_iterate_prefetch\fR (int)
+.ad
+.RS 12n
+If this is set, when we start iterating over a ZAP object, zfs will prefetch
+the entire object (all leaf blocks).  However, this is limited by
+\fBdmu_prefetch_max\fR.
+.sp
+Use \fB1\fR for on (default) and \fB0\fR for off.
+.RE
+
 .sp
 .ne 2
 .na
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index 77c0784cca0b..3489d31d9c9e 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -117,7 +118,18 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
 	zap_attribute_t za;
 	int error;
 
-	zap_cursor_init_serialized(&zc, os, object, *walk);
+	if (*walk == 0) {
+		/*
+		 * We don't want to prefetch the entire ZAP object, because
+		 * it can be enormous.  Also the primary use of DDT iteration
+		 * is for scrubbing, in which case we will be issuing many
+		 * scrub I/Os for each ZAP block that we read in, so
+		 * reading the ZAP is unlikely to be the bottleneck.
+		 */
+		zap_cursor_init_noprefetch(&zc, os, object);
+	} else {
+		zap_cursor_init_serialized(&zc, os, object, *walk);
+	}
 	if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
 		uchar_t cbuf[sizeof (dde->dde_phys) + 1];
 		uint64_t csize = za.za_num_integers;
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 2d6740576bb6..b4131d91781a 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -81,6 +81,13 @@ int zfs_dmu_offset_next_sync = 0;
  */
 int zfs_object_remap_one_indirect_delay_ms = 0;
 
+/*
+ * Limit the amount we can prefetch with one call to this amount.  This
+ * helps to limit the amount of memory that can be used by prefetching.
+ * Larger objects should be prefetched a bit at a time.
+ */
+int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "object directory"	},
@@ -667,6 +674,11 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 		return;
 	}
 
+	/*
+	 * See comment before the definition of dmu_prefetch_max.
+	 */
+	len = MIN(len, dmu_prefetch_max);
+
 	/*
 	 * XXX - Note, if the dnode for the requested object is not
 	 * already cached, we will do a *synchronous* read in the
@@ -2629,6 +2641,10 @@ module_param(zfs_dmu_offset_next_sync, int, 0644);
 MODULE_PARM_DESC(zfs_dmu_offset_next_sync,
 	"Enable forcing txg sync to find holes");
 
+module_param(dmu_prefetch_max, int, 0644);
+MODULE_PARM_DESC(dmu_prefetch_max,
+	"Limit one prefetch call to this size");
+
 /* END CSTYLED */
 
 #endif
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index 6d8c498042c9..30f62ac43b62 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
@@ -49,6 +49,36 @@
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 
+/*
+ * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
+ * (all leaf blocks) when we start iterating over it.
+ *
+ * For zap_cursor_init(), the callers all intend to iterate through all the
+ * entries.  There are a few cases where an error (typically i/o error) could
+ * cause it to bail out early.
+ *
+ * For zap_cursor_init_serialized(), there are callers that do the iteration
+ * outside of ZFS.  Typically they would iterate over everything, but we
+ * don't have control of that.  E.g. zfs_ioc_snapshot_list_next(),
+ * zcp_snapshots_iter(), and other iterators over things in the MOS - these
+ * are called by /sbin/zfs and channel programs.  The other example is
+ * zfs_readdir() which iterates over directory entries for the getdents()
+ * syscall.  /sbin/ls iterates to the end (unless it receives a signal), but
+ * userland doesn't have to.
+ *
+ * Given that the ZAP entries aren't returned in a specific order, the only
+ * legitimate use cases for partial iteration would be:
+ *
+ * 1. Pagination: e.g. you only want to display 100 entries at a time, so you
+ *    get the first 100 and then wait for the user to hit "next page", which
+ *    they may never do).
+ *
+ * 2. You want to know if there are more than X entries, without relying on
+ *    the zfs-specific implementation of the directory's st_size (which is
+ *    the number of entries).
+ */
+int zap_iterate_prefetch = B_TRUE;
+
 int fzap_default_block_shift = 14; /* 16k blocksize */
 
 extern inline zap_phys_t *zap_f_phys(zap_t *zap);
@@ -1189,6 +1219,21 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 	/* retrieve the next entry at or after zc_hash/zc_cd */
 	/* if no entry, return ENOENT */
 
+	/*
+	 * If we are reading from the beginning, we're almost certain to
+	 * iterate over the entire ZAP object.  If there are multiple leaf
+	 * blocks (freeblk > 2), prefetch the whole object (up to
+	 * dmu_prefetch_max bytes), so that we read the leaf blocks
+	 * concurrently. (Unless noprefetch was requested via
+	 * zap_cursor_init_noprefetch()).
+	 */
+	if (zc->zc_hash == 0 && zap_iterate_prefetch &&
+	    zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
+		dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
+		    zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
+		    ZIO_PRIORITY_ASYNC_READ);
+	}
+
 	if (zc->zc_leaf &&
 	    (ZAP_HASH_IDX(zc->zc_hash,
 	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
@@ -1333,3 +1378,12 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 		}
 	}
 }
+
+#if defined(_KERNEL)
+/* BEGIN CSTYLED */
+module_param(zap_iterate_prefetch, int, 0644);
+MODULE_PARM_DESC(zap_iterate_prefetch,
+	"When iterating ZAP object, prefetch it");
+
+/* END CSTYLED */
+#endif
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index fa369f797548..467812ff637c 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  */
@@ -1472,9 +1472,9 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
  * Routines for iterating over the attributes.
  */
 
-void
-zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
-    uint64_t serialized)
+static void
+zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+    uint64_t serialized, boolean_t prefetch)
 {
 	zc->zc_objset = os;
 	zc->zc_zap = NULL;
@@ -1483,12 +1483,33 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
 	zc->zc_serialized = serialized;
 	zc->zc_hash = 0;
 	zc->zc_cd = 0;
+	zc->zc_prefetch = prefetch;
+}
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+    uint64_t serialized)
+{
+	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
 }
 
+/*
+ * Initialize a cursor at the beginning of the ZAP object.  The entire
+ * ZAP object will be prefetched.
+ */
 void
 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
 {
-	zap_cursor_init_serialized(zc, os, zapobj, 0);
+	zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
+}
+
+/*
+ * Initialize a cursor at the beginning, but request that we not prefetch
+ * the entire ZAP object.
+ */
+void
+zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+	zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
 }
 
 void

From 4f809bddc67b152afd9e9a52a01d1af132151a9f Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Thu, 13 Jun 2019 09:15:06 +0900
Subject: [PATCH 020/109] Fix lockdep warning on insmod

sysfs_attr_init() is required to make lockdep happy for dynamically
allocated sysfs attributes. This fixed #8868 on Fedora 29 running
kernel-debug.

This requirement was introduced in 2.6.34.
See include/linux/sysfs.h for what it actually does.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #8868
Closes #8884
---
 module/zfs/zfs_sysfs.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/module/zfs/zfs_sysfs.c b/module/zfs/zfs_sysfs.c
index 30b5edb01e18..2f5bea9aa996 100644
--- a/module/zfs/zfs_sysfs.c
+++ b/module/zfs/zfs_sysfs.c
@@ -144,6 +144,10 @@ zfs_kobj_release(struct kobject *kobj)
 	zkobj->zko_attr_count = 0;
 }
 
+#ifndef sysfs_attr_init
+#define	sysfs_attr_init(attr) do {} while (0)
+#endif
+
 static void
 zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name)
 {
@@ -154,6 +158,7 @@ zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name)
 	zkobj->zko_attr_list[attr_num].name = attr_name;
 	zkobj->zko_attr_list[attr_num].mode = 0444;
 	zkobj->zko_default_attrs[attr_num] = &zkobj->zko_attr_list[attr_num];
+	sysfs_attr_init(&zkobj->zko_attr_list[attr_num]);
 }
 
 static int

From 77e64c6fffa2af6c3b8aeb8b486873a3fca91e53 Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <mahrens@delphix.com>
Date: Thu, 13 Jun 2019 08:48:43 -0700
Subject: [PATCH 021/109] ztest: dmu_tx_assign() gets ENOSPC in
 spa_vdev_remove_thread()

When running zloop, we occasionally see the following crash:

    dmu_tx_assign(tx, TXG_WAIT) == 0 (0x1c == 0)
    ASSERT at ../../module/zfs/vdev_removal.c:1507:spa_vdev_remove_thread()/sbin/ztest(+0x89c3)[0x55faf567b9c3]

The error value 0x1c is ENOSPC.

The transaction used by spa_vdev_remove_thread() should not be able to
fail due to being out of space. i.e. we should not call
dmu_tx_hold_space().  This will allow the removal thread to schedule its
work even when the pool is low on space.  The "slop space" will provide
enough free space to sync out the txg.

Reviewed-by: Igor Kozhukhov <igor@dilos.org>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-37853
Closes #8889
---
 module/zfs/vdev_removal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index f2d18d9257bd..536a982eca2b 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -1498,7 +1498,7 @@ spa_vdev_remove_thread(void *arg)
 
 			dmu_tx_t *tx =
 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
-			dmu_tx_hold_space(tx, SPA_MAXBLOCKSIZE);
+
 			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 			uint64_t txg = dmu_tx_get_txg(tx);
 

From 19cebf05187d60605ae38ddef9cdf7b10a51deba Mon Sep 17 00:00:00 2001
From: Tulsi Jain <TulsiJain@users.noreply.github.com>
Date: Thu, 13 Jun 2019 08:56:15 -0700
Subject: [PATCH 022/109] Restrict filesystem creation if name referred either
 '.' or '..'

This change restricts filesystem creation if the given name
contains either '.' or '..'

Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: TulsiJain <tulsi.jain@delphix.com>
Closes #8842
Closes #8564
---
 include/zfs_namecheck.h                       |  2 ++
 lib/libzfs/libzfs_dataset.c                   | 10 +++++++++
 module/zcommon/zfs_namecheck.c                | 21 +++++++++++++++++++
 .../zfs_create/zfs_create_009_neg.ksh         |  4 +++-
 4 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/include/zfs_namecheck.h b/include/zfs_namecheck.h
index 527db92b0cfa..56d3d36f026e 100644
--- a/include/zfs_namecheck.h
+++ b/include/zfs_namecheck.h
@@ -43,6 +43,8 @@ typedef enum {
 	NAME_ERR_RESERVED,		/* entire name is reserved */
 	NAME_ERR_DISKLIKE,		/* reserved disk name (c[0-9].*) */
 	NAME_ERR_TOOLONG,		/* name is too long */
+	NAME_ERR_SELF_REF,		/* reserved self path name ('.') */
+	NAME_ERR_PARENT_REF,		/* reserved parent path name ('..') */
 	NAME_ERR_NO_AT,			/* permission set is missing '@' */
 } namecheck_err_t;
 
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index 93af50b99cdd..3be205f1f437 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -197,6 +197,16 @@ zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
 				    "reserved disk name"));
 				break;
 
+			case NAME_ERR_SELF_REF:
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "self reference, '.' is found in name"));
+				break;
+
+			case NAME_ERR_PARENT_REF:
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "parent reference, '..' is found in name"));
+				break;
+
 			default:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "(%d) not defined"), why);
diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c
index 58b23b0e00b0..b1e0de6d8181 100644
--- a/module/zcommon/zfs_namecheck.c
+++ b/module/zcommon/zfs_namecheck.c
@@ -232,6 +232,27 @@ entity_namecheck(const char *path, namecheck_err_t *why, char *what)
 			}
 		}
 
+		if (*end == '\0' || *end == '/') {
+			int component_length = end - start;
+			/* Validate the contents of this component is not '.' */
+			if (component_length == 1) {
+				if (start[0] == '.') {
+					if (why)
+						*why = NAME_ERR_SELF_REF;
+					return (-1);
+				}
+			}
+
+			/* Validate the content of this component is not '..' */
+			if (component_length == 2) {
+				if (start[0] == '.' && start[1] == '.') {
+					if (why)
+						*why = NAME_ERR_PARENT_REF;
+					return (-1);
+				}
+			}
+		}
+
 		/* Snapshot or bookmark delimiter found */
 		if (*end == '@' || *end == '#') {
 			/* Multiple delimiters are not allowed */
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh
index b8190626c7b3..63f5e595ea38 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh
@@ -90,7 +90,9 @@ set -A args  "$TESTPOOL/" "$TESTPOOL//blah" "$TESTPOOL/@blah" \
 	"$TESTPOOL/blah*blah" "$TESTPOOL/blah blah" \
 	"-s $TESTPOOL/$TESTFS1" "-b 1092 $TESTPOOL/$TESTFS1" \
 	"-b 64k $TESTPOOL/$TESTFS1" "-s -b 32k $TESTPOOL/$TESTFS1" \
-	"$TESTPOOL/$BYND_MAX_NAME" "$TESTPOOL/$BYND_NEST_LIMIT"
+	"$TESTPOOL/$BYND_MAX_NAME" "$TESTPOOL/$BYND_NEST_LIMIT" \
+	"$TESTPOOL/." "$TESTPOOL/.." "$TESTPOOL/../blah" "$TESTPOOL/./blah" \
+	"$TESTPOOL/blah/./blah" "$TESTPOOL/blah/../blah"
 
 log_assert "Verify 'zfs create <filesystem>' fails with bad <filesystem> argument."
 

From cab7d856ea619db0d5d17e0a17fedac273f9945d Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 13 Jun 2019 16:08:24 -0400
Subject: [PATCH 023/109] Move write aggregation memory copy out of vq_lock

Memory copy is too heavy operation to do under the congested lock.
Moving it out reduces congestion by many times to almost invisible.
Since the original zio removed from the queue, and the child zio is
not executed yet, I don't see why would the copy need protection.
My guess it just remained like this from the time when lock was not
dropped here, which was added later to fix lock ordering issue.

Multi-threaded sequential write tests with both HDD and SSD pools
with ZVOL block sizes of 4KB, 16KB, 64KB and 128KB all show major
reduction of lock congestion, saving from 15% to 35% of CPU time
and increasing throughput from 10% to 40%.

Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:  Alexander Motin <mav@FreeBSD.org>
Closes #8890
---
 module/zfs/vdev_queue.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index e74df76b7530..86b20f134834 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -709,6 +709,18 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	do {
 		dio = nio;
 		nio = AVL_NEXT(t, dio);
+		zio_add_child(dio, aio);
+		vdev_queue_io_remove(vq, dio);
+	} while (dio != last);
+
+	/*
+	 * We need to drop the vdev queue's lock during zio_execute() to
+	 * avoid a deadlock that we could encounter due to lock order
+	 * reversal between vq_lock and io_lock in zio_change_priority().
+	 * Use the dropped lock to do memory copy without congestion.
+	 */
+	mutex_exit(&vq->vq_lock);
+	while ((dio = zio_walk_parents(aio, &zl)) != NULL) {
 		ASSERT3U(dio->io_type, ==, aio->io_type);
 
 		if (dio->io_flags & ZIO_FLAG_NODATA) {
@@ -720,16 +732,6 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 			    dio->io_offset - aio->io_offset, 0, dio->io_size);
 		}
 
-		zio_add_child(dio, aio);
-		vdev_queue_io_remove(vq, dio);
-	} while (dio != last);
-
-	/*
-	 * We need to drop the vdev queue's lock to avoid a deadlock that we
-	 * could encounter since this I/O will complete immediately.
-	 */
-	mutex_exit(&vq->vq_lock);
-	while ((dio = zio_walk_parents(aio, &zl)) != NULL) {
 		zio_vdev_io_bypass(dio);
 		zio_execute(dio);
 	}

From 592ee2e6ddcad339398e825bdb39569167c550ab Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <mahrens@delphix.com>
Date: Thu, 13 Jun 2019 13:10:19 -0700
Subject: [PATCH 024/109] compress metadata in later sync passes

Starting in sync pass 5 (zfs_sync_pass_dont_compress), we disable
compression (including of metadata).  Ostensibly this helps the sync
passes to converge (i.e. for a sync pass to not need to allocate
anything because it is 100% overwrites).

However, in practice it increases the average number of sync passes,
because when we turn compression off, a lot of block's size will change
and thus we have to re-allocate (not overwrite) them.  It also increases
the number of 128KB allocations (e.g. for indirect blocks and spacemaps)
because these will not be compressed.  The 128K allocations are
especially detrimental to performance on highly fragmented systems,
which may have very few free segments of this size, and may need to load
new metaslabs to satisfy 128K allocations.

We should increase zfs_sync_pass_dont_compress.  In practice on a highly
fragmented system we see a few 5-pass txg's, a tiny number of 6-pass
txg's, and no txg's with more than 6 passes.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-63431
Closes #8892
---
 man/man5/zfs-module-parameters.5 | 16 ++++++++++++++--
 module/zfs/zio.c                 | 18 ++++++++++++++++--
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 29374a9d3965..2d2a79413d97 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -2444,9 +2444,21 @@ Default value: \fB25\fR.
 \fBzfs_sync_pass_dont_compress\fR (int)
 .ad
 .RS 12n
-Don't compress starting in this pass
+Starting in this sync pass, we disable compression (including of metadata). 
+With the default setting, in practice, we don't have this many sync passes,
+so this has no effect.
+.sp
+The original intent was that disabling compression would help the sync passes
+to converge. However, in practice disabling compression increases the average
+number of sync passes, because when we turn compression off, a lot of block's
+size will change and thus we have to re-allocate (not overwrite) them. It
+also increases the number of 128KB allocations (e.g. for indirect blocks and
+spacemaps) because these will not be compressed. The 128K allocations are
+especially detrimental to performance on highly fragmented systems, which may
+have very few free segments of this size, and may need to load new metaslabs
+to satisfy 128K allocations.
 .sp
-Default value: \fB5\fR.
+Default value: \fB8\fR.
 .RE
 
 .sp
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 016ac07eabd9..5bfff37eb3b5 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  */
@@ -96,9 +96,23 @@ int zio_slow_io_ms = (30 * MILLISEC);
  *
  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  * regular blocks are not deferred.
+ *
+ * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable
+ * compression (including of metadata).  In practice, we don't have this
+ * many sync passes, so this has no effect.
+ *
+ * The original intent was that disabling compression would help the sync
+ * passes to converge. However, in practice disabling compression increases
+ * the average number of sync passes, because when we turn compression off, a
+ * lot of block's size will change and thus we have to re-allocate (not
+ * overwrite) them. It also increases the number of 128KB allocations (e.g.
+ * for indirect blocks and spacemaps) because these will not be compressed.
+ * The 128K allocations are especially detrimental to performance on highly
+ * fragmented systems, which may have very few free segments of this size,
+ * and may need to load new metaslabs to satisfy 128K allocations.
  */
 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
-int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
+int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */
 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
 
 /*

From 6083f403873f5e427ee8d86f903aa08c7b69daab Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <mahrens@delphix.com>
Date: Thu, 13 Jun 2019 13:12:39 -0700
Subject: [PATCH 025/109] panic in removal_remap test on 4K devices

If the zfs_remove_max_segment tunable is changed to be not a multiple of
the sector size, then the device removal code will malfunction and try
to create mappings that are smaller than one sector, leading to a panic.

On debug bits this assertion will fail in spa_vdev_copy_segment():
    ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);

On nondebug, the system panics with a stack like:
    metaslab_free_concrete()
    metaslab_free_impl()
    metaslab_free_impl_cb()
    vdev_indirect_remap()
    free_from_removing_vdev()
    metaslab_free_impl()
    metaslab_free_dva()
    metaslab_free()

Fortunately, the default for zfs_remove_max_segment is 1MB, so this
can't occur by default.  We hit it during this test because
removal_remap.ksh changes zfs_remove_max_segment to 1KB. When testing on
4KB-sector disks, we hit the bug.

This change makes the zfs_remove_max_segment tunable more robust,
automatically rounding it up to a multiple of the sector size. We also
turn some key assertions into VERIFY's so that similar bugs would be
caught before they are encoded on disk (and thus avoid a
panic-reboot-loop).

Reviewed-by: Sean Eric Fagan <sef@ixsystems.com>
Reviewed-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-61342
Closes #8893
---
 include/sys/vdev_removal.h       |  8 ++++----
 man/man5/zfs-module-parameters.5 | 27 ++++++++++++++++++++++++++
 module/zfs/vdev_label.c          |  5 ++---
 module/zfs/vdev_removal.c        | 33 +++++++++++++++++++++++++-------
 4 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h
index 3962237afdab..e3bab0658d62 100644
--- a/include/sys/vdev_removal.h
+++ b/include/sys/vdev_removal.h
@@ -14,7 +14,7 @@
  */
 
 /*
- * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2019 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_REMOVAL_H
@@ -81,13 +81,13 @@ extern void spa_vdev_condense_suspend(spa_t *);
 extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
 extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t);
 extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *);
-extern void svr_sync(spa_t *spa, dmu_tx_t *tx);
+extern void svr_sync(spa_t *, dmu_tx_t *);
 extern void spa_vdev_remove_suspend(spa_t *);
 extern int spa_vdev_remove_cancel(spa_t *);
-extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
+extern void spa_vdev_removal_destroy(spa_vdev_removal_t *);
+extern uint64_t spa_remove_max_segment(spa_t *);
 
 extern int vdev_removal_max_span;
-extern int zfs_remove_max_segment;
 
 #ifdef	__cplusplus
 }
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 2d2a79413d97..8ad3ce466ce5 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -2194,6 +2194,33 @@ pool cannot be returned to a healthy state prior to removing the device.
 Default value: \fB0\fR.
 .RE
 
+.sp
+.ne 2
+.na
+\fBzfs_removal_suspend_progress\fR (int)
+.ad
+.RS 12n
+.sp
+This is used by the test suite so that it can ensure that certain actions
+happen while in the middle of a removal.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_remove_max_segment\fR (int)
+.ad
+.RS 12n
+.sp
+The largest contiguous segment that we will attempt to allocate when removing
+a device.  This can be no larger than 16MB.  If there is a performance
+problem with attempting to allocate large blocks, consider decreasing this.
+.sp
+Default value: \fB16,777,216\fR (16MB).
+.RE
+
 .sp
 .ne 2
 .na
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index a0e373b3dfc5..6320732ed6da 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -21,8 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  */
 
@@ -613,7 +612,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 			 * zfs_remove_max_segment, so we need at least one entry
 			 * per zfs_remove_max_segment of allocated data.
 			 */
-			seg_count += to_alloc / zfs_remove_max_segment;
+			seg_count += to_alloc / spa_remove_max_segment(spa);
 
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
 			    seg_count *
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 536a982eca2b..6f64edd8c473 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -100,6 +100,8 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
  * removing a device.  This can be no larger than SPA_MAXBLOCKSIZE.  If
  * there is a performance problem with attempting to allocate large blocks,
  * consider decreasing this.
+ *
+ * See also the accessor function spa_remove_max_segment().
  */
 int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
 
@@ -951,8 +953,10 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
 	vdev_indirect_mapping_entry_t *entry;
 	dva_t dst = {{ 0 }};
 	uint64_t start = range_tree_min(segs);
+	ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift));
 
 	ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
+	ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift));
 
 	uint64_t size = range_tree_span(segs);
 	if (range_tree_span(segs) > maxalloc) {
@@ -983,6 +987,7 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
 		}
 	}
 	ASSERT3U(size, <=, maxalloc);
+	ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift));
 
 	/*
 	 * An allocation class might not have any remaining vdevs or space
@@ -1026,11 +1031,11 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
 
 	/*
 	 * We can't have any padding of the allocated size, otherwise we will
-	 * misunderstand what's allocated, and the size of the mapping.
-	 * The caller ensures this will be true by passing in a size that is
-	 * aligned to the worst (highest) ashift in the pool.
+	 * misunderstand what's allocated, and the size of the mapping. We
+	 * prevent padding by ensuring that all devices in the pool have the
+	 * same ashift, and the allocation size is a multiple of the ashift.
 	 */
-	ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
+	VERIFY3U(DVA_GET_ASIZE(&dst), ==, size);
 
 	entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
 	DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
@@ -1363,6 +1368,20 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
 	range_tree_destroy(segs);
 }
 
+/*
+ * The size of each removal mapping is limited by the tunable
+ * zfs_remove_max_segment, but we must adjust this to be a multiple of the
+ * pool's ashift, so that we don't try to split individual sectors regardless
+ * of the tunable value.  (Note that device removal requires that all devices
+ * have the same ashift, so there's no difference between spa_min_ashift and
+ * spa_max_ashift.) The raw tunable should not be used elsewhere.
+ */
+uint64_t
+spa_remove_max_segment(spa_t *spa)
+{
+	return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift));
+}
+
 /*
  * The removal thread operates in open context.  It iterates over all
  * allocated space in the vdev, by loading each metaslab's spacemap.
@@ -1385,7 +1404,7 @@ spa_vdev_remove_thread(void *arg)
 	spa_t *spa = arg;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_copy_arg_t vca;
-	uint64_t max_alloc = zfs_remove_max_segment;
+	uint64_t max_alloc = spa_remove_max_segment(spa);
 	uint64_t last_txg = 0;
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@@ -1511,7 +1530,7 @@ spa_vdev_remove_thread(void *arg)
 			vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 
 			if (txg != last_txg)
-				max_alloc = zfs_remove_max_segment;
+				max_alloc = spa_remove_max_segment(spa);
 			last_txg = txg;
 
 			spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);

From b033353b2548a357a7e2bbde2cf68b2ccf8f0054 Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <mahrens@delphix.com>
Date: Thu, 13 Jun 2019 13:14:35 -0700
Subject: [PATCH 026/109] lz4_decompress_abd declared but not defined

`lz4_decompress_abd` is declared in zio_compress.h but it is not defined
anywhere. The declaration should be removed.

Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed-by: Allan Jude <allanjude@freebsd.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-47477
Closes #8894
---
 include/sys/zio_compress.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h
index 1642823d3d42..208117eee4b5 100644
--- a/include/sys/zio_compress.h
+++ b/include/sys/zio_compress.h
@@ -105,8 +105,7 @@ extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
     int level);
 extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
     int level);
-extern int lz4_decompress_abd(abd_t *src, void *dst, size_t s_len, size_t d_len,
-    int level);
+
 /*
  * Compress and decompress data if necessary.
  */

From 9e54b9d930849e2ccb9ae12d729c7f20e54c670f Mon Sep 17 00:00:00 2001
From: Ryan Moeller <ryan@freqlabs.com>
Date: Thu, 13 Jun 2019 13:15:46 -0700
Subject: [PATCH 027/109] Python config cleanup

Don't require Python at configure/build unless building pyzfs.
Move ZFS_AC_PYTHON_MODULE to always-pyzfs.m4 where it is used.
Make test syntax more consistent.

Sponsored by: iXsystems, Inc.
Reviewed-by: Neal Gompa <ngompa@datto.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Ryan Moeller <ryan@ixsystems.com>
Closes #8895
---
 config/always-python.m4 | 87 +++++++++++------------------------------
 config/always-pyzfs.m4  | 45 ++++++++++++++-------
 2 files changed, 53 insertions(+), 79 deletions(-)

diff --git a/config/always-python.m4 b/config/always-python.m4
index 7cfefd9ebcae..c1c07597e688 100644
--- a/config/always-python.m4
+++ b/config/always-python.m4
@@ -1,47 +1,3 @@
-dnl #
-dnl # ZFS_AC_PYTHON_VERSION(version, [action-if-true], [action-if-false])
-dnl #
-dnl # Verify Python version
-dnl #
-AC_DEFUN([ZFS_AC_PYTHON_VERSION], [
-	ver_check=`$PYTHON -c "import sys; print (sys.version.split()[[0]] $1)"`
-	AS_IF([test "$ver_check" = "True"], [
-		m4_ifvaln([$2], [$2])
-	], [
-		m4_ifvaln([$3], [$3])
-	])
-])
-
-dnl #
-dnl # ZFS_AC_PYTHON_VERSION_IS_2
-dnl # ZFS_AC_PYTHON_VERSION_IS_3
-dnl #
-dnl # Tests if the $PYTHON_VERSION matches 2.x or 3.x.
-dnl #
-AC_DEFUN([ZFS_AC_PYTHON_VERSION_IS_2],
-	[test "${PYTHON_VERSION%%\.*}" = "2"])
-AC_DEFUN([ZFS_AC_PYTHON_VERSION_IS_3],
-	[test "${PYTHON_VERSION%%\.*}" = "3"])
-
-dnl #
-dnl # ZFS_AC_PYTHON_MODULE(module_name, [action-if-true], [action-if-false])
-dnl #
-dnl # Checks for Python module. Freely inspired by AX_PYTHON_MODULE
-dnl # https://www.gnu.org/software/autoconf-archive/ax_python_module.html
-dnl # Required by ZFS_AC_CONFIG_ALWAYS_PYZFS.
-dnl #
-AC_DEFUN([ZFS_AC_PYTHON_MODULE], [
-	PYTHON_NAME=`basename $PYTHON`
-	AC_MSG_CHECKING([for $PYTHON_NAME module: $1])
-	AS_IF([$PYTHON -c "import $1" 2>/dev/null], [
-		AC_MSG_RESULT(yes)
-		m4_ifvaln([$2], [$2])
-	], [
-		AC_MSG_RESULT(no)
-		m4_ifvaln([$3], [$3])
-	])
-])
-
 dnl #
 dnl # The majority of the python scripts are written to be compatible
 dnl # with Python 2.6 and Python 3.4.  Therefore, they may be installed
@@ -66,35 +22,38 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYTHON], [
 		[AC_MSG_ERROR([Unknown --with-python value '$with_python'])]
 	)
 
-	AS_IF([test $PYTHON != :], [
-		AS_IF([$PYTHON --version >/dev/null 2>&1],
-			[AM_PATH_PYTHON([2.6], [], [:])],
-			[AC_MSG_ERROR([Cannot find $PYTHON in your system path])]
-		)
-	])
-	AM_CONDITIONAL([USING_PYTHON], [test "$PYTHON" != :])
-	AM_CONDITIONAL([USING_PYTHON_2], [ZFS_AC_PYTHON_VERSION_IS_2])
-	AM_CONDITIONAL([USING_PYTHON_3], [ZFS_AC_PYTHON_VERSION_IS_3])
-
 	dnl #
 	dnl # Minimum supported Python versions for utilities:
-	dnl # Python 2.6.x, or Python 3.4.x
+	dnl # Python 2.6 or Python 3.4
 	dnl #
-	AS_IF([ZFS_AC_PYTHON_VERSION_IS_2], [
-		ZFS_AC_PYTHON_VERSION([>= '2.6'], [ true ],
-			[AC_MSG_ERROR("Python >= 2.6.x is not available")])
+	AM_PATH_PYTHON([], [], [:])
+	AS_IF([test -z "$PYTHON_VERSION"], [
+		PYTHON_VERSION=$(basename $PYTHON | tr -cd 0-9.)
 	])
+	PYTHON_MINOR=${PYTHON_VERSION#*\.}
 
-	AS_IF([ZFS_AC_PYTHON_VERSION_IS_3], [
-		ZFS_AC_PYTHON_VERSION([>= '3.4'], [ true ],
-			[AC_MSG_ERROR("Python >= 3.4.x is not available")])
-	])
+	AS_CASE([$PYTHON_VERSION],
+		[2.*], [
+			AS_IF([test $PYTHON_MINOR -lt 6],
+				[AC_MSG_ERROR("Python >= 2.6 is required")])
+		],
+		[3.*], [
+			AS_IF([test $PYTHON_MINOR -lt 4],
+				[AC_MSG_ERROR("Python >= 3.4 is required")])
+		],
+		[:|2|3], [],
+		[PYTHON_VERSION=3]
+	)
+
+	AM_CONDITIONAL([USING_PYTHON], [test "$PYTHON" != :])
+	AM_CONDITIONAL([USING_PYTHON_2], [test "x${PYTHON_VERSION%%\.*}" = x2])
+	AM_CONDITIONAL([USING_PYTHON_3], [test "x${PYTHON_VERSION%%\.*}" = x3])
 
 	dnl #
 	dnl # Request that packages be built for a specific Python version.
 	dnl #
-	AS_IF([test $with_python != check], [
-		PYTHON_PKG_VERSION=`echo ${PYTHON} | tr -d 'a-zA-Z.'`
+	AS_IF([test "x$with_python" != xcheck], [
+		PYTHON_PKG_VERSION=$(echo $PYTHON_VERSION | tr -d .)
 		DEFINE_PYTHON_PKG_VERSION='--define "__use_python_pkg_version '${PYTHON_PKG_VERSION}'"'
 		DEFINE_PYTHON_VERSION='--define "__use_python '${PYTHON}'"'
 	], [
diff --git a/config/always-pyzfs.m4 b/config/always-pyzfs.m4
index 6f32e98feed2..f620a8f9a18b 100644
--- a/config/always-pyzfs.m4
+++ b/config/always-pyzfs.m4
@@ -1,5 +1,24 @@
 dnl #
-dnl # Determines if pyzfs can be built, requires Python 2.7 or latter.
+dnl # ZFS_AC_PYTHON_MODULE(module_name, [action-if-true], [action-if-false])
+dnl #
+dnl # Checks for Python module. Freely inspired by AX_PYTHON_MODULE
+dnl # https://www.gnu.org/software/autoconf-archive/ax_python_module.html
+dnl # Required by ZFS_AC_CONFIG_ALWAYS_PYZFS.
+dnl #
+AC_DEFUN([ZFS_AC_PYTHON_MODULE], [
+	PYTHON_NAME=$(basename $PYTHON)
+	AC_MSG_CHECKING([for $PYTHON_NAME module: $1])
+	AS_IF([$PYTHON -c "import $1" 2>/dev/null], [
+		AC_MSG_RESULT(yes)
+		m4_ifvaln([$2], [$2])
+	], [
+		AC_MSG_RESULT(no)
+		m4_ifvaln([$3], [$3])
+	])
+])
+
+dnl #
+dnl # Determines if pyzfs can be built, requires Python 2.7 or later.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [
 	AC_ARG_ENABLE([pyzfs],
@@ -18,7 +37,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [
 			DEFINE_PYZFS='--without pyzfs'
 		])
 	], [
-		AS_IF([test $PYTHON != :], [
+		AS_IF([test "$PYTHON" != :], [
 			DEFINE_PYZFS=''
 		], [
 			enable_pyzfs=no
@@ -31,20 +50,16 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [
 	dnl # Require python-devel libraries
 	dnl #
 	AS_IF([test "x$enable_pyzfs" = xcheck  -o "x$enable_pyzfs" = xyes], [
-		AS_IF([ZFS_AC_PYTHON_VERSION_IS_2], [
-			PYTHON_REQUIRED_VERSION=">= '2.7.0'"
-		], [
-			AS_IF([ZFS_AC_PYTHON_VERSION_IS_3], [
-				PYTHON_REQUIRED_VERSION=">= '3.4.0'"
-			], [
-				AC_MSG_ERROR("Python $PYTHON_VERSION unknown")
-			])
-		])
+		AS_CASE([$PYTHON_VERSION],
+			[3.*], [PYTHON_REQUIRED_VERSION=">= '3.4.0'"],
+			[2.*], [PYTHON_REQUIRED_VERSION=">= '2.7.0'"],
+			[AC_MSG_ERROR("Python $PYTHON_VERSION unknown")]
+		)
 
 		AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [
 			AS_IF([test "x$enable_pyzfs" = xyes], [
 				AC_MSG_ERROR("Python $PYTHON_REQUIRED_VERSION development library is not installed")
-			], [test ! "x$enable_pyzfs" = xno], [
+			], [test "x$enable_pyzfs" != xno], [
 				enable_pyzfs=no
 			])
 		])
@@ -57,7 +72,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [
 		ZFS_AC_PYTHON_MODULE([setuptools], [], [
 			AS_IF([test "x$enable_pyzfs" = xyes], [
 				AC_MSG_ERROR("Python $PYTHON_VERSION setuptools is not installed")
-			], [test ! "x$enable_pyzfs" = xno], [
+			], [test "x$enable_pyzfs" != xno], [
 				enable_pyzfs=no
 			])
 		])
@@ -70,7 +85,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [
 		ZFS_AC_PYTHON_MODULE([cffi], [], [
 			AS_IF([test "x$enable_pyzfs" = xyes], [
 				AC_MSG_ERROR("Python $PYTHON_VERSION cffi is not installed")
-			], [test ! "x$enable_pyzfs" = xno], [
+			], [test "x$enable_pyzfs" != xno], [
 				enable_pyzfs=no
 			])
 		])
@@ -81,7 +96,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [
 	dnl #
 	AS_IF([test "x$enable_pyzfs" = xcheck], [enable_pyzfs=yes])
 
-	AM_CONDITIONAL([PYZFS_ENABLED], [test x$enable_pyzfs = xyes])
+	AM_CONDITIONAL([PYZFS_ENABLED], [test "x$enable_pyzfs" = xyes])
 	AC_SUBST([PYZFS_ENABLED], [$enable_pyzfs])
 	AC_SUBST(pythonsitedir, [$PYTHON_SITE_PKG])
 

From ed7b0d357a070d28710abe9a6c6fc22c4fcbe854 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 14 Jun 2019 17:07:34 -0400
Subject: [PATCH 028/109] Minimize aggsum_compare(&arc_size, arc_c) calls.

For busy ARC situation when arc_size close to arc_c is desired.  But
then it is quite likely that aggsum_compare(&arc_size, arc_c) will need
to flush per-CPU buckets to find exact comparison result.  Doing that
often in a hot path penalizes whole idea of aggsum usage there, since it
replaces few simple atomic additions with dozens of lock acquisitions.

Replacing aggsum_compare() with aggsum_upper_bound() in code increasing
arc_p when ARC is growing (arc_size < arc_c) according to PMC profiles
allows to save ~5% of CPU time in aggsum code during sequential write
to 12 ZVOLs with 16KB block size on large dual-socket system.

I suppose there some minor arc_p behavior change due to lower precision
of the new code, but I don't think it is a big deal, since it should
affect only very small window in time (aggsum buckets are flushed every
second) and in ARC size (buckets are limited to 10 average ARC blocks
per CPU).

Reviewed-by: Chris Dunlop <chris@onthe.net.au>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Allan Jude <allanjude@freebsd.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:  Alexander Motin <mav@FreeBSD.org>
Closes #8901
---
 module/zfs/arc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index a7e7d26996f8..720365c4a935 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -5606,7 +5606,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 		 * If we are growing the cache, and we are adding anonymous
 		 * data, and we have outgrown arc_p, update arc_p
 		 */
-		if (aggsum_compare(&arc_size, arc_c) < 0 &&
+		if (aggsum_upper_bound(&arc_size) < arc_c &&
 		    hdr->b_l1hdr.b_state == arc_anon &&
 		    (zfs_refcount_count(&arc_anon->arcs_size) +
 		    zfs_refcount_count(&arc_mru->arcs_size) > arc_p))

From b5e8d14a4b0c25b19c4e148123e5d579add0cfa5 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 19 Jun 2019 10:39:28 -0700
Subject: [PATCH 029/109] ZTS: Fix mmp_interval failure

The mmp_interval test case was failing on Fedora 30 due to the built-in
'echo' command terminating the script when it was unable to write to
the sysfs module parameter.  This change in behavior was observed with
ksh-2020.0.0-alpha1.  Resolve the issue by using the external cat
command which fails gracefully as expected.

Additionally, remove some incorrect quotes around the $? return values.

Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #8906
---
 tests/zfs-tests/include/libtest.shlib | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index 1b841d7ba02c..c7cb36a8d0ee 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -3494,13 +3494,13 @@ function set_tunable_impl
 	Linux)
 		typeset zfs_tunables="/sys/module/$module/parameters"
 		[[ -w "$zfs_tunables/$tunable" ]] || return 1
-		echo -n "$value" > "$zfs_tunables/$tunable"
-		return "$?"
+		cat >"$zfs_tunables/$tunable" <<<"$value"
+		return $?
 		;;
 	SunOS)
 		[[ "$module" -eq "zfs" ]] || return 1
 		echo "${tunable}/${mdb_cmd}0t${value}" | mdb -kw
-		return "$?"
+		return $?
 		;;
 	esac
 }
@@ -3527,7 +3527,7 @@ function get_tunable_impl
 		typeset zfs_tunables="/sys/module/$module/parameters"
 		[[ -f "$zfs_tunables/$tunable" ]] || return 1
 		cat $zfs_tunables/$tunable
-		return "$?"
+		return $?
 		;;
 	SunOS)
 		[[ "$module" -eq "zfs" ]] || return 1

From 5b0327bc5795b5ae8b1926d90a9b6b8b10433f72 Mon Sep 17 00:00:00 2001
From: Olaf Faaland <faaland1@llnl.gov>
Date: Wed, 19 Jun 2019 11:44:44 -0700
Subject: [PATCH 030/109] kmod-zfs-devel rpm should provide kmod-spl-devel

When configure is run with --with-spec=redhat, and rpms are built, the
kmod-zfs-devel package is missing

Provides: kmod-spl-devel = %{version}

which is required by software such as Lustre which builds against zfs
kmods.  Adding it makes it easier for such software to build against
both zfs-0.7 (where SPL is separate and may be missing) and zfs-0.8.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #8930
---
 rpm/redhat/zfs-kmod.spec.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in
index 473f2d032509..f632c4867e63 100644
--- a/rpm/redhat/zfs-kmod.spec.in
+++ b/rpm/redhat/zfs-kmod.spec.in
@@ -41,6 +41,7 @@ This package contains the ZFS kernel modules.
 %package -n kmod-%{kmod_name}-devel
 Summary:        ZFS kernel module(s) devel common
 Group:          System Environment/Kernel
+Provides:       kmod-spl-devel = %{version}
 
 %description -n  kmod-%{kmod_name}-devel
 This package provides the header files and objects to build kernel modules.

From 2087b6cf4941b936583b48471a79b252dc0a9dbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michael=20Niew=C3=B6hner?=
 <c0d3z3r0@users.noreply.github.com>
Date: Wed, 19 Jun 2019 20:53:37 +0200
Subject: [PATCH 031/109] Fix memory leak in check_disk()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Allan Jude <allanjude@freebsd.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Michael Niewöhner <foss@mniewoehner.de>
Closes #8897
Closes #8911
---
 cmd/zpool/zpool_vdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c
index 7ea9d742006d..52c696816f73 100644
--- a/cmd/zpool/zpool_vdev.c
+++ b/cmd/zpool/zpool_vdev.c
@@ -433,6 +433,7 @@ check_disk(const char *path, blkid_cache cache, int force,
 		char *value = blkid_get_tag_value(cache, "TYPE", path);
 		(void) fprintf(stderr, gettext("%s is in use and contains "
 		    "a %s filesystem.\n"), path, value ? value : "unknown");
+		free(value);
 		return (-1);
 	}
 

From fb6f6b47d6f9b63e5768635b74160d94b3fe33f5 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Thu, 20 Jun 2019 04:27:31 +0900
Subject: [PATCH 032/109] Use ZFS_DEV macro instead of literals

The rest of the code/comments use ZFS_DEV, so sync with that.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #8912
---
 lib/libzfs_core/libzfs_core.c | 6 +++---
 lib/libzpool/util.c           | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c
index 99fc84d04614..eb332bc94e8c 100644
--- a/lib/libzfs_core/libzfs_core.c
+++ b/lib/libzfs_core/libzfs_core.c
@@ -52,7 +52,7 @@
  *
  *  - Thin Layer.  libzfs_core is a thin layer, marshaling arguments
  *  to/from the kernel ioctls.  There is generally a 1:1 correspondence
- *  between libzfs_core functions and ioctls to /dev/zfs.
+ *  between libzfs_core functions and ioctls to ZFS_DEV.
  *
  *  - Clear Atomicity.  Because libzfs_core functions are generally 1:1
  *  with kernel ioctls, and kernel ioctls are general atomic, each
@@ -135,7 +135,7 @@ libzfs_core_init(void)
 {
 	(void) pthread_mutex_lock(&g_lock);
 	if (g_refcount == 0) {
-		g_fd = open("/dev/zfs", O_RDWR);
+		g_fd = open(ZFS_DEV, O_RDWR);
 		if (g_fd < 0) {
 			(void) pthread_mutex_unlock(&g_lock);
 			return (errno);
@@ -499,7 +499,7 @@ lzc_sync(const char *pool_name, nvlist_t *innvl, nvlist_t **outnvl)
  * The snapshots must all be in the same pool.
  * The value is the name of the hold (string type).
  *
- * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL).
+ * If cleanup_fd is not -1, it must be the result of open(ZFS_DEV, O_EXCL).
  * In this case, when the cleanup_fd is closed (including on process
  * termination), the holds will be released.  If the system is shut down
  * uncleanly, the holds will be released when the pool is next opened
diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c
index ad05d2239ae0..67bc209ceec9 100644
--- a/lib/libzpool/util.c
+++ b/lib/libzpool/util.c
@@ -223,7 +223,7 @@ pool_active(void *unused, const char *name, uint64_t guid,
 	 * Use ZFS_IOC_POOL_SYNC to confirm if a pool is active
 	 */
 
-	fd = open("/dev/zfs", O_RDWR);
+	fd = open(ZFS_DEV, O_RDWR);
 	if (fd < 0)
 		return (-1);
 

From 01cc94f68d89c71943ecc5bd3dfaff6171dfe157 Mon Sep 17 00:00:00 2001
From: dacianstremtan <35844628+dacianstremtan@users.noreply.github.com>
Date: Thu, 20 Jun 2019 15:27:14 -0400
Subject: [PATCH 033/109] Replace whereis with type in zfs-lib.sh

The whereis command should not be used since it may not exist
in the initramfs.  The dracut plymouth module also uses the type
command instead of whereis.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Garrett Fields <ghfields@gmail.com>
Signed-off-by: Dacian Reece-Stremtan <dacianstremtan@gmail.com>
Closes #8920
Closes #8938
---
 contrib/dracut/90zfs/zfs-lib.sh.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/dracut/90zfs/zfs-lib.sh.in b/contrib/dracut/90zfs/zfs-lib.sh.in
index 23c07af9e86f..44021c6e5fc1 100755
--- a/contrib/dracut/90zfs/zfs-lib.sh.in
+++ b/contrib/dracut/90zfs/zfs-lib.sh.in
@@ -144,7 +144,7 @@ ask_for_password() {
 
     { flock -s 9;
         # Prompt for password with plymouth, if installed and running.
-        if whereis plymouth >/dev/null 2>&1 && plymouth --ping 2>/dev/null; then
+        if type plymouth >/dev/null 2>&1 && plymouth --ping 2>/dev/null; then
             plymouth ask-for-password \
                 --prompt "$ply_prompt" --number-of-tries="$ply_tries" \
                 --command="$ply_cmd"

From b96ceeead2a9c7e0973fcef58356defb10f6df26 Mon Sep 17 00:00:00 2001
From: Tom Caputi <tcaputi@datto.com>
Date: Thu, 20 Jun 2019 15:29:51 -0400
Subject: [PATCH 034/109] Allow unencrypted children of encrypted datasets

When encryption was first added to ZFS, we made a decision to
prevent users from creating unencrypted children of encrypted
datasets. The idea was to prevent users from inadvertently
leaving some of their data unencrypted. However, since the
release of 0.8.0, some legitimate reasons have been brought up
for this behavior to be allowed. This patch simply removes this
limitation from all code paths that had checks for it and updates
the tests accordingly.

Reviewed-by: Jason King <jason.king@joyent.com>
Reviewed-by: Sean Eric Fagan <sef@ixsystems.com>
Reviewed-by: Richard Laager <rlaager@wiktel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #8737
Closes #8870
---
 include/sys/dsl_crypt.h                       |  1 -
 lib/libzfs/libzfs_crypto.c                    | 41 +---------------
 lib/libzfs/libzfs_dataset.c                   | 13 ++---
 lib/libzfs/libzfs_sendrecv.c                  | 48 ++++++++-----------
 module/zfs/dmu_objset.c                       |  7 ---
 module/zfs/dmu_recv.c                         | 24 +++++-----
 module/zfs/dsl_crypt.c                        | 44 +----------------
 .../zfs_create/zfs_create_encrypted.ksh       | 20 ++++----
 .../zfs_receive/zfs_receive_to_encrypted.ksh  | 14 +++---
 .../zfs_rename/zfs_rename_to_encrypted.ksh    | 14 +++---
 10 files changed, 63 insertions(+), 163 deletions(-)

diff --git a/include/sys/dsl_crypt.h b/include/sys/dsl_crypt.h
index c2c0a548a488..0f73ea6c6df8 100644
--- a/include/sys/dsl_crypt.h
+++ b/include/sys/dsl_crypt.h
@@ -209,7 +209,6 @@ void dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd,
     struct dsl_dataset *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx);
 uint64_t dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey,
     dmu_tx_t *tx);
-int dmu_objset_clone_crypt_check(dsl_dir_t *parentdd, dsl_dir_t *origindd);
 uint64_t dsl_crypto_key_clone_sync(dsl_dir_t *origindd, dmu_tx_t *tx);
 void dsl_crypto_key_destroy_sync(uint64_t dckobj, dmu_tx_t *tx);
 
diff --git a/lib/libzfs/libzfs_crypto.c b/lib/libzfs/libzfs_crypto.c
index 3318a6bd2e11..d31f43b1fdf2 100644
--- a/lib/libzfs/libzfs_crypto.c
+++ b/lib/libzfs/libzfs_crypto.c
@@ -740,14 +740,6 @@ zfs_crypto_create(libzfs_handle_t *hdl, char *parent_name, nvlist_t *props,
 		pcrypt = ZIO_CRYPT_OFF;
 	}
 
-	/* Check for encryption being explicitly truned off */
-	if (crypt == ZIO_CRYPT_OFF && pcrypt != ZIO_CRYPT_OFF) {
-		ret = EINVAL;
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "Invalid encryption value. Dataset must be encrypted."));
-		goto out;
-	}
-
 	/* Get the inherited encryption property if we don't have it locally */
 	if (!local_crypt)
 		crypt = pcrypt;
@@ -849,10 +841,7 @@ int
 zfs_crypto_clone_check(libzfs_handle_t *hdl, zfs_handle_t *origin_zhp,
     char *parent_name, nvlist_t *props)
 {
-	int ret;
 	char errbuf[1024];
-	zfs_handle_t *pzhp = NULL;
-	uint64_t pcrypt, ocrypt;
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "Encryption clone error"));
@@ -865,40 +854,12 @@ zfs_crypto_clone_check(libzfs_handle_t *hdl, zfs_handle_t *origin_zhp,
 	    nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)) ||
 	    nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION)) ||
 	    nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS))) {
-		ret = EINVAL;
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "Encryption properties must inherit from origin dataset."));
-		goto out;
-	}
-
-	/* get a reference to parent dataset, should never be NULL */
-	pzhp = make_dataset_handle(hdl, parent_name);
-	if (pzhp == NULL) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "Failed to lookup parent."));
-		return (ENOENT);
+		return (EINVAL);
 	}
 
-	/* Lookup parent's crypt */
-	pcrypt = zfs_prop_get_int(pzhp, ZFS_PROP_ENCRYPTION);
-	ocrypt = zfs_prop_get_int(origin_zhp, ZFS_PROP_ENCRYPTION);
-
-	/* all children of encrypted parents must be encrypted */
-	if (pcrypt != ZIO_CRYPT_OFF && ocrypt == ZIO_CRYPT_OFF) {
-		ret = EINVAL;
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "Cannot create unencrypted clone as a child "
-		    "of encrypted parent."));
-		goto out;
-	}
-
-	zfs_close(pzhp);
 	return (0);
-
-out:
-	if (pzhp != NULL)
-		zfs_close(pzhp);
-	return (ret);
 }
 
 typedef struct loadkeys_cbdata {
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index 3be205f1f437..ee5a6412ead5 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -4632,16 +4632,9 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive,
 			    "with the new name"));
 			(void) zfs_error(hdl, EZFS_EXISTS, errbuf);
 		} else if (errno == EACCES) {
-			if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) ==
-			    ZIO_CRYPT_OFF) {
-				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "cannot rename an unencrypted dataset to "
-				    "be a decendent of an encrypted one"));
-			} else {
-				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "cannot move encryption child outside of "
-				    "its encryption root"));
-			}
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "cannot move encrypted child outside of "
+			    "its encryption root"));
 			(void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf);
 		} else {
 			(void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf);
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index f69a46430bbe..052b96b9b653 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -2827,7 +2827,7 @@ recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *destname,
 		is_clone = zhp->zfs_dmustats.dds_origin[0] != '\0';
 		(void) zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL);
 
-		/* we don't need to do anything for unencrypted filesystems */
+		/* we don't need to do anything for unencrypted datasets */
 		if (crypt == ZIO_CRYPT_OFF) {
 			zfs_close(zhp);
 			continue;
@@ -4210,34 +4210,6 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 			goto out;
 		}
 
-		/*
-		 * It is invalid to receive a properties stream that was
-		 * unencrypted on the send side as a child of an encrypted
-		 * parent. Technically there is nothing preventing this, but
-		 * it would mean that the encryption=off property which is
-		 * locally set on the send side would not be received correctly.
-		 * We can infer encryption=off if the stream is not raw and
-		 * properties were included since the send side will only ever
-		 * send the encryption property in a raw nvlist header. This
-		 * check will be avoided if the user specifically overrides
-		 * the encryption property on the command line.
-		 */
-		if (!raw && rcvprops != NULL &&
-		    !nvlist_exists(cmdprops,
-		    zfs_prop_to_name(ZFS_PROP_ENCRYPTION))) {
-			uint64_t crypt;
-
-			crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION);
-
-			if (crypt != ZIO_CRYPT_OFF) {
-				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "parent '%s' must not be encrypted to "
-				    "receive unenecrypted property"), name);
-				err = zfs_error(hdl, EZFS_BADPROP, errbuf);
-				zfs_close(zhp);
-				goto out;
-			}
-		}
 		zfs_close(zhp);
 
 		newfs = B_TRUE;
@@ -4274,6 +4246,24 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 	    &oxprops, &wkeydata, &wkeylen, errbuf)) != 0)
 		goto out;
 
+	/*
+	 * When sending with properties (zfs send -p), the encryption property
+	 * is not included because it is a SETONCE property and therefore
+	 * treated as read only. However, we are always able to determine its
+	 * value because raw sends will include it in the DRR_BDEGIN payload
+	 * and non-raw sends with properties are not allowed for encrypted
+	 * datasets. Therefore, if this is a non-raw properties stream, we can
+	 * infer that the value should be ZIO_CRYPT_OFF and manually add that
+	 * to the received properties.
+	 */
+	if (stream_wantsnewfs && !raw && rcvprops != NULL &&
+	    !nvlist_exists(cmdprops, zfs_prop_to_name(ZFS_PROP_ENCRYPTION))) {
+		if (oxprops == NULL)
+			oxprops = fnvlist_alloc();
+		fnvlist_add_uint64(oxprops,
+		    zfs_prop_to_name(ZFS_PROP_ENCRYPTION), ZIO_CRYPT_OFF);
+	}
+
 	err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops,
 	    oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable,
 	    raw, infd, drr_noswap, cleanup_fd, &read_bytes, &errflags,
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index f95915b9e253..30436b188fc4 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1348,13 +1348,6 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
 		return (SET_ERROR(EINVAL));
 	}
 
-	error = dmu_objset_clone_crypt_check(pdd, origin->ds_dir);
-	if (error != 0) {
-		dsl_dataset_rele(origin, FTAG);
-		dsl_dir_rele(pdd, FTAG);
-		return (error);
-	}
-
 	dsl_dataset_rele(origin, FTAG);
 	dsl_dir_rele(pdd, FTAG);
 
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 65a031b42cc6..3481feb21dbc 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -327,7 +327,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 		/* Open the parent of tofs */
 		ASSERT3U(strlen(tofs), <, sizeof (buf));
 		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
-		error = dsl_dataset_hold_flags(dp, buf, dsflags, FTAG, &ds);
+		error = dsl_dataset_hold(dp, buf, FTAG, &ds);
 		if (error != 0)
 			return (error);
 
@@ -345,13 +345,13 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 			error = dmu_objset_create_crypt_check(ds->ds_dir,
 			    drba->drba_dcp, &will_encrypt);
 			if (error != 0) {
-				dsl_dataset_rele_flags(ds, dsflags, FTAG);
+				dsl_dataset_rele(ds, FTAG);
 				return (error);
 			}
 
 			if (will_encrypt &&
 			    (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
-				dsl_dataset_rele_flags(ds, dsflags, FTAG);
+				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 		}
@@ -364,25 +364,25 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 		    ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
 		if (error != 0) {
-			dsl_dataset_rele_flags(ds, dsflags, FTAG);
+			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 
 		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
 		if (error != 0) {
-			dsl_dataset_rele_flags(ds, dsflags, FTAG);
+			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 
 		/* can't recv below anything but filesystems (eg. no ZVOLs) */
 		error = dmu_objset_from_ds(ds, &os);
 		if (error != 0) {
-			dsl_dataset_rele_flags(ds, dsflags, FTAG);
+			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 		if (dmu_objset_type(os) != DMU_OST_ZFS) {
-			dsl_dataset_rele_flags(ds, dsflags, FTAG);
+			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
 		}
 
@@ -392,31 +392,31 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 			error = dsl_dataset_hold_flags(dp, drba->drba_origin,
 			    dsflags, FTAG, &origin);
 			if (error != 0) {
-				dsl_dataset_rele_flags(ds, dsflags, FTAG);
+				dsl_dataset_rele(ds, FTAG);
 				return (error);
 			}
 			if (!origin->ds_is_snapshot) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
-				dsl_dataset_rele_flags(ds, dsflags, FTAG);
+				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 			if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
 			    fromguid != 0) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
-				dsl_dataset_rele_flags(ds, dsflags, FTAG);
+				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(ENODEV));
 			}
 			if (origin->ds_dir->dd_crypto_obj != 0 &&
 			    (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
-				dsl_dataset_rele_flags(ds, dsflags, FTAG);
+				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 			dsl_dataset_rele_flags(origin,
 			    dsflags, FTAG);
 		}
 
-		dsl_dataset_rele_flags(ds, dsflags, FTAG);
+		dsl_dataset_rele(ds, FTAG);
 		error = 0;
 	}
 	return (error);
diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c
index 21db8e51ffd0..0c0ffaadd8fb 100644
--- a/module/zfs/dsl_crypt.c
+++ b/module/zfs/dsl_crypt.c
@@ -1610,15 +1610,8 @@ dsl_dir_rename_crypt_check(dsl_dir_t *dd, dsl_dir_t *newparent)
 	int ret;
 	uint64_t curr_rddobj, parent_rddobj;
 
-	if (dd->dd_crypto_obj == 0) {
-		/* children of encrypted parents must be encrypted */
-		if (newparent->dd_crypto_obj != 0) {
-			ret = SET_ERROR(EACCES);
-			goto error;
-		}
-
+	if (dd->dd_crypto_obj == 0)
 		return (0);
-	}
 
 	ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj);
 	if (ret != 0)
@@ -1747,34 +1740,6 @@ dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin,
 	kmem_free(keylocation, ZAP_MAXVALUELEN);
 }
 
-int
-dmu_objset_clone_crypt_check(dsl_dir_t *parentdd, dsl_dir_t *origindd)
-{
-	int ret;
-	uint64_t pcrypt, crypt;
-
-	/*
-	 * Check that we are not making an unencrypted child of an
-	 * encrypted parent.
-	 */
-	ret = dsl_dir_get_crypt(parentdd, &pcrypt);
-	if (ret != 0)
-		return (ret);
-
-	ret = dsl_dir_get_crypt(origindd, &crypt);
-	if (ret != 0)
-		return (ret);
-
-	ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT);
-	ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT);
-
-	if (crypt == ZIO_CRYPT_OFF && pcrypt != ZIO_CRYPT_OFF)
-		return (SET_ERROR(EINVAL));
-
-	return (0);
-}
-
-
 int
 dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp,
     boolean_t *will_encrypt)
@@ -1805,13 +1770,6 @@ dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp,
 	ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT);
 	ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT);
 
-	/*
-	 * We can't create an unencrypted child of an encrypted parent
-	 * under any circumstances.
-	 */
-	if (crypt == ZIO_CRYPT_OFF && pcrypt != ZIO_CRYPT_OFF)
-		return (SET_ERROR(EINVAL));
-
 	/* check for valid dcp with no encryption (inherited or local) */
 	if (crypt == ZIO_CRYPT_OFF) {
 		/* Must not specify encryption params */
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_encrypted.ksh
index 9d5ecab0dfee..7e5072f0d5fd 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_encrypted.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_encrypted.ksh
@@ -51,10 +51,10 @@
 # yes	unspec	0	1	no	no keyformat specified
 # yes	unspec	1	0	yes	new encryption root, crypt inherited
 # yes	unspec	1	1	yes	new encryption root, crypt inherited
-# yes	off	0	0	no	unencrypted child of encrypted parent
-# yes	off	0	1	no	unencrypted child of encrypted parent
-# yes	off	1	0	no	unencrypted child of encrypted parent
-# yes	off	1	1	no	unencrypted child of encrypted parent
+# yes	off	0	0	yes	unencrypted child of encrypted parent
+# yes	off	0	1	no	keylocation given, but crypt off
+# yes	off	1	0	no	keyformat given, but crypt off
+# yes	off	1	1	no	keyformat given, but crypt off
 # yes	on	0	0	yes	inherited encryption, local crypt
 # yes	on	0	1	no	no keyformat specified for new key
 # yes	on	1	0	yes	new encryption root
@@ -113,7 +113,9 @@ log_must eval "echo $PASSPHRASE | zfs create -o keyformat=passphrase" \
 log_must eval "echo $PASSPHRASE | zfs create -o keyformat=passphrase" \
 	"-o keylocation=prompt $TESTPOOL/$TESTFS2/c4"
 
-log_mustnot zfs create -o encryption=off $TESTPOOL/$TESTFS2/c5
+log_must zfs create -o encryption=off $TESTPOOL/$TESTFS2/c5
+log_must test "$(get_prop 'encryption' $TESTPOOL/$TESTFS2/c5)" == "off"
+
 log_mustnot zfs create -o encryption=off -o keylocation=prompt \
 	$TESTPOOL/$TESTFS2/c5
 log_mustnot zfs create -o encryption=off -o keyformat=passphrase \
@@ -122,13 +124,13 @@ log_mustnot zfs create -o encryption=off -o keyformat=passphrase \
 	-o keylocation=prompt $TESTPOOL/$TESTFS2/c5
 
 log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \
-	"$TESTPOOL/$TESTFS2/c5"
+	"$TESTPOOL/$TESTFS2/c6"
 log_mustnot zfs create -o encryption=on -o keylocation=prompt \
-	$TESTPOOL/$TESTFS2/c6
+	$TESTPOOL/$TESTFS2/c7
 log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \
-	"-o keyformat=passphrase $TESTPOOL/$TESTFS2/c6"
+	"-o keyformat=passphrase $TESTPOOL/$TESTFS2/c7"
 log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \
-	"-o keyformat=passphrase -o keylocation=prompt $TESTPOOL/$TESTFS2/c7"
+	"-o keyformat=passphrase -o keylocation=prompt $TESTPOOL/$TESTFS2/c8"
 
 log_pass "ZFS creates datasets only if they have a valid combination of" \
 	"encryption properties set."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh
index 57896c6fd305..f8e53f02c23d 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh
@@ -46,7 +46,7 @@ function cleanup
 
 log_onexit cleanup
 
-log_assert "ZFS should receive to an encrypted child dataset"
+log_assert "ZFS should receive encrypted filesystems into child dataset"
 
 typeset passphrase="password"
 typeset snap="$TESTPOOL/$TESTFS@snap"
@@ -60,11 +60,13 @@ log_must eval "echo $passphrase | zfs create -o encryption=on" \
 log_note "Verifying ZFS will receive to an encrypted child"
 log_must eval "zfs send $snap | zfs receive $TESTPOOL/$TESTFS1/c1"
 
-log_note "Verifying 'send -p' will not receive to an encrypted child"
-log_mustnot eval "zfs send -p $snap | zfs receive $TESTPOOL/$TESTFS1/c2"
+log_note "Verifying 'send -p' will receive to an encrypted child"
+log_must eval "zfs send -p $snap | zfs receive $TESTPOOL/$TESTFS1/c2"
+log_must test "$(get_prop 'encryption' $TESTPOOL/$TESTFS1/c2)" == "off"
 
-log_note "Verifying 'send -R' will not receive to an encrypted child"
-log_mustnot eval "zfs send -R $snap | zfs receive $TESTPOOL/$TESTFS1/c3"
+log_note "Verifying 'send -R' will receive to an encrypted child"
+log_must eval "zfs send -R $snap | zfs receive $TESTPOOL/$TESTFS1/c3"
+log_must test "$(get_prop 'encryption' $TESTPOOL/$TESTFS1/c3)" == "off"
 
 log_note "Verifying ZFS will not receive to an encrypted child when the" \
 	"parent key is unloaded"
@@ -72,4 +74,4 @@ log_must zfs unmount $TESTPOOL/$TESTFS1
 log_must zfs unload-key $TESTPOOL/$TESTFS1
 log_mustnot eval "zfs send $snap | zfs receive $TESTPOOL/$TESTFS1/c4"
 
-log_pass "ZFS can receive to an encrypted child dataset"
+log_pass "ZFS can receive encrypted filesystems into child dataset"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh
index 400592aaca2c..1b9c6e3c704f 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh
@@ -23,12 +23,13 @@
 
 #
 # DESCRIPTION:
-# 'zfs rename' should not rename an unencrypted dataset to a child
+# 'zfs rename' should be able to move an unencrypted dataset to a child
 # of an encrypted dataset
 #
 # STRATEGY:
 # 1. Create an encrypted dataset
-# 2. Attempt to rename the default dataset to a child of the encrypted dataset
+# 2. Rename the default dataset to a child of the encrypted dataset
+# 3. Confirm the child dataset doesn't have any encryption properties
 #
 
 verify_runnable "both"
@@ -36,16 +37,17 @@ verify_runnable "both"
 function cleanup
 {
 	datasetexists $TESTPOOL/$TESTFS2 && \
-		log_must zfs destroy $TESTPOOL/$TESTFS2
+		log_must zfs destroy -r $TESTPOOL/$TESTFS2
 }
 log_onexit cleanup
 
-log_assert "'zfs rename' should not rename an unencrypted dataset to a" \
+log_assert "'zfs rename' should allow renaming an unencrypted dataset to a" \
 	"child of an encrypted dataset"
 
 log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \
 	"-o keyformat=passphrase -o keylocation=prompt $TESTPOOL/$TESTFS2"
-log_mustnot zfs rename $TESTPOOL/$TESTFS $TESTPOOL/$TESTFS2/$TESTFS
+log_must zfs rename $TESTPOOL/$TESTFS $TESTPOOL/$TESTFS2/$TESTFS
+log_must test "$(get_prop 'encryption' $TESTPOOL/$TESTFS2/$TESTFS)" == "off"
 
-log_pass "'zfs rename' does not rename an unencrypted dataset to a child" \
+log_pass "'zfs rename' allows renaming an unencrypted dataset to a child" \
 	"of an encrypted dataset"

From 9af524b0ee26c821cf412b796ef178e108c5cb10 Mon Sep 17 00:00:00 2001
From: Igor K <igor@dilos.org>
Date: Fri, 21 Jun 2019 04:29:02 +0300
Subject: [PATCH 035/109] Update vdev_ops_t from illumos

Align vdev_ops_t from illumos for better compatibility.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Igor Kozhukhov <igor@dilos.org>
Closes #8925
---
 module/zfs/vdev_disk.c     | 26 ++++++-------
 module/zfs/vdev_file.c     | 52 ++++++++++++-------------
 module/zfs/vdev_indirect.c | 26 ++++++-------
 module/zfs/vdev_mirror.c   | 78 +++++++++++++++++++-------------------
 module/zfs/vdev_missing.c  | 52 ++++++++++++-------------
 module/zfs/vdev_raidz.c    | 26 ++++++-------
 module/zfs/vdev_root.c     | 26 ++++++-------
 7 files changed, 143 insertions(+), 143 deletions(-)

diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 1419ae6ad54a..1686ddfce77d 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -935,19 +935,19 @@ param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
 }
 
 vdev_ops_t vdev_disk_ops = {
-	vdev_disk_open,
-	vdev_disk_close,
-	vdev_default_asize,
-	vdev_disk_io_start,
-	vdev_disk_io_done,
-	NULL,
-	NULL,
-	vdev_disk_hold,
-	vdev_disk_rele,
-	NULL,
-	vdev_default_xlate,
-	VDEV_TYPE_DISK,		/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
+	.vdev_op_open = vdev_disk_open,
+	.vdev_op_close = vdev_disk_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_disk_io_start,
+	.vdev_op_io_done = vdev_disk_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_disk_hold,
+	.vdev_op_rele = vdev_disk_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c
index c155057852a3..b79017f3a610 100644
--- a/module/zfs/vdev_file.c
+++ b/module/zfs/vdev_file.c
@@ -277,19 +277,19 @@ vdev_file_io_done(zio_t *zio)
 }
 
 vdev_ops_t vdev_file_ops = {
-	vdev_file_open,
-	vdev_file_close,
-	vdev_default_asize,
-	vdev_file_io_start,
-	vdev_file_io_done,
-	NULL,
-	NULL,
-	vdev_file_hold,
-	vdev_file_rele,
-	NULL,
-	vdev_default_xlate,
-	VDEV_TYPE_FILE,		/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
+	.vdev_op_open = vdev_file_open,
+	.vdev_op_close = vdev_file_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_file_io_start,
+	.vdev_op_io_done = vdev_file_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_file_hold,
+	.vdev_op_rele = vdev_file_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_type = VDEV_TYPE_FILE,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 void
@@ -313,19 +313,19 @@ vdev_file_fini(void)
 #ifndef _KERNEL
 
 vdev_ops_t vdev_disk_ops = {
-	vdev_file_open,
-	vdev_file_close,
-	vdev_default_asize,
-	vdev_file_io_start,
-	vdev_file_io_done,
-	NULL,
-	NULL,
-	vdev_file_hold,
-	vdev_file_rele,
-	NULL,
-	vdev_default_xlate,
-	VDEV_TYPE_DISK,		/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
+	.vdev_op_open = vdev_file_open,
+	.vdev_op_close = vdev_file_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_file_io_start,
+	.vdev_op_io_done = vdev_file_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_file_hold,
+	.vdev_op_rele = vdev_file_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 #endif
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 4d18e33c0ab7..4539fa638ada 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -1842,19 +1842,19 @@ vdev_indirect_io_done(zio_t *zio)
 }
 
 vdev_ops_t vdev_indirect_ops = {
-	vdev_indirect_open,
-	vdev_indirect_close,
-	vdev_default_asize,
-	vdev_indirect_io_start,
-	vdev_indirect_io_done,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	vdev_indirect_remap,
-	NULL,
-	VDEV_TYPE_INDIRECT,	/* name of this vdev type */
-	B_FALSE			/* leaf vdev */
+	.vdev_op_open = vdev_indirect_open,
+	.vdev_op_close = vdev_indirect_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_indirect_io_start,
+	.vdev_op_io_done = vdev_indirect_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = vdev_indirect_remap,
+	.vdev_op_xlate = NULL,
+	.vdev_op_type = VDEV_TYPE_INDIRECT,	/* name of this vdev type */
+	.vdev_op_leaf = B_FALSE			/* leaf vdev */
 };
 
 #if defined(_KERNEL)
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 59cc2dcdd2ca..23ff75bfc96f 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -786,51 +786,51 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
 }
 
 vdev_ops_t vdev_mirror_ops = {
-	vdev_mirror_open,
-	vdev_mirror_close,
-	vdev_default_asize,
-	vdev_mirror_io_start,
-	vdev_mirror_io_done,
-	vdev_mirror_state_change,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	vdev_default_xlate,
-	VDEV_TYPE_MIRROR,	/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
+	.vdev_op_open = vdev_mirror_open,
+	.vdev_op_close = vdev_mirror_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_mirror_io_start,
+	.vdev_op_io_done = vdev_mirror_io_done,
+	.vdev_op_state_change = vdev_mirror_state_change,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_type = VDEV_TYPE_MIRROR,	/* name of this vdev type */
+	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 vdev_ops_t vdev_replacing_ops = {
-	vdev_mirror_open,
-	vdev_mirror_close,
-	vdev_default_asize,
-	vdev_mirror_io_start,
-	vdev_mirror_io_done,
-	vdev_mirror_state_change,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	vdev_default_xlate,
-	VDEV_TYPE_REPLACING,	/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
+	.vdev_op_open = vdev_mirror_open,
+	.vdev_op_close = vdev_mirror_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_mirror_io_start,
+	.vdev_op_io_done = vdev_mirror_io_done,
+	.vdev_op_state_change = vdev_mirror_state_change,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_type = VDEV_TYPE_REPLACING,	/* name of this vdev type */
+	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 vdev_ops_t vdev_spare_ops = {
-	vdev_mirror_open,
-	vdev_mirror_close,
-	vdev_default_asize,
-	vdev_mirror_io_start,
-	vdev_mirror_io_done,
-	vdev_mirror_state_change,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	vdev_default_xlate,
-	VDEV_TYPE_SPARE,	/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
+	.vdev_op_open = vdev_mirror_open,
+	.vdev_op_close = vdev_mirror_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_mirror_io_start,
+	.vdev_op_io_done = vdev_mirror_io_done,
+	.vdev_op_state_change = vdev_mirror_state_change,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_type = VDEV_TYPE_SPARE,	/* name of this vdev type */
+	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 #if defined(_KERNEL)
diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c
index d85993bff052..205b23eba7f5 100644
--- a/module/zfs/vdev_missing.c
+++ b/module/zfs/vdev_missing.c
@@ -80,33 +80,33 @@ vdev_missing_io_done(zio_t *zio)
 }
 
 vdev_ops_t vdev_missing_ops = {
-	vdev_missing_open,
-	vdev_missing_close,
-	vdev_default_asize,
-	vdev_missing_io_start,
-	vdev_missing_io_done,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	VDEV_TYPE_MISSING,	/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
+	.vdev_op_open = vdev_missing_open,
+	.vdev_op_close = vdev_missing_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_missing_io_start,
+	.vdev_op_io_done = vdev_missing_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = NULL,
+	.vdev_op_type = VDEV_TYPE_MISSING,	/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 vdev_ops_t vdev_hole_ops = {
-	vdev_missing_open,
-	vdev_missing_close,
-	vdev_default_asize,
-	vdev_missing_io_start,
-	vdev_missing_io_done,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	VDEV_TYPE_HOLE,		/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
+	.vdev_op_open = vdev_missing_open,
+	.vdev_op_close = vdev_missing_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_missing_io_start,
+	.vdev_op_io_done = vdev_missing_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = NULL,
+	.vdev_op_type = VDEV_TYPE_HOLE,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 215cd1c12064..327b186713fa 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -2403,17 +2403,17 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
 }
 
 vdev_ops_t vdev_raidz_ops = {
-	vdev_raidz_open,
-	vdev_raidz_close,
-	vdev_raidz_asize,
-	vdev_raidz_io_start,
-	vdev_raidz_io_done,
-	vdev_raidz_state_change,
-	vdev_raidz_need_resilver,
-	NULL,
-	NULL,
-	NULL,
-	vdev_raidz_xlate,
-	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
+	.vdev_op_open = vdev_raidz_open,
+	.vdev_op_close = vdev_raidz_close,
+	.vdev_op_asize = vdev_raidz_asize,
+	.vdev_op_io_start = vdev_raidz_io_start,
+	.vdev_op_io_done = vdev_raidz_io_done,
+	.vdev_op_state_change = vdev_raidz_state_change,
+	.vdev_op_need_resilver = vdev_raidz_need_resilver,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_raidz_xlate,
+	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
+	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c
index e40b7ce8e4e8..7170f7013608 100644
--- a/module/zfs/vdev_root.c
+++ b/module/zfs/vdev_root.c
@@ -140,17 +140,17 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
 }
 
 vdev_ops_t vdev_root_ops = {
-	vdev_root_open,
-	vdev_root_close,
-	vdev_default_asize,
-	NULL,			/* io_start - not applicable to the root */
-	NULL,			/* io_done - not applicable to the root */
-	vdev_root_state_change,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	VDEV_TYPE_ROOT,		/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
+	.vdev_op_open = vdev_root_open,
+	.vdev_op_close = vdev_root_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = NULL,	/* not applicable to the root */
+	.vdev_op_io_done = NULL,	/* not applicable to the root */
+	.vdev_op_state_change = vdev_root_state_change,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = NULL,
+	.vdev_op_type = VDEV_TYPE_ROOT,	/* name of this vdev type */
+	.vdev_op_leaf = B_FALSE		/* not a leaf vdev */
 };

From 3c2a42fd254917db78484c428bd317ec7189c968 Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <mahrens@delphix.com>
Date: Thu, 20 Jun 2019 18:30:40 -0700
Subject: [PATCH 036/109] dedup=verify doesn't clear the blkptr's dedup flag

The logic to handle strong checksum collisions where the data doesn't
match is incorrect. It is not clearing the dedup bit of the blkptr,
which can cause a panic later in zio_ddt_free() due to the dedup table
not matching what is in the blkptr.

Reviewed-by: Tom Caputi <tcaputi@datto.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-48097
Closes #8936
---
 module/zfs/zio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 5bfff37eb3b5..f9503bd3ff81 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3192,7 +3192,9 @@ zio_ddt_write(zio_t *zio)
 			BP_ZERO(bp);
 		} else {
 			zp->zp_dedup = B_FALSE;
+			BP_SET_DEDUP(bp, B_FALSE);
 		}
+		ASSERT(!BP_GET_DEDUP(bp));
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
 		return (zio);

From ab24c9cd4cbba2c4d5cb68f3e1e08dcf2275dc34 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Fri, 21 Jun 2019 10:31:53 +0900
Subject: [PATCH 037/109] Prevent pointer to an out-of-scope local variable

`show_str` could be a pointer to a local variable in stack
which is out-of-scope by the time
`return (snprintf(buf, buflen, "%s\n", show_str));`
is called.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #8924
Closes #8940
---
 module/zfs/zfs_sysfs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/module/zfs/zfs_sysfs.c b/module/zfs/zfs_sysfs.c
index 2f5bea9aa996..bb7f3b69a662 100644
--- a/module/zfs/zfs_sysfs.c
+++ b/module/zfs/zfs_sysfs.c
@@ -264,6 +264,7 @@ zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property,
     char *buf, size_t buflen)
 {
 	const char *show_str;
+	char number[32];
 
 	/* For dataset properties list the dataset types that apply */
 	if (strcmp(attr_name, "datasets") == 0 &&
@@ -291,8 +292,6 @@ zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property,
 	} else if (strcmp(attr_name, "values") == 0) {
 		show_str = property->pd_values ? property->pd_values : "";
 	} else if (strcmp(attr_name, "default") == 0) {
-		char number[32];
-
 		switch (property->pd_proptype) {
 		case PROP_TYPE_NUMBER:
 			(void) snprintf(number, sizeof (number), "%llu",

From 1fd28bd8d4e102a4ce5e4910427f612c7cf73e68 Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <pcd@delphix.com>
Date: Fri, 21 Jun 2019 09:40:56 -0700
Subject: [PATCH 038/109] Add SCSI_PASSTHROUGH to zvols to enable UNMAP support

When exporting ZVOLs as SCSI LUNs, by default Windows will not
issue them UNMAP commands. This reduces storage efficiency in
many cases.

We add the SCSI_PASSTHROUGH flag to the zvol's device queue,
which lets the SCSI target logic know that it can handle SCSI
commands.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: John Gallagher <john.gallagher@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #8933
---
 module/zfs/zvol.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index c29f65f676b9..7c7500dbaaf7 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -1876,6 +1876,10 @@ zvol_create_minor_impl(const char *name)
 #ifdef QUEUE_FLAG_ADD_RANDOM
 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue);
 #endif
+	/* This flag was introduced in kernel version 4.12. */
+#ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
+	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_queue);
+#endif
 
 	if (spa_writeable(dmu_objset_spa(os))) {
 		if (zil_replay_disable)

From 7a5f4656ce76dbb2c7f3c6810f670a713da48a9e Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <pcd@delphix.com>
Date: Sat, 22 Jun 2019 16:32:26 -0700
Subject: [PATCH 039/109] Fix comments on zfs_bookmark_phys

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #8945
---
 include/sys/dsl_bookmark.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/sys/dsl_bookmark.h b/include/sys/dsl_bookmark.h
index 3cdad7441407..ea7d70cf3232 100644
--- a/include/sys/dsl_bookmark.h
+++ b/include/sys/dsl_bookmark.h
@@ -37,9 +37,11 @@ typedef struct zfs_bookmark_phys {
 	uint64_t zbm_creation_txg;	/* birth transaction group */
 	uint64_t zbm_creation_time;	/* bookmark creation time */
 
-	/* the following fields are reserved for redacted send / recv */
+	/* fields used for redacted send / recv */
 	uint64_t zbm_redaction_obj;	/* redaction list object */
 	uint64_t zbm_flags;		/* ZBM_FLAG_* */
+
+	/* fields used for bookmark written size */
 	uint64_t zbm_referenced_bytes_refd;
 	uint64_t zbm_compressed_bytes_refd;
 	uint64_t zbm_uncompressed_bytes_refd;

From d053481523b369b7c00f5fd1c1b1ae54876b8f69 Mon Sep 17 00:00:00 2001
From: Allan Jude <allanjude@freebsd.org>
Date: Sat, 22 Jun 2019 19:33:44 -0400
Subject: [PATCH 040/109] zstreamdump: add per-record-type counters and an
 overhead counter

Count the bytes of payload for each replication record type

Count the bytes of overhead (replication records themselves)

Include these counters in the output summary at the end of the run.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Signed-off-by: Allan Jude <allanjude@freebsd.org>
Sponsored-By: Klara Systems and Catalogic
Closes #8432
---
 cmd/zstreamdump/zstreamdump.c                 | 63 ++++++++++++-------
 .../tests/functional/rsend/rsend.kshlib       |  2 +-
 2 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/cmd/zstreamdump/zstreamdump.c b/cmd/zstreamdump/zstreamdump.c
index a162eceda58f..a65b4cef3d31 100644
--- a/cmd/zstreamdump/zstreamdump.c
+++ b/cmd/zstreamdump/zstreamdump.c
@@ -53,7 +53,6 @@
  */
 #define	DUMP_GROUPING	4
 
-uint64_t total_write_size = 0;
 uint64_t total_stream_len = 0;
 FILE *send_stream = 0;
 boolean_t do_byteswap = B_FALSE;
@@ -219,6 +218,9 @@ main(int argc, char *argv[])
 {
 	char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
 	uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
+	uint64_t total_payload_size = 0;
+	uint64_t total_overhead_size = 0;
+	uint64_t drr_byte_count[DRR_NUMTYPES] = { 0 };
 	char salt[ZIO_DATA_SALT_LEN * 2 + 1];
 	char iv[ZIO_DATA_IV_LEN * 2 + 1];
 	char mac[ZIO_DATA_MAC_LEN * 2 + 1];
@@ -336,7 +338,9 @@ main(int argc, char *argv[])
 		}
 
 		drr_record_count[drr->drr_type]++;
+		total_overhead_size += sizeof (*drr);
 		total_records++;
+		payload_size = 0;
 
 		switch (drr->drr_type) {
 		case DRR_BEGIN:
@@ -390,6 +394,7 @@ main(int argc, char *argv[])
 					nvlist_print(stdout, nv);
 					nvlist_free(nv);
 				}
+				payload_size = sz;
 			}
 			break;
 
@@ -554,7 +559,6 @@ main(int argc, char *argv[])
 			if (dump) {
 				print_block(buf, payload_size);
 			}
-			total_write_size += payload_size;
 			break;
 
 		case DRR_WRITE_BYREF:
@@ -683,6 +687,7 @@ main(int argc, char *argv[])
 				print_block(buf,
 				    P2ROUNDUP(drrwe->drr_psize, 8));
 			}
+			payload_size = P2ROUNDUP(drrwe->drr_psize, 8);
 			break;
 		case DRR_OBJECT_RANGE:
 			if (do_byteswap) {
@@ -723,6 +728,8 @@ main(int argc, char *argv[])
 			    (longlong_t)drrc->drr_checksum.zc_word[3]);
 		}
 		pcksum = zc;
+		drr_byte_count[drr->drr_type] += payload_size;
+		total_payload_size += payload_size;
 	}
 	free(buf);
 	fletcher_4_fini();
@@ -730,28 +737,40 @@ main(int argc, char *argv[])
 	/* Print final summary */
 
 	(void) printf("SUMMARY:\n");
-	(void) printf("\tTotal DRR_BEGIN records = %lld\n",
-	    (u_longlong_t)drr_record_count[DRR_BEGIN]);
-	(void) printf("\tTotal DRR_END records = %lld\n",
-	    (u_longlong_t)drr_record_count[DRR_END]);
-	(void) printf("\tTotal DRR_OBJECT records = %lld\n",
-	    (u_longlong_t)drr_record_count[DRR_OBJECT]);
-	(void) printf("\tTotal DRR_FREEOBJECTS records = %lld\n",
-	    (u_longlong_t)drr_record_count[DRR_FREEOBJECTS]);
-	(void) printf("\tTotal DRR_WRITE records = %lld\n",
-	    (u_longlong_t)drr_record_count[DRR_WRITE]);
-	(void) printf("\tTotal DRR_WRITE_BYREF records = %lld\n",
-	    (u_longlong_t)drr_record_count[DRR_WRITE_BYREF]);
-	(void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld\n",
-	    (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED]);
-	(void) printf("\tTotal DRR_FREE records = %lld\n",
-	    (u_longlong_t)drr_record_count[DRR_FREE]);
-	(void) printf("\tTotal DRR_SPILL records = %lld\n",
-	    (u_longlong_t)drr_record_count[DRR_SPILL]);
+	(void) printf("\tTotal DRR_BEGIN records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_BEGIN],
+	    (u_longlong_t)drr_byte_count[DRR_BEGIN]);
+	(void) printf("\tTotal DRR_END records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_END],
+	    (u_longlong_t)drr_byte_count[DRR_END]);
+	(void) printf("\tTotal DRR_OBJECT records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_OBJECT],
+	    (u_longlong_t)drr_byte_count[DRR_OBJECT]);
+	(void) printf("\tTotal DRR_FREEOBJECTS records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_FREEOBJECTS],
+	    (u_longlong_t)drr_byte_count[DRR_FREEOBJECTS]);
+	(void) printf("\tTotal DRR_WRITE records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_WRITE],
+	    (u_longlong_t)drr_byte_count[DRR_WRITE]);
+	(void) printf("\tTotal DRR_WRITE_BYREF records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_WRITE_BYREF],
+	    (u_longlong_t)drr_byte_count[DRR_WRITE_BYREF]);
+	(void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld (%llu "
+	    "bytes)\n", (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED],
+	    (u_longlong_t)drr_byte_count[DRR_WRITE_EMBEDDED]);
+	(void) printf("\tTotal DRR_FREE records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_FREE],
+	    (u_longlong_t)drr_byte_count[DRR_FREE]);
+	(void) printf("\tTotal DRR_SPILL records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_SPILL],
+	    (u_longlong_t)drr_byte_count[DRR_SPILL]);
 	(void) printf("\tTotal records = %lld\n",
 	    (u_longlong_t)total_records);
-	(void) printf("\tTotal write size = %lld (0x%llx)\n",
-	    (u_longlong_t)total_write_size, (u_longlong_t)total_write_size);
+	(void) printf("\tTotal payload size = %lld (0x%llx)\n",
+	    (u_longlong_t)total_payload_size, (u_longlong_t)total_payload_size);
+	(void) printf("\tTotal header overhead = %lld (0x%llx)\n",
+	    (u_longlong_t)total_overhead_size,
+	    (u_longlong_t)total_overhead_size);
 	(void) printf("\tTotal stream length = %lld (0x%llx)\n",
 	    (u_longlong_t)total_stream_len, (u_longlong_t)total_stream_len);
 	return (0);
diff --git a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib
index 521a1c7eb63c..8737ae55abfa 100644
--- a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib
+++ b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib
@@ -754,7 +754,7 @@ function verify_stream_size
 	datasetexists $ds || log_fail "No such dataset: $ds"
 
 	typeset stream_size=$(cat $stream | zstreamdump | sed -n \
-	    's/	Total write size = \(.*\) (0x.*)/\1/p')
+	    's/	Total payload size = \(.*\) (0x.*)/\1/p')
 
 	typeset inc_size=0
 	if [[ -n $inc_src ]]; then

From 95fcb04215015950b3388ba0a6edad8e1b463415 Mon Sep 17 00:00:00 2001
From: Don Brady <don.brady@delphix.com>
Date: Sat, 22 Jun 2019 16:41:21 -0700
Subject: [PATCH 041/109] Let zfs mount all tolerate in-progress mounts

The zfs-mount service can unexpectedly fail to start when zfs
encounters a mount that is in progress. This service uses
zfs mount -a, which has a window between the time it checks if
the dataset was mounted and when the actual mount (via mount.zfs
binary) occurs.

The reason for the racing mounts is that both zfs-mount.target
and zfs-share.target are allowed to execute concurrently after
the import.  This is more of an issue with the relatively recent
addition of parallel mounting, and we should consider serializing
the mount and share targets.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Allan Jude <allanjude@freebsd.org>
Signed-off-by: Don Brady <don.brady@delphix.com>
Closes #8881
---
 cmd/zfs/zfs_main.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 214a437c5dd1..074216055227 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -6446,8 +6446,25 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
 			return (1);
 		}
 
-		if (zfs_mount(zhp, options, flags) != 0)
+		if (zfs_mount(zhp, options, flags) != 0) {
+			/*
+			 * Check if a mount sneaked in after we checked
+			 */
+			if (!explicit &&
+			    libzfs_errno(g_zfs) == EZFS_MOUNTFAILED) {
+				usleep(10 * MILLISEC);
+				libzfs_mnttab_cache(g_zfs, B_FALSE);
+
+				if (zfs_is_mounted(zhp, NULL)) {
+					(void) fprintf(stderr, gettext(
+					    "Ignoring previous 'already "
+					    "mounted' error for '%s'\n"),
+					    zfs_get_name(zhp));
+					return (0);
+				}
+			}
 			return (1);
+		}
 		break;
 	}
 

From 2d88230d97d9f9f4f3b89d1081eeab86fe3d9373 Mon Sep 17 00:00:00 2001
From: Harry Mallon <1816667+hjmallon@users.noreply.github.com>
Date: Sun, 23 Jun 2019 00:43:11 +0100
Subject: [PATCH 042/109] Add libnvpair to libzfs pkg-config

Functions such as `fnvlist_lookup_nvlist` need libnvpair to be linked.
Default pkg-config file did not contain it.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Harry Mallon <hjmallon@gmail.com>
Closes #8919
---
 lib/libzfs/libzfs.pc.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/libzfs/libzfs.pc.in b/lib/libzfs/libzfs.pc.in
index 0e83f7a64be0..1122401a6eb9 100644
--- a/lib/libzfs/libzfs.pc.in
+++ b/lib/libzfs/libzfs.pc.in
@@ -9,4 +9,4 @@ Version: @VERSION@
 URL: http://zfsonlinux.org
 Requires: libzfs_core
 Cflags: -I${includedir}/libzfs -I${includedir}/libspl
-Libs: -L${libdir} -lzfs
+Libs: -L${libdir} -lzfs -lnvpair

From be4a282a8ddc6bf42ec1ba9c3b99d06052f1d625 Mon Sep 17 00:00:00 2001
From: gordan-bobic <gordan.bobic@gmail.com>
Date: Sun, 23 Jun 2019 00:47:19 +0100
Subject: [PATCH 043/109] Remove arch and relax version dependency

Remove arch and relax version dependency for zfs-dracut
package.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Gordan Bobic <gordan@redsleeve.org>
Issue #8913
Closes #8914
---
 rpm/generic/zfs.spec.in | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index 9faa3ba771a1..0b16cd0e886b 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -255,7 +255,8 @@ validating the file system.
 %package dracut
 Summary:        Dracut module
 Group:          System Environment/Kernel
-Requires:       %{name}%{?_isa} = %{version}-%{release}
+BuildArch:	noarch
+Requires:       %{name} >= %{version}
 Requires:       dracut
 Requires:       /usr/bin/awk
 Requires:       grep

From 7d64595c251682f4a38809ecd44e81b4d1af8b74 Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <mahrens@delphix.com>
Date: Sat, 22 Jun 2019 16:48:54 -0700
Subject: [PATCH 044/109] dn_struct_rwlock can not be held in
 dmu_tx_try_assign()

The thread calling dmu_tx_try_assign() can't hold the dn_struct_rwlock
while assigning the tx, because this can lead to deadlock. Specifically,
if this dnode is already assigned to an earlier txg, this thread may
need to wait for that txg to sync (the ERESTART case below).  The other
thread that has assigned this dnode to an earlier txg prevents this txg
from syncing until its tx can complete (calling dmu_tx_commit()), but it
may need to acquire the dn_struct_rwlock to do so (e.g. via
dmu_buf_hold*()).

This commit adds an assertion to dmu_tx_try_assign() to ensure that this
deadlock is not inadvertently introduced.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #8929
---
 module/zfs/dmu_tx.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index cbadcc86fc61..7d65e842ff03 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -925,6 +925,25 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 	    txh = list_next(&tx->tx_holds, txh)) {
 		dnode_t *dn = txh->txh_dnode;
 		if (dn != NULL) {
+			/*
+			 * This thread can't hold the dn_struct_rwlock
+			 * while assigning the tx, because this can lead to
+			 * deadlock. Specifically, if this dnode is already
+			 * assigned to an earlier txg, this thread may need
+			 * to wait for that txg to sync (the ERESTART case
+			 * below).  The other thread that has assigned this
+			 * dnode to an earlier txg prevents this txg from
+			 * syncing until its tx can complete (calling
+			 * dmu_tx_commit()), but it may need to acquire the
+			 * dn_struct_rwlock to do so (e.g. via
+			 * dmu_buf_hold*()).
+			 *
+			 * Note that this thread can't hold the lock for
+			 * read either, but the rwlock doesn't record
+			 * enough information to make that assertion.
+			 */
+			ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
 			mutex_enter(&dn->dn_mtx);
 			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 				mutex_exit(&dn->dn_mtx);

From cc7fe8a59967092a9b42355794a1859feb30548f Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Mon, 24 Jun 2019 09:32:47 -0700
Subject: [PATCH 045/109] Fix out-of-tree build failures

Resolve the incorrect use of srcdir and builddir references for
various files in the build system.  These have crept in over time
and went unnoticed because when building in the top level directory
srcdir and builddir are identical.

With this change it's again possible to build in a subdirectory.

    $ mkdir obj
    $ cd obj
    $ ../configure
    $ make

Reviewed-by: loli10K <ezomori.nozomu@gmail.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Don Brady <don.brady@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #8921
Closes #8943
---
 Makefile.am                                   |  3 +-
 cmd/zed/Makefile.am                           | 57 +------------------
 cmd/zed/zed.d/Makefile.am                     | 57 +++++++++++++++++++
 configure.ac                                  |  1 +
 contrib/initramfs/Makefile.am                 | 21 ++++---
 contrib/pyzfs/Makefile.am                     |  2 +-
 module/Makefile.in                            |  5 +-
 scripts/Makefile.am                           |  5 +-
 tests/runfiles/Makefile.am                    |  5 +-
 .../tests/functional/checksum/Makefile.am     |  2 +-
 .../tests/functional/hkdf/Makefile.am         |  2 +-
 11 files changed, 88 insertions(+), 72 deletions(-)
 create mode 100644 cmd/zed/zed.d/Makefile.am

diff --git a/Makefile.am b/Makefile.am
index 1ec2514922a9..9afe22954101 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -52,7 +52,8 @@ distclean-local::
 		-type f -print | xargs $(RM)
 
 all-local:
-	-${top_srcdir}/scripts/zfs-tests.sh -c
+	-[ -x ${top_builddir}/scripts/zfs-tests.sh ] && \
+	    ${top_builddir}/scripts/zfs-tests.sh -c
 
 dist-hook: gitrev
 	cp ${top_srcdir}/include/zfs_gitrev.h $(distdir)/include; \
diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am
index 9c11315f2a58..fb479f9b5c79 100644
--- a/cmd/zed/Makefile.am
+++ b/cmd/zed/Makefile.am
@@ -1,12 +1,11 @@
+SUBDIRS = zed.d
+
 include $(top_srcdir)/config/Rules.am
 
 DEFAULT_INCLUDES += \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/lib/libspl/include
 
-EXTRA_DIST = zed.d/README \
-	zed.d/history_event-zfs-list-cacher.sh.in
-
 sbin_PROGRAMS = zed
 
 ZED_SRC = \
@@ -47,55 +46,3 @@ zed_LDADD = \
 
 zed_LDADD += -lrt
 zed_LDFLAGS = -pthread
-
-zedconfdir = $(sysconfdir)/zfs/zed.d
-
-dist_zedconf_DATA = \
-	zed.d/zed-functions.sh \
-	zed.d/zed.rc
-
-zedexecdir = $(zfsexecdir)/zed.d
-
-dist_zedexec_SCRIPTS = \
-	zed.d/all-debug.sh \
-	zed.d/all-syslog.sh \
-	zed.d/data-notify.sh \
-	zed.d/generic-notify.sh \
-	zed.d/resilver_finish-notify.sh \
-	zed.d/scrub_finish-notify.sh \
-	zed.d/statechange-led.sh \
-	zed.d/statechange-notify.sh \
-	zed.d/vdev_clear-led.sh \
-	zed.d/vdev_attach-led.sh \
-	zed.d/pool_import-led.sh \
-	zed.d/resilver_finish-start-scrub.sh
-
-nodist_zedexec_SCRIPTS = zed.d/history_event-zfs-list-cacher.sh
-
-$(nodist_zedexec_SCRIPTS): %: %.in
-	-$(SED) -e 's,@bindir\@,$(bindir),g' \
-		-e 's,@runstatedir\@,$(runstatedir),g' \
-		-e 's,@sbindir\@,$(sbindir),g' \
-		-e 's,@sysconfdir\@,$(sysconfdir),g' \
-		$< >'$@'
-
-zedconfdefaults = \
-	all-syslog.sh \
-	data-notify.sh \
-	resilver_finish-notify.sh \
-	scrub_finish-notify.sh \
-	statechange-led.sh \
-	statechange-notify.sh \
-	vdev_clear-led.sh \
-	vdev_attach-led.sh \
-	pool_import-led.sh \
-	resilver_finish-start-scrub.sh
-
-install-data-hook:
-	$(MKDIR_P) "$(DESTDIR)$(zedconfdir)"
-	for f in $(zedconfdefaults); do \
-	  test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \
-	       -L "$(DESTDIR)$(zedconfdir)/$${f}" || \
-	    ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \
-	done
-	chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc"
diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am
new file mode 100644
index 000000000000..716db2b2f215
--- /dev/null
+++ b/cmd/zed/zed.d/Makefile.am
@@ -0,0 +1,57 @@
+include $(top_srcdir)/config/Rules.am
+
+EXTRA_DIST = \
+	README \
+	history_event-zfs-list-cacher.sh.in
+
+zedconfdir = $(sysconfdir)/zfs/zed.d
+
+dist_zedconf_DATA = \
+	zed-functions.sh \
+	zed.rc
+
+zedexecdir = $(zfsexecdir)/zed.d
+
+dist_zedexec_SCRIPTS = \
+	all-debug.sh \
+	all-syslog.sh \
+	data-notify.sh \
+	generic-notify.sh \
+	resilver_finish-notify.sh \
+	scrub_finish-notify.sh \
+	statechange-led.sh \
+	statechange-notify.sh \
+	vdev_clear-led.sh \
+	vdev_attach-led.sh \
+	pool_import-led.sh \
+	resilver_finish-start-scrub.sh
+
+nodist_zedexec_SCRIPTS = history_event-zfs-list-cacher.sh
+
+$(nodist_zedexec_SCRIPTS): %: %.in
+	-$(SED) -e 's,@bindir\@,$(bindir),g' \
+		-e 's,@runstatedir\@,$(runstatedir),g' \
+		-e 's,@sbindir\@,$(sbindir),g' \
+		-e 's,@sysconfdir\@,$(sysconfdir),g' \
+		$< >'$@'
+
+zedconfdefaults = \
+	all-syslog.sh \
+	data-notify.sh \
+	resilver_finish-notify.sh \
+	scrub_finish-notify.sh \
+	statechange-led.sh \
+	statechange-notify.sh \
+	vdev_clear-led.sh \
+	vdev_attach-led.sh \
+	pool_import-led.sh \
+	resilver_finish-start-scrub.sh
+
+install-data-hook:
+	$(MKDIR_P) "$(DESTDIR)$(zedconfdir)"
+	for f in $(zedconfdefaults); do \
+	  test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \
+	       -L "$(DESTDIR)$(zedconfdir)/$${f}" || \
+	    ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \
+	done
+	chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc"
diff --git a/configure.ac b/configure.ac
index db614084e37e..ea2e355c70bf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -120,6 +120,7 @@ AC_CONFIG_FILES([
 	cmd/dbufstat/Makefile
 	cmd/arc_summary/Makefile
 	cmd/zed/Makefile
+	cmd/zed/zed.d/Makefile
 	cmd/raidz_test/Makefile
 	cmd/zgenhostid/Makefile
 	contrib/Makefile
diff --git a/contrib/initramfs/Makefile.am b/contrib/initramfs/Makefile.am
index 87ec7a86f5ac..9f912d946649 100644
--- a/contrib/initramfs/Makefile.am
+++ b/contrib/initramfs/Makefile.am
@@ -11,13 +11,18 @@ EXTRA_DIST = \
 	$(top_srcdir)/contrib/initramfs/README.initramfs.markdown
 
 install-initrdSCRIPTS: $(EXTRA_DIST)
-	for d in conf.d conf-hooks.d hooks scripts scripts/local-top; do \
-	  $(MKDIR_P) $(DESTDIR)$(initrddir)/$$d; \
-	  cp $(top_srcdir)/contrib/initramfs/$$d/zfs \
-	    $(DESTDIR)$(initrddir)/$$d/; \
+	for d in conf.d conf-hooks.d scripts/local-top; do \
+		$(MKDIR_P) $(DESTDIR)$(initrddir)/$$d; \
+		cp $(top_srcdir)/contrib/initramfs/$$d/zfs \
+		    $(DESTDIR)$(initrddir)/$$d/; \
 	done
-	if [ -f etc/init.d/zfs ]; then \
-	  $(MKDIR_P) $(DESTDIR)$(DEFAULT_INITCONF_DIR); \
-	  cp $(top_srcdir)/etc/init.d/zfs \
-	    $(DESTDIR)$(DEFAULT_INITCONF_DIR)/; \
+	for d in hooks scripts; do \
+		$(MKDIR_P) $(DESTDIR)$(initrddir)/$$d; \
+		cp $(top_builddir)/contrib/initramfs/$$d/zfs \
+		    $(DESTDIR)$(initrddir)/$$d/; \
+	done
+	if [ -f $(top_builddir)/etc/init.d/zfs ]; then \
+		$(MKDIR_P) $(DESTDIR)$(DEFAULT_INITCONF_DIR); \
+		cp $(top_builddir)/etc/init.d/zfs \
+		    $(DESTDIR)$(DEFAULT_INITCONF_DIR)/; \
 	fi
diff --git a/contrib/pyzfs/Makefile.am b/contrib/pyzfs/Makefile.am
index 1549bf237932..fa1bb32ce2eb 100644
--- a/contrib/pyzfs/Makefile.am
+++ b/contrib/pyzfs/Makefile.am
@@ -24,7 +24,7 @@ all-local:
 # files are later created by manually loading the Python modules.
 #
 install-exec-local:
-	$(PYTHON) $(srcdir)/setup.py install \
+	$(PYTHON) $(builddir)/setup.py install \
 	--prefix $(prefix) \
 	--root $(DESTDIR)/ \
 	--install-lib $(pythonsitedir) \
diff --git a/module/Makefile.in b/module/Makefile.in
index 935bd2663062..eca7691aedbb 100644
--- a/module/Makefile.in
+++ b/module/Makefile.in
@@ -66,8 +66,9 @@ modules_uninstall:
 
 distdir:
 	list='$(subdir-m)'; for subdir in $$list; do \
-		(cd @top_srcdir@/module && find $$subdir -name '*.c' -o -name '*.h' -o -name '*.S' |\
-		xargs cp --parents -t $$distdir); \
+		(cd @top_srcdir@/module && find $$subdir \
+		-name '*.c' -o -name '*.h' -o -name '*.S' | \
+		xargs cp --parents -t @abs_top_builddir@/module/$$distdir); \
 	done
 
 distclean maintainer-clean: clean
diff --git a/scripts/Makefile.am b/scripts/Makefile.am
index 11e963c527a8..d275a41c4e04 100644
--- a/scripts/Makefile.am
+++ b/scripts/Makefile.am
@@ -60,7 +60,7 @@ all-local:
 		-e '\|^export SBIN_DIR=|s|$$|@abs_top_builddir@/bin|' \
 		-e '\|^export ZTS_DIR=|s|$$|@abs_top_srcdir@/tests|' \
 		-e '\|^export SCRIPT_DIR=|s|$$|@abs_top_srcdir@/scripts|' \
-		common.sh.in >common.sh
+		$(abs_top_srcdir)/scripts/common.sh.in >common.sh
 	-echo "$$EXTRA_ENVIRONMENT" >>common.sh
 
 clean-local:
@@ -71,4 +71,5 @@ install-data-hook:
 		-e '\|^export SBIN_DIR=|s|$$|@sbindir@|' \
 		-e '\|^export ZTS_DIR=|s|$$|@datadir@/@PACKAGE@|' \
 		-e '\|^export SCRIPT_DIR=|s|$$|@datadir@/@PACKAGE@|' \
-		common.sh.in >$(DESTDIR)$(datadir)/@PACKAGE@/common.sh
+		$(abs_top_srcdir)/scripts/common.sh.in \
+		>$(DESTDIR)$(datadir)/@PACKAGE@/common.sh
diff --git a/tests/runfiles/Makefile.am b/tests/runfiles/Makefile.am
index 138d905a5722..4625806ff8ba 100644
--- a/tests/runfiles/Makefile.am
+++ b/tests/runfiles/Makefile.am
@@ -1,2 +1,5 @@
 pkgdatadir = $(datadir)/@PACKAGE@/runfiles
-dist_pkgdata_DATA = *.run
+dist_pkgdata_DATA = \
+	linux.run \
+	longevity.run \
+	perf-regression.run
diff --git a/tests/zfs-tests/tests/functional/checksum/Makefile.am b/tests/zfs-tests/tests/functional/checksum/Makefile.am
index f72546b22590..905d991ed75f 100644
--- a/tests/zfs-tests/tests/functional/checksum/Makefile.am
+++ b/tests/zfs-tests/tests/functional/checksum/Makefile.am
@@ -1,7 +1,7 @@
 include $(top_srcdir)/config/Rules.am
 
 AM_CPPFLAGS += -I$(top_srcdir)/include
-LDADD = $(top_srcdir)/lib/libicp/libicp.la
+LDADD = $(top_builddir)/lib/libicp/libicp.la
 
 AUTOMAKE_OPTIONS = subdir-objects
 
diff --git a/tests/zfs-tests/tests/functional/hkdf/Makefile.am b/tests/zfs-tests/tests/functional/hkdf/Makefile.am
index 3ac26ed21c16..b54e353cd963 100644
--- a/tests/zfs-tests/tests/functional/hkdf/Makefile.am
+++ b/tests/zfs-tests/tests/functional/hkdf/Makefile.am
@@ -2,7 +2,7 @@ include $(top_srcdir)/config/Rules.am
 
 AM_CPPFLAGS += -I$(top_srcdir)/include
 AM_CPPFLAGS += -I$(top_srcdir)/lib/libspl/include
-LDADD = $(top_srcdir)/lib/libzpool/libzpool.la
+LDADD = $(top_builddir)/lib/libzpool/libzpool.la
 
 AUTOMAKE_OPTIONS = subdir-objects
 

From bfe5f029cfb0ae5e246898baf928c944c220ff46 Mon Sep 17 00:00:00 2001
From: Tom Caputi <tcaputi@datto.com>
Date: Mon, 24 Jun 2019 19:42:52 -0400
Subject: [PATCH 046/109] Fix error message on promoting encrypted dataset

This patch corrects the error message reported when attempting
to promote a dataset outside of its encryption root.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #8905
Closes #8935
---
 lib/libzfs/libzfs_dataset.c | 10 ++++++++++
 module/zfs/dsl_crypt.c      |  8 ++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index ee5a6412ead5..0d0194e68453 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -4117,6 +4117,16 @@ zfs_promote(zfs_handle_t *zhp)
 
 	if (ret != 0) {
 		switch (ret) {
+		case EACCES:
+			/*
+			 * Promoting encrypted dataset outside its
+			 * encryption root.
+			 */
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "cannot promote dataset outside its "
+			    "encryption root"));
+			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
+
 		case EEXIST:
 			/* There is a conflicting snapshot name. */
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c
index 0c0ffaadd8fb..568fe7aa3263 100644
--- a/module/zfs/dsl_crypt.c
+++ b/module/zfs/dsl_crypt.c
@@ -1676,11 +1676,15 @@ dsl_dataset_promote_crypt_check(dsl_dir_t *target, dsl_dir_t *origin)
 	 * Check that the parent of the target has the same encryption root.
 	 */
 	ret = dsl_dir_get_encryption_root_ddobj(origin->dd_parent, &op_rddobj);
-	if (ret != 0)
+	if (ret == ENOENT)
+		return (SET_ERROR(EACCES));
+	else if (ret != 0)
 		return (ret);
 
 	ret = dsl_dir_get_encryption_root_ddobj(target->dd_parent, &tp_rddobj);
-	if (ret != 0)
+	if (ret == ENOENT)
+		return (SET_ERROR(EACCES));
+	else if (ret != 0)
 		return (ret);
 
 	if (op_rddobj != tp_rddobj)

From 05006f125ccd97851d5f673483fb4ba606bdf0d3 Mon Sep 17 00:00:00 2001
From: Igor K <igor@dilos.org>
Date: Tue, 25 Jun 2019 03:58:12 +0300
Subject: [PATCH 047/109] -Y option for zdb is valid

The -Y option was added for ztest to test split block reconstruction.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Igor Kozhukhov <igor@dilos.org>
Closes #8926
---
 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh
index a5f827b5642f..e69779bd4b4c 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh
@@ -59,7 +59,7 @@ set -A args "create" "add" "destroy" "import fakepool" \
     "-a" "-f" "-g" "-h" "-j" "-m" "-n" "-o" "-p" \
     "-p /tmp" "-r" "-t" "-w" "-x" "-y" "-z" \
     "-D" "-E" "-G" "-H" "-I" "-J" "-K" "-M" \
-    "-N" "-Q" "-R" "-S" "-T" "-W" "-Y" "-Z"
+    "-N" "-Q" "-R" "-S" "-T" "-W" "-Z"
 
 log_assert "Execute zdb using invalid parameters."
 

From 04d4df89f4526eecd66fa1c380dba5ee3aff261c Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 25 Jun 2019 15:03:38 -0400
Subject: [PATCH 048/109] Avoid extra taskq_dispatch() calls by DMU

DMU sync code calls taskq_dispatch() for each sublist of os_dirty_dnodes
and os_synced_dnodes.  Since the number of sublists by default is equal
to number of CPUs, it will dispatch equal, potentially large, number of
tasks, waking up many CPUs to handle them, even if only one or few of
sublists actually have any work to do.

This change adds check for empty sublists to avoid this.

Reviewed by: Sean Eric Fagan <sef@ixsystems.com>
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:  Alexander Motin <mav@FreeBSD.org>
Closes #8909
---
 include/sys/multilist.h |  2 ++
 module/zfs/dmu_objset.c | 19 ++++++++++++++-----
 module/zfs/multilist.c  | 22 ++++++++++++++++++++++
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/include/sys/multilist.h b/include/sys/multilist.h
index 439540685971..0c7b4075d9a3 100644
--- a/include/sys/multilist.h
+++ b/include/sys/multilist.h
@@ -89,6 +89,8 @@ void multilist_sublist_insert_head(multilist_sublist_t *, void *);
 void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
 void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
 void multilist_sublist_remove(multilist_sublist_t *, void *);
+int  multilist_sublist_is_empty(multilist_sublist_t *);
+int  multilist_sublist_is_empty_idx(multilist_t *, unsigned int);
 
 void *multilist_sublist_head(multilist_sublist_t *);
 void *multilist_sublist_tail(multilist_sublist_t *);
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 30436b188fc4..29ed45a55dc7 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1692,6 +1692,8 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 	zio_t *zio;
 	list_t *list;
 	dbuf_dirty_record_t *dr;
+	int num_sublists;
+	multilist_t *ml;
 	blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
 	*blkptr_copy = *os->os_rootbp;
 
@@ -1780,10 +1782,13 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 		}
 	}
 
-	for (int i = 0;
-	    i < multilist_get_num_sublists(os->os_dirty_dnodes[txgoff]); i++) {
+	ml = os->os_dirty_dnodes[txgoff];
+	num_sublists = multilist_get_num_sublists(ml);
+	for (int i = 0; i < num_sublists; i++) {
+		if (multilist_sublist_is_empty_idx(ml, i))
+			continue;
 		sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
-		sda->sda_list = os->os_dirty_dnodes[txgoff];
+		sda->sda_list = ml;
 		sda->sda_sublist_idx = i;
 		sda->sda_tx = tx;
 		(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
@@ -2086,6 +2091,8 @@ userquota_updates_task(void *arg)
 void
 dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
 {
+	int num_sublists;
+
 	if (!dmu_objset_userused_enabled(os))
 		return;
 
@@ -2118,8 +2125,10 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
 		    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 	}
 
-	for (int i = 0;
-	    i < multilist_get_num_sublists(os->os_synced_dnodes); i++) {
+	num_sublists = multilist_get_num_sublists(os->os_synced_dnodes);
+	for (int i = 0; i < num_sublists; i++) {
+		if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i))
+			continue;
 		userquota_updates_arg_t *uua =
 		    kmem_alloc(sizeof (*uua), KM_SLEEP);
 		uua->uua_os = os;
diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c
index 2a594c56cbd5..b74ee0f0670a 100644
--- a/module/zfs/multilist.c
+++ b/module/zfs/multilist.c
@@ -363,6 +363,28 @@ multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
 	list_remove(&mls->mls_list, obj);
 }
 
+int
+multilist_sublist_is_empty(multilist_sublist_t *mls)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_is_empty(&mls->mls_list));
+}
+
+int
+multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx)
+{
+	multilist_sublist_t *mls;
+	int empty;
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+	mls = &ml->ml_sublists[sublist_idx];
+	ASSERT(!MUTEX_HELD(&mls->mls_lock));
+	mutex_enter(&mls->mls_lock);
+	empty = list_is_empty(&mls->mls_list);
+	mutex_exit(&mls->mls_lock);
+	return (empty);
+}
+
 void *
 multilist_sublist_head(multilist_sublist_t *mls)
 {

From 7d2489cfad1b04c1b22292d0a9a58f85195ce00c Mon Sep 17 00:00:00 2001
From: George Wilson <george.wilson@delphix.com>
Date: Fri, 28 Jun 2019 15:40:24 -0400
Subject: [PATCH 049/109] nopwrites on dmu_sync-ed blocks can result in a panic

After device removal, performing nopwrites on a dmu_sync-ed block
will result in a panic. This panic can show up in two ways:
1. an attempt to issue an IOCTL in vdev_indirect_io_start()
2. a failed comparison of zio->io_bp and zio->io_bp_orig in
   zio_done()
To resolve both of these panics, nopwrites of blocks on indirect
vdevs should be ignored and new allocations should be performed on
concrete vdevs.

Reviewed-by: Igor Kozhukhov <igor@dilos.org>
Reviewed-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Don Brady <don.brady@delphix.com>
Signed-off-by: George Wilson <gwilson@delphix.com>
Closes #8957
---
 module/zfs/zio.c                              | 14 +++
 tests/runfiles/linux.run                      |  2 +-
 .../tests/functional/removal/Makefile.am      |  2 +-
 .../functional/removal/removal_nopwrite.ksh   | 87 +++++++++++++++++++
 4 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/removal/removal_nopwrite.ksh

diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index f9503bd3ff81..94eaa5888a9c 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2862,6 +2862,20 @@ zio_nop_write(zio_t *zio)
 		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
 		    sizeof (uint64_t)) == 0);
 
+		/*
+		 * If we're overwriting a block that is currently on an
+		 * indirect vdev, then ignore the nopwrite request and
+		 * allow a new block to be allocated on a concrete vdev.
+		 */
+		spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER);
+		vdev_t *tvd = vdev_lookup_top(zio->io_spa,
+		    DVA_GET_VDEV(&bp->blk_dva[0]));
+		if (tvd->vdev_ops == &vdev_indirect_ops) {
+			spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
+			return (zio);
+		}
+		spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
+
 		*bp = *bp_orig;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		zio->io_flags |= ZIO_FLAG_NOPWRITE;
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 22fc26212c0d..3f82676ef218 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -758,7 +758,7 @@ tags = ['functional', 'refreserv']
 pre =
 tests = ['removal_all_vdev', 'removal_check_space',
     'removal_condense_export', 'removal_multiple_indirection',
-    'removal_remap', 'removal_remap_deadlists',
+    'removal_remap', 'removal_nopwrite', 'removal_remap_deadlists',
     'removal_resume_export', 'removal_sanity', 'removal_with_add',
     'removal_with_create_fs', 'removal_with_dedup',
     'removal_with_errors', 'removal_with_export',
diff --git a/tests/zfs-tests/tests/functional/removal/Makefile.am b/tests/zfs-tests/tests/functional/removal/Makefile.am
index ba42b899acac..df92e0b5ed44 100644
--- a/tests/zfs-tests/tests/functional/removal/Makefile.am
+++ b/tests/zfs-tests/tests/functional/removal/Makefile.am
@@ -18,7 +18,7 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/removal
 dist_pkgdata_SCRIPTS = \
 	cleanup.ksh removal_all_vdev.ksh removal_check_space.ksh \
 	removal_condense_export.ksh removal_multiple_indirection.ksh \
-	removal_remap_deadlists.ksh removal_remap.ksh \
+	removal_remap_deadlists.ksh removal_nopwrite.ksh removal_remap.ksh \
 	removal_reservation.ksh removal_resume_export.ksh \
 	removal_sanity.ksh removal_with_add.ksh removal_with_create_fs.ksh \
 	removal_with_dedup.ksh removal_with_errors.ksh \
diff --git a/tests/zfs-tests/tests/functional/removal/removal_nopwrite.ksh b/tests/zfs-tests/tests/functional/removal/removal_nopwrite.ksh
new file mode 100755
index 000000000000..cb8bd6b810c1
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/removal/removal_nopwrite.ksh
@@ -0,0 +1,87 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/removal/removal.kshlib
+. $STF_SUITE/tests/functional/nopwrite/nopwrite.shlib
+
+default_setup_noexit "$DISKS"
+log_onexit default_cleanup_noexit
+BLOCKSIZE=8192
+
+origin="$TESTPOOL/$TESTFS"
+
+log_must zfs set compress=on $origin
+log_must zfs set checksum=edonr $origin
+
+log_must zfs set recordsize=8k $origin
+dd if=/dev/urandom of=$TESTDIR/file_8k bs=1024k count=$MEGS oflag=sync \
+    conv=notrunc >/dev/null 2>&1 || log_fail "dd into $TESTDIR/file failed."
+log_must zfs set recordsize=128k $origin
+dd if=/dev/urandom of=$TESTDIR/file_128k bs=1024k count=$MEGS oflag=sync \
+    conv=notrunc >/dev/null 2>&1 || log_fail "dd into $TESTDIR/file failed."
+
+zfs snapshot $origin@a || log_fail "zfs snap failed"
+log_must zfs clone $origin@a $origin/clone
+
+#
+# Verify that nopwrites work prior to removal
+#
+log_must zfs set recordsize=8k $origin/clone
+dd if=/$TESTDIR/file_8k of=/$TESTDIR/clone/file_8k bs=1024k \
+     oflag=sync conv=notrunc >/dev/null 2>&1 || log_fail "dd failed."
+log_must verify_nopwrite $origin $origin@a $origin/clone
+
+log_must zfs set recordsize=128k $origin/clone
+dd if=/$TESTDIR/file_128k of=/$TESTDIR/clone/file_128k bs=1024k \
+     oflag=sync conv=notrunc >/dev/null 2>&1 || log_fail "dd failed."
+log_must verify_nopwrite $origin $origin@a $origin/clone
+
+#
+# Remove a device before testing nopwrites again
+#
+log_must zpool remove $TESTPOOL $REMOVEDISK
+log_must wait_for_removal $TESTPOOL
+log_mustnot vdevs_in_pool $TESTPOOL $REMOVEDISK
+
+#
+# Normally, we expect nopwrites to avoid allocating new blocks, but
+# after a device has been removed the DVAs will get remapped when
+# a L0's indirect bloock is written. This will negate the effects
+# of nopwrite and should result in new allocations.
+#
+
+#
+# Perform a direct zil nopwrite test
+#
+log_must zfs set recordsize=8k $origin/clone
+dd if=/$TESTDIR/file_8k of=/$TESTDIR/clone/file_8k bs=1024k \
+     oflag=sync conv=notrunc >/dev/null 2>&1 || log_fail "dd failed."
+log_mustnot verify_nopwrite $origin $origin@a $origin/clone
+
+#
+# Perform an indirect zil nopwrite test
+#
+log_must zfs set recordsize=128k $origin/clone
+dd if=/$TESTDIR/file_128k of=/$TESTDIR/clone/file_128k bs=1024k \
+     oflag=sync conv=notrunc >/dev/null 2>&1 || log_fail "dd failed."
+log_mustnot verify_nopwrite $origin $origin@a $origin/clone
+
+log_pass "Remove works with nopwrite."

From 093bb6446120c50a7109ed7e7a0f2e76730b3160 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Wed, 3 Jul 2019 00:25:23 +0900
Subject: [PATCH 050/109] Don't use d_path() for automount mount point for
 chroot'd process

Chroot'd process fails to automount snapshots due to realpath(3)
failure in mount.zfs(8).

Construct a mount point path from sb of the ctldir inode and dirent
name, instead of from d_path(), so that chroot'd process doesn't get
affected by its view of fs.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #8903
Closes #8966
---
 module/zfs/zfs_ctldir.c | 41 +++++++----------------------------------
 1 file changed, 7 insertions(+), 34 deletions(-)

diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
index c8071a7c215f..aa50646fef83 100644
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -703,37 +703,6 @@ zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len,
 	return (0);
 }
 
-/*
- * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
- */
-static int
-zfsctl_snapshot_path(struct path *path, int len, char *full_path)
-{
-	char *path_buffer, *path_ptr;
-	int path_len, error = 0;
-
-	path_buffer = kmem_alloc(len, KM_SLEEP);
-
-	path_ptr = d_path(path, path_buffer, len);
-	if (IS_ERR(path_ptr)) {
-		error = -PTR_ERR(path_ptr);
-		goto out;
-	}
-
-	path_len = path_buffer + len - 1 - path_ptr;
-	if (path_len > len) {
-		error = SET_ERROR(EFAULT);
-		goto out;
-	}
-
-	memcpy(full_path, path_ptr, path_len);
-	full_path[path_len] = '\0';
-out:
-	kmem_free(path_buffer, len);
-
-	return (error);
-}
-
 /*
  * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
  */
@@ -1077,9 +1046,13 @@ zfsctl_snapshot_mount(struct path *path, int flags)
 	if (error)
 		goto error;
 
-	error = zfsctl_snapshot_path(path, MAXPATHLEN, full_path);
-	if (error)
-		goto error;
+	/*
+	 * Construct a mount point path from sb of the ctldir inode and dirent
+	 * name, instead of from d_path(), so that chroot'd process doesn't fail
+	 * on mount.zfs(8).
+	 */
+	snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s",
+	    zfsvfs->z_vfs->vfs_mntpoint, dname(dentry));
 
 	/*
 	 * Multiple concurrent automounts of a snapshot are never allowed.

From 9e09826b33092bfe41dce14e098b2d2f4931da2f Mon Sep 17 00:00:00 2001
From: Tom Caputi <tcaputi@datto.com>
Date: Tue, 2 Jul 2019 20:30:00 -0400
Subject: [PATCH 051/109] Fix error text for EINVAL in zfs_receive_one()

This small patch fixes the EINVAL case for zfs_receive_one(). A
missing 'else' has been added to the two possible cases, which
will ensure the intended error message is printed.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: loli10K <ezomori.nozomu@gmail.com>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #8977
---
 lib/libzfs/libzfs_sendrecv.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index 052b96b9b653..0d3853e0a1c4 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -4418,14 +4418,15 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 			*cp = '@';
 			break;
 		case EINVAL:
-			if (flags->resumable)
+			if (flags->resumable) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "kernel modules must be upgraded to "
 				    "receive this stream."));
-			if (embedded && !raw)
+			} else if (embedded && !raw) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "incompatible embedded data stream "
 				    "feature with encrypted receive."));
+			}
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ECKSUM:

From 7a03d7c73cec63e3c3e771c8cf34d8876a0f0532 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 3 Jul 2019 13:01:54 -0700
Subject: [PATCH 052/109] Check b_freeze_cksum under ZFS_DEBUG_MODIFY
 conditional

The b_freeze_cksum field can only have data when ZFS_DEBUG_MODIFY
is set.  Therefore, the EQUIV check must be wrapped accordingly.
For the same reason the ASSERT in arc_buf_fill() in unsafe.
However, since it's largely redundant it has simply been removed.

Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Allan Jude <allanjude@freebsd.org>
Reviewed-by: Igor Kozhukhov <igor@dilos.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #8979
---
 module/zfs/arc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 720365c4a935..f125ca6a4d14 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1872,7 +1872,8 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
 	 * There were no decompressed bufs, so there should not be a
 	 * checksum on the hdr either.
 	 */
-	EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
+	if (zfs_flags & ZFS_DEBUG_MODIFY)
+		EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
 
 	return (copied);
 }
@@ -2253,7 +2254,6 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 		 */
 		if (arc_buf_try_copy_decompressed_data(buf)) {
 			/* Skip byteswapping and checksumming (already done) */
-			ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
 			return (0);
 		} else {
 			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),

From 14a11bf2f6052413cdaa5cf8193d16ce8f2fa388 Mon Sep 17 00:00:00 2001
From: Paul Zuchowski <31706010+PaulZ-98@users.noreply.github.com>
Date: Wed, 3 Jul 2019 16:05:02 -0400
Subject: [PATCH 053/109] Improve "Unable to automount" error message.

Having the mountpoint and dataset name both in the message made it
confusing to read.  Additionally, convert this to a zfs_dbgmsg rather than
sending it to the console.

Reviewed-by: Tom Caputi <tcaputi@datto.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Zuchowski <pzuchowski@datto.com>
Closes #8959
---
 module/zfs/zfs_ctldir.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
index aa50646fef83..52314f4e1bdb 100644
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -30,6 +30,7 @@
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright (c) 2018 George Melikov. All Rights Reserved.
+ * Copyright (c) 2019 Datto, Inc. All rights reserved.
  */
 
 /*
@@ -1081,8 +1082,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
 	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	if (error) {
 		if (!(error & MOUNT_BUSY << 8)) {
-			cmn_err(CE_WARN, "Unable to automount %s/%s: %d",
-			    full_path, full_name, error);
+			zfs_dbgmsg("Unable to automount %s error=%d",
+			    full_path, error);
 			error = SET_ERROR(EISDIR);
 		} else {
 			/*

From 1f72a18f59d73f6e09ea052fb51cc7e19eaa3250 Mon Sep 17 00:00:00 2001
From: Tom Caputi <tcaputi@datto.com>
Date: Fri, 5 Jul 2019 19:53:14 -0400
Subject: [PATCH 054/109] Remove VERIFY from dsl_dataset_crypt_stats()

This patch fixes an issue where dsl_dataset_crypt_stats() would
VERIFY that it was able to hold the encryption root. This function
should instead silently continue without populating the related
field in the nvlist, as is the convention for this code.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #8976
---
 module/zfs/dsl_crypt.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c
index 568fe7aa3263..24711227ba55 100644
--- a/module/zfs/dsl_crypt.c
+++ b/module/zfs/dsl_crypt.c
@@ -2624,11 +2624,13 @@ dsl_dataset_crypt_stats(dsl_dataset_t *ds, nvlist_t *nv)
 	}
 
 	if (dsl_dir_get_encryption_root_ddobj(dd, &intval) == 0) {
-		VERIFY0(dsl_dir_hold_obj(dd->dd_pool, intval, NULL, FTAG,
-		    &enc_root));
-		dsl_dir_name(enc_root, buf);
-		dsl_dir_rele(enc_root, FTAG);
-		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ENCRYPTION_ROOT, buf);
+		if (dsl_dir_hold_obj(dd->dd_pool, intval, NULL, FTAG,
+		    &enc_root) == 0) {
+			dsl_dir_name(enc_root, buf);
+			dsl_dir_rele(enc_root, FTAG);
+			dsl_prop_nvlist_add_string(nv,
+			    ZFS_PROP_ENCRYPTION_ROOT, buf);
+		}
 	}
 }
 

From 2ac233c633e9bce36df8e7a3d7501cf4a0e227bb Mon Sep 17 00:00:00 2001
From: loli10K <loli10K@users.noreply.github.com>
Date: Tue, 9 Jul 2019 18:28:05 +0200
Subject: [PATCH 055/109] Fix dracut Debian/Ubuntu packaging

This commit ensures make(1) targets that build .deb packages fail if
alien(1) can't convert all .rpm files; additionally it also updates
the zfs-dracut package name which was changed to "noarch" in ca4e5a7.

Reviewed-by: Neal Gompa <ngompa@datto.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
Closes #8990
Closes #8991
---
 config/deb.am | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/deb.am b/config/deb.am
index e405547aa949..83059a923493 100644
--- a/config/deb.am
+++ b/config/deb.am
@@ -20,7 +20,7 @@ deb-kmod: deb-local rpm-kmod
 	arch=`$(RPM) -qp $${name}-kmod-$${version}.src.rpm --qf %{arch} | tail -1`; \
 	debarch=`$(DPKG) --print-architecture`; \
 	pkg1=kmod-$${name}*$${version}.$${arch}.rpm; \
-	fakeroot $(ALIEN) --bump=0 --scripts --to-deb --target=$$debarch $$pkg1; \
+	fakeroot $(ALIEN) --bump=0 --scripts --to-deb --target=$$debarch $$pkg1 || exit 1; \
 	$(RM) $$pkg1
 
 
@@ -30,7 +30,7 @@ deb-dkms: deb-local rpm-dkms
 	arch=`$(RPM) -qp $${name}-dkms-$${version}.src.rpm --qf %{arch} | tail -1`; \
 	debarch=`$(DPKG) --print-architecture`; \
 	pkg1=$${name}-dkms-$${version}.$${arch}.rpm; \
-	fakeroot $(ALIEN) --bump=0 --scripts --to-deb --target=$$debarch $$pkg1; \
+	fakeroot $(ALIEN) --bump=0 --scripts --to-deb --target=$$debarch $$pkg1 || exit 1; \
 	$(RM) $$pkg1
 
 deb-utils: deb-local rpm-utils
@@ -45,7 +45,7 @@ deb-utils: deb-local rpm-utils
 	pkg5=libzpool2-$${version}.$${arch}.rpm; \
 	pkg6=libzfs2-devel-$${version}.$${arch}.rpm; \
 	pkg7=$${name}-test-$${version}.$${arch}.rpm; \
-	pkg8=$${name}-dracut-$${version}.$${arch}.rpm; \
+	pkg8=$${name}-dracut-$${version}.noarch.rpm; \
 	pkg9=$${name}-initramfs-$${version}.$${arch}.rpm; \
 	pkg10=`ls python*-pyzfs-$${version}* | tail -1`; \
 ## Arguments need to be passed to dh_shlibdeps. Alien provides no mechanism
@@ -63,7 +63,7 @@ deb-utils: deb-local rpm-utils
 	env PATH=$${path_prepend}:$${PATH} \
 	fakeroot $(ALIEN) --bump=0 --scripts --to-deb --target=$$debarch \
 	    $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 \
-	    $$pkg8 $$pkg9 $$pkg10; \
+	    $$pkg8 $$pkg9 $$pkg10 || exit 1; \
 	$(RM) $${path_prepend}/dh_shlibdeps; \
 	rmdir $${path_prepend}; \
 	$(RM) $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 \

From ccd8125e450c2968b2878fd887da7fac5b9a49f1 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Wed, 10 Jul 2019 01:31:46 +0900
Subject: [PATCH 056/109] Fix race in parallel mount's thread dispatching
 algorithm

Strategy of parallel mount is as follows.

1) Initial thread dispatching is to select sets of mount points that
 don't have dependencies on other sets, hence threads can/should run
 lock-less and shouldn't race with other threads for other sets. Each
 thread dispatched corresponds to top level directory which may or may
 not have datasets to be mounted on sub directories.

2) Subsequent recursive thread dispatching for each thread from 1)
 is to mount datasets for each set of mount points. The mount points
 within each set have dependencies (i.e. child directories), so child
 directories are processed only after parent directory completes.

The problem is that the initial thread dispatching in
zfs_foreach_mountpoint() can be multi-threaded when it needs to be
single-threaded, and this puts threads under race condition. This race
appeared as mount/unmount issues on ZoL for ZoL having different
timing regarding mount(2) execution due to fork(2)/exec(2) of mount(8).
`zfs unmount -a` which expects proper mount order can't unmount if the
mounts were reordered by the race condition.

There are currently two known patterns of input list `handles` in
`zfs_foreach_mountpoint(..,handles,..)` which cause the race condition.

1) #8833 case where input is `/a /a /a/b` after sorting.
 The problem is that libzfs_path_contains() can't correctly handle an
 input list with two same top level directories.
 There is a race between two POSIX threads A and B,
  * ThreadA for "/a" for test1 and "/a/b"
  * ThreadB for "/a" for test0/a
 and in case of #8833, ThreadA won the race. Two threads were created
 because "/a" wasn't considered as `"/a" contains "/a"`.

2) #8450 case where input is `/ /var/data /var/data/test` after sorting.
 The problem is that libzfs_path_contains() can't correctly handle an
 input list containing "/".
 There is a race between two POSIX threads A and B,
  * ThreadA for "/" and "/var/data/test"
  * ThreadB for "/var/data"
 and in case of #8450, ThreadA won the race. Two threads were created
 because "/var/data" wasn't considered as `"/" contains "/var/data"`.
 In other words, if there is (at least one) "/" in the input list,
 the initial thread dispatching must be single-threaded since every
 directory is a child of "/", meaning they all directly or indirectly
 depend on "/".

In both cases, the first non_descendant_idx() call fails to correctly
determine "path1-contains-path2", and as a result the initial thread
dispatching creates another thread when it needs to be single-threaded.
Fix a conditional in libzfs_path_contains() to consider above two.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Sebastien Roy <sebastien.roy@delphix.com>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #8450
Closes #8833
Closes #8878
---
 lib/libzfs/libzfs_mount.c                     |   6 +-
 tests/runfiles/linux.run                      |   3 +-
 .../functional/cli_root/zfs_mount/Makefile.am |   1 +
 .../cli_root/zfs_mount/zfs_mount_test_race.sh | 116 ++++++++++++++++++
 4 files changed, 123 insertions(+), 3 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh

diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c
index 649c232aa3e5..d62801cfdaca 100644
--- a/lib/libzfs/libzfs_mount.c
+++ b/lib/libzfs/libzfs_mount.c
@@ -1302,12 +1302,14 @@ mountpoint_cmp(const void *arga, const void *argb)
 }
 
 /*
- * Return true if path2 is a child of path1.
+ * Return true if path2 is a child of path1 or path2 equals path1 or
+ * path1 is "/" (path2 is always a child of "/").
  */
 static boolean_t
 libzfs_path_contains(const char *path1, const char *path2)
 {
-	return (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/');
+	return (strcmp(path1, path2) == 0 || strcmp(path1, "/") == 0 ||
+	    (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/'));
 }
 
 /*
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 3f82676ef218..27e36b594ab5 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -182,7 +182,8 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
     'zfs_mount_007_pos', 'zfs_mount_008_pos', 'zfs_mount_009_neg',
     'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_neg',
     'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount',
-    'zfs_multi_mount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints']
+    'zfs_multi_mount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
+    'zfs_mount_test_race']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
 [tests/functional/cli_root/zfs_program]
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am
index b2de98934b74..c208a1c378dc 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am
@@ -19,6 +19,7 @@ dist_pkgdata_SCRIPTS = \
 	zfs_mount_all_mountpoints.ksh \
 	zfs_mount_encrypted.ksh \
 	zfs_mount_remount.ksh \
+	zfs_mount_test_race.sh \
 	zfs_multi_mount.ksh
 
 dist_pkgdata_DATA = \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh
new file mode 100755
index 000000000000..404770b2727f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh
@@ -0,0 +1,116 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.cfg
+
+#
+# DESCRIPTION:
+# Verify parallel mount ordering is consistent.
+#
+# There was a bug in initial thread dispatching algorithm which put threads
+# under race condition which resulted in undefined mount order.  The purpose
+# of this test is to verify `zfs unmount -a` succeeds (not `zfs mount -a`
+# succeeds, it always does) after `zfs mount -a`, which could fail if threads
+# race.  See github.com/zfsonlinux/zfs/issues/{8450,8833,8878} for details.
+#
+# STRATEGY:
+# 1. Create pools and filesystems.
+# 2. Set same mount point for >1 datasets.
+# 3. Unmount all datasets.
+# 4. Mount all datasets.
+# 5. Unmount all datasets (verify this succeeds).
+#
+
+verify_runnable "both"
+
+TMPDIR=${TMPDIR:-$TEST_BASE_DIR}
+MNTPT=$TMPDIR/zfs_mount_test_race_mntpt
+DISK1="$TMPDIR/zfs_mount_test_race_disk1"
+DISK2="$TMPDIR/zfs_mount_test_race_disk2"
+
+TESTPOOL1=zfs_mount_test_race_tp1
+TESTPOOL2=zfs_mount_test_race_tp2
+
+export __ZFS_POOL_RESTRICT="$TESTPOOL1 $TESTPOOL2"
+log_must zfs $unmountall
+unset __ZFS_POOL_RESTRICT
+
+function cleanup
+{
+	zpool destroy $TESTPOOL1
+	zpool destroy $TESTPOOL2
+	rm -rf $MNTPT
+	rm -rf /$TESTPOOL1
+	rm -rf /$TESTPOOL2
+	rm -f $DISK1
+	rm -f $DISK2
+	export __ZFS_POOL_RESTRICT="$TESTPOOL1 $TESTPOOL2"
+	log_must zfs $mountall
+	unset __ZFS_POOL_RESTRICT
+}
+log_onexit cleanup
+
+log_note "Verify parallel mount ordering is consistent"
+
+log_must truncate -s $MINVDEVSIZE $DISK1
+log_must truncate -s $MINVDEVSIZE $DISK2
+
+log_must zpool create -f $TESTPOOL1 $DISK1
+log_must zpool create -f $TESTPOOL2 $DISK2
+
+log_must zfs create $TESTPOOL1/$TESTFS1
+log_must zfs create $TESTPOOL2/$TESTFS2
+
+log_must zfs set mountpoint=none $TESTPOOL1
+log_must zfs set mountpoint=$MNTPT $TESTPOOL1/$TESTFS1
+
+# Note that unmount can fail (due to race condition on `zfs mount -a`) with or
+# without `canmount=off`.  The race has nothing to do with canmount property,
+# but turn it off for convenience of mount layout used in this test case.
+log_must zfs set canmount=off $TESTPOOL2
+log_must zfs set mountpoint=$MNTPT $TESTPOOL2
+
+# At this point, layout of datasets in two pools will look like below.
+# Previously, on next `zfs mount -a`, pthreads assigned to TESTFS1 and TESTFS2
+# could race, and TESTFS2 usually (actually always) won in ZoL.  Note that the
+# problem is how two or more threads could initially be assigned to the same
+# top level directory, not this specific layout.  This layout is just an example
+# that can reproduce race, and is also the layout reported in #8833.
+#
+# NAME                  MOUNTED  MOUNTPOINT
+# ----------------------------------------------
+# /$TESTPOOL1           no       none
+# /$TESTPOOL1/$TESTFS1  yes      $MNTPT
+# /$TESTPOOL2           no       $MNTPT
+# /$TESTPOOL2/$TESTFS2  yes      $MNTPT/$TESTFS2
+
+# Apparently two datasets must be mounted.
+log_must ismounted $TESTPOOL1/$TESTFS1
+log_must ismounted $TESTPOOL2/$TESTFS2
+# This unmount always succeeds, because potential race hasn't happened yet.
+log_must zfs unmount -a
+# This mount always succeeds, whether threads are under race condition or not.
+log_must zfs mount -a
+
+# Verify datasets are mounted (TESTFS2 fails if the race broke mount order).
+log_must ismounted $TESTPOOL1/$TESTFS1
+log_must ismounted $TESTPOOL2/$TESTFS2
+# Verify unmount succeeds (fails if the race broke mount order).
+log_must zfs unmount -a
+
+log_pass "Verify parallel mount ordering is consistent passed"

From c3a3c5a30fea98f640e23b0f3c2c10d5606ba9fc Mon Sep 17 00:00:00 2001
From: Shaun Tancheff <shaun@aeonazure.com>
Date: Tue, 9 Jul 2019 15:02:40 -0500
Subject: [PATCH 057/109] pkg-utils python sitelib for SLES15

Use python -Esc to set __python_sitelib.

Reviewed-by: Neal Gompa <ngompa@datto.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Shaun Tancheff <stancheff@cray.com>
Closes #8969
---
 rpm/generic/zfs.spec.in | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index 0b16cd0e886b..0864a72a1155 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -99,6 +99,7 @@
 %define __python_cffi_pkg         python%{__python_pkg_version}-cffi
 %define __python_setuptools_pkg   python%{__python_pkg_version}-setuptools
 %endif
+%define __python_sitelib          %(%{__python} -Esc "from distutils.sysconfig import get_python_lib; print(get_python_lib())")
 
 # By default python-pyzfs is enabled, with the exception of
 # RHEL 6 which by default uses Python 2.6 which is too old.
@@ -474,8 +475,8 @@ systemctl --system daemon-reload >/dev/null || true
 %doc contrib/pyzfs/README
 %doc contrib/pyzfs/LICENSE
 %defattr(-,root,root,-)
-%{python_sitelib}/libzfs_core/*
-%{python_sitelib}/pyzfs*
+%{__python_sitelib}/libzfs_core/*
+%{__python_sitelib}/pyzfs*
 %endif
 
 %if 0%{?_initramfs}

From 6e19cc77cfd10a8587181f57ef4f9d7a1a7bc5b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Attila=20F=C3=BCl=C3=B6p?= <attila@fueloep.org>
Date: Wed, 10 Jul 2019 20:44:52 +0200
Subject: [PATCH 058/109] Fix ZTS killed processes detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

log_neg_expect was using the wrong exit status to detect if a process
got killed by SIGSEGV or SIGBUS, resulting in false positives.

Reviewed-by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: John Kennedy <john.kennedy@delphix.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Attila Fülöp <attila@fueloep.org>
Closes #9003
---
 tests/test-runner/include/logapi.shlib | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test-runner/include/logapi.shlib b/tests/test-runner/include/logapi.shlib
index 32fc00616180..cd7982a94a0b 100644
--- a/tests/test-runner/include/logapi.shlib
+++ b/tests/test-runner/include/logapi.shlib
@@ -198,12 +198,12 @@ function log_neg_expect
 	elif (( $status == 127 )); then
 		print -u2 $($out)
 		_printerror "$@" "unexpectedly exited $status (File not found)"
-	# bus error - core dump
-	elif (( $status == 138 )); then
+	# bus error - core dump (256+signal, SIGBUS=7)
+	elif (( $status == 263 )); then
 		print -u2 $($out)
 		_printerror "$@" "unexpectedly exited $status (Bus Error)"
-	# segmentation violation - core dump
-	elif (( $status == 139 )); then
+	# segmentation violation - core dump (256+signal, SIGSEGV=11)
+	elif (( $status == 267 )); then
 		print -u2 $($out)
 		_printerror "$@" "unexpectedly exited $status (SEGV)"
 	else

From cf966cb19ae63f65c518678ce57642c716808ef6 Mon Sep 17 00:00:00 2001
From: Nick Mattis <nmattis@users.noreply.github.com>
Date: Wed, 10 Jul 2019 18:54:49 -0400
Subject: [PATCH 059/109] Fixes: #8934 Large kmem_alloc

Large allocation over the spl_kmem_alloc_warn value was being performed.
Switched to vmem_alloc interface as specified for large allocations.
Changed the subsequent frees to match.

Reviewed-by: Tom Caputi <tcaputi@datto.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: nmattis <nickm970@gmail.com>
Closes #8934
Closes #9011
---
 module/zfs/vdev_indirect_births.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/module/zfs/vdev_indirect_births.c b/module/zfs/vdev_indirect_births.c
index 1c44a64287d3..99b83c392257 100644
--- a/module/zfs/vdev_indirect_births.c
+++ b/module/zfs/vdev_indirect_births.c
@@ -70,7 +70,7 @@ vdev_indirect_births_close(vdev_indirect_births_t *vib)
 	if (vib->vib_phys->vib_count > 0) {
 		uint64_t births_size = vdev_indirect_births_size_impl(vib);
 
-		kmem_free(vib->vib_entries, births_size);
+		vmem_free(vib->vib_entries, births_size);
 		vib->vib_entries = NULL;
 	}
 
@@ -108,7 +108,7 @@ vdev_indirect_births_open(objset_t *os, uint64_t births_object)
 
 	if (vib->vib_phys->vib_count > 0) {
 		uint64_t births_size = vdev_indirect_births_size_impl(vib);
-		vib->vib_entries = kmem_alloc(births_size, KM_SLEEP);
+		vib->vib_entries = vmem_alloc(births_size, KM_SLEEP);
 		VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0,
 		    births_size, vib->vib_entries, DMU_READ_PREFETCH));
 	}
@@ -148,10 +148,10 @@ vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
 	vib->vib_phys->vib_count++;
 	new_size = vdev_indirect_births_size_impl(vib);
 
-	new_entries = kmem_alloc(new_size, KM_SLEEP);
+	new_entries = vmem_alloc(new_size, KM_SLEEP);
 	if (old_size > 0) {
 		bcopy(vib->vib_entries, new_entries, old_size);
-		kmem_free(vib->vib_entries, old_size);
+		vmem_free(vib->vib_entries, old_size);
 	}
 	new_entries[vib->vib_phys->vib_count - 1] = vibe;
 	vib->vib_entries = new_entries;

From 0a223246e124e68bbd2ee2cd7ddcd0bbcd6fa3a5 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Tue, 16 Jul 2019 05:57:56 +0900
Subject: [PATCH 060/109] Disable unused pathname::pn_path* (unneeded in Linux)

struct pathname is originally from Solaris VFS, and it has been used
in ZoL to merely call VOP from Linux VFS interface without API change,
therefore pathname::pn_path* are unused and unneeded. Technically,
struct pathname is a wrapper for C string in ZoL.

Saves stack a bit on lookup and unlink.

(#if0'd members instead of removing since comments refer to them.)

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #9025
---
 include/sys/pathname.h |  2 ++
 module/zfs/pathname.c  | 15 +++++++++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/include/sys/pathname.h b/include/sys/pathname.h
index 5db69b1784c9..d79cc5c01afd 100644
--- a/include/sys/pathname.h
+++ b/include/sys/pathname.h
@@ -54,8 +54,10 @@ extern "C" {
  */
 typedef struct pathname {
 	char	*pn_buf;		/* underlying storage */
+#if 0 /* unused in ZoL */
 	char	*pn_path;		/* remaining pathname */
 	size_t	pn_pathlen;		/* remaining length */
+#endif
 	size_t	pn_bufsize;		/* total size of pn_buf */
 } pathname_t;
 
diff --git a/module/zfs/pathname.c b/module/zfs/pathname.c
index e3e97c9bb365..4766762f37d1 100644
--- a/module/zfs/pathname.c
+++ b/module/zfs/pathname.c
@@ -71,9 +71,12 @@ pn_alloc(struct pathname *pnp)
 void
 pn_alloc_sz(struct pathname *pnp, size_t sz)
 {
-	pnp->pn_path = pnp->pn_buf = kmem_alloc(sz, KM_SLEEP);
-	pnp->pn_pathlen = 0;
+	pnp->pn_buf = kmem_alloc(sz, KM_SLEEP);
 	pnp->pn_bufsize = sz;
+#if 0 /* unused in ZoL */
+	pnp->pn_path = pnp->pn_buf;
+	pnp->pn_pathlen = 0;
+#endif
 }
 
 /*
@@ -84,6 +87,10 @@ pn_free(struct pathname *pnp)
 {
 	/* pn_bufsize is usually MAXPATHLEN, but may not be */
 	kmem_free(pnp->pn_buf, pnp->pn_bufsize);
-	pnp->pn_path = pnp->pn_buf = NULL;
-	pnp->pn_pathlen = pnp->pn_bufsize = 0;
+	pnp->pn_buf = NULL;
+	pnp->pn_bufsize = 0;
+#if 0 /* unused in ZoL */
+	pnp->pn_path = NULL;
+	pnp->pn_pathlen = 0;
+#endif
 }

From 78831d42906436c93570a7181548faaf456eb60f Mon Sep 17 00:00:00 2001
From: Tom Caputi <tcaputi@datto.com>
Date: Mon, 15 Jul 2019 16:08:42 -0700
Subject: [PATCH 061/109] Ensure dsl_destroy_head() decrypts objsets

This patch corrects a small issue where the dsl_destroy_head()
code that runs when the async_destroy feature is disabled would
not properly decrypt the dataset before beginning processing.
If the dataset is not able to be decrypted, the optimization
code now simply does not run and the dataset is completely
destroyed in the DSL sync task.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #9021
---
 module/zfs/dsl_destroy.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
index 465b3dfac890..a01abfa0038d 100644
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@@ -1059,9 +1059,10 @@ dsl_destroy_head(const char *name)
 		/*
 		 * Head deletion is processed in one txg on old pools;
 		 * remove the objects from open context so that the txg sync
-		 * is not too long.
+		 * is not too long. This optimization can only work for
+		 * encrypted datasets if the wrapping key is loaded.
 		 */
-		error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_FALSE,
+		error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_TRUE,
 		    FTAG, &os);
 		if (error == 0) {
 			uint64_t prev_snap_txg =
@@ -1073,7 +1074,7 @@ dsl_destroy_head(const char *name)
 				(void) dmu_free_long_object(os, obj);
 			/* sync out all frees */
 			txg_wait_synced(dmu_objset_pool(os), 0);
-			dmu_objset_disown(os, B_FALSE, FTAG);
+			dmu_objset_disown(os, B_TRUE, FTAG);
 		}
 	}
 

From d751b12a9d927d71a1c584be25bf705bb8decda2 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Mon, 15 Jul 2019 16:11:55 -0700
Subject: [PATCH 062/109] Export dnode symbols

External consumers such as Lustre require access to the dnode
interfaces in order to correctly manipulate dnodes.

Reviewed-by: James Simmons <uja.ornl@yahoo.com>
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #8994
Closes #9027
---
 module/zfs/dnode.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index c06f614e1993..5fd473303d7d 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -2483,3 +2483,13 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
 
 	return (error);
 }
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dnode_hold);
+EXPORT_SYMBOL(dnode_rele);
+EXPORT_SYMBOL(dnode_set_nlevels);
+EXPORT_SYMBOL(dnode_set_blksz);
+EXPORT_SYMBOL(dnode_free_range);
+EXPORT_SYMBOL(dnode_evict_dbufs);
+EXPORT_SYMBOL(dnode_evict_bonus);
+#endif

From 73e50a7d5ddb20e20fd1eab23f00f26f85bd717a Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Tue, 16 Jul 2019 08:26:52 +0900
Subject: [PATCH 063/109] Drop redundant POSIX ACL check in zpl_init_acl()

ZFS_ACLTYPE_POSIXACL has already been tested in zpl_init_acl(),
so no need to test again on POSIX ACL access.

Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #9009
---
 module/zfs/zpl_xattr.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/module/zfs/zpl_xattr.c b/module/zfs/zpl_xattr.c
index 8ee6e9a97f0a..95523f28e3b4 100644
--- a/module/zfs/zpl_xattr.c
+++ b/module/zfs/zpl_xattr.c
@@ -1130,12 +1130,9 @@ zpl_init_acl(struct inode *ip, struct inode *dir)
 		return (0);
 
 	if (!S_ISLNK(ip->i_mode)) {
-		if (ITOZSB(ip)->z_acl_type == ZFS_ACLTYPE_POSIXACL) {
-			acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT);
-			if (IS_ERR(acl))
-				return (PTR_ERR(acl));
-		}
-
+		acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl))
+			return (PTR_ERR(acl));
 		if (!acl) {
 			ip->i_mode &= ~current_umask();
 			ip->i_ctime = current_time(ip);
@@ -1144,7 +1141,7 @@ zpl_init_acl(struct inode *ip, struct inode *dir)
 		}
 	}
 
-	if ((ITOZSB(ip)->z_acl_type == ZFS_ACLTYPE_POSIXACL) && acl) {
+	if (acl) {
 		umode_t mode;
 
 		if (S_ISDIR(ip->i_mode)) {

From af7a5672c3d1ef17d352627e64c24d762da919e3 Mon Sep 17 00:00:00 2001
From: Antonio Russo <antonio.e.russo@gmail.com>
Date: Sun, 2 Jun 2019 08:57:10 -0400
Subject: [PATCH 064/109] systemd encryption key support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Modify zfs-mount-generator to produce a dependency on new
zfs-import-key-*.service units, dynamically created at boot to call
zfs load-key for the encryption root, before attempting to mount any
encrypted datasets.

These units are created by zfs-mount-generator, and RequiresMountsFor on
the keyfile, if present, or call systemd-ask-password if a passphrase is
requested.

This patch includes suggestions from @Fabian-Gruenbichler, @ryanjaeb and
@rlaager, as well an adaptation of @rlaager's script to retry on
incorrect password entry.

Reviewed-by: Richard Laager <rlaager@wiktel.com>
Reviewed-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Antonio Russo <antonio.e.russo@gmail.com>
Closes #8750
Closes #8848
---
 .../zed.d/history_event-zfs-list-cacher.sh.in |  4 +-
 .../system-generators/zfs-mount-generator.in  | 54 ++++++++++++++++++-
 man/man8/zfs-mount-generator.8.in             |  2 +-
 3 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in b/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in
index c1513cf3a01f..6d0f44ab3260 100755
--- a/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in
+++ b/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in
@@ -47,7 +47,7 @@ case "${ZEVENT_HISTORY_INTERNAL_NAME}" in
         # Only act if one of the tracked properties is altered.
         case "${ZEVENT_HISTORY_INTERNAL_STR%%=*}" in
             canmount|mountpoint|atime|relatime|devices|exec| \
-                readonly|setuid|nbmand) ;;
+                readonly|setuid|nbmand|encroot|keylocation) ;;
             *) exit 0 ;;
         esac
       ;;
@@ -62,7 +62,7 @@ zed_lock zfs-list
 trap abort_alter EXIT
 
 PROPS="name,mountpoint,canmount,atime,relatime,devices,exec,readonly"
-PROPS="${PROPS},setuid,nbmand"
+PROPS="${PROPS},setuid,nbmand,encroot,keylocation"
 
 "${ZFS}" list -H -t filesystem -o $PROPS -r "${ZEVENT_POOL}" > "${FSLIST_TMP}"
 
diff --git a/etc/systemd/system-generators/zfs-mount-generator.in b/etc/systemd/system-generators/zfs-mount-generator.in
index 5428eb25d92c..ae208c965f97 100755
--- a/etc/systemd/system-generators/zfs-mount-generator.in
+++ b/etc/systemd/system-generators/zfs-mount-generator.in
@@ -71,6 +71,8 @@ process_line() {
   p_readonly="${8}"
   p_setuid="${9}"
   p_nbmand="${10}"
+  p_encroot="${11}"
+  p_keyloc="${12}"
 
   # Check for canmount=off .
   if [ "${p_canmount}" = "off" ] ; then
@@ -168,6 +170,54 @@ process_line() {
       "${dataset}" >/dev/kmsg
   fi
 
+  # Minimal pre-requisites to mount a ZFS dataset
+  wants="zfs-import.target"
+  if [ -n "${p_encroot}" ] &&
+      [ "${p_encroot}" != "-" ] ; then
+    keyloadunit="zfs-load-key-$(systemd-escape "${p_encroot}").service"
+    if [ "${p_encroot}" = "${dataset}" ] ; then
+        pathdep=""
+      if [ "${p_keyloc%%://*}" = "file" ] ; then
+        pathdep="RequiresMountsFor='${p_keyloc#file://}'"
+        keyloadcmd="@sbindir@/zfs load-key '${dataset}'"
+      elif [ "${p_keyloc}" = "prompt" ] ; then
+        keyloadcmd="sh -c 'set -eu;"\
+"count=0;"\
+"while [ \$\$count -lt 3 ];do"\
+"  systemd-ask-password --id=\"zfs:${dataset}\""\
+"    \"Enter passphrase for ${dataset}:\"|"\
+"    @sbindir@/zfs load-key \"${dataset}\" && exit 0;"\
+"  count=\$\$((count + 1));"\
+"done;"\
+"exit 1'"
+      else
+        printf 'zfs-mount-generator: (%s) invalid keylocation\n' \
+          "${dataset}" >/dev/kmsg
+      fi
+      cat > "${dest_norm}/${keyloadunit}" << EOF
+# Automatically generated by zfs-mount-generator
+
+[Unit]
+Description=Load ZFS key for ${dataset}
+SourcePath=${cachefile}
+Documentation=man:zfs-mount-generator(8)
+DefaultDependencies=no
+Wants=${wants}
+After=${wants}
+${pathdep}
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+ExecStart=${keyloadcmd}
+ExecStop=@sbindir@/zfs unload-key '${dataset}'
+EOF
+    fi
+    # Update the dependencies for the mount file to require the
+    # key-loading unit.
+    wants="${wants},${keyloadunit}"
+  fi
+
   # If the mountpoint has already been created, give it precedence.
   if [ -e "${dest_norm}/${mountfile}" ] ; then
     printf 'zfs-mount-generator: %s already exists\n' "${mountfile}" \
@@ -183,8 +233,8 @@ process_line() {
 SourcePath=${cachefile}
 Documentation=man:zfs-mount-generator(8)
 Before=local-fs.target zfs-mount.service
-After=zfs-import.target
-Wants=zfs-import.target
+After=${wants}
+Wants=${wants}
 
 [Mount]
 Where=${p_mountpoint}
diff --git a/man/man8/zfs-mount-generator.8.in b/man/man8/zfs-mount-generator.8.in
index 79720601d62a..48e4e2dfac29 100644
--- a/man/man8/zfs-mount-generator.8.in
+++ b/man/man8/zfs-mount-generator.8.in
@@ -26,7 +26,7 @@ information on ZFS mountpoints must be stored separately. The output
 of the command
 .PP
 .RS 4
-zfs list -H -o name,mountpoint,canmount,atime,relatime,devices,exec,readonly,setuid,nbmand
+zfs list -H -o name,mountpoint,canmount,atime,relatime,devices,exec,readonly,setuid,nbmand,encroot,keylocation
 .RE
 .PP
 for datasets that should be mounted by systemd, should be kept

From 446d08fba4f2a795a278906167157bb6378176a1 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 16 Jul 2019 14:14:12 -0700
Subject: [PATCH 065/109] Fix get_special_prop() build failure

The cast of the size_t returned by strlcpy() to a uint64_t by the
VERIFY3U can result in a build failure when CONFIG_FORTIFY_SOURCE
is set.  This is due to the additional hardening.  Since the token
is expected to always fit in strval the VERIFY3U has been removed.
If somehow it doesn't, it will still be safely truncated.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Don Brady <don.brady@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #8999
Closes #9020
---
 module/zfs/zcp_get.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/module/zfs/zcp_get.c b/module/zfs/zcp_get.c
index ed98f0d1025b..0a5f0b8242ab 100644
--- a/module/zfs/zcp_get.c
+++ b/module/zfs/zcp_get.c
@@ -423,13 +423,11 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
 	case ZFS_PROP_RECEIVE_RESUME_TOKEN: {
 		char *token = get_receive_resume_stats_impl(ds);
 
-		VERIFY3U(strlcpy(strval, token, ZAP_MAXVALUELEN),
-		    <, ZAP_MAXVALUELEN);
+		(void) strlcpy(strval, token, ZAP_MAXVALUELEN);
 		if (strcmp(strval, "") == 0) {
 			char *childval = get_child_receive_stats(ds);
 
-			VERIFY3U(strlcpy(strval, childval, ZAP_MAXVALUELEN),
-			    <, ZAP_MAXVALUELEN);
+			(void) strlcpy(strval, childval, ZAP_MAXVALUELEN);
 			if (strcmp(strval, "") == 0)
 				error = ENOENT;
 

From 984bfb373fe7816e7c1b3ea0bf3fa937bc34d5d8 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 16 Jul 2019 17:22:31 -0700
Subject: [PATCH 066/109] Minor style cleanup

Resolve an assortment of style inconsistencies including
use of white space, typos, capitalization, and line wrapping.
There is no functional change.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #9030
---
 config/kernel-fpu.m4                |  9 ++++--
 include/linux/simd_aarch64.h        |  6 ++--
 include/linux/simd_x86.h            | 48 +++++++++++++++--------------
 module/icp/algs/aes/aes_impl.c      | 11 +++++--
 module/icp/algs/modes/gcm.c         | 10 +++---
 module/icp/include/aes/aes_impl.h   |  2 +-
 module/icp/include/modes/gcm_impl.h |  4 +--
 module/spl/spl-thread.c             |  3 +-
 module/zcommon/zfs_fletcher.c       |  6 ++--
 9 files changed, 57 insertions(+), 42 deletions(-)

diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4
index 5fff79a74c70..ebb02fb09a28 100644
--- a/config/kernel-fpu.m4
+++ b/config/kernel-fpu.m4
@@ -18,7 +18,8 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [
 		#include <asm/fpu/api.h>
 	],[
 	],[
-		AC_DEFINE(HAVE_KERNEL_FPU_API_HEADER, 1, [kernel has asm/fpu/api.h])
+		AC_DEFINE(HAVE_KERNEL_FPU_API_HEADER, 1,
+		    [kernel has asm/fpu/api.h])
 		AC_MSG_RESULT(asm/fpu/api.h)
 	],[
 		AC_MSG_RESULT(i387.h & xcr.h)
@@ -39,8 +40,10 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [
 		kernel_fpu_end();
 	], [kernel_fpu_begin], [arch/x86/kernel/fpu/core.c], [
 		AC_MSG_RESULT(kernel_fpu_*)
-		AC_DEFINE(HAVE_KERNEL_FPU, 1, [kernel has kernel_fpu_* functions])
-		AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions])
+		AC_DEFINE(HAVE_KERNEL_FPU, 1,
+		    [kernel has kernel_fpu_* functions])
+		AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1,
+		    [kernel exports FPU functions])
 	],[
 		ZFS_LINUX_TRY_COMPILE_SYMBOL([
 			#include <linux/module.h>
diff --git a/include/linux/simd_aarch64.h b/include/linux/simd_aarch64.h
index 155ef6205599..56153a16072e 100644
--- a/include/linux/simd_aarch64.h
+++ b/include/linux/simd_aarch64.h
@@ -26,8 +26,10 @@
  * USER API:
  *
  * Kernel fpu methods:
- * 	kfpu_begin()
- * 	kfpu_end()
+ *	kfpu_allowed()
+ *	kfpu_initialize()
+ *	kfpu_begin()
+ *	kfpu_end()
  */
 
 #ifndef _SIMD_AARCH64_H
diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h
index 12cd7467788e..0489bfaa3a70 100644
--- a/include/linux/simd_x86.h
+++ b/include/linux/simd_x86.h
@@ -26,8 +26,10 @@
  * USER API:
  *
  * Kernel fpu methods:
- * 	kfpu_begin()
- * 	kfpu_end()
+ *	kfpu_allowed()
+ *	kfpu_initialize()
+ *	kfpu_begin()
+ *	kfpu_end()
  *
  * SIMD support:
  *
@@ -37,31 +39,31 @@
  * all relevant feature test functions should be called.
  *
  * Supported features:
- * 	zfs_sse_available()
- * 	zfs_sse2_available()
- * 	zfs_sse3_available()
- * 	zfs_ssse3_available()
- * 	zfs_sse4_1_available()
- * 	zfs_sse4_2_available()
+ *	zfs_sse_available()
+ *	zfs_sse2_available()
+ *	zfs_sse3_available()
+ *	zfs_ssse3_available()
+ *	zfs_sse4_1_available()
+ *	zfs_sse4_2_available()
  *
- * 	zfs_avx_available()
- * 	zfs_avx2_available()
+ *	zfs_avx_available()
+ *	zfs_avx2_available()
  *
- * 	zfs_bmi1_available()
- * 	zfs_bmi2_available()
+ *	zfs_bmi1_available()
+ *	zfs_bmi2_available()
  *
- * 	zfs_avx512f_available()
- * 	zfs_avx512cd_available()
- * 	zfs_avx512er_available()
- * 	zfs_avx512pf_available()
- * 	zfs_avx512bw_available()
- * 	zfs_avx512dq_available()
- * 	zfs_avx512vl_available()
- * 	zfs_avx512ifma_available()
- * 	zfs_avx512vbmi_available()
+ *	zfs_avx512f_available()
+ *	zfs_avx512cd_available()
+ *	zfs_avx512er_available()
+ *	zfs_avx512pf_available()
+ *	zfs_avx512bw_available()
+ *	zfs_avx512dq_available()
+ *	zfs_avx512vl_available()
+ *	zfs_avx512ifma_available()
+ *	zfs_avx512vbmi_available()
  *
  * NOTE(AVX-512VL):	If using AVX-512 instructions with 128Bit registers
- * 			also add zfs_avx512vl_available() to feature check.
+ *			also add zfs_avx512vl_available() to feature check.
  */
 
 #ifndef _SIMD_X86_H
@@ -190,7 +192,7 @@ typedef struct cpuid_feature_desc {
  * Descriptions of supported instruction sets
  */
 static const cpuid_feature_desc_t cpuid_features[] = {
-	[SSE]		= {1U, 0U,	1U << 25, 	EDX	},
+	[SSE]		= {1U, 0U,	1U << 25,	EDX	},
 	[SSE2]		= {1U, 0U,	1U << 26,	EDX	},
 	[SSE3]		= {1U, 0U,	1U << 0,	ECX	},
 	[SSSE3]		= {1U, 0U,	1U << 9,	ECX	},
diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c
index e15050635741..36e0686a51c2 100644
--- a/module/icp/algs/aes/aes_impl.c
+++ b/module/icp/algs/aes/aes_impl.c
@@ -303,16 +303,21 @@ aes_impl_init(void)
 	}
 	aes_supp_impl_cnt = c;
 
-	/* set fastest implementation. assume hardware accelerated is fastest */
+	/*
+	 * Set the fastest implementation given the assumption that the
+	 * hardware accelerated version is the fastest.
+	 */
 #if defined(__x86_64)
 #if defined(HAVE_AES)
-	if (aes_aesni_impl.is_supported())
+	if (aes_aesni_impl.is_supported()) {
 		memcpy(&aes_fastest_impl, &aes_aesni_impl,
 		    sizeof (aes_fastest_impl));
-	else
+	} else
 #endif
+	{
 		memcpy(&aes_fastest_impl, &aes_x86_64_impl,
 		    sizeof (aes_fastest_impl));
+	}
 #else
 	memcpy(&aes_fastest_impl, &aes_generic_impl,
 	    sizeof (aes_fastest_impl));
diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c
index 13bceef0f170..0afd957f0cf9 100644
--- a/module/icp/algs/modes/gcm.c
+++ b/module/icp/algs/modes/gcm.c
@@ -646,7 +646,7 @@ const gcm_impl_ops_t *gcm_all_impl[] = {
 /* Indicate that benchmark has been completed */
 static boolean_t gcm_impl_initialized = B_FALSE;
 
-/* Select aes implementation */
+/* Select GCM implementation */
 #define	IMPL_FASTEST	(UINT32_MAX)
 #define	IMPL_CYCLE	(UINT32_MAX-1)
 
@@ -713,13 +713,15 @@ gcm_impl_init(void)
 
 	/* set fastest implementation. assume hardware accelerated is fastest */
 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
-	if (gcm_pclmulqdq_impl.is_supported())
+	if (gcm_pclmulqdq_impl.is_supported()) {
 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
 		    sizeof (gcm_fastest_impl));
-	else
+	} else
 #endif
+	{
 		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
 		    sizeof (gcm_fastest_impl));
+	}
 
 	strcpy(gcm_fastest_impl.name, "fastest");
 
@@ -742,7 +744,7 @@ static const struct {
  * If we are called before init(), user preference will be saved in
  * user_sel_impl, and applied in later init() call. This occurs when module
  * parameter is specified on module load. Otherwise, directly update
- * icp_aes_impl.
+ * icp_gcm_impl.
  *
  * @val		Name of gcm implementation to use
  * @param	Unused.
diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h
index 95cfddf9e0a4..3a3de91cf6a5 100644
--- a/module/icp/include/aes/aes_impl.h
+++ b/module/icp/include/aes/aes_impl.h
@@ -162,7 +162,7 @@ typedef enum aes_mech_type {
 #endif /* _AES_IMPL */
 
 /*
- * Methods used to define aes implementation
+ * Methods used to define AES implementation
  *
  * @aes_gen_f Key generation
  * @aes_enc_f Function encrypts one block
diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h
index cbb904c059b7..b78cc8aab010 100644
--- a/module/icp/include/modes/gcm_impl.h
+++ b/module/icp/include/modes/gcm_impl.h
@@ -37,12 +37,12 @@ extern "C" {
 #include <sys/crypto/common.h>
 
 /*
- * Methods used to define gcm implementation
+ * Methods used to define GCM implementation
  *
  * @gcm_mul_f Perform carry-less multiplication
  * @gcm_will_work_f Function tests whether implementation will function
  */
-typedef void 		(*gcm_mul_f)(uint64_t *, uint64_t *, uint64_t *);
+typedef void		(*gcm_mul_f)(uint64_t *, uint64_t *, uint64_t *);
 typedef boolean_t	(*gcm_will_work_f)(void);
 
 #define	GCM_IMPL_NAME_MAX (16)
diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c
index d441ad65f317..0352a31ea835 100644
--- a/module/spl/spl-thread.c
+++ b/module/spl/spl-thread.c
@@ -153,8 +153,9 @@ spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...)
 			if (PTR_ERR(tsk) == -ENOMEM)
 				continue;
 			return (NULL);
-		} else
+		} else {
 			return (tsk);
+		}
 	} while (1);
 }
 EXPORT_SYMBOL(spl_kthread_create);
diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c
index 5a991ba6073a..f712ce40c6ea 100644
--- a/module/zcommon/zfs_fletcher.c
+++ b/module/zcommon/zfs_fletcher.c
@@ -592,8 +592,9 @@ fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
 }
 
 #if defined(_KERNEL)
-/* Fletcher 4 kstats */
-
+/*
+ * Fletcher 4 kstats
+ */
 static int
 fletcher_4_kstat_headers(char *buf, size_t size)
 {
@@ -669,7 +670,6 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
 	zio_cksum_t zc;
 	uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
 
-
 	fletcher_checksum_func_t *fletcher_4_test = native ?
 	    fletcher_4_native : fletcher_4_byteswap;
 

From 2b9f73e5e6ae6f210b1b316bbd7bcbf8c6c62d61 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Thu, 18 Jul 2019 01:07:53 +0900
Subject: [PATCH 067/109] Use zfsctl_snapshot_hold() wrapper

zfs_refcount_*() are to be wrapped by zfsctl_snapshot_*() in this file.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #9039
---
 module/zfs/zfs_ctldir.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
index 52314f4e1bdb..8acbbb61ca9d 100644
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -192,7 +192,7 @@ static void
 zfsctl_snapshot_add(zfs_snapentry_t *se)
 {
 	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
-	zfs_refcount_add(&se->se_refcount, NULL);
+	zfsctl_snapshot_hold(se);
 	avl_add(&zfs_snapshots_by_name, se);
 	avl_add(&zfs_snapshots_by_objsetid, se);
 }
@@ -269,7 +269,7 @@ zfsctl_snapshot_find_by_name(char *snapname)
 	search.se_name = snapname;
 	se = avl_find(&zfs_snapshots_by_name, &search, NULL);
 	if (se)
-		zfs_refcount_add(&se->se_refcount, NULL);
+		zfsctl_snapshot_hold(se);
 
 	return (se);
 }
@@ -290,7 +290,7 @@ zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid)
 	search.se_objsetid = objsetid;
 	se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL);
 	if (se)
-		zfs_refcount_add(&se->se_refcount, NULL);
+		zfsctl_snapshot_hold(se);
 
 	return (se);
 }

From ceb516ac2f4c2ddffcea8a6d282312dd941d3296 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michael=20Niew=C3=B6hner?=
 <c0d3z3r0@users.noreply.github.com>
Date: Wed, 17 Jul 2019 18:09:22 +0200
Subject: [PATCH 068/109] Add missing __GFP_HIGHMEM flag to vmalloc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make use of __GFP_HIGHMEM flag in vmem_alloc, which is required for
some 32-bit systems to make use of full available memory.
While kernel versions >=4.12-rc1 add this flag implicitly, older
kernels do not.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Signed-off-by: Michael Niewöhner <foss@mniewoehner.de>
Closes #9031
---
 module/spl/spl-kmem.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
index 1fdb61e6fce1..824b5e89f507 100644
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -180,7 +180,8 @@ spl_kmem_alloc_impl(size_t size, int flags, int node)
 		 */
 		if ((size > spl_kmem_alloc_max) || use_vmem) {
 			if (flags & KM_VMEM) {
-				ptr = __vmalloc(size, lflags, PAGE_KERNEL);
+				ptr = __vmalloc(size, lflags | __GFP_HIGHMEM,
+				    PAGE_KERNEL);
 			} else {
 				return (NULL);
 			}

From 4c98586dafab4518a7eea8db9a19271e99ae3110 Mon Sep 17 00:00:00 2001
From: jdike <52420226+jdike@users.noreply.github.com>
Date: Wed, 17 Jul 2019 12:18:24 -0400
Subject: [PATCH 069/109] Fix lockdep recursive locking false positive in
 dbuf_destroy

lockdep reports a possible recursive lock in dbuf_destroy.

It is true that dbuf_destroy is acquiring the dn_dbufs_mtx
on one dnode while holding it on another dnode.  However,
it is impossible for these to be the same dnode because,
among other things,dbuf_destroy checks MUTEX_HELD before
acquiring the mutex.

This fix defines a class NESTED_SINGLE == 1 and changes
that lock to call mutex_enter_nested with a subclass of
NESTED_SINGLE.

In order to make the userspace code compile,
include/sys/zfs_context.h now defines mutex_enter_nested and
NESTED_SINGLE.

This is the lockdep report:

[  122.950921] ============================================
[  122.950921] WARNING: possible recursive locking detected
[  122.950921] 4.19.29-4.19.0-debug-d69edad5368c1166 #1 Tainted: G           O
[  122.950921] --------------------------------------------
[  122.950921] dbu_evict/1457 is trying to acquire lock:
[  122.950921] 0000000083e9cbcf (&dn->dn_dbufs_mtx){+.+.}, at: dbuf_destroy+0x3c0/0xdb0 [zfs]
[  122.950921]
               but task is already holding lock:
[  122.950921] 0000000055523987 (&dn->dn_dbufs_mtx){+.+.}, at: dnode_evict_dbufs+0x90/0x740 [zfs]
[  122.950921]
               other info that might help us debug this:
[  122.950921]  Possible unsafe locking scenario:

[  122.950921]        CPU0
[  122.950921]        ----
[  122.950921]   lock(&dn->dn_dbufs_mtx);
[  122.950921]   lock(&dn->dn_dbufs_mtx);
[  122.950921]
                *** DEADLOCK ***

[  122.950921]  May be due to missing lock nesting notation

[  122.950921] 1 lock held by dbu_evict/1457:
[  122.950921]  #0: 0000000055523987 (&dn->dn_dbufs_mtx){+.+.}, at: dnode_evict_dbufs+0x90/0x740 [zfs]
[  122.950921]
               stack backtrace:
[  122.950921] CPU: 0 PID: 1457 Comm: dbu_evict Tainted: G           O      4.19.29-4.19.0-debug-d69edad5368c1166 #1
[  122.950921] Hardware name: Supermicro H8SSL-I2/H8SSL-I2, BIOS 080011  03/13/2009
[  122.950921] Call Trace:
[  122.950921]  dump_stack+0x91/0xeb
[  122.950921]  __lock_acquire+0x2ca7/0x4f10
[  122.950921]  lock_acquire+0x153/0x330
[  122.950921]  dbuf_destroy+0x3c0/0xdb0 [zfs]
[  122.950921]  dbuf_evict_one+0x1cc/0x3d0 [zfs]
[  122.950921]  dbuf_rele_and_unlock+0xb84/0xd60 [zfs]
[  122.950921]  dnode_evict_dbufs+0x3a6/0x740 [zfs]
[  122.950921]  dmu_objset_evict+0x7a/0x500 [zfs]
[  122.950921]  dsl_dataset_evict_async+0x70/0x480 [zfs]
[  122.950921]  taskq_thread+0x979/0x1480 [spl]
[  122.950921]  kthread+0x2e7/0x3e0
[  122.950921]  ret_from_fork+0x27/0x50

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Jeff Dike <jdike@akamai.com>
Closes #8984
---
 include/spl/sys/mutex.h   | 2 ++
 include/sys/zfs_context.h | 2 ++
 module/zfs/dbuf.c         | 3 ++-
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/spl/sys/mutex.h b/include/spl/sys/mutex.h
index ed0cd4932cfa..a61f35c61eb1 100644
--- a/include/spl/sys/mutex.h
+++ b/include/spl/sys/mutex.h
@@ -127,6 +127,8 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp)			\
 })
 /* END CSTYLED */
 
+#define	NESTED_SINGLE 1
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define	mutex_enter_nested(mp, subclass)			\
 {								\
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index e3fa2e61bdc9..598b86a7a659 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -257,6 +257,8 @@ extern void mutex_enter(kmutex_t *mp);
 extern void mutex_exit(kmutex_t *mp);
 extern int mutex_tryenter(kmutex_t *mp);
 
+#define	NESTED_SINGLE 1
+#define	mutex_enter_nested(mp, class) mutex_enter(mp)
 /*
  * RW locks
  */
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 07e616f6f0de..94c49b3ef0a9 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2591,7 +2591,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	if (db->db_blkid != DMU_BONUS_BLKID) {
 		boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
 		if (needlock)
-			mutex_enter(&dn->dn_dbufs_mtx);
+			mutex_enter_nested(&dn->dn_dbufs_mtx,
+			    NESTED_SINGLE);
 		avl_remove(&dn->dn_dbufs, db);
 		atomic_dec_32(&dn->dn_dbufs_count);
 		membar_producer();

From 54561073e7f6e258f6c9e96be60821d51db2ac34 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 12 Jul 2019 13:27:24 -0700
Subject: [PATCH 070/109] Linux 5.3 compat: rw_semaphore owner

Commit https://github.com/torvalds/linux/commit/94a9717b updated the
rwsem's owner field to contain additional flags describing the rwsem's
state.  Rather then update the wrappers to mask out these bits, the
code no longer relies on the owner stored by the kernel.  This does
increase the size of a krwlock_t but it makes the implementation
less sensitive to future kernel changes.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #9029
---
 include/spl/sys/rwlock.h | 68 +++-------------------------------------
 module/spl/spl-rwlock.c  |  3 --
 2 files changed, 5 insertions(+), 66 deletions(-)

diff --git a/include/spl/sys/rwlock.h b/include/spl/sys/rwlock.h
index 408defac20d3..5e052b532a42 100644
--- a/include/spl/sys/rwlock.h
+++ b/include/spl/sys/rwlock.h
@@ -78,15 +78,9 @@ typedef enum {
 	RW_READER	= 2
 } krw_t;
 
-/*
- * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, rw_semaphore will have an owner
- * field, so we don't need our own.
- */
 typedef struct {
 	struct rw_semaphore rw_rwlock;
-#ifndef CONFIG_RWSEM_SPIN_ON_OWNER
 	kthread_t *rw_owner;
-#endif
 #ifdef CONFIG_LOCKDEP
 	krw_type_t	rw_type;
 #endif /* CONFIG_LOCKDEP */
@@ -97,31 +91,19 @@ typedef struct {
 static inline void
 spl_rw_set_owner(krwlock_t *rwp)
 {
-/*
- * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, down_write, up_write,
- * downgrade_write and __init_rwsem will set/clear owner for us.
- */
-#ifndef CONFIG_RWSEM_SPIN_ON_OWNER
 	rwp->rw_owner = current;
-#endif
 }
 
 static inline void
 spl_rw_clear_owner(krwlock_t *rwp)
 {
-#ifndef CONFIG_RWSEM_SPIN_ON_OWNER
 	rwp->rw_owner = NULL;
-#endif
 }
 
 static inline kthread_t *
 rw_owner(krwlock_t *rwp)
 {
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-	return (SEM(rwp)->owner);
-#else
 	return (rwp->rw_owner);
-#endif
 }
 
 #ifdef CONFIG_LOCKDEP
@@ -148,62 +130,22 @@ spl_rw_lockdep_on_maybe(krwlock_t *rwp)			\
 #define	spl_rw_lockdep_on_maybe(rwp)
 #endif /* CONFIG_LOCKDEP */
 
-
 static inline int
-RW_WRITE_HELD(krwlock_t *rwp)
+RW_LOCK_HELD(krwlock_t *rwp)
 {
-	return (rw_owner(rwp) == current);
+	return (spl_rwsem_is_locked(SEM(rwp)));
 }
 
 static inline int
-RW_LOCK_HELD(krwlock_t *rwp)
+RW_WRITE_HELD(krwlock_t *rwp)
 {
-	return (spl_rwsem_is_locked(SEM(rwp)));
+	return (rw_owner(rwp) == current);
 }
 
 static inline int
 RW_READ_HELD(krwlock_t *rwp)
 {
-	if (!RW_LOCK_HELD(rwp))
-		return (0);
-
-	/*
-	 * rw_semaphore cheat sheet:
-	 *
-	 * < 3.16:
-	 * There's no rw_semaphore.owner, so use rwp.owner instead.
-	 * If rwp.owner == NULL then it's a reader
-	 *
-	 * 3.16 - 4.7:
-	 * rw_semaphore.owner added (https://lwn.net/Articles/596656/)
-	 * and CONFIG_RWSEM_SPIN_ON_OWNER introduced.
-	 * If rw_semaphore.owner == NULL then it's a reader
-	 *
-	 * 4.8 - 4.16.16:
-	 * RWSEM_READER_OWNED added as an internal #define.
-	 * (https://lore.kernel.org/patchwork/patch/678590/)
-	 * If rw_semaphore.owner == 1 then it's a reader
-	 *
-	 * 4.16.17 - 4.19:
-	 * RWSEM_OWNER_UNKNOWN introduced as ((struct task_struct *)-1L)
-	 * (https://do-db2.lkml.org/lkml/2018/5/15/985)
-	 * If rw_semaphore.owner == 1 then it's a reader.
-	 *
-	 * 4.20+:
-	 * RWSEM_OWNER_UNKNOWN changed to ((struct task_struct *)-2L)
-	 * (https://lkml.org/lkml/2018/9/6/986)
-	 * If rw_semaphore.owner & 1 then it's a reader, and also the reader's
-	 * task_struct may be embedded in rw_semaphore->owner.
-	 */
-#if	defined(CONFIG_RWSEM_SPIN_ON_OWNER) && defined(RWSEM_OWNER_UNKNOWN)
-	if (RWSEM_OWNER_UNKNOWN == (struct task_struct *)-2L) {
-		/* 4.20+ kernels with CONFIG_RWSEM_SPIN_ON_OWNER */
-		return ((unsigned long) SEM(rwp)->owner & 1);
-	}
-#endif
-
-	/* < 4.20 kernel or !CONFIG_RWSEM_SPIN_ON_OWNER */
-	return (rw_owner(rwp) == NULL || (unsigned long) rw_owner(rwp) == 1);
+	return (RW_LOCK_HELD(rwp) && rw_owner(rwp) == NULL);
 }
 
 /*
diff --git a/module/spl/spl-rwlock.c b/module/spl/spl-rwlock.c
index 86727ed1957c..886e16924e65 100644
--- a/module/spl/spl-rwlock.c
+++ b/module/spl/spl-rwlock.c
@@ -119,9 +119,6 @@ rwsem_tryupgrade(struct rw_semaphore *rwsem)
 	if (__rwsem_tryupgrade(rwsem)) {
 		rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
 		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-		rwsem->owner = current;
-#endif
 		return (1);
 	}
 	return (0);

From 3982d959c5b8577993740c03392c4efa750c0479 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 12 Jul 2019 14:06:36 -0700
Subject: [PATCH 071/109] Linux 5.3 compat: retire rw_tryupgrade()

The Linux kernel's rwsem's have never provided an interface to
allow a reader to be upgraded to a writer.  Historically, this
functionality has been implemented by a SPL wrapper function.
However, this approach depends on internal knowledge of the
rw_semaphore and is therefore rather brittle.

Since the ZFS code must always be able to fallback to rw_exit()
and rw_enter() when an rw_tryupgrade() fails; this functionality
isn't critical.  Furthermore, the only potentially performance
sensitive consumer is dmu_zfetch() and no decrease in performance
was observed with this change applied.  See the PR comments for
additional testing details.

Therefore, it is being retired to make the build more robust and
to simplify the rwlock implementation.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #9029
---
 include/spl/sys/rwlock.h |  60 +++--------------------
 module/spl/spl-rwlock.c  | 101 ---------------------------------------
 2 files changed, 7 insertions(+), 154 deletions(-)

diff --git a/include/spl/sys/rwlock.h b/include/spl/sys/rwlock.h
index 5e052b532a42..89e02fa8f044 100644
--- a/include/spl/sys/rwlock.h
+++ b/include/spl/sys/rwlock.h
@@ -29,43 +29,6 @@
 #include <linux/rwsem.h>
 #include <linux/sched.h>
 
-/* Linux kernel compatibility */
-#if defined(CONFIG_PREEMPT_RT_FULL)
-#define	SPL_RWSEM_SINGLE_READER_VALUE	(1)
-#define	SPL_RWSEM_SINGLE_WRITER_VALUE	(0)
-#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
-#define	SPL_RWSEM_SINGLE_READER_VALUE	(1)
-#define	SPL_RWSEM_SINGLE_WRITER_VALUE	(-1)
-#elif defined(RWSEM_ACTIVE_MASK)
-#define	SPL_RWSEM_SINGLE_READER_VALUE	(RWSEM_ACTIVE_READ_BIAS)
-#define	SPL_RWSEM_SINGLE_WRITER_VALUE	(RWSEM_ACTIVE_WRITE_BIAS)
-#endif
-
-/* Linux 3.16 changed activity to count for rwsem-spinlock */
-#if defined(CONFIG_PREEMPT_RT_FULL)
-#define	RWSEM_COUNT(sem)	sem->read_depth
-#elif defined(HAVE_RWSEM_ACTIVITY)
-#define	RWSEM_COUNT(sem)	sem->activity
-/* Linux 4.8 changed count to an atomic_long_t for !rwsem-spinlock */
-#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT)
-#define	RWSEM_COUNT(sem)	atomic_long_read(&(sem)->count)
-#else
-#define	RWSEM_COUNT(sem)	sem->count
-#endif
-
-#if defined(RWSEM_SPINLOCK_IS_RAW)
-#define	spl_rwsem_lock_irqsave(lk, fl)		raw_spin_lock_irqsave(lk, fl)
-#define	spl_rwsem_unlock_irqrestore(lk, fl)	\
-    raw_spin_unlock_irqrestore(lk, fl)
-#define	spl_rwsem_trylock_irqsave(lk, fl)	raw_spin_trylock_irqsave(lk, fl)
-#else
-#define	spl_rwsem_lock_irqsave(lk, fl)		spin_lock_irqsave(lk, fl)
-#define	spl_rwsem_unlock_irqrestore(lk, fl)	spin_unlock_irqrestore(lk, fl)
-#define	spl_rwsem_trylock_irqsave(lk, fl)	spin_trylock_irqsave(lk, fl)
-#endif /* RWSEM_SPINLOCK_IS_RAW */
-
-#define	spl_rwsem_is_locked(rwsem)		rwsem_is_locked(rwsem)
-
 typedef enum {
 	RW_DRIVER	= 2,
 	RW_DEFAULT	= 4,
@@ -133,7 +96,7 @@ spl_rw_lockdep_on_maybe(krwlock_t *rwp)			\
 static inline int
 RW_LOCK_HELD(krwlock_t *rwp)
 {
-	return (spl_rwsem_is_locked(SEM(rwp)));
+	return (rwsem_is_locked(SEM(rwp)));
 }
 
 static inline int
@@ -170,6 +133,12 @@ RW_READ_HELD(krwlock_t *rwp)
  */
 #define	rw_destroy(rwp)		((void) 0)
 
+/*
+ * Upgrading a rwsem from a reader to a writer is not supported by the
+ * Linux kernel.  The lock must be dropped and reacquired as a writer.
+ */
+#define	rw_tryupgrade(rwp)	RW_WRITE_HELD(rwp)
+
 #define	rw_tryenter(rwp, rw)						\
 ({									\
 	int _rc_ = 0;							\
@@ -228,24 +197,9 @@ RW_READ_HELD(krwlock_t *rwp)
 	spl_rw_lockdep_on_maybe(rwp);					\
 })
 
-#define	rw_tryupgrade(rwp)						\
-({									\
-	int _rc_ = 0;							\
-									\
-	if (RW_WRITE_HELD(rwp)) {					\
-		_rc_ = 1;						\
-	} else {							\
-		spl_rw_lockdep_off_maybe(rwp);				\
-		if ((_rc_ = rwsem_tryupgrade(SEM(rwp))))		\
-			spl_rw_set_owner(rwp);				\
-		spl_rw_lockdep_on_maybe(rwp);				\
-	}								\
-	_rc_;								\
-})
 /* END CSTYLED */
 
 int spl_rw_init(void);
 void spl_rw_fini(void);
-int rwsem_tryupgrade(struct rw_semaphore *rwsem);
 
 #endif /* _SPL_RWLOCK_H */
diff --git a/module/spl/spl-rwlock.c b/module/spl/spl-rwlock.c
index 886e16924e65..10f7c38db4eb 100644
--- a/module/spl/spl-rwlock.c
+++ b/module/spl/spl-rwlock.c
@@ -24,106 +24,5 @@
  *  Solaris Porting Layer (SPL) Reader/Writer Lock Implementation.
  */
 
-#include <sys/rwlock.h>
-#include <linux/module.h>
-
-#if defined(CONFIG_PREEMPT_RT_FULL)
-
-#include <linux/rtmutex.h>
-#define	RT_MUTEX_OWNER_MASKALL	1UL
-
-static int
-__rwsem_tryupgrade(struct rw_semaphore *rwsem)
-{
-#if defined(READER_BIAS) && defined(WRITER_BIAS)
-	/*
-	 * After the 4.9.20-rt16 kernel the realtime patch series lifted the
-	 * single reader restriction.  While this could be accommodated by
-	 * adding additional compatibility code assume the rwsem can never
-	 * be upgraded.  All caller must already cleanly handle this case.
-	 */
-	return (0);
-#else
-	ASSERT((struct task_struct *)
-	    ((unsigned long)rwsem->lock.owner & ~RT_MUTEX_OWNER_MASKALL) ==
-	    current);
-
-	/*
-	 * Prior to 4.9.20-rt16 kernel the realtime patch series, rwsem is
-	 * implemented as a single mutex held by readers and writers alike.
-	 * However, this implementation would prevent a thread from taking
-	 * a read lock twice, as the mutex would already be locked on
-	 * the second attempt. Therefore the implementation allows a
-	 * single thread to take a rwsem as read lock multiple times
-	 * tracking that nesting as read_depth counter.
-	 */
-	if (rwsem->read_depth <= 1) {
-		/*
-		 * In case, the current thread has not taken the lock
-		 * more than once as read lock, we can allow an
-		 * upgrade to a write lock. rwsem_rt.h implements
-		 * write locks as read_depth == 0.
-		 */
-		rwsem->read_depth = 0;
-		return (1);
-	}
-	return (0);
-#endif
-}
-#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
-static int
-__rwsem_tryupgrade(struct rw_semaphore *rwsem)
-{
-	int ret = 0;
-	unsigned long flags;
-	spl_rwsem_lock_irqsave(&rwsem->wait_lock, flags);
-	if (RWSEM_COUNT(rwsem) == SPL_RWSEM_SINGLE_READER_VALUE &&
-	    list_empty(&rwsem->wait_list)) {
-		ret = 1;
-		RWSEM_COUNT(rwsem) = SPL_RWSEM_SINGLE_WRITER_VALUE;
-	}
-	spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags);
-	return (ret);
-}
-#elif defined(RWSEM_ACTIVE_MASK)
-#if defined(HAVE_RWSEM_ATOMIC_LONG_COUNT)
-static int
-__rwsem_tryupgrade(struct rw_semaphore *rwsem)
-{
-	long val;
-	val = atomic_long_cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE,
-	    SPL_RWSEM_SINGLE_WRITER_VALUE);
-	return (val == SPL_RWSEM_SINGLE_READER_VALUE);
-}
-#else
-static int
-__rwsem_tryupgrade(struct rw_semaphore *rwsem)
-{
-	typeof(rwsem->count) val;
-	val = cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE,
-	    SPL_RWSEM_SINGLE_WRITER_VALUE);
-	return (val == SPL_RWSEM_SINGLE_READER_VALUE);
-}
-#endif
-#else
-static int
-__rwsem_tryupgrade(struct rw_semaphore *rwsem)
-{
-	return (0);
-}
-#endif
-
-int
-rwsem_tryupgrade(struct rw_semaphore *rwsem)
-{
-	if (__rwsem_tryupgrade(rwsem)) {
-		rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
-		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
-		return (1);
-	}
-	return (0);
-}
-EXPORT_SYMBOL(rwsem_tryupgrade);
-
 int spl_rw_init(void) { return 0; }
 void spl_rw_fini(void) { }

From 428a63cc62c31056b602e80ec072d8093ca049c8 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 12 Jul 2019 14:40:15 -0700
Subject: [PATCH 072/109] Retire unused spl_{mutex,rwlock}_{init_fini}

These functions are unused and can be removed along
with the spl-mutex.c and spl-rwlock.c source files.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #9029
---
 include/spl/sys/mutex.h  |  3 ---
 include/spl/sys/rwlock.h |  4 ----
 module/spl/Makefile.in   |  2 --
 module/spl/spl-generic.c | 38 +++++++++++++-------------------------
 module/spl/spl-mutex.c   | 30 ------------------------------
 module/spl/spl-rwlock.c  | 28 ----------------------------
 6 files changed, 13 insertions(+), 92 deletions(-)
 delete mode 100644 module/spl/spl-mutex.c
 delete mode 100644 module/spl/spl-rwlock.c

diff --git a/include/spl/sys/mutex.h b/include/spl/sys/mutex.h
index a61f35c61eb1..73da23685590 100644
--- a/include/spl/sys/mutex.h
+++ b/include/spl/sys/mutex.h
@@ -181,7 +181,4 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp)			\
 	/* NOTE: do not dereference mp after this point */	\
 }
 
-int spl_mutex_init(void);
-void spl_mutex_fini(void);
-
 #endif /* _SPL_MUTEX_H */
diff --git a/include/spl/sys/rwlock.h b/include/spl/sys/rwlock.h
index 89e02fa8f044..60f5bfd986b4 100644
--- a/include/spl/sys/rwlock.h
+++ b/include/spl/sys/rwlock.h
@@ -196,10 +196,6 @@ RW_READ_HELD(krwlock_t *rwp)
 	downgrade_write(SEM(rwp));					\
 	spl_rw_lockdep_on_maybe(rwp);					\
 })
-
 /* END CSTYLED */
 
-int spl_rw_init(void);
-void spl_rw_fini(void);
-
 #endif /* _SPL_RWLOCK_H */
diff --git a/module/spl/Makefile.in b/module/spl/Makefile.in
index 3bcbf63cbc63..e16666aa94f3 100644
--- a/module/spl/Makefile.in
+++ b/module/spl/Makefile.in
@@ -16,10 +16,8 @@ $(MODULE)-objs += spl-kmem.o
 $(MODULE)-objs += spl-kmem-cache.o
 $(MODULE)-objs += spl-kobj.o
 $(MODULE)-objs += spl-kstat.o
-$(MODULE)-objs += spl-mutex.o
 $(MODULE)-objs += spl-proc.o
 $(MODULE)-objs += spl-procfs-list.o
-$(MODULE)-objs += spl-rwlock.o
 $(MODULE)-objs += spl-taskq.o
 $(MODULE)-objs += spl-thread.o
 $(MODULE)-objs += spl-tsd.o
diff --git a/module/spl/spl-generic.c b/module/spl/spl-generic.c
index cd2fa2020510..3c5ef60bd1a4 100644
--- a/module/spl/spl-generic.c
+++ b/module/spl/spl-generic.c
@@ -694,51 +694,41 @@ spl_init(void)
 	if ((rc = spl_kvmem_init()))
 		goto out1;
 
-	if ((rc = spl_mutex_init()))
-		goto out2;
-
-	if ((rc = spl_rw_init()))
-		goto out3;
-
 	if ((rc = spl_tsd_init()))
-		goto out4;
+		goto out2;
 
 	if ((rc = spl_taskq_init()))
-		goto out5;
+		goto out3;
 
 	if ((rc = spl_kmem_cache_init()))
-		goto out6;
+		goto out4;
 
 	if ((rc = spl_vn_init()))
-		goto out7;
+		goto out5;
 
 	if ((rc = spl_proc_init()))
-		goto out8;
+		goto out6;
 
 	if ((rc = spl_kstat_init()))
-		goto out9;
+		goto out7;
 
 	if ((rc = spl_zlib_init()))
-		goto out10;
+		goto out8;
 
 	return (rc);
 
-out10:
-	spl_kstat_fini();
-out9:
-	spl_proc_fini();
 out8:
-	spl_vn_fini();
+	spl_kstat_fini();
 out7:
-	spl_kmem_cache_fini();
+	spl_proc_fini();
 out6:
-	spl_taskq_fini();
+	spl_vn_fini();
 out5:
-	spl_tsd_fini();
+	spl_kmem_cache_fini();
 out4:
-	spl_rw_fini();
+	spl_taskq_fini();
 out3:
-	spl_mutex_fini();
+	spl_tsd_fini();
 out2:
 	spl_kvmem_fini();
 out1:
@@ -755,8 +745,6 @@ spl_fini(void)
 	spl_kmem_cache_fini();
 	spl_taskq_fini();
 	spl_tsd_fini();
-	spl_rw_fini();
-	spl_mutex_fini();
 	spl_kvmem_fini();
 }
 
diff --git a/module/spl/spl-mutex.c b/module/spl/spl-mutex.c
deleted file mode 100644
index ba818862b679..000000000000
--- a/module/spl/spl-mutex.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
- *  Copyright (C) 2007 The Regents of the University of California.
- *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
- *  UCRL-CODE-235197
- *
- *  This file is part of the SPL, Solaris Porting Layer.
- *  For details, see <http://zfsonlinux.org/>.
- *
- *  The SPL is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; either version 2 of the License, or (at your
- *  option) any later version.
- *
- *  The SPL is distributed in the hope that it will be useful, but WITHOUT
- *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- *  for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Solaris Porting Layer (SPL) Mutex Implementation.
- */
-
-#include <sys/mutex.h>
-
-int spl_mutex_init(void) { return 0; }
-void spl_mutex_fini(void) { }
diff --git a/module/spl/spl-rwlock.c b/module/spl/spl-rwlock.c
deleted file mode 100644
index 10f7c38db4eb..000000000000
--- a/module/spl/spl-rwlock.c
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
- *  Copyright (C) 2007 The Regents of the University of California.
- *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
- *  UCRL-CODE-235197
- *
- *  This file is part of the SPL, Solaris Porting Layer.
- *  For details, see <http://zfsonlinux.org/>.
- *
- *  The SPL is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; either version 2 of the License, or (at your
- *  option) any later version.
- *
- *  The SPL is distributed in the hope that it will be useful, but WITHOUT
- *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- *  for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Solaris Porting Layer (SPL) Reader/Writer Lock Implementation.
- */
-
-int spl_rw_init(void) { return 0; }
-void spl_rw_fini(void) { }

From 3c144b92671df9c6e9d926e6c19a34893645500e Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Fri, 19 Jul 2019 04:48:46 +0900
Subject: [PATCH 073/109] Fix wrong comment on zcr_blksz_{min,max}

These aren't tunable; illumos has this comment fixed in
"3742 zfs comments need cleaner, more consistent style",
so sync with that.

Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #9052
---
 module/zfs/zfs_vnops.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 9d8a9cbc5419..4f07111f25e3 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -5074,13 +5074,14 @@ zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
 
 #ifdef HAVE_UIO_ZEROCOPY
 /*
- * Tunable, both must be a power of 2.
- *
- * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
- * zcr_blksz_max: if set to less than the file block size, allow loaning out of
- *		an arcbuf for a partial block read
+ * The smallest read we may consider to loan out an arcbuf.
+ * This must be a power of 2.
  */
 int zcr_blksz_min = (1 << 10);	/* 1K */
+/*
+ * If set to less than the file block size, allow loaning out of an
+ * arcbuf for a partial block read.  This must be a power of 2.
+ */
 int zcr_blksz_max = (1 << 17);	/* 128K */
 
 /*ARGSUSED*/

From bbbe4b0a9885fb671186da86b63c09f262852c65 Mon Sep 17 00:00:00 2001
From: Serapheim Dimitropoulos <serapheim@delphix.com>
Date: Thu, 18 Jul 2019 12:55:29 -0700
Subject: [PATCH 074/109] hdr_recl calls zthr_wakeup() on destroyed zthr

There exists a race condition were hdr_recl() calls
zthr_wakeup() on a destroyed zthr. The timeline is the
following:

[1] hdr_recl() runs first and goes intro zthr_wakeup()
    because arc_initialized is set.
[2] arc_fini() is called by another thread, zeroes
    that flag, destroying the zthr, and goes into
    buf_init().
[3] hdr_recl() tries to enter the destroyed mutex
    and we blow up.

This patch ensures that the ARC's zthrs are not offloaded
any new work once arc_initialized is set and then destroys
them after all of the ARC state has been deleted.

Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #9047
---
 module/zfs/arc.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index f125ca6a4d14..53a44bdaf44c 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018, Joyent, Inc.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  */
@@ -5079,6 +5079,9 @@ arc_kmem_reap_soon(void)
 static boolean_t
 arc_adjust_cb_check(void *arg, zthr_t *zthr)
 {
+	if (!arc_initialized)
+		return (B_FALSE);
+
 	/*
 	 * This is necessary so that any changes which may have been made to
 	 * many of the zfs_arc_* module parameters will be propagated to
@@ -5166,6 +5169,9 @@ arc_adjust_cb(void *arg, zthr_t *zthr)
 static boolean_t
 arc_reap_cb_check(void *arg, zthr_t *zthr)
 {
+	if (!arc_initialized)
+		return (B_FALSE);
+
 	int64_t free_memory = arc_available_memory();
 
 	/*
@@ -7924,11 +7930,9 @@ arc_fini(void)
 
 	list_destroy(&arc_prune_list);
 	mutex_destroy(&arc_prune_mtx);
-	(void) zthr_cancel(arc_adjust_zthr);
-	zthr_destroy(arc_adjust_zthr);
 
+	(void) zthr_cancel(arc_adjust_zthr);
 	(void) zthr_cancel(arc_reap_zthr);
-	zthr_destroy(arc_reap_zthr);
 
 	mutex_destroy(&arc_adjust_lock);
 	cv_destroy(&arc_adjust_waiters_cv);
@@ -7941,6 +7945,14 @@ arc_fini(void)
 	buf_fini();
 	arc_state_fini();
 
+	/*
+	 * We destroy the zthrs after all the ARC state has been
+	 * torn down to avoid the case of them receiving any
+	 * wakeup() signals after they are destroyed.
+	 */
+	zthr_destroy(arc_adjust_zthr);
+	zthr_destroy(arc_reap_zthr);
+
 	ASSERT0(arc_loaned_bytes);
 }
 

From 1c4b0fc7457d6c6dac801f4a4a694ffe954bb91f Mon Sep 17 00:00:00 2001
From: Serapheim Dimitropoulos <serapheim@delphix.com>
Date: Thu, 18 Jul 2019 13:02:33 -0700
Subject: [PATCH 075/109] Race condition between spa async threads and export

In the past we've seen multiple race conditions that have
to do with open-context threads async threads and concurrent
calls to spa_export()/spa_destroy() (including the one
referenced in issue #9015).

This patch ensures that only one thread can execute the
main body of spa_export_common() at a time, with subsequent
threads returning with a new error code created just for
this situation, eliminating this way any race condition
bugs introduced by concurrent calls to this function.

Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #9015
Closes #9044
---
 cmd/ztest/ztest.c        | 18 +++++++++++++++++-
 include/libzfs.h         |  1 +
 include/sys/fs/zfs.h     |  1 +
 include/sys/spa_impl.h   |  1 +
 lib/libzfs/libzfs_util.c |  5 +++++
 module/zfs/spa.c         | 18 +++++++++++++++++-
 6 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 9c2cf9501831..3bf840d88ed6 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -2745,8 +2745,24 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
 	VERIFY3U(EEXIST, ==,
 	    spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL));
 	nvlist_free(nvroot);
+
+	/*
+	 * We open a reference to the spa and then we try to export it
+	 * expecting one of the following errors:
+	 *
+	 * EBUSY
+	 *	Because of the reference we just opened.
+	 *
+	 * ZFS_ERR_EXPORT_IN_PROGRESS
+	 *	For the case that there is another ztest thread doing
+	 *	an export concurrently.
+	 */
 	VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
-	VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool));
+	int error = spa_destroy(zo->zo_pool);
+	if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) {
+		fatal(0, "spa_destroy(%s) returned unexpected value %d",
+		    spa->spa_name, error);
+	}
 	spa_close(spa, FTAG);
 
 	(void) pthread_rwlock_unlock(&ztest_name_lock);
diff --git a/include/libzfs.h b/include/libzfs.h
index e2ec2d9bce7b..a5b2a8393f43 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -147,6 +147,7 @@ typedef enum zfs_error {
 	EZFS_NO_TRIM,		/* no active trim */
 	EZFS_TRIM_NOTSUP,	/* device does not support trim */
 	EZFS_NO_RESILVER_DEFER,	/* pool doesn't support resilver_defer */
+	EZFS_EXPORT_IN_PROGRESS,	/* currently exporting the pool */
 	EZFS_UNKNOWN
 } zfs_error_t;
 
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 3bcefdbfd775..c167a594a7d4 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1318,6 +1318,7 @@ typedef enum {
 	ZFS_ERR_FROM_IVSET_GUID_MISSING,
 	ZFS_ERR_FROM_IVSET_GUID_MISMATCH,
 	ZFS_ERR_SPILL_BLOCK_FLAG_MISSING,
+	ZFS_ERR_EXPORT_IN_PROGRESS,
 } zfs_errno_t;
 
 /*
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 66032d9aad7a..0de8613d3eb8 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -219,6 +219,7 @@ struct spa {
 	spa_taskqs_t	spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
 	dsl_pool_t	*spa_dsl_pool;
 	boolean_t	spa_is_initializing;	/* true while opening pool */
+	boolean_t	spa_is_exporting;	/* true while exporting pool */
 	metaslab_class_t *spa_normal_class;	/* normal data class */
 	metaslab_class_t *spa_log_class;	/* intent log data class */
 	metaslab_class_t *spa_special_class;	/* special allocation class */
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index 19bb57ad4378..dc2d68ebebbe 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -303,6 +303,8 @@ libzfs_error_description(libzfs_handle_t *hdl)
 	case EZFS_NO_RESILVER_DEFER:
 		return (dgettext(TEXT_DOMAIN, "this action requires the "
 		    "resilver_defer feature"));
+	case EZFS_EXPORT_IN_PROGRESS:
+		return (dgettext(TEXT_DOMAIN, "pool export in progress"));
 	case EZFS_UNKNOWN:
 		return (dgettext(TEXT_DOMAIN, "unknown error"));
 	default:
@@ -598,6 +600,9 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 	case ZFS_ERR_VDEV_TOO_BIG:
 		zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap);
 		break;
+	case ZFS_ERR_EXPORT_IN_PROGRESS:
+		zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap);
+		break;
 	case ZFS_ERR_IOC_CMD_UNAVAIL:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
 		    "module does not support this operation. A reboot may "
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index eb3ff91a073c..ce622cee88b0 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -5722,6 +5722,13 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 		return (SET_ERROR(ENOENT));
 	}
 
+	if (spa->spa_is_exporting) {
+		/* the pool is being exported by another thread */
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
+	}
+	spa->spa_is_exporting = B_TRUE;
+
 	/*
 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
 	 * reacquire the namespace lock, and see if we can export.
@@ -5757,6 +5764,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 	    (spa->spa_inject_ref != 0 &&
 	    new_state != POOL_STATE_UNINITIALIZED)) {
 		spa_async_resume(spa);
+		spa->spa_is_exporting = B_FALSE;
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EBUSY));
 	}
@@ -5771,6 +5779,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 		if (!force && new_state == POOL_STATE_EXPORTED &&
 		    spa_has_active_shared_spare(spa)) {
 			spa_async_resume(spa);
+			spa->spa_is_exporting = B_FALSE;
 			mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(EXDEV));
 		}
@@ -5822,9 +5831,16 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 		if (!hardforce)
 			spa_write_cachefile(spa, B_TRUE, B_TRUE);
 		spa_remove(spa);
+	} else {
+		/*
+		 * If spa_remove() is not called for this spa_t and
+		 * there is any possibility that it can be reused,
+		 * we make sure to reset the exporting flag.
+		 */
+		spa->spa_is_exporting = B_FALSE;
 	}
-	mutex_exit(&spa_namespace_lock);
 
+	mutex_exit(&spa_namespace_lock);
 	return (0);
 }
 

From be068aeea86433481c1bc18cf1a76ed033daea2e Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Fri, 19 Jul 2019 11:21:54 -0700
Subject: [PATCH 076/109] Move some tests to cli_user/zpool_status

The tests in tests/functional/cli_root/zpool_status should all require
root. However, linux.run has "user =" specified for those tests, which
means they run as a normal user.  When I removed that line to run them
as root, the following tests did not pass:

zpool_status_003_pos
zpool_status_-c_disable
zpool_status_-c_homedir
zpool_status_-c_searchpath

These tests need to be run as a normal user.  To fix this, move these
tests to a new tests/functional/cli_user/zpool_status directory.

Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #9057
---
 configure.ac                                  |  1 +
 tests/runfiles/linux.run                      | 11 ++++---
 .../cli_root/zpool_status/Makefile.am         |  6 +---
 .../tests/functional/cli_user/Makefile.am     |  3 +-
 .../cli_user/zpool_status/Makefile.am         |  8 +++++
 .../cli_user/zpool_status/cleanup.ksh         | 30 +++++++++++++++++
 .../cli_user/zpool_status/setup.ksh           | 32 +++++++++++++++++++
 .../zpool_status/zpool_status_-c_disable.ksh  |  0
 .../zpool_status/zpool_status_-c_homedir.ksh  |  0
 .../zpool_status_-c_searchpath.ksh            |  0
 .../zpool_status/zpool_status_003_pos.ksh     |  0
 11 files changed, 81 insertions(+), 10 deletions(-)
 create mode 100644 tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile.am
 create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_status/cleanup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_status/setup.ksh
 rename tests/zfs-tests/tests/functional/{cli_root => cli_user}/zpool_status/zpool_status_-c_disable.ksh (100%)
 rename tests/zfs-tests/tests/functional/{cli_root => cli_user}/zpool_status/zpool_status_-c_homedir.ksh (100%)
 rename tests/zfs-tests/tests/functional/{cli_root => cli_user}/zpool_status/zpool_status_-c_searchpath.ksh (100%)
 rename tests/zfs-tests/tests/functional/{cli_root => cli_user}/zpool_status/zpool_status_003_pos.ksh (100%)

diff --git a/configure.ac b/configure.ac
index ea2e355c70bf..cf1d8b394adf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -272,6 +272,7 @@ AC_CONFIG_FILES([
 	tests/zfs-tests/tests/functional/cli_user/zfs_list/Makefile
 	tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile
 	tests/zfs-tests/tests/functional/cli_user/zpool_list/Makefile
+	tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile
 	tests/zfs-tests/tests/functional/compression/Makefile
 	tests/zfs-tests/tests/functional/cp_files/Makefile
 	tests/zfs-tests/tests/functional/ctime/Makefile
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 27e36b594ab5..c08bc4e31a36 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -462,10 +462,7 @@ tests = ['zpool_split_cliargs', 'zpool_split_devices',
 tags = ['functional', 'cli_root', 'zpool_split']
 
 [tests/functional/cli_root/zpool_status]
-tests = ['zpool_status_001_pos', 'zpool_status_002_pos','zpool_status_003_pos',
-    'zpool_status_-c_disable', 'zpool_status_-c_homedir',
-    'zpool_status_-c_searchpath']
-user =
+tests = ['zpool_status_001_pos', 'zpool_status_002_pos']
 tags = ['functional', 'cli_root', 'zpool_status']
 
 [tests/functional/cli_root/zpool_sync]
@@ -529,6 +526,12 @@ tests = ['zpool_list_001_pos', 'zpool_list_002_neg']
 user =
 tags = ['functional', 'cli_user', 'zpool_list']
 
+[tests/functional/cli_user/zpool_status]
+tests = ['zpool_status_003_pos', 'zpool_status_-c_disable',
+    'zpool_status_-c_homedir', 'zpool_status_-c_searchpath']
+user =
+tags = ['functional', 'cli_user', 'zpool_status']
+
 [tests/functional/compression]
 tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
     'compress_004_pos']
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am
index aab4de0e7c89..beb59e3d066b 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am
@@ -3,8 +3,4 @@ dist_pkgdata_SCRIPTS = \
 	setup.ksh \
 	cleanup.ksh \
 	zpool_status_001_pos.ksh \
-	zpool_status_002_pos.ksh \
-	zpool_status_003_pos.ksh \
-	zpool_status_-c_disable.ksh \
-	zpool_status_-c_homedir.ksh \
-	zpool_status_-c_searchpath.ksh
+	zpool_status_002_pos.ksh
diff --git a/tests/zfs-tests/tests/functional/cli_user/Makefile.am b/tests/zfs-tests/tests/functional/cli_user/Makefile.am
index f1ff32e8d22d..119f8ee187f6 100644
--- a/tests/zfs-tests/tests/functional/cli_user/Makefile.am
+++ b/tests/zfs-tests/tests/functional/cli_user/Makefile.am
@@ -2,4 +2,5 @@ SUBDIRS = \
 	misc \
 	zfs_list \
 	zpool_iostat \
-	zpool_list
+	zpool_list \
+	zpool_status
diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile.am b/tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile.am
new file mode 100644
index 000000000000..e1b339657749
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile.am
@@ -0,0 +1,8 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_user/zpool_status
+dist_pkgdata_SCRIPTS = \
+	setup.ksh \
+	cleanup.ksh \
+	zpool_status_003_pos.ksh \
+	zpool_status_-c_disable.ksh \
+	zpool_status_-c_homedir.ksh \
+	zpool_status_-c_searchpath.ksh
diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_status/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/cleanup.ksh
new file mode 100755
index 000000000000..79cd6e9f908e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_user/zpool_status/cleanup.ksh
@@ -0,0 +1,30 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_status/setup.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/setup.ksh
new file mode 100755
index 000000000000..6a9af3bc28c3
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_user/zpool_status/setup.ksh
@@ -0,0 +1,32 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+DISK=${DISKS%% *}
+
+default_setup $DISK
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_disable.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_disable.ksh
similarity index 100%
rename from tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_disable.ksh
rename to tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_disable.ksh
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_homedir.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh
similarity index 100%
rename from tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_homedir.ksh
rename to tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_searchpath.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh
similarity index 100%
rename from tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_searchpath.ksh
rename to tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_003_pos.ksh
similarity index 100%
rename from tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh
rename to tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_003_pos.ksh

From 65a0b28b42976a23c354f0518e0e1cc02b943b46 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Sat, 20 Jul 2019 03:23:56 +0900
Subject: [PATCH 077/109] Fix module_param() type for zfs_read_chunk_size

zfs_read_chunk_size is unsigned long.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #9051
---
 module/zfs/zfs_vnops.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 4f07111f25e3..2a49293c245c 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -5260,9 +5260,11 @@ EXPORT_SYMBOL(zfs_putpage);
 EXPORT_SYMBOL(zfs_dirty_inode);
 EXPORT_SYMBOL(zfs_map);
 
-/* CSTYLED */
+/* BEGIN CSTYLED */
 module_param(zfs_delete_blocks, ulong, 0644);
 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
-module_param(zfs_read_chunk_size, long, 0644);
+module_param(zfs_read_chunk_size, ulong, 0644);
 MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk");
+/* END CSTYLED */
+
 #endif

From 4f951b183c645f320ad375bb41b319634370e3ac Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Fri, 26 Jul 2019 03:59:20 +0900
Subject: [PATCH 078/109] Don't directly cast unsigned long to void*

Cast to uintptr_t first for portability on integer to/from pointer
conversion.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #9065
---
 module/zfs/zfs_ioctl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index c6b55d24f7ef..152433d60790 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -7110,7 +7110,8 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
 
 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 
-	error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
+	error = ddi_copyin((void *)(uintptr_t)arg, zc, sizeof (zfs_cmd_t),
+	    flag);
 	if (error != 0) {
 		error = SET_ERROR(EFAULT);
 		goto out;
@@ -7269,7 +7270,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
 
 out:
 	nvlist_free(innvl);
-	rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
+	rc = ddi_copyout(zc, (void *)(uintptr_t)arg, sizeof (zfs_cmd_t), flag);
 	if (error == 0 && rc != 0)
 		error = SET_ERROR(EFAULT);
 	if (error == 0 && vec->zvec_allow_log) {

From 1f5979d23f4b06b3d8ebc58b7d7e3946393fa9ce Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <mahrens@delphix.com>
Date: Fri, 26 Jul 2019 12:07:48 -0700
Subject: [PATCH 079/109] zed crashes when devid not present

zed core dumps due to a NULL pointer in zfs_agent_iter_vdev(). The
gs_devid is NULL, but the nvl has a "devid" entry.

zfs_agent_post_event() checks that ZFS_EV_VDEV_GUID or DEV_IDENTIFIER is
present in nvl, but then later it and zfs_agent_iter_vdev() assume that
DEV_IDENTIFIER is present and thus gs_devid is set.

Typically this is not a problem because usually either all vdevs have
devid's, or none of them do. Since zfs_agent_iter_vdev() first checks if
the vdev has devid before dereferencing gs_devid, the problem isn't
typically encountered. However, if some vdevs have devid's and some do
not, then the problem is easily reproduced.  This can happen if the pool
has been moved from a system that has devid's to one that does not.

The fix is for zfs_agent_iter_vdev() to only try to match the devid's if
both nvl and gsp have devid's present.

Reviewed-by: Prashanth Sreenivasa <pks@delphix.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: loli10K <ezomori.nozomu@gmail.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-65090
Closes #9054
Closes #9060
---
 cmd/zed/agents/zfs_agents.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c
index 6d392604bceb..006e0ab99f47 100644
--- a/cmd/zed/agents/zfs_agents.c
+++ b/cmd/zed/agents/zfs_agents.c
@@ -116,7 +116,8 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
 	/*
 	 * On a devid match, grab the vdev guid and expansion time, if any.
 	 */
-	if ((nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) &&
+	if (gsp->gs_devid != NULL &&
+	    (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) &&
 	    (strcmp(gsp->gs_devid, path) == 0)) {
 		(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
 		    &gsp->gs_vdev_guid);

From 6c68594675ed3fdc1d663da47eaeb27c3db97f29 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Sat, 27 Jul 2019 05:52:30 +0900
Subject: [PATCH 080/109] Implement secpolicy_vnode_setid_retain()

Don't unconditionally return 0 (i.e. retain SUID/SGID).
Test CAP_FSETID capability.

https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t
which expects SUID/SGID to be dropped on write(2) by non-owner fails
without this. Most filesystems make this decision within VFS by using
a generic file write for fops.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #9035
Closes #9043
---
 configure.ac                                  |   1 +
 module/zfs/policy.c                           |   2 +-
 tests/runfiles/linux.run                      |   5 +
 tests/zfs-tests/tests/functional/Makefile.am  |   1 +
 .../tests/functional/suid/.gitignore          |   1 +
 .../tests/functional/suid/Makefile.am         |  16 +++
 .../tests/functional/suid/cleanup.ksh         |  34 +++++
 .../zfs-tests/tests/functional/suid/setup.ksh |  35 +++++
 .../functional/suid/suid_write_to_file.c      | 133 ++++++++++++++++++
 .../functional/suid/suid_write_to_none.ksh    |  52 +++++++
 .../functional/suid/suid_write_to_sgid.ksh    |  52 +++++++
 .../functional/suid/suid_write_to_suid.ksh    |  52 +++++++
 .../suid/suid_write_to_suid_sgid.ksh          |  52 +++++++
 13 files changed, 435 insertions(+), 1 deletion(-)
 create mode 100644 tests/zfs-tests/tests/functional/suid/.gitignore
 create mode 100644 tests/zfs-tests/tests/functional/suid/Makefile.am
 create mode 100755 tests/zfs-tests/tests/functional/suid/cleanup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/suid/setup.ksh
 create mode 100644 tests/zfs-tests/tests/functional/suid/suid_write_to_file.c
 create mode 100755 tests/zfs-tests/tests/functional/suid/suid_write_to_none.ksh
 create mode 100755 tests/zfs-tests/tests/functional/suid/suid_write_to_sgid.ksh
 create mode 100755 tests/zfs-tests/tests/functional/suid/suid_write_to_suid.ksh
 create mode 100755 tests/zfs-tests/tests/functional/suid/suid_write_to_suid_sgid.ksh

diff --git a/configure.ac b/configure.ac
index cf1d8b394adf..e8592ffb1d2d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -328,6 +328,7 @@ AC_CONFIG_FILES([
 	tests/zfs-tests/tests/functional/snapshot/Makefile
 	tests/zfs-tests/tests/functional/snapused/Makefile
 	tests/zfs-tests/tests/functional/sparse/Makefile
+	tests/zfs-tests/tests/functional/suid/Makefile
 	tests/zfs-tests/tests/functional/alloc_class/Makefile
 	tests/zfs-tests/tests/functional/threadsappend/Makefile
 	tests/zfs-tests/tests/functional/tmpfile/Makefile
diff --git a/module/zfs/policy.c b/module/zfs/policy.c
index 55c932747915..a723235d3015 100644
--- a/module/zfs/policy.c
+++ b/module/zfs/policy.c
@@ -209,7 +209,7 @@ secpolicy_vnode_setdac(const cred_t *cr, uid_t owner)
 int
 secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot)
 {
-	return (0);
+	return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM));
 }
 
 /*
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index c08bc4e31a36..1c368d20c454 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -847,6 +847,11 @@ tags = ['functional', 'snapused']
 tests = ['sparse_001_pos']
 tags = ['functional', 'sparse']
 
+[tests/functional/suid]
+tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid',
+    'suid_write_to_none']
+tags = ['functional', 'suid']
+
 [tests/functional/threadsappend]
 tests = ['threadsappend_001_pos']
 tags = ['functional', 'threadsappend']
diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am
index da27673ec946..ac0ba7cf3d1d 100644
--- a/tests/zfs-tests/tests/functional/Makefile.am
+++ b/tests/zfs-tests/tests/functional/Makefile.am
@@ -66,6 +66,7 @@ SUBDIRS = \
 	snapshot \
 	snapused \
 	sparse \
+	suid \
 	threadsappend \
 	tmpfile \
 	trim \
diff --git a/tests/zfs-tests/tests/functional/suid/.gitignore b/tests/zfs-tests/tests/functional/suid/.gitignore
new file mode 100644
index 000000000000..a9a3db79ba44
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/suid/.gitignore
@@ -0,0 +1 @@
+/suid_write_to_file
diff --git a/tests/zfs-tests/tests/functional/suid/Makefile.am b/tests/zfs-tests/tests/functional/suid/Makefile.am
new file mode 100644
index 000000000000..594d2b77ca8e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/suid/Makefile.am
@@ -0,0 +1,16 @@
+include $(top_srcdir)/config/Rules.am
+
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/suid
+
+dist_pkgdata_SCRIPTS = \
+	suid_write_to_suid.ksh \
+	suid_write_to_sgid.ksh \
+	suid_write_to_suid_sgid.ksh \
+	suid_write_to_none.ksh \
+	cleanup.ksh \
+	setup.ksh
+
+pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/suid
+
+pkgexec_PROGRAMS = suid_write_to_file
+suid_write_to_file_SOURCES = suid_write_to_file.c
diff --git a/tests/zfs-tests/tests/functional/suid/cleanup.ksh b/tests/zfs-tests/tests/functional/suid/cleanup.ksh
new file mode 100755
index 000000000000..6e41e02faf58
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/suid/cleanup.ksh
@@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/suid/setup.ksh b/tests/zfs-tests/tests/functional/suid/setup.ksh
new file mode 100755
index 000000000000..d04d5568c003
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/suid/setup.ksh
@@ -0,0 +1,35 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+DISK=${DISKS%% *}
+default_setup $DISK
diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_file.c b/tests/zfs-tests/tests/functional/suid/suid_write_to_file.c
new file mode 100644
index 000000000000..571dc553bec2
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_file.c
@@ -0,0 +1,133 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+static void
+test_stat_mode(mode_t extra)
+{
+	struct stat st;
+	int i, fd;
+	char fpath[1024];
+	char *penv[] = {"TESTDIR", "TESTFILE0"};
+	char buf[] = "test";
+	mode_t res;
+	mode_t mode = 0777 | extra;
+
+	/*
+	 * Get the environment variable values.
+	 */
+	for (i = 0; i < sizeof (penv) / sizeof (char *); i++) {
+		if ((penv[i] = getenv(penv[i])) == NULL) {
+			fprintf(stderr, "getenv(penv[%d])\n", i);
+			exit(1);
+		}
+	}
+
+	umask(0);
+	if (stat(penv[0], &st) == -1 && mkdir(penv[0], mode) == -1) {
+		perror("mkdir");
+		exit(2);
+	}
+
+	snprintf(fpath, sizeof (fpath), "%s/%s", penv[0], penv[1]);
+	unlink(fpath);
+	if (stat(fpath, &st) == 0) {
+		fprintf(stderr, "%s exists\n", fpath);
+		exit(3);
+	}
+
+	fd = creat(fpath, mode);
+	if (fd == -1) {
+		perror("creat");
+		exit(4);
+	}
+	close(fd);
+
+	if (setuid(65534) == -1) {
+		perror("setuid");
+		exit(5);
+	}
+
+	fd = open(fpath, O_RDWR);
+	if (fd == -1) {
+		perror("open");
+		exit(6);
+	}
+
+	if (write(fd, buf, sizeof (buf)) == -1) {
+		perror("write");
+		exit(7);
+	}
+	close(fd);
+
+	if (stat(fpath, &st) == -1) {
+		perror("stat");
+		exit(8);
+	}
+	unlink(fpath);
+
+	/* Verify SUID/SGID are dropped */
+	res = st.st_mode & (0777 | S_ISUID | S_ISGID);
+	if (res != (mode & 0777)) {
+		fprintf(stderr, "stat(2) %o\n", res);
+		exit(9);
+	}
+}
+
+int
+main(int argc, char *argv[])
+{
+	const char *name;
+	mode_t extra;
+
+	if (argc < 2) {
+		fprintf(stderr, "Invalid argc\n");
+		exit(1);
+	}
+
+	name = argv[1];
+	if (strcmp(name, "SUID") == 0) {
+		extra = S_ISUID;
+	} else if (strcmp(name, "SGID") == 0) {
+		extra = S_ISGID;
+	} else if (strcmp(name, "SUID_SGID") == 0) {
+		extra = S_ISUID | S_ISGID;
+	} else if (strcmp(name, "NONE") == 0) {
+		extra = 0;
+	} else {
+		fprintf(stderr, "Invalid name %s\n", name);
+		exit(1);
+	}
+
+	test_stat_mode(extra);
+
+	return (0);
+}
diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_none.ksh b/tests/zfs-tests/tests/functional/suid/suid_write_to_none.ksh
new file mode 100755
index 000000000000..dd01978619f9
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_none.ksh
@@ -0,0 +1,52 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify write(2) to regular file by non-owner.
+# Also see https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t
+#
+# STRATEGY:
+# 1. creat(2) a file.
+# 2. write(2) to the file with uid=65534.
+# 3. stat(2) the file and verify .st_mode value.
+#
+
+verify_runnable "both"
+
+function cleanup
+{
+	rm -f $TESTDIR/$TESTFILE0
+}
+
+log_onexit cleanup
+log_note "Verify write(2) to regular file by non-owner"
+
+log_must $STF_SUITE/tests/functional/suid/suid_write_to_file "NONE"
+
+log_pass "Verify write(2) to regular file by non-owner passed"
diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_sgid.ksh b/tests/zfs-tests/tests/functional/suid/suid_write_to_sgid.ksh
new file mode 100755
index 000000000000..49ae2bd1b31e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_sgid.ksh
@@ -0,0 +1,52 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify write(2) to SGID file by non-owner.
+# Also see https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t
+#
+# STRATEGY:
+# 1. creat(2) a file with SGID.
+# 2. write(2) to the file with uid=65534.
+# 3. stat(2) the file and verify .st_mode value.
+#
+
+verify_runnable "both"
+
+function cleanup
+{
+	rm -f $TESTDIR/$TESTFILE0
+}
+
+log_onexit cleanup
+log_note "Verify write(2) to SGID file by non-owner"
+
+log_must $STF_SUITE/tests/functional/suid/suid_write_to_file "SGID"
+
+log_pass "Verify write(2) to SGID file by non-owner passed"
diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_suid.ksh b/tests/zfs-tests/tests/functional/suid/suid_write_to_suid.ksh
new file mode 100755
index 000000000000..3983aad2e51d
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_suid.ksh
@@ -0,0 +1,52 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify write(2) to SUID file by non-owner.
+# Also see https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t
+#
+# STRATEGY:
+# 1. creat(2) a file with SUID.
+# 2. write(2) to the file with uid=65534.
+# 3. stat(2) the file and verify .st_mode value.
+#
+
+verify_runnable "both"
+
+function cleanup
+{
+	rm -f $TESTDIR/$TESTFILE0
+}
+
+log_onexit cleanup
+log_note "Verify write(2) to SUID file by non-owner"
+
+log_must $STF_SUITE/tests/functional/suid/suid_write_to_file "SUID"
+
+log_pass "Verify write(2) to SUID file by non-owner passed"
diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_suid_sgid.ksh b/tests/zfs-tests/tests/functional/suid/suid_write_to_suid_sgid.ksh
new file mode 100755
index 000000000000..a058c7e7d4bc
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_suid_sgid.ksh
@@ -0,0 +1,52 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify write(2) to SUID/SGID file by non-owner.
+# Also see https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t
+#
+# STRATEGY:
+# 1. creat(2) a file with SUID/SGID.
+# 2. write(2) to the file with uid=65534.
+# 3. stat(2) the file and verify .st_mode value.
+#
+
+verify_runnable "both"
+
+function cleanup
+{
+	rm -f $TESTDIR/$TESTFILE0
+}
+
+log_onexit cleanup
+log_note "Verify write(2) to SUID/SGID file by non-owner"
+
+log_must $STF_SUITE/tests/functional/suid/suid_write_to_file "SUID_SGID"
+
+log_pass "Verify write(2) to SUID/SGID file by non-owner passed"

From a8c5bcb5de431a792287fd355b8599513ddf69c5 Mon Sep 17 00:00:00 2001
From: George Wilson <george.wilson@delphix.com>
Date: Sun, 28 Jul 2019 21:13:56 -0400
Subject: [PATCH 081/109] Race between zfs-share and zfs-mount services

When a system boots the zfs-mount.service and the
zfs-share.service can start simultaneously. What may be
unclear is that sharing a filesystem will first mount
the filesystem if it's not already mounted. This means
that both service can race to mount the same fileystem.
This race can result in a SEGFAULT or EBUSY conditions.

This change explicitly defines the start ordering between the
two services such that the zfs-mount.service is solely
responsible for mounting filesystems eliminating the race
between "zfs mount -a" and "zfs share -a" commands.

Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Wilson <george.wilson@delphix.com>
Closes #9083
---
 etc/systemd/system/zfs-share.service.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/etc/systemd/system/zfs-share.service.in b/etc/systemd/system/zfs-share.service.in
index 75ff6e946767..5f4ba411b3cd 100644
--- a/etc/systemd/system/zfs-share.service.in
+++ b/etc/systemd/system/zfs-share.service.in
@@ -5,6 +5,7 @@ After=nfs-server.service nfs-kernel-server.service
 After=smb.service
 Before=rpc-statd-notify.service
 Wants=zfs-mount.service
+After=zfs-mount.service
 PartOf=nfs-server.service nfs-kernel-server.service
 PartOf=smb.service
 

From 8c00159411ed891b91f8b4f3d4356c038ffa81ca Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Sun, 28 Jul 2019 18:15:26 -0700
Subject: [PATCH 082/109] Fix channel programs on s390x

When adapting the original sources for s390x the JMP_BUF_CNT was
mistakenly halved due to an incorrect assumption of the size of
a unsigned long.  They are 8 bytes for the s390x architecture.
Increase JMP_BUF_CNT accordingly.

Authored-by: Don Brady <don.brady@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reported-by: Colin Ian King <canonical.com>
Tested-by: Colin Ian King <canonical.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #8992
Closes #9080
---
 module/lua/ldo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/lua/ldo.c b/module/lua/ldo.c
index aca02b234770..59d0b6a2c298 100644
--- a/module/lua/ldo.c
+++ b/module/lua/ldo.c
@@ -61,7 +61,7 @@
 #elif defined(__mips__)
 #define JMP_BUF_CNT	12
 #elif defined(__s390x__)
-#define JMP_BUF_CNT	9
+#define JMP_BUF_CNT	18
 #else
 #define	JMP_BUF_CNT	1
 #endif

From 6c9882d5dbc6bcaf39ae2ca54860743c083fa940 Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <mahrens@delphix.com>
Date: Tue, 30 Jul 2019 09:18:30 -0700
Subject: [PATCH 083/109] Improve performance by using dmu_tx_hold_*_by_dnode()

In zfs_write() and dmu_tx_hold_sa(), we can use dmu_tx_hold_*_by_dnode()
instead of dmu_tx_hold_*(), since we already have a dbuf from the target
dnode in hand.  This eliminates some calls to dnode_hold(), which can be
expensive.  This is especially impactful if several threads are
accessing objects that are in the same block of dnodes, because they
will contend for that dbuf's lock.

We are seeing 10-20% performance wins for the sequential_writes tests in
the performance test suite, when doing >=128K writes to files with
recordsize=8K.

This also removes some unnecessary casts that are in the area.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #9081
---
 module/zfs/dmu_tx.c    |  6 ++++--
 module/zfs/sa.c        | 10 +++++-----
 module/zfs/zfs_vnops.c |  8 ++++++--
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 7d65e842ff03..d6a42f84c751 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -1338,7 +1338,10 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
 
 	object = sa_handle_object(hdl);
 
-	dmu_tx_hold_bonus(tx, object);
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+	DB_DNODE_ENTER(db);
+	dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db));
+	DB_DNODE_EXIT(db);
 
 	if (tx->tx_objset->os_sa->sa_master_obj == 0)
 		return;
@@ -1360,7 +1363,6 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
 		ASSERT(tx->tx_txg == 0);
 		dmu_tx_hold_spill(tx, object);
 	} else {
-		dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(db);
diff --git a/module/zfs/sa.c b/module/zfs/sa.c
index 56a606962a7f..4999fef345dc 100644
--- a/module/zfs/sa.c
+++ b/module/zfs/sa.c
@@ -1380,7 +1380,7 @@ sa_handle_destroy(sa_handle_t *hdl)
 	dmu_buf_rele(hdl->sa_bonus, NULL);
 
 	if (hdl->sa_spill)
-		dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
+		dmu_buf_rele(hdl->sa_spill, NULL);
 	mutex_exit(&hdl->sa_lock);
 
 	kmem_cache_free(sa_cache, hdl);
@@ -2028,7 +2028,7 @@ sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
 			hdl->sa_spill_tab = NULL;
 		}
 
-		dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
+		dmu_buf_rele(hdl->sa_spill, NULL);
 		hdl->sa_spill = NULL;
 	}
 
@@ -2131,13 +2131,13 @@ sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
 void
 sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
 {
-	dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
+	dmu_object_info_from_db(hdl->sa_bonus, doi);
 }
 
 void
 sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
 {
-	dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
+	dmu_object_size_from_db(hdl->sa_bonus,
 	    blksize, nblocks);
 }
 
@@ -2150,7 +2150,7 @@ sa_set_userp(sa_handle_t *hdl, void *ptr)
 dmu_buf_t *
 sa_get_db(sa_handle_t *hdl)
 {
-	return ((dmu_buf_t *)hdl->sa_bonus);
+	return (hdl->sa_bonus);
 }
 
 void *
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 2a49293c245c..7f33aea43d48 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -775,7 +775,11 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		 */
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+		DB_DNODE_ENTER(db);
+		dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
+		    MIN(n, max_blksz));
+		DB_DNODE_EXIT(db);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
@@ -1048,7 +1052,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 		return (SET_ERROR(ENOENT));
 	}
 
-	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_lwb = lwb;
 	zgd->zgd_private = zp;
 

From 6d1599c1e1d1fabb14eb27f8f28d3c6b539f3fdc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michael=20Niew=C3=B6hner?=
 <c0d3z3r0@users.noreply.github.com>
Date: Tue, 30 Jul 2019 18:59:38 +0200
Subject: [PATCH 084/109] Increase default zcmd allocation to 256K
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When creating hundreds of clones (for example using containers with
LXD) cloning slows down as the number of clones increases over time.
The reason for this is that the fetching of the clone information
using a small zcmd buffer requires two ioctl calls, one to determine
the size and a second to return the data. However, this requires
gathering the data twice, once to determine the size and again to
populate the zcmd buffer to return it to userspace.
These are expensive ioctl() calls, so instead, make the default buffer
size much larger: 256K.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Michael Niewöhner <foss@mniewoehner.de>
Closes #9084
---
 lib/libzfs/libzfs_util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index dc2d68ebebbe..eed6282ca357 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -1139,7 +1139,7 @@ int
 zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len)
 {
 	if (len == 0)
-		len = 16 * 1024;
+		len = 256 * 1024;
 	zc->zc_nvlist_dst_size = len;
 	zc->zc_nvlist_dst =
 	    (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);

From 569f5d5d0543a1f1f4958a65fafc3eb7bf1778d1 Mon Sep 17 00:00:00 2001
From: Chunwei Chen <david.chen@nutanix.com>
Date: Tue, 13 Aug 2019 20:21:27 -0700
Subject: [PATCH 085/109] Fix out-of-order ZIL txtype lost on hardlinked files

We should only call zil_remove_async when an object is removed. However,
in current implementation, it is called whenever TX_REMOVE is called. In
the case of hardlinked file, every unlink will generate TX_REMOVE and
causing operations to be dropped even when the object is not removed.

We fix this by only calling zil_remove_async when the file is fully
unlinked.

Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #8769
Closes #9061
---
 include/sys/zfs_znode.h                           |  2 +-
 module/zfs/zfs_log.c                              | 15 ++++++++++++++-
 module/zfs/zfs_vnops.c                            |  5 +++--
 module/zfs/zil.c                                  | 12 +-----------
 .../tests/functional/slog/slog_replay_fs.ksh      |  8 ++++++++
 5 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h
index d4a3ea769331..add45a7f46e4 100644
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -371,7 +371,7 @@ extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp,
     vattr_t *vap);
 extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *dzp, char *name, uint64_t foid);
+    znode_t *dzp, char *name, uint64_t foid, boolean_t unlinked);
 #define	ZFS_NO_OBJECT	0	/* no object id */
 extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name);
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index 15c396ce0329..5966b7612b35 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -380,12 +380,14 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	zil_itx_assign(zilog, itx, tx);
 }
 
+void zil_remove_async(zilog_t *zilog, uint64_t oid);
+
 /*
  * Handles both TX_REMOVE and TX_RMDIR transactions.
  */
 void
 zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *dzp, char *name, uint64_t foid)
+    znode_t *dzp, char *name, uint64_t foid, boolean_t unlinked)
 {
 	itx_t *itx;
 	lr_remove_t *lr;
@@ -401,6 +403,17 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 
 	itx->itx_oid = foid;
 
+	/*
+	 * Object ids can be re-instantiated in the next txg so
+	 * remove any async transactions to avoid future leaks.
+	 * This can happen if a fsync occurs on the re-instantiated
+	 * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
+	 * the new file data and flushes a write record for the old object.
+	 */
+	if (unlinked) {
+		ASSERT((txtype & ~TX_CI) == TX_REMOVE);
+		zil_remove_async(zilog, foid);
+	}
 	zil_itx_assign(zilog, itx, tx);
 }
 
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 7f33aea43d48..3c2278164289 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -1886,7 +1886,7 @@ zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags)
 	txtype = TX_REMOVE;
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
-	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
+	zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
 
 	dmu_tx_commit(tx);
 out:
@@ -2219,7 +2219,8 @@ zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr,
 		uint64_t txtype = TX_RMDIR;
 		if (flags & FIGNORECASE)
 			txtype |= TX_CI;
-		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
+		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
+		    B_FALSE);
 	}
 
 	dmu_tx_commit(tx);
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index ff14a98b6b25..5249a0e93666 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1824,7 +1824,7 @@ zil_aitx_compare(const void *x1, const void *x2)
 /*
  * Remove all async itx with the given oid.
  */
-static void
+void
 zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
 	uint64_t otxg, txg;
@@ -1876,16 +1876,6 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 	itxg_t *itxg;
 	itxs_t *itxs, *clean = NULL;
 
-	/*
-	 * Object ids can be re-instantiated in the next txg so
-	 * remove any async transactions to avoid future leaks.
-	 * This can happen if a fsync occurs on the re-instantiated
-	 * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
-	 * the new file data and flushes a write record for the old object.
-	 */
-	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
-		zil_remove_async(zilog, itx->itx_oid);
-
 	/*
 	 * Ensure the data of a renamed file is committed before the rename.
 	 */
diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh
index 5f281a756f15..ea3f8451b9e3 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh
@@ -160,6 +160,14 @@ log_must attr -qs fileattr -V HelloWorld /$TESTPOOL/$TESTFS/xattr.file
 log_must attr -qs tmpattr -V HelloWorld /$TESTPOOL/$TESTFS/xattr.file
 log_must attr -qr tmpattr /$TESTPOOL/$TESTFS/xattr.file
 
+# TX_WRITE, TX_LINK, TX_REMOVE
+# Make sure TX_REMOVE won't affect TX_WRITE if file is not destroyed
+log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/link_and_unlink bs=128k \
+   count=8
+log_must ln /$TESTPOOL/$TESTFS/link_and_unlink \
+   /$TESTPOOL/$TESTFS/link_and_unlink.link
+log_must rm /$TESTPOOL/$TESTFS/link_and_unlink.link
+
 #
 # 4. Copy TESTFS to temporary location (TESTDIR/copy)
 #

From 65469f6e302205858b26da93c191ffab5bedbdff Mon Sep 17 00:00:00 2001
From: Dominic Pearson <dsp@technoanimal.net>
Date: Tue, 20 Aug 2019 00:22:52 +0200
Subject: [PATCH 086/109] Linux 5.3 compat: Makefile subdir-m no longer
 supported

Uses obj-m instead, due to kernel changes.

See LKML: Masahiro Yamada, Tue, 6 Aug 2019 19:03:23 +0900

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Dominic Pearson <dsp@technoanimal.net>
Closes #9169
---
 .gitignore         | 11 +++++++++++
 module/Makefile.in | 24 ++++++++++++------------
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index 549fa59f3822..ae9e22dfa7bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -63,3 +63,14 @@ cscope.*
 *.log
 venv
 
+#
+# Module leftovers
+#
+/module/avl/zavl.mod
+/module/icp/icp.mod
+/module/lua/zlua.mod
+/module/nvpair/znvpair.mod
+/module/spl/spl.mod
+/module/unicode/zunicode.mod
+/module/zcommon/zcommon.mod
+/module/zfs/zfs.mod
diff --git a/module/Makefile.in b/module/Makefile.in
index eca7691aedbb..7477dbe56509 100644
--- a/module/Makefile.in
+++ b/module/Makefile.in
@@ -1,11 +1,11 @@
-subdir-m += avl
-subdir-m += icp
-subdir-m += lua
-subdir-m += nvpair
-subdir-m += spl
-subdir-m += unicode
-subdir-m += zcommon
-subdir-m += zfs
+obj-m += avl/
+obj-m += icp/
+obj-m += lua/
+obj-m += nvpair/
+obj-m += spl/
+obj-m += unicode/
+obj-m += zcommon/
+obj-m += zfs/
 
 INSTALL_MOD_DIR ?= extra
 
@@ -60,13 +60,13 @@ modules_install:
 modules_uninstall:
 	@# Uninstall the kernel modules
 	kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@
-	list='$(subdir-m)'; for subdir in $$list; do \
-		$(RM) -R $$kmoddir/$(INSTALL_MOD_DIR)/$$subdir; \
+	list='$(obj-m)'; for objdir in $$list; do \
+		$(RM) -R $$kmoddir/$(INSTALL_MOD_DIR)/$$objdir; \
 	done
 
 distdir:
-	list='$(subdir-m)'; for subdir in $$list; do \
-		(cd @top_srcdir@/module && find $$subdir \
+	list='$(obj-m)'; for objdir in $$list; do \
+		(cd @top_srcdir@/module && find $$objdir \
 		-name '*.c' -o -name '*.h' -o -name '*.S' | \
 		xargs cp --parents -t @abs_top_builddir@/module/$$distdir); \
 	done

From 023ab67a64fc297bb5d773406f5b1fc6dd0d957b Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Wed, 21 Aug 2019 09:29:23 -0700
Subject: [PATCH 087/109] Linux 5.3: Fix switch() fall though compiler errors

Fix some switch() fall-though compiler errors:

    abd.c:1504:9: error: this statement may fall through

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #9170
---
 module/lua/llex.c                   | 9 ++++++---
 module/zfs/abd.c                    | 4 ++++
 module/zfs/vdev_raidz_math_scalar.c | 1 +
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/module/lua/llex.c b/module/lua/llex.c
index 8760155d0546..50c301f599f1 100644
--- a/module/lua/llex.c
+++ b/module/lua/llex.c
@@ -431,9 +431,12 @@ static int llex (LexState *ls, SemInfo *seminfo) {
         if (sep >= 0) {
           read_long_string(ls, seminfo, sep);
           return TK_STRING;
-        }
-        else if (sep == -1) return '[';
-        else lexerror(ls, "invalid long string delimiter", TK_STRING);
+        } else if (sep == -1) {
+		return '[';
+        } else {
+		lexerror(ls, "invalid long string delimiter", TK_STRING);
+		break;
+	}
       }
       case '=': {
         next(ls);
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index 9041bd8b1841..32b2c842c0df 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -1370,8 +1370,10 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
 		switch (parity) {
 			case 3:
 				len = MIN(caiters[2].iter_mapsize, len);
+				/* falls through */
 			case 2:
 				len = MIN(caiters[1].iter_mapsize, len);
+				/* falls through */
 			case 1:
 				len = MIN(caiters[0].iter_mapsize, len);
 		}
@@ -1461,9 +1463,11 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
 			case 3:
 				len = MIN(xiters[2].iter_mapsize, len);
 				len = MIN(citers[2].iter_mapsize, len);
+				/* falls through */
 			case 2:
 				len = MIN(xiters[1].iter_mapsize, len);
 				len = MIN(citers[1].iter_mapsize, len);
+				/* falls through */
 			case 1:
 				len = MIN(xiters[0].iter_mapsize, len);
 				len = MIN(citers[0].iter_mapsize, len);
diff --git a/module/zfs/vdev_raidz_math_scalar.c b/module/zfs/vdev_raidz_math_scalar.c
index a693bff63ffb..cd742e146ca6 100644
--- a/module/zfs/vdev_raidz_math_scalar.c
+++ b/module/zfs/vdev_raidz_math_scalar.c
@@ -142,6 +142,7 @@ static const struct {
 		a.b[6] = mul_lt[a.b[6]];				\
 		a.b[5] = mul_lt[a.b[5]];				\
 		a.b[4] = mul_lt[a.b[4]];				\
+		/* falls through */					\
 	case 4:								\
 		a.b[3] = mul_lt[a.b[3]];				\
 		a.b[2] = mul_lt[a.b[2]];				\

From 512a50f38d17f77118af6f297ddf7ba720a48ebc Mon Sep 17 00:00:00 2001
From: yshui <yshuiv7@gmail.com>
Date: Fri, 23 Aug 2019 01:11:17 +0100
Subject: [PATCH 088/109] zfs-mount-genrator: dependencies should be
 space-separated

Reviewed-by: Antonio Russo <antonio.e.russo@gmail.com>
Reviewed-by: Richard Laager <rlaager@wiktel.com>
Signed-off-by: Yuxuan Shui <yshuiv7@gmail.com>
Closes #9174
---
 etc/systemd/system-generators/zfs-mount-generator.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etc/systemd/system-generators/zfs-mount-generator.in b/etc/systemd/system-generators/zfs-mount-generator.in
index ae208c965f97..3e529cb67bb3 100755
--- a/etc/systemd/system-generators/zfs-mount-generator.in
+++ b/etc/systemd/system-generators/zfs-mount-generator.in
@@ -215,7 +215,7 @@ EOF
     fi
     # Update the dependencies for the mount file to require the
     # key-loading unit.
-    wants="${wants},${keyloadunit}"
+    wants="${wants} ${keyloadunit}"
   fi
 
   # If the mountpoint has already been created, give it precedence.

From 33374f21f0f8922baa95796c70edcc4bc17df19f Mon Sep 17 00:00:00 2001
From: Ryan Moeller <ryan@freqlabs.com>
Date: Thu, 22 Aug 2019 20:26:51 -0400
Subject: [PATCH 089/109] Make slog test setup more robust

The slog tests fail when attempting to create pools using file vdevs
that already exist from previous test runs. Remove these files in the
setup for the test.

Reviewed-by: Igor Kozhukhov <igor@dilos.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Signed-off-by: Ryan Moeller <ryan@ixsystems.com>
Closes #9194
---
 tests/zfs-tests/tests/functional/slog/setup.ksh       |  9 ---------
 tests/zfs-tests/tests/functional/slog/slog.kshlib     | 11 ++++++++++-
 .../zfs-tests/tests/functional/slog/slog_001_pos.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_002_pos.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_003_pos.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_004_pos.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_005_pos.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_006_pos.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_007_pos.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_008_neg.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_009_neg.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_010_neg.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_011_neg.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_012_neg.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_013_pos.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_014_pos.ksh  |  1 +
 .../zfs-tests/tests/functional/slog/slog_015_neg.ksh  |  1 +
 .../tests/functional/slog/slog_replay_fs.ksh          |  1 +
 .../tests/functional/slog/slog_replay_volume.ksh      |  1 +
 19 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/slog/setup.ksh b/tests/zfs-tests/tests/functional/slog/setup.ksh
index f30824d3ee90..8e8d214d823c 100755
--- a/tests/zfs-tests/tests/functional/slog/setup.ksh
+++ b/tests/zfs-tests/tests/functional/slog/setup.ksh
@@ -38,13 +38,4 @@ if ! verify_slog_support ; then
 	log_unsupported "This system doesn't support separate intent logs"
 fi
 
-if [[ -d $VDEV ]]; then
-	log_must rm -rf $VDIR
-fi
-if [[ -d $VDEV2 ]]; then
-	log_must rm -rf $VDIR2
-fi
-log_must mkdir -p $VDIR $VDIR2
-log_must truncate -s $MINVDEVSIZE $VDEV $SDEV $LDEV $VDEV2 $SDEV2 $LDEV2
-
 log_pass
diff --git a/tests/zfs-tests/tests/functional/slog/slog.kshlib b/tests/zfs-tests/tests/functional/slog/slog.kshlib
index 6ed7e4e0502f..75cfec2d832d 100644
--- a/tests/zfs-tests/tests/functional/slog/slog.kshlib
+++ b/tests/zfs-tests/tests/functional/slog/slog.kshlib
@@ -31,11 +31,20 @@
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/tests/functional/slog/slog.cfg
 
+function setup
+{
+	log_must rm -rf $VDIR $VDIR2
+	log_must mkdir -p $VDIR $VDIR2
+	log_must truncate -s $MINVDEVSIZE $VDEV $SDEV $LDEV $VDEV2 $SDEV2 $LDEV2
+
+	return 0
+}
+
 function cleanup
 {
 	poolexists $TESTPOOL && destroy_pool $TESTPOOL
 	poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
-	rm -rf $TESTDIR
+	rm -rf $TESTDIR $VDIR $VDIR2
 }
 
 #
diff --git a/tests/zfs-tests/tests/functional/slog/slog_001_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_001_pos.ksh
index 3d3daf5f9ccc..a4c35ed9e98e 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_001_pos.ksh
@@ -45,6 +45,7 @@ verify_runnable "global"
 
 log_assert "Creating a pool with a log device succeeds."
 log_onexit cleanup
+log_must setup
 
 for type in "" "mirror" "raidz" "raidz2"
 do
diff --git a/tests/zfs-tests/tests/functional/slog/slog_002_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_002_pos.ksh
index b056f19cdb80..91904aa612d1 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_002_pos.ksh
@@ -46,6 +46,7 @@ verify_runnable "global"
 
 log_assert "Adding a log device to normal pool works."
 log_onexit cleanup
+log_must setup
 
 for type in "" "mirror" "raidz" "raidz2"
 do
diff --git a/tests/zfs-tests/tests/functional/slog/slog_003_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_003_pos.ksh
index c647b8f54b75..0b4d6ede3e13 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_003_pos.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_003_pos.ksh
@@ -46,6 +46,7 @@ verify_runnable "global"
 
 log_assert "Adding an extra log device works."
 log_onexit cleanup
+log_must setup
 
 for type in "" "mirror" "raidz" "raidz2"
 do
diff --git a/tests/zfs-tests/tests/functional/slog/slog_004_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_004_pos.ksh
index 4b0b3439a2e3..10f28dcc000b 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_004_pos.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_004_pos.ksh
@@ -46,6 +46,7 @@ verify_runnable "global"
 
 log_assert "Attaching a log device passes."
 log_onexit cleanup
+log_must setup
 
 for type in "" "mirror" "raidz" "raidz2"
 do
diff --git a/tests/zfs-tests/tests/functional/slog/slog_005_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_005_pos.ksh
index cbbb9486913a..4836f6f27937 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_005_pos.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_005_pos.ksh
@@ -46,6 +46,7 @@ verify_runnable "global"
 
 log_assert "Detaching a log device passes."
 log_onexit cleanup
+log_must setup
 
 for type in "" "mirror" "raidz" "raidz2"
 do
diff --git a/tests/zfs-tests/tests/functional/slog/slog_006_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_006_pos.ksh
index 53e8c67ca005..24143196fd2e 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_006_pos.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_006_pos.ksh
@@ -46,6 +46,7 @@ verify_runnable "global"
 
 log_assert "Replacing a log device passes."
 log_onexit cleanup
+log_must setup
 
 for type in "" "mirror" "raidz" "raidz2"
 do
diff --git a/tests/zfs-tests/tests/functional/slog/slog_007_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_007_pos.ksh
index 4926fb7b3192..27ac38606c29 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_007_pos.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_007_pos.ksh
@@ -48,6 +48,7 @@ verify_runnable "global"
 
 log_assert "Exporting and importing pool with log devices passes."
 log_onexit cleanup
+log_must setup
 
 for type in "" "mirror" "raidz" "raidz2"
 do
diff --git a/tests/zfs-tests/tests/functional/slog/slog_008_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_008_neg.ksh
index 587e0e321222..54587a0c61a7 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_008_neg.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_008_neg.ksh
@@ -44,6 +44,7 @@ verify_runnable "global"
 
 log_assert "A raidz/raidz2 log is not supported."
 log_onexit cleanup
+log_must setup
 
 for type in "" "mirror" "raidz" "raidz2"
 do
diff --git a/tests/zfs-tests/tests/functional/slog/slog_009_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_009_neg.ksh
index e7091f17b759..222f71a99928 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_009_neg.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_009_neg.ksh
@@ -45,6 +45,7 @@ verify_runnable "global"
 
 log_assert "A raidz/raidz2 log can not be added to existed pool."
 log_onexit cleanup
+log_must setup
 
 for type in "" "mirror" "raidz" "raidz2"
 do
diff --git a/tests/zfs-tests/tests/functional/slog/slog_010_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_010_neg.ksh
index 8fe248ffbcba..edd9abea0930 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_010_neg.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_010_neg.ksh
@@ -46,6 +46,7 @@ verify_runnable "global"
 
 log_assert "Slog device can not be replaced with spare device."
 log_onexit cleanup
+log_must setup
 
 log_must zpool create $TESTPOOL $VDEV spare $SDEV log $LDEV
 sdev=$(random_get $SDEV)
diff --git a/tests/zfs-tests/tests/functional/slog/slog_011_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_011_neg.ksh
index 2dad200b31c1..3bebc8201713 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_011_neg.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_011_neg.ksh
@@ -46,6 +46,7 @@ verify_runnable "global"
 
 log_assert "Offline and online a log device passes."
 log_onexit cleanup
+log_must setup
 
 for type in "" "mirror" "raidz" "raidz2"
 do
diff --git a/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh
index 45566d427f1d..8d6fb2bffb7f 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh
@@ -45,6 +45,7 @@ verify_runnable "global"
 
 log_assert "Pool can survive when one of mirror log device get corrupted."
 log_onexit cleanup
+log_must setup
 
 for type in "" "mirror" "raidz" "raidz2"
 do
diff --git a/tests/zfs-tests/tests/functional/slog/slog_013_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_013_pos.ksh
index bbe5adc24174..d6917065ddbf 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_013_pos.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_013_pos.ksh
@@ -60,6 +60,7 @@ log_assert "Verify slog device can be disk, file, lofi device or any device " \
 	"that presents a block interface."
 verify_disk_count "$DISKS" 2
 log_onexit cleanup_testenv
+log_must setup
 
 dsk1=${DISKS%% *}
 log_must zpool create $TESTPOOL ${DISKS#$dsk1}
diff --git a/tests/zfs-tests/tests/functional/slog/slog_014_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_014_pos.ksh
index 0ec96ae1e6f7..e8ea29f1ffa3 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_014_pos.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_014_pos.ksh
@@ -44,6 +44,7 @@
 verify_runnable "global"
 
 log_assert "log device can survive when one of the pool device get corrupted."
+log_must setup
 
 for type in "mirror" "raidz" "raidz2"; do
 	for spare in "" "spare"; do
diff --git a/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh
index 37821888ea00..fa6105116574 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh
@@ -47,6 +47,7 @@ function cleanup
 
 ORIG_TIMEOUT=$(get_tunable zfs_commit_timeout_pct | tail -1 | awk '{print $NF}')
 log_onexit cleanup
+log_must setup
 
 for PCT in 0 1 2 4 8 16 32 64 128 256 512 1024; do
 	log_must set_tunable64 zfs_commit_timeout_pct $PCT
diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh
index ea3f8451b9e3..3e5bccd2ef18 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh
@@ -66,6 +66,7 @@ function cleanup_fs
 
 log_assert "Replay of intent log succeeds."
 log_onexit cleanup_fs
+log_must setup
 
 #
 # 1. Create an empty file system (TESTFS)
diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh
index c8a3cbbf43c4..a72c83b5bfc6 100755
--- a/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh
+++ b/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh
@@ -76,6 +76,7 @@ function cleanup_volume
 
 log_assert "Replay of intent log succeeds."
 log_onexit cleanup_volume
+log_must setup
 
 #
 # 1. Create an empty volume (TESTVOL), set sync=always, and format

From 95319fc569cf1ab322926f037b92dd4fd15b5630 Mon Sep 17 00:00:00 2001
From: Tom Caputi <tcaputi@datto.com>
Date: Tue, 27 Aug 2019 12:55:51 -0400
Subject: [PATCH 090/109] Fix deadlock in 'zfs rollback'

Currently, the 'zfs rollback' code can end up deadlocked due to
the way the kernel handles unreferenced inodes on a suspended fs.
Essentially, the zfs_resume_fs() code path may cause zfs to spawn
new threads as it reinstantiates the suspended fs's zil. When a
new thread is spawned, the kernel may attempt to free memory for
that thread by freeing some unreferenced inodes. If it happens to
select inodes that are a a part of the suspended fs a deadlock
will occur because freeing inodes requires holding the fs's
z_teardown_inactive_lock which is still held from the suspend.

This patch corrects this issue by adding an additional reference
to all inodes that are still present when a suspend is initiated.
This prevents them from being freed by the kernel for any reason.

Reviewed-by: Alek Pinchuk <apinchuk@datto.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #9203
---
 include/sys/zfs_znode.h |  1 +
 module/zfs/zfs_vfsops.c | 16 +++++++++++++++-
 module/zfs/zfs_znode.c  |  1 +
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h
index add45a7f46e4..01b358cc4da8 100644
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -196,6 +196,7 @@ typedef struct znode {
 	uint8_t		z_atime_dirty;	/* atime needs to be synced */
 	uint8_t		z_zn_prefetch;	/* Prefetch znodes? */
 	uint8_t		z_moved;	/* Has this znode been moved? */
+	boolean_t   z_suspended;    /* extra ref from a suspend? */
 	uint_t		z_blksz;	/* block size in bytes */
 	uint_t		z_seq;		/* modification sequence number */
 	uint64_t	z_mapcnt;	/* number of pages mapped to file */
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index 371c412f6beb..489f12b7fc0f 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -1736,7 +1736,12 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 	 * will fail with EIO since we have z_teardown_lock for writer (only
 	 * relevant for forced unmount).
 	 *
-	 * Release all holds on dbufs.
+	 * Release all holds on dbufs. We also grab an extra reference to all
+	 * the remaining inodes so that the kernel does not attempt to free
+	 * any inodes of a suspended fs. This can cause deadlocks since the
+	 * zfs_resume_fs() process may involve starting threads, which might
+	 * attempt to free unreferenced inodes to free up memory for the new
+	 * thread.
 	 */
 	if (!unmounting) {
 		mutex_enter(&zfsvfs->z_znodes_lock);
@@ -1744,6 +1749,9 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
 			if (zp->z_sa_hdl)
 				zfs_znode_dmu_fini(zp);
+			if (igrab(ZTOI(zp)) != NULL)
+				zp->z_suspended = B_TRUE;
+
 		}
 		mutex_exit(&zfsvfs->z_znodes_lock);
 	}
@@ -2192,6 +2200,12 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
 			remove_inode_hash(ZTOI(zp));
 			zp->z_is_stale = B_TRUE;
 		}
+
+		/* see comment in zfs_suspend_fs() */
+		if (zp->z_suspended) {
+			zfs_iput_async(ZTOI(zp));
+			zp->z_suspended = B_FALSE;
+		}
 	}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index 3dd299942202..91162e857d44 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -540,6 +540,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	ASSERT3P(zp->z_acl_cached, ==, NULL);
 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
 	zp->z_moved = 0;
+	zp->z_suspended = B_FALSE;
 	zp->z_sa_hdl = NULL;
 	zp->z_unlinked = 0;
 	zp->z_atime_dirty = 0;

From ea34735203a259f331dc549c25c7ed92b34cd470 Mon Sep 17 00:00:00 2001
From: Richard Allen <33836503+belperite@users.noreply.github.com>
Date: Tue, 27 Aug 2019 21:44:02 +0100
Subject: [PATCH 091/109] Fix Plymouth passphrase prompt in initramfs script

Entering the ZFS encryption passphrase under Plymouth wasn't working
because in the ZFS initrd script, Plymouth was calling zfs via
"--command", which wasn't passing through the filesystem argument to
zfs load-key properly (it was passing through the single quotes around
the filesystem name intended to handle spaces literally,
which zfs load-key couldn't understand).

Reviewed-by: Richard Laager <rlaager@wiktel.com>
Reviewed-by: Garrett Fields <ghfields@gmail.com>
Signed-off-by: Richard Allen <belperite@gmail.com>
Issue #9193
Closes #9202
---
 contrib/initramfs/scripts/zfs.in | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/contrib/initramfs/scripts/zfs.in b/contrib/initramfs/scripts/zfs.in
index ad604a82ce52..05410ea2bdce 100644
--- a/contrib/initramfs/scripts/zfs.in
+++ b/contrib/initramfs/scripts/zfs.in
@@ -411,29 +411,29 @@ decrypt_fs()
 
 		# Determine dataset that holds key for root dataset
 		ENCRYPTIONROOT=$(${ZFS} get -H -o value encryptionroot "${fs}")
-		DECRYPT_CMD="${ZFS} load-key '${ENCRYPTIONROOT}'"
 
 		# If root dataset is encrypted...
 		if ! [ "${ENCRYPTIONROOT}" = "-" ]; then
-
+			TRY_COUNT=3
 			# Prompt with plymouth, if active
 			if [ -e /bin/plymouth ] && /bin/plymouth --ping 2>/dev/null; then
-				plymouth ask-for-password --prompt "Encrypted ZFS password for ${ENCRYPTIONROOT}" \
-					--number-of-tries="3" \
-					--command="${DECRYPT_CMD}"
+				while [ $TRY_COUNT -gt 0 ]; do
+					plymouth ask-for-password --prompt "Encrypted ZFS password for ${ENCRYPTIONROOT}" | \
+						$ZFS load-key "${ENCRYPTIONROOT}" && break
+					TRY_COUNT=$((TRY_COUNT - 1))
+				done
 
 			# Prompt with systemd, if active 
 			elif [ -e /run/systemd/system ]; then
-				TRY_COUNT=3
 				while [ $TRY_COUNT -gt 0 ]; do
 					systemd-ask-password "Encrypted ZFS password for ${ENCRYPTIONROOT}" --no-tty | \
-						${DECRYPT_CMD} && break
+						$ZFS load-key "${ENCRYPTIONROOT}" && break
 					TRY_COUNT=$((TRY_COUNT - 1))
 				done
 
 			# Prompt with ZFS tty, otherwise
 			else
-				eval "${DECRYPT_CMD}"
+				$ZFS load-key "${ENCRYPTIONROOT}"
 			fi
 		fi
 	fi

From 931bef81c8a4bda13e22be770c1dca3721dffc0f Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Tue, 27 Aug 2019 23:45:53 +0300
Subject: [PATCH 092/109] zfs_ioc_snapshot: check user-prop permissions on
 snapshotted datasets

Previously, the permissions were checked on the pool which was obviously
incorrect.

After this change, zfs_check_userprops() only validates the properties
without any permission checks.  The permissions are checked individually
for each snapshotted dataset.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Signed-off-by: Andriy Gapon <avg@FreeBSD.org>
Closes #9179
Closes #9180
---
 module/zfs/zfs_ioctl.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 152433d60790..ac573ccbf170 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -2744,10 +2744,9 @@ zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
  * Check that all the properties are valid user properties.
  */
 static int
-zfs_check_userprops(const char *fsname, nvlist_t *nvl)
+zfs_check_userprops(nvlist_t *nvl)
 {
 	nvpair_t *pair = NULL;
-	int error = 0;
 
 	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
 		const char *propname = nvpair_name(pair);
@@ -2756,10 +2755,6 @@ zfs_check_userprops(const char *fsname, nvlist_t *nvl)
 		    nvpair_type(pair) != DATA_TYPE_STRING)
 			return (SET_ERROR(EINVAL));
 
-		if ((error = zfs_secpolicy_write_perms(fsname,
-		    ZFS_DELEG_PERM_USERPROP, CRED())))
-			return (error);
-
 		if (strlen(propname) >= ZAP_MAXNAMELEN)
 			return (SET_ERROR(ENAMETOOLONG));
 
@@ -3473,19 +3468,18 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 	nvpair_t *pair;
 
 	(void) nvlist_lookup_nvlist(innvl, "props", &props);
-	if ((error = zfs_check_userprops(poolname, props)) != 0)
-		return (error);
-
 	if (!nvlist_empty(props) &&
 	    zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
 		return (SET_ERROR(ENOTSUP));
+	if ((error = zfs_check_userprops(props)) != 0)
+		return (error);
 
 	snaps = fnvlist_lookup_nvlist(innvl, "snaps");
 	poollen = strlen(poolname);
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
 		const char *name = nvpair_name(pair);
-		const char *cp = strchr(name, '@');
+		char *cp = strchr(name, '@');
 
 		/*
 		 * The snap name must contain an @, and the part after it must
@@ -3502,6 +3496,18 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 		    (name[poollen] != '/' && name[poollen] != '@'))
 			return (SET_ERROR(EXDEV));
 
+		/*
+		 * Check for permission to set the properties on the fs.
+		 */
+		if (!nvlist_empty(props)) {
+			*cp = '\0';
+			error = zfs_secpolicy_write_perms(name,
+			    ZFS_DELEG_PERM_USERPROP, CRED());
+			*cp = '@';
+			if (error != 0)
+				return (error);
+		}
+
 		/* This must be the only snap of this fs. */
 		for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair);
 		    pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) {

From c7a4255f128cc493df8383cb9f1ed650191b2081 Mon Sep 17 00:00:00 2001
From: Chunwei Chen <david.chen@nutanix.com>
Date: Wed, 28 Aug 2019 10:42:02 -0700
Subject: [PATCH 093/109] Fix zil replay panic when TX_REMOVE followed by
 TX_CREATE

If TX_REMOVE is followed by TX_CREATE on the same object id, we need to
make sure the object removal is completely finished before creation. The
current implementation relies on dnode_hold_impl with
DNODE_MUST_BE_ALLOCATED returning ENOENT. While this check seems to work
fine before, in current version it does not guarantee the object removal
is completed.

We fix this by checking if DNODE_MUST_BE_FREE returns successful
instead. Also add test and remove dead code in dnode_hold_impl.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tom Caputi <tcaputi@datto.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7151
Closes #8910
Closes #9123
Closes #9145
---
 include/sys/dnode.h                           |   7 +-
 module/zfs/dnode.c                            |  49 +++++--
 module/zfs/zfs_replay.c                       |   8 +-
 tests/runfiles/linux.run                      |   4 +-
 .../tests/functional/slog/Makefile.am         |   3 +-
 ...g_replay_fs.ksh => slog_replay_fs_001.ksh} |   0
 .../functional/slog/slog_replay_fs_002.ksh    | 137 ++++++++++++++++++
 7 files changed, 184 insertions(+), 24 deletions(-)
 rename tests/zfs-tests/tests/functional/slog/{slog_replay_fs.ksh => slog_replay_fs_001.ksh} (100%)
 create mode 100755 tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh

diff --git a/include/sys/dnode.h b/include/sys/dnode.h
index c60258bbc768..e97e40373b4d 100644
--- a/include/sys/dnode.h
+++ b/include/sys/dnode.h
@@ -46,6 +46,7 @@ extern "C" {
  */
 #define	DNODE_MUST_BE_ALLOCATED	1
 #define	DNODE_MUST_BE_FREE	2
+#define	DNODE_DRY_RUN		4
 
 /*
  * dnode_next_offset() flags.
@@ -415,6 +416,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
 boolean_t dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
 void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
+int dnode_try_claim(objset_t *os, uint64_t object, int slots);
 void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
 void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
 void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
@@ -532,11 +534,6 @@ typedef struct dnode_stats {
 	 * a range of dnode slots which would overflow the dnode_phys_t.
 	 */
 	kstat_named_t dnode_hold_free_overflow;
-	/*
-	 * Number of times a dnode_hold(...) was attempted on a dnode
-	 * which had already been unlinked in an earlier txg.
-	 */
-	kstat_named_t dnode_hold_free_txg;
 	/*
 	 * Number of times dnode_free_interior_slots() needed to retry
 	 * acquiring a slot zrl lock due to contention.
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 5fd473303d7d..cc7bc5ec82c8 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -55,7 +55,6 @@ dnode_stats_t dnode_stats = {
 	{ "dnode_hold_free_lock_retry",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_overflow",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_refcount",		KSTAT_DATA_UINT64 },
-	{ "dnode_hold_free_txg",		KSTAT_DATA_UINT64 },
 	{ "dnode_free_interior_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "dnode_allocate",			KSTAT_DATA_UINT64 },
 	{ "dnode_reallocate",			KSTAT_DATA_UINT64 },
@@ -1255,6 +1254,10 @@ dnode_buf_evict_async(void *dbu)
  * as an extra dnode slot by an large dnode, in which case it returns
  * ENOENT.
  *
+ * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just
+ * return whether the hold would succeed or not. tag and dnp should set to
+ * NULL in this case.
+ *
  * errors:
  * EINVAL - Invalid object number or flags.
  * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
@@ -1283,6 +1286,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 
 	ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
 	ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+	IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));
 
 	/*
 	 * If you are holding the spa config lock as writer, you shouldn't
@@ -1312,8 +1316,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
 			return (SET_ERROR(EEXIST));
 		DNODE_VERIFY(dn);
-		(void) zfs_refcount_add(&dn->dn_holds, tag);
-		*dnp = dn;
+		/* Don't actually hold if dry run, just return 0 */
+		if (!(flag & DNODE_DRY_RUN)) {
+			(void) zfs_refcount_add(&dn->dn_holds, tag);
+			*dnp = dn;
+		}
 		return (0);
 	}
 
@@ -1455,6 +1462,14 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 			return (SET_ERROR(ENOENT));
 		}
 
+		/* Don't actually hold if dry run, just return 0 */
+		if (flag & DNODE_DRY_RUN) {
+			mutex_exit(&dn->dn_mtx);
+			dnode_slots_rele(dnc, idx, slots);
+			dbuf_rele(db, FTAG);
+			return (0);
+		}
+
 		DNODE_STAT_BUMP(dnode_hold_alloc_hits);
 	} else if (flag & DNODE_MUST_BE_FREE) {
 
@@ -1512,6 +1527,14 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 			return (SET_ERROR(EEXIST));
 		}
 
+		/* Don't actually hold if dry run, just return 0 */
+		if (flag & DNODE_DRY_RUN) {
+			mutex_exit(&dn->dn_mtx);
+			dnode_slots_rele(dnc, idx, slots);
+			dbuf_rele(db, FTAG);
+			return (0);
+		}
+
 		dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
 		DNODE_STAT_BUMP(dnode_hold_free_hits);
 	} else {
@@ -1519,15 +1542,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 		return (SET_ERROR(EINVAL));
 	}
 
-	if (dn->dn_free_txg) {
-		DNODE_STAT_BUMP(dnode_hold_free_txg);
-		type = dn->dn_type;
-		mutex_exit(&dn->dn_mtx);
-		dnode_slots_rele(dnc, idx, slots);
-		dbuf_rele(db, FTAG);
-		return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
-		    ENOENT : EEXIST));
-	}
+	ASSERT0(dn->dn_free_txg);
 
 	if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
 		dbuf_add_ref(db, dnh);
@@ -1618,6 +1633,16 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
 	}
 }
 
+/*
+ * Test whether we can create a dnode at the specified location.
+ */
+int
+dnode_try_claim(objset_t *os, uint64_t object, int slots)
+{
+	return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,
+	    slots, NULL, NULL));
+}
+
 void
 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 {
diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c
index 144381769059..7dea85bb6614 100644
--- a/module/zfs/zfs_replay.c
+++ b/module/zfs/zfs_replay.c
@@ -337,8 +337,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
 	xva.xva_vattr.va_nblocks = lr->lr_gen;
 	xva.xva_vattr.va_fsid = dnodesize;
 
-	error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
-	if (error != ENOENT)
+	error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
+	if (error)
 		goto bail;
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
@@ -473,8 +473,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
 	xva.xva_vattr.va_nblocks = lr->lr_gen;
 	xva.xva_vattr.va_fsid = dnodesize;
 
-	error = dmu_object_info(zfsvfs->z_os, objid, NULL);
-	if (error != ENOENT)
+	error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
+	if (error)
 		goto out;
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 1c368d20c454..0e157cf0e98e 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -824,8 +824,8 @@ tags = ['functional', 'scrub_mirror']
 tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos',
     'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg',
     'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg',
-    'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs',
-    'slog_replay_volume']
+    'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001',
+    'slog_replay_fs_002', 'slog_replay_volume']
 tags = ['functional', 'slog']
 
 [tests/functional/snapshot]
diff --git a/tests/zfs-tests/tests/functional/slog/Makefile.am b/tests/zfs-tests/tests/functional/slog/Makefile.am
index 4548ce63b40c..33e3a6d3a496 100644
--- a/tests/zfs-tests/tests/functional/slog/Makefile.am
+++ b/tests/zfs-tests/tests/functional/slog/Makefile.am
@@ -17,7 +17,8 @@ dist_pkgdata_SCRIPTS = \
 	slog_013_pos.ksh \
 	slog_014_pos.ksh \
 	slog_015_neg.ksh \
-	slog_replay_fs.ksh \
+	slog_replay_fs_001.ksh \
+	slog_replay_fs_002.ksh \
 	slog_replay_volume.ksh
 
 dist_pkgdata_DATA = \
diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh
similarity index 100%
rename from tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh
rename to tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh
diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh
new file mode 100755
index 000000000000..3c3ccdf4ad23
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh
@@ -0,0 +1,137 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/tests/functional/slog/slog.kshlib
+
+#
+# DESCRIPTION:
+#	Verify slog replay correctly when TX_REMOVEs are followed by
+#	TX_CREATEs.
+#
+# STRATEGY:
+#	1. Create a file system (TESTFS) with a lot of files
+#	2. Freeze TESTFS
+#	3. Remove all files then create a lot of files
+#	4. Copy TESTFS to temporary location (TESTDIR/copy)
+#	5. Unmount filesystem
+#	   <at this stage TESTFS is empty again and unfrozen, and the
+#	   intent log contains a complete set of deltas to replay it>
+#	6. Remount TESTFS <which replays the intent log>
+#	7. Compare TESTFS against the TESTDIR/copy
+#
+
+verify_runnable "global"
+
+function cleanup_fs
+{
+	cleanup
+}
+
+log_assert "Replay of intent log succeeds."
+log_onexit cleanup_fs
+log_must setup
+
+#
+# 1. Create a file system (TESTFS) with a lot of files
+#
+log_must zpool create $TESTPOOL $VDEV log mirror $LDEV
+log_must zfs set compression=on $TESTPOOL
+log_must zfs create $TESTPOOL/$TESTFS
+
+# Prep for the test of TX_REMOVE followed by TX_CREATE
+dnsize=(legacy auto 1k 2k 4k 8k 16k)
+NFILES=200
+log_must mkdir /$TESTPOOL/$TESTFS/dir0
+log_must eval 'for i in $(seq $NFILES); do zfs set dnodesize=${dnsize[$RANDOM % ${#dnsize[@]}]} $TESTPOOL/$TESTFS; touch /$TESTPOOL/$TESTFS/dir0/file.$i; done'
+
+#
+# Reimport to reset dnode allocation pointer.
+# This is to make sure we will have TX_REMOVE and TX_CREATE on same id
+#
+log_must zpool export $TESTPOOL
+log_must zpool import -f -d $VDIR $TESTPOOL
+
+#
+# This dd command works around an issue where ZIL records aren't created
+# after freezing the pool unless a ZIL header already exists. Create a file
+# synchronously to force ZFS to write one out.
+#
+log_must dd if=/dev/zero of=/$TESTPOOL/$TESTFS/sync \
+    conv=fdatasync,fsync bs=1 count=1
+
+#
+# 2. Freeze TESTFS
+#
+log_must zpool freeze $TESTPOOL
+
+#
+# 3. Remove all files then create a lot of files
+#
+# TX_REMOVE followed by TX_CREATE
+log_must eval 'rm -f /$TESTPOOL/$TESTFS/dir0/*'
+log_must eval 'for i in $(seq $NFILES); do zfs set dnodesize=${dnsize[$RANDOM % ${#dnsize[@]}]} $TESTPOOL/$TESTFS; touch /$TESTPOOL/$TESTFS/dir0/file.$i; done'
+
+#
+# 4. Copy TESTFS to temporary location (TESTDIR/copy)
+#
+log_must mkdir -p $TESTDIR/copy
+log_must cp -a /$TESTPOOL/$TESTFS/* $TESTDIR/copy/
+
+#
+# 5. Unmount filesystem and export the pool
+#
+# At this stage TESTFS is empty again and frozen, the intent log contains
+# a complete set of deltas to replay.
+#
+log_must zfs unmount /$TESTPOOL/$TESTFS
+
+log_note "Verify transactions to replay:"
+log_must zdb -iv $TESTPOOL/$TESTFS
+
+log_must zpool export $TESTPOOL
+
+#
+# 6. Remount TESTFS <which replays the intent log>
+#
+# Import the pool to unfreeze it and claim log blocks.  It has to be
+# `zpool import -f` because we can't write a frozen pool's labels!
+#
+log_must zpool import -f -d $VDIR $TESTPOOL
+
+#
+# 7. Compare TESTFS against the TESTDIR/copy
+#
+log_note "Verify current block usage:"
+log_must zdb -bcv $TESTPOOL
+
+log_note "Verify number of files"
+log_must test "$(ls /$TESTPOOL/$TESTFS/dir0 | wc -l)" -eq $NFILES
+
+log_note "Verify working set diff:"
+log_must diff -r /$TESTPOOL/$TESTFS $TESTDIR/copy
+
+log_pass "Replay of intent log succeeds."

From 0e765c4eb89346a77733037a46b32aec85205a19 Mon Sep 17 00:00:00 2001
From: Pavel Zakharov <pavel.zakharov@delphix.com>
Date: Wed, 28 Aug 2019 18:02:58 -0400
Subject: [PATCH 094/109] zfs_handle used after being closed/freed in
 change_one callback

This is a typical case of use after free. We would call zfs_close(zhp)
which would free the handle, and then call zfs_iter_children() on that
handle later.  This change ensures that the zfs_handle is only closed
when we are ready to return.

Running `zfs inherit -r sharenfs pool` was failing with an error
code without any error messages. After some debugging I've pinpointed
the issue to be memory corruption, which would cause zfs to try to
issue an ioctl to the wrong device and receive ENOTTY.

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alek Pinchuk <apinchuk@datto.com>
Signed-off-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Issue #7967
Closes #9165
---
 lib/libzfs/libzfs_changelist.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c
index 3101febc1605..72f641056edc 100644
--- a/lib/libzfs/libzfs_changelist.c
+++ b/lib/libzfs/libzfs_changelist.c
@@ -475,9 +475,10 @@ change_one(zfs_handle_t *zhp, void *data)
 	prop_changelist_t *clp = data;
 	char property[ZFS_MAXPROPLEN];
 	char where[64];
-	prop_changenode_t *cn;
+	prop_changenode_t *cn = NULL;
 	zprop_source_t sourcetype = ZPROP_SRC_NONE;
 	zprop_source_t share_sourcetype = ZPROP_SRC_NONE;
+	int ret = 0;
 
 	/*
 	 * We only want to unmount/unshare those filesystems that may inherit
@@ -493,8 +494,7 @@ change_one(zfs_handle_t *zhp, void *data)
 	    zfs_prop_get(zhp, clp->cl_prop, property,
 	    sizeof (property), &sourcetype, where, sizeof (where),
 	    B_FALSE) != 0) {
-		zfs_close(zhp);
-		return (0);
+		goto out;
 	}
 
 	/*
@@ -506,8 +506,7 @@ change_one(zfs_handle_t *zhp, void *data)
 	    zfs_prop_get(zhp, clp->cl_shareprop, property,
 	    sizeof (property), &share_sourcetype, where, sizeof (where),
 	    B_FALSE) != 0) {
-		zfs_close(zhp);
-		return (0);
+		goto out;
 	}
 
 	if (clp->cl_alldependents || clp->cl_allchildren ||
@@ -518,8 +517,8 @@ change_one(zfs_handle_t *zhp, void *data)
 	    share_sourcetype == ZPROP_SRC_INHERITED))) {
 		if ((cn = zfs_alloc(zfs_get_handle(zhp),
 		    sizeof (prop_changenode_t))) == NULL) {
-			zfs_close(zhp);
-			return (-1);
+			ret = -1;
+			goto out;
 		}
 
 		cn->cn_handle = zhp;
@@ -541,16 +540,23 @@ change_one(zfs_handle_t *zhp, void *data)
 			uu_avl_insert(clp->cl_tree, cn, idx);
 		} else {
 			free(cn);
-			zfs_close(zhp);
+			cn = NULL;
 		}
 
 		if (!clp->cl_alldependents)
-			return (zfs_iter_children(zhp, change_one, data));
-	} else {
-		zfs_close(zhp);
+			ret = zfs_iter_children(zhp, change_one, data);
+
+		/*
+		 * If we added the handle to the changelist, we will re-use it
+		 * later so return without closing it.
+		 */
+		if (cn != NULL)
+			return (ret);
 	}
 
-	return (0);
+out:
+	zfs_close(zhp);
+	return (ret);
 }
 
 static int

From 3cf4ecb03fecca9d9a326c32e8f1f7573a93a8e3 Mon Sep 17 00:00:00 2001
From: Georgy Yakovlev <168902+gyakovlev@users.noreply.github.com>
Date: Thu, 29 Aug 2019 12:14:48 -0800
Subject: [PATCH 095/109] etc/init.d/zfs-functions.in: remove arch warning

Remove the x86_64 warning, it's no longer the case that this is the
only supported architecture.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Georgy Yakovlev <gyakovlev@gentoo.org>
Closes: #9177
---
 etc/init.d/zfs-functions.in | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/etc/init.d/zfs-functions.in b/etc/init.d/zfs-functions.in
index 490503e91391..cbc7fd22a0a0 100644
--- a/etc/init.d/zfs-functions.in
+++ b/etc/init.d/zfs-functions.in
@@ -294,13 +294,6 @@ checksystem()
 	# Just make sure that /dev/zfs is created.
 	udev_trigger
 
-	if ! [ "$(uname -m)" = "x86_64" ]; then
-		echo "Warning: You're not running 64bit. Currently native zfs in";
-		echo "         Linux is only supported and tested on 64bit.";
-		# should we break here? People doing this should know what they
-		# do, thus i'm not breaking here.
-	fi
-
 	return 0
 }
 

From 13e5e396a31df268cba6571a800abe9e54c47db4 Mon Sep 17 00:00:00 2001
From: loli10K <loli10K@users.noreply.github.com>
Date: Tue, 3 Sep 2019 19:36:33 +0200
Subject: [PATCH 096/109] Fix Intel QAT / ZFS compatibility on v4.7.1+ kernels

This change use the compat code introduced in 9cc1844a.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
Closes #9268
Closes #9269
---
 module/zfs/qat_compress.c | 2 +-
 module/zfs/qat_crypt.c    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/module/zfs/qat_compress.c b/module/zfs/qat_compress.c
index 1c5c0a4e7256..b3c8c1621675 100644
--- a/module/zfs/qat_compress.c
+++ b/module/zfs/qat_compress.c
@@ -547,7 +547,7 @@ qat_compress(qat_compress_dir_t dir, char *src, int src_len,
 }
 
 static int
-param_set_qat_compress(const char *val, struct kernel_param *kp)
+param_set_qat_compress(const char *val, zfs_kernel_param_t *kp)
 {
 	int ret;
 	int *pvalue = kp->arg;
diff --git a/module/zfs/qat_crypt.c b/module/zfs/qat_crypt.c
index 34c19b5823a8..2170366df142 100644
--- a/module/zfs/qat_crypt.c
+++ b/module/zfs/qat_crypt.c
@@ -578,7 +578,7 @@ qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp)
 }
 
 static int
-param_set_qat_encrypt(const char *val, struct kernel_param *kp)
+param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp)
 {
 	int ret;
 	int *pvalue = kp->arg;
@@ -600,7 +600,7 @@ param_set_qat_encrypt(const char *val, struct kernel_param *kp)
 }
 
 static int
-param_set_qat_checksum(const char *val, struct kernel_param *kp)
+param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp)
 {
 	int ret;
 	int *pvalue = kp->arg;

From beb21db3c6ac503a43ef7c6532d099c056f89f5b Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Tue, 3 Sep 2019 20:56:55 +0300
Subject: [PATCH 097/109] Always refuse receving non-resume stream when resume
 state exists

This fixes a hole in the situation where the resume state is left from
receiving a new dataset and, so, the state is set on the dataset itself
(as opposed to %recv child).

Additionally, distinguish incremental and resume streams in error
messages.

Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tom Caputi <tcaputi@datto.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Andriy Gapon <avg@FreeBSD.org>
Closes #9252
---
 lib/libzfs/libzfs_sendrecv.c | 15 +++++++++++----
 module/zfs/dmu_recv.c        | 10 +++++++---
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index 0d3853e0a1c4..d967e043b4e5 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -3992,11 +3992,18 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 		}
 	} else {
 		/*
-		 * if the fs does not exist, look for it based on the
-		 * fromsnap GUID
+		 * If the fs does not exist, look for it based on the
+		 * fromsnap GUID.
 		 */
-		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-		    "cannot receive incremental stream"));
+		if (resuming) {
+			(void) snprintf(errbuf, sizeof (errbuf),
+			    dgettext(TEXT_DOMAIN,
+			    "cannot receive resume stream"));
+		} else {
+			(void) snprintf(errbuf, sizeof (errbuf),
+			    dgettext(TEXT_DOMAIN,
+			    "cannot receive incremental stream"));
+		}
 
 		(void) strcpy(name, destsnap);
 		*strchr(name, '@') = '\0';
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 3481feb21dbc..2324e8e87ba2 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -86,21 +86,25 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
 	boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
 	boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0;
 
-	/* temporary clone name must not exist */
+	/* Temporary clone name must not exist. */
 	error = zap_lookup(dp->dp_meta_objset,
 	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
 	    8, 1, &val);
 	if (error != ENOENT)
 		return (error == 0 ? EBUSY : error);
 
-	/* new snapshot name must not exist */
+	/* Resume state must not be set. */
+	if (dsl_dataset_has_resume_receive_state(ds))
+		return (SET_ERROR(EBUSY));
+
+	/* New snapshot name must not exist. */
 	error = zap_lookup(dp->dp_meta_objset,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj,
 	    drba->drba_cookie->drc_tosnap, 8, 1, &val);
 	if (error != ENOENT)
 		return (error == 0 ? EEXIST : error);
 
-	/* must not have children if receiving a ZVOL */
+	/* Must not have children if receiving a ZVOL. */
 	error = zap_count(dp->dp_meta_objset,
 	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children);
 	if (error != 0)

From 38528476bf0b64e7462a1141ff73d016a94f3471 Mon Sep 17 00:00:00 2001
From: Pavel Zakharov <pavel.zakharov@delphix.com>
Date: Wed, 17 Jul 2019 18:33:05 -0400
Subject: [PATCH 098/109] New service that waits on zvol links to be created
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The zfs-volume-wait.service scans existing zvols and waits for their
links under /dev to be created. Any service that depends on zvol
links to be there should add a dependency on zfs-volumes.target.
By default, this target is not enabled.

Reviewed-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Reviewed-by: Antonio Russo <antonio.e.russo@gmail.com>
Reviewed-by: Richard Laager <rlaager@wiktel.com>
Reviewed-by: loli10K <ezomori.nozomu@gmail.com>
Reviewed-by: John Gallagher <john.gallagher@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Pavel Zakharov <pzakharov@delphix.com>
Closes #8975
---
 cmd/Makefile.am                               |  2 +-
 cmd/zvol_wait/Makefile.am                     |  1 +
 cmd/zvol_wait/zvol_wait                       | 93 +++++++++++++++++++
 configure.ac                                  |  1 +
 etc/systemd/system/50-zfs.preset.in           |  1 +
 etc/systemd/system/Makefile.am                |  4 +
 etc/systemd/system/zfs-volume-wait.service.in | 13 +++
 etc/systemd/system/zfs-volumes.target.in      |  7 ++
 man/man1/Makefile.am                          |  2 +-
 man/man1/zvol_wait.1                          | 21 +++++
 rpm/generic/zfs.spec.in                       |  3 +-
 11 files changed, 145 insertions(+), 3 deletions(-)
 create mode 100644 cmd/zvol_wait/Makefile.am
 create mode 100755 cmd/zvol_wait/zvol_wait
 create mode 100644 etc/systemd/system/zfs-volume-wait.service.in
 create mode 100644 etc/systemd/system/zfs-volumes.target.in
 create mode 100644 man/man1/zvol_wait.1

diff --git a/cmd/Makefile.am b/cmd/Makefile.am
index 0d990789b0c6..88609e455f2b 100644
--- a/cmd/Makefile.am
+++ b/cmd/Makefile.am
@@ -5,4 +5,4 @@ if USING_PYTHON
 SUBDIRS += arcstat arc_summary dbufstat
 endif
 
-SUBDIRS += mount_zfs zed zvol_id
+SUBDIRS += mount_zfs zed zvol_id zvol_wait
diff --git a/cmd/zvol_wait/Makefile.am b/cmd/zvol_wait/Makefile.am
new file mode 100644
index 000000000000..564031c9799d
--- /dev/null
+++ b/cmd/zvol_wait/Makefile.am
@@ -0,0 +1 @@
+dist_bin_SCRIPTS = zvol_wait
diff --git a/cmd/zvol_wait/zvol_wait b/cmd/zvol_wait/zvol_wait
new file mode 100755
index 000000000000..d512be41bcb5
--- /dev/null
+++ b/cmd/zvol_wait/zvol_wait
@@ -0,0 +1,93 @@
+#!/bin/sh
+
+count_zvols() {
+	if [ -z "$zvols" ]; then
+		echo 0
+	else
+		echo "$zvols" | wc -l
+	fi
+}
+
+filter_out_zvols_with_links() {
+	while read -r zvol; do
+		if [ ! -L "/dev/zvol/$zvol" ]; then
+			echo "$zvol"
+		fi
+	done
+}
+
+filter_out_deleted_zvols() {
+	while read -r zvol; do
+		if zfs list "$zvol" >/dev/null 2>&1; then
+			echo "$zvol"
+		fi
+	done
+}
+
+list_zvols() {
+	zfs list -t volume -H -o name,volmode | while read -r zvol_line; do
+		name=$(echo "$zvol_line" | awk '{print $1}')
+		volmode=$(echo "$zvol_line" | awk '{print $2}')
+		# /dev links are not created for zvols with volmode = "none".
+		[ "$volmode" = "none" ] || echo "$name"
+	done
+}
+
+zvols=$(list_zvols)
+zvols_count=$(count_zvols)
+if [ "$zvols_count" -eq 0 ]; then
+	echo "No zvols found, nothing to do."
+	exit 0
+fi
+
+echo "Testing $zvols_count zvol links"
+
+outer_loop=0
+while [ "$outer_loop" -lt 20 ]; do
+	outer_loop=$((outer_loop + 1))
+
+	old_zvols_count=$(count_zvols)
+
+	inner_loop=0
+	while [ "$inner_loop" -lt 30 ]; do
+		inner_loop=$((inner_loop + 1))
+
+		zvols="$(echo "$zvols" | filter_out_zvols_with_links)"
+
+		zvols_count=$(count_zvols)
+		if [ "$zvols_count" -eq 0 ]; then
+			echo "All zvol links are now present."
+			exit 0
+		fi
+		sleep 1
+	done
+
+	echo "Still waiting on $zvols_count zvol links ..."
+	#
+	# Although zvols should normally not be deleted at boot time,
+	# if that is the case then their links will be missing and
+	# we would stall.
+	#
+	if [ "$old_zvols_count" -eq "$zvols_count" ]; then
+		echo "No progress since last loop."
+		echo "Checking if any zvols were deleted."
+
+		zvols=$(echo "$zvols" | filter_out_deleted_zvols)
+		zvols_count=$(count_zvols)
+
+		if [ "$old_zvols_count" -ne "$zvols_count" ]; then
+			echo "$((old_zvols_count - zvols_count)) zvol(s) deleted."
+		fi
+
+		if [ "$zvols_count" -ne 0 ]; then
+			echo "Remaining zvols:"
+			echo "$zvols"
+		else
+			echo "All zvol links are now present."
+			exit 0
+		fi
+	fi
+done
+
+echo "Timed out waiting on zvol links"
+exit 1
diff --git a/configure.ac b/configure.ac
index e8592ffb1d2d..a3ac134ffccf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -123,6 +123,7 @@ AC_CONFIG_FILES([
 	cmd/zed/zed.d/Makefile
 	cmd/raidz_test/Makefile
 	cmd/zgenhostid/Makefile
+	cmd/zvol_wait/Makefile
 	contrib/Makefile
 	contrib/bash_completion.d/Makefile
 	contrib/dracut/Makefile
diff --git a/etc/systemd/system/50-zfs.preset.in b/etc/systemd/system/50-zfs.preset.in
index 884a69b5b683..e4056a92cd98 100644
--- a/etc/systemd/system/50-zfs.preset.in
+++ b/etc/systemd/system/50-zfs.preset.in
@@ -5,4 +5,5 @@ enable zfs-import.target
 enable zfs-mount.service
 enable zfs-share.service
 enable zfs-zed.service
+enable zfs-volume-wait.service
 enable zfs.target
diff --git a/etc/systemd/system/Makefile.am b/etc/systemd/system/Makefile.am
index 1586209caa6d..9249f15eb455 100644
--- a/etc/systemd/system/Makefile.am
+++ b/etc/systemd/system/Makefile.am
@@ -7,7 +7,9 @@ systemdunit_DATA = \
 	zfs-import-scan.service \
 	zfs-mount.service \
 	zfs-share.service \
+	zfs-volume-wait.service \
 	zfs-import.target \
+	zfs-volumes.target \
 	zfs.target
 
 EXTRA_DIST = \
@@ -17,6 +19,8 @@ EXTRA_DIST = \
 	$(top_srcdir)/etc/systemd/system/zfs-mount.service.in \
 	$(top_srcdir)/etc/systemd/system/zfs-share.service.in \
 	$(top_srcdir)/etc/systemd/system/zfs-import.target.in \
+	$(top_srcdir)/etc/systemd/system/zfs-volume-wait.service.in \
+	$(top_srcdir)/etc/systemd/system/zfs-volumes.target.in \
 	$(top_srcdir)/etc/systemd/system/zfs.target.in \
 	$(top_srcdir)/etc/systemd/system/50-zfs.preset.in
 
diff --git a/etc/systemd/system/zfs-volume-wait.service.in b/etc/systemd/system/zfs-volume-wait.service.in
new file mode 100644
index 000000000000..75bd9fcdd56c
--- /dev/null
+++ b/etc/systemd/system/zfs-volume-wait.service.in
@@ -0,0 +1,13 @@
+[Unit]
+Description=Wait for ZFS Volume (zvol) links in /dev
+DefaultDependencies=no
+After=systemd-udev-settle.service
+After=zfs-import.target
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+ExecStart=@bindir@/zvol_wait
+
+[Install]
+WantedBy=zfs-volumes.target
diff --git a/etc/systemd/system/zfs-volumes.target.in b/etc/systemd/system/zfs-volumes.target.in
new file mode 100644
index 000000000000..5cb9a10f49c5
--- /dev/null
+++ b/etc/systemd/system/zfs-volumes.target.in
@@ -0,0 +1,7 @@
+[Unit]
+Description=ZFS volumes are ready
+After=zfs-volume-wait.service
+Requires=zfs-volume-wait.service
+
+[Install]
+WantedBy=zfs.target
diff --git a/man/man1/Makefile.am b/man/man1/Makefile.am
index bd78be1452a8..2af917fa5c2e 100644
--- a/man/man1/Makefile.am
+++ b/man/man1/Makefile.am
@@ -1,4 +1,4 @@
-dist_man_MANS = zhack.1 ztest.1 raidz_test.1
+dist_man_MANS = zhack.1 ztest.1 raidz_test.1 zvol_wait.1
 EXTRA_DIST = cstyle.1
 
 install-data-local:
diff --git a/man/man1/zvol_wait.1 b/man/man1/zvol_wait.1
new file mode 100644
index 000000000000..0366da5376d3
--- /dev/null
+++ b/man/man1/zvol_wait.1
@@ -0,0 +1,21 @@
+.Dd July 5, 2019
+.Dt ZVOL_WAIT 1 SMM
+.Os Linux
+.Sh NAME
+.Nm zvol_wait
+.Nd Wait for ZFS volume links in
+.Em /dev
+to be created.
+.Sh SYNOPSIS
+.Nm
+.Sh DESCRIPTION
+When a ZFS pool is imported, ZFS will register each ZFS volume
+(zvol) as a disk device with the system. As the disks are registered,
+.Xr \fBudev 7\fR
+will asynchronously create symlinks under
+.Em /dev/zvol
+using the zvol's name.
+.Nm
+will wait for all those symlinks to be created before returning.
+.Sh SEE ALSO
+.Xr \fBudev 7\fR
diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index 0864a72a1155..4fdf7bb69ec7 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -322,7 +322,7 @@ image which is ZFS aware.
 
 %if 0%{?_systemd}
     %define systemd --enable-systemd --with-systemdunitdir=%{_unitdir} --with-systemdpresetdir=%{_presetdir} --with-systemdmodulesloaddir=%{_modulesloaddir} --with-systemdgeneratordir=%{_systemdgeneratordir} --disable-sysvinit
-    %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-share.service zfs-zed.service zfs.target zfs-import.target
+    %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-share.service zfs-zed.service zfs.target zfs-import.target zfs-volume-wait.service zfs-volumes.target
 %else
     %define systemd --enable-sysvinit --disable-systemd
 %endif
@@ -419,6 +419,7 @@ systemctl --system daemon-reload >/dev/null || true
 %{_sbindir}/*
 %{_bindir}/raidz_test
 %{_bindir}/zgenhostid
+%{_bindir}/zvol_wait
 # Optional Python 2/3 scripts
 %{_bindir}/arc_summary
 %{_bindir}/arcstat

From 5acba22ec0bd934894d746ca967d451fdc6d3368 Mon Sep 17 00:00:00 2001
From: Pavel Zakharov <pavel.zakharov@delphix.com>
Date: Tue, 3 Sep 2019 14:29:52 -0400
Subject: [PATCH 099/109] zvol_wait script should ignore partially received
 zvols

Partially received zvols won't have links in /dev/zvol.

Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Closes #9260
---
 cmd/zvol_wait/zvol_wait | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/cmd/zvol_wait/zvol_wait b/cmd/zvol_wait/zvol_wait
index d512be41bcb5..e5df82dd376a 100755
--- a/cmd/zvol_wait/zvol_wait
+++ b/cmd/zvol_wait/zvol_wait
@@ -25,11 +25,30 @@ filter_out_deleted_zvols() {
 }
 
 list_zvols() {
-	zfs list -t volume -H -o name,volmode | while read -r zvol_line; do
+	zfs list -t volume -H -o name,volmode,receive_resume_token |
+		while read -r zvol_line; do
 		name=$(echo "$zvol_line" | awk '{print $1}')
 		volmode=$(echo "$zvol_line" | awk '{print $2}')
+		token=$(echo "$zvol_line" | awk '{print $3}')
+		#
 		# /dev links are not created for zvols with volmode = "none".
-		[ "$volmode" = "none" ] || echo "$name"
+		#
+		[ "$volmode" = "none" ] && continue
+		#
+		# We also also ignore partially received zvols if it is
+		# not an incremental receive, as those won't even have a block
+		# device minor node created yet.
+		#
+		if [ "$token" != "-" ]; then
+			#
+			# Incremental receives create an invisible clone that
+			# is not automatically displayed by zfs list.
+			#
+			if ! zfs list "$name/%recv" >/dev/null 2>&1; then
+				continue
+			fi
+		fi
+		echo "$name"
 	done
 }
 

From 9f261b1be681e93158d65fa8e5f2a0553af05b20 Mon Sep 17 00:00:00 2001
From: loli10K <loli10K@users.noreply.github.com>
Date: Wed, 4 Sep 2019 00:20:39 +0200
Subject: [PATCH 100/109] Fix zfs-dkms .deb package warning in prerm script

Debian zfs-dkms package generated by alien doesn't call the prerm script
(rpm's %preun) with an integer as first parameter, which results in the
following warning when the package is uninstalled:

   "zfs-dkms.prerm: line 3: [: remove: integer expression expected"

Modify the if-condition to avoid the warning.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
Closes #9271
---
 rpm/generic/zfs-dkms.spec.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in
index 568bef988ca0..d87293686422 100644
--- a/rpm/generic/zfs-dkms.spec.in
+++ b/rpm/generic/zfs-dkms.spec.in
@@ -73,7 +73,7 @@ exit 1
 
 %preun
 # Are we doing an upgrade?
-if [ $1 -ne 0 ] ; then
+if [ "$1" = "1" -o "$1" = "upgrade" ] ; then
 	# Yes we are.  Are we upgrading to a new ZFS version?
 	NEWEST_VER=$(dkms status zfs | sed 's/,//g' | sort -r -V | awk '/installed/{print $2; exit}')
 	if [ "$NEWEST_VER" != "%{version}" ] ; then

From 146d7d8846d532a0ee66454ec0b14d6a511a6228 Mon Sep 17 00:00:00 2001
From: loli10K <loli10K@users.noreply.github.com>
Date: Wed, 4 Sep 2019 22:36:25 +0200
Subject: [PATCH 101/109] Fix zpool subcommands error message with some
 unsupported options

Both 'detach' and 'online' zpool subcommands, when provided with an
unsupported option, forget to print it in the error message:

   # zpool online -t rpool vda3
   invalid option ''
   usage:
      online [-e] <pool> <device> ...

This changes fixes the error message in order to include the actual
option that is not supported.

Reviewed-by: Ryan Moeller <ryan@ixsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
Closes #9270
---
 cmd/zpool/zpool_main.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index a3c76030d634..b9c7462b618e 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -6111,9 +6111,8 @@ zpool_do_detach(int argc, char **argv)
 	int ret;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "f")) != -1) {
+	while ((c = getopt(argc, argv, "")) != -1) {
 		switch (c) {
-		case 'f':
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -6342,12 +6341,11 @@ zpool_do_online(int argc, char **argv)
 	int flags = 0;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "et")) != -1) {
+	while ((c = getopt(argc, argv, "e")) != -1) {
 		switch (c) {
 		case 'e':
 			flags |= ZFS_ONLINE_EXPAND;
 			break;
-		case 't':
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);

From 0ae5f0c8d29f2dff2470779cd7e1b4c3cfeaf12b Mon Sep 17 00:00:00 2001
From: Olaf Faaland <faaland1@llnl.gov>
Date: Fri, 6 Sep 2019 11:30:07 -0700
Subject: [PATCH 102/109] BuildRequires libtirpc-devel needed for RHEL 8

Building against RHEL 8 requires libtirpc-devel, as with fedora 28.
Add rhel8 and centos8 options to the test, to account for that.

BuildRequires Originally added for fedora 28 via commit
1a62a305be01972ef1b81469134faa4937836096

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #9289
---
 rpm/generic/zfs.spec.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index 4fdf7bb69ec7..b9ca5ed5fb74 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -139,7 +139,7 @@ BuildRequires:  libblkid-devel
 BuildRequires:  libudev-devel
 BuildRequires:  libattr-devel
 BuildRequires:  openssl-devel
-%if 0%{?fedora} >= 28
+%if 0%{?fedora} >= 28 || 0%{?rhel} >= 8 || 0%{?centos} >= 8
 BuildRequires:  libtirpc-devel
 %endif
 Requires:       openssl

From 97d4986214e2f1a003f60a931bb6c9dafdead7bf Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 10 Sep 2019 13:42:30 -0700
Subject: [PATCH 103/109] Fix /etc/hostid on root pool deadlock

Accidentally introduced by dc04a8c which now takes the SCL_VDEV lock
as a reader in zfs_blkptr_verify().  A deadlock can occur if the
/etc/hostid file resides on a dataset in the same pool.  This is
because reading the /etc/hostid file may occur while the caller is
holding the SCL_VDEV lock as a writer.  For example, to perform a
`zpool attach` as shown in the abbreviated stack below.

To resolve the issue we cache the system's hostid when initializing
the spa_t, or when modifying the multihost property.  The cached
value is then relied upon for subsequent accesses.

Call Trace:
    spa_config_enter+0x1e8/0x350 [zfs]
    zfs_blkptr_verify+0x33c/0x4f0 [zfs] <--- trying read lock
    zio_read+0x6c/0x140 [zfs]
    ...
    vfs_read+0xfc/0x1e0
    kernel_read+0x50/0x90
    ...
    spa_get_hostid+0x1c/0x38 [zfs]
    spa_config_generate+0x1a0/0x610 [zfs]
    vdev_label_init+0xa0/0xc80 [zfs]
    vdev_create+0x98/0xe0 [zfs]
    spa_vdev_attach+0x14c/0xb40 [zfs] <--- grabbed write lock

Reviewed-by: loli10K <ezomori.nozomu@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #9256
Closes #9285
---
 include/sys/spa.h                             |  2 +-
 include/sys/spa_impl.h                        |  1 +
 module/zfs/spa.c                              | 15 ++--
 module/zfs/spa_config.c                       |  2 +-
 module/zfs/spa_misc.c                         | 19 +---
 tests/runfiles/linux.run                      |  2 +-
 .../tests/functional/mmp/Makefile.am          |  1 +
 .../tests/functional/mmp/mmp_hostid.ksh       | 90 +++++++++++++++++++
 8 files changed, 109 insertions(+), 23 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_hostid.ksh

diff --git a/include/sys/spa.h b/include/sys/spa.h
index 23434edbc72e..ca63d3a49058 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1104,7 +1104,7 @@ extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
 extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
 extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
 extern boolean_t spa_multihost(spa_t *spa);
-extern unsigned long spa_get_hostid(void);
+extern uint32_t spa_get_hostid(spa_t *spa);
 extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
 
 extern int spa_mode(spa_t *spa);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 0de8613d3eb8..9ab107599fd6 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -395,6 +395,7 @@ struct spa {
 	mmp_thread_t	spa_mmp;		/* multihost mmp thread */
 	list_t		spa_leaf_list;		/* list of leaf vdevs */
 	uint64_t	spa_leaf_list_gen;	/* track leaf_list changes */
+	uint32_t	spa_hostid;		/* cached system hostid */
 
 	/*
 	 * spa_refcount & spa_config_lock must be the last elements
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index ce622cee88b0..4e322e34b080 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -567,8 +567,13 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 			if (!error && intval > 1)
 				error = SET_ERROR(EINVAL);
 
-			if (!error && !spa_get_hostid())
-				error = SET_ERROR(ENOTSUP);
+			if (!error) {
+				uint32_t hostid = zone_get_hostid(NULL);
+				if (hostid)
+					spa->spa_hostid = hostid;
+				else
+					error = SET_ERROR(ENOTSUP);
+			}
 
 			break;
 
@@ -2496,7 +2501,7 @@ spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
 	if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
 		hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
 
-	if (hostid == spa_get_hostid())
+	if (hostid == spa_get_hostid(spa))
 		return (B_FALSE);
 
 	/*
@@ -3015,7 +3020,7 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 	    spa->spa_config);
 	if (activity_check) {
 		if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
-		    spa_get_hostid() == 0) {
+		    spa_get_hostid(spa) == 0) {
 			nvlist_free(label);
 			fnvlist_add_uint64(spa->spa_load_info,
 			    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
@@ -3695,7 +3700,7 @@ spa_ld_load_vdev_metadata(spa_t *spa)
 	 * be imported when the system hostid is zero.  The exception to
 	 * this rule is zdb which is always allowed to access pools.
 	 */
-	if (spa_multihost(spa) && spa_get_hostid() == 0 &&
+	if (spa_multihost(spa) && spa_get_hostid(spa) == 0 &&
 	    (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c
index 6c0894338e25..8c7c14999da6 100644
--- a/module/zfs/spa_config.c
+++ b/module/zfs/spa_config.c
@@ -457,7 +457,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 		fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
 		    spa->spa_comment);
 
-	hostid = spa_get_hostid();
+	hostid = spa_get_hostid(spa);
 	if (hostid != 0)
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid);
 	fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname()->nodename);
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index a111a9e4e611..185b70201483 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -658,6 +658,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa->spa_proc = &p0;
 	spa->spa_proc_state = SPA_PROC_NONE;
 	spa->spa_trust_config = B_TRUE;
+	spa->spa_hostid = zone_get_hostid(NULL);
 
 	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
 	spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
@@ -2540,22 +2541,10 @@ spa_multihost(spa_t *spa)
 	return (spa->spa_multihost ? B_TRUE : B_FALSE);
 }
 
-unsigned long
-spa_get_hostid(void)
+uint32_t
+spa_get_hostid(spa_t *spa)
 {
-	unsigned long myhostid;
-
-#ifdef	_KERNEL
-	myhostid = zone_get_hostid(NULL);
-#else	/* _KERNEL */
-	/*
-	 * We're emulating the system's hostid in userland, so
-	 * we can't use zone_get_hostid().
-	 */
-	(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
-#endif	/* _KERNEL */
-
-	return (myhostid);
+	return (spa->spa_hostid);
 }
 
 boolean_t
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 0e157cf0e98e..ff98661ec795 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -657,7 +657,7 @@ tags = ['functional', 'mmap']
 tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
     'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import',
     'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history',
-    'mmp_on_zdb', 'mmp_write_distribution']
+    'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid']
 tags = ['functional', 'mmp']
 
 [tests/functional/mount]
diff --git a/tests/zfs-tests/tests/functional/mmp/Makefile.am b/tests/zfs-tests/tests/functional/mmp/Makefile.am
index e39a0a5aac8e..2848fd4ce692 100644
--- a/tests/zfs-tests/tests/functional/mmp/Makefile.am
+++ b/tests/zfs-tests/tests/functional/mmp/Makefile.am
@@ -12,6 +12,7 @@ dist_pkgdata_SCRIPTS = \
 	mmp_reset_interval.ksh \
 	mmp_on_zdb.ksh \
 	mmp_write_distribution.ksh \
+	mmp_hostid.ksh \
 	setup.ksh \
 	cleanup.ksh
 
diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_hostid.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_hostid.ksh
new file mode 100755
index 000000000000..b492b1070caf
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/mmp/mmp_hostid.ksh
@@ -0,0 +1,90 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
+#
+
+# DESCRIPTION:
+#	Verify the hostid file can reside on a ZFS dataset.
+#
+# STRATEGY:
+#	1. Create a non-redundant pool
+#	2. Create an 'etc' dataset containing a valid hostid file
+#	3. Create a file so the pool will have some contents
+#	4. Verify multihost cannot be enabled until the /etc/hostid is linked
+#	5. Verify vdevs may be attached and detached
+#	6. Verify normal, cache, log and special vdevs can be added
+#	7. Verify normal, cache, and log vdevs can be removed
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/mmp/mmp.cfg
+. $STF_SUITE/tests/functional/mmp/mmp.kshlib
+
+verify_runnable "both"
+
+function cleanup
+{
+	default_cleanup_noexit
+	log_must rm $MMP_DIR/file.{0,1,2,3,4,5}
+	log_must rmdir $MMP_DIR
+	log_must mmp_clear_hostid
+}
+
+log_assert "Verify hostid file can reside on a ZFS dataset"
+log_onexit cleanup
+
+log_must mkdir -p $MMP_DIR
+log_must truncate -s $MINVDEVSIZE $MMP_DIR/file.{0,1,2,3,4,5}
+
+# 1. Create a non-redundant pool
+log_must zpool create $MMP_POOL $MMP_DIR/file.0
+
+# 2. Create an 'etc' dataset containing a valid hostid file; caching is
+#    disabled on the dataset to force the hostid to be read from disk.
+log_must zfs create -o primarycache=none -o secondarycache=none $MMP_POOL/etc
+mntpnt_etc=$(get_prop mountpoint $MMP_POOL/etc)
+log_must mmp_set_hostid $HOSTID1
+log_must mv $HOSTID_FILE $mntpnt_etc/hostid
+
+# 3. Create a file so the pool will have some contents
+log_must zfs create $MMP_POOL/fs
+mntpnt_fs=$(get_prop mountpoint $MMP_POOL/fs)
+log_must mkfile 1M $fs_mntpnt/file
+
+# 4. Verify multihost cannot be enabled until the /etc/hostid is linked
+log_mustnot zpool set multihost=on $MMP_POOL
+log_must ln -s $mntpnt_etc/hostid $HOSTID_FILE
+log_must zpool set multihost=on $MMP_POOL
+
+# 5. Verify vdevs may be attached and detached
+log_must zpool attach $MMP_POOL $MMP_DIR/file.0 $MMP_DIR/file.1
+log_must zpool detach $MMP_POOL $MMP_DIR/file.1
+
+# 6. Verify normal, cache, log and special vdevs can be added
+log_must zpool add $MMP_POOL $MMP_DIR/file.1
+log_must zpool add $MMP_POOL $MMP_DIR/file.2
+log_must zpool add $MMP_POOL cache $MMP_DIR/file.3
+log_must zpool add $MMP_POOL log $MMP_DIR/file.4
+log_must zpool add $MMP_POOL special $MMP_DIR/file.5
+
+# 7. Verify normal, cache, and log vdevs can be removed
+log_must zpool remove $MMP_POOL $MMP_DIR/file.2
+log_must zpool remove $MMP_POOL $MMP_DIR/file.3
+log_must zpool remove $MMP_POOL $MMP_DIR/file.4
+
+log_pass "Verify hostid file can reside on a ZFS dataset."

From e17445d1f70600c22cd319765c0e403d5f9d5024 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 11 Sep 2019 11:14:50 -0700
Subject: [PATCH 104/109] kmodtool: depmod path

Determine the location of depmod on the system, either /sbin/depmod or
/usr/sbin/depmod.  Then use that path when generating the specfile.

Additionally, update the Requires lines to reference the package which
provides depmod rather than the binary itself.  For CentOS/RHEL 7+8
and all supported Fedora releases this is the kmod package, and for
CentOS/RHEL 6 it is the module-init-tools package.

Reviewed-by: Minh Diep <mdiep@whamcloud.com>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #8724
Closes #9310
---
 scripts/kmodtool | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/scripts/kmodtool b/scripts/kmodtool
index a632dd046b5a..9298d6d27dfa 100755
--- a/scripts/kmodtool
+++ b/scripts/kmodtool
@@ -144,7 +144,13 @@ print_rpmtemplate_per_kmodpkg ()
 	local kernel_uname_r=${1}
 	local kernel_variant="${2:+-${2}}"
 
-    # first part
+	# Detect depmod install location
+	local depmod_path=/sbin/depmod
+	if [ ! -f ${depmod_path} ]; then
+		depmod_path=/usr/sbin/depmod
+	fi
+
+	# first part
 	cat <<EOF
 %package       -n kmod-${kmodname}-${kernel_uname_r}
 Summary:          ${kmodname} kernel module(s) for ${kernel_uname_r}
@@ -153,8 +159,14 @@ Provides:         kernel-modules-for-kernel = ${kernel_uname_r}
 Provides:         kmod-${kmodname}-uname-r = ${kernel_uname_r}
 Provides:         ${kmodname}-kmod = %{?epoch:%{epoch}:}%{version}-%{release}
 Requires:         ${kmodname}-kmod-common >= %{?epoch:%{epoch}:}%{version}
-Requires(post):   ${prefix}/sbin/depmod
-Requires(postun): ${prefix}/sbin/depmod
+
+%if 0%{?rhel} == 6 || 0%{?centos} == 6
+Requires(post):   module-init-tools
+Requires(postun): module-init-tools
+%else
+Requires(post):   kmod
+Requires(postun): kmod
+%endif
 EOF
 
 	if [[ ${obsolete_name} ]]; then
@@ -170,17 +182,17 @@ BuildRequires:	  kernel-devel-uname-r = ${kernel_uname_r}
 %{?KmodsRequires:Requires: %{KmodsRequires}-uname-r = ${kernel_uname_r}}
 %{?KmodsRequires:BuildRequires: %{KmodsRequires}-uname-r = ${kernel_uname_r}}
 %post          -n kmod-${kmodname}-${kernel_uname_r}
-${prefix}/sbin/depmod -aeF /boot/System.map-${kernel_uname_r} ${kernel_uname_r} > /dev/null || :
+${prefix}${depmod_path} -aeF /boot/System.map-${kernel_uname_r} ${kernel_uname_r} > /dev/null || :
 %postun        -n kmod-${kmodname}-${kernel_uname_r}
-${prefix}/sbin/depmod  -aF /boot/System.map-${kernel_uname_r} ${kernel_uname_r} &> /dev/null || :
+${prefix}${depmod_path} -aF /boot/System.map-${kernel_uname_r} ${kernel_uname_r} &> /dev/null || :
 
 EOF
 	else
 	  cat <<EOF
 %post          -n kmod-${kmodname}-${kernel_uname_r}
-[[ "\$(uname -r)" == "${kernel_uname_r}"  ]] && ${prefix}/sbin/depmod -a > /dev/null || :
+[[ "\$(uname -r)" == "${kernel_uname_r}"  ]] && ${prefix}${depmod_path} -a > /dev/null || :
 %postun        -n kmod-${kmodname}-${kernel_uname_r}
-[[ "\$(uname -r)" == "${kernel_uname_r}"  ]] && ${prefix}/sbin/depmod -a > /dev/null || :
+[[ "\$(uname -r)" == "${kernel_uname_r}"  ]] && ${prefix}${depmod_path} -a > /dev/null || :
 
 EOF
 	fi

From 9fa8b5b55b44f1d860b05587bff1dccd896cb77b Mon Sep 17 00:00:00 2001
From: Chengfei ZHu <chengfeix.zhu@intel.com>
Date: Fri, 13 Sep 2019 04:33:44 +0800
Subject: [PATCH 105/109] QAT related bug fixes

1. Fix issue:  Kernel BUG with QAT during decompression  #9276.
   Now it is uninterruptible for a specific given QAT request,
   but Ctrl-C interrupt still works in user-space process.

2. Copy the digest result to the buffer only when doing encryption,
   and vise-versa for decryption.

Reviewed-by: Tom Caputi <tcaputi@datto.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Chengfei Zhu <chengfeix.zhu@intel.com>
Closes #9276
Closes #9303
---
 module/zfs/qat.c          |  2 +-
 module/zfs/qat.h          |  5 -----
 module/zfs/qat_compress.c | 14 +++-----------
 module/zfs/qat_crypt.c    | 29 ++++++++++++++---------------
 4 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/module/zfs/qat.c b/module/zfs/qat.c
index a6f024cb44d7..08613b3a2042 100644
--- a/module/zfs/qat.c
+++ b/module/zfs/qat.c
@@ -21,7 +21,7 @@
 
 #if defined(_KERNEL) && defined(HAVE_QAT)
 #include <sys/zfs_context.h>
-#include "qat.h"
+#include <sys/qat.h>
 
 qat_stats_t qat_stats = {
 	{ "comp_requests",			KSTAT_DATA_UINT64 },
diff --git a/module/zfs/qat.h b/module/zfs/qat.h
index 9014c03148ba..5c1cd15d09d6 100644
--- a/module/zfs/qat.h
+++ b/module/zfs/qat.h
@@ -40,11 +40,6 @@ typedef enum qat_encrypt_dir {
 #include "dc/cpa_dc.h"
 #include "lac/cpa_cy_sym.h"
 
-/*
- * Timeout - no response from hardware after 0.5 seconds
- */
-#define	QAT_TIMEOUT_MS		500
-
 /*
  * The minimal and maximal buffer size which are not restricted
  * in the QAT hardware, but with the input buffer size between 4KB
diff --git a/module/zfs/qat_compress.c b/module/zfs/qat_compress.c
index b3c8c1621675..46ccb997a3b7 100644
--- a/module/zfs/qat_compress.c
+++ b/module/zfs/qat_compress.c
@@ -28,7 +28,7 @@
 #include <sys/zfs_context.h>
 #include <sys/byteorder.h>
 #include <sys/zio.h>
-#include "qat.h"
+#include <sys/qat.h>
 
 /*
  * Max instances in a QAT device, each instance is a channel to submit
@@ -404,11 +404,7 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len,
 		}
 
 		/* we now wait until the completion of the operation. */
-		if (!wait_for_completion_interruptible_timeout(&complete,
-		    QAT_TIMEOUT_MS)) {
-			status = CPA_STATUS_FAIL;
-			goto fail;
-		}
+		wait_for_completion(&complete);
 
 		if (dc_results.status != CPA_STATUS_SUCCESS) {
 			status = CPA_STATUS_FAIL;
@@ -463,11 +459,7 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len,
 		}
 
 		/* we now wait until the completion of the operation. */
-		if (!wait_for_completion_interruptible_timeout(&complete,
-		    QAT_TIMEOUT_MS)) {
-			status = CPA_STATUS_FAIL;
-			goto fail;
-		}
+		wait_for_completion(&complete);
 
 		if (dc_results.status != CPA_STATUS_SUCCESS) {
 			status = CPA_STATUS_FAIL;
diff --git a/module/zfs/qat_crypt.c b/module/zfs/qat_crypt.c
index 2170366df142..1e77f143e3ec 100644
--- a/module/zfs/qat_crypt.c
+++ b/module/zfs/qat_crypt.c
@@ -36,7 +36,7 @@
 #include <sys/zio_crypt.h>
 #include "lac/cpa_cy_im.h"
 #include "lac/cpa_cy_common.h"
-#include "qat.h"
+#include <sys/qat.h>
 
 /*
  * Max instances in a QAT device, each instance is a channel to submit
@@ -415,6 +415,9 @@ qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf,
 	op_data.messageLenToCipherInBytes = enc_len;
 	op_data.ivLenInBytes = ZIO_DATA_IV_LEN;
 	bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN);
+	/* if dir is QAT_DECRYPT, copy digest_buf to pDigestResult */
+	if (dir == QAT_DECRYPT)
+		bcopy(digest_buf, op_data.pDigestResult, ZIO_DATA_MAC_LEN);
 
 	cb.verify_result = CPA_FALSE;
 	init_completion(&cb.complete);
@@ -423,23 +426,21 @@ qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf,
 	if (status != CPA_STATUS_SUCCESS)
 		goto fail;
 
-	if (!wait_for_completion_interruptible_timeout(&cb.complete,
-	    QAT_TIMEOUT_MS)) {
-		status = CPA_STATUS_FAIL;
-		goto fail;
-	}
+	/* we now wait until the completion of the operation. */
+	wait_for_completion(&cb.complete);
 
 	if (cb.verify_result == CPA_FALSE) {
 		status = CPA_STATUS_FAIL;
 		goto fail;
 	}
 
-	/* save digest result to digest_buf */
-	bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN);
-	if (dir == QAT_ENCRYPT)
+	if (dir == QAT_ENCRYPT) {
+		/* if dir is QAT_ENCRYPT, save pDigestResult to digest_buf */
+		bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN);
 		QAT_STAT_INCR(encrypt_total_out_bytes, enc_len);
-	else
+	} else {
 		QAT_STAT_INCR(decrypt_total_out_bytes, enc_len);
+	}
 
 fail:
 	if (status != CPA_STATUS_SUCCESS)
@@ -549,11 +550,9 @@ qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp)
 	if (status != CPA_STATUS_SUCCESS)
 		goto fail;
 
-	if (!wait_for_completion_interruptible_timeout(&cb.complete,
-	    QAT_TIMEOUT_MS)) {
-		status = CPA_STATUS_FAIL;
-		goto fail;
-	}
+	/* we now wait until the completion of the operation. */
+	wait_for_completion(&cb.complete);
+
 	if (cb.verify_result == CPA_FALSE) {
 		status = CPA_STATUS_FAIL;
 		goto fail;

From 63d8f57fe794dadc629c430470545b636665c1b6 Mon Sep 17 00:00:00 2001
From: loli10K <loli10K@users.noreply.github.com>
Date: Sat, 14 Sep 2019 03:09:59 +0200
Subject: [PATCH 106/109] Scrubbing root pools may deadlock on kernels without
 elevator_change() (#9321)

Originally the zfs_vdev_elevator module option was added as a
convenience so the requested elevator would be automatically set
on the underlying block devices. At the time this was simple
because the kernel provided an API function which did exactly this.

This API was then removed in the Linux 4.12 kernel which prompted
us to add compatibly code to set the elevator via a usermodehelper.

Unfortunately changing the evelator via usermodehelper requires reading
some userland binaries, most notably modprobe(8) or sh(1), from a zfs
dataset on systems with root-on-zfs. This can deadlock the system if
used during the following call path because it may need, if the data
is not already cached in the ARC, reading directly from disk while
holding the spa config lock as a writer:

  zfs_ioc_pool_scan()
    -> spa_scan()
      -> spa_scan()
        -> vdev_reopen()
          -> vdev_elevator_switch()
            -> call_usermodehelper()

While the usermodehelper waits sh(1), modprobe(8) is blocked in the
ZIO pipeline trying to read from disk:

  INFO: task modprobe:2650 blocked for more than 10 seconds.
       Tainted: P           OE     5.2.14
  modprobe        D    0  2650    206 0x00000000
  Call Trace:
   ? __schedule+0x244/0x5f0
   schedule+0x2f/0xa0
   cv_wait_common+0x156/0x290 [spl]
   ? do_wait_intr_irq+0xb0/0xb0
   spa_config_enter+0x13b/0x1e0 [zfs]
   zio_vdev_io_start+0x51d/0x590 [zfs]
   ? tsd_get_by_thread+0x3b/0x80 [spl]
   zio_nowait+0x142/0x2f0 [zfs]
   arc_read+0xb2d/0x19d0 [zfs]
   ...
   zpl_iter_read+0xfa/0x170 [zfs]
   new_sync_read+0x124/0x1b0
   vfs_read+0x91/0x140
   ksys_read+0x59/0xd0
   do_syscall_64+0x4f/0x130
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

This commit changes how we use the usermodehelper functionality from
synchronous (UMH_WAIT_PROC) to asynchronous (UMH_NO_WAIT) which prevents
scrubs, and other vdev_elevator_switch() consumers, from triggering the
aforementioned issue.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
Issue #8664
Closes #9321
---
 module/zfs/vdev_disk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 1686ddfce77d..46437f21fb78 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -220,7 +220,7 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
 	char *envp[] = { NULL };
 
 	argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
-	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	error = call_usermodehelper(argv[0], argv, envp, UMH_NO_WAIT);
 	strfree(argv[2]);
 #endif /* HAVE_ELEVATOR_CHANGE */
 	if (error) {

From 12a78fbb4fcbba6c4c8d9b0aa34d23e33107b0ae Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Wed, 18 Sep 2019 19:04:45 +0300
Subject: [PATCH 107/109] Fix dsl_scan_ds_clone_swapped logic

The was incorrect with respect to swapping dataset IDs both in the
on-disk ZAP object and the in-memory queue.

In both cases, if ds1 was already present, then it would be first
replaced with ds2 and then ds would be replaced back with ds1.
Also, both cases did not properly handle a situation where both ds1 and
ds2 are already queued.  A duplicate insertion would be attempted and
its failure would result in a panic.

Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tom Caputi <tcaputi@datto.com>
Signed-off-by: Andriy Gapon <avg@FreeBSD.org>
Closes #9140
Closes #9163
---
 module/zfs/dsl_scan.c | 100 +++++++++++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 31 deletions(-)

diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 04a439fad5c5..9ccb17b7e141 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2165,16 +2165,17 @@ ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2,
 }
 
 /*
- * Called when a parent dataset and its clone are swapped. If we were
+ * Called when an origin dataset and its clone are swapped.  If we were
  * currently traversing the dataset, we need to switch to traversing the
- * newly promoted parent.
+ * newly promoted clone.
  */
 void
 dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
-	uint64_t mintxg;
+	uint64_t mintxg1, mintxg2;
+	boolean_t ds1_queued, ds2_queued;
 
 	if (!dsl_scan_is_running(scn))
 		return;
@@ -2182,44 +2183,81 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
 	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
 	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
 
-	if (scan_ds_queue_contains(scn, ds1->ds_object, &mintxg)) {
-		scan_ds_queue_remove(scn, ds1->ds_object);
-		scan_ds_queue_insert(scn, ds2->ds_object, mintxg);
+	/*
+	 * Handle the in-memory scan queue.
+	 */
+	ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1);
+	ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2);
+
+	/* Sanity checking. */
+	if (ds1_queued) {
+		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+	}
+	if (ds2_queued) {
+		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 	}
-	if (scan_ds_queue_contains(scn, ds2->ds_object, &mintxg)) {
+
+	if (ds1_queued && ds2_queued) {
+		/*
+		 * If both are queued, we don't need to do anything.
+		 * The swapping code below would not handle this case correctly,
+		 * since we can't insert ds2 if it is already there. That's
+		 * because scan_ds_queue_insert() prohibits a duplicate insert
+		 * and panics.
+		 */
+	} else if (ds1_queued) {
+		scan_ds_queue_remove(scn, ds1->ds_object);
+		scan_ds_queue_insert(scn, ds2->ds_object, mintxg1);
+	} else if (ds2_queued) {
 		scan_ds_queue_remove(scn, ds2->ds_object);
-		scan_ds_queue_insert(scn, ds1->ds_object, mintxg);
+		scan_ds_queue_insert(scn, ds1->ds_object, mintxg2);
 	}
 
-	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
-	    ds1->ds_object, &mintxg) == 0) {
-		int err;
-		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
-		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
-		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+	/*
+	 * Handle the on-disk scan queue.
+	 * The on-disk state is an out-of-date version of the in-memory state,
+	 * so the in-memory and on-disk values for ds1_queued and ds2_queued may
+	 * be different. Therefore we need to apply the swap logic to the
+	 * on-disk state independently of the in-memory state.
+	 */
+	ds1_queued = zap_lookup_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0;
+	ds2_queued = zap_lookup_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0;
+
+	/* Sanity checking. */
+	if (ds1_queued) {
+		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+	}
+	if (ds2_queued) {
+		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+	}
+
+	if (ds1_queued && ds2_queued) {
+		/*
+		 * If both are queued, we don't need to do anything.
+		 * Alternatively, we could check for EEXIST from
+		 * zap_add_int_key() and back out to the original state, but
+		 * that would be more work than checking for this case upfront.
+		 */
+	} else if (ds1_queued) {
+		VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
-		err = zap_add_int_key(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
-		VERIFY(err == 0 || err == EEXIST);
-		if (err == EEXIST) {
-			/* Both were there to begin with */
-			VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
-			    scn->scn_phys.scn_queue_obj,
-			    ds1->ds_object, mintxg, tx));
-		}
+		VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx));
 		zfs_dbgmsg("clone_swap ds %llu; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds1->ds_object,
 		    (u_longlong_t)ds2->ds_object);
-	}
-	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
-	    ds2->ds_object, &mintxg) == 0) {
-		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
-		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
-		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+	} else if (ds2_queued) {
+		VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
-		VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
+		VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx));
 		zfs_dbgmsg("clone_swap ds %llu; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds2->ds_object,

From c37fa0d5a86c1ce646fcceecfbb069d3dc1dc36d Mon Sep 17 00:00:00 2001
From: Kody A Kantor <kody.kantor@gmail.com>
Date: Sun, 22 Sep 2019 17:25:39 -0500
Subject: [PATCH 108/109] Disabled resilver_defer feature leads to looping
 resilvers

When a disk is replaced with another on a pool with the resilver_defer
feature present, but not enabled the resilver activity restarts during
each spa_sync. This patch checks to make sure that the resilver_defer
feature is first enabled before requesting a deferred resilver.

This was originally fixed in illumos-joyent as OS-7982.

Reviewed-by: Chris Dunlop <chris@onthe.net.au>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tom Caputi <tcaputi@datto.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Signed-off-by: Kody A Kantor <kody@kkantor.com>
External-issue: illumos-joyent OS-7982
Closes #9299
Closes #9338
---
 module/zfs/dsl_scan.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 9ccb17b7e141..202c6e8d8f3f 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright 2016 Gary Mills
  * Copyright (c) 2017 Datto Inc.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/dsl_scan.h>
@@ -952,13 +952,16 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 		 * will find the drives that need to be resilvered
 		 * when the machine reboots and start the resilver then.
 		 */
-		boolean_t resilver_needed =
-		    dsl_scan_clear_deferred(spa->spa_root_vdev, tx);
-		if (resilver_needed) {
-			spa_history_log_internal(spa,
-			    "starting deferred resilver", tx,
-			    "errors=%llu", spa_get_errlog_size(spa));
-			spa_async_request(spa, SPA_ASYNC_RESILVER);
+		if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
+			boolean_t resilver_needed =
+			    dsl_scan_clear_deferred(spa->spa_root_vdev, tx);
+			if (resilver_needed) {
+				spa_history_log_internal(spa,
+				    "starting deferred resilver", tx,
+				    "errors=%llu",
+				    (u_longlong_t)spa_get_errlog_size(spa));
+				spa_async_request(spa, SPA_ASYNC_RESILVER);
+			}
 		}
 	}
 

From 1222e921c9e3d8f5c693f196435be4604a1187c0 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Fri, 23 Aug 2019 15:52:32 -0700
Subject: [PATCH 109/109] Tag zfs-0.8.2

META file and changelog updated.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
---
 META | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/META b/META
index d9285b7732e4..960a2b73ab30 100644
--- a/META
+++ b/META
@@ -1,10 +1,10 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       0.8.1
+Version:       0.8.2
 Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS on Linux
-Linux-Maximum: 5.1
+Linux-Maximum: 5.3
 Linux-Minimum: 2.6.32