openzfs · ryao · May 13, 2014 · Apr 17, 2014 · Apr 17, 2014 · Apr 17, 2014
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
@@ -94,6 +94,7 @@ KERNEL_H = \
 	$(top_srcdir)/include/sys/varargs.h \
 	$(top_srcdir)/include/sys/vfs.h \
 	$(top_srcdir)/include/sys/vfs_opreg.h \
+	$(top_srcdir)/include/sys/vmem.h \
 	$(top_srcdir)/include/sys/vmsystm.h \
 	$(top_srcdir)/include/sys/vnode.h \
 	$(top_srcdir)/include/sys/zmod.h \

diff --git a/include/sys/kmem.h b/include/sys/kmem.h
@@ -35,164 +35,87 @@
 #include <linux/ctype.h>
 #include <asm/atomic.h>
 #include <sys/types.h>
+#include <sys/vmem.h>
 #include <sys/vmsystm.h>
 #include <sys/kstat.h>
 #include <sys/taskq.h>
 
 /*
  * Memory allocation interfaces
  */
-#define KM_SLEEP	GFP_KERNEL	/* Can sleep, never fails */
-#define KM_NOSLEEP	GFP_ATOMIC	/* Can not sleep, may fail */
-#define KM_PUSHPAGE	(GFP_NOIO | __GFP_HIGH)	/* Use reserved memory */
-#define KM_NODEBUG	__GFP_NOWARN	/* Suppress warnings */
-#define KM_FLAGS	__GFP_BITS_MASK
-#define KM_VMFLAGS	GFP_LEVEL_MASK
+#define	KM_SLEEP	0x0000	/* can block for memory; success guaranteed */
+#define	KM_NOSLEEP	0x0001	/* cannot block for memory; may fail */
+#define	KM_PUSHPAGE	0x0004	/* can block for memory; may use reserve */
+#define	KM_ZERO		0x1000	/* zero the allocation */
 
-/*
- * Used internally, the kernel does not need to support this flag
- */
-#ifndef __GFP_ZERO
-# define __GFP_ZERO                     0x8000
-#endif
-
-/*
- * PF_NOFS is a per-process debug flag which is set in current->flags to
- * detect when a process is performing an unsafe allocation.  All tasks
- * with PF_NOFS set must strictly use KM_PUSHPAGE for allocations because
- * if they enter direct reclaim and initiate I/O the may deadlock.
- *
- * When debugging is disabled, any incorrect usage will be detected and
- * a call stack with warning will be printed to the console.  The flags
- * will then be automatically corrected to allow for safe execution.  If
- * debugging is enabled this will be treated as a fatal condition.
- *
- * To avoid any risk of conflicting with the existing PF_ flags.  The
- * PF_NOFS bit shadows the rarely used PF_MUTEX_TESTER bit.  Only when
- * CONFIG_RT_MUTEX_TESTER is not set, and we know this bit is unused,
- * will the PF_NOFS bit be valid.  Happily, most existing distributions
- * ship a kernel with CONFIG_RT_MUTEX_TESTER disabled.
- */
-#if !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER)
-# define PF_NOFS			PF_MUTEX_TESTER
+#define KM_PUBLIC_MASK	(KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE)
 
-static inline void
-sanitize_flags(struct task_struct *p, gfp_t *flags)
-{
-	if (unlikely((p->flags & PF_NOFS) && (*flags & (__GFP_IO|__GFP_FS)))) {
-# ifdef NDEBUG
-		SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "Fixing allocation for "
-		   "task %s (%d) which used GFP flags 0x%x with PF_NOFS set\n",
-		    p->comm, p->pid, flags);
-		spl_debug_dumpstack(p);
-		*flags &= ~(__GFP_IO|__GFP_FS);
-# else
-		PANIC("FATAL allocation for task %s (%d) which used GFP "
-		    "flags 0x%x with PF_NOFS set\n", p->comm, p->pid, flags);
-# endif /* NDEBUG */
-	}
-}
-#else
-# define PF_NOFS			0x00000000
-# define sanitize_flags(p, fl)		((void)0)
-#endif /* !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER) */
+/* XXX: Modify the code to stop using these */
+#define	KM_NODEBUG	0x0
 
 /*
- * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as
- * early as 2.6.32.  To avoid this issue when it occurs in upstream kernels
- * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC).
- * I would prefer the caller handle the failure case cleanly but we are
- * trying to emulate Solaris and those are not the Solaris semantics.
+ * We use a special process flag to avoid recursive callbacks into
+ * the filesystem during transactions.  We will also issue our own
+ * warnings, so we explicitly skip any generic ones (silly of us).
  */
-static inline void *
-kmalloc_nofail(size_t size, gfp_t flags)
+static inline gfp_t
+kmem_flags_convert(int flags)
 {
-	void *ptr;
+	gfp_t lflags = __GFP_NOWARN;
+
+	if (flags & KM_NOSLEEP) {
+		lflags |= GFP_ATOMIC | __GFP_NORETRY;
+	} else {
+		lflags |= GFP_KERNEL;
+		if ((current->flags & PF_FSTRANS))
+			lflags &= ~(__GFP_IO|__GFP_FS);
+	}
 
-	sanitize_flags(current, &flags);
+	if (flags & KM_PUSHPAGE)
+		lflags |= __GFP_HIGH;
 
-	do {
-		ptr = kmalloc(size, flags);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
+	if (flags & KM_ZERO)
+		lflags |= __GFP_ZERO;
 
-	return ptr;
+	return (lflags);
 }
 
-static inline void *
-kzalloc_nofail(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	sanitize_flags(current, &flags);
-
-	do {
-		ptr = kzalloc(size, flags);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
-
-	return ptr;
-}
+typedef struct {
+	struct task_struct *fstrans_thread;
+	unsigned int saved_flags;
+} fstrans_cookie_t;
 
-static inline void *
-kmalloc_node_nofail(size_t size, gfp_t flags, int node)
+static inline fstrans_cookie_t
+spl_fstrans_mark(void)
 {
-	void *ptr;
+	fstrans_cookie_t cookie;
 
-	sanitize_flags(current, &flags);
+	VERIFY(cookie.fstrans_thread = current);
 
-	do {
-		ptr = kmalloc_node(size, flags, node);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
+	cookie.saved_flags = current->flags & PF_FSTRANS;
+	current->flags |= PF_FSTRANS;
 
-	return ptr;
+	return (cookie);
 }
 
-static inline void *
-vmalloc_nofail(size_t size, gfp_t flags)
+static inline void
+spl_fstrans_unmark(fstrans_cookie_t cookie)
 {
-	void *ptr;
-
-	sanitize_flags(current, &flags);
-
-	/*
-	 * Retry failed __vmalloc() allocations once every second.  The
-	 * rational for the delay is that the likely failure modes are:
-	 *
-	 * 1) The system has completely exhausted memory, in which case
-	 *    delaying 1 second for the memory reclaim to run is reasonable
-	 *    to avoid thrashing the system.
-	 * 2) The system has memory but has exhausted the small virtual
-	 *    address space available on 32-bit systems.  Retrying the
-	 *    allocation immediately will only result in spinning on the
-	 *    virtual address space lock.  It is better delay a second and
-	 *    hope that another process will free some of the address space.
-	 *    But the bottom line is there is not much we can actually do
-	 *    since we can never safely return a failure and honor the
-	 *    Solaris semantics.
-	 */
-	while (1) {
-		ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
-		if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ);
-		} else {
-			break;
-		}
-	}
+	VERIFY(cookie.fstrans_thread == current);
+	VERIFY(current->flags & PF_FSTRANS);
 
-	return ptr;
+	current->flags &= ~(PF_FSTRANS);
+	current->flags |= cookie.saved_flags;
 }
 
-static inline void *
-vzalloc_nofail(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	ptr = vmalloc_nofail(size, flags);
-	if (ptr)
-		memset(ptr, 0, (size));
+/*
+ * This is a version of vmalloc() that hooks into PF_FSTRANS.
+ */
+extern void *spl_vmalloc (unsigned long size, gfp_t lflags, pgprot_t prot);
 
-	return ptr;
-}
+extern void *spl_kmem_alloc(size_t size, int flags);
+extern void *spl_kmem_zalloc(size_t size, int flags);
+extern void spl_kmem_free(const void *buf, size_t size);
 
 #ifdef DEBUG_KMEM
 
@@ -205,31 +128,19 @@ vzalloc_nofail(size_t size, gfp_t flags)
 # define kmem_alloc_used_sub(size)      atomic64_sub(size, &kmem_alloc_used)
 # define kmem_alloc_used_read()         atomic64_read(&kmem_alloc_used)
 # define kmem_alloc_used_set(size)      atomic64_set(&kmem_alloc_used, size)
-# define vmem_alloc_used_add(size)      atomic64_add(size, &vmem_alloc_used)
-# define vmem_alloc_used_sub(size)      atomic64_sub(size, &vmem_alloc_used)
-# define vmem_alloc_used_read()         atomic64_read(&vmem_alloc_used)
-# define vmem_alloc_used_set(size)      atomic64_set(&vmem_alloc_used, size)
 
 extern atomic64_t kmem_alloc_used;
 extern unsigned long long kmem_alloc_max;
-extern atomic64_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
 
 # else  /* HAVE_ATOMIC64_T */
 
 # define kmem_alloc_used_add(size)      atomic_add(size, &kmem_alloc_used)
 # define kmem_alloc_used_sub(size)      atomic_sub(size, &kmem_alloc_used)
 # define kmem_alloc_used_read()         atomic_read(&kmem_alloc_used)
 # define kmem_alloc_used_set(size)      atomic_set(&kmem_alloc_used, size)
-# define vmem_alloc_used_add(size)      atomic_add(size, &vmem_alloc_used)
-# define vmem_alloc_used_sub(size)      atomic_sub(size, &vmem_alloc_used)
-# define vmem_alloc_used_read()         atomic_read(&vmem_alloc_used)
-# define vmem_alloc_used_set(size)      atomic_set(&vmem_alloc_used, size)
 
 extern atomic_t kmem_alloc_used;
 extern unsigned long long kmem_alloc_max;
-extern atomic_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
 
 # endif /* HAVE_ATOMIC64_T */
 
@@ -246,23 +157,15 @@ extern unsigned long long vmem_alloc_max;
  * --enable-debug-kmem-tracking to configure.
  */
 #  define kmem_alloc(sz, fl)            kmem_alloc_track((sz), (fl),           \
-                                             __FUNCTION__, __LINE__, 0, 0)
-#  define kmem_zalloc(sz, fl)           kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
-                                             __FUNCTION__, __LINE__, 0, 0)
-#  define kmem_alloc_node(sz, fl, nd)   kmem_alloc_track((sz), (fl),           \
-                                             __FUNCTION__, __LINE__, 1, nd)
+                                             __FUNCTION__, __LINE__, \
+					     NUMA_NO_NODE)
+#  define kmem_zalloc(sz, fl)           kmem_alloc_track((sz), (fl)|KM_ZERO,\
+                                             __FUNCTION__, __LINE__, \
+					     NUMA_NO_NODE)
 #  define kmem_free(ptr, sz)            kmem_free_track((ptr), (sz))
 
-#  define vmem_alloc(sz, fl)            vmem_alloc_track((sz), (fl),           \
-                                             __FUNCTION__, __LINE__)
-#  define vmem_zalloc(sz, fl)           vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
-                                             __FUNCTION__, __LINE__)
-#  define vmem_free(ptr, sz)            vmem_free_track((ptr), (sz))
-
-extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
+extern void *kmem_alloc_track(size_t, int, const char *, int, int);
 extern void kmem_free_track(const void *, size_t);
-extern void *vmem_alloc_track(size_t, int, const char *, int);
-extern void vmem_free_track(const void *, size_t);
 
 # else /* DEBUG_KMEM_TRACKING */
 /*
@@ -275,23 +178,15 @@ extern void vmem_free_track(const void *, size_t);
  * pass the --disable-debug-kmem option to configure.
  */
 #  define kmem_alloc(sz, fl)            kmem_alloc_debug((sz), (fl),           \
-                                             __FUNCTION__, __LINE__, 0, 0)
-#  define kmem_zalloc(sz, fl)           kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
-                                             __FUNCTION__, __LINE__, 0, 0)
-#  define kmem_alloc_node(sz, fl, nd)   kmem_alloc_debug((sz), (fl),           \
-                                             __FUNCTION__, __LINE__, 1, nd)
+                                             __FUNCTION__, __LINE__, \
+					     NUMA_NO_NODE)
+#  define kmem_zalloc(sz, fl)           kmem_alloc_debug((sz), (fl)|KM_ZERO,\
+                                             __FUNCTION__, __LINE__, \
+					     NUMA_NO_NODE)
 #  define kmem_free(ptr, sz)            kmem_free_debug((ptr), (sz))
 
-#  define vmem_alloc(sz, fl)            vmem_alloc_debug((sz), (fl),           \
-                                             __FUNCTION__, __LINE__)
-#  define vmem_zalloc(sz, fl)           vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
-                                             __FUNCTION__, __LINE__)
-#  define vmem_free(ptr, sz)            vmem_free_debug((ptr), (sz))
-
-extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
+extern void *kmem_alloc_debug(size_t, int, const char *, int, int);
 extern void kmem_free_debug(const void *, size_t);
-extern void *vmem_alloc_debug(size_t, int, const char *, int);
-extern void vmem_free_debug(const void *, size_t);
 
 # endif /* DEBUG_KMEM_TRACKING */
 #else /* DEBUG_KMEM */
@@ -302,14 +197,9 @@ extern void vmem_free_debug(const void *, size_t);
  * minimal memory accounting.  To enable basic accounting pass the
  * --enable-debug-kmem option to configure.
  */
-# define kmem_alloc(sz, fl)             kmalloc_nofail((sz), (fl))
-# define kmem_zalloc(sz, fl)            kzalloc_nofail((sz), (fl))
-# define kmem_alloc_node(sz, fl, nd)    kmalloc_node_nofail((sz), (fl), (nd))
-# define kmem_free(ptr, sz)             ((void)(sz), kfree(ptr))
-
-# define vmem_alloc(sz, fl)             vmalloc_nofail((sz), (fl))
-# define vmem_zalloc(sz, fl)            vzalloc_nofail((sz), (fl))
-# define vmem_free(ptr, sz)             ((void)(sz), vfree(ptr))
+# define kmem_alloc(sz, fl)             spl_kmem_alloc((sz), (fl))
+# define kmem_zalloc(sz, fl)            spl_kmem_zalloc((sz), (fl))
+# define kmem_free(ptr, sz)             spl_kmem_free((ptr), (sz))
 
 #endif /* DEBUG_KMEM */
 
@@ -340,12 +230,13 @@ enum {
 	KMC_BIT_OFFSLAB		= 8,	/* Objects not on slab */
 	KMC_BIT_NOEMERGENCY	= 9,	/* Disable emergency objects */
 	KMC_BIT_DEADLOCKED      = 14,	/* Deadlock detected */
-	KMC_BIT_GROWING         = 15,   /* Growing in progress */
-	KMC_BIT_REAPING		= 16,	/* Reaping in progress */
-	KMC_BIT_DESTROY		= 17,	/* Destroy in progress */
-	KMC_BIT_TOTAL		= 18,	/* Proc handler helper bit */
-	KMC_BIT_ALLOC		= 19,	/* Proc handler helper bit */
-	KMC_BIT_MAX		= 20,	/* Proc handler helper bit */
+	KMC_BIT_GROWING         = 15,   /* Growing in progress for KM_SLEEP */
+	KMC_BIT_GROWING_HIGH	= 16,	/* Growing in progress */
+	KMC_BIT_REAPING		= 17,	/* Reaping in progress */
+	KMC_BIT_DESTROY		= 18,	/* Destroy in progress */
+	KMC_BIT_TOTAL		= 19,	/* Proc handler helper bit */
+	KMC_BIT_ALLOC		= 20,	/* Proc handler helper bit */
+	KMC_BIT_MAX		= 21,	/* Proc handler helper bit */
 };
 
 /* kmem move callback return values */
@@ -369,6 +260,7 @@ typedef enum kmem_cbrc {
 #define KMC_NOEMERGENCY		(1 << KMC_BIT_NOEMERGENCY)
 #define KMC_DEADLOCKED		(1 << KMC_BIT_DEADLOCKED)
 #define KMC_GROWING		(1 << KMC_BIT_GROWING)
+#define KMC_GROWING_HIGH	(1 << KMC_BIT_GROWING_HIGH)
 #define KMC_REAPING		(1 << KMC_BIT_REAPING)
 #define KMC_DESTROY		(1 << KMC_BIT_DESTROY)
 #define KMC_TOTAL		(1 << KMC_BIT_TOTAL)
@@ -510,8 +402,6 @@ void spl_kmem_fini(void);
 #define kmem_cache_reap_now(skc)	\
         spl_kmem_cache_reap_now(skc, skc->skc_reap)
 #define kmem_reap()			spl_kmem_reap()
-#define kmem_virt(ptr)			(((ptr) >= (void *)VMALLOC_START) && \
-					 ((ptr) <  (void *)VMALLOC_END))
 
 /*
  * Allow custom slab allocation flags to be set for KMC_SLAB based caches.