Refactor generic memory allocation interfaces

This patch achieves the following goals: 1. It replaces the preprocessor kmem flag to gfp flag mapping with proper translation logic. This eliminates the potential for surprises that were previously possible where kmem flags were mapped to gfp flags. 2. It maps vmem_alloc() allocations to kmem_alloc() for allocations sized less than or equal to spl_kmem_alloc_max. This ensures that small allocations will not contend on a single global lock, large allocations can still be handled, and potentially limited virtual address space will not be squandered. This behavior is entirely different than under Illumos due to different memory management stratagies employed by the respective kernels. However, this functionally provides the sematics we require. 3. The --disable-debug-kmem, --enable-debug-kmem (default), and --enable-debug-kmem-tracking allocators have been unified in to a single spl_kmem_alloc_impl() allocation function. This was done to simplify the code and make it more maintainable. 4. Improve portability by exposing an implementation of the memory allocations functions that can be safely used in the same way they are used on Illumos. Specifically, callers may safely using KM_SLEEP in contexts which perform filesystem IO. This allows us to eliminate an entire class of Linux specific changes which were previously required to avoid deadlocking the system. This change will be largely transparent to existing callers by there are a few caveats: 1. Because the headers were refactored it extraneous includes removed callers may find they need to explicitly add additional #includes. In particular, kmem_cache.h must now be explicitly includes to access the SPL's kmem cache implementation. This behavior is different from Illumos but it was done to avoid always masking the Linux slab functions when kmem.h is included. 2. Callers, like Lustre, which made assumptions about the definitions of KM_SLEEP, KM_NOSLEEP, and KM_PUSHPAGE will need to be updated. Other callers such as ZFS which did not will not require changes. 3. KM_PUSHPAGE is no longer overloaded to imply GFP_NOIO. It retains its original meaning of allowing allocations to access reserved memory. KM_PUSHPAGE callers can be converted back to KM_SLEEP. 4. The KM_NODEBUG flags has been retired and the default warning threshold increased to 32k. 5. The kmem_virt() functions has been removed. For callers which need to distinguish between a physical and virtual address use is_vmalloc_addr().
openzfs · Dec 17, 2014 · b3db9cb · b3db9cb
1 parent 75356ba
commit b3db9cb
Show file tree

Hide file tree

Showing 10 changed files with 463 additions and 775 deletions.
diff --git a/include/sys/kmem.h b/include/sys/kmem.h
@@ -26,6 +26,7 @@
 #define	_SPL_KMEM_H
 
 #include <linux/slab.h>
+#include <linux/sched.h>
 
 extern int kmem_debugging(void);
 extern char *kmem_vasprintf(const char *fmt, va_list ap);
@@ -36,68 +37,41 @@ extern void strfree(char *str);
 /*
  * Memory allocation interfaces
  */
-#define	KM_SLEEP	GFP_KERNEL	/* Can sleep, never fails */
-#define	KM_NOSLEEP	GFP_ATOMIC	/* Can not sleep, may fail */
-#define	KM_PUSHPAGE	(GFP_NOIO | __GFP_HIGH)	/* Use reserved memory */
-#define	KM_NODEBUG	__GFP_NOWARN	/* Suppress warnings */
-#define	KM_FLAGS	__GFP_BITS_MASK
-#define	KM_VMFLAGS	GFP_LEVEL_MASK
+#define	KM_SLEEP	0x0000	/* can block for memory; success guaranteed */
+#define	KM_NOSLEEP	0x0001	/* cannot block for memory; may fail */
+#define	KM_PUSHPAGE	0x0004	/* can block for memory; may use reserve */
+#define	KM_ZERO		0x1000	/* zero the allocation */
+#define	KM_VMEM		0x2000	/* caller is vmem_* wrapper */
 
-/*
- * Used internally, the kernel does not need to support this flag
- */
-#ifndef __GFP_ZERO
-#define	__GFP_ZERO	0x8000
-#endif
+#define	KM_PUBLIC_MASK	(KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE)
 
 /*
- * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as
- * early as 2.6.32.  To avoid this issue when it occurs in upstream kernels
- * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC).
- * I would prefer the caller handle the failure case cleanly but we are
- * trying to emulate Solaris and those are not the Solaris semantics.
+ * Convert a KM_* flags mask to its Linux GFP_* counterpart.  The conversion
+ * function is context aware which means that KM_SLEEP allocations can be
+ * safely used in syncing contexts which have set PF_FSTRANS.
  */
-static inline void *
-kmalloc_nofail(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	do {
-		ptr = kmalloc(size, flags);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
-
-	return (ptr);
-}
-
-static inline void *
-kzalloc_nofail(size_t size, gfp_t flags)
+static inline gfp_t
+kmem_flags_convert(int flags)
 {
-	void *ptr;
-
-	do {
-		ptr = kzalloc(size, flags);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
+	gfp_t lflags = __GFP_NOWARN | __GFP_COMP;
 
-	return (ptr);
-}
+	if (flags & KM_NOSLEEP) {
+		lflags |= GFP_ATOMIC | __GFP_NORETRY;
+	} else {
+		lflags |= GFP_KERNEL;
+		if ((current->flags & PF_FSTRANS))
+			lflags &= ~(__GFP_IO|__GFP_FS);
+	}
 
-static inline void *
-kmalloc_node_nofail(size_t size, gfp_t flags, int node)
-{
-	void *ptr;
+	if (flags & KM_PUSHPAGE)
+		lflags |= __GFP_HIGH;
 
-	do {
-		ptr = kmalloc_node(size, flags, node);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
+	if (flags & KM_ZERO)
+		lflags |= __GFP_ZERO;
 
-	return (ptr);
+	return (lflags);
 }
 
-#ifdef DEBUG_KMEM
-
-/*
- * Memory accounting functions to be used only when DEBUG_KMEM is set.
- */
 #ifdef HAVE_ATOMIC64_T
 #define	kmem_alloc_used_add(size)	atomic64_add(size, &kmem_alloc_used)
 #define	kmem_alloc_used_sub(size)	atomic64_sub(size, &kmem_alloc_used)
@@ -114,70 +88,29 @@ extern atomic_t kmem_alloc_used;
 extern unsigned long long kmem_alloc_max;
 #endif /* HAVE_ATOMIC64_T */
 
-#ifdef DEBUG_KMEM_TRACKING
-/*
- * DEBUG_KMEM && DEBUG_KMEM_TRACKING
- *
- * The maximum level of memory debugging.  All memory will be accounted
- * for and each allocation will be explicitly tracked.  Any allocation
- * which is leaked will be reported on module unload and the exact location
- * where that memory was allocation will be reported.  This level of memory
- * tracking will have a significant impact on performance and should only
- * be enabled for debugging.  This feature may be enabled by passing
- * --enable-debug-kmem-tracking to configure.
- */
-#define	kmem_alloc(sz, fl)		kmem_alloc_track((sz), (fl),           \
-					__FUNCTION__, __LINE__, 0, 0)
-#define	kmem_zalloc(sz, fl)		kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
-					__FUNCTION__, __LINE__, 0, 0)
-#define	kmem_alloc_node(sz, fl, nd)	kmem_alloc_track((sz), (fl),           \
-					__FUNCTION__, __LINE__, 1, nd)
-#define	kmem_free(ptr, sz)		kmem_free_track((ptr), (sz))
-
-extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
-extern void kmem_free_track(const void *, size_t);
-
-#else /* DEBUG_KMEM_TRACKING */
-/*
- * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
- *
- * The default build will set DEBUG_KEM.  This provides basic memory
- * accounting with little to no impact on performance.  When the module
- * is unloaded in any memory was leaked the total number of leaked bytes
- * will be reported on the console.  To disable this basic accounting
- * pass the --disable-debug-kmem option to configure.
- */
-#define	kmem_alloc(sz, fl)		kmem_alloc_debug((sz), (fl),           \
-					__FUNCTION__, __LINE__, 0, 0)
-#define	kmem_zalloc(sz, fl)		kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
-					__FUNCTION__, __LINE__, 0, 0)
-#define	kmem_alloc_node(sz, fl, nd)	kmem_alloc_debug((sz), (fl),           \
-					__FUNCTION__, __LINE__, 1, nd)
-#define	kmem_free(ptr, sz)		kmem_free_debug((ptr), (sz))
-
-extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
-extern void kmem_free_debug(const void *, size_t);
-
-#endif /* DEBUG_KMEM_TRACKING */
-#else /* DEBUG_KMEM */
-/*
- * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
- *
- * All debugging is disabled.  There will be no overhead even for
- * minimal memory accounting.  To enable basic accounting pass the
- * --enable-debug-kmem option to configure.
- */
-#define	kmem_alloc(sz, fl)		kmalloc_nofail((sz), (fl))
-#define	kmem_zalloc(sz, fl)		kzalloc_nofail((sz), (fl))
-#define	kmem_alloc_node(sz, fl, nd)	kmalloc_node_nofail((sz), (fl), (nd))
-#define	kmem_free(ptr, sz)		((void)(sz), kfree(ptr))
+extern unsigned int spl_kmem_alloc_warn;
+extern unsigned int spl_kmem_alloc_max;
 
-#endif /* DEBUG_KMEM */
+#define	kmem_alloc(sz, fl)	spl_kmem_alloc((sz), (fl), __func__, __LINE__)
+#define	kmem_zalloc(sz, fl)	spl_kmem_zalloc((sz), (fl), __func__, __LINE__)
+#define	kmem_free(ptr, sz)	spl_kmem_free((ptr), (sz))
 
-int spl_kmem_init(void);
-void spl_kmem_fini(void);
+extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_kmem_free(const void *ptr, size_t sz);
 
-#define	kmem_virt(ptr)			(((ptr) >= (void *)VMALLOC_START) && \
-					((ptr) <  (void *)VMALLOC_END))
+/*
+ * The following functions are only available for internal use.
+ */
+extern void *spl_kmem_alloc_impl(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_debug(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_track(size_t size, int flags,
+    const char *func, int line, int node);
+extern void spl_kmem_free_impl(const void *buf, size_t size);
+extern void spl_kmem_free_debug(const void *buf, size_t size);
+extern void spl_kmem_free_track(const void *buf, size_t size);
+
+extern int spl_kmem_init(void);
+extern void spl_kmem_fini(void);
 
 #endif	/* _SPL_KMEM_H */
diff --git a/include/sys/kmem_cache.h b/include/sys/kmem_cache.h
@@ -202,6 +202,7 @@ extern void spl_kmem_cache_set_move(spl_kmem_cache_t *,
 extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);
 extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags);
 extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj);
+extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags);
 extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count);
 extern void spl_kmem_reap(void);
 
@@ -214,29 +215,6 @@ extern void spl_kmem_reap(void);
 #define	kmem_cache_reap_now(skc)	\
     spl_kmem_cache_reap_now(skc, skc->skc_reap)
 #define	kmem_reap()			spl_kmem_reap()
-#define	kmem_virt(ptr)			\
-    (((ptr) >= (void *)VMALLOC_START) && \
-    ((ptr) <  (void *)VMALLOC_END))
-
-/*
- * Allow custom slab allocation flags to be set for KMC_SLAB based caches.
- * One use for this function is to ensure the __GFP_COMP flag is part of
- * the default allocation mask which ensures higher order allocations are
- * properly refcounted.  This flag was added to the default ->allocflags
- * as of Linux 3.11.
- */
-static inline void
-kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags)
-{
-	if (skc->skc_linux_cache == NULL)
-		return;
-
-#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
-	skc->skc_linux_cache->allocflags |= flags;
-#elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
-	skc->skc_linux_cache->gfpflags |= flags;
-#endif
-}
 
 /*
  * The following functions are only available for internal use.