From 80367a21fdbd031c2910e7657fe53da67e910a7f Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 17 Apr 2014 10:59:00 -0400 Subject: [PATCH 01/11] Revert "Add PF_NOFS debugging flag" This reverts commit eb0f407a2b9089113ef6f2402ebd887511315b43 in preperation for updating the kmem/vmem infrastructure to use the PF_FSTRANS flag. Signed-off-by: Brian Behlendorf --- include/sys/kmem.h | 49 ---------------------------------------------- 1 file changed, 49 deletions(-) diff --git a/include/sys/kmem.h b/include/sys/kmem.h index 936e49d6..2bfb4d07 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -56,47 +56,6 @@ # define __GFP_ZERO 0x8000 #endif -/* - * PF_NOFS is a per-process debug flag which is set in current->flags to - * detect when a process is performing an unsafe allocation. All tasks - * with PF_NOFS set must strictly use KM_PUSHPAGE for allocations because - * if they enter direct reclaim and initiate I/O the may deadlock. - * - * When debugging is disabled, any incorrect usage will be detected and - * a call stack with warning will be printed to the console. The flags - * will then be automatically corrected to allow for safe execution. If - * debugging is enabled this will be treated as a fatal condition. - * - * To avoid any risk of conflicting with the existing PF_ flags. The - * PF_NOFS bit shadows the rarely used PF_MUTEX_TESTER bit. Only when - * CONFIG_RT_MUTEX_TESTER is not set, and we know this bit is unused, - * will the PF_NOFS bit be valid. Happily, most existing distributions - * ship a kernel with CONFIG_RT_MUTEX_TESTER disabled. - */ -#if !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER) -#define PF_NOFS PF_MUTEX_TESTER - -static inline void -sanitize_flags(struct task_struct *p, gfp_t *flags) -{ - if (unlikely((p->flags & PF_NOFS) && (*flags & (__GFP_IO|__GFP_FS)))) { -#ifdef NDEBUG - printk(KERN_WARNING "Fixing allocation for task %s (%d) " - "which used GFP flags 0x%x with PF_NOFS set\n", - p->comm, p->pid, *flags); - spl_dumpstack(); - *flags &= ~(__GFP_IO|__GFP_FS); -#else - PANIC("FATAL allocation for task %s (%d) which used GFP " - "flags 0x%x with PF_NOFS set\n", p->comm, p->pid, *flags); -#endif /* NDEBUG */ - } -} -#else -#define PF_NOFS 0x00000000 -#define sanitize_flags(p, fl) ((void)0) -#endif /* !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER) */ - /* * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as * early as 2.6.32. To avoid this issue when it occurs in upstream kernels @@ -109,8 +68,6 @@ kmalloc_nofail(size_t size, gfp_t flags) { void *ptr; - sanitize_flags(current, &flags); - do { ptr = kmalloc(size, flags); } while (ptr == NULL && (flags & __GFP_WAIT)); @@ -123,8 +80,6 @@ kzalloc_nofail(size_t size, gfp_t flags) { void *ptr; - sanitize_flags(current, &flags); - do { ptr = kzalloc(size, flags); } while (ptr == NULL && (flags & __GFP_WAIT)); @@ -137,8 +92,6 @@ kmalloc_node_nofail(size_t size, gfp_t flags, int node) { void *ptr; - sanitize_flags(current, &flags); - do { ptr = kmalloc_node(size, flags, node); } while (ptr == NULL && (flags & __GFP_WAIT)); @@ -151,8 +104,6 @@ vmalloc_nofail(size_t size, gfp_t flags) { void *ptr; - sanitize_flags(current, &flags); - /* * Retry failed __vmalloc() allocations once every second. The * rational for the delay is that the likely failure modes are: From 128ec5ff65c6562b5f360299a5f1841c1efab68c Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 8 Dec 2014 13:04:42 -0500 Subject: [PATCH 02/11] Refactor existing code This change introduces no functional changes to the memory management interfaces. It only restructures the existing codes by separating the kmem, vmem, and kmem cache implementations in the separate source and header files. Splitting this functionality in to separate files required the addition of spl_vmem_{init,fini}() and spl_kmem_cache_{initi,fini}() functions. Additionally, several minor changes to the #include's were required to accommodate the removal of extraneous header from kmem.h. But again, while large this patch introduces no functional changes. Signed-off-by: Brian Behlendorf --- include/linux/proc_compat.h | 6 +- include/sys/Makefile.am | 2 + include/sys/kmem.h | 310 +----- include/sys/kmem_cache.h | 246 +++++ include/sys/types.h | 1 - include/sys/vmem.h | 183 ++++ include/sys/vmsystm.h | 13 - module/spl/Makefile.in | 2 + module/spl/spl-condvar.c | 1 + module/spl/spl-generic.c | 43 +- module/spl/spl-kmem-cache.c | 1648 ++++++++++++++++++++++++++++ module/spl/spl-kmem.c | 1932 ++------------------------------- module/spl/spl-kstat.c | 1 + module/spl/spl-proc.c | 5 + module/spl/spl-tsd.c | 1 + module/spl/spl-vmem.c | 355 ++++++ module/spl/spl-vnode.c | 1 + module/spl/spl-zlib.c | 1 + module/splat/splat-condvar.c | 3 +- module/splat/splat-internal.h | 1 + module/splat/splat-kmem.c | 3 + module/splat/splat-taskq.c | 2 + module/splat/splat-zlib.c | 1 + 23 files changed, 2576 insertions(+), 2185 deletions(-) create mode 100644 include/sys/kmem_cache.h create mode 100644 include/sys/vmem.h create mode 100644 module/spl/spl-kmem-cache.c create mode 100644 module/spl/spl-vmem.c diff --git a/include/linux/proc_compat.h b/include/linux/proc_compat.h index 5bbe8508..2c57f39d 100644 --- a/include/linux/proc_compat.h +++ b/include/linux/proc_compat.h @@ -22,8 +22,8 @@ * with the SPL. If not, see . \*****************************************************************************/ -#ifndef _SPL_PROC_H -#define _SPL_PROC_H +#ifndef _SPL_PROC_COMPAT_H +#define _SPL_PROC_COMPAT_H #include @@ -32,4 +32,4 @@ extern struct proc_dir_entry *proc_spl_kstat; int spl_proc_init(void); void spl_proc_fini(void); -#endif /* SPL_PROC_H */ +#endif /* SPL_PROC_COMPAT_H */ diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 2d21c572..f9e883fd 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -44,6 +44,7 @@ KERNEL_H = \ $(top_srcdir)/include/sys/isa_defs.h \ $(top_srcdir)/include/sys/kidmap.h \ $(top_srcdir)/include/sys/kmem.h \ + $(top_srcdir)/include/sys/kmem_cache.h \ $(top_srcdir)/include/sys/kobj.h \ $(top_srcdir)/include/sys/kstat.h \ $(top_srcdir)/include/sys/list.h \ @@ -94,6 +95,7 @@ KERNEL_H = \ $(top_srcdir)/include/sys/varargs.h \ $(top_srcdir)/include/sys/vfs.h \ $(top_srcdir)/include/sys/vfs_opreg.h \ + $(top_srcdir)/include/sys/vmem.h \ $(top_srcdir)/include/sys/vmsystm.h \ $(top_srcdir)/include/sys/vnode.h \ $(top_srcdir)/include/sys/zmod.h \ diff --git a/include/sys/kmem.h b/include/sys/kmem.h index 2bfb4d07..ee25e4c8 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -25,19 +25,13 @@ #ifndef _SPL_KMEM_H #define _SPL_KMEM_H -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include + +extern int kmem_debugging(void); +extern char *kmem_vasprintf(const char *fmt, va_list ap); +extern char *kmem_asprintf(const char *fmt, ...); +extern char *strdup(const char *str); +extern void strfree(char *str); /* * Memory allocation interfaces @@ -99,52 +93,6 @@ kmalloc_node_nofail(size_t size, gfp_t flags, int node) return ptr; } -static inline void * -vmalloc_nofail(size_t size, gfp_t flags) -{ - void *ptr; - - /* - * Retry failed __vmalloc() allocations once every second. The - * rational for the delay is that the likely failure modes are: - * - * 1) The system has completely exhausted memory, in which case - * delaying 1 second for the memory reclaim to run is reasonable - * to avoid thrashing the system. - * 2) The system has memory but has exhausted the small virtual - * address space available on 32-bit systems. Retrying the - * allocation immediately will only result in spinning on the - * virtual address space lock. It is better delay a second and - * hope that another process will free some of the address space. - * But the bottom line is there is not much we can actually do - * since we can never safely return a failure and honor the - * Solaris semantics. - */ - while (1) { - ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); - if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - } else { - break; - } - } - - return ptr; -} - -static inline void * -vzalloc_nofail(size_t size, gfp_t flags) -{ - void *ptr; - - ptr = vmalloc_nofail(size, flags); - if (ptr) - memset(ptr, 0, (size)); - - return ptr; -} - #ifdef DEBUG_KMEM /* @@ -156,15 +104,9 @@ vzalloc_nofail(size_t size, gfp_t flags) # define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) # define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used) # define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size) -# define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used) -# define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used) -# define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used) -# define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size) extern atomic64_t kmem_alloc_used; extern unsigned long long kmem_alloc_max; -extern atomic64_t vmem_alloc_used; -extern unsigned long long vmem_alloc_max; # else /* HAVE_ATOMIC64_T */ @@ -172,15 +114,9 @@ extern unsigned long long vmem_alloc_max; # define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used) # define kmem_alloc_used_read() atomic_read(&kmem_alloc_used) # define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size) -# define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used) -# define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used) -# define vmem_alloc_used_read() atomic_read(&vmem_alloc_used) -# define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size) extern atomic_t kmem_alloc_used; extern unsigned long long kmem_alloc_max; -extern atomic_t vmem_alloc_used; -extern unsigned long long vmem_alloc_max; # endif /* HAVE_ATOMIC64_T */ @@ -204,16 +140,8 @@ extern unsigned long long vmem_alloc_max; __FUNCTION__, __LINE__, 1, nd) # define kmem_free(ptr, sz) kmem_free_track((ptr), (sz)) -# define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__) -# define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__) -# define vmem_free(ptr, sz) vmem_free_track((ptr), (sz)) - extern void *kmem_alloc_track(size_t, int, const char *, int, int, int); extern void kmem_free_track(const void *, size_t); -extern void *vmem_alloc_track(size_t, int, const char *, int); -extern void vmem_free_track(const void *, size_t); # else /* DEBUG_KMEM_TRACKING */ /* @@ -233,16 +161,8 @@ extern void vmem_free_track(const void *, size_t); __FUNCTION__, __LINE__, 1, nd) # define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz)) -# define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__) -# define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__) -# define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz)) - extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int); extern void kmem_free_debug(const void *, size_t); -extern void *vmem_alloc_debug(size_t, int, const char *, int); -extern void vmem_free_debug(const void *, size_t); # endif /* DEBUG_KMEM_TRACKING */ #else /* DEBUG_KMEM */ @@ -258,230 +178,12 @@ extern void vmem_free_debug(const void *, size_t); # define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd)) # define kmem_free(ptr, sz) ((void)(sz), kfree(ptr)) -# define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl)) -# define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl)) -# define vmem_free(ptr, sz) ((void)(sz), vfree(ptr)) - #endif /* DEBUG_KMEM */ -extern int kmem_debugging(void); -extern char *kmem_vasprintf(const char *fmt, va_list ap); -extern char *kmem_asprintf(const char *fmt, ...); -extern char *strdup(const char *str); -extern void strfree(char *str); - - -/* - * Slab allocation interfaces. The SPL slab differs from the standard - * Linux SLAB or SLUB primarily in that each cache may be backed by slabs - * allocated from the physical or virtal memory address space. The virtual - * slabs allow for good behavior when allocation large objects of identical - * size. This slab implementation also supports both constructors and - * destructions which the Linux slab does not. - */ -enum { - KMC_BIT_NOTOUCH = 0, /* Don't update ages */ - KMC_BIT_NODEBUG = 1, /* Default behavior */ - KMC_BIT_NOMAGAZINE = 2, /* XXX: Unsupported */ - KMC_BIT_NOHASH = 3, /* XXX: Unsupported */ - KMC_BIT_QCACHE = 4, /* XXX: Unsupported */ - KMC_BIT_KMEM = 5, /* Use kmem cache */ - KMC_BIT_VMEM = 6, /* Use vmem cache */ - KMC_BIT_SLAB = 7, /* Use Linux slab cache */ - KMC_BIT_OFFSLAB = 8, /* Objects not on slab */ - KMC_BIT_NOEMERGENCY = 9, /* Disable emergency objects */ - KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ - KMC_BIT_GROWING = 15, /* Growing in progress */ - KMC_BIT_REAPING = 16, /* Reaping in progress */ - KMC_BIT_DESTROY = 17, /* Destroy in progress */ - KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ - KMC_BIT_ALLOC = 19, /* Proc handler helper bit */ - KMC_BIT_MAX = 20, /* Proc handler helper bit */ -}; - -/* kmem move callback return values */ -typedef enum kmem_cbrc { - KMEM_CBRC_YES = 0, /* Object moved */ - KMEM_CBRC_NO = 1, /* Object not moved */ - KMEM_CBRC_LATER = 2, /* Object not moved, try again later */ - KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */ - KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ -} kmem_cbrc_t; - -#define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH) -#define KMC_NODEBUG (1 << KMC_BIT_NODEBUG) -#define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE) -#define KMC_NOHASH (1 << KMC_BIT_NOHASH) -#define KMC_QCACHE (1 << KMC_BIT_QCACHE) -#define KMC_KMEM (1 << KMC_BIT_KMEM) -#define KMC_VMEM (1 << KMC_BIT_VMEM) -#define KMC_SLAB (1 << KMC_BIT_SLAB) -#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB) -#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY) -#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) -#define KMC_GROWING (1 << KMC_BIT_GROWING) -#define KMC_REAPING (1 << KMC_BIT_REAPING) -#define KMC_DESTROY (1 << KMC_BIT_DESTROY) -#define KMC_TOTAL (1 << KMC_BIT_TOTAL) -#define KMC_ALLOC (1 << KMC_BIT_ALLOC) -#define KMC_MAX (1 << KMC_BIT_MAX) - -#define KMC_REAP_CHUNK INT_MAX -#define KMC_DEFAULT_SEEKS 1 - -#define KMC_EXPIRE_AGE 0x1 /* Due to age */ -#define KMC_EXPIRE_MEM 0x2 /* Due to low memory */ - -#define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */ - -extern unsigned int spl_kmem_cache_expire; -extern struct list_head spl_kmem_cache_list; -extern struct rw_semaphore spl_kmem_cache_sem; - -#define SKM_MAGIC 0x2e2e2e2e -#define SKO_MAGIC 0x20202020 -#define SKS_MAGIC 0x22222222 -#define SKC_MAGIC 0x2c2c2c2c - -#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */ -#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */ -#define SPL_KMEM_CACHE_OBJ_PER_SLAB 16 /* Target objects per slab */ -#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 1 /* Minimum objects per slab */ -#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ - -#define POINTER_IS_VALID(p) 0 /* Unimplemented */ -#define POINTER_INVALIDATE(pp) /* Unimplemented */ - -typedef int (*spl_kmem_ctor_t)(void *, void *, int); -typedef void (*spl_kmem_dtor_t)(void *, void *); -typedef void (*spl_kmem_reclaim_t)(void *); - -typedef struct spl_kmem_magazine { - uint32_t skm_magic; /* Sanity magic */ - uint32_t skm_avail; /* Available objects */ - uint32_t skm_size; /* Magazine size */ - uint32_t skm_refill; /* Batch refill size */ - struct spl_kmem_cache *skm_cache; /* Owned by cache */ - unsigned long skm_age; /* Last cache access */ - unsigned int skm_cpu; /* Owned by cpu */ - void *skm_objs[0]; /* Object pointers */ -} spl_kmem_magazine_t; - -typedef struct spl_kmem_obj { - uint32_t sko_magic; /* Sanity magic */ - void *sko_addr; /* Buffer address */ - struct spl_kmem_slab *sko_slab; /* Owned by slab */ - struct list_head sko_list; /* Free object list linkage */ -} spl_kmem_obj_t; - -typedef struct spl_kmem_slab { - uint32_t sks_magic; /* Sanity magic */ - uint32_t sks_objs; /* Objects per slab */ - struct spl_kmem_cache *sks_cache; /* Owned by cache */ - struct list_head sks_list; /* Slab list linkage */ - struct list_head sks_free_list; /* Free object list */ - unsigned long sks_age; /* Last modify jiffie */ - uint32_t sks_ref; /* Ref count used objects */ -} spl_kmem_slab_t; - -typedef struct spl_kmem_alloc { - struct spl_kmem_cache *ska_cache; /* Owned by cache */ - int ska_flags; /* Allocation flags */ - taskq_ent_t ska_tqe; /* Task queue entry */ -} spl_kmem_alloc_t; - -typedef struct spl_kmem_emergency { - struct rb_node ske_node; /* Emergency tree linkage */ - void *ske_obj; /* Buffer address */ -} spl_kmem_emergency_t; - -typedef struct spl_kmem_cache { - uint32_t skc_magic; /* Sanity magic */ - uint32_t skc_name_size; /* Name length */ - char *skc_name; /* Name string */ - spl_kmem_magazine_t *skc_mag[NR_CPUS]; /* Per-CPU warm cache */ - uint32_t skc_mag_size; /* Magazine size */ - uint32_t skc_mag_refill; /* Magazine refill count */ - spl_kmem_ctor_t skc_ctor; /* Constructor */ - spl_kmem_dtor_t skc_dtor; /* Destructor */ - spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */ - void *skc_private; /* Private data */ - void *skc_vmp; /* Unused */ - struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */ - unsigned long skc_flags; /* Flags */ - uint32_t skc_obj_size; /* Object size */ - uint32_t skc_obj_align; /* Object alignment */ - uint32_t skc_slab_objs; /* Objects per slab */ - uint32_t skc_slab_size; /* Slab size */ - uint32_t skc_delay; /* Slab reclaim interval */ - uint32_t skc_reap; /* Slab reclaim count */ - atomic_t skc_ref; /* Ref count callers */ - taskqid_t skc_taskqid; /* Slab reclaim task */ - struct list_head skc_list; /* List of caches linkage */ - struct list_head skc_complete_list;/* Completely alloc'ed */ - struct list_head skc_partial_list; /* Partially alloc'ed */ - struct rb_root skc_emergency_tree; /* Min sized objects */ - spinlock_t skc_lock; /* Cache lock */ - wait_queue_head_t skc_waitq; /* Allocation waiters */ - uint64_t skc_slab_fail; /* Slab alloc failures */ - uint64_t skc_slab_create;/* Slab creates */ - uint64_t skc_slab_destroy;/* Slab destroys */ - uint64_t skc_slab_total; /* Slab total current */ - uint64_t skc_slab_alloc; /* Slab alloc current */ - uint64_t skc_slab_max; /* Slab max historic */ - uint64_t skc_obj_total; /* Obj total current */ - uint64_t skc_obj_alloc; /* Obj alloc current */ - uint64_t skc_obj_max; /* Obj max historic */ - uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */ - uint64_t skc_obj_emergency; /* Obj emergency current */ - uint64_t skc_obj_emergency_max; /* Obj emergency max */ -} spl_kmem_cache_t; -#define kmem_cache_t spl_kmem_cache_t - -extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size, - size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, - spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags); -extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, - kmem_cbrc_t (*)(void *, void *, size_t, void *)); -extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); -extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); -extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); -extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count); -extern void spl_kmem_reap(void); - int spl_kmem_init(void); void spl_kmem_fini(void); -#define kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) \ - spl_kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) -#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) -#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) -#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) -#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) -#define kmem_cache_reap_now(skc) \ - spl_kmem_cache_reap_now(skc, skc->skc_reap) -#define kmem_reap() spl_kmem_reap() #define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \ ((ptr) < (void *)VMALLOC_END)) -/* - * Allow custom slab allocation flags to be set for KMC_SLAB based caches. - * One use for this function is to ensure the __GFP_COMP flag is part of - * the default allocation mask which ensures higher order allocations are - * properly refcounted. This flag was added to the default ->allocflags - * as of Linux 3.11. - */ -static inline void -kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags) -{ - if (skc->skc_linux_cache == NULL) - return; - -#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) - skc->skc_linux_cache->allocflags |= flags; -#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) - skc->skc_linux_cache->gfpflags |= flags; -#endif -} - #endif /* _SPL_KMEM_H */ diff --git a/include/sys/kmem_cache.h b/include/sys/kmem_cache.h new file mode 100644 index 00000000..654a2ea4 --- /dev/null +++ b/include/sys/kmem_cache.h @@ -0,0 +1,246 @@ +/*****************************************************************************\ + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . +\*****************************************************************************/ + +#ifndef _SPL_KMEM_CACHE_H +#define _SPL_KMEM_CACHE_H + +#include + +/* + * Slab allocation interfaces. The SPL slab differs from the standard + * Linux SLAB or SLUB primarily in that each cache may be backed by slabs + * allocated from the physical or virtal memory address space. The virtual + * slabs allow for good behavior when allocation large objects of identical + * size. This slab implementation also supports both constructors and + * destructions which the Linux slab does not. + */ +enum { + KMC_BIT_NOTOUCH = 0, /* Don't update ages */ + KMC_BIT_NODEBUG = 1, /* Default behavior */ + KMC_BIT_NOMAGAZINE = 2, /* XXX: Unsupported */ + KMC_BIT_NOHASH = 3, /* XXX: Unsupported */ + KMC_BIT_QCACHE = 4, /* XXX: Unsupported */ + KMC_BIT_KMEM = 5, /* Use kmem cache */ + KMC_BIT_VMEM = 6, /* Use vmem cache */ + KMC_BIT_SLAB = 7, /* Use Linux slab cache */ + KMC_BIT_OFFSLAB = 8, /* Objects not on slab */ + KMC_BIT_NOEMERGENCY = 9, /* Disable emergency objects */ + KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ + KMC_BIT_GROWING = 15, /* Growing in progress */ + KMC_BIT_REAPING = 16, /* Reaping in progress */ + KMC_BIT_DESTROY = 17, /* Destroy in progress */ + KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ + KMC_BIT_ALLOC = 19, /* Proc handler helper bit */ + KMC_BIT_MAX = 20, /* Proc handler helper bit */ +}; + +/* kmem move callback return values */ +typedef enum kmem_cbrc { + KMEM_CBRC_YES = 0, /* Object moved */ + KMEM_CBRC_NO = 1, /* Object not moved */ + KMEM_CBRC_LATER = 2, /* Object not moved, try again later */ + KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */ + KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ +} kmem_cbrc_t; + +#define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH) +#define KMC_NODEBUG (1 << KMC_BIT_NODEBUG) +#define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE) +#define KMC_NOHASH (1 << KMC_BIT_NOHASH) +#define KMC_QCACHE (1 << KMC_BIT_QCACHE) +#define KMC_KMEM (1 << KMC_BIT_KMEM) +#define KMC_VMEM (1 << KMC_BIT_VMEM) +#define KMC_SLAB (1 << KMC_BIT_SLAB) +#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB) +#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY) +#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) +#define KMC_GROWING (1 << KMC_BIT_GROWING) +#define KMC_REAPING (1 << KMC_BIT_REAPING) +#define KMC_DESTROY (1 << KMC_BIT_DESTROY) +#define KMC_TOTAL (1 << KMC_BIT_TOTAL) +#define KMC_ALLOC (1 << KMC_BIT_ALLOC) +#define KMC_MAX (1 << KMC_BIT_MAX) + +#define KMC_REAP_CHUNK INT_MAX +#define KMC_DEFAULT_SEEKS 1 + +#define KMC_EXPIRE_AGE 0x1 /* Due to age */ +#define KMC_EXPIRE_MEM 0x2 /* Due to low memory */ + +#define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */ + +extern unsigned int spl_kmem_cache_expire; +extern struct list_head spl_kmem_cache_list; +extern struct rw_semaphore spl_kmem_cache_sem; + +#define SKM_MAGIC 0x2e2e2e2e +#define SKO_MAGIC 0x20202020 +#define SKS_MAGIC 0x22222222 +#define SKC_MAGIC 0x2c2c2c2c + +#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */ +#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */ +#define SPL_KMEM_CACHE_OBJ_PER_SLAB 16 /* Target objects per slab */ +#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 1 /* Minimum objects per slab */ +#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ + +#define POINTER_IS_VALID(p) 0 /* Unimplemented */ +#define POINTER_INVALIDATE(pp) /* Unimplemented */ + +typedef int (*spl_kmem_ctor_t)(void *, void *, int); +typedef void (*spl_kmem_dtor_t)(void *, void *); +typedef void (*spl_kmem_reclaim_t)(void *); + +typedef struct spl_kmem_magazine { + uint32_t skm_magic; /* Sanity magic */ + uint32_t skm_avail; /* Available objects */ + uint32_t skm_size; /* Magazine size */ + uint32_t skm_refill; /* Batch refill size */ + struct spl_kmem_cache *skm_cache; /* Owned by cache */ + unsigned long skm_age; /* Last cache access */ + unsigned int skm_cpu; /* Owned by cpu */ + void *skm_objs[0]; /* Object pointers */ +} spl_kmem_magazine_t; + +typedef struct spl_kmem_obj { + uint32_t sko_magic; /* Sanity magic */ + void *sko_addr; /* Buffer address */ + struct spl_kmem_slab *sko_slab; /* Owned by slab */ + struct list_head sko_list; /* Free object list linkage */ +} spl_kmem_obj_t; + +typedef struct spl_kmem_slab { + uint32_t sks_magic; /* Sanity magic */ + uint32_t sks_objs; /* Objects per slab */ + struct spl_kmem_cache *sks_cache; /* Owned by cache */ + struct list_head sks_list; /* Slab list linkage */ + struct list_head sks_free_list; /* Free object list */ + unsigned long sks_age; /* Last modify jiffie */ + uint32_t sks_ref; /* Ref count used objects */ +} spl_kmem_slab_t; + +typedef struct spl_kmem_alloc { + struct spl_kmem_cache *ska_cache; /* Owned by cache */ + int ska_flags; /* Allocation flags */ + taskq_ent_t ska_tqe; /* Task queue entry */ +} spl_kmem_alloc_t; + +typedef struct spl_kmem_emergency { + struct rb_node ske_node; /* Emergency tree linkage */ + void *ske_obj; /* Buffer address */ +} spl_kmem_emergency_t; + +typedef struct spl_kmem_cache { + uint32_t skc_magic; /* Sanity magic */ + uint32_t skc_name_size; /* Name length */ + char *skc_name; /* Name string */ + spl_kmem_magazine_t *skc_mag[NR_CPUS]; /* Per-CPU warm cache */ + uint32_t skc_mag_size; /* Magazine size */ + uint32_t skc_mag_refill; /* Magazine refill count */ + spl_kmem_ctor_t skc_ctor; /* Constructor */ + spl_kmem_dtor_t skc_dtor; /* Destructor */ + spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */ + void *skc_private; /* Private data */ + void *skc_vmp; /* Unused */ + struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */ + unsigned long skc_flags; /* Flags */ + uint32_t skc_obj_size; /* Object size */ + uint32_t skc_obj_align; /* Object alignment */ + uint32_t skc_slab_objs; /* Objects per slab */ + uint32_t skc_slab_size; /* Slab size */ + uint32_t skc_delay; /* Slab reclaim interval */ + uint32_t skc_reap; /* Slab reclaim count */ + atomic_t skc_ref; /* Ref count callers */ + taskqid_t skc_taskqid; /* Slab reclaim task */ + struct list_head skc_list; /* List of caches linkage */ + struct list_head skc_complete_list;/* Completely alloc'ed */ + struct list_head skc_partial_list; /* Partially alloc'ed */ + struct rb_root skc_emergency_tree; /* Min sized objects */ + spinlock_t skc_lock; /* Cache lock */ + wait_queue_head_t skc_waitq; /* Allocation waiters */ + uint64_t skc_slab_fail; /* Slab alloc failures */ + uint64_t skc_slab_create;/* Slab creates */ + uint64_t skc_slab_destroy;/* Slab destroys */ + uint64_t skc_slab_total; /* Slab total current */ + uint64_t skc_slab_alloc; /* Slab alloc current */ + uint64_t skc_slab_max; /* Slab max historic */ + uint64_t skc_obj_total; /* Obj total current */ + uint64_t skc_obj_alloc; /* Obj alloc current */ + uint64_t skc_obj_max; /* Obj max historic */ + uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */ + uint64_t skc_obj_emergency; /* Obj emergency current */ + uint64_t skc_obj_emergency_max; /* Obj emergency max */ +} spl_kmem_cache_t; +#define kmem_cache_t spl_kmem_cache_t + +extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size, + size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, + spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags); +extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, + kmem_cbrc_t (*)(void *, void *, size_t, void *)); +extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); +extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); +extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); +extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count); +extern void spl_kmem_reap(void); + +#define kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) \ + spl_kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) +#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) +#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) +#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) +#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) +#define kmem_cache_reap_now(skc) \ + spl_kmem_cache_reap_now(skc, skc->skc_reap) +#define kmem_reap() spl_kmem_reap() +#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \ + ((ptr) < (void *)VMALLOC_END)) + +/* + * Allow custom slab allocation flags to be set for KMC_SLAB based caches. + * One use for this function is to ensure the __GFP_COMP flag is part of + * the default allocation mask which ensures higher order allocations are + * properly refcounted. This flag was added to the default ->allocflags + * as of Linux 3.11. + */ +static inline void +kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags) +{ + if (skc->skc_linux_cache == NULL) + return; + +#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) + skc->skc_linux_cache->allocflags |= flags; +#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) + skc->skc_linux_cache->gfpflags |= flags; +#endif +} + +/* + * The following functions are only available for internal use. + */ +extern int spl_kmem_cache_init(void); +extern void spl_kmem_cache_fini(void); + +#endif /* _SPL_KMEM_CACHE_H */ diff --git a/include/sys/types.h b/include/sys/types.h index 3b3a42ed..ec0455cd 100644 --- a/include/sys/types.h +++ b/include/sys/types.h @@ -48,7 +48,6 @@ typedef long long longlong_t; typedef long long offset_t; typedef struct task_struct kthread_t; typedef struct task_struct proc_t; -typedef struct vmem { } vmem_t; typedef short pri_t; typedef struct timespec timestruc_t; /* definition per SVr4 */ typedef struct timespec timespec_t; diff --git a/include/sys/vmem.h b/include/sys/vmem.h new file mode 100644 index 00000000..e86e89bb --- /dev/null +++ b/include/sys/vmem.h @@ -0,0 +1,183 @@ +/*****************************************************************************\ + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . +\*****************************************************************************/ + +#ifndef _SPL_VMEM_H +#define _SPL_VMEM_H + +#include +#include +#include + +typedef struct vmem { } vmem_t; + +extern vmem_t *heap_arena; +extern vmem_t *zio_alloc_arena; +extern vmem_t *zio_arena; + +extern size_t vmem_size(vmem_t *vmp, int typemask); + +/* + * Memory allocation interfaces + */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 + +#ifndef VMALLOC_TOTAL +#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) +#endif + +static inline void * +vmalloc_nofail(size_t size, gfp_t flags) +{ + void *ptr; + + /* + * Retry failed __vmalloc() allocations once every second. The + * rational for the delay is that the likely failure modes are: + * + * 1) The system has completely exhausted memory, in which case + * delaying 1 second for the memory reclaim to run is reasonable + * to avoid thrashing the system. + * 2) The system has memory but has exhausted the small virtual + * address space available on 32-bit systems. Retrying the + * allocation immediately will only result in spinning on the + * virtual address space lock. It is better delay a second and + * hope that another process will free some of the address space. + * But the bottom line is there is not much we can actually do + * since we can never safely return a failure and honor the + * Solaris semantics. + */ + while (1) { + ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); + if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + } else { + break; + } + } + + return ptr; +} + +static inline void * +vzalloc_nofail(size_t size, gfp_t flags) +{ + void *ptr; + + ptr = vmalloc_nofail(size, flags); + if (ptr) + memset(ptr, 0, (size)); + + return ptr; +} + +#ifdef DEBUG_KMEM + +/* + * Memory accounting functions to be used only when DEBUG_KMEM is set. + */ +# ifdef HAVE_ATOMIC64_T + +# define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used) +# define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used) +# define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used) +# define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size) + +extern atomic64_t vmem_alloc_used; +extern unsigned long long vmem_alloc_max; + +# else /* HAVE_ATOMIC64_T */ + +# define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used) +# define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used) +# define vmem_alloc_used_read() atomic_read(&vmem_alloc_used) +# define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size) + +extern atomic_t vmem_alloc_used; +extern unsigned long long vmem_alloc_max; + +# endif /* HAVE_ATOMIC64_T */ + +# ifdef DEBUG_KMEM_TRACKING +/* + * DEBUG_KMEM && DEBUG_KMEM_TRACKING + * + * The maximum level of memory debugging. All memory will be accounted + * for and each allocation will be explicitly tracked. Any allocation + * which is leaked will be reported on module unload and the exact location + * where that memory was allocation will be reported. This level of memory + * tracking will have a significant impact on performance and should only + * be enabled for debugging. This feature may be enabled by passing + * --enable-debug-kmem-tracking to configure. + */ +# define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \ + __FUNCTION__, __LINE__) +# define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\ + __FUNCTION__, __LINE__) +# define vmem_free(ptr, sz) vmem_free_track((ptr), (sz)) + +extern void *kmem_alloc_track(size_t, int, const char *, int, int, int); +extern void kmem_free_track(const void *, size_t); +extern void *vmem_alloc_track(size_t, int, const char *, int); +extern void vmem_free_track(const void *, size_t); + +# else /* DEBUG_KMEM_TRACKING */ +/* + * DEBUG_KMEM && !DEBUG_KMEM_TRACKING + * + * The default build will set DEBUG_KEM. This provides basic memory + * accounting with little to no impact on performance. When the module + * is unloaded in any memory was leaked the total number of leaked bytes + * will be reported on the console. To disable this basic accounting + * pass the --disable-debug-kmem option to configure. + */ +# define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \ + __FUNCTION__, __LINE__) +# define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ + __FUNCTION__, __LINE__) +# define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz)) + +extern void *vmem_alloc_debug(size_t, int, const char *, int); +extern void vmem_free_debug(const void *, size_t); + +# endif /* DEBUG_KMEM_TRACKING */ +#else /* DEBUG_KMEM */ +/* + * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING + * + * All debugging is disabled. There will be no overhead even for + * minimal memory accounting. To enable basic accounting pass the + * --enable-debug-kmem option to configure. + */ +# define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl)) +# define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl)) +# define vmem_free(ptr, sz) ((void)(sz), vfree(ptr)) + +#endif /* DEBUG_KMEM */ + +int spl_vmem_init(void); +void spl_vmem_fini(void); + +#endif /* _SPL_VMEM_H */ diff --git a/include/sys/vmsystm.h b/include/sys/vmsystm.h index c6b86866..2fa16952 100644 --- a/include/sys/vmsystm.h +++ b/include/sys/vmsystm.h @@ -37,19 +37,6 @@ #define physmem totalram_pages #define freemem nr_free_pages() -extern vmem_t *heap_arena; /* primary kernel heap arena */ -extern vmem_t *zio_alloc_arena; /* arena for zio caches */ -extern vmem_t *zio_arena; /* arena for allocating zio memory */ - -extern size_t vmem_size(vmem_t *vmp, int typemask); - -#define VMEM_ALLOC 0x01 -#define VMEM_FREE 0x02 - -#ifndef VMALLOC_TOTAL -#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) -#endif - #define xcopyin(from, to, size) copy_from_user(to, from, size) #define xcopyout(from, to, size) copy_to_user(to, from, size) diff --git a/module/spl/Makefile.in b/module/spl/Makefile.in index 9f67ed64..d1742448 100644 --- a/module/spl/Makefile.in +++ b/module/spl/Makefile.in @@ -8,6 +8,8 @@ obj-$(CONFIG_SPL) := $(MODULE).o $(MODULE)-objs += @top_srcdir@/module/spl/spl-proc.o $(MODULE)-objs += @top_srcdir@/module/spl/spl-kmem.o +$(MODULE)-objs += @top_srcdir@/module/spl/spl-kmem-cache.o +$(MODULE)-objs += @top_srcdir@/module/spl/spl-vmem.o $(MODULE)-objs += @top_srcdir@/module/spl/spl-thread.o $(MODULE)-objs += @top_srcdir@/module/spl/spl-taskq.o $(MODULE)-objs += @top_srcdir@/module/spl/spl-rwlock.o diff --git a/module/spl/spl-condvar.c b/module/spl/spl-condvar.c index 2a0052f5..cebb8f2b 100644 --- a/module/spl/spl-condvar.c +++ b/module/spl/spl-condvar.c @@ -25,6 +25,7 @@ \*****************************************************************************/ #include +#include void __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) diff --git a/module/spl/spl-generic.c b/module/spl/spl-generic.c index 803f03a8..b706ccec 100644 --- a/module/spl/spl-generic.c +++ b/module/spl/spl-generic.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include #include @@ -38,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -479,12 +482,46 @@ zone_get_hostid(void *zone) } EXPORT_SYMBOL(zone_get_hostid); +static int +spl_kvmem_init(void) +{ + int rc = 0; + + rc = spl_kmem_init(); + if (rc) + goto out1; + + rc = spl_vmem_init(); + if (rc) + goto out2; + + rc = spl_kmem_cache_init(); + if (rc) + goto out3; + + return (rc); +out3: + spl_vmem_fini(); +out2: + spl_kmem_fini(); +out1: + return (rc); +} + +static void +spl_kvmem_fini(void) +{ + spl_kmem_cache_fini(); + spl_vmem_fini(); + spl_kmem_fini(); +} + static int __init spl_init(void) { int rc = 0; - if ((rc = spl_kmem_init())) + if ((rc = spl_kvmem_init())) goto out1; if ((rc = spl_mutex_init())) @@ -530,7 +567,7 @@ __init spl_init(void) out3: spl_mutex_fini(); out2: - spl_kmem_fini(); + spl_kvmem_fini(); out1: printk(KERN_NOTICE "SPL: Failed to Load Solaris Porting Layer " "v%s-%s%s, rc = %d\n", SPL_META_VERSION, SPL_META_RELEASE, @@ -552,7 +589,7 @@ spl_fini(void) spl_taskq_fini(); spl_rw_fini(); spl_mutex_fini(); - spl_kmem_fini(); + spl_kvmem_fini(); } /* Called when a dependent module is loaded */ diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c new file mode 100644 index 00000000..d24c4c20 --- /dev/null +++ b/module/spl/spl-kmem-cache.c @@ -0,0 +1,1648 @@ +/*****************************************************************************\ + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + ***************************************************************************** + * Solaris Porting Layer (SPL) Kmem Implementation. +\*****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Within the scope of spl-kmem.c file the kmem_cache_* definitions + * are removed to allow access to the real Linux slab allocator. + */ +#undef kmem_cache_destroy +#undef kmem_cache_create +#undef kmem_cache_alloc +#undef kmem_cache_free + + +/* + * Cache expiration was implemented because it was part of the default Solaris + * kmem_cache behavior. The idea is that per-cpu objects which haven't been + * accessed in several seconds should be returned to the cache. On the other + * hand Linux slabs never move objects back to the slabs unless there is + * memory pressure on the system. By default the Linux method is enabled + * because it has been shown to improve responsiveness on low memory systems. + * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM. + */ +unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM; +EXPORT_SYMBOL(spl_kmem_cache_expire); +module_param(spl_kmem_cache_expire, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)"); + +/* + * The default behavior is to report the number of objects remaining in the + * cache. This allows the Linux VM to repeatedly reclaim objects from the + * cache when memory is low satisfy other memory allocations. Alternately, + * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache + * is reclaimed. This may increase the likelihood of out of memory events. + */ +unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; +module_param(spl_kmem_cache_reclaim, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); + +unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; +module_param(spl_kmem_cache_obj_per_slab, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); + +unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN; +module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min, + "Minimal number of objects per slab"); + +unsigned int spl_kmem_cache_max_size = 32; +module_param(spl_kmem_cache_max_size, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); + +/* + * For small objects the Linux slab allocator should be used to make the most + * efficient use of the memory. However, large objects are not supported by + * the Linux slab and therefore the SPL implementation is preferred. A cutoff + * of 16K was determined to be optimal for architectures using 4K pages. + */ +#if PAGE_SIZE == 4096 +unsigned int spl_kmem_cache_slab_limit = 16384; +#else +unsigned int spl_kmem_cache_slab_limit = 0; +#endif +module_param(spl_kmem_cache_slab_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_slab_limit, + "Objects less than N bytes use the Linux slab"); + +unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4); +module_param(spl_kmem_cache_kmem_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, + "Objects less than N bytes use the kmalloc"); + +/* + * Slab allocation interfaces + * + * While the Linux slab implementation was inspired by the Solaris + * implementation I cannot use it to emulate the Solaris APIs. I + * require two features which are not provided by the Linux slab. + * + * 1) Constructors AND destructors. Recent versions of the Linux + * kernel have removed support for destructors. This is a deal + * breaker for the SPL which contains particularly expensive + * initializers for mutex's, condition variables, etc. We also + * require a minimal level of cleanup for these data types unlike + * many Linux data type which do need to be explicitly destroyed. + * + * 2) Virtual address space backed slab. Callers of the Solaris slab + * expect it to work well for both small are very large allocations. + * Because of memory fragmentation the Linux slab which is backed + * by kmalloc'ed memory performs very badly when confronted with + * large numbers of large allocations. Basing the slab on the + * virtual address space removes the need for contiguous pages + * and greatly improve performance for large allocations. + * + * For these reasons, the SPL has its own slab implementation with + * the needed features. It is not as highly optimized as either the + * Solaris or Linux slabs, but it should get me most of what is + * needed until it can be optimized or obsoleted by another approach. + * + * One serious concern I do have about this method is the relatively + * small virtual address space on 32bit arches. This will seriously + * constrain the size of the slab caches and their performance. + * + * XXX: Improve the partial slab list by carefully maintaining a + * strict ordering of fullest to emptiest slabs based on + * the slab reference count. This guarantees the when freeing + * slabs back to the system we need only linearly traverse the + * last N slabs in the list to discover all the freeable slabs. + * + * XXX: NUMA awareness for optionally allocating memory close to a + * particular core. This can be advantageous if you know the slab + * object will be short lived and primarily accessed from one core. + * + * XXX: Slab coloring may also yield performance improvements and would + * be desirable to implement. + */ + +struct list_head spl_kmem_cache_list; /* List of caches */ +struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ +taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */ + +static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); + +SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker); +SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, + spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS); + +static void * +kv_alloc(spl_kmem_cache_t *skc, int size, int flags) +{ + void *ptr; + + ASSERT(ISP2(size)); + + if (skc->skc_flags & KMC_KMEM) + ptr = (void *)__get_free_pages(flags | __GFP_COMP, + get_order(size)); + else + ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); + + /* Resulting allocated memory will be page aligned */ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + + return ptr; +} + +static void +kv_free(spl_kmem_cache_t *skc, void *ptr, int size) +{ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + ASSERT(ISP2(size)); + + /* + * The Linux direct reclaim path uses this out of band value to + * determine if forward progress is being made. Normally this is + * incremented by kmem_freepages() which is part of the various + * Linux slab implementations. However, since we are using none + * of that infrastructure we are responsible for incrementing it. + */ + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; + + if (skc->skc_flags & KMC_KMEM) + free_pages((unsigned long)ptr, get_order(size)); + else + vfree(ptr); +} + +/* + * Required space for each aligned sks. + */ +static inline uint32_t +spl_sks_size(spl_kmem_cache_t *skc) +{ + return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t), + skc->skc_obj_align, uint32_t); +} + +/* + * Required space for each aligned object. + */ +static inline uint32_t +spl_obj_size(spl_kmem_cache_t *skc) +{ + uint32_t align = skc->skc_obj_align; + + return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + + P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t); +} + +/* + * Lookup the spl_kmem_object_t for an object given that object. + */ +static inline spl_kmem_obj_t * +spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) +{ + return obj + P2ROUNDUP_TYPED(skc->skc_obj_size, + skc->skc_obj_align, uint32_t); +} + +/* + * Required space for each offslab object taking in to account alignment + * restrictions and the power-of-two requirement of kv_alloc(). + */ +static inline uint32_t +spl_offslab_size(spl_kmem_cache_t *skc) +{ + return 1UL << (fls64(spl_obj_size(skc)) + 1); +} + +/* + * It's important that we pack the spl_kmem_obj_t structure and the + * actual objects in to one large address space to minimize the number + * of calls to the allocator. It is far better to do a few large + * allocations and then subdivide it ourselves. Now which allocator + * we use requires balancing a few trade offs. + * + * For small objects we use kmem_alloc() because as long as you are + * only requesting a small number of pages (ideally just one) its cheap. + * However, when you start requesting multiple pages with kmem_alloc() + * it gets increasingly expensive since it requires contiguous pages. + * For this reason we shift to vmem_alloc() for slabs of large objects + * which removes the need for contiguous pages. We do not use + * vmem_alloc() in all cases because there is significant locking + * overhead in __get_vm_area_node(). This function takes a single + * global lock when acquiring an available virtual address range which + * serializes all vmem_alloc()'s for all slab caches. Using slightly + * different allocation functions for small and large objects should + * give us the best of both worlds. + * + * KMC_ONSLAB KMC_OFFSLAB + * + * +------------------------+ +-----------------+ + * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+ + * | skc_obj_size <-+ | | +-----------------+ | | + * | spl_kmem_obj_t | | | | + * | skc_obj_size <---+ | +-----------------+ | | + * | spl_kmem_obj_t | | | skc_obj_size | <-+ | + * | ... v | | spl_kmem_obj_t | | + * +------------------------+ +-----------------+ v + */ +static spl_kmem_slab_t * +spl_slab_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_slab_t *sks; + spl_kmem_obj_t *sko, *n; + void *base, *obj; + uint32_t obj_size, offslab_size = 0; + int i, rc = 0; + + base = kv_alloc(skc, skc->skc_slab_size, flags); + if (base == NULL) + return (NULL); + + sks = (spl_kmem_slab_t *)base; + sks->sks_magic = SKS_MAGIC; + sks->sks_objs = skc->skc_slab_objs; + sks->sks_age = jiffies; + sks->sks_cache = skc; + INIT_LIST_HEAD(&sks->sks_list); + INIT_LIST_HEAD(&sks->sks_free_list); + sks->sks_ref = 0; + obj_size = spl_obj_size(skc); + + if (skc->skc_flags & KMC_OFFSLAB) + offslab_size = spl_offslab_size(skc); + + for (i = 0; i < sks->sks_objs; i++) { + if (skc->skc_flags & KMC_OFFSLAB) { + obj = kv_alloc(skc, offslab_size, flags); + if (!obj) { + rc = -ENOMEM; + goto out; + } + } else { + obj = base + spl_sks_size(skc) + (i * obj_size); + } + + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + sko = spl_sko_from_obj(skc, obj); + sko->sko_addr = obj; + sko->sko_magic = SKO_MAGIC; + sko->sko_slab = sks; + INIT_LIST_HEAD(&sko->sko_list); + list_add_tail(&sko->sko_list, &sks->sks_free_list); + } + +out: + if (rc) { + if (skc->skc_flags & KMC_OFFSLAB) + list_for_each_entry_safe(sko, n, &sks->sks_free_list, + sko_list) + kv_free(skc, sko->sko_addr, offslab_size); + + kv_free(skc, base, skc->skc_slab_size); + sks = NULL; + } + + return (sks); +} + +/* + * Remove a slab from complete or partial list, it must be called with + * the 'skc->skc_lock' held but the actual free must be performed + * outside the lock to prevent deadlocking on vmem addresses. + */ +static void +spl_slab_free(spl_kmem_slab_t *sks, + struct list_head *sks_list, struct list_head *sko_list) +{ + spl_kmem_cache_t *skc; + + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref == 0); + + skc = sks->sks_cache; + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(spin_is_locked(&skc->skc_lock)); + + /* + * Update slab/objects counters in the cache, then remove the + * slab from the skc->skc_partial_list. Finally add the slab + * and all its objects in to the private work lists where the + * destructors will be called and the memory freed to the system. + */ + skc->skc_obj_total -= sks->sks_objs; + skc->skc_slab_total--; + list_del(&sks->sks_list); + list_add(&sks->sks_list, sks_list); + list_splice_init(&sks->sks_free_list, sko_list); +} + +/* + * Traverses all the partial slabs attached to a cache and free those + * which which are currently empty, and have not been touched for + * skc_delay seconds to avoid thrashing. The count argument is + * passed to optionally cap the number of slabs reclaimed, a count + * of zero means try and reclaim everything. When flag is set we + * always free an available slab regardless of age. + */ +static void +spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) +{ + spl_kmem_slab_t *sks, *m; + spl_kmem_obj_t *sko, *n; + LIST_HEAD(sks_list); + LIST_HEAD(sko_list); + uint32_t size = 0; + int i = 0; + + /* + * Move empty slabs and objects which have not been touched in + * skc_delay seconds on to private lists to be freed outside + * the spin lock. This delay time is important to avoid thrashing + * however when flag is set the delay will not be used. + */ + spin_lock(&skc->skc_lock); + list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){ + /* + * All empty slabs are at the end of skc->skc_partial_list, + * therefore once a non-empty slab is found we can stop + * scanning. Additionally, stop when reaching the target + * reclaim 'count' if a non-zero threshold is given. + */ + if ((sks->sks_ref > 0) || (count && i >= count)) + break; + + if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) { + spl_slab_free(sks, &sks_list, &sko_list); + i++; + } + } + spin_unlock(&skc->skc_lock); + + /* + * The following two loops ensure all the object destructors are + * run, any offslab objects are freed, and the slabs themselves + * are freed. This is all done outside the skc->skc_lock since + * this allows the destructor to sleep, and allows us to perform + * a conditional reschedule when a freeing a large number of + * objects and slabs back to the system. + */ + if (skc->skc_flags & KMC_OFFSLAB) + size = spl_offslab_size(skc); + + list_for_each_entry_safe(sko, n, &sko_list, sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); + + if (skc->skc_flags & KMC_OFFSLAB) + kv_free(skc, sko->sko_addr, size); + } + + list_for_each_entry_safe(sks, m, &sks_list, sks_list) { + ASSERT(sks->sks_magic == SKS_MAGIC); + kv_free(skc, sks, skc->skc_slab_size); + } +} + +static spl_kmem_emergency_t * +spl_emergency_search(struct rb_root *root, void *obj) +{ + struct rb_node *node = root->rb_node; + spl_kmem_emergency_t *ske; + unsigned long address = (unsigned long)obj; + + while (node) { + ske = container_of(node, spl_kmem_emergency_t, ske_node); + + if (address < (unsigned long)ske->ske_obj) + node = node->rb_left; + else if (address > (unsigned long)ske->ske_obj) + node = node->rb_right; + else + return ske; + } + + return NULL; +} + +static int +spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + spl_kmem_emergency_t *ske_tmp; + unsigned long address = (unsigned long)ske->ske_obj; + + while (*new) { + ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); + + parent = *new; + if (address < (unsigned long)ske_tmp->ske_obj) + new = &((*new)->rb_left); + else if (address > (unsigned long)ske_tmp->ske_obj) + new = &((*new)->rb_right); + else + return 0; + } + + rb_link_node(&ske->ske_node, parent, new); + rb_insert_color(&ske->ske_node, root); + + return 1; +} + +/* + * Allocate a single emergency object and track it in a red black tree. + */ +static int +spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) +{ + spl_kmem_emergency_t *ske; + int empty; + + /* Last chance use a partial slab if one now exists */ + spin_lock(&skc->skc_lock); + empty = list_empty(&skc->skc_partial_list); + spin_unlock(&skc->skc_lock); + if (!empty) + return (-EEXIST); + + ske = kmalloc(sizeof(*ske), flags); + if (ske == NULL) + return (-ENOMEM); + + ske->ske_obj = kmalloc(skc->skc_obj_size, flags); + if (ske->ske_obj == NULL) { + kfree(ske); + return (-ENOMEM); + } + + spin_lock(&skc->skc_lock); + empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); + if (likely(empty)) { + skc->skc_obj_total++; + skc->skc_obj_emergency++; + if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) + skc->skc_obj_emergency_max = skc->skc_obj_emergency; + } + spin_unlock(&skc->skc_lock); + + if (unlikely(!empty)) { + kfree(ske->ske_obj); + kfree(ske); + return (-EINVAL); + } + + *obj = ske->ske_obj; + + return (0); +} + +/* + * Locate the passed object in the red black tree and free it. + */ +static int +spl_emergency_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_emergency_t *ske; + + spin_lock(&skc->skc_lock); + ske = spl_emergency_search(&skc->skc_emergency_tree, obj); + if (likely(ske)) { + rb_erase(&ske->ske_node, &skc->skc_emergency_tree); + skc->skc_obj_emergency--; + skc->skc_obj_total--; + } + spin_unlock(&skc->skc_lock); + + if (unlikely(ske == NULL)) + return (-ENOENT); + + kfree(ske->ske_obj); + kfree(ske); + + return (0); +} + +/* + * Release objects from the per-cpu magazine back to their slab. The flush + * argument contains the max number of entries to remove from the magazine. + */ +static void +__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +{ + int i, count = MIN(flush, skm->skm_avail); + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(spin_is_locked(&skc->skc_lock)); + + for (i = 0; i < count; i++) + spl_cache_shrink(skc, skm->skm_objs[i]); + + skm->skm_avail -= count; + memmove(skm->skm_objs, &(skm->skm_objs[count]), + sizeof(void *) * skm->skm_avail); +} + +static void +spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +{ + spin_lock(&skc->skc_lock); + __spl_cache_flush(skc, skm, flush); + spin_unlock(&skc->skc_lock); +} + +static void +spl_magazine_age(void *data) +{ + spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; + spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; + + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skm->skm_cpu == smp_processor_id()); + ASSERT(irqs_disabled()); + + /* There are no available objects or they are too young to age out */ + if ((skm->skm_avail == 0) || + time_before(jiffies, skm->skm_age + skc->skc_delay * HZ)) + return; + + /* + * Because we're executing in interrupt context we may have + * interrupted the holder of this lock. To avoid a potential + * deadlock return if the lock is contended. + */ + if (!spin_trylock(&skc->skc_lock)) + return; + + __spl_cache_flush(skc, skm, skm->skm_refill); + spin_unlock(&skc->skc_lock); +} + +/* + * Called regularly to keep a downward pressure on the cache. + * + * Objects older than skc->skc_delay seconds in the per-cpu magazines will + * be returned to the caches. This is done to prevent idle magazines from + * holding memory which could be better used elsewhere. The delay is + * present to prevent thrashing the magazine. + * + * The newly released objects may result in empty partial slabs. Those + * slabs should be released to the system. Otherwise moving the objects + * out of the magazines is just wasted work. + */ +static void +spl_cache_age(void *data) +{ + spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; + taskqid_t id = 0; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + /* Dynamically disabled at run time */ + if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE)) + return; + + atomic_inc(&skc->skc_ref); + + if (!(skc->skc_flags & KMC_NOMAGAZINE)) + on_each_cpu(spl_magazine_age, skc, 1); + + spl_slab_reclaim(skc, skc->skc_reap, 0); + + while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) { + id = taskq_dispatch_delay( + spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP, + ddi_get_lbolt() + skc->skc_delay / 3 * HZ); + + /* Destroy issued after dispatch immediately cancel it */ + if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id) + taskq_cancel_id(spl_kmem_cache_taskq, id); + } + + spin_lock(&skc->skc_lock); + skc->skc_taskqid = id; + spin_unlock(&skc->skc_lock); + + atomic_dec(&skc->skc_ref); +} + +/* + * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. + * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, + * for very small objects we may end up with more than this so as not + * to waste space in the minimal allocation of a single page. Also for + * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min, + * lower than this and we will fail. + */ +static int +spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) +{ + uint32_t sks_size, obj_size, max_size; + + if (skc->skc_flags & KMC_OFFSLAB) { + *objs = spl_kmem_cache_obj_per_slab; + *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), PAGE_SIZE); + return (0); + } else { + sks_size = spl_sks_size(skc); + obj_size = spl_obj_size(skc); + + if (skc->skc_flags & KMC_KMEM) + max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE; + else + max_size = (spl_kmem_cache_max_size * 1024 * 1024); + + /* Power of two sized slab */ + for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) { + *objs = (*size - sks_size) / obj_size; + if (*objs >= spl_kmem_cache_obj_per_slab) + return (0); + } + + /* + * Unable to satisfy target objects per slab, fall back to + * allocating a maximally sized slab and assuming it can + * contain the minimum objects count use it. If not fail. + */ + *size = max_size; + *objs = (*size - sks_size) / obj_size; + if (*objs >= (spl_kmem_cache_obj_per_slab_min)) + return (0); + } + + return (-ENOSPC); +} + +/* + * Make a guess at reasonable per-cpu magazine size based on the size of + * each object and the cost of caching N of them in each magazine. Long + * term this should really adapt based on an observed usage heuristic. + */ +static int +spl_magazine_size(spl_kmem_cache_t *skc) +{ + uint32_t obj_size = spl_obj_size(skc); + int size; + + /* Per-magazine sizes below assume a 4Kib page size */ + if (obj_size > (PAGE_SIZE * 256)) + size = 4; /* Minimum 4Mib per-magazine */ + else if (obj_size > (PAGE_SIZE * 32)) + size = 16; /* Minimum 2Mib per-magazine */ + else if (obj_size > (PAGE_SIZE)) + size = 64; /* Minimum 256Kib per-magazine */ + else if (obj_size > (PAGE_SIZE / 4)) + size = 128; /* Minimum 128Kib per-magazine */ + else + size = 256; + + return (size); +} + +/* + * Allocate a per-cpu magazine to associate with a specific core. + */ +static spl_kmem_magazine_t * +spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) +{ + spl_kmem_magazine_t *skm; + int size = sizeof(spl_kmem_magazine_t) + + sizeof(void *) * skc->skc_mag_size; + + skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu)); + if (skm) { + skm->skm_magic = SKM_MAGIC; + skm->skm_avail = 0; + skm->skm_size = skc->skc_mag_size; + skm->skm_refill = skc->skc_mag_refill; + skm->skm_cache = skc; + skm->skm_age = jiffies; + skm->skm_cpu = cpu; + } + + return (skm); +} + +/* + * Free a per-cpu magazine associated with a specific core. + */ +static void +spl_magazine_free(spl_kmem_magazine_t *skm) +{ + int size = sizeof(spl_kmem_magazine_t) + + sizeof(void *) * skm->skm_size; + + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skm->skm_avail == 0); + + kmem_free(skm, size); +} + +/* + * Create all pre-cpu magazines of reasonable sizes. + */ +static int +spl_magazine_create(spl_kmem_cache_t *skc) +{ + int i; + + if (skc->skc_flags & KMC_NOMAGAZINE) + return (0); + + skc->skc_mag_size = spl_magazine_size(skc); + skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; + + for_each_online_cpu(i) { + skc->skc_mag[i] = spl_magazine_alloc(skc, i); + if (!skc->skc_mag[i]) { + for (i--; i >= 0; i--) + spl_magazine_free(skc->skc_mag[i]); + + return (-ENOMEM); + } + } + + return (0); +} + +/* + * Destroy all pre-cpu magazines. + */ +static void +spl_magazine_destroy(spl_kmem_cache_t *skc) +{ + spl_kmem_magazine_t *skm; + int i; + + if (skc->skc_flags & KMC_NOMAGAZINE) + return; + + for_each_online_cpu(i) { + skm = skc->skc_mag[i]; + spl_cache_flush(skc, skm, skm->skm_avail); + spl_magazine_free(skm); + } +} + +/* + * Create a object cache based on the following arguments: + * name cache name + * size cache object size + * align cache object alignment + * ctor cache object constructor + * dtor cache object destructor + * reclaim cache object reclaim + * priv cache private data for ctor/dtor/reclaim + * vmp unused must be NULL + * flags + * KMC_NOTOUCH Disable cache object aging (unsupported) + * KMC_NODEBUG Disable debugging (unsupported) + * KMC_NOHASH Disable hashing (unsupported) + * KMC_QCACHE Disable qcache (unsupported) + * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab + * KMC_KMEM Force kmem backed cache + * KMC_VMEM Force vmem backed cache + * KMC_SLAB Force Linux slab backed cache + * KMC_OFFSLAB Locate objects off the slab + */ +spl_kmem_cache_t * +spl_kmem_cache_create(char *name, size_t size, size_t align, + spl_kmem_ctor_t ctor, + spl_kmem_dtor_t dtor, + spl_kmem_reclaim_t reclaim, + void *priv, void *vmp, int flags) +{ + spl_kmem_cache_t *skc; + int rc; + + /* + * Unsupported flags + */ + ASSERT0(flags & KMC_NOMAGAZINE); + ASSERT0(flags & KMC_NOHASH); + ASSERT0(flags & KMC_QCACHE); + ASSERT(vmp == NULL); + + might_sleep(); + + /* + * Allocate memory for a new cache an initialize it. Unfortunately, + * this usually ends up being a large allocation of ~32k because + * we need to allocate enough memory for the worst case number of + * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we + * explicitly pass KM_NODEBUG to suppress the kmem warning + */ + skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG); + if (skc == NULL) + return (NULL); + + skc->skc_magic = SKC_MAGIC; + skc->skc_name_size = strlen(name) + 1; + skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP); + if (skc->skc_name == NULL) { + kmem_free(skc, sizeof(*skc)); + return (NULL); + } + strncpy(skc->skc_name, name, skc->skc_name_size); + + skc->skc_ctor = ctor; + skc->skc_dtor = dtor; + skc->skc_reclaim = reclaim; + skc->skc_private = priv; + skc->skc_vmp = vmp; + skc->skc_linux_cache = NULL; + skc->skc_flags = flags; + skc->skc_obj_size = size; + skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; + skc->skc_delay = SPL_KMEM_CACHE_DELAY; + skc->skc_reap = SPL_KMEM_CACHE_REAP; + atomic_set(&skc->skc_ref, 0); + + INIT_LIST_HEAD(&skc->skc_list); + INIT_LIST_HEAD(&skc->skc_complete_list); + INIT_LIST_HEAD(&skc->skc_partial_list); + skc->skc_emergency_tree = RB_ROOT; + spin_lock_init(&skc->skc_lock); + init_waitqueue_head(&skc->skc_waitq); + skc->skc_slab_fail = 0; + skc->skc_slab_create = 0; + skc->skc_slab_destroy = 0; + skc->skc_slab_total = 0; + skc->skc_slab_alloc = 0; + skc->skc_slab_max = 0; + skc->skc_obj_total = 0; + skc->skc_obj_alloc = 0; + skc->skc_obj_max = 0; + skc->skc_obj_deadlock = 0; + skc->skc_obj_emergency = 0; + skc->skc_obj_emergency_max = 0; + + /* + * Verify the requested alignment restriction is sane. + */ + if (align) { + VERIFY(ISP2(align)); + VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); + VERIFY3U(align, <=, PAGE_SIZE); + skc->skc_obj_align = align; + } + + /* + * When no specific type of slab is requested (kmem, vmem, or + * linuxslab) then select a cache type based on the object size + * and default tunables. + */ + if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) { + + /* + * Objects smaller than spl_kmem_cache_slab_limit can + * use the Linux slab for better space-efficiency. By + * default this functionality is disabled until its + * performance characters are fully understood. + */ + if (spl_kmem_cache_slab_limit && + size <= (size_t)spl_kmem_cache_slab_limit) + skc->skc_flags |= KMC_SLAB; + + /* + * Small objects, less than spl_kmem_cache_kmem_limit per + * object should use kmem because their slabs are small. + */ + else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit) + skc->skc_flags |= KMC_KMEM; + + /* + * All other objects are considered large and are placed + * on vmem backed slabs. + */ + else + skc->skc_flags |= KMC_VMEM; + } + + /* + * Given the type of slab allocate the required resources. + */ + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + rc = spl_slab_size(skc, + &skc->skc_slab_objs, &skc->skc_slab_size); + if (rc) + goto out; + + rc = spl_magazine_create(skc); + if (rc) + goto out; + } else { + skc->skc_linux_cache = kmem_cache_create( + skc->skc_name, size, align, 0, NULL); + if (skc->skc_linux_cache == NULL) { + rc = ENOMEM; + goto out; + } + + kmem_cache_set_allocflags(skc, __GFP_COMP); + skc->skc_flags |= KMC_NOMAGAZINE; + } + + if (spl_kmem_cache_expire & KMC_EXPIRE_AGE) + skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq, + spl_cache_age, skc, TQ_SLEEP, + ddi_get_lbolt() + skc->skc_delay / 3 * HZ); + + down_write(&spl_kmem_cache_sem); + list_add_tail(&skc->skc_list, &spl_kmem_cache_list); + up_write(&spl_kmem_cache_sem); + + return (skc); +out: + kmem_free(skc->skc_name, skc->skc_name_size); + kmem_free(skc, sizeof(*skc)); + return (NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_create); + +/* + * Register a move callback to for cache defragmentation. + * XXX: Unimplemented but harmless to stub out for now. + */ +void +spl_kmem_cache_set_move(spl_kmem_cache_t *skc, + kmem_cbrc_t (move)(void *, void *, size_t, void *)) +{ + ASSERT(move != NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_set_move); + +/* + * Destroy a cache and all objects associated with the cache. + */ +void +spl_kmem_cache_destroy(spl_kmem_cache_t *skc) +{ + DECLARE_WAIT_QUEUE_HEAD(wq); + taskqid_t id; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB)); + + down_write(&spl_kmem_cache_sem); + list_del_init(&skc->skc_list); + up_write(&spl_kmem_cache_sem); + + /* Cancel any and wait for any pending delayed tasks */ + VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + spin_lock(&skc->skc_lock); + id = skc->skc_taskqid; + spin_unlock(&skc->skc_lock); + + taskq_cancel_id(spl_kmem_cache_taskq, id); + + /* Wait until all current callers complete, this is mainly + * to catch the case where a low memory situation triggers a + * cache reaping action which races with this destroy. */ + wait_event(wq, atomic_read(&skc->skc_ref) == 0); + + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + spl_magazine_destroy(skc); + spl_slab_reclaim(skc, 0, 1); + } else { + ASSERT(skc->skc_flags & KMC_SLAB); + kmem_cache_destroy(skc->skc_linux_cache); + } + + spin_lock(&skc->skc_lock); + + /* Validate there are no objects in use and free all the + * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ + ASSERT3U(skc->skc_slab_alloc, ==, 0); + ASSERT3U(skc->skc_obj_alloc, ==, 0); + ASSERT3U(skc->skc_slab_total, ==, 0); + ASSERT3U(skc->skc_obj_total, ==, 0); + ASSERT3U(skc->skc_obj_emergency, ==, 0); + ASSERT(list_empty(&skc->skc_complete_list)); + + kmem_free(skc->skc_name, skc->skc_name_size); + spin_unlock(&skc->skc_lock); + + kmem_free(skc, sizeof(*skc)); +} +EXPORT_SYMBOL(spl_kmem_cache_destroy); + +/* + * Allocate an object from a slab attached to the cache. This is used to + * repopulate the per-cpu magazine caches in batches when they run low. + */ +static void * +spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) +{ + spl_kmem_obj_t *sko; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(spin_is_locked(&skc->skc_lock)); + + sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); + ASSERT(sko->sko_magic == SKO_MAGIC); + ASSERT(sko->sko_addr != NULL); + + /* Remove from sks_free_list */ + list_del_init(&sko->sko_list); + + sks->sks_age = jiffies; + sks->sks_ref++; + skc->skc_obj_alloc++; + + /* Track max obj usage statistics */ + if (skc->skc_obj_alloc > skc->skc_obj_max) + skc->skc_obj_max = skc->skc_obj_alloc; + + /* Track max slab usage statistics */ + if (sks->sks_ref == 1) { + skc->skc_slab_alloc++; + + if (skc->skc_slab_alloc > skc->skc_slab_max) + skc->skc_slab_max = skc->skc_slab_alloc; + } + + return sko->sko_addr; +} + +/* + * Generic slab allocation function to run by the global work queues. + * It is responsible for allocating a new slab, linking it in to the list + * of partial slabs, and then waking any waiters. + */ +static void +spl_cache_grow_work(void *data) +{ + spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; + spl_kmem_cache_t *skc = ska->ska_cache; + spl_kmem_slab_t *sks; + + sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG); + spin_lock(&skc->skc_lock); + if (sks) { + skc->skc_slab_total++; + skc->skc_obj_total += sks->sks_objs; + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + } + + atomic_dec(&skc->skc_ref); + clear_bit(KMC_BIT_GROWING, &skc->skc_flags); + clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + wake_up_all(&skc->skc_waitq); + spin_unlock(&skc->skc_lock); + + kfree(ska); +} + +/* + * Returns non-zero when a new slab should be available. + */ +static int +spl_cache_grow_wait(spl_kmem_cache_t *skc) +{ + return !test_bit(KMC_BIT_GROWING, &skc->skc_flags); +} + +/* + * No available objects on any slabs, create a new slab. Note that this + * functionality is disabled for KMC_SLAB caches which are backed by the + * Linux slab. + */ +static int +spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) +{ + int remaining, rc; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT((skc->skc_flags & KMC_SLAB) == 0); + might_sleep(); + *obj = NULL; + + /* + * Before allocating a new slab wait for any reaping to complete and + * then return so the local magazine can be rechecked for new objects. + */ + if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { + rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, + TASK_UNINTERRUPTIBLE); + return (rc ? rc : -EAGAIN); + } + + /* + * This is handled by dispatching a work request to the global work + * queue. This allows us to asynchronously allocate a new slab while + * retaining the ability to safely fall back to a smaller synchronous + * allocations to ensure forward progress is always maintained. + */ + if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { + spl_kmem_alloc_t *ska; + + ska = kmalloc(sizeof(*ska), flags); + if (ska == NULL) { + clear_bit(KMC_BIT_GROWING, &skc->skc_flags); + wake_up_all(&skc->skc_waitq); + return (-ENOMEM); + } + + atomic_inc(&skc->skc_ref); + ska->ska_cache = skc; + ska->ska_flags = flags & ~__GFP_FS; + taskq_init_ent(&ska->ska_tqe); + taskq_dispatch_ent(spl_kmem_cache_taskq, + spl_cache_grow_work, ska, 0, &ska->ska_tqe); + } + + /* + * The goal here is to only detect the rare case where a virtual slab + * allocation has deadlocked. We must be careful to minimize the use + * of emergency objects which are more expensive to track. Therefore, + * we set a very long timeout for the asynchronous allocation and if + * the timeout is reached the cache is flagged as deadlocked. From + * this point only new emergency objects will be allocated until the + * asynchronous allocation completes and clears the deadlocked flag. + */ + if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { + rc = spl_emergency_alloc(skc, flags, obj); + } else { + remaining = wait_event_timeout(skc->skc_waitq, + spl_cache_grow_wait(skc), HZ); + + if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) { + spin_lock(&skc->skc_lock); + if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { + set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + skc->skc_obj_deadlock++; + } + spin_unlock(&skc->skc_lock); + } + + rc = -ENOMEM; + } + + return (rc); +} + +/* + * Refill a per-cpu magazine with objects from the slabs for this cache. + * Ideally the magazine can be repopulated using existing objects which have + * been released, however if we are unable to locate enough free objects new + * slabs of objects will be created. On success NULL is returned, otherwise + * the address of a single emergency object is returned for use by the caller. + */ +static void * +spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) +{ + spl_kmem_slab_t *sks; + int count = 0, rc, refill; + void *obj = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + + refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); + spin_lock(&skc->skc_lock); + + while (refill > 0) { + /* No slabs available we may need to grow the cache */ + if (list_empty(&skc->skc_partial_list)) { + spin_unlock(&skc->skc_lock); + + local_irq_enable(); + rc = spl_cache_grow(skc, flags, &obj); + local_irq_disable(); + + /* Emergency object for immediate use by caller */ + if (rc == 0 && obj != NULL) + return (obj); + + if (rc) + goto out; + + /* Rescheduled to different CPU skm is not local */ + if (skm != skc->skc_mag[smp_processor_id()]) + goto out; + + /* Potentially rescheduled to the same CPU but + * allocations may have occurred from this CPU while + * we were sleeping so recalculate max refill. */ + refill = MIN(refill, skm->skm_size - skm->skm_avail); + + spin_lock(&skc->skc_lock); + continue; + } + + /* Grab the next available slab */ + sks = list_entry((&skc->skc_partial_list)->next, + spl_kmem_slab_t, sks_list); + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref < sks->sks_objs); + ASSERT(!list_empty(&sks->sks_free_list)); + + /* Consume as many objects as needed to refill the requested + * cache. We must also be careful not to overfill it. */ + while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) { + ASSERT(skm->skm_avail < skm->skm_size); + ASSERT(count < skm->skm_size); + skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks); + } + + /* Move slab to skc_complete_list when full */ + if (sks->sks_ref == sks->sks_objs) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_complete_list); + } + } + + spin_unlock(&skc->skc_lock); +out: + return (NULL); +} + +/* + * Release an object back to the slab from which it came. + */ +static void +spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_slab_t *sks = NULL; + spl_kmem_obj_t *sko = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(spin_is_locked(&skc->skc_lock)); + + sko = spl_sko_from_obj(skc, obj); + ASSERT(sko->sko_magic == SKO_MAGIC); + sks = sko->sko_slab; + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_cache == skc); + list_add(&sko->sko_list, &sks->sks_free_list); + + sks->sks_age = jiffies; + sks->sks_ref--; + skc->skc_obj_alloc--; + + /* Move slab to skc_partial_list when no longer full. Slabs + * are added to the head to keep the partial list is quasi-full + * sorted order. Fuller at the head, emptier at the tail. */ + if (sks->sks_ref == (sks->sks_objs - 1)) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_partial_list); + } + + /* Move empty slabs to the end of the partial list so + * they can be easily found and freed during reclamation. */ + if (sks->sks_ref == 0) { + list_del(&sks->sks_list); + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + skc->skc_slab_alloc--; + } +} + +/* + * Allocate an object from the per-cpu magazine, or if the magazine + * is empty directly allocate from a slab and repopulate the magazine. + */ +void * +spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_magazine_t *skm; + void *obj = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + ASSERT(flags & KM_SLEEP); + + atomic_inc(&skc->skc_ref); + + /* + * Allocate directly from a Linux slab. All optimizations are left + * to the underlying cache we only need to guarantee that KM_SLEEP + * callers will never fail. + */ + if (skc->skc_flags & KMC_SLAB) { + struct kmem_cache *slc = skc->skc_linux_cache; + + do { + obj = kmem_cache_alloc(slc, flags | __GFP_COMP); + } while ((obj == NULL) && !(flags & KM_NOSLEEP)); + + goto ret; + } + + local_irq_disable(); + +restart: + /* Safe to update per-cpu structure without lock, but + * in the restart case we must be careful to reacquire + * the local magazine since this may have changed + * when we need to grow the cache. */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + if (likely(skm->skm_avail)) { + /* Object available in CPU cache, use it */ + obj = skm->skm_objs[--skm->skm_avail]; + skm->skm_age = jiffies; + } else { + obj = spl_cache_refill(skc, skm, flags); + if (obj == NULL) + goto restart; + } + + local_irq_enable(); + ASSERT(obj); + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + +ret: + /* Pre-emptively migrate object to CPU L1 cache */ + if (obj) { + if (obj && skc->skc_ctor) + skc->skc_ctor(obj, skc->skc_private, flags); + else + prefetchw(obj); + } + + atomic_dec(&skc->skc_ref); + + return (obj); +} + +EXPORT_SYMBOL(spl_kmem_cache_alloc); + +/* + * Free an object back to the local per-cpu magazine, there is no + * guarantee that this is the same magazine the object was originally + * allocated from. We may need to flush entire from the magazine + * back to the slabs to make space. + */ +void +spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_magazine_t *skm; + unsigned long flags; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + atomic_inc(&skc->skc_ref); + + /* + * Run the destructor + */ + if (skc->skc_dtor) + skc->skc_dtor(obj, skc->skc_private); + + /* + * Free the object from the Linux underlying Linux slab. + */ + if (skc->skc_flags & KMC_SLAB) { + kmem_cache_free(skc->skc_linux_cache, obj); + goto out; + } + + /* + * Only virtual slabs may have emergency objects and these objects + * are guaranteed to have physical addresses. They must be removed + * from the tree of emergency objects and the freed. + */ + if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) { + spl_emergency_free(skc, obj); + goto out; + } + + local_irq_save(flags); + + /* Safe to update per-cpu structure without lock, but + * no remote memory allocation tracking is being performed + * it is entirely possible to allocate an object from one + * CPU cache and return it to another. */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + /* Per-CPU cache full, flush it to make space */ + if (unlikely(skm->skm_avail >= skm->skm_size)) + spl_cache_flush(skc, skm, skm->skm_refill); + + /* Available space in cache, use it */ + skm->skm_objs[skm->skm_avail++] = obj; + + local_irq_restore(flags); +out: + atomic_dec(&skc->skc_ref); +} +EXPORT_SYMBOL(spl_kmem_cache_free); + +/* + * The generic shrinker function for all caches. Under Linux a shrinker + * may not be tightly coupled with a slab cache. In fact Linux always + * systematically tries calling all registered shrinker callbacks which + * report that they contain unused objects. Because of this we only + * register one shrinker function in the shim layer for all slab caches. + * We always attempt to shrink all caches when this generic shrinker + * is called. + * + * If sc->nr_to_scan is zero, the caller is requesting a query of the + * number of objects which can potentially be freed. If it is nonzero, + * the request is to free that many objects. + * + * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks + * in struct shrinker and also require the shrinker to return the number + * of objects freed. + * + * Older kernels require the shrinker to return the number of freeable + * objects following the freeing of nr_to_free. + * + * Linux semantics differ from those under Solaris, which are to + * free all available objects which may (and probably will) be more + * objects than the requested nr_to_scan. + */ +static spl_shrinker_t +__spl_kmem_cache_generic_shrinker(struct shrinker *shrink, + struct shrink_control *sc) +{ + spl_kmem_cache_t *skc; + int alloc = 0; + + down_read(&spl_kmem_cache_sem); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + if (sc->nr_to_scan) { +#ifdef HAVE_SPLIT_SHRINKER_CALLBACK + uint64_t oldalloc = skc->skc_obj_alloc; + spl_kmem_cache_reap_now(skc, + MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1)); + if (oldalloc > skc->skc_obj_alloc) + alloc += oldalloc - skc->skc_obj_alloc; +#else + spl_kmem_cache_reap_now(skc, + MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1)); + alloc += skc->skc_obj_alloc; +#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */ + } else { + /* Request to query number of freeable objects */ + alloc += skc->skc_obj_alloc; + } + } + up_read(&spl_kmem_cache_sem); + + /* + * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass. + * This functionality only exists to work around a rare issue where + * shrink_slabs() is repeatedly invoked by many cores causing the + * system to thrash. + */ + if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan) + return (SHRINK_STOP); + + return (MAX(alloc, 0)); +} + +SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker); + +/* + * Call the registered reclaim function for a cache. Depending on how + * many and which objects are released it may simply repopulate the + * local magazine which will then need to age-out. Objects which cannot + * fit in the magazine we will be released back to their slabs which will + * also need to age out before being release. This is all just best + * effort and we do not want to thrash creating and destroying slabs. + */ +void +spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) +{ + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + atomic_inc(&skc->skc_ref); + + /* + * Execute the registered reclaim callback if it exists. The + * per-cpu caches will be drained when is set KMC_EXPIRE_MEM. + */ + if (skc->skc_flags & KMC_SLAB) { + if (skc->skc_reclaim) + skc->skc_reclaim(skc->skc_private); + + if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) + kmem_cache_shrink(skc->skc_linux_cache); + + goto out; + } + + /* + * Prevent concurrent cache reaping when contended. + */ + if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) + goto out; + + /* + * When a reclaim function is available it may be invoked repeatedly + * until at least a single slab can be freed. This ensures that we + * do free memory back to the system. This helps minimize the chance + * of an OOM event when the bulk of memory is used by the slab. + * + * When free slabs are already available the reclaim callback will be + * skipped. Additionally, if no forward progress is detected despite + * a reclaim function the cache will be skipped to avoid deadlock. + * + * Longer term this would be the correct place to add the code which + * repacks the slabs in order minimize fragmentation. + */ + if (skc->skc_reclaim) { + uint64_t objects = UINT64_MAX; + int do_reclaim; + + do { + spin_lock(&skc->skc_lock); + do_reclaim = + (skc->skc_slab_total > 0) && + ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) && + (skc->skc_obj_alloc < objects); + + objects = skc->skc_obj_alloc; + spin_unlock(&skc->skc_lock); + + if (do_reclaim) + skc->skc_reclaim(skc->skc_private); + + } while (do_reclaim); + } + + /* Reclaim from the magazine then the slabs ignoring age and delay. */ + if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) { + spl_kmem_magazine_t *skm; + unsigned long irq_flags; + + local_irq_save(irq_flags); + skm = skc->skc_mag[smp_processor_id()]; + spl_cache_flush(skc, skm, skm->skm_avail); + local_irq_restore(irq_flags); + } + + spl_slab_reclaim(skc, count, 1); + clear_bit(KMC_BIT_REAPING, &skc->skc_flags); + smp_wmb(); + wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); +out: + atomic_dec(&skc->skc_ref); +} +EXPORT_SYMBOL(spl_kmem_cache_reap_now); + +/* + * Reap all free slabs from all registered caches. + */ +void +spl_kmem_reap(void) +{ + struct shrink_control sc; + + sc.nr_to_scan = KMC_REAP_CHUNK; + sc.gfp_mask = GFP_KERNEL; + + (void) __spl_kmem_cache_generic_shrinker(NULL, &sc); +} +EXPORT_SYMBOL(spl_kmem_reap); + +int +spl_kmem_cache_init(void) +{ + init_rwsem(&spl_kmem_cache_sem); + INIT_LIST_HEAD(&spl_kmem_cache_list); + spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", + 1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE); + spl_register_shrinker(&spl_kmem_cache_shrinker); + + return (0); +} + +void +spl_kmem_cache_fini(void) +{ + spl_unregister_shrinker(&spl_kmem_cache_shrinker); + taskq_destroy(spl_kmem_cache_taskq); +} diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c index 502f5365..075bf258 100644 --- a/module/spl/spl-kmem.c +++ b/module/spl/spl-kmem.c @@ -24,97 +24,9 @@ * Solaris Porting Layer (SPL) Kmem Implementation. \*****************************************************************************/ +#include #include -#include -#include - -/* - * Within the scope of spl-kmem.c file the kmem_cache_* definitions - * are removed to allow access to the real Linux slab allocator. - */ -#undef kmem_cache_destroy -#undef kmem_cache_create -#undef kmem_cache_alloc -#undef kmem_cache_free - - -/* - * Cache expiration was implemented because it was part of the default Solaris - * kmem_cache behavior. The idea is that per-cpu objects which haven't been - * accessed in several seconds should be returned to the cache. On the other - * hand Linux slabs never move objects back to the slabs unless there is - * memory pressure on the system. By default the Linux method is enabled - * because it has been shown to improve responsiveness on low memory systems. - * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM. - */ -unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM; -EXPORT_SYMBOL(spl_kmem_cache_expire); -module_param(spl_kmem_cache_expire, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)"); - -/* - * The default behavior is to report the number of objects remaining in the - * cache. This allows the Linux VM to repeatedly reclaim objects from the - * cache when memory is low satisfy other memory allocations. Alternately, - * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache - * is reclaimed. This may increase the likelihood of out of memory events. - */ -unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; -module_param(spl_kmem_cache_reclaim, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); - -unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; -module_param(spl_kmem_cache_obj_per_slab, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); - -unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN; -module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min, - "Minimal number of objects per slab"); - -unsigned int spl_kmem_cache_max_size = 32; -module_param(spl_kmem_cache_max_size, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); - -/* - * For small objects the Linux slab allocator should be used to make the most - * efficient use of the memory. However, large objects are not supported by - * the Linux slab and therefore the SPL implementation is preferred. A cutoff - * of 16K was determined to be optimal for architectures using 4K pages. - */ -#if PAGE_SIZE == 4096 -unsigned int spl_kmem_cache_slab_limit = 16384; -#else -unsigned int spl_kmem_cache_slab_limit = 0; -#endif -module_param(spl_kmem_cache_slab_limit, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_slab_limit, - "Objects less than N bytes use the Linux slab"); - -unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4); -module_param(spl_kmem_cache_kmem_limit, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, - "Objects less than N bytes use the kmalloc"); - -vmem_t *heap_arena = NULL; -EXPORT_SYMBOL(heap_arena); - -vmem_t *zio_alloc_arena = NULL; -EXPORT_SYMBOL(zio_alloc_arena); - -vmem_t *zio_arena = NULL; -EXPORT_SYMBOL(zio_arena); - -size_t -vmem_size(vmem_t *vmp, int typemask) -{ - ASSERT3P(vmp, ==, NULL); - ASSERT3S(typemask & VMEM_ALLOC, ==, VMEM_ALLOC); - ASSERT3S(typemask & VMEM_FREE, ==, VMEM_FREE); - - return (VMALLOC_TOTAL); -} -EXPORT_SYMBOL(vmem_size); +#include int kmem_debugging(void) @@ -195,19 +107,13 @@ EXPORT_SYMBOL(strfree); # ifdef HAVE_ATOMIC64_T atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); unsigned long long kmem_alloc_max = 0; -atomic64_t vmem_alloc_used = ATOMIC64_INIT(0); -unsigned long long vmem_alloc_max = 0; # else /* HAVE_ATOMIC64_T */ atomic_t kmem_alloc_used = ATOMIC_INIT(0); unsigned long long kmem_alloc_max = 0; -atomic_t vmem_alloc_used = ATOMIC_INIT(0); -unsigned long long vmem_alloc_max = 0; # endif /* HAVE_ATOMIC64_T */ EXPORT_SYMBOL(kmem_alloc_used); EXPORT_SYMBOL(kmem_alloc_max); -EXPORT_SYMBOL(vmem_alloc_used); -EXPORT_SYMBOL(vmem_alloc_max); /* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked * but also the location of every alloc and free. When the SPL module is @@ -225,9 +131,6 @@ EXPORT_SYMBOL(vmem_alloc_max); # define KMEM_HASH_BITS 10 # define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) -# define VMEM_HASH_BITS 10 -# define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS) - typedef struct kmem_debug { struct hlist_node kd_hlist; /* Hash node linkage */ struct list_head kd_list; /* List of all allocations */ @@ -241,18 +144,10 @@ spinlock_t kmem_lock; struct hlist_head kmem_table[KMEM_TABLE_SIZE]; struct list_head kmem_list; -spinlock_t vmem_lock; -struct hlist_head vmem_table[VMEM_TABLE_SIZE]; -struct list_head vmem_list; - EXPORT_SYMBOL(kmem_lock); EXPORT_SYMBOL(kmem_table); EXPORT_SYMBOL(kmem_list); -EXPORT_SYMBOL(vmem_lock); -EXPORT_SYMBOL(vmem_table); -EXPORT_SYMBOL(vmem_list); - static kmem_debug_t * kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr) { @@ -393,107 +288,6 @@ kmem_free_track(const void *ptr, size_t size) } EXPORT_SYMBOL(kmem_free_track); -void * -vmem_alloc_track(size_t size, int flags, const char *func, int line) -{ - void *ptr = NULL; - kmem_debug_t *dptr; - unsigned long irq_flags; - - ASSERT(flags & KM_SLEEP); - - /* Function may be called with KM_NOSLEEP so failure is possible */ - dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t), - flags & ~__GFP_ZERO); - if (unlikely(dptr == NULL)) { - printk(KERN_WARNING "debug vmem_alloc(%ld, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - sizeof(kmem_debug_t), flags, func, line, - vmem_alloc_used_read(), vmem_alloc_max); - } else { - /* - * We use __strdup() below because the string pointed to by - * __FUNCTION__ might not be available by the time we want - * to print it, since the module might have been unloaded. - * This can never fail because we have already asserted - * that flags is KM_SLEEP. - */ - dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO); - if (unlikely(dptr->kd_func == NULL)) { - kfree(dptr); - printk(KERN_WARNING "debug __strdup() at %s:%d " - "failed (%lld/%llu)\n", func, line, - vmem_alloc_used_read(), vmem_alloc_max); - goto out; - } - - /* Use the correct allocator */ - if (flags & __GFP_ZERO) { - ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO); - } else { - ptr = vmalloc_nofail(size, flags); - } - - if (unlikely(ptr == NULL)) { - kfree(dptr->kd_func); - kfree(dptr); - printk(KERN_WARNING "vmem_alloc (%llu, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - (unsigned long long) size, flags, func, line, - vmem_alloc_used_read(), vmem_alloc_max); - goto out; - } - - vmem_alloc_used_add(size); - if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) - vmem_alloc_max = vmem_alloc_used_read(); - - INIT_HLIST_NODE(&dptr->kd_hlist); - INIT_LIST_HEAD(&dptr->kd_list); - - dptr->kd_addr = ptr; - dptr->kd_size = size; - dptr->kd_line = line; - - spin_lock_irqsave(&vmem_lock, irq_flags); - hlist_add_head(&dptr->kd_hlist, - &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]); - list_add_tail(&dptr->kd_list, &vmem_list); - spin_unlock_irqrestore(&vmem_lock, irq_flags); - } -out: - return (ptr); -} -EXPORT_SYMBOL(vmem_alloc_track); - -void -vmem_free_track(const void *ptr, size_t size) -{ - kmem_debug_t *dptr; - - ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, - (unsigned long long) size); - - /* Must exist in hash due to vmem_alloc() */ - dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr); - ASSERT(dptr); - - /* Size must match */ - ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " - "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size, - (unsigned long long) size, dptr->kd_func, dptr->kd_line); - - vmem_alloc_used_sub(size); - kfree(dptr->kd_func); - - memset((void *)dptr, 0x5a, sizeof(kmem_debug_t)); - kfree(dptr); - - memset((void *)ptr, 0x5a, size); - vfree(ptr); -} -EXPORT_SYMBOL(vmem_free_track); - # else /* DEBUG_KMEM_TRACKING */ void * @@ -548,1682 +342,105 @@ kmem_free_debug(const void *ptr, size_t size) } EXPORT_SYMBOL(kmem_free_debug); -void * -vmem_alloc_debug(size_t size, int flags, const char *func, int line) -{ - void *ptr; - - ASSERT(flags & KM_SLEEP); - - /* Use the correct allocator */ - if (flags & __GFP_ZERO) { - ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO)); - } else { - ptr = vmalloc_nofail(size, flags); - } - - if (unlikely(ptr == NULL)) { - printk(KERN_WARNING - "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - (unsigned long long)vmem_alloc_used_read(), vmem_alloc_max); - } else { - vmem_alloc_used_add(size); - if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) - vmem_alloc_max = vmem_alloc_used_read(); - } - - return (ptr); -} -EXPORT_SYMBOL(vmem_alloc_debug); - -void -vmem_free_debug(const void *ptr, size_t size) -{ - ASSERT(ptr || size > 0); - vmem_alloc_used_sub(size); - vfree(ptr); -} -EXPORT_SYMBOL(vmem_free_debug); - # endif /* DEBUG_KMEM_TRACKING */ #endif /* DEBUG_KMEM */ -/* - * Slab allocation interfaces - * - * While the Linux slab implementation was inspired by the Solaris - * implementation I cannot use it to emulate the Solaris APIs. I - * require two features which are not provided by the Linux slab. - * - * 1) Constructors AND destructors. Recent versions of the Linux - * kernel have removed support for destructors. This is a deal - * breaker for the SPL which contains particularly expensive - * initializers for mutex's, condition variables, etc. We also - * require a minimal level of cleanup for these data types unlike - * many Linux data type which do need to be explicitly destroyed. - * - * 2) Virtual address space backed slab. Callers of the Solaris slab - * expect it to work well for both small are very large allocations. - * Because of memory fragmentation the Linux slab which is backed - * by kmalloc'ed memory performs very badly when confronted with - * large numbers of large allocations. Basing the slab on the - * virtual address space removes the need for contiguous pages - * and greatly improve performance for large allocations. - * - * For these reasons, the SPL has its own slab implementation with - * the needed features. It is not as highly optimized as either the - * Solaris or Linux slabs, but it should get me most of what is - * needed until it can be optimized or obsoleted by another approach. - * - * One serious concern I do have about this method is the relatively - * small virtual address space on 32bit arches. This will seriously - * constrain the size of the slab caches and their performance. - * - * XXX: Improve the partial slab list by carefully maintaining a - * strict ordering of fullest to emptiest slabs based on - * the slab reference count. This guarantees the when freeing - * slabs back to the system we need only linearly traverse the - * last N slabs in the list to discover all the freeable slabs. - * - * XXX: NUMA awareness for optionally allocating memory close to a - * particular core. This can be advantageous if you know the slab - * object will be short lived and primarily accessed from one core. - * - * XXX: Slab coloring may also yield performance improvements and would - * be desirable to implement. - */ - -struct list_head spl_kmem_cache_list; /* List of caches */ -struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ -taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */ - -static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); - -SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker); -SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, - spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS); - -static void * -kv_alloc(spl_kmem_cache_t *skc, int size, int flags) +#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) +static char * +spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) { - void *ptr; - - ASSERT(ISP2(size)); - - if (skc->skc_flags & KMC_KMEM) - ptr = (void *)__get_free_pages(flags | __GFP_COMP, - get_order(size)); - else - ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); - - /* Resulting allocated memory will be page aligned */ - ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; + int i, flag = 1; - return ptr; -} + ASSERT(str != NULL && len >= 17); + memset(str, 0, len); -static void -kv_free(spl_kmem_cache_t *skc, void *ptr, int size) -{ - ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); - ASSERT(ISP2(size)); + /* Check for a fully printable string, and while we are at + * it place the printable characters in the passed buffer. */ + for (i = 0; i < size; i++) { + str[i] = ((char *)(kd->kd_addr))[i]; + if (isprint(str[i])) { + continue; + } else { + /* Minimum number of printable characters found + * to make it worthwhile to print this as ascii. */ + if (i > min) + break; - /* - * The Linux direct reclaim path uses this out of band value to - * determine if forward progress is being made. Normally this is - * incremented by kmem_freepages() which is part of the various - * Linux slab implementations. However, since we are using none - * of that infrastructure we are responsible for incrementing it. - */ - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; + flag = 0; + break; + } + } - if (skc->skc_flags & KMC_KMEM) - free_pages((unsigned long)ptr, get_order(size)); - else - vfree(ptr); -} + if (!flag) { + sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", + *((uint8_t *)kd->kd_addr), + *((uint8_t *)kd->kd_addr + 2), + *((uint8_t *)kd->kd_addr + 4), + *((uint8_t *)kd->kd_addr + 6), + *((uint8_t *)kd->kd_addr + 8), + *((uint8_t *)kd->kd_addr + 10), + *((uint8_t *)kd->kd_addr + 12), + *((uint8_t *)kd->kd_addr + 14)); + } -/* - * Required space for each aligned sks. - */ -static inline uint32_t -spl_sks_size(spl_kmem_cache_t *skc) -{ - return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t), - skc->skc_obj_align, uint32_t); + return str; } -/* - * Required space for each aligned object. - */ -static inline uint32_t -spl_obj_size(spl_kmem_cache_t *skc) +static int +spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) { - uint32_t align = skc->skc_obj_align; + int i; - return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + - P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t); -} + spin_lock_init(lock); + INIT_LIST_HEAD(list); -/* - * Lookup the spl_kmem_object_t for an object given that object. - */ -static inline spl_kmem_obj_t * -spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) -{ - return obj + P2ROUNDUP_TYPED(skc->skc_obj_size, - skc->skc_obj_align, uint32_t); -} + for (i = 0; i < size; i++) + INIT_HLIST_HEAD(&kmem_table[i]); -/* - * Required space for each offslab object taking in to account alignment - * restrictions and the power-of-two requirement of kv_alloc(). - */ -static inline uint32_t -spl_offslab_size(spl_kmem_cache_t *skc) -{ - return 1UL << (fls64(spl_obj_size(skc)) + 1); + return (0); } -/* - * It's important that we pack the spl_kmem_obj_t structure and the - * actual objects in to one large address space to minimize the number - * of calls to the allocator. It is far better to do a few large - * allocations and then subdivide it ourselves. Now which allocator - * we use requires balancing a few trade offs. - * - * For small objects we use kmem_alloc() because as long as you are - * only requesting a small number of pages (ideally just one) its cheap. - * However, when you start requesting multiple pages with kmem_alloc() - * it gets increasingly expensive since it requires contiguous pages. - * For this reason we shift to vmem_alloc() for slabs of large objects - * which removes the need for contiguous pages. We do not use - * vmem_alloc() in all cases because there is significant locking - * overhead in __get_vm_area_node(). This function takes a single - * global lock when acquiring an available virtual address range which - * serializes all vmem_alloc()'s for all slab caches. Using slightly - * different allocation functions for small and large objects should - * give us the best of both worlds. - * - * KMC_ONSLAB KMC_OFFSLAB - * - * +------------------------+ +-----------------+ - * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+ - * | skc_obj_size <-+ | | +-----------------+ | | - * | spl_kmem_obj_t | | | | - * | skc_obj_size <---+ | +-----------------+ | | - * | spl_kmem_obj_t | | | skc_obj_size | <-+ | - * | ... v | | spl_kmem_obj_t | | - * +------------------------+ +-----------------+ v - */ -static spl_kmem_slab_t * -spl_slab_alloc(spl_kmem_cache_t *skc, int flags) +static void +spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) { - spl_kmem_slab_t *sks; - spl_kmem_obj_t *sko, *n; - void *base, *obj; - uint32_t obj_size, offslab_size = 0; - int i, rc = 0; - - base = kv_alloc(skc, skc->skc_slab_size, flags); - if (base == NULL) - return (NULL); - - sks = (spl_kmem_slab_t *)base; - sks->sks_magic = SKS_MAGIC; - sks->sks_objs = skc->skc_slab_objs; - sks->sks_age = jiffies; - sks->sks_cache = skc; - INIT_LIST_HEAD(&sks->sks_list); - INIT_LIST_HEAD(&sks->sks_free_list); - sks->sks_ref = 0; - obj_size = spl_obj_size(skc); - - if (skc->skc_flags & KMC_OFFSLAB) - offslab_size = spl_offslab_size(skc); - - for (i = 0; i < sks->sks_objs; i++) { - if (skc->skc_flags & KMC_OFFSLAB) { - obj = kv_alloc(skc, offslab_size, flags); - if (!obj) { - rc = -ENOMEM; - goto out; - } - } else { - obj = base + spl_sks_size(skc) + (i * obj_size); - } + unsigned long flags; + kmem_debug_t *kd; + char str[17]; - ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); - sko = spl_sko_from_obj(skc, obj); - sko->sko_addr = obj; - sko->sko_magic = SKO_MAGIC; - sko->sko_slab = sks; - INIT_LIST_HEAD(&sko->sko_list); - list_add_tail(&sko->sko_list, &sks->sks_free_list); - } + spin_lock_irqsave(lock, flags); + if (!list_empty(list)) + printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", + "size", "data", "func", "line"); -out: - if (rc) { - if (skc->skc_flags & KMC_OFFSLAB) - list_for_each_entry_safe(sko, n, &sks->sks_free_list, - sko_list) - kv_free(skc, sko->sko_addr, offslab_size); - - kv_free(skc, base, skc->skc_slab_size); - sks = NULL; - } + list_for_each_entry(kd, list, kd_list) + printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, + (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), + kd->kd_func, kd->kd_line); - return (sks); + spin_unlock_irqrestore(lock, flags); } +#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ +#define spl_kmem_init_tracking(list, lock, size) +#define spl_kmem_fini_tracking(list, lock) +#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ -/* - * Remove a slab from complete or partial list, it must be called with - * the 'skc->skc_lock' held but the actual free must be performed - * outside the lock to prevent deadlocking on vmem addresses. - */ -static void -spl_slab_free(spl_kmem_slab_t *sks, - struct list_head *sks_list, struct list_head *sko_list) +int +spl_kmem_init(void) { - spl_kmem_cache_t *skc; - - ASSERT(sks->sks_magic == SKS_MAGIC); - ASSERT(sks->sks_ref == 0); + int rc = 0; - skc = sks->sks_cache; - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(spin_is_locked(&skc->skc_lock)); +#ifdef DEBUG_KMEM + kmem_alloc_used_set(0); + spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); +#endif - /* - * Update slab/objects counters in the cache, then remove the - * slab from the skc->skc_partial_list. Finally add the slab - * and all its objects in to the private work lists where the - * destructors will be called and the memory freed to the system. - */ - skc->skc_obj_total -= sks->sks_objs; - skc->skc_slab_total--; - list_del(&sks->sks_list); - list_add(&sks->sks_list, sks_list); - list_splice_init(&sks->sks_free_list, sko_list); + return (rc); } -/* - * Traverses all the partial slabs attached to a cache and free those - * which which are currently empty, and have not been touched for - * skc_delay seconds to avoid thrashing. The count argument is - * passed to optionally cap the number of slabs reclaimed, a count - * of zero means try and reclaim everything. When flag is set we - * always free an available slab regardless of age. - */ -static void -spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) +void +spl_kmem_fini(void) { - spl_kmem_slab_t *sks, *m; - spl_kmem_obj_t *sko, *n; - LIST_HEAD(sks_list); - LIST_HEAD(sko_list); - uint32_t size = 0; - int i = 0; - - /* - * Move empty slabs and objects which have not been touched in - * skc_delay seconds on to private lists to be freed outside - * the spin lock. This delay time is important to avoid thrashing - * however when flag is set the delay will not be used. - */ - spin_lock(&skc->skc_lock); - list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){ - /* - * All empty slabs are at the end of skc->skc_partial_list, - * therefore once a non-empty slab is found we can stop - * scanning. Additionally, stop when reaching the target - * reclaim 'count' if a non-zero threshold is given. - */ - if ((sks->sks_ref > 0) || (count && i >= count)) - break; - - if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) { - spl_slab_free(sks, &sks_list, &sko_list); - i++; - } - } - spin_unlock(&skc->skc_lock); - - /* - * The following two loops ensure all the object destructors are - * run, any offslab objects are freed, and the slabs themselves - * are freed. This is all done outside the skc->skc_lock since - * this allows the destructor to sleep, and allows us to perform - * a conditional reschedule when a freeing a large number of - * objects and slabs back to the system. - */ - if (skc->skc_flags & KMC_OFFSLAB) - size = spl_offslab_size(skc); - - list_for_each_entry_safe(sko, n, &sko_list, sko_list) { - ASSERT(sko->sko_magic == SKO_MAGIC); - - if (skc->skc_flags & KMC_OFFSLAB) - kv_free(skc, sko->sko_addr, size); - } - - list_for_each_entry_safe(sks, m, &sks_list, sks_list) { - ASSERT(sks->sks_magic == SKS_MAGIC); - kv_free(skc, sks, skc->skc_slab_size); - } -} - -static spl_kmem_emergency_t * -spl_emergency_search(struct rb_root *root, void *obj) -{ - struct rb_node *node = root->rb_node; - spl_kmem_emergency_t *ske; - unsigned long address = (unsigned long)obj; - - while (node) { - ske = container_of(node, spl_kmem_emergency_t, ske_node); - - if (address < (unsigned long)ske->ske_obj) - node = node->rb_left; - else if (address > (unsigned long)ske->ske_obj) - node = node->rb_right; - else - return ske; - } - - return NULL; -} - -static int -spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) -{ - struct rb_node **new = &(root->rb_node), *parent = NULL; - spl_kmem_emergency_t *ske_tmp; - unsigned long address = (unsigned long)ske->ske_obj; - - while (*new) { - ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); - - parent = *new; - if (address < (unsigned long)ske_tmp->ske_obj) - new = &((*new)->rb_left); - else if (address > (unsigned long)ske_tmp->ske_obj) - new = &((*new)->rb_right); - else - return 0; - } - - rb_link_node(&ske->ske_node, parent, new); - rb_insert_color(&ske->ske_node, root); - - return 1; -} - -/* - * Allocate a single emergency object and track it in a red black tree. - */ -static int -spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) -{ - spl_kmem_emergency_t *ske; - int empty; - - /* Last chance use a partial slab if one now exists */ - spin_lock(&skc->skc_lock); - empty = list_empty(&skc->skc_partial_list); - spin_unlock(&skc->skc_lock); - if (!empty) - return (-EEXIST); - - ske = kmalloc(sizeof(*ske), flags); - if (ske == NULL) - return (-ENOMEM); - - ske->ske_obj = kmalloc(skc->skc_obj_size, flags); - if (ske->ske_obj == NULL) { - kfree(ske); - return (-ENOMEM); - } - - spin_lock(&skc->skc_lock); - empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); - if (likely(empty)) { - skc->skc_obj_total++; - skc->skc_obj_emergency++; - if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) - skc->skc_obj_emergency_max = skc->skc_obj_emergency; - } - spin_unlock(&skc->skc_lock); - - if (unlikely(!empty)) { - kfree(ske->ske_obj); - kfree(ske); - return (-EINVAL); - } - - *obj = ske->ske_obj; - - return (0); -} - -/* - * Locate the passed object in the red black tree and free it. - */ -static int -spl_emergency_free(spl_kmem_cache_t *skc, void *obj) -{ - spl_kmem_emergency_t *ske; - - spin_lock(&skc->skc_lock); - ske = spl_emergency_search(&skc->skc_emergency_tree, obj); - if (likely(ske)) { - rb_erase(&ske->ske_node, &skc->skc_emergency_tree); - skc->skc_obj_emergency--; - skc->skc_obj_total--; - } - spin_unlock(&skc->skc_lock); - - if (unlikely(ske == NULL)) - return (-ENOENT); - - kfree(ske->ske_obj); - kfree(ske); - - return (0); -} - -/* - * Release objects from the per-cpu magazine back to their slab. The flush - * argument contains the max number of entries to remove from the magazine. - */ -static void -__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) -{ - int i, count = MIN(flush, skm->skm_avail); - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(skm->skm_magic == SKM_MAGIC); - ASSERT(spin_is_locked(&skc->skc_lock)); - - for (i = 0; i < count; i++) - spl_cache_shrink(skc, skm->skm_objs[i]); - - skm->skm_avail -= count; - memmove(skm->skm_objs, &(skm->skm_objs[count]), - sizeof(void *) * skm->skm_avail); -} - -static void -spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) -{ - spin_lock(&skc->skc_lock); - __spl_cache_flush(skc, skm, flush); - spin_unlock(&skc->skc_lock); -} - -static void -spl_magazine_age(void *data) -{ - spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; - spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; - - ASSERT(skm->skm_magic == SKM_MAGIC); - ASSERT(skm->skm_cpu == smp_processor_id()); - ASSERT(irqs_disabled()); - - /* There are no available objects or they are too young to age out */ - if ((skm->skm_avail == 0) || - time_before(jiffies, skm->skm_age + skc->skc_delay * HZ)) - return; - - /* - * Because we're executing in interrupt context we may have - * interrupted the holder of this lock. To avoid a potential - * deadlock return if the lock is contended. - */ - if (!spin_trylock(&skc->skc_lock)) - return; - - __spl_cache_flush(skc, skm, skm->skm_refill); - spin_unlock(&skc->skc_lock); -} - -/* - * Called regularly to keep a downward pressure on the cache. - * - * Objects older than skc->skc_delay seconds in the per-cpu magazines will - * be returned to the caches. This is done to prevent idle magazines from - * holding memory which could be better used elsewhere. The delay is - * present to prevent thrashing the magazine. - * - * The newly released objects may result in empty partial slabs. Those - * slabs should be released to the system. Otherwise moving the objects - * out of the magazines is just wasted work. - */ -static void -spl_cache_age(void *data) -{ - spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; - taskqid_t id = 0; - - ASSERT(skc->skc_magic == SKC_MAGIC); - - /* Dynamically disabled at run time */ - if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE)) - return; - - atomic_inc(&skc->skc_ref); - - if (!(skc->skc_flags & KMC_NOMAGAZINE)) - on_each_cpu(spl_magazine_age, skc, 1); - - spl_slab_reclaim(skc, skc->skc_reap, 0); - - while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) { - id = taskq_dispatch_delay( - spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP, - ddi_get_lbolt() + skc->skc_delay / 3 * HZ); - - /* Destroy issued after dispatch immediately cancel it */ - if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id) - taskq_cancel_id(spl_kmem_cache_taskq, id); - } - - spin_lock(&skc->skc_lock); - skc->skc_taskqid = id; - spin_unlock(&skc->skc_lock); - - atomic_dec(&skc->skc_ref); -} - -/* - * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. - * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, - * for very small objects we may end up with more than this so as not - * to waste space in the minimal allocation of a single page. Also for - * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min, - * lower than this and we will fail. - */ -static int -spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) -{ - uint32_t sks_size, obj_size, max_size; - - if (skc->skc_flags & KMC_OFFSLAB) { - *objs = spl_kmem_cache_obj_per_slab; - *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), PAGE_SIZE); - return (0); - } else { - sks_size = spl_sks_size(skc); - obj_size = spl_obj_size(skc); - - if (skc->skc_flags & KMC_KMEM) - max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE; - else - max_size = (spl_kmem_cache_max_size * 1024 * 1024); - - /* Power of two sized slab */ - for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) { - *objs = (*size - sks_size) / obj_size; - if (*objs >= spl_kmem_cache_obj_per_slab) - return (0); - } - - /* - * Unable to satisfy target objects per slab, fall back to - * allocating a maximally sized slab and assuming it can - * contain the minimum objects count use it. If not fail. - */ - *size = max_size; - *objs = (*size - sks_size) / obj_size; - if (*objs >= (spl_kmem_cache_obj_per_slab_min)) - return (0); - } - - return (-ENOSPC); -} - -/* - * Make a guess at reasonable per-cpu magazine size based on the size of - * each object and the cost of caching N of them in each magazine. Long - * term this should really adapt based on an observed usage heuristic. - */ -static int -spl_magazine_size(spl_kmem_cache_t *skc) -{ - uint32_t obj_size = spl_obj_size(skc); - int size; - - /* Per-magazine sizes below assume a 4Kib page size */ - if (obj_size > (PAGE_SIZE * 256)) - size = 4; /* Minimum 4Mib per-magazine */ - else if (obj_size > (PAGE_SIZE * 32)) - size = 16; /* Minimum 2Mib per-magazine */ - else if (obj_size > (PAGE_SIZE)) - size = 64; /* Minimum 256Kib per-magazine */ - else if (obj_size > (PAGE_SIZE / 4)) - size = 128; /* Minimum 128Kib per-magazine */ - else - size = 256; - - return (size); -} - -/* - * Allocate a per-cpu magazine to associate with a specific core. - */ -static spl_kmem_magazine_t * -spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) -{ - spl_kmem_magazine_t *skm; - int size = sizeof(spl_kmem_magazine_t) + - sizeof(void *) * skc->skc_mag_size; - - skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu)); - if (skm) { - skm->skm_magic = SKM_MAGIC; - skm->skm_avail = 0; - skm->skm_size = skc->skc_mag_size; - skm->skm_refill = skc->skc_mag_refill; - skm->skm_cache = skc; - skm->skm_age = jiffies; - skm->skm_cpu = cpu; - } - - return (skm); -} - -/* - * Free a per-cpu magazine associated with a specific core. - */ -static void -spl_magazine_free(spl_kmem_magazine_t *skm) -{ - int size = sizeof(spl_kmem_magazine_t) + - sizeof(void *) * skm->skm_size; - - ASSERT(skm->skm_magic == SKM_MAGIC); - ASSERT(skm->skm_avail == 0); - - kmem_free(skm, size); -} - -/* - * Create all pre-cpu magazines of reasonable sizes. - */ -static int -spl_magazine_create(spl_kmem_cache_t *skc) -{ - int i; - - if (skc->skc_flags & KMC_NOMAGAZINE) - return (0); - - skc->skc_mag_size = spl_magazine_size(skc); - skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; - - for_each_online_cpu(i) { - skc->skc_mag[i] = spl_magazine_alloc(skc, i); - if (!skc->skc_mag[i]) { - for (i--; i >= 0; i--) - spl_magazine_free(skc->skc_mag[i]); - - return (-ENOMEM); - } - } - - return (0); -} - -/* - * Destroy all pre-cpu magazines. - */ -static void -spl_magazine_destroy(spl_kmem_cache_t *skc) -{ - spl_kmem_magazine_t *skm; - int i; - - if (skc->skc_flags & KMC_NOMAGAZINE) - return; - - for_each_online_cpu(i) { - skm = skc->skc_mag[i]; - spl_cache_flush(skc, skm, skm->skm_avail); - spl_magazine_free(skm); - } -} - -/* - * Create a object cache based on the following arguments: - * name cache name - * size cache object size - * align cache object alignment - * ctor cache object constructor - * dtor cache object destructor - * reclaim cache object reclaim - * priv cache private data for ctor/dtor/reclaim - * vmp unused must be NULL - * flags - * KMC_NOTOUCH Disable cache object aging (unsupported) - * KMC_NODEBUG Disable debugging (unsupported) - * KMC_NOHASH Disable hashing (unsupported) - * KMC_QCACHE Disable qcache (unsupported) - * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab - * KMC_KMEM Force kmem backed cache - * KMC_VMEM Force vmem backed cache - * KMC_SLAB Force Linux slab backed cache - * KMC_OFFSLAB Locate objects off the slab - */ -spl_kmem_cache_t * -spl_kmem_cache_create(char *name, size_t size, size_t align, - spl_kmem_ctor_t ctor, - spl_kmem_dtor_t dtor, - spl_kmem_reclaim_t reclaim, - void *priv, void *vmp, int flags) -{ - spl_kmem_cache_t *skc; - int rc; - - /* - * Unsupported flags - */ - ASSERT0(flags & KMC_NOMAGAZINE); - ASSERT0(flags & KMC_NOHASH); - ASSERT0(flags & KMC_QCACHE); - ASSERT(vmp == NULL); - - might_sleep(); - - /* - * Allocate memory for a new cache an initialize it. Unfortunately, - * this usually ends up being a large allocation of ~32k because - * we need to allocate enough memory for the worst case number of - * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we - * explicitly pass KM_NODEBUG to suppress the kmem warning - */ - skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG); - if (skc == NULL) - return (NULL); - - skc->skc_magic = SKC_MAGIC; - skc->skc_name_size = strlen(name) + 1; - skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP); - if (skc->skc_name == NULL) { - kmem_free(skc, sizeof(*skc)); - return (NULL); - } - strncpy(skc->skc_name, name, skc->skc_name_size); - - skc->skc_ctor = ctor; - skc->skc_dtor = dtor; - skc->skc_reclaim = reclaim; - skc->skc_private = priv; - skc->skc_vmp = vmp; - skc->skc_linux_cache = NULL; - skc->skc_flags = flags; - skc->skc_obj_size = size; - skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; - skc->skc_delay = SPL_KMEM_CACHE_DELAY; - skc->skc_reap = SPL_KMEM_CACHE_REAP; - atomic_set(&skc->skc_ref, 0); - - INIT_LIST_HEAD(&skc->skc_list); - INIT_LIST_HEAD(&skc->skc_complete_list); - INIT_LIST_HEAD(&skc->skc_partial_list); - skc->skc_emergency_tree = RB_ROOT; - spin_lock_init(&skc->skc_lock); - init_waitqueue_head(&skc->skc_waitq); - skc->skc_slab_fail = 0; - skc->skc_slab_create = 0; - skc->skc_slab_destroy = 0; - skc->skc_slab_total = 0; - skc->skc_slab_alloc = 0; - skc->skc_slab_max = 0; - skc->skc_obj_total = 0; - skc->skc_obj_alloc = 0; - skc->skc_obj_max = 0; - skc->skc_obj_deadlock = 0; - skc->skc_obj_emergency = 0; - skc->skc_obj_emergency_max = 0; - - /* - * Verify the requested alignment restriction is sane. - */ - if (align) { - VERIFY(ISP2(align)); - VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); - VERIFY3U(align, <=, PAGE_SIZE); - skc->skc_obj_align = align; - } - - /* - * When no specific type of slab is requested (kmem, vmem, or - * linuxslab) then select a cache type based on the object size - * and default tunables. - */ - if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) { - - /* - * Objects smaller than spl_kmem_cache_slab_limit can - * use the Linux slab for better space-efficiency. By - * default this functionality is disabled until its - * performance characters are fully understood. - */ - if (spl_kmem_cache_slab_limit && - size <= (size_t)spl_kmem_cache_slab_limit) - skc->skc_flags |= KMC_SLAB; - - /* - * Small objects, less than spl_kmem_cache_kmem_limit per - * object should use kmem because their slabs are small. - */ - else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit) - skc->skc_flags |= KMC_KMEM; - - /* - * All other objects are considered large and are placed - * on vmem backed slabs. - */ - else - skc->skc_flags |= KMC_VMEM; - } - - /* - * Given the type of slab allocate the required resources. - */ - if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { - rc = spl_slab_size(skc, - &skc->skc_slab_objs, &skc->skc_slab_size); - if (rc) - goto out; - - rc = spl_magazine_create(skc); - if (rc) - goto out; - } else { - skc->skc_linux_cache = kmem_cache_create( - skc->skc_name, size, align, 0, NULL); - if (skc->skc_linux_cache == NULL) { - rc = ENOMEM; - goto out; - } - - kmem_cache_set_allocflags(skc, __GFP_COMP); - skc->skc_flags |= KMC_NOMAGAZINE; - } - - if (spl_kmem_cache_expire & KMC_EXPIRE_AGE) - skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq, - spl_cache_age, skc, TQ_SLEEP, - ddi_get_lbolt() + skc->skc_delay / 3 * HZ); - - down_write(&spl_kmem_cache_sem); - list_add_tail(&skc->skc_list, &spl_kmem_cache_list); - up_write(&spl_kmem_cache_sem); - - return (skc); -out: - kmem_free(skc->skc_name, skc->skc_name_size); - kmem_free(skc, sizeof(*skc)); - return (NULL); -} -EXPORT_SYMBOL(spl_kmem_cache_create); - -/* - * Register a move callback to for cache defragmentation. - * XXX: Unimplemented but harmless to stub out for now. - */ -void -spl_kmem_cache_set_move(spl_kmem_cache_t *skc, - kmem_cbrc_t (move)(void *, void *, size_t, void *)) -{ - ASSERT(move != NULL); -} -EXPORT_SYMBOL(spl_kmem_cache_set_move); - -/* - * Destroy a cache and all objects associated with the cache. - */ -void -spl_kmem_cache_destroy(spl_kmem_cache_t *skc) -{ - DECLARE_WAIT_QUEUE_HEAD(wq); - taskqid_t id; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB)); - - down_write(&spl_kmem_cache_sem); - list_del_init(&skc->skc_list); - up_write(&spl_kmem_cache_sem); - - /* Cancel any and wait for any pending delayed tasks */ - VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - - spin_lock(&skc->skc_lock); - id = skc->skc_taskqid; - spin_unlock(&skc->skc_lock); - - taskq_cancel_id(spl_kmem_cache_taskq, id); - - /* Wait until all current callers complete, this is mainly - * to catch the case where a low memory situation triggers a - * cache reaping action which races with this destroy. */ - wait_event(wq, atomic_read(&skc->skc_ref) == 0); - - if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { - spl_magazine_destroy(skc); - spl_slab_reclaim(skc, 0, 1); - } else { - ASSERT(skc->skc_flags & KMC_SLAB); - kmem_cache_destroy(skc->skc_linux_cache); - } - - spin_lock(&skc->skc_lock); - - /* Validate there are no objects in use and free all the - * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ - ASSERT3U(skc->skc_slab_alloc, ==, 0); - ASSERT3U(skc->skc_obj_alloc, ==, 0); - ASSERT3U(skc->skc_slab_total, ==, 0); - ASSERT3U(skc->skc_obj_total, ==, 0); - ASSERT3U(skc->skc_obj_emergency, ==, 0); - ASSERT(list_empty(&skc->skc_complete_list)); - - kmem_free(skc->skc_name, skc->skc_name_size); - spin_unlock(&skc->skc_lock); - - kmem_free(skc, sizeof(*skc)); -} -EXPORT_SYMBOL(spl_kmem_cache_destroy); - -/* - * Allocate an object from a slab attached to the cache. This is used to - * repopulate the per-cpu magazine caches in batches when they run low. - */ -static void * -spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) -{ - spl_kmem_obj_t *sko; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(sks->sks_magic == SKS_MAGIC); - ASSERT(spin_is_locked(&skc->skc_lock)); - - sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); - ASSERT(sko->sko_magic == SKO_MAGIC); - ASSERT(sko->sko_addr != NULL); - - /* Remove from sks_free_list */ - list_del_init(&sko->sko_list); - - sks->sks_age = jiffies; - sks->sks_ref++; - skc->skc_obj_alloc++; - - /* Track max obj usage statistics */ - if (skc->skc_obj_alloc > skc->skc_obj_max) - skc->skc_obj_max = skc->skc_obj_alloc; - - /* Track max slab usage statistics */ - if (sks->sks_ref == 1) { - skc->skc_slab_alloc++; - - if (skc->skc_slab_alloc > skc->skc_slab_max) - skc->skc_slab_max = skc->skc_slab_alloc; - } - - return sko->sko_addr; -} - -/* - * Generic slab allocation function to run by the global work queues. - * It is responsible for allocating a new slab, linking it in to the list - * of partial slabs, and then waking any waiters. - */ -static void -spl_cache_grow_work(void *data) -{ - spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; - spl_kmem_cache_t *skc = ska->ska_cache; - spl_kmem_slab_t *sks; - - sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG); - spin_lock(&skc->skc_lock); - if (sks) { - skc->skc_slab_total++; - skc->skc_obj_total += sks->sks_objs; - list_add_tail(&sks->sks_list, &skc->skc_partial_list); - } - - atomic_dec(&skc->skc_ref); - clear_bit(KMC_BIT_GROWING, &skc->skc_flags); - clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); - wake_up_all(&skc->skc_waitq); - spin_unlock(&skc->skc_lock); - - kfree(ska); -} - -/* - * Returns non-zero when a new slab should be available. - */ -static int -spl_cache_grow_wait(spl_kmem_cache_t *skc) -{ - return !test_bit(KMC_BIT_GROWING, &skc->skc_flags); -} - -/* - * No available objects on any slabs, create a new slab. Note that this - * functionality is disabled for KMC_SLAB caches which are backed by the - * Linux slab. - */ -static int -spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) -{ - int remaining, rc; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT((skc->skc_flags & KMC_SLAB) == 0); - might_sleep(); - *obj = NULL; - - /* - * Before allocating a new slab wait for any reaping to complete and - * then return so the local magazine can be rechecked for new objects. - */ - if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { - rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, - TASK_UNINTERRUPTIBLE); - return (rc ? rc : -EAGAIN); - } - - /* - * This is handled by dispatching a work request to the global work - * queue. This allows us to asynchronously allocate a new slab while - * retaining the ability to safely fall back to a smaller synchronous - * allocations to ensure forward progress is always maintained. - */ - if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { - spl_kmem_alloc_t *ska; - - ska = kmalloc(sizeof(*ska), flags); - if (ska == NULL) { - clear_bit(KMC_BIT_GROWING, &skc->skc_flags); - wake_up_all(&skc->skc_waitq); - return (-ENOMEM); - } - - atomic_inc(&skc->skc_ref); - ska->ska_cache = skc; - ska->ska_flags = flags & ~__GFP_FS; - taskq_init_ent(&ska->ska_tqe); - taskq_dispatch_ent(spl_kmem_cache_taskq, - spl_cache_grow_work, ska, 0, &ska->ska_tqe); - } - - /* - * The goal here is to only detect the rare case where a virtual slab - * allocation has deadlocked. We must be careful to minimize the use - * of emergency objects which are more expensive to track. Therefore, - * we set a very long timeout for the asynchronous allocation and if - * the timeout is reached the cache is flagged as deadlocked. From - * this point only new emergency objects will be allocated until the - * asynchronous allocation completes and clears the deadlocked flag. - */ - if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { - rc = spl_emergency_alloc(skc, flags, obj); - } else { - remaining = wait_event_timeout(skc->skc_waitq, - spl_cache_grow_wait(skc), HZ); - - if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) { - spin_lock(&skc->skc_lock); - if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { - set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); - skc->skc_obj_deadlock++; - } - spin_unlock(&skc->skc_lock); - } - - rc = -ENOMEM; - } - - return (rc); -} - -/* - * Refill a per-cpu magazine with objects from the slabs for this cache. - * Ideally the magazine can be repopulated using existing objects which have - * been released, however if we are unable to locate enough free objects new - * slabs of objects will be created. On success NULL is returned, otherwise - * the address of a single emergency object is returned for use by the caller. - */ -static void * -spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) -{ - spl_kmem_slab_t *sks; - int count = 0, rc, refill; - void *obj = NULL; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(skm->skm_magic == SKM_MAGIC); - - refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); - spin_lock(&skc->skc_lock); - - while (refill > 0) { - /* No slabs available we may need to grow the cache */ - if (list_empty(&skc->skc_partial_list)) { - spin_unlock(&skc->skc_lock); - - local_irq_enable(); - rc = spl_cache_grow(skc, flags, &obj); - local_irq_disable(); - - /* Emergency object for immediate use by caller */ - if (rc == 0 && obj != NULL) - return (obj); - - if (rc) - goto out; - - /* Rescheduled to different CPU skm is not local */ - if (skm != skc->skc_mag[smp_processor_id()]) - goto out; - - /* Potentially rescheduled to the same CPU but - * allocations may have occurred from this CPU while - * we were sleeping so recalculate max refill. */ - refill = MIN(refill, skm->skm_size - skm->skm_avail); - - spin_lock(&skc->skc_lock); - continue; - } - - /* Grab the next available slab */ - sks = list_entry((&skc->skc_partial_list)->next, - spl_kmem_slab_t, sks_list); - ASSERT(sks->sks_magic == SKS_MAGIC); - ASSERT(sks->sks_ref < sks->sks_objs); - ASSERT(!list_empty(&sks->sks_free_list)); - - /* Consume as many objects as needed to refill the requested - * cache. We must also be careful not to overfill it. */ - while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) { - ASSERT(skm->skm_avail < skm->skm_size); - ASSERT(count < skm->skm_size); - skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks); - } - - /* Move slab to skc_complete_list when full */ - if (sks->sks_ref == sks->sks_objs) { - list_del(&sks->sks_list); - list_add(&sks->sks_list, &skc->skc_complete_list); - } - } - - spin_unlock(&skc->skc_lock); -out: - return (NULL); -} - -/* - * Release an object back to the slab from which it came. - */ -static void -spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) -{ - spl_kmem_slab_t *sks = NULL; - spl_kmem_obj_t *sko = NULL; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(spin_is_locked(&skc->skc_lock)); - - sko = spl_sko_from_obj(skc, obj); - ASSERT(sko->sko_magic == SKO_MAGIC); - sks = sko->sko_slab; - ASSERT(sks->sks_magic == SKS_MAGIC); - ASSERT(sks->sks_cache == skc); - list_add(&sko->sko_list, &sks->sks_free_list); - - sks->sks_age = jiffies; - sks->sks_ref--; - skc->skc_obj_alloc--; - - /* Move slab to skc_partial_list when no longer full. Slabs - * are added to the head to keep the partial list is quasi-full - * sorted order. Fuller at the head, emptier at the tail. */ - if (sks->sks_ref == (sks->sks_objs - 1)) { - list_del(&sks->sks_list); - list_add(&sks->sks_list, &skc->skc_partial_list); - } - - /* Move empty slabs to the end of the partial list so - * they can be easily found and freed during reclamation. */ - if (sks->sks_ref == 0) { - list_del(&sks->sks_list); - list_add_tail(&sks->sks_list, &skc->skc_partial_list); - skc->skc_slab_alloc--; - } -} - -/* - * Allocate an object from the per-cpu magazine, or if the magazine - * is empty directly allocate from a slab and repopulate the magazine. - */ -void * -spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) -{ - spl_kmem_magazine_t *skm; - void *obj = NULL; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - ASSERT(flags & KM_SLEEP); - - atomic_inc(&skc->skc_ref); - - /* - * Allocate directly from a Linux slab. All optimizations are left - * to the underlying cache we only need to guarantee that KM_SLEEP - * callers will never fail. - */ - if (skc->skc_flags & KMC_SLAB) { - struct kmem_cache *slc = skc->skc_linux_cache; - - do { - obj = kmem_cache_alloc(slc, flags | __GFP_COMP); - } while ((obj == NULL) && !(flags & KM_NOSLEEP)); - - goto ret; - } - - local_irq_disable(); - -restart: - /* Safe to update per-cpu structure without lock, but - * in the restart case we must be careful to reacquire - * the local magazine since this may have changed - * when we need to grow the cache. */ - skm = skc->skc_mag[smp_processor_id()]; - ASSERT(skm->skm_magic == SKM_MAGIC); - - if (likely(skm->skm_avail)) { - /* Object available in CPU cache, use it */ - obj = skm->skm_objs[--skm->skm_avail]; - skm->skm_age = jiffies; - } else { - obj = spl_cache_refill(skc, skm, flags); - if (obj == NULL) - goto restart; - } - - local_irq_enable(); - ASSERT(obj); - ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); - -ret: - /* Pre-emptively migrate object to CPU L1 cache */ - if (obj) { - if (obj && skc->skc_ctor) - skc->skc_ctor(obj, skc->skc_private, flags); - else - prefetchw(obj); - } - - atomic_dec(&skc->skc_ref); - - return (obj); -} - -EXPORT_SYMBOL(spl_kmem_cache_alloc); - -/* - * Free an object back to the local per-cpu magazine, there is no - * guarantee that this is the same magazine the object was originally - * allocated from. We may need to flush entire from the magazine - * back to the slabs to make space. - */ -void -spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) -{ - spl_kmem_magazine_t *skm; - unsigned long flags; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - atomic_inc(&skc->skc_ref); - - /* - * Run the destructor - */ - if (skc->skc_dtor) - skc->skc_dtor(obj, skc->skc_private); - - /* - * Free the object from the Linux underlying Linux slab. - */ - if (skc->skc_flags & KMC_SLAB) { - kmem_cache_free(skc->skc_linux_cache, obj); - goto out; - } - - /* - * Only virtual slabs may have emergency objects and these objects - * are guaranteed to have physical addresses. They must be removed - * from the tree of emergency objects and the freed. - */ - if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) { - spl_emergency_free(skc, obj); - goto out; - } - - local_irq_save(flags); - - /* Safe to update per-cpu structure without lock, but - * no remote memory allocation tracking is being performed - * it is entirely possible to allocate an object from one - * CPU cache and return it to another. */ - skm = skc->skc_mag[smp_processor_id()]; - ASSERT(skm->skm_magic == SKM_MAGIC); - - /* Per-CPU cache full, flush it to make space */ - if (unlikely(skm->skm_avail >= skm->skm_size)) - spl_cache_flush(skc, skm, skm->skm_refill); - - /* Available space in cache, use it */ - skm->skm_objs[skm->skm_avail++] = obj; - - local_irq_restore(flags); -out: - atomic_dec(&skc->skc_ref); -} -EXPORT_SYMBOL(spl_kmem_cache_free); - -/* - * The generic shrinker function for all caches. Under Linux a shrinker - * may not be tightly coupled with a slab cache. In fact Linux always - * systematically tries calling all registered shrinker callbacks which - * report that they contain unused objects. Because of this we only - * register one shrinker function in the shim layer for all slab caches. - * We always attempt to shrink all caches when this generic shrinker - * is called. - * - * If sc->nr_to_scan is zero, the caller is requesting a query of the - * number of objects which can potentially be freed. If it is nonzero, - * the request is to free that many objects. - * - * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks - * in struct shrinker and also require the shrinker to return the number - * of objects freed. - * - * Older kernels require the shrinker to return the number of freeable - * objects following the freeing of nr_to_free. - * - * Linux semantics differ from those under Solaris, which are to - * free all available objects which may (and probably will) be more - * objects than the requested nr_to_scan. - */ -static spl_shrinker_t -__spl_kmem_cache_generic_shrinker(struct shrinker *shrink, - struct shrink_control *sc) -{ - spl_kmem_cache_t *skc; - int alloc = 0; - - down_read(&spl_kmem_cache_sem); - list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { - if (sc->nr_to_scan) { -#ifdef HAVE_SPLIT_SHRINKER_CALLBACK - uint64_t oldalloc = skc->skc_obj_alloc; - spl_kmem_cache_reap_now(skc, - MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1)); - if (oldalloc > skc->skc_obj_alloc) - alloc += oldalloc - skc->skc_obj_alloc; -#else - spl_kmem_cache_reap_now(skc, - MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1)); - alloc += skc->skc_obj_alloc; -#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */ - } else { - /* Request to query number of freeable objects */ - alloc += skc->skc_obj_alloc; - } - } - up_read(&spl_kmem_cache_sem); - - /* - * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass. - * This functionality only exists to work around a rare issue where - * shrink_slabs() is repeatedly invoked by many cores causing the - * system to thrash. - */ - if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan) - return (SHRINK_STOP); - - return (MAX(alloc, 0)); -} - -SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker); - -/* - * Call the registered reclaim function for a cache. Depending on how - * many and which objects are released it may simply repopulate the - * local magazine which will then need to age-out. Objects which cannot - * fit in the magazine we will be released back to their slabs which will - * also need to age out before being release. This is all just best - * effort and we do not want to thrash creating and destroying slabs. - */ -void -spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) -{ - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - - atomic_inc(&skc->skc_ref); - - /* - * Execute the registered reclaim callback if it exists. The - * per-cpu caches will be drained when is set KMC_EXPIRE_MEM. - */ - if (skc->skc_flags & KMC_SLAB) { - if (skc->skc_reclaim) - skc->skc_reclaim(skc->skc_private); - - if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) - kmem_cache_shrink(skc->skc_linux_cache); - - goto out; - } - - /* - * Prevent concurrent cache reaping when contended. - */ - if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) - goto out; - - /* - * When a reclaim function is available it may be invoked repeatedly - * until at least a single slab can be freed. This ensures that we - * do free memory back to the system. This helps minimize the chance - * of an OOM event when the bulk of memory is used by the slab. - * - * When free slabs are already available the reclaim callback will be - * skipped. Additionally, if no forward progress is detected despite - * a reclaim function the cache will be skipped to avoid deadlock. - * - * Longer term this would be the correct place to add the code which - * repacks the slabs in order minimize fragmentation. - */ - if (skc->skc_reclaim) { - uint64_t objects = UINT64_MAX; - int do_reclaim; - - do { - spin_lock(&skc->skc_lock); - do_reclaim = - (skc->skc_slab_total > 0) && - ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) && - (skc->skc_obj_alloc < objects); - - objects = skc->skc_obj_alloc; - spin_unlock(&skc->skc_lock); - - if (do_reclaim) - skc->skc_reclaim(skc->skc_private); - - } while (do_reclaim); - } - - /* Reclaim from the magazine then the slabs ignoring age and delay. */ - if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) { - spl_kmem_magazine_t *skm; - unsigned long irq_flags; - - local_irq_save(irq_flags); - skm = skc->skc_mag[smp_processor_id()]; - spl_cache_flush(skc, skm, skm->skm_avail); - local_irq_restore(irq_flags); - } - - spl_slab_reclaim(skc, count, 1); - clear_bit(KMC_BIT_REAPING, &skc->skc_flags); - smp_wmb(); - wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); -out: - atomic_dec(&skc->skc_ref); -} -EXPORT_SYMBOL(spl_kmem_cache_reap_now); - -/* - * Reap all free slabs from all registered caches. - */ -void -spl_kmem_reap(void) -{ - struct shrink_control sc; - - sc.nr_to_scan = KMC_REAP_CHUNK; - sc.gfp_mask = GFP_KERNEL; - - (void) __spl_kmem_cache_generic_shrinker(NULL, &sc); -} -EXPORT_SYMBOL(spl_kmem_reap); - -#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) -static char * -spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) -{ - int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; - int i, flag = 1; - - ASSERT(str != NULL && len >= 17); - memset(str, 0, len); - - /* Check for a fully printable string, and while we are at - * it place the printable characters in the passed buffer. */ - for (i = 0; i < size; i++) { - str[i] = ((char *)(kd->kd_addr))[i]; - if (isprint(str[i])) { - continue; - } else { - /* Minimum number of printable characters found - * to make it worthwhile to print this as ascii. */ - if (i > min) - break; - - flag = 0; - break; - } - } - - if (!flag) { - sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", - *((uint8_t *)kd->kd_addr), - *((uint8_t *)kd->kd_addr + 2), - *((uint8_t *)kd->kd_addr + 4), - *((uint8_t *)kd->kd_addr + 6), - *((uint8_t *)kd->kd_addr + 8), - *((uint8_t *)kd->kd_addr + 10), - *((uint8_t *)kd->kd_addr + 12), - *((uint8_t *)kd->kd_addr + 14)); - } - - return str; -} - -static int -spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) -{ - int i; - - spin_lock_init(lock); - INIT_LIST_HEAD(list); - - for (i = 0; i < size; i++) - INIT_HLIST_HEAD(&kmem_table[i]); - - return (0); -} - -static void -spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) -{ - unsigned long flags; - kmem_debug_t *kd; - char str[17]; - - spin_lock_irqsave(lock, flags); - if (!list_empty(list)) - printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", - "size", "data", "func", "line"); - - list_for_each_entry(kd, list, kd_list) - printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, - (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), - kd->kd_func, kd->kd_line); - - spin_unlock_irqrestore(lock, flags); -} -#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ -#define spl_kmem_init_tracking(list, lock, size) -#define spl_kmem_fini_tracking(list, lock) -#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ - -int -spl_kmem_init(void) -{ - int rc = 0; - -#ifdef DEBUG_KMEM - kmem_alloc_used_set(0); - vmem_alloc_used_set(0); - - spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); - spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE); -#endif - - init_rwsem(&spl_kmem_cache_sem); - INIT_LIST_HEAD(&spl_kmem_cache_list); - spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", - 1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE); - - spl_register_shrinker(&spl_kmem_cache_shrinker); - - return (rc); -} - -void -spl_kmem_fini(void) -{ - spl_unregister_shrinker(&spl_kmem_cache_shrinker); - taskq_destroy(spl_kmem_cache_taskq); - #ifdef DEBUG_KMEM /* Display all unreclaimed memory addresses, including the * allocation size and the first few bytes of what's located @@ -2233,11 +450,6 @@ spl_kmem_fini(void) printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", kmem_alloc_used_read(), kmem_alloc_max); - if (vmem_alloc_used_read() != 0) - printk(KERN_WARNING "vmem leaked %ld/%llu bytes\n", - vmem_alloc_used_read(), vmem_alloc_max); - spl_kmem_fini_tracking(&kmem_list, &kmem_lock); - spl_kmem_fini_tracking(&vmem_list, &vmem_lock); #endif /* DEBUG_KMEM */ } diff --git a/module/spl/spl-kstat.c b/module/spl/spl-kstat.c index cb27ed3d..e8917a3e 100644 --- a/module/spl/spl-kstat.c +++ b/module/spl/spl-kstat.c @@ -26,6 +26,7 @@ #include #include +#include #ifndef HAVE_PDE_DATA #define PDE_DATA(x) (PDE(x)->data) diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c index 137af718..e5712aee 100644 --- a/module/spl/spl-proc.c +++ b/module/spl/spl-proc.c @@ -26,9 +26,14 @@ #include #include +#include +#include +#include +#include #include #include #include +#include #include #if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) diff --git a/module/spl/spl-tsd.c b/module/spl/spl-tsd.c index c9d532f4..f4f81048 100644 --- a/module/spl/spl-tsd.c +++ b/module/spl/spl-tsd.c @@ -61,6 +61,7 @@ #include #include #include +#include typedef struct tsd_hash_bin { spinlock_t hb_lock; diff --git a/module/spl/spl-vmem.c b/module/spl/spl-vmem.c new file mode 100644 index 00000000..4c140eb8 --- /dev/null +++ b/module/spl/spl-vmem.c @@ -0,0 +1,355 @@ +/*****************************************************************************\ + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + ***************************************************************************** + * Solaris Porting Layer (SPL) Kmem Implementation. +\*****************************************************************************/ + +#include +#include +#include + +vmem_t *heap_arena = NULL; +EXPORT_SYMBOL(heap_arena); + +vmem_t *zio_alloc_arena = NULL; +EXPORT_SYMBOL(zio_alloc_arena); + +vmem_t *zio_arena = NULL; +EXPORT_SYMBOL(zio_arena); + +size_t +vmem_size(vmem_t *vmp, int typemask) +{ + ASSERT3P(vmp, ==, NULL); + ASSERT3S(typemask & VMEM_ALLOC, ==, VMEM_ALLOC); + ASSERT3S(typemask & VMEM_FREE, ==, VMEM_FREE); + + return (VMALLOC_TOTAL); +} +EXPORT_SYMBOL(vmem_size); + +/* + * Memory allocation interfaces and debugging for basic kmem_* + * and vmem_* style memory allocation. When DEBUG_KMEM is enabled + * the SPL will keep track of the total memory allocated, and + * report any memory leaked when the module is unloaded. + */ +#ifdef DEBUG_KMEM + +/* Shim layer memory accounting */ +# ifdef HAVE_ATOMIC64_T +atomic64_t vmem_alloc_used = ATOMIC64_INIT(0); +unsigned long long vmem_alloc_max = 0; +# else /* HAVE_ATOMIC64_T */ +atomic_t vmem_alloc_used = ATOMIC_INIT(0); +unsigned long long vmem_alloc_max = 0; +# endif /* HAVE_ATOMIC64_T */ + +EXPORT_SYMBOL(vmem_alloc_used); +EXPORT_SYMBOL(vmem_alloc_max); + +/* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked + * but also the location of every alloc and free. When the SPL module is + * unloaded a list of all leaked addresses and where they were allocated + * will be dumped to the console. Enabling this feature has a significant + * impact on performance but it makes finding memory leaks straight forward. + * + * Not surprisingly with debugging enabled the xmem_locks are very highly + * contended particularly on xfree(). If we want to run with this detailed + * debugging enabled for anything other than debugging we need to minimize + * the contention by moving to a lock per xmem_table entry model. + */ +# ifdef DEBUG_KMEM_TRACKING + +# define VMEM_HASH_BITS 10 +# define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS) + +typedef struct kmem_debug { + struct hlist_node kd_hlist; /* Hash node linkage */ + struct list_head kd_list; /* List of all allocations */ + void *kd_addr; /* Allocation pointer */ + size_t kd_size; /* Allocation size */ + const char *kd_func; /* Allocation function */ + int kd_line; /* Allocation line */ +} kmem_debug_t; + +spinlock_t vmem_lock; +struct hlist_head vmem_table[VMEM_TABLE_SIZE]; +struct list_head vmem_list; + +EXPORT_SYMBOL(vmem_lock); +EXPORT_SYMBOL(vmem_table); +EXPORT_SYMBOL(vmem_list); + +void * +vmem_alloc_track(size_t size, int flags, const char *func, int line) +{ + void *ptr = NULL; + kmem_debug_t *dptr; + unsigned long irq_flags; + + ASSERT(flags & KM_SLEEP); + + /* Function may be called with KM_NOSLEEP so failure is possible */ + dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t), + flags & ~__GFP_ZERO); + if (unlikely(dptr == NULL)) { + printk(KERN_WARNING "debug vmem_alloc(%ld, 0x%x) " + "at %s:%d failed (%lld/%llu)\n", + sizeof(kmem_debug_t), flags, func, line, + vmem_alloc_used_read(), vmem_alloc_max); + } else { + /* + * We use __strdup() below because the string pointed to by + * __FUNCTION__ might not be available by the time we want + * to print it, since the module might have been unloaded. + * This can never fail because we have already asserted + * that flags is KM_SLEEP. + */ + dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO); + if (unlikely(dptr->kd_func == NULL)) { + kfree(dptr); + printk(KERN_WARNING "debug __strdup() at %s:%d " + "failed (%lld/%llu)\n", func, line, + vmem_alloc_used_read(), vmem_alloc_max); + goto out; + } + + /* Use the correct allocator */ + if (flags & __GFP_ZERO) { + ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO); + } else { + ptr = vmalloc_nofail(size, flags); + } + + if (unlikely(ptr == NULL)) { + kfree(dptr->kd_func); + kfree(dptr); + printk(KERN_WARNING "vmem_alloc (%llu, 0x%x) " + "at %s:%d failed (%lld/%llu)\n", + (unsigned long long) size, flags, func, line, + vmem_alloc_used_read(), vmem_alloc_max); + goto out; + } + + vmem_alloc_used_add(size); + if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) + vmem_alloc_max = vmem_alloc_used_read(); + + INIT_HLIST_NODE(&dptr->kd_hlist); + INIT_LIST_HEAD(&dptr->kd_list); + + dptr->kd_addr = ptr; + dptr->kd_size = size; + dptr->kd_line = line; + + spin_lock_irqsave(&vmem_lock, irq_flags); + hlist_add_head(&dptr->kd_hlist, + &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]); + list_add_tail(&dptr->kd_list, &vmem_list); + spin_unlock_irqrestore(&vmem_lock, irq_flags); + } +out: + return (ptr); +} +EXPORT_SYMBOL(vmem_alloc_track); + +void +vmem_free_track(const void *ptr, size_t size) +{ + kmem_debug_t *dptr; + + ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, + (unsigned long long) size); + + /* Must exist in hash due to vmem_alloc() */ + dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr); + ASSERT(dptr); + + /* Size must match */ + ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " + "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size, + (unsigned long long) size, dptr->kd_func, dptr->kd_line); + + vmem_alloc_used_sub(size); + kfree(dptr->kd_func); + + memset((void *)dptr, 0x5a, sizeof(kmem_debug_t)); + kfree(dptr); + + memset((void *)ptr, 0x5a, size); + vfree(ptr); +} +EXPORT_SYMBOL(vmem_free_track); + +# else /* DEBUG_KMEM_TRACKING */ + +void * +vmem_alloc_debug(size_t size, int flags, const char *func, int line) +{ + void *ptr; + + ASSERT(flags & KM_SLEEP); + + /* Use the correct allocator */ + if (flags & __GFP_ZERO) { + ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO)); + } else { + ptr = vmalloc_nofail(size, flags); + } + + if (unlikely(ptr == NULL)) { + printk(KERN_WARNING + "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", + (unsigned long long)size, flags, func, line, + (unsigned long long)vmem_alloc_used_read(), vmem_alloc_max); + } else { + vmem_alloc_used_add(size); + if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) + vmem_alloc_max = vmem_alloc_used_read(); + } + + return (ptr); +} +EXPORT_SYMBOL(vmem_alloc_debug); + +void +vmem_free_debug(const void *ptr, size_t size) +{ + ASSERT(ptr || size > 0); + vmem_alloc_used_sub(size); + vfree(ptr); +} +EXPORT_SYMBOL(vmem_free_debug); + +# endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ + +#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) +static char * +spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) +{ + int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; + int i, flag = 1; + + ASSERT(str != NULL && len >= 17); + memset(str, 0, len); + + /* Check for a fully printable string, and while we are at + * it place the printable characters in the passed buffer. */ + for (i = 0; i < size; i++) { + str[i] = ((char *)(kd->kd_addr))[i]; + if (isprint(str[i])) { + continue; + } else { + /* Minimum number of printable characters found + * to make it worthwhile to print this as ascii. */ + if (i > min) + break; + + flag = 0; + break; + } + } + + if (!flag) { + sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", + *((uint8_t *)kd->kd_addr), + *((uint8_t *)kd->kd_addr + 2), + *((uint8_t *)kd->kd_addr + 4), + *((uint8_t *)kd->kd_addr + 6), + *((uint8_t *)kd->kd_addr + 8), + *((uint8_t *)kd->kd_addr + 10), + *((uint8_t *)kd->kd_addr + 12), + *((uint8_t *)kd->kd_addr + 14)); + } + + return str; +} + +static int +spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) +{ + int i; + + spin_lock_init(lock); + INIT_LIST_HEAD(list); + + for (i = 0; i < size; i++) + INIT_HLIST_HEAD(&kmem_table[i]); + + return (0); +} + +static void +spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) +{ + unsigned long flags; + kmem_debug_t *kd; + char str[17]; + + spin_lock_irqsave(lock, flags); + if (!list_empty(list)) + printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", + "size", "data", "func", "line"); + + list_for_each_entry(kd, list, kd_list) + printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, + (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), + kd->kd_func, kd->kd_line); + + spin_unlock_irqrestore(lock, flags); +} +#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ +#define spl_kmem_init_tracking(list, lock, size) +#define spl_kmem_fini_tracking(list, lock) +#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ + +int +spl_vmem_init(void) +{ + int rc = 0; + +#ifdef DEBUG_KMEM + vmem_alloc_used_set(0); + spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE); +#endif + + return (rc); +} + +void +spl_vmem_fini(void) +{ +#ifdef DEBUG_KMEM + /* Display all unreclaimed memory addresses, including the + * allocation size and the first few bytes of what's located + * at that address to aid in debugging. Performance is not + * a serious concern here since it is module unload time. */ + if (vmem_alloc_used_read() != 0) + printk(KERN_WARNING "vmem leaked %ld/%llu bytes\n", + vmem_alloc_used_read(), vmem_alloc_max); + + spl_kmem_fini_tracking(&vmem_list, &vmem_lock); +#endif /* DEBUG_KMEM */ +} diff --git a/module/spl/spl-vnode.c b/module/spl/spl-vnode.c index e5db0ec2..97eb4ef7 100644 --- a/module/spl/spl-vnode.c +++ b/module/spl/spl-vnode.c @@ -26,6 +26,7 @@ #include #include +#include #include #include diff --git a/module/spl/spl-zlib.c b/module/spl/spl-zlib.c index 2967b03c..77c2a1dd 100644 --- a/module/spl/spl-zlib.c +++ b/module/spl/spl-zlib.c @@ -54,6 +54,7 @@ #include +#include #include #include diff --git a/module/splat/splat-condvar.c b/module/splat/splat-condvar.c index 3ee2ffc9..ed633acd 100644 --- a/module/splat/splat-condvar.c +++ b/module/splat/splat-condvar.c @@ -24,8 +24,9 @@ * Solaris Porting LAyer Tests (SPLAT) Condition Variable Tests. \*****************************************************************************/ -#include #include +#include +#include #include "splat-internal.h" #define SPLAT_CONDVAR_NAME "condvar" diff --git a/module/splat/splat-internal.h b/module/splat/splat-internal.h index eff8a9e7..83213269 100644 --- a/module/splat/splat-internal.h +++ b/module/splat/splat-internal.h @@ -27,6 +27,7 @@ #include "splat-ctl.h" #include +#include #define SPLAT_SUBSYSTEM_INIT(type) \ ({ splat_subsystem_t *_sub_; \ diff --git a/module/splat/splat-kmem.c b/module/splat/splat-kmem.c index cf47ce65..7edc8599 100644 --- a/module/splat/splat-kmem.c +++ b/module/splat/splat-kmem.c @@ -25,7 +25,10 @@ \*****************************************************************************/ #include +#include +#include #include +#include #include "splat-internal.h" #define SPLAT_KMEM_NAME "kmem" diff --git a/module/splat/splat-taskq.c b/module/splat/splat-taskq.c index d8406f15..8229fed3 100644 --- a/module/splat/splat-taskq.c +++ b/module/splat/splat-taskq.c @@ -25,8 +25,10 @@ \*****************************************************************************/ #include +#include #include #include +#include #include #include "splat-internal.h" diff --git a/module/splat/splat-zlib.c b/module/splat/splat-zlib.c index c614c5e6..eaa48369 100644 --- a/module/splat/splat-zlib.c +++ b/module/splat/splat-zlib.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "splat-internal.h" #define SPLAT_ZLIB_NAME "zlib" From 5994aef86837a5705ba48fed7caf6ab801386a78 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 8 Dec 2014 13:35:51 -0500 Subject: [PATCH 03/11] Fix kmem cstyle issues Address all cstyle issues in the kmem, vmem, and kmem_cache source and headers. This will done to make it easier to review subsequent changes which will rework the kmem/vmem implementation. Signed-off-by: Brian Behlendorf --- include/sys/kmem.h | 98 ++++++++++----------- include/sys/kmem_cache.h | 121 +++++++++++++------------- include/sys/vmem.h | 68 +++++++-------- module/spl/spl-kmem-cache.c | 168 ++++++++++++++++++++---------------- module/spl/spl-kmem.c | 104 +++++++++++----------- module/spl/spl-vmem.c | 89 ++++++++++--------- 6 files changed, 335 insertions(+), 313 deletions(-) diff --git a/include/sys/kmem.h b/include/sys/kmem.h index ee25e4c8..a9d94c90 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -1,4 +1,4 @@ -/*****************************************************************************\ +/* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -20,7 +20,7 @@ * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . -\*****************************************************************************/ + */ #ifndef _SPL_KMEM_H #define _SPL_KMEM_H @@ -36,18 +36,18 @@ extern void strfree(char *str); /* * Memory allocation interfaces */ -#define KM_SLEEP GFP_KERNEL /* Can sleep, never fails */ -#define KM_NOSLEEP GFP_ATOMIC /* Can not sleep, may fail */ -#define KM_PUSHPAGE (GFP_NOIO | __GFP_HIGH) /* Use reserved memory */ -#define KM_NODEBUG __GFP_NOWARN /* Suppress warnings */ -#define KM_FLAGS __GFP_BITS_MASK -#define KM_VMFLAGS GFP_LEVEL_MASK +#define KM_SLEEP GFP_KERNEL /* Can sleep, never fails */ +#define KM_NOSLEEP GFP_ATOMIC /* Can not sleep, may fail */ +#define KM_PUSHPAGE (GFP_NOIO | __GFP_HIGH) /* Use reserved memory */ +#define KM_NODEBUG __GFP_NOWARN /* Suppress warnings */ +#define KM_FLAGS __GFP_BITS_MASK +#define KM_VMFLAGS GFP_LEVEL_MASK /* * Used internally, the kernel does not need to support this flag */ #ifndef __GFP_ZERO -# define __GFP_ZERO 0x8000 +#define __GFP_ZERO 0x8000 #endif /* @@ -66,7 +66,7 @@ kmalloc_nofail(size_t size, gfp_t flags) ptr = kmalloc(size, flags); } while (ptr == NULL && (flags & __GFP_WAIT)); - return ptr; + return (ptr); } static inline void * @@ -78,7 +78,7 @@ kzalloc_nofail(size_t size, gfp_t flags) ptr = kzalloc(size, flags); } while (ptr == NULL && (flags & __GFP_WAIT)); - return ptr; + return (ptr); } static inline void * @@ -90,7 +90,7 @@ kmalloc_node_nofail(size_t size, gfp_t flags, int node) ptr = kmalloc_node(size, flags, node); } while (ptr == NULL && (flags & __GFP_WAIT)); - return ptr; + return (ptr); } #ifdef DEBUG_KMEM @@ -98,29 +98,23 @@ kmalloc_node_nofail(size_t size, gfp_t flags, int node) /* * Memory accounting functions to be used only when DEBUG_KMEM is set. */ -# ifdef HAVE_ATOMIC64_T - -# define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) -# define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) -# define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used) -# define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size) - +#ifdef HAVE_ATOMIC64_T +#define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) +#define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) +#define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used) +#define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size) extern atomic64_t kmem_alloc_used; extern unsigned long long kmem_alloc_max; - -# else /* HAVE_ATOMIC64_T */ - -# define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used) -# define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used) -# define kmem_alloc_used_read() atomic_read(&kmem_alloc_used) -# define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size) - +#else /* HAVE_ATOMIC64_T */ +#define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used) +#define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used) +#define kmem_alloc_used_read() atomic_read(&kmem_alloc_used) +#define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size) extern atomic_t kmem_alloc_used; extern unsigned long long kmem_alloc_max; +#endif /* HAVE_ATOMIC64_T */ -# endif /* HAVE_ATOMIC64_T */ - -# ifdef DEBUG_KMEM_TRACKING +#ifdef DEBUG_KMEM_TRACKING /* * DEBUG_KMEM && DEBUG_KMEM_TRACKING * @@ -132,18 +126,18 @@ extern unsigned long long kmem_alloc_max; * be enabled for debugging. This feature may be enabled by passing * --enable-debug-kmem-tracking to configure. */ -# define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__, 0, 0) -# define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__, 0, 0) -# define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__, 1, nd) -# define kmem_free(ptr, sz) kmem_free_track((ptr), (sz)) +#define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \ + __FUNCTION__, __LINE__, 0, 0) +#define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\ + __FUNCTION__, __LINE__, 0, 0) +#define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \ + __FUNCTION__, __LINE__, 1, nd) +#define kmem_free(ptr, sz) kmem_free_track((ptr), (sz)) extern void *kmem_alloc_track(size_t, int, const char *, int, int, int); extern void kmem_free_track(const void *, size_t); -# else /* DEBUG_KMEM_TRACKING */ +#else /* DEBUG_KMEM_TRACKING */ /* * DEBUG_KMEM && !DEBUG_KMEM_TRACKING * @@ -153,18 +147,18 @@ extern void kmem_free_track(const void *, size_t); * will be reported on the console. To disable this basic accounting * pass the --disable-debug-kmem option to configure. */ -# define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__, 0, 0) -# define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__, 0, 0) -# define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__, 1, nd) -# define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz)) +#define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \ + __FUNCTION__, __LINE__, 0, 0) +#define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ + __FUNCTION__, __LINE__, 0, 0) +#define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \ + __FUNCTION__, __LINE__, 1, nd) +#define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz)) extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int); extern void kmem_free_debug(const void *, size_t); -# endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM_TRACKING */ #else /* DEBUG_KMEM */ /* * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING @@ -173,17 +167,17 @@ extern void kmem_free_debug(const void *, size_t); * minimal memory accounting. To enable basic accounting pass the * --enable-debug-kmem option to configure. */ -# define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl)) -# define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl)) -# define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd)) -# define kmem_free(ptr, sz) ((void)(sz), kfree(ptr)) +#define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl)) +#define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl)) +#define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd)) +#define kmem_free(ptr, sz) ((void)(sz), kfree(ptr)) #endif /* DEBUG_KMEM */ int spl_kmem_init(void); void spl_kmem_fini(void); -#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \ - ((ptr) < (void *)VMALLOC_END)) +#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \ + ((ptr) < (void *)VMALLOC_END)) #endif /* _SPL_KMEM_H */ diff --git a/include/sys/kmem_cache.h b/include/sys/kmem_cache.h index 654a2ea4..a5bc0322 100644 --- a/include/sys/kmem_cache.h +++ b/include/sys/kmem_cache.h @@ -1,4 +1,4 @@ -/*****************************************************************************\ +/* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -20,7 +20,7 @@ * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . -\*****************************************************************************/ + */ #ifndef _SPL_KMEM_CACHE_H #define _SPL_KMEM_CACHE_H @@ -33,7 +33,7 @@ * allocated from the physical or virtal memory address space. The virtual * slabs allow for good behavior when allocation large objects of identical * size. This slab implementation also supports both constructors and - * destructions which the Linux slab does not. + * destructors which the Linux slab does not. */ enum { KMC_BIT_NOTOUCH = 0, /* Don't update ages */ @@ -46,8 +46,8 @@ enum { KMC_BIT_SLAB = 7, /* Use Linux slab cache */ KMC_BIT_OFFSLAB = 8, /* Objects not on slab */ KMC_BIT_NOEMERGENCY = 9, /* Disable emergency objects */ - KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ - KMC_BIT_GROWING = 15, /* Growing in progress */ + KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ + KMC_BIT_GROWING = 15, /* Growing in progress */ KMC_BIT_REAPING = 16, /* Reaping in progress */ KMC_BIT_DESTROY = 17, /* Destroy in progress */ KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ @@ -64,29 +64,29 @@ typedef enum kmem_cbrc { KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ } kmem_cbrc_t; -#define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH) -#define KMC_NODEBUG (1 << KMC_BIT_NODEBUG) -#define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE) -#define KMC_NOHASH (1 << KMC_BIT_NOHASH) -#define KMC_QCACHE (1 << KMC_BIT_QCACHE) -#define KMC_KMEM (1 << KMC_BIT_KMEM) -#define KMC_VMEM (1 << KMC_BIT_VMEM) -#define KMC_SLAB (1 << KMC_BIT_SLAB) -#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB) -#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY) -#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) -#define KMC_GROWING (1 << KMC_BIT_GROWING) -#define KMC_REAPING (1 << KMC_BIT_REAPING) -#define KMC_DESTROY (1 << KMC_BIT_DESTROY) -#define KMC_TOTAL (1 << KMC_BIT_TOTAL) -#define KMC_ALLOC (1 << KMC_BIT_ALLOC) -#define KMC_MAX (1 << KMC_BIT_MAX) - -#define KMC_REAP_CHUNK INT_MAX -#define KMC_DEFAULT_SEEKS 1 - -#define KMC_EXPIRE_AGE 0x1 /* Due to age */ -#define KMC_EXPIRE_MEM 0x2 /* Due to low memory */ +#define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH) +#define KMC_NODEBUG (1 << KMC_BIT_NODEBUG) +#define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE) +#define KMC_NOHASH (1 << KMC_BIT_NOHASH) +#define KMC_QCACHE (1 << KMC_BIT_QCACHE) +#define KMC_KMEM (1 << KMC_BIT_KMEM) +#define KMC_VMEM (1 << KMC_BIT_VMEM) +#define KMC_SLAB (1 << KMC_BIT_SLAB) +#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB) +#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY) +#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) +#define KMC_GROWING (1 << KMC_BIT_GROWING) +#define KMC_REAPING (1 << KMC_BIT_REAPING) +#define KMC_DESTROY (1 << KMC_BIT_DESTROY) +#define KMC_TOTAL (1 << KMC_BIT_TOTAL) +#define KMC_ALLOC (1 << KMC_BIT_ALLOC) +#define KMC_MAX (1 << KMC_BIT_MAX) + +#define KMC_REAP_CHUNK INT_MAX +#define KMC_DEFAULT_SEEKS 1 + +#define KMC_EXPIRE_AGE 0x1 /* Due to age */ +#define KMC_EXPIRE_MEM 0x2 /* Due to low memory */ #define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */ @@ -94,19 +94,19 @@ extern unsigned int spl_kmem_cache_expire; extern struct list_head spl_kmem_cache_list; extern struct rw_semaphore spl_kmem_cache_sem; -#define SKM_MAGIC 0x2e2e2e2e -#define SKO_MAGIC 0x20202020 -#define SKS_MAGIC 0x22222222 -#define SKC_MAGIC 0x2c2c2c2c +#define SKM_MAGIC 0x2e2e2e2e +#define SKO_MAGIC 0x20202020 +#define SKS_MAGIC 0x22222222 +#define SKC_MAGIC 0x2c2c2c2c -#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */ -#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */ -#define SPL_KMEM_CACHE_OBJ_PER_SLAB 16 /* Target objects per slab */ -#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 1 /* Minimum objects per slab */ -#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ +#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */ +#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */ +#define SPL_KMEM_CACHE_OBJ_PER_SLAB 16 /* Target objects per slab */ +#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 1 /* Minimum objects per slab */ +#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ -#define POINTER_IS_VALID(p) 0 /* Unimplemented */ -#define POINTER_INVALIDATE(pp) /* Unimplemented */ +#define POINTER_IS_VALID(p) 0 /* Unimplemented */ +#define POINTER_INVALIDATE(pp) /* Unimplemented */ typedef int (*spl_kmem_ctor_t)(void *, void *, int); typedef void (*spl_kmem_dtor_t)(void *, void *); @@ -124,14 +124,14 @@ typedef struct spl_kmem_magazine { } spl_kmem_magazine_t; typedef struct spl_kmem_obj { - uint32_t sko_magic; /* Sanity magic */ + uint32_t sko_magic; /* Sanity magic */ void *sko_addr; /* Buffer address */ struct spl_kmem_slab *sko_slab; /* Owned by slab */ struct list_head sko_list; /* Free object list linkage */ } spl_kmem_obj_t; typedef struct spl_kmem_slab { - uint32_t sks_magic; /* Sanity magic */ + uint32_t sks_magic; /* Sanity magic */ uint32_t sks_objs; /* Objects per slab */ struct spl_kmem_cache *sks_cache; /* Owned by cache */ struct list_head sks_list; /* Slab list linkage */ @@ -174,14 +174,14 @@ typedef struct spl_kmem_cache { atomic_t skc_ref; /* Ref count callers */ taskqid_t skc_taskqid; /* Slab reclaim task */ struct list_head skc_list; /* List of caches linkage */ - struct list_head skc_complete_list;/* Completely alloc'ed */ - struct list_head skc_partial_list; /* Partially alloc'ed */ + struct list_head skc_complete_list; /* Completely alloc'ed */ + struct list_head skc_partial_list; /* Partially alloc'ed */ struct rb_root skc_emergency_tree; /* Min sized objects */ spinlock_t skc_lock; /* Cache lock */ wait_queue_head_t skc_waitq; /* Allocation waiters */ uint64_t skc_slab_fail; /* Slab alloc failures */ - uint64_t skc_slab_create;/* Slab creates */ - uint64_t skc_slab_destroy;/* Slab destroys */ + uint64_t skc_slab_create; /* Slab creates */ + uint64_t skc_slab_destroy; /* Slab destroys */ uint64_t skc_slab_total; /* Slab total current */ uint64_t skc_slab_alloc; /* Slab alloc current */ uint64_t skc_slab_max; /* Slab max historic */ @@ -192,30 +192,31 @@ typedef struct spl_kmem_cache { uint64_t skc_obj_emergency; /* Obj emergency current */ uint64_t skc_obj_emergency_max; /* Obj emergency max */ } spl_kmem_cache_t; -#define kmem_cache_t spl_kmem_cache_t +#define kmem_cache_t spl_kmem_cache_t extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size, - size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, - spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags); + size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, + spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags); extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, - kmem_cbrc_t (*)(void *, void *, size_t, void *)); + kmem_cbrc_t (*)(void *, void *, size_t, void *)); extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count); extern void spl_kmem_reap(void); -#define kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) \ - spl_kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) -#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) -#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) -#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) -#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) -#define kmem_cache_reap_now(skc) \ - spl_kmem_cache_reap_now(skc, skc->skc_reap) -#define kmem_reap() spl_kmem_reap() -#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \ - ((ptr) < (void *)VMALLOC_END)) +#define kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) \ + spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) +#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) +#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) +#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) +#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) +#define kmem_cache_reap_now(skc) \ + spl_kmem_cache_reap_now(skc, skc->skc_reap) +#define kmem_reap() spl_kmem_reap() +#define kmem_virt(ptr) \ + (((ptr) >= (void *)VMALLOC_START) && \ + ((ptr) < (void *)VMALLOC_END)) /* * Allow custom slab allocation flags to be set for KMC_SLAB based caches. diff --git a/include/sys/vmem.h b/include/sys/vmem.h index e86e89bb..f59ac5e8 100644 --- a/include/sys/vmem.h +++ b/include/sys/vmem.h @@ -1,4 +1,4 @@ -/*****************************************************************************\ +/* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -20,7 +20,7 @@ * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . -\*****************************************************************************/ + */ #ifndef _SPL_VMEM_H #define _SPL_VMEM_H @@ -40,11 +40,11 @@ extern size_t vmem_size(vmem_t *vmp, int typemask); /* * Memory allocation interfaces */ -#define VMEM_ALLOC 0x01 -#define VMEM_FREE 0x02 +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 #ifndef VMALLOC_TOTAL -#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) +#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) #endif static inline void * @@ -78,7 +78,7 @@ vmalloc_nofail(size_t size, gfp_t flags) } } - return ptr; + return (ptr); } static inline void * @@ -90,7 +90,7 @@ vzalloc_nofail(size_t size, gfp_t flags) if (ptr) memset(ptr, 0, (size)); - return ptr; + return (ptr); } #ifdef DEBUG_KMEM @@ -98,29 +98,29 @@ vzalloc_nofail(size_t size, gfp_t flags) /* * Memory accounting functions to be used only when DEBUG_KMEM is set. */ -# ifdef HAVE_ATOMIC64_T +#ifdef HAVE_ATOMIC64_T -# define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used) -# define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used) -# define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used) -# define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size) +#define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used) +#define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used) +#define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used) +#define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size) extern atomic64_t vmem_alloc_used; extern unsigned long long vmem_alloc_max; -# else /* HAVE_ATOMIC64_T */ +#else /* HAVE_ATOMIC64_T */ -# define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used) -# define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used) -# define vmem_alloc_used_read() atomic_read(&vmem_alloc_used) -# define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size) +#define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used) +#define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used) +#define vmem_alloc_used_read() atomic_read(&vmem_alloc_used) +#define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size) extern atomic_t vmem_alloc_used; extern unsigned long long vmem_alloc_max; -# endif /* HAVE_ATOMIC64_T */ +#endif /* HAVE_ATOMIC64_T */ -# ifdef DEBUG_KMEM_TRACKING +#ifdef DEBUG_KMEM_TRACKING /* * DEBUG_KMEM && DEBUG_KMEM_TRACKING * @@ -132,18 +132,18 @@ extern unsigned long long vmem_alloc_max; * be enabled for debugging. This feature may be enabled by passing * --enable-debug-kmem-tracking to configure. */ -# define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__) -# define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__) -# define vmem_free(ptr, sz) vmem_free_track((ptr), (sz)) +#define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \ + __FUNCTION__, __LINE__) +#define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\ + __FUNCTION__, __LINE__) +#define vmem_free(ptr, sz) vmem_free_track((ptr), (sz)) extern void *kmem_alloc_track(size_t, int, const char *, int, int, int); extern void kmem_free_track(const void *, size_t); extern void *vmem_alloc_track(size_t, int, const char *, int); extern void vmem_free_track(const void *, size_t); -# else /* DEBUG_KMEM_TRACKING */ +#else /* DEBUG_KMEM_TRACKING */ /* * DEBUG_KMEM && !DEBUG_KMEM_TRACKING * @@ -153,16 +153,16 @@ extern void vmem_free_track(const void *, size_t); * will be reported on the console. To disable this basic accounting * pass the --disable-debug-kmem option to configure. */ -# define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__) -# define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__) -# define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz)) +#define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \ + __FUNCTION__, __LINE__) +#define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ + __FUNCTION__, __LINE__) +#define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz)) extern void *vmem_alloc_debug(size_t, int, const char *, int); extern void vmem_free_debug(const void *, size_t); -# endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM_TRACKING */ #else /* DEBUG_KMEM */ /* * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING @@ -171,9 +171,9 @@ extern void vmem_free_debug(const void *, size_t); * minimal memory accounting. To enable basic accounting pass the * --enable-debug-kmem option to configure. */ -# define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl)) -# define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl)) -# define vmem_free(ptr, sz) ((void)(sz), vfree(ptr)) +#define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl)) +#define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl)) +#define vmem_free(ptr, sz) ((void)(sz), vfree(ptr)) #endif /* DEBUG_KMEM */ diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c index d24c4c20..3aa65a9b 100644 --- a/module/spl/spl-kmem-cache.c +++ b/module/spl/spl-kmem-cache.c @@ -1,4 +1,4 @@ -/*****************************************************************************\ +/* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -20,9 +20,7 @@ * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . - ***************************************************************************** - * Solaris Porting Layer (SPL) Kmem Implementation. -\*****************************************************************************/ + */ #include #include @@ -76,7 +74,7 @@ MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN; module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min, - "Minimal number of objects per slab"); + "Minimal number of objects per slab"); unsigned int spl_kmem_cache_max_size = 32; module_param(spl_kmem_cache_max_size, uint, 0644); @@ -95,12 +93,12 @@ unsigned int spl_kmem_cache_slab_limit = 0; #endif module_param(spl_kmem_cache_slab_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_slab_limit, - "Objects less than N bytes use the Linux slab"); + "Objects less than N bytes use the Linux slab"); unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4); module_param(spl_kmem_cache_kmem_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, - "Objects less than N bytes use the kmalloc"); + "Objects less than N bytes use the kmalloc"); /* * Slab allocation interfaces @@ -114,7 +112,7 @@ MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, * breaker for the SPL which contains particularly expensive * initializers for mutex's, condition variables, etc. We also * require a minimal level of cleanup for these data types unlike - * many Linux data type which do need to be explicitly destroyed. + * many Linux data types which do need to be explicitly destroyed. * * 2) Virtual address space backed slab. Callers of the Solaris slab * expect it to work well for both small are very large allocations. @@ -135,7 +133,7 @@ MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, * * XXX: Improve the partial slab list by carefully maintaining a * strict ordering of fullest to emptiest slabs based on - * the slab reference count. This guarantees the when freeing + * the slab reference count. This guarantees that when freeing * slabs back to the system we need only linearly traverse the * last N slabs in the list to discover all the freeable slabs. * @@ -149,7 +147,7 @@ MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, struct list_head spl_kmem_cache_list; /* List of caches */ struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ -taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */ +taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); @@ -173,7 +171,7 @@ kv_alloc(spl_kmem_cache_t *skc, int size, int flags) /* Resulting allocated memory will be page aligned */ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); - return ptr; + return (ptr); } static void @@ -204,8 +202,8 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size) static inline uint32_t spl_sks_size(spl_kmem_cache_t *skc) { - return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t), - skc->skc_obj_align, uint32_t); + return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t), + skc->skc_obj_align, uint32_t)); } /* @@ -216,8 +214,8 @@ spl_obj_size(spl_kmem_cache_t *skc) { uint32_t align = skc->skc_obj_align; - return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + - P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t); + return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + + P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t)); } /* @@ -226,8 +224,8 @@ spl_obj_size(spl_kmem_cache_t *skc) static inline spl_kmem_obj_t * spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) { - return obj + P2ROUNDUP_TYPED(skc->skc_obj_size, - skc->skc_obj_align, uint32_t); + return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size, + skc->skc_obj_align, uint32_t)); } /* @@ -237,7 +235,7 @@ spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) static inline uint32_t spl_offslab_size(spl_kmem_cache_t *skc) { - return 1UL << (fls64(spl_obj_size(skc)) + 1); + return (1UL << (fls64(spl_obj_size(skc)) + 1)); } /* @@ -320,8 +318,8 @@ spl_slab_alloc(spl_kmem_cache_t *skc, int flags) out: if (rc) { if (skc->skc_flags & KMC_OFFSLAB) - list_for_each_entry_safe(sko, n, &sks->sks_free_list, - sko_list) + list_for_each_entry_safe(sko, + n, &sks->sks_free_list, sko_list) kv_free(skc, sko->sko_addr, offslab_size); kv_free(skc, base, skc->skc_slab_size); @@ -338,7 +336,7 @@ spl_slab_alloc(spl_kmem_cache_t *skc, int flags) */ static void spl_slab_free(spl_kmem_slab_t *sks, - struct list_head *sks_list, struct list_head *sko_list) + struct list_head *sks_list, struct list_head *sko_list) { spl_kmem_cache_t *skc; @@ -363,7 +361,7 @@ spl_slab_free(spl_kmem_slab_t *sks, } /* - * Traverses all the partial slabs attached to a cache and free those + * Traverse all the partial slabs attached to a cache and free those * which which are currently empty, and have not been touched for * skc_delay seconds to avoid thrashing. The count argument is * passed to optionally cap the number of slabs reclaimed, a count @@ -387,7 +385,8 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) * however when flag is set the delay will not be used. */ spin_lock(&skc->skc_lock); - list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){ + list_for_each_entry_safe_reverse(sks, m, + &skc->skc_partial_list, sks_list) { /* * All empty slabs are at the end of skc->skc_partial_list, * therefore once a non-empty slab is found we can stop @@ -397,7 +396,8 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) if ((sks->sks_ref > 0) || (count && i >= count)) break; - if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) { + if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ) || + flag) { spl_slab_free(sks, &sks_list, &sko_list); i++; } @@ -443,10 +443,10 @@ spl_emergency_search(struct rb_root *root, void *obj) else if (address > (unsigned long)ske->ske_obj) node = node->rb_right; else - return ske; + return (ske); } - return NULL; + return (NULL); } static int @@ -465,13 +465,13 @@ spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) else if (address > (unsigned long)ske_tmp->ske_obj) new = &((*new)->rb_right); else - return 0; + return (0); } rb_link_node(&ske->ske_node, parent, new); rb_insert_color(&ske->ske_node, root); - return 1; + return (1); } /* @@ -490,7 +490,7 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) if (!empty) return (-EEXIST); - ske = kmalloc(sizeof(*ske), flags); + ske = kmalloc(sizeof (*ske), flags); if (ske == NULL) return (-ENOMEM); @@ -565,7 +565,7 @@ __spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) skm->skm_avail -= count; memmove(skm->skm_objs, &(skm->skm_objs[count]), - sizeof(void *) * skm->skm_avail); + sizeof (void *) * skm->skm_avail); } static void @@ -666,7 +666,7 @@ spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) if (skc->skc_flags & KMC_OFFSLAB) { *objs = spl_kmem_cache_obj_per_slab; - *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), PAGE_SIZE); + *size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE); return (0); } else { sks_size = spl_sks_size(skc); @@ -731,8 +731,8 @@ static spl_kmem_magazine_t * spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) { spl_kmem_magazine_t *skm; - int size = sizeof(spl_kmem_magazine_t) + - sizeof(void *) * skc->skc_mag_size; + int size = sizeof (spl_kmem_magazine_t) + + sizeof (void *) * skc->skc_mag_size; skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu)); if (skm) { @@ -754,8 +754,8 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) static void spl_magazine_free(spl_kmem_magazine_t *skm) { - int size = sizeof(spl_kmem_magazine_t) + - sizeof(void *) * skm->skm_size; + int size = sizeof (spl_kmem_magazine_t) + + sizeof (void *) * skm->skm_size; ASSERT(skm->skm_magic == SKM_MAGIC); ASSERT(skm->skm_avail == 0); @@ -802,11 +802,11 @@ spl_magazine_destroy(spl_kmem_cache_t *skc) if (skc->skc_flags & KMC_NOMAGAZINE) return; - for_each_online_cpu(i) { + for_each_online_cpu(i) { skm = skc->skc_mag[i]; spl_cache_flush(skc, skm, skm->skm_avail); spl_magazine_free(skm); - } + } } /* @@ -832,12 +832,10 @@ spl_magazine_destroy(spl_kmem_cache_t *skc) */ spl_kmem_cache_t * spl_kmem_cache_create(char *name, size_t size, size_t align, - spl_kmem_ctor_t ctor, - spl_kmem_dtor_t dtor, - spl_kmem_reclaim_t reclaim, - void *priv, void *vmp, int flags) + spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, + void *priv, void *vmp, int flags) { - spl_kmem_cache_t *skc; + spl_kmem_cache_t *skc; int rc; /* @@ -851,13 +849,13 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, might_sleep(); /* - * Allocate memory for a new cache an initialize it. Unfortunately, + * Allocate memory for a new cache and initialize it. Unfortunately, * this usually ends up being a large allocation of ~32k because * we need to allocate enough memory for the worst case number of * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we * explicitly pass KM_NODEBUG to suppress the kmem warning */ - skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG); + skc = kmem_zalloc(sizeof (*skc), KM_SLEEP| KM_NODEBUG); if (skc == NULL) return (NULL); @@ -865,7 +863,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, skc->skc_name_size = strlen(name) + 1; skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP); if (skc->skc_name == NULL) { - kmem_free(skc, sizeof(*skc)); + kmem_free(skc, sizeof (*skc)); return (NULL); } strncpy(skc->skc_name, name, skc->skc_name_size); @@ -923,7 +921,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, * Objects smaller than spl_kmem_cache_slab_limit can * use the Linux slab for better space-efficiency. By * default this functionality is disabled until its - * performance characters are fully understood. + * performance characteristics are fully understood. */ if (spl_kmem_cache_slab_limit && size <= (size_t)spl_kmem_cache_slab_limit) @@ -980,20 +978,20 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, return (skc); out: kmem_free(skc->skc_name, skc->skc_name_size); - kmem_free(skc, sizeof(*skc)); + kmem_free(skc, sizeof (*skc)); return (NULL); } EXPORT_SYMBOL(spl_kmem_cache_create); /* - * Register a move callback to for cache defragmentation. + * Register a move callback for cache defragmentation. * XXX: Unimplemented but harmless to stub out for now. */ void spl_kmem_cache_set_move(spl_kmem_cache_t *skc, kmem_cbrc_t (move)(void *, void *, size_t, void *)) { - ASSERT(move != NULL); + ASSERT(move != NULL); } EXPORT_SYMBOL(spl_kmem_cache_set_move); @@ -1022,9 +1020,11 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) taskq_cancel_id(spl_kmem_cache_taskq, id); - /* Wait until all current callers complete, this is mainly + /* + * Wait until all current callers complete, this is mainly * to catch the case where a low memory situation triggers a - * cache reaping action which races with this destroy. */ + * cache reaping action which races with this destroy. + */ wait_event(wq, atomic_read(&skc->skc_ref) == 0); if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { @@ -1037,8 +1037,10 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) spin_lock(&skc->skc_lock); - /* Validate there are no objects in use and free all the - * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ + /* + * Validate there are no objects in use and free all the + * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. + */ ASSERT3U(skc->skc_slab_alloc, ==, 0); ASSERT3U(skc->skc_obj_alloc, ==, 0); ASSERT3U(skc->skc_slab_total, ==, 0); @@ -1049,7 +1051,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) kmem_free(skc->skc_name, skc->skc_name_size); spin_unlock(&skc->skc_lock); - kmem_free(skc, sizeof(*skc)); + kmem_free(skc, sizeof (*skc)); } EXPORT_SYMBOL(spl_kmem_cache_destroy); @@ -1089,7 +1091,7 @@ spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) skc->skc_slab_max = skc->skc_slab_alloc; } - return sko->sko_addr; + return (sko->sko_addr); } /* @@ -1127,7 +1129,7 @@ spl_cache_grow_work(void *data) static int spl_cache_grow_wait(spl_kmem_cache_t *skc) { - return !test_bit(KMC_BIT_GROWING, &skc->skc_flags); + return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags)); } /* @@ -1164,7 +1166,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { spl_kmem_alloc_t *ska; - ska = kmalloc(sizeof(*ska), flags); + ska = kmalloc(sizeof (*ska), flags); if (ska == NULL) { clear_bit(KMC_BIT_GROWING, &skc->skc_flags); wake_up_all(&skc->skc_waitq); @@ -1192,7 +1194,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) rc = spl_emergency_alloc(skc, flags, obj); } else { remaining = wait_event_timeout(skc->skc_waitq, - spl_cache_grow_wait(skc), HZ); + spl_cache_grow_wait(skc), HZ); if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) { spin_lock(&skc->skc_lock); @@ -1249,9 +1251,11 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) if (skm != skc->skc_mag[smp_processor_id()]) goto out; - /* Potentially rescheduled to the same CPU but + /* + * Potentially rescheduled to the same CPU but * allocations may have occurred from this CPU while - * we were sleeping so recalculate max refill. */ + * we were sleeping so recalculate max refill. + */ refill = MIN(refill, skm->skm_size - skm->skm_avail); spin_lock(&skc->skc_lock); @@ -1260,17 +1264,21 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) /* Grab the next available slab */ sks = list_entry((&skc->skc_partial_list)->next, - spl_kmem_slab_t, sks_list); + spl_kmem_slab_t, sks_list); ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_ref < sks->sks_objs); ASSERT(!list_empty(&sks->sks_free_list)); - /* Consume as many objects as needed to refill the requested - * cache. We must also be careful not to overfill it. */ - while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) { + /* + * Consume as many objects as needed to refill the requested + * cache. We must also be careful not to overfill it. + */ + while (sks->sks_ref < sks->sks_objs && refill-- > 0 && + ++count) { ASSERT(skm->skm_avail < skm->skm_size); ASSERT(count < skm->skm_size); - skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks); + skm->skm_objs[skm->skm_avail++] = + spl_cache_obj(skc, sks); } /* Move slab to skc_complete_list when full */ @@ -1308,16 +1316,20 @@ spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) sks->sks_ref--; skc->skc_obj_alloc--; - /* Move slab to skc_partial_list when no longer full. Slabs + /* + * Move slab to skc_partial_list when no longer full. Slabs * are added to the head to keep the partial list is quasi-full - * sorted order. Fuller at the head, emptier at the tail. */ + * sorted order. Fuller at the head, emptier at the tail. + */ if (sks->sks_ref == (sks->sks_objs - 1)) { list_del(&sks->sks_list); list_add(&sks->sks_list, &skc->skc_partial_list); } - /* Move empty slabs to the end of the partial list so - * they can be easily found and freed during reclamation. */ + /* + * Move empty slabs to the end of the partial list so + * they can be easily found and freed during reclamation. + */ if (sks->sks_ref == 0) { list_del(&sks->sks_list); list_add_tail(&sks->sks_list, &skc->skc_partial_list); @@ -1359,10 +1371,12 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) local_irq_disable(); restart: - /* Safe to update per-cpu structure without lock, but + /* + * Safe to update per-cpu structure without lock, but * in the restart case we must be careful to reacquire * the local magazine since this may have changed - * when we need to grow the cache. */ + * when we need to grow the cache. + */ skm = skc->skc_mag[smp_processor_id()]; ASSERT(skm->skm_magic == SKM_MAGIC); @@ -1438,10 +1452,12 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) local_irq_save(flags); - /* Safe to update per-cpu structure without lock, but + /* + * Safe to update per-cpu structure without lock, but * no remote memory allocation tracking is being performed * it is entirely possible to allocate an object from one - * CPU cache and return it to another. */ + * CPU cache and return it to another. + */ skm = skc->skc_mag[smp_processor_id()]; ASSERT(skm->skm_magic == SKM_MAGIC); @@ -1495,12 +1511,12 @@ __spl_kmem_cache_generic_shrinker(struct shrinker *shrink, #ifdef HAVE_SPLIT_SHRINKER_CALLBACK uint64_t oldalloc = skc->skc_obj_alloc; spl_kmem_cache_reap_now(skc, - MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1)); + MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); if (oldalloc > skc->skc_obj_alloc) alloc += oldalloc - skc->skc_obj_alloc; #else spl_kmem_cache_reap_now(skc, - MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1)); + MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); alloc += skc->skc_obj_alloc; #endif /* HAVE_SPLIT_SHRINKER_CALLBACK */ } else { @@ -1581,7 +1597,7 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) spin_lock(&skc->skc_lock); do_reclaim = (skc->skc_slab_total > 0) && - ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) && + ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) && (skc->skc_obj_alloc < objects); objects = skc->skc_obj_alloc; diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c index 075bf258..96ad2b04 100644 --- a/module/spl/spl-kmem.c +++ b/module/spl/spl-kmem.c @@ -1,4 +1,4 @@ -/*****************************************************************************\ +/* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -20,9 +20,7 @@ * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . - ***************************************************************************** - * Solaris Porting Layer (SPL) Kmem Implementation. -\*****************************************************************************/ + */ #include #include @@ -31,7 +29,7 @@ int kmem_debugging(void) { - return 0; + return (0); } EXPORT_SYMBOL(kmem_debugging); @@ -47,7 +45,7 @@ kmem_vasprintf(const char *fmt, va_list ap) va_end(aq); } while (ptr == NULL); - return ptr; + return (ptr); } EXPORT_SYMBOL(kmem_vasprintf); @@ -63,7 +61,7 @@ kmem_asprintf(const char *fmt, ...) va_end(ap); } while (ptr == NULL); - return ptr; + return (ptr); } EXPORT_SYMBOL(kmem_asprintf); @@ -78,13 +76,13 @@ __strdup(const char *str, int flags) if (ptr) memcpy(ptr, str, n + 1); - return ptr; + return (ptr); } char * strdup(const char *str) { - return __strdup(str, KM_SLEEP); + return (__strdup(str, KM_SLEEP)); } EXPORT_SYMBOL(strdup); @@ -104,18 +102,19 @@ EXPORT_SYMBOL(strfree); #ifdef DEBUG_KMEM /* Shim layer memory accounting */ -# ifdef HAVE_ATOMIC64_T +#ifdef HAVE_ATOMIC64_T atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); unsigned long long kmem_alloc_max = 0; -# else /* HAVE_ATOMIC64_T */ +#else /* HAVE_ATOMIC64_T */ atomic_t kmem_alloc_used = ATOMIC_INIT(0); unsigned long long kmem_alloc_max = 0; -# endif /* HAVE_ATOMIC64_T */ +#endif /* HAVE_ATOMIC64_T */ EXPORT_SYMBOL(kmem_alloc_used); EXPORT_SYMBOL(kmem_alloc_max); -/* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked +/* + * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked * but also the location of every alloc and free. When the SPL module is * unloaded a list of all leaked addresses and where they were allocated * will be dumped to the console. Enabling this feature has a significant @@ -126,18 +125,18 @@ EXPORT_SYMBOL(kmem_alloc_max); * debugging enabled for anything other than debugging we need to minimize * the contention by moving to a lock per xmem_table entry model. */ -# ifdef DEBUG_KMEM_TRACKING +#ifdef DEBUG_KMEM_TRACKING -# define KMEM_HASH_BITS 10 -# define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) +#define KMEM_HASH_BITS 10 +#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) typedef struct kmem_debug { - struct hlist_node kd_hlist; /* Hash node linkage */ - struct list_head kd_list; /* List of all allocations */ - void *kd_addr; /* Allocation pointer */ - size_t kd_size; /* Allocation size */ - const char *kd_func; /* Allocation function */ - int kd_line; /* Allocation line */ + struct hlist_node kd_hlist; /* Hash node linkage */ + struct list_head kd_list; /* List of all allocations */ + void *kd_addr; /* Allocation pointer */ + size_t kd_size; /* Allocation size */ + const char *kd_func; /* Allocation function */ + int kd_line; /* Allocation line */ } kmem_debug_t; spinlock_t kmem_lock; @@ -149,7 +148,8 @@ EXPORT_SYMBOL(kmem_table); EXPORT_SYMBOL(kmem_list); static kmem_debug_t * -kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr) +kmem_del_init(spinlock_t *lock, struct hlist_head *table, + int bits, const void *addr) { struct hlist_head *head; struct hlist_node *node; @@ -165,7 +165,7 @@ kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void * hlist_del_init(&p->kd_hlist); list_del_init(&p->kd_list); spin_unlock_irqrestore(lock, flags); - return p; + return (p); } } @@ -183,12 +183,12 @@ kmem_alloc_track(size_t size, int flags, const char *func, int line, unsigned long irq_flags; /* Function may be called with KM_NOSLEEP so failure is possible */ - dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t), + dptr = (kmem_debug_t *) kmalloc_nofail(sizeof (kmem_debug_t), flags & ~__GFP_ZERO); if (unlikely(dptr == NULL)) { printk(KERN_WARNING "debug kmem_alloc(%ld, 0x%x) at %s:%d " - "failed (%lld/%llu)\n", sizeof(kmem_debug_t), flags, + "failed (%lld/%llu)\n", sizeof (kmem_debug_t), flags, func, line, kmem_alloc_used_read(), kmem_alloc_max); } else { /* @@ -280,7 +280,7 @@ kmem_free_track(const void *ptr, size_t size) kmem_alloc_used_sub(size); kfree(dptr->kd_func); - memset((void *)dptr, 0x5a, sizeof(kmem_debug_t)); + memset((void *)dptr, 0x5a, sizeof (kmem_debug_t)); kfree(dptr); memset((void *)ptr, 0x5a, size); @@ -288,7 +288,7 @@ kmem_free_track(const void *ptr, size_t size) } EXPORT_SYMBOL(kmem_free_track); -# else /* DEBUG_KMEM_TRACKING */ +#else /* DEBUG_KMEM_TRACKING */ void * kmem_alloc_debug(size_t size, int flags, const char *func, int line, @@ -342,7 +342,7 @@ kmem_free_debug(const void *ptr, size_t size) } EXPORT_SYMBOL(kmem_free_debug); -# endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM_TRACKING */ #endif /* DEBUG_KMEM */ #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) @@ -355,15 +355,19 @@ spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) ASSERT(str != NULL && len >= 17); memset(str, 0, len); - /* Check for a fully printable string, and while we are at - * it place the printable characters in the passed buffer. */ + /* + * Check for a fully printable string, and while we are at + * it place the printable characters in the passed buffer. + */ for (i = 0; i < size; i++) { str[i] = ((char *)(kd->kd_addr))[i]; if (isprint(str[i])) { continue; } else { - /* Minimum number of printable characters found - * to make it worthwhile to print this as ascii. */ + /* + * Minimum number of printable characters found + * to make it worthwhile to print this as ascii. + */ if (i > min) break; @@ -374,17 +378,17 @@ spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) if (!flag) { sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", - *((uint8_t *)kd->kd_addr), - *((uint8_t *)kd->kd_addr + 2), - *((uint8_t *)kd->kd_addr + 4), - *((uint8_t *)kd->kd_addr + 6), - *((uint8_t *)kd->kd_addr + 8), - *((uint8_t *)kd->kd_addr + 10), - *((uint8_t *)kd->kd_addr + 12), - *((uint8_t *)kd->kd_addr + 14)); + *((uint8_t *)kd->kd_addr), + *((uint8_t *)kd->kd_addr + 2), + *((uint8_t *)kd->kd_addr + 4), + *((uint8_t *)kd->kd_addr + 6), + *((uint8_t *)kd->kd_addr + 8), + *((uint8_t *)kd->kd_addr + 10), + *((uint8_t *)kd->kd_addr + 12), + *((uint8_t *)kd->kd_addr + 14)); } - return str; + return (str); } static int @@ -411,18 +415,18 @@ spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) spin_lock_irqsave(lock, flags); if (!list_empty(list)) printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", - "size", "data", "func", "line"); + "size", "data", "func", "line"); list_for_each_entry(kd, list, kd_list) printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, - (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), - kd->kd_func, kd->kd_line); + (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), + kd->kd_func, kd->kd_line); spin_unlock_irqrestore(lock, flags); } #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ -#define spl_kmem_init_tracking(list, lock, size) -#define spl_kmem_fini_tracking(list, lock) +#define spl_kmem_init_tracking(list, lock, size) +#define spl_kmem_fini_tracking(list, lock) #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ int @@ -442,10 +446,12 @@ void spl_kmem_fini(void) { #ifdef DEBUG_KMEM - /* Display all unreclaimed memory addresses, including the + /* + * Display all unreclaimed memory addresses, including the * allocation size and the first few bytes of what's located * at that address to aid in debugging. Performance is not - * a serious concern here since it is module unload time. */ + * a serious concern here since it is module unload time. + */ if (kmem_alloc_used_read() != 0) printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", kmem_alloc_used_read(), kmem_alloc_max); diff --git a/module/spl/spl-vmem.c b/module/spl/spl-vmem.c index 4c140eb8..51aef941 100644 --- a/module/spl/spl-vmem.c +++ b/module/spl/spl-vmem.c @@ -1,4 +1,4 @@ -/*****************************************************************************\ +/* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -20,9 +20,7 @@ * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . - ***************************************************************************** - * Solaris Porting Layer (SPL) Kmem Implementation. -\*****************************************************************************/ + */ #include #include @@ -57,18 +55,19 @@ EXPORT_SYMBOL(vmem_size); #ifdef DEBUG_KMEM /* Shim layer memory accounting */ -# ifdef HAVE_ATOMIC64_T +#ifdef HAVE_ATOMIC64_T atomic64_t vmem_alloc_used = ATOMIC64_INIT(0); unsigned long long vmem_alloc_max = 0; -# else /* HAVE_ATOMIC64_T */ +#else /* HAVE_ATOMIC64_T */ atomic_t vmem_alloc_used = ATOMIC_INIT(0); unsigned long long vmem_alloc_max = 0; -# endif /* HAVE_ATOMIC64_T */ +#endif /* HAVE_ATOMIC64_T */ EXPORT_SYMBOL(vmem_alloc_used); EXPORT_SYMBOL(vmem_alloc_max); -/* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked +/* + * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked * but also the location of every alloc and free. When the SPL module is * unloaded a list of all leaked addresses and where they were allocated * will be dumped to the console. Enabling this feature has a significant @@ -79,18 +78,18 @@ EXPORT_SYMBOL(vmem_alloc_max); * debugging enabled for anything other than debugging we need to minimize * the contention by moving to a lock per xmem_table entry model. */ -# ifdef DEBUG_KMEM_TRACKING +#ifdef DEBUG_KMEM_TRACKING -# define VMEM_HASH_BITS 10 -# define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS) +#define VMEM_HASH_BITS 10 +#define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS) typedef struct kmem_debug { - struct hlist_node kd_hlist; /* Hash node linkage */ - struct list_head kd_list; /* List of all allocations */ - void *kd_addr; /* Allocation pointer */ - size_t kd_size; /* Allocation size */ - const char *kd_func; /* Allocation function */ - int kd_line; /* Allocation line */ + struct hlist_node kd_hlist; /* Hash node linkage */ + struct list_head kd_list; /* List of all allocations */ + void *kd_addr; /* Allocation pointer */ + size_t kd_size; /* Allocation size */ + const char *kd_func; /* Allocation function */ + int kd_line; /* Allocation line */ } kmem_debug_t; spinlock_t vmem_lock; @@ -111,12 +110,12 @@ vmem_alloc_track(size_t size, int flags, const char *func, int line) ASSERT(flags & KM_SLEEP); /* Function may be called with KM_NOSLEEP so failure is possible */ - dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t), + dptr = (kmem_debug_t *) kmalloc_nofail(sizeof (kmem_debug_t), flags & ~__GFP_ZERO); if (unlikely(dptr == NULL)) { printk(KERN_WARNING "debug vmem_alloc(%ld, 0x%x) " "at %s:%d failed (%lld/%llu)\n", - sizeof(kmem_debug_t), flags, func, line, + sizeof (kmem_debug_t), flags, func, line, vmem_alloc_used_read(), vmem_alloc_max); } else { /* @@ -194,7 +193,7 @@ vmem_free_track(const void *ptr, size_t size) vmem_alloc_used_sub(size); kfree(dptr->kd_func); - memset((void *)dptr, 0x5a, sizeof(kmem_debug_t)); + memset((void *)dptr, 0x5a, sizeof (kmem_debug_t)); kfree(dptr); memset((void *)ptr, 0x5a, size); @@ -202,7 +201,7 @@ vmem_free_track(const void *ptr, size_t size) } EXPORT_SYMBOL(vmem_free_track); -# else /* DEBUG_KMEM_TRACKING */ +#else /* DEBUG_KMEM_TRACKING */ void * vmem_alloc_debug(size_t size, int flags, const char *func, int line) @@ -242,7 +241,7 @@ vmem_free_debug(const void *ptr, size_t size) } EXPORT_SYMBOL(vmem_free_debug); -# endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM_TRACKING */ #endif /* DEBUG_KMEM */ #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) @@ -255,15 +254,19 @@ spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) ASSERT(str != NULL && len >= 17); memset(str, 0, len); - /* Check for a fully printable string, and while we are at - * it place the printable characters in the passed buffer. */ + /* + * Check for a fully printable string, and while we are at + * it place the printable characters in the passed buffer. + */ for (i = 0; i < size; i++) { str[i] = ((char *)(kd->kd_addr))[i]; if (isprint(str[i])) { continue; } else { - /* Minimum number of printable characters found - * to make it worthwhile to print this as ascii. */ + /* + * Minimum number of printable characters found + * to make it worthwhile to print this as ascii. + */ if (i > min) break; @@ -274,17 +277,17 @@ spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) if (!flag) { sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", - *((uint8_t *)kd->kd_addr), - *((uint8_t *)kd->kd_addr + 2), - *((uint8_t *)kd->kd_addr + 4), - *((uint8_t *)kd->kd_addr + 6), - *((uint8_t *)kd->kd_addr + 8), - *((uint8_t *)kd->kd_addr + 10), - *((uint8_t *)kd->kd_addr + 12), - *((uint8_t *)kd->kd_addr + 14)); + *((uint8_t *)kd->kd_addr), + *((uint8_t *)kd->kd_addr + 2), + *((uint8_t *)kd->kd_addr + 4), + *((uint8_t *)kd->kd_addr + 6), + *((uint8_t *)kd->kd_addr + 8), + *((uint8_t *)kd->kd_addr + 10), + *((uint8_t *)kd->kd_addr + 12), + *((uint8_t *)kd->kd_addr + 14)); } - return str; + return (str); } static int @@ -311,18 +314,18 @@ spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) spin_lock_irqsave(lock, flags); if (!list_empty(list)) printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", - "size", "data", "func", "line"); + "size", "data", "func", "line"); list_for_each_entry(kd, list, kd_list) printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, - (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), - kd->kd_func, kd->kd_line); + (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), + kd->kd_func, kd->kd_line); spin_unlock_irqrestore(lock, flags); } #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ -#define spl_kmem_init_tracking(list, lock, size) -#define spl_kmem_fini_tracking(list, lock) +#define spl_kmem_init_tracking(list, lock, size) +#define spl_kmem_fini_tracking(list, lock) #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ int @@ -342,10 +345,12 @@ void spl_vmem_fini(void) { #ifdef DEBUG_KMEM - /* Display all unreclaimed memory addresses, including the + /* + * Display all unreclaimed memory addresses, including the * allocation size and the first few bytes of what's located * at that address to aid in debugging. Performance is not - * a serious concern here since it is module unload time. */ + * a serious concern here since it is module unload time. + */ if (vmem_alloc_used_read() != 0) printk(KERN_WARNING "vmem leaked %ld/%llu bytes\n", vmem_alloc_used_read(), vmem_alloc_max); From 16740f547051b6fa39a16a6e2c53f831bef8c1ad Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 8 Dec 2014 15:37:14 -0500 Subject: [PATCH 04/11] Refactor generic memory allocation interfaces This patch achieves the following goals: 1. It replaces the preprocessor kmem flag to gfp flag mapping with proper translation logic. This eliminates the potential for surprises that were previously possible where kmem flags were mapped to gfp flags. 2. It maps vmem_alloc() allocations to kmem_alloc() for allocations sized less than or equal to the newly-added spl_kmem_alloc_max parameter. This ensures that small allocations will not contend on a single global lock, large allocations can still be handled, and potentially limited virtual address space will not be squandered. This behavior is entirely different than under Illumos due to different memory management strategies employed by the respective kernels. However, this functionally provides the semantics required. 3. The --disable-debug-kmem, --enable-debug-kmem (default), and --enable-debug-kmem-tracking allocators have been unified in to a single spl_kmem_alloc_impl() allocation function. This was done to simplify the code and make it more maintainable. 4. Improve portability by exposing an implementation of the memory allocations functions that can be safely used in the same way they are used on Illumos. Specifically, callers may safely use KM_SLEEP in contexts which perform filesystem IO. This allows us to eliminate an entire class of Linux specific changes which were previously required to avoid deadlocking the system. This change will be largely transparent to existing callers but there are a few caveats: 1. Because the headers were refactored and extraneous includes removed callers may find they need to explicitly add additional #includes. In particular, kmem_cache.h must now be explicitly includes to access the SPL's kmem cache implementation. This behavior is different from Illumos but it was done to avoid always masking the Linux slab functions when kmem.h is included. 2. Callers, like Lustre, which made assumptions about the definitions of KM_SLEEP, KM_NOSLEEP, and KM_PUSHPAGE will need to be updated. Other callers such as ZFS which did not will not require changes. 3. KM_PUSHPAGE is no longer overloaded to imply GFP_NOIO. It retains its original meaning of allowing allocations to access reserved memory. KM_PUSHPAGE callers can be converted back to KM_SLEEP. 4. The KM_NODEBUG flags has been retired and the default warning threshold increased to 32k. 5. The kmem_virt() functions has been removed. For callers which need to distinguish between a physical and virtual address use is_vmalloc_addr(). Signed-off-by: Brian Behlendorf --- include/sys/kmem.h | 159 ++++-------- include/sys/kmem_cache.h | 24 +- include/sys/vmem.h | 171 ++++--------- man/man5/spl-module-parameters.5 | 40 ++++ module/spl/spl-kmem-cache.c | 89 ++++--- module/spl/spl-kmem.c | 399 +++++++++++++++++++------------ module/spl/spl-proc.c | 20 -- module/spl/spl-tsd.c | 3 +- module/spl/spl-vmem.c | 319 +++--------------------- module/splat/splat-kmem.c | 20 +- 10 files changed, 469 insertions(+), 775 deletions(-) diff --git a/include/sys/kmem.h b/include/sys/kmem.h index a9d94c90..045d07c2 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -26,6 +26,7 @@ #define _SPL_KMEM_H #include +#include extern int kmem_debugging(void); extern char *kmem_vasprintf(const char *fmt, va_list ap); @@ -36,68 +37,41 @@ extern void strfree(char *str); /* * Memory allocation interfaces */ -#define KM_SLEEP GFP_KERNEL /* Can sleep, never fails */ -#define KM_NOSLEEP GFP_ATOMIC /* Can not sleep, may fail */ -#define KM_PUSHPAGE (GFP_NOIO | __GFP_HIGH) /* Use reserved memory */ -#define KM_NODEBUG __GFP_NOWARN /* Suppress warnings */ -#define KM_FLAGS __GFP_BITS_MASK -#define KM_VMFLAGS GFP_LEVEL_MASK +#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */ +#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */ +#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */ +#define KM_ZERO 0x1000 /* zero the allocation */ +#define KM_VMEM 0x2000 /* caller is vmem_* wrapper */ -/* - * Used internally, the kernel does not need to support this flag - */ -#ifndef __GFP_ZERO -#define __GFP_ZERO 0x8000 -#endif +#define KM_PUBLIC_MASK (KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE) /* - * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as - * early as 2.6.32. To avoid this issue when it occurs in upstream kernels - * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC). - * I would prefer the caller handle the failure case cleanly but we are - * trying to emulate Solaris and those are not the Solaris semantics. + * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion + * function is context aware which means that KM_SLEEP allocations can be + * safely used in syncing contexts which have set PF_FSTRANS. */ -static inline void * -kmalloc_nofail(size_t size, gfp_t flags) -{ - void *ptr; - - do { - ptr = kmalloc(size, flags); - } while (ptr == NULL && (flags & __GFP_WAIT)); - - return (ptr); -} - -static inline void * -kzalloc_nofail(size_t size, gfp_t flags) +static inline gfp_t +kmem_flags_convert(int flags) { - void *ptr; - - do { - ptr = kzalloc(size, flags); - } while (ptr == NULL && (flags & __GFP_WAIT)); + gfp_t lflags = __GFP_NOWARN | __GFP_COMP; - return (ptr); -} + if (flags & KM_NOSLEEP) { + lflags |= GFP_ATOMIC | __GFP_NORETRY; + } else { + lflags |= GFP_KERNEL; + if ((current->flags & PF_FSTRANS)) + lflags &= ~(__GFP_IO|__GFP_FS); + } -static inline void * -kmalloc_node_nofail(size_t size, gfp_t flags, int node) -{ - void *ptr; + if (flags & KM_PUSHPAGE) + lflags |= __GFP_HIGH; - do { - ptr = kmalloc_node(size, flags, node); - } while (ptr == NULL && (flags & __GFP_WAIT)); + if (flags & KM_ZERO) + lflags |= __GFP_ZERO; - return (ptr); + return (lflags); } -#ifdef DEBUG_KMEM - -/* - * Memory accounting functions to be used only when DEBUG_KMEM is set. - */ #ifdef HAVE_ATOMIC64_T #define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) #define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) @@ -114,70 +88,29 @@ extern atomic_t kmem_alloc_used; extern unsigned long long kmem_alloc_max; #endif /* HAVE_ATOMIC64_T */ -#ifdef DEBUG_KMEM_TRACKING -/* - * DEBUG_KMEM && DEBUG_KMEM_TRACKING - * - * The maximum level of memory debugging. All memory will be accounted - * for and each allocation will be explicitly tracked. Any allocation - * which is leaked will be reported on module unload and the exact location - * where that memory was allocation will be reported. This level of memory - * tracking will have a significant impact on performance and should only - * be enabled for debugging. This feature may be enabled by passing - * --enable-debug-kmem-tracking to configure. - */ -#define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__, 0, 0) -#define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__, 0, 0) -#define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__, 1, nd) -#define kmem_free(ptr, sz) kmem_free_track((ptr), (sz)) - -extern void *kmem_alloc_track(size_t, int, const char *, int, int, int); -extern void kmem_free_track(const void *, size_t); - -#else /* DEBUG_KMEM_TRACKING */ -/* - * DEBUG_KMEM && !DEBUG_KMEM_TRACKING - * - * The default build will set DEBUG_KEM. This provides basic memory - * accounting with little to no impact on performance. When the module - * is unloaded in any memory was leaked the total number of leaked bytes - * will be reported on the console. To disable this basic accounting - * pass the --disable-debug-kmem option to configure. - */ -#define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__, 0, 0) -#define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__, 0, 0) -#define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__, 1, nd) -#define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz)) - -extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int); -extern void kmem_free_debug(const void *, size_t); - -#endif /* DEBUG_KMEM_TRACKING */ -#else /* DEBUG_KMEM */ -/* - * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING - * - * All debugging is disabled. There will be no overhead even for - * minimal memory accounting. To enable basic accounting pass the - * --enable-debug-kmem option to configure. - */ -#define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl)) -#define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl)) -#define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd)) -#define kmem_free(ptr, sz) ((void)(sz), kfree(ptr)) +extern unsigned int spl_kmem_alloc_warn; +extern unsigned int spl_kmem_alloc_max; -#endif /* DEBUG_KMEM */ +#define kmem_alloc(sz, fl) spl_kmem_alloc((sz), (fl), __func__, __LINE__) +#define kmem_zalloc(sz, fl) spl_kmem_zalloc((sz), (fl), __func__, __LINE__) +#define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz)) -int spl_kmem_init(void); -void spl_kmem_fini(void); +extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line); +extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void spl_kmem_free(const void *ptr, size_t sz); -#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \ - ((ptr) < (void *)VMALLOC_END)) +/* + * The following functions are only available for internal use. + */ +extern void *spl_kmem_alloc_impl(size_t size, int flags, int node); +extern void *spl_kmem_alloc_debug(size_t size, int flags, int node); +extern void *spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node); +extern void spl_kmem_free_impl(const void *buf, size_t size); +extern void spl_kmem_free_debug(const void *buf, size_t size); +extern void spl_kmem_free_track(const void *buf, size_t size); + +extern int spl_kmem_init(void); +extern void spl_kmem_fini(void); #endif /* _SPL_KMEM_H */ diff --git a/include/sys/kmem_cache.h b/include/sys/kmem_cache.h index a5bc0322..a9b5bdd2 100644 --- a/include/sys/kmem_cache.h +++ b/include/sys/kmem_cache.h @@ -202,6 +202,7 @@ extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); +extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags); extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count); extern void spl_kmem_reap(void); @@ -214,29 +215,6 @@ extern void spl_kmem_reap(void); #define kmem_cache_reap_now(skc) \ spl_kmem_cache_reap_now(skc, skc->skc_reap) #define kmem_reap() spl_kmem_reap() -#define kmem_virt(ptr) \ - (((ptr) >= (void *)VMALLOC_START) && \ - ((ptr) < (void *)VMALLOC_END)) - -/* - * Allow custom slab allocation flags to be set for KMC_SLAB based caches. - * One use for this function is to ensure the __GFP_COMP flag is part of - * the default allocation mask which ensures higher order allocations are - * properly refcounted. This flag was added to the default ->allocflags - * as of Linux 3.11. - */ -static inline void -kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags) -{ - if (skc->skc_linux_cache == NULL) - return; - -#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) - skc->skc_linux_cache->allocflags |= flags; -#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) - skc->skc_linux_cache->gfpflags |= flags; -#endif -} /* * The following functions are only available for internal use. diff --git a/include/sys/vmem.h b/include/sys/vmem.h index f59ac5e8..6eb2c676 100644 --- a/include/sys/vmem.h +++ b/include/sys/vmem.h @@ -47,135 +47,60 @@ extern size_t vmem_size(vmem_t *vmp, int typemask); #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) #endif -static inline void * -vmalloc_nofail(size_t size, gfp_t flags) -{ - void *ptr; - - /* - * Retry failed __vmalloc() allocations once every second. The - * rational for the delay is that the likely failure modes are: - * - * 1) The system has completely exhausted memory, in which case - * delaying 1 second for the memory reclaim to run is reasonable - * to avoid thrashing the system. - * 2) The system has memory but has exhausted the small virtual - * address space available on 32-bit systems. Retrying the - * allocation immediately will only result in spinning on the - * virtual address space lock. It is better delay a second and - * hope that another process will free some of the address space. - * But the bottom line is there is not much we can actually do - * since we can never safely return a failure and honor the - * Solaris semantics. - */ - while (1) { - ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); - if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - } else { - break; - } - } - - return (ptr); -} - -static inline void * -vzalloc_nofail(size_t size, gfp_t flags) -{ - void *ptr; - - ptr = vmalloc_nofail(size, flags); - if (ptr) - memset(ptr, 0, (size)); - - return (ptr); -} - -#ifdef DEBUG_KMEM - -/* - * Memory accounting functions to be used only when DEBUG_KMEM is set. - */ -#ifdef HAVE_ATOMIC64_T - -#define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used) -#define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used) -#define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used) -#define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size) - -extern atomic64_t vmem_alloc_used; -extern unsigned long long vmem_alloc_max; - -#else /* HAVE_ATOMIC64_T */ - -#define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used) -#define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used) -#define vmem_alloc_used_read() atomic_read(&vmem_alloc_used) -#define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size) - -extern atomic_t vmem_alloc_used; -extern unsigned long long vmem_alloc_max; - -#endif /* HAVE_ATOMIC64_T */ - -#ifdef DEBUG_KMEM_TRACKING /* - * DEBUG_KMEM && DEBUG_KMEM_TRACKING + * vmem_* is an interface to a low level arena-based memory allocator on + * Illumos that is used to allocate virtual address space. The kmem SLAB + * allocator allocates slabs from it. Then the generic allocation functions + * kmem_{alloc,zalloc,free}() are layered on top of SLAB allocators. * - * The maximum level of memory debugging. All memory will be accounted - * for and each allocation will be explicitly tracked. Any allocation - * which is leaked will be reported on module unload and the exact location - * where that memory was allocation will be reported. This level of memory - * tracking will have a significant impact on performance and should only - * be enabled for debugging. This feature may be enabled by passing - * --enable-debug-kmem-tracking to configure. - */ -#define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__) -#define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__) -#define vmem_free(ptr, sz) vmem_free_track((ptr), (sz)) - -extern void *kmem_alloc_track(size_t, int, const char *, int, int, int); -extern void kmem_free_track(const void *, size_t); -extern void *vmem_alloc_track(size_t, int, const char *, int); -extern void vmem_free_track(const void *, size_t); - -#else /* DEBUG_KMEM_TRACKING */ -/* - * DEBUG_KMEM && !DEBUG_KMEM_TRACKING + * On Linux, the primary means of doing allocations is via kmalloc(), which + * is similarly layered on top of something called the buddy allocator. The + * buddy allocator is not available to kernel modules, it uses physical + * memory addresses rather than virtual memory addresses and is prone to + * fragmentation. * - * The default build will set DEBUG_KEM. This provides basic memory - * accounting with little to no impact on performance. When the module - * is unloaded in any memory was leaked the total number of leaked bytes - * will be reported on the console. To disable this basic accounting - * pass the --disable-debug-kmem option to configure. - */ -#define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__) -#define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__) -#define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz)) - -extern void *vmem_alloc_debug(size_t, int, const char *, int); -extern void vmem_free_debug(const void *, size_t); - -#endif /* DEBUG_KMEM_TRACKING */ -#else /* DEBUG_KMEM */ -/* - * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING + * Linux sets aside a relatively small address space for in-kernel virtual + * memory from which allocations can be done using vmalloc(). It might seem + * like a good idea to use vmalloc() to implement something similar to + * Illumos' allocator. However, this has the following problems: + * + * 1. Page directory table allocations are hard coded to use GFP_KERNEL. + * Consequently, any KM_PUSHPAGE or KM_NOSLEEP allocations done using + * vmalloc() will not have proper semantics. + * + * 2. Address space exhaustion is a real issue on 32-bit platforms where + * only a few 100MB are available. The kernel will handle it by spinning + * when it runs out of address space. + * + * 3. All vmalloc() allocations and frees are protected by a single global + * lock which serializes all allocations. * - * All debugging is disabled. There will be no overhead even for - * minimal memory accounting. To enable basic accounting pass the - * --enable-debug-kmem option to configure. + * 4. Accessing /proc/meminfo and /proc/vmallocinfo will iterate the entire + * list. The former will sum the allocations while the latter will print + * them to user space in a way that user space can keep the lock held + * indefinitely. When the total number of mapped allocations is large + * (several 100,000) a large amount of time will be spent waiting on locks. + * + * 5. Linux has a wait_on_bit() locking primitive that assumes physical + * memory is used, it simply does not work on virtual memory. Certain + * Linux structures (e.g. the superblock) use them and might be embedded + * into a structure from Illumos. This makes using Linux virtual memory + * unsafe in certain situations. + * + * It follows that we cannot obtain identical semantics to those on Illumos. + * Consequently, we implement the kmem_{alloc,zalloc,free}() functions in + * such a way that they can be used as drop-in replacements for small vmem_* + * allocations (8MB in size or smaller) and map vmem_{alloc,zalloc,free}() + * to them. */ -#define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl)) -#define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl)) -#define vmem_free(ptr, sz) ((void)(sz), vfree(ptr)) -#endif /* DEBUG_KMEM */ +#define vmem_alloc(sz, fl) spl_vmem_alloc((sz), (fl), __func__, __LINE__) +#define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__) +#define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz)) + +extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line); +extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void spl_vmem_free(const void *ptr, size_t sz); int spl_vmem_init(void); void spl_vmem_fini(void); diff --git a/man/man5/spl-module-parameters.5 b/man/man5/spl-module-parameters.5 index 33e10b53..2ec5b668 100644 --- a/man/man5/spl-module-parameters.5 +++ b/man/man5/spl-module-parameters.5 @@ -80,6 +80,46 @@ By age (0x1) or low memory (0x2) Default value: \fB0\fR. .RE +.sp +.ne 2 +.na +\fBspl_kmem_alloc_warn\fR (uint) +.ad +.RS 12n +As a general rule kmem_alloc() allocations should be small, preferably +just a few pages since they must by physically contiguous. Therefore, a +rate limited warning will be printed to the console for any kmem_alloc() +which exceeds a reasonable threshold. + +The default warning threshold is set to eight pages but capped at 32K to +accommodate systems using large pages. This value was selected to be small +enough to ensure the largest allocations are quickly noticed and fixed. +But large enough to avoid logging any warnings when a allocation size is +larger than optimal but not a serious concern. Since this value is tunable, +developers are encouraged to set it lower when testing so any new largish +allocations are quickly caught. These warnings may be disabled by setting +the threshold to zero. +.sp +Default value: \fB32K\fR. +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_alloc_max\fR (uint) +.ad +.RS 12n +Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. +Allocations which are marginally smaller than this limit may succeed but +should still be avoided due to the expense of locating a contiguous range +of free pages. Therefore, a maximum kmem size with reasonable safely +margin of 4x is set. Kmem_alloc() allocations larger than this maximum +will quickly fail. Vmem_alloc() allocations less than or equal to this +value will use kmalloc(), but shift to vmalloc() when exceeding this value. +.sp +Default value: \fBKMALLOC_MAX_SIZE/4\fR. +.RE + .sp .ne 2 .na diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c index 3aa65a9b..9a8ccfe4 100644 --- a/module/spl/spl-kmem-cache.c +++ b/module/spl/spl-kmem-cache.c @@ -130,19 +130,6 @@ MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, * One serious concern I do have about this method is the relatively * small virtual address space on 32bit arches. This will seriously * constrain the size of the slab caches and their performance. - * - * XXX: Improve the partial slab list by carefully maintaining a - * strict ordering of fullest to emptiest slabs based on - * the slab reference count. This guarantees that when freeing - * slabs back to the system we need only linearly traverse the - * last N slabs in the list to discover all the freeable slabs. - * - * XXX: NUMA awareness for optionally allocating memory close to a - * particular core. This can be advantageous if you know the slab - * object will be short lived and primarily accessed from one core. - * - * XXX: Slab coloring may also yield performance improvements and would - * be desirable to implement. */ struct list_head spl_kmem_cache_list; /* List of caches */ @@ -158,15 +145,15 @@ SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, static void * kv_alloc(spl_kmem_cache_t *skc, int size, int flags) { + gfp_t lflags = kmem_flags_convert(flags); void *ptr; ASSERT(ISP2(size)); if (skc->skc_flags & KMC_KMEM) - ptr = (void *)__get_free_pages(flags | __GFP_COMP, - get_order(size)); + ptr = (void *)__get_free_pages(lflags, get_order(size)); else - ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL); /* Resulting allocated memory will be page aligned */ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); @@ -361,12 +348,11 @@ spl_slab_free(spl_kmem_slab_t *sks, } /* - * Traverse all the partial slabs attached to a cache and free those - * which which are currently empty, and have not been touched for - * skc_delay seconds to avoid thrashing. The count argument is - * passed to optionally cap the number of slabs reclaimed, a count - * of zero means try and reclaim everything. When flag is set we - * always free an available slab regardless of age. + * Traverse all the partial slabs attached to a cache and free those which + * are currently empty, and have not been touched for skc_delay seconds to + * avoid thrashing. The count argument is passed to optionally cap the + * number of slabs reclaimed, a count of zero means try and reclaim + * everything. When flag the is set available slabs freed regardless of age. */ static void spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) @@ -480,6 +466,7 @@ spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) static int spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) { + gfp_t lflags = kmem_flags_convert(flags); spl_kmem_emergency_t *ske; int empty; @@ -490,11 +477,11 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) if (!empty) return (-EEXIST); - ske = kmalloc(sizeof (*ske), flags); + ske = kmalloc(sizeof (*ske), lflags); if (ske == NULL) return (-ENOMEM); - ske->ske_obj = kmalloc(skc->skc_obj_size, flags); + ske->ske_obj = kmalloc(skc->skc_obj_size, lflags); if (ske->ske_obj == NULL) { kfree(ske); return (-ENOMEM); @@ -734,7 +721,7 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) int size = sizeof (spl_kmem_magazine_t) + sizeof (void *) * skc->skc_mag_size; - skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu)); + skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (skm) { skm->skm_magic = SKM_MAGIC; skm->skm_avail = 0; @@ -754,13 +741,9 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) static void spl_magazine_free(spl_kmem_magazine_t *skm) { - int size = sizeof (spl_kmem_magazine_t) + - sizeof (void *) * skm->skm_size; - ASSERT(skm->skm_magic == SKM_MAGIC); ASSERT(skm->skm_avail == 0); - - kmem_free(skm, size); + kfree(skm); } /* @@ -835,6 +818,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags) { + gfp_t lflags = kmem_flags_convert(KM_SLEEP); spl_kmem_cache_t *skc; int rc; @@ -852,18 +836,17 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, * Allocate memory for a new cache and initialize it. Unfortunately, * this usually ends up being a large allocation of ~32k because * we need to allocate enough memory for the worst case number of - * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we - * explicitly pass KM_NODEBUG to suppress the kmem warning + * cpus in the magazine, skc_mag[NR_CPUS]. */ - skc = kmem_zalloc(sizeof (*skc), KM_SLEEP| KM_NODEBUG); + skc = kzalloc(sizeof (*skc), lflags); if (skc == NULL) return (NULL); skc->skc_magic = SKC_MAGIC; skc->skc_name_size = strlen(name) + 1; - skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP); + skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags); if (skc->skc_name == NULL) { - kmem_free(skc, sizeof (*skc)); + kfree(skc); return (NULL); } strncpy(skc->skc_name, name, skc->skc_name_size); @@ -962,7 +945,11 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, goto out; } - kmem_cache_set_allocflags(skc, __GFP_COMP); +#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) + skc->skc_linux_cache->allocflags |= __GFP_COMP; +#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) + skc->skc_linux_cache->gfpflags |= __GFP_COMP; +#endif skc->skc_flags |= KMC_NOMAGAZINE; } @@ -977,8 +964,8 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, return (skc); out: - kmem_free(skc->skc_name, skc->skc_name_size); - kmem_free(skc, sizeof (*skc)); + kfree(skc->skc_name); + kfree(skc); return (NULL); } EXPORT_SYMBOL(spl_kmem_cache_create); @@ -1048,10 +1035,10 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) ASSERT3U(skc->skc_obj_emergency, ==, 0); ASSERT(list_empty(&skc->skc_complete_list)); - kmem_free(skc->skc_name, skc->skc_name_size); spin_unlock(&skc->skc_lock); - kmem_free(skc, sizeof (*skc)); + kfree(skc->skc_name); + kfree(skc); } EXPORT_SYMBOL(spl_kmem_cache_destroy); @@ -1106,7 +1093,13 @@ spl_cache_grow_work(void *data) spl_kmem_cache_t *skc = ska->ska_cache; spl_kmem_slab_t *sks; - sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG); +#if defined(PF_MEMALLOC_NOIO) + unsigned noio_flag = memalloc_noio_save(); + sks = spl_slab_alloc(skc, ska->ska_flags); + memalloc_noio_restore(noio_flag); +#else + sks = spl_slab_alloc(skc, ska->ska_flags); +#endif spin_lock(&skc->skc_lock); if (sks) { skc->skc_slab_total++; @@ -1140,8 +1133,9 @@ spl_cache_grow_wait(spl_kmem_cache_t *skc) static int spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) { - int remaining, rc; + int remaining, rc = 0; + ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT((skc->skc_flags & KMC_SLAB) == 0); might_sleep(); @@ -1166,7 +1160,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { spl_kmem_alloc_t *ska; - ska = kmalloc(sizeof (*ska), flags); + ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); if (ska == NULL) { clear_bit(KMC_BIT_GROWING, &skc->skc_flags); wake_up_all(&skc->skc_waitq); @@ -1175,7 +1169,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) atomic_inc(&skc->skc_ref); ska->ska_cache = skc; - ska->ska_flags = flags & ~__GFP_FS; + ska->ska_flags = flags; taskq_init_ent(&ska->ska_tqe); taskq_dispatch_ent(spl_kmem_cache_taskq, spl_cache_grow_work, ska, 0, &ska->ska_tqe); @@ -1347,9 +1341,9 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) spl_kmem_magazine_t *skm; void *obj = NULL; + ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - ASSERT(flags & KM_SLEEP); atomic_inc(&skc->skc_ref); @@ -1360,9 +1354,8 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) */ if (skc->skc_flags & KMC_SLAB) { struct kmem_cache *slc = skc->skc_linux_cache; - do { - obj = kmem_cache_alloc(slc, flags | __GFP_COMP); + obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); } while ((obj == NULL) && !(flags & KM_NOSLEEP)); goto ret; @@ -1445,7 +1438,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) * are guaranteed to have physical addresses. They must be removed * from the tree of emergency objects and the freed. */ - if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) { + if ((skc->skc_flags & KMC_VMEM) && !is_vmalloc_addr(obj)) { spl_emergency_free(skc, obj); goto out; } diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c index 96ad2b04..4cd7cdbe 100644 --- a/module/spl/spl-kmem.c +++ b/module/spl/spl-kmem.c @@ -23,8 +23,47 @@ */ #include +#include #include #include +#include +#include + +/* + * As a general rule kmem_alloc() allocations should be small, preferably + * just a few pages since they must by physically contiguous. Therefore, a + * rate limited warning will be printed to the console for any kmem_alloc() + * which exceeds a reasonable threshold. + * + * The default warning threshold is set to eight pages but capped at 32K to + * accommodate systems using large pages. This value was selected to be small + * enough to ensure the largest allocations are quickly noticed and fixed. + * But large enough to avoid logging any warnings when a allocation size is + * larger than optimal but not a serious concern. Since this value is tunable, + * developers are encouraged to set it lower when testing so any new largish + * allocations are quickly caught. These warnings may be disabled by setting + * the threshold to zero. + */ +unsigned int spl_kmem_alloc_warn = MAX(8 * PAGE_SIZE, 32 * 1024); +module_param(spl_kmem_alloc_warn, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_warn, + "Warning threshold in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_warn); + +/* + * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. + * Allocations which are marginally smaller than this limit may succeed but + * should still be avoided due to the expense of locating a contiguous range + * of free pages. Therefore, a maximum kmem size with reasonable safely + * margin of 4x is set. Kmem_alloc() allocations larger than this maximum + * will quickly fail. Vmem_alloc() allocations less than or equal to this + * value will use kmalloc(), but shift to vmalloc() when exceeding this value. + */ +unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); +module_param(spl_kmem_alloc_max, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_max, + "Maximum size in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_max); int kmem_debugging(void) @@ -72,7 +111,7 @@ __strdup(const char *str, int flags) int n; n = strlen(str); - ptr = kmalloc_nofail(n + 1, flags); + ptr = kmalloc(n + 1, kmem_flags_convert(flags)); if (ptr) memcpy(ptr, str, n + 1); @@ -94,10 +133,101 @@ strfree(char *str) EXPORT_SYMBOL(strfree); /* - * Memory allocation interfaces and debugging for basic kmem_* - * and vmem_* style memory allocation. When DEBUG_KMEM is enabled - * the SPL will keep track of the total memory allocated, and - * report any memory leaked when the module is unloaded. + * Limit the number of large allocation stack traces dumped to not more than + * 5 every 60 seconds to prevent denial-of-service attacks from debug code. + */ +DEFINE_RATELIMIT_STATE(kmem_alloc_ratelimit_state, 60 * HZ, 5); + +/* + * General purpose unified implementation of kmem_alloc(). It is an + * amalgamation of Linux and Illumos allocator design. It should never be + * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains + * relatively portable. Consumers may only access this function through + * wrappers that enforce the common flags to ensure portability. + */ +inline void * +spl_kmem_alloc_impl(size_t size, int flags, int node) +{ + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + /* + * Log abnormally large allocations and rate limit the console output. + * Allocations larger than spl_kmem_alloc_warn should be performed + * through the vmem_alloc()/vmem_zalloc() interfaces. + */ + if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && + !(flags & KM_VMEM) && __ratelimit(&kmem_alloc_ratelimit_state)) { + printk(KERN_WARNING + "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" + "https://github.com/zfsonlinux/zfs/issues/new\n", + (unsigned long)size, flags); + dump_stack(); + } + + /* + * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used + * unlike kmem_alloc() with KM_SLEEP on Illumos. + */ + do { + /* + * Calling kmalloc_node() when the size >= spl_kmem_alloc_max + * is unsafe. This must fail for all for kmem_alloc() and + * kmem_zalloc() callers. + * + * For vmem_alloc() and vmem_zalloc() callers it is permissible + * to use __vmalloc(). However, in general use of __vmalloc() + * is strongly discouraged because a global lock must be + * acquired. Contention on this lock can significantly + * impact performance so frequently manipulating the virtual + * address space is strongly discouraged. + */ + if (unlikely(size > spl_kmem_alloc_max)) { + if (flags & KM_VMEM) { + ptr = __vmalloc(size, lflags, PAGE_KERNEL); + } else { + return (NULL); + } + } else { + ptr = kmalloc_node(size, lflags, node); + } + + if (likely(ptr) || (flags & KM_NOSLEEP)) + return (ptr); + + if (unlikely(__ratelimit(&kmem_alloc_ratelimit_state))) { + printk(KERN_WARNING + "Possible memory allocation deadlock: " + "size=%lu lflags=0x%x", + (unsigned long)size, lflags); + dump_stack(); + } + + /* + * Use cond_resched() instead of congestion_wait() to avoid + * deadlocking systems where there are no block devices. + */ + cond_resched(); + } while (1); + + return (NULL); +} + +inline void +spl_kmem_free_impl(const void *buf, size_t size) +{ + if (is_vmalloc_addr(buf)) + vfree(buf); + else + kfree(buf); +} + +/* + * Memory allocation and accounting for kmem_* * style allocations. When + * DEBUG_KMEM is enabled the total memory allocated will be tracked and + * any memory leaked will be reported during module unload. + * + * ./configure --enable-debug-kmem */ #ifdef DEBUG_KMEM @@ -113,6 +243,28 @@ unsigned long long kmem_alloc_max = 0; EXPORT_SYMBOL(kmem_alloc_used); EXPORT_SYMBOL(kmem_alloc_max); +inline void * +spl_kmem_alloc_debug(size_t size, int flags, int node) +{ + void *ptr; + + ptr = spl_kmem_alloc_impl(size, flags, node); + if (ptr) { + kmem_alloc_used_add(size); + if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) + kmem_alloc_max = kmem_alloc_used_read(); + } + + return (ptr); +} + +inline void +spl_kmem_free_debug(const void *ptr, size_t size) +{ + kmem_alloc_used_sub(size); + spl_kmem_free_impl(ptr, size); +} + /* * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked * but also the location of every alloc and free. When the SPL module is @@ -124,9 +276,14 @@ EXPORT_SYMBOL(kmem_alloc_max); * contended particularly on xfree(). If we want to run with this detailed * debugging enabled for anything other than debugging we need to minimize * the contention by moving to a lock per xmem_table entry model. + * + * ./configure --enable-debug-kmem-tracking */ #ifdef DEBUG_KMEM_TRACKING +#include +#include + #define KMEM_HASH_BITS 10 #define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) @@ -139,13 +296,9 @@ typedef struct kmem_debug { int kd_line; /* Allocation line */ } kmem_debug_t; -spinlock_t kmem_lock; -struct hlist_head kmem_table[KMEM_TABLE_SIZE]; -struct list_head kmem_list; - -EXPORT_SYMBOL(kmem_lock); -EXPORT_SYMBOL(kmem_table); -EXPORT_SYMBOL(kmem_list); +static spinlock_t kmem_lock; +static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; +static struct list_head kmem_list; static kmem_debug_t * kmem_del_init(spinlock_t *lock, struct hlist_head *table, @@ -174,176 +327,112 @@ kmem_del_init(spinlock_t *lock, struct hlist_head *table, return (NULL); } -void * -kmem_alloc_track(size_t size, int flags, const char *func, int line, - int node_alloc, int node) +inline void * +spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node) { void *ptr = NULL; kmem_debug_t *dptr; unsigned long irq_flags; - /* Function may be called with KM_NOSLEEP so failure is possible */ - dptr = (kmem_debug_t *) kmalloc_nofail(sizeof (kmem_debug_t), - flags & ~__GFP_ZERO); + dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); + if (dptr == NULL) + return (NULL); - if (unlikely(dptr == NULL)) { - printk(KERN_WARNING "debug kmem_alloc(%ld, 0x%x) at %s:%d " - "failed (%lld/%llu)\n", sizeof (kmem_debug_t), flags, - func, line, kmem_alloc_used_read(), kmem_alloc_max); - } else { - /* - * Marked unlikely because we should never be doing this, - * we tolerate to up 2 pages but a single page is best. - */ - if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) { - printk(KERN_WARNING "large kmem_alloc(%llu, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - kmem_alloc_used_read(), kmem_alloc_max); - spl_dumpstack(); - } - - /* - * We use __strdup() below because the string pointed to by - * __FUNCTION__ might not be available by the time we want - * to print it since the module might have been unloaded. - * This can only fail in the KM_NOSLEEP case. - */ - dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO); - if (unlikely(dptr->kd_func == NULL)) { - kfree(dptr); - printk(KERN_WARNING "debug __strdup() at %s:%d " - "failed (%lld/%llu)\n", func, line, - kmem_alloc_used_read(), kmem_alloc_max); - goto out; - } - - /* Use the correct allocator */ - if (node_alloc) { - ASSERT(!(flags & __GFP_ZERO)); - ptr = kmalloc_node_nofail(size, flags, node); - } else if (flags & __GFP_ZERO) { - ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO); - } else { - ptr = kmalloc_nofail(size, flags); - } + dptr->kd_func = __strdup(func, flags); + if (dptr->kd_func == NULL) { + kfree(dptr); + return (NULL); + } - if (unlikely(ptr == NULL)) { - kfree(dptr->kd_func); - kfree(dptr); - printk(KERN_WARNING "kmem_alloc(%llu, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - (unsigned long long) size, flags, func, line, - kmem_alloc_used_read(), kmem_alloc_max); - goto out; - } + ptr = spl_kmem_alloc_debug(size, flags, node); + if (ptr == NULL) { + kfree(dptr->kd_func); + kfree(dptr); + return (NULL); + } - kmem_alloc_used_add(size); - if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) - kmem_alloc_max = kmem_alloc_used_read(); + INIT_HLIST_NODE(&dptr->kd_hlist); + INIT_LIST_HEAD(&dptr->kd_list); - INIT_HLIST_NODE(&dptr->kd_hlist); - INIT_LIST_HEAD(&dptr->kd_list); + dptr->kd_addr = ptr; + dptr->kd_size = size; + dptr->kd_line = line; - dptr->kd_addr = ptr; - dptr->kd_size = size; - dptr->kd_line = line; + spin_lock_irqsave(&kmem_lock, irq_flags); + hlist_add_head(&dptr->kd_hlist, + &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); + list_add_tail(&dptr->kd_list, &kmem_list); + spin_unlock_irqrestore(&kmem_lock, irq_flags); - spin_lock_irqsave(&kmem_lock, irq_flags); - hlist_add_head(&dptr->kd_hlist, - &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); - list_add_tail(&dptr->kd_list, &kmem_list); - spin_unlock_irqrestore(&kmem_lock, irq_flags); - } -out: return (ptr); } -EXPORT_SYMBOL(kmem_alloc_track); -void -kmem_free_track(const void *ptr, size_t size) +inline void +spl_kmem_free_track(const void *ptr, size_t size) { kmem_debug_t *dptr; - ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, - (unsigned long long) size); - /* Must exist in hash due to kmem_alloc() */ dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); - ASSERT(dptr); + ASSERT3P(dptr, !=, NULL); + ASSERT3S(dptr->kd_size, ==, size); - /* Size must match */ - ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " - "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size, - (unsigned long long) size, dptr->kd_func, dptr->kd_line); - - kmem_alloc_used_sub(size); kfree(dptr->kd_func); - - memset((void *)dptr, 0x5a, sizeof (kmem_debug_t)); kfree(dptr); - memset((void *)ptr, 0x5a, size); - kfree(ptr); + spl_kmem_free_debug(ptr, size); } -EXPORT_SYMBOL(kmem_free_track); - -#else /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ +/* + * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. + */ void * -kmem_alloc_debug(size_t size, int flags, const char *func, int line, - int node_alloc, int node) +spl_kmem_alloc(size_t size, int flags, const char *func, int line) { - void *ptr; - - /* - * Marked unlikely because we should never be doing this, - * we tolerate to up 2 pages but a single page is best. - */ - if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) { - printk(KERN_WARNING - "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max); - spl_dumpstack(); - } + ASSERT0(flags & ~KM_PUBLIC_MASK); + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_kmem_alloc); - /* Use the correct allocator */ - if (node_alloc) { - ASSERT(!(flags & __GFP_ZERO)); - ptr = kmalloc_node_nofail(size, flags, node); - } else if (flags & __GFP_ZERO) { - ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO)); - } else { - ptr = kmalloc_nofail(size, flags); - } +void * +spl_kmem_zalloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); - if (unlikely(ptr == NULL)) { - printk(KERN_WARNING - "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max); - } else { - kmem_alloc_used_add(size); - if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) - kmem_alloc_max = kmem_alloc_used_read(); - } + flags |= KM_ZERO; - return (ptr); +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif } -EXPORT_SYMBOL(kmem_alloc_debug); +EXPORT_SYMBOL(spl_kmem_zalloc); void -kmem_free_debug(const void *ptr, size_t size) +spl_kmem_free(const void *buf, size_t size) { - ASSERT(ptr || size > 0); - kmem_alloc_used_sub(size); - kfree(ptr); +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif } -EXPORT_SYMBOL(kmem_free_debug); - -#endif /* DEBUG_KMEM_TRACKING */ -#endif /* DEBUG_KMEM */ +EXPORT_SYMBOL(spl_kmem_free); #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) static char * @@ -424,22 +513,20 @@ spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) spin_unlock_irqrestore(lock, flags); } -#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ -#define spl_kmem_init_tracking(list, lock, size) -#define spl_kmem_fini_tracking(list, lock) #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ int spl_kmem_init(void) { - int rc = 0; - #ifdef DEBUG_KMEM kmem_alloc_used_set(0); + +#ifdef DEBUG_KMEM_TRACKING spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); -#endif +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ - return (rc); + return (0); } void @@ -454,8 +541,10 @@ spl_kmem_fini(void) */ if (kmem_alloc_used_read() != 0) printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", - kmem_alloc_used_read(), kmem_alloc_max); + (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); +#ifdef DEBUG_KMEM_TRACKING spl_kmem_fini_tracking(&kmem_list, &kmem_lock); +#endif /* DEBUG_KMEM_TRACKING */ #endif /* DEBUG_KMEM */ } diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c index e5712aee..a434ef54 100644 --- a/module/spl/spl-proc.c +++ b/module/spl/spl-proc.c @@ -353,26 +353,6 @@ static struct ctl_table spl_kmem_table[] = { .mode = 0444, .proc_handler = &proc_doulongvec_minmax, }, - { - .procname = "vmem_used", - .data = &vmem_alloc_used, -# ifdef HAVE_ATOMIC64_T - .maxlen = sizeof(atomic64_t), -# else - .maxlen = sizeof(atomic_t), -# endif /* HAVE_ATOMIC64_T */ - .mode = 0444, - .proc_handler = &proc_domemused, - }, - { - .procname = "vmem_max", - .data = &vmem_alloc_max, - .maxlen = sizeof(unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doulongvec_minmax, - }, { .procname = "slab_kmem_total", .data = (void *)(KMC_KMEM | KMC_TOTAL), diff --git a/module/spl/spl-tsd.c b/module/spl/spl-tsd.c index f4f81048..9a098752 100644 --- a/module/spl/spl-tsd.c +++ b/module/spl/spl-tsd.c @@ -337,8 +337,7 @@ tsd_hash_table_init(uint_t bits) if (table == NULL) return (NULL); - table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size, - KM_SLEEP | KM_NODEBUG); + table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size, KM_SLEEP); if (table->ht_bins == NULL) { kmem_free(table, sizeof(tsd_hash_table_t)); return (NULL); diff --git a/module/spl/spl-vmem.c b/module/spl/spl-vmem.c index 51aef941..e177988a 100644 --- a/module/spl/spl-vmem.c +++ b/module/spl/spl-vmem.c @@ -24,6 +24,7 @@ #include #include +#include #include vmem_t *heap_arena = NULL; @@ -47,314 +48,62 @@ vmem_size(vmem_t *vmp, int typemask) EXPORT_SYMBOL(vmem_size); /* - * Memory allocation interfaces and debugging for basic kmem_* - * and vmem_* style memory allocation. When DEBUG_KMEM is enabled - * the SPL will keep track of the total memory allocated, and - * report any memory leaked when the module is unloaded. + * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces. */ -#ifdef DEBUG_KMEM - -/* Shim layer memory accounting */ -#ifdef HAVE_ATOMIC64_T -atomic64_t vmem_alloc_used = ATOMIC64_INIT(0); -unsigned long long vmem_alloc_max = 0; -#else /* HAVE_ATOMIC64_T */ -atomic_t vmem_alloc_used = ATOMIC_INIT(0); -unsigned long long vmem_alloc_max = 0; -#endif /* HAVE_ATOMIC64_T */ - -EXPORT_SYMBOL(vmem_alloc_used); -EXPORT_SYMBOL(vmem_alloc_max); - -/* - * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked - * but also the location of every alloc and free. When the SPL module is - * unloaded a list of all leaked addresses and where they were allocated - * will be dumped to the console. Enabling this feature has a significant - * impact on performance but it makes finding memory leaks straight forward. - * - * Not surprisingly with debugging enabled the xmem_locks are very highly - * contended particularly on xfree(). If we want to run with this detailed - * debugging enabled for anything other than debugging we need to minimize - * the contention by moving to a lock per xmem_table entry model. - */ -#ifdef DEBUG_KMEM_TRACKING - -#define VMEM_HASH_BITS 10 -#define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS) - -typedef struct kmem_debug { - struct hlist_node kd_hlist; /* Hash node linkage */ - struct list_head kd_list; /* List of all allocations */ - void *kd_addr; /* Allocation pointer */ - size_t kd_size; /* Allocation size */ - const char *kd_func; /* Allocation function */ - int kd_line; /* Allocation line */ -} kmem_debug_t; - -spinlock_t vmem_lock; -struct hlist_head vmem_table[VMEM_TABLE_SIZE]; -struct list_head vmem_list; - -EXPORT_SYMBOL(vmem_lock); -EXPORT_SYMBOL(vmem_table); -EXPORT_SYMBOL(vmem_list); - void * -vmem_alloc_track(size_t size, int flags, const char *func, int line) -{ - void *ptr = NULL; - kmem_debug_t *dptr; - unsigned long irq_flags; - - ASSERT(flags & KM_SLEEP); - - /* Function may be called with KM_NOSLEEP so failure is possible */ - dptr = (kmem_debug_t *) kmalloc_nofail(sizeof (kmem_debug_t), - flags & ~__GFP_ZERO); - if (unlikely(dptr == NULL)) { - printk(KERN_WARNING "debug vmem_alloc(%ld, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - sizeof (kmem_debug_t), flags, func, line, - vmem_alloc_used_read(), vmem_alloc_max); - } else { - /* - * We use __strdup() below because the string pointed to by - * __FUNCTION__ might not be available by the time we want - * to print it, since the module might have been unloaded. - * This can never fail because we have already asserted - * that flags is KM_SLEEP. - */ - dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO); - if (unlikely(dptr->kd_func == NULL)) { - kfree(dptr); - printk(KERN_WARNING "debug __strdup() at %s:%d " - "failed (%lld/%llu)\n", func, line, - vmem_alloc_used_read(), vmem_alloc_max); - goto out; - } - - /* Use the correct allocator */ - if (flags & __GFP_ZERO) { - ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO); - } else { - ptr = vmalloc_nofail(size, flags); - } - - if (unlikely(ptr == NULL)) { - kfree(dptr->kd_func); - kfree(dptr); - printk(KERN_WARNING "vmem_alloc (%llu, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - (unsigned long long) size, flags, func, line, - vmem_alloc_used_read(), vmem_alloc_max); - goto out; - } - - vmem_alloc_used_add(size); - if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) - vmem_alloc_max = vmem_alloc_used_read(); - - INIT_HLIST_NODE(&dptr->kd_hlist); - INIT_LIST_HEAD(&dptr->kd_list); - - dptr->kd_addr = ptr; - dptr->kd_size = size; - dptr->kd_line = line; - - spin_lock_irqsave(&vmem_lock, irq_flags); - hlist_add_head(&dptr->kd_hlist, - &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]); - list_add_tail(&dptr->kd_list, &vmem_list); - spin_unlock_irqrestore(&vmem_lock, irq_flags); - } -out: - return (ptr); -} -EXPORT_SYMBOL(vmem_alloc_track); - -void -vmem_free_track(const void *ptr, size_t size) +spl_vmem_alloc(size_t size, int flags, const char *func, int line) { - kmem_debug_t *dptr; + ASSERT0(flags & ~KM_PUBLIC_MASK); - ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, - (unsigned long long) size); + flags |= KM_VMEM; - /* Must exist in hash due to vmem_alloc() */ - dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr); - ASSERT(dptr); - - /* Size must match */ - ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " - "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size, - (unsigned long long) size, dptr->kd_func, dptr->kd_line); - - vmem_alloc_used_sub(size); - kfree(dptr->kd_func); - - memset((void *)dptr, 0x5a, sizeof (kmem_debug_t)); - kfree(dptr); - - memset((void *)ptr, 0x5a, size); - vfree(ptr); +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif } -EXPORT_SYMBOL(vmem_free_track); - -#else /* DEBUG_KMEM_TRACKING */ +EXPORT_SYMBOL(spl_vmem_alloc); void * -vmem_alloc_debug(size_t size, int flags, const char *func, int line) +spl_vmem_zalloc(size_t size, int flags, const char *func, int line) { - void *ptr; - - ASSERT(flags & KM_SLEEP); + ASSERT0(flags & ~KM_PUBLIC_MASK); - /* Use the correct allocator */ - if (flags & __GFP_ZERO) { - ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO)); - } else { - ptr = vmalloc_nofail(size, flags); - } + flags |= (KM_VMEM | KM_ZERO); - if (unlikely(ptr == NULL)) { - printk(KERN_WARNING - "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - (unsigned long long)vmem_alloc_used_read(), vmem_alloc_max); - } else { - vmem_alloc_used_add(size); - if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) - vmem_alloc_max = vmem_alloc_used_read(); - } - - return (ptr); +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif } -EXPORT_SYMBOL(vmem_alloc_debug); +EXPORT_SYMBOL(spl_vmem_zalloc); void -vmem_free_debug(const void *ptr, size_t size) -{ - ASSERT(ptr || size > 0); - vmem_alloc_used_sub(size); - vfree(ptr); -} -EXPORT_SYMBOL(vmem_free_debug); - -#endif /* DEBUG_KMEM_TRACKING */ -#endif /* DEBUG_KMEM */ - -#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) -static char * -spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) -{ - int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; - int i, flag = 1; - - ASSERT(str != NULL && len >= 17); - memset(str, 0, len); - - /* - * Check for a fully printable string, and while we are at - * it place the printable characters in the passed buffer. - */ - for (i = 0; i < size; i++) { - str[i] = ((char *)(kd->kd_addr))[i]; - if (isprint(str[i])) { - continue; - } else { - /* - * Minimum number of printable characters found - * to make it worthwhile to print this as ascii. - */ - if (i > min) - break; - - flag = 0; - break; - } - } - - if (!flag) { - sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", - *((uint8_t *)kd->kd_addr), - *((uint8_t *)kd->kd_addr + 2), - *((uint8_t *)kd->kd_addr + 4), - *((uint8_t *)kd->kd_addr + 6), - *((uint8_t *)kd->kd_addr + 8), - *((uint8_t *)kd->kd_addr + 10), - *((uint8_t *)kd->kd_addr + 12), - *((uint8_t *)kd->kd_addr + 14)); - } - - return (str); -} - -static int -spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) -{ - int i; - - spin_lock_init(lock); - INIT_LIST_HEAD(list); - - for (i = 0; i < size; i++) - INIT_HLIST_HEAD(&kmem_table[i]); - - return (0); -} - -static void -spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) +spl_vmem_free(const void *buf, size_t size) { - unsigned long flags; - kmem_debug_t *kd; - char str[17]; - - spin_lock_irqsave(lock, flags); - if (!list_empty(list)) - printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", - "size", "data", "func", "line"); - - list_for_each_entry(kd, list, kd_list) - printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, - (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), - kd->kd_func, kd->kd_line); - - spin_unlock_irqrestore(lock, flags); +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif } -#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ -#define spl_kmem_init_tracking(list, lock, size) -#define spl_kmem_fini_tracking(list, lock) -#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ +EXPORT_SYMBOL(spl_vmem_free); int spl_vmem_init(void) { - int rc = 0; - -#ifdef DEBUG_KMEM - vmem_alloc_used_set(0); - spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE); -#endif - - return (rc); + return (0); } void spl_vmem_fini(void) { -#ifdef DEBUG_KMEM - /* - * Display all unreclaimed memory addresses, including the - * allocation size and the first few bytes of what's located - * at that address to aid in debugging. Performance is not - * a serious concern here since it is module unload time. - */ - if (vmem_alloc_used_read() != 0) - printk(KERN_WARNING "vmem leaked %ld/%llu bytes\n", - vmem_alloc_used_read(), vmem_alloc_max); - - spl_kmem_fini_tracking(&vmem_list, &vmem_lock); -#endif /* DEBUG_KMEM */ } diff --git a/module/splat/splat-kmem.c b/module/splat/splat-kmem.c index 7edc8599..81f748bb 100644 --- a/module/splat/splat-kmem.c +++ b/module/splat/splat-kmem.c @@ -95,11 +95,11 @@ splat_kmem_test1(struct file *file, void *arg) int size = PAGE_SIZE; int i, count, rc = 0; - while ((!rc) && (size <= (PAGE_SIZE * 32))) { + while ((!rc) && (size <= spl_kmem_alloc_warn)) { count = 0; for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) { - ptr[i] = kmem_alloc(size, KM_SLEEP | KM_NODEBUG); + ptr[i] = kmem_alloc(size, KM_SLEEP); if (ptr[i]) count++; } @@ -127,11 +127,11 @@ splat_kmem_test2(struct file *file, void *arg) int size = PAGE_SIZE; int i, j, count, rc = 0; - while ((!rc) && (size <= (PAGE_SIZE * 32))) { + while ((!rc) && (size <= spl_kmem_alloc_warn)) { count = 0; for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) { - ptr[i] = kmem_zalloc(size, KM_SLEEP | KM_NODEBUG); + ptr[i] = kmem_zalloc(size, KM_SLEEP); if (ptr[i]) count++; } @@ -171,7 +171,11 @@ splat_kmem_test3(struct file *file, void *arg) int size = PAGE_SIZE; int i, count, rc = 0; - while ((!rc) && (size <= (PAGE_SIZE * 1024))) { + /* + * Test up to 4x the maximum kmem_alloc() size to ensure both + * the kmem_alloc() and vmem_alloc() call paths are used. + */ + while ((!rc) && (size <= (4 * spl_kmem_alloc_max))) { count = 0; for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { @@ -203,7 +207,11 @@ splat_kmem_test4(struct file *file, void *arg) int size = PAGE_SIZE; int i, j, count, rc = 0; - while ((!rc) && (size <= (PAGE_SIZE * 1024))) { + /* + * Test up to 4x the maximum kmem_zalloc() size to ensure both + * the kmem_zalloc() and vmem_zalloc() call paths are used. + */ + while ((!rc) && (size <= (4 * spl_kmem_alloc_max))) { count = 0; for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { From caae89bdff58c04b90db1b0eddaf4cf6d3f984cc Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Sun, 13 Jul 2014 14:45:20 -0400 Subject: [PATCH 05/11] Add hooks for disabling direct reclaim The port of XFS to Linux introduced a thread-specific PF_FSTRANS bit that is used to mark contexts which are processing transactions. When set, allocations in this context can dip into kernel memory reserves to avoid deadlocks during writeback. Linux 3.9 provided the additional PF_MEMALLOC_NOIO for disabling __GFP_IO in page allocations, which XFS began using in 3.15. This patch implements hooks for marking transactions via PF_FSTRANS. When an allocation is performed in the context of PF_FSTRANS, any KM_SLEEP allocation is transparently converted to a GFP_NOIO allocation. Additionally, when using a Linux 3.9 or newer kernel, it will set PF_MEMALLOC_NOIO to prevent direct reclaim from entering pageout() on on any KM_PUSHPAGE or KM_NOSLEEP allocation. This effectively allows the spl_vmalloc() helper function to be used safely in a thread which is responsible for IO. Signed-off-by: Brian Behlendorf --- include/sys/kmem.h | 34 ++++++++++++++++++++++++++++++++++ include/sys/vmem.h | 1 + module/spl/spl-kmem-cache.c | 4 +++- module/spl/spl-kmem.c | 2 +- module/spl/spl-vmem.c | 25 +++++++++++++++++++++++++ 5 files changed, 64 insertions(+), 2 deletions(-) diff --git a/include/sys/kmem.h b/include/sys/kmem.h index 045d07c2..8d5e7293 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -25,6 +25,7 @@ #ifndef _SPL_KMEM_H #define _SPL_KMEM_H +#include #include #include @@ -72,6 +73,39 @@ kmem_flags_convert(int flags) return (lflags); } +typedef struct { + struct task_struct *fstrans_thread; + unsigned int saved_flags; +} fstrans_cookie_t; + +static inline fstrans_cookie_t +spl_fstrans_mark(void) +{ + fstrans_cookie_t cookie; + + cookie.fstrans_thread = current; + cookie.saved_flags = current->flags & PF_FSTRANS; + current->flags |= PF_FSTRANS; + + return (cookie); +} + +static inline void +spl_fstrans_unmark(fstrans_cookie_t cookie) +{ + ASSERT3P(cookie.fstrans_thread, ==, current); + ASSERT(current->flags & PF_FSTRANS); + + current->flags &= ~(PF_FSTRANS); + current->flags |= cookie.saved_flags; +} + +static inline int +spl_fstrans_check(void) +{ + return (current->flags & PF_FSTRANS); +} + #ifdef HAVE_ATOMIC64_T #define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) #define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) diff --git a/include/sys/vmem.h b/include/sys/vmem.h index 6eb2c676..8aadc9d0 100644 --- a/include/sys/vmem.h +++ b/include/sys/vmem.h @@ -36,6 +36,7 @@ extern vmem_t *zio_alloc_arena; extern vmem_t *zio_arena; extern size_t vmem_size(vmem_t *vmp, int typemask); +extern void *spl_vmalloc(unsigned long size, gfp_t lflags, pgprot_t prot); /* * Memory allocation interfaces diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c index 9a8ccfe4..f8edb44a 100644 --- a/module/spl/spl-kmem-cache.c +++ b/module/spl/spl-kmem-cache.c @@ -153,7 +153,7 @@ kv_alloc(spl_kmem_cache_t *skc, int size, int flags) if (skc->skc_flags & KMC_KMEM) ptr = (void *)__get_free_pages(lflags, get_order(size)); else - ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL); + ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL); /* Resulting allocated memory will be page aligned */ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); @@ -1098,7 +1098,9 @@ spl_cache_grow_work(void *data) sks = spl_slab_alloc(skc, ska->ska_flags); memalloc_noio_restore(noio_flag); #else + fstrans_cookie_t cookie = spl_fstrans_mark(); sks = spl_slab_alloc(skc, ska->ska_flags); + spl_fstrans_unmark(cookie); #endif spin_lock(&skc->skc_lock); if (sks) { diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c index 4cd7cdbe..914f0fbf 100644 --- a/module/spl/spl-kmem.c +++ b/module/spl/spl-kmem.c @@ -184,7 +184,7 @@ spl_kmem_alloc_impl(size_t size, int flags, int node) */ if (unlikely(size > spl_kmem_alloc_max)) { if (flags & KM_VMEM) { - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = spl_vmalloc(size, lflags, PAGE_KERNEL); } else { return (NULL); } diff --git a/module/spl/spl-vmem.c b/module/spl/spl-vmem.c index e177988a..bca27f26 100644 --- a/module/spl/spl-vmem.c +++ b/module/spl/spl-vmem.c @@ -97,6 +97,31 @@ spl_vmem_free(const void *buf, size_t size) } EXPORT_SYMBOL(spl_vmem_free); +/* + * Public vmalloc() interface designed to be safe to be called during I/O. + */ +void * +spl_vmalloc(unsigned long size, gfp_t lflags, pgprot_t prot) +{ +#if defined(PF_MEMALLOC_NOIO) + void *ptr; + unsigned noio_flag = 0; + + if (spl_fstrans_check()) + noio_flag = memalloc_noio_save(); + + ptr = __vmalloc(size, lflags, prot); + + if (spl_fstrans_check()) + memalloc_noio_restore(noio_flag); + + return (ptr); +#else + return (__vmalloc(size, lflags, prot)); +#endif +} +EXPORT_SYMBOL(spl_vmalloc); + int spl_vmem_init(void) { From 3aa345673a6d8f2116b060926c038e3d45e20069 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 4 Dec 2014 18:47:51 -0500 Subject: [PATCH 06/11] Enforce architecture-specific barriers around clear_bit() The comment above the Linux 3.16 kernel's clear_bit() states: /** * clear_bit - Clears a bit in memory * @nr: Bit to clear * @addr: Address to start counting from * * clear_bit() is atomic and may not be reordered. However, it does * not contain a memory barrier, so if it is used for locking purposes, * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic() * in order to ensure changes are visible on other processors. */ This comment does not make sense in the context of x86 because x86 maps the operations to barrier(), which is a compiler barrier. However, it does make sense to me when I consider architectures that reorder around atomic instructions. In such situations, a processor is allowed to execute the wake_up_bit() before clear_bit() and we have a race. There are a few architectures that suffer from this issue. In such situations, the other processor would wake-up, see the bit is still taken and go to sleep, while the one responsible for waking it up will assume that it did its job and continue. This patch implements a wrapper that maps smp_mb__{before,after}_atomic() to smp_mb__{before,after}_clear_bit() on older kernels and changes our code to leverage it in a manner consistent with the mainline kernel. Signed-off-by: Brian Behlendorf --- module/spl/spl-kmem-cache.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c index f8edb44a..22e4548c 100644 --- a/module/spl/spl-kmem-cache.c +++ b/module/spl/spl-kmem-cache.c @@ -42,6 +42,20 @@ #undef kmem_cache_free +/* + * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() + * with smp_mb__{before,after}_atomic() because they were redundant. This is + * only used inside our SLAB allocator, so we implement an internal wrapper + * here to give us smp_mb__{before,after}_atomic() on older kernels. + */ +#ifndef smp_mb__before_atomic +#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x) +#endif + +#ifndef smp_mb__after_atomic +#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) +#endif + /* * Cache expiration was implemented because it was part of the default Solaris * kmem_cache behavior. The idea is that per-cpu objects which haven't been @@ -1110,8 +1124,10 @@ spl_cache_grow_work(void *data) } atomic_dec(&skc->skc_ref); + smp_mb__before_atomic(); clear_bit(KMC_BIT_GROWING, &skc->skc_flags); clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + smp_mb__after_atomic(); wake_up_all(&skc->skc_waitq); spin_unlock(&skc->skc_lock); @@ -1164,7 +1180,8 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); if (ska == NULL) { - clear_bit(KMC_BIT_GROWING, &skc->skc_flags); + clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags); + smp_mb__after_atomic(); wake_up_all(&skc->skc_waitq); return (-ENOMEM); } @@ -1616,8 +1633,8 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) } spl_slab_reclaim(skc, count, 1); - clear_bit(KMC_BIT_REAPING, &skc->skc_flags); - smp_wmb(); + clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); + smp_mb__after_atomic(); wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); out: atomic_dec(&skc->skc_ref); From d767c7bca9fce93a011c283446fc5dbd7d49b9aa Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 5 Dec 2014 17:11:18 -0500 Subject: [PATCH 07/11] Make slab reclaim more aggressive Many people have noticed that the kmem cache implementation is slow to release its memory. This patch makes the reclaim behavior more aggressive by immediately freeing a slab once it is empty. Unused objects which are cached in the magazines will still prevent a slab from being freed. Signed-off-by: Brian Behlendorf --- man/man5/spl-module-parameters.5 | 22 +++++++++ module/spl/spl-kmem-cache.c | 76 ++++++++++++++++++++------------ 2 files changed, 69 insertions(+), 29 deletions(-) diff --git a/man/man5/spl-module-parameters.5 b/man/man5/spl-module-parameters.5 index 2ec5b668..5b7ee832 100644 --- a/man/man5/spl-module-parameters.5 +++ b/man/man5/spl-module-parameters.5 @@ -120,6 +120,28 @@ value will use kmalloc(), but shift to vmalloc() when exceeding this value. Default value: \fBKMALLOC_MAX_SIZE/4\fR. .RE +.sp +.ne 2 +.na +\fBspl_kmem_cache_magazine_size\fR (uint) +.ad +.RS 12n +Cache magazines are an optimization designed to minimize the cost of +allocating memory. They do this by keeping a per-cpu cache of recently +freed objects, which can then be reallocated without taking a lock. This +can improve performance on highly contended caches. However, because +objects in magazines will prevent otherwise empty slabs from being +immediately released this may not be ideal for low memory machines. +.sp +For this reason \fBspl_kmem_cache_magazine_size\fR can be used to set a +maximum magazine size. When this value is set to 0 the magazine size will +be automatically determined based on the object size. Otherwise magazines +will be limited to 2-256 objects per magazine (i.e per cpu). Magazines +may never be entirely disabled in this implementation. +.sp +Default value: \fB0\fR. +.RE + .sp .ne 2 .na diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c index 22e4548c..a68852ed 100644 --- a/module/spl/spl-kmem-cache.c +++ b/module/spl/spl-kmem-cache.c @@ -70,6 +70,25 @@ EXPORT_SYMBOL(spl_kmem_cache_expire); module_param(spl_kmem_cache_expire, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)"); +/* + * Cache magazines are an optimization designed to minimize the cost of + * allocating memory. They do this by keeping a per-cpu cache of recently + * freed objects, which can then be reallocated without taking a lock. This + * can improve performance on highly contended caches. However, because + * objects in magazines will prevent otherwise empty slabs from being + * immediately released this may not be ideal for low memory machines. + * + * For this reason spl_kmem_cache_magazine_size can be used to set a maximum + * magazine size. When this value is set to 0 the magazine size will be + * automatically determined based on the object size. Otherwise magazines + * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines + * may never be entirely disabled in this implementation. + */ +unsigned int spl_kmem_cache_magazine_size = 0; +module_param(spl_kmem_cache_magazine_size, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_magazine_size, + "Default magazine size (2-256), set automatically (0)\n"); + /* * The default behavior is to report the number of objects remaining in the * cache. This allows the Linux VM to repeatedly reclaim objects from the @@ -362,45 +381,31 @@ spl_slab_free(spl_kmem_slab_t *sks, } /* - * Traverse all the partial slabs attached to a cache and free those which - * are currently empty, and have not been touched for skc_delay seconds to - * avoid thrashing. The count argument is passed to optionally cap the - * number of slabs reclaimed, a count of zero means try and reclaim - * everything. When flag the is set available slabs freed regardless of age. + * Reclaim empty slabs at the end of the partial list. */ static void -spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) +spl_slab_reclaim(spl_kmem_cache_t *skc) { spl_kmem_slab_t *sks, *m; spl_kmem_obj_t *sko, *n; LIST_HEAD(sks_list); LIST_HEAD(sko_list); uint32_t size = 0; - int i = 0; /* - * Move empty slabs and objects which have not been touched in - * skc_delay seconds on to private lists to be freed outside - * the spin lock. This delay time is important to avoid thrashing - * however when flag is set the delay will not be used. + * Empty slabs and objects must be moved to a private list so they + * can be safely freed outside the spin lock. All empty slabs are + * at the end of skc->skc_partial_list, therefore once a non-empty + * slab is found we can stop scanning. */ spin_lock(&skc->skc_lock); list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list, sks_list) { - /* - * All empty slabs are at the end of skc->skc_partial_list, - * therefore once a non-empty slab is found we can stop - * scanning. Additionally, stop when reaching the target - * reclaim 'count' if a non-zero threshold is given. - */ - if ((sks->sks_ref > 0) || (count && i >= count)) + + if (sks->sks_ref > 0) break; - if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ) || - flag) { - spl_slab_free(sks, &sks_list, &sko_list); - i++; - } + spl_slab_free(sks, &sks_list, &sko_list); } spin_unlock(&skc->skc_lock); @@ -633,7 +638,7 @@ spl_cache_age(void *data) if (!(skc->skc_flags & KMC_NOMAGAZINE)) on_each_cpu(spl_magazine_age, skc, 1); - spl_slab_reclaim(skc, skc->skc_reap, 0); + spl_slab_reclaim(skc); while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) { id = taskq_dispatch_delay( @@ -710,6 +715,9 @@ spl_magazine_size(spl_kmem_cache_t *skc) uint32_t obj_size = spl_obj_size(skc); int size; + if (spl_kmem_cache_magazine_size > 0) + return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2)); + /* Per-magazine sizes below assume a 4Kib page size */ if (obj_size > (PAGE_SIZE * 256)) size = 4; /* Minimum 4Mib per-magazine */ @@ -1030,7 +1038,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { spl_magazine_destroy(skc); - spl_slab_reclaim(skc, 0, 1); + spl_slab_reclaim(skc); } else { ASSERT(skc->skc_flags & KMC_SLAB); kmem_cache_destroy(skc->skc_linux_cache); @@ -1433,6 +1441,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) { spl_kmem_magazine_t *skm; unsigned long flags; + int do_reclaim = 0; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); @@ -1473,14 +1482,23 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) skm = skc->skc_mag[smp_processor_id()]; ASSERT(skm->skm_magic == SKM_MAGIC); - /* Per-CPU cache full, flush it to make space */ - if (unlikely(skm->skm_avail >= skm->skm_size)) + /* + * Per-CPU cache full, flush it to make space for this object, + * this may result in an empty slab which can be reclaimed once + * interrupts are re-enabled. + */ + if (unlikely(skm->skm_avail >= skm->skm_size)) { spl_cache_flush(skc, skm, skm->skm_refill); + do_reclaim = 1; + } /* Available space in cache, use it */ skm->skm_objs[skm->skm_avail++] = obj; local_irq_restore(flags); + + if (do_reclaim) + spl_slab_reclaim(skc); out: atomic_dec(&skc->skc_ref); } @@ -1621,7 +1639,7 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) } while (do_reclaim); } - /* Reclaim from the magazine then the slabs ignoring age and delay. */ + /* Reclaim from the magazine and free all now empty slabs. */ if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) { spl_kmem_magazine_t *skm; unsigned long irq_flags; @@ -1632,7 +1650,7 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) local_irq_restore(irq_flags); } - spl_slab_reclaim(skc, count, 1); + spl_slab_reclaim(skc); clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); smp_mb__after_atomic(); wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); From a32e1dd8d0058d9964696b92cfd9715242f0a3c7 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 5 Dec 2014 18:31:24 -0500 Subject: [PATCH 08/11] Update spl-module-parameters(5) man page The spl-module-parameters(5) was not kept up to date. Refresh the man page so that it lists all the possible module options, describes what the do, and justify why the default values are set they way the are. Signed-off-by: Brian Behlendorf --- man/man5/spl-module-parameters.5 | 131 ++++++++++++++++++++++++------- 1 file changed, 103 insertions(+), 28 deletions(-) diff --git a/man/man5/spl-module-parameters.5 b/man/man5/spl-module-parameters.5 index 5b7ee832..27c7bfd4 100644 --- a/man/man5/spl-module-parameters.5 +++ b/man/man5/spl-module-parameters.5 @@ -17,67 +17,135 @@ Description of the different parameters to the SPL module. .sp .ne 2 .na -\fBspl_debug_subsys\fR (ulong) +\fBspl_kmem_cache_expire\fR (uint) .ad .RS 12n -Subsystem debugging level mask. +Cache expiration is part of default Illumos cache behavior. The idea is +that objects in magazines which have not been recently accessed should be +returned to the slabs periodically. This is known as cache aging and +when enabled objects will be typically returned after 15 seconds. +.sp +On the other hand Linux slabs are designed to never move objects back to +the slabs unless there is memory pressure. This is possible because under +Linux the cache will be notified when memory is low and objects can be +released. +.sp +By default only the Linux method is enabled. It has been shown to improve +responsiveness on low memory systems and not negatively impact the performance +of systems with more memory. This policy may be changed by setting the +\fBspl_kmem_cache_expire\fR bit mask as follows, both policies may be enabled +concurrently. +.sp +0x01 - Aging (Illumos), 0x02 - Low memory (Linux) .sp -Default value: \fB~0\fR. +Default value: \fB0x02\fR .RE .sp .ne 2 .na -\fBspl_debug_mask\fR (ulong) +\fBspl_kmem_cache_reclaim\fR (uint) .ad .RS 12n -Debugging level mask. +When this is set it prevents Linux from being able to rapidly reclaim all the +memory held by the kmem caches. This may be useful in circumstances where +it's preferable that Linux reclaim memory from some other subsystem first. +Setting this will increase the likelihood out of memory events on a memory +constrained system. .sp -Default value: \fB8 | 10 | 4 | 20\fR (SD_ERROR | SD_EMERG | SD_WARNING | SD_CONSOLE). +Default value: \fB0\fR .RE .sp .ne 2 .na -\fBspl_debug_printk\fR (ulong) +\fBspl_kmem_cache_obj_per_slab\fR (uint) .ad .RS 12n -Console printk level mask. +The preferred number of objects per slab in the cache. In general, a larger +value will increase the caches memory footprint while decreasing the time +required to perform an allocation. Conversely, a smaller value will minimize +the footprint and improve cache reclaim time but individual allocations may +take longer. .sp -Default value: \fB8 | 10 | 4 | 20\fR (SD_ERROR | SD_EMERG | SD_WARNING | SD_CONSOLE). +Default value: \fB16\fR .RE .sp .ne 2 .na -\fBspl_debug_mb\fR (int) +\fBspl_kmem_cache_obj_per_slab_min\fR (uint) .ad .RS 12n -Total debug buffer size. +The minimum number of objects allowed per slab. Normally slabs will contain +\fBspl_kmem_cache_obj_per_slab\fR objects but for caches that contain very +large objects it's desirable to only have a few, or even just one, object per +slab. .sp -Default value: \fB-1\fR. +Default value: \fB1\fR .RE .sp .ne 2 .na -\fBspl_debug_panic_on_bug\fR (int) +\fBspl_kmem_cache_max_size\fR (uint) .ad .RS 12n -Panic on BUG +The maximum size of a kmem cache slab in MiB. This effectively limits +the maximum cache object size to \fBspl_kmem_cache_max_size\fR / +\fBspl_kmem_cache_obj_per_slab\fR. Caches may not be created with +object sized larger than this limit. .sp -Use \fB1\fR for yes and \fB0\fR for no (default). +Default value: \fB32\fR .RE .sp .ne 2 .na -\fBspl_kmem_cache_expire\fR (uint) +\fBspl_kmem_cache_slab_limit\fR (uint) .ad .RS 12n -By age (0x1) or low memory (0x2) +For small objects the Linux slab allocator should be used to make the most +efficient use of the memory. However, large objects are not supported by +the Linux slab and therefore the SPL implementation is preferred. This +value is used to determine the cutoff between a small and large object. +.sp +Objects of \fBspl_kmem_cache_slab_limit\fR or smaller will be allocated +using the Linux slab allocator, large objects use the SPL allocator. A +cutoff of 16K was determined to be optimal for architectures using 4K pages. .sp -Default value: \fB0\fR. +Default value: \fB16,384\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_kmem_limit\fR (uint) +.ad +.RS 12n +Depending on the size of a cache object it may be backed by kmalloc()'d +or vmalloc()'d memory. This is because the size of the required allocation +greatly impacts the best way to allocate the memory. +.sp +When objects are small and only a small number of memory pages need to be +allocated, ideally just one, then kmalloc() is very efficient. However, +when allocating multiple pages with kmalloc() it gets increasingly expensive +because the pages must be physically contiguous. +.sp +For this reason we shift to vmalloc() for slabs of large objects which +which removes the need for contiguous pages. We cannot use vmalloc() in +all cases because there is significant locking overhead involved. This +function takes a single global lock over the entire virtual address range +which serializes all allocations. Using slightly different allocation +functions for small and large objects allows us to handle a wide range of +object sizes. +.sh +The \fBspl_kmem_cache_kmem_limit\fR value is used to determine this cutoff +size. One quarter the PAGE_SIZE is used as the default value because +\fBspl_kmem_cache_obj_per_slab\fR defaults to 16. This means that at +most we will need to allocate four contiguous pages. +.sp +Default value: \fBPAGE_SIZE/4\fR .RE .sp @@ -90,7 +158,7 @@ As a general rule kmem_alloc() allocations should be small, preferably just a few pages since they must by physically contiguous. Therefore, a rate limited warning will be printed to the console for any kmem_alloc() which exceeds a reasonable threshold. - +.sp The default warning threshold is set to eight pages but capped at 32K to accommodate systems using large pages. This value was selected to be small enough to ensure the largest allocations are quickly noticed and fixed. @@ -100,7 +168,7 @@ developers are encouraged to set it lower when testing so any new largish allocations are quickly caught. These warnings may be disabled by setting the threshold to zero. .sp -Default value: \fB32K\fR. +Default value: \fB32,768\fR .RE .sp @@ -117,7 +185,7 @@ margin of 4x is set. Kmem_alloc() allocations larger than this maximum will quickly fail. Vmem_alloc() allocations less than or equal to this value will use kmalloc(), but shift to vmalloc() when exceeding this value. .sp -Default value: \fBKMALLOC_MAX_SIZE/4\fR. +Default value: \fBKMALLOC_MAX_SIZE/4\fR .RE .sp @@ -139,7 +207,7 @@ be automatically determined based on the object size. Otherwise magazines will be limited to 2-256 objects per magazine (i.e per cpu). Magazines may never be entirely disabled in this implementation. .sp -Default value: \fB0\fR. +Default value: \fB0\fR .RE .sp @@ -148,9 +216,12 @@ Default value: \fB0\fR. \fBspl_hostid\fR (ulong) .ad .RS 12n -The system hostid. +The system hostid, when set this can be used to uniquely identify a system. +By default this value is set to zero which indicates the hostid is disabled. +It can be explicitly enabled by placing a unique non-zero value in +\fB/etc/hostid/\fR. .sp -Default value: \fB0xFFFFFFFF\fR (an invalid hostid!) +Default value: \fB0\fR .RE .sp @@ -159,9 +230,10 @@ Default value: \fB0xFFFFFFFF\fR (an invalid hostid!) \fBspl_hostid_path\fR (charp) .ad .RS 12n -The system hostid file +The expected path to locate the system hostid when specified. This value +may be overridden for non-standard configurations. .sp -Default value: \fB/etc/hostid\fR. +Default value: \fB/etc/hostid\fR .RE .sp @@ -170,7 +242,10 @@ Default value: \fB/etc/hostid\fR. \fBspl_taskq_thread_bind\fR (int) .ad .RS 12n -Bind taskq thread to CPU +Bind taskq threads to specific CPUs. When enabled all taskq threads will +be distributed evenly over the available CPUs. By default, this behavior +is disabled to allow the Linux scheduler the maximum flexibility to determine +where a thread should run. .sp -Default value: \fB0\fR. +Default value: \fB0\fR .RE From 783bc15d19f504d7abc5fed51cb1a47fe86abc32 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 15 Dec 2014 16:02:48 -0800 Subject: [PATCH 09/11] Reduce kmem cache deadlock threshold Reduce the threshold for detecting a kmem cache deadlock by 10x from HZ to HZ/10. The reduced value is still several orders of magnitude large enough to avoid being triggered incorrectly. By reducing it we allow the system to resolve the issue more quickly. Signed-off-by: Brian Behlendorf --- module/spl/spl-kmem-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c index a68852ed..809ac5cc 100644 --- a/module/spl/spl-kmem-cache.c +++ b/module/spl/spl-kmem-cache.c @@ -1215,7 +1215,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) rc = spl_emergency_alloc(skc, flags, obj); } else { remaining = wait_event_timeout(skc->skc_waitq, - spl_cache_grow_wait(skc), HZ); + spl_cache_grow_wait(skc), HZ / 10); if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) { spin_lock(&skc->skc_lock); From 82cd609ffe9291f79026e9645c4bea0ab1626f8d Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 15 Dec 2014 14:06:18 -0800 Subject: [PATCH 10/11] Refine slab cache sizing This change is designed to improve the memory utilization of slabs by more carefully setting their size. The way the code currently works is problematic for slabs which contain large objects (>1MB). This is due to slabs being unconditionally rounded up to a power of two which may result in unused space at the end of the slab. The reason the existing code rounds up every slab is because it assumes it will backed by the buddy allocator. Since the buddy allocator can only performs power of two allocations this is desirable because it avoids wasting any space. However, this logic breaks down if slab is backed by vmalloc() which operates at a page level granularity. In this case, the optimal thing to do is calculate the minimum required slab size given certain constraints (object size, alignment, objects/slab, etc). Therefore, this patch reworks the spl_slab_size() function so that it sizes KMC_KMEM slabs differently than KMC_VMEM slabs. KMC_KMEM slabs are rounded up to the nearest power of two, and KMC_VMEM slabs are allowed to be the minimum required size. This change also reduces the default number of objects per slab. This reduces how much memory a single cache object can pin, which can result in significant memory saving for highly fragmented caches. But depending on the workload it may result in slabs being allocated and freed more frequently. In practice, this has been shown to be a better default for most workloads. Also the maximum slab size has been reduced to 4MB on 32-bit systems. Due to the limited virtual address space it's critical the we be as frugal as possible. A limit of 4M still lets us reasonably comfortably allocate a limited number of 1MB objects. Finally, the kmem:slab_small and kmem:slab_large SPLAT tests were extended to provide better test coverage of various object sizes and alignments. Caches are created with random parameters and their basic functionality is verified by allocating several slabs worth of objects. Signed-off-by: Brian Behlendorf --- include/sys/kmem_cache.h | 17 +- man/man5/spl-module-parameters.5 | 4 +- module/spl/spl-kmem-cache.c | 92 ++++++---- module/splat/splat-kmem.c | 277 +++++++++++++++++++------------ 4 files changed, 247 insertions(+), 143 deletions(-) diff --git a/include/sys/kmem_cache.h b/include/sys/kmem_cache.h index a9b5bdd2..9ac41e6e 100644 --- a/include/sys/kmem_cache.h +++ b/include/sys/kmem_cache.h @@ -101,9 +101,24 @@ extern struct rw_semaphore spl_kmem_cache_sem; #define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */ #define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */ -#define SPL_KMEM_CACHE_OBJ_PER_SLAB 16 /* Target objects per slab */ +#define SPL_KMEM_CACHE_OBJ_PER_SLAB 8 /* Target objects per slab */ #define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 1 /* Minimum objects per slab */ #define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ +#ifdef _LP64 +#define SPL_KMEM_CACHE_MAX_SIZE 32 /* Max slab size in MB */ +#else +#define SPL_KMEM_CACHE_MAX_SIZE 4 /* Max slab size in MB */ +#endif + +#define SPL_MAX_ORDER (MAX_ORDER - 3) +#define SPL_MAX_ORDER_NR_PAGES (1 << (SPL_MAX_ORDER - 1)) + +#ifdef CONFIG_SLUB +#define SPL_MAX_KMEM_CACHE_ORDER PAGE_ALLOC_COSTLY_ORDER +#define SPL_MAX_KMEM_ORDER_NR_PAGES (1 << (SPL_MAX_KMEM_CACHE_ORDER - 1)) +#else +#define SPL_MAX_KMEM_ORDER_NR_PAGES (KMALLOC_MAX_SIZE >> PAGE_SHIFT) +#endif #define POINTER_IS_VALID(p) 0 /* Unimplemented */ #define POINTER_INVALIDATE(pp) /* Unimplemented */ diff --git a/man/man5/spl-module-parameters.5 b/man/man5/spl-module-parameters.5 index 27c7bfd4..3e7e877f 100644 --- a/man/man5/spl-module-parameters.5 +++ b/man/man5/spl-module-parameters.5 @@ -68,7 +68,7 @@ required to perform an allocation. Conversely, a smaller value will minimize the footprint and improve cache reclaim time but individual allocations may take longer. .sp -Default value: \fB16\fR +Default value: \fB8\fR .RE .sp @@ -96,7 +96,7 @@ the maximum cache object size to \fBspl_kmem_cache_max_size\fR / \fBspl_kmem_cache_obj_per_slab\fR. Caches may not be created with object sized larger than this limit. .sp -Default value: \fB32\fR +Default value: \fB32 (64-bit) or 4 (32-bit)\fR .RE .sp diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c index 809ac5cc..5160aa1c 100644 --- a/module/spl/spl-kmem-cache.c +++ b/module/spl/spl-kmem-cache.c @@ -109,7 +109,7 @@ module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min, "Minimal number of objects per slab"); -unsigned int spl_kmem_cache_max_size = 32; +unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE; module_param(spl_kmem_cache_max_size, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); @@ -128,7 +128,13 @@ module_param(spl_kmem_cache_slab_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_slab_limit, "Objects less than N bytes use the Linux slab"); -unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4); +/* + * This value defaults to a threshold designed to avoid allocations which + * have been deemed costly by the kernel. + */ +unsigned int spl_kmem_cache_kmem_limit = + ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) / + SPL_KMEM_CACHE_OBJ_PER_SLAB; module_param(spl_kmem_cache_kmem_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, "Objects less than N bytes use the kmalloc"); @@ -181,12 +187,12 @@ kv_alloc(spl_kmem_cache_t *skc, int size, int flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; - ASSERT(ISP2(size)); - - if (skc->skc_flags & KMC_KMEM) + if (skc->skc_flags & KMC_KMEM) { + ASSERT(ISP2(size)); ptr = (void *)__get_free_pages(lflags, get_order(size)); - else + } else { ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL); + } /* Resulting allocated memory will be page aligned */ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); @@ -198,7 +204,6 @@ static void kv_free(spl_kmem_cache_t *skc, void *ptr, int size) { ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); - ASSERT(ISP2(size)); /* * The Linux direct reclaim path uses this out of band value to @@ -210,10 +215,12 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; - if (skc->skc_flags & KMC_KMEM) + if (skc->skc_flags & KMC_KMEM) { + ASSERT(ISP2(size)); free_pages((unsigned long)ptr, get_order(size)); - else + } else { vfree(ptr); + } } /* @@ -668,40 +675,48 @@ spl_cache_age(void *data) static int spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) { - uint32_t sks_size, obj_size, max_size; + uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs; if (skc->skc_flags & KMC_OFFSLAB) { - *objs = spl_kmem_cache_obj_per_slab; - *size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE); - return (0); + tgt_objs = spl_kmem_cache_obj_per_slab; + tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE); + + if ((skc->skc_flags & KMC_KMEM) && + (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE))) + return (-ENOSPC); } else { sks_size = spl_sks_size(skc); obj_size = spl_obj_size(skc); - - if (skc->skc_flags & KMC_KMEM) - max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE; - else - max_size = (spl_kmem_cache_max_size * 1024 * 1024); - - /* Power of two sized slab */ - for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) { - *objs = (*size - sks_size) / obj_size; - if (*objs >= spl_kmem_cache_obj_per_slab) - return (0); - } + max_size = (spl_kmem_cache_max_size * 1024 * 1024); + tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); /* - * Unable to satisfy target objects per slab, fall back to - * allocating a maximally sized slab and assuming it can - * contain the minimum objects count use it. If not fail. + * KMC_KMEM slabs are allocated by __get_free_pages() which + * rounds up to the nearest order. Knowing this the size + * should be rounded up to the next power of two with a hard + * maximum defined by the maximum allowed allocation order. */ - *size = max_size; - *objs = (*size - sks_size) / obj_size; - if (*objs >= (spl_kmem_cache_obj_per_slab_min)) - return (0); + if (skc->skc_flags & KMC_KMEM) { + max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE; + tgt_size = MIN(max_size, + PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1))); + } + + if (tgt_size <= max_size) { + tgt_objs = (tgt_size - sks_size) / obj_size; + } else { + tgt_objs = (max_size - sks_size) / obj_size; + tgt_size = (tgt_objs * obj_size) + sks_size; + } } - return (-ENOSPC); + if (tgt_objs == 0) + return (-ENOSPC); + + *objs = tgt_objs; + *size = tgt_size; + + return (0); } /* @@ -960,6 +975,11 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, if (rc) goto out; } else { + if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) { + rc = EINVAL; + goto out; + } + skc->skc_linux_cache = kmem_cache_create( skc->skc_name, size, align, 0, NULL); if (skc->skc_linux_cache == NULL) { @@ -1406,8 +1426,11 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) skm->skm_age = jiffies; } else { obj = spl_cache_refill(skc, skm, flags); - if (obj == NULL) + if ((obj == NULL) && !(flags & KM_NOSLEEP)) goto restart; + + local_irq_enable(); + goto ret; } local_irq_enable(); @@ -1427,7 +1450,6 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) return (obj); } - EXPORT_SYMBOL(spl_kmem_cache_alloc); /* diff --git a/module/splat/splat-kmem.c b/module/splat/splat-kmem.c index 81f748bb..cd0000ba 100644 --- a/module/splat/splat-kmem.c +++ b/module/splat/splat-kmem.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include "splat-internal.h" @@ -583,87 +584,124 @@ splat_kmem_cache_test_thread(void *arg) static int splat_kmem_cache_test(struct file *file, void *arg, char *name, - int size, int align, int flags) + int size, int align, int flags) { - kmem_cache_priv_t *kcp; - kmem_cache_data_t *kcd = NULL; - int rc = 0, max; + kmem_cache_priv_t *kcp = NULL; + kmem_cache_data_t **kcd = NULL; + int i, rc = 0, objs = 0; + + splat_vprint(file, name, + "Testing size=%d, align=%d, flags=0x%04x\n", + size, align, flags); kcp = splat_kmem_cache_test_kcp_alloc(file, name, size, align, 0); if (!kcp) { splat_vprint(file, name, "Unable to create '%s'\n", "kcp"); - return -ENOMEM; + return (-ENOMEM); } - kcp->kcp_cache = - kmem_cache_create(SPLAT_KMEM_CACHE_NAME, - kcp->kcp_size, kcp->kcp_align, - splat_kmem_cache_test_constructor, - splat_kmem_cache_test_destructor, - NULL, kcp, NULL, flags); - if (!kcp->kcp_cache) { - splat_vprint(file, name, - "Unable to create '%s'\n", - SPLAT_KMEM_CACHE_NAME); + kcp->kcp_cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME, + kcp->kcp_size, kcp->kcp_align, + splat_kmem_cache_test_constructor, + splat_kmem_cache_test_destructor, + NULL, kcp, NULL, flags); + if (kcp->kcp_cache == NULL) { + splat_vprint(file, name, "Unable to create " + "name='%s', size=%d, align=%d, flags=0x%x\n", + SPLAT_KMEM_CACHE_NAME, size, align, flags); rc = -ENOMEM; goto out_free; } - kcd = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP); - if (!kcd) { - splat_vprint(file, name, - "Unable to allocate from '%s'\n", - SPLAT_KMEM_CACHE_NAME); - rc = -EINVAL; + /* + * Allocate several slabs worth of objects to verify functionality. + * However, on 32-bit systems with limited address space constrain + * it to a single slab for the purposes of this test. + */ +#ifdef _LP64 + objs = SPL_KMEM_CACHE_OBJ_PER_SLAB * 4; +#else + objs = 1; +#endif + kcd = kmem_zalloc(sizeof (kmem_cache_data_t *) * objs, KM_SLEEP); + if (kcd == NULL) { + splat_vprint(file, name, "Unable to allocate pointers " + "for %d objects\n", objs); + rc = -ENOMEM; goto out_free; } - if (!kcd->kcd_flag) { - splat_vprint(file, name, - "Failed to run contructor for '%s'\n", - SPLAT_KMEM_CACHE_NAME); - rc = -EINVAL; - goto out_free; - } + for (i = 0; i < objs; i++) { + kcd[i] = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP); + if (kcd[i] == NULL) { + splat_vprint(file, name, "Unable to allocate " + "from '%s'\n", SPLAT_KMEM_CACHE_NAME); + rc = -EINVAL; + goto out_free; + } - if (kcd->kcd_magic != kcp->kcp_magic) { - splat_vprint(file, name, - "Failed to pass private data to constructor " - "for '%s'\n", SPLAT_KMEM_CACHE_NAME); - rc = -EINVAL; - goto out_free; + if (!kcd[i]->kcd_flag) { + splat_vprint(file, name, "Failed to run constructor " + "for '%s'\n", SPLAT_KMEM_CACHE_NAME); + rc = -EINVAL; + goto out_free; + } + + if (kcd[i]->kcd_magic != kcp->kcp_magic) { + splat_vprint(file, name, + "Failed to pass private data to constructor " + "for '%s'\n", SPLAT_KMEM_CACHE_NAME); + rc = -EINVAL; + goto out_free; + } } - max = kcp->kcp_count; - kmem_cache_free(kcp->kcp_cache, kcd); + for (i = 0; i < objs; i++) { + kmem_cache_free(kcp->kcp_cache, kcd[i]); + + /* Destructors are run for every kmem_cache_free() */ + if (kcd[i]->kcd_flag) { + splat_vprint(file, name, + "Failed to run destructor for '%s'\n", + SPLAT_KMEM_CACHE_NAME); + rc = -EINVAL; + goto out_free; + } + } - /* Destroy the entire cache which will force destructors to - * run and we can verify one was called for every object */ - kmem_cache_destroy(kcp->kcp_cache); if (kcp->kcp_count) { splat_vprint(file, name, - "Failed to run destructor on all slab objects " - "for '%s'\n", SPLAT_KMEM_CACHE_NAME); + "Failed to run destructor on all slab objects for '%s'\n", + SPLAT_KMEM_CACHE_NAME); rc = -EINVAL; } + kmem_free(kcd, sizeof (kmem_cache_data_t *) * objs); + kmem_cache_destroy(kcp->kcp_cache); + splat_kmem_cache_test_kcp_free(kcp); splat_vprint(file, name, - "Successfully ran ctors/dtors for %d elements in '%s'\n", - max, SPLAT_KMEM_CACHE_NAME); + "Success ran alloc'd/free'd %d objects of size %d\n", + objs, size); - return rc; + return (rc); out_free: - if (kcd) - kmem_cache_free(kcp->kcp_cache, kcd); + if (kcd) { + for (i = 0; i < objs; i++) { + if (kcd[i] != NULL) + kmem_cache_free(kcp->kcp_cache, kcd[i]); + } + + kmem_free(kcd, sizeof (kmem_cache_data_t *) * objs); + } if (kcp->kcp_cache) kmem_cache_destroy(kcp->kcp_cache); splat_kmem_cache_test_kcp_free(kcp); - return rc; + return (rc); } static int @@ -757,35 +795,49 @@ static int splat_kmem_test5(struct file *file, void *arg) { char *name = SPLAT_KMEM_TEST5_NAME; - int rc; - - /* On slab (default + kmem + vmem) */ - rc = splat_kmem_cache_test(file, arg, name, 128, 0, 0); - if (rc) - return rc; + int i, rc = 0; - rc = splat_kmem_cache_test(file, arg, name, 128, 0, KMC_KMEM); - if (rc) - return rc; + /* Randomly pick small object sizes and alignments. */ + for (i = 0; i < 100; i++) { + int size, align, flags = 0; + uint32_t rnd; + + /* Evenly distribute tests over all value cache types */ + get_random_bytes((void *)&rnd, sizeof (uint32_t)); + switch (rnd & 0x03) { + default: + case 0x00: + flags = 0; + break; + case 0x01: + flags = KMC_KMEM; + break; + case 0x02: + flags = KMC_VMEM; + break; + case 0x03: + flags = KMC_SLAB; + break; + } - rc = splat_kmem_cache_test(file, arg, name, 128, 0, KMC_VMEM); - if (rc) - return rc; + /* The following flags are set with a 1/10 chance */ + flags |= ((((rnd >> 8) % 10) == 0) ? KMC_OFFSLAB : 0); + flags |= ((((rnd >> 16) % 10) == 0) ? KMC_NOEMERGENCY : 0); - /* Off slab (default + kmem + vmem) */ - rc = splat_kmem_cache_test(file, arg, name, 128, 0, KMC_OFFSLAB); - if (rc) - return rc; + /* 32b - PAGE_SIZE */ + get_random_bytes((void *)&rnd, sizeof (uint32_t)); + size = MAX(rnd % (PAGE_SIZE + 1), 32); - rc = splat_kmem_cache_test(file, arg, name, 128, 0, - KMC_KMEM | KMC_OFFSLAB); - if (rc) - return rc; + /* 2^N where (3 <= N <= PAGE_SHIFT) */ + get_random_bytes((void *)&rnd, sizeof (uint32_t)); + align = (1 << MAX(3, rnd % (PAGE_SHIFT + 1))); - rc = splat_kmem_cache_test(file, arg, name, 128, 0, - KMC_VMEM | KMC_OFFSLAB); + rc = splat_kmem_cache_test(file, arg, name, size, align, flags); + if (rc) + return (rc); + } - return rc; + return (rc); } /* @@ -795,44 +847,53 @@ static int splat_kmem_test6(struct file *file, void *arg) { char *name = SPLAT_KMEM_TEST6_NAME; - int rc; - - /* On slab (default + kmem + vmem) */ - rc = splat_kmem_cache_test(file, arg, name, 256*1024, 0, 0); - if (rc) - return rc; - - rc = splat_kmem_cache_test(file, arg, name, 64*1024, 0, KMC_KMEM); - if (rc) - return rc; - - rc = splat_kmem_cache_test(file, arg, name, 1024*1024, 0, KMC_VMEM); - if (rc) - return rc; - - rc = splat_kmem_cache_test(file, arg, name, 16*1024*1024, 0, KMC_VMEM); - if (rc) - return rc; + int i, max_size, rc = 0; + + /* Randomly pick large object sizes and alignments. */ + for (i = 0; i < 100; i++) { + int size, align, flags = 0; + uint32_t rnd; + + /* Evenly distribute tests over all value cache types */ + get_random_bytes((void *)&rnd, sizeof (uint32_t)); + switch (rnd & 0x03) { + default: + case 0x00: + flags = 0; + max_size = (SPL_KMEM_CACHE_MAX_SIZE * 1024 * 1024) / 2; + break; + case 0x01: + flags = KMC_KMEM; + max_size = (SPL_MAX_ORDER_NR_PAGES - 2) * PAGE_SIZE; + break; + case 0x02: + flags = KMC_VMEM; + max_size = (SPL_KMEM_CACHE_MAX_SIZE * 1024 * 1024) / 2; + break; + case 0x03: + flags = KMC_SLAB; + max_size = SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE; + break; + } - /* Off slab (default + kmem + vmem) */ - rc = splat_kmem_cache_test(file, arg, name, 256*1024, 0, KMC_OFFSLAB); - if (rc) - return rc; + /* The following flags are set with a 1/10 chance */ + flags |= ((((rnd >> 8) % 10) == 0) ? KMC_OFFSLAB : 0); + flags |= ((((rnd >> 16) % 10) == 0) ? KMC_NOEMERGENCY : 0); - rc = splat_kmem_cache_test(file, arg, name, 64*1024, 0, - KMC_KMEM | KMC_OFFSLAB); - if (rc) - return rc; + /* PAGE_SIZE - max_size */ + get_random_bytes((void *)&rnd, sizeof (uint32_t)); + size = MAX(rnd % (max_size + 1), PAGE_SIZE), - rc = splat_kmem_cache_test(file, arg, name, 1024*1024, 0, - KMC_VMEM | KMC_OFFSLAB); - if (rc) - return rc; + /* 2^N where (3 <= N <= PAGE_SHIFT) */ + get_random_bytes((void *)&rnd, sizeof (uint32_t)); + align = (1 << MAX(3, rnd % (PAGE_SHIFT + 1))); - rc = splat_kmem_cache_test(file, arg, name, 16*1024*1024, 0, - KMC_VMEM | KMC_OFFSLAB); + rc = splat_kmem_cache_test(file, arg, name, size, align, flags); + if (rc) + return (rc); + } - return rc; + return (rc); } /* @@ -842,14 +903,20 @@ static int splat_kmem_test7(struct file *file, void *arg) { char *name = SPLAT_KMEM_TEST7_NAME; + int max_size = (SPL_KMEM_CACHE_MAX_SIZE * 1024 * 1024) / 2; int i, rc; for (i = SPL_KMEM_CACHE_ALIGN; i <= PAGE_SIZE; i *= 2) { - rc = splat_kmem_cache_test(file, arg, name, 157, i, 0); + uint32_t size; + + get_random_bytes((void *)&size, sizeof (uint32_t)); + size = MAX(size % (max_size + 1), 32); + + rc = splat_kmem_cache_test(file, arg, name, size, i, 0); if (rc) return rc; - rc = splat_kmem_cache_test(file, arg, name, 157, i, + rc = splat_kmem_cache_test(file, arg, name, size, i, KMC_OFFSLAB); if (rc) return rc; From c65cc486956d90c3f391e4e32473b5759884ddd4 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 9 Jan 2015 14:00:34 -0800 Subject: [PATCH 11/11] Fix kmem cache deadlock logic The kmem cache implementation always adds new slabs by dispatching a task to the spl_kmem_cache taskq to perform the allocation. This is done because large slabs must be allocated using vmalloc(). It is possible these allocations will block on IO because the GFP_NOIO flag is not honored. This can result in a deadlock. Therefore, a deadlock detection strategy was implemented to deal with this case. When it is determined, by timeout, that the spl_kmem_cache thread has deadlocked attempting to add a new slab. Then all callers attempting to allocate from the cache fall back to using kmalloc() which does honor all passed flags. This logic was correct but an optimization in the code allowed for a deadlock. Because only slabs backed by vmalloc() can deadlock in the way described above. An optimization was made to only invoke this deadlock detection code for vmalloc() backed caches. This had the advantage of making it easy to distinguish these objects when they were freed. But this isn't strictly safe. If all the spl_kmem_cache threads end up deadlocked then we can't grow any of the other caches either. This can once again result in a deadlock if memory needs to be allocated from one of these other caches to ensure forward progress. The fix here is to remove the optimization which limits this fall back allocation strategy to vmalloc() backed caches. Doing this means we may need to take the cache lock in spl_kmem_cache_free() call path. This small cost is somewhat mitigated by ignoring objects with virtual addresses which could not have been allocated by kmalloc(). For good measure the default number of spl_kmem_cache threads has been increased from 1 to 4 (and made tunable). This alone wouldn't resolve the original issue since it's still possible for all the threads to be deadlocked. However, it does help responsiveness by ensuring that a single deadlocked spl_kmem_cache thread can't block allocations from all other caches until the timeout is reached. Signed-off-by: Brian Behlendorf --- module/spl/spl-kmem-cache.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c index 5160aa1c..38fee703 100644 --- a/module/spl/spl-kmem-cache.c +++ b/module/spl/spl-kmem-cache.c @@ -139,6 +139,15 @@ module_param(spl_kmem_cache_kmem_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, "Objects less than N bytes use the kmalloc"); +/* + * The number of threads available to allocate new slabs for caches. This + * should not need to be tuned but it is available for performance analysis. + */ +unsigned int spl_kmem_cache_kmem_threads = 4; +module_param(spl_kmem_cache_kmem_threads, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, + "Number of spl_kmem_cache threads"); + /* * Slab allocation interfaces * @@ -544,14 +553,14 @@ spl_emergency_free(spl_kmem_cache_t *skc, void *obj) spin_lock(&skc->skc_lock); ske = spl_emergency_search(&skc->skc_emergency_tree, obj); - if (likely(ske)) { + if (ske) { rb_erase(&ske->ske_node, &skc->skc_emergency_tree); skc->skc_obj_emergency--; skc->skc_obj_total--; } spin_unlock(&skc->skc_lock); - if (unlikely(ske == NULL)) + if (ske == NULL) return (-ENOENT); kfree(ske->ske_obj); @@ -1237,7 +1246,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) remaining = wait_event_timeout(skc->skc_waitq, spl_cache_grow_wait(skc), HZ / 10); - if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) { + if (!remaining) { spin_lock(&skc->skc_lock); if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); @@ -1464,6 +1473,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) spl_kmem_magazine_t *skm; unsigned long flags; int do_reclaim = 0; + int do_emergency = 0; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); @@ -1484,13 +1494,18 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) } /* - * Only virtual slabs may have emergency objects and these objects - * are guaranteed to have physical addresses. They must be removed - * from the tree of emergency objects and the freed. + * While a cache has outstanding emergency objects all freed objects + * must be checked. However, since emergency objects will never use + * a virtual address these objects can be safely excluded as an + * optimization. */ - if ((skc->skc_flags & KMC_VMEM) && !is_vmalloc_addr(obj)) { - spl_emergency_free(skc, obj); - goto out; + if (!is_vmalloc_addr(obj)) { + spin_lock(&skc->skc_lock); + do_emergency = (skc->skc_obj_emergency > 0); + spin_unlock(&skc->skc_lock); + + if (do_emergency && (spl_emergency_free(skc, obj) == 0)) + goto out; } local_irq_save(flags); @@ -1702,7 +1717,7 @@ spl_kmem_cache_init(void) init_rwsem(&spl_kmem_cache_sem); INIT_LIST_HEAD(&spl_kmem_cache_list); spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", - 1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE); + spl_kmem_cache_kmem_threads, maxclsyspri, 1, 32, TASKQ_PREPOPULATE); spl_register_shrinker(&spl_kmem_cache_shrinker); return (0);