Skip to content
This repository has been archived by the owner on Feb 26, 2020. It is now read-only.

Kmem rework #369

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/sys/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ KERNEL_H = \
$(top_srcdir)/include/sys/varargs.h \
$(top_srcdir)/include/sys/vfs.h \
$(top_srcdir)/include/sys/vfs_opreg.h \
$(top_srcdir)/include/sys/vmem.h \
$(top_srcdir)/include/sys/vmsystm.h \
$(top_srcdir)/include/sys/vnode.h \
$(top_srcdir)/include/sys/zmod.h \
Expand Down
258 changes: 74 additions & 184 deletions include/sys/kmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,164 +35,87 @@
#include <linux/ctype.h>
#include <asm/atomic.h>
#include <sys/types.h>
#include <sys/vmem.h>
#include <sys/vmsystm.h>
#include <sys/kstat.h>
#include <sys/taskq.h>

/*
* Memory allocation interfaces
*/
#define KM_SLEEP GFP_KERNEL /* Can sleep, never fails */
#define KM_NOSLEEP GFP_ATOMIC /* Can not sleep, may fail */
#define KM_PUSHPAGE (GFP_NOIO | __GFP_HIGH) /* Use reserved memory */
#define KM_NODEBUG __GFP_NOWARN /* Suppress warnings */
#define KM_FLAGS __GFP_BITS_MASK
#define KM_VMFLAGS GFP_LEVEL_MASK
#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */
#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */
#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */
#define KM_ZERO 0x1000 /* zero the allocation */

/*
* Used internally, the kernel does not need to support this flag
*/
#ifndef __GFP_ZERO
# define __GFP_ZERO 0x8000
#endif

/*
* PF_NOFS is a per-process debug flag which is set in current->flags to
* detect when a process is performing an unsafe allocation. All tasks
* with PF_NOFS set must strictly use KM_PUSHPAGE for allocations because
* if they enter direct reclaim and initiate I/O the may deadlock.
*
* When debugging is disabled, any incorrect usage will be detected and
* a call stack with warning will be printed to the console. The flags
* will then be automatically corrected to allow for safe execution. If
* debugging is enabled this will be treated as a fatal condition.
*
* To avoid any risk of conflicting with the existing PF_ flags. The
* PF_NOFS bit shadows the rarely used PF_MUTEX_TESTER bit. Only when
* CONFIG_RT_MUTEX_TESTER is not set, and we know this bit is unused,
* will the PF_NOFS bit be valid. Happily, most existing distributions
* ship a kernel with CONFIG_RT_MUTEX_TESTER disabled.
*/
#if !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER)
# define PF_NOFS PF_MUTEX_TESTER
#define KM_PUBLIC_MASK (KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE)

static inline void
sanitize_flags(struct task_struct *p, gfp_t *flags)
{
if (unlikely((p->flags & PF_NOFS) && (*flags & (__GFP_IO|__GFP_FS)))) {
# ifdef NDEBUG
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "Fixing allocation for "
"task %s (%d) which used GFP flags 0x%x with PF_NOFS set\n",
p->comm, p->pid, flags);
spl_debug_dumpstack(p);
*flags &= ~(__GFP_IO|__GFP_FS);
# else
PANIC("FATAL allocation for task %s (%d) which used GFP "
"flags 0x%x with PF_NOFS set\n", p->comm, p->pid, flags);
# endif /* NDEBUG */
}
}
#else
# define PF_NOFS 0x00000000
# define sanitize_flags(p, fl) ((void)0)
#endif /* !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER) */
/* XXX: Modify the code to stop using these */
#define KM_NODEBUG 0x0

/*
* __GFP_NOFAIL looks like it will be removed from the kernel perhaps as
* early as 2.6.32. To avoid this issue when it occurs in upstream kernels
* we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC).
* I would prefer the caller handle the failure case cleanly but we are
* trying to emulate Solaris and those are not the Solaris semantics.
* We use a special process flag to avoid recursive callbacks into
* the filesystem during transactions. We will also issue our own
* warnings, so we explicitly skip any generic ones (silly of us).
*/
static inline void *
kmalloc_nofail(size_t size, gfp_t flags)
static inline gfp_t
kmem_flags_convert(int flags)
{
void *ptr;
gfp_t lflags = __GFP_NOWARN;

if (flags & KM_NOSLEEP) {
lflags |= GFP_ATOMIC | __GFP_NORETRY;
} else {
lflags |= GFP_KERNEL;
if ((current->flags & PF_FSTRANS))
lflags &= ~(__GFP_IO|__GFP_FS);
}

sanitize_flags(current, &flags);
if (flags & KM_PUSHPAGE)
lflags |= __GFP_HIGH;

do {
ptr = kmalloc(size, flags);
} while (ptr == NULL && (flags & __GFP_WAIT));
if (flags & KM_ZERO)
lflags |= __GFP_ZERO;

return ptr;
return (lflags);
}

static inline void *
kzalloc_nofail(size_t size, gfp_t flags)
{
void *ptr;

sanitize_flags(current, &flags);

do {
ptr = kzalloc(size, flags);
} while (ptr == NULL && (flags & __GFP_WAIT));

return ptr;
}
typedef struct {
struct task_struct *fstrans_thread;
unsigned int saved_flags;
} fstrans_cookie_t;

static inline void *
kmalloc_node_nofail(size_t size, gfp_t flags, int node)
static inline fstrans_cookie_t
spl_fstrans_mark(void)
{
void *ptr;
fstrans_cookie_t cookie;

sanitize_flags(current, &flags);
VERIFY(cookie.fstrans_thread = current);

do {
ptr = kmalloc_node(size, flags, node);
} while (ptr == NULL && (flags & __GFP_WAIT));
cookie.saved_flags = current->flags & PF_FSTRANS;
current->flags |= PF_FSTRANS;

return ptr;
return (cookie);
}

static inline void *
vmalloc_nofail(size_t size, gfp_t flags)
static inline void
spl_fstrans_unmark(fstrans_cookie_t cookie)
{
void *ptr;

sanitize_flags(current, &flags);

/*
* Retry failed __vmalloc() allocations once every second. The
* rational for the delay is that the likely failure modes are:
*
* 1) The system has completely exhausted memory, in which case
* delaying 1 second for the memory reclaim to run is reasonable
* to avoid thrashing the system.
* 2) The system has memory but has exhausted the small virtual
* address space available on 32-bit systems. Retrying the
* allocation immediately will only result in spinning on the
* virtual address space lock. It is better delay a second and
* hope that another process will free some of the address space.
* But the bottom line is there is not much we can actually do
* since we can never safely return a failure and honor the
* Solaris semantics.
*/
while (1) {
ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ);
} else {
break;
}
}
VERIFY(cookie.fstrans_thread == current);
VERIFY(current->flags & PF_FSTRANS);

return ptr;
current->flags &= ~(PF_FSTRANS);
current->flags |= cookie.saved_flags;
}

static inline void *
vzalloc_nofail(size_t size, gfp_t flags)
{
void *ptr;

ptr = vmalloc_nofail(size, flags);
if (ptr)
memset(ptr, 0, (size));
/*
* This is a version of vmalloc() that hooks into PF_FSTRANS.
*/
extern void *spl_vmalloc (unsigned long size, gfp_t lflags, pgprot_t prot);

return ptr;
}
extern void *spl_kmem_alloc(size_t size, int flags);
extern void *spl_kmem_zalloc(size_t size, int flags);
extern void spl_kmem_free(const void *buf, size_t size);

#ifdef DEBUG_KMEM

Expand All @@ -205,31 +128,19 @@ vzalloc_nofail(size_t size, gfp_t flags)
# define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used)
# define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used)
# define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size)
# define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used)
# define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used)
# define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used)
# define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size)

extern atomic64_t kmem_alloc_used;
extern unsigned long long kmem_alloc_max;
extern atomic64_t vmem_alloc_used;
extern unsigned long long vmem_alloc_max;

# else /* HAVE_ATOMIC64_T */

# define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used)
# define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used)
# define kmem_alloc_used_read() atomic_read(&kmem_alloc_used)
# define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size)
# define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used)
# define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used)
# define vmem_alloc_used_read() atomic_read(&vmem_alloc_used)
# define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size)

extern atomic_t kmem_alloc_used;
extern unsigned long long kmem_alloc_max;
extern atomic_t vmem_alloc_used;
extern unsigned long long vmem_alloc_max;

# endif /* HAVE_ATOMIC64_T */

Expand All @@ -246,23 +157,15 @@ extern unsigned long long vmem_alloc_max;
* --enable-debug-kmem-tracking to configure.
*/
# define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \
__FUNCTION__, __LINE__, 0, 0)
# define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
__FUNCTION__, __LINE__, 0, 0)
# define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \
__FUNCTION__, __LINE__, 1, nd)
__FUNCTION__, __LINE__, \
NUMA_NO_NODE)
# define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|KM_ZERO,\
__FUNCTION__, __LINE__, \
NUMA_NO_NODE)
# define kmem_free(ptr, sz) kmem_free_track((ptr), (sz))

# define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \
__FUNCTION__, __LINE__)
# define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
__FUNCTION__, __LINE__)
# define vmem_free(ptr, sz) vmem_free_track((ptr), (sz))

extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
extern void *kmem_alloc_track(size_t, int, const char *, int, int);
extern void kmem_free_track(const void *, size_t);
extern void *vmem_alloc_track(size_t, int, const char *, int);
extern void vmem_free_track(const void *, size_t);

# else /* DEBUG_KMEM_TRACKING */
/*
Expand All @@ -275,23 +178,15 @@ extern void vmem_free_track(const void *, size_t);
* pass the --disable-debug-kmem option to configure.
*/
# define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \
__FUNCTION__, __LINE__, 0, 0)
# define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
__FUNCTION__, __LINE__, 0, 0)
# define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \
__FUNCTION__, __LINE__, 1, nd)
__FUNCTION__, __LINE__, \
NUMA_NO_NODE)
# define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|KM_ZERO,\
__FUNCTION__, __LINE__, \
NUMA_NO_NODE)
# define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz))

# define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \
__FUNCTION__, __LINE__)
# define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
__FUNCTION__, __LINE__)
# define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz))

extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
extern void *kmem_alloc_debug(size_t, int, const char *, int, int);
extern void kmem_free_debug(const void *, size_t);
extern void *vmem_alloc_debug(size_t, int, const char *, int);
extern void vmem_free_debug(const void *, size_t);

# endif /* DEBUG_KMEM_TRACKING */
#else /* DEBUG_KMEM */
Expand All @@ -302,14 +197,9 @@ extern void vmem_free_debug(const void *, size_t);
* minimal memory accounting. To enable basic accounting pass the
* --enable-debug-kmem option to configure.
*/
# define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl))
# define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl))
# define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd))
# define kmem_free(ptr, sz) ((void)(sz), kfree(ptr))

# define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl))
# define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl))
# define vmem_free(ptr, sz) ((void)(sz), vfree(ptr))
# define kmem_alloc(sz, fl) spl_kmem_alloc((sz), (fl))
# define kmem_zalloc(sz, fl) spl_kmem_zalloc((sz), (fl))
# define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz))

#endif /* DEBUG_KMEM */

Expand Down Expand Up @@ -340,12 +230,13 @@ enum {
KMC_BIT_OFFSLAB = 8, /* Objects not on slab */
KMC_BIT_NOEMERGENCY = 9, /* Disable emergency objects */
KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */
KMC_BIT_GROWING = 15, /* Growing in progress */
KMC_BIT_REAPING = 16, /* Reaping in progress */
KMC_BIT_DESTROY = 17, /* Destroy in progress */
KMC_BIT_TOTAL = 18, /* Proc handler helper bit */
KMC_BIT_ALLOC = 19, /* Proc handler helper bit */
KMC_BIT_MAX = 20, /* Proc handler helper bit */
KMC_BIT_GROWING = 15, /* Growing in progress for KM_SLEEP */
KMC_BIT_GROWING_HIGH = 16, /* Growing in progress */
KMC_BIT_REAPING = 17, /* Reaping in progress */
KMC_BIT_DESTROY = 18, /* Destroy in progress */
KMC_BIT_TOTAL = 19, /* Proc handler helper bit */
KMC_BIT_ALLOC = 20, /* Proc handler helper bit */
KMC_BIT_MAX = 21, /* Proc handler helper bit */
};

/* kmem move callback return values */
Expand All @@ -369,6 +260,7 @@ typedef enum kmem_cbrc {
#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY)
#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED)
#define KMC_GROWING (1 << KMC_BIT_GROWING)
#define KMC_GROWING_HIGH (1 << KMC_BIT_GROWING_HIGH)
#define KMC_REAPING (1 << KMC_BIT_REAPING)
#define KMC_DESTROY (1 << KMC_BIT_DESTROY)
#define KMC_TOTAL (1 << KMC_BIT_TOTAL)
Expand Down Expand Up @@ -510,8 +402,6 @@ void spl_kmem_fini(void);
#define kmem_cache_reap_now(skc) \
spl_kmem_cache_reap_now(skc, skc->skc_reap)
#define kmem_reap() spl_kmem_reap()
#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \
((ptr) < (void *)VMALLOC_END))

/*
* Allow custom slab allocation flags to be set for KMC_SLAB based caches.
Expand Down
Loading