diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-02-21-14-19-35.bpo-37448.btl7vO.rst b/Misc/NEWS.d/next/Core and Builtins/2021-02-21-14-19-35.bpo-37448.btl7vO.rst new file mode 100644 index 00000000000000..fe771a50122f69 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-02-21-14-19-35.bpo-37448.btl7vO.rst @@ -0,0 +1,15 @@ +Add a radix tree based memory map to track in-use obmalloc arenas. Use to +replace the old implementation of address_in_range(). The radix tree +approach makes it easy to increase pool sizes beyond the OS page size. +Boosting the pool and arena size allows obmalloc to handle a significantly +higher percentage of requests from its ultra-fast paths. + +It also has the advantage of eliminating the memory unsanitary behavior of +the previous address_in_range(). The old address_in_range() was marked with +the annotations _Py_NO_SANITIZE_ADDRESS, _Py_NO_SANITIZE_THREAD, and +_Py_NO_SANITIZE_MEMORY. Those annotations are no longer needed. + +To disable the radix tree map, set a preprocessor flag as follows: +`-DWITH_PYMALLOC_RADIX_TREE=0`. + +Co-authored-by: Tim Peters diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 03d0e8e51264c6..76ff6f9c99bc9c 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -894,6 +894,22 @@ static int running_on_valgrind = -1; #endif #endif +#if !defined(WITH_PYMALLOC_RADIX_TREE) +/* Use radix-tree to track arena memory regions, for address_in_range(). + * Enable by default since it allows larger pool sizes. Can be disabled + * using -DWITH_PYMALLOC_RADIX_TREE=0 */ +#define WITH_PYMALLOC_RADIX_TREE 1 +#endif + +#if SIZEOF_VOID_P > 4 +/* on 64-bit platforms use larger pools and arenas if we can */ +#define USE_LARGE_ARENAS +#if WITH_PYMALLOC_RADIX_TREE +/* large pools only supported if radix-tree is enabled */ +#define USE_LARGE_POOLS +#endif +#endif + /* * The allocator sub-allocates blocks of memory (called arenas) aligned * on a page boundary. This is a reserved virtual address space for the @@ -907,18 +923,34 @@ static int running_on_valgrind = -1; * Arenas are allocated with mmap() on systems supporting anonymous memory * mappings to reduce heap fragmentation. */ -#define ARENA_SIZE (256 << 10) /* 256KB */ +#ifdef USE_LARGE_ARENAS +#define ARENA_BITS 20 /* 1 MiB */ +#else +#define ARENA_BITS 18 /* 256 KiB */ +#endif +#define ARENA_SIZE (1 << ARENA_BITS) +#define ARENA_SIZE_MASK (ARENA_SIZE - 1) #ifdef WITH_MEMORY_LIMITS #define MAX_ARENAS (SMALL_MEMORY_LIMIT / ARENA_SIZE) #endif /* - * Size of the pools used for small blocks. Should be a power of 2, - * between 1K and SYSTEM_PAGE_SIZE, that is: 1k, 2k, 4k. + * Size of the pools used for small blocks. Must be a power of 2. */ -#define POOL_SIZE SYSTEM_PAGE_SIZE /* must be 2^N */ -#define POOL_SIZE_MASK SYSTEM_PAGE_SIZE_MASK +#ifdef USE_LARGE_POOLS +#define POOL_BITS 14 /* 16 KiB */ +#else +#define POOL_BITS 12 /* 4 KiB */ +#endif +#define POOL_SIZE (1 << POOL_BITS) +#define POOL_SIZE_MASK (POOL_SIZE - 1) + +#if !WITH_PYMALLOC_RADIX_TREE +#if POOL_SIZE != SYSTEM_PAGE_SIZE +# error "pool size must be equal to system page size" +#endif +#endif #define MAX_POOLS_IN_ARENA (ARENA_SIZE / POOL_SIZE) #if MAX_POOLS_IN_ARENA * POOL_SIZE != ARENA_SIZE @@ -1233,6 +1265,264 @@ _Py_GetAllocatedBlocks(void) return n; } +#if WITH_PYMALLOC_RADIX_TREE +/*==========================================================================*/ +/* radix tree for tracking arena usage + + bit allocation for keys + + 64-bit pointers and 2^20 arena size: + 16 -> ignored (POINTER_BITS - ADDRESS_BITS) + 10 -> MAP_TOP + 10 -> MAP_MID + 8 -> MAP_BOT + 20 -> ideal aligned arena + ---- + 64 + + 32-bit pointers and 2^18 arena size: + 14 -> MAP_BOT + 18 -> ideal aligned arena + ---- + 32 + +*/ + +#if SIZEOF_VOID_P == 8 + +/* number of bits in a pointer */ +#define POINTER_BITS 64 + +/* Current 64-bit processors are limited to 48-bit physical addresses. For + * now, the top 17 bits of addresses will all be equal to bit 2**47. If that + * changes in the future, this must be adjusted upwards. + */ +#define ADDRESS_BITS 48 + +/* use the top and mid layers of the radix tree */ +#define USE_INTERIOR_NODES + +#elif SIZEOF_VOID_P == 4 + +#define POINTER_BITS 32 +#define ADDRESS_BITS 32 + +#else + + /* Currently this code works for 64-bit or 32-bit pointers only. */ +#error "obmalloc radix tree requires 64-bit or 32-bit pointers." + +#endif /* SIZEOF_VOID_P */ + +/* arena_coverage_t members require this to be true */ +#if ARENA_BITS >= 32 +# error "arena size must be < 2^32" +#endif + +#ifdef USE_INTERIOR_NODES +/* number of bits used for MAP_TOP and MAP_MID nodes */ +#define INTERIOR_BITS ((ADDRESS_BITS - ARENA_BITS + 2) / 3) +#else +#define INTERIOR_BITS 0 +#endif + +#define MAP_TOP_BITS INTERIOR_BITS +#define MAP_TOP_LENGTH (1 << MAP_TOP_BITS) +#define MAP_TOP_MASK (MAP_BOT_LENGTH - 1) + +#define MAP_MID_BITS INTERIOR_BITS +#define MAP_MID_LENGTH (1 << MAP_MID_BITS) +#define MAP_MID_MASK (MAP_MID_LENGTH - 1) + +#define MAP_BOT_BITS (ADDRESS_BITS - ARENA_BITS - 2*INTERIOR_BITS) +#define MAP_BOT_LENGTH (1 << MAP_BOT_BITS) +#define MAP_BOT_MASK (MAP_BOT_LENGTH - 1) + +#define MAP_BOT_SHIFT ARENA_BITS +#define MAP_MID_SHIFT (MAP_BOT_BITS + MAP_BOT_SHIFT) +#define MAP_TOP_SHIFT (MAP_MID_BITS + MAP_MID_SHIFT) + +#define AS_UINT(p) ((uintptr_t)(p)) +#define MAP_BOT_INDEX(p) ((AS_UINT(p) >> MAP_BOT_SHIFT) & MAP_BOT_MASK) +#define MAP_MID_INDEX(p) ((AS_UINT(p) >> MAP_MID_SHIFT) & MAP_MID_MASK) +#define MAP_TOP_INDEX(p) ((AS_UINT(p) >> MAP_TOP_SHIFT) & MAP_TOP_MASK) + +#if ADDRESS_BITS > POINTER_BITS +/* Return non-physical address bits of a pointer. Those bits should be same + * for all valid pointers if ADDRESS_BITS set correctly. Linux has support for + * 57-bit address space (Intel 5-level paging) but will not currently give + * those addresses to user space. + */ +#define HIGH_BITS(p) (AS_UINT(p) >> ADDRESS_BITS) +#else +#define HIGH_BITS(p) 0 +#endif + + +/* This is the leaf of the radix tree. See arena_map_mark_used() for the + * meaning of these members. */ +typedef struct { + int32_t tail_hi; + int32_t tail_lo; +} arena_coverage_t; + +typedef struct arena_map_bot { + /* The members tail_hi and tail_lo are accessed together. So, it + * better to have them as an array of structs, rather than two + * arrays. + */ + arena_coverage_t arenas[MAP_BOT_LENGTH]; +} arena_map_bot_t; + +#ifdef USE_INTERIOR_NODES +typedef struct arena_map_mid { + struct arena_map_bot *ptrs[MAP_MID_LENGTH]; +} arena_map_mid_t; + +typedef struct arena_map_top { + struct arena_map_mid *ptrs[MAP_TOP_LENGTH]; +} arena_map_top_t; +#endif + +/* The root of radix tree. Note that by initializing like this, the memory + * should be in the BSS. The OS will only memory map pages as the MAP_MID + * nodes get used (OS pages are demand loaded as needed). + */ +#ifdef USE_INTERIOR_NODES +static arena_map_top_t arena_map_root; +/* accounting for number of used interior nodes */ +static int arena_map_mid_count; +static int arena_map_bot_count; +#else +static arena_map_bot_t arena_map_root; +#endif + +/* Return a pointer to a bottom tree node, return NULL if it doesn't exist or + * it cannot be created */ +static arena_map_bot_t * +arena_map_get(block *p, int create) +{ +#ifdef USE_INTERIOR_NODES + /* sanity check that ADDRESS_BITS is correct */ + assert(HIGH_BITS(p) == HIGH_BITS(&arena_map_root)); + int i1 = MAP_TOP_INDEX(p); + if (arena_map_root.ptrs[i1] == NULL) { + if (!create) { + return NULL; + } + arena_map_mid_t *n = PyMem_RawCalloc(1, sizeof(arena_map_mid_t)); + if (n == NULL) { + return NULL; + } + arena_map_root.ptrs[i1] = n; + arena_map_mid_count++; + } + int i2 = MAP_MID_INDEX(p); + if (arena_map_root.ptrs[i1]->ptrs[i2] == NULL) { + if (!create) { + return NULL; + } + arena_map_bot_t *n = PyMem_RawCalloc(1, sizeof(arena_map_bot_t)); + if (n == NULL) { + return NULL; + } + arena_map_root.ptrs[i1]->ptrs[i2] = n; + arena_map_bot_count++; + } + return arena_map_root.ptrs[i1]->ptrs[i2]; +#else + return &arena_map_root; +#endif +} + + +/* The radix tree only tracks arenas. So, for 16 MiB arenas, we throw + * away 24 bits of the address. That reduces the space requirement of + * the tree compared to similar radix tree page-map schemes. In + * exchange for slashing the space requirement, it needs more + * computation to check an address. + * + * Tracking coverage is done by "ideal" arena address. It is easier to + * explain in decimal so let's say that the arena size is 100 bytes. + * Then, ideal addresses are 100, 200, 300, etc. For checking if a + * pointer address is inside an actual arena, we have to check two ideal + * arena addresses. E.g. if pointer is 357, we need to check 200 and + * 300. In the rare case that an arena is aligned in the ideal way + * (e.g. base address of arena is 200) then we only have to check one + * ideal address. + * + * The tree nodes for 200 and 300 both store the address of arena. + * There are two cases: the arena starts at a lower ideal arena and + * extends to this one, or the arena starts in this arena and extends to + * the next ideal arena. The tail_lo and tail_hi members correspond to + * these two cases. + */ + + +/* mark or unmark addresses covered by arena */ +static int +arena_map_mark_used(uintptr_t arena_base, int is_used) +{ + /* sanity check that ADDRESS_BITS is correct */ + assert(HIGH_BITS(arena_base) == HIGH_BITS(&arena_map_root)); + arena_map_bot_t *n_hi = arena_map_get((block *)arena_base, is_used); + if (n_hi == NULL) { + assert(is_used); /* otherwise node should already exist */ + return 0; /* failed to allocate space for node */ + } + int i3 = MAP_BOT_INDEX((block *)arena_base); + int32_t tail = (int32_t)(arena_base & ARENA_SIZE_MASK); + if (tail == 0) { + /* is ideal arena address */ + n_hi->arenas[i3].tail_hi = is_used ? -1 : 0; + } + else { + /* arena_base address is not ideal (aligned to arena size) and + * so it potentially covers two MAP_BOT nodes. Get the MAP_BOT node + * for the next arena. Note that it might be in different MAP_TOP + * and MAP_MID nodes as well so we need to call arena_map_get() + * again (do the full tree traversal). + */ + n_hi->arenas[i3].tail_hi = is_used ? tail : 0; + uintptr_t arena_base_next = arena_base + ARENA_SIZE; + /* If arena_base is a legit arena address, so is arena_base_next - 1 + * (last address in arena). If arena_base_next overflows then it + * must overflow to 0. However, that would mean arena_base was + * "ideal" and we should not be in this case. */ + assert(arena_base < arena_base_next); + arena_map_bot_t *n_lo = arena_map_get((block *)arena_base_next, is_used); + if (n_lo == NULL) { + assert(is_used); /* otherwise should already exist */ + n_hi->arenas[i3].tail_hi = 0; + return 0; /* failed to allocate space for node */ + } + int i3_next = MAP_BOT_INDEX(arena_base_next); + n_lo->arenas[i3_next].tail_lo = is_used ? tail : 0; + } + return 1; +} + +/* Return true if 'p' is a pointer inside an obmalloc arena. + * _PyObject_Free() calls this so it needs to be very fast. */ +static int +arena_map_is_used(block *p) +{ + arena_map_bot_t *n = arena_map_get(p, 0); + if (n == NULL) { + return 0; + } + int i3 = MAP_BOT_INDEX(p); + /* ARENA_BITS must be < 32 so that the tail is a non-negative int32_t. */ + int32_t hi = n->arenas[i3].tail_hi; + int32_t lo = n->arenas[i3].tail_lo; + int32_t tail = (int32_t)(AS_UINT(p) & ARENA_SIZE_MASK); + return (tail < lo) || (tail >= hi && hi != 0); +} + +/* end of radix tree logic */ +/*==========================================================================*/ +#endif /* WITH_PYMALLOC_RADIX_TREE */ + /* Allocate a new arena. If we run out of memory, return NULL. Else * allocate a new arena, and return the address of an arena_object @@ -1302,6 +1592,15 @@ new_arena(void) unused_arena_objects = arenaobj->nextarena; assert(arenaobj->address == 0); address = _PyObject_Arena.alloc(_PyObject_Arena.ctx, ARENA_SIZE); +#if WITH_PYMALLOC_RADIX_TREE + if (address != NULL) { + if (!arena_map_mark_used((uintptr_t)address, 1)) { + /* marking arena in radix tree failed, abort */ + _PyObject_Arena.free(_PyObject_Arena.ctx, address, ARENA_SIZE); + address = NULL; + } + } +#endif if (address == NULL) { /* The allocation failed: return NULL after putting the * arenaobj back. @@ -1332,6 +1631,17 @@ new_arena(void) } + +#if WITH_PYMALLOC_RADIX_TREE +/* Return true if and only if P is an address that was allocated by + pymalloc. When the radix tree is used, 'poolp' is unused. + */ +static bool +address_in_range(void *p, poolp pool) +{ + return arena_map_is_used(p); +} +#else /* address_in_range(P, POOL) @@ -1423,6 +1733,7 @@ address_in_range(void *p, poolp pool) arenas[arenaindex].address != 0; } +#endif /* !WITH_PYMALLOC_RADIX_TREE */ /*==========================================================================*/ @@ -1768,6 +2079,11 @@ insert_to_freepool(poolp pool) ao->nextarena = unused_arena_objects; unused_arena_objects = ao; +#if WITH_PYMALLOC_RADIX_TREE + /* mark arena region as not under control of obmalloc */ + arena_map_mark_used(ao->address, 0); +#endif + /* Free the entire arena. */ _PyObject_Arena.free(_PyObject_Arena.ctx, (void *)ao->address, ARENA_SIZE); @@ -2711,6 +3027,12 @@ _PyObject_DebugMallocStats(FILE *out) (void)printone(out, "# arenas reclaimed", ntimes_arena_allocated - narenas); (void)printone(out, "# arenas highwater mark", narenas_highwater); (void)printone(out, "# arenas allocated current", narenas); +#ifdef USE_INTERIOR_NODES + (void)printone(out, "# arena map mid nodes", arena_map_mid_count); + (void)printone(out, "# arena map bot nodes", arena_map_bot_count); + fputc('\n', out); +#endif + PyOS_snprintf(buf, sizeof(buf), "%zu arenas * %d bytes/arena", @@ -2729,6 +3051,15 @@ _PyObject_DebugMallocStats(FILE *out) total += printone(out, "# bytes lost to pool headers", pool_header_bytes); total += printone(out, "# bytes lost to quantization", quantization); total += printone(out, "# bytes lost to arena alignment", arena_alignment); +#ifdef WITH_PYMALLOC_RADIX_TREE + total += printone(out, "# bytes lost to arena map root", sizeof(arena_map_root)); +#endif +#ifdef USE_INTERIOR_NODES + total += printone(out, "# bytes lost to arena map mid", + sizeof(arena_map_mid_t) * arena_map_mid_count); + total += printone(out, "# bytes lost to arena map bot", + sizeof(arena_map_bot_t) * arena_map_bot_count); +#endif (void)printone(out, "Total", total); return 1; }