From 4450445f51236c3270ec540ee9a62bff11b22b94 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Mon, 6 Mar 2023 10:47:26 -0800 Subject: [PATCH] pymem: add _PyMem_FreeQsbr --- Include/Python.h | 1 + Include/cpython/pystate.h | 4 + Include/internal/pycore_interp.h | 2 + Include/internal/pycore_pymem.h | 13 +++ Include/internal/pycore_qsbr.h | 5 ++ Objects/obmalloc.c | 147 ++++++++++++++++++++++++++++++- Python/pystate.c | 7 ++ Python/qsbr.c | 11 +++ 8 files changed, 189 insertions(+), 1 deletion(-) diff --git a/Include/Python.h b/Include/Python.h index 0e981239c1..7d583e91d1 100644 --- a/Include/Python.h +++ b/Include/Python.h @@ -76,6 +76,7 @@ #include "cpython/cellobject.h" #include "iterobject.h" #include "cpython/initconfig.h" +#include "cpython/pyqueue.h" #include "pystate.h" #include "cpython/genobject.h" #include "descrobject.h" diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h index 8af62f67c5..1fe84d5fc7 100644 --- a/Include/cpython/pystate.h +++ b/Include/cpython/pystate.h @@ -211,6 +211,10 @@ struct _ts { _PyStackChunk *datastack_chunk; PyObject **datastack_top; PyObject **datastack_limit; + + /* Queue of data pointers to be freed */ + struct _Py_queue_head/*<_PyMemWork>*/ mem_work; + /* XXX signal handlers should also be here */ /* The following fields are here to avoid allocation during init. diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index f96d2538fd..0eb95c0781 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -24,6 +24,7 @@ extern "C" { #include "pycore_list.h" // struct _Py_list_state #include "pycore_llist.h" // struct llist_node #include "pycore_global_objects.h" // struct _Py_interp_static_objects +#include "pycore_pymem.h" // struct _mem_work #include "pycore_tuple.h" // struct _Py_tuple_state #include "pycore_typeobject.h" // struct type_cache #include "pycore_unicodeobject.h" // struct _Py_unicode_state @@ -118,6 +119,7 @@ struct _is { struct _ceval_state ceval; struct _gc_runtime_state gc; + struct _mem_state mem; // sys.modules dictionary PyObject *modules; diff --git a/Include/internal/pycore_pymem.h b/Include/internal/pycore_pymem.h index 4135cc8a65..e5fb53e328 100644 --- a/Include/internal/pycore_pymem.h +++ b/Include/internal/pycore_pymem.h @@ -33,6 +33,13 @@ struct _pymem_allocators { PyObjectArenaAllocator obj_arena; }; +struct _mem_state { + _PyMutex mutex; + /* Queue of data pointers to be freed from dead threads */ + struct _Py_queue_head/*<_PyMemWork>*/ work; + int nonempty; +}; + /* Set the memory allocator of the specified domain to the default. Save the old allocator into *old_alloc if it's non-NULL. @@ -92,6 +99,12 @@ PyAPI_FUNC(int) _PyMem_GetAllocatorName( PYMEM_ALLOCATOR_NOT_SET does nothing. */ PyAPI_FUNC(int) _PyMem_SetupAllocators(PyMemAllocatorName allocator); +/* Free the pointer after all threads are quiescent. */ +extern void _PyMem_FreeQsbr(void *ptr); +extern void _PyMem_QsbrPoll(PyThreadState *tstate); +extern void _PyMem_AbandonQsbr(PyThreadState *tstate); +extern void _PyMem_QsbrFini(PyInterpreterState *interp); + extern void * _PyMem_DefaultRawMalloc(size_t); extern void * _PyMem_DefaultRawCalloc(size_t, size_t); extern void * _PyMem_DefaultRawRealloc(void *, size_t); diff --git a/Include/internal/pycore_qsbr.h b/Include/internal/pycore_qsbr.h index 8c76eb1acb..6888ef1aea 100644 --- a/Include/internal/pycore_qsbr.h +++ b/Include/internal/pycore_qsbr.h @@ -13,6 +13,8 @@ struct qsbr { uint64_t t_seq; struct qsbr_shared *t_shared; struct qsbr *t_next; + int t_deferred; + int t_limit; PyThreadState *tstate; }; @@ -46,6 +48,9 @@ _Py_qsbr_init(struct qsbr_shared *shared); uint64_t _Py_qsbr_advance(struct qsbr_shared *shared); +uint64_t +_Py_qsbr_deferred_advance(struct qsbr *qsbr); + bool _Py_qsbr_poll(struct qsbr *qsbr, uint64_t goal); diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 177cfcbecb..3b3bcf93d1 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -1,10 +1,11 @@ #include "Python.h" #include "pycore_code.h" // stats #include "pycore_pystate.h" // _PyInterpreterState_GET - #include "pycore_obmalloc.h" #include "pycore_pymem.h" #include "pycore_pymem_init.h" +#include "pycore_pyqueue.h" +#include "pycore_qsbr.h" #include // malloc() #include @@ -675,6 +676,150 @@ PyMem_Free(void *ptr) } +typedef struct { + void *ptr; + uint64_t seq; +} _PyMem_WorkItem; + +#define PY_MEM_WORK_ITEMS 127 + +typedef struct _PyMemWork { + struct _Py_queue_node node; + unsigned int first; + unsigned int size; + _PyMem_WorkItem items[PY_MEM_WORK_ITEMS]; +} _PyMem_WorkBuf; + +void +_PyMem_FreeQsbr(void *ptr) +{ + PyThreadState *tstate = _PyThreadState_GET(); + + // Try to get an non-full workbuf + _PyMem_WorkBuf *work = NULL; + if (!_Py_queue_is_empty(&tstate->mem_work)) { + work = _Py_queue_last(&tstate->mem_work, _PyMem_WorkBuf, node); + if (work->size == PY_MEM_WORK_ITEMS) { + work = NULL; + } + } + + if (work == NULL) { + work = PyMem_RawMalloc(sizeof(_PyMem_WorkBuf)); + if (work == NULL) { + Py_FatalError("out of memory (in _PyMem_FreeQsbr)"); + } + work->first = work->size = 0; + _Py_queue_enqeue(&tstate->mem_work, &work->node); + } + + PyThreadStateImpl *tstate_impl = (PyThreadStateImpl *)tstate; + work->items[work->size].ptr = ptr; + work->items[work->size].seq = _Py_qsbr_deferred_advance(tstate_impl->qsbr); + work->size++; + + if (work->size == PY_MEM_WORK_ITEMS) { + // Now seems like a good time to check for any memory that can be freed. + _PyMem_QsbrPoll(tstate); + } +} + +static int +_PyMem_ProcessQueue(struct _Py_queue_head *queue, struct qsbr *qsbr, bool keep_empty) +{ + while (!_Py_queue_is_empty(queue)) { + _PyMem_WorkBuf *work = _Py_queue_first(queue, _PyMem_WorkBuf, node); + if (work->size == 0 && keep_empty) { + return 0; + } + while (work->first < work->size) { + _PyMem_WorkItem *item = &work->items[work->first]; + if (!_Py_qsbr_poll(qsbr, item->seq)) { + return 1; + } + PyMem_Free(item->ptr); + work->first++; + } + + // Remove the empty work buffer + _Py_queue_dequeue(queue); + + // If the queue doesn't have an empty work buffer, stick this + // one at the end of the queue. Otherwise, free it. + if (keep_empty && _Py_queue_is_empty(queue)) { + work->first = work->size = 0; + _Py_queue_enqeue(queue, &work->node); + return 0; + } + else if (keep_empty && _Py_queue_last(queue, _PyMem_WorkBuf, node)->size == 0) { + work->first = work->size = 0; + _Py_queue_enqeue(queue, &work->node); + } + else { + PyMem_RawFree(work); + } + } + return 0; +} + +void +_PyMem_QsbrPoll(PyThreadState *tstate) +{ + struct qsbr *qsbr = ((PyThreadStateImpl *)tstate)->qsbr; + + // Process any work on the thread-local queue. + _PyMem_ProcessQueue(&tstate->mem_work, qsbr, true); + + // Process any work on the interpreter queue if we can get the lock. + PyInterpreterState *interp = tstate->interp; + if (_Py_atomic_load_int_relaxed(&interp->mem.nonempty) && + _PyMutex_TryLock(&interp->mem.mutex)) { + int more = _PyMem_ProcessQueue(&interp->mem.work, qsbr, false); + _Py_atomic_store_int_relaxed(&interp->mem.nonempty, more); + _PyMutex_unlock(&interp->mem.mutex); + } +} + +void +_PyMem_QsbrFini(PyInterpreterState *interp) +{ + struct _Py_queue_head *queue = &interp->mem.work; + while (!_Py_queue_is_empty(queue)) { + _PyMem_WorkBuf *work = _Py_queue_first(queue, _PyMem_WorkBuf, node); + while (work->first < work->size) { + _PyMem_WorkItem *item = &work->items[work->first]; + PyMem_Free(item->ptr); + work->first++; + } + _Py_queue_dequeue(queue); + PyMem_RawFree(work); + } + interp->mem.nonempty = 0; +} + +void +_PyMem_AbandonQsbr(PyThreadState *tstate) +{ + PyInterpreterState *interp = tstate->interp; + + while (!_Py_queue_is_empty(&tstate->mem_work)) { + struct _Py_queue_node *node = _Py_queue_dequeue(&tstate->mem_work); + if (node == NULL) { + break; + } + _PyMem_WorkBuf *work = _Py_queue_data(node, _PyMem_WorkBuf, node); + if (work->first == work->size) { + PyMem_RawFree(work); + } + else { + _PyMutex_lock(&interp->mem.mutex); + _Py_queue_enqeue(&interp->mem.work, node); + _Py_atomic_store_int_relaxed(&interp->mem.nonempty, 1); + _PyMutex_unlock(&interp->mem.mutex); + } + } +} + wchar_t* _PyMem_RawWcsdup(const wchar_t *str) { diff --git a/Python/pystate.c b/Python/pystate.c index 80904ed317..023315c39d 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -12,6 +12,7 @@ #include "pycore_pyerrors.h" #include "pycore_pylifecycle.h" #include "pycore_pymem.h" // _PyMem_DefaultRawFree() +#include "pycore_pyqueue.h" // _Py_queue_init #include "pycore_pystate.h" // _PyThreadState_GET() #include "pycore_qsbr.h" #include "pycore_runtime_init.h" // _PyRuntimeState_INIT @@ -464,6 +465,7 @@ init_interpreter(PyInterpreterState *interp, _PyGC_InitState(&interp->gc); PyConfig_InitPythonConfig(&interp->config); _PyType_InitCache(interp); + _Py_queue_init(&interp->mem.work); interp->_initialized = 1; } @@ -598,6 +600,9 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate) _PyGC_CollectNoFail(tstate); _PyGC_Fini(interp); + /* Perform any delayed PyMem_Free calls */ + _PyMem_QsbrFini(interp); + /* We don't clear sysdict and builtins until the end of this function. Because clearing other attributes can execute arbitrary Python code which requires sysdict and builtins. */ @@ -1051,6 +1056,7 @@ init_threadstate(PyThreadState *tstate, tstate->daemon = (id > 1); tstate->done_event = done_event; _PyEventRc_Incref(done_event); + _Py_queue_init(&tstate->mem_work); if (_PyRuntime.stop_the_world_requested) { tstate->status = _Py_THREAD_GC; @@ -1339,6 +1345,7 @@ PyThreadState_Clear(PyThreadState *tstate) } _Py_queue_destroy(tstate); + _PyMem_AbandonQsbr(tstate); /* Don't clear tstate->pyframe: it is a borrowed reference */ diff --git a/Python/qsbr.c b/Python/qsbr.c index cf5f803d92..0d8f5773d1 100644 --- a/Python/qsbr.c +++ b/Python/qsbr.c @@ -58,6 +58,7 @@ _Py_qsbr_alloc(struct qsbr_shared *shared) } memset(qsbr, 0, sizeof(*qsbr)); qsbr->t_shared = shared; + qsbr->t_limit = 32; return qsbr; } @@ -95,6 +96,16 @@ _Py_qsbr_advance(struct qsbr_shared *shared) return _Py_atomic_add_uint64(&shared->s_wr, QSBR_INCR) + QSBR_INCR; } +uint64_t +_Py_qsbr_deferred_advance(struct qsbr *qsbr) +{ + if (++qsbr->t_deferred < qsbr->t_limit) { + return _Py_qsbr_shared_current(qsbr->t_shared) + QSBR_INCR; + } + qsbr->t_deferred = 0; + return _Py_qsbr_advance(qsbr->t_shared); +} + uint64_t _Py_qsbr_poll_scan(struct qsbr_shared *shared) {