From d1bb94d7c7d0cb16b9dbbb0295ba5ec081a3c461 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Mon, 23 Oct 2023 14:49:09 +0100 Subject: [PATCH] GH-109369: Add machinery for deoptimizing tier2 executors, both individually and globally. (GH-110384) --- Include/cpython/optimizer.h | 26 ++++ Include/internal/pycore_interp.h | 1 + Lib/test/test_capi/test_misc.py | 61 ++++++++ Modules/_testinternalcapi.c | 28 ++++ Python/instrumentation.c | 3 + Python/optimizer.c | 235 ++++++++++++++++++++++++++++++- Python/pystate.c | 1 + 7 files changed, 353 insertions(+), 2 deletions(-) diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index 47536108a9665e1..2a5251b3ecb02a6 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -6,9 +6,27 @@ extern "C" { #endif +typedef struct _PyExecutorLinkListNode { + struct _PyExecutorObject *next; + struct _PyExecutorObject *previous; +} _PyExecutorLinkListNode; + + +/* Bloom filter with m = 256 + * https://en.wikipedia.org/wiki/Bloom_filter */ +#define BLOOM_FILTER_WORDS 8 + +typedef struct _bloom_filter { + uint32_t bits[BLOOM_FILTER_WORDS]; +} _PyBloomFilter; + typedef struct { uint8_t opcode; uint8_t oparg; + uint8_t valid; + uint8_t linked; + _PyBloomFilter bloom; + _PyExecutorLinkListNode links; } _PyVMData; typedef struct _PyExecutorObject { @@ -45,6 +63,14 @@ _PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_ extern _PyOptimizerObject _PyOptimizer_Default; +void _Py_ExecutorInit(_PyExecutorObject *, _PyBloomFilter *); +void _Py_ExecutorClear(_PyExecutorObject *); +void _Py_BloomFilter_Init(_PyBloomFilter *); +void _Py_BloomFilter_Add(_PyBloomFilter *bloom, void *obj); +PyAPI_FUNC(void) _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj); +PyAPI_FUNC(void) _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj); +extern void _Py_Executors_InvalidateAll(PyInterpreterState *interp); + /* For testing */ PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewCounter(void); PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewUOpOptimizer(void); diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 60d333ad7baa2eb..fc27aad48b5831e 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -215,6 +215,7 @@ struct _is { struct types_state types; struct callable_cache callable_cache; _PyOptimizerObject *optimizer; + _PyExecutorObject *executor_list_head; uint16_t optimizer_resume_threshold; uint16_t optimizer_backedge_threshold; uint32_t next_func_version; diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index b3f32d860eee035..917a8dabec0d5ac 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -2489,6 +2489,67 @@ def get_first_executor(func): return None +class TestExecutorInvalidation(unittest.TestCase): + + def setUp(self): + self.old = _testinternalcapi.get_optimizer() + self.opt = _testinternalcapi.get_counter_optimizer() + _testinternalcapi.set_optimizer(self.opt) + + def tearDown(self): + _testinternalcapi.set_optimizer(self.old) + + def test_invalidate_object(self): + # Generate a new set of functions at each call + ns = {} + func_src = "\n".join( + f""" + def f{n}(): + for _ in range(1000): + pass + """ for n in range(5) + ) + exec(textwrap.dedent(func_src), ns, ns) + funcs = [ ns[f'f{n}'] for n in range(5)] + objects = [object() for _ in range(5)] + + for f in funcs: + f() + executors = [get_first_executor(f) for f in funcs] + # Set things up so each executor depends on the objects + # with an equal or lower index. + for i, exe in enumerate(executors): + self.assertTrue(exe.valid) + for obj in objects[:i+1]: + _testinternalcapi.add_executor_dependency(exe, obj) + self.assertTrue(exe.valid) + # Assert that the correct executors are invalidated + # and check that nothing crashes when we invalidate + # an executor mutliple times. + for i in (4,3,2,1,0): + _testinternalcapi.invalidate_executors(objects[i]) + for exe in executors[i:]: + self.assertFalse(exe.valid) + for exe in executors[:i]: + self.assertTrue(exe.valid) + + def test_uop_optimizer_invalidation(self): + # Generate a new function at each call + ns = {} + exec(textwrap.dedent(""" + def f(): + for i in range(1000): + pass + """), ns, ns) + f = ns['f'] + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + f() + exe = get_first_executor(f) + self.assertTrue(exe.valid) + _testinternalcapi.invalidate_executors(f.__code__) + self.assertFalse(exe.valid) + class TestUops(unittest.TestCase): def test_basic_loop(self): diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index ddeb38938a331f5..4ead1b6bea7fae3 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -1002,6 +1002,32 @@ get_executor(PyObject *self, PyObject *const *args, Py_ssize_t nargs) return (PyObject *)PyUnstable_GetExecutor((PyCodeObject *)code, ioffset); } +static PyObject * +add_executor_dependency(PyObject *self, PyObject *args) +{ + PyObject *exec; + PyObject *obj; + if (!PyArg_ParseTuple(args, "OO", &exec, &obj)) { + return NULL; + } + /* No way to tell in general if exec is an executor, so we only accept + * counting_executor */ + if (strcmp(Py_TYPE(exec)->tp_name, "counting_executor")) { + PyErr_SetString(PyExc_TypeError, "argument must be a counting_executor"); + return NULL; + } + _Py_Executor_DependsOn((_PyExecutorObject *)exec, obj); + Py_RETURN_NONE; +} + +static PyObject * +invalidate_executors(PyObject *self, PyObject *obj) +{ + PyInterpreterState *interp = PyInterpreterState_Get(); + _Py_Executors_InvalidateDependency(interp, obj); + Py_RETURN_NONE; +} + static int _pending_callback(void *arg) { /* we assume the argument is callable object to which we own a reference */ @@ -1565,6 +1591,8 @@ static PyMethodDef module_functions[] = { {"get_executor", _PyCFunction_CAST(get_executor), METH_FASTCALL, NULL}, {"get_counter_optimizer", get_counter_optimizer, METH_NOARGS, NULL}, {"get_uop_optimizer", get_uop_optimizer, METH_NOARGS, NULL}, + {"add_executor_dependency", add_executor_dependency, METH_VARARGS, NULL}, + {"invalidate_executors", invalidate_executors, METH_O, NULL}, {"pending_threadfunc", _PyCFunction_CAST(pending_threadfunc), METH_VARARGS | METH_KEYWORDS}, {"pending_identify", pending_identify, METH_VARARGS, NULL}, diff --git a/Python/instrumentation.c b/Python/instrumentation.c index eee1908e503e43c..4bb57a621e37e8d 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -1582,6 +1582,7 @@ _Py_Instrument(PyCodeObject *code, PyInterpreterState *interp) if (code->co_executors != NULL) { _PyCode_Clear_Executors(code); } + _Py_Executors_InvalidateDependency(interp, code); int code_len = (int)Py_SIZE(code); /* code->_co_firsttraceable >= code_len indicates * that no instrumentation can be inserted. @@ -1803,6 +1804,7 @@ _PyMonitoring_SetEvents(int tool_id, _PyMonitoringEventSet events) return -1; } set_global_version(interp, new_version); + _Py_Executors_InvalidateAll(interp); return instrument_all_executing_code_objects(interp); } @@ -1832,6 +1834,7 @@ _PyMonitoring_SetLocalEvents(PyCodeObject *code, int tool_id, _PyMonitoringEvent /* Force instrumentation update */ code->_co_instrumentation_version -= MONITORING_VERSION_INCREMENT; } + _Py_Executors_InvalidateDependency(interp, code); if (_Py_Instrument(code, interp)) { return -1; } diff --git a/Python/optimizer.c b/Python/optimizer.c index 955ac812177ac44..8d19de220d3d3d5 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -220,10 +220,16 @@ typedef struct { static void counter_dealloc(_PyCounterExecutorObject *self) { + _Py_ExecutorClear((_PyExecutorObject *)self); Py_DECREF(self->optimizer); PyObject_Free(self); } +static PyMemberDef counter_members[] = { + { "valid", Py_T_UBYTE, offsetof(_PyCounterExecutorObject, executor.vm_data.valid), Py_READONLY, "is valid?" }, + { NULL } +}; + static PyTypeObject CounterExecutor_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) .tp_name = "counting_executor", @@ -231,6 +237,7 @@ static PyTypeObject CounterExecutor_Type = { .tp_itemsize = 0, .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, .tp_dealloc = (destructor)counter_dealloc, + .tp_members = counter_members, }; static _PyInterpreterFrame * @@ -261,6 +268,9 @@ counter_optimize( executor->optimizer = (_PyCounterOptimizerObject *)self; executor->next_instr = instr; *exec_ptr = (_PyExecutorObject *)executor; + _PyBloomFilter empty; + _Py_BloomFilter_Init(&empty); + _Py_ExecutorInit((_PyExecutorObject *)executor, &empty); return 1; } @@ -288,6 +298,8 @@ static PyTypeObject CounterOptimizer_Type = { PyObject * PyUnstable_Optimizer_NewCounter(void) { + PyType_Ready(&CounterExecutor_Type); + PyType_Ready(&CounterOptimizer_Type); _PyCounterOptimizerObject *opt = (_PyCounterOptimizerObject *)_PyObject_New(&CounterOptimizer_Type); if (opt == NULL) { return NULL; @@ -303,6 +315,7 @@ PyUnstable_Optimizer_NewCounter(void) static void uop_dealloc(_PyUOpExecutorObject *self) { + _Py_ExecutorClear((_PyExecutorObject *)self); PyObject_Free(self); } @@ -356,6 +369,12 @@ PySequenceMethods uop_as_sequence = { .sq_item = (ssizeargfunc)uop_item, }; + +static PyMemberDef uop_members[] = { + { "valid", Py_T_UBYTE, offsetof(_PyUOpExecutorObject, base.vm_data.valid), Py_READONLY, "is valid?" }, + { NULL } +}; + static PyTypeObject UOpExecutor_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) .tp_name = "uop_executor", @@ -364,6 +383,7 @@ static PyTypeObject UOpExecutor_Type = { .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, .tp_dealloc = (destructor)uop_dealloc, .tp_as_sequence = &uop_as_sequence, + .tp_members = uop_members, }; static int @@ -399,9 +419,11 @@ translate_bytecode_to_trace( PyCodeObject *code, _Py_CODEUNIT *instr, _PyUOpInstruction *trace, - int buffer_size) + int buffer_size, + _PyBloomFilter *dependencies) { PyCodeObject *initial_code = code; + _Py_BloomFilter_Add(dependencies, initial_code); _Py_CODEUNIT *initial_instr = instr; int trace_length = 0; int max_length = buffer_size; @@ -735,6 +757,7 @@ translate_bytecode_to_trace( // Increment IP to the return address instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]] + 1; TRACE_STACK_PUSH(); + _Py_BloomFilter_Add(dependencies, new_code); code = new_code; instr = _PyCode_CODE(code); DPRINTF(2, @@ -895,8 +918,10 @@ uop_optimize( _PyExecutorObject **exec_ptr, int curr_stackentries) { + _PyBloomFilter dependencies; + _Py_BloomFilter_Init(&dependencies); _PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH]; - int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH); + int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH, &dependencies); if (trace_length <= 0) { // Error or nothing translated return trace_length; @@ -915,6 +940,7 @@ uop_optimize( OPT_HIST(trace_length, optimized_trace_length_hist); executor->base.execute = _PyUopExecute; memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction)); + _Py_ExecutorInit((_PyExecutorObject *)executor, &dependencies); *exec_ptr = (_PyExecutorObject *)executor; return 1; } @@ -936,6 +962,8 @@ static PyTypeObject UOpOptimizer_Type = { PyObject * PyUnstable_Optimizer_NewUOpOptimizer(void) { + PyType_Ready(&UOpExecutor_Type); + PyType_Ready(&UOpOptimizer_Type); _PyOptimizerObject *opt = PyObject_New(_PyOptimizerObject, &UOpOptimizer_Type); if (opt == NULL) { return NULL; @@ -947,3 +975,206 @@ PyUnstable_Optimizer_NewUOpOptimizer(void) opt->backedge_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER; return (PyObject *)opt; } + + +/***************************************** + * Executor management + ****************************************/ + +/* We use a bloomfilter with k = 6, m = 256 + * The choice of k and the following constants + * could do with a more rigourous analysis, + * but here is a simple analysis: + * + * We want to keep the false positive rate low. + * For n = 5 (a trace depends on 5 objects), + * we expect 30 bits set, giving a false positive + * rate of (30/256)**6 == 2.5e-6 which is plenty + * good enough. + * + * However with n = 10 we expect 60 bits set (worst case), + * giving a false positive of (60/256)**6 == 0.0001 + * + * We choose k = 6, rather than a higher number as + * it means the false positive rate grows slower for high n. + * + * n = 5, k = 6 => fp = 2.6e-6 + * n = 5, k = 8 => fp = 3.5e-7 + * n = 10, k = 6 => fp = 1.6e-4 + * n = 10, k = 8 => fp = 0.9e-4 + * n = 15, k = 6 => fp = 0.18% + * n = 15, k = 8 => fp = 0.23% + * n = 20, k = 6 => fp = 1.1% + * n = 20, k = 8 => fp = 2.3% + * + * The above analysis assumes perfect hash functions, + * but those don't exist, so the real false positive + * rates may be worse. + */ + +#define K 6 + +#define SEED 20221211 + +/* TO DO -- Use more modern hash functions with better distribution of bits */ +static uint64_t +address_to_hash(void *ptr) { + assert(ptr != NULL); + uint64_t uhash = SEED; + uintptr_t addr = (uintptr_t)ptr; + for (int i = 0; i < SIZEOF_VOID_P; i++) { + uhash ^= addr & 255; + uhash *= (uint64_t)_PyHASH_MULTIPLIER; + addr >>= 8; + } + return uhash; +} + +void +_Py_BloomFilter_Init(_PyBloomFilter *bloom) +{ + for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { + bloom->bits[i] = 0; + } +} + +/* We want K hash functions that each set 1 bit. + * A hash function that sets 1 bit in M bits can be trivially + * derived from a log2(M) bit hash function. + * So we extract 8 (log2(256)) bits at a time from + * the 64bit hash. */ +void +_Py_BloomFilter_Add(_PyBloomFilter *bloom, void *ptr) +{ + uint64_t hash = address_to_hash(ptr); + assert(K <= 8); + for (int i = 0; i < K; i++) { + uint8_t bits = hash & 255; + bloom->bits[bits >> 5] |= (1 << (bits&31)); + hash >>= 8; + } +} + +static bool +bloom_filter_may_contain(_PyBloomFilter *bloom, _PyBloomFilter *hashes) +{ + for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { + if ((bloom->bits[i] & hashes->bits[i]) != hashes->bits[i]) { + return false; + } + } + return true; +} + +static void +link_executor(_PyExecutorObject *executor) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyExecutorLinkListNode *links = &executor->vm_data.links; + _PyExecutorObject *head = interp->executor_list_head; + if (head == NULL) { + interp->executor_list_head = executor; + links->previous = NULL; + links->next = NULL; + } + else { + _PyExecutorObject *next = head->vm_data.links.next; + links->previous = head; + links->next = next; + if (next != NULL) { + next->vm_data.links.previous = executor; + } + head->vm_data.links.next = executor; + } + executor->vm_data.linked = true; + /* executor_list_head must be first in list */ + assert(interp->executor_list_head->vm_data.links.previous == NULL); +} + +static void +unlink_executor(_PyExecutorObject *executor) +{ + if (!executor->vm_data.linked) { + return; + } + _PyExecutorLinkListNode *links = &executor->vm_data.links; + _PyExecutorObject *next = links->next; + _PyExecutorObject *prev = links->previous; + if (next != NULL) { + next->vm_data.links.previous = prev; + } + if (prev != NULL) { + prev->vm_data.links.next = next; + } + else { + // prev == NULL implies that executor is the list head + PyInterpreterState *interp = PyInterpreterState_Get(); + assert(interp->executor_list_head == executor); + interp->executor_list_head = next; + } + executor->vm_data.linked = false; +} + +/* This must be called by optimizers before using the executor */ +void +_Py_ExecutorInit(_PyExecutorObject *executor, _PyBloomFilter *dependency_set) +{ + executor->vm_data.valid = true; + for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { + executor->vm_data.bloom.bits[i] = dependency_set->bits[i]; + } + link_executor(executor); +} + +/* This must be called by executors during dealloc */ +void +_Py_ExecutorClear(_PyExecutorObject *executor) +{ + unlink_executor(executor); +} + +void +_Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj) +{ + assert(executor->vm_data.valid = true); + _Py_BloomFilter_Add(&executor->vm_data.bloom, obj); +} + +/* Invalidate all executors that depend on `obj` + * May cause other executors to be invalidated as well + */ +void +_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj) +{ + _PyBloomFilter obj_filter; + _Py_BloomFilter_Init(&obj_filter); + _Py_BloomFilter_Add(&obj_filter, obj); + /* Walk the list of executors */ + /* TO DO -- Use a tree to avoid traversing as many objects */ + for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { + assert(exec->vm_data.valid); + _PyExecutorObject *next = exec->vm_data.links.next; + if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter)) { + exec->vm_data.valid = false; + unlink_executor(exec); + } + exec = next; + } +} + +/* Invalidate all executors */ +void +_Py_Executors_InvalidateAll(PyInterpreterState *interp) +{ + /* Walk the list of executors */ + for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { + assert(exec->vm_data.valid); + _PyExecutorObject *next = exec->vm_data.links.next; + exec->vm_data.links.next = NULL; + exec->vm_data.links.previous = NULL; + exec->vm_data.valid = false; + exec->vm_data.linked = false; + exec = next; + } + interp->executor_list_head = NULL; +} diff --git a/Python/pystate.c b/Python/pystate.c index 2e6f07e60033edb..c44a28ca6d3ac83 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -713,6 +713,7 @@ init_interpreter(PyInterpreterState *interp, interp->optimizer_backedge_threshold = _PyOptimizer_Default.backedge_threshold; interp->optimizer_resume_threshold = _PyOptimizer_Default.backedge_threshold; interp->next_func_version = 1; + interp->executor_list_head = NULL; if (interp != &runtime->_main_interpreter) { /* Fix the self-referential, statically initialized fields. */ interp->dtoa = (struct _dtoa_state)_dtoa_state_INIT(interp);