From 3941620c1e2311303169b89a0c2b658516d3d27a Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Thu, 18 Jan 2024 15:57:23 +0000 Subject: [PATCH 01/13] Change optimizer API to provide more information --- Include/cpython/optimizer.h | 2 +- Include/internal/pycore_optimizer.h | 2 +- Python/optimizer.c | 16 +++++++++------- Python/optimizer_analysis.c | 7 ++++--- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index 96e829f8fbe97d..66a841713cce85 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -45,7 +45,7 @@ typedef struct _PyExecutorObject { typedef struct _PyOptimizerObject _PyOptimizerObject; /* Should return > 0 if a new executor is created. O if no executor is produced and < 0 if an error occurred. */ -typedef int (*optimize_func)(_PyOptimizerObject* self, PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutorObject **, int curr_stackentries); +typedef int (*optimize_func)(_PyOptimizerObject* self, struct _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **, int curr_stackentries); typedef struct _PyOptimizerObject { PyObject_HEAD diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 31f30c673f207a..1105adb59d3e48 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -8,7 +8,7 @@ extern "C" { # error "this header requires Py_BUILD_CORE define" #endif -int _Py_uop_analyze_and_optimize(PyCodeObject *code, +int _Py_uop_analyze_and_optimize(_PyInterpreterFrame *frame, _PyUOpInstruction *trace, int trace_len, int curr_stackentries); extern PyTypeObject _PyCounterExecutor_Type; diff --git a/Python/optimizer.c b/Python/optimizer.c index 4b6ed1781b5b78..b1689e19e79f13 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -109,7 +109,7 @@ PyUnstable_Replace_Executor(PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutor static int error_optimize( _PyOptimizerObject* self, - PyCodeObject *code, + _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec, int Py_UNUSED(stack_entries)) @@ -173,7 +173,7 @@ _PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject } _PyOptimizerObject *opt = interp->optimizer; _PyExecutorObject *executor = NULL; - int err = opt->optimize(opt, code, start, &executor, (int)(stack_pointer - _PyFrame_Stackbase(frame))); + int err = opt->optimize(opt, frame, start, &executor, (int)(stack_pointer - _PyFrame_Stackbase(frame))); if (err <= 0) { assert(executor == NULL); return err; @@ -376,13 +376,14 @@ BRANCH_TO_GUARD[4][2] = { */ static int translate_bytecode_to_trace( - PyCodeObject *code, + _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyUOpInstruction *trace, int buffer_size, _PyBloomFilter *dependencies) { bool progress_needed = true; + PyCodeObject *code = (PyCodeObject *)frame->f_executable; PyCodeObject *initial_code = code; _Py_BloomFilter_Add(dependencies, initial_code); _Py_CODEUNIT *initial_instr = instr; @@ -795,7 +796,7 @@ make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies) static int uop_optimize( _PyOptimizerObject *self, - PyCodeObject *code, + _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr, int curr_stackentries) @@ -803,7 +804,7 @@ uop_optimize( _PyBloomFilter dependencies; _Py_BloomFilter_Init(&dependencies); _PyUOpInstruction buffer[UOP_MAX_TRACE_LENGTH]; - int err = translate_bytecode_to_trace(code, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies); + int err = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies); if (err <= 0) { // Error or nothing translated return err; @@ -811,7 +812,7 @@ uop_optimize( OPT_STAT_INC(traces_created); char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE"); if (uop_optimize == NULL || *uop_optimize > '0') { - err = _Py_uop_analyze_and_optimize(code, buffer, UOP_MAX_TRACE_LENGTH, curr_stackentries); + err = _Py_uop_analyze_and_optimize(frame, buffer, UOP_MAX_TRACE_LENGTH, curr_stackentries); if (err < 0) { return -1; } @@ -874,12 +875,13 @@ PyTypeObject _PyCounterExecutor_Type = { static int counter_optimize( _PyOptimizerObject* self, - PyCodeObject *code, + _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr, int Py_UNUSED(curr_stackentries) ) { + PyCodeObject *code = (PyCodeObject *)frame->f_executable; int oparg = instr->op.arg; while (instr->op.code == EXTENDED_ARG) { instr++; diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index d1225997e10be2..70fefb7ae89b31 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -13,8 +13,9 @@ #include "pycore_optimizer.h" static void -peephole_opt(PyCodeObject *co, _PyUOpInstruction *buffer, int buffer_size) +peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size) { + PyCodeObject *co = (PyCodeObject *)frame->f_executable; for (int pc = 0; pc < buffer_size; pc++) { int opcode = buffer[pc].opcode; switch(opcode) { @@ -86,13 +87,13 @@ remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size) int _Py_uop_analyze_and_optimize( - PyCodeObject *co, + _PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size, int curr_stacklen ) { - peephole_opt(co, buffer, buffer_size); + peephole_opt(frame, buffer, buffer_size); remove_unneeded_uops(buffer, buffer_size); return 0; } From 665cc58bd8a9cccfe734b1e095ac7cb41ba59f29 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Thu, 18 Jan 2024 17:30:37 +0000 Subject: [PATCH 02/13] Add new uops for optimizing globals --- Include/internal/pycore_uop_ids.h | 7 +++-- Include/internal/pycore_uop_metadata.h | 6 ++++ Python/bytecodes.c | 21 +++++++++++++ Python/executor_cases.c.h | 42 ++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 2 deletions(-) diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index a7056586ff04c0..311328bd6e6198 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -232,8 +232,11 @@ extern "C" { #define _CHECK_VALIDITY 379 #define _LOAD_CONST_INLINE 380 #define _LOAD_CONST_INLINE_BORROW 381 -#define _INTERNAL_INCREMENT_OPT_COUNTER 382 -#define MAX_UOP_ID 382 +#define _LOAD_CONST_INLINE_WITH_NULL 382 +#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 383 +#define _CHECK_FUNCTION_VERSION 384 +#define _INTERNAL_INCREMENT_OPT_COUNTER 385 +#define MAX_UOP_ID 385 #ifdef __cplusplus } diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 14d3382e895cdf..8a20991e544e87 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -204,6 +204,9 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_CHECK_VALIDITY] = HAS_DEOPT_FLAG, [_LOAD_CONST_INLINE] = 0, [_LOAD_CONST_INLINE_BORROW] = 0, + [_LOAD_CONST_INLINE_WITH_NULL] = 0, + [_LOAD_CONST_INLINE_BORROW_WITH_NULL] = 0, + [_CHECK_FUNCTION_VERSION] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_PASSTHROUGH_FLAG, [_INTERNAL_INCREMENT_OPT_COUNTER] = 0, }; @@ -254,6 +257,7 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = { [_CHECK_EG_MATCH] = "_CHECK_EG_MATCH", [_CHECK_EXC_MATCH] = "_CHECK_EXC_MATCH", [_CHECK_FUNCTION_EXACT_ARGS] = "_CHECK_FUNCTION_EXACT_ARGS", + [_CHECK_FUNCTION_VERSION] = "_CHECK_FUNCTION_VERSION", [_CHECK_MANAGED_OBJECT_HAS_VALUES] = "_CHECK_MANAGED_OBJECT_HAS_VALUES", [_CHECK_PEP_523] = "_CHECK_PEP_523", [_CHECK_STACK_SPACE] = "_CHECK_STACK_SPACE", @@ -332,6 +336,8 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = { [_LOAD_CONST] = "_LOAD_CONST", [_LOAD_CONST_INLINE] = "_LOAD_CONST_INLINE", [_LOAD_CONST_INLINE_BORROW] = "_LOAD_CONST_INLINE_BORROW", + [_LOAD_CONST_INLINE_BORROW_WITH_NULL] = "_LOAD_CONST_INLINE_BORROW_WITH_NULL", + [_LOAD_CONST_INLINE_WITH_NULL] = "_LOAD_CONST_INLINE_WITH_NULL", [_LOAD_DEREF] = "_LOAD_DEREF", [_LOAD_FAST] = "_LOAD_FAST", [_LOAD_FAST_AND_CLEAR] = "_LOAD_FAST_AND_CLEAR", diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 18749ce60ecd45..638d2dab2eeb42 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -4071,11 +4071,32 @@ dummy_func( } op(_LOAD_CONST_INLINE, (ptr/4 -- value)) { + TIER_TWO_ONLY value = Py_NewRef(ptr); } op(_LOAD_CONST_INLINE_BORROW, (ptr/4 -- value)) { + TIER_TWO_ONLY + value = ptr; + } + + op(_LOAD_CONST_INLINE_WITH_NULL, (ptr/4 -- value, null)) { + TIER_TWO_ONLY + value = Py_NewRef(ptr); + null = NULL; + } + + op(_LOAD_CONST_INLINE_BORROW_WITH_NULL, (ptr/4 -- value, null)) { + TIER_TWO_ONLY value = ptr; + null = NULL; + } + + op(_CHECK_FUNCTION_VERSION, (func_version/2, callable, self_or_null, unused[oparg] -- callable, self_or_null, unused[oparg])) { + TIER_TWO_ONLY + DEOPT_IF(!PyFunction_Check(callable)); + PyFunctionObject *func = (PyFunctionObject *)callable; + DEOPT_IF(func->func_version != func_version); } /* Internal -- for testing executors */ diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 241b9056207715..667df44cd37f82 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -3393,6 +3393,7 @@ case _LOAD_CONST_INLINE: { PyObject *value; PyObject *ptr = (PyObject *)CURRENT_OPERAND(); + TIER_TWO_ONLY value = Py_NewRef(ptr); stack_pointer[0] = value; stack_pointer += 1; @@ -3402,12 +3403,53 @@ case _LOAD_CONST_INLINE_BORROW: { PyObject *value; PyObject *ptr = (PyObject *)CURRENT_OPERAND(); + TIER_TWO_ONLY value = ptr; stack_pointer[0] = value; stack_pointer += 1; break; } + case _LOAD_CONST_INLINE_WITH_NULL: { + PyObject *value; + PyObject *null; + PyObject *ptr = (PyObject *)CURRENT_OPERAND(); + TIER_TWO_ONLY + value = Py_NewRef(ptr); + null = NULL; + stack_pointer[0] = value; + stack_pointer[1] = null; + stack_pointer += 2; + break; + } + + case _LOAD_CONST_INLINE_BORROW_WITH_NULL: { + PyObject *value; + PyObject *null; + PyObject *ptr = (PyObject *)CURRENT_OPERAND(); + TIER_TWO_ONLY + value = ptr; + null = NULL; + stack_pointer[0] = value; + stack_pointer[1] = null; + stack_pointer += 2; + break; + } + + case _CHECK_FUNCTION_VERSION: { + PyObject *self_or_null; + PyObject *callable; + oparg = CURRENT_OPARG(); + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; + uint32_t func_version = (uint32_t)CURRENT_OPERAND(); + TIER_TWO_ONLY + if (!PyFunction_Check(callable)) goto deoptimize; + PyFunctionObject *func = (PyFunctionObject *)callable; + if (func->func_version != func_version) goto deoptimize; + break; + } + case _INTERNAL_INCREMENT_OPT_COUNTER: { PyObject *opt; opt = stack_pointer[-1]; From 9d9bc24eeb8b0f6d3def786521fec1a2177ca1aa Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Thu, 18 Jan 2024 20:12:25 +0000 Subject: [PATCH 03/13] Optimize LOAD_GLOBAL specializations in tier 2 --- Include/internal/pycore_uop_ids.h | 7 +- Include/internal/pycore_uop_metadata.h | 6 +- Python/bytecodes.c | 11 +- Python/executor_cases.c.h | 20 +-- Python/optimizer_analysis.c | 167 +++++++++++++++++++++++++ 5 files changed, 192 insertions(+), 19 deletions(-) diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index 311328bd6e6198..b2476e1c6e5c4b 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -234,9 +234,10 @@ extern "C" { #define _LOAD_CONST_INLINE_BORROW 381 #define _LOAD_CONST_INLINE_WITH_NULL 382 #define _LOAD_CONST_INLINE_BORROW_WITH_NULL 383 -#define _CHECK_FUNCTION_VERSION 384 -#define _INTERNAL_INCREMENT_OPT_COUNTER 385 -#define MAX_UOP_ID 385 +#define _CHECK_GLOBALS 384 +#define _CHECK_BUILTINS 385 +#define _INTERNAL_INCREMENT_OPT_COUNTER 386 +#define MAX_UOP_ID 386 #ifdef __cplusplus } diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 8a20991e544e87..2b5b37e6b8d6a4 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -206,7 +206,8 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_LOAD_CONST_INLINE_BORROW] = 0, [_LOAD_CONST_INLINE_WITH_NULL] = 0, [_LOAD_CONST_INLINE_BORROW_WITH_NULL] = 0, - [_CHECK_FUNCTION_VERSION] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_PASSTHROUGH_FLAG, + [_CHECK_GLOBALS] = HAS_DEOPT_FLAG, + [_CHECK_BUILTINS] = HAS_DEOPT_FLAG, [_INTERNAL_INCREMENT_OPT_COUNTER] = 0, }; @@ -253,11 +254,12 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = { [_CHECK_ATTR_METHOD_LAZY_DICT] = "_CHECK_ATTR_METHOD_LAZY_DICT", [_CHECK_ATTR_MODULE] = "_CHECK_ATTR_MODULE", [_CHECK_ATTR_WITH_HINT] = "_CHECK_ATTR_WITH_HINT", + [_CHECK_BUILTINS] = "_CHECK_BUILTINS", [_CHECK_CALL_BOUND_METHOD_EXACT_ARGS] = "_CHECK_CALL_BOUND_METHOD_EXACT_ARGS", [_CHECK_EG_MATCH] = "_CHECK_EG_MATCH", [_CHECK_EXC_MATCH] = "_CHECK_EXC_MATCH", [_CHECK_FUNCTION_EXACT_ARGS] = "_CHECK_FUNCTION_EXACT_ARGS", - [_CHECK_FUNCTION_VERSION] = "_CHECK_FUNCTION_VERSION", + [_CHECK_GLOBALS] = "_CHECK_GLOBALS", [_CHECK_MANAGED_OBJECT_HAS_VALUES] = "_CHECK_MANAGED_OBJECT_HAS_VALUES", [_CHECK_PEP_523] = "_CHECK_PEP_523", [_CHECK_STACK_SPACE] = "_CHECK_STACK_SPACE", diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 638d2dab2eeb42..376209d59b0828 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -4092,11 +4092,14 @@ dummy_func( null = NULL; } - op(_CHECK_FUNCTION_VERSION, (func_version/2, callable, self_or_null, unused[oparg] -- callable, self_or_null, unused[oparg])) { + op(_CHECK_GLOBALS, (dict/4 -- )) { TIER_TWO_ONLY - DEOPT_IF(!PyFunction_Check(callable)); - PyFunctionObject *func = (PyFunctionObject *)callable; - DEOPT_IF(func->func_version != func_version); + DEOPT_IF(GLOBALS() != dict); + } + + op(_CHECK_BUILTINS, (dict/4 -- )) { + TIER_TWO_ONLY + DEOPT_IF(BUILTINS() != dict); } /* Internal -- for testing executors */ diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 667df44cd37f82..2d914b82dbf88f 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -3436,17 +3436,17 @@ break; } - case _CHECK_FUNCTION_VERSION: { - PyObject *self_or_null; - PyObject *callable; - oparg = CURRENT_OPARG(); - self_or_null = stack_pointer[-1 - oparg]; - callable = stack_pointer[-2 - oparg]; - uint32_t func_version = (uint32_t)CURRENT_OPERAND(); + case _CHECK_GLOBALS: { + PyObject *dict = (PyObject *)CURRENT_OPERAND(); TIER_TWO_ONLY - if (!PyFunction_Check(callable)) goto deoptimize; - PyFunctionObject *func = (PyFunctionObject *)callable; - if (func->func_version != func_version) goto deoptimize; + if (GLOBALS() != dict) goto deoptimize; + break; + } + + case _CHECK_BUILTINS: { + PyObject *dict = (PyObject *)CURRENT_OPERAND(); + TIER_TWO_ONLY + if (BUILTINS() != dict) goto deoptimize; break; } diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 70fefb7ae89b31..5c9a886fd2bbfe 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -5,6 +5,7 @@ #include "pycore_opcode_utils.h" #include "pycore_pystate.h" // _PyInterpreterState_GET() #include "pycore_uop_metadata.h" +#include "pycore_dict.h" #include "pycore_long.h" #include "cpython/optimizer.h" #include @@ -12,6 +13,172 @@ #include #include "pycore_optimizer.h" +static int +builtins_watcher_callback(PyDict_WatchEvent event, PyObject* dict, + PyObject* key, PyObject* new_value) +{ + if (event != PyDict_EVENT_CLONED) { + _Py_Executors_InvalidateAll(_PyInterpreterState_GET()); + } + return 0; +} + +static int +globals_watcher_callback(PyDict_WatchEvent event, PyObject* dict, + PyObject* key, PyObject* new_value) +{ + if (event != PyDict_EVENT_CLONED) { + _Py_Executors_InvalidateDependency(_PyInterpreterState_GET(), dict); + } + /* TO DO -- Mark the dict, as a warning for future optimization attempts */ + return 0; +} + + +static void +global_to_const(_PyUOpInstruction *inst, PyObject *obj) +{ + assert(inst->opcode == _LOAD_GLOBAL_MODULE || inst->opcode == _LOAD_GLOBAL_BUILTINS); + assert(PyDict_CheckExact(obj)); + PyDictObject *dict = (PyDictObject *)obj; + assert(dict->ma_keys->dk_kind == DICT_KEYS_UNICODE); + PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(dict->ma_keys); + assert(inst->operand < dict->ma_used); + assert(inst->operand <= UINT16_MAX); + PyObject *res = entries[inst->operand].me_value; + if (res == NULL) { + return; + } + if (_Py_IsImmortal(res)) { + inst->opcode = (inst->oparg & 1) ? _LOAD_CONST_INLINE_BORROW_WITH_NULL : _LOAD_CONST_INLINE_BORROW; + } + else { + inst->opcode = (inst->oparg & 1) ? _LOAD_CONST_INLINE_WITH_NULL : _LOAD_CONST_INLINE; + } + inst->operand = (uint64_t)res; +} + +static int +check_globals_version(_PyUOpInstruction *inst, PyObject *obj) +{ + if (!PyDict_CheckExact(obj)) { + return -1; + } + PyDictObject *dict = (PyDictObject *)obj; + if (dict->ma_keys->dk_version != inst->operand) { + return -1; + } + return 0; +} + +static void +remove_globals(_PyUOpInstruction *buffer, int buffer_size, + _PyInterpreterFrame *frame, _PyBloomFilter *dependencies) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj; + assert(PyFunction_Check(func)); + PyObject *builtins = frame->f_builtins; + PyObject *globals = frame->f_globals; + assert(func->func_builtins == builtins); + assert(func->func_globals == globals); + /* In order to treat globals as constants, we need to + * know that the globals dict is the one we expected, and + * that it hasn't changed + * In order to treat builtins as constants, we need to + * know that the builtins dict is the one we expected, and + * that it hasn't changed and that the global dictionary's + * keys have not changed */ + uint32_t builtins_checked = 0; + uint32_t builtins_watched = 0; + uint32_t globals_checked = 0; + uint32_t globals_watched = 0; + if (interp->dict_state.watchers[0] == NULL) { + interp->dict_state.watchers[0] = builtins_watcher_callback; + } + if (interp->dict_state.watchers[1] == NULL) { + interp->dict_state.watchers[1] = globals_watcher_callback; + } + for (int pc = 0; pc < buffer_size; pc++) { + _PyUOpInstruction *inst = &buffer[pc]; + int opcode = inst->opcode; + switch(opcode) { + case _GUARD_BUILTINS_VERSION: + if (check_globals_version(inst, builtins)) { + continue; + } + if ((builtins_watched & 1) == 0) { + PyDict_Watch(0, builtins); + builtins_watched = 1; + } + if (builtins_checked & 1) { + buffer[pc].opcode = NOP; + } + else { + buffer[pc].opcode = _CHECK_BUILTINS; + buffer[pc].operand = (uintptr_t)builtins; + builtins_checked |= 1; + } + break; + case _GUARD_GLOBALS_VERSION: + if (check_globals_version(inst, globals)) { + continue; + } + if ((globals_watched & 1) == 0) { + PyDict_Watch(1, globals); + _Py_BloomFilter_Add(dependencies, globals); + globals_watched |= 1; + } + if (globals_checked & 1) { + buffer[pc].opcode = NOP; + } + else { + buffer[pc].opcode = _CHECK_GLOBALS; + buffer[pc].operand = (uintptr_t)globals; + globals_checked |= 1; + } + break; + case _LOAD_GLOBAL_BUILTINS: + if (globals_checked & builtins_checked & globals_watched & builtins_watched & 1) { + global_to_const(inst, builtins); + } + break; + case _LOAD_GLOBAL_MODULE: + if (globals_checked & globals_watched & 1) { + global_to_const(inst, globals); + } + break; + case _JUMP_TO_TOP: + case _EXIT_TRACE: + goto done; + case _PUSH_FRAME: + { + globals_checked <<= 1; + globals_watched <<= 1; + builtins_checked <<= 1; + builtins_watched <<= 1; + PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand; + assert(PyFunction_Check(func)); + globals = func->func_globals; + builtins = func->func_builtins; + break; + } + case _POP_FRAME: + globals_checked >>= 1; + globals_watched >>= 1; + builtins_checked >>= 1; + builtins_watched >>= 1; + PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand; + assert(PyFunction_Check(func)); + globals = func->func_globals; + builtins = func->func_builtins; + break; + } + } + done: + return; +} + static void peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size) { From 24b3533cbd9d9ed9e1a8b43ab933c0c5ec158bea Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Thu, 18 Jan 2024 21:17:31 +0000 Subject: [PATCH 04/13] Fix handling of call stack in trace generator --- Python/optimizer.c | 21 +++++++++++++-------- Python/optimizer_analysis.c | 16 +++++++++++++--- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index b1689e19e79f13..aefd9bcaa9f988 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -359,7 +359,8 @@ BRANCH_TO_GUARD[4][2] = { ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0); \ goto done; \ } \ - trace_stack[trace_stack_depth].code = code; \ + assert(func->func_code == (PyObject *)code); \ + trace_stack[trace_stack_depth].func = func; \ trace_stack[trace_stack_depth].instr = instr; \ trace_stack_depth++; #define TRACE_STACK_POP() \ @@ -367,7 +368,8 @@ BRANCH_TO_GUARD[4][2] = { Py_FatalError("Trace stack underflow\n"); \ } \ trace_stack_depth--; \ - code = trace_stack[trace_stack_depth].code; \ + func = trace_stack[trace_stack_depth].func; \ + code = (PyCodeObject *)trace_stack[trace_stack_depth].func->func_code; \ instr = trace_stack[trace_stack_depth].instr; /* Returns 1 on success, @@ -384,13 +386,15 @@ translate_bytecode_to_trace( { bool progress_needed = true; PyCodeObject *code = (PyCodeObject *)frame->f_executable; + PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj; + assert(PyFunction_Check(func)); PyCodeObject *initial_code = code; _Py_BloomFilter_Add(dependencies, initial_code); _Py_CODEUNIT *initial_instr = instr; int trace_length = 0; int max_length = buffer_size; struct { - PyCodeObject *code; + PyFunctionObject *func; _Py_CODEUNIT *instr; } trace_stack[TRACE_STACK_SIZE]; int trace_stack_depth = 0; @@ -591,7 +595,7 @@ translate_bytecode_to_trace( TRACE_STACK_POP(); /* Set the operand to the code object returned to, * to assist optimization passes */ - trace[trace_length-1].operand = (uintptr_t)code; + trace[trace_length-1].operand = (uintptr_t)func; DPRINTF(2, "Returning to %s (%s:%d) at byte offset %d\n", PyUnicode_AsUTF8(code->co_qualname), @@ -607,10 +611,10 @@ translate_bytecode_to_trace( // Add one to account for the actual opcode/oparg pair: + 1; uint32_t func_version = read_u32(&instr[func_version_offset].cache); - PyFunctionObject *func = _PyFunction_LookupByVersion(func_version); + PyFunctionObject *new_func = _PyFunction_LookupByVersion(func_version); DPRINTF(3, "Function object: %p\n", func); - if (func != NULL) { - PyCodeObject *new_code = (PyCodeObject *)PyFunction_GET_CODE(func); + if (new_func != NULL) { + PyCodeObject *new_code = (PyCodeObject *)PyFunction_GET_CODE(new_func); if (new_code == code) { // Recursive call, bail (we could be here forever). DPRINTF(2, "Bailing on recursive call to %s (%s:%d)\n", @@ -635,8 +639,9 @@ translate_bytecode_to_trace( _Py_BloomFilter_Add(dependencies, new_code); /* Set the operand to the callee's code object, * to assist optimization passes */ - trace[trace_length-1].operand = (uintptr_t)new_code; + trace[trace_length-1].operand = (uintptr_t)new_func; code = new_code; + func = new_func; instr = _PyCode_CODE(code); DPRINTF(2, "Continuing in %s (%s:%d) at byte offset %d\n", diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 5c9a886fd2bbfe..f74b076734cfef 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -72,8 +72,8 @@ check_globals_version(_PyUOpInstruction *inst, PyObject *obj) } static void -remove_globals(_PyUOpInstruction *buffer, int buffer_size, - _PyInterpreterFrame *frame, _PyBloomFilter *dependencies) +remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, + int buffer_size, _PyBloomFilter *dependencies) { PyInterpreterState *interp = _PyInterpreterState_GET(); PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj; @@ -204,8 +204,17 @@ peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_s } case _PUSH_FRAME: case _POP_FRAME: - co = (PyCodeObject *)buffer[pc].operand; + { + PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand; + if (func == NULL) { + co = NULL; + } + else { + assert(PyFunction_Check(func)); + co = (PyCodeObject *)func->func_code; + } break; + } case _JUMP_TO_TOP: case _EXIT_TRACE: return; @@ -260,6 +269,7 @@ _Py_uop_analyze_and_optimize( int curr_stacklen ) { + remove_globals(frame, buffer, buffer_size, dependencies); peephole_opt(frame, buffer, buffer_size); remove_unneeded_uops(buffer, buffer_size); return 0; From 35a92cccd3aaabf0e86734bbcdc6e0c3ff7eb8c3 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Thu, 18 Jan 2024 22:56:31 +0000 Subject: [PATCH 05/13] Get globals to const pass working --- Include/internal/pycore_optimizer.h | 3 +- Lib/test/test_capi/test_watchers.py | 28 ++++++++--------- Modules/_testcapi/watchers.c | 4 +-- Objects/dictobject.c | 3 +- Python/optimizer.c | 7 +++-- Python/optimizer_analysis.c | 47 +++++++++++++++-------------- 6 files changed, 49 insertions(+), 43 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 1105adb59d3e48..e21412fc815540 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -9,7 +9,8 @@ extern "C" { #endif int _Py_uop_analyze_and_optimize(_PyInterpreterFrame *frame, - _PyUOpInstruction *trace, int trace_len, int curr_stackentries); + _PyUOpInstruction *trace, int trace_len, int curr_stackentries, + _PyBloomFilter *dependencies); extern PyTypeObject _PyCounterExecutor_Type; extern PyTypeObject _PyCounterOptimizer_Type; diff --git a/Lib/test/test_capi/test_watchers.py b/Lib/test/test_capi/test_watchers.py index 5981712c80c3a9..dc9a8f6440ee47 100644 --- a/Lib/test/test_capi/test_watchers.py +++ b/Lib/test/test_capi/test_watchers.py @@ -52,14 +52,6 @@ def test_set_existing_item(self): d["foo"] = "baz" self.assert_events(["mod:foo:baz"]) - def test_clone(self): - d = {} - d2 = {"foo": "bar"} - with self.watcher() as wid: - self.watch(wid, d) - d.update(d2) - self.assert_events(["clone"]) - def test_no_event_if_not_watched(self): d = {} with self.watcher() as wid: @@ -80,6 +72,14 @@ def test_pop(self): d.pop("foo") self.assert_events(["del:foo"]) + def test_clone(self): + d = {} + d2 = {"foo": "bar"} + with self.watcher() as wid: + self.watch(wid, d) + d.update(d2) + self.assert_events(["clone"]) + def test_clear(self): d = {"foo": "bar"} with self.watcher() as wid: @@ -151,8 +151,8 @@ def test_watch_out_of_range_watcher_id(self): def test_watch_unassigned_watcher_id(self): d = {} - with self.assertRaisesRegex(ValueError, r"No dict watcher set for ID 1"): - self.watch(1, d) + with self.assertRaisesRegex(ValueError, r"No dict watcher set for ID 3"): + self.watch(3, d) def test_unwatch_non_dict(self): with self.watcher() as wid: @@ -168,8 +168,8 @@ def test_unwatch_out_of_range_watcher_id(self): def test_unwatch_unassigned_watcher_id(self): d = {} - with self.assertRaisesRegex(ValueError, r"No dict watcher set for ID 1"): - self.unwatch(1, d) + with self.assertRaisesRegex(ValueError, r"No dict watcher set for ID 3"): + self.unwatch(3, d) def test_clear_out_of_range_watcher_id(self): with self.assertRaisesRegex(ValueError, r"Invalid dict watcher ID -1"): @@ -178,8 +178,8 @@ def test_clear_out_of_range_watcher_id(self): self.clear_watcher(8) # DICT_MAX_WATCHERS = 8 def test_clear_unassigned_watcher_id(self): - with self.assertRaisesRegex(ValueError, r"No dict watcher set for ID 1"): - self.clear_watcher(1) + with self.assertRaisesRegex(ValueError, r"No dict watcher set for ID 3"): + self.clear_watcher(3) class TestTypeWatchers(unittest.TestCase): diff --git a/Modules/_testcapi/watchers.c b/Modules/_testcapi/watchers.c index a763ff46a3c290..1eb0db2c2e6576 100644 --- a/Modules/_testcapi/watchers.c +++ b/Modules/_testcapi/watchers.c @@ -15,8 +15,8 @@ module _testcapi /*[clinic end generated code: output=da39a3ee5e6b4b0d input=6361033e795369fc]*/ // Test dict watching -static PyObject *g_dict_watch_events; -static int g_dict_watchers_installed; +static PyObject *g_dict_watch_events = NULL; +static int g_dict_watchers_installed = 0; static int dict_watch_callback(PyDict_WatchEvent event, diff --git a/Objects/dictobject.c b/Objects/dictobject.c index e608b91679b568..e4e0e7702efe6a 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -5975,7 +5975,8 @@ PyDict_AddWatcher(PyDict_WatchCallback callback) { PyInterpreterState *interp = _PyInterpreterState_GET(); - for (int i = 0; i < DICT_MAX_WATCHERS; i++) { + /* Start at 2, as 0 and 1 as reserved for CPython */ + for (int i = 2; i < DICT_MAX_WATCHERS; i++) { if (!interp->dict_state.watchers[i]) { interp->dict_state.watchers[i] = callback; return i; diff --git a/Python/optimizer.c b/Python/optimizer.c index aefd9bcaa9f988..c80fb641c3b8a0 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -817,9 +817,10 @@ uop_optimize( OPT_STAT_INC(traces_created); char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE"); if (uop_optimize == NULL || *uop_optimize > '0') { - err = _Py_uop_analyze_and_optimize(frame, buffer, UOP_MAX_TRACE_LENGTH, curr_stackentries); - if (err < 0) { - return -1; + err = _Py_uop_analyze_and_optimize(frame, buffer, + UOP_MAX_TRACE_LENGTH, curr_stackentries, &dependencies); + if (err <= 0) { + return err; } } _PyExecutorObject *executor = make_executor_from_uops(buffer, &dependencies); diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index f74b076734cfef..f9175cabd909e0 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -43,7 +43,6 @@ global_to_const(_PyUOpInstruction *inst, PyObject *obj) PyDictObject *dict = (PyDictObject *)obj; assert(dict->ma_keys->dk_kind == DICT_KEYS_UNICODE); PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(dict->ma_keys); - assert(inst->operand < dict->ma_used); assert(inst->operand <= UINT16_MAX); PyObject *res = entries[inst->operand].me_value; if (res == NULL) { @@ -59,19 +58,19 @@ global_to_const(_PyUOpInstruction *inst, PyObject *obj) } static int -check_globals_version(_PyUOpInstruction *inst, PyObject *obj) +incorrect_keys(_PyUOpInstruction *inst, PyObject *obj) { if (!PyDict_CheckExact(obj)) { - return -1; + return 1; } PyDictObject *dict = (PyDictObject *)obj; if (dict->ma_keys->dk_version != inst->operand) { - return -1; + return 1; } return 0; } -static void +static int remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size, _PyBloomFilter *dependencies) { @@ -104,12 +103,12 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int opcode = inst->opcode; switch(opcode) { case _GUARD_BUILTINS_VERSION: - if (check_globals_version(inst, builtins)) { - continue; + if (incorrect_keys(inst, builtins)) { + return 0; } if ((builtins_watched & 1) == 0) { PyDict_Watch(0, builtins); - builtins_watched = 1; + builtins_watched |= 1; } if (builtins_checked & 1) { buffer[pc].opcode = NOP; @@ -121,8 +120,8 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, } break; case _GUARD_GLOBALS_VERSION: - if (check_globals_version(inst, globals)) { - continue; + if (incorrect_keys(inst, globals)) { + return 0; } if ((globals_watched & 1) == 0) { PyDict_Watch(1, globals); @@ -139,18 +138,16 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, } break; case _LOAD_GLOBAL_BUILTINS: - if (globals_checked & builtins_checked & globals_watched & builtins_watched & 1) { - global_to_const(inst, builtins); - } + assert(globals_checked & builtins_checked & globals_watched & builtins_watched & 1); + global_to_const(inst, builtins); break; case _LOAD_GLOBAL_MODULE: - if (globals_checked & globals_watched & 1) { - global_to_const(inst, globals); - } + assert(globals_checked & globals_watched & 1); + global_to_const(inst, globals); break; case _JUMP_TO_TOP: case _EXIT_TRACE: - goto done; + return 1; case _PUSH_FRAME: { globals_checked <<= 1; @@ -158,6 +155,9 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, builtins_checked <<= 1; builtins_watched <<= 1; PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand; + if (func == NULL) { + return 1; + } assert(PyFunction_Check(func)); globals = func->func_globals; builtins = func->func_builtins; @@ -175,8 +175,7 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, break; } } - done: - return; + return 0; } static void @@ -266,11 +265,15 @@ _Py_uop_analyze_and_optimize( _PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size, - int curr_stacklen + int curr_stacklen, + _PyBloomFilter *dependencies ) { - remove_globals(frame, buffer, buffer_size, dependencies); + int err = remove_globals(frame, buffer, buffer_size, dependencies); + if (err <= 0) { + return err; + } peephole_opt(frame, buffer, buffer_size); remove_unneeded_uops(buffer, buffer_size); - return 0; + return 1; } From 8c699a324dd9b6b93d037b17226703deba7c4871 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Thu, 18 Jan 2024 23:30:18 +0000 Subject: [PATCH 06/13] Minor tidying --- Lib/test/test_capi/test_watchers.py | 16 ++++++++-------- Objects/dictobject.c | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Lib/test/test_capi/test_watchers.py b/Lib/test/test_capi/test_watchers.py index dc9a8f6440ee47..ae062b1bda26b7 100644 --- a/Lib/test/test_capi/test_watchers.py +++ b/Lib/test/test_capi/test_watchers.py @@ -52,6 +52,14 @@ def test_set_existing_item(self): d["foo"] = "baz" self.assert_events(["mod:foo:baz"]) + def test_clone(self): + d = {} + d2 = {"foo": "bar"} + with self.watcher() as wid: + self.watch(wid, d) + d.update(d2) + self.assert_events(["clone"]) + def test_no_event_if_not_watched(self): d = {} with self.watcher() as wid: @@ -72,14 +80,6 @@ def test_pop(self): d.pop("foo") self.assert_events(["del:foo"]) - def test_clone(self): - d = {} - d2 = {"foo": "bar"} - with self.watcher() as wid: - self.watch(wid, d) - d.update(d2) - self.assert_events(["clone"]) - def test_clear(self): d = {"foo": "bar"} with self.watcher() as wid: diff --git a/Objects/dictobject.c b/Objects/dictobject.c index e4e0e7702efe6a..96e0e9fae99f0f 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -5975,7 +5975,7 @@ PyDict_AddWatcher(PyDict_WatchCallback callback) { PyInterpreterState *interp = _PyInterpreterState_GET(); - /* Start at 2, as 0 and 1 as reserved for CPython */ + /* Start at 2, as 0 and 1 are reserved for CPython */ for (int i = 2; i < DICT_MAX_WATCHERS; i++) { if (!interp->dict_state.watchers[i]) { interp->dict_state.watchers[i] = callback; From 99e38cad5058d16689bbad6d312ce65b394653ad Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Fri, 19 Jan 2024 00:58:02 +0000 Subject: [PATCH 07/13] Merge builtin watchers and limit re-optimizations --- Include/cpython/optimizer.h | 2 + Include/internal/pycore_interp.h | 2 +- Python/optimizer_analysis.c | 83 ++++++++++++++++---------------- Python/pylifecycle.c | 21 +++++--- 4 files changed, 58 insertions(+), 50 deletions(-) diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index 66a841713cce85..e0c4f9b511d434 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -92,6 +92,8 @@ PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewUOpOptimizer(void); /* Minimum of 16 additional executions before retry */ #define MINIMUM_TIER2_BACKOFF 4 +#define _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS 4 + #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 662a18d93f329d..7512fe15dea851 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -71,7 +71,6 @@ typedef struct _rare_events { uint8_t set_eval_frame_func; /* Modifying the builtins, __builtins__.__dict__[var] = ... */ uint8_t builtin_dict; - int builtins_dict_watcher_id; /* Modifying a function, e.g. func.__defaults__ = ..., etc. */ uint8_t func_modification; } _rare_events; @@ -233,6 +232,7 @@ struct _is { uint16_t optimizer_backedge_threshold; uint32_t next_func_version; _rare_events rare_events; + PyDict_WatchCallback builtins_dict_watcher; _Py_GlobalMonitors monitors; bool sys_profile_initialized; diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index f9175cabd909e0..1340beab8dd4cc 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -1,5 +1,6 @@ #include "Python.h" #include "opcode.h" +#include "pycore_dict.h" #include "pycore_interp.h" #include "pycore_opcode_metadata.h" #include "pycore_opcode_utils.h" @@ -13,16 +14,6 @@ #include #include "pycore_optimizer.h" -static int -builtins_watcher_callback(PyDict_WatchEvent event, PyObject* dict, - PyObject* key, PyObject* new_value) -{ - if (event != PyDict_EVENT_CLONED) { - _Py_Executors_InvalidateAll(_PyInterpreterState_GET()); - } - return 0; -} - static int globals_watcher_callback(PyDict_WatchEvent event, PyObject* dict, PyObject* key, PyObject* new_value) @@ -78,6 +69,9 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj; assert(PyFunction_Check(func)); PyObject *builtins = frame->f_builtins; + if (builtins != interp->builtins) { + return 1; + } PyObject *globals = frame->f_globals; assert(func->func_builtins == builtins); assert(func->func_globals == globals); @@ -92,9 +86,6 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, uint32_t builtins_watched = 0; uint32_t globals_checked = 0; uint32_t globals_watched = 0; - if (interp->dict_state.watchers[0] == NULL) { - interp->dict_state.watchers[0] = builtins_watcher_callback; - } if (interp->dict_state.watchers[1] == NULL) { interp->dict_state.watchers[1] = globals_watcher_callback; } @@ -106,6 +97,9 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, if (incorrect_keys(inst, builtins)) { return 0; } + if (interp->rare_events.builtin_dict >= _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS) { + continue; + } if ((builtins_watched & 1) == 0) { PyDict_Watch(0, builtins); builtins_watched |= 1; @@ -138,41 +132,46 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, } break; case _LOAD_GLOBAL_BUILTINS: - assert(globals_checked & builtins_checked & globals_watched & builtins_watched & 1); - global_to_const(inst, builtins); + if (globals_checked & builtins_checked & globals_watched & builtins_watched & 1) { + global_to_const(inst, builtins); + } break; case _LOAD_GLOBAL_MODULE: - assert(globals_checked & globals_watched & 1); - global_to_const(inst, globals); + if (globals_checked & globals_watched & 1) { + global_to_const(inst, globals); + } break; case _JUMP_TO_TOP: case _EXIT_TRACE: return 1; - case _PUSH_FRAME: - { - globals_checked <<= 1; - globals_watched <<= 1; - builtins_checked <<= 1; - builtins_watched <<= 1; - PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand; - if (func == NULL) { - return 1; - } - assert(PyFunction_Check(func)); - globals = func->func_globals; - builtins = func->func_builtins; - break; - } - case _POP_FRAME: - globals_checked >>= 1; - globals_watched >>= 1; - builtins_checked >>= 1; - builtins_watched >>= 1; - PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand; - assert(PyFunction_Check(func)); - globals = func->func_globals; - builtins = func->func_builtins; - break; + case _PUSH_FRAME: + { + globals_checked <<= 1; + globals_watched <<= 1; + builtins_checked <<= 1; + builtins_watched <<= 1; + PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand; + if (func == NULL) { + return 1; + } + assert(PyFunction_Check(func)); + globals = func->func_globals; + builtins = func->func_builtins; + if (builtins != interp->builtins) { + return 1; + } + break; + } + case _POP_FRAME: + globals_checked >>= 1; + globals_watched >>= 1; + builtins_checked >>= 1; + builtins_watched >>= 1; + PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand; + assert(PyFunction_Check(func)); + globals = func->func_globals; + builtins = func->func_builtins; + break; } } return 0; diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 261622adc4cc77..cd5fb32d287db4 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -32,6 +32,7 @@ #include "pycore_typevarobject.h" // _Py_clear_generic_types() #include "pycore_unicodeobject.h" // _PyUnicode_InitTypes() #include "pycore_weakref.h" // _PyWeakref_GET_REF() +#include "cpython/optimizer.h" // _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS #include "opcode.h" @@ -608,7 +609,11 @@ init_interp_create_gil(PyThreadState *tstate, int gil) static int builtins_dict_watcher(PyDict_WatchEvent event, PyObject *dict, PyObject *key, PyObject *new_value) { - RARE_EVENT_INC(builtin_dict); + PyInterpreterState *interp = _PyInterpreterState_GET(); + if (event != PyDict_EVENT_CLONED && interp->rare_events.builtin_dict < _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS) { + _Py_Executors_InvalidateAll(interp); + } + RARE_EVENT_INTERP_INC(interp, builtin_dict); return 0; } @@ -1272,11 +1277,9 @@ init_interp_main(PyThreadState *tstate) } } - if ((interp->rare_events.builtins_dict_watcher_id = PyDict_AddWatcher(&builtins_dict_watcher)) == -1) { - return _PyStatus_ERR("failed to add builtin dict watcher"); - } - if (PyDict_Watch(interp->rare_events.builtins_dict_watcher_id, interp->builtins) != 0) { + interp->dict_state.watchers[0] = &builtins_dict_watcher; + if (PyDict_Watch(0, interp->builtins) != 0) { return _PyStatus_ERR("failed to set builtin dict watcher"); } @@ -1607,8 +1610,12 @@ finalize_modules(PyThreadState *tstate) { PyInterpreterState *interp = tstate->interp; - // Stop collecting stats on __builtin__ modifications during teardown - PyDict_Unwatch(interp->rare_events.builtins_dict_watcher_id, interp->builtins); + // Invalidate all executors and turn off tier 2 optimizer + _Py_Executors_InvalidateAll(interp); + interp->optimizer = &_PyOptimizer_Default; + + // Stop watching __builtin__ modifications + PyDict_Unwatch(0, interp->builtins); PyObject *modules = _PyImport_GetModules(interp); if (modules == NULL) { From e0eaea9eb207ff0db3885b819edcd9bf61a4d73e Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Fri, 19 Jan 2024 01:00:20 +0000 Subject: [PATCH 08/13] Add braces --- Python/optimizer_analysis.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 1340beab8dd4cc..16f52e2df22e1f 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -163,6 +163,7 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, break; } case _POP_FRAME: + { globals_checked >>= 1; globals_watched >>= 1; builtins_checked >>= 1; @@ -172,6 +173,7 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, globals = func->func_globals; builtins = func->func_builtins; break; + } } } return 0; From d3a259092a41817c86d1c3e4f9135a1147119302 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Fri, 19 Jan 2024 08:42:37 +0000 Subject: [PATCH 09/13] Use a few bits in the dict version tag to track deoptimizations --- Include/cpython/optimizer.h | 1 + Include/internal/pycore_dict.h | 6 +++--- Include/internal/pycore_dict_state.h | 1 + Python/optimizer_analysis.c | 27 +++++++++++++++++++++++++-- 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index e0c4f9b511d434..7b0d7d2e87260a 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -93,6 +93,7 @@ PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewUOpOptimizer(void); #define MINIMUM_TIER2_BACKOFF 4 #define _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS 4 +#define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 8 #ifdef __cplusplus } diff --git a/Include/internal/pycore_dict.h b/Include/internal/pycore_dict.h index d96870e9197bbf..b9fe5cd1ed0584 100644 --- a/Include/internal/pycore_dict.h +++ b/Include/internal/pycore_dict.h @@ -206,8 +206,8 @@ static inline PyDictUnicodeEntry* DK_UNICODE_ENTRIES(PyDictKeysObject *dk) { #define DK_IS_UNICODE(dk) ((dk)->dk_kind != DICT_KEYS_GENERAL) -#define DICT_VERSION_INCREMENT (1 << DICT_MAX_WATCHERS) -#define DICT_VERSION_MASK (DICT_VERSION_INCREMENT - 1) +#define DICT_VERSION_INCREMENT (1 << (DICT_MAX_WATCHERS + DICT_WATCHED_MUTATION_BITS)) +#define DICT_WATCHER_MASK ((1 << DICT_MAX_WATCHERS) - 1) #define DICT_NEXT_VERSION(INTERP) \ ((INTERP)->dict_state.global_version += DICT_VERSION_INCREMENT) @@ -227,7 +227,7 @@ _PyDict_NotifyEvent(PyInterpreterState *interp, PyObject *value) { assert(Py_REFCNT((PyObject*)mp) > 0); - int watcher_bits = mp->ma_version_tag & DICT_VERSION_MASK; + int watcher_bits = mp->ma_version_tag & DICT_WATCHER_MASK; if (watcher_bits) { _PyDict_SendEvent(watcher_bits, event, mp, key, value); return DICT_NEXT_VERSION(interp) | watcher_bits; diff --git a/Include/internal/pycore_dict_state.h b/Include/internal/pycore_dict_state.h index ece0f10ca25170..0ec2116503a4c3 100644 --- a/Include/internal/pycore_dict_state.h +++ b/Include/internal/pycore_dict_state.h @@ -19,6 +19,7 @@ extern "C" { #endif #define DICT_MAX_WATCHERS 8 +#define DICT_WATCHED_MUTATION_BITS 4 struct _Py_dict_state { /*Global counter used to set ma_version_tag field of dictionary. diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 16f52e2df22e1f..083e43649db14a 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -14,14 +14,33 @@ #include #include "pycore_optimizer.h" +static int get_mutations(PyObject* dict) { + assert(PyDict_CheckExact(dict)); + PyDictObject *d = (PyDictObject *)dict; + return (d->ma_version_tag >> DICT_MAX_WATCHERS) & ((1 << DICT_WATCHED_MUTATION_BITS)-1); +} + +static int increment_mutations(PyObject* dict) { + assert(PyDict_CheckExact(dict)); + PyDictObject *d = (PyDictObject *)dict; + return d->ma_version_tag += (1 << DICT_MAX_WATCHERS); +} + static int globals_watcher_callback(PyDict_WatchEvent event, PyObject* dict, PyObject* key, PyObject* new_value) { - if (event != PyDict_EVENT_CLONED) { + if (event == PyDict_EVENT_CLONED) { + return 0; + } + uint64_t watched_mutations = get_mutations(dict); + if (watched_mutations < _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS) { _Py_Executors_InvalidateDependency(_PyInterpreterState_GET(), dict); + increment_mutations(dict); + } + else { + PyDict_Unwatch(1, dict); } - /* TO DO -- Mark the dict, as a warning for future optimization attempts */ return 0; } @@ -117,6 +136,10 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, if (incorrect_keys(inst, globals)) { return 0; } + uint64_t watched_mutations = get_mutations(globals); + if (watched_mutations >= _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS) { + continue; + } if ((globals_watched & 1) == 0) { PyDict_Watch(1, globals); _Py_BloomFilter_Add(dependencies, globals); From b3f24187037734b93e6e7225ed71e26712d829b4 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Thu, 1 Feb 2024 16:36:24 +0000 Subject: [PATCH 10/13] Address review comments --- Include/cpython/dictobject.h | 3 +++ Include/cpython/optimizer.h | 5 ++++- Python/optimizer.c | 2 +- Python/optimizer_analysis.c | 30 +++++++++++++++++++++--------- Python/pylifecycle.c | 1 + 5 files changed, 30 insertions(+), 11 deletions(-) diff --git a/Include/cpython/dictobject.h b/Include/cpython/dictobject.h index 944965fb9e5351..1720fe6f01ea37 100644 --- a/Include/cpython/dictobject.h +++ b/Include/cpython/dictobject.h @@ -17,6 +17,9 @@ typedef struct { /* Dictionary version: globally unique, value change each time the dictionary is modified */ #ifdef Py_BUILD_CORE + /* Bits 0-7 are for dict watchers. + * Bits 8-11 are for the watched mutation counter (used by tier2 optimization) + * The remaining bits (12-63) are the actual version tag. */ uint64_t ma_version_tag; #else Py_DEPRECATED(3.12) uint64_t ma_version_tag; diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index 7b0d7d2e87260a..e4fbdee8e29823 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -45,7 +45,10 @@ typedef struct _PyExecutorObject { typedef struct _PyOptimizerObject _PyOptimizerObject; /* Should return > 0 if a new executor is created. O if no executor is produced and < 0 if an error occurred. */ -typedef int (*optimize_func)(_PyOptimizerObject* self, struct _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **, int curr_stackentries); +typedef int (*optimize_func)( + _PyOptimizerObject* self, struct _PyInterpreterFrame *frame, + _Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr, + int curr_stackentries); typedef struct _PyOptimizerObject { PyObject_HEAD diff --git a/Python/optimizer.c b/Python/optimizer.c index 19aa008915c5a6..0ef42d0b134f05 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -594,7 +594,7 @@ translate_bytecode_to_trace( ADD_TO_TRACE(uop, oparg, operand, target); if (uop == _POP_FRAME) { TRACE_STACK_POP(); - /* Set the operand to the code object returned to, + /* Set the operand to the function object returned to, * to assist optimization passes */ trace[trace_length-1].operand = (uintptr_t)func; DPRINTF(2, diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 083e43649db14a..bf3538edcbcc1b 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -14,16 +14,18 @@ #include #include "pycore_optimizer.h" -static int get_mutations(PyObject* dict) { +static int +get_mutations(PyObject* dict) { assert(PyDict_CheckExact(dict)); PyDictObject *d = (PyDictObject *)dict; return (d->ma_version_tag >> DICT_MAX_WATCHERS) & ((1 << DICT_WATCHED_MUTATION_BITS)-1); } -static int increment_mutations(PyObject* dict) { +static void +increment_mutations(PyObject* dict) { assert(PyDict_CheckExact(dict)); PyDictObject *d = (PyDictObject *)dict; - return d->ma_version_tag += (1 << DICT_MAX_WATCHERS); + d->ma_version_tag += (1 << DICT_MAX_WATCHERS); } static int @@ -80,6 +82,14 @@ incorrect_keys(_PyUOpInstruction *inst, PyObject *obj) return 0; } +/* The first two dict watcher IDs are reserved for CPython, + * so we don't need to check that they haven't been used */ +#define BUILTINS_WATCHER_ID 0 +#define GLOBALS_WATCHER_ID 1 + +/* Returns 1 if successfully optimized + * 0 if the trace is not suitable for optimization (yet) + * -1 if there was an error. */ static int remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size, _PyBloomFilter *dependencies) @@ -101,6 +111,9 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, * know that the builtins dict is the one we expected, and * that it hasn't changed and that the global dictionary's * keys have not changed */ + + /* These values represent stacks of booleans (one bool per bit). + * Pushing a frame shifts left, popping a frame shifts right. */ uint32_t builtins_checked = 0; uint32_t builtins_watched = 0; uint32_t globals_checked = 0; @@ -120,7 +133,7 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, continue; } if ((builtins_watched & 1) == 0) { - PyDict_Watch(0, builtins); + PyDict_Watch(BUILTINS_WATCHER_ID, builtins); builtins_watched |= 1; } if (builtins_checked & 1) { @@ -141,7 +154,7 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, continue; } if ((globals_watched & 1) == 0) { - PyDict_Watch(1, globals); + PyDict_Watch(GLOBALS_WATCHER_ID, globals); _Py_BloomFilter_Add(dependencies, globals); globals_watched |= 1; } @@ -164,9 +177,6 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, global_to_const(inst, globals); } break; - case _JUMP_TO_TOP: - case _EXIT_TRACE: - return 1; case _PUSH_FRAME: { globals_checked <<= 1; @@ -197,6 +207,9 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, builtins = func->func_builtins; break; } + case _JUMP_TO_TOP: + case _EXIT_TRACE: + return 1; } } return 0; @@ -283,7 +296,6 @@ remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size) } } - int _Py_uop_analyze_and_optimize( _PyInterpreterFrame *frame, diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index cd5fb32d287db4..9a8d99fa15f2ee 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1612,6 +1612,7 @@ finalize_modules(PyThreadState *tstate) // Invalidate all executors and turn off tier 2 optimizer _Py_Executors_InvalidateAll(interp); + Py_XDECREF(interp->optimizer); interp->optimizer = &_PyOptimizer_Default; // Stop watching __builtin__ modifications From e65dcf21f8987320ab6b9b13f0d4f68cac6b0c4e Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Fri, 2 Feb 2024 02:44:48 +0000 Subject: [PATCH 11/13] Tolerate default optimizer being called --- Python/optimizer.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 0ee33538f80c4a..d71ca0aef0e11a 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -108,16 +108,14 @@ PyUnstable_Replace_Executor(PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutor } static int -error_optimize( +never_optimize( _PyOptimizerObject* self, _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec, int Py_UNUSED(stack_entries)) { - assert(0); - PyErr_Format(PyExc_SystemError, "Should never call error_optimize"); - return -1; + return 0; } PyTypeObject _PyDefaultOptimizer_Type = { @@ -130,7 +128,7 @@ PyTypeObject _PyDefaultOptimizer_Type = { _PyOptimizerObject _PyOptimizer_Default = { PyObject_HEAD_INIT(&_PyDefaultOptimizer_Type) - .optimize = error_optimize, + .optimize = never_optimize, .resume_threshold = INT16_MAX, .backedge_threshold = INT16_MAX, }; From f763e3729106f4a3b3f7aa28dbab278f615c46a2 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Fri, 2 Feb 2024 03:56:04 +0000 Subject: [PATCH 12/13] Fix compiler warning --- Python/optimizer_analysis.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index bf3538edcbcc1b..2cfbf4b349d0f5 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -95,15 +95,14 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size, _PyBloomFilter *dependencies) { PyInterpreterState *interp = _PyInterpreterState_GET(); - PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj; - assert(PyFunction_Check(func)); PyObject *builtins = frame->f_builtins; if (builtins != interp->builtins) { return 1; } PyObject *globals = frame->f_globals; - assert(func->func_builtins == builtins); - assert(func->func_globals == globals); + assert(PyFunction_Check(((PyFunctionObject *)frame->f_funcobj))); + assert(((PyFunctionObject *)frame->f_funcobj)->func_builtins == builtins); + assert(((PyFunctionObject *)frame->f_funcobj)->func_globals == globals); /* In order to treat globals as constants, we need to * know that the globals dict is the one we expected, and * that it hasn't changed From 5bdac7aa8601f95ad56017753c4041c41caa5059 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Fri, 2 Feb 2024 04:26:52 +0000 Subject: [PATCH 13/13] Reduce modification limits a bit. --- Include/cpython/optimizer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index 3fba6e5179ecda..5a9ccaea3b2209 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -97,8 +97,8 @@ PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewUOpOptimizer(void); /* Minimum of 16 additional executions before retry */ #define MINIMUM_TIER2_BACKOFF 4 -#define _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS 4 -#define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 8 +#define _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS 3 +#define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 6 #ifdef __cplusplus }