Skip to content

GH-133136: Revise QSBR to reduce excess memory held #135473

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Include/internal/pycore_pymem.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ extern wchar_t *_PyMem_DefaultRawWcsdup(const wchar_t *str);
extern int _PyMem_DebugEnabled(void);

// Enqueue a pointer to be freed possibly after some delay.
extern void _PyMem_FreeDelayed(void *ptr);
extern void _PyMem_FreeDelayed(void *ptr, size_t size);

// Enqueue an object to be freed possibly after some delay
#ifdef Py_GIL_DISABLED
Expand Down
28 changes: 22 additions & 6 deletions Include/internal/pycore_qsbr.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,15 @@ struct _qsbr_thread_state {
// Thread state (or NULL)
PyThreadState *tstate;

// Used to defer advancing write sequence a fixed number of times
int deferrals;
// Number of held items added by this thread since last process
int deferred_count;

// Estimate for the amount of memory that is held by this thread since
// the last process
size_t deferred_memory;

// Sequence number at time of last "should process" check.
uint64_t seq_last_check;

// Is this thread state allocated?
bool allocated;
Expand Down Expand Up @@ -109,11 +116,20 @@ _Py_qbsr_goal_reached(struct _qsbr_thread_state *qsbr, uint64_t goal)
extern uint64_t
_Py_qsbr_advance(struct _qsbr_shared *shared);

// Batches requests to advance the write sequence. This advances the write
// sequence every N calls, which reduces overhead but increases time to
// reclamation. Returns the new goal.
// Advance the write sequence as required and return the sequence goal to use
// for memory to be freed. If the sequence is advanced, this goal is the new
// sequence value, otherwise it is the next sequence value. In either case,
// the goal is higher that any write sequence value already observed by readers.
//
// The 'size' argument is the size in bytes of the memory scheduled to be
// freed. If that size is not available, pass zero as the value.
extern uint64_t
_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr);
_Py_qsbr_advance_with_size(struct _qsbr_thread_state *qsbr, size_t size);

// Return true if memory held by QSBR should be processed to determine if it
// can be safely freed.
extern bool
_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr);

// Have the read sequences advanced to the given goal? If this returns true,
// it safe to reclaim any memory tagged with the goal (or earlier goal).
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Limit excess memory usage in the :term:`free threading` build when a
large dictionary or list is resized and accessed by multiple threads.
2 changes: 1 addition & 1 deletion Objects/codeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -3350,7 +3350,7 @@ create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx)
}
memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void *));
_Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc);
_PyMem_FreeDelayed(tlbc);
_PyMem_FreeDelayed(tlbc, tlbc->size * sizeof(void *));
tlbc = new_tlbc;
}
char *bc = PyMem_Calloc(1, _PyCode_NBYTES(co));
Expand Down
4 changes: 2 additions & 2 deletions Objects/dictobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,7 @@ free_keys_object(PyDictKeysObject *keys, bool use_qsbr)
{
#ifdef Py_GIL_DISABLED
if (use_qsbr) {
_PyMem_FreeDelayed(keys);
_PyMem_FreeDelayed(keys, _PyDict_KeysSize(keys));
return;
}
#endif
Expand Down Expand Up @@ -858,7 +858,7 @@ free_values(PyDictValues *values, bool use_qsbr)
assert(values->embedded == 0);
#ifdef Py_GIL_DISABLED
if (use_qsbr) {
_PyMem_FreeDelayed(values);
_PyMem_FreeDelayed(values, values_size_from_count(values->capacity));
return;
}
#endif
Expand Down
3 changes: 2 additions & 1 deletion Objects/listobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ free_list_items(PyObject** items, bool use_qsbr)
#ifdef Py_GIL_DISABLED
_PyListArray *array = _Py_CONTAINER_OF(items, _PyListArray, ob_item);
if (use_qsbr) {
_PyMem_FreeDelayed(array);
size_t size = sizeof(_PyListArray) + array->allocated * sizeof(PyObject *);
_PyMem_FreeDelayed(array, size);
}
else {
PyMem_Free(array);
Expand Down
24 changes: 14 additions & 10 deletions Objects/obmalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,10 @@ _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force)

_PyMem_mi_page_clear_qsbr(page);
page->retire_expire = 0;
page->qsbr_goal = _Py_qsbr_deferred_advance(tstate->qsbr);

size_t bsize = mi_page_block_size(page);
page->qsbr_goal = _Py_qsbr_advance_with_size(tstate->qsbr, page->capacity*bsize);

llist_insert_tail(&tstate->mimalloc.page_list, &page->qsbr_node);
return false;
}
Expand Down Expand Up @@ -1142,7 +1145,7 @@ free_work_item(uintptr_t ptr, delayed_dealloc_cb cb, void *state)
}

static void
free_delayed(uintptr_t ptr)
free_delayed(uintptr_t ptr, size_t size)
{
#ifndef Py_GIL_DISABLED
free_work_item(ptr, NULL, NULL);
Expand Down Expand Up @@ -1200,23 +1203,21 @@ free_delayed(uintptr_t ptr)
}

assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK);
uint64_t seq = _Py_qsbr_deferred_advance(tstate->qsbr);
uint64_t seq = _Py_qsbr_advance_with_size(tstate->qsbr, size);
buf->array[buf->wr_idx].ptr = ptr;
buf->array[buf->wr_idx].qsbr_goal = seq;
buf->wr_idx++;

if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) {
_PyMem_ProcessDelayed((PyThreadState *)tstate);
}
_PyMem_ProcessDelayed((PyThreadState *)tstate);
#endif
}

void
_PyMem_FreeDelayed(void *ptr)
_PyMem_FreeDelayed(void *ptr, size_t size)
{
assert(!((uintptr_t)ptr & 0x01));
if (ptr != NULL) {
free_delayed((uintptr_t)ptr);
free_delayed((uintptr_t)ptr, size);
}
}

Expand All @@ -1226,7 +1227,7 @@ _PyObject_XDecRefDelayed(PyObject *ptr)
{
assert(!((uintptr_t)ptr & 0x01));
if (ptr != NULL) {
free_delayed(((uintptr_t)ptr)|0x01);
free_delayed(((uintptr_t)ptr)|0x01, 64);
}
}
#endif
Expand Down Expand Up @@ -1299,8 +1300,11 @@ maybe_process_interp_queue(struct _Py_mem_interp_free_queue *queue,
void
_PyMem_ProcessDelayed(PyThreadState *tstate)
{
PyInterpreterState *interp = tstate->interp;
_PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate;
if (!_Py_qsbr_should_process(tstate_impl->qsbr)) {
return;
}
PyInterpreterState *interp = tstate->interp;

// Process thread-local work
process_queue(&tstate_impl->mem_free_queue, tstate_impl, true, NULL, NULL);
Expand Down
51 changes: 43 additions & 8 deletions Python/qsbr.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,14 @@
// Starting size of the array of qsbr thread states
#define MIN_ARRAY_SIZE 8

// For _Py_qsbr_deferred_advance(): the number of deferrals before advancing
// the write sequence.
#define QSBR_DEFERRED_LIMIT 10
// For should_advance_qsbr(): the number of deferred items before advancing
// the write sequence. This is based on WORK_ITEMS_PER_CHUNK. We ideally
// want to process a chunk before it overflows.
#define QSBR_DEFERRED_LIMIT 127

// If the deferred memory exceeds 1 MiB, we force an advance in the
// shared QSBR sequence number to limit excess memory usage.
#define QSBR_MEM_LIMIT 1024*1024

// Allocate a QSBR thread state from the freelist
static struct _qsbr_thread_state *
Expand Down Expand Up @@ -113,17 +118,47 @@ _Py_qsbr_advance(struct _qsbr_shared *shared)
// NOTE: with 64-bit sequence numbers, we don't have to worry too much
// about the wr_seq getting too far ahead of rd_seq, but if we ever use
// 32-bit sequence numbers, we'll need to be more careful.
return _Py_atomic_add_uint64(&shared->wr_seq, QSBR_INCR) + QSBR_INCR;
return _Py_atomic_add_uint64(&shared->wr_seq, QSBR_INCR);
}

static int
should_advance_qsbr(struct _qsbr_thread_state *qsbr, size_t size)
{
qsbr->deferred_count++;
qsbr->deferred_memory += size;
if (qsbr->deferred_count >= QSBR_DEFERRED_LIMIT ||
qsbr->deferred_memory > QSBR_MEM_LIMIT) {
qsbr->deferred_count = 0;
qsbr->deferred_memory = 0;
return 1;
}
return 0;
}

uint64_t
_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr)
_Py_qsbr_advance_with_size(struct _qsbr_thread_state *qsbr, size_t size)
{
if (++qsbr->deferrals < QSBR_DEFERRED_LIMIT) {
if (should_advance_qsbr(qsbr, size)) {
// Advance the write sequence and return the updated value as the goal.
return _Py_qsbr_advance(qsbr->shared);
}
else {
// Don't advance, return the next sequence value as the goal.
return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR;
}
qsbr->deferrals = 0;
return _Py_qsbr_advance(qsbr->shared);
}

bool
_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr)
{
if (qsbr->seq_last_check == qsbr->seq) {
// If 'seq' for this thread hasn't advanced, it is unlikely that any
// deferred memory is ready to be freed. Wait longer before trying
// to process.
return false;
}
qsbr->seq_last_check = qsbr->seq;
return true;
}

static uint64_t
Expand Down
Loading