diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index c01b9c7e6a4d..2722b0bb4437 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -370,6 +370,16 @@ apw_load_buffers(void) apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0; apw_state->prewarmed_blocks = 0; + + /* Don't prewarm more than we can fit. */ + if (num_elements > NBuffers) + { + num_elements = NBuffers; + ereport(LOG, + (errmsg("autoprewarm: capping prewarmed blocks to %d (shared_buffers size)", + NBuffers))); + } + /* Get the info position of the first block of the next database. */ while (apw_state->prewarm_start_idx < num_elements) { @@ -410,10 +420,6 @@ apw_load_buffers(void) apw_state->database = current_db; Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx); - /* If we've run out of free buffers, don't launch another worker. */ - if (!have_free_buffer()) - break; - /* * Likewise, don't launch if we've already been told to shut down. * (The launch would fail anyway, but we might as well skip it.) @@ -462,12 +468,6 @@ apw_read_stream_next_block(ReadStream *stream, { BlockInfoRecord blk = p->block_info[p->pos]; - if (!have_free_buffer()) - { - p->pos = apw_state->prewarm_stop_idx; - return InvalidBlockNumber; - } - if (blk.tablespace != p->tablespace) return InvalidBlockNumber; @@ -523,10 +523,10 @@ autoprewarm_database_main(Datum main_arg) blk = block_info[i]; /* - * Loop until we run out of blocks to prewarm or until we run out of free + * Loop until we run out of blocks to prewarm or until we run out of * buffers. */ - while (i < apw_state->prewarm_stop_idx && have_free_buffer()) + while (i < apw_state->prewarm_stop_idx) { Oid tablespace = blk.tablespace; RelFileNumber filenumber = blk.filenumber; @@ -568,14 +568,13 @@ autoprewarm_database_main(Datum main_arg) /* * We have a relation; now let's loop until we find a valid fork of - * the relation or we run out of free buffers. Once we've read from - * all valid forks or run out of options, we'll close the relation and + * the relation or we run out of buffers. Once we've read from all + * valid forks or run out of options, we'll close the relation and * move on. */ while (i < apw_state->prewarm_stop_idx && blk.tablespace == tablespace && - blk.filenumber == filenumber && - have_free_buffer()) + blk.filenumber == filenumber) { ForkNumber forknum = blk.forknum; BlockNumber nblocks; diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 72f5acceec78..3d44374f5ab0 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -77,6 +77,283 @@ int BgWriterDelay = 200; static TimestampTz last_snapshot_ts; static XLogRecPtr last_snapshot_lsn = InvalidXLogRecPtr; +/* + * Collected buffer usage information. + */ +typedef struct BackendBufferStats +{ + int backend_id; + uint64 usage_sum; + double usage_ratio; +} BackendBufferStats; + +static int +compare_backend_usage(const void *a, const void *b) +{ + const BackendBufferStats *stat_a = (const BackendBufferStats *) a; + const BackendBufferStats *stat_b = (const BackendBufferStats *) b; + + if (stat_a->usage_ratio < stat_b->usage_ratio) + return -1; + if (stat_a->usage_ratio > stat_b->usage_ratio) + return 1; + return 0; +} + +static uint64 +CalculateSystemBufferPressure(BackendBufferStats *backend_stats[], int *num_backends) +{ + uint64 total_usage = 0; + int active_backends = 0; + BackendBufferStats *stats; + + /* Count active backends first */ + for (int i = 0; i < ProcGlobal->allProcCount; i++) + { + PGPROC *proc = &ProcGlobal->allProcs[i]; + + if (proc->pid != 0 && proc->databaseId != InvalidOid) + active_backends++; + } + + if (active_backends == 0) + { + *backend_stats = NULL; + *num_backends = 0; + return 0; + } + + /* Allocate stats array */ + stats = palloc(sizeof(BackendBufferStats) * active_backends); + *backend_stats = stats; + *num_backends = active_backends; + + /* Collect stats from all active backends */ + for (int i = 0, j = 0; i < ProcGlobal->allProcCount; i++) + { + PGPROC *proc = &ProcGlobal->allProcs[i]; + + if (proc->pid != 0 && proc->databaseId != InvalidOid) + { + uint64 usage_sum = pg_atomic_read_u32(&proc->bufferUsageSum); + + stats[j].backend_id = i; + stats[j].usage_sum = usage_sum; + stats[j].usage_ratio = (double) usage_sum / NBuffers; + total_usage += usage_sum; + j++; + } + } + + /* Sort by usage ratio for percentile calculation */ + qsort(stats, active_backends, sizeof(BackendBufferStats), + compare_backend_usage); + + return total_usage; +} + +static void +GetHighUsageBackends(BackendBufferStats *stats, int num_backends, + int **high_usage_backends, int *num_high_usage) +{ + int percentile_90_idx = (int) (num_backends * 0.9); + + *num_high_usage = num_backends - percentile_90_idx; + + if (*num_high_usage > 0) + { + *high_usage_backends = palloc(sizeof(int) * (*num_high_usage)); + for (int i = 0; i < *num_high_usage; i++) + (*high_usage_backends)[i] = stats[percentile_90_idx + i].backend_id; + } + else + { + *high_usage_backends = NULL; + *num_high_usage = 0; + } +} + +/* + * Shared buffer sync function used by both main loop and aggressive writing + */ +static int +SyncTargetedBuffers(WritebackContext *wb_context, int *target_backends, + int num_targets, int max_buffers) +{ + int buffers_written = 0; + int buffer_id; + BufferDesc *bufHdr; + uint32 buf_state; + + /* If no specific targets, sync any dirty buffers */ + if (target_backends == NULL || num_targets == 0) + return BgBufferSync(wb_context); + + /* Scan through all buffers looking for dirty ones from target backends */ + for (buffer_id = 0; buffer_id < NBuffers && buffers_written < max_buffers; buffer_id++) + { + uint32 dirty_backend; + bool is_target; + + bufHdr = GetBufferDescriptor(buffer_id); + + /* Quick check if buffer is dirty */ + buf_state = pg_atomic_read_u32(&bufHdr->state); + if (!(buf_state & BM_DIRTY)) + continue; + + /* Check if this buffer is from one of our target backends */ + dirty_backend = pg_atomic_read_u32(&bufHdr->dirty_backend_id); + is_target = false; + + for (int i = 0; i < num_targets; i++) + if (dirty_backend == target_backends[i]) + { + is_target = true; + break; + } + + if (!is_target) + continue; + + /* Skip if buffer is pinned */ + if (BUF_STATE_GET_REFCOUNT(buf_state) > 0) + continue; + + /* Try to write this buffer using the writeback context */ + ScheduleBufferTagForWriteback(wb_context, + IOContextForStrategy(NULL), + &bufHdr->tag); + buffers_written++; + } + + /* Issue the actual writes */ + if (buffers_written > 0) + IssuePendingWritebacks(wb_context, IOContextForStrategy(NULL)); + + return buffers_written; +} + +static void +AggressiveBufferWrite(WritebackContext *wb_context, int *high_usage_backends, + int num_high_usage, bool critical) +{ + int write_target = critical ? bgwriter_lru_maxpages * 3 : bgwriter_lru_maxpages * 2; + int buffers_written = 0; + + /* Focus on buffers from high-usage backends first */ + buffers_written = SyncTargetedBuffers(wb_context, high_usage_backends, + num_high_usage, write_target); + + /* If still under target, write additional dirty buffers */ + if (buffers_written < write_target) + BgBufferSync(wb_context); +} + +/* In src/backend/postmaster/bgwriter.c - Enhanced UpdateBackendDecayRates */ +static void +UpdateBackendDecayRates(BackendBufferStats *backend_stats, int num_backends, + double pressure_ratio, int *high_usage_backends, int num_high_usage) +{ + uint32 base_decay_rate; + uint64 total_usage = 0; + uint64 avg_usage; + int i, + j; + + /* Calculate base decay rate from system pressure */ + if (pressure_ratio > 0.90) + /* Critical pressure - aggressive decay */ + base_decay_rate = 3; + else if (pressure_ratio > 0.75) + /* High pressure */ + base_decay_rate = 2; + else + /* Normal decay rate */ + base_decay_rate = 1; + + /* Calculate total usage for relative comparisons */ + for (i = 0; i < num_backends; i++) + total_usage += backend_stats[i].usage_sum; + avg_usage = num_backends > 0 ? total_usage / num_backends : 0; + + if (base_decay_rate > 1) + elog(DEBUG2, "Buffer pressure: %.2f%%, base decay rate: %u, avg usage: %lu", + pressure_ratio * 100, base_decay_rate, avg_usage); + + /* Update each backend's personalized decay rate */ + for (i = 0; i < ProcGlobal->allProcCount; i++) + { + PGPROC *proc = &ProcGlobal->allProcs[i]; + + /* Only update active user backends */ + if (proc->pid != 0 && proc->databaseId != InvalidOid) + { + uint32 backend_usage = pg_atomic_read_u32(&proc->bufferUsageSum); + uint32 personalized_rate = base_decay_rate; + + /* Find this backend in the stats array */ + BackendBufferStats *backend_stat = NULL; + + for (j = 0; j < num_backends; j++) + { + if (backend_stats[j].backend_id == i) + { + backend_stat = &backend_stats[j]; + break; + } + } + + /* + * Calculate personalized decay rate based on usage and + * clock-sweep performance. + */ + if (backend_stat != NULL && avg_usage > 0) + { + double usage_ratio = (double) backend_usage / avg_usage; + + /* Get clock-sweep performance metrics */ + uint32 search_count = pg_atomic_read_u32(&proc->bufferSearchCount); + uint64 total_distance = pg_atomic_read_u64(&proc->clockSweepDistance); + uint32 total_passes = pg_atomic_read_u32(&proc->clockSweepPasses); + uint64 total_time = pg_atomic_read_u64(&proc->clockSweepTimeMicros); + + /* Calculate average search metrics */ + double avg_distance = search_count > 0 ? (double) total_distance / search_count : 0; + double avg_passes = search_count > 0 ? (double) total_passes / search_count : 0; + double avg_time = search_count > 0 ? (double) total_time / search_count : 0; + + /* Adjust decay rate based on usage relative to average */ + if (usage_ratio > 2.0) + /* High usage backends get more aggressive decay */ + personalized_rate = Min(4, base_decay_rate + 2); + else if (usage_ratio > 1.5) + personalized_rate = Min(4, base_decay_rate + 1); + else if (usage_ratio < 0.5) + /* Low usage backends get less aggressive decay */ + personalized_rate = Max(1, base_decay_rate > 1 ? base_decay_rate - 1 : 1); + + /* Further adjust based on clock-sweep performance */ + if (avg_distance > NBuffers * 0.5) + /* Searching more than half the buffer pool */ + personalized_rate = Min(4, personalized_rate + 1); + if (avg_passes > 1.0) + /* Making multiple complete passes */ + personalized_rate = Min(4, personalized_rate + 1); + if (avg_time > 1000.0) + /* Taking more than 1ms per search */ + personalized_rate = Min(4, personalized_rate + 1); + + elog(DEBUG2, "Backend %d: usage_ratio=%.2f, avg_distance=%.1f, avg_passes=%.2f, " + "avg_time=%.1fμs, decay_rate=%u", + i, usage_ratio, avg_distance, avg_passes, avg_time, personalized_rate); + } + + /* Update the backend's decay rate */ + pg_atomic_write_u32(&proc->bufferDecayRate, personalized_rate); + } + } +} /* * Main entry point for bgwriter process @@ -222,6 +499,15 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len) */ for (;;) { + BackendBufferStats *backend_stats = NULL; + int num_backends; + int *high_usage_backends = NULL; + int num_high_usage; + uint64 max_possible; + uint64 total_usage; + double pressure_ratio; + bool high_pressure; + bool critical_pressure; bool can_hibernate; int rc; @@ -230,6 +516,35 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len) ProcessMainLoopInterrupts(); + /* Calculate current buffer pressure */ + total_usage = CalculateSystemBufferPressure(&backend_stats, &num_backends); + max_possible = (uint64) NBuffers * BM_MAX_USAGE_COUNT; + total_usage = total_usage > max_possible ? max_possible : total_usage; + pressure_ratio = (double) total_usage / max_possible; + + /* Get high-usage backends (90th percentile) */ + if (backend_stats != NULL) + GetHighUsageBackends(backend_stats, num_backends, + &high_usage_backends, &num_high_usage); + + /* Update global decay rate based on current pressure */ + UpdateBackendDecayRates(backend_stats, num_backends, pressure_ratio, + high_usage_backends, num_high_usage); + + /* Determine if proactive action is needed */ + high_pressure = pressure_ratio > 0.75; /* 75% threshold */ + critical_pressure = pressure_ratio > 0.90; /* 90% threshold */ + + if (high_pressure) + { + elog(LOG, "%s buffer pressure detected: %.2f%% (%d high-usage backends)", + critical_pressure ? "Critical" : "High", + pressure_ratio * 100, num_high_usage); + + /* Aggressive writing of dirty buffers */ + AggressiveBufferWrite(&wb_context, high_usage_backends, num_high_usage, critical_pressure); + } + /* * Do one cycle of dirty-buffer writing. */ @@ -294,6 +609,11 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len) } } + if (backend_stats != NULL) + pfree(backend_stats); + if (high_usage_backends != NULL) + pfree(high_usage_backends); + /* * Sleep until we are signaled or BgWriterDelay has elapsed. * diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index a182fcd660cc..119f31b5d658 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -128,11 +128,11 @@ independently. If it is necessary to lock more than one partition at a time, they must be locked in partition-number order to avoid risk of deadlock. * A separate system-wide spinlock, buffer_strategy_lock, provides mutual -exclusion for operations that access the buffer free list or select -buffers for replacement. A spinlock is used here rather than a lightweight -lock for efficiency; no other locks of any sort should be acquired while -buffer_strategy_lock is held. This is essential to allow buffer replacement -to happen in multiple backends with reasonable concurrency. +exclusion for operations that select buffers for replacement. A spinlock is +used here rather than a lightweight lock for efficiency; no other locks of any +sort should be acquired while buffer_strategy_lock is held. This is essential +to allow buffer replacement to happen in multiple backends with reasonable +concurrency. * Each buffer header contains a spinlock that must be taken when examining or changing fields of that buffer header. This allows operations such as @@ -158,18 +158,8 @@ unset by sleeping on the buffer's condition variable. Normal Buffer Replacement Strategy ---------------------------------- -There is a "free list" of buffers that are prime candidates for replacement. -In particular, buffers that are completely free (contain no valid page) are -always in this list. We could also throw buffers into this list if we -consider their pages unlikely to be needed soon; however, the current -algorithm never does that. The list is singly-linked using fields in the -buffer headers; we maintain head and tail pointers in global variables. -(Note: although the list links are in the buffer headers, they are -considered to be protected by the buffer_strategy_lock, not the buffer-header -spinlocks.) To choose a victim buffer to recycle when there are no free -buffers available, we use a simple clock-sweep algorithm, which avoids the -need to take system-wide locks during common operations. It works like -this: +To choose a victim buffer to recycle we use a simple clock-sweep algorithm. It +works like this: Each buffer header contains a usage counter, which is incremented (up to a small limit value) whenever the buffer is pinned. (This requires only the @@ -184,20 +174,14 @@ The algorithm for a process that needs to obtain a victim buffer is: 1. Obtain buffer_strategy_lock. -2. If buffer free list is nonempty, remove its head buffer. Release -buffer_strategy_lock. If the buffer is pinned or has a nonzero usage count, -it cannot be used; ignore it go back to step 1. Otherwise, pin the buffer, -and return it. +2. Select the buffer pointed to by nextVictimBuffer, and circularly advance +nextVictimBuffer for next time. Release buffer_strategy_lock. -3. Otherwise, the buffer free list is empty. Select the buffer pointed to by -nextVictimBuffer, and circularly advance nextVictimBuffer for next time. -Release buffer_strategy_lock. - -4. If the selected buffer is pinned or has a nonzero usage count, it cannot +3. If the selected buffer is pinned or has a nonzero usage count, it cannot be used. Decrement its usage count (if nonzero), reacquire buffer_strategy_lock, and return to step 3 to examine the next buffer. -5. Pin the selected buffer, and return. +4. Pin the selected buffer, and return. (Note that if the selected buffer is dirty, we will have to write it out before we can recycle it; if someone else pins the buffer meanwhile we will @@ -211,9 +195,9 @@ Buffer Ring Replacement Strategy When running a query that needs to access a large number of pages just once, such as VACUUM or a large sequential scan, a different strategy is used. A page that has been touched only by such a scan is unlikely to be needed -again soon, so instead of running the normal clock sweep algorithm and +again soon, so instead of running the normal clock-sweep algorithm and blowing out the entire buffer cache, a small ring of buffers is allocated -using the normal clock sweep algorithm and those buffers are reused for the +using the normal clock-sweep algorithm and those buffers are reused for the whole scan. This also implies that much of the write traffic caused by such a statement will be done by the backend itself and not pushed off onto other processes. @@ -234,7 +218,7 @@ the ring strategy effectively degrades to the normal strategy. VACUUM uses a ring like sequential scans, however, the size of this ring is controlled by the vacuum_buffer_usage_limit GUC. Dirty pages are not removed -from the ring. Instead, WAL is flushed if needed to allow reuse of the +from the ring. Instead, the WAL is flushed if needed to allow reuse of the buffers. Before introducing the buffer ring strategy in 8.3, VACUUM's buffers were sent to the freelist, which was effectively a buffer ring of 1 buffer, resulting in excessive WAL flushing. diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index ed1dc488a42b..dfc5e1f5696c 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -124,24 +124,18 @@ BufferManagerShmemInit(void) pg_atomic_init_u32(&buf->state, 0); buf->wait_backend_pgprocno = INVALID_PROC_NUMBER; + /* Initialize dirty backend tracking */ + pg_atomic_init_u32(&buf->dirty_backend_id, INVALID_PROC_NUMBER); + buf->buf_id = i; pgaio_wref_clear(&buf->io_wref); - /* - * Initially link all the buffers together as unused. Subsequent - * management of this list is done by freelist.c. - */ - buf->freeNext = i + 1; - LWLockInitialize(BufferDescriptorGetContentLock(buf), LWTRANCHE_BUFFER_CONTENT); ConditionVariableInit(BufferDescriptorGetIOCV(buf)); } - - /* Correct last entry of linked list */ - GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST; } /* Init other shared buffer-management stuff */ diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 350cc0402aa8..95bbd62bb6ed 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2094,12 +2094,6 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ UnpinBuffer(victim_buf_hdr); - /* - * The victim buffer we acquired previously is clean and unused, let - * it be found again quickly - */ - StrategyFreeBuffer(victim_buf_hdr); - /* remaining code should match code at top of routine */ existing_buf_hdr = GetBufferDescriptor(existing_buf_id); @@ -2142,6 +2136,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * just like permanent relations. */ victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; + if (MyProc != NULL) + pg_atomic_add_fetch_u32(&MyProc->bufferUsageSum, 1); if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM) victim_buf_state |= BM_PERMANENT; @@ -2158,8 +2154,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } /* - * InvalidateBuffer -- mark a shared buffer invalid and return it to the - * freelist. + * InvalidateBuffer -- mark a shared buffer invalid. * * The buffer header spinlock must be held at entry. We drop it before * returning. (This is sane because the caller must have locked the @@ -2257,11 +2252,6 @@ InvalidateBuffer(BufferDesc *buf) * Done with mapping lock. */ LWLockRelease(oldPartitionLock); - - /* - * Insert the buffer at the head of the list of free buffers. - */ - StrategyFreeBuffer(buf); } /* @@ -2679,11 +2669,6 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, { BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1); - /* - * The victim buffer we acquired previously is clean and unused, - * let it be found again quickly - */ - StrategyFreeBuffer(buf_hdr); UnpinBuffer(buf_hdr); } @@ -2756,12 +2741,6 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, valid = PinBuffer(existing_hdr, strategy); LWLockRelease(partition_lock); - - /* - * The victim buffer we acquired previously is clean and unused, - * let it be found again quickly - */ - StrategyFreeBuffer(victim_buf_hdr); UnpinBuffer(victim_buf_hdr); buffers[i] = BufferDescriptorGetBuffer(existing_hdr); @@ -2804,6 +2783,8 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, victim_buf_hdr->tag = tag; buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; + if (MyProc != NULL) + pg_atomic_add_fetch_u32(&MyProc->bufferUsageSum, 1); if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM) buf_state |= BM_PERMANENT; @@ -2973,6 +2954,11 @@ MarkBufferDirty(Buffer buffer) Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); buf_state |= BM_DIRTY | BM_JUST_DIRTIED; + /* Track which backend dirtied this buffer */ + if (MyProc != NULL) + pg_atomic_write_u32(&bufHdr->dirty_backend_id, + MyProc - ProcGlobal->allProcs); + if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state, buf_state)) break; @@ -3608,7 +3594,7 @@ BufferSync(int flags) * This is called periodically by the background writer process. * * Returns true if it's appropriate for the bgwriter process to go into - * low-power hibernation mode. (This happens if the strategy clock sweep + * low-power hibernation mode. (This happens if the strategy clock-sweep * has been "lapped" and no buffer allocations have occurred recently, * or if the bgwriter has been effectively disabled by setting * bgwriter_lru_maxpages to 0.) @@ -3658,8 +3644,8 @@ BgBufferSync(WritebackContext *wb_context) uint32 new_recent_alloc; /* - * Find out where the freelist clock sweep currently is, and how many - * buffer allocations have happened since our last call. + * Find out where the clock-sweep currently is, and how many buffer + * allocations have happened since our last call. */ strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); @@ -3679,8 +3665,8 @@ BgBufferSync(WritebackContext *wb_context) /* * Compute strategy_delta = how many buffers have been scanned by the - * clock sweep since last time. If first time through, assume none. Then - * see if we are still ahead of the clock sweep, and if so, how many + * clock-sweep since last time. If first time through, assume none. Then + * see if we are still ahead of the clock-sweep, and if so, how many * buffers we could scan before we'd catch up with it and "lap" it. Note: * weird-looking coding of xxx_passes comparisons are to avoid bogus * behavior when the passes counts wrap around. diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 01909be02725..7a7b8b1ab4e9 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -33,25 +33,17 @@ typedef struct slock_t buffer_strategy_lock; /* - * Clock sweep hand: index of next buffer to consider grabbing. Note that + * clock-sweep hand: index of next buffer to consider grabbing. Note that * this isn't a concrete buffer - we only ever increase the value. So, to * get an actual buffer, it needs to be used modulo NBuffers. */ pg_atomic_uint32 nextVictimBuffer; - int firstFreeBuffer; /* Head of list of unused buffers */ - int lastFreeBuffer; /* Tail of list of unused buffers */ - - /* - * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is, - * when the list is empty) - */ - /* * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. */ - uint32 completePasses; /* Complete cycles of the clock sweep */ + uint32 completePasses; /* Complete cycles of the clock-sweep */ pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ /* @@ -89,7 +81,7 @@ typedef struct BufferAccessStrategyData * struct. */ Buffer buffers[FLEXIBLE_ARRAY_MEMBER]; -} BufferAccessStrategyData; +} BufferAccessStrategyData; /* Prototypes for internal functions */ @@ -163,23 +155,6 @@ ClockSweepTick(void) return victim; } -/* - * have_free_buffer -- a lockless check to see if there is a free buffer in - * buffer pool. - * - * If the result is true that will become stale once free buffers are moved out - * by other operations, so the caller who strictly want to use a free buffer - * should not call this. - */ -bool -have_free_buffer(void) -{ - if (StrategyControl->firstFreeBuffer >= 0) - return true; - else - return false; -} - /* * StrategyGetBuffer * @@ -199,6 +174,14 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r int bgwprocno; int trycounter; uint32 local_buf_state; /* to avoid repeated (de-)referencing */ + uint32 backend_decay_rate; + + /* Clock-sweep performance tracking */ + instr_time start_time, + end_time; + uint64 buffers_examined = 0; + uint32 complete_passes = 0; + uint32 initial_clock_hand; *from_ring = false; @@ -216,6 +199,18 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r } } + initial_clock_hand = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer); + if (initial_clock_hand >= NBuffers) + initial_clock_hand %= NBuffers; + + /* Start timing the buffer search */ + INSTR_TIME_SET_CURRENT(start_time); + + /* Get this backend's personalized decay rate */ + backend_decay_rate = pg_atomic_read_u32(&MyProc->bufferDecayRate); + if (backend_decay_rate == 0) + backend_decay_rate = 1; + /* * If asked, we need to waken the bgwriter. Since we don't want to rely on * a spinlock for this we force a read from shared memory once, and then @@ -249,73 +244,14 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r */ pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); - /* - * First check, without acquiring the lock, whether there's buffers in the - * freelist. Since we otherwise don't require the spinlock in every - * StrategyGetBuffer() invocation, it'd be sad to acquire it here - - * uselessly in most cases. That obviously leaves a race where a buffer is - * put on the freelist but we don't see the store yet - but that's pretty - * harmless, it'll just get used during the next buffer acquisition. - * - * If there's buffers on the freelist, acquire the spinlock to pop one - * buffer of the freelist. Then check whether that buffer is usable and - * repeat if not. - * - * Note that the freeNext fields are considered to be protected by the - * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to - * manipulate them without holding the spinlock. - */ - if (StrategyControl->firstFreeBuffer >= 0) - { - while (true) - { - /* Acquire the spinlock to remove element from the freelist */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - - if (StrategyControl->firstFreeBuffer < 0) - { - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - break; - } - - buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer); - Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); - - /* Unconditionally remove buffer from freelist */ - StrategyControl->firstFreeBuffer = buf->freeNext; - buf->freeNext = FREENEXT_NOT_IN_LIST; - - /* - * Release the lock so someone else can access the freelist while - * we check out this buffer. - */ - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - - /* - * If the buffer is pinned or has a nonzero usage_count, we cannot - * use it; discard it and retry. (This can only happen if VACUUM - * put a valid buffer in the freelist and then someone else used - * it before we got to it. It's probably impossible altogether as - * of 8.3, but we'd better check anyway.) - */ - local_buf_state = LockBufHdr(buf); - if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 - && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0) - { - if (strategy != NULL) - AddBufferToRing(strategy, buf); - *buf_state = local_buf_state; - return buf; - } - UnlockBufHdr(buf, local_buf_state); - } - } - - /* Nothing on the freelist, so run the "clock sweep" algorithm */ + /* Use the "clock sweep" algorithm to find a free buffer */ trycounter = NBuffers; for (;;) { - buf = GetBufferDescriptor(ClockSweepTick()); + uint32 hand = ClockSweepTick(); + + buf = GetBufferDescriptor(hand); + buffers_examined++; /* * If the buffer is pinned or has a nonzero usage_count, we cannot use @@ -325,18 +261,53 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0) { - if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0) + uint32 current_usage = BUF_STATE_GET_USAGECOUNT(local_buf_state); + + if (current_usage != 0) { - local_buf_state -= BUF_USAGECOUNT_ONE; + uint32 current_sum; + uint32 new_sum; + uint32 decay_amount = Min(current_usage, backend_decay_rate); + + local_buf_state -= decay_amount * BUF_USAGECOUNT_ONE; + + do + { + current_sum = pg_atomic_read_u32(&MyProc->bufferUsageSum); + if (current_sum < decay_amount) + new_sum = 0; + else + new_sum = current_sum - decay_amount; + } while (!pg_atomic_compare_exchange_u32(&MyProc->bufferUsageSum, + ¤t_sum, new_sum)); trycounter = NBuffers; } else { + uint64 search_time_micros; + + INSTR_TIME_SET_CURRENT(end_time); + INSTR_TIME_SUBTRACT(end_time, start_time); + + search_time_micros = INSTR_TIME_GET_MICROSEC(end_time); + + /* Update this backend's clock-sweep performance metrics */ + pg_atomic_add_fetch_u64(&MyProc->clockSweepDistance, buffers_examined); + pg_atomic_add_fetch_u32(&MyProc->clockSweepPasses, complete_passes); + pg_atomic_add_fetch_u64(&MyProc->clockSweepTimeMicros, search_time_micros); + pg_atomic_add_fetch_u32(&MyProc->bufferSearchCount, 1); + + elog(DEBUG2, "Buffer search completed: examined=%lu, passes=%u, time=%luμs, decay_rate=%u", + buffers_examined, complete_passes, search_time_micros, backend_decay_rate); + /* Found a usable buffer */ if (strategy != NULL) AddBufferToRing(strategy, buf); *buf_state = local_buf_state; + + pg_atomic_add_fetch_u32(&MyProc->bufferUsageSum, 1); + return buf; } } @@ -353,30 +324,10 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r elog(ERROR, "no unpinned buffers available"); } UnlockBufHdr(buf, local_buf_state); - } -} - -/* - * StrategyFreeBuffer: put a buffer on the freelist - */ -void -StrategyFreeBuffer(BufferDesc *buf) -{ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - /* - * It is possible that we are told to put something in the freelist that - * is already in it; don't screw up the list if so. - */ - if (buf->freeNext == FREENEXT_NOT_IN_LIST) - { - buf->freeNext = StrategyControl->firstFreeBuffer; - if (buf->freeNext < 0) - StrategyControl->lastFreeBuffer = buf->buf_id; - StrategyControl->firstFreeBuffer = buf->buf_id; + if (buffers_examined > 1 && hand == initial_clock_hand) + complete_passes++; } - - SpinLockRelease(&StrategyControl->buffer_strategy_lock); } /* @@ -415,6 +366,7 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) { *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0); } + SpinLockRelease(&StrategyControl->buffer_strategy_lock); return result; } @@ -504,14 +456,7 @@ StrategyInitialize(bool init) SpinLockInit(&StrategyControl->buffer_strategy_lock); - /* - * Grab the whole linked list of free buffers for our strategy. We - * assume it was previously set up by BufferManagerShmemInit(). - */ - StrategyControl->firstFreeBuffer = 0; - StrategyControl->lastFreeBuffer = NBuffers - 1; - - /* Initialize the clock sweep pointer */ + /* Initialize the clock-sweep pointer */ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); /* Clear statistics */ @@ -759,7 +704,7 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) * * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, * since our own previous usage of the ring element would have left it - * there, but it might've been decremented by clock sweep since then). A + * there, but it might've been decremented by clock-sweep since then). A * higher usage_count indicates someone else has touched the buffer, so we * shouldn't re-use it. */ diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 3c0d20f4659d..04fef13409b0 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -229,7 +229,7 @@ GetLocalVictimBuffer(void) ResourceOwnerEnlarge(CurrentResourceOwner); /* - * Need to get a new buffer. We use a clock sweep algorithm (essentially + * Need to get a new buffer. We use a clock-sweep algorithm (essentially * the same as what freelist.c does now...) */ trycounter = NLocBuffer; diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index e9ef0fbfe32c..fdb2554e3f5b 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -528,6 +528,14 @@ InitProcess(void) MyProc->clogGroupMemberLsn = InvalidXLogRecPtr; Assert(pg_atomic_read_u32(&MyProc->clogGroupNext) == INVALID_PROC_NUMBER); + /* Initialize buffer usage tracking */ + pg_atomic_init_u32(&MyProc->bufferUsageSum, 0); + pg_atomic_init_u32(&MyProc->bufferDecayRate, 1); + pg_atomic_init_u64(&MyProc->clockSweepDistance, 0); + pg_atomic_init_u32(&MyProc->clockSweepPasses, 0); + pg_atomic_init_u64(&MyProc->clockSweepTimeMicros, 0); + pg_atomic_init_u32(&MyProc->bufferSearchCount, 0); + /* * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch * on it. That allows us to repoint the process latch, which so far diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 52a71b138f73..ac87bd90afd1 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -80,8 +80,8 @@ StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32, * The maximum allowed value of usage_count represents a tradeoff between * accuracy and speed of the clock-sweep buffer management algorithm. A * large value (comparable to NBuffers) would approximate LRU semantics. - * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of - * clock sweeps to find a free buffer, so in practice we don't want the + * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of the + * clock-sweep hand to find a free buffer, so in practice we don't want the * value to be very large. */ #define BM_MAX_USAGE_COUNT 5 @@ -217,8 +217,7 @@ BufMappingPartitionLockByIndex(uint32 index) * single atomic variable. This layout allow us to do some operations in a * single atomic operation, without actually acquiring and releasing spinlock; * for instance, increase or decrease refcount. buf_id field never changes - * after initialization, so does not need locking. freeNext is protected by - * the buffer_strategy_lock not buffer header lock. The LWLock can take care + * after initialization, so does not need locking. The LWLock can take care * of itself. The buffer header lock is *not* used to control access to the * data in the buffer! * @@ -264,10 +263,11 @@ typedef struct BufferDesc pg_atomic_uint32 state; int wait_backend_pgprocno; /* backend of pin-count waiter */ - int freeNext; /* link in freelist chain */ PgAioWaitRef io_wref; /* set iff AIO is in progress */ LWLock content_lock; /* to lock access to buffer contents */ + + pg_atomic_uint32 dirty_backend_id; /* backend ID that last dirtied this buffer */ } BufferDesc; /* @@ -360,13 +360,6 @@ BufferDescriptorGetContentLock(const BufferDesc *bdesc) return (LWLock *) (&bdesc->content_lock); } -/* - * The freeNext field is either the index of the next freelist entry, - * or one of these special values: - */ -#define FREENEXT_END_OF_LIST (-1) -#define FREENEXT_NOT_IN_LIST (-2) - /* * Functions for acquiring/releasing a shared buffer header's spinlock. Do * not apply these to local buffers! @@ -453,7 +446,6 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); -extern bool have_free_buffer(void); /* buf_table.c */ extern Size BufTableShmemSize(int size); diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index c6f5ebceefdd..e5daaf992764 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -247,6 +247,16 @@ struct PGPROC uint8 lwWaitMode; /* lwlock mode being waited for */ proclist_node lwWaitLink; /* position in LW lock wait list */ + /* Per-backend buffer usage tracking */ + pg_atomic_uint32 bufferUsageSum; /* Running total of buffer usage */ + pg_atomic_uint32 bufferDecayRate; /* Per-tick usage decay rate */ + + /* Clock-sweep performance metrics */ + pg_atomic_uint64 clockSweepDistance; /* Total buffers examined */ + pg_atomic_uint32 clockSweepPasses; /* Complete clock passes */ + pg_atomic_uint64 clockSweepTimeMicros; /* Total time in microseconds */ + pg_atomic_uint32 bufferSearchCount; /* Number of buffer searches */ + /* Support for condition variables. */ proclist_node cvWaitLink; /* position in CV wait list */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index a13e81628902..518f7aa3a92b 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -234,6 +234,7 @@ BTWriteState BUF_MEM BYTE BY_HANDLE_FILE_INFORMATION +BackendBufferStats BackendParameters BackendStartupData BackendState @@ -336,6 +337,7 @@ Bucket BufFile Buffer BufferAccessStrategy +BufferAccessStrategyData BufferAccessStrategyType BufferCacheNumaContext BufferCacheNumaRec