From e67432a656372d347e6d025700f4c17b1ca859f8 Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Mon, 25 Aug 2025 19:23:50 +0530 Subject: [PATCH 01/16] Add system view for shared buffer lookup table The view exposes the contents of the shared buffer lookup table for debugging, testing and investigation. TODO: It is better to place this view in pg_buffercache. But it's added as a system view since BufHashTable is not exposed outside buf_table.c. To move it to pg_buffercache, we should move the function pg_get_buffer_lookup_table() to pg_buffercache which invokes BufTableGetContent() by passing it the tuple store and tuple descriptor. BufTableGetContent fills the tuple store. The partitions are locked by pg_get_buffer_lookup_table(). Author: Ashutosh Bapat --- doc/src/sgml/system-views.sgml | 89 ++++++++++++++++++++++++++ src/backend/catalog/system_views.sql | 7 ++ src/backend/storage/buffer/buf_table.c | 61 ++++++++++++++++++ src/include/catalog/pg_proc.dat | 11 ++++ src/test/regress/expected/rules.out | 7 ++ 5 files changed, 175 insertions(+) diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 4187191ea741..89be9bc333fe 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -71,6 +71,11 @@ backend memory contexts + + pg_buffer_lookup_table + shared buffer lookup table + + pg_config compile-time configuration parameters @@ -896,6 +901,90 @@ AND c1.path[c2.level] = c2.path[c2.level]; + + <structname>pg_buffer_lookup_table</structname> + + pg_buffer_lookup_table + + + The pg_buffer_lookup_table view exposes the current + contents of the shared buffer lookup table. Each row represents an entry in + the lookup table mapping a relation page to the ID of buffer in which it is + cached. The shared buffer lookup table is locked for a short duration while + reading so as to ensure consistency. This may affect performance if this view + is queried very frequently. + + + <structname>pg_buffer_lookup_table</structname> View + + + + + Column Type + + + Description + + + + + + + tablespace oid + + + OID of the tablespace containing the relation + + + + + database oid + + + OID of the database containing the relation (zero for shared relations) + + + + + relfilenode oid + + + relfilenode identifying the relation + + + + + forknum int2 + + + Fork number within the relation (see ) + + + + + blocknum int8 + + + Block number within the relation + + + + + bufferid int4 + + + ID of the buffer caching the page + + + + +
+ + Access to this view is restricted to members of the + pg_read_all_stats role by default. + +
+ <structname>pg_config</structname> diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index c77fa0234bb7..46fc28396de9 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1420,3 +1420,10 @@ REVOKE ALL ON pg_aios FROM PUBLIC; GRANT SELECT ON pg_aios TO pg_read_all_stats; REVOKE EXECUTE ON FUNCTION pg_get_aios() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_get_aios() TO pg_read_all_stats; + +CREATE VIEW pg_buffer_lookup_table AS + SELECT * FROM pg_get_buffer_lookup_table(); +REVOKE ALL ON pg_buffer_lookup_table FROM PUBLIC; +GRANT SELECT ON pg_buffer_lookup_table TO pg_read_all_stats; +REVOKE EXECUTE ON FUNCTION pg_get_buffer_lookup_table() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_buffer_lookup_table() TO pg_read_all_stats; diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index 9d256559bab9..1f6e215a2ca3 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -21,7 +21,12 @@ */ #include "postgres.h" +#include "fmgr.h" +#include "funcapi.h" #include "storage/buf_internals.h" +#include "storage/lwlock.h" +#include "utils/rel.h" +#include "utils/builtins.h" /* entry for buffer lookup hashtable */ typedef struct @@ -159,3 +164,59 @@ BufTableDelete(BufferTag *tagPtr, uint32 hashcode) if (!result) /* shouldn't happen */ elog(ERROR, "shared buffer hash table corrupted"); } + +/* + * SQL callable function to report contents of the shared buffer lookup table. + */ +Datum +pg_get_buffer_lookup_table(PG_FUNCTION_ARGS) +{ +#define PG_GET_BUFFER_LOOKUP_TABLE_COLS 6 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + HASH_SEQ_STATUS hstat; + BufferLookupEnt *ent; + Datum values[PG_GET_BUFFER_LOOKUP_TABLE_COLS]; + bool nulls[PG_GET_BUFFER_LOOKUP_TABLE_COLS]; + int i; + + memset(nulls, 0, sizeof(nulls)); + + /* + * We put all the tuples into a tuplestore in one scan of the hashtable. + * This avoids any issue of the hashtable possibly changing between calls. + */ + InitMaterializedSRF(fcinfo, 0); + + Assert(rsinfo->setDesc->natts == PG_GET_BUFFER_LOOKUP_TABLE_COLS); + + /* + * Lock all buffer mapping partitions to ensure a consistent view of the + * hash table during the scan. Must grab LWLocks in partition-number order + * to avoid LWLock deadlock. + */ + for (i = 0; i < NUM_BUFFER_PARTITIONS; i++) + LWLockAcquire(BufMappingPartitionLockByIndex(i), LW_SHARED); + + hash_seq_init(&hstat, SharedBufHash); + while ((ent = (BufferLookupEnt *) hash_seq_search(&hstat)) != NULL) + { + values[0] = ObjectIdGetDatum(ent->key.spcOid); + values[1] = ObjectIdGetDatum(ent->key.dbOid); + values[2] = ObjectIdGetDatum(ent->key.relNumber); + values[3] = ObjectIdGetDatum(ent->key.forkNum); + values[4] = UInt32GetDatum(ent->key.blockNum); + values[5] = Int32GetDatum(ent->id); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + /* + * Release all buffer mapping partition locks in the reverse order so as + * to avoid LWLock deadlock. + */ + for (i = NUM_BUFFER_PARTITIONS - 1; i >= 0; i--) + LWLockRelease(BufMappingPartitionLockByIndex(i)); + + return (Datum) 0; +} diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 01eba3b5a190..229999ff2623 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8592,6 +8592,17 @@ proargmodes => '{o,o,o}', proargnames => '{name,type,size}', prosrc => 'pg_get_dsm_registry_allocations' }, +# buffer lookup table +{ oid => '5102', + descr => 'shared buffer lookup table', + proname => 'pg_get_buffer_lookup_table', prorows => '6', proretset => 't', + provolatile => 'v', prorettype => 'record', + proargtypes => '', proallargtypes => '{oid,oid,oid,int2,int8,int4}', + proargmodes => '{o,o,o,o,o,o}', + proargnames => '{tablespace,database,relfilenode,forknum,blocknum,bufferid}', + prosrc => 'pg_get_buffer_lookup_table' +}, + # memory context of local backend { oid => '2282', descr => 'information about all memory contexts of local backend', diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 35e8aad7701b..760bb13fe95b 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1330,6 +1330,13 @@ pg_backend_memory_contexts| SELECT name, free_chunks, used_bytes FROM pg_get_backend_memory_contexts() pg_get_backend_memory_contexts(name, ident, type, level, path, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes); +pg_buffer_lookup_table| SELECT tablespace, + database, + relfilenode, + forknum, + blocknum, + bufferid + FROM pg_get_buffer_lookup_table() pg_get_buffer_lookup_table(tablespace, database, relfilenode, forknum, blocknum, bufferid); pg_config| SELECT name, setting FROM pg_config() pg_config(name, setting); From c2818633e00fac29c3afb7c6bd1d9565eb33df56 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Tue, 17 Jun 2025 15:14:33 +0200 Subject: [PATCH 02/16] Process config reload in AIO workers Currenly AIO workers process interrupts only via CHECK_FOR_INTERRUPTS, which does not include ConfigReloadPending. Thus we need to check for it explicitly. --- src/backend/storage/aio/method_worker.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c index b5ac073a910d..d1c6da89c4b3 100644 --- a/src/backend/storage/aio/method_worker.c +++ b/src/backend/storage/aio/method_worker.c @@ -80,6 +80,7 @@ static void pgaio_worker_shmem_init(bool first_time); static bool pgaio_worker_needs_synchronous_execution(PgAioHandle *ioh); static int pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios); +static void pgaio_worker_process_interrupts(void); const IoMethodOps pgaio_worker_ops = { .shmem_size = pgaio_worker_shmem_size, @@ -463,6 +464,8 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) int nwakeups = 0; int worker; + pgaio_worker_process_interrupts(); + /* * Try to get a job to do. * @@ -592,3 +595,25 @@ pgaio_workers_enabled(void) { return io_method == IOMETHOD_WORKER; } + +/* + * Process any new interrupts. + */ +static void +pgaio_worker_process_interrupts(void) +{ + /* + * Reloading config can trigger further signals, complicating interrupts + * processing -- so let it run first. + * + * XXX: Is there any need in memory barrier after ProcessConfigFile? + */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); +} From 6a3a9b476c44cbb66c01f0c88956e7af89583e9f Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Sun, 6 Apr 2025 16:40:32 +0200 Subject: [PATCH 03/16] Introduce pending flag for GUC assign hooks Currently an assing hook can perform some preprocessing of a new value, but it cannot change the behavior, which dictates that the new value will be applied immediately after the hook. Certain GUC options (like shared_buffers, coming in subsequent patches) may need coordinating work between backends to change, meaning we cannot apply it right away. Add a new flag "pending" for an assign hook to allow the hook indicate exactly that. If the pending flag is set after the hook, the new value will not be applied and it's handling becomes the hook's implementation responsibility. Note, that this also requires changes in the way how GUCs are getting reported, but the patch does not cover that yet. --- src/backend/access/transam/xlog.c | 2 +- src/backend/commands/variable.c | 6 +-- src/backend/libpq/pqcomm.c | 8 ++-- src/backend/tcop/postgres.c | 2 +- src/backend/utils/misc/guc.c | 59 +++++++++++++++++++--------- src/backend/utils/misc/stack_depth.c | 2 +- src/include/utils/guc.h | 2 +- src/include/utils/guc_hooks.h | 20 +++++----- 8 files changed, 61 insertions(+), 40 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index eceab3412558..cc48b253bc86 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2197,7 +2197,7 @@ CalculateCheckpointSegments(void) } void -assign_max_wal_size(int newval, void *extra) +assign_max_wal_size(int newval, void *extra, bool *pending) { max_wal_size_mb = newval; CalculateCheckpointSegments(); diff --git a/src/backend/commands/variable.c b/src/backend/commands/variable.c index 608f10d9412d..e40dae2ddf2d 100644 --- a/src/backend/commands/variable.c +++ b/src/backend/commands/variable.c @@ -1143,7 +1143,7 @@ check_cluster_name(char **newval, void **extra, GucSource source) * GUC assign_hook for maintenance_io_concurrency */ void -assign_maintenance_io_concurrency(int newval, void *extra) +assign_maintenance_io_concurrency(int newval, void *extra, bool *pending) { /* * Reconfigure recovery prefetching, because a setting it depends on @@ -1161,12 +1161,12 @@ assign_maintenance_io_concurrency(int newval, void *extra) * they may be assigned in either order. */ void -assign_io_max_combine_limit(int newval, void *extra) +assign_io_max_combine_limit(int newval, void *extra, bool *pending) { io_combine_limit = Min(newval, io_combine_limit_guc); } void -assign_io_combine_limit(int newval, void *extra) +assign_io_combine_limit(int newval, void *extra, bool *pending) { io_combine_limit = Min(io_max_combine_limit, newval); } diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index 25f739a6a17d..1726a7c0993f 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -1951,7 +1951,7 @@ pq_settcpusertimeout(int timeout, Port *port) * GUC assign_hook for tcp_keepalives_idle */ void -assign_tcp_keepalives_idle(int newval, void *extra) +assign_tcp_keepalives_idle(int newval, void *extra, bool *pending) { /* * The kernel API provides no way to test a value without setting it; and @@ -1984,7 +1984,7 @@ show_tcp_keepalives_idle(void) * GUC assign_hook for tcp_keepalives_interval */ void -assign_tcp_keepalives_interval(int newval, void *extra) +assign_tcp_keepalives_interval(int newval, void *extra, bool *pending) { /* See comments in assign_tcp_keepalives_idle */ (void) pq_setkeepalivesinterval(newval, MyProcPort); @@ -2007,7 +2007,7 @@ show_tcp_keepalives_interval(void) * GUC assign_hook for tcp_keepalives_count */ void -assign_tcp_keepalives_count(int newval, void *extra) +assign_tcp_keepalives_count(int newval, void *extra, bool *pending) { /* See comments in assign_tcp_keepalives_idle */ (void) pq_setkeepalivescount(newval, MyProcPort); @@ -2030,7 +2030,7 @@ show_tcp_keepalives_count(void) * GUC assign_hook for tcp_user_timeout */ void -assign_tcp_user_timeout(int newval, void *extra) +assign_tcp_user_timeout(int newval, void *extra, bool *pending) { /* See comments in assign_tcp_keepalives_idle */ (void) pq_settcpusertimeout(newval, MyProcPort); diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index d356830f756b..8d4d6cc3f333 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3596,7 +3596,7 @@ check_log_stats(bool *newval, void **extra, GucSource source) /* GUC assign hook for transaction_timeout */ void -assign_transaction_timeout(int newval, void *extra) +assign_transaction_timeout(int newval, void *extra, bool *pending) { if (IsTransactionState()) { diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 46fdefebe353..0d5e523aaf00 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1680,6 +1680,7 @@ InitializeOneGUCOption(struct config_generic *gconf) struct config_int *conf = (struct config_int *) gconf; int newval = conf->boot_val; void *extra = NULL; + bool pending = false; Assert(newval >= conf->min); Assert(newval <= conf->max); @@ -1688,9 +1689,13 @@ InitializeOneGUCOption(struct config_generic *gconf) elog(FATAL, "failed to initialize %s to %d", conf->gen.name, newval); if (conf->assign_hook) - conf->assign_hook(newval, extra); - *conf->variable = conf->reset_val = newval; - conf->gen.extra = conf->reset_extra = extra; + conf->assign_hook(newval, extra, &pending); + + if (!pending) + { + *conf->variable = conf->reset_val = newval; + conf->gen.extra = conf->reset_extra = extra; + } break; } case PGC_REAL: @@ -2046,13 +2051,18 @@ ResetAllOptions(void) case PGC_INT: { struct config_int *conf = (struct config_int *) gconf; + bool pending = false; if (conf->assign_hook) conf->assign_hook(conf->reset_val, - conf->reset_extra); - *conf->variable = conf->reset_val; - set_extra_field(&conf->gen, &conf->gen.extra, - conf->reset_extra); + conf->reset_extra, + &pending); + if (!pending) + { + *conf->variable = conf->reset_val; + set_extra_field(&conf->gen, &conf->gen.extra, + conf->reset_extra); + } break; } case PGC_REAL: @@ -2429,16 +2439,21 @@ AtEOXact_GUC(bool isCommit, int nestLevel) struct config_int *conf = (struct config_int *) gconf; int newval = newvalue.val.intval; void *newextra = newvalue.extra; + bool pending = false; if (*conf->variable != newval || conf->gen.extra != newextra) { if (conf->assign_hook) - conf->assign_hook(newval, newextra); - *conf->variable = newval; - set_extra_field(&conf->gen, &conf->gen.extra, - newextra); - changed = true; + conf->assign_hook(newval, newextra, &pending); + + if (!pending) + { + *conf->variable = newval; + set_extra_field(&conf->gen, &conf->gen.extra, + newextra); + changed = true; + } } break; } @@ -3855,18 +3870,24 @@ set_config_with_handle(const char *name, config_handle *handle, if (changeVal) { + bool pending = false; + /* Save old value to support transaction abort */ if (!makeDefault) push_old_value(&conf->gen, action); if (conf->assign_hook) - conf->assign_hook(newval, newextra); - *conf->variable = newval; - set_extra_field(&conf->gen, &conf->gen.extra, - newextra); - set_guc_source(&conf->gen, source); - conf->gen.scontext = context; - conf->gen.srole = srole; + conf->assign_hook(newval, newextra, &pending); + + if (!pending) + { + *conf->variable = newval; + set_extra_field(&conf->gen, &conf->gen.extra, + newextra); + set_guc_source(&conf->gen, source); + conf->gen.scontext = context; + conf->gen.srole = srole; + } } if (makeDefault) { diff --git a/src/backend/utils/misc/stack_depth.c b/src/backend/utils/misc/stack_depth.c index 8f7cf531fbc5..ef59ae62008d 100644 --- a/src/backend/utils/misc/stack_depth.c +++ b/src/backend/utils/misc/stack_depth.c @@ -156,7 +156,7 @@ check_max_stack_depth(int *newval, void **extra, GucSource source) /* GUC assign hook for max_stack_depth */ void -assign_max_stack_depth(int newval, void *extra) +assign_max_stack_depth(int newval, void *extra, bool *pending) { ssize_t newval_bytes = newval * (ssize_t) 1024; diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index f21ec37da893..c3056cd2da81 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -187,7 +187,7 @@ typedef bool (*GucStringCheckHook) (char **newval, void **extra, GucSource sourc typedef bool (*GucEnumCheckHook) (int *newval, void **extra, GucSource source); typedef void (*GucBoolAssignHook) (bool newval, void *extra); -typedef void (*GucIntAssignHook) (int newval, void *extra); +typedef void (*GucIntAssignHook) (int newval, void *extra, bool *pending); typedef void (*GucRealAssignHook) (double newval, void *extra); typedef void (*GucStringAssignHook) (const char *newval, void *extra); typedef void (*GucEnumAssignHook) (int newval, void *extra); diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index 82ac8646a8d4..658c799419e9 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -81,12 +81,12 @@ extern bool check_log_stats(bool *newval, void **extra, GucSource source); extern bool check_log_timezone(char **newval, void **extra, GucSource source); extern void assign_log_timezone(const char *newval, void *extra); extern const char *show_log_timezone(void); -extern void assign_maintenance_io_concurrency(int newval, void *extra); -extern void assign_io_max_combine_limit(int newval, void *extra); -extern void assign_io_combine_limit(int newval, void *extra); -extern void assign_max_wal_size(int newval, void *extra); +extern void assign_maintenance_io_concurrency(int newval, void *extra, bool *pending); +extern void assign_io_max_combine_limit(int newval, void *extra, bool *pending); +extern void assign_io_combine_limit(int newval, void *extra, bool *pending); +extern void assign_max_wal_size(int newval, void *extra, bool *pending); extern bool check_max_stack_depth(int *newval, void **extra, GucSource source); -extern void assign_max_stack_depth(int newval, void *extra); +extern void assign_max_stack_depth(int newval, void *extra, bool *pending); extern bool check_multixact_member_buffers(int *newval, void **extra, GucSource source); extern bool check_multixact_offset_buffers(int *newval, void **extra, @@ -141,13 +141,13 @@ extern void assign_synchronous_standby_names(const char *newval, void *extra); extern void assign_synchronous_commit(int newval, void *extra); extern void assign_syslog_facility(int newval, void *extra); extern void assign_syslog_ident(const char *newval, void *extra); -extern void assign_tcp_keepalives_count(int newval, void *extra); +extern void assign_tcp_keepalives_count(int newval, void *extra, bool *pending); extern const char *show_tcp_keepalives_count(void); -extern void assign_tcp_keepalives_idle(int newval, void *extra); +extern void assign_tcp_keepalives_idle(int newval, void *extra, bool *pending); extern const char *show_tcp_keepalives_idle(void); -extern void assign_tcp_keepalives_interval(int newval, void *extra); +extern void assign_tcp_keepalives_interval(int newval, void *extra, bool *pending); extern const char *show_tcp_keepalives_interval(void); -extern void assign_tcp_user_timeout(int newval, void *extra); +extern void assign_tcp_user_timeout(int newval, void *extra, bool *pending); extern const char *show_tcp_user_timeout(void); extern bool check_temp_buffers(int *newval, void **extra, GucSource source); extern bool check_temp_tablespaces(char **newval, void **extra, @@ -163,7 +163,7 @@ extern bool check_transaction_buffers(int *newval, void **extra, GucSource sourc extern bool check_transaction_deferrable(bool *newval, void **extra, GucSource source); extern bool check_transaction_isolation(int *newval, void **extra, GucSource source); extern bool check_transaction_read_only(bool *newval, void **extra, GucSource source); -extern void assign_transaction_timeout(int newval, void *extra); +extern void assign_transaction_timeout(int newval, void *extra, bool *pending); extern const char *show_unix_socket_permissions(void); extern bool check_wal_buffers(int *newval, void **extra, GucSource source); extern bool check_wal_consistency_checking(char **newval, void **extra, From 11274f14b56354ac31956d0bd475112a302179d9 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Fri, 4 Apr 2025 21:46:14 +0200 Subject: [PATCH 04/16] Introduce pss_barrierReceivedGeneration Currently WaitForProcSignalBarrier allows to make sure the message sent via EmitProcSignalBarrier was processed by all ProcSignal mechanism participants. Add pss_barrierReceivedGeneration alongside with pss_barrierGeneration, which will be updated when a process has received the message, but not processed it yet. This makes it possible to support a new mode of waiting, when ProcSignal participants want to synchronize message processing. To do that, a participant can wait via WaitForProcSignalBarrierReceived when processing a message, effectively making sure that all processes are going to start processing ProcSignalBarrier simultaneously. --- src/backend/storage/ipc/procsignal.c | 67 ++++++++++++++++++++++------ src/include/storage/procsignal.h | 1 + 2 files changed, 54 insertions(+), 14 deletions(-) diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 087821311cce..eb3ceaae8095 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -58,7 +58,10 @@ * of it. For such use cases, we set a bit in pss_barrierCheckMask and then * increment the current "barrier generation"; when the new barrier generation * (or greater) appears in the pss_barrierGeneration flag of every process, - * we know that the message has been received everywhere. + * we know that the message has been received and processed everywhere. In case + * if we only need to know only that the message was received everywhere (e.g. + * receiving processes need to handle the message in a coordinated fashion) + * use pss_barrierReceivedGeneration in the same way. */ typedef struct { @@ -70,6 +73,7 @@ typedef struct /* Barrier-related fields (not protected by pss_mutex) */ pg_atomic_uint64 pss_barrierGeneration; + pg_atomic_uint64 pss_barrierReceivedGeneration; pg_atomic_uint32 pss_barrierCheckMask; ConditionVariable pss_barrierCV; } ProcSignalSlot; @@ -152,6 +156,8 @@ ProcSignalShmemInit(void) slot->pss_cancel_key_len = 0; MemSet(slot->pss_signalFlags, 0, sizeof(slot->pss_signalFlags)); pg_atomic_init_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX); + pg_atomic_init_u64(&slot->pss_barrierReceivedGeneration, + PG_UINT64_MAX); pg_atomic_init_u32(&slot->pss_barrierCheckMask, 0); ConditionVariableInit(&slot->pss_barrierCV); } @@ -199,6 +205,8 @@ ProcSignalInit(const uint8 *cancel_key, int cancel_key_len) barrier_generation = pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration); pg_atomic_write_u64(&slot->pss_barrierGeneration, barrier_generation); + pg_atomic_write_u64(&slot->pss_barrierReceivedGeneration, + barrier_generation); if (cancel_key_len > 0) memcpy(slot->pss_cancel_key, cancel_key, cancel_key_len); @@ -263,6 +271,7 @@ CleanupProcSignalState(int status, Datum arg) * no barrier waits block on it. */ pg_atomic_write_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX); + pg_atomic_write_u64(&slot->pss_barrierReceivedGeneration, PG_UINT64_MAX); SpinLockRelease(&slot->pss_mutex); @@ -416,12 +425,8 @@ EmitProcSignalBarrier(ProcSignalBarrierType type) return generation; } -/* - * WaitForProcSignalBarrier - wait until it is guaranteed that all changes - * requested by a specific call to EmitProcSignalBarrier() have taken effect. - */ -void -WaitForProcSignalBarrier(uint64 generation) +static void +WaitForProcSignalBarrierInternal(uint64 generation, bool receivedOnly) { Assert(generation <= pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration)); @@ -436,12 +441,17 @@ WaitForProcSignalBarrier(uint64 generation) uint64 oldval; /* - * It's important that we check only pss_barrierGeneration here and - * not pss_barrierCheckMask. Bits in pss_barrierCheckMask get cleared - * before the barrier is actually absorbed, but pss_barrierGeneration + * It's important that we check only pss_barrierGeneration & + * pss_barrierGeneration here and not pss_barrierCheckMask. Bits in + * pss_barrierCheckMask get cleared before the barrier is actually + * absorbed, but pss_barrierGeneration & pss_barrierReceivedGeneration * is updated only afterward. */ - oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration); + if (receivedOnly) + oldval = pg_atomic_read_u64(&slot->pss_barrierReceivedGeneration); + else + oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration); + while (oldval < generation) { if (ConditionVariableTimedSleep(&slot->pss_barrierCV, @@ -450,7 +460,11 @@ WaitForProcSignalBarrier(uint64 generation) ereport(LOG, (errmsg("still waiting for backend with PID %d to accept ProcSignalBarrier", (int) pg_atomic_read_u32(&slot->pss_pid)))); - oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration); + + if (receivedOnly) + oldval = pg_atomic_read_u64(&slot->pss_barrierReceivedGeneration); + else + oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration); } ConditionVariableCancelSleep(); } @@ -464,12 +478,33 @@ WaitForProcSignalBarrier(uint64 generation) * The caller is probably calling this function because it wants to read * the shared state or perform further writes to shared state once all * backends are known to have absorbed the barrier. However, the read of - * pss_barrierGeneration was performed unlocked; insert a memory barrier - * to separate it from whatever follows. + * pss_barrierGeneration & pss_barrierReceivedGeneration was performed + * unlocked; insert a memory barrier to separate it from whatever follows. */ pg_memory_barrier(); } +/* + * WaitForProcSignalBarrier - wait until it is guaranteed that all changes + * requested by a specific call to EmitProcSignalBarrier() have taken effect. + */ +void +WaitForProcSignalBarrier(uint64 generation) +{ + WaitForProcSignalBarrierInternal(generation, false); +} + +/* + * WaitForProcSignalBarrierReceived - wait until it is guaranteed that all + * backends have observed the message sent by a specific call to + * EmitProcSignalBarrier(). + */ +void +WaitForProcSignalBarrierReceived(uint64 generation) +{ + WaitForProcSignalBarrierInternal(generation, true); +} + /* * Handle receipt of an interrupt indicating a global barrier event. * @@ -523,6 +558,10 @@ ProcessProcSignalBarrier(void) if (local_gen == shared_gen) return; + /* The message is observed, record that */ + pg_atomic_write_u64(&MyProcSignalSlot->pss_barrierReceivedGeneration, + shared_gen); + /* * Get and clear the flags that are set for this backend. Note that * pg_atomic_exchange_u32 is a full barrier, so we're guaranteed that the diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index afeeb1ca019f..2733bbb8c5b8 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -79,6 +79,7 @@ extern void SendCancelRequest(int backendPID, const uint8 *cancel_key, int cance extern uint64 EmitProcSignalBarrier(ProcSignalBarrierType type); extern void WaitForProcSignalBarrier(uint64 generation); +extern void WaitForProcSignalBarrierReceived(uint64 generation); extern void ProcessProcSignalBarrier(void); extern void procsignal_sigusr1_handler(SIGNAL_ARGS); From 6066aa5ead1992f36d0b2cb8b729190ba8b4b6ef Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Fri, 28 Feb 2025 19:54:47 +0100 Subject: [PATCH 05/16] Allow to use multiple shared memory mappings Currently all the work with shared memory is done via a single anonymous memory mapping, which limits ways how the shared memory could be organized. Introduce possibility to allocate multiple shared memory mappings, where a single mapping is associated with a specified shared memory segment. There is only fixed amount of available segments, currently only one main shared memory segment is allocated. A new shared memory API is introduces, extended with a segment as a new parameter. As a path of least resistance, the original API is kept in place, utilizing the main shared memory segment. --- src/backend/port/posix_sema.c | 4 +- src/backend/port/sysv_sema.c | 4 +- src/backend/port/sysv_shmem.c | 138 +++++++++++++++++++--------- src/backend/port/win32_sema.c | 2 +- src/backend/storage/ipc/ipc.c | 4 +- src/backend/storage/ipc/ipci.c | 63 +++++++------ src/backend/storage/ipc/shmem.c | 148 +++++++++++++++++++++--------- src/backend/storage/lmgr/lwlock.c | 15 ++- src/include/storage/ipc.h | 2 +- src/include/storage/pg_sema.h | 2 +- src/include/storage/pg_shmem.h | 18 ++++ src/include/storage/shmem.h | 11 +++ 12 files changed, 283 insertions(+), 128 deletions(-) diff --git a/src/backend/port/posix_sema.c b/src/backend/port/posix_sema.c index 269c7460817e..401e1113fa10 100644 --- a/src/backend/port/posix_sema.c +++ b/src/backend/port/posix_sema.c @@ -193,7 +193,7 @@ PGSemaphoreShmemSize(int maxSemas) * we don't have to expose the counters to other processes.) */ void -PGReserveSemaphores(int maxSemas) +PGReserveSemaphores(int maxSemas, int shmem_segment) { struct stat statbuf; @@ -220,7 +220,7 @@ PGReserveSemaphores(int maxSemas) * ShmemAlloc() won't be ready yet. */ sharedSemas = (PGSemaphore) - ShmemAllocUnlocked(PGSemaphoreShmemSize(maxSemas)); + ShmemAllocUnlockedInSegment(PGSemaphoreShmemSize(maxSemas), shmem_segment); #endif numSems = 0; diff --git a/src/backend/port/sysv_sema.c b/src/backend/port/sysv_sema.c index 6ac83ea1a821..7bb363989c49 100644 --- a/src/backend/port/sysv_sema.c +++ b/src/backend/port/sysv_sema.c @@ -327,7 +327,7 @@ PGSemaphoreShmemSize(int maxSemas) * have clobbered.) */ void -PGReserveSemaphores(int maxSemas) +PGReserveSemaphores(int maxSemas, int shmem_segment) { struct stat statbuf; @@ -348,7 +348,7 @@ PGReserveSemaphores(int maxSemas) * ShmemAlloc() won't be ready yet. */ sharedSemas = (PGSemaphore) - ShmemAllocUnlocked(PGSemaphoreShmemSize(maxSemas)); + ShmemAllocUnlockedInSegment(PGSemaphoreShmemSize(maxSemas), shmem_segment); numSharedSemas = 0; maxSharedSemas = maxSemas; diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 197926d44f6b..56af0231d242 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -94,8 +94,19 @@ typedef enum unsigned long UsedShmemSegID = 0; void *UsedShmemSegAddr = NULL; -static Size AnonymousShmemSize; -static void *AnonymousShmem = NULL; +typedef struct AnonymousMapping +{ + int shmem_segment; + Size shmem_size; /* Size of the mapping */ + Pointer shmem; /* Pointer to the start of the mapped memory */ + Pointer seg_addr; /* SysV shared memory for the header */ + unsigned long seg_id; /* IPC key */ +} AnonymousMapping; + +static AnonymousMapping Mappings[ANON_MAPPINGS]; + +/* Keeps track of used mapping segments */ +static int next_free_segment = 0; static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); static void IpcMemoryDetach(int status, Datum shmaddr); @@ -104,6 +115,28 @@ static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, void *attachAt, PGShmemHeader **addr); +static const char* +MappingName(int shmem_segment) +{ + switch (shmem_segment) + { + case MAIN_SHMEM_SEGMENT: + return "main"; + default: + return "unknown"; + } +} + +static void +DebugMappings() +{ + for(int i = 0; i < next_free_segment; i++) + { + AnonymousMapping m = Mappings[i]; + elog(DEBUG1, "Mapping[%s]: addr %p, size %zu", + MappingName(i), m.shmem, m.shmem_size); + } +} /* * InternalIpcMemoryCreate(memKey, size) @@ -591,14 +624,13 @@ check_huge_page_size(int *newval, void **extra, GucSource source) /* * Creates an anonymous mmap()ed shared memory segment. * - * Pass the requested size in *size. This function will modify *size to the - * actual size of the allocation, if it ends up allocating a segment that is - * larger than requested. + * This function will modify mapping size to the actual size of the allocation, + * if it ends up allocating a segment that is larger than requested. */ -static void * -CreateAnonymousSegment(Size *size) +static void +CreateAnonymousSegment(AnonymousMapping *mapping) { - Size allocsize = *size; + Size allocsize = mapping->shmem_size; void *ptr = MAP_FAILED; int mmap_errno = 0; @@ -623,8 +655,11 @@ CreateAnonymousSegment(Size *size) PG_MMAP_FLAGS | mmap_flags, -1, 0); mmap_errno = errno; if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) - elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", - allocsize); + { + DebugMappings(); + elog(DEBUG1, "segment[%s]: mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", + MappingName(mapping->shmem_segment), allocsize); + } } #endif @@ -642,7 +677,7 @@ CreateAnonymousSegment(Size *size) * Use the original size, not the rounded-up value, when falling back * to non-huge pages. */ - allocsize = *size; + allocsize = mapping->shmem_size; ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, -1, 0); mmap_errno = errno; @@ -651,8 +686,10 @@ CreateAnonymousSegment(Size *size) if (ptr == MAP_FAILED) { errno = mmap_errno; + DebugMappings(); ereport(FATAL, - (errmsg("could not map anonymous shared memory: %m"), + (errmsg("segment[%s]: could not map anonymous shared memory: %m", + MappingName(mapping->shmem_segment)), (mmap_errno == ENOMEM) ? errhint("This error usually means that PostgreSQL's request " "for a shared memory segment exceeded available memory, " @@ -663,8 +700,8 @@ CreateAnonymousSegment(Size *size) allocsize) : 0)); } - *size = allocsize; - return ptr; + mapping->shmem = ptr; + mapping->shmem_size = allocsize; } /* @@ -674,13 +711,18 @@ CreateAnonymousSegment(Size *size) static void AnonymousShmemDetach(int status, Datum arg) { - /* Release anonymous shared memory block, if any. */ - if (AnonymousShmem != NULL) + for(int i = 0; i < next_free_segment; i++) { - if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) - elog(LOG, "munmap(%p, %zu) failed: %m", - AnonymousShmem, AnonymousShmemSize); - AnonymousShmem = NULL; + AnonymousMapping m = Mappings[i]; + + /* Release anonymous shared memory block, if any. */ + if (m.shmem != NULL) + { + if (munmap(m.shmem, m.shmem_size) < 0) + elog(LOG, "munmap(%p, %zu) failed: %m", + m.shmem, m.shmem_size); + m.shmem = NULL; + } } } @@ -705,6 +747,7 @@ PGSharedMemoryCreate(Size size, PGShmemHeader *hdr; struct stat statbuf; Size sysvsize; + AnonymousMapping *mapping = &Mappings[next_free_segment]; /* * We use the data directory's ID info (inode and device numbers) to @@ -733,11 +776,15 @@ PGSharedMemoryCreate(Size size, /* Room for a header? */ Assert(size > MAXALIGN(sizeof(PGShmemHeader))); + mapping->shmem_size = size; + mapping->shmem_segment = next_free_segment; if (shared_memory_type == SHMEM_TYPE_MMAP) { - AnonymousShmem = CreateAnonymousSegment(&size); - AnonymousShmemSize = size; + /* On success, mapping data will be modified. */ + CreateAnonymousSegment(mapping); + + next_free_segment++; /* Register on-exit routine to unmap the anonymous segment */ on_shmem_exit(AnonymousShmemDetach, (Datum) 0); @@ -760,7 +807,7 @@ PGSharedMemoryCreate(Size size, * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure * that, but prefer fixing it over coping here.) */ - NextShmemSegID = statbuf.st_ino; + NextShmemSegID = statbuf.st_ino + next_free_segment; for (;;) { @@ -852,13 +899,13 @@ PGSharedMemoryCreate(Size size, /* * Initialize space allocation status for segment. */ - hdr->totalsize = size; + hdr->totalsize = mapping->shmem_size; hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); *shim = hdr; /* Save info for possible future use */ - UsedShmemSegAddr = memAddress; - UsedShmemSegID = (unsigned long) NextShmemSegID; + mapping->seg_addr = memAddress; + mapping->seg_id = (unsigned long) NextShmemSegID; /* * If AnonymousShmem is NULL here, then we're not using anonymous shared @@ -866,10 +913,10 @@ PGSharedMemoryCreate(Size size, * block. Otherwise, the System V shared memory block is only a shim, and * we must return a pointer to the real block. */ - if (AnonymousShmem == NULL) + if (mapping->shmem == NULL) return hdr; - memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader)); - return (PGShmemHeader *) AnonymousShmem; + memcpy(mapping->shmem, hdr, sizeof(PGShmemHeader)); + return (PGShmemHeader *) mapping->shmem; } #ifdef EXEC_BACKEND @@ -969,23 +1016,28 @@ PGSharedMemoryNoReAttach(void) void PGSharedMemoryDetach(void) { - if (UsedShmemSegAddr != NULL) + for(int i = 0; i < next_free_segment; i++) { - if ((shmdt(UsedShmemSegAddr) < 0) + AnonymousMapping m = Mappings[i]; + + if (m.seg_addr != NULL) + { + if ((shmdt(m.seg_addr) < 0) #if defined(EXEC_BACKEND) && defined(__CYGWIN__) - /* Work-around for cygipc exec bug */ - && shmdt(NULL) < 0 + /* Work-around for cygipc exec bug */ + && shmdt(NULL) < 0 #endif - ) - elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr); - UsedShmemSegAddr = NULL; - } + ) + elog(LOG, "shmdt(%p) failed: %m", m.seg_addr); + m.seg_addr = NULL; + } - if (AnonymousShmem != NULL) - { - if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) - elog(LOG, "munmap(%p, %zu) failed: %m", - AnonymousShmem, AnonymousShmemSize); - AnonymousShmem = NULL; + if (m.shmem != NULL) + { + if (munmap(m.shmem, m.shmem_size) < 0) + elog(LOG, "munmap(%p, %zu) failed: %m", + m.shmem, m.shmem_size); + m.shmem = NULL; + } } } diff --git a/src/backend/port/win32_sema.c b/src/backend/port/win32_sema.c index 5854ad1f54d3..e7365ff8060d 100644 --- a/src/backend/port/win32_sema.c +++ b/src/backend/port/win32_sema.c @@ -44,7 +44,7 @@ PGSemaphoreShmemSize(int maxSemas) * process exits. */ void -PGReserveSemaphores(int maxSemas) +PGReserveSemaphores(int maxSemas, int shmem_segment) { mySemSet = (HANDLE *) malloc(maxSemas * sizeof(HANDLE)); if (mySemSet == NULL) diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c index 2704e80b3a7d..1965b2d3eb4d 100644 --- a/src/backend/storage/ipc/ipc.c +++ b/src/backend/storage/ipc/ipc.c @@ -61,6 +61,8 @@ static void proc_exit_prepare(int code); * but provide some additional features we need --- in particular, * we want to register callbacks to invoke when we are disconnecting * from a broken shared-memory context but not exiting the postmaster. + * Maximum number of such exit callbacks depends on the number of shared + * segments. * * Callback functions can take zero, one, or two args: the first passed * arg is the integer exitcode, the second is the Datum supplied when @@ -68,7 +70,7 @@ static void proc_exit_prepare(int code); * ---------------------------------------------------------------- */ -#define MAX_ON_EXITS 20 +#define MAX_ON_EXITS 40 struct ONEXIT { diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 2fa045e6b0f6..8b38e9853276 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -86,7 +86,7 @@ RequestAddinShmemSpace(Size size) * required. */ Size -CalculateShmemSize(int *num_semaphores) +CalculateShmemSize(int *num_semaphores, int shmem_segment) { Size size; int numSemas; @@ -206,33 +206,38 @@ CreateSharedMemoryAndSemaphores(void) Assert(!IsUnderPostmaster); - /* Compute the size of the shared-memory block */ - size = CalculateShmemSize(&numSemas); - elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size); - - /* - * Create the shmem segment - */ - seghdr = PGSharedMemoryCreate(size, &shim); - - /* - * Make sure that huge pages are never reported as "unknown" while the - * server is running. - */ - Assert(strcmp("unknown", - GetConfigOption("huge_pages_status", false, false)) != 0); - - InitShmemAccess(seghdr); - - /* - * Create semaphores - */ - PGReserveSemaphores(numSemas); - - /* - * Set up shared memory allocation mechanism - */ - InitShmemAllocation(); + for(int segment = 0; segment < ANON_MAPPINGS; segment++) + { + /* Compute the size of the shared-memory block */ + size = CalculateShmemSize(&numSemas, segment); + elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size); + + /* + * Create the shmem segment. + * + * XXX: Do multiple shims are needed, one per segment? + */ + seghdr = PGSharedMemoryCreate(size, &shim); + + /* + * Make sure that huge pages are never reported as "unknown" while the + * server is running. + */ + Assert(strcmp("unknown", + GetConfigOption("huge_pages_status", false, false)) != 0); + + InitShmemAccessInSegment(seghdr, segment); + + /* + * Create semaphores + */ + PGReserveSemaphores(numSemas, segment); + + /* + * Set up shared memory allocation mechanism + */ + InitShmemAllocationInSegment(segment); + } /* Initialize subsystems */ CreateOrAttachShmemStructs(); @@ -363,7 +368,7 @@ InitializeShmemGUCs(void) /* * Calculate the shared memory size and round up to the nearest megabyte. */ - size_b = CalculateShmemSize(&num_semas); + size_b = CalculateShmemSize(&num_semas, MAIN_SHMEM_SEGMENT); size_mb = add_size(size_b, (1024 * 1024) - 1) / (1024 * 1024); sprintf(buf, "%zu", size_mb); SetConfigOption("shared_memory_size", buf, diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index a0770e867968..f185ed28f95f 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -76,19 +76,19 @@ #include "utils/builtins.h" static void *ShmemAllocRaw(Size size, Size *allocated_size); +static void *ShmemAllocRawInSegment(Size size, Size *allocated_size, + int shmem_segment); /* shared memory global variables */ -static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */ +ShmemSegment Segments[ANON_MAPPINGS]; -static void *ShmemBase; /* start address of shared memory */ - -static void *ShmemEnd; /* end+1 address of shared memory */ - -slock_t *ShmemLock; /* spinlock for shared memory and LWLock - * allocation */ - -static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */ +/* + * Primary index hashtable for shmem, for simplicity we use a single for all + * shared memory segments. There can be performance consequences of that, and + * an alternative option would be to have one index per shared memory segments. + */ +static HTAB *ShmemIndex = NULL; /* To get reliable results for NUMA inquiry we need to "touch pages" once */ static bool firstNumaTouch = true; @@ -101,9 +101,17 @@ Datum pg_numa_available(PG_FUNCTION_ARGS); void InitShmemAccess(PGShmemHeader *seghdr) { - ShmemSegHdr = seghdr; - ShmemBase = seghdr; - ShmemEnd = (char *) ShmemBase + seghdr->totalsize; + InitShmemAccessInSegment(seghdr, MAIN_SHMEM_SEGMENT); +} + +void +InitShmemAccessInSegment(PGShmemHeader *seghdr, int shmem_segment) +{ + PGShmemHeader *shmhdr = (PGShmemHeader *) seghdr; + ShmemSegment *seg = &Segments[shmem_segment]; + seg->ShmemSegHdr = shmhdr; + seg->ShmemBase = (void *) shmhdr; + seg->ShmemEnd = (char *) seg->ShmemBase + shmhdr->totalsize; } /* @@ -114,7 +122,13 @@ InitShmemAccess(PGShmemHeader *seghdr) void InitShmemAllocation(void) { - PGShmemHeader *shmhdr = ShmemSegHdr; + InitShmemAllocationInSegment(MAIN_SHMEM_SEGMENT); +} + +void +InitShmemAllocationInSegment(int shmem_segment) +{ + PGShmemHeader *shmhdr = Segments[shmem_segment].ShmemSegHdr; char *aligned; Assert(shmhdr != NULL); @@ -123,9 +137,9 @@ InitShmemAllocation(void) * Initialize the spinlock used by ShmemAlloc. We must use * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet. */ - ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t)); + Segments[shmem_segment].ShmemLock = (slock_t *) ShmemAllocUnlockedInSegment(sizeof(slock_t), shmem_segment); - SpinLockInit(ShmemLock); + SpinLockInit(Segments[shmem_segment].ShmemLock); /* * Allocations after this point should go through ShmemAlloc, which @@ -150,11 +164,17 @@ InitShmemAllocation(void) */ void * ShmemAlloc(Size size) +{ + return ShmemAllocInSegment(size, MAIN_SHMEM_SEGMENT); +} + +void * +ShmemAllocInSegment(Size size, int shmem_segment) { void *newSpace; Size allocated_size; - newSpace = ShmemAllocRaw(size, &allocated_size); + newSpace = ShmemAllocRawInSegment(size, &allocated_size, shmem_segment); if (!newSpace) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), @@ -184,6 +204,12 @@ ShmemAllocNoError(Size size) */ static void * ShmemAllocRaw(Size size, Size *allocated_size) +{ + return ShmemAllocRawInSegment(size, allocated_size, MAIN_SHMEM_SEGMENT); +} + +static void * +ShmemAllocRawInSegment(Size size, Size *allocated_size, int shmem_segment) { Size newStart; Size newFree; @@ -203,22 +229,22 @@ ShmemAllocRaw(Size size, Size *allocated_size) size = CACHELINEALIGN(size); *allocated_size = size; - Assert(ShmemSegHdr != NULL); + Assert(Segments[shmem_segment].ShmemSegHdr != NULL); - SpinLockAcquire(ShmemLock); + SpinLockAcquire(Segments[shmem_segment].ShmemLock); - newStart = ShmemSegHdr->freeoffset; + newStart = Segments[shmem_segment].ShmemSegHdr->freeoffset; newFree = newStart + size; - if (newFree <= ShmemSegHdr->totalsize) + if (newFree <= Segments[shmem_segment].ShmemSegHdr->totalsize) { - newSpace = (char *) ShmemBase + newStart; - ShmemSegHdr->freeoffset = newFree; + newSpace = (char *) Segments[shmem_segment].ShmemBase + newStart; + Segments[shmem_segment].ShmemSegHdr->freeoffset = newFree; } else newSpace = NULL; - SpinLockRelease(ShmemLock); + SpinLockRelease(Segments[shmem_segment].ShmemLock); /* note this assert is okay with newSpace == NULL */ Assert(newSpace == (void *) CACHELINEALIGN(newSpace)); @@ -236,6 +262,12 @@ ShmemAllocRaw(Size size, Size *allocated_size) */ void * ShmemAllocUnlocked(Size size) +{ + return ShmemAllocUnlockedInSegment(size, MAIN_SHMEM_SEGMENT); +} + +void * +ShmemAllocUnlockedInSegment(Size size, int shmem_segment) { Size newStart; Size newFree; @@ -246,19 +278,19 @@ ShmemAllocUnlocked(Size size) */ size = MAXALIGN(size); - Assert(ShmemSegHdr != NULL); + Assert(Segments[shmem_segment].ShmemSegHdr != NULL); - newStart = ShmemSegHdr->freeoffset; + newStart = Segments[shmem_segment].ShmemSegHdr->freeoffset; newFree = newStart + size; - if (newFree > ShmemSegHdr->totalsize) + if (newFree > Segments[shmem_segment].ShmemSegHdr->totalsize) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of shared memory (%zu bytes requested)", size))); - ShmemSegHdr->freeoffset = newFree; + Segments[shmem_segment].ShmemSegHdr->freeoffset = newFree; - newSpace = (char *) ShmemBase + newStart; + newSpace = (char *) Segments[shmem_segment].ShmemBase + newStart; Assert(newSpace == (void *) MAXALIGN(newSpace)); @@ -273,7 +305,13 @@ ShmemAllocUnlocked(Size size) bool ShmemAddrIsValid(const void *addr) { - return (addr >= ShmemBase) && (addr < ShmemEnd); + return ShmemAddrIsValidInSegment(addr, MAIN_SHMEM_SEGMENT); +} + +bool +ShmemAddrIsValidInSegment(const void *addr, int shmem_segment) +{ + return (addr >= Segments[shmem_segment].ShmemBase) && (addr < Segments[shmem_segment].ShmemEnd); } /* @@ -334,6 +372,18 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ int64 max_size, /* max size of the table */ HASHCTL *infoP, /* info about key and bucket size */ int hash_flags) /* info about infoP */ +{ + return ShmemInitHashInSegment(name, init_size, max_size, infoP, hash_flags, + MAIN_SHMEM_SEGMENT); +} + +HTAB * +ShmemInitHashInSegment(const char *name, /* table string name for shmem index */ + long init_size, /* initial table size */ + long max_size, /* max size of the table */ + HASHCTL *infoP, /* info about key and bucket size */ + int hash_flags, /* info about infoP */ + int shmem_segment) /* in which segment to keep the table */ { bool found; void *location; @@ -350,9 +400,9 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE; /* look it up in the shmem index */ - location = ShmemInitStruct(name, + location = ShmemInitStructInSegment(name, hash_get_shared_size(infoP, hash_flags), - &found); + &found, shmem_segment); /* * if it already exists, attach to it rather than allocate and initialize @@ -385,6 +435,13 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ */ void * ShmemInitStruct(const char *name, Size size, bool *foundPtr) +{ + return ShmemInitStructInSegment(name, size, foundPtr, MAIN_SHMEM_SEGMENT); +} + +void * +ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, + int shmem_segment) { ShmemIndexEnt *result; void *structPtr; @@ -393,7 +450,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) if (!ShmemIndex) { - PGShmemHeader *shmemseghdr = ShmemSegHdr; + PGShmemHeader *shmemseghdr = Segments[shmem_segment].ShmemSegHdr; /* Must be trying to create/attach to ShmemIndex itself */ Assert(strcmp(name, "ShmemIndex") == 0); @@ -416,7 +473,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) * process can be accessing shared memory yet. */ Assert(shmemseghdr->index == NULL); - structPtr = ShmemAlloc(size); + structPtr = ShmemAllocInSegment(size, shmem_segment); shmemseghdr->index = structPtr; *foundPtr = false; } @@ -433,8 +490,8 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) LWLockRelease(ShmemIndexLock); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("could not create ShmemIndex entry for data structure \"%s\"", - name))); + errmsg("could not create ShmemIndex entry for data structure \"%s\" in segment %d", + name, shmem_segment))); } if (*foundPtr) @@ -459,7 +516,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) Size allocated_size; /* It isn't in the table yet. allocate and initialize it */ - structPtr = ShmemAllocRaw(size, &allocated_size); + structPtr = ShmemAllocRawInSegment(size, &allocated_size, shmem_segment); if (structPtr == NULL) { /* out of memory; remove the failed ShmemIndex entry */ @@ -478,14 +535,13 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) LWLockRelease(ShmemIndexLock); - Assert(ShmemAddrIsValid(structPtr)); + Assert(ShmemAddrIsValidInSegment(structPtr, shmem_segment)); Assert(structPtr == (void *) CACHELINEALIGN(structPtr)); return structPtr; } - /* * Add two Size values, checking for overflow */ @@ -542,10 +598,11 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS) /* output all allocated entries */ memset(nulls, 0, sizeof(nulls)); + /* XXX: take all shared memory segments into account. */ while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) { values[0] = CStringGetTextDatum(ent->key); - values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr); + values[1] = Int64GetDatum((char *) ent->location - (char *) Segments[MAIN_SHMEM_SEGMENT].ShmemSegHdr); values[2] = Int64GetDatum(ent->size); values[3] = Int64GetDatum(ent->allocated_size); named_allocated += ent->allocated_size; @@ -557,15 +614,15 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS) /* output shared memory allocated but not counted via the shmem index */ values[0] = CStringGetTextDatum(""); nulls[1] = true; - values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated); + values[2] = Int64GetDatum(Segments[MAIN_SHMEM_SEGMENT].ShmemSegHdr->freeoffset - named_allocated); values[3] = values[2]; tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); /* output as-of-yet unused shared memory */ nulls[0] = true; - values[1] = Int64GetDatum(ShmemSegHdr->freeoffset); + values[1] = Int64GetDatum(Segments[MAIN_SHMEM_SEGMENT].ShmemSegHdr->freeoffset); nulls[1] = false; - values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset); + values[2] = Int64GetDatum(Segments[MAIN_SHMEM_SEGMENT].ShmemSegHdr->totalsize - Segments[MAIN_SHMEM_SEGMENT].ShmemSegHdr->freeoffset); values[3] = values[2]; tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); @@ -630,7 +687,12 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) * this is not very likely, and moreover we have more entries, each of * them using only fraction of the total pages. */ - shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1; + for(int segment = 0; segment < ANON_MAPPINGS; segment++) + { + PGShmemHeader *shmhdr = Segments[segment].ShmemSegHdr; + shm_total_page_count += (shmhdr->totalsize / os_page_size) + 1; + } + page_ptrs = palloc0(sizeof(void *) * shm_total_page_count); pages_status = palloc(sizeof(int) * shm_total_page_count); diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index b017880f5e45..c25dd13b63af 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -80,6 +80,8 @@ #include "pg_trace.h" #include "pgstat.h" #include "port/pg_bitutils.h" +#include "postmaster/postmaster.h" +#include "storage/pg_shmem.h" #include "storage/proc.h" #include "storage/proclist.h" #include "storage/procnumber.h" @@ -612,12 +614,15 @@ LWLockNewTrancheId(const char *name) /* * We use the ShmemLock spinlock to protect LWLockCounter and * LWLockTrancheNames. + * + * XXX: Looks like this is the only use of Segments outside of shmem.c, + * it's maybe worth it to reshape this part to hide Segments structure. */ - SpinLockAcquire(ShmemLock); + SpinLockAcquire(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); if (*LWLockCounter - LWTRANCHE_FIRST_USER_DEFINED >= MAX_NAMED_TRANCHES) { - SpinLockRelease(ShmemLock); + SpinLockRelease(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); ereport(ERROR, (errmsg("maximum number of tranches already registered"), errdetail("No more than %d tranches may be registered.", @@ -628,7 +633,7 @@ LWLockNewTrancheId(const char *name) LocalLWLockCounter = *LWLockCounter; strlcpy(LWLockTrancheNames[result - LWTRANCHE_FIRST_USER_DEFINED], name, NAMEDATALEN); - SpinLockRelease(ShmemLock); + SpinLockRelease(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); return result; } @@ -750,9 +755,9 @@ GetLWTrancheName(uint16 trancheId) */ if (trancheId >= LocalLWLockCounter) { - SpinLockAcquire(ShmemLock); + SpinLockAcquire(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); LocalLWLockCounter = *LWLockCounter; - SpinLockRelease(ShmemLock); + SpinLockRelease(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); if (trancheId >= LocalLWLockCounter) elog(ERROR, "tranche %d is not registered", trancheId); diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h index 3baf418b3d1e..6ebda479ced7 100644 --- a/src/include/storage/ipc.h +++ b/src/include/storage/ipc.h @@ -77,7 +77,7 @@ extern void check_on_shmem_exit_lists_are_empty(void); /* ipci.c */ extern PGDLLIMPORT shmem_startup_hook_type shmem_startup_hook; -extern Size CalculateShmemSize(int *num_semaphores); +extern Size CalculateShmemSize(int *num_semaphores, int shmem_segment); extern void CreateSharedMemoryAndSemaphores(void); #ifdef EXEC_BACKEND extern void AttachSharedMemoryStructs(void); diff --git a/src/include/storage/pg_sema.h b/src/include/storage/pg_sema.h index fa6ca35a51f5..8ae9637fcd0a 100644 --- a/src/include/storage/pg_sema.h +++ b/src/include/storage/pg_sema.h @@ -41,7 +41,7 @@ typedef HANDLE PGSemaphore; extern Size PGSemaphoreShmemSize(int maxSemas); /* Module initialization (called during postmaster start or shmem reinit) */ -extern void PGReserveSemaphores(int maxSemas); +extern void PGReserveSemaphores(int maxSemas, int shmem_segment); /* Allocate a PGSemaphore structure with initial count 1 */ extern PGSemaphore PGSemaphoreCreate(void); diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 5f7d4b83a60e..2348c59b5a03 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -25,6 +25,7 @@ #define PG_SHMEM_H #include "storage/dsm_impl.h" +#include "storage/spin.h" typedef struct PGShmemHeader /* standard header for all Postgres shmem */ { @@ -41,6 +42,20 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */ #endif } PGShmemHeader; +typedef struct ShmemSegment +{ + PGShmemHeader *ShmemSegHdr; /* shared mem segment header */ + void *ShmemBase; /* start address of shared memory */ + void *ShmemEnd; /* end+1 address of shared memory */ + slock_t *ShmemLock; /* spinlock for shared memory and LWLock + * allocation */ +} ShmemSegment; + +/* Number of available segments for anonymous memory mappings */ +#define ANON_MAPPINGS 1 + +extern PGDLLIMPORT ShmemSegment Segments[ANON_MAPPINGS]; + /* GUC variables */ extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; @@ -91,4 +106,7 @@ extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2); extern void PGSharedMemoryDetach(void); extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags); +/* The main segment, contains everything except buffer blocks and related data. */ +#define MAIN_SHMEM_SEGMENT 0 + #endif /* PG_SHMEM_H */ diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h index cd683a9d2d93..910c43f54f4f 100644 --- a/src/include/storage/shmem.h +++ b/src/include/storage/shmem.h @@ -30,15 +30,26 @@ extern PGDLLIMPORT slock_t *ShmemLock; typedef struct PGShmemHeader PGShmemHeader; /* avoid including * storage/pg_shmem.h here */ extern void InitShmemAccess(PGShmemHeader *seghdr); +extern void InitShmemAccessInSegment(struct PGShmemHeader *seghdr, + int shmem_segment); extern void InitShmemAllocation(void); +extern void InitShmemAllocationInSegment(int shmem_segment); extern void *ShmemAlloc(Size size); +extern void *ShmemAllocInSegment(Size size, int shmem_segment); extern void *ShmemAllocNoError(Size size); extern void *ShmemAllocUnlocked(Size size); +extern void *ShmemAllocUnlockedInSegment(Size size, int shmem_segment); extern bool ShmemAddrIsValid(const void *addr); +extern bool ShmemAddrIsValidInSegment(const void *addr, int shmem_segment); extern void InitShmemIndex(void); extern HTAB *ShmemInitHash(const char *name, int64 init_size, int64 max_size, HASHCTL *infoP, int hash_flags); +extern HTAB *ShmemInitHashInSegment(const char *name, long init_size, + long max_size, HASHCTL *infoP, + int hash_flags, int shmem_segment); extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr); +extern void *ShmemInitStructInSegment(const char *name, Size size, + bool *foundPtr, int shmem_segment); extern Size add_size(Size s1, Size s2); extern Size mul_size(Size s1, Size s2); From 1a9817c12566acbecd4196a77502a52261d7a464 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Tue, 17 Jun 2025 11:47:04 +0200 Subject: [PATCH 06/16] Address space reservation for shared memory Currently the shared memory layout is designed to pack everything tight together, leaving no space between mappings for resizing. Here is how it looks like for one mapping in /proc/$PID/maps, /dev/zero represents the anonymous shared memory we talk about: 00400000-00490000 /path/bin/postgres ... 012d9000-0133e000 [heap] 7f443a800000-7f470a800000 /dev/zero (deleted) 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive 7f4718400000-7f4718401000 /usr/lib64/libstdc++.so.6.0.34 ... Make the layout more dynamic via splitting every shared memory segment into two parts: * An anonymous file, which actually contains shared memory content. Such an anonymous file is created via memfd_create, it lives in memory, behaves like a regular file and semantically equivalent to an anonymous memory allocated via mmap with MAP_ANONYMOUS. * A reservation mapping, which size is much larger than required shared segment size. This mapping is created with flags PROT_NONE (which makes sure the reserved space is not used), and MAP_NORESERVE (to not count the reserved space against memory limits). The anonymous file is mapped into this reservation mapping. The resulting layout looks like this: 00400000-00490000 /path/bin/postgres ... 3f526000-3f590000 rw-p [heap] 7fbd827fe000-7fbd8bdde000 rw-s /memfd:main (deleted) -- anon file 7fbd8bdde000-7fbe82800000 ---s /memfd:main (deleted) -- reservation 7fbe82800000-7fbe90670000 r--p /usr/lib/locale/locale-archive 7fbe90800000-7fbe90941000 r-xp /usr/lib64/libstdc++.so.6.0.34 To resize a shared memory segment in this layout it's possible to use ftruncate on the anonymous file, adjusting access permissions on the reserved space as needed. This approach also do not impact the actual memory usage as reported by the kernel. Here is the output of /proc/$PID/status for the master version with shared_buffers = 128 MB: // Peak virtual memory size, which is described as total pages // mapped in mm_struct. It corresponds to the mapped reserved space // and is the only number that grows with it. VmPeak: 2043192 kB // Size of memory portions. It contains RssAnon + RssFile + RssShmem VmRSS: 22908 kB // Size of resident anonymous memory RssAnon: 768 kB // Size of resident file mappings RssFile: 10364 kB // Size of resident shmem memory (includes SysV shm, mapping of tmpfs and // shared anonymous mappings) RssShmem: 11776 kB Here is the same for the patch when reserving 20GB of space: VmPeak: 21255824 kB VmRSS: 25020 kB RssAnon: 768 kB RssFile: 10812 kB RssShmem: 13440 kB Cgroup v2 doesn't have any problems with that as well. To verify a new cgroup was created with the memory limit 256 MB, then PostgreSQL was launched withing this cgroup with shared_buffers = 128 MB: $ cd /sys/fs/cgroup $ mkdir postgres $ cd postres $ echo 268435456 > memory.max $ echo $MASTER_PID_SHELL > cgroup.procs # postgres from the master branch has being successfully launched # from that shell $ cat memory.current 17465344 (~16.6 MB) # stop postgres $ echo $PATCH_PID_SHELL > cgroup.procs # postgres from the patch has being successfully launched from that shell $ cat memory.current 20770816 (~19.8 MB) To control the amount of space reserved a new GUC max_available_memory is introduced. Ideally it should be based on the maximum available memory, hense the name. There are also few unrelated advantages of using anon files: * We've got a file descriptor, which could be used for regular file operations (modification, truncation, you name it). * The file could be given a name, which improves readability when it comes to process maps. * By default, Linux will not add file-backed shared mappings into a core dump, making it more convenient to work with them in PostgreSQL: no more huge dumps to process. The downside is that memfd_create is Linux specific. --- src/backend/port/sysv_shmem.c | 290 ++++++++++++++++++---- src/backend/port/win32_shmem.c | 2 +- src/backend/storage/ipc/ipci.c | 5 +- src/backend/storage/ipc/shmem.c | 2 +- src/backend/utils/init/globals.c | 1 + src/backend/utils/misc/guc_parameters.dat | 12 + src/include/miscadmin.h | 1 + src/include/portability/mem.h | 2 +- src/include/storage/pg_shmem.h | 5 +- 9 files changed, 260 insertions(+), 60 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 56af0231d242..363ddfd1fca1 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -97,10 +97,12 @@ void *UsedShmemSegAddr = NULL; typedef struct AnonymousMapping { int shmem_segment; - Size shmem_size; /* Size of the mapping */ + Size shmem_size; /* Size of the actually used memory */ + Size shmem_reserved; /* Size of the reserved mapping */ Pointer shmem; /* Pointer to the start of the mapped memory */ Pointer seg_addr; /* SysV shared memory for the header */ unsigned long seg_id; /* IPC key */ + int segment_fd; /* fd for the backing anon file */ } AnonymousMapping; static AnonymousMapping Mappings[ANON_MAPPINGS]; @@ -108,6 +110,49 @@ static AnonymousMapping Mappings[ANON_MAPPINGS]; /* Keeps track of used mapping segments */ static int next_free_segment = 0; +/* + * Anonymous mapping layout we use looks like this: + * + * 00400000-00c2a000 r-xp /bin/postgres + * ... + * 3f526000-3f590000 rw-p [heap] + * 7fbd827fe000-7fbd8bdde000 rw-s /memfd:main (deleted) + * 7fbd8bdde000-7fbe82800000 ---s /memfd:main (deleted) + * 7fbe82800000-7fbe90670000 r--p /usr/lib/locale/locale-archive + * 7fbe90800000-7fbe90941000 r-xp /usr/lib64/libstdc++.so.6.0.34 + * ... + * + * We need to place shared memory mappings in such a way, that there will be + * gaps between them in the address space. Those gaps have to be large enough + * to resize the mapping up to certain size, without counting towards the total + * memory consumption. + * + * To achieve this, for each shared memory segment we first create an anonymous + * file of specified size using memfd_create, which will accomodate actual + * shared memory mapping content. It is represented by the first /memfd:main + * with rw permissions. Then we create a mapping for this file using mmap, with + * size much larger than required and flags PROT_NONE (allows to make sure the + * reserved space will not be used) and MAP_NORESERVE (prevents the space from + * being counted against memory limits). The mapping serves as an address space + * reservation, into which shared memory segment can be extended and is + * represented by the second /memfd:main with no permissions. + * + * The reserved space for each segment is calculated as a fraction of the total + * reserved space (MaxAvailableMemory), as specified in the SHMEM_RESIZE_RATIO + * array. + */ +static double SHMEM_RESIZE_RATIO[1] = { + 1.0, /* MAIN_SHMEM_SLOT */ +}; + +/* + * Flag telling that we have decided to use huge pages. + * + * XXX: It's possible to use GetConfigOption("huge_pages_status", false, false) + * instead, but it feels like an overkill. + */ +static bool huge_pages_on = false; + static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); static void IpcMemoryDetach(int status, Datum shmaddr); static void IpcMemoryDelete(int status, Datum shmId); @@ -503,19 +548,20 @@ PGSharedMemoryAttach(IpcMemoryId shmId, * hugepage sizes, we might want to think about more invasive strategies, * such as increasing shared_buffers to absorb the extra space. * - * Returns the (real, assumed or config provided) page size into - * *hugepagesize, and the hugepage-related mmap flags to use into - * *mmap_flags if requested by the caller. If huge pages are not supported, - * *hugepagesize and *mmap_flags are set to 0. + * Returns the (real, assumed or config provided) page size into *hugepagesize, + * the hugepage-related mmap and memfd flags to use into *mmap_flags and + * *memfd_flags if requested by the caller. If huge pages are not supported, + * *hugepagesize, *mmap_flags and *memfd_flags are set to 0. */ void -GetHugePageSize(Size *hugepagesize, int *mmap_flags) +GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags) { #ifdef MAP_HUGETLB Size default_hugepagesize = 0; Size hugepagesize_local = 0; int mmap_flags_local = 0; + int memfd_flags_local = 0; /* * System-dependent code to find out the default huge page size. @@ -574,6 +620,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) } mmap_flags_local = MAP_HUGETLB; + memfd_flags_local = MFD_HUGETLB; /* * On recent enough Linux, also include the explicit page size, if @@ -584,7 +631,16 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) { int shift = pg_ceil_log2_64(hugepagesize_local); - mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } +#endif + +#if defined(MFD_HUGE_MASK) && defined(MFD_HUGE_SHIFT) + if (hugepagesize_local != default_hugepagesize) + { + int shift = pg_ceil_log2_64(hugepagesize_local); + + memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; } #endif @@ -593,6 +649,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) *mmap_flags = mmap_flags_local; if (hugepagesize) *hugepagesize = hugepagesize_local; + if (memfd_flags) + *memfd_flags = memfd_flags_local; #else @@ -600,6 +658,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) *hugepagesize = 0; if (mmap_flags) *mmap_flags = 0; + if (memfd_flags) + *memfd_flags = 0; #endif /* MAP_HUGETLB */ } @@ -625,72 +685,90 @@ check_huge_page_size(int *newval, void **extra, GucSource source) * Creates an anonymous mmap()ed shared memory segment. * * This function will modify mapping size to the actual size of the allocation, - * if it ends up allocating a segment that is larger than requested. + * if it ends up allocating a segment that is larger than requested. If needed, + * it also rounds up the mapping reserved size to be a multiple of huge page + * size. + * + * Note that we do not fallback from huge pages to regular pages in this + * function, this decision was already made in ReserveAnonymousMemory and we + * stick to it. */ static void CreateAnonymousSegment(AnonymousMapping *mapping) { Size allocsize = mapping->shmem_size; void *ptr = MAP_FAILED; - int mmap_errno = 0; + int save_errno = 0; + int mmap_flags = PG_MMAP_FLAGS, memfd_flags = 0; + + elog(DEBUG1, "segment[%s]: size %zu, reserved %zu", + MappingName(mapping->shmem_segment), mapping->shmem_size, + mapping->shmem_reserved); #ifndef MAP_HUGETLB - /* PGSharedMemoryCreate should have dealt with this case */ - Assert(huge_pages != HUGE_PAGES_ON); + /* PrepareHugePages should have dealt with this case */ + Assert(huge_pages != HUGE_PAGES_ON && !huge_pages_on); #else - if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) + if (huge_pages_on) { - /* - * Round up the request size to a suitable large value. - */ Size hugepagesize; - int mmap_flags; - GetHugePageSize(&hugepagesize, &mmap_flags); + /* Make sure nothing is messed up */ + Assert(huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY); + + /* Round up the request size to a suitable large value */ + GetHugePageSize(&hugepagesize, &mmap_flags, &memfd_flags); if (allocsize % hugepagesize != 0) allocsize += hugepagesize - (allocsize % hugepagesize); - ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS | mmap_flags, -1, 0); - mmap_errno = errno; - if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) - { - DebugMappings(); - elog(DEBUG1, "segment[%s]: mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", - MappingName(mapping->shmem_segment), allocsize); - } + /* + * The reserved space is multiple of BLCKSZ. We know the huge page + * size, round up the reserved space to it. + */ + mapping->shmem_reserved = mapping->shmem_reserved + hugepagesize - + (mapping->shmem_reserved % hugepagesize); + + /* Verify that the new size is withing the reserved boundaries */ + if (mapping->shmem_reserved < mapping->shmem_size) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("not enough shared memory is reserved"), + errhint("You may need to increase \"max_available_memory\"."))); + + mmap_flags = PG_MMAP_FLAGS | mmap_flags; } #endif /* - * Report whether huge pages are in use. This needs to be tracked before - * the second mmap() call if attempting to use huge pages failed - * previously. + * Prepare an anonymous file backing the segment. Its size will be + * specified later via ftruncate. + * + * The file behaves like a regular file, but lives in memory. Once all + * references to the file are dropped, it is automatically released. + * Anonymous memory is used for all backing pages of the file, thus it has + * the same semantics as anonymous memory allocations using mmap with the + * MAP_ANONYMOUS flag. */ - SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on", - PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); + mapping->segment_fd = memfd_create(MappingName(mapping->shmem_segment), + memfd_flags); - if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON) + /* + * Specify the segment file size using allocsize, which contains + * potentially modified value. + */ + if(ftruncate(mapping->segment_fd, allocsize) == -1) { - /* - * Use the original size, not the rounded-up value, when falling back - * to non-huge pages. - */ - allocsize = mapping->shmem_size; - ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS, -1, 0); - mmap_errno = errno; - } + save_errno = errno; - if (ptr == MAP_FAILED) - { - errno = mmap_errno; DebugMappings(); + close(mapping->segment_fd); + + errno = save_errno; ereport(FATAL, - (errmsg("segment[%s]: could not map anonymous shared memory: %m", + (errmsg("segment[%s]: could not truncate anonymous file: %m", MappingName(mapping->shmem_segment)), - (mmap_errno == ENOMEM) ? + (save_errno == ENOMEM) ? errhint("This error usually means that PostgreSQL's request " "for a shared memory segment exceeded available memory, " "swap space, or huge pages. To reduce the request size " @@ -700,10 +778,112 @@ CreateAnonymousSegment(AnonymousMapping *mapping) allocsize) : 0)); } + elog(DEBUG1, "segment[%s]: mmap(%zu)", + MappingName(mapping->shmem_segment), allocsize); + + /* + * Create a reservation mapping. + */ + ptr = mmap(NULL, mapping->shmem_reserved, PROT_NONE, + mmap_flags | MAP_NORESERVE, mapping->segment_fd, 0); + save_errno = errno; + + if (ptr == MAP_FAILED) + { + DebugMappings(); + + errno = save_errno; + ereport(FATAL, + (errmsg("segment[%s]: could not map anonymous shared memory: %m", + MappingName(mapping->shmem_segment)))); + } + + /* Make the memory accessible */ + if(mprotect(ptr, allocsize, PROT_READ | PROT_WRITE) == -1) + { + save_errno = errno; + DebugMappings(); + + errno = save_errno; + ereport(FATAL, + (errmsg("segment[%s]: could not mprotect anonymous shared memory: %m", + MappingName(mapping->shmem_segment)))); + } + mapping->shmem = ptr; mapping->shmem_size = allocsize; } +/* + * PrepareHugePages + * + * Figure out if there are enough huge pages to allocate all shared memory + * segments, and report that information via huge_pages_status and + * huge_pages_on. It needs to be called before creating shared memory segments. + * + * It is necessary to maintain the same semantic (simple on/off) for + * huge_pages_status, even if there are multiple shared memory segments: all + * segments either use huge pages or not, there is no mix of segments with + * different page size. The latter might be actually beneficial, in particular + * because only some segments may require large amount of memory, but for now + * we go with a simple solution. + */ +void +PrepareHugePages() +{ + void *ptr = MAP_FAILED; + + /* Reset to handle reinitialization */ + next_free_segment = 0; + + /* Complain if hugepages demanded but we can't possibly support them */ +#if !defined(MAP_HUGETLB) + if (huge_pages == HUGE_PAGES_ON) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("huge pages not supported on this platform"))); +#else + if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) + { + Size hugepagesize, total_size = 0; + int mmap_flags; + + GetHugePageSize(&hugepagesize, &mmap_flags, NULL); + + /* + * Figure out how much memory is needed for all segments, keeping in + * mind that for every segment this value will be rounding up by the + * huge page size. The resulting value will be used to probe memory and + * decide whether we will allocate huge pages or not. + */ + for(int segment = 0; segment < ANON_MAPPINGS; segment++) + { + int numSemas; + Size segment_size = CalculateShmemSize(&numSemas, segment); + + if (segment_size % hugepagesize != 0) + segment_size += hugepagesize - (segment_size % hugepagesize); + + total_size += segment_size; + } + + /* Map total amount of memory to test its availability. */ + elog(DEBUG1, "reserving space: probe mmap(%zu) with MAP_HUGETLB", + total_size); + ptr = mmap(NULL, total_size, PROT_NONE, + PG_MMAP_FLAGS | MAP_ANONYMOUS | mmap_flags, -1, 0); + } +#endif + + /* + * Report whether huge pages are in use. This needs to be tracked before + * creating shared memory segments. + */ + SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on", + PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); + huge_pages_on = ptr != MAP_FAILED; +} + /* * AnonymousShmemDetach --- detach from an anonymous mmap'd block * (called as an on_shmem_exit callback, hence funny argument list) @@ -746,7 +926,7 @@ PGSharedMemoryCreate(Size size, void *memAddress; PGShmemHeader *hdr; struct stat statbuf; - Size sysvsize; + Size sysvsize, total_reserved; AnonymousMapping *mapping = &Mappings[next_free_segment]; /* @@ -760,14 +940,6 @@ PGSharedMemoryCreate(Size size, errmsg("could not stat data directory \"%s\": %m", DataDir))); - /* Complain if hugepages demanded but we can't possibly support them */ -#if !defined(MAP_HUGETLB) - if (huge_pages == HUGE_PAGES_ON) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("huge pages not supported on this platform"))); -#endif - /* For now, we don't support huge pages in SysV memory */ if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP) ereport(ERROR, @@ -776,8 +948,16 @@ PGSharedMemoryCreate(Size size, /* Room for a header? */ Assert(size > MAXALIGN(sizeof(PGShmemHeader))); + + /* Prepare the mapping information */ mapping->shmem_size = size; mapping->shmem_segment = next_free_segment; + total_reserved = (Size) MaxAvailableMemory * BLCKSZ; + mapping->shmem_reserved = total_reserved * SHMEM_RESIZE_RATIO[next_free_segment]; + + /* Round up to be a multiple of BLCKSZ */ + mapping->shmem_reserved = mapping->shmem_reserved + BLCKSZ - + (mapping->shmem_reserved % BLCKSZ); if (shared_memory_type == SHMEM_TYPE_MMAP) { diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c index 4dee856d6bd6..732fedee87e0 100644 --- a/src/backend/port/win32_shmem.c +++ b/src/backend/port/win32_shmem.c @@ -627,7 +627,7 @@ pgwin32_ReserveSharedMemoryRegion(HANDLE hChild) * use GetLargePageMinimum() instead. */ void -GetHugePageSize(Size *hugepagesize, int *mmap_flags) +GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags) { if (hugepagesize) *hugepagesize = 0; diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 8b38e9853276..b60f7ef9ce2a 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -206,6 +206,9 @@ CreateSharedMemoryAndSemaphores(void) Assert(!IsUnderPostmaster); + /* Decide if we use huge pages or regular size pages */ + PrepareHugePages(); + for(int segment = 0; segment < ANON_MAPPINGS; segment++) { /* Compute the size of the shared-memory block */ @@ -377,7 +380,7 @@ InitializeShmemGUCs(void) /* * Calculate the number of huge pages required. */ - GetHugePageSize(&hp_size, NULL); + GetHugePageSize(&hp_size, NULL, NULL); if (hp_size != 0) { Size hp_required; diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index f185ed28f95f..9bb73f310521 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -815,7 +815,7 @@ pg_get_shmem_pagesize(void) Assert(huge_pages_status != HUGE_PAGES_UNKNOWN); if (huge_pages_status == HUGE_PAGES_ON) - GetHugePageSize(&os_page_size, NULL); + GetHugePageSize(&os_page_size, NULL, NULL); return os_page_size; } diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index d31cb45a0588..90d3feb547c5 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -140,6 +140,7 @@ int max_parallel_maintenance_workers = 2; * register background workers. */ int NBuffers = 16384; +int MaxAvailableMemory = 524288; int MaxConnections = 100; int max_worker_processes = 8; int max_parallel_workers = 8; diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 6bc6be13d2ad..c94f3fc3c80d 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -1107,6 +1107,18 @@ max => 'INT_MAX / 2', }, +# TODO: should this be PGC_POSTMASTER? +{ name => "max_available_memory", type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_MEM', + short_desc => 'Sets the upper limit for the shared_buffers value.', + long_desc => 'Shared memory could be resized at runtime, this parameters sets the upper limit for it, beyond which resizing would not be supported. Normally this value would be the same as the total available memory.', + flags => 'GUC_UNIT_BLOCKS', + variable => 'MaxAvailableMemory', + boot_val => '524288', + min => '16', + max => 'INT_MAX / 2', +}, + + { name => 'vacuum_buffer_usage_limit', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_MEM', short_desc => 'Sets the buffer pool size for VACUUM, ANALYZE, and autovacuum.', flags => 'GUC_UNIT_KB', diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 1bef98471c36..a0c37a7749ef 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -173,6 +173,7 @@ extern PGDLLIMPORT char *DataDir; extern PGDLLIMPORT int data_directory_mode; extern PGDLLIMPORT int NBuffers; +extern PGDLLIMPORT int MaxAvailableMemory; extern PGDLLIMPORT int MaxBackends; extern PGDLLIMPORT int MaxConnections; extern PGDLLIMPORT int max_worker_processes; diff --git a/src/include/portability/mem.h b/src/include/portability/mem.h index ef9800732d90..40588ff69683 100644 --- a/src/include/portability/mem.h +++ b/src/include/portability/mem.h @@ -38,7 +38,7 @@ #define MAP_NOSYNC 0 #endif -#define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE) +#define PG_MMAP_FLAGS (MAP_SHARED|MAP_HASSEMAPHORE) /* Some really old systems don't define MAP_FAILED. */ #ifndef MAP_FAILED diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 2348c59b5a03..79b0b1ef9eb8 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -61,6 +61,7 @@ extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; extern PGDLLIMPORT int huge_page_size; extern PGDLLIMPORT int huge_pages_status; +extern PGDLLIMPORT int MaxAvailableMemory; /* Possible values for huge_pages and huge_pages_status */ typedef enum @@ -104,7 +105,9 @@ extern PGShmemHeader *PGSharedMemoryCreate(Size size, PGShmemHeader **shim); extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2); extern void PGSharedMemoryDetach(void); -extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags); +extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags, + int *memfd_flags); +void PrepareHugePages(void); /* The main segment, contains everything except buffer blocks and related data. */ #define MAIN_SHMEM_SEGMENT 0 From a5fc36011600828336bfc8f20a564d3ff7dfa6de Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Tue, 17 Jun 2025 11:22:02 +0200 Subject: [PATCH 07/16] Introduce multiple shmem segments for shared buffers Add more shmem segments to split shared buffers into following chunks: * BUFFERS_SHMEM_SEGMENT: contains buffer blocks * BUFFER_DESCRIPTORS_SHMEM_SEGMENT: contains buffer descriptors * BUFFER_IOCV_SHMEM_SEGMENT: contains condition variables for buffers * CHECKPOINT_BUFFERS_SHMEM_SEGMENT: contains checkpoint buffer ids * STRATEGY_SHMEM_SEGMENT: contains buffer strategy status Size of the corresponding shared data directly depends on NBuffers, meaning that if we would like to change NBuffers, they have to be resized correspondingly. Placing each of them in a separate shmem segment allows to achieve that. There are some asumptions made about each of shmem segments upper size limit. The buffer blocks have the largest, while the rest claim less extra room for resize. Ideally those limits have to be deduced from the maximum allowed shared memory. --- src/backend/port/sysv_shmem.c | 24 +++++++- src/backend/storage/buffer/buf_init.c | 79 +++++++++++++++++--------- src/backend/storage/buffer/buf_table.c | 6 +- src/backend/storage/buffer/freelist.c | 5 +- src/backend/storage/ipc/ipci.c | 2 +- src/include/storage/bufmgr.h | 2 +- src/include/storage/pg_shmem.h | 24 +++++++- 7 files changed, 105 insertions(+), 37 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 363ddfd1fca1..dac011b766bc 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -139,10 +139,18 @@ static int next_free_segment = 0; * * The reserved space for each segment is calculated as a fraction of the total * reserved space (MaxAvailableMemory), as specified in the SHMEM_RESIZE_RATIO - * array. + * array. E.g. we allow BUFFERS_SHMEM_SEGMENT to take up to 60% of the whole + * space when resizing, based on the fact that it most likely will be the main + * consumer of this memory. Those numbers are pulled out of thin air for now, + * makes sense to evaluate them more precise. */ -static double SHMEM_RESIZE_RATIO[1] = { - 1.0, /* MAIN_SHMEM_SLOT */ +static double SHMEM_RESIZE_RATIO[6] = { + 0.1, /* MAIN_SHMEM_SEGMENT */ + 0.6, /* BUFFERS_SHMEM_SEGMENT */ + 0.1, /* BUFFER_DESCRIPTORS_SHMEM_SEGMENT */ + 0.1, /* BUFFER_IOCV_SHMEM_SEGMENT */ + 0.05, /* CHECKPOINT_BUFFERS_SHMEM_SEGMENT */ + 0.05, /* STRATEGY_SHMEM_SEGMENT */ }; /* @@ -167,6 +175,16 @@ MappingName(int shmem_segment) { case MAIN_SHMEM_SEGMENT: return "main"; + case BUFFERS_SHMEM_SEGMENT: + return "buffers"; + case BUFFER_DESCRIPTORS_SHMEM_SEGMENT: + return "descriptors"; + case BUFFER_IOCV_SHMEM_SEGMENT: + return "iocv"; + case CHECKPOINT_BUFFERS_SHMEM_SEGMENT: + return "checkpoint"; + case STRATEGY_SHMEM_SEGMENT: + return "strategy"; default: return "unknown"; } diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 6fd3a6bbac5e..5383442e2134 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -62,7 +62,10 @@ CkptSortItem *CkptBufferIds; * Initialize shared buffer pool * * This is called once during shared-memory initialization (either in the - * postmaster, or in a standalone backend). + * postmaster, or in a standalone backend). Size of data structures initialized + * here depends on NBuffers, and to be able to change NBuffers without a + * restart we store each structure into a separate shared memory segment, which + * could be resized on demand. */ void BufferManagerShmemInit(void) @@ -74,22 +77,22 @@ BufferManagerShmemInit(void) /* Align descriptors to a cacheline boundary. */ BufferDescriptors = (BufferDescPadded *) - ShmemInitStruct("Buffer Descriptors", + ShmemInitStructInSegment("Buffer Descriptors", NBuffers * sizeof(BufferDescPadded), - &foundDescs); + &foundDescs, BUFFER_DESCRIPTORS_SHMEM_SEGMENT); /* Align buffer pool on IO page size boundary. */ BufferBlocks = (char *) TYPEALIGN(PG_IO_ALIGN_SIZE, - ShmemInitStruct("Buffer Blocks", + ShmemInitStructInSegment("Buffer Blocks", NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, - &foundBufs)); + &foundBufs, BUFFERS_SHMEM_SEGMENT)); /* Align condition variables to cacheline boundary. */ BufferIOCVArray = (ConditionVariableMinimallyPadded *) - ShmemInitStruct("Buffer IO Condition Variables", + ShmemInitStructInSegment("Buffer IO Condition Variables", NBuffers * sizeof(ConditionVariableMinimallyPadded), - &foundIOCV); + &foundIOCV, BUFFER_IOCV_SHMEM_SEGMENT); /* * The array used to sort to-be-checkpointed buffer ids is located in @@ -99,8 +102,9 @@ BufferManagerShmemInit(void) * painful. */ CkptBufferIds = (CkptSortItem *) - ShmemInitStruct("Checkpoint BufferIds", - NBuffers * sizeof(CkptSortItem), &foundBufCkpt); + ShmemInitStructInSegment("Checkpoint BufferIds", + NBuffers * sizeof(CkptSortItem), &foundBufCkpt, + CHECKPOINT_BUFFERS_SHMEM_SEGMENT); if (foundDescs || foundBufs || foundIOCV || foundBufCkpt) { @@ -147,33 +151,54 @@ BufferManagerShmemInit(void) * BufferManagerShmemSize * * compute the size of shared memory for the buffer pool including - * data pages, buffer descriptors, hash tables, etc. + * data pages, buffer descriptors, hash tables, etc. based on the + * shared memory segment. The main segment must not allocate anything + * related to buffers, every other segment will receive part of the + * data. */ Size -BufferManagerShmemSize(void) +BufferManagerShmemSize(int shmem_segment) { Size size = 0; - /* size of buffer descriptors */ - size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded))); - /* to allow aligning buffer descriptors */ - size = add_size(size, PG_CACHE_LINE_SIZE); + if (shmem_segment == MAIN_SHMEM_SEGMENT) + return size; - /* size of data pages, plus alignment padding */ - size = add_size(size, PG_IO_ALIGN_SIZE); - size = add_size(size, mul_size(NBuffers, BLCKSZ)); + if (shmem_segment == BUFFER_DESCRIPTORS_SHMEM_SEGMENT) + { + /* size of buffer descriptors */ + size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded))); + /* to allow aligning buffer descriptors */ + size = add_size(size, PG_CACHE_LINE_SIZE); + } - /* size of stuff controlled by freelist.c */ - size = add_size(size, StrategyShmemSize()); + if (shmem_segment == BUFFERS_SHMEM_SEGMENT) + { + /* size of data pages, plus alignment padding */ + size = add_size(size, PG_IO_ALIGN_SIZE); + size = add_size(size, mul_size(NBuffers, BLCKSZ)); + } - /* size of I/O condition variables */ - size = add_size(size, mul_size(NBuffers, - sizeof(ConditionVariableMinimallyPadded))); - /* to allow aligning the above */ - size = add_size(size, PG_CACHE_LINE_SIZE); + if (shmem_segment == STRATEGY_SHMEM_SEGMENT) + { + /* size of stuff controlled by freelist.c */ + size = add_size(size, StrategyShmemSize()); + } - /* size of checkpoint sort array in bufmgr.c */ - size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem))); + if (shmem_segment == BUFFER_IOCV_SHMEM_SEGMENT) + { + /* size of I/O condition variables */ + size = add_size(size, mul_size(NBuffers, + sizeof(ConditionVariableMinimallyPadded))); + /* to allow aligning the above */ + size = add_size(size, PG_CACHE_LINE_SIZE); + } + + if (shmem_segment == CHECKPOINT_BUFFERS_SHMEM_SEGMENT) + { + /* size of checkpoint sort array in bufmgr.c */ + size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem))); + } return size; } diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index 1f6e215a2ca3..18a789671386 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -25,6 +25,7 @@ #include "funcapi.h" #include "storage/buf_internals.h" #include "storage/lwlock.h" +#include "storage/pg_shmem.h" #include "utils/rel.h" #include "utils/builtins.h" @@ -64,10 +65,11 @@ InitBufTable(int size) info.entrysize = sizeof(BufferLookupEnt); info.num_partitions = NUM_BUFFER_PARTITIONS; - SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table", + SharedBufHash = ShmemInitHashInSegment("Shared Buffer Lookup Table", size, size, &info, - HASH_ELEM | HASH_BLOBS | HASH_PARTITION | HASH_FIXED_SIZE); + HASH_ELEM | HASH_BLOBS | HASH_PARTITION | HASH_FIXED_SIZE, + STRATEGY_SHMEM_SEGMENT); } /* diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 7d59a92bd1a8..0bfbbb096d6a 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -19,6 +19,7 @@ #include "port/atomics.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/pg_shmem.h" #include "storage/proc.h" #define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var)))) @@ -381,9 +382,9 @@ StrategyInitialize(bool init) * Get or create the shared strategy control block */ StrategyControl = (BufferStrategyControl *) - ShmemInitStruct("Buffer Strategy Status", + ShmemInitStructInSegment("Buffer Strategy Status", sizeof(BufferStrategyControl), - &found); + &found, STRATEGY_SHMEM_SEGMENT); if (!found) { diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index b60f7ef9ce2a..2dbd81afc873 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -113,7 +113,7 @@ CalculateShmemSize(int *num_semaphores, int shmem_segment) sizeof(ShmemIndexEnt))); size = add_size(size, dsm_estimate_size()); size = add_size(size, DSMRegistryShmemSize()); - size = add_size(size, BufferManagerShmemSize()); + size = add_size(size, BufferManagerShmemSize(shmem_segment)); size = add_size(size, LockManagerShmemSize()); size = add_size(size, PredicateLockShmemSize()); size = add_size(size, ProcGlobalShmemSize()); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 47360a3d3d85..f8d34513c7f5 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -318,7 +318,7 @@ extern void EvictRelUnpinnedBuffers(Relation rel, /* in buf_init.c */ extern void BufferManagerShmemInit(void); -extern Size BufferManagerShmemSize(void); +extern Size BufferManagerShmemSize(int); /* in localbuf.c */ extern void AtProcExit_LocalBuffers(void); diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 79b0b1ef9eb8..a7b275b4db97 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -52,7 +52,7 @@ typedef struct ShmemSegment } ShmemSegment; /* Number of available segments for anonymous memory mappings */ -#define ANON_MAPPINGS 1 +#define ANON_MAPPINGS 6 extern PGDLLIMPORT ShmemSegment Segments[ANON_MAPPINGS]; @@ -109,7 +109,29 @@ extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags); void PrepareHugePages(void); +/* + * To be able to dynamically resize largest parts of the data stored in shared + * memory, we split it into multiple shared memory mappings segments. Each + * segment contains only certain part of the data, which size depends on + * NBuffers. + */ + /* The main segment, contains everything except buffer blocks and related data. */ #define MAIN_SHMEM_SEGMENT 0 +/* Buffer blocks */ +#define BUFFERS_SHMEM_SEGMENT 1 + +/* Buffer descriptors */ +#define BUFFER_DESCRIPTORS_SHMEM_SEGMENT 2 + +/* Condition variables for buffers */ +#define BUFFER_IOCV_SHMEM_SEGMENT 3 + +/* Checkpoint BufferIds */ +#define CHECKPOINT_BUFFERS_SHMEM_SEGMENT 4 + +/* Buffer strategy status */ +#define STRATEGY_SHMEM_SEGMENT 5 + #endif /* PG_SHMEM_H */ From 811571266cae162a1392f499739cd6b268c396c9 Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Wed, 20 Aug 2025 11:35:20 +0530 Subject: [PATCH 08/16] Fix compilation failures from previous commits shm_total_page_count is used unitialized. If this variable has a random value to start with, the final sum would be wrong. Also include pg_shmem.h where shared memory segment macros are used. Author: Ashutosh Bapat --- src/backend/storage/buffer/buf_init.c | 1 + src/backend/storage/ipc/shmem.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 5383442e2134..6d703e18f8b9 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -16,6 +16,7 @@ #include "storage/aio.h" #include "storage/buf_internals.h" +#include "storage/pg_shmem.h" #include "storage/bufmgr.h" BufferDescPadded *BufferDescriptors; diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 9bb73f310521..e6cb919f0fc0 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -649,7 +649,7 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) Size os_page_size; void **page_ptrs; int *pages_status; - uint64 shm_total_page_count, + uint64 shm_total_page_count = 0, shm_ent_page_count, max_nodes; Size *nodes; From bb9c37f81cbbaab942f544d703a834eadd83e2b0 Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Thu, 21 Aug 2025 11:56:09 +0530 Subject: [PATCH 09/16] Refactor CalculateShmemSize() This function calls many functions which return the amount of shared memory required for different shared memory data structures. Up until now, the returned total of these sizes was used to create a single shared memory segment. But starting the previous patch, we create multiple shared memory segments each of which contain one shared memory structure related to shared buffers and one main memory segment containing rest of the structures. Since CalculateShmemSize() is called for every shared memory segment, and its return value is added to the memory required for all the shared memory segments, we end up allocating more memory than required. Instead, CalculateShmemSize() is called only once. Each of its callees are expected to a. return the size required from the main segment b. add sizes to the AnonymousMappings corresponding to the other memory segments. For individual modules to add memory to their respective AnonymousMappings, we need to know the different mappings upfront. Hence ANON_MAPPINGS replaces next_free_segment. TODOs: 1. This change however requires that the AnonymousMappings array and macros defining identifiers of each of the segments be platform-independent. This patch doesn't achieve that goal for all the platforms for example windows. We need to fix that. 2. If postgres is invoked with -C shared_memory_size, it reports 0. That's because it report the GUC values before share memory sizes are set in AnonymousMappings. Fix that too. 3. Eliminate this assymetry in CalculateShmemSize(). See TODO in prologue of CalculateShmemSize(). 4. This is one way to avoid requesting more memory in each segment. But there may be other ways to design CalculateShmemSize(). Need to think and implement it better. Author: Ashutosh Bapat --- src/backend/port/sysv_shmem.c | 48 ++++++-------------- src/backend/port/win32_shmem.c | 7 +-- src/backend/postmaster/postmaster.c | 14 +++--- src/backend/storage/buffer/buf_init.c | 55 ++++++++--------------- src/backend/storage/ipc/ipci.c | 65 ++++++++++++++++++++++----- src/backend/storage/ipc/shmem.c | 8 ++-- src/backend/tcop/postgres.c | 14 +++--- src/include/storage/bufmgr.h | 2 +- src/include/storage/ipc.h | 2 +- src/include/storage/pg_shmem.h | 17 ++++++- 10 files changed, 125 insertions(+), 107 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index dac011b766bc..b85911bdfc4e 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -94,21 +94,7 @@ typedef enum unsigned long UsedShmemSegID = 0; void *UsedShmemSegAddr = NULL; -typedef struct AnonymousMapping -{ - int shmem_segment; - Size shmem_size; /* Size of the actually used memory */ - Size shmem_reserved; /* Size of the reserved mapping */ - Pointer shmem; /* Pointer to the start of the mapped memory */ - Pointer seg_addr; /* SysV shared memory for the header */ - unsigned long seg_id; /* IPC key */ - int segment_fd; /* fd for the backing anon file */ -} AnonymousMapping; - -static AnonymousMapping Mappings[ANON_MAPPINGS]; - -/* Keeps track of used mapping segments */ -static int next_free_segment = 0; +AnonymousMapping Mappings[ANON_MAPPINGS]; /* * Anonymous mapping layout we use looks like this: @@ -168,7 +154,7 @@ static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, void *attachAt, PGShmemHeader **addr); -static const char* +const char* MappingName(int shmem_segment) { switch (shmem_segment) @@ -193,7 +179,7 @@ MappingName(int shmem_segment) static void DebugMappings() { - for(int i = 0; i < next_free_segment; i++) + for(int i = 0; i < ANON_MAPPINGS; i++) { AnonymousMapping m = Mappings[i]; elog(DEBUG1, "Mapping[%s]: addr %p, size %zu", @@ -851,9 +837,6 @@ PrepareHugePages() { void *ptr = MAP_FAILED; - /* Reset to handle reinitialization */ - next_free_segment = 0; - /* Complain if hugepages demanded but we can't possibly support them */ #if !defined(MAP_HUGETLB) if (huge_pages == HUGE_PAGES_ON) @@ -876,8 +859,7 @@ PrepareHugePages() */ for(int segment = 0; segment < ANON_MAPPINGS; segment++) { - int numSemas; - Size segment_size = CalculateShmemSize(&numSemas, segment); + Size segment_size = Mappings[segment].shmem_req_size; if (segment_size % hugepagesize != 0) segment_size += hugepagesize - (segment_size % hugepagesize); @@ -909,7 +891,7 @@ PrepareHugePages() static void AnonymousShmemDetach(int status, Datum arg) { - for(int i = 0; i < next_free_segment; i++) + for(int i = 0; i < ANON_MAPPINGS; i++) { AnonymousMapping m = Mappings[i]; @@ -927,7 +909,7 @@ AnonymousShmemDetach(int status, Datum arg) /* * PGSharedMemoryCreate * - * Create a shared memory segment of the given size and initialize its + * Create a shared memory segment for the given mapping and initialize its * standard header. Also, register an on_shmem_exit callback to release * the storage. * @@ -937,7 +919,7 @@ AnonymousShmemDetach(int status, Datum arg) * postmaster or backend. */ PGShmemHeader * -PGSharedMemoryCreate(Size size, +PGSharedMemoryCreate(AnonymousMapping *mapping, PGShmemHeader **shim) { IpcMemoryKey NextShmemSegID; @@ -945,7 +927,6 @@ PGSharedMemoryCreate(Size size, PGShmemHeader *hdr; struct stat statbuf; Size sysvsize, total_reserved; - AnonymousMapping *mapping = &Mappings[next_free_segment]; /* * We use the data directory's ID info (inode and device numbers) to @@ -965,13 +946,12 @@ PGSharedMemoryCreate(Size size, errmsg("huge pages not supported with the current \"shared_memory_type\" setting"))); /* Room for a header? */ - Assert(size > MAXALIGN(sizeof(PGShmemHeader))); + Assert(mapping->shmem_req_size > MAXALIGN(sizeof(PGShmemHeader))); /* Prepare the mapping information */ - mapping->shmem_size = size; - mapping->shmem_segment = next_free_segment; + mapping->shmem_size = mapping->shmem_req_size; total_reserved = (Size) MaxAvailableMemory * BLCKSZ; - mapping->shmem_reserved = total_reserved * SHMEM_RESIZE_RATIO[next_free_segment]; + mapping->shmem_reserved = total_reserved * SHMEM_RESIZE_RATIO[mapping->shmem_segment]; /* Round up to be a multiple of BLCKSZ */ mapping->shmem_reserved = mapping->shmem_reserved + BLCKSZ - @@ -982,8 +962,6 @@ PGSharedMemoryCreate(Size size, /* On success, mapping data will be modified. */ CreateAnonymousSegment(mapping); - next_free_segment++; - /* Register on-exit routine to unmap the anonymous segment */ on_shmem_exit(AnonymousShmemDetach, (Datum) 0); @@ -992,7 +970,7 @@ PGSharedMemoryCreate(Size size, } else { - sysvsize = size; + sysvsize = mapping->shmem_req_size; /* huge pages are only available with mmap */ SetConfigOption("huge_pages_status", "off", @@ -1005,7 +983,7 @@ PGSharedMemoryCreate(Size size, * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure * that, but prefer fixing it over coping here.) */ - NextShmemSegID = statbuf.st_ino + next_free_segment; + NextShmemSegID = statbuf.st_ino + mapping->shmem_segment; for (;;) { @@ -1214,7 +1192,7 @@ PGSharedMemoryNoReAttach(void) void PGSharedMemoryDetach(void) { - for(int i = 0; i < next_free_segment; i++) + for(int i = 0; i < ANON_MAPPINGS; i++) { AnonymousMapping m = Mappings[i]; diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c index 732fedee87e0..1db07ff65d34 100644 --- a/src/backend/port/win32_shmem.c +++ b/src/backend/port/win32_shmem.c @@ -204,7 +204,7 @@ EnableLockPagesPrivilege(int elevel) * standard header. */ PGShmemHeader * -PGSharedMemoryCreate(Size size, +PGSharedMemoryCreate(AnonymousMapping *mapping, PGShmemHeader **shim) { void *memAddress; @@ -216,7 +216,7 @@ PGSharedMemoryCreate(Size size, DWORD size_high; DWORD size_low; SIZE_T largePageSize = 0; - Size orig_size = size; + Size size = mapping->shmem_req_size; DWORD flProtect = PAGE_READWRITE; DWORD desiredAccess; @@ -304,7 +304,7 @@ PGSharedMemoryCreate(Size size, * Use the original size, not the rounded-up value, when * falling back to non-huge pages. */ - size = orig_size; + size = mapping->shmem_req_size; flProtect = PAGE_READWRITE; goto retry; } @@ -391,6 +391,7 @@ PGSharedMemoryCreate(Size size, hdr->totalsize = size; hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); hdr->dsm_control = 0; + mapping->shmem_size = size; /* Save info for possible future use */ UsedShmemSegAddr = memAddress; diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index e1d643b013d7..b59d20b4ac22 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -963,13 +963,6 @@ PostmasterMain(int argc, char *argv[]) */ process_shmem_requests(); - /* - * Now that loadable modules have had their chance to request additional - * shared memory, determine the value of any runtime-computed GUCs that - * depend on the amount of shared memory required. - */ - InitializeShmemGUCs(); - /* * Now that modules have been loaded, we can process any custom resource * managers specified in the wal_consistency_checking GUC. @@ -1005,6 +998,13 @@ PostmasterMain(int argc, char *argv[]) */ CreateSharedMemoryAndSemaphores(); + /* + * Now that loadable modules have had their chance to request additional + * shared memory, determine the value of any runtime-computed GUCs that + * depend on the amount of shared memory required. + */ + InitializeShmemGUCs(); + /* * Estimate number of openable files. This must happen after setting up * semaphores, because on some platforms semaphores count as open files. diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 6d703e18f8b9..6f148d1d80b2 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -158,48 +158,31 @@ BufferManagerShmemInit(void) * data. */ Size -BufferManagerShmemSize(int shmem_segment) +BufferManagerShmemSize(void) { - Size size = 0; + size_t size; - if (shmem_segment == MAIN_SHMEM_SEGMENT) - return size; + /* size of buffer descriptors, plus alignment padding */ + size = add_size(0, mul_size(NBuffers, sizeof(BufferDescPadded))); + size = add_size(size, PG_CACHE_LINE_SIZE); + Mappings[BUFFER_DESCRIPTORS_SHMEM_SEGMENT].shmem_req_size = size; - if (shmem_segment == BUFFER_DESCRIPTORS_SHMEM_SEGMENT) - { - /* size of buffer descriptors */ - size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded))); - /* to allow aligning buffer descriptors */ - size = add_size(size, PG_CACHE_LINE_SIZE); - } + /* size of data pages, plus alignment padding */ + size = add_size(0, PG_IO_ALIGN_SIZE); + size = add_size(size, mul_size(NBuffers, BLCKSZ)); + Mappings[BUFFERS_SHMEM_SEGMENT].shmem_req_size = size; - if (shmem_segment == BUFFERS_SHMEM_SEGMENT) - { - /* size of data pages, plus alignment padding */ - size = add_size(size, PG_IO_ALIGN_SIZE); - size = add_size(size, mul_size(NBuffers, BLCKSZ)); - } + /* size of stuff controlled by freelist.c */ + Mappings[STRATEGY_SHMEM_SEGMENT].shmem_req_size = StrategyShmemSize(); - if (shmem_segment == STRATEGY_SHMEM_SEGMENT) - { - /* size of stuff controlled by freelist.c */ - size = add_size(size, StrategyShmemSize()); - } + /* size of I/O condition variables, plus alignment padding */ + size = add_size(0, mul_size(NBuffers, + sizeof(ConditionVariableMinimallyPadded))); + size = add_size(size, PG_CACHE_LINE_SIZE); + Mappings[BUFFER_IOCV_SHMEM_SEGMENT].shmem_req_size = size; - if (shmem_segment == BUFFER_IOCV_SHMEM_SEGMENT) - { - /* size of I/O condition variables */ - size = add_size(size, mul_size(NBuffers, - sizeof(ConditionVariableMinimallyPadded))); - /* to allow aligning the above */ - size = add_size(size, PG_CACHE_LINE_SIZE); - } - - if (shmem_segment == CHECKPOINT_BUFFERS_SHMEM_SEGMENT) - { - /* size of checkpoint sort array in bufmgr.c */ - size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem))); - } + /* size of checkpoint sort array in bufmgr.c */ + Mappings[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_req_size = mul_size(NBuffers, sizeof(CkptSortItem)); return size; } diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 2dbd81afc873..2cd278449f0d 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -84,9 +84,23 @@ RequestAddinShmemSpace(Size size) * * If num_semaphores is not NULL, it will be set to the number of semaphores * required. + * + * TODO: Right now the minions of this function return the size of shared memory + * required in the main shared memory segment but add sizes required from other + * segments in the respective mappings. I think we should change this assymetry. + * It's only the buffer manager which adds sizes for other segments, but in + * future there may be others. Further the buffer manager related other segments + * are expected to hold only one resizable structure thus their size should be + * set only once when changing shared buffer pool size (i.e. when changin + * shared_buffers GUC). We shouldn't allow adding more structures to these + * segments, and thus restrict adding sizes to the corresponding mappings after + * the initial size is set. + * + * TODO: Also we should do something about numSemas, which is not required + * everywhere CalculateShmemSize is called. */ Size -CalculateShmemSize(int *num_semaphores, int shmem_segment) +CalculateShmemSize(int *num_semaphores) { Size size; int numSemas; @@ -113,7 +127,13 @@ CalculateShmemSize(int *num_semaphores, int shmem_segment) sizeof(ShmemIndexEnt))); size = add_size(size, dsm_estimate_size()); size = add_size(size, DSMRegistryShmemSize()); - size = add_size(size, BufferManagerShmemSize(shmem_segment)); + + /* + * Buffer manager adds estimates for memory requirements for every shared + * memory segment that it uses in the corresponding AnonymousMappings. + * Consider size required from only the main shared memory segment here. + */ + size = add_size(size, BufferManagerShmemSize()); size = add_size(size, LockManagerShmemSize()); size = add_size(size, PredicateLockShmemSize()); size = add_size(size, ProcGlobalShmemSize()); @@ -154,8 +174,15 @@ CalculateShmemSize(int *num_semaphores, int shmem_segment) /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); + /* + * All the shared memory allocations considered so far happen in the main + * shared memory segment. + */ + Mappings[MAIN_SHMEM_SEGMENT].shmem_req_size = size; + /* might as well round it off to a multiple of a typical page size */ - size = add_size(size, 8192 - (size % 8192)); + for (int segment = 0; segment < ANON_MAPPINGS; segment++) + Mappings[segment].shmem_req_size = add_size(Mappings[segment].shmem_req_size, 8192 - (Mappings[segment].shmem_req_size % 8192)); return size; } @@ -201,26 +228,30 @@ CreateSharedMemoryAndSemaphores(void) { PGShmemHeader *shim; PGShmemHeader *seghdr; - Size size; int numSemas; Assert(!IsUnderPostmaster); + CalculateShmemSize(&numSemas); + /* Decide if we use huge pages or regular size pages */ PrepareHugePages(); for(int segment = 0; segment < ANON_MAPPINGS; segment++) { + AnonymousMapping *mapping = &Mappings[segment]; + + mapping->shmem_segment = segment; + /* Compute the size of the shared-memory block */ - size = CalculateShmemSize(&numSemas, segment); - elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size); + elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", mapping->shmem_req_size); /* * Create the shmem segment. * * XXX: Do multiple shims are needed, one per segment? */ - seghdr = PGSharedMemoryCreate(size, &shim); + seghdr = PGSharedMemoryCreate(mapping, &shim); /* * Make sure that huge pages are never reported as "unknown" while the @@ -232,9 +263,13 @@ CreateSharedMemoryAndSemaphores(void) InitShmemAccessInSegment(seghdr, segment); /* - * Create semaphores + * Shared memory for semaphores is allocated in the main shared memory. + * Hence they are allocated after the main segment is created. Patch + * proposed at https://commitfest.postgresql.org/patch/5997/ simplifies + * this. */ - PGReserveSemaphores(numSemas, segment); + if (segment == MAIN_SHMEM_SEGMENT) + PGReserveSemaphores(numSemas, segment); /* * Set up shared memory allocation mechanism @@ -357,7 +392,9 @@ CreateOrAttachShmemStructs(void) * InitializeShmemGUCs * * This function initializes runtime-computed GUCs related to the amount of - * shared memory required for the current configuration. + * shared memory required for the current configuration. It assumes that the + * memory required by the shared memory segments is already calculated and is + * available in AnonymousMappings. */ void InitializeShmemGUCs(void) @@ -366,12 +403,16 @@ InitializeShmemGUCs(void) Size size_b; Size size_mb; Size hp_size; - int num_semas; + int num_semas = ProcGlobalSemas(); + int i; /* * Calculate the shared memory size and round up to the nearest megabyte. */ - size_b = CalculateShmemSize(&num_semas, MAIN_SHMEM_SEGMENT); + size_b = 0; + for (i = 0; i < ANON_MAPPINGS; i++) + size_b = add_size(size_b, Mappings[i].shmem_req_size); + size_mb = add_size(size_b, (1024 * 1024) - 1) / (1024 * 1024); sprintf(buf, "%zu", size_mb); SetConfigOption("shared_memory_size", buf, diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index e6cb919f0fc0..90c21a972259 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -178,8 +178,8 @@ ShmemAllocInSegment(Size size, int shmem_segment) if (!newSpace) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of shared memory (%zu bytes requested)", - size))); + errmsg("out of shared memory in segment %s (%zu bytes requested)", + MappingName(shmem_segment), size))); return newSpace; } @@ -286,8 +286,8 @@ ShmemAllocUnlockedInSegment(Size size, int shmem_segment) if (newFree > Segments[shmem_segment].ShmemSegHdr->totalsize) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of shared memory (%zu bytes requested)", - size))); + errmsg("out of shared memory in segment %s (%zu bytes requested)", + MappingName(shmem_segment), size))); Segments[shmem_segment].ShmemSegHdr->freeoffset = newFree; newSpace = (char *) Segments[shmem_segment].ShmemBase + newStart; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 8d4d6cc3f333..c819608fff63 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -4132,13 +4132,6 @@ PostgresSingleUserMain(int argc, char *argv[], */ process_shmem_requests(); - /* - * Now that loadable modules have had their chance to request additional - * shared memory, determine the value of any runtime-computed GUCs that - * depend on the amount of shared memory required. - */ - InitializeShmemGUCs(); - /* * Now that modules have been loaded, we can process any custom resource * managers specified in the wal_consistency_checking GUC. @@ -4151,6 +4144,13 @@ PostgresSingleUserMain(int argc, char *argv[], */ CreateSharedMemoryAndSemaphores(); + /* + * Now that loadable modules have had their chance to request additional + * shared memory, determine the value of any runtime-computed GUCs that + * depend on the amount of shared memory required. + */ + InitializeShmemGUCs(); + /* * Estimate number of openable files. This must happen after setting up * semaphores, because on some platforms semaphores count as open files. diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index f8d34513c7f5..47360a3d3d85 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -318,7 +318,7 @@ extern void EvictRelUnpinnedBuffers(Relation rel, /* in buf_init.c */ extern void BufferManagerShmemInit(void); -extern Size BufferManagerShmemSize(int); +extern Size BufferManagerShmemSize(void); /* in localbuf.c */ extern void AtProcExit_LocalBuffers(void); diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h index 6ebda479ced7..3baf418b3d1e 100644 --- a/src/include/storage/ipc.h +++ b/src/include/storage/ipc.h @@ -77,7 +77,7 @@ extern void check_on_shmem_exit_lists_are_empty(void); /* ipci.c */ extern PGDLLIMPORT shmem_startup_hook_type shmem_startup_hook; -extern Size CalculateShmemSize(int *num_semaphores, int shmem_segment); +extern Size CalculateShmemSize(int *num_semaphores); extern void CreateSharedMemoryAndSemaphores(void); #ifdef EXEC_BACKEND extern void AttachSharedMemoryStructs(void); diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index a7b275b4db97..a1fa6b43fe38 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -27,6 +27,18 @@ #include "storage/dsm_impl.h" #include "storage/spin.h" +typedef struct AnonymousMapping +{ + int shmem_segment; /* TODO: Do we really need it? */ + Size shmem_req_size; /* Required size of the segment */ + Size shmem_size; /* Size of the actually used memory */ + Size shmem_reserved; /* Size of the reserved mapping */ + Pointer shmem; /* Pointer to the start of the mapped memory */ + Pointer seg_addr; /* SysV shared memory for the header */ + unsigned long seg_id; /* IPC key */ + int segment_fd; /* fd for the backing anon file */ +} AnonymousMapping; + typedef struct PGShmemHeader /* standard header for all Postgres shmem */ { int32 magic; /* magic # to identify Postgres segments */ @@ -55,6 +67,8 @@ typedef struct ShmemSegment #define ANON_MAPPINGS 6 extern PGDLLIMPORT ShmemSegment Segments[ANON_MAPPINGS]; +extern PGDLLIMPORT AnonymousMapping Mappings[ANON_MAPPINGS]; + /* GUC variables */ extern PGDLLIMPORT int shared_memory_type; @@ -101,10 +115,11 @@ extern void PGSharedMemoryReAttach(void); extern void PGSharedMemoryNoReAttach(void); #endif -extern PGShmemHeader *PGSharedMemoryCreate(Size size, +extern PGShmemHeader *PGSharedMemoryCreate(AnonymousMapping *mapping, PGShmemHeader **shim); extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2); extern void PGSharedMemoryDetach(void); +extern const char *MappingName(int shmem_segment); extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags); void PrepareHugePages(void); From 70b4164c9d63dbc24aef6180af722e480258b1b9 Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Wed, 20 Aug 2025 10:55:27 +0530 Subject: [PATCH 10/16] WIP: Monitoring views Modifies pg_shmem_allocations to report shared memory segment as well. Adds pg_shmem_segments to report shared memory segment information. TODO: This commit should be merged with the earlier commit introducing multiple shared memory segments. Author: Ashutosh Bapat --- doc/src/sgml/system-views.sgml | 9 +++ src/backend/catalog/system_views.sql | 7 +++ src/backend/storage/ipc/shmem.c | 90 ++++++++++++++++++++++------ src/include/catalog/pg_proc.dat | 12 +++- src/include/storage/pg_shmem.h | 1 - src/include/storage/shmem.h | 1 + src/test/regress/expected/rules.out | 10 +++- 7 files changed, 108 insertions(+), 22 deletions(-) diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 89be9bc333fe..7d14a6eca24e 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -4167,6 +4167,15 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
+ + + segment text + + + The name of the shared memory segment concerning the allocation. + + + off int8 diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 46fc28396de9..f659dbb2f862 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -658,6 +658,13 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats; REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats; +CREATE VIEW pg_shmem_segments AS + SELECT * FROM pg_get_shmem_segments(); + +REVOKE ALL ON pg_shmem_segments FROM PUBLIC; +GRANT SELECT ON pg_shmem_segments TO pg_read_all_stats; +REVOKE EXECUTE ON FUNCTION pg_get_shmem_segments() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_shmem_segments() TO pg_read_all_stats; CREATE VIEW pg_shmem_allocations_numa AS SELECT * FROM pg_get_shmem_allocations_numa(); diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 90c21a972259..9499f332e77f 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -531,6 +531,7 @@ ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, result->size = size; result->allocated_size = allocated_size; result->location = structPtr; + result->shmem_segment = shmem_segment; } LWLockRelease(ShmemIndexLock); @@ -582,13 +583,14 @@ mul_size(Size s1, Size s2) Datum pg_get_shmem_allocations(PG_FUNCTION_ARGS) { -#define PG_GET_SHMEM_SIZES_COLS 4 +#define PG_GET_SHMEM_SIZES_COLS 5 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; HASH_SEQ_STATUS hstat; ShmemIndexEnt *ent; - Size named_allocated = 0; + Size named_allocated[ANON_MAPPINGS] = {0}; Datum values[PG_GET_SHMEM_SIZES_COLS]; bool nulls[PG_GET_SHMEM_SIZES_COLS]; + int i; InitMaterializedSRF(fcinfo, 0); @@ -598,33 +600,42 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS) /* output all allocated entries */ memset(nulls, 0, sizeof(nulls)); - /* XXX: take all shared memory segments into account. */ while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) { values[0] = CStringGetTextDatum(ent->key); - values[1] = Int64GetDatum((char *) ent->location - (char *) Segments[MAIN_SHMEM_SEGMENT].ShmemSegHdr); - values[2] = Int64GetDatum(ent->size); - values[3] = Int64GetDatum(ent->allocated_size); - named_allocated += ent->allocated_size; + values[1] = CStringGetTextDatum(MappingName(ent->shmem_segment)); + values[2] = Int64GetDatum((char *) ent->location - (char *) Segments[ent->shmem_segment].ShmemSegHdr); + values[3] = Int64GetDatum(ent->size); + values[4] = Int64GetDatum(ent->allocated_size); + named_allocated[ent->shmem_segment] += ent->allocated_size; tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); } /* output shared memory allocated but not counted via the shmem index */ - values[0] = CStringGetTextDatum(""); - nulls[1] = true; - values[2] = Int64GetDatum(Segments[MAIN_SHMEM_SEGMENT].ShmemSegHdr->freeoffset - named_allocated); - values[3] = values[2]; - tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + for (i = 0; i < ANON_MAPPINGS; i++) + { + values[0] = CStringGetTextDatum(""); + values[1] = CStringGetTextDatum(MappingName(i)); + nulls[2] = true; + values[3] = Int64GetDatum(Segments[i].ShmemSegHdr->freeoffset - named_allocated[i]); + values[4] = values[3]; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } /* output as-of-yet unused shared memory */ - nulls[0] = true; - values[1] = Int64GetDatum(Segments[MAIN_SHMEM_SEGMENT].ShmemSegHdr->freeoffset); - nulls[1] = false; - values[2] = Int64GetDatum(Segments[MAIN_SHMEM_SEGMENT].ShmemSegHdr->totalsize - Segments[MAIN_SHMEM_SEGMENT].ShmemSegHdr->freeoffset); - values[3] = values[2]; - tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + memset(nulls, 0, sizeof(nulls)); + + for (i = 0; i < ANON_MAPPINGS; i++) + { + nulls[0] = true; + values[1] = CStringGetTextDatum(MappingName(i)); + values[2] = Int64GetDatum(Segments[i].ShmemSegHdr->freeoffset); + values[3] = Int64GetDatum(Segments[i].ShmemSegHdr->totalsize - Segments[i].ShmemSegHdr->freeoffset); + values[4] = values[3]; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } LWLockRelease(ShmemIndexLock); @@ -825,3 +836,46 @@ pg_numa_available(PG_FUNCTION_ARGS) { PG_RETURN_BOOL(pg_numa_init() != -1); } + +/* SQL SRF showing shared memory segments */ +Datum +pg_get_shmem_segments(PG_FUNCTION_ARGS) +{ +#define PG_GET_SHMEM_SEGS_COLS 6 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + Datum values[PG_GET_SHMEM_SEGS_COLS]; + bool nulls[PG_GET_SHMEM_SEGS_COLS]; + int i; + + InitMaterializedSRF(fcinfo, 0); + + /* output all allocated entries */ + for (i = 0; i < ANON_MAPPINGS; i++) + { + PGShmemHeader *shmhdr = Segments[i].ShmemSegHdr; + AnonymousMapping *segmapping = &Mappings[i]; + int j; + + if (shmhdr == NULL) + { + for (j = 0; j < PG_GET_SHMEM_SEGS_COLS; j++) + nulls[j] = true; + } + else + { + memset(nulls, 0, sizeof(nulls)); + values[0] = Int32GetDatum(i); + values[1] = CStringGetTextDatum(MappingName(i)); + values[2] = Int64GetDatum(shmhdr->totalsize); + values[3] = Int64GetDatum(shmhdr->freeoffset); + values[4] = Int64GetDatum(segmapping->shmem_size); + values[5] = Int64GetDatum(segmapping->shmem_reserved); + } + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + return (Datum) 0; +} + diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 229999ff2623..cbafefe20c49 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8568,8 +8568,8 @@ { oid => '5052', descr => 'allocations from the main shared memory segment', proname => 'pg_get_shmem_allocations', prorows => '50', proretset => 't', provolatile => 'v', prorettype => 'record', proargtypes => '', - proallargtypes => '{text,int8,int8,int8}', proargmodes => '{o,o,o,o}', - proargnames => '{name,off,size,allocated_size}', + proallargtypes => '{text,text,int8,int8,int8}', proargmodes => '{o,o,o,o,o}', + proargnames => '{name,segment,off,size,allocated_size}', prosrc => 'pg_get_shmem_allocations' }, { oid => '4099', descr => 'Is NUMA support available?', @@ -8592,6 +8592,14 @@ proargmodes => '{o,o,o}', proargnames => '{name,type,size}', prosrc => 'pg_get_dsm_registry_allocations' }, +# shared memory segments +{ oid => '5101', descr => 'shared memory segments', + proname => 'pg_get_shmem_segments', prorows => '6', proretset => 't', + provolatile => 'v', prorettype => 'record', proargtypes => '', + proallargtypes => '{int4,text,int8,int8,int8,int8}', proargmodes => '{o,o,o,o,o,o}', + proargnames => '{id,name,size,freeoffset,mapping_size,mapping_reserved_size}', + prosrc => 'pg_get_shmem_segments' }, + # buffer lookup table { oid => '5102', descr => 'shared buffer lookup table', diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index a1fa6b43fe38..715f6acb5dd5 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -69,7 +69,6 @@ typedef struct ShmemSegment extern PGDLLIMPORT ShmemSegment Segments[ANON_MAPPINGS]; extern PGDLLIMPORT AnonymousMapping Mappings[ANON_MAPPINGS]; - /* GUC variables */ extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h index 910c43f54f4f..64ff5a286ba8 100644 --- a/src/include/storage/shmem.h +++ b/src/include/storage/shmem.h @@ -71,6 +71,7 @@ typedef struct void *location; /* location in shared mem */ Size size; /* # bytes requested for the structure */ Size allocated_size; /* # bytes actually allocated */ + int shmem_segment; /* segment in which the structure is allocated */ } ShmemIndexEnt; #endif /* SHMEM_H */ diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 760bb13fe95b..e73314b5ef0e 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1764,14 +1764,22 @@ pg_shadow| SELECT pg_authid.rolname AS usename, LEFT JOIN pg_db_role_setting s ON (((pg_authid.oid = s.setrole) AND (s.setdatabase = (0)::oid)))) WHERE pg_authid.rolcanlogin; pg_shmem_allocations| SELECT name, + segment, off, size, allocated_size - FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size); + FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, segment, off, size, allocated_size); pg_shmem_allocations_numa| SELECT name, numa_node, size FROM pg_get_shmem_allocations_numa() pg_get_shmem_allocations_numa(name, numa_node, size); +pg_shmem_segments| SELECT id, + name, + size, + freeoffset, + mapping_size, + mapping_reserved_size + FROM pg_get_shmem_segments() pg_get_shmem_segments(id, name, size, freeoffset, mapping_size, mapping_reserved_size); pg_stat_activity| SELECT s.datid, d.datname, s.pid, From 76d28dbf960598cf47b28a0f89baa9c50946d73e Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Tue, 17 Jun 2025 14:16:55 +0200 Subject: [PATCH 11/16] Allow to resize shared memory without restart Add assing hook for shared_buffers to resize shared memory using space, introduced in the previous commits without requiring PostgreSQL restart. Essentially the implementation is based on two mechanisms: a ProcSignalBarrier is used to make sure all processes are starting the resize procedure simultaneously, and a global Barrier is used to coordinate after that and make sure all finished processes are waiting for others that are in progress. The resize process looks like this: * The GUC assign hook sets a flag to let the Postmaster know that resize was requested. * Postmaster verifies the flag in the event loop, and starts the resize by emitting a ProcSignal barrier. * All processes, that participate in ProcSignal mechanism, begin to process ProcSignal barrier. First a process waits until all processes have confirmed they received the message and can start simultaneously. * Every process recalculates shared memory size based on the new NBuffers, adjusts its size using ftruncate and adjust reservation permissions with mprotect. One elected process signals the postmaster to do the same. * When finished, every process waits on a global ShmemControl barrier, untill all others are finished as well. This way we ensure three stages with clear boundaries: before the resize, when all processes use old NBuffers; during the resize, when processes have mix of old and new NBuffers, and wait until it's done; after the resize, when all processes use new NBuffers. * After all processes are using new value, one of them will initialize new shared structures (buffer blocks, descriptors, etc) as needed and broadcast new value of NBuffers via ShmemControl in shared memory. Other backends are waiting for this operation to finish as well. Then the barrier is lifted and everything goes as usual. Since resizing takes time, we need to take into account that during that time: - New backends can be spawned. They will check status of the barrier early during the bootstrap, and wait until everything is over to work with the new NBuffers value. - Old backends can exit before attempting to resize. Synchronization used between backends relies on ProcSignalBarrier and waits for all participants received the message at the beginning to gather all existing backends. - Some backends might be blocked and not responsing either before or after receiving the message. In the first case such backend still have ProcSignalSlot and should be waited for, in the second case shared barrier will make sure we still waiting for those backends. In any case there is an unbounded wait. - Backends might join barrier in disjoint groups with some time in between. That means that relying only on the shared dynamic barrier is not enough -- it will only synchronize resize procedure withing those groups. That's why we wait first for all participants of ProcSignal mechanism who received the message. Here is how it looks like after raising shared_buffers from 128 MB to 512 MB and calling pg_reload_conf(): -- 128 MB 7f87909fc000-7f8798248000 rw-s /memfd:strategy (deleted) 7f8798248000-7f879d6ca000 ---s /memfd:strategy (deleted) 7f879d6ca000-7f87a4e84000 rw-s /memfd:checkpoint (deleted) 7f87a4e84000-7f87aa398000 ---s /memfd:checkpoint (deleted) 7f87aa398000-7f87b1b42000 rw-s /memfd:iocv (deleted) 7f87b1b42000-7f87c3d32000 ---s /memfd:iocv (deleted) 7f87c3d32000-7f87cb59c000 rw-s /memfd:descriptors (deleted) 7f87cb59c000-7f87dd6cc000 ---s /memfd:descriptors (deleted) 7f87dd6cc000-7f87ece38000 rw-s /memfd:buffers (deleted) ^ buffers content, ~247 MB 7f87ece38000-7f8877066000 ---s /memfd:buffers (deleted) ^ reserved space, ~2210 MB 7f8877066000-7f887e7d0000 rw-s /memfd:main (deleted) 7f887e7d0000-7f8890a00000 ---s /memfd:main (deleted) -- 512 MB 7f87909fc000-7f879866a000 rw-s /memfd:strategy (deleted) 7f879866a000-7f879d6ca000 ---s /memfd:strategy (deleted) 7f879d6ca000-7f87a50f4000 rw-s /memfd:checkpoint (deleted) 7f87a50f4000-7f87aa398000 ---s /memfd:checkpoint (deleted) 7f87aa398000-7f87b1d82000 rw-s /memfd:iocv (deleted) 7f87b1d82000-7f87c3d32000 ---s /memfd:iocv (deleted) 7f87c3d32000-7f87cba1c000 rw-s /memfd:descriptors (deleted) 7f87cba1c000-7f87dd6cc000 ---s /memfd:descriptors (deleted) 7f87dd6cc000-7f8804fb8000 rw-s /memfd:buffers (deleted) ^ buffers content, ~632 MB 7f8804fb8000-7f8877066000 ---s /memfd:buffers (deleted) ^ reserved space, ~1824 MB 7f8877066000-7f887e950000 rw-s /memfd:main (deleted) 7f887e950000-7f8890a00000 ---s /memfd:main (deleted) The implementation supports only increasing of shared_buffers. For decreasing the value a similar procedure is needed. But the buffer blocks with data have to be drained first, so that the actual data set fits into the new smaller space. From experiment it turns out that shared mappings have to be extended separately for each process that uses them. Another rough edge is that a backend blocked on ReadCommand will not apply shared_buffers change until it receives something. Authors: Dmitrii Dolgov, Ashutosh Bapat --- src/backend/port/sysv_shmem.c | 443 ++++++++++++++++++ src/backend/postmaster/checkpointer.c | 12 +- src/backend/postmaster/postmaster.c | 18 + src/backend/storage/buffer/buf_init.c | 60 ++- src/backend/storage/ipc/ipci.c | 15 +- src/backend/storage/ipc/procsignal.c | 46 ++ src/backend/storage/ipc/shmem.c | 23 +- src/backend/tcop/postgres.c | 10 + .../utils/activity/wait_event_names.txt | 3 + src/backend/utils/misc/guc_parameters.dat | 3 +- src/include/storage/bufmgr.h | 2 +- src/include/storage/ipc.h | 3 + src/include/storage/lwlocklist.h | 1 + src/include/storage/pg_shmem.h | 26 + src/include/storage/pmsignal.h | 1 + src/include/storage/procsignal.h | 1 + src/tools/pgindent/typedefs.list | 1 + 17 files changed, 631 insertions(+), 37 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index b85911bdfc4e..dc4eeeee56a2 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -30,13 +30,19 @@ #include "miscadmin.h" #include "port/pg_bitutils.h" #include "portability/mem.h" +#include "storage/bufmgr.h" #include "storage/dsm.h" #include "storage/fd.h" #include "storage/ipc.h" +#include "storage/lwlock.h" #include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" #include "utils/guc.h" #include "utils/guc_hooks.h" #include "utils/pidfile.h" +#include "utils/wait_event.h" /* @@ -96,6 +102,13 @@ void *UsedShmemSegAddr = NULL; AnonymousMapping Mappings[ANON_MAPPINGS]; +/* Flag telling postmaster that resize is needed */ +volatile bool pending_pm_shmem_resize = false; + +/* Keeps track of the previous NBuffers value */ +static int NBuffersOld = -1; +static int NBuffersPending = -1; + /* * Anonymous mapping layout we use looks like this: * @@ -147,6 +160,49 @@ static double SHMEM_RESIZE_RATIO[6] = { */ static bool huge_pages_on = false; +/* + * Flag telling that we have prepared the memory layout to be resizable. If + * false after all shared memory segments creation, it means we failed to setup + * needed layout and falled back to the regular non-resizable approach. + */ +static bool shmem_resizable = false; + +/* + * Currently broadcasted value of NBuffers in shared memory. + * + * Most of the time this value is going to be equal to NBuffers. But if + * postmaster is resizing shared memory and a new backend was created + * at the same time, there is a possibility for the new backend to inherit the + * old NBuffers value, but miss the resize signal if ProcSignal infrastructure + * was not initialized yet. Consider this situation: + * + * Postmaster ------> New Backend + * | | + * | Launch + * | | + * | Inherit NBuffers + * | | + * Resize NBuffers | + * | | + * Emit Barrier | + * | Init ProcSignal + * | | + * Finish resize | + * | | + * New NBuffers Old NBuffers + * + * In this case the backend is not yet ready to receive a signal from + * EmitProcSignalBarrier, and will be ignored. The same happens if ProcSignal + * is initialized even later, after the resizing was finished. + * + * To address resulting inconsistency, postmaster broadcasts the current + * NBuffers value via shared memory. Every new backend has to verify this value + * before it will access the buffer pool: if it differs from its own value, + * this indicates a shared memory resize has happened and the backend has to + * first synchronize with rest of the pack. + */ +ShmemControl *ShmemCtrl = NULL; + static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); static void IpcMemoryDetach(int status, Datum shmaddr); static void IpcMemoryDelete(int status, Datum shmId); @@ -906,6 +962,346 @@ AnonymousShmemDetach(int status, Datum arg) } } +/* + * Resize all shared memory segments based on the current NBuffers value, which + * is is applied from NBuffersPending. The actual segment resizing is done via + * ftruncate, which will fail if is not sufficient space to expand the anon + * file. When finished, based on the new and old values initialize new buffer + * blocks if any. + * + * If reinitializing took place, as the last step this function does buffers + * reinitialization as well and broadcasts the new value of NSharedBuffers. All + * of that needs to be done only by one backend, the first one that managed to + * grab the ShmemResizeLock. + */ +bool +AnonymousShmemResize(void) +{ + int numSemas; + bool reinit = false; + int mmap_flags = PG_MMAP_FLAGS; + Size hugepagesize; + + NBuffers = NBuffersPending; + + elog(DEBUG1, "Resize shmem from %d to %d", NBuffersOld, NBuffers); + + /* + * XXX: Where to reset the flag is still an open question. E.g. do we + * consider a no-op when NBuffers is equal to NBuffersOld a genuine resize + * and reset the flag? + */ + pending_pm_shmem_resize = false; + + /* + * XXX: Currently only increasing of shared_buffers is supported. For + * decreasing something similar has to be done, but buffer blocks with + * data have to be drained first. + */ + if(NBuffersOld > NBuffers) + return false; + +#ifndef MAP_HUGETLB + /* PrepareHugePages should have dealt with this case */ + Assert(huge_pages != HUGE_PAGES_ON && !huge_pages_on); +#else + if (huge_pages_on) + { + /* Make sure nothing is messed up */ + Assert(huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY); + + /* Round up the new size to a suitable large value */ + GetHugePageSize(&hugepagesize, &mmap_flags, NULL); + } +#endif + + /* Note that CalculateShmemSize indirectly depends on NBuffers */ + CalculateShmemSize(&numSemas); + + for(int i = 0; i < ANON_MAPPINGS; i++) + { + AnonymousMapping *m = &Mappings[i]; + +#ifdef MAP_HUGETLB + if (huge_pages_on && (m->shmem_req_size % hugepagesize != 0)) + m->shmem_req_size += hugepagesize - (m->shmem_req_size % hugepagesize); +#endif + + if (m->shmem == NULL) + continue; + + if (m->shmem_size == m->shmem_req_size) + continue; + + if (m->shmem_reserved < m->shmem_req_size) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("not enough shared memory is reserved"), + errhint("You may need to increase \"max_available_memory\"."))); + + elog(DEBUG1, "segment[%s]: resize from %zu to %zu at address %p", + MappingName(m->shmem_segment), m->shmem_size, + m->shmem_req_size, m->shmem); + + /* Resize the backing anon file. */ + if(ftruncate(m->segment_fd, m->shmem_req_size) == -1) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("could not truncase anonymous file for \"%s\": %m", + MappingName(m->shmem_segment)))); + + /* Adjust memory accessibility */ + if(mprotect(m->shmem, m->shmem_req_size, PROT_READ | PROT_WRITE) == -1) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("could not mprotect anonymous shared memory for \"%s\": %m", + MappingName(m->shmem_segment)))); + + /* If shrinking, make reserved space unavailable again */ + if(m->shmem_req_size < m->shmem_size && + mprotect(m->shmem + m->shmem_req_size, m->shmem_size - m->shmem_req_size, PROT_NONE) == -1) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("could not mprotect reserved shared memory for \"%s\": %m", + MappingName(m->shmem_segment)))); + + reinit = true; + m->shmem_size = m->shmem_req_size; + } + + if (reinit) + { + if(IsUnderPostmaster && + LWLockConditionalAcquire(ShmemResizeLock, LW_EXCLUSIVE)) + { + /* + * If the new NBuffers was already broadcasted, the buffer pool was + * already initialized before. + * + * Since we're not on a hot path, we use lwlocks and do not need to + * involve memory barrier. + */ + if(pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) != NBuffers) + { + /* + * Allow the first backend that managed to get the lock to + * reinitialize the new portion of buffer pool. Every other + * process will wait on the shared barrier for that to finish, + * since it's a part of the SHMEM_RESIZE_DONE phase. + * + * Note that it's enough when only one backend will do that, + * even the ShmemInitStruct part. The reason is that resized + * shared memory will maintain the same addresses, meaning that + * all the pointers are still valid, and we only need to update + * structures size in the ShmemIndex once -- any other backend + * will pick up this shared structure from the index. + * + * XXX: This is the right place for buffer eviction as well. + */ + BufferManagerShmemInit(NBuffersOld); + + /* If all fine, broadcast the new value */ + pg_atomic_write_u32(&ShmemCtrl->NSharedBuffers, NBuffers); + } + + LWLockRelease(ShmemResizeLock); + } + } + + return true; +} + +/* + * We are asked to resize shared memory. Wait for all ProcSignal participants + * to join the barrier, then do the resize and wait on the barrier until all + * participating finish resizing as well -- otherwise we face danger of + * inconsistency between backends. + * + * XXX: If a backend is blocked on ReadCommand in PostgresMain, it will not + * proceed with AnonymousShmemResize after receiving SIGHUP, until something + * will be sent. + */ +bool +ProcessBarrierShmemResize(Barrier *barrier) +{ + Assert(IsUnderPostmaster); + + elog(DEBUG1, "Handle a barrier for shmem resizing from %d to %d, %d", + NBuffersOld, NBuffersPending, pending_pm_shmem_resize); + + /* Wait until we have seen the new NBuffers value */ + if (!pending_pm_shmem_resize) + return false; + + /* + * First thing to do after attaching to the barrier is to wait for others. + * We can't simply use BarrierArriveAndWait, because backends might arrive + * here in disjoint groups, e.g. first two backends, pause, then second two + * backends. If the resize is quick enough that can lead to a situation + * when the first group is already finished before the second has appeared, + * and the barrier will only synchonize withing those groups. + */ + if (BarrierAttach(barrier) == SHMEM_RESIZE_REQUESTED) + WaitForProcSignalBarrierReceived( + pg_atomic_read_u64(&ShmemCtrl->Generation)); + + /* + * Now start the procedure, and elect one backend to ping postmaster to do + * the same. + * + * XXX: If we need to be able to abort resizing, this has to be done later, + * after the SHMEM_RESIZE_DONE. + */ + if (BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_START)) + { + Assert(IsUnderPostmaster); + SendPostmasterSignal(PMSIGNAL_SHMEM_RESIZE); + } + + AnonymousShmemResize(); + + /* The second phase means the resize has finished, SHMEM_RESIZE_DONE */ + BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_DONE); + + BarrierDetach(barrier); + return true; +} + +/* + * GUC assign hook for shared_buffers. It's recommended for an assign hook to + * be as minimal as possible, thus we just request shared memory resize and + * remember the previous value. + */ +void +assign_shared_buffers(int newval, void *extra, bool *pending) +{ + elog(DEBUG1, "Received SIGHUP for shmem resizing"); + + pending_pm_shmem_resize = true; + *pending = true; + NBuffersPending = newval; + + NBuffersOld = NBuffers; +} + +/* + * Test if we have somehow missed a shmem resize signal and NBuffers value + * differs from NSharedBuffers. If yes, catchup and do resize. + */ +void +AdjustShmemSize(void) +{ + uint32 NSharedBuffers = pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers); + + if (NSharedBuffers != NBuffers) + { + /* + * If the broadcasted shared_buffers is different from the one we see, + * it could be that the backend has missed a resize signal. To avoid + * any inconsistency, adjust the shared mappings, before having a + * chance to access the buffer pool. + */ + ereport(LOG, + (errmsg("shared_buffers has been changed from %d to %d, " + "resize shared memory", + NBuffers, NSharedBuffers))); + NBuffers = NSharedBuffers; + AnonymousShmemResize(); + } +} + +/* + * Start resizing procedure, making sure all existing processes will have + * consistent view of shared memory size. Must be called only in postmaster. + */ +void +CoordinateShmemResize(void) +{ + elog(DEBUG1, "Coordinating shmem resize from %d to %d", + NBuffersOld, NBuffers); + Assert(!IsUnderPostmaster); + + /* + * We use dynamic barrier to help dealing with backends that were spawned + * during the resize. + */ + BarrierInit(&ShmemCtrl->Barrier, 0); + + /* + * If the value did not change, or shared memory segments are not + * initialized yet, skip the resize. + */ + if (NBuffersPending == NBuffersOld) + { + elog(DEBUG1, "Skip resizing, new %d, old %d", + NBuffers, NBuffersOld); + return; + } + + /* + * Shared memory resize requires some coordination done by postmaster, + * and consists of three phases: + * + * - Before the resize all existing backends have the same old NBuffers. + * - When resize is in progress, backends are expected to have a + * mixture of old a new values. They're not allowed to touch buffer + * pool during this time frame. + * - After resize has been finished, all existing backends, that can access + * the buffer pool, are expected to have the same new value of NBuffers. + * + * Those phases are ensured by joining the shared barrier associated with + * the procedure. Since resizing takes time, we need to take into account + * that during that time: + * + * - New backends can be spawned. They will check status of the barrier + * early during the bootstrap, and wait until everything is over to work + * with the new NBuffers value. + * + * - Old backends can exit before attempting to resize. Synchronization + * used between backends relies on ProcSignalBarrier and waits for all + * participants received the message at the beginning to gather all + * existing backends. + * + * - Some backends might be blocked and not responsing either before or + * after receiving the message. In the first case such backend still + * have ProcSignalSlot and should be waited for, in the second case + * shared barrier will make sure we still waiting for those backends. In + * any case there is an unbounded wait. + * + * - Backends might join barrier in disjoint groups with some time in + * between. That means that relying only on the shared dynamic barrier is + * not enough -- it will only synchronize resize procedure withing those + * groups. That's why we wait first for all participants of ProcSignal + * mechanism who received the message. + */ + elog(DEBUG1, "Emit a barrier for shmem resizing"); + pg_atomic_init_u64(&ShmemCtrl->Generation, + EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SHMEM_RESIZE)); + + /* To order everything after setting Generation value */ + pg_memory_barrier(); + + /* + * After that postmaster waits for PMSIGNAL_SHMEM_RESIZE as a sign that all + * the rest of the pack has started the procedure and it can resize shared + * memory as well. + * + * Normally we would call WaitForProcSignalBarrier here to wait until every + * backend has reported on the ProcSignalBarrier. But for shared memory + * resize we don't need this, as every participating backend will + * synchronize on the ProcSignal barrier. In fact even if we would like to + * wait here, it wouldn't be possible -- we're in the postmaster, without + * any waiting infrastructure available. + * + * If at some point it will turn out that waiting is essential, we would + * need to consider some alternatives. E.g. it could be a designated + * coordination process, which is not a postmaster. Another option would be + * to introduce a CoordinateShmemResize lock and allow only one process to + * take it (this probably would have to be something different than + * LWLocks, since they block interrupts, and coordination relies on them). + */ +} + /* * PGSharedMemoryCreate * @@ -1217,3 +1613,50 @@ PGSharedMemoryDetach(void) } } } + +void +WaitOnShmemBarrier() +{ + Barrier *barrier = &ShmemCtrl->Barrier; + + /* Nothing to do if resizing is not started */ + if (BarrierPhase(barrier) < SHMEM_RESIZE_START) + return; + + BarrierAttach(barrier); + + /* Otherwise wait through all available phases */ + while (BarrierPhase(barrier) < SHMEM_RESIZE_DONE) + { + ereport(LOG, (errmsg("ProcSignal barrier is in phase %d, waiting", + BarrierPhase(barrier)))); + + BarrierArriveAndWait(barrier, 0); + } + + BarrierDetach(barrier); +} + +void +ShmemControlInit(void) +{ + bool foundShmemCtrl; + + ShmemCtrl = (ShmemControl *) + ShmemInitStruct("Shmem Control", sizeof(ShmemControl), + &foundShmemCtrl); + + if (!foundShmemCtrl) + { + /* + * The barrier is missing here, it will be initialized right before + * starting the resizing process as a convenient way to reset it. + */ + + /* Initialize with the currently known value */ + pg_atomic_init_u32(&ShmemCtrl->NSharedBuffers, NBuffers); + + /* shmem_resizable should be initialized by now */ + ShmemCtrl->Resizable = shmem_resizable; + } +} diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index e84e8663e966..ef3f84a55f57 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -654,9 +654,12 @@ CheckpointerMain(const void *startup_data, size_t startup_data_len) static void ProcessCheckpointerInterrupts(void) { - if (ProcSignalBarrierPending) - ProcessProcSignalBarrier(); - + /* + * Reloading config can trigger further signals, complicating interrupts + * processing -- so let it run first. + * + * XXX: Is there any need in memory barrier after ProcessConfigFile? + */ if (ConfigReloadPending) { ConfigReloadPending = false; @@ -676,6 +679,9 @@ ProcessCheckpointerInterrupts(void) UpdateSharedMemoryConfig(); } + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + /* Perform logging of memory contexts of this process */ if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index b59d20b4ac22..ba9528d5dfa3 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -426,6 +426,7 @@ static void process_pm_pmsignal(void); static void process_pm_child_exit(void); static void process_pm_reload_request(void); static void process_pm_shutdown_request(void); +static void process_pm_shmem_resize(void); static void dummy_handler(SIGNAL_ARGS); static void CleanupBackend(PMChild *bp, int exitstatus); static void HandleChildCrash(int pid, int exitstatus, const char *procname); @@ -1697,6 +1698,9 @@ ServerLoop(void) if (pending_pm_pmsignal) process_pm_pmsignal(); + if (pending_pm_shmem_resize) + process_pm_shmem_resize(); + if (events[i].events & WL_SOCKET_ACCEPT) { ClientSocket s; @@ -2042,6 +2046,17 @@ process_pm_reload_request(void) } } +static void +process_pm_shmem_resize(void) +{ + /* + * Failure to resize is considered to be fatal and will not be + * retried, which means we can disable pending flag right here. + */ + pending_pm_shmem_resize = false; + CoordinateShmemResize(); +} + /* * pg_ctl uses SIGTERM, SIGINT and SIGQUIT to request different types of * shutdown. @@ -3862,6 +3877,9 @@ process_pm_pmsignal(void) request_state_update = true; } + if (CheckPostmasterSignal(PMSIGNAL_SHMEM_RESIZE)) + AnonymousShmemResize(); + /* * Try to advance postmaster's state machine, if a child requests it. */ diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 6f148d1d80b2..0e72e373193a 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -18,6 +18,7 @@ #include "storage/buf_internals.h" #include "storage/pg_shmem.h" #include "storage/bufmgr.h" +#include "storage/pg_shmem.h" BufferDescPadded *BufferDescriptors; char *BufferBlocks; @@ -63,18 +64,28 @@ CkptSortItem *CkptBufferIds; * Initialize shared buffer pool * * This is called once during shared-memory initialization (either in the - * postmaster, or in a standalone backend). Size of data structures initialized - * here depends on NBuffers, and to be able to change NBuffers without a - * restart we store each structure into a separate shared memory segment, which - * could be resized on demand. + * postmaster, or in a standalone backend) or during shared-memory resize. Size + * of data structures initialized here depends on NBuffers, and to be able to + * change NBuffers without a restart we store each structure into a separate + * shared memory segment, which could be resized on demand. + * + * FirstBufferToInit tells where to start initializing buffers. For + * initialization it always will be zero, but when resizing shared-memory it + * indicates the number of already initialized buffers. + * + * No locks are taking in this function, it is the caller responsibility to + * make sure only one backend can work with new buffers. */ void -BufferManagerShmemInit(void) +BufferManagerShmemInit(int FirstBufferToInit) { bool foundBufs, foundDescs, foundIOCV, foundBufCkpt; + int i; + elog(DEBUG1, "BufferManagerShmemInit from %d to %d", + FirstBufferToInit, NBuffers); /* Align descriptors to a cacheline boundary. */ BufferDescriptors = (BufferDescPadded *) @@ -111,34 +122,35 @@ BufferManagerShmemInit(void) { /* should find all of these, or none of them */ Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt); - /* note: this path is only taken in EXEC_BACKEND case */ - } - else - { - int i; - /* - * Initialize all the buffer headers. + * note: this path is only taken in EXEC_BACKEND case when initializing + * shared memory, or in all cases when resizing shared memory. */ - for (i = 0; i < NBuffers; i++) - { - BufferDesc *buf = GetBufferDescriptor(i); + } + +#ifndef EXEC_BACKEND + /* + * Initialize all the buffer headers. + */ + for (i = FirstBufferToInit; i < NBuffers; i++) + { + BufferDesc *buf = GetBufferDescriptor(i); - ClearBufferTag(&buf->tag); + ClearBufferTag(&buf->tag); - pg_atomic_init_u32(&buf->state, 0); - buf->wait_backend_pgprocno = INVALID_PROC_NUMBER; + pg_atomic_init_u32(&buf->state, 0); + buf->wait_backend_pgprocno = INVALID_PROC_NUMBER; - buf->buf_id = i; + buf->buf_id = i; - pgaio_wref_clear(&buf->io_wref); + pgaio_wref_clear(&buf->io_wref); - LWLockInitialize(BufferDescriptorGetContentLock(buf), - LWTRANCHE_BUFFER_CONTENT); + LWLockInitialize(BufferDescriptorGetContentLock(buf), + LWTRANCHE_BUFFER_CONTENT); - ConditionVariableInit(BufferDescriptorGetIOCV(buf)); - } + ConditionVariableInit(BufferDescriptorGetIOCV(buf)); } +#endif /* Init other shared buffer-management stuff */ StrategyInitialize(!foundDescs); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 2cd278449f0d..bd75f06047e6 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -171,6 +171,14 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, SlotSyncShmemSize()); size = add_size(size, AioShmemSize()); + /* + * XXX: For some reason slightly more memory is needed for larger + * shared_buffers, but this size is enough for any large value I've tested + * with. Is it a mistake in how slots are split, or there was a hidden + * inconsistency in shmem calculation? + */ + size = add_size(size, 1024 * 1024 * 100); + /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); @@ -333,7 +341,7 @@ CreateOrAttachShmemStructs(void) CommitTsShmemInit(); SUBTRANSShmemInit(); MultiXactShmemInit(); - BufferManagerShmemInit(); + BufferManagerShmemInit(0); /* * Set up lock manager @@ -345,6 +353,11 @@ CreateOrAttachShmemStructs(void) */ PredicateLockShmemInit(); + /* + * Set up shared memory resize manager + */ + ShmemControlInit(); + /* * Set up process table */ diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index eb3ceaae8095..2160d258fa73 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -27,6 +27,7 @@ #include "storage/condition_variable.h" #include "storage/ipc.h" #include "storage/latch.h" +#include "storage/pg_shmem.h" #include "storage/shmem.h" #include "storage/sinval.h" #include "storage/smgr.h" @@ -113,6 +114,10 @@ static bool CheckProcSignal(ProcSignalReason reason); static void CleanupProcSignalState(int status, Datum arg); static void ResetProcSignalBarrierBits(uint32 flags); +#ifdef DEBUG_SHMEM_RESIZE +bool delay_proc_signal_init = false; +#endif + /* * ProcSignalShmemSize * Compute space needed for ProcSignal's shared memory @@ -176,6 +181,43 @@ ProcSignalInit(const uint8 *cancel_key, int cancel_key_len) uint32 old_pss_pid; Assert(cancel_key_len >= 0 && cancel_key_len <= MAX_CANCEL_KEY_LENGTH); + +#ifdef DEBUG_SHMEM_RESIZE + /* + * Introduced for debugging purposes. You can change the variable at + * runtime using gdb, then start new backends with delayed ProcSignal + * initialization. Simple pg_usleep wont work here due to SIGHUP interrupt + * needed for testing. Taken from pg_sleep; + */ + if (delay_proc_signal_init) + { +#define GetNowFloat() ((float8) GetCurrentTimestamp() / 1000000.0) + float8 endtime = GetNowFloat() + 5; + + for (;;) + { + float8 delay; + long delay_ms; + + CHECK_FOR_INTERRUPTS(); + + delay = endtime - GetNowFloat(); + if (delay >= 600.0) + delay_ms = 600000; + else if (delay > 0.0) + delay_ms = (long) (delay * 1000.0); + else + break; + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + delay_ms, + WAIT_EVENT_PG_SLEEP); + ResetLatch(MyLatch); + } + } +#endif + if (MyProcNumber < 0) elog(ERROR, "MyProcNumber not set"); if (MyProcNumber >= NumProcSignalSlots) @@ -615,6 +657,10 @@ ProcessProcSignalBarrier(void) case PROCSIGNAL_BARRIER_SMGRRELEASE: processed = ProcessBarrierSmgrRelease(); break; + case PROCSIGNAL_BARRIER_SHMEM_RESIZE: + processed = ProcessBarrierShmemResize( + &ShmemCtrl->Barrier); + break; } /* diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 9499f332e77f..2a1975403003 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -498,17 +498,26 @@ ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, { /* * Structure is in the shmem index so someone else has allocated it - * already. The size better be the same as the size we are trying to - * initialize to, or there is a name conflict (or worse). + * already. Verify the structure's size: + * - If it's the same, we've found the expected structure. + * - If it's different, we're resizing the expected structure. + * + * XXX: There is an implicit assumption this can only happen in + * "resizable" segments, where only one shared structure is allowed. + * This has to be implemented more cleanly. */ if (result->size != size) { - LWLockRelease(ShmemIndexLock); - ereport(ERROR, - (errmsg("ShmemIndex entry size is wrong for data structure" - " \"%s\": expected %zu, actual %zu", - name, size, result->size))); + Size delta = size - result->size; + + result->size = size; + + /* Reflect size change in the shared segment */ + SpinLockAcquire(Segments[shmem_segment].ShmemLock); + Segments[shmem_segment].ShmemSegHdr->freeoffset += delta; + SpinLockRelease(Segments[shmem_segment].ShmemLock); } + structPtr = result->location; } else diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index c819608fff63..15e9dde41d12 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -62,6 +62,7 @@ #include "rewrite/rewriteHandler.h" #include "storage/bufmgr.h" #include "storage/ipc.h" +#include "storage/pg_shmem.h" #include "storage/pmsignal.h" #include "storage/proc.h" #include "storage/procsignal.h" @@ -4317,6 +4318,15 @@ PostgresMain(const char *dbname, const char *username) */ BeginReportingGUCOptions(); + /* Verify the shared barrier, if it's still active: join and wait. */ + WaitOnShmemBarrier(); + + /* + * After waiting on the barrier above we guaranteed to have NSharedBuffers + * broadcasted, so we can use it in the function below. + */ + AdjustShmemSize(); + /* * Also set up handler to log session end; we have to wait till now to be * sure Log_disconnections has its final value. diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 7553f6eacef7..82cee6b88772 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -155,6 +155,8 @@ REPLICATION_ORIGIN_DROP "Waiting for a replication origin to become inactive so REPLICATION_SLOT_DROP "Waiting for a replication slot to become inactive so it can be dropped." RESTORE_COMMAND "Waiting for to complete." SAFE_SNAPSHOT "Waiting to obtain a valid snapshot for a READ ONLY DEFERRABLE transaction." +SHMEM_RESIZE_START "Waiting for other backends to start resizing shared memory." +SHMEM_RESIZE_DONE "Waiting for other backends to finish resizing shared memory." SYNC_REP "Waiting for confirmation from a remote server during synchronous replication." WAL_RECEIVER_EXIT "Waiting for the WAL receiver to exit." WAL_RECEIVER_WAIT_START "Waiting for startup process to send initial data for streaming replication." @@ -355,6 +357,7 @@ DSMRegistry "Waiting to read or update the dynamic shared memory registry." InjectionPoint "Waiting to read or update information related to injection points." SerialControl "Waiting to read or update shared pg_serial state." AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." +ShmemResize "Waiting to resize shared memory." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index c94f3fc3c80d..5c534cee2ac2 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -1098,13 +1098,14 @@ # We sometimes multiply the number of shared buffers by two without # checking for overflow, so we mustn't allow more than INT_MAX / 2. -{ name => 'shared_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', +{ name => 'shared_buffers', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_MEM', short_desc => 'Sets the number of shared memory buffers used by the server.', flags => 'GUC_UNIT_BLOCKS', variable => 'NBuffers', boot_val => '16384', min => '16', max => 'INT_MAX / 2', + assign_hook => 'assign_shared_buffers' }, # TODO: should this be PGC_POSTMASTER? diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 47360a3d3d85..51ce6ebcf6cb 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -317,7 +317,7 @@ extern void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_skipped); /* in buf_init.c */ -extern void BufferManagerShmemInit(void); +extern void BufferManagerShmemInit(int); extern Size BufferManagerShmemSize(void); /* in localbuf.c */ diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h index 3baf418b3d1e..847f56a36dcb 100644 --- a/src/include/storage/ipc.h +++ b/src/include/storage/ipc.h @@ -64,6 +64,7 @@ typedef void (*shmem_startup_hook_type) (void); /* ipc.c */ extern PGDLLIMPORT bool proc_exit_inprogress; extern PGDLLIMPORT bool shmem_exit_inprogress; +extern PGDLLIMPORT volatile bool pending_pm_shmem_resize; pg_noreturn extern void proc_exit(int code); extern void shmem_exit(int code); @@ -83,5 +84,7 @@ extern void CreateSharedMemoryAndSemaphores(void); extern void AttachSharedMemoryStructs(void); #endif extern void InitializeShmemGUCs(void); +extern void CoordinateShmemResize(void); +extern bool AnonymousShmemResize(void); #endif /* IPC_H */ diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index 06a1ffd4b08b..cba586027a7a 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -85,6 +85,7 @@ PG_LWLOCK(50, DSMRegistry) PG_LWLOCK(51, InjectionPoint) PG_LWLOCK(52, SerialControl) PG_LWLOCK(53, AioWorkerSubmissionQueue) +PG_LWLOCK(54, ShmemResize) /* * There also exist several built-in LWLock tranches. As with the predefined diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 715f6acb5dd5..eba28ce8a5cc 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -24,6 +24,7 @@ #ifndef PG_SHMEM_H #define PG_SHMEM_H +#include "storage/barrier.h" #include "storage/dsm_impl.h" #include "storage/spin.h" @@ -69,6 +70,25 @@ typedef struct ShmemSegment extern PGDLLIMPORT ShmemSegment Segments[ANON_MAPPINGS]; extern PGDLLIMPORT AnonymousMapping Mappings[ANON_MAPPINGS]; +/* + * ShmemControl is shared between backends and helps to coordinate shared + * memory resize. + */ +typedef struct +{ + pg_atomic_uint32 NSharedBuffers; + Barrier Barrier; + pg_atomic_uint64 Generation; + bool Resizable; +} ShmemControl; + +extern PGDLLIMPORT ShmemControl *ShmemCtrl; + +/* The phases for shared memory resizing, used by for ProcSignal barrier. */ +#define SHMEM_RESIZE_REQUESTED 0 +#define SHMEM_RESIZE_START 1 +#define SHMEM_RESIZE_DONE 2 + /* GUC variables */ extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; @@ -123,6 +143,12 @@ extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags); void PrepareHugePages(void); +bool ProcessBarrierShmemResize(Barrier *barrier); +void assign_shared_buffers(int newval, void *extra, bool *pending); +void AdjustShmemSize(void); +extern void WaitOnShmemBarrier(void); +extern void ShmemControlInit(void); + /* * To be able to dynamically resize largest parts of the data stored in shared * memory, we split it into multiple shared memory mappings segments. Each diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h index 428aa3fd68a0..1a55bf57a70e 100644 --- a/src/include/storage/pmsignal.h +++ b/src/include/storage/pmsignal.h @@ -42,6 +42,7 @@ typedef enum PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */ PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */ PMSIGNAL_XLOG_IS_SHUTDOWN, /* ShutdownXLOG() completed */ + PMSIGNAL_SHMEM_RESIZE, /* resize shared memory */ } PMSignalReason; #define NUM_PMSIGNALS (PMSIGNAL_XLOG_IS_SHUTDOWN+1) diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 2733bbb8c5b8..97033f84dced 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -54,6 +54,7 @@ typedef enum typedef enum { PROCSIGNAL_BARRIER_SMGRRELEASE, /* ask smgr to close files */ + PROCSIGNAL_BARRIER_SHMEM_RESIZE, /* ask backends to resize shared memory */ } ProcSignalBarrierType; /* diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 37f26f6c6b75..1e6098723673 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2765,6 +2765,7 @@ ShellTypeInfo ShippableCacheEntry ShippableCacheKey ShmemIndexEnt +ShmemControl ShutdownForeignScan_function ShutdownInformation ShutdownMode From b3c3f2f38adec906a5b455fbb65c7af8a3c4e042 Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Mon, 1 Sep 2025 15:40:41 +0530 Subject: [PATCH 12/16] Initial value of shared_buffers (or NBuffers) The assign_hook for shared_buffers (assign_shared_buffers()) is called twice during server startup. First time it sets the default value of shared_buffers, followed by a second time when it sets the value specified in the configuration file or on the command line. At those times the shared buffer pool is yet to be initialized. Hence there is no need to keep the GUC change pending or going through the entire process of resizing memory maps, reinitializing the shared memory and process synchronization. Instead the given value should be assigned directly to NBuffers, which will be used when creating the shared memory and also when initializing the buffer pool the first time. Any changes to shared_buffer after that will need remapping the shared memory segment and synchronize buffer pool reinitialization across the backends. If BufferBlocks is not initilized assign_shared_buffers() sets the given value to NBuffers directly. Otherwise it marks the change as pending and sets the flag pending_pm_shmem_resize so that Postmaster can start the buffer pool reinitialization. TODO: 1. The change depends upon the C convention that the global pointer variables being initialized to NULL. May be initialize BufferBlocks to NULL explicitly. 2. We might think of a better way to check whether buffer pool has been initialized or not. See comment in assign_shared_buffers(). Author: Ashutosh Bapat --- src/backend/port/sysv_shmem.c | 42 ++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index dc4eeeee56a2..ba8613678f6e 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -1168,20 +1168,42 @@ ProcessBarrierShmemResize(Barrier *barrier) } /* - * GUC assign hook for shared_buffers. It's recommended for an assign hook to - * be as minimal as possible, thus we just request shared memory resize and - * remember the previous value. + * GUC assign hook for shared_buffers. + * + * When setting the GUC first time after starting the server, the GUC value is + * changed immediately since there is not shared memory setup yet. + * + * After the shared memory is setup, changing the GUC value requires resizing and + * reiniatializing (at least parts of) the shared memory structures related to + * shared buffers. That's a long and complicated process. It's recommended for + * an assign hook to be as minimal as possible, thus we just request shared + * memory resize and remember the previous value. */ void assign_shared_buffers(int newval, void *extra, bool *pending) { - elog(DEBUG1, "Received SIGHUP for shmem resizing"); - - pending_pm_shmem_resize = true; - *pending = true; - NBuffersPending = newval; - - NBuffersOld = NBuffers; + /* + * TODO: If a backend joins while the buffer resizing is in progress or it + * reads a value of shared_buffers from configuration which is different from + * the value being used by existing backends, this method may not work. Need + * to think of a better solution. + */ + if (BufferBlocks) + { + elog(DEBUG1, "bufferpool is already initialized with size = %d, reinitializing it with size = %d", + NBuffers, newval); + pending_pm_shmem_resize = true; + *pending = true; + NBuffersPending = newval; + NBuffersOld = NBuffers; + } + else + { + elog(DEBUG1, "initializing buffer pool with size = %d", newval); + NBuffers = newval; + *pending = false; + pending_pm_shmem_resize = false; + } } /* From 42113e324015ec9fd7f73e52983ffd64cc05f805 Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Thu, 21 Aug 2025 15:44:24 +0530 Subject: [PATCH 13/16] Update sizes and addresses of shared memory mapping and shared memory structures Update totalsize and end address in segment and mapping: Once a shared memory segment has been resized, the total size and end address of the same needs to be updated in the corresponding AnonymousMapping and Segment structure. Update allocated_size for resized shared memory structure: Reallocating the shared memory structure after resizing needs a bit more work. But at least update the allocated_size as well along with the size of shared memory structure. Author: Ashutosh Bapat --- src/backend/port/sysv_shmem.c | 4 ++++ src/backend/storage/ipc/shmem.c | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index ba8613678f6e..54d335b2e5db 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -1021,6 +1021,8 @@ AnonymousShmemResize(void) for(int i = 0; i < ANON_MAPPINGS; i++) { AnonymousMapping *m = &Mappings[i]; + ShmemSegment *segment = &Segments[i]; + PGShmemHeader *shmem_hdr = segment->ShmemSegHdr; #ifdef MAP_HUGETLB if (huge_pages_on && (m->shmem_req_size % hugepagesize != 0)) @@ -1067,6 +1069,8 @@ AnonymousShmemResize(void) reinit = true; m->shmem_size = m->shmem_req_size; + shmem_hdr->totalsize = m->shmem_size; + segment->ShmemEnd = m->shmem + m->shmem_size; } if (reinit) diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 2a1975403003..0f9abf69fd5e 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -504,13 +504,17 @@ ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, * * XXX: There is an implicit assumption this can only happen in * "resizable" segments, where only one shared structure is allowed. - * This has to be implemented more cleanly. + * This has to be implemented more cleanly. Probably we should implement + * ShmemReallocRawInSegment functionality just to adjust the size + * according to alignment, return the allocated size and update the + * mapping offset. */ if (result->size != size) { Size delta = size - result->size; result->size = size; + result->allocated_size = size; /* Reflect size change in the shared segment */ SpinLockAcquire(Segments[shmem_segment].ShmemLock); From 45e64d9eb984ff474f940b8a228612a10c4f4269 Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Thu, 19 Jun 2025 17:38:29 +0200 Subject: [PATCH 14/16] Support shrinking shared buffers Buffer eviction =============== When shrinking the shared buffers pool, each buffer in the area being shrunk needs to be flushed if it's dirty so as not to loose the changes to that buffer after shrinking. Also, each such buffer needs to be removed from the buffer mapping table so that backends do not access it after shrinking. Buffer eviction requires a separate barrier phase for two reasons: 1. No other backend should map a new page to any of buffers being evicted when eviction is in progress. So they wait while eviction is in progress. 2. Since a pinned buffer has the pin recorded in the backend local memory as well as the buffer descriptor (which is in shared memory), eviction should not coincide with remapping the shared memory of a backend. Otherwise we might loose consistency of local and shared pinning records. Hence it needs to be carried out in ProcessBarrierShmemResize() and not in AnonymousShmemResize() as indicated by now removed comment. If a buffer being evicted is pinned, we raise a FATAL error but this should improve. There are multiple options 1. to wait for the pinned buffer to get unpinned, 2. the backend is killed or it itself cancels the query or 3. rollback the operation. Note that option 1 and 2 would require the pinning related local and shared records to be accessed. But we need infrastructure to do either of this right now. Removing the evicted buffers from buffer ring ============================================= If the buffer pool has been shrunk, the buffers in the buffer ring may not be valid anymore. Modify GetBufferFromRing to check if the buffer is still valid before using it. This makes GetBufferFromRing() a bit more expensive because of additional boolean condition and masks any bug that introduces an invalid buffer into the ring. The alternative fix is more complex as explained below. The strategy object is created in CurrentMemoryContext and is not available in any global structure thus accessible when processing buffer resizing barriers. We may modify GetAccessStrategy() to register strategy in a global linked list and then arrange to deregister it once it's no more in use. Looking at the places which use GetAccessStrategy(), fixing all those may be some work. Author: Ashutosh Bapat Reviewed-by: Tomas Vondra --- src/backend/port/sysv_shmem.c | 42 ++++++--- src/backend/storage/buffer/bufmgr.c | 93 +++++++++++++++++++ src/backend/storage/buffer/freelist.c | 18 +++- .../utils/activity/wait_event_names.txt | 1 + src/include/storage/bufmgr.h | 1 + src/include/storage/pg_shmem.h | 1 + 6 files changed, 139 insertions(+), 17 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 54d335b2e5db..9e1b2c3201f1 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -993,14 +993,6 @@ AnonymousShmemResize(void) */ pending_pm_shmem_resize = false; - /* - * XXX: Currently only increasing of shared_buffers is supported. For - * decreasing something similar has to be done, but buffer blocks with - * data have to be drained first. - */ - if(NBuffersOld > NBuffers) - return false; - #ifndef MAP_HUGETLB /* PrepareHugePages should have dealt with this case */ Assert(huge_pages != HUGE_PAGES_ON && !huge_pages_on); @@ -1099,11 +1091,14 @@ AnonymousShmemResize(void) * all the pointers are still valid, and we only need to update * structures size in the ShmemIndex once -- any other backend * will pick up this shared structure from the index. - * - * XXX: This is the right place for buffer eviction as well. */ BufferManagerShmemInit(NBuffersOld); + /* + * Wipe out the evictor PID so that it can be used for the next + * buffer resizing operation. + */ + ShmemCtrl->evictor_pid = 0; /* If all fine, broadcast the new value */ pg_atomic_write_u32(&ShmemCtrl->NSharedBuffers, NBuffers); } @@ -1156,11 +1151,31 @@ ProcessBarrierShmemResize(Barrier *barrier) * XXX: If we need to be able to abort resizing, this has to be done later, * after the SHMEM_RESIZE_DONE. */ - if (BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_START)) + + /* + * Evict extra buffers when shrinking shared buffers. We need to do this + * while the memory for extra buffers is still mapped i.e. before remapping + * the shared memory segments to a smaller memory area. + */ + if (NBuffersOld > NBuffersPending) { - Assert(IsUnderPostmaster); - SendPostmasterSignal(PMSIGNAL_SHMEM_RESIZE); + BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_START); + + /* + * TODO: If the buffer eviction fails for any reason, we should + * gracefully rollback the shared buffer resizing and try again. But the + * infrastructure to do so is not available right now. Hence just raise + * a FATAL so that the system restarts. + */ + if (!EvictExtraBuffers(NBuffersPending, NBuffersOld)) + elog(FATAL, "buffer eviction failed"); + + if (BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_EVICT)) + SendPostmasterSignal(PMSIGNAL_SHMEM_RESIZE); } + else + if (BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_START)) + SendPostmasterSignal(PMSIGNAL_SHMEM_RESIZE); AnonymousShmemResize(); @@ -1684,5 +1699,6 @@ ShmemControlInit(void) /* shmem_resizable should be initialized by now */ ShmemCtrl->Resizable = shmem_resizable; + ShmemCtrl->evictor_pid = 0; } } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index fe470de63f20..5424c405b44e 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -57,6 +57,7 @@ #include "storage/fd.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/pg_shmem.h" #include "storage/proc.h" #include "storage/read_stream.h" #include "storage/smgr.h" @@ -7422,3 +7423,95 @@ const PgAioHandleCallbacks aio_local_buffer_readv_cb = { .complete_local = local_buffer_readv_complete, .report = buffer_readv_report, }; + +/* + * When shrinking shared buffers pool, evict the buffers which will not be part + * of the shrunk buffer pool. + */ +bool +EvictExtraBuffers(int newBufSize, int oldBufSize) +{ + bool result = true; + + /* + * If the buffer being evicated is locked, this function will need to wait. + * This function should not be called from a Postmaster since it can not wait on a lock. + */ + Assert(IsUnderPostmaster); + + /* + * Let only one backend perform eviction. We could split the work across all + * the backends but that doesn't seem necessary. + * + * The first backend to acquire ShmemResizeLock, sets its own PID as the + * evictor PID for other backends to know that the eviction is in progress or + * has already been performed. The evictor backend releases the lock when it + * finishes eviction. While the eviction is in progress, backends other than + * evictor backend won't be able to take the lock. They won't perform + * eviction. A backend may acquire the lock after eviction has completed, but + * it will not perform eviction since the evictor PID is already set. Evictor + * PID is reset only when the buffer resizing finishes. Thus only one backend + * will perform eviction in a given instance of shared buffers resizing. + * + * Any backend which acquires this lock will release it before the eviction + * phase finishes, hence the same lock can be reused for the next phase of + * resizing buffers. + */ + if (LWLockConditionalAcquire(ShmemResizeLock, LW_EXCLUSIVE)) + { + if (ShmemCtrl->evictor_pid == 0) + { + ShmemCtrl->evictor_pid = MyProcPid; + + /* + * TODO: Before evicting any buffer, we should check whether any of the + * buffers are pinned. If we find that a buffer is pinned after evicting + * most of them, that will impact performance since all those evicted + * buffers might need to be read again. + */ + for (Buffer buf = newBufSize + 1; buf <= oldBufSize; buf++) + { + BufferDesc *desc = GetBufferDescriptor(buf - 1); + uint32 buf_state; + bool buffer_flushed; + + buf_state = pg_atomic_read_u32(&desc->state); + + /* + * Nobody is expected to touch the buffers while resizing is + * going one hence unlocked precheck should be safe and saves + * some cycles. + */ + if (!(buf_state & BM_VALID)) + continue; + + /* + * XXX: Looks like CurrentResourceOwner can be NULL here, find + * another one in that case? + * */ + if (CurrentResourceOwner) + ResourceOwnerEnlarge(CurrentResourceOwner); + + ReservePrivateRefCountEntry(); + + LockBufHdr(desc); + + /* + * Now that we have locked buffer descriptor, make sure that the + * buffer without valid data has been skipped above. + */ + Assert(buf_state & BM_VALID); + + if (!EvictUnpinnedBufferInternal(desc, &buffer_flushed)) + { + elog(WARNING, "could not remove buffer %u, it is pinned", buf); + result = false; + break; + } + } + } + LWLockRelease(ShmemResizeLock); + } + + return result; +} diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 0bfbbb096d6a..db8aafdaf8c9 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -630,12 +630,22 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) strategy->current = 0; /* - * If the slot hasn't been filled yet, tell the caller to allocate a new - * buffer with the normal allocation strategy. He will then fill this - * slot by calling AddBufferToRing with the new buffer. + * If the slot hasn't been filled yet or the buffer in the slot has been + * invalidated when buffer pool was shrunk, tell the caller to allocate a new + * buffer with the normal allocation strategy. He will then fill this slot + * by calling AddBufferToRing with the new buffer. + * + * TODO: Ideally we would want to check for bufnum > NBuffers only once + * after every time the buffer pool is shrunk so as to catch any runtime + * bugs that introduce invalid buffers in the ring. But that is complicated. + * The BufferAccessStrategy objects are not accessible outside the + * ScanState. Hence we can not purge the buffers while evicting the buffers. + * After the resizing is finished, it's not possible to notice when we touch + * the first of those objects and the last of objects. See if this can + * fixed. */ bufnum = strategy->buffers[strategy->current]; - if (bufnum == InvalidBuffer) + if (bufnum == InvalidBuffer || bufnum > NBuffers) return NULL; /* diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 82cee6b88772..9a6a62753056 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -156,6 +156,7 @@ REPLICATION_SLOT_DROP "Waiting for a replication slot to become inactive so it c RESTORE_COMMAND "Waiting for to complete." SAFE_SNAPSHOT "Waiting to obtain a valid snapshot for a READ ONLY DEFERRABLE transaction." SHMEM_RESIZE_START "Waiting for other backends to start resizing shared memory." +SHMEM_RESIZE_EVICT "Waiting for other backends to finish buffer evication phase." SHMEM_RESIZE_DONE "Waiting for other backends to finish resizing shared memory." SYNC_REP "Waiting for confirmation from a remote server during synchronous replication." WAL_RECEIVER_EXIT "Waiting for the WAL receiver to exit." diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 51ce6ebcf6cb..c91a42fc5987 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -315,6 +315,7 @@ extern void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped); +extern bool EvictExtraBuffers(int fromBuf, int toBuf); /* in buf_init.c */ extern void BufferManagerShmemInit(int); diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index eba28ce8a5cc..0a59746b4724 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -77,6 +77,7 @@ extern PGDLLIMPORT AnonymousMapping Mappings[ANON_MAPPINGS]; typedef struct { pg_atomic_uint32 NSharedBuffers; + pid_t evictor_pid; Barrier Barrier; pg_atomic_uint64 Generation; bool Resizable; From 10cc269efdc5fa5bc65e5620882372b84ee33df6 Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Thu, 19 Jun 2025 17:38:51 +0200 Subject: [PATCH 15/16] Reinitialize StrategyControl after resizing buffers ... and BgBufferSync and ClockSweepTick adjustments Reinitializing strategry control area ===================================== The commit introduces a separate function StrategyReInitialize() instead of reusing StrategyInitialize() since some of the things that the second one does are not required in the first one. Here's list of what StrategyReInitialize() does and how does it differ from StrategyInitialize(). 1. StrategyControl pointer needn't be fetched again since it should not change. But added an Assert to make sure the pointer is valid. 2. &StrategyControl->buffer_strategy_lock need not be initialized again. 3. nextVictimBuffer, completePasses and numBufferAllocs are viewed in the context of NBuffers. Now that NBuffers itself has changed, those three do not make sense. Reset them as if the server has restarted again. Ability to delay resizing operation =================================== This commit introduces a flag delay_shmem_resize, which postgresql backends and workers can use to signal the coordinator to delay resizing operation. Background writer sets this flag when its scanning buffers. Background writer operation =========================== Background writer is blocked when the actual resizing is in progress. It stops a scan in progress when it sees that the resizing has begun or is about to begin. Once the buffer resizing is finished, before resuming the regular operation, bgwriter resets the information saved so far. This information is viewed in the context of NBuffers and hence does not make sense after resizing which chanegs NBuffers. Buffer lookup table =================== Right now there is no way to free shared memory. Even if we shrink the buffer lookup table when shrinking the buffer pool the unused hash table entries can not be freed. When we expand the buffer pool, more entries can be allocated but we can not resize the hash table directory without rehashing all the entries. Just allocating more entries will lead to more contention. Hence we setup the buffer lookup table considering the maximum possible size of the buffer pool which is MaxAvailableMemory only once at the beginning. Shared buffer lookup table and StrategyControl are not resized even if the buffer pool is resized hence they are allocated in the main shared memory segment TODO: ==== 1. The way BgBufferSync is written today, it packs four functionalities: setting up the buffer sync state, performing the buffer sync, resetting the buffer sync state when bgwriter_lru_maxpages <= 0 and setting it up again after bgwriter_lru_maxpages > 0. That makes the code hard to read. It will be good to divide this function into 3/4 different functions each performing one functionality. Then pack all the state (the local variables from that function converted to static global) into a structure, which is passed to these functions. Once that happens BgBufferSyncReset() will call one of the functions to reset the state when buffer pool is resized. 2. The condition (pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) == NBuffers) checked in BgBufferSync() to check whether buffer resizing is "about to begin" is wrong. NBuffers it not changed, until AnonymousShmemResize() is called and it wont' be called unless BgBufferSync() finishes if it has already begun. Need a better condition to check whether buffer resizing is about to begin. Author: Ashutosh Bapat Reviewed-by: Tomas Vondra --- src/backend/port/sysv_shmem.c | 23 ++++++-- src/backend/storage/buffer/buf_init.c | 19 +++++-- src/backend/storage/buffer/buf_table.c | 9 ++- src/backend/storage/buffer/bufmgr.c | 72 ++++++++++++++++++------ src/backend/storage/buffer/freelist.c | 77 ++++++++++++++++++++++++-- src/include/storage/buf_internals.h | 1 + src/include/storage/bufmgr.h | 1 + src/include/storage/ipc.h | 1 + src/include/storage/pg_shmem.h | 5 +- 9 files changed, 170 insertions(+), 38 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 9e1b2c3201f1..3be28e228aeb 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -104,6 +104,7 @@ AnonymousMapping Mappings[ANON_MAPPINGS]; /* Flag telling postmaster that resize is needed */ volatile bool pending_pm_shmem_resize = false; +volatile bool delay_shmem_resize = false; /* Keeps track of the previous NBuffers value */ static int NBuffersOld = -1; @@ -144,12 +145,11 @@ static int NBuffersPending = -1; * makes sense to evaluate them more precise. */ static double SHMEM_RESIZE_RATIO[6] = { - 0.1, /* MAIN_SHMEM_SEGMENT */ + 0.15, /* MAIN_SHMEM_SEGMENT */ 0.6, /* BUFFERS_SHMEM_SEGMENT */ 0.1, /* BUFFER_DESCRIPTORS_SHMEM_SEGMENT */ 0.1, /* BUFFER_IOCV_SHMEM_SEGMENT */ 0.05, /* CHECKPOINT_BUFFERS_SHMEM_SEGMENT */ - 0.05, /* STRATEGY_SHMEM_SEGMENT */ }; /* @@ -225,8 +225,6 @@ MappingName(int shmem_segment) return "iocv"; case CHECKPOINT_BUFFERS_SHMEM_SEGMENT: return "checkpoint"; - case STRATEGY_SHMEM_SEGMENT: - return "strategy"; default: return "unknown"; } @@ -1125,13 +1123,17 @@ ProcessBarrierShmemResize(Barrier *barrier) { Assert(IsUnderPostmaster); - elog(DEBUG1, "Handle a barrier for shmem resizing from %d to %d, %d", - NBuffersOld, NBuffersPending, pending_pm_shmem_resize); + elog(DEBUG1, "Handle a barrier for shmem resizing from %d to %d, %d, %d", + NBuffersOld, NBuffersPending, pending_pm_shmem_resize, delay_shmem_resize); /* Wait until we have seen the new NBuffers value */ if (!pending_pm_shmem_resize) return false; + /* Wait till this process becomes ready to resize buffers. */ + if (delay_shmem_resize) + return false; + /* * First thing to do after attaching to the barrier is to wait for others. * We can't simply use BarrierArriveAndWait, because backends might arrive @@ -1182,6 +1184,15 @@ ProcessBarrierShmemResize(Barrier *barrier) /* The second phase means the resize has finished, SHMEM_RESIZE_DONE */ BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_DONE); + if (MyBackendType == B_BG_WRITER) + { + /* + * Before resuming regular background writer activity, adjust the + * statistics collected so far. + */ + BgBufferSyncReset(NBuffersOld, NBuffers); + } + BarrierDetach(barrier); return true; } diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 0e72e373193a..be64fa5a136e 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -152,8 +152,15 @@ BufferManagerShmemInit(int FirstBufferToInit) } #endif - /* Init other shared buffer-management stuff */ - StrategyInitialize(!foundDescs); + /* + * Init other shared buffer-management stuff from scratch configuring buffer + * pool the first time. If we are just resizing buffer pool adjust only the + * required structures. + */ + if (FirstBufferToInit == 0) + StrategyInitialize(!foundDescs); + else + StrategyReInitialize(FirstBufferToInit); /* Initialize per-backend file flush context */ WritebackContextInit(&BackendWritebackContext, @@ -184,9 +191,6 @@ BufferManagerShmemSize(void) size = add_size(size, mul_size(NBuffers, BLCKSZ)); Mappings[BUFFERS_SHMEM_SEGMENT].shmem_req_size = size; - /* size of stuff controlled by freelist.c */ - Mappings[STRATEGY_SHMEM_SEGMENT].shmem_req_size = StrategyShmemSize(); - /* size of I/O condition variables, plus alignment padding */ size = add_size(0, mul_size(NBuffers, sizeof(ConditionVariableMinimallyPadded))); @@ -196,5 +200,10 @@ BufferManagerShmemSize(void) /* size of checkpoint sort array in bufmgr.c */ Mappings[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_req_size = mul_size(NBuffers, sizeof(CkptSortItem)); + /* Allocations in the main memory segment, at the end. */ + + /* size of stuff controlled by freelist.c */ + size = add_size(0, StrategyShmemSize()); + return size; } diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index 18a789671386..e5a97e557d9c 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -65,11 +65,18 @@ InitBufTable(int size) info.entrysize = sizeof(BufferLookupEnt); info.num_partitions = NUM_BUFFER_PARTITIONS; + /* + * The shared buffer look up table is set up only once with maximum possible + * entries considering maximum size of the buffer pool. It is not resized + * after that even if the buffer pool is resized. Hence it is allocated in + * the main shared memory segment and not in a resizeable shared memory + * segment. + */ SharedBufHash = ShmemInitHashInSegment("Shared Buffer Lookup Table", size, size, &info, HASH_ELEM | HASH_BLOBS | HASH_PARTITION | HASH_FIXED_SIZE, - STRATEGY_SHMEM_SEGMENT); + MAIN_SHMEM_SEGMENT); } /* diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 5424c405b44e..48c46d5b9637 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3580,6 +3580,32 @@ BufferSync(int flags) TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan); } +/* + * Information saved between BgBufferSync() calls so we can determine the + * strategy point's advance rate and avoid scanning already-cleaned buffers. The + * variables are global instead of static local so that BgBufferSyncReset() can + * adjust it when resizing shared buffers. + */ +static bool saved_info_valid = false; +static int prev_strategy_buf_id; +static uint32 prev_strategy_passes; +static int next_to_clean; +static uint32 next_passes; + +/* Moving averages of allocation rate and clean-buffer density */ +static float smoothed_alloc = 0; +static float smoothed_density = 10.0; + +void +BgBufferSyncReset(int NBuffersOld, int NBuffersNew) +{ + saved_info_valid = false; +#ifdef BGW_DEBUG + elog(DEBUG2, "invalidated background writer status after resizing buffers from %d to %d", + NBuffersOld, NBuffersNew); +#endif +} + /* * BgBufferSync -- Write out some dirty buffers in the pool. * @@ -3599,20 +3625,6 @@ BgBufferSync(WritebackContext *wb_context) uint32 strategy_passes; uint32 recent_alloc; - /* - * Information saved between calls so we can determine the strategy - * point's advance rate and avoid scanning already-cleaned buffers. - */ - static bool saved_info_valid = false; - static int prev_strategy_buf_id; - static uint32 prev_strategy_passes; - static int next_to_clean; - static uint32 next_passes; - - /* Moving averages of allocation rate and clean-buffer density */ - static float smoothed_alloc = 0; - static float smoothed_density = 10.0; - /* Potentially these could be tunables, but for now, not */ float smoothing_samples = 16; float scan_whole_pool_milliseconds = 120000.0; @@ -3635,6 +3647,22 @@ BgBufferSync(WritebackContext *wb_context) long new_strategy_delta; uint32 new_recent_alloc; + /* + * If buffer pool is being shrunk the buffer being written out may not remain + * valid. If the buffer pool is being expanded, more buffers will become + * available without even this function writing out any. Hence wait till + * buffer resizing finishes i.e. go into hibernation mode. + */ + if (pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) != NBuffers) + return true; + + /* + * Resizing shared buffers while this function is performing an LRU scan on + * them may lead to wrong results. Indicate that the resizing should wait for + * the LRU scan to complete. + */ + delay_shmem_resize = true; + /* * Find out where the clock-sweep currently is, and how many buffer * allocations have happened since our last call. @@ -3811,8 +3839,17 @@ BgBufferSync(WritebackContext *wb_context) num_written = 0; reusable_buffers = reusable_buffers_est; - /* Execute the LRU scan */ - while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) + /* + * Execute the LRU scan. + * + * If buffer pool is being shrunk, the buffer being written may not remain + * valid. If the buffer pool is being expanded, more buffers will become + * available without even this function writing any. Hence stop what we are doing. This + * also unblocks other processes that are waiting for buffer resizing to + * finish. + */ + while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est && + pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) == NBuffers) { int sync_state = SyncOneBuffer(next_to_clean, true, wb_context); @@ -3871,6 +3908,9 @@ BgBufferSync(WritebackContext *wb_context) #endif } + /* Let the resizing commence. */ + delay_shmem_resize = false; + /* Return true if OK to hibernate */ return (bufs_to_lap == 0 && recent_alloc == 0); } diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index db8aafdaf8c9..89269087034a 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -371,12 +371,21 @@ StrategyInitialize(bool init) * * Since we can't tolerate running out of lookup table entries, we must be * sure to specify an adequate table size here. The maximum steady-state - * usage is of course NBuffers entries, but BufferAlloc() tries to insert - * a new entry before deleting the old. In principle this could be - * happening in each partition concurrently, so we could need as many as - * NBuffers + NUM_BUFFER_PARTITIONS entries. + * usage is of course is as many number of entries as the number of buffers + * in the buffer pool. Right now there is no way to free shared memory. Even + * if we shrink the buffer lookup table when shrinking the buffer pool the + * unused hash table entries can not be freed. When we expand the buffer + * pool, more entries can be allocated but we can not resize the hash table + * directory without rehashing all the entries. Just allocating more entries + * will lead to more contention. Hence we setup the buffer lookup table + * considering the maximum possible size of the buffer pool which is + * MaxAvailableMemory. + * + * Additionally BufferAlloc() tries to insert a new entry before deleting the + * old. In principle this could be happening in each partition concurrently, + * so we need extra NUM_BUFFER_PARTITIONS entries. */ - InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS); + InitBufTable(MaxAvailableMemory + NUM_BUFFER_PARTITIONS); /* * Get or create the shared strategy control block @@ -384,7 +393,7 @@ StrategyInitialize(bool init) StrategyControl = (BufferStrategyControl *) ShmemInitStructInSegment("Buffer Strategy Status", sizeof(BufferStrategyControl), - &found, STRATEGY_SHMEM_SEGMENT); + &found, MAIN_SHMEM_SEGMENT); if (!found) { @@ -409,6 +418,62 @@ StrategyInitialize(bool init) Assert(!init); } +/* + * StrategyReInitialize -- re-initialize the buffer cache replacement + * strategy. + * + * To be called when resizing buffer manager and only from the coordinator. + * TODO: Assess the differences between this function and StrategyInitialize(). + */ +void +StrategyReInitialize(int FirstBufferIdToInit) +{ + bool found; + + /* + * Resizing memory for buffer pools should not affect the address of + * StrategyControl. + */ + if (StrategyControl != (BufferStrategyControl *) + ShmemInitStructInSegment("Buffer Strategy Status", + sizeof(BufferStrategyControl), + &found, MAIN_SHMEM_SEGMENT)) + elog(FATAL, "something went wrong while re-initializing the buffer strategy"); + + Assert(found); + + /* TODO: Buffer lookup table adjustment: There are two options: + * + * 1. Resize the buffer lookup table to match the new number of buffers. But + * this requires rehashing all the entries in the buffer lookup table with + * the new table size. + * + * 2. Allocate maximum size of the buffer lookup table at the beginning and + * never resize it. This leaves sparse buffer lookup table which is + * inefficient from both memory and time perspective. According to David + * Rowley, the sparse entries in the buffer look up table cause frequent + * cacheline reload which affect performance. If the impact of that + * inefficiency in a benchmark is significant, we will need to consider first + * option. + */ + /* + * The clock sweep tick pointer might have got invalidated. Reset it as if + * starting a fresh server. + */ + pg_atomic_write_u32(&StrategyControl->nextVictimBuffer, 0); + + /* + * The old statistics is viewed in the context of the number of shared + * buffers. It does not make sense now that the number of shared buffers + * itself has changed. + */ + StrategyControl->completePasses = 0; + pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); + + /* No pending notification */ + StrategyControl->bgwprocno = -1; +} + /* ---------------------------------------------------------------- * Backend-private buffer ring management diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index dfd614f7ca44..551479649ca2 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -443,6 +443,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); +extern void StrategyReInitialize(int FirstBufferToInit); /* buf_table.c */ extern Size BufTableShmemSize(int size); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index c91a42fc5987..2fe3202168b5 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -299,6 +299,7 @@ extern bool IsBufferCleanupOK(Buffer buffer); extern bool HoldingBufferPinThatDelaysRecovery(void); extern bool BgBufferSync(WritebackContext *wb_context); +extern void BgBufferSyncReset(int NBuffersOld, int NBuffersNew); extern uint32 GetPinLimit(void); extern uint32 GetLocalPinLimit(void); diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h index 847f56a36dcb..6e7b0abb6257 100644 --- a/src/include/storage/ipc.h +++ b/src/include/storage/ipc.h @@ -65,6 +65,7 @@ typedef void (*shmem_startup_hook_type) (void); extern PGDLLIMPORT bool proc_exit_inprogress; extern PGDLLIMPORT bool shmem_exit_inprogress; extern PGDLLIMPORT volatile bool pending_pm_shmem_resize; +extern PGDLLIMPORT volatile bool delay_shmem_resize; pg_noreturn extern void proc_exit(int code); extern void shmem_exit(int code); diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 0a59746b4724..704b065f9e95 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -65,7 +65,7 @@ typedef struct ShmemSegment } ShmemSegment; /* Number of available segments for anonymous memory mappings */ -#define ANON_MAPPINGS 6 +#define ANON_MAPPINGS 5 extern PGDLLIMPORT ShmemSegment Segments[ANON_MAPPINGS]; extern PGDLLIMPORT AnonymousMapping Mappings[ANON_MAPPINGS]; @@ -172,7 +172,4 @@ extern void ShmemControlInit(void); /* Checkpoint BufferIds */ #define CHECKPOINT_BUFFERS_SHMEM_SEGMENT 4 -/* Buffer strategy status */ -#define STRATEGY_SHMEM_SEGMENT 5 - #endif /* PG_SHMEM_H */ From adf52ae3d6da7bb3d71a790c3142b5c1afbb9649 Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Wed, 3 Sep 2025 10:59:20 +0530 Subject: [PATCH 16/16] Tests for dynamic shared_buffers resizing The commit adds two tests: 1. TAP test to stress test buffer pool resizing under concurrent load. 2. SQL test to test sanity of shared memory allocations and mappings after buffer pool resizing operation. Author: Palak Chaturvedi Author: Ashutosh Bapat --- src/test/Makefile | 2 +- src/test/README | 3 + src/test/buffermgr/Makefile | 27 ++ src/test/buffermgr/README | 26 ++ src/test/buffermgr/expected/buffer_resize.out | 237 ++++++++++++++++++ src/test/buffermgr/meson.build | 17 ++ src/test/buffermgr/sql/buffer_resize.sql | 73 ++++++ src/test/buffermgr/t/001_resize_buffer.pl | 126 ++++++++++ src/test/meson.build | 1 + 9 files changed, 511 insertions(+), 1 deletion(-) create mode 100644 src/test/buffermgr/Makefile create mode 100644 src/test/buffermgr/README create mode 100644 src/test/buffermgr/expected/buffer_resize.out create mode 100644 src/test/buffermgr/meson.build create mode 100644 src/test/buffermgr/sql/buffer_resize.sql create mode 100644 src/test/buffermgr/t/001_resize_buffer.pl diff --git a/src/test/Makefile b/src/test/Makefile index 511a72e6238a..95f8858a8183 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -12,7 +12,7 @@ subdir = src/test top_builddir = ../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = perl postmaster regress isolation modules authentication recovery subscription +SUBDIRS = perl postmaster regress isolation modules authentication recovery subscription buffermgr ifeq ($(with_icu),yes) SUBDIRS += icu diff --git a/src/test/README b/src/test/README index afdc76765190..77f11607ff76 100644 --- a/src/test/README +++ b/src/test/README @@ -15,6 +15,9 @@ examples/ Demonstration programs for libpq that double as regression tests via "make check" +buffermgr/ + Tests for resizing buffer pool without restarting the server + isolation/ Tests for concurrent behavior at the SQL level diff --git a/src/test/buffermgr/Makefile b/src/test/buffermgr/Makefile new file mode 100644 index 000000000000..97c3da9e20a3 --- /dev/null +++ b/src/test/buffermgr/Makefile @@ -0,0 +1,27 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/test/buffermgr +# +# Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/test/buffermgr/Makefile +# +#------------------------------------------------------------------------- + +EXTRA_INSTALL = contrib/pg_buffercache + +REGRESS = buffer_resize + +subdir = src/test/buffermgr +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) + +clean distclean: + rm -rf tmp_check diff --git a/src/test/buffermgr/README b/src/test/buffermgr/README new file mode 100644 index 000000000000..c375ad809892 --- /dev/null +++ b/src/test/buffermgr/README @@ -0,0 +1,26 @@ +src/test/buffermgr/README + +Regression tests for buffer manager +=================================== + +This directory contains a test suite for resizing buffer manager without restarting the server. + + +Running the tests +================= + +NOTE: You must have given the --enable-tap-tests argument to configure. + +Run + make check +or + make installcheck +You can use "make installcheck" if you previously did "make install". +In that case, the code in the installation tree is tested. With +"make check", a temporary installation tree is built from the current +sources and then tested. + +Either way, this test initializes, starts, and stops a test Postgres +cluster. + +See src/test/perl/README for more info about running these tests. diff --git a/src/test/buffermgr/expected/buffer_resize.out b/src/test/buffermgr/expected/buffer_resize.out new file mode 100644 index 000000000000..a986be9a5dac --- /dev/null +++ b/src/test/buffermgr/expected/buffer_resize.out @@ -0,0 +1,237 @@ +-- Test buffer pool resizing and shared memory allocation tracking +-- This test resizes the buffer pool multiple times and monitors +-- shared memory allocations related to buffer management +-- Create a separate schema for this test +CREATE SCHEMA buffer_resize_test; +SET search_path TO buffer_resize_test, public; +-- Create a view for buffer-related shared memory allocations +CREATE VIEW buffer_allocations AS +SELECT name, segment, size, allocated_size +FROM pg_shmem_allocations +WHERE name IN ('Buffer Blocks', 'Buffer Descriptors', 'Buffer IO Condition Variables', + 'Checkpoint BufferIds') +ORDER BY name; +-- Note: We exclude the 'main' segment even if it contains the shared buffer +-- lookup table because it contains other shared structures whose total sizes +-- may vary as the code changes. +CREATE VIEW buffer_segments AS +SELECT name, size, mapping_size, mapping_reserved_size +FROM pg_shmem_segments +WHERE name <> 'main' +ORDER BY name; +-- Enable pg_buffercache for buffer count verification +CREATE EXTENSION IF NOT EXISTS pg_buffercache; +-- Test 1: Default shared_buffers +SHOW shared_buffers; + shared_buffers +---------------- + 128MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+-----------+---------------- + Buffer Blocks | buffers | 134221824 | 134221824 + Buffer Descriptors | descriptors | 1048576 | 1048576 + Buffer IO Condition Variables | iocv | 262144 | 262144 + Checkpoint BufferIds | checkpoint | 327680 | 327680 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+-----------+--------------+----------------------- + buffers | 134225920 | 134225920 | 2576982016 + checkpoint | 335872 | 335872 | 214753280 + descriptors | 1056768 | 1056768 | 429498368 + iocv | 270336 | 270336 | 429498368 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 16384 +(1 row) + +-- Test 2: Set to 64MB +ALTER SYSTEM SET shared_buffers = '64MB'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(1); + pg_sleep +---------- + +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 64MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+----------+---------------- + Buffer Blocks | buffers | 67112960 | 67112960 + Buffer Descriptors | descriptors | 524288 | 524288 + Buffer IO Condition Variables | iocv | 131072 | 131072 + Checkpoint BufferIds | checkpoint | 163840 | 163840 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+----------+--------------+----------------------- + buffers | 67117056 | 67117056 | 2576982016 + checkpoint | 172032 | 172032 | 214753280 + descriptors | 532480 | 532480 | 429498368 + iocv | 139264 | 139264 | 429498368 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 8192 +(1 row) + +-- Test 3: Set to 256MB +ALTER SYSTEM SET shared_buffers = '256MB'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(1); + pg_sleep +---------- + +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 256MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+-----------+---------------- + Buffer Blocks | buffers | 268439552 | 268439552 + Buffer Descriptors | descriptors | 2097152 | 2097152 + Buffer IO Condition Variables | iocv | 524288 | 524288 + Checkpoint BufferIds | checkpoint | 655360 | 655360 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+-----------+--------------+----------------------- + buffers | 268443648 | 268443648 | 2576982016 + checkpoint | 663552 | 663552 | 214753280 + descriptors | 2105344 | 2105344 | 429498368 + iocv | 532480 | 532480 | 429498368 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 32768 +(1 row) + +-- Test 4: Set to 100MB (non-power-of-two) +ALTER SYSTEM SET shared_buffers = '100MB'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(1); + pg_sleep +---------- + +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 100MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+-----------+---------------- + Buffer Blocks | buffers | 104861696 | 104861696 + Buffer Descriptors | descriptors | 819200 | 819200 + Buffer IO Condition Variables | iocv | 204800 | 204800 + Checkpoint BufferIds | checkpoint | 256000 | 256000 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+-----------+--------------+----------------------- + buffers | 104865792 | 104865792 | 2576982016 + checkpoint | 262144 | 262144 | 214753280 + descriptors | 827392 | 827392 | 429498368 + iocv | 212992 | 212992 | 429498368 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 12800 +(1 row) + +-- Test 5: Set to minimum 128kB +ALTER SYSTEM SET shared_buffers = '128kB'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(1); + pg_sleep +---------- + +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 128kB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+--------+---------------- + Buffer Blocks | buffers | 135168 | 135168 + Buffer Descriptors | descriptors | 1024 | 1024 + Buffer IO Condition Variables | iocv | 256 | 256 + Checkpoint BufferIds | checkpoint | 320 | 320 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+--------+--------------+----------------------- + buffers | 139264 | 139264 | 2576982016 + checkpoint | 8192 | 8192 | 214753280 + descriptors | 8192 | 8192 | 429498368 + iocv | 8192 | 8192 | 429498368 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 16 +(1 row) + +-- Clean up the schema and all its objects +RESET search_path; +DROP SCHEMA buffer_resize_test CASCADE; +NOTICE: drop cascades to 3 other objects +DETAIL: drop cascades to view buffer_resize_test.buffer_allocations +drop cascades to view buffer_resize_test.buffer_segments +drop cascades to extension pg_buffercache diff --git a/src/test/buffermgr/meson.build b/src/test/buffermgr/meson.build new file mode 100644 index 000000000000..e71dcdea685f --- /dev/null +++ b/src/test/buffermgr/meson.build @@ -0,0 +1,17 @@ +# Copyright (c) 2022-2025, PostgreSQL Global Development Group + +tests += { + 'name': 'buffermgr', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'buffer_resize', + ], + }, + 'tap': { + 'tests': [ + 't/001_resize_buffer.pl', + ], + }, +} diff --git a/src/test/buffermgr/sql/buffer_resize.sql b/src/test/buffermgr/sql/buffer_resize.sql new file mode 100644 index 000000000000..45f5bb6d78be --- /dev/null +++ b/src/test/buffermgr/sql/buffer_resize.sql @@ -0,0 +1,73 @@ +-- Test buffer pool resizing and shared memory allocation tracking +-- This test resizes the buffer pool multiple times and monitors +-- shared memory allocations related to buffer management + +-- Create a separate schema for this test +CREATE SCHEMA buffer_resize_test; +SET search_path TO buffer_resize_test, public; + +-- Create a view for buffer-related shared memory allocations +CREATE VIEW buffer_allocations AS +SELECT name, segment, size, allocated_size +FROM pg_shmem_allocations +WHERE name IN ('Buffer Blocks', 'Buffer Descriptors', 'Buffer IO Condition Variables', + 'Checkpoint BufferIds') +ORDER BY name; + +-- Note: We exclude the 'main' segment even if it contains the shared buffer +-- lookup table because it contains other shared structures whose total sizes +-- may vary as the code changes. +CREATE VIEW buffer_segments AS +SELECT name, size, mapping_size, mapping_reserved_size +FROM pg_shmem_segments +WHERE name <> 'main' +ORDER BY name; + +-- Enable pg_buffercache for buffer count verification +CREATE EXTENSION IF NOT EXISTS pg_buffercache; + +-- Test 1: Default shared_buffers +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 2: Set to 64MB +ALTER SYSTEM SET shared_buffers = '64MB'; +SELECT pg_reload_conf(); +SELECT pg_sleep(1); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 3: Set to 256MB +ALTER SYSTEM SET shared_buffers = '256MB'; +SELECT pg_reload_conf(); +SELECT pg_sleep(1); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 4: Set to 100MB (non-power-of-two) +ALTER SYSTEM SET shared_buffers = '100MB'; +SELECT pg_reload_conf(); +SELECT pg_sleep(1); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 5: Set to minimum 128kB +ALTER SYSTEM SET shared_buffers = '128kB'; +SELECT pg_reload_conf(); +SELECT pg_sleep(1); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Clean up the schema and all its objects +RESET search_path; +DROP SCHEMA buffer_resize_test CASCADE; diff --git a/src/test/buffermgr/t/001_resize_buffer.pl b/src/test/buffermgr/t/001_resize_buffer.pl new file mode 100644 index 000000000000..8cf9e4539ab5 --- /dev/null +++ b/src/test/buffermgr/t/001_resize_buffer.pl @@ -0,0 +1,126 @@ +# Copyright (c) 2025-2025, PostgreSQL Global Development Group +# +# Minimal test testing shared_buffer resizing under load + +use strict; +use warnings; +use IPC::Run; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Function to resize buffer pool and verify the change. +sub apply_and_verify_buffer_change +{ + my ($node, $new_size) = @_; + + # Use a single background_psql session for consistency + my $psql_session = $node->background_psql('postgres'); + $psql_session->query_safe("ALTER SYSTEM SET shared_buffers = '$new_size'"); + $psql_session->query_safe("SELECT pg_reload_conf()"); + + # Wait till the resizing finishes using the same session + # + # TODO: Right now there is no way to know when the resize has finished and + # all the backends are using new value of shared_buffers. Hence we poll + # manually until we get the expected value in the same session. + my $current_size; + my $attempts = 0; + my $max_attempts = 60; # 60 seconds timeout + do { + $current_size = $psql_session->query_safe("SHOW shared_buffers"); + $attempts++; + + # Only sleep if we didn't get the expected result and haven't timed out yet + if ($current_size ne $new_size && $attempts < $max_attempts) { + sleep(1); + } + } while ($current_size ne $new_size && $attempts < $max_attempts); + + $psql_session->quit; + + # Check if we succeeded or timed out + if ($current_size ne $new_size) { + die "Timeout waiting for shared_buffers to change to $new_size (got $current_size after ${attempts}s)"; + } +} + +# Initialize a cluster and start pgbench in the background for concurrent load. +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; +$node->start; +$node->safe_psql('postgres', "CREATE EXTENSION pg_buffercache"); +my $pgb_scale = 10; +my $pgb_duration = 120; +my $pgb_num_clients = 10; +$node->pgbench( + "--initialize --init-steps=dtpvg --scale=$pgb_scale --quiet", + 0, + [qr{^$}], + [ # stderr patterns to verify initialization stages + qr{dropping old tables}, + qr{creating tables}, + qr{done in \d+\.\d\d s } + ], + "pgbench initialization (scale=$pgb_scale)" +); +my ($pgbench_stdin, $pgbench_stdout, $pgbench_stderr) = ('', '', ''); +my $pgbench_process = IPC::Run::start( + [ + 'pgbench', + '-p', $node->port, + '-T', $pgb_duration, + '-c', $pgb_num_clients, + 'postgres' + ], + '<' => \$pgbench_stdin, + '>' => \$pgbench_stdout, + '2>' => \$pgbench_stderr +); + +ok($pgbench_process, "pgbench started successfully"); + +# Allow pgbench to establish connections and start generating load. +# +# TODO: When creating new backends is known to work well with buffer pool +# resizing, this wait should be removed. +sleep(1); + +# Resize buffer pool to various sizes while pgbench is running in the +# background. +# +# TODO: These are pseudo-randomly picked sizes, but we can do better. +my $tests_completed = 0; +my @buffer_sizes = ('900MB', '500MB', '250MB', '400MB', '120MB', '600MB'); +for my $target_size (@buffer_sizes) +{ + # Verify workload generator is still running + if (!$pgbench_process->pumpable) { + ok(0, "pgbench is still running"); + last; + } + + apply_and_verify_buffer_change($node, $target_size); + $tests_completed++; + + # Wait for the resized buffer pool to stabilize. If the resized buffer pool + # is utilized fully, it might hit any wrongly initialized areas of shared + # memory. + sleep(2); +} +is($tests_completed, scalar(@buffer_sizes), "All buffer sizes were tested"); + +# Make sure that pgbench can end normally. +$pgbench_process->signal('TERM'); +IPC::Run::finish $pgbench_process; +ok(grep { $pgbench_process->result == $_ } (0, 15), "pgbench exited gracefully"); + +# Log any error output from pgbench for debugging +diag("pgbench stderr:\n$pgbench_stderr"); +diag("pgbench stdout:\n$pgbench_stdout"); + +# Ensure database is still functional after all the buffer changes +$node->connect_ok("dbname=postgres", + "Database remains accessible after $tests_completed buffer resize operations"); + +done_testing(); \ No newline at end of file diff --git a/src/test/meson.build b/src/test/meson.build index ccc31d6a86a1..2a5ba1dec398 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -4,6 +4,7 @@ subdir('regress') subdir('isolation') subdir('authentication') +subdir('buffermgr') subdir('postmaster') subdir('recovery') subdir('subscription')