From 1a80b7d80519468b8f7d82218c5f021742c34dd6 Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Fri, 15 Aug 2025 14:48:02 +0200 Subject: [PATCH 1/2] Online enabling and disabling of data checksums This allows data checksums to be enabled, or disabled, in a running cluster without restricting access to the cluster during processing. Data checksums could prior to this only be enabled during initdb or when the cluster is offline using the pg_checksums app. This commit introduce functionality to enable, or disable, data checksums while the cluster is running regardless of how it was initialized. A background worker launcher process is responsible for launching a dynamic per-database background worker which will mark all buffers dirty for all relation with storage in order for them to have data checksums calcuated on write. Once all relations in all databases have been processed, the data_checksums state will be set to on and the cluster will at that point be identical to one which had data checksums enabled during initialization or via offline processing. When data checksums are being enabled, concurrent I/O operations from backends other than the data checksums worker will write the checksums but not verify them on reading. Only when all backends have absorbed the procsignalbarrier for setting data_checksums to on will they also start verifying checksums on reading. The same process is repeated during disabling; all backends write checksums but do not verify them until the barrier for setting the state to off has been absorbed by all. This in-progress state is used to ensure there are no false negatives (or positives) due to reading a checksum which is not in sync with the page. A new testmodule, test_checksums, is introduced with an extensive set of tests covering both online and offline data checksum mode changes. The tests for online processing are gated begind the PG_TEST_EXTRA flag to some degree due to being very time consuming to run. This work is based on an earlier version of this patch which was reviewed by among others Heikki Linnakangas, Robert Haas, Andres Freund, Tomas Vondra, Michael Banck and Andrey Borodin. During the work on this new version, Tomas Vondra has given invaluable assistance with not only coding and reviewing but very in-depth testing. Author: Daniel Gustafsson Author: Magnus Hagander Co-authored-by: Tomas Vondra Reviewed-by: Tomas Vondra Discussion: https://postgr.es/m/CABUevExz9hUUOLnJVr2kpw9Cx=o4MCr1SVKwbupzuxP7ckNutA@mail.gmail.com Discussion: https://postgr.es/m/20181030051643.elbxjww5jjgnjaxg@alap3.anarazel.de Discussion: https://postgr.es/m/CABUevEwE3urLtwxxqdgd5O2oQz9J717ZzMbh+ziCSa5YLLU_BA@mail.gmail.com --- doc/src/sgml/func/func-admin.sgml | 71 + doc/src/sgml/glossary.sgml | 23 + doc/src/sgml/monitoring.sgml | 208 ++- doc/src/sgml/ref/pg_checksums.sgml | 6 + doc/src/sgml/regress.sgml | 12 + doc/src/sgml/wal.sgml | 59 +- src/backend/access/rmgrdesc/xlogdesc.c | 24 + src/backend/access/transam/xlog.c | 678 +++++++- src/backend/access/transam/xlogfuncs.c | 57 + src/backend/access/transam/xlogrecovery.c | 13 + src/backend/backup/basebackup.c | 6 +- src/backend/catalog/system_functions.sql | 20 + src/backend/catalog/system_views.sql | 20 + src/backend/postmaster/Makefile | 1 + src/backend/postmaster/auxprocess.c | 19 + src/backend/postmaster/bgworker.c | 7 + src/backend/postmaster/datachecksumsworker.c | 1471 +++++++++++++++++ src/backend/postmaster/meson.build | 1 + src/backend/postmaster/postmaster.c | 5 + src/backend/replication/logical/decode.c | 1 + src/backend/storage/ipc/ipci.c | 4 + src/backend/storage/ipc/procsignal.c | 14 + src/backend/storage/page/README | 4 +- src/backend/storage/page/bufpage.c | 10 +- src/backend/utils/activity/pgstat_backend.c | 2 + src/backend/utils/activity/pgstat_io.c | 2 + .../utils/activity/wait_event_names.txt | 4 + src/backend/utils/adt/pgstatfuncs.c | 8 +- src/backend/utils/init/miscinit.c | 3 +- src/backend/utils/init/postinit.c | 20 +- src/backend/utils/misc/guc_parameters.dat | 5 +- src/backend/utils/misc/guc_tables.c | 9 +- src/bin/pg_checksums/pg_checksums.c | 4 +- src/bin/pg_controldata/pg_controldata.c | 2 + src/bin/pg_upgrade/controldata.c | 9 + src/include/access/xlog.h | 14 +- src/include/access/xlog_internal.h | 7 + src/include/catalog/pg_control.h | 6 +- src/include/catalog/pg_proc.dat | 19 + src/include/commands/progress.h | 17 + src/include/miscadmin.h | 6 + src/include/postmaster/datachecksumsworker.h | 51 + src/include/postmaster/proctypelist.h | 2 + src/include/storage/bufpage.h | 2 +- src/include/storage/checksum.h | 15 + src/include/storage/lwlocklist.h | 1 + src/include/storage/proc.h | 6 +- src/include/storage/procsignal.h | 5 + src/include/utils/backend_progress.h | 1 + src/test/modules/Makefile | 1 + src/test/modules/meson.build | 1 + src/test/modules/test_checksums/.gitignore | 2 + src/test/modules/test_checksums/Makefile | 40 + src/test/modules/test_checksums/README | 22 + src/test/modules/test_checksums/meson.build | 36 + .../modules/test_checksums/t/001_basic.pl | 63 + .../modules/test_checksums/t/002_restarts.pl | 110 ++ .../test_checksums/t/003_standby_restarts.pl | 114 ++ .../modules/test_checksums/t/004_offline.pl | 82 + .../modules/test_checksums/t/005_injection.pl | 126 ++ .../test_checksums/t/006_pgbench_single.pl | 268 +++ .../test_checksums/t/007_pgbench_standby.pl | 398 +++++ .../test_checksums/t/DataChecksums/Utils.pm | 283 ++++ .../test_checksums/test_checksums--1.0.sql | 28 + .../modules/test_checksums/test_checksums.c | 225 +++ .../test_checksums/test_checksums.control | 4 + src/test/perl/PostgreSQL/Test/Cluster.pm | 45 + src/test/regress/expected/rules.out | 36 + src/test/regress/expected/stats.out | 18 +- src/tools/pgindent/typedefs.list | 6 + 70 files changed, 4815 insertions(+), 47 deletions(-) create mode 100644 src/backend/postmaster/datachecksumsworker.c create mode 100644 src/include/postmaster/datachecksumsworker.h create mode 100644 src/test/modules/test_checksums/.gitignore create mode 100644 src/test/modules/test_checksums/Makefile create mode 100644 src/test/modules/test_checksums/README create mode 100644 src/test/modules/test_checksums/meson.build create mode 100644 src/test/modules/test_checksums/t/001_basic.pl create mode 100644 src/test/modules/test_checksums/t/002_restarts.pl create mode 100644 src/test/modules/test_checksums/t/003_standby_restarts.pl create mode 100644 src/test/modules/test_checksums/t/004_offline.pl create mode 100644 src/test/modules/test_checksums/t/005_injection.pl create mode 100644 src/test/modules/test_checksums/t/006_pgbench_single.pl create mode 100644 src/test/modules/test_checksums/t/007_pgbench_standby.pl create mode 100644 src/test/modules/test_checksums/t/DataChecksums/Utils.pm create mode 100644 src/test/modules/test_checksums/test_checksums--1.0.sql create mode 100644 src/test/modules/test_checksums/test_checksums.c create mode 100644 src/test/modules/test_checksums/test_checksums.control diff --git a/doc/src/sgml/func/func-admin.sgml b/doc/src/sgml/func/func-admin.sgml index 1b465bc8ba71..f3a8782ede01 100644 --- a/doc/src/sgml/func/func-admin.sgml +++ b/doc/src/sgml/func/func-admin.sgml @@ -2979,4 +2979,75 @@ SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8'); + + Data Checksum Functions + + + The functions shown in can + be used to enable or disable data checksums in a running cluster. + See for details. + + + + Data Checksum Functions + + + + + Function + + + Description + + + + + + + + + pg_enable_data_checksums + + pg_enable_data_checksums ( cost_delay int, cost_limit int ) + void + + + Initiates data checksums for the cluster. This will switch the data + checksums mode to inprogress-on as well as start a + background worker that will process all pages in the database and + enable checksums on them. When all data pages have had checksums + enabled, the cluster will automatically switch data checksums mode to + on. + + + If cost_delay and cost_limit are + specified, the speed of the process is throttled using the same principles as + Cost-based Vacuum Delay. + + + + + + + pg_disable_data_checksums + + pg_disable_data_checksums () + void + + + Disables data checksum validation and calculation for the cluster. This + will switch the data checksum mode to inprogress-off + while data checksums are being disabled. When all active backends have + stopped validating data checksums, the data checksum mode will be + changed to off. At this point the data pages will + still have checksums recorded but they are not updated when pages are + modified. + + + + +
+ +
+ diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml index 8651f0cdb919..9bac0c963489 100644 --- a/doc/src/sgml/glossary.sgml +++ b/doc/src/sgml/glossary.sgml @@ -184,6 +184,8 @@ (but not the autovacuum workers), the background writer, the checkpointer, + the data checksums worker, + the data checksums worker launcher, the logger, the startup process, the WAL archiver, @@ -573,6 +575,27 @@ + + Data Checksums Worker + + + An auxiliary process + which enables or disables data checksums in a specific database. + + + + + + Data Checksums Worker Launcher + + + An auxiliary process + which starts processes + for each database. + + + + Database cluster diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index f3bf527d5b4b..b56e220f3d8c 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -3551,8 +3551,9 @@ description | Waiting for a newly initialized WAL file to reach durable storage Number of data page checksum failures detected in this - database (or on a shared object), or NULL if data checksums are - disabled. + database (or on a shared object). + Detected failures are reported regardless of the + setting. @@ -3562,8 +3563,8 @@ description | Waiting for a newly initialized WAL file to reach durable storage Time at which the last data page checksum failure was detected in - this database (or on a shared object), or NULL if data checksums are - disabled. + this database (or on a shared object). Last failure is reported + regardless of the setting. @@ -6946,6 +6947,205 @@ FROM pg_stat_get_backend_idset() AS backendid; + + Data Checksum Progress Reporting + + + pg_stat_progress_data_checksums + + + + When data checksums are being enabled on a running cluster, the + pg_stat_progress_data_checksums view will contain + a row for the launcher process, and one row for each worker process which + is currently calculating checksums for the data pages in one database. + + + + <structname>pg_stat_progress_data_checksums</structname> View + + + + + + Column Type + + + Description> + + + + + + + + + + pid integer + + + Process ID of a datachecksumworker process. + + + + + + + datid oid + + + OID of this database, or 0 for the launcher process + relation + + + + + + datname name + + + Name of this database, or NULL for the + launcher process. + + + + + + + phase text + + + Current processing phase, see + for description of the phases. + + + + + + + + databases_total integer + + + The total number of databases which will be processed. Only the + launcher worker has this value set, the other worker processes + have this set to NULL. + + + + + + + + databases_done integer + + + The number of databases which have been processed. Only the + launcher worker has this value set, the other worker processes + have this set to NULL. + + + + + + + + relations_total integer + + + The total number of relations which will be processed, or + NULL if the data checksums worker process hasn't + calculated the number of relations yet. The launcher process has + this NULL. + + + + + + + + relations_done integer + + + The number of relations which have been processed. The launcher + process has this NULL. + + + + + + + + blocks_total integer + + + The number of blocks in the current relation which will be processed, + or NULL if the data checksums worker process hasn't + calculated the number of blocks yet. The launcher process has + this NULL. + + + + + + + + blocks_done integer + + + The number of blocks in the current relation which have been processed. + The launcher process has this NULL. + + + + + + +
+ + + Data Checksum Phases + + + + + + Phase + Description + + + + + enabling + + The command is currently enabling data checksums on the cluster. + + + + disabling + + The command is currently disabling data checksums on the cluster. + + + + waiting on temporary tables + + The command is currently waiting for all temporary tables which existed + at the time the command was started to be removed. + + + + waiting on checkpoint + + The command is currently waiting for a checkpoint to update the checksum + state before finishing. + + + + +
+
+ diff --git a/doc/src/sgml/ref/pg_checksums.sgml b/doc/src/sgml/ref/pg_checksums.sgml index e9e393495dfc..e764b8be04d5 100644 --- a/doc/src/sgml/ref/pg_checksums.sgml +++ b/doc/src/sgml/ref/pg_checksums.sgml @@ -45,6 +45,12 @@ PostgreSQL documentation exit status is nonzero if the operation failed. + + When enabling checksums, if checksums were in the process of being enabled + when the cluster was shut down, pg_checksums + will still process all relations regardless of the online processing. + + When verifying checksums, every file in the cluster is scanned. When enabling checksums, each relation file block with a changed checksum is diff --git a/doc/src/sgml/regress.sgml b/doc/src/sgml/regress.sgml index 8838fe7f0225..7074751834ea 100644 --- a/doc/src/sgml/regress.sgml +++ b/doc/src/sgml/regress.sgml @@ -263,6 +263,18 @@ make check-world PG_TEST_EXTRA='kerberos ldap ssl load_balance libpq_encryption' The following values are currently supported: + + checksum_extended + + + Runs additional tests for enabling data checksums which inject delays + and re-tries in the processing, as well as tests that run pgbench + concurrently and randomly restarts the cluster. Some of these test + suites requires injection points enabled in the installation. + + + + kerberos diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index f3b86b26be90..0ada90ca0b16 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -246,9 +246,10 @@ Checksums can be disabled when the cluster is initialized using initdb. - They can also be enabled or disabled at a later time as an offline - operation. Data checksums are enabled or disabled at the full cluster - level, and cannot be specified individually for databases or tables. + They can also be enabled or disabled at a later time either as an offline + operation or online in a running cluster allowing concurrent access. Data + checksums are enabled or disabled at the full cluster level, and cannot be + specified individually for databases or tables. @@ -265,7 +266,7 @@ - Off-line Enabling of Checksums + Offline Enabling of Checksums The pg_checksums @@ -274,6 +275,56 @@ + + + Online Enabling of Checksums + + + Checksums can be enabled or disabled online, by calling the appropriate + functions. + + + + Enabling checksums will put the cluster checksum mode in + inprogress-on mode. During this time, checksums will be + written but not verified. In addition to this, a background worker process + is started that enables checksums on all existing data in the cluster. Once + this worker has completed processing all databases in the cluster, the + checksum mode will automatically switch to on. The + processing will consume two background worker processes, make sure that + max_worker_processes allows for at least two more + additional processes. + + + + The process will initially wait for all open transactions to finish before + it starts, so that it can be certain that there are no tables that have been + created inside a transaction that has not committed yet and thus would not + be visible to the process enabling checksums. It will also, for each database, + wait for all pre-existing temporary tables to get removed before it finishes. + If long-lived temporary tables are used in the application it may be necessary + to terminate these application connections to allow the process to complete. + + + + If the cluster is stopped while in inprogress-on mode, for + any reason, then this process must be restarted manually. To do this, + re-execute the function pg_enable_data_checksums() + once the cluster has been restarted. The process will start over, there is + no support for resuming work from where it was interrupted. + + + + + Enabling checksums can cause significant I/O to the system, as most of the + database pages will need to be rewritten, and will be written both to the + data files and the WAL. The impact may be limited by throttling using the + cost_delay and cost_limit + parameters of the pg_enable_data_checksums function. + + + + diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index cd6c2a2f650a..c50d654db30e 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -18,6 +18,7 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/pg_control.h" +#include "storage/bufpage.h" #include "utils/guc.h" #include "utils/timestamp.h" @@ -167,6 +168,26 @@ xlog_desc(StringInfo buf, XLogReaderState *record) memcpy(&wal_level, rec, sizeof(int)); appendStringInfo(buf, "wal_level %s", get_wal_level_string(wal_level)); } + else if (info == XLOG_CHECKSUMS) + { + xl_checksum_state xlrec; + + memcpy(&xlrec, rec, sizeof(xl_checksum_state)); + switch (xlrec.new_checksumtype) + { + case PG_DATA_CHECKSUM_VERSION: + appendStringInfoString(buf, "on"); + break; + case PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION: + appendStringInfoString(buf, "inprogress-off"); + break; + case PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION: + appendStringInfoString(buf, "inprogress-on"); + break; + default: + appendStringInfoString(buf, "off"); + } + } } const char * @@ -218,6 +239,9 @@ xlog_identify(uint8 info) case XLOG_CHECKPOINT_REDO: id = "CHECKPOINT_REDO"; break; + case XLOG_CHECKSUMS: + id = "CHECKSUMS"; + break; } return id; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 9900e3e0179f..d70d0493dcbe 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -286,6 +286,11 @@ static XLogRecPtr RedoRecPtr; */ static bool doPageWrites; +/* + * Force creating a restart point on the next CHECKPOINT after XLOG_CHECKSUMS. + */ +static bool checksumRestartPoint = false; + /*---------- * Shared-memory data structures for XLOG control * @@ -550,6 +555,9 @@ typedef struct XLogCtlData */ XLogRecPtr lastFpwDisableRecPtr; + /* last data_checksum_version we've seen */ + uint32 data_checksum_version; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -573,6 +581,44 @@ static WALInsertLockPadded *WALInsertLocks = NULL; */ static ControlFileData *ControlFile = NULL; +/* + * This must match largest number of sets in barrier_eq and barrier_ne in the + * below checksum_barriers definition. + */ +#define MAX_BARRIER_CONDITIONS 2 + +/* + * Configuration of conditions which must match when absorbing a procsignal + * barrier during data checksum enable/disable operations. A single function + * is used for absorbing all barriers, and the set of conditions to use is + * looked up in the checksum_barriers struct. The struct member for the target + * state defines which state the backend must currently be in, and which it + * must not be in. + */ +typedef struct ChecksumBarrierCondition +{ + /* The target state of the barrier */ + int target; + /* A set of states in which at least one MUST match the current state */ + int barrier_eq[MAX_BARRIER_CONDITIONS]; + /* The number of elements in the barrier_eq set */ + int barrier_eq_sz; + /* A set of states which all MUST NOT match the current state */ + int barrier_ne[MAX_BARRIER_CONDITIONS]; + /* The number of elements in the barrier_ne set */ + int barrier_ne_sz; +} ChecksumBarrierCondition; + +static const ChecksumBarrierCondition checksum_barriers[] = +{ + {PG_DATA_CHECKSUM_OFF, {PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION, PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION}, 2, {PG_DATA_CHECKSUM_VERSION}, 1}, + {PG_DATA_CHECKSUM_VERSION, {PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION}, 1, {0}, 0}, + {PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION, {PG_DATA_CHECKSUM_ANY_VERSION}, 1, {PG_DATA_CHECKSUM_VERSION}, 1}, + {PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION, {PG_DATA_CHECKSUM_VERSION}, 1, {0}, 0}, + {-1} +}; + + /* * Calculate the amount of space left on the page after 'endptr'. Beware * multiple evaluation! @@ -647,6 +693,36 @@ static XLogRecPtr LocalMinRecoveryPoint; static TimeLineID LocalMinRecoveryPointTLI; static bool updateMinRecoveryPoint = true; +/* + * Local state fror Controlfile data_checksum_version. After initialization + * this is only updated when absorbing a procsignal barrier during interrupt + * processing. The reason for keeping a copy in backend-private memory is to + * avoid locking for interrogating checksum state. Possible values are the + * checksum versions defined in storage/bufpage.h as well as zero when data + * checksums are disabled. + */ +static uint32 LocalDataChecksumVersion = 0; + +/* + * Flag to remember if the procsignalbarrier being absorbed for checksums is + * the first one. The first procsignalbarrier can in rare cases be for the + * state we've initialized, i.e. a duplicate. This may happen for any + * data_checksum_version value, but for PG_DATA_CHECKSUM_ON_VERSION this would + * trigger an assert failure (this is the only transition with an assert) when + * processing the barrier. This may happen if the process is spawned between + * the update of XLogCtl->data_checksum_version and the barrier being emitted. + * This can only happen on the very first barrier so mark that with this flag. + */ +static bool InitialDataChecksumTransition = true; + +/* + * Variable backing the GUC, keep it in sync with LocalDataChecksumVersion. + * See SetLocalDataChecksumVersion(). + */ +int data_checksums = 0; + +static void SetLocalDataChecksumVersion(uint32 data_checksum_version); + /* For WALInsertLockAcquire/Release functions */ static int MyLockNo = 0; static bool holdingAllLocks = false; @@ -715,6 +791,8 @@ static void WALInsertLockAcquireExclusive(void); static void WALInsertLockRelease(void); static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); +static void XLogChecksums(uint32 new_type); + /* * Insert an XLOG record represented by an already-constructed chain of data * chunks. This is a low-level routine; to construct the WAL record header @@ -829,9 +907,10 @@ XLogInsertRecord(XLogRecData *rdata, * only happen just after a checkpoint, so it's better to be slow in * this case and fast otherwise. * - * Also check to see if fullPageWrites was just turned on or there's a - * running backup (which forces full-page writes); if we weren't - * already doing full-page writes then go back and recompute. + * Also check to see if fullPageWrites was just turned on, there's a + * running backup or if checksums are enabled (all of which forces + * full-page writes); if we weren't already doing full-page writes + * then go back and recompute. * * If we aren't doing full-page writes then RedoRecPtr doesn't * actually affect the contents of the XLOG record, so we'll update @@ -844,7 +923,9 @@ XLogInsertRecord(XLogRecData *rdata, Assert(RedoRecPtr < Insert->RedoRecPtr); RedoRecPtr = Insert->RedoRecPtr; } - doPageWrites = (Insert->fullPageWrites || Insert->runningBackups > 0); + doPageWrites = (Insert->fullPageWrites || + Insert->runningBackups > 0 || + DataChecksumsNeedWrite()); if (doPageWrites && (!prevDoPageWrites || @@ -4251,6 +4332,12 @@ InitControlFile(uint64 sysidentifier, uint32 data_checksum_version) ControlFile->wal_log_hints = wal_log_hints; ControlFile->track_commit_timestamp = track_commit_timestamp; ControlFile->data_checksum_version = data_checksum_version; + + /* + * Set the data_checksum_version value into XLogCtl, which is where all + * processes get the current value from. (Maybe it should go just there?) + */ + XLogCtl->data_checksum_version = data_checksum_version; } static void @@ -4575,9 +4662,9 @@ ReadControlFile(void) CalculateCheckpointSegments(); - /* Make the initdb settings visible as GUC variables, too */ - SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no", - PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); + elog(LOG, "ReadControlFile checkpoint %X/%08X redo %X/%08X", + LSN_FORMAT_ARGS(ControlFile->checkPoint), + LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo)); } /* @@ -4611,13 +4698,430 @@ GetMockAuthenticationNonce(void) } /* - * Are checksums enabled for data pages? + * DataChecksumsNeedWrite + * Returns whether data checksums must be written or not + * + * Returns true iff data checksums are enabled or are in the process of being + * enabled. During "inprogress-on" and "inprogress-off" states checksums must + * be written even though they are not verified (see datachecksumsworker.c for + * a longer discussion). + * + * This function is intended for callsites which are about to write a data page + * to storage, and need to know whether to re-calculate the checksum for the + * page header. Calling this function must be performed as close to the write + * operation as possible to keep the critical section short. */ bool -DataChecksumsEnabled(void) +DataChecksumsNeedWrite(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION || + LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION || + LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); +} + +/* + * DataChecksumsNeedVerify + * Returns whether data checksums must be verified or not + * + * Data checksums are only verified if they are fully enabled in the cluster. + * During the "inprogress-on" and "inprogress-off" states they are only + * updated, not verified (see datachecksumsworker.c for a longer discussion). + * + * This function is intended for callsites which have read data and are about + * to perform checksum validation based on the result of this. Calling this + * function must be performed as close to the validation call as possible to + * keep the critical section short. This is in order to protect against time of + * check/time of use situations around data checksum validation. + */ +bool +DataChecksumsNeedVerify(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION); +} + +/* + * DataChecksumsOnInProgress + * Returns whether data checksums are being enabled + * + * Most operations don't need to worry about the "inprogress" states, and + * should use DataChecksumsNeedVerify() or DataChecksumsNeedWrite(). The + * "inprogress-on" state for enabling checksums is used when the checksum + * worker is setting checksums on all pages, it can thus be used to check for + * aborted checksum processing which need to be restarted. + */ +inline bool +DataChecksumsOnInProgress(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); +} + +/* + * DataChecksumsOffInProgress + * Returns whether data checksums are being disabled + * + * The "inprogress-off" state for disabling checksums is used for when the + * worker resets the catalog state. DataChecksumsNeedVerify() or + * DataChecksumsNeedWrite() should be used for deciding whether to read/write + * checksums. + */ +bool +DataChecksumsOffInProgress(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); +} + +/* + * SetDataChecksumsOnInProgress + * Sets the data checksum state to "inprogress-on" to enable checksums + * + * To start the process of enabling data checksums in a running cluster the + * data_checksum_version state must be changed to "inprogress-on". See + * SetDataChecksumsOn below for a description on how this state change works. + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOnInProgress(bool immediate_checkpoint) +{ + uint64 barrier; + int flags; + + Assert(ControlFile != NULL); + + /* + * The state transition is performed in a critical section with + * checkpoints held off to provide crash safety. + */ + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + START_CRIT_SECTION(); + + XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON); + + END_CRIT_SECTION(); + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + + /* + * Await state change in all backends to ensure that all backends are in + * "inprogress-on". Once done we know that all backends are writing data + * checksums. + */ + WaitForProcSignalBarrier(barrier); + + /* + * force checkpoint to persist the current checksum state in control file + * etc. + * + * XXX is this needed? There's already a checkpoint at the end of + * ProcessAllDatabases, maybe this is redundant? + */ + flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT; + if (immediate_checkpoint) + flags = flags | CHECKPOINT_FAST; + RequestCheckpoint(flags); +} + +/* + * SetDataChecksumsOn + * Enables data checksums cluster-wide + * + * Enabling data checksums is performed using two barriers, the first one to + * set the state to "inprogress-on" (done by SetDataChecksumsOnInProgress()) + * and the second one to set the state to "on" (done here). Below is a short + * description of the processing, a more detailed write-up can be found in + * datachecksumsworker.c. + * + * To start the process of enabling data checksums in a running cluster the + * data_checksum_version state must be changed to "inprogress-on". This state + * requires data checksums to be written but not verified. This ensures that + * all data pages can be checksummed without the risk of false negatives in + * validation during the process. When all existing pages are guaranteed to + * have checksums, and all new pages will be initiated with checksums, the + * state can be changed to "on". Once the state is "on" checksums will be both + * written and verified. See datachecksumsworker.c for a longer discussion on + * how data checksums can be enabled in a running cluster. + * + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOn(bool immediate_checkpoint) { + uint64 barrier; + int flags; + Assert(ControlFile != NULL); - return (ControlFile->data_checksum_version > 0); + + SpinLockAcquire(&XLogCtl->info_lck); + + /* + * The only allowed state transition to "on" is from "inprogress-on" since + * that state ensures that all pages will have data checksums written. + */ + if (XLogCtl->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + { + SpinLockRelease(&XLogCtl->info_lck); + elog(ERROR, "checksums not in \"inprogress-on\" mode"); + } + + SpinLockRelease(&XLogCtl->info_lck); + + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + INJECTION_POINT("datachecksums-enable-checksums-delay", NULL); + START_CRIT_SECTION(); + + XLogChecksums(PG_DATA_CHECKSUM_VERSION); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_VERSION; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON); + + END_CRIT_SECTION(); + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + + /* + * Await state transition of "on" in all backends. When done we know that + * data checksums are enabled in all backends and data checksums are both + * written and verified. + */ + WaitForProcSignalBarrier(barrier); + + INJECTION_POINT("datachecksums-enable-checksums-pre-checkpoint", NULL); + + /* XXX is this needed? */ + flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT; + if (immediate_checkpoint) + flags = flags | CHECKPOINT_FAST; + RequestCheckpoint(flags); +} + +/* + * SetDataChecksumsOff + * Disables data checksums cluster-wide + * + * Disabling data checksums must be performed with two sets of barriers, each + * carrying a different state. The state is first set to "inprogress-off" + * during which checksums are still written but not verified. This ensures that + * backends which have yet to observe the state change from "on" won't get + * validation errors on concurrently modified pages. Once all backends have + * changed to "inprogress-off", the barrier for moving to "off" can be emitted. + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOff(bool immediate_checkpoint) +{ + uint64 barrier; + int flags; + + Assert(ControlFile); + + SpinLockAcquire(&XLogCtl->info_lck); + + /* If data checksums are already disabled there is nothing to do */ + if (XLogCtl->data_checksum_version == 0) + { + SpinLockRelease(&XLogCtl->info_lck); + return; + } + + /* + * If data checksums are currently enabled we first transition to the + * "inprogress-off" state during which backends continue to write + * checksums without verifying them. When all backends are in + * "inprogress-off" the next transition to "off" can be performed, after + * which all data checksum processing is disabled. + */ + if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_VERSION) + { + SpinLockRelease(&XLogCtl->info_lck); + + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + START_CRIT_SECTION(); + + XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF); + + END_CRIT_SECTION(); + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + + /* + * Update local state in all backends to ensure that any backend in + * "on" state is changed to "inprogress-off". + */ + WaitForProcSignalBarrier(barrier); + + /* + * force checkpoint to persist the current checksum state in control + * file etc. + * + * XXX is this safe? What if the crash/shutdown happens while waiting + * for the checkpoint? Also, should we persist the checksum first and + * only then flip the flag in XLogCtl? + */ + INJECTION_POINT("datachecksums-disable-checksums-pre-checkpoint", NULL); + flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT; + if (immediate_checkpoint) + flags = flags | CHECKPOINT_FAST; + RequestCheckpoint(flags); + + /* + * At this point we know that no backends are verifying data checksums + * during reading. Next, we can safely move to state "off" to also + * stop writing checksums. + */ + } + else + { + /* + * Ending up here implies that the checksums state is "inprogress-on" + * or "inprogress-off" and we can transition directly to "off" from + * there. + */ + SpinLockRelease(&XLogCtl->info_lck); + } + + /* + * Ensure that we don't incur a checkpoint during disabling checksums. + */ + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + START_CRIT_SECTION(); + + XLogChecksums(0); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = 0; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF); + + END_CRIT_SECTION(); + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + + WaitForProcSignalBarrier(barrier); + + flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT; + if (immediate_checkpoint) + flags = flags | CHECKPOINT_FAST; + RequestCheckpoint(flags); +} + +/* + * AbsorbDataChecksumsBarrier + * Generic function for absorbing data checksum state changes + * + * All procsignalbarriers regarding data checksum state changes are absorbed + * with this function. The set of conditions required for the state change to + * be accepted are listed in the checksum_barriers struct, target_state is + * used to look up the relevant entry. + */ +bool +AbsorbDataChecksumsBarrier(int target_state) +{ + const ChecksumBarrierCondition *condition = checksum_barriers; + int current = LocalDataChecksumVersion; + bool found = false; + + /* + * Find the barrier condition definition for the target state. Not finding + * a condition would be a grave programmer error as the states are a + * discrete set. + */ + while (condition->target != target_state && condition->target != -1) + condition++; + if (unlikely(condition->target == -1)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid target state %i for data checksum barrier", + target_state)); + + /* + * The current state MUST be equal to one of the EQ states defined in this + * barrier condition, or equal to the target_state if - and only if - + * InitialDataChecksumTransition is true. + */ + for (int i = 0; i < condition->barrier_eq_sz; i++) + { + if (current == condition->barrier_eq[i] || + condition->barrier_eq[i] == PG_DATA_CHECKSUM_ANY_VERSION) + found = true; + } + if (InitialDataChecksumTransition && current == target_state) + found = true; + + /* + * The current state MUST NOT be equal to any of the NE states defined in + * this barrier condition. + */ + for (int i = 0; i < condition->barrier_ne_sz; i++) + { + if (current == condition->barrier_ne[i]) + found = false; + } + + /* + * If the relevent state criteria aren't satisfied, throw an error which + * will be caught by the procsignal machinery for a later retry. + */ + if (!found) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("incorrect data checksum state %i for target state %i", + current, target_state)); + + SetLocalDataChecksumVersion(target_state); + InitialDataChecksumTransition = false; + return true; +} + +/* + * InitLocalControlData + * + * Set up backend local caches of controldata variables which may change at + * any point during runtime and thus require special cased locking. So far + * this only applies to data_checksum_version, but it's intended to be general + * purpose enough to handle future cases. + */ +void +InitLocalDataChecksumVersion(void) +{ + SpinLockAcquire(&XLogCtl->info_lck); + SetLocalDataChecksumVersion(XLogCtl->data_checksum_version); + SpinLockRelease(&XLogCtl->info_lck); +} + +void +SetLocalDataChecksumVersion(uint32 data_checksum_version) +{ + LocalDataChecksumVersion = data_checksum_version; + + data_checksums = data_checksum_version; +} + +/* guc hook */ +const char * +show_data_checksums(void) +{ + if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION) + return "on"; + else if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + return "inprogress-on"; + else if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION) + return "inprogress-off"; + else + return "off"; } /* @@ -4892,6 +5396,7 @@ LocalProcessControlFile(bool reset) Assert(reset || ControlFile == NULL); ControlFile = palloc(sizeof(ControlFileData)); ReadControlFile(); + SetLocalDataChecksumVersion(ControlFile->data_checksum_version); } /* @@ -5061,6 +5566,11 @@ XLOGShmemInit(void) XLogCtl->InstallXLogFileSegmentActive = false; XLogCtl->WalWriterSleeping = false; + /* Use the checksum info from control file */ + XLogCtl->data_checksum_version = ControlFile->data_checksum_version; + + SetLocalDataChecksumVersion(XLogCtl->data_checksum_version); + SpinLockInit(&XLogCtl->Insert.insertpos_lck); SpinLockInit(&XLogCtl->info_lck); pg_atomic_init_u64(&XLogCtl->logInsertResult, InvalidXLogRecPtr); @@ -6202,6 +6712,47 @@ StartupXLOG(void) pfree(endOfRecoveryInfo->recoveryStopReason); pfree(endOfRecoveryInfo); + /* + * If we reach this point with checksums in the state inprogress-on, it + * means that data checksums were in the process of being enabled when the + * cluster shut down. Since processing didn't finish, the operation will + * have to be restarted from scratch since there is no capability to + * continue where it was when the cluster shut down. Thus, revert the + * state back to off, and inform the user with a warning message. Being + * able to restart processing is a TODO, but it wouldn't be possible to + * restart here since we cannot launch a dynamic background worker + * directly from here (it has to be from a regular backend). + */ + if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + { + XLogChecksums(0); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = 0; + SetLocalDataChecksumVersion(XLogCtl->data_checksum_version); + SpinLockRelease(&XLogCtl->info_lck); + + ereport(WARNING, + (errmsg("data checksums state has been set of off"), + errhint("If checksums were being enabled during shutdown then processing must be manually restarted."))); + } + + /* + * If data checksums were being disabled when the cluster was shut down, + * we know that we have a state where all backends have stopped validating + * checksums and we can move to off instead of prompting the user to + * perform any action. + */ + if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION) + { + XLogChecksums(0); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = 0; + SetLocalDataChecksumVersion(XLogCtl->data_checksum_version); + SpinLockRelease(&XLogCtl->info_lck); + } + /* * All done with end-of-recovery actions. * @@ -6493,7 +7044,7 @@ GetRedoRecPtr(void) XLogRecPtr ptr; /* - * The possibly not up-to-date copy in XlogCtl is enough. Even if we + * The possibly not up-to-date copy in XLogCtl is enough. Even if we * grabbed a WAL insertion lock to read the authoritative value in * Insert->RedoRecPtr, someone might update it just after we've released * the lock. @@ -7057,6 +7608,12 @@ CreateCheckPoint(int flags) checkPoint.fullPageWrites = Insert->fullPageWrites; checkPoint.wal_level = wal_level; + /* + * Get the current data_checksum_version value from xlogctl, valid at the + * time of the checkpoint. + */ + checkPoint.data_checksum_version = XLogCtl->data_checksum_version; + if (shutdown) { XLogRecPtr curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos); @@ -7312,6 +7869,9 @@ CreateCheckPoint(int flags) ControlFile->minRecoveryPoint = InvalidXLogRecPtr; ControlFile->minRecoveryPointTLI = 0; + /* make sure we start with the checksum version as of the checkpoint */ + ControlFile->data_checksum_version = checkPoint.data_checksum_version; + /* * Persist unloggedLSN value. It's reset on crash recovery, so this goes * unused on non-shutdown checkpoints, but seems useful to store it always @@ -7455,6 +8015,10 @@ CreateEndOfRecoveryRecord(void) LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->minRecoveryPoint = recptr; ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID; + + /* start with the latest checksum version (as of the end of recovery) */ + ControlFile->data_checksum_version = XLogCtl->data_checksum_version; + UpdateControlFile(); LWLockRelease(ControlFileLock); @@ -7796,6 +8360,10 @@ CreateRestartPoint(int flags) if (flags & CHECKPOINT_IS_SHUTDOWN) ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY; } + + /* we shall start with the latest checksum version */ + ControlFile->data_checksum_version = lastCheckPoint.data_checksum_version; + UpdateControlFile(); } LWLockRelease(ControlFileLock); @@ -8207,6 +8775,26 @@ XLogReportParameters(void) } } +/* + * Log the new state of checksums + */ +static void +XLogChecksums(uint32 new_type) +{ + xl_checksum_state xlrec; + XLogRecPtr recptr; + + xlrec.new_checksumtype = new_type; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_checksum_state)); + + INJECTION_POINT("datachecksums-xlogchecksums-pre-xloginsert", &new_type); + + recptr = XLogInsert(RM_XLOG_ID, XLOG_CHECKSUMS); + XLogFlush(recptr); +} + /* * Update full_page_writes in shared memory, and write an * XLOG_FPW_CHANGE record if necessary. @@ -8641,6 +9229,74 @@ xlog_redo(XLogReaderState *record) { /* nothing to do here, just for informational purposes */ } + else if (info == XLOG_CHECKSUMS) + { + xl_checksum_state state; + uint64 barrier; + + memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state)); + + /* + * XXX Could this end up written to the control file prematurely? IIRC + * that happens during checkpoint, so what if that gets triggered e.g. + * because someone runs CHECKPOINT? If we then crash (or something + * like that), could that confuse the instance? + */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = state.new_checksumtype; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * Block on a procsignalbarrier to await all processes having seen the + * change to checksum status. Once the barrier has been passed we can + * initiate the corresponding processing. + */ + switch (state.new_checksumtype) + { + case PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON); + WaitForProcSignalBarrier(barrier); + break; + + case PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF); + WaitForProcSignalBarrier(barrier); + break; + + case PG_DATA_CHECKSUM_VERSION: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON); + WaitForProcSignalBarrier(barrier); + break; + + default: + Assert(state.new_checksumtype == 0); + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF); + WaitForProcSignalBarrier(barrier); + break; + } + + /* + * force creating a restart point for the first CHECKPOINT after + * seeing XLOG_CHECKSUMS in WAL + */ + checksumRestartPoint = true; + } + + if (checksumRestartPoint && + (info == XLOG_CHECKPOINT_ONLINE || + info == XLOG_CHECKPOINT_REDO || + info == XLOG_CHECKPOINT_SHUTDOWN)) + { + int flags; + + elog(LOG, "forcing creation of a restart point after XLOG_CHECKSUMS"); + + /* We explicitly want an immediate checkpoint here */ + flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST; + RequestCheckpoint(flags); + + checksumRestartPoint = false; + } } /* diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 8c3090165f00..d786374209f2 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -26,6 +26,7 @@ #include "funcapi.h" #include "miscadmin.h" #include "pgstat.h" +#include "postmaster/datachecksumsworker.h" #include "replication/walreceiver.h" #include "storage/fd.h" #include "storage/latch.h" @@ -748,3 +749,59 @@ pg_promote(PG_FUNCTION_ARGS) wait_seconds))); PG_RETURN_BOOL(false); } + +/* + * Disables data checksums for the cluster, if applicable. Starts a background + * worker which turns off the data checksums. + */ +Datum +disable_data_checksums(PG_FUNCTION_ARGS) +{ + bool fast = PG_GETARG_BOOL(0); + + ereport(LOG, + errmsg("disable_data_checksums, fast: %d", fast)); + + if (!superuser()) + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to change data checksum state")); + + StartDataChecksumsWorkerLauncher(DISABLE_DATACHECKSUMS, 0, 0, fast); + PG_RETURN_VOID(); +} + +/* + * Enables data checksums for the cluster, if applicable. Supports vacuum- + * like cost based throttling to limit system load. Starts a background worker + * which updates data checksums on existing data. + */ +Datum +enable_data_checksums(PG_FUNCTION_ARGS) +{ + int cost_delay = PG_GETARG_INT32(0); + int cost_limit = PG_GETARG_INT32(1); + bool fast = PG_GETARG_BOOL(2); + + ereport(LOG, + errmsg("enable_data_checksums, cost_delay: %d cost_limit: %d fast: %d", cost_delay, cost_limit, fast)); + + if (!superuser()) + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to change data checksum state")); + + if (cost_delay < 0) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cost delay cannot be a negative value")); + + if (cost_limit <= 0) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cost limit must be greater than zero")); + + StartDataChecksumsWorkerLauncher(ENABLE_DATACHECKSUMS, cost_delay, cost_limit, fast); + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 93c50831b260..e8c394716072 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -782,6 +782,10 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID; RedoStartLSN = ControlFile->checkPointCopy.redo; RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID; + + elog(LOG, "InitWalRecovery checkpoint %X/%08X redo %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc), LSN_FORMAT_ARGS(RedoStartLSN)); + record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, CheckPointTLI); if (record != NULL) @@ -1665,6 +1669,9 @@ PerformWalRecovery(void) bool reachedRecoveryTarget = false; TimeLineID replayTLI; + elog(LOG, "PerformWalRecovery checkpoint %X/%08X redo %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc), LSN_FORMAT_ARGS(RedoStartLSN)); + /* * Initialize shared variables for tracking progress of WAL replay, as if * we had just replayed the record before the REDO location (or the @@ -1673,12 +1680,14 @@ PerformWalRecovery(void) SpinLockAcquire(&XLogRecoveryCtl->info_lck); if (RedoStartLSN < CheckPointLoc) { + elog(LOG, "(RedoStartLSN < CheckPointLoc)"); XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr; XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN; XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI; } else { + elog(LOG, "(RedoStartLSN >= CheckPointLoc)"); XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr; XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr; XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI; @@ -1690,6 +1699,10 @@ PerformWalRecovery(void) XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED; SpinLockRelease(&XLogRecoveryCtl->info_lck); + elog(LOG, "PerformWalRecovery lastReplayedReadRecPtr %X/%08X lastReplayedEndRecPtr %X/%08X", + LSN_FORMAT_ARGS(XLogRecoveryCtl->lastReplayedReadRecPtr), + LSN_FORMAT_ARGS(XLogRecoveryCtl->lastReplayedEndRecPtr)); + /* Also ensure XLogReceiptTime has a sane value */ XLogReceiptTime = GetCurrentTimestamp(); diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index bb7d90aa5d96..54dcfbcb3334 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -1613,7 +1613,8 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, * enabled for this cluster, and if this is a relation file, then verify * the checksum. */ - if (!noverify_checksums && DataChecksumsEnabled() && + if (!noverify_checksums && + DataChecksumsNeedWrite() && RelFileNumberIsValid(relfilenumber)) verify_checksum = true; @@ -2007,6 +2008,9 @@ verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno, if (PageIsNew(page) || PageGetLSN(page) >= start_lsn) return true; + if (!DataChecksumsNeedVerify()) + return true; + /* Perform the actual checksum calculation. */ checksum = pg_checksum_page(page, blkno); diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql index 2d946d6d9e9b..0bded82b84c8 100644 --- a/src/backend/catalog/system_functions.sql +++ b/src/backend/catalog/system_functions.sql @@ -657,6 +657,22 @@ LANGUAGE INTERNAL STRICT VOLATILE PARALLEL UNSAFE AS 'pg_replication_origin_session_setup'; +CREATE OR REPLACE FUNCTION + pg_enable_data_checksums(cost_delay integer DEFAULT 0, + cost_limit integer DEFAULT 100, + fast boolean DEFAULT false) +RETURNS void +STRICT VOLATILE LANGUAGE internal +PARALLEL RESTRICTED +AS 'enable_data_checksums'; + +CREATE OR REPLACE FUNCTION + pg_disable_data_checksums(fast boolean DEFAULT false) +RETURNS void +STRICT VOLATILE LANGUAGE internal +PARALLEL RESTRICTED +AS 'disable_data_checksums'; + -- -- The default permissions for functions mean that anyone can execute them. -- A number of functions shouldn't be executable by just anyone, but rather @@ -782,6 +798,10 @@ REVOKE EXECUTE ON FUNCTION pg_ls_logicalmapdir() FROM PUBLIC; REVOKE EXECUTE ON FUNCTION pg_ls_replslotdir(text) FROM PUBLIC; +REVOKE EXECUTE ON FUNCTION pg_enable_data_checksums(integer, integer, boolean) FROM public; + +REVOKE EXECUTE ON FUNCTION pg_disable_data_checksums(boolean) FROM public; + -- -- We also set up some things as accessible to standard roles. -- diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index dec8df4f8ee6..fe149aabdbe3 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1371,6 +1371,26 @@ CREATE VIEW pg_stat_progress_copy AS FROM pg_stat_get_progress_info('COPY') AS S LEFT JOIN pg_database D ON S.datid = D.oid; +CREATE VIEW pg_stat_progress_data_checksums AS + SELECT + S.pid AS pid, S.datid, D.datname AS datname, + CASE S.param1 WHEN 0 THEN 'enabling' + WHEN 1 THEN 'disabling' + WHEN 2 THEN 'waiting' + WHEN 3 THEN 'waiting on temporary tables' + WHEN 4 THEN 'waiting on checkpoint' + WHEN 5 THEN 'done' + END AS phase, + CASE S.param2 WHEN -1 THEN NULL ELSE S.param2 END AS databases_total, + S.param3 AS databases_done, + CASE S.param4 WHEN -1 THEN NULL ELSE S.param4 END AS relations_total, + CASE S.param5 WHEN -1 THEN NULL ELSE S.param5 END AS relations_done, + CASE S.param6 WHEN -1 THEN NULL ELSE S.param6 END AS blocks_total, + CASE S.param7 WHEN -1 THEN NULL ELSE S.param7 END AS blocks_done + FROM pg_stat_get_progress_info('DATACHECKSUMS') AS S + LEFT JOIN pg_database D ON S.datid = D.oid + ORDER BY S.datid; -- return the launcher process first + CREATE VIEW pg_user_mappings AS SELECT U.oid AS umid, diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index 0f4435d2d97c..0c36765acfe1 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -18,6 +18,7 @@ OBJS = \ bgworker.o \ bgwriter.o \ checkpointer.o \ + datachecksumsworker.o \ fork_process.o \ interrupt.o \ launch_backend.o \ diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c index a6d3630398f4..5742a1dd724e 100644 --- a/src/backend/postmaster/auxprocess.c +++ b/src/backend/postmaster/auxprocess.c @@ -15,6 +15,7 @@ #include #include +#include "access/xlog.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/auxprocess.h" @@ -68,6 +69,24 @@ AuxiliaryProcessMainCommon(void) ProcSignalInit(NULL, 0); + /* + * Initialize a local cache of the data_checksum_version, to be updated by + * the procsignal-based barriers. + * + * This intentionally happens after initializing the procsignal, otherwise + * we might miss a state change. This means we can get a barrier for the + * state we've just initialized - but it can happen only once. + * + * The postmaster (which is what gets forked into the new child process) + * does not handle barriers, therefore it may not have the current value + * of LocalDataChecksumVersion value (it'll have the value read from the + * control file, which may be arbitrarily old). + * + * NB: Even if the postmaster handled barriers, the value might still be + * stale, as it might have changed after this process forked. + */ + InitLocalDataChecksumVersion(); + /* * Auxiliary processes don't run transactions, but they may need a * resource owner anyway to manage buffer pins acquired outside diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 142a02eb5e95..ed3dc05406c3 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -18,6 +18,7 @@ #include "pgstat.h" #include "port/atomics.h" #include "postmaster/bgworker_internals.h" +#include "postmaster/datachecksumsworker.h" #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/logicalworker.h" @@ -135,6 +136,12 @@ static const struct }, { "SequenceSyncWorkerMain", SequenceSyncWorkerMain + }, + { + "DataChecksumsWorkerLauncherMain", DataChecksumsWorkerLauncherMain + }, + { + "DataChecksumsWorkerMain", DataChecksumsWorkerMain } }; diff --git a/src/backend/postmaster/datachecksumsworker.c b/src/backend/postmaster/datachecksumsworker.c new file mode 100644 index 000000000000..3deb57a96de6 --- /dev/null +++ b/src/backend/postmaster/datachecksumsworker.c @@ -0,0 +1,1471 @@ +/*------------------------------------------------------------------------- + * + * datachecksumsworker.c + * Background worker for enabling or disabling data checksums online + * + * When enabling data checksums on a database at initdb time or when shut down + * with pg_checksums, no extra process is required as each page is checksummed, + * and verified, when accessed. When enabling checksums on an already running + * cluster, this worker will ensure that all pages are checksummed before + * verification of the checksums is turned on. In the case of disabling + * checksums, the state transition is performed only in the control file, no + * changes are performed on the data pages. + * + * Checksums can be either enabled or disabled cluster-wide, with on/off being + * the end state for data_checksums. + * + * Enabling checksums + * ------------------ + * When enabling checksums in an online cluster, data_checksums will be set to + * "inprogress-on" which signals that write operations MUST compute and write + * the checksum on the data page, but during reading the checksum SHALL NOT be + * verified. This ensures that all objects created during checksumming will + * have checksums set, but no reads will fail due to incorrect checksum. The + * DataChecksumsWorker will compile a list of databases which exist at the + * start of checksumming, and all of these which haven't been dropped during + * the processing MUST have been processed successfully in order for checksums + * to be enabled. Any new relation created during processing will see the + * in-progress state and will automatically be checksummed. + * + * For each database, all relations which have storage are read and every data + * page is marked dirty to force a write with the checksum. This will generate + * a lot of WAL as the entire database is read and written. + * + * If the processing is interrupted by a cluster restart, it will be restarted + * from the beginning again as state isn't persisted. + * + * Disabling checksums + * ------------------- + * When disabling checksums, data_checksums will be set to "inprogress-off" + * which signals that checksums are written but no longer verified. This ensure + * that backends which have yet to move from the "on" state will still be able + * to process data checksum validation. + * + * Synchronization and Correctness + * ------------------------------- + * The processes involved in enabling, or disabling, data checksums in an + * online cluster must be properly synchronized with the normal backends + * serving concurrent queries to ensure correctness. Correctness is defined + * as the following: + * + * - Backends SHALL NOT violate local data_checksums state + * - Data checksums SHALL NOT be considered enabled cluster-wide until all + * currently connected backends have the local state "enabled" + * + * There are two levels of synchronization required for enabling data checksums + * in an online cluster: (i) changing state in the active backends ("on", + * "off", "inprogress-on" and "inprogress-off"), and (ii) ensuring no + * incompatible objects and processes are left in a database when workers end. + * The former deals with cluster-wide agreement on data checksum state and the + * latter with ensuring that any concurrent activity cannot break the data + * checksum contract during processing. + * + * Synchronizing the state change is done with procsignal barriers, where the + * WAL logging backend updating the global state in the controlfile will wait + * for all other backends to absorb the barrier. Barrier absorption will happen + * during interrupt processing, which means that connected backends will change + * state at different times. To prevent data checksum state changes when + * writing and verifying checksums, interrupts shall be held off before + * interrogating state and resumed when the IO operation has been performed. + * + * When Enabling Data Checksums + * ---------------------------- + * A process which fails to observe data checksums being enabled can induce + * two types of errors: failing to write the checksum when modifying the page + * and failing to validate the data checksum on the page when reading it. + * + * When processing starts all backends belong to one of the below sets, with + * one set being empty: + * + * Bd: Backends in "off" state + * Bi: Backends in "inprogress-on" state + * + * If processing is started in an online cluster then all backends are in Bd. + * If processing was halted by the cluster shutting down, the controlfile + * state "inprogress-on" will be observed on system startup and all backends + * will be in Bd. Backends transition Bd -> Bi via a procsignalbarrier. When + * the DataChecksumsWorker has finished writing checksums on all pages and + * enables data checksums cluster-wide, there are four sets of backends where + * Bd shall be an empty set: + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backends in "off" state + * Be: Backends in "on" state + * Bi: Backends in "inprogress-on" state + * + * Backends in Bi and Be will write checksums when modifying a page, but only + * backends in Be will verify the checksum during reading. The Bg backend is + * blocked waiting for all backends in Bi to process interrupts and move to + * Be. Any backend starting while Bg is waiting on the procsignalbarrier will + * observe the global state being "on" and will thus automatically belong to + * Be. Checksums are enabled cluster-wide when Bi is an empty set. Bi and Be + * are compatible sets while still operating based on their local state as + * both write data checksums. + * + * When Disabling Data Checksums + * ----------------------------- + * A process which fails to observe that data checksums have been disabled + * can induce two types of errors: writing the checksum when modifying the + * page and validating a data checksum which is no longer correct due to + * modifications to the page. + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backends in "off" state + * Be: Backends in "on" state + * Bo: Backends in "inprogress-off" state + * + * Backends transition from the Be state to Bd like so: Be -> Bo -> Bd + * + * The goal is to transition all backends to Bd making the others empty sets. + * Backends in Bo write data checksums, but don't validate them, such that + * backends still in Be can continue to validate pages until the barrier has + * been absorbed such that they are in Bo. Once all backends are in Bo, the + * barrier to transition to "off" can be raised and all backends can safely + * stop writing data checksums as no backend is enforcing data checksum + * validation any longer. + * + * + * Potential optimizations + * ----------------------- + * Below are some potential optimizations and improvements which were brought + * up during reviews of this feature, but which weren't implemented in the + * initial version. These are ideas listed without any validation on their + * feasibility or potential payoff. More discussion on these can be found on + * the -hackers threads linked to in the commit message of this feature. + * + * * Launching datachecksumsworker for resuming operation from the startup + * process: Currently users have to restart processing manually after a + * restart since dynamic background worker cannot be started from the + * postmaster. Changing the startup process could make restarting the + * processing automatic on cluster restart. + * * Avoid dirtying the page when checksums already match: Iff the checksum + * on the page happens to already match we still dirty the page. It should + * be enough to only do the log_newpage_buffer() call in that case. + * * Invent a lightweight WAL record that doesn't contain the full-page + * image but just the block number: On replay, the redo routine would read + * the page from disk. + * * Teach pg_checksums to avoid checksummed pages when pg_checksums is used + * to enable checksums on a cluster which is in inprogress-on state and + * may have checksummed pages (make pg_checksums be able to resume an + * online operation). + * + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/datachecksumsworker.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/xact.h" +#include "access/xloginsert.h" +#include "catalog/indexing.h" +#include "catalog/pg_class.h" +#include "catalog/pg_database.h" +#include "commands/progress.h" +#include "commands/vacuum.h" +#include "common/relpath.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/bgwriter.h" +#include "postmaster/datachecksumsworker.h" +#include "storage/bufmgr.h" +#include "storage/checksum.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "storage/lwlock.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/fmgroids.h" +#include "utils/injection_point.h" +#include "utils/lsyscache.h" +#include "utils/ps_status.h" +#include "utils/syscache.h" + +/* + * Number of times we retry to open a database before giving up and consider + * it to have failed processing. + */ +#define DATACHECKSUMSWORKER_MAX_DB_RETRIES 5 + +/* + * Signaling between backends calling pg_enable/disable_data_checksums, the + * checksums launcher process, and the checksums worker process. + * + * This struct is protected by DataChecksumsWorkerLock + */ +typedef struct DataChecksumsWorkerShmemStruct +{ + /* + * These are set by pg_{enable|disable|verify}_data_checksums, to tell the + * launcher what the target state is. + */ + DataChecksumsWorkerOperation launch_operation; + int launch_cost_delay; + int launch_cost_limit; + bool launch_fast; + + /* + * Is a launcher process is currently running? + * + * This is set by the launcher process, after it has read the above + * launch_* parameters. + */ + bool launcher_running; + + /* + * These fields indicate the target state that the launcher is currently + * working towards. They can be different from the corresponding launch_* + * fields, if a new pg_enable/disable_data_checksums() call was made while + * the launcher/worker was already running. + * + * The below members are set when the launcher starts, and are only + * accessed read-only by the single worker. Thus, we can access these + * without a lock. If multiple workers, or dynamic cost parameters, are + * supported at some point then this would need to be revisited. + */ + DataChecksumsWorkerOperation operation; + int cost_delay; + int cost_limit; + bool immediate_checkpoint; + + /* + * Signaling between the launcher and the worker process. + * + * As there is only a single worker, and the launcher won't read these + * until the worker exits, they can be accessed without the need for a + * lock. If multiple workers are supported then this will have to be + * revisited. + */ + + /* result, set by worker before exiting */ + DataChecksumsWorkerResult success; + + /* + * tells the worker process whether it should also process the shared + * catalogs + */ + bool process_shared_catalogs; +} DataChecksumsWorkerShmemStruct; + +/* Shared memory segment for datachecksumsworker */ +static DataChecksumsWorkerShmemStruct *DataChecksumsWorkerShmem; + +typedef struct DataChecksumsWorkerDatabase +{ + Oid dboid; + char *dbname; +} DataChecksumsWorkerDatabase; + +typedef struct DataChecksumsWorkerResultEntry +{ + Oid dboid; + DataChecksumsWorkerResult result; + int retries; +} DataChecksumsWorkerResultEntry; + + +/* + * Flag set by the interrupt handler + */ +static volatile sig_atomic_t abort_requested = false; + +/* + * Have we set the DataChecksumsWorkerShmemStruct->launcher_running flag? + * If we have, we need to clear it before exiting! + */ +static volatile sig_atomic_t launcher_running = false; + +/* + * Are we enabling data checksums, or disabling them? + */ +static DataChecksumsWorkerOperation operation; + +/* Prototypes */ +static List *BuildDatabaseList(void); +static List *BuildRelationList(bool temp_relations, bool include_shared); +static void FreeDatabaseList(List *dblist); +static DataChecksumsWorkerResult ProcessDatabase(DataChecksumsWorkerDatabase *db); +static bool ProcessAllDatabases(bool immediate_checkpoint); +static bool ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy); +static void launcher_cancel_handler(SIGNAL_ARGS); +static void WaitForAllTransactionsToFinish(void); + +/* + * StartDataChecksumsWorkerLauncher + * Main entry point for datachecksumsworker launcher process + * + * The main entrypoint for starting data checksums processing for enabling as + * well as disabling. + */ +void +StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op, + int cost_delay, + int cost_limit, + bool fast) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + bool launcher_running; + +#ifdef USE_ASSERT_CHECKING + /* The cost delay settings have no effect when disabling */ + if (op == DISABLE_DATACHECKSUMS) + Assert(cost_delay == 0 && cost_limit == 0); +#endif + + /* Store the desired state in shared memory */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + + DataChecksumsWorkerShmem->launch_operation = op; + DataChecksumsWorkerShmem->launch_cost_delay = cost_delay; + DataChecksumsWorkerShmem->launch_cost_limit = cost_limit; + DataChecksumsWorkerShmem->launch_fast = fast; + + /* is the launcher already running? */ + launcher_running = DataChecksumsWorkerShmem->launcher_running; + + LWLockRelease(DataChecksumsWorkerLock); + + /* + * Launch a new launcher process, if it's not running already. + * + * If the launcher is currently busy enabling the checksums, and we want + * them disabled (or vice versa), the launcher will notice that at latest + * when it's about to exit, and will loop back process the new request. So + * if the launcher is already running, we don't need to do anything more + * here to abort it. + * + * If you call pg_enable/disable_data_checksums() twice in a row, before + * the launcher has had a chance to start up, we still end up launching it + * twice. That's OK, the second invocation will see that a launcher is + * already running and exit quickly. + * + * TODO: We could optimize here and skip launching the launcher, if we are + * already in the desired state, i.e. if the checksums are already enabled + * and you call pg_enable_data_checksums(). + */ + if (!launcher_running) + { + /* + * Prepare the BackgroundWorker and launch it. + */ + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "DataChecksumsWorkerLauncherMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksum launcher"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksum launcher"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = (Datum) 0; + + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("failed to start background worker to process data checksums")); + } +} + +/* + * ProcessSingleRelationFork + * Enable data checksums in a single relation/fork. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy) +{ + BlockNumber numblocks = RelationGetNumberOfBlocksInFork(reln, forkNum); + char activity[NAMEDATALEN * 2 + 128]; + char *relns; + + relns = get_namespace_name(RelationGetNamespace(reln)); + + if (!relns) + return false; + + /* Report the current relation to pgstat_activity */ + snprintf(activity, sizeof(activity) - 1, "processing: %s.%s (%s, %dblocks)", + relns, RelationGetRelationName(reln), forkNames[forkNum], numblocks); + pgstat_report_activity(STATE_RUNNING, activity); + + /* + * As of now we only update the block counter for main forks in order to + * not cause too frequent calls. TODO: investigate whether we should do it + * more frequent? + */ + if (forkNum == MAIN_FORKNUM) + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL, + numblocks); + + /* + * We are looping over the blocks which existed at the time of process + * start, which is safe since new blocks are created with checksums set + * already due to the state being "inprogress-on". + */ + for (BlockNumber blknum = 0; blknum < numblocks; blknum++) + { + Buffer buf = ReadBufferExtended(reln, forkNum, blknum, RBM_NORMAL, strategy); + + /* Need to get an exclusive lock before we can flag as dirty */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Mark the buffer as dirty and force a full page write. We have to + * re-write the page to WAL even if the checksum hasn't changed, + * because if there is a replica it might have a slightly different + * version of the page with an invalid checksum, caused by unlogged + * changes (e.g. hintbits) on the master happening while checksums + * were off. This can happen if there was a valid checksum on the page + * at one point in the past, so only when checksums are first on, then + * off, and then turned on again. TODO: investigate if this could be + * avoided if the checksum is calculated to be correct and wal_level + * is set to "minimal", + */ + START_CRIT_SECTION(); + MarkBufferDirty(buf); + log_newpage_buffer(buf, false); + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buf); + + /* + * This is the only place where we check if we are asked to abort, the + * abortion will bubble up from here. It's safe to check this without + * a lock, because if we miss it being set, we will try again soon. + */ + Assert(operation == ENABLE_DATACHECKSUMS); + if (DataChecksumsWorkerShmem->launch_operation == DISABLE_DATACHECKSUMS) + abort_requested = true; + + if (abort_requested) + return false; + + /* + * As of now we only update the block counter for main forks in order + * to not cause too frequent calls. TODO: investigate whether we + * should do it more frequent? + */ + if (forkNum == MAIN_FORKNUM) + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_DONE, + (blknum + 1)); + + vacuum_delay_point(false); + } + + pfree(relns); + return true; +} + +/* + * ProcessSingleRelationByOid + * Process a single relation based on oid. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationByOid(Oid relationId, BufferAccessStrategy strategy) +{ + Relation rel; + bool aborted = false; + + StartTransactionCommand(); + + rel = try_relation_open(relationId, AccessShareLock); + if (rel == NULL) + { + /* + * Relation no longer exists. We don't consider this an error since + * there are no pages in it that need data checksums, and thus return + * true. The worker operates off a list of relations generated at the + * start of processing, so relations being dropped in the meantime is + * to be expected. + */ + CommitTransactionCommand(); + pgstat_report_activity(STATE_IDLE, NULL); + return true; + } + RelationGetSmgr(rel); + + for (ForkNumber fnum = 0; fnum <= MAX_FORKNUM; fnum++) + { + if (smgrexists(rel->rd_smgr, fnum)) + { + if (!ProcessSingleRelationFork(rel, fnum, strategy)) + { + aborted = true; + break; + } + } + } + relation_close(rel, AccessShareLock); + elog(DEBUG2, + "data checksum processing done for relation with OID %u: %s", + relationId, (aborted ? "aborted" : "finished")); + + CommitTransactionCommand(); + + pgstat_report_activity(STATE_IDLE, NULL); + + return !aborted; +} + +/* + * ProcessDatabase + * Enable data checksums in a single database. + * + * We do this by launching a dynamic background worker into this database, and + * waiting for it to finish. We have to do this in a separate worker, since + * each process can only be connected to one database during its lifetime. + */ +static DataChecksumsWorkerResult +ProcessDatabase(DataChecksumsWorkerDatabase *db) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + BgwHandleStatus status; + pid_t pid; + char activity[NAMEDATALEN + 64]; + + DataChecksumsWorkerShmem->success = DATACHECKSUMSWORKER_FAILED; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "%s", "DataChecksumsWorkerMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksum worker"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksum worker"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid); + + /* + * If there are no worker slots available, make sure we retry processing + * this database. This will make the datachecksumsworker move on to the + * next database and quite likely fail with the same problem. TODO: Maybe + * we need a backoff to avoid running through all the databases here in + * short order. + */ + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + ereport(WARNING, + errmsg("failed to start worker for enabling data checksums in database \"%s\", retrying", + db->dbname), + errhint("The max_worker_processes setting might be too low.")); + return DATACHECKSUMSWORKER_RETRYDB; + } + + status = WaitForBackgroundWorkerStartup(bgw_handle, &pid); + if (status == BGWH_STOPPED) + { + ereport(WARNING, + errmsg("could not start background worker for enabling data checksums in database \"%s\"", + db->dbname), + errhint("More details on the error might be found in the server log.")); + return DATACHECKSUMSWORKER_FAILED; + } + + /* + * If the postmaster crashed we cannot end up with a processed database so + * we have no alternative other than exiting. When enabling checksums we + * won't at this time have changed the pg_control version to enabled so + * when the cluster comes back up processing will have to be restarted. + * When disabling, the pg_control version will be set to off before this + * so when the cluster comes up checksums will be off as expected. + */ + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("cannot enable data checksums without the postmaster process"), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums().")); + + Assert(status == BGWH_STARTED); + ereport(DEBUG1, + errmsg("initiating data checksum processing in database \"%s\"", + db->dbname)); + + snprintf(activity, sizeof(activity) - 1, + "Waiting for worker in database %s (pid %ld)", db->dbname, (long) pid); + pgstat_report_activity(STATE_RUNNING, activity); + + status = WaitForBackgroundWorkerShutdown(bgw_handle); + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("postmaster exited during data checksum processing in \"%s\"", + db->dbname), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums().")); + + if (DataChecksumsWorkerShmem->success == DATACHECKSUMSWORKER_ABORTED) + ereport(LOG, + errmsg("data checksums processing was aborted in database \"%s\"", + db->dbname)); + + pgstat_report_activity(STATE_IDLE, NULL); + + return DataChecksumsWorkerShmem->success; +} + +/* + * launcher_exit + * + * Internal routine for cleaning up state when the launcher process exits. We + * need to clean up the abort flag to ensure that processing can be restarted + * again after it was previously aborted. + */ +static void +launcher_exit(int code, Datum arg) +{ + if (launcher_running) + { + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + launcher_running = false; + DataChecksumsWorkerShmem->launcher_running = false; + LWLockRelease(DataChecksumsWorkerLock); + } +} + +/* + * launcher_cancel_handler + * + * Internal routine for reacting to SIGINT and flagging the worker to abort. + * The worker won't be interrupted immediately but will check for abort flag + * between each block in a relation. + */ +static void +launcher_cancel_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + abort_requested = true; + + /* + * There is no sleeping in the main loop, the flag will be checked + * periodically in ProcessSingleRelationFork. The worker does however + * sleep when waiting for concurrent transactions to end so we still need + * to set the latch. + */ + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * WaitForAllTransactionsToFinish + * Blocks awaiting all current transactions to finish + * + * Returns when all transactions which are active at the call of the function + * have ended, or if the postmaster dies while waiting. If the postmaster dies + * the abort flag will be set to indicate that the caller of this shouldn't + * proceed. + * + * NB: this will return early, if aborted by SIGINT or if the target state + * is changed while we're running. + */ +static void +WaitForAllTransactionsToFinish(void) +{ + TransactionId waitforxid; + + LWLockAcquire(XidGenLock, LW_SHARED); + waitforxid = XidFromFullTransactionId(TransamVariables->nextXid); + LWLockRelease(XidGenLock); + + while (TransactionIdPrecedes(GetOldestActiveTransactionId(false, true), waitforxid)) + { + char activity[64]; + int rc; + + /* Oldest running xid is older than us, so wait */ + snprintf(activity, + sizeof(activity), + "Waiting for current transactions to finish (waiting for %u)", + waitforxid); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 3 seconds */ + ResetLatch(MyLatch); + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + 3000, + WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION); + + /* + * If the postmaster died we won't be able to enable checksums + * cluster-wide so abort and hope to continue when restarted. + */ + if (rc & WL_POSTMASTER_DEATH) + ereport(FATAL, + errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("postmaster exited during data checksum processing"), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums().")); + + LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED); + if (DataChecksumsWorkerShmem->launch_operation != operation) + abort_requested = true; + LWLockRelease(DataChecksumsWorkerLock); + if (abort_requested) + break; + } + + pgstat_report_activity(STATE_IDLE, NULL); + return; +} + +/* + * DataChecksumsWorkerLauncherMain + * + * Main function for launching dynamic background workers for processing data + * checksums in databases. This function has the bgworker management, with + * ProcessAllDatabases being responsible for looping over the databases and + * initiating processing. + */ +void +DataChecksumsWorkerLauncherMain(Datum arg) +{ + on_shmem_exit(launcher_exit, 0); + + ereport(DEBUG1, + errmsg("background worker \"datachecksum launcher\" started")); + + pqsignal(SIGTERM, die); + pqsignal(SIGINT, launcher_cancel_handler); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_LAUNCHER; + init_ps_display(NULL); + + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + + if (DataChecksumsWorkerShmem->launcher_running) + { + /* Launcher was already running, let it finish */ + LWLockRelease(DataChecksumsWorkerLock); + return; + } + + launcher_running = true; + + /* + * Initialize a connection to shared catalogs only. + */ + BackgroundWorkerInitializeConnectionByOid(InvalidOid, InvalidOid, 0); + + operation = DataChecksumsWorkerShmem->launch_operation; + DataChecksumsWorkerShmem->launcher_running = true; + DataChecksumsWorkerShmem->operation = operation; + DataChecksumsWorkerShmem->cost_delay = DataChecksumsWorkerShmem->launch_cost_delay; + DataChecksumsWorkerShmem->cost_limit = DataChecksumsWorkerShmem->launch_cost_limit; + DataChecksumsWorkerShmem->immediate_checkpoint = DataChecksumsWorkerShmem->launch_fast; + LWLockRelease(DataChecksumsWorkerLock); + + /* + * The target state can change while we are busy enabling/disabling + * checksums, if the user calls pg_disable/enable_data_checksums() before + * we are finished with the previous request. In that case, we will loop + * back here, to process the new request. + */ +again: + + pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS, + InvalidOid); + + if (operation == ENABLE_DATACHECKSUMS) + { + /* + * If we are asked to enable checksums in a cluster which already has + * checksums enabled, exit immediately as there is nothing more to do. + * Hold interrupts to make sure state doesn't change during checking. + */ + HOLD_INTERRUPTS(); + if (DataChecksumsNeedVerify()) + { + RESUME_INTERRUPTS(); + goto done; + } + RESUME_INTERRUPTS(); + + /* + * Set the state to inprogress-on and wait on the procsignal barrier. + */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_ENABLING); + SetDataChecksumsOnInProgress(DataChecksumsWorkerShmem->immediate_checkpoint); + + /* + * All backends are now in inprogress-on state and are writing data + * checksums. Start processing all data at rest. + */ + if (!ProcessAllDatabases(DataChecksumsWorkerShmem->immediate_checkpoint)) + { + /* + * If the target state changed during processing then it's not a + * failure, so restart processing instead. + */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + if (DataChecksumsWorkerShmem->launch_operation != operation) + { + LWLockRelease(DataChecksumsWorkerLock); + goto done; + } + LWLockRelease(DataChecksumsWorkerLock); + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("unable to enable data checksums in cluster")); + } + + /* + * Data checksums have been set on all pages, set the state to on in + * order to instruct backends to validate checksums on reading. + */ + SetDataChecksumsOn(DataChecksumsWorkerShmem->immediate_checkpoint); + } + else + { + int flags; + + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_DISABLING); + SetDataChecksumsOff(DataChecksumsWorkerShmem->immediate_checkpoint); + + flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT; + if (DataChecksumsWorkerShmem->immediate_checkpoint) + flags = flags | CHECKPOINT_FAST; + RequestCheckpoint(flags); + } + +done: + + /* + * All done. But before we exit, check if the target state was changed + * while we were running. In that case we will have to start all over + * again. + */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + if (DataChecksumsWorkerShmem->launch_operation != operation) + { + DataChecksumsWorkerShmem->operation = DataChecksumsWorkerShmem->launch_operation; + operation = DataChecksumsWorkerShmem->launch_operation; + DataChecksumsWorkerShmem->cost_delay = DataChecksumsWorkerShmem->launch_cost_delay; + DataChecksumsWorkerShmem->cost_limit = DataChecksumsWorkerShmem->launch_cost_limit; + LWLockRelease(DataChecksumsWorkerLock); + goto again; + } + + /* Shut down progress reporting as we are done */ + pgstat_progress_end_command(); + + launcher_running = false; + DataChecksumsWorkerShmem->launcher_running = false; + LWLockRelease(DataChecksumsWorkerLock); +} + +/* + * ProcessAllDatabases + * Compute the list of all databases and process checksums in each + * + * This will repeatedly generate a list of databases to process for enabling + * checksums. Until no new databases are found, this will loop around computing + * a new list and comparing it to the already seen ones. + * + * If immediate_checkpoint is set to true then a CHECKPOINT_FAST will be + * issued. This is useful for testing but should be avoided in production use + * as it may affect cluster performance drastically. + */ +static bool +ProcessAllDatabases(bool immediate_checkpoint) +{ + List *DatabaseList; + HTAB *ProcessedDatabases = NULL; + HASHCTL hash_ctl; + bool found_failed = false; + int flags; + + /* Initialize a hash tracking all processed databases */ + memset(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(DataChecksumsWorkerResultEntry); + ProcessedDatabases = hash_create("Processed databases", + 64, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); + + /* + * Set up so first run processes shared catalogs, but not once in every + * db. + */ + DataChecksumsWorkerShmem->process_shared_catalogs = true; + + /* + * Get a list of all databases to process. This may include databases that + * were created during our runtime. Since a database can be created as a + * copy of any other database (which may not have existed in our last + * run), we have to repeat this loop until no new databases show up in the + * list. + */ + DatabaseList = BuildDatabaseList(); + + /* Allow a test case to modify the initial list of databases */ + INJECTION_POINT("datachecksumsworker-initial-dblist", DatabaseList); + + /* + * Update progress reporting with the total number of databases we need to + * process. This number should not be changed during processing, the + * columns for processed databases is instead increased such that it can + * be compared against the total. + */ + { + const int index[] = { + PROGRESS_DATACHECKSUMS_DBS_TOTAL, + PROGRESS_DATACHECKSUMS_DBS_DONE, + PROGRESS_DATACHECKSUMS_RELS_TOTAL, + PROGRESS_DATACHECKSUMS_RELS_DONE, + PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL, + PROGRESS_DATACHECKSUMS_BLOCKS_DONE, + }; + + int64 vals[6]; + + vals[0] = list_length(DatabaseList); + vals[1] = 0; + + /* translated to NULL */ + vals[2] = -1; + vals[3] = -1; + vals[4] = -1; + vals[5] = -1; + + pgstat_progress_update_multi_param(6, index, vals); + } + + while (true) + { + int processed_databases = 0; + + foreach_ptr(DataChecksumsWorkerDatabase, db, DatabaseList) + { + DataChecksumsWorkerResult result; + DataChecksumsWorkerResultEntry *entry; + bool found; + + /* + * Check if this database has been processed already, and if so + * whether it should be retried or skipped. + */ + entry = (DataChecksumsWorkerResultEntry *) hash_search(ProcessedDatabases, &db->dboid, + HASH_FIND, NULL); + + if (entry) + { + if (entry->result == DATACHECKSUMSWORKER_RETRYDB) + { + /* + * Limit the number of retries to avoid infinite looping + * in case there simply won't be enough workers in the + * cluster to finish this operation. + */ + if (entry->retries > DATACHECKSUMSWORKER_MAX_DB_RETRIES) + entry->result = DATACHECKSUMSWORKER_FAILED; + } + + /* Skip if this database has been processed already */ + if (entry->result != DATACHECKSUMSWORKER_RETRYDB) + continue; + } + + result = ProcessDatabase(db); + processed_databases++; + + /* + * Update the number of processed databases in the progress + * report. + */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_DBS_DONE, + processed_databases); + + /* Allow a test process to alter the result of the operation */ + INJECTION_POINT("datachecksumsworker-fail-db", &result); + + if (result == DATACHECKSUMSWORKER_SUCCESSFUL) + { + /* + * If one database has completed shared catalogs, we don't + * have to process them again. + */ + if (DataChecksumsWorkerShmem->process_shared_catalogs) + DataChecksumsWorkerShmem->process_shared_catalogs = false; + } + else if (result == DATACHECKSUMSWORKER_ABORTED) + { + /* Abort flag set, so exit the whole process */ + return false; + } + + entry = hash_search(ProcessedDatabases, &db->dboid, HASH_ENTER, &found); + entry->dboid = db->dboid; + entry->result = result; + if (!found) + entry->retries = 0; + else + entry->retries++; + } + + elog(DEBUG1, + "%i databases processed for data checksum enabling, %s", + processed_databases, + (processed_databases ? "process with restart" : "process completed")); + + FreeDatabaseList(DatabaseList); + + /* + * If no databases were processed in this run of the loop, we have now + * finished all databases and no concurrently created ones can exist. + */ + if (processed_databases == 0) + break; + + /* + * Re-generate the list of databases for another pass. Since we wait + * for all pre-existing transactions finish, this way we can be + * certain that there are no databases left without checksums. + */ + WaitForAllTransactionsToFinish(); + DatabaseList = BuildDatabaseList(); + } + + /* + * ProcessedDatabases now has all databases and the results of their + * processing. Failure to enable checksums for a database can be because + * they actually failed for some reason, or because the database was + * dropped between us getting the database list and trying to process it. + * Get a fresh list of databases to detect the second case where the + * database was dropped before we had started processing it. If a database + * still exists, but enabling checksums failed then we fail the entire + * checksumming process and exit with an error. + */ + WaitForAllTransactionsToFinish(); + DatabaseList = BuildDatabaseList(); + + foreach_ptr(DataChecksumsWorkerDatabase, db, DatabaseList) + { + DataChecksumsWorkerResultEntry *entry; + bool found; + + entry = hash_search(ProcessedDatabases, (void *) &db->dboid, + HASH_FIND, &found); + + /* + * We are only interested in the processed databases which failed, and + * where the failed database still exists. This indicates that + * enabling checksums actually failed, and not that the failure was + * due to the db being concurrently dropped. + */ + if (found && entry->result == DATACHECKSUMSWORKER_FAILED) + { + ereport(WARNING, + errmsg("failed to enable data checksums in \"%s\"", db->dbname)); + found_failed = found; + continue; + } + } + + FreeDatabaseList(DatabaseList); + + if (found_failed) + { + /* Disable checksums on cluster, because we failed */ + SetDataChecksumsOff(DataChecksumsWorkerShmem->immediate_checkpoint); + /* Force a checkpoint to make everything consistent */ + flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT; + if (immediate_checkpoint) + flags = flags | CHECKPOINT_FAST; + RequestCheckpoint(flags); + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("data checksums failed to get enabled in all databases, aborting"), + errhint("The server log might have more information on the cause of the error.")); + } + + /* + * When enabling checksums, we have to wait for a checkpoint for the + * checksums to change from in-progress to on. + */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_WAITING_CHECKPOINT); + + /* + * Force a checkpoint to get everything out to disk. The use of immediate + * checkpoints is for running tests, as they would otherwise not execute + * in such a way that they can reliably be placed under timeout control. + */ + flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT; + if (immediate_checkpoint) + flags = flags | CHECKPOINT_FAST; + RequestCheckpoint(flags); + + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_WAITING_BARRIER); + return true; +} + +/* + * DataChecksumsWorkerShmemSize + * Compute required space for datachecksumsworker-related shared memory + */ +Size +DataChecksumsWorkerShmemSize(void) +{ + Size size; + + size = sizeof(DataChecksumsWorkerShmemStruct); + size = MAXALIGN(size); + + return size; +} + +/* + * DataChecksumsWorkerShmemInit + * Allocate and initialize datachecksumsworker-related shared memory + */ +void +DataChecksumsWorkerShmemInit(void) +{ + bool found; + + DataChecksumsWorkerShmem = (DataChecksumsWorkerShmemStruct *) + ShmemInitStruct("DataChecksumsWorker Data", + DataChecksumsWorkerShmemSize(), + &found); + + if (!found) + { + MemSet(DataChecksumsWorkerShmem, 0, DataChecksumsWorkerShmemSize()); + + /* + * Even if this is a redundant assignment, we want to be explicit + * about our intent for readability, since we want to be able to query + * this state in case of restartability. + */ + DataChecksumsWorkerShmem->launch_operation = false; + DataChecksumsWorkerShmem->launcher_running = false; + DataChecksumsWorkerShmem->launch_fast = false; + } +} + +/* + * BuildDatabaseList + * Compile a list of all currently available databases in the cluster + * + * This creates the list of databases for the datachecksumsworker workers to + * add checksums to. If the caller wants to ensure that no concurrently + * running CREATE DATABASE calls exist, this needs to be preceded by a call + * to WaitForAllTransactionsToFinish(). + */ +static List * +BuildDatabaseList(void) +{ + List *DatabaseList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(DatabaseRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_database pgdb = (Form_pg_database) GETSTRUCT(tup); + DataChecksumsWorkerDatabase *db; + + oldctx = MemoryContextSwitchTo(ctx); + + db = (DataChecksumsWorkerDatabase *) palloc0(sizeof(DataChecksumsWorkerDatabase)); + + db->dboid = pgdb->oid; + db->dbname = pstrdup(NameStr(pgdb->datname)); + + DatabaseList = lappend(DatabaseList, db); + + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return DatabaseList; +} + +static void +FreeDatabaseList(List *dblist) +{ + if (!dblist) + return; + + foreach_ptr(DataChecksumsWorkerDatabase, db, dblist) + { + if (db->dbname != NULL) + pfree(db->dbname); + } + + list_free_deep(dblist); +} + +/* + * BuildRelationList + * Compile a list of relations in the database + * + * Returns a list of OIDs for the request relation types. If temp_relations + * is True then only temporary relations are returned. If temp_relations is + * False then non-temporary relations which have data checksums are returned. + * If include_shared is True then shared relations are included as well in a + * non-temporary list. include_shared has no relevance when building a list of + * temporary relations. + */ +static List * +BuildRelationList(bool temp_relations, bool include_shared) +{ + List *RelationList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(RelationRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup); + + /* + * Only include temporary relations when asked for a temp relation + * list. + */ + if (pgc->relpersistence == RELPERSISTENCE_TEMP) + { + if (!temp_relations) + continue; + } + else + { + /* + * If we are only interested in temp relations then continue + * immediately as the current relation isn't a temp relation. + */ + if (temp_relations) + continue; + + if (!RELKIND_HAS_STORAGE(pgc->relkind)) + continue; + + if (pgc->relisshared && !include_shared) + continue; + } + + oldctx = MemoryContextSwitchTo(ctx); + RelationList = lappend_oid(RelationList, pgc->oid); + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return RelationList; +} + +/* + * DataChecksumsWorkerMain + * + * Main function for enabling checksums in a single database, This is the + * function set as the bgw_function_name in the dynamic background worker + * process initiated for each database by the worker launcher. After enabling + * data checksums in each applicable relation in the database, it will wait for + * all temporary relations that were present when the function started to + * disappear before returning. This is required since we cannot rewrite + * existing temporary relations with data checksums. + */ +void +DataChecksumsWorkerMain(Datum arg) +{ + Oid dboid = DatumGetObjectId(arg); + List *RelationList = NIL; + List *InitialTempTableList = NIL; + BufferAccessStrategy strategy; + bool aborted = false; + int64 rels_done; + + operation = ENABLE_DATACHECKSUMS; + + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_WORKER; + init_ps_display(NULL); + + BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid, + BGWORKER_BYPASS_ALLOWCONN); + + /* worker will have a separate entry in pg_stat_progress_data_checksums */ + pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS, + InvalidOid); + + /* + * Get a list of all temp tables present as we start in this database. We + * need to wait until they are all gone until we are done, since we cannot + * access these relations and modify them. + */ + InitialTempTableList = BuildRelationList(true, false); + + /* + * Enable vacuum cost delay, if any. + */ + Assert(DataChecksumsWorkerShmem->operation == ENABLE_DATACHECKSUMS); + VacuumCostDelay = DataChecksumsWorkerShmem->cost_delay; + VacuumCostLimit = DataChecksumsWorkerShmem->cost_limit; + VacuumCostActive = (VacuumCostDelay > 0); + VacuumCostBalance = 0; + VacuumCostPageHit = 0; + VacuumCostPageMiss = 0; + VacuumCostPageDirty = 0; + + /* + * Create and set the vacuum strategy as our buffer strategy. + */ + strategy = GetAccessStrategy(BAS_VACUUM); + + RelationList = BuildRelationList(false, + DataChecksumsWorkerShmem->process_shared_catalogs); + + /* Update the total number of relations to be processed in this DB. */ + { + const int index[] = { + PROGRESS_DATACHECKSUMS_RELS_TOTAL, + PROGRESS_DATACHECKSUMS_RELS_DONE + }; + + int64 vals[2]; + + vals[0] = list_length(RelationList); + vals[1] = 0; + + pgstat_progress_update_multi_param(2, index, vals); + } + + /* Process the relations */ + rels_done = 0; + foreach_oid(reloid, RelationList) + { + if (!ProcessSingleRelationByOid(reloid, strategy)) + { + aborted = true; + break; + } + + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_RELS_DONE, + ++rels_done); + } + list_free(RelationList); + + if (aborted) + { + DataChecksumsWorkerShmem->success = DATACHECKSUMSWORKER_ABORTED; + ereport(DEBUG1, + errmsg("data checksum processing aborted in database OID %u", + dboid)); + return; + } + + /* The worker is about to wait for temporary tables to go away. */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL); + + /* + * Wait for all temp tables that existed when we started to go away. This + * is necessary since we cannot "reach" them to enable checksums. Any temp + * tables created after we started will already have checksums in them + * (due to the "inprogress-on" state), so no need to wait for those. + */ + for (;;) + { + List *CurrentTempTables; + int numleft; + char activity[64]; + + CurrentTempTables = BuildRelationList(true, false); + numleft = 0; + foreach_oid(tmptbloid, InitialTempTableList) + { + if (list_member_oid(CurrentTempTables, tmptbloid)) + numleft++; + } + list_free(CurrentTempTables); + + INJECTION_POINT("datachecksumsworker-fake-temptable-wait", &numleft); + + if (numleft == 0) + break; + + /* + * At least one temp table is left to wait for, indicate in pgstat + * activity and progress reporting. + */ + snprintf(activity, + sizeof(activity), + "Waiting for %d temp tables to be removed", numleft); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 3 seconds */ + ResetLatch(MyLatch); + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 3000, + WAIT_EVENT_CHECKSUM_ENABLE_TEMPTABLE_WAIT); + + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + aborted = DataChecksumsWorkerShmem->launch_operation != operation; + LWLockRelease(DataChecksumsWorkerLock); + + if (aborted || abort_requested) + { + DataChecksumsWorkerShmem->success = DATACHECKSUMSWORKER_ABORTED; + ereport(DEBUG1, + errmsg("data checksum processing aborted in database OID %u", + dboid)); + return; + } + } + + list_free(InitialTempTableList); + + /* worker done */ + pgstat_progress_end_command(); + + DataChecksumsWorkerShmem->success = DATACHECKSUMSWORKER_SUCCESSFUL; +} diff --git a/src/backend/postmaster/meson.build b/src/backend/postmaster/meson.build index 0008603cfee9..ce10ef1059a8 100644 --- a/src/backend/postmaster/meson.build +++ b/src/backend/postmaster/meson.build @@ -6,6 +6,7 @@ backend_sources += files( 'bgworker.c', 'bgwriter.c', 'checkpointer.c', + 'datachecksumsworker.c', 'fork_process.c', 'interrupt.c', 'launch_backend.c', diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 00de559ba8f4..8910f0990185 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -2991,6 +2991,11 @@ PostmasterStateMachine(void) B_INVALID, B_STANDALONE_BACKEND); + /* also add checksumming processes */ + remainMask = btmask_add(remainMask, + B_DATACHECKSUMSWORKER_LAUNCHER, + B_DATACHECKSUMSWORKER_WORKER); + /* All types should be included in targetMask or remainMask */ Assert((remainMask.mask | targetMask.mask) == BTYPE_MASK_ALL.mask); } diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index cc03f0706e9c..f9f06821a8f9 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -186,6 +186,7 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_FPW_CHANGE: case XLOG_FPI_FOR_HINT: case XLOG_FPI: + case XLOG_CHECKSUMS: case XLOG_OVERWRITE_CONTRECORD: case XLOG_CHECKPOINT_REDO: break; diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 2fa045e6b0f6..44213d140aee 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -30,6 +30,8 @@ #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" +#include "postmaster/datachecksumsworker.h" +#include "postmaster/postmaster.h" #include "postmaster/walsummarizer.h" #include "replication/logicallauncher.h" #include "replication/origin.h" @@ -150,6 +152,7 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, InjectionPointShmemSize()); size = add_size(size, SlotSyncShmemSize()); size = add_size(size, AioShmemSize()); + size = add_size(size, DataChecksumsWorkerShmemSize()); /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); @@ -332,6 +335,7 @@ CreateOrAttachShmemStructs(void) PgArchShmemInit(); ApplyLauncherShmemInit(); SlotSyncShmemInit(); + DataChecksumsWorkerShmemInit(); /* * Set up other modules that need some shared memory space diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 087821311cce..2f6ccdfb32f0 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -18,12 +18,14 @@ #include #include "access/parallel.h" +#include "access/xlog.h" #include "commands/async.h" #include "miscadmin.h" #include "pgstat.h" #include "port/pg_bitutils.h" #include "replication/logicalworker.h" #include "replication/walsender.h" +#include "storage/checksum.h" #include "storage/condition_variable.h" #include "storage/ipc.h" #include "storage/latch.h" @@ -576,6 +578,18 @@ ProcessProcSignalBarrier(void) case PROCSIGNAL_BARRIER_SMGRRELEASE: processed = ProcessBarrierSmgrRelease(); break; + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON: + processed = AbsorbDataChecksumsBarrier(PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); + break; + case PROCSIGNAL_BARRIER_CHECKSUM_ON: + processed = AbsorbDataChecksumsBarrier(PG_DATA_CHECKSUM_VERSION); + break; + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF: + processed = AbsorbDataChecksumsBarrier(PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); + break; + case PROCSIGNAL_BARRIER_CHECKSUM_OFF: + processed = AbsorbDataChecksumsBarrier(PG_DATA_CHECKSUM_OFF); + break; } /* diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README index e30d7ac59adc..73c36a639086 100644 --- a/src/backend/storage/page/README +++ b/src/backend/storage/page/README @@ -10,7 +10,9 @@ http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed 2010/12/22 on -hackers list. Current implementation requires this be enabled system-wide at initdb time, or -by using the pg_checksums tool on an offline cluster. +by using the pg_checksums tool on an offline cluster. Checksums can also be +enabled at runtime using pg_enable_data_checksums(), and disabled by using +pg_disable_data_checksums(). The checksum is not valid at all times on a data page!! The checksum is valid when the page leaves the shared pool and is checked diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index aac6e6959546..cfb1753ffba4 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -107,7 +107,7 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail */ if (!PageIsNew(page)) { - if (DataChecksumsEnabled()) + if (DataChecksumsNeedVerify()) { checksum = pg_checksum_page(page, blkno); @@ -151,8 +151,8 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail if ((flags & (PIV_LOG_WARNING | PIV_LOG_LOG)) != 0) ereport(flags & PIV_LOG_WARNING ? WARNING : LOG, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("page verification failed, calculated checksum %u but expected %u", - checksum, p->pd_checksum))); + errmsg("page verification failed, calculated checksum %u but expected %u (page LSN %X/%08X)", + checksum, p->pd_checksum, LSN_FORMAT_ARGS(PageXLogRecPtrGet(p->pd_lsn))))); if (header_sane && (flags & PIV_IGNORE_CHECKSUM_FAILURE)) return true; @@ -1511,7 +1511,7 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) static char *pageCopy = NULL; /* If we don't need a checksum, just return the passed-in data */ - if (PageIsNew(page) || !DataChecksumsEnabled()) + if (PageIsNew(page) || !DataChecksumsNeedWrite()) return page; /* @@ -1541,7 +1541,7 @@ void PageSetChecksumInplace(Page page, BlockNumber blkno) { /* If we don't need a checksum, just return */ - if (PageIsNew(page) || !DataChecksumsEnabled()) + if (PageIsNew(page) || !DataChecksumsNeedWrite()) return; ((PageHeader) page)->pd_checksum = pg_checksum_page(page, blkno); diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c index 199ba2cc17a7..7afe00982678 100644 --- a/src/backend/utils/activity/pgstat_backend.c +++ b/src/backend/utils/activity/pgstat_backend.c @@ -380,6 +380,8 @@ pgstat_tracks_backend_bktype(BackendType bktype) case B_CHECKPOINTER: case B_IO_WORKER: case B_STARTUP: + case B_DATACHECKSUMSWORKER_LAUNCHER: + case B_DATACHECKSUMSWORKER_WORKER: return false; case B_AUTOVAC_WORKER: diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index 13ae57ed6498..a290d56f4096 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -362,6 +362,8 @@ pgstat_tracks_io_bktype(BackendType bktype) case B_LOGGER: return false; + case B_DATACHECKSUMSWORKER_LAUNCHER: + case B_DATACHECKSUMSWORKER_WORKER: case B_AUTOVAC_LAUNCHER: case B_AUTOVAC_WORKER: case B_BACKEND: diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 7553f6eacef7..430178c699ce 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -116,6 +116,9 @@ CHECKPOINT_DELAY_COMPLETE "Waiting for a backend that blocks a checkpoint from c CHECKPOINT_DELAY_START "Waiting for a backend that blocks a checkpoint from starting." CHECKPOINT_DONE "Waiting for a checkpoint to complete." CHECKPOINT_START "Waiting for a checkpoint to start." +CHECKSUM_ENABLE_STARTCONDITION "Waiting for data checksums enabling to start." +CHECKSUM_ENABLE_FINISHCONDITION "Waiting for data checksums to be enabled." +CHECKSUM_ENABLE_TEMPTABLE_WAIT "Waiting for temporary tables to be dropped for data checksums to be enabled." EXECUTE_GATHER "Waiting for activity from a child process while executing a Gather plan node." HASH_BATCH_ALLOCATE "Waiting for an elected Parallel Hash participant to allocate a hash table." HASH_BATCH_ELECT "Waiting to elect a Parallel Hash participant to allocate a hash table." @@ -355,6 +358,7 @@ DSMRegistry "Waiting to read or update the dynamic shared memory registry." InjectionPoint "Waiting to read or update information related to injection points." SerialControl "Waiting to read or update shared pg_serial state." AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." +DataChecksumsWorker "Waiting for data checksumsworker." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index a710508979e4..5df447b6788e 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -295,6 +295,8 @@ pg_stat_get_progress_info(PG_FUNCTION_ARGS) cmdtype = PROGRESS_COMMAND_BASEBACKUP; else if (pg_strcasecmp(cmd, "COPY") == 0) cmdtype = PROGRESS_COMMAND_COPY; + else if (pg_strcasecmp(cmd, "DATACHECKSUMS") == 0) + cmdtype = PROGRESS_COMMAND_DATACHECKSUMS; else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -1167,9 +1169,6 @@ pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS) int64 result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) - PG_RETURN_NULL(); - if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) result = 0; else @@ -1185,9 +1184,6 @@ pg_stat_get_db_checksum_last_failure(PG_FUNCTION_ARGS) TimestampTz result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) - PG_RETURN_NULL(); - if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) result = 0; else diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index fec79992c8de..9b78e0012efc 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -844,7 +844,8 @@ InitializeSessionUserIdStandalone(void) * workers, in slot sync worker and in background workers. */ Assert(!IsUnderPostmaster || AmAutoVacuumWorkerProcess() || - AmLogicalSlotSyncWorkerProcess() || AmBackgroundWorkerProcess()); + AmLogicalSlotSyncWorkerProcess() || AmBackgroundWorkerProcess() || + AmDataChecksumsWorkerProcess()); /* call only once */ Assert(!OidIsValid(AuthenticatedUserId)); diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 98f9598cd789..b598deb56485 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -746,6 +746,24 @@ InitPostgres(const char *in_dbname, Oid dboid, ProcSignalInit(MyCancelKey, MyCancelKeyLength); + /* + * Initialize a local cache of the data_checksum_version, to be updated by + * the procsignal-based barriers. + * + * This intentionally happens after initializing the procsignal, otherwise + * we might miss a state change. This means we can get a barrier for the + * state we've just initialized - but it can happen only once. + * + * The postmaster (which is what gets forked into the new child process) + * does not handle barriers, therefore it may not have the current value + * of LocalDataChecksumVersion value (it'll have the value read from the + * control file, which may be arbitrarily old). + * + * NB: Even if the postmaster handled barriers, the value might still be + * stale, as it might have changed after this process forked. + */ + InitLocalDataChecksumVersion(); + /* * Also set up timeout handlers needed for backend operation. We need * these in every case except bootstrap. @@ -874,7 +892,7 @@ InitPostgres(const char *in_dbname, Oid dboid, errhint("You should immediately run CREATE USER \"%s\" SUPERUSER;.", username != NULL ? username : "postgres"))); } - else if (AmBackgroundWorkerProcess()) + else if (AmBackgroundWorkerProcess() || AmDataChecksumsWorkerProcess()) { if (username == NULL && !OidIsValid(useroid)) { diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 1128167c0251..1330840e9c3b 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -531,11 +531,12 @@ max => '1.0', }, -{ name => 'data_checksums', type => 'bool', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', +{ name => 'data_checksums', type => 'enum', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', short_desc => 'Shows whether data checksums are turned on for this cluster.', flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED', variable => 'data_checksums', - boot_val => 'false', + boot_val => 'PG_DATA_CHECKSUM_OFF', + options => 'data_checksums_options', }, # Can't be set by ALTER SYSTEM as it can lead to recursive definition diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 00c8376cf4de..ae1506d87f5c 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -491,6 +491,14 @@ static const struct config_enum_entry file_copy_method_options[] = { {NULL, 0, false} }; +static const struct config_enum_entry data_checksums_options[] = { + {"on", PG_DATA_CHECKSUM_VERSION, true}, + {"off", PG_DATA_CHECKSUM_OFF, true}, + {"inprogress-on", PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION, true}, + {"inprogress-off", PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION, true}, + {NULL, 0, false} +}; + /* * Options for enum values stored in other modules */ @@ -617,7 +625,6 @@ static int shared_memory_size_mb; static int shared_memory_size_in_huge_pages; static int wal_block_size; static int num_os_semaphores; -static bool data_checksums; static bool integer_datetimes; #ifdef USE_ASSERT_CHECKING diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 46cb2f36efaa..327a677cb81e 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -585,7 +585,7 @@ main(int argc, char *argv[]) ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) pg_fatal("cluster must be shut down"); - if (ControlFile->data_checksum_version == 0 && + if (ControlFile->data_checksum_version != PG_DATA_CHECKSUM_VERSION && mode == PG_MODE_CHECK) pg_fatal("data checksums are not enabled in cluster"); @@ -593,7 +593,7 @@ main(int argc, char *argv[]) mode == PG_MODE_DISABLE) pg_fatal("data checksums are already disabled in cluster"); - if (ControlFile->data_checksum_version > 0 && + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION && mode == PG_MODE_ENABLE) pg_fatal("data checksums are already enabled in cluster"); diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 10de058ce91f..acf5c7b026e7 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -280,6 +280,8 @@ main(int argc, char *argv[]) ControlFile->checkPointCopy.oldestCommitTsXid); printf(_("Latest checkpoint's newestCommitTsXid:%u\n"), ControlFile->checkPointCopy.newestCommitTsXid); + printf(_("Latest checkpoint's data_checksum_version:%u\n"), + ControlFile->checkPointCopy.data_checksum_version); printf(_("Time of latest checkpoint: %s\n"), ckpttime_str); printf(_("Fake LSN counter for unlogged rels: %X/%08X\n"), diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index 90cef0864de7..29684e824401 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -15,6 +15,7 @@ #include "access/xlog_internal.h" #include "common/string.h" #include "pg_upgrade.h" +#include "storage/bufpage.h" /* @@ -736,6 +737,14 @@ check_control_data(ControlData *oldctrl, * check_for_isn_and_int8_passing_mismatch(). */ + /* + * If data checksums are in any in-progress state then disallow the + * upgrade. The user should either let the process finish, or turn off + * data checksums, before retrying. + */ + if (oldctrl->data_checksum_version > PG_DATA_CHECKSUM_VERSION) + pg_fatal("checksums are being enabled in the old cluster"); + /* * We might eventually allow upgrades from checksum to no-checksum * clusters. diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 605280ed8fb6..100df16384f9 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -56,6 +56,7 @@ extern PGDLLIMPORT int CommitDelay; extern PGDLLIMPORT int CommitSiblings; extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; +extern PGDLLIMPORT int data_checksums; extern PGDLLIMPORT int CheckPointSegments; @@ -117,7 +118,7 @@ extern PGDLLIMPORT int wal_level; * of the bits make it to disk, but the checksum wouldn't match. Also WAL-log * them if forced by wal_log_hints=on. */ -#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints) +#define XLogHintBitIsNeeded() (wal_log_hints || DataChecksumsNeedWrite()) /* Do we need to WAL-log information required only for Hot Standby and logical replication? */ #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA) @@ -230,7 +231,16 @@ extern XLogRecPtr GetXLogWriteRecPtr(void); extern uint64 GetSystemIdentifier(void); extern char *GetMockAuthenticationNonce(void); -extern bool DataChecksumsEnabled(void); +extern bool DataChecksumsNeedWrite(void); +extern bool DataChecksumsNeedVerify(void); +extern bool DataChecksumsOnInProgress(void); +extern bool DataChecksumsOffInProgress(void); +extern void SetDataChecksumsOnInProgress(bool immediate_checkpoint); +extern void SetDataChecksumsOn(bool immediate_checkpoint); +extern void SetDataChecksumsOff(bool immediate_checkpoint); +extern bool AbsorbDataChecksumsBarrier(int target_state); +extern const char *show_data_checksums(void); +extern void InitLocalDataChecksumVersion(void); extern bool GetDefaultCharSignedness(void); extern XLogRecPtr GetFakeLSNForUnloggedRel(void); extern Size XLOGShmemSize(void); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 34deb2fe5f04..faaa0e62d385 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -25,6 +25,7 @@ #include "lib/stringinfo.h" #include "pgtime.h" #include "storage/block.h" +#include "storage/checksum.h" #include "storage/relfilelocator.h" @@ -289,6 +290,12 @@ typedef struct xl_restore_point char rp_name[MAXFNAMELEN]; } xl_restore_point; +/* Information logged when data checksum level is changed */ +typedef struct xl_checksum_state +{ + uint32 new_checksumtype; +} xl_checksum_state; + /* Overwrite of prior contrecord */ typedef struct xl_overwrite_contrecord { diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 63e834a6ce47..a8877fb87d1a 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -62,6 +62,9 @@ typedef struct CheckPoint * set to InvalidTransactionId. */ TransactionId oldestActiveXid; + + /* data checksums at the time of the checkpoint */ + uint32 data_checksum_version; } CheckPoint; /* XLOG info values for XLOG rmgr */ @@ -80,6 +83,7 @@ typedef struct CheckPoint /* 0xC0 is used in Postgres 9.5-11 */ #define XLOG_OVERWRITE_CONTRECORD 0xD0 #define XLOG_CHECKPOINT_REDO 0xE0 +#define XLOG_CHECKSUMS 0xF0 /* @@ -219,7 +223,7 @@ typedef struct ControlFileData bool float8ByVal; /* float8, int8, etc pass-by-value? */ /* Are data pages protected by checksums? Zero if no checksum version */ - uint32 data_checksum_version; + uint32 data_checksum_version; /* persistent */ /* * True if the default signedness of char is "signed" on a platform where diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 34b7fddb0e7a..faf7df6ead6a 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12381,6 +12381,25 @@ proname => 'jsonb_subscript_handler', prorettype => 'internal', proargtypes => 'internal', prosrc => 'jsonb_subscript_handler' }, +# data checksum management functions +{ oid => '9258', + descr => 'disable data checksums', + proname => 'pg_disable_data_checksums', provolatile => 'v', prorettype => 'void', + proparallel => 'r', + proargtypes => 'bool', proallargtypes => '{bool}', + proargmodes => '{i}', + proargnames => '{fast}', + prosrc => 'disable_data_checksums' }, + +{ oid => '9257', + descr => 'enable data checksums', + proname => 'pg_enable_data_checksums', provolatile => 'v', prorettype => 'void', + proparallel => 'r', + proargtypes => 'int4 int4 bool', proallargtypes => '{int4,int4,bool}', + proargmodes => '{i,i,i}', + proargnames => '{cost_delay,cost_limit,fast}', + prosrc => 'enable_data_checksums' }, + # collation management functions { oid => '3445', descr => 'import collations from operating system', proname => 'pg_import_system_collations', procost => '100', diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 1cde4bd9bcf1..d2aa148533b5 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -162,4 +162,21 @@ #define PROGRESS_COPY_TYPE_PIPE 3 #define PROGRESS_COPY_TYPE_CALLBACK 4 +/* Progress parameters for PROGRESS_DATACHECKSUMS */ +#define PROGRESS_DATACHECKSUMS_PHASE 0 +#define PROGRESS_DATACHECKSUMS_DBS_TOTAL 1 +#define PROGRESS_DATACHECKSUMS_DBS_DONE 2 +#define PROGRESS_DATACHECKSUMS_RELS_TOTAL 3 +#define PROGRESS_DATACHECKSUMS_RELS_DONE 4 +#define PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL 5 +#define PROGRESS_DATACHECKSUMS_BLOCKS_DONE 6 + +/* Phases of datachecksumsworker operation */ +#define PROGRESS_DATACHECKSUMS_PHASE_ENABLING 0 +#define PROGRESS_DATACHECKSUMS_PHASE_DISABLING 1 +#define PROGRESS_DATACHECKSUMS_PHASE_WAITING 2 +#define PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL 3 +#define PROGRESS_DATACHECKSUMS_PHASE_WAITING_CHECKPOINT 4 +#define PROGRESS_DATACHECKSUMS_PHASE_WAITING_BARRIER 5 + #endif diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 9a7d733ddeff..581fbae2ee00 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -367,6 +367,9 @@ typedef enum BackendType B_WAL_SUMMARIZER, B_WAL_WRITER, + B_DATACHECKSUMSWORKER_LAUNCHER, + B_DATACHECKSUMSWORKER_WORKER, + /* * Logger is not connected to shared memory and does not have a PGPROC * entry. @@ -392,6 +395,9 @@ extern PGDLLIMPORT BackendType MyBackendType; #define AmWalSummarizerProcess() (MyBackendType == B_WAL_SUMMARIZER) #define AmWalWriterProcess() (MyBackendType == B_WAL_WRITER) #define AmIoWorkerProcess() (MyBackendType == B_IO_WORKER) +#define AmDataChecksumsWorkerProcess() \ + (MyBackendType == B_DATACHECKSUMSWORKER_LAUNCHER || \ + MyBackendType == B_DATACHECKSUMSWORKER_WORKER) #define AmSpecialWorkerProcess() \ (AmAutoVacuumLauncherProcess() || \ diff --git a/src/include/postmaster/datachecksumsworker.h b/src/include/postmaster/datachecksumsworker.h new file mode 100644 index 000000000000..2cd066fd0feb --- /dev/null +++ b/src/include/postmaster/datachecksumsworker.h @@ -0,0 +1,51 @@ +/*------------------------------------------------------------------------- + * + * datachecksumsworker.h + * header file for data checksum helper background worker + * + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/postmaster/datachecksumsworker.h + * + *------------------------------------------------------------------------- + */ +#ifndef DATACHECKSUMSWORKER_H +#define DATACHECKSUMSWORKER_H + +/* Shared memory */ +extern Size DataChecksumsWorkerShmemSize(void); +extern void DataChecksumsWorkerShmemInit(void); + +/* Possible operations the Datachecksumsworker can perform */ +typedef enum DataChecksumsWorkerOperation +{ + ENABLE_DATACHECKSUMS, + DISABLE_DATACHECKSUMS, + /* TODO: VERIFY_DATACHECKSUMS, */ +} DataChecksumsWorkerOperation; + +/* + * Possible states for a database entry which has been processed. Exported + * here since we want to be able to reference this from injection point tests. + */ +typedef enum +{ + DATACHECKSUMSWORKER_SUCCESSFUL = 0, + DATACHECKSUMSWORKER_ABORTED, + DATACHECKSUMSWORKER_FAILED, + DATACHECKSUMSWORKER_RETRYDB, +} DataChecksumsWorkerResult; + +/* Start the background processes for enabling or disabling checksums */ +void StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op, + int cost_delay, + int cost_limit, + bool fast); + +/* Background worker entrypoints */ +void DataChecksumsWorkerLauncherMain(Datum arg); +void DataChecksumsWorkerMain(Datum arg); + +#endif /* DATACHECKSUMSWORKER_H */ diff --git a/src/include/postmaster/proctypelist.h b/src/include/postmaster/proctypelist.h index 242862451d8d..3dc93b176d90 100644 --- a/src/include/postmaster/proctypelist.h +++ b/src/include/postmaster/proctypelist.h @@ -38,6 +38,8 @@ PG_PROCTYPE(B_BACKEND, gettext_noop("client backend"), BackendMain, true) PG_PROCTYPE(B_BG_WORKER, gettext_noop("background worker"), BackgroundWorkerMain, true) PG_PROCTYPE(B_BG_WRITER, gettext_noop("background writer"), BackgroundWriterMain, true) PG_PROCTYPE(B_CHECKPOINTER, gettext_noop("checkpointer"), CheckpointerMain, true) +PG_PROCTYPE(B_DATACHECKSUMSWORKER_LAUNCHER, gettext_noop("datachecksum launcher"), NULL, false) +PG_PROCTYPE(B_DATACHECKSUMSWORKER_WORKER, gettext_noop("datachecksum worker"), NULL, false) PG_PROCTYPE(B_DEAD_END_BACKEND, gettext_noop("dead-end client backend"), BackendMain, true) PG_PROCTYPE(B_INVALID, gettext_noop("unrecognized"), NULL, false) PG_PROCTYPE(B_IO_WORKER, gettext_noop("io worker"), IoWorkerMain, true) diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index abc2cf2a020b..2fb242f029d3 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -16,6 +16,7 @@ #include "access/xlogdefs.h" #include "storage/block.h" +#include "storage/checksum.h" #include "storage/off.h" /* GUC variable */ @@ -204,7 +205,6 @@ typedef PageHeaderData *PageHeader; * handling pages. */ #define PG_PAGE_LAYOUT_VERSION 4 -#define PG_DATA_CHECKSUM_VERSION 1 /* ---------------------------------------------------------------- * page support functions diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h index 25d13a798d10..0faaac14b1bc 100644 --- a/src/include/storage/checksum.h +++ b/src/include/storage/checksum.h @@ -15,6 +15,21 @@ #include "storage/block.h" +/* + * Checksum version 0 is used for when data checksums are disabled (OFF). + * PG_DATA_CHECKSUM_VERSION defines that data checksums are enabled in the + * cluster and PG_DATA_CHECKSUM_INPROGRESS_{ON|OFF}_VERSION defines that data + * checksums are either currently being enabled or disabled. + */ +typedef enum ChecksumType +{ + PG_DATA_CHECKSUM_OFF = 0, + PG_DATA_CHECKSUM_VERSION, + PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION, + PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION, + PG_DATA_CHECKSUM_ANY_VERSION +} ChecksumType; + /* * Compute the checksum for a Postgres page. The page must be aligned on a * 4-byte boundary. diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index 06a1ffd4b08b..b8f7ba0be517 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -85,6 +85,7 @@ PG_LWLOCK(50, DSMRegistry) PG_LWLOCK(51, InjectionPoint) PG_LWLOCK(52, SerialControl) PG_LWLOCK(53, AioWorkerSubmissionQueue) +PG_LWLOCK(54, DataChecksumsWorker) /* * There also exist several built-in LWLock tranches. As with the predefined diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index c6f5ebceefdd..d90d35b1d6fa 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -463,11 +463,11 @@ extern PGDLLIMPORT PGPROC *PreparedXactProcs; * Background writer, checkpointer, WAL writer, WAL summarizer, and archiver * run during normal operation. Startup process and WAL receiver also consume * 2 slots, but WAL writer is launched only after startup has exited, so we - * only need 6 slots. + * only need 6 slots to cover these. The DataChecksums worker and launcher + * can consume 2 slots when data checksums are enabled or disabled. */ #define MAX_IO_WORKERS 32 -#define NUM_AUXILIARY_PROCS (6 + MAX_IO_WORKERS) - +#define NUM_AUXILIARY_PROCS (8 + MAX_IO_WORKERS) /* configurable options */ extern PGDLLIMPORT int DeadlockTimeout; diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index afeeb1ca019f..c54c61e2cd8a 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -54,6 +54,11 @@ typedef enum typedef enum { PROCSIGNAL_BARRIER_SMGRRELEASE, /* ask smgr to close files */ + + PROCSIGNAL_BARRIER_CHECKSUM_OFF, + PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON, + PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF, + PROCSIGNAL_BARRIER_CHECKSUM_ON, } ProcSignalBarrierType; /* diff --git a/src/include/utils/backend_progress.h b/src/include/utils/backend_progress.h index dda813ab4076..c664e92dbfe7 100644 --- a/src/include/utils/backend_progress.h +++ b/src/include/utils/backend_progress.h @@ -28,6 +28,7 @@ typedef enum ProgressCommandType PROGRESS_COMMAND_CREATE_INDEX, PROGRESS_COMMAND_BASEBACKUP, PROGRESS_COMMAND_COPY, + PROGRESS_COMMAND_DATACHECKSUMS, } ProgressCommandType; #define PGSTAT_NUM_PROGRESS_PARAM 20 diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 902a79541010..28c8d0bd3cf2 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -18,6 +18,7 @@ SUBDIRS = \ test_binaryheap \ test_bitmapset \ test_bloomfilter \ + test_checksums \ test_copy_callbacks \ test_custom_rmgrs \ test_ddl_deparse \ diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 14fc761c4cfa..88b8b3695342 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -17,6 +17,7 @@ subdir('test_aio') subdir('test_binaryheap') subdir('test_bitmapset') subdir('test_bloomfilter') +subdir('test_checksums') subdir('test_copy_callbacks') subdir('test_custom_rmgrs') subdir('test_ddl_deparse') diff --git a/src/test/modules/test_checksums/.gitignore b/src/test/modules/test_checksums/.gitignore new file mode 100644 index 000000000000..871e943d50e1 --- /dev/null +++ b/src/test/modules/test_checksums/.gitignore @@ -0,0 +1,2 @@ +# Generated by test suite +/tmp_check/ diff --git a/src/test/modules/test_checksums/Makefile b/src/test/modules/test_checksums/Makefile new file mode 100644 index 000000000000..a5b6259a7288 --- /dev/null +++ b/src/test/modules/test_checksums/Makefile @@ -0,0 +1,40 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/test/checksum +# +# Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/test/checksum/Makefile +# +#------------------------------------------------------------------------- + +EXTRA_INSTALL = src/test/modules/injection_points + +export enable_injection_points + +MODULE_big = test_checksums +OBJS = \ + $(WIN32RES) \ + test_checksums.o +PGFILEDESC = "test_checksums - test code for data checksums" + +EXTENSION = test_checksums +DATA = test_checksums--1.0.sql + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_checksums +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) diff --git a/src/test/modules/test_checksums/README b/src/test/modules/test_checksums/README new file mode 100644 index 000000000000..0f0317060b38 --- /dev/null +++ b/src/test/modules/test_checksums/README @@ -0,0 +1,22 @@ +src/test/checksum/README + +Regression tests for data checksums +=================================== + +This directory contains a test suite for enabling data checksums +in a running cluster. + +Running the tests +================= + + make check + +or + + make installcheck + +NOTE: This creates a temporary installation (in the case of "check"), +with multiple nodes, be they master or standby(s) for the purpose of +the tests. + +NOTE: This requires the --enable-tap-tests argument to configure. diff --git a/src/test/modules/test_checksums/meson.build b/src/test/modules/test_checksums/meson.build new file mode 100644 index 000000000000..ffc737ca87af --- /dev/null +++ b/src/test/modules/test_checksums/meson.build @@ -0,0 +1,36 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +test_checksums_sources = files( + 'test_checksums.c', +) + +test_checksums = shared_module('test_checksums', + test_checksums_sources, + kwargs: pg_test_mod_args, +) +test_install_libs += test_checksums + +test_install_data += files( + 'test_checksums.control', + 'test_checksums--1.0.sql', +) + +tests += { + 'name': 'test_checksums', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'tap': { + 'env': { + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, + 'tests': [ + 't/001_basic.pl', + 't/002_restarts.pl', + 't/003_standby_restarts.pl', + 't/004_offline.pl', + 't/005_injection.pl', + 't/006_pgbench_single.pl', + 't/007_pgbench_standby.pl', + ], + }, +} diff --git a/src/test/modules/test_checksums/t/001_basic.pl b/src/test/modules/test_checksums/t/001_basic.pl new file mode 100644 index 000000000000..728a5c4510c3 --- /dev/null +++ b/src/test/modules/test_checksums/t/001_basic.pl @@ -0,0 +1,63 @@ + +# Copyright (c) 2025, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize node with checksums disabled. +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init(no_data_checksums => 1); +$node->start; + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are turned off +test_checksum_state($node, 'off'); + +# Enable data checksums and wait for the state transition to 'on' +enable_data_checksums($node, wait => 'on'); + +# Run a dummy query just to make sure we can read back data +my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1 "); +is($result, '9999', 'ensure checksummed pages can be read back'); + +# Enable data checksums again which should be a no-op so we explicitly don't +# wait for any state transition as none should happen here +enable_data_checksums($node); +test_checksum_state($node, 'on'); +# ..and make sure we can still read/write data +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +# Disable checksums again and wait for the state transition +disable_data_checksums($node, wait => 'on'); + +# Test reading data again +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '10000', 'ensure previously checksummed pages can be read back'); + +# Re-enable checksums and make sure that the underlying data has changed to +# ensure that checksums will be different. +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +enable_data_checksums($node, wait => 'on'); + +# Run a dummy query just to make sure we can read back the data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/002_restarts.pl b/src/test/modules/test_checksums/t/002_restarts.pl new file mode 100644 index 000000000000..6c17f304eac4 --- /dev/null +++ b/src/test/modules/test_checksums/t/002_restarts.pl @@ -0,0 +1,110 @@ + +# Copyright (c) 2024, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with a +# restart which breaks processing. +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize node with checksums disabled. +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init(no_data_checksums => 1); +$node->start; + +# Initialize result storage for queries +my $result; + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are disabled +test_checksum_state($node, 'off'); + +SKIP: +{ + skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 6 + if (!$ENV{PG_TEST_EXTRA} + || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/); + + # Create a barrier for checksumming to block on, in this case a pre- + # existing temporary table which is kept open while processing is started. + # We can accomplish this by setting up an interactive psql process which + # keeps the temporary table created as we enable checksums in another psql + # process. + # + # This is a similar test to the synthetic variant in 005_injection.pl + # which fakes this scenario. + my $bsession = $node->background_psql('postgres'); + $bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);'); + + # In another session, make sure we can see the blocking temp table but + # start processing anyways and check that we are blocked with a proper + # wait event. + $result = $node->safe_psql('postgres', + "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';" + ); + is($result, 't', 'ensure we can see the temporary table'); + + # Enabling data checksums shouldn't work as the process is blocked on the + # temporary table held open by $bsession. Ensure that we reach inprogress- + # on before we do more tests. + enable_data_checksums($node, wait => 'inprogress-on'); + + # Wait for processing to finish and the worker waiting for leftover temp + # relations to be able to actually finish + $result = $node->poll_query_until( + 'postgres', + "SELECT wait_event FROM pg_catalog.pg_stat_activity " + . "WHERE backend_type = 'datachecksum worker';", + 'ChecksumEnableTemptableWait'); + + # The datachecksumsworker waits for temporary tables to disappear for 3 + # seconds before retrying, so sleep for 4 seconds to be guaranteed to see + # a retry cycle + sleep(4); + + # Re-check the wait event to ensure we are blocked on the right thing. + $result = $node->safe_psql('postgres', + "SELECT wait_event FROM pg_catalog.pg_stat_activity " + . "WHERE backend_type = 'datachecksum worker';"); + is($result, 'ChecksumEnableTemptableWait', + 'ensure the correct wait condition is set'); + test_checksum_state($node, 'inprogress-on'); + + # Stop the cluster while bsession is still attached. We can't close the + # session first since the brief period between closing and stopping might + # be enough for checksums to get enabled. + $node->stop; + $bsession->quit; + $node->start; + + # Ensure the checksums aren't enabled across the restart. This leaves the + # cluster in the same state as before we entered the SKIP block. + test_checksum_state($node, 'off'); +} + +enable_data_checksums($node, wait => 'on'); + +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '9999', 'ensure checksummed pages can be read back'); + +$result = $node->poll_query_until( + 'postgres', + "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksumsworker%';", + '0'); +is($result, 1, 'await datachecksums worker/launcher termination'); + +disable_data_checksums($node, wait => 1); + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/003_standby_restarts.pl b/src/test/modules/test_checksums/t/003_standby_restarts.pl new file mode 100644 index 000000000000..f724d4ea74c3 --- /dev/null +++ b/src/test/modules/test_checksums/t/003_standby_restarts.pl @@ -0,0 +1,114 @@ + +# Copyright (c) 2024, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with +# streaming replication +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize primary node +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1, no_data_checksums => 1); +$node_primary->start; + +my $slotname = 'physical_slot'; +$node_primary->safe_psql('postgres', + "SELECT pg_create_physical_replication_slot('$slotname')"); + +# Take backup +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +# Create streaming standby linking to primary +my $node_standby_1 = PostgreSQL::Test::Cluster->new('standby_1'); +$node_standby_1->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby_1->append_conf( + 'postgresql.conf', qq[ +primary_slot_name = '$slotname' +]); +$node_standby_1->start; + +# Create some content on the primary to have un-checksummed data in the cluster +$node_primary->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Wait for standbys to catch up +$node_primary->wait_for_catchup($node_standby_1, 'replay', + $node_primary->lsn('insert')); + +# Check that checksums are turned off on all nodes +test_checksum_state($node_primary, 'off'); +test_checksum_state($node_standby_1, 'off'); + +# --------------------------------------------------------------------------- +# Enable checksums for the cluster, and make sure that both the primary and +# standby change state. +# + +# Ensure that the primary switches to "inprogress-on" +enable_data_checksums($node_primary, wait => 'inprogress-on'); +# Wait for checksum enable to be replayed +$node_primary->wait_for_catchup($node_standby_1, 'replay'); + +# Ensure that the standby has switched to "inprogress-on" or "on". Normally it +# would be "inprogress-on", but it is theoretically possible for the primary to +# complete the checksum enabling *and* have the standby replay that record +# before we reach the check below. +my $result = $node_standby_1->poll_query_until( + 'postgres', + "SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'f'); +is($result, 1, 'ensure standby has absorbed the inprogress-on barrier'); +$result = $node_standby_1->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); + +is(($result eq 'inprogress-on' || $result eq 'on'), + 1, 'ensure checksums are on, or in progress, on standby_1'); + +# Insert some more data which should be checksummed on INSERT +$node_primary->safe_psql('postgres', + "INSERT INTO t VALUES (generate_series(1, 10000));"); + +# Wait for checksums enabled on the primary and standby +wait_for_checksum_state($node_primary, 'on'); +wait_for_checksum_state($node_standby_1, 'on'); + +$result = + $node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1"); +is($result, '19998', 'ensure we can safely read all data with checksums'); + +$result = $node_primary->poll_query_until( + 'postgres', + "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksumsworker%';", + '0'); +is($result, 1, 'await datachecksums worker/launcher termination'); + +# +# Disable checksums and ensure it's propagated to standby and that we can +# still read all data +# + +# Disable checksums and wait for the operation to be replayed +disable_data_checksums($node_primary); +$node_primary->wait_for_catchup($node_standby_1, 'replay'); +# Ensure that the primary abd standby has switched to off +wait_for_checksum_state($node_primary, 'off'); +wait_for_checksum_state($node_standby_1, 'off'); +# Doublecheck reading data without errors +$result = + $node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1"); +is($result, "19998", 'ensure we can safely read all data without checksums'); + +$node_standby_1->stop; +$node_primary->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/004_offline.pl b/src/test/modules/test_checksums/t/004_offline.pl new file mode 100644 index 000000000000..e9fbcf77eab5 --- /dev/null +++ b/src/test/modules/test_checksums/t/004_offline.pl @@ -0,0 +1,82 @@ + +# Copyright (c) 2024, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums offline from various states +# of checksum processing +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize node with checksums disabled. +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init(no_data_checksums => 1); +$node->start; + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are disabled +test_checksum_state($node, 'off'); + +# Enable checksums offline using pg_checksums +$node->stop; +$node->checksum_enable_offline; +$node->start; + +# Ensure that checksums are enabled +test_checksum_state($node, 'on'); + +# Run a dummy query just to make sure we can read back some data +my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '9999', 'ensure checksummed pages can be read back'); + +# Disable checksums offline again using pg_checksums +$node->stop; +$node->checksum_disable_offline; +$node->start; + +# Ensure that checksums are disabled +test_checksum_state($node, 'off'); + +# Create a barrier for checksumming to block on, in this case a pre-existing +# temporary table which is kept open while processing is started. We can +# accomplish this by setting up an interactive psql process which keeps the +# temporary table created as we enable checksums in another psql process. + +my $bsession = $node->background_psql('postgres'); +$bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);'); + +# In another session, make sure we can see the blocking temp table but start +# processing anyways and check that we are blocked with a proper wait event. +$result = $node->safe_psql('postgres', + "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';"); +is($result, 't', 'ensure we can see the temporary table'); + +enable_data_checksums($node, wait => 'inprogress-on'); + +# Turn the cluster off and enable checksums offline, then start back up +$bsession->quit; +$node->stop; +$node->checksum_enable_offline; +$node->start; + +# Ensure that checksums are now enabled even though processing wasn't +# restarted +test_checksum_state($node, 'on'); + +# Run a dummy query just to make sure we can read back some data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '9999', 'ensure checksummed pages can be read back'); + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/005_injection.pl b/src/test/modules/test_checksums/t/005_injection.pl new file mode 100644 index 000000000000..ae801cd336f7 --- /dev/null +++ b/src/test/modules/test_checksums/t/005_injection.pl @@ -0,0 +1,126 @@ + +# Copyright (c) 2025, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with +# injection point tests injecting failures into the processing + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# --------------------------------------------------------------------------- +# Test cluster setup +# + +# Initiate testcluster +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init(no_data_checksums => 1); +$node->start; + +# Set up test environment +$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); + +# --------------------------------------------------------------------------- +# Inducing failures and crashes in processing + +# Force enabling checksums to fail by marking one of the databases as having +# failed in processing. +disable_data_checksums($node, wait => 1); +$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(true);'); +enable_data_checksums($node, wait => 'off'); +$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(false);'); + +# Force the server to crash after enabling data checksums but before issuing +# the checkpoint. Since the switch has been WAL logged the server should come +# up with checksums enabled after replay. +test_checksum_state($node, 'off'); +$node->safe_psql('postgres', 'SELECT dc_crash_before_checkpoint();'); +enable_data_checksums($node, fast => 'true'); +my $ret = wait_for_cluster_crash($node); +ok($ret == 1, "Cluster crash detection timeout"); +ok(!$node->is_alive, "Cluster crashed due to abort() before checkpointing"); +$node->_update_pid(-1); +$node->start; +test_checksum_state($node, 'on'); + +# Another test just like the previous, but for disabling data checksums (and +# crashing just before checkpointing). The previous injection points were all +# detached from through the crash so they need to be reattached. +$node->safe_psql('postgres', 'SELECT dc_crash_before_checkpoint();'); +disable_data_checksums($node); +$ret = wait_for_cluster_crash($node); +ok($ret == 1, "Cluster crash detection timeout"); +ok(!$node->is_alive, "Cluster crashed due to abort() before checkpointing"); +$node->_update_pid(-1); +$node->start; +test_checksum_state($node, 'off'); + +# Now inject a crash before inserting the WAL record for data checksum state +# change, when the server comes back up again the state should not have been +# set to the new value since the process didn't succeed. +$node->safe_psql('postgres', 'SELECT dc_crash_before_xlog();'); +enable_data_checksums($node); +$ret = wait_for_cluster_crash($node); +ok($ret == 1, "Cluster crash detection timeout"); +ok(!$node->is_alive, "Cluster crashed"); +$node->_update_pid(-1); +$node->start; +test_checksum_state($node, 'off'); + +# This re-runs the same test again but with first disabling data checksums and +# then enabling again, crashing right before inserting the WAL record. When +# it comes back up the checksums must not be enabled. +$node->safe_psql('postgres', 'SELECT dc_crash_before_xlog();'); +enable_data_checksums($node); +$ret = wait_for_cluster_crash($node); +ok($ret == 1, "Cluster crash detection timeout"); +ok(!$node->is_alive, "Cluster crashed"); +$node->_update_pid(-1); +$node->start; +test_checksum_state($node, 'off'); + +# --------------------------------------------------------------------------- +# Timing and retry related tests +# + +# Force the enable checksums processing to make multiple passes by removing +# one database from the list in the first pass. This will simulate a CREATE +# DATABASE during processing. Doing this via fault injection makes the test +# not be subject to exact timing. +$node->safe_psql('postgres', 'SELECT dcw_prune_dblist(true);'); +enable_data_checksums($node, wait => 'on'); + +SKIP: +{ + skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 4 + if (!$ENV{PG_TEST_EXTRA} + || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/); + + # Inject a delay in the barrier for enabling checksums + disable_data_checksums($node, wait => 1); + $node->safe_psql('postgres', 'SELECT dcw_inject_delay_barrier();'); + enable_data_checksums($node, wait => 'on'); + + # Fake the existence of a temporary table at the start of processing, which + # will force the processing to wait and retry in order to wait for it to + # disappear. + disable_data_checksums($node, wait => 1); + $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(true);'); + enable_data_checksums($node, wait => 'on'); +} + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/006_pgbench_single.pl b/src/test/modules/test_checksums/t/006_pgbench_single.pl new file mode 100644 index 000000000000..96f3b2cd8a6d --- /dev/null +++ b/src/test/modules/test_checksums/t/006_pgbench_single.pl @@ -0,0 +1,268 @@ + +# Copyright (c) 2025, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with +# concurrent activity via pgbench runs + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +if (!$ENV{PG_TEST_EXTRA} || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/) +{ + plan skip_all => 'Extended tests not enabled'; +} + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my $node; +my $node_loglocation = 0; + +# The number of full test iterations which will be performed. The exact number +# of tests performed and the wall time taken is non-deterministic as the test +# performs a lot of randomized actions, but 10 iterations will be a long test +# run regardless. +my $TEST_ITERATIONS = 10; + +# Variables which record the current state of the cluster +my $data_checksum_state = 'off'; +my $pgbench = undef; + +# determines whether enable_data_checksums/disable_data_checksums forces an +# immediate checkpoint +my @flip_modes = ('true', 'false'); + +# Start a pgbench run in the background against the server specified via the +# port passed as parameter. +sub background_rw_pgbench +{ + my $port = shift; + + # If a previous pgbench is still running, start by shutting it down. + if ($pgbench) + { + $pgbench->finish; + } + + # Randomize the number of pgbench clients a bit (range 1-16) + my $clients = 1 + int(rand(15)); + + my @cmd = ('pgbench', '-p', $port, '-T', '600', '-c', $clients); + + # Randomize whether we spawn connections or not + push(@cmd, '-C') if (cointoss); + # Finally add the database name to use + push(@cmd, 'postgres'); + + $pgbench = IPC::Run::start( + \@cmd, + '<' => '/dev/null', + '>' => '/dev/null', + '2>' => '/dev/null', + IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default)); +} + +# Invert the state of data checksums in the cluster, if data checksums are on +# then disable them and vice versa. Also performs proper validation of the +# before and after state. +sub flip_data_checksums +{ + # First, make sure the cluster is in the state we expect it to be + test_checksum_state($node, $data_checksum_state); + + if ($data_checksum_state eq 'off') + { + # Coin-toss to see if we are injecting a retry due to a temptable + $node->safe_psql('postgres', 'SELECT dcw_fake_temptable();') + if cointoss(); + + # log LSN right before we start changing checksums + my $result = + $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before enabling: " . $result . "\n"); + + my $mode = $flip_modes[ int(rand(@flip_modes)) ]; + + # Ensure that the primary switches to "inprogress-on" + enable_data_checksums( + $node, + wait => 'inprogress-on', + 'fast' => $mode); + + random_sleep(); + + # Wait for checksums enabled on the primary + wait_for_checksum_state($node, 'on'); + + # log LSN right after the primary flips checksums to "on" + $result = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after enabling: " . $result . "\n"); + + random_sleep(); + + $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(false);'); + $data_checksum_state = 'on'; + } + elsif ($data_checksum_state eq 'on') + { + random_sleep(); + + # log LSN right before we start changing checksums + my $result = + $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before disabling: " . $result . "\n"); + + my $mode = $flip_modes[ int(rand(@flip_modes)) ]; + + disable_data_checksums($node, 'fast' => $mode); + + # Wait for checksums disabled on the primary + wait_for_checksum_state($node, 'off'); + + # log LSN right after the primary flips checksums to "off" + $result = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after disabling: " . $result . "\n"); + + random_sleep(); + + $data_checksum_state = 'off'; + } + else + { + # This should only happen due to programmer error when hacking on the + # test code, but since that might pass subtly by let's ensure it gets + # caught with a test error if so. + bail('data_checksum_state variable has invalid state:' + . $data_checksum_state); + } +} + +# Create and start a cluster with one node +$node = PostgreSQL::Test::Cluster->new('main'); +$node->init(allows_streaming => 1, no_data_checksums => 1); +# max_connections need to be bumped in order to accommodate for pgbench clients +# and log_statement is dialled down since it otherwise will generate enormous +# amounts of logging. Page verification failures are still logged. +$node->append_conf( + 'postgresql.conf', + qq[ +max_connections = 100 +log_statement = none +]); +$node->start; +$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;"); +# Initialize pgbench +$node->command_ok([ 'pgbench', '-i', '-s', '100', '-q', 'postgres' ]); +# Start the test suite with pgbench running. +background_rw_pgbench($node->port); + +# Main test suite. This loop will start a pgbench run on the cluster and while +# that's running flip the state of data checksums concurrently. It will then +# randomly restart thec cluster (in fast or immediate) mode and then check for +# the desired state. The idea behind doing things randomly is to stress out +# any timing related issues by subjecting the cluster for varied workloads. +# A TODO is to generate a trace such that any test failure can be traced to +# its order of operations for debugging. +for (my $i = 0; $i < $TEST_ITERATIONS; $i++) +{ + note("iteration ", ($i + 1), " of ", $TEST_ITERATIONS); + + if (!$node->is_alive) + { + random_sleep(); + + # Start, to do recovery, and stop + $node->start; + $node->stop('fast'); + + # Since the log isn't being written to now, parse the log and check + # for instances of checksum verification failures. + my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile, + $node_loglocation); + unlike( + $log, + qr/page verification failed/, + "no checksum validation errors in primary log (during WAL recovery)" + ); + $node_loglocation = -s $node->logfile; + + # Randomize the WAL size, to trigger checkpoints less/more often + my $sb = 64 + int(rand(1024)); + $node->append_conf('postgresql.conf', qq[max_wal_size = $sb]); + + $node->start; + + # Start a pgbench in the background against the primary + background_rw_pgbench($node->port); + } + + $node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); + + flip_data_checksums(); + random_sleep(); + my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); + is($result, '100000', 'ensure data pages can be read back on primary'); + + random_sleep(); + + # Potentially powercycle the node + if (cointoss()) + { + $node->stop(stopmode()); + + PostgreSQL::Test::Utils::system_log("pg_controldata", + $node->data_dir); + + my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile, + $node_loglocation); + unlike( + $log, + qr/page verification failed/, + "no checksum validation errors in primary log (outside WAL recovery)" + ); + $node_loglocation = -s $node->logfile; + } + + random_sleep(); +} + +# Make sure the node is running +if (!$node->is_alive) +{ + $node->start; +} + +# Testrun is over, ensure that data reads back as expected and perform a final +# verification of the data checksum state. +my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '100000', 'ensure data pages can be read back on primary'); +test_checksum_state($node, $data_checksum_state); + +# Perform one final pass over the logs and hunt for unexpected errors +my $log = + PostgreSQL::Test::Utils::slurp_file($node->logfile, $node_loglocation); +unlike( + $log, + qr/page verification failed/, + "no checksum validation errors in primary log"); +$node_loglocation = -s $node->logfile; + +$node->teardown_node; + +done_testing(); diff --git a/src/test/modules/test_checksums/t/007_pgbench_standby.pl b/src/test/modules/test_checksums/t/007_pgbench_standby.pl new file mode 100644 index 000000000000..8b8e031cbf68 --- /dev/null +++ b/src/test/modules/test_checksums/t/007_pgbench_standby.pl @@ -0,0 +1,398 @@ + +# Copyright (c) 2025, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster, +# comprising of a primary and a replicated standby, with concurrent activity +# via pgbench runs + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +my $node_primary_slot = 'physical_slot'; +my $node_primary_backup = 'primary_backup'; +my $node_primary; +my $node_primary_loglocation = 0; +my $node_standby_1; +my $node_standby_1_loglocation = 0; + +# The number of full test iterations which will be performed. The exact number +# of tests performed and the wall time taken is non-deterministic as the test +# performs a lot of randomized actions, but 5 iterations will be a long test +# run regardless. +my $TEST_ITERATIONS = 5; + +# Variables which record the current state of the cluster +my $data_checksum_state = 'off'; + +my $pgbench_primary = undef; +my $pgbench_standby = undef; + +# Variables holding state for managing the cluster and aux processes in +# various ways +my ($pgb_primary_stdin, $pgb_primary_stdout, $pgb_primary_stderr) = + ('', '', ''); +my ($pgb_standby_1_stdin, $pgb_standby_1_stdout, $pgb_standby_1_stderr) = + ('', '', ''); + +if (!$ENV{PG_TEST_EXTRA} || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/) +{ + plan skip_all => 'Extended tests not enabled'; +} + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# determines whether enable_data_checksums/disable_data_checksums forces an +# immediate checkpoint +my @flip_modes = ('true', 'false'); + +# Start a pgbench run in the background against the server specified via the +# port passed as parameter +sub background_pgbench +{ + my ($port, $standby) = @_; + + # Terminate any currently running pgbench process before continuing + $pgbench_primary->finish if $pgbench_primary; + + my $clients = 1 + int(rand(15)); + + my @cmd = ('pgbench', '-p', $port, '-T', '600', '-c', $clients); + # Randomize whether we spawn connections or not + push(@cmd, '-C') if (cointoss()); + # If we run on a standby it needs to be a read-only benchmark + push(@cmd, '-S') if ($standby); + # Finally add the database name to use + push(@cmd, 'postgres'); + + $pgbench_primary = IPC::Run::start( + [ 'pgbench', '-p', $port, '-T', '600', '-c', $clients, 'postgres' ], + '<' => '/dev/null', + '>' => '/dev/null', + '2>' => '/dev/null', + IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default)); +} + +# Invert the state of data checksums in the cluster, if data checksums are on +# then disable them and vice versa. Also performs proper validation of the +# before and after state. +sub flip_data_checksums +{ + test_checksum_state($node_primary, $data_checksum_state); + test_checksum_state($node_standby_1, $data_checksum_state); + + if ($data_checksum_state eq 'off') + { + # Coin-toss to see if we are injecting a retry due to a temptable + $node_primary->safe_psql('postgres', + 'SELECT dcw_fake_temptable(true);') + if cointoss(); + + # log LSN right before we start changing checksums + my $result = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before enabling: " . $result . "\n"); + + my $mode = $flip_modes[ int(rand(@flip_modes)) ]; + + # Ensure that the primary switches to "inprogress-on" + enable_data_checksums( + $node_primary, + wait => 'inprogress-on', + 'fast' => $mode); + random_sleep(); + # Wait for checksum enable to be replayed + $node_primary->wait_for_catchup($node_standby_1, 'replay'); + + # Ensure that the standby has switched to "inprogress-on" or "on". + # Normally it would be "inprogress-on", but it is theoretically + # possible for the primary to complete the checksum enabling *and* have + # the standby replay that record before we reach the check below. + $result = $node_standby_1->poll_query_until( + 'postgres', + "SELECT setting = 'off' " + . "FROM pg_catalog.pg_settings " + . "WHERE name = 'data_checksums';", + 'f'); + is($result, 1, + 'ensure standby has absorbed the inprogress-on barrier'); + random_sleep(); + $result = $node_standby_1->safe_psql('postgres', + "SELECT setting " + . "FROM pg_catalog.pg_settings " + . "WHERE name = 'data_checksums';"); + + is(($result eq 'inprogress-on' || $result eq 'on'), + 1, 'ensure checksums are on, or in progress, on standby_1'); + + # Wait for checksums enabled on the primary and standby + wait_for_checksum_state($node_primary, 'on'); + + # log LSN right after the primary flips checksums to "on" + $result = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after enabling: " . $result . "\n"); + + random_sleep(); + wait_for_checksum_state($node_standby_1, 'on'); + + $node_primary->safe_psql('postgres', + 'SELECT dcw_fake_temptable(false);'); + $data_checksum_state = 'on'; + } + elsif ($data_checksum_state eq 'on') + { + random_sleep(); + + # log LSN right before we start changing checksums + my $result = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before disabling: " . $result . "\n"); + + my $mode = $flip_modes[ int(rand(@flip_modes)) ]; + + disable_data_checksums($node_primary, 'fast' => $mode); + $node_primary->wait_for_catchup($node_standby_1, 'replay'); + + # Wait for checksums disabled on the primary and standby + wait_for_checksum_state($node_primary, 'off'); + wait_for_checksum_state($node_standby_1, 'off'); + + # log LSN right after the primary flips checksums to "off" + $result = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after disabling: " . $result . "\n"); + + random_sleep(); + wait_for_checksum_state($node_standby_1, 'off'); + + $data_checksum_state = 'off'; + } + else + { + # This should only happen due to programmer error when hacking on the + # test code, but since that might pass subtly by let's ensure it gets + # caught with a test error if so. + is(1, 0, 'data_checksum_state variable has invalid state'); + } +} + +# Create and start a cluster with one primary and one standby node, and ensure +# they are caught up and in sync. +$node_primary = PostgreSQL::Test::Cluster->new('main'); +$node_primary->init(allows_streaming => 1, no_data_checksums => 1); +# max_connections need to be bumped in order to accommodate for pgbench clients +# and log_statement is dialled down since it otherwise will generate enormous +# amounts of logging. Page verification failures are still logged. +$node_primary->append_conf( + 'postgresql.conf', + qq[ +max_connections = 30 +log_statement = none +]); +$node_primary->start; +$node_primary->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); +# Create some content to have un-checksummed data in the cluster +$node_primary->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;"); +$node_primary->safe_psql('postgres', + "SELECT pg_create_physical_replication_slot('$node_primary_slot');"); +$node_primary->backup($node_primary_backup); + +$node_standby_1 = PostgreSQL::Test::Cluster->new('standby_1'); +$node_standby_1->init_from_backup($node_primary, $node_primary_backup, + has_streaming => 1); +$node_standby_1->append_conf( + 'postgresql.conf', qq[ +primary_slot_name = '$node_primary_slot' +]); +$node_standby_1->start; + +# Initialize pgbench and wait for the objects to be created on the standby +$node_primary->command_ok([ 'pgbench', '-i', '-s', '100', '-q', 'postgres' ]); +$node_primary->wait_for_catchup($node_standby_1, 'replay'); + +# Start the test suite with pgbench running on all nodes +background_pgbench($node_standby_1->port, 1); +background_pgbench($node_primary->port, 0); + +# Main test suite. This loop will start a pgbench run on the cluster and while +# that's running flip the state of data checksums concurrently. It will then +# randomly restart the cluster (in fast or immediate) mode and then check for +# the desired state. The idea behind doing things randomly is to stress out +# any timing related issues by subjecting the cluster for varied workloads. +# A TODO is to generate a trace such that any test failure can be traced to +# its order of operations for debugging. +for (my $i = 0; $i < $TEST_ITERATIONS; $i++) +{ + note("iteration ", ($i + 1), " of ", $TEST_ITERATIONS); + + if (!$node_primary->is_alive) + { + random_sleep(); + + # start, to do recovery, and stop + $node_primary->start; + $node_primary->stop('fast'); + + # Since the log isn't being written to now, parse the log and check + # for instances of checksum verification failures. + my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, + $node_primary_loglocation); + unlike( + $log, + qr/page verification failed/, + "no checksum validation errors in primary log (during WAL recovery)" + ); + $node_primary_loglocation = -s $node_primary->logfile; + + # randomize the WAL size, to trigger checkpoints less/more often + my $sb = 64 + int(rand(960)); + $node_primary->append_conf('postgresql.conf', qq[max_wal_size = $sb]); + + note("changing primary max_wal_size to " . $sb); + + $node_primary->start; + + # Start a pgbench in the background against the primary + background_pgbench($node_primary->port, 0); + } + + if (!$node_standby_1->is_alive) + { + random_sleep(); + + $node_standby_1->start; + $node_standby_1->stop('fast'); + + # Since the log isn't being written to now, parse the log and check + # for instances of checksum verification failures. + my $log = + PostgreSQL::Test::Utils::slurp_file($node_standby_1->logfile, + $node_standby_1_loglocation); + unlike( + $log, + qr/page verification failed/, + "no checksum validation errors in standby_1 log (during WAL recovery)" + ); + $node_standby_1_loglocation = -s $node_standby_1->logfile; + + # randomize the WAL size, to trigger checkpoints less/more often + my $sb = 64 + int(rand(960)); + $node_standby_1->append_conf('postgresql.conf', + qq[max_wal_size = $sb]); + + note("changing standby max_wal_size to " . $sb); + + $node_standby_1->start; + + # Start a select-only pgbench in the background on the standby + background_pgbench($node_standby_1->port, 1); + } + + $node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;"); + + flip_data_checksums(); + random_sleep(); + my $result = $node_primary->safe_psql('postgres', + "SELECT count(*) FROM t WHERE a > 1"); + is($result, '100000', 'ensure data pages can be read back on primary'); + random_sleep(); + $node_primary->wait_for_catchup($node_standby_1, 'write'); + + random_sleep(); + + # Potentially powercycle the cluster (the nodes independently) + # XXX should maybe try stopping nodes in the opposite order too? + if (cointoss()) + { + $node_primary->stop(stopmode()); + + # print the contents of the control file on the primary + PostgreSQL::Test::Utils::system_log("pg_controldata", + $node_primary->data_dir); + + # slurp the file after shutdown, so that it doesn't interfere with the recovery + my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, + $node_primary_loglocation); + unlike( + $log, + qr/page verification failed/, + "no checksum validation errors in primary log (outside WAL recovery)" + ); + $node_primary_loglocation = -s $node_primary->logfile; + } + + random_sleep(); + + if (cointoss()) + { + $node_standby_1->stop(stopmode()); + + # print the contents of the control file on the standby + PostgreSQL::Test::Utils::system_log("pg_controldata", + $node_standby_1->data_dir); + + # slurp the file after shutdown, so that it doesn't interfere with the recovery + my $log = + PostgreSQL::Test::Utils::slurp_file($node_standby_1->logfile, + $node_standby_1_loglocation); + unlike( + $log, + qr/page verification failed/, + "no checksum validation errors in standby_1 log (outside WAL recovery)" + ); + $node_standby_1_loglocation = -s $node_standby_1->logfile; + } +} + +# make sure the nodes are running +if (!$node_primary->is_alive) +{ + $node_primary->start; +} + +if (!$node_standby_1->is_alive) +{ + $node_standby_1->start; +} + +# Testrun is over, ensure that data reads back as expected and perform a final +# verification of the data checksum state. +my $result = + $node_primary->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '100000', 'ensure data pages can be read back on primary'); +test_checksum_state($node_primary, $data_checksum_state); +test_checksum_state($node_standby_1, $data_checksum_state); + +# Perform one final pass over the logs and hunt for unexpected errors +my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, + $node_primary_loglocation); +unlike( + $log, + qr/page verification failed/, + "no checksum validation errors in primary log"); +$node_primary_loglocation = -s $node_primary->logfile; +$log = PostgreSQL::Test::Utils::slurp_file($node_standby_1->logfile, + $node_standby_1_loglocation); +unlike( + $log, + qr/page verification failed/, + "no checksum validation errors in standby_1 log"); +$node_standby_1_loglocation = -s $node_standby_1->logfile; + +$node_standby_1->teardown_node; +$node_primary->teardown_node; + +done_testing(); diff --git a/src/test/modules/test_checksums/t/DataChecksums/Utils.pm b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm new file mode 100644 index 000000000000..cf670be944ca --- /dev/null +++ b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm @@ -0,0 +1,283 @@ + +# Copyright (c) 2025, PostgreSQL Global Development Group + +=pod + +=head1 NAME + +DataChecksums::Utils - Utility functions for testing data checksums in a running cluster + +=head1 SYNOPSIS + + use PostgreSQL::Test::Cluster; + use DataChecksums::Utils qw( .. ); + + # Create, and start, a new cluster + my $node = PostgreSQL::Test::Cluster->new('primary'); + $node->init; + $node->start; + + test_checksum_state($node, 'off'); + + enable_data_checksums($node); + + wait_for_checksum_state($node, 'on'); + + +=cut + +package DataChecksums::Utils; + +use strict; +use warnings FATAL => 'all'; +use Exporter 'import'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +our @EXPORT = qw( + cointoss + disable_data_checksums + enable_data_checksums + random_sleep + stopmode + test_checksum_state + wait_for_checksum_state + wait_for_cluster_crash +); + +=pod + +=head1 METHODS + +=over + +=item test_checksum_state(node, state) + +Test that the current value of the data checksum GUC in the server running +at B matches B. If the values differ, a test failure is logged. +Returns True if the values match, otherwise False. + +=cut + +sub test_checksum_state +{ + my ($postgresnode, $state) = @_; + + my $result = $postgresnode->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" + ); + is($result, $state, 'ensure checksums are set to ' . $state); + return $result eq $state; +} + +=item wait_for_checksum_state(node, state) + +Test the value of the data checksum GUC in the server running at B +repeatedly until it matches B or times out. Processing will run for +$PostgreSQL::Test::Utils::timeout_default seconds before timing out. If the +values differ when the process times out, False is returned and a test failure +is logged, otherwise True. + +=cut + +sub wait_for_checksum_state +{ + my ($postgresnode, $state) = @_; + + my $res = $postgresnode->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + $state); + is($res, 1, 'ensure data checksums are transitioned to ' . $state); + return $res == 1; +} + +=item wait_for_cluster_crash(node, params) + +Repeatedly test if the cluster running at B for responds to connections +and return when it no longer does so, or when it times out. Processing will +run for $PostgreSQL::Test::Utils::timeout_default seconds unless a timeout +value is specified as a parameter. Returns True if the cluster crashed, else +False if the process timed out. + +=over + +=item timeout + +Approximate number of seconds to wait for cluster to crash, default is +$PostgreSQL::Test::Utils::timeout_default. There are no real-time guarantee +that the total process time won't exceed the timeout. + +=back + +=cut + +sub wait_for_cluster_crash +{ + my $postgresnode = shift; + my %params = @_; + my $crash = 0; + + $params{timeout} = $PostgreSQL::Test::Utils::timeout_default + unless (defined($params{timeout})); + + for (my $naps = 0; $naps < $params{timeout}; $naps++) + { + if (!$postgresnode->is_alive) + { + $crash = 1; + last; + } + sleep(1); + } + + return $crash == 1; +} + +=item enable_data_checksums($node, %params) + +Function for enabling data checksums in the cluster running at B. + +=over + +=item cost_delay + +The B to use when enabling data checksums, default is 0. + +=item cost_limit + +The B to use when enabling data checksums, default is 100. + +=item fast + +If set to C an immediate checkpoint will be issued after data +checksums are enabled. Setting this to false will lead to slower tests. +The default is C. + +=item wait + +If defined, the function will wait for the state defined in this parameter, +waiting timing out, before returning. The function will wait for +$PostgreSQL::Test::Utils::timeout_default seconds before timing out. + +=back + +=cut + +sub enable_data_checksums +{ + my $postgresnode = shift; + my %params = @_; + + # Set sane defaults for the parameters + $params{cost_delay} = 0 unless (defined($params{cost_delay})); + $params{cost_limit} = 100 unless (defined($params{cost_limit})); + $params{fast} = 'true' unless (defined($params{fast})); + + my $query = <<'EOQ'; +SELECT pg_enable_data_checksums(%s, %s, %s); +EOQ + + $postgresnode->safe_psql( + 'postgres', + sprintf($query, + $params{cost_delay}, $params{cost_limit}, $params{fast})); + + wait_for_checksum_state($postgresnode, $params{wait}) + if (defined($params{wait})); +} + +=item disable_data_checksums($node, %params) + +Function for disabling data checksums in the cluster running at B. + +=over + +=item wait + +If defined, the function will wait for the state to turn to B, or +waiting timing out, before returning. The function will wait for +$PostgreSQL::Test::Utils::timeout_default seconds before timing out. +Unlike in C the value of the parameter is discarded. + +=over + +=item fast + +If set to C the checkpoint after disabling will be set to immediate, else +it will be deferred. The default if no value is set is B. + +=back + +=cut + +sub disable_data_checksums +{ + my $postgresnode = shift; + my %params = @_; + + # Set sane defaults for the parameters + $params{fast} = 'true' unless (defined($params{fast})); + + my $query = <<'EOQ'; +SELECT pg_disable_data_checksums(%s); +EOQ + + $postgresnode->safe_psql('postgres', sprintf($query, $params{fast})); + + wait_for_checksum_state($postgresnode, 'off') if (defined($params{wait})); +} + +=item cointoss + +Helper for retrieving a binary value with random distribution for deciding +whether to turn things off during testing. + +=back + +=cut + +sub cointoss +{ + return int(rand() < 0.5); +} + +=item random_sleep(max) + +Helper for injecting random sleeps here and there in the testrun. The sleep +duration will be in the range (0,B), but won't be predictable in order to +avoid sleep patterns that manage to avoid race conditions and timing bugs. +The default B is 3 seconds. + +=back + +=cut + +sub random_sleep +{ + my $max = shift; + sleep(int(rand(defined($max) ? $max : 3))) if cointoss; +} + +=item stopmode + +Small helper function for randomly selecting a valid stopmode. + +=back + +=cut + +sub stopmode +{ + return 'immediate' if (cointoss); + return 'fast'; +} + +=pod + +=back + +=cut + +1; diff --git a/src/test/modules/test_checksums/test_checksums--1.0.sql b/src/test/modules/test_checksums/test_checksums--1.0.sql new file mode 100644 index 000000000000..aa086d5c4302 --- /dev/null +++ b/src/test/modules/test_checksums/test_checksums--1.0.sql @@ -0,0 +1,28 @@ +/* src/test/modules/test_checksums/test_checksums--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_checksums" to load this file. \quit + +CREATE FUNCTION dcw_inject_delay_barrier(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_inject_fail_database(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_prune_dblist(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_fake_temptable(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dc_crash_before_checkpoint(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dc_crash_before_xlog(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_checksums/test_checksums.c b/src/test/modules/test_checksums/test_checksums.c new file mode 100644 index 000000000000..c182f2c868b4 --- /dev/null +++ b/src/test/modules/test_checksums/test_checksums.c @@ -0,0 +1,225 @@ +/*-------------------------------------------------------------------------- + * + * test_checksums.c + * Test data checksums + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_checksums/test_checksums.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "funcapi.h" +#include "miscadmin.h" +#include "postmaster/datachecksumsworker.h" +#include "storage/latch.h" +#include "utils/injection_point.h" +#include "utils/wait_event.h" + +#define USEC_PER_SEC 1000000 + +PG_MODULE_MAGIC; + +extern PGDLLEXPORT void dc_delay_barrier(const char *name, const void *private_data, void *arg); +extern PGDLLEXPORT void dc_fail_database(const char *name, const void *private_data, void *arg); +extern PGDLLEXPORT void dc_dblist(const char *name, const void *private_data, void *arg); +extern PGDLLEXPORT void dc_fake_temptable(const char *name, const void *private_data, void *arg); + +extern PGDLLEXPORT void crash(const char *name, const void *private_data, void *arg); + +/* + * Test for delaying emission of procsignalbarriers. + */ +void +dc_delay_barrier(const char *name, const void *private_data, void *arg) +{ + (void) name; + (void) private_data; + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + (3 * 1000), + WAIT_EVENT_PG_SLEEP); +} + +PG_FUNCTION_INFO_V1(dcw_inject_delay_barrier); +Datum +dcw_inject_delay_barrier(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksums-enable-checksums-delay", + "test_checksums", + "dc_delay_barrier", + NULL, + 0); + else + InjectionPointDetach("datachecksums-enable-checksums-delay"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +void +dc_fail_database(const char *name, const void *private_data, void *arg) +{ + static bool first_pass = true; + DataChecksumsWorkerResult *res = (DataChecksumsWorkerResult *) arg; + + if (first_pass) + *res = DATACHECKSUMSWORKER_FAILED; + first_pass = false; +} + +PG_FUNCTION_INFO_V1(dcw_inject_fail_database); +Datum +dcw_inject_fail_database(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-fail-db", + "test_checksums", + "dc_fail_database", + NULL, + 0); + else + InjectionPointDetach("datachecksumsworker-fail-db"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +/* + * Test to remove an entry from the Databaselist to force re-processing since + * not all databases could be processed in the first iteration of the loop. + */ +void +dc_dblist(const char *name, const void *private_data, void *arg) +{ + static bool first_pass = true; + List *DatabaseList = (List *) arg; + + if (first_pass) + DatabaseList = list_delete_last(DatabaseList); + first_pass = false; +} + +PG_FUNCTION_INFO_V1(dcw_prune_dblist); +Datum +dcw_prune_dblist(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-initial-dblist", + "test_checksums", + "dc_dblist", + NULL, + 0); + else + InjectionPointDetach("datachecksumsworker-initial-dblist"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +/* + * Test to force waiting for existing temptables. + */ +void +dc_fake_temptable(const char *name, const void *private_data, void *arg) +{ + static bool first_pass = true; + int *numleft = (int *) arg; + + if (first_pass) + *numleft = 1; + first_pass = false; +} + +PG_FUNCTION_INFO_V1(dcw_fake_temptable); +Datum +dcw_fake_temptable(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-fake-temptable-wait", + "test_checksums", + "dc_fake_temptable", + NULL, + 0); + else + InjectionPointDetach("datachecksumsworker-fake-temptable-wait"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +void +crash(const char *name, const void *private_data, void *arg) +{ + abort(); +} + +/* + * dc_crash_before_checkpoint + * + * Ensure that the server crashes just before the checkpoint is issued after + * enabling or disabling checksums. + */ +PG_FUNCTION_INFO_V1(dc_crash_before_checkpoint); +Datum +dc_crash_before_checkpoint(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + InjectionPointAttach("datachecksums-enable-checksums-pre-checkpoint", + "test_checksums", "crash", NULL, 0); + InjectionPointAttach("datachecksums-disable-checksums-pre-checkpoint", + "test_checksums", "crash", NULL, 0); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +/* + * dc_crash_before_xlog + * + * Ensure that the server crashes right before it is about insert the xlog + * record XLOG_CHECKSUMS. + */ +PG_FUNCTION_INFO_V1(dc_crash_before_xlog); +Datum +dc_crash_before_xlog(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksums-xlogchecksums-pre-xloginsert", + "test_checksums", "crash", NULL, 0); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} diff --git a/src/test/modules/test_checksums/test_checksums.control b/src/test/modules/test_checksums/test_checksums.control new file mode 100644 index 000000000000..84b4cc035a78 --- /dev/null +++ b/src/test/modules/test_checksums/test_checksums.control @@ -0,0 +1,4 @@ +comment = 'Test code for data checksums' +default_version = '1.0' +module_pathname = '$libdir/test_checksums' +relocatable = true diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index 35413f140198..3af7944aceac 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -3872,6 +3872,51 @@ sub advance_wal } } +=item $node->checksum_enable_offline() + +Enable data page checksums in an offline cluster with B. The +caller is responsible for ensuring that the cluster is in the right state for +this operation. + +=cut + +sub checksum_enable_offline +{ + my ($self) = @_; + + print "# Enabling checksums in \"$self->data_dir\"\n"; + PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D', + $self->data_dir, '-e'); + return; +} + +=item checksum_disable_offline + +Disable data page checksums in an offline cluster with B. The +caller is responsible for ensuring that the cluster is in the right state for +this operation. + +=cut + +sub checksum_disable_offline +{ + my ($self) = @_; + + print "# Disabling checksums in \"$self->data_dir\"\n"; + PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D', + $self->data_dir, '-d'); + return; +} + +sub checksum_verify_offline +{ + my ($self) = @_; + + PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D', + $self->data_dir, '-c'); + return; +} + =pod =back diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 2bf968ae3d37..9c4409a12a1b 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2081,6 +2081,42 @@ pg_stat_progress_create_index| SELECT s.pid, s.param15 AS partitions_done FROM (pg_stat_get_progress_info('CREATE INDEX'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) LEFT JOIN pg_database d ON ((s.datid = d.oid))); +pg_stat_progress_data_checksums| SELECT s.pid, + s.datid, + d.datname, + CASE s.param1 + WHEN 0 THEN 'enabling'::text + WHEN 1 THEN 'disabling'::text + WHEN 2 THEN 'waiting'::text + WHEN 3 THEN 'waiting on temporary tables'::text + WHEN 4 THEN 'waiting on checkpoint'::text + WHEN 5 THEN 'done'::text + ELSE NULL::text + END AS phase, + CASE s.param2 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param2 + END AS databases_total, + s.param3 AS databases_done, + CASE s.param4 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param4 + END AS relations_total, + CASE s.param5 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param5 + END AS relations_done, + CASE s.param6 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param6 + END AS blocks_total, + CASE s.param7 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param7 + END AS blocks_done + FROM (pg_stat_get_progress_info('DATACHECKSUMS'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) + LEFT JOIN pg_database d ON ((s.datid = d.oid))) + ORDER BY s.datid; pg_stat_progress_vacuum| SELECT s.pid, s.datid, d.datname, diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index 67e1860e984f..c9feff8331e2 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -51,6 +51,22 @@ client backend|relation|vacuum client backend|temp relation|normal client backend|wal|init client backend|wal|normal +datachecksum launcher|relation|bulkread +datachecksum launcher|relation|bulkwrite +datachecksum launcher|relation|init +datachecksum launcher|relation|normal +datachecksum launcher|relation|vacuum +datachecksum launcher|temp relation|normal +datachecksum launcher|wal|init +datachecksum launcher|wal|normal +datachecksum worker|relation|bulkread +datachecksum worker|relation|bulkwrite +datachecksum worker|relation|init +datachecksum worker|relation|normal +datachecksum worker|relation|vacuum +datachecksum worker|temp relation|normal +datachecksum worker|wal|init +datachecksum worker|wal|normal io worker|relation|bulkread io worker|relation|bulkwrite io worker|relation|init @@ -95,7 +111,7 @@ walsummarizer|wal|init walsummarizer|wal|normal walwriter|wal|init walwriter|wal|normal -(79 rows) +(95 rows) \a -- ensure that both seqscan and indexscan plans are allowed SET enable_seqscan TO on; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 2ca7b75af579..7328a685df4f 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -417,6 +417,7 @@ CheckPointStmt CheckpointStatsData CheckpointerRequest CheckpointerShmemStruct +ChecksumType Chromosome CkptSortItem CkptTsStatus @@ -610,6 +611,10 @@ DataPageDeleteStack DataTypesUsageChecks DataTypesUsageVersionCheck DatabaseInfo +DataChecksumsWorkerDatabase +DataChecksumsWorkerResult +DataChecksumsWorkerResultEntry +DataChecksumsWorkerShmemStruct DateADT DateTimeErrorExtra Datum @@ -4252,6 +4257,7 @@ xl_btree_split xl_btree_unlink_page xl_btree_update xl_btree_vacuum +xl_checksum_state xl_clog_truncate xl_commit_ts_truncate xl_dbase_create_file_copy_rec From aeb97123bb353396b048fcef629189fb91cd7c12 Mon Sep 17 00:00:00 2001 From: tomas Date: Sat, 30 Aug 2025 15:57:21 +0200 Subject: [PATCH 2/2] Log checksum version during checkpoints etc. log data_checksum_version, when: - reading/writing the control file - on every checkpoint - setting ControlFile->data_checksum_version --- src/backend/access/transam/xlog.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index d70d0493dcbe..1893396af854 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4447,6 +4447,9 @@ WriteControlFile(void) (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", XLOG_CONTROL_FILE))); + + elog(LOG, "WriteControlFile ControlFile->data_checksum_version = %d ControlFile->checkPointCopy.data_checksum_version = %d", + ControlFile->data_checksum_version, ControlFile->checkPointCopy.data_checksum_version); } static void @@ -4665,6 +4668,9 @@ ReadControlFile(void) elog(LOG, "ReadControlFile checkpoint %X/%08X redo %X/%08X", LSN_FORMAT_ARGS(ControlFile->checkPoint), LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo)); + + elog(LOG, "ReadControlFile ControlFile->data_checksum_version = %d ControlFile->checkPointCopy.data_checksum_version = %d", + ControlFile->data_checksum_version, ControlFile->checkPointCopy.data_checksum_version); } /* @@ -5569,6 +5575,9 @@ XLOGShmemInit(void) /* Use the checksum info from control file */ XLogCtl->data_checksum_version = ControlFile->data_checksum_version; + elog(LOG, "XLOGShmemInit ControlFile->data_checksum_version = %d ControlFile->checkPointCopy.data_checksum_version = %d", + ControlFile->data_checksum_version, ControlFile->checkPointCopy.data_checksum_version); + SetLocalDataChecksumVersion(XLogCtl->data_checksum_version); SpinLockInit(&XLogCtl->Insert.insertpos_lck); @@ -7338,7 +7347,7 @@ LogCheckpointEnd(bool restartpoint) "%d removed, %d recycled; write=%ld.%03d s, " "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, " "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, " - "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X", + "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X, checksums=%d (%d)", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_slru_written, @@ -7354,7 +7363,9 @@ LogCheckpointEnd(bool restartpoint) (int) (PrevCheckPointDistance / 1024.0), (int) (CheckPointDistanceEstimate / 1024.0), LSN_FORMAT_ARGS(ControlFile->checkPoint), - LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo)))); + LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo), + ControlFile->data_checksum_version, + ControlFile->checkPointCopy.data_checksum_version))); else ereport(LOG, (errmsg("checkpoint complete: wrote %d buffers (%.1f%%), " @@ -7362,7 +7373,7 @@ LogCheckpointEnd(bool restartpoint) "%d removed, %d recycled; write=%ld.%03d s, " "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, " "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, " - "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X", + "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X, checksums=%d (%d)", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_slru_written, @@ -7378,7 +7389,9 @@ LogCheckpointEnd(bool restartpoint) (int) (PrevCheckPointDistance / 1024.0), (int) (CheckPointDistanceEstimate / 1024.0), LSN_FORMAT_ARGS(ControlFile->checkPoint), - LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo)))); + LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo), + ControlFile->data_checksum_version, + ControlFile->checkPointCopy.data_checksum_version))); } /* @@ -7872,6 +7885,9 @@ CreateCheckPoint(int flags) /* make sure we start with the checksum version as of the checkpoint */ ControlFile->data_checksum_version = checkPoint.data_checksum_version; + elog(LOG, "CreateCheckPoint ControlFile->data_checksum_version = %d ControlFile->checkPointCopy.data_checksum_version = %d", + ControlFile->data_checksum_version, ControlFile->checkPointCopy.data_checksum_version); + /* * Persist unloggedLSN value. It's reset on crash recovery, so this goes * unused on non-shutdown checkpoints, but seems useful to store it always @@ -8019,6 +8035,9 @@ CreateEndOfRecoveryRecord(void) /* start with the latest checksum version (as of the end of recovery) */ ControlFile->data_checksum_version = XLogCtl->data_checksum_version; + elog(LOG, "CreateEndOfRecoveryRecord ControlFile->data_checksum_version = %d ControlFile->checkPointCopy.data_checksum_version = %d", + ControlFile->data_checksum_version, ControlFile->checkPointCopy.data_checksum_version); + UpdateControlFile(); LWLockRelease(ControlFileLock); @@ -8364,6 +8383,9 @@ CreateRestartPoint(int flags) /* we shall start with the latest checksum version */ ControlFile->data_checksum_version = lastCheckPoint.data_checksum_version; + elog(LOG, "CreateRestartPoint ControlFile->data_checksum_version = %d ControlFile->checkPointCopy.data_checksum_version = %d", + ControlFile->data_checksum_version, ControlFile->checkPointCopy.data_checksum_version); + UpdateControlFile(); } LWLockRelease(ControlFileLock);