diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile index 1b7a63cbaa40..1f2fec95de53 100644 --- a/contrib/amcheck/Makefile +++ b/contrib/amcheck/Makefile @@ -4,16 +4,17 @@ MODULE_big = amcheck OBJS = \ $(WIN32RES) \ verify_common.o \ + verify_gist.o \ verify_gin.o \ verify_heapam.o \ verify_nbtree.o EXTENSION = amcheck DATA = amcheck--1.2--1.3.sql amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql \ - amcheck--1.3--1.4.sql amcheck--1.4--1.5.sql + amcheck--1.3--1.4.sql amcheck--1.4--1.5.sql amcheck--1.5--1.6.sql PGFILEDESC = "amcheck - function for verifying relation integrity" -REGRESS = check check_btree check_gin check_heap +REGRESS = check check_btree check_gin check_gist check_heap EXTRA_INSTALL = contrib/pg_walinspect TAP_TESTS = 1 diff --git a/contrib/amcheck/amcheck--1.5--1.6.sql b/contrib/amcheck/amcheck--1.5--1.6.sql new file mode 100644 index 000000000000..a6a1debff12c --- /dev/null +++ b/contrib/amcheck/amcheck--1.5--1.6.sql @@ -0,0 +1,14 @@ +/* contrib/amcheck/amcheck--1.5--1.6.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.6'" to load this file. \quit + + +-- gist_index_check() +-- +CREATE FUNCTION gist_index_check(index regclass, heapallindexed boolean) +RETURNS VOID +AS 'MODULE_PATHNAME', 'gist_index_check' +LANGUAGE C STRICT; + +REVOKE ALL ON FUNCTION gist_index_check(regclass,boolean) FROM PUBLIC; diff --git a/contrib/amcheck/amcheck.control b/contrib/amcheck/amcheck.control index c8ba6d7c9bc3..2f329ef2cf49 100644 --- a/contrib/amcheck/amcheck.control +++ b/contrib/amcheck/amcheck.control @@ -1,5 +1,5 @@ # amcheck extension comment = 'functions for verifying relation integrity' -default_version = '1.5' +default_version = '1.6' module_pathname = '$libdir/amcheck' relocatable = true diff --git a/contrib/amcheck/expected/check_gist.out b/contrib/amcheck/expected/check_gist.out new file mode 100644 index 000000000000..cbc3e27e6793 --- /dev/null +++ b/contrib/amcheck/expected/check_gist.out @@ -0,0 +1,145 @@ +SELECT setseed(1); + setseed +--------- + +(1 row) + +-- Test that index built with bulk load is correct +CREATE TABLE gist_check AS SELECT point(random(),s) c, random() p FROM generate_series(1,10000) s; +CREATE INDEX gist_check_idx1 ON gist_check USING gist(c); +CREATE INDEX gist_check_idx2 ON gist_check USING gist(c) INCLUDE(p); +SELECT gist_index_check('gist_check_idx1', false); + gist_index_check +------------------ + +(1 row) + +SELECT gist_index_check('gist_check_idx2', false); + gist_index_check +------------------ + +(1 row) + +SELECT gist_index_check('gist_check_idx1', true); + gist_index_check +------------------ + +(1 row) + +SELECT gist_index_check('gist_check_idx2', true); + gist_index_check +------------------ + +(1 row) + +-- Test that index is correct after inserts +INSERT INTO gist_check SELECT point(random(),s) c, random() p FROM generate_series(1,10000) s; +SELECT gist_index_check('gist_check_idx1', false); + gist_index_check +------------------ + +(1 row) + +SELECT gist_index_check('gist_check_idx2', false); + gist_index_check +------------------ + +(1 row) + +SELECT gist_index_check('gist_check_idx1', true); + gist_index_check +------------------ + +(1 row) + +SELECT gist_index_check('gist_check_idx2', true); + gist_index_check +------------------ + +(1 row) + +-- Test that index is correct after vacuuming +DELETE FROM gist_check WHERE c[1] < 5000; -- delete clustered data +DELETE FROM gist_check WHERE c[1]::int % 2 = 0; -- delete scattered data +-- We need two passes through the index and one global vacuum to actually +-- reuse page +VACUUM gist_check; +VACUUM; +SELECT gist_index_check('gist_check_idx1', false); + gist_index_check +------------------ + +(1 row) + +SELECT gist_index_check('gist_check_idx2', false); + gist_index_check +------------------ + +(1 row) + +SELECT gist_index_check('gist_check_idx1', true); + gist_index_check +------------------ + +(1 row) + +SELECT gist_index_check('gist_check_idx2', true); + gist_index_check +------------------ + +(1 row) + +-- Test that index is correct after reusing pages +INSERT INTO gist_check SELECT point(random(),s) c, random() p FROM generate_series(1,10000) s; +SELECT gist_index_check('gist_check_idx1', false); + gist_index_check +------------------ + +(1 row) + +SELECT gist_index_check('gist_check_idx2', false); + gist_index_check +------------------ + +(1 row) + +SELECT gist_index_check('gist_check_idx1', true); + gist_index_check +------------------ + +(1 row) + +SELECT gist_index_check('gist_check_idx2', true); + gist_index_check +------------------ + +(1 row) + +-- cleanup +DROP TABLE gist_check; +-- +-- Similar to BUG #15597 +-- +CREATE TABLE toast_bug(c point,buggy text); +ALTER TABLE toast_bug ALTER COLUMN buggy SET STORAGE extended; +CREATE INDEX toasty ON toast_bug USING gist(c) INCLUDE(buggy); +-- pg_attribute entry for toasty.buggy (the index) will have plain storage: +UPDATE pg_attribute SET attstorage = 'p' +WHERE attrelid = 'toasty'::regclass AND attname = 'buggy'; +-- Whereas pg_attribute entry for toast_bug.buggy (the table) still has extended storage: +SELECT attstorage FROM pg_attribute +WHERE attrelid = 'toast_bug'::regclass AND attname = 'buggy'; + attstorage +------------ + x +(1 row) + +-- Insert compressible heap tuple (comfortably exceeds TOAST_TUPLE_THRESHOLD): +INSERT INTO toast_bug SELECT point(0,0), repeat('a', 2200); +-- Should not get false positive report of corruption: +SELECT gist_index_check('toasty', true); + gist_index_check +------------------ + +(1 row) + diff --git a/contrib/amcheck/meson.build b/contrib/amcheck/meson.build index 1f0c347ed541..13b36b495ed9 100644 --- a/contrib/amcheck/meson.build +++ b/contrib/amcheck/meson.build @@ -5,6 +5,7 @@ amcheck_sources = files( 'verify_gin.c', 'verify_heapam.c', 'verify_nbtree.c', + 'verify_gist.c', ) if host_system == 'windows' @@ -27,6 +28,7 @@ install_data( 'amcheck--1.2--1.3.sql', 'amcheck--1.3--1.4.sql', 'amcheck--1.4--1.5.sql', + 'amcheck--1.5--1.6.sql', kwargs: contrib_data_args, ) @@ -39,6 +41,7 @@ tests += { 'check', 'check_btree', 'check_gin', + 'check_gist', 'check_heap', ], }, diff --git a/contrib/amcheck/sql/check_gist.sql b/contrib/amcheck/sql/check_gist.sql new file mode 100644 index 000000000000..37966423b8b8 --- /dev/null +++ b/contrib/amcheck/sql/check_gist.sql @@ -0,0 +1,62 @@ + +SELECT setseed(1); + +-- Test that index built with bulk load is correct +CREATE TABLE gist_check AS SELECT point(random(),s) c, random() p FROM generate_series(1,10000) s; +CREATE INDEX gist_check_idx1 ON gist_check USING gist(c); +CREATE INDEX gist_check_idx2 ON gist_check USING gist(c) INCLUDE(p); +SELECT gist_index_check('gist_check_idx1', false); +SELECT gist_index_check('gist_check_idx2', false); +SELECT gist_index_check('gist_check_idx1', true); +SELECT gist_index_check('gist_check_idx2', true); + +-- Test that index is correct after inserts +INSERT INTO gist_check SELECT point(random(),s) c, random() p FROM generate_series(1,10000) s; +SELECT gist_index_check('gist_check_idx1', false); +SELECT gist_index_check('gist_check_idx2', false); +SELECT gist_index_check('gist_check_idx1', true); +SELECT gist_index_check('gist_check_idx2', true); + +-- Test that index is correct after vacuuming +DELETE FROM gist_check WHERE c[1] < 5000; -- delete clustered data +DELETE FROM gist_check WHERE c[1]::int % 2 = 0; -- delete scattered data + +-- We need two passes through the index and one global vacuum to actually +-- reuse page +VACUUM gist_check; +VACUUM; + +SELECT gist_index_check('gist_check_idx1', false); +SELECT gist_index_check('gist_check_idx2', false); +SELECT gist_index_check('gist_check_idx1', true); +SELECT gist_index_check('gist_check_idx2', true); + + +-- Test that index is correct after reusing pages +INSERT INTO gist_check SELECT point(random(),s) c, random() p FROM generate_series(1,10000) s; +SELECT gist_index_check('gist_check_idx1', false); +SELECT gist_index_check('gist_check_idx2', false); +SELECT gist_index_check('gist_check_idx1', true); +SELECT gist_index_check('gist_check_idx2', true); +-- cleanup +DROP TABLE gist_check; + +-- +-- Similar to BUG #15597 +-- +CREATE TABLE toast_bug(c point,buggy text); +ALTER TABLE toast_bug ALTER COLUMN buggy SET STORAGE extended; +CREATE INDEX toasty ON toast_bug USING gist(c) INCLUDE(buggy); + +-- pg_attribute entry for toasty.buggy (the index) will have plain storage: +UPDATE pg_attribute SET attstorage = 'p' +WHERE attrelid = 'toasty'::regclass AND attname = 'buggy'; + +-- Whereas pg_attribute entry for toast_bug.buggy (the table) still has extended storage: +SELECT attstorage FROM pg_attribute +WHERE attrelid = 'toast_bug'::regclass AND attname = 'buggy'; + +-- Insert compressible heap tuple (comfortably exceeds TOAST_TUPLE_THRESHOLD): +INSERT INTO toast_bug SELECT point(0,0), repeat('a', 2200); +-- Should not get false positive report of corruption: +SELECT gist_index_check('toasty', true); \ No newline at end of file diff --git a/contrib/amcheck/verify_common.c b/contrib/amcheck/verify_common.c index a31ce06ed99a..e9b4887f65e4 100644 --- a/contrib/amcheck/verify_common.c +++ b/contrib/amcheck/verify_common.c @@ -13,6 +13,7 @@ #include "postgres.h" #include "access/genam.h" +#include "access/heaptoast.h" #include "access/table.h" #include "access/tableam.h" #include "verify_common.h" @@ -189,3 +190,114 @@ index_checkable(Relation rel, Oid am_id) return amcheck_index_mainfork_expected(rel); } + +IndexTuple +amcheck_normalize_tuple(Relation irel, IndexTuple itup) +{ + TupleDesc tupleDescriptor = RelationGetDescr(irel); + Datum normalized[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + bool need_free[INDEX_MAX_KEYS]; + bool formnewtup = false; + IndexTuple reformed; + int i; + + /* Easy case: It's immediately clear that tuple has no varlena datums */ + if (!IndexTupleHasVarwidths(itup)) + return itup; + + for (i = 0; i < tupleDescriptor->natts; i++) + { + Form_pg_attribute att; + + att = TupleDescAttr(tupleDescriptor, i); + + /* Assume untoasted/already normalized datum initially */ + need_free[i] = false; + normalized[i] = index_getattr(itup, att->attnum, + tupleDescriptor, + &isnull[i]); + if (att->attbyval || att->attlen != -1 || isnull[i]) + continue; + + /* + * Callers always pass a tuple that could safely be inserted into the + * index without further processing, so an external varlena header + * should never be encountered here + */ + if (VARATT_IS_EXTERNAL(DatumGetPointer(normalized[i]))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("external varlena datum in tuple that references heap row (%u,%u) in index \"%s\"", + ItemPointerGetBlockNumber(&(itup->t_tid)), + ItemPointerGetOffsetNumber(&(itup->t_tid)), + RelationGetRelationName(irel)))); + else if (!VARATT_IS_COMPRESSED(DatumGetPointer(normalized[i])) && + VARSIZE(DatumGetPointer(normalized[i])) > TOAST_INDEX_TARGET && + (att->attstorage == TYPSTORAGE_EXTENDED || + att->attstorage == TYPSTORAGE_MAIN)) + { + /* + * This value will be compressed by index_form_tuple() with the + * current storage settings. We may be here because this tuple + * was formed with different storage settings. So, force forming. + */ + formnewtup = true; + } + else if (VARATT_IS_COMPRESSED(DatumGetPointer(normalized[i]))) + { + formnewtup = true; + normalized[i] = PointerGetDatum(PG_DETOAST_DATUM(normalized[i])); + need_free[i] = true; + } + + /* + * Short tuples may have 1B or 4B header. Convert 4B header of short + * tuples to 1B + */ + else if (VARATT_CAN_MAKE_SHORT(DatumGetPointer(normalized[i]))) + { + /* convert to short varlena */ + Size len = VARATT_CONVERTED_SHORT_SIZE(DatumGetPointer(normalized[i])); + char *data = palloc(len); + + SET_VARSIZE_SHORT(data, len); + memcpy(data + 1, VARDATA(DatumGetPointer(normalized[i])), len - 1); + + formnewtup = true; + normalized[i] = PointerGetDatum(data); + need_free[i] = true; + } + } + + /* + * Easier case: Tuple has varlena datums, none of which are compressed or + * short with 4B header + */ + if (!formnewtup) + return itup; + + /* + * Hard case: Tuple had compressed varlena datums that necessitate + * creating normalized version of the tuple from uncompressed input datums + * (normalized input datums). This is rather naive, but shouldn't be + * necessary too often. + * + * In the heap, tuples may contain short varlena datums with both 1B + * header and 4B headers. But the corresponding index tuple should always + * have such varlena's with 1B headers. So, if there is a short varlena + * with 4B header, we need to convert it for fingerprinting. + * + * Note that we rely on deterministic index_form_tuple() TOAST compression + * of normalized input. + */ + reformed = index_form_tuple(tupleDescriptor, normalized, isnull); + reformed->t_tid = itup->t_tid; + + /* Cannot leak memory here */ + for (i = 0; i < tupleDescriptor->natts; i++) + if (need_free[i]) + pfree(DatumGetPointer(normalized[i])); + + return reformed; +} \ No newline at end of file diff --git a/contrib/amcheck/verify_common.h b/contrib/amcheck/verify_common.h index 3fa63d2121ab..ffe0d30beb36 100644 --- a/contrib/amcheck/verify_common.h +++ b/contrib/amcheck/verify_common.h @@ -26,3 +26,5 @@ extern void amcheck_lock_relation_and_check(Oid indrelid, Oid am_id, IndexDoCheckCallback check, LOCKMODE lockmode, void *state); + +extern IndexTuple amcheck_normalize_tuple(Relation irel, IndexTuple itup); diff --git a/contrib/amcheck/verify_gist.c b/contrib/amcheck/verify_gist.c new file mode 100644 index 000000000000..c15cd6ab5564 --- /dev/null +++ b/contrib/amcheck/verify_gist.c @@ -0,0 +1,664 @@ +/*------------------------------------------------------------------------- + * + * verify_gist.c + * Verifies the integrity of GiST indexes based on invariants. + * + * Verification checks that all paths in GiST graph contain + * consistent keys: tuples on parent pages consistently include tuples + * from children pages. Also, verification checks graph invariants: + * internal page must have at least one downlink, internal page can + * reference either only leaf pages or only internal pages. + * + * + * Copyright (c) 2017-2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/amcheck/verify_gist.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gist_private.h" +#include "access/heaptoast.h" +#include "access/tableam.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "common/pg_prng.h" +#include "lib/bloomfilter.h" +#include "verify_common.h" +#include "utils/memutils.h" + + +/* + * GistScanItem represents one item of depth-first scan of GiST index. + */ +typedef struct GistScanItem +{ + int depth; + + /* Referenced block number to check next */ + BlockNumber blkno; + + /* + * Correctness of this parent tuple will be checked against contents of + * referenced page. This tuple will be NULL for the root block. + */ + IndexTuple parenttup; + + /* + * LSN to handle concurrent scans of the page. It's necessary to avoid + * missing some subtrees from the page that was split just before we read it. + */ + XLogRecPtr parentlsn; + + /* + * Reference to parent page for re-locking in case of found parent-child + * tuple discrepancies. + */ + BlockNumber parentblk; + + /* Pointer to a next stack item. */ + struct GistScanItem *next; +} GistScanItem; + +typedef struct GistCheckState +{ + /* GiST state */ + GISTSTATE *state; + /* Bloom filter fingerprints index tuples */ + bloom_filter *filter; + + Snapshot snapshot; + Relation rel; + Relation heaprel; + + /* Debug counter for reporting percentage of work already done */ + int64 heaptuplespresent; + + /* progress reporting stuff */ + BlockNumber totalblocks; + BlockNumber reportedblocks; + BlockNumber scannedblocks; + BlockNumber deltablocks; + + int leafdepth; +} GistCheckState; + +PG_FUNCTION_INFO_V1(gist_index_check); + +static void giststate_init_heapallindexed(Relation rel, GistCheckState * result); +static void gist_check_parent_keys_consistency(Relation rel, Relation heaprel, + void *callback_state, bool readonly); +static void gist_check_page(GistCheckState * check_state, GistScanItem * stack, + Page page, bool heapallindexed, + BufferAccessStrategy strategy); +static void check_index_page(Relation rel, Buffer buffer, BlockNumber blockNo); +static IndexTuple gist_refind_parent(Relation rel, BlockNumber parentblkno, + BlockNumber childblkno, + BufferAccessStrategy strategy); +static ItemId PageGetItemIdCareful(Relation rel, BlockNumber block, + Page page, OffsetNumber offset); +static void gist_tuple_present_callback(Relation index, ItemPointer tid, + Datum *values, bool *isnull, + bool tupleIsAlive, void *checkstate); +static IndexTuple gistFormNormalizedTuple(GistCheckState *giststate, + IndexTuple itup); + +/* + * gist_index_check(index regclass) + * + * Verify integrity of GiST index. + * + * Acquires AccessShareLock on heap & index relations. + */ +Datum +gist_index_check(PG_FUNCTION_ARGS) +{ + Oid indrelid = PG_GETARG_OID(0); + bool heapallindexed = PG_GETARG_BOOL(1); + + amcheck_lock_relation_and_check(indrelid, + GIST_AM_OID, + gist_check_parent_keys_consistency, + AccessShareLock, + &heapallindexed); + + PG_RETURN_VOID(); +} + +/* + * Initialize GIST state files needed to perform. + * This initialized bloom filter and snapshot. + */ +static void +giststate_init_heapallindexed(Relation rel, GistCheckState * result) +{ + int64 total_pages; + int64 total_elems; + uint64 seed; + + /* + * Size Bloom filter based on estimated number of tuples in index. This + * logic is similar to B-tree, see verify_btree.c . + */ + total_pages = result->totalblocks; + total_elems = Max(total_pages * (MaxOffsetNumber / 5), + (int64) rel->rd_rel->reltuples); + seed = pg_prng_uint64(&pg_global_prng_state); + result->filter = bloom_create(total_elems, maintenance_work_mem, seed); + + result->snapshot = RegisterSnapshot(GetTransactionSnapshot()); + + + /* + * GetTransactionSnapshot() always acquires a new MVCC snapshot in READ + * COMMITTED mode. A new snapshot is guaranteed to have all the entries + * it requires in the index. + * + * We must defend against the possibility that an old xact snapshot was + * returned at higher isolation levels when that snapshot is not safe for + * index scans of the target index. This is possible when the snapshot + * sees tuples that are before the index's indcheckxmin horizon. Throwing + * an error here should be very rare. It doesn't seem worth using a + * secondary snapshot to avoid this. + */ + if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin && + !TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data), + result->snapshot->xmin)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("index \"%s\" cannot be verified using transaction snapshot", + RelationGetRelationName(rel)))); +} + +/* + * Main entry point for GiST check. + * + * This function verifies that tuples of internal pages cover all + * the key space of each tuple on the leaf page. To do this we invoke + * gist_check_internal_page() for every internal page. + * + * This check allocates memory context and scans through + * GiST graph. This scan is performed in a depth-first search using a stack of + * GistScanItem-s. Initially this stack contains only the root block number. On + * each iteration the top block number is replaced by referenced block numbers. + * + * + * gist_check_internal_page() in its turn takes every tuple and tries to + * adjust it by tuples on the referenced child page. Parent gist tuple should + * never require any adjustments. + */ +static void +gist_check_parent_keys_consistency(Relation rel, Relation heaprel, + void *callback_state, bool readonly) +{ + BufferAccessStrategy strategy = GetAccessStrategy(BAS_BULKREAD); + GistScanItem *stack; + MemoryContext mctx; + MemoryContext oldcontext; + GISTSTATE *state; + bool heapallindexed = *((bool *) callback_state); + GistCheckState *check_state = palloc0(sizeof(GistCheckState)); + + mctx = AllocSetContextCreate(CurrentMemoryContext, + "amcheck context", + ALLOCSET_DEFAULT_SIZES); + oldcontext = MemoryContextSwitchTo(mctx); + + state = initGISTstate(rel); + + check_state->state = state; + check_state->rel = rel; + check_state->heaprel = heaprel; + + /* + * We don't know the height of the tree yet, but as soon as we encounter a + * leaf page, we will set 'leafdepth' to its depth. + */ + check_state->leafdepth = -1; + + check_state->totalblocks = RelationGetNumberOfBlocks(rel); + /* report every 100 blocks or 5%, whichever is bigger */ + check_state->deltablocks = Max(check_state->totalblocks / 20, 100); + + if (heapallindexed) + giststate_init_heapallindexed(rel, check_state); + + /* Start the scan at the root page */ + stack = (GistScanItem *) palloc0(sizeof(GistScanItem)); + stack->depth = 0; + stack->parenttup = NULL; + stack->parentblk = InvalidBlockNumber; + stack->parentlsn = InvalidXLogRecPtr; + stack->blkno = GIST_ROOT_BLKNO; + + /* + * This GiST scan is effectively an "old" VACUUM version before commit + * fe280694d which introduced physical order scanning. + */ + + while (stack) + { + GistScanItem *stack_next; + Buffer buffer; + Page page; + XLogRecPtr lsn; + + CHECK_FOR_INTERRUPTS(); + + /* Report progress */ + if (check_state->scannedblocks > check_state->reportedblocks + + check_state->deltablocks) + { + elog(DEBUG1, "verified level %u blocks of approximately %u total", + check_state->scannedblocks, check_state->totalblocks); + check_state->reportedblocks = check_state->scannedblocks; + } + check_state->scannedblocks++; + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, + RBM_NORMAL, strategy); + LockBuffer(buffer, GIST_SHARE); + page = (Page) BufferGetPage(buffer); + lsn = BufferGetLSNAtomic(buffer); + + /* Do basic sanity checks on the page headers */ + check_index_page(rel, buffer, stack->blkno); + + /* + * It's possible that the page was split since we looked at the + * parent, so that we didn't missed the downlink of the right sibling + * when we scanned the parent. If so, add the right sibling to the + * stack now. + */ + if (GistFollowRight(page) || stack->parentlsn < GistPageGetNSN(page)) + { + /* split page detected, install right link to the stack */ + GistScanItem *ptr = (GistScanItem *) palloc(sizeof(GistScanItem)); + + ptr->depth = stack->depth; + ptr->parenttup = CopyIndexTuple(stack->parenttup); + ptr->parentblk = stack->parentblk; + ptr->parentlsn = stack->parentlsn; + ptr->blkno = GistPageGetOpaque(page)->rightlink; + ptr->next = stack->next; + stack->next = ptr; + } + + gist_check_page(check_state, stack, page, heapallindexed, strategy); + + if (!GistPageIsLeaf(page)) + { + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + + for (OffsetNumber i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + /* Internal page, so recurse to the child */ + GistScanItem *ptr; + ItemId iid = PageGetItemIdCareful(rel, stack->blkno, page, i); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + + ptr = (GistScanItem *) palloc(sizeof(GistScanItem)); + ptr->depth = stack->depth + 1; + ptr->parenttup = CopyIndexTuple(idxtuple); + ptr->parentblk = stack->blkno; + ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + ptr->parentlsn = lsn; + ptr->next = stack->next; + stack->next = ptr; + } + } + + LockBuffer(buffer, GIST_UNLOCK); + ReleaseBuffer(buffer); + + /* Step to next item in the queue */ + stack_next = stack->next; + if (stack->parenttup) + pfree(stack->parenttup); + pfree(stack); + stack = stack_next; + } + + if (heapallindexed) + { + IndexInfo *indexinfo = BuildIndexInfo(rel); + TableScanDesc scan; + + scan = table_beginscan_strat(heaprel, /* relation */ + check_state->snapshot, /* snapshot */ + 0, /* number of keys */ + NULL, /* scan key */ + true, /* buffer access strategy OK */ + true); /* syncscan OK? */ + + /* + * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY. + */ + indexinfo->ii_Concurrent = true; + + indexinfo->ii_Unique = false; + indexinfo->ii_ExclusionOps = NULL; + indexinfo->ii_ExclusionProcs = NULL; + indexinfo->ii_ExclusionStrats = NULL; + + elog(DEBUG1, "verifying that tuples from index \"%s\" are present in \"%s\"", + RelationGetRelationName(rel), + RelationGetRelationName(heaprel)); + + table_index_build_scan(heaprel, rel, indexinfo, true, false, + gist_tuple_present_callback, (void *) check_state, scan); + + ereport(DEBUG1, + (errmsg_internal("finished verifying presence of " INT64_FORMAT " tuples from table \"%s\" with bitset %.2f%% set", + check_state->heaptuplespresent, + RelationGetRelationName(heaprel), + 100.0 * bloom_prop_bits_set(check_state->filter)))); + + UnregisterSnapshot(check_state->snapshot); + bloom_free(check_state->filter); + } + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(mctx); + pfree(check_state); +} + +static void +gist_check_page(GistCheckState * check_state, GistScanItem * stack, + Page page, bool heapallindexed, BufferAccessStrategy strategy) +{ + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + + /* Check that the tree has the same height in all branches */ + if (GistPageIsLeaf(page)) + { + if (check_state->leafdepth == -1) + check_state->leafdepth = stack->depth; + else if (stack->depth != check_state->leafdepth) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": internal pages traversal encountered leaf page unexpectedly on block %u", + RelationGetRelationName(check_state->rel), stack->blkno))); + } + + /* + * Check that each tuple looks valid, and is consistent with the downlink + * we followed when we stepped on this page. + */ + for (OffsetNumber i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + ItemId iid = PageGetItemIdCareful(check_state->rel, stack->blkno, page, i); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + IndexTuple tmpTuple = NULL; + + /* + * Check that it's not a leftover invalid tuple from pre-9.1 See also + * gistdoinsert() and gistbulkdelete() handling of such tuples. We do + * consider it an error here. + */ + if (GistTupleIsInvalid(idxtuple)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("index \"%s\" contains an inner tuple marked as invalid, block %u, offset %u", + RelationGetRelationName(check_state->rel), stack->blkno, i), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), + errhint("Please REINDEX it."))); + + if (MAXALIGN(ItemIdGetLength(iid)) != MAXALIGN(IndexTupleSize(idxtuple))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has inconsistent tuple sizes, block %u, offset %u", + RelationGetRelationName(check_state->rel), stack->blkno, i))); + + /* + * Check if this tuple is consistent with the downlink in the parent. + */ + if (stack->parenttup) + tmpTuple = gistgetadjusted(check_state->rel, stack->parenttup, idxtuple, check_state->state); + + if (tmpTuple) + { + /* + * There was a discrepancy between parent and child tuples. We + * need to verify it is not a result of concurrent call of + * gistplacetopage(). So, lock parent and try to find downlink for + * current page. It may be missing due to concurrent page split, + * this is OK. + * + * Note that when we acquire parent tuple now we hold lock for both + * parent and child buffers. Thus the parent tuple must include the + * keyspace of the child. + */ + + pfree(tmpTuple); + pfree(stack->parenttup); + stack->parenttup = gist_refind_parent(check_state->rel, stack->parentblk, + stack->blkno, strategy); + + /* We found it - make a final check before failing */ + if (!stack->parenttup) + elog(NOTICE, "Unable to find parent tuple for block %u on block %u due to concurrent split", + stack->blkno, stack->parentblk); + else if (gistgetadjusted(check_state->rel, stack->parenttup, idxtuple, check_state->state)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has inconsistent records on page %u offset %u", + RelationGetRelationName(check_state->rel), stack->blkno, i))); + else + { + /* + * But now it is properly adjusted - nothing to do here. + */ + } + } + + if (GistPageIsLeaf(page)) + { + if (heapallindexed) + { + IndexTuple norm; + + norm = gistFormNormalizedTuple(check_state, idxtuple); + + bloom_add_element(check_state->filter, + (unsigned char *) norm, + IndexTupleSize(norm)); + + /* Be tidy */ + if (norm != idxtuple) + pfree(norm); + } + } + else + { + OffsetNumber off = ItemPointerGetOffsetNumber(&(idxtuple->t_tid)); + + if (off != TUPLE_IS_VALID) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has on page %u offset %u has item id not pointing to 0xffff, but %hu", + RelationGetRelationName(check_state->rel), stack->blkno, i, off))); + } + } +} + +/* + * gistFormNormalizedTuple - analogue to gistFormTuple, but performs deTOASTing + * of all included data (for covering indexes). While we do not expect + * toasted attributes in normal indexes, this can happen as a result of + * intervention into system catalog. Detoasting of key attributes is expected + * to be done by opclass decompression methods, if the indexed type might be + * toasted. + */ +static IndexTuple +gistFormNormalizedTuple(GistCheckState *giststate, + IndexTuple itup) +{ + return amcheck_normalize_tuple(giststate->rel, itup); +} + +static void +gist_tuple_present_callback(Relation index, ItemPointer tid, Datum *values, + bool *isnull, bool tupleIsAlive, void *checkstate) +{ + GistCheckState *state = (GistCheckState *) checkstate; + IndexTuple itup, norm; + Datum compatt[INDEX_MAX_KEYS]; + + /* Generate a normalized index tuple for fingerprinting */ + gistCompressValues(state->state, index, values, isnull, true, compatt); + itup = index_form_tuple(RelationGetDescr(index), compatt, isnull); + itup->t_tid = *tid; + + norm = gistFormNormalizedTuple(state, itup); + + /* Probe Bloom filter -- tuple should be present */ + if (bloom_lacks_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm))) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"", + ItemPointerGetBlockNumber(&(norm->t_tid)), + ItemPointerGetOffsetNumber(&(norm->t_tid)), + RelationGetRelationName(state->heaprel), + RelationGetRelationName(state->rel)))); + + state->heaptuplespresent++; + + pfree(itup); + /* Be tidy */ + if (norm != itup) + pfree(norm); +} + +/* + * check_index_page - verification of basic invariants about GiST page data + * This function does not do any tuple analysis. + */ +static void +check_index_page(Relation rel, Buffer buffer, BlockNumber blockNo) +{ + Page page = BufferGetPage(buffer); + + gistcheckpage(rel, buffer); + + if (GistPageGetOpaque(page)->gist_page_id != GIST_PAGE_ID) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has corrupted page %d", + RelationGetRelationName(rel), blockNo))); + + if (GistPageIsDeleted(page)) + { + if (!GistPageIsLeaf(page)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has deleted internal page %d", + RelationGetRelationName(rel), blockNo))); + if (PageGetMaxOffsetNumber(page) > InvalidOffsetNumber) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has deleted page %d with tuples", + RelationGetRelationName(rel), blockNo))); + } + else if (PageGetMaxOffsetNumber(page) > MaxIndexTuplesPerPage) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has page %d with exceeding count of tuples", + RelationGetRelationName(rel), blockNo))); +} + +/* + * Try to re-find the downlink pointing to 'blkno', in 'parentblkno'. + * + * If found, returns a palloc'd copy of the downlink tuple. Otherwise, + * returns NULL. + */ +static IndexTuple +gist_refind_parent(Relation rel, + BlockNumber parentblkno, BlockNumber childblkno, + BufferAccessStrategy strategy) +{ + Buffer parentbuf; + Page parentpage; + OffsetNumber parent_maxoff; + IndexTuple result = NULL; + + parentbuf = ReadBufferExtended(rel, MAIN_FORKNUM, parentblkno, RBM_NORMAL, + strategy); + + LockBuffer(parentbuf, GIST_SHARE); + parentpage = BufferGetPage(parentbuf); + + if (GistPageIsLeaf(parentpage)) + { + /* + * Currently GiST never deletes internal pages, thus they can never + * become leaf. + */ + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" internal page %d became leaf", + RelationGetRelationName(rel), parentblkno))); + } + + parent_maxoff = PageGetMaxOffsetNumber(parentpage); + for (OffsetNumber o = FirstOffsetNumber; o <= parent_maxoff; o = OffsetNumberNext(o)) + { + ItemId p_iid = PageGetItemIdCareful(rel, parentblkno, parentpage, o); + IndexTuple itup = (IndexTuple) PageGetItem(parentpage, p_iid); + + if (ItemPointerGetBlockNumber(&(itup->t_tid)) == childblkno) + { + /* + * Found it! Make a copy and return it while both parent and child + * pages are locked. This guarantees that at this particular + * moment tuples must be coherent to each other. + */ + result = CopyIndexTuple(itup); + break; + } + } + + UnlockReleaseBuffer(parentbuf); + + return result; +} + +static ItemId +PageGetItemIdCareful(Relation rel, BlockNumber block, Page page, + OffsetNumber offset) +{ + ItemId itemid = PageGetItemId(page, offset); + + if (ItemIdGetOffset(itemid) + ItemIdGetLength(itemid) > + BLCKSZ - MAXALIGN(sizeof(GISTPageOpaqueData))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("line pointer points past end of tuple space in index \"%s\"", + RelationGetRelationName(rel)), + errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", + block, offset, ItemIdGetOffset(itemid), + ItemIdGetLength(itemid), + ItemIdGetFlags(itemid)))); + + /* + * Verify that the line pointer isn't LP_REDIRECT or LP_UNUSED, since gist + * never uses either. Verify that the line pointer has storage, too, since + * even LP_DEAD items should. + */ + if (ItemIdIsRedirected(itemid) || !ItemIdIsUsed(itemid) || + ItemIdGetLength(itemid) == 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("invalid line pointer storage in index \"%s\"", + RelationGetRelationName(rel)), + errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", + block, offset, ItemIdGetOffset(itemid), + ItemIdGetLength(itemid), + ItemIdGetFlags(itemid)))); + + return itemid; +} diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 0949c88983ac..678528f2fd50 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -2859,115 +2859,10 @@ bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values, static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) { - TupleDesc tupleDescriptor = RelationGetDescr(state->rel); - Datum normalized[INDEX_MAX_KEYS]; - bool isnull[INDEX_MAX_KEYS]; - bool need_free[INDEX_MAX_KEYS]; - bool formnewtup = false; - IndexTuple reformed; - int i; - /* Caller should only pass "logical" non-pivot tuples here */ Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup)); - /* Easy case: It's immediately clear that tuple has no varlena datums */ - if (!IndexTupleHasVarwidths(itup)) - return itup; - - for (i = 0; i < tupleDescriptor->natts; i++) - { - Form_pg_attribute att; - - att = TupleDescAttr(tupleDescriptor, i); - - /* Assume untoasted/already normalized datum initially */ - need_free[i] = false; - normalized[i] = index_getattr(itup, att->attnum, - tupleDescriptor, - &isnull[i]); - if (att->attbyval || att->attlen != -1 || isnull[i]) - continue; - - /* - * Callers always pass a tuple that could safely be inserted into the - * index without further processing, so an external varlena header - * should never be encountered here - */ - if (VARATT_IS_EXTERNAL(DatumGetPointer(normalized[i]))) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("external varlena datum in tuple that references heap row (%u,%u) in index \"%s\"", - ItemPointerGetBlockNumber(&(itup->t_tid)), - ItemPointerGetOffsetNumber(&(itup->t_tid)), - RelationGetRelationName(state->rel)))); - else if (!VARATT_IS_COMPRESSED(DatumGetPointer(normalized[i])) && - VARSIZE(DatumGetPointer(normalized[i])) > TOAST_INDEX_TARGET && - (att->attstorage == TYPSTORAGE_EXTENDED || - att->attstorage == TYPSTORAGE_MAIN)) - { - /* - * This value will be compressed by index_form_tuple() with the - * current storage settings. We may be here because this tuple - * was formed with different storage settings. So, force forming. - */ - formnewtup = true; - } - else if (VARATT_IS_COMPRESSED(DatumGetPointer(normalized[i]))) - { - formnewtup = true; - normalized[i] = PointerGetDatum(PG_DETOAST_DATUM(normalized[i])); - need_free[i] = true; - } - - /* - * Short tuples may have 1B or 4B header. Convert 4B header of short - * tuples to 1B - */ - else if (VARATT_CAN_MAKE_SHORT(DatumGetPointer(normalized[i]))) - { - /* convert to short varlena */ - Size len = VARATT_CONVERTED_SHORT_SIZE(DatumGetPointer(normalized[i])); - char *data = palloc(len); - - SET_VARSIZE_SHORT(data, len); - memcpy(data + 1, VARDATA(DatumGetPointer(normalized[i])), len - 1); - - formnewtup = true; - normalized[i] = PointerGetDatum(data); - need_free[i] = true; - } - } - - /* - * Easier case: Tuple has varlena datums, none of which are compressed or - * short with 4B header - */ - if (!formnewtup) - return itup; - - /* - * Hard case: Tuple had compressed varlena datums that necessitate - * creating normalized version of the tuple from uncompressed input datums - * (normalized input datums). This is rather naive, but shouldn't be - * necessary too often. - * - * In the heap, tuples may contain short varlena datums with both 1B - * header and 4B headers. But the corresponding index tuple should always - * have such varlena's with 1B headers. So, if there is a short varlena - * with 4B header, we need to convert it for fingerprinting. - * - * Note that we rely on deterministic index_form_tuple() TOAST compression - * of normalized input. - */ - reformed = index_form_tuple(tupleDescriptor, normalized, isnull); - reformed->t_tid = itup->t_tid; - - /* Cannot leak memory here */ - for (i = 0; i < tupleDescriptor->natts; i++) - if (need_free[i]) - pfree(DatumGetPointer(normalized[i])); - - return reformed; + return amcheck_normalize_tuple(state->rel, itup); } /* diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml index 0aff0a6c8c6f..7e4b6c6f6927 100644 --- a/doc/src/sgml/amcheck.sgml +++ b/doc/src/sgml/amcheck.sgml @@ -208,6 +208,25 @@ ORDER BY c.relpages DESC LIMIT 10; + + + gist_index_check(index regclass, heapallindexed boolean) returns void + + gist_index_check + + + + + + gist_index_check tests that its target GiST + has consistent parent-child tuples relations (no parent tuples + require tuple adjustement) and page graph respects balanced-tree + invariants (internal pages reference only leaf page or only internal + pages). + + + +