PostgreSQL Source Code git master
nbtinsert.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/tableam.h"
#include "access/transam.h"
#include "access/xloginsert.h"
#include "common/int.h"
#include "common/pg_prng.h"
#include "lib/qunique.h"
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
Include dependency graph for nbtinsert.c:

Go to the source code of this file.

Macros

#define BTREE_FASTPATH_MIN_LEVEL   2
 

Functions

static BTStack _bt_search_insert (Relation rel, Relation heaprel, BTInsertState insertstate)
 
static TransactionId _bt_check_unique (Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
 
static OffsetNumber _bt_findinsertloc (Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
 
static void _bt_stepright (Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack)
 
static void _bt_insertonpg (Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
 
static Buffer _bt_split (Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
 
static void _bt_insert_parent (Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
 
static Buffer _bt_newlevel (Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
 
static bool _bt_pgaddtup (Page page, Size itemsize, const IndexTupleData *itup, OffsetNumber itup_off, bool newfirstdataitem)
 
static void _bt_delete_or_dedup_one_page (Relation rel, Relation heapRel, BTInsertState insertstate, bool simpleonly, bool checkingunique, bool uniquedup, bool indexUnchanged)
 
static void _bt_simpledel_pass (Relation rel, Buffer buffer, Relation heapRel, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, OffsetNumber minoff, OffsetNumber maxoff)
 
static BlockNumber_bt_deadblocks (Page page, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, int *nblocks)
 
static int _bt_blk_cmp (const void *arg1, const void *arg2)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
 
void _bt_finish_split (Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
 

Macro Definition Documentation

◆ BTREE_FASTPATH_MIN_LEVEL

#define BTREE_FASTPATH_MIN_LEVEL   2

Definition at line 31 of file nbtinsert.c.

Function Documentation

◆ _bt_blk_cmp()

static int _bt_blk_cmp ( const void *  arg1,
const void *  arg2 
)
inlinestatic

Definition at line 3015 of file nbtinsert.c.

3016{
3017 BlockNumber b1 = *((BlockNumber *) arg1);
3018 BlockNumber b2 = *((BlockNumber *) arg2);
3019
3020 return pg_cmp_u32(b1, b2);
3021}
uint32 BlockNumber
Definition: block.h:31
static int pg_cmp_u32(uint32 a, uint32 b)
Definition: int.h:652

References pg_cmp_u32().

Referenced by _bt_deadblocks(), and _bt_simpledel_pass().

◆ _bt_check_unique()

static TransactionId _bt_check_unique ( Relation  rel,
BTInsertState  insertstate,
Relation  heapRel,
IndexUniqueCheck  checkUnique,
bool *  is_unique,
uint32 speculativeToken 
)
static

Definition at line 409 of file nbtinsert.c.

412{
413 IndexTuple itup = insertstate->itup;
414 IndexTuple curitup = NULL;
415 ItemId curitemid = NULL;
416 BTScanInsert itup_key = insertstate->itup_key;
417 SnapshotData SnapshotDirty;
418 OffsetNumber offset;
419 OffsetNumber maxoff;
420 Page page;
421 BTPageOpaque opaque;
422 Buffer nbuf = InvalidBuffer;
423 bool found = false;
424 bool inposting = false;
425 bool prevalldead = true;
426 int curposti = 0;
427
428 /* Assume unique until we find a duplicate */
429 *is_unique = true;
430
431 InitDirtySnapshot(SnapshotDirty);
432
433 page = BufferGetPage(insertstate->buf);
434 opaque = BTPageGetOpaque(page);
435 maxoff = PageGetMaxOffsetNumber(page);
436
437 /*
438 * Find the first tuple with the same key.
439 *
440 * This also saves the binary search bounds in insertstate. We use them
441 * in the fastpath below, but also in the _bt_findinsertloc() call later.
442 */
443 Assert(!insertstate->bounds_valid);
444 offset = _bt_binsrch_insert(rel, insertstate);
445
446 /*
447 * Scan over all equal tuples, looking for live conflicts.
448 */
449 Assert(!insertstate->bounds_valid || insertstate->low == offset);
450 Assert(!itup_key->anynullkeys);
451 Assert(itup_key->scantid == NULL);
452 for (;;)
453 {
454 /*
455 * Each iteration of the loop processes one heap TID, not one index
456 * tuple. Current offset number for page isn't usually advanced on
457 * iterations that process heap TIDs from posting list tuples.
458 *
459 * "inposting" state is set when _inside_ a posting list --- not when
460 * we're at the start (or end) of a posting list. We advance curposti
461 * at the end of the iteration when inside a posting list tuple. In
462 * general, every loop iteration either advances the page offset or
463 * advances curposti --- an iteration that handles the rightmost/max
464 * heap TID in a posting list finally advances the page offset (and
465 * unsets "inposting").
466 *
467 * Make sure the offset points to an actual index tuple before trying
468 * to examine it...
469 */
470 if (offset <= maxoff)
471 {
472 /*
473 * Fastpath: In most cases, we can use cached search bounds to
474 * limit our consideration to items that are definitely
475 * duplicates. This fastpath doesn't apply when the original page
476 * is empty, or when initial offset is past the end of the
477 * original page, which may indicate that we need to examine a
478 * second or subsequent page.
479 *
480 * Note that this optimization allows us to avoid calling
481 * _bt_compare() directly when there are no duplicates, as long as
482 * the offset where the key will go is not at the end of the page.
483 */
484 if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
485 {
486 Assert(insertstate->bounds_valid);
487 Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
488 Assert(insertstate->low <= insertstate->stricthigh);
489 Assert(_bt_compare(rel, itup_key, page, offset) < 0);
490 break;
491 }
492
493 /*
494 * We can skip items that are already marked killed.
495 *
496 * In the presence of heavy update activity an index may contain
497 * many killed items with the same key; running _bt_compare() on
498 * each killed item gets expensive. Just advance over killed
499 * items as quickly as we can. We only apply _bt_compare() when
500 * we get to a non-killed item. We could reuse the bounds to
501 * avoid _bt_compare() calls for known equal tuples, but it
502 * doesn't seem worth it.
503 */
504 if (!inposting)
505 curitemid = PageGetItemId(page, offset);
506 if (inposting || !ItemIdIsDead(curitemid))
507 {
508 ItemPointerData htid;
509 bool all_dead = false;
510
511 if (!inposting)
512 {
513 /* Plain tuple, or first TID in posting list tuple */
514 if (_bt_compare(rel, itup_key, page, offset) != 0)
515 break; /* we're past all the equal tuples */
516
517 /* Advanced curitup */
518 curitup = (IndexTuple) PageGetItem(page, curitemid);
519 Assert(!BTreeTupleIsPivot(curitup));
520 }
521
522 /* okay, we gotta fetch the heap tuple using htid ... */
523 if (!BTreeTupleIsPosting(curitup))
524 {
525 /* ... htid is from simple non-pivot tuple */
526 Assert(!inposting);
527 htid = curitup->t_tid;
528 }
529 else if (!inposting)
530 {
531 /* ... htid is first TID in new posting list */
532 inposting = true;
533 prevalldead = true;
534 curposti = 0;
535 htid = *BTreeTupleGetPostingN(curitup, 0);
536 }
537 else
538 {
539 /* ... htid is second or subsequent TID in posting list */
540 Assert(curposti > 0);
541 htid = *BTreeTupleGetPostingN(curitup, curposti);
542 }
543
544 /*
545 * If we are doing a recheck, we expect to find the tuple we
546 * are rechecking. It's not a duplicate, but we have to keep
547 * scanning.
548 */
549 if (checkUnique == UNIQUE_CHECK_EXISTING &&
550 ItemPointerCompare(&htid, &itup->t_tid) == 0)
551 {
552 found = true;
553 }
554
555 /*
556 * Check if there's any table tuples for this index entry
557 * satisfying SnapshotDirty. This is necessary because for AMs
558 * with optimizations like heap's HOT, we have just a single
559 * index entry for the entire chain.
560 */
561 else if (table_index_fetch_tuple_check(heapRel, &htid,
562 &SnapshotDirty,
563 &all_dead))
564 {
565 TransactionId xwait;
566
567 /*
568 * It is a duplicate. If we are only doing a partial
569 * check, then don't bother checking if the tuple is being
570 * updated in another transaction. Just return the fact
571 * that it is a potential conflict and leave the full
572 * check till later. Don't invalidate binary search
573 * bounds.
574 */
575 if (checkUnique == UNIQUE_CHECK_PARTIAL)
576 {
577 if (nbuf != InvalidBuffer)
578 _bt_relbuf(rel, nbuf);
579 *is_unique = false;
581 }
582
583 /*
584 * If this tuple is being updated by other transaction
585 * then we have to wait for its commit/abort.
586 */
587 xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
588 SnapshotDirty.xmin : SnapshotDirty.xmax;
589
590 if (TransactionIdIsValid(xwait))
591 {
592 if (nbuf != InvalidBuffer)
593 _bt_relbuf(rel, nbuf);
594 /* Tell _bt_doinsert to wait... */
595 *speculativeToken = SnapshotDirty.speculativeToken;
596 /* Caller releases lock on buf immediately */
597 insertstate->bounds_valid = false;
598 return xwait;
599 }
600
601 /*
602 * Otherwise we have a definite conflict. But before
603 * complaining, look to see if the tuple we want to insert
604 * is itself now committed dead --- if so, don't complain.
605 * This is a waste of time in normal scenarios but we must
606 * do it to support CREATE INDEX CONCURRENTLY.
607 *
608 * We must follow HOT-chains here because during
609 * concurrent index build, we insert the root TID though
610 * the actual tuple may be somewhere in the HOT-chain.
611 * While following the chain we might not stop at the
612 * exact tuple which triggered the insert, but that's OK
613 * because if we find a live tuple anywhere in this chain,
614 * we have a unique key conflict. The other live tuple is
615 * not part of this chain because it had a different index
616 * entry.
617 */
618 htid = itup->t_tid;
619 if (table_index_fetch_tuple_check(heapRel, &htid,
620 SnapshotSelf, NULL))
621 {
622 /* Normal case --- it's still live */
623 }
624 else
625 {
626 /*
627 * It's been deleted, so no error, and no need to
628 * continue searching
629 */
630 break;
631 }
632
633 /*
634 * Check for a conflict-in as we would if we were going to
635 * write to this page. We aren't actually going to write,
636 * but we want a chance to report SSI conflicts that would
637 * otherwise be masked by this unique constraint
638 * violation.
639 */
641
642 /*
643 * This is a definite conflict. Break the tuple down into
644 * datums and report the error. But first, make sure we
645 * release the buffer locks we're holding ---
646 * BuildIndexValueDescription could make catalog accesses,
647 * which in the worst case might touch this same index and
648 * cause deadlocks.
649 */
650 if (nbuf != InvalidBuffer)
651 _bt_relbuf(rel, nbuf);
652 _bt_relbuf(rel, insertstate->buf);
653 insertstate->buf = InvalidBuffer;
654 insertstate->bounds_valid = false;
655
656 {
658 bool isnull[INDEX_MAX_KEYS];
659 char *key_desc;
660
662 values, isnull);
663
664 key_desc = BuildIndexValueDescription(rel, values,
665 isnull);
666
668 (errcode(ERRCODE_UNIQUE_VIOLATION),
669 errmsg("duplicate key value violates unique constraint \"%s\"",
671 key_desc ? errdetail("Key %s already exists.",
672 key_desc) : 0,
673 errtableconstraint(heapRel,
675 }
676 }
677 else if (all_dead && (!inposting ||
678 (prevalldead &&
679 curposti == BTreeTupleGetNPosting(curitup) - 1)))
680 {
681 /*
682 * The conflicting tuple (or all HOT chains pointed to by
683 * all posting list TIDs) is dead to everyone, so mark the
684 * index entry killed.
685 */
686 ItemIdMarkDead(curitemid);
687 opaque->btpo_flags |= BTP_HAS_GARBAGE;
688
689 /*
690 * Mark buffer with a dirty hint, since state is not
691 * crucial. Be sure to mark the proper buffer dirty.
692 */
693 if (nbuf != InvalidBuffer)
694 MarkBufferDirtyHint(nbuf, true);
695 else
696 MarkBufferDirtyHint(insertstate->buf, true);
697 }
698
699 /*
700 * Remember if posting list tuple has even a single HOT chain
701 * whose members are not all dead
702 */
703 if (!all_dead && inposting)
704 prevalldead = false;
705 }
706 }
707
708 if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
709 {
710 /* Advance to next TID in same posting list */
711 curposti++;
712 continue;
713 }
714 else if (offset < maxoff)
715 {
716 /* Advance to next tuple */
717 curposti = 0;
718 inposting = false;
719 offset = OffsetNumberNext(offset);
720 }
721 else
722 {
723 int highkeycmp;
724
725 /* If scankey == hikey we gotta check the next page too */
726 if (P_RIGHTMOST(opaque))
727 break;
728 highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
729 Assert(highkeycmp <= 0);
730 if (highkeycmp != 0)
731 break;
732 /* Advance to next non-dead page --- there must be one */
733 for (;;)
734 {
735 BlockNumber nblkno = opaque->btpo_next;
736
737 nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
738 page = BufferGetPage(nbuf);
739 opaque = BTPageGetOpaque(page);
740 if (!P_IGNORE(opaque))
741 break;
742 if (P_RIGHTMOST(opaque))
743 elog(ERROR, "fell off the end of index \"%s\"",
745 }
746 /* Will also advance to next tuple */
747 curposti = 0;
748 inposting = false;
749 maxoff = PageGetMaxOffsetNumber(page);
750 offset = P_FIRSTDATAKEY(opaque);
751 /* Don't invalidate binary search bounds */
752 }
753 }
754
755 /*
756 * If we are doing a recheck then we should have found the tuple we are
757 * checking. Otherwise there's something very wrong --- probably, the
758 * index is on a non-immutable expression.
759 */
760 if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
762 (errcode(ERRCODE_INTERNAL_ERROR),
763 errmsg("failed to re-find tuple within index \"%s\"",
765 errhint("This may be because of a non-immutable index expression."),
766 errtableconstraint(heapRel,
768
769 if (nbuf != InvalidBuffer)
770 _bt_relbuf(rel, nbuf);
771
773}
static Datum values[MAXATTR]
Definition: bootstrap.c:153
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4224
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5435
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:425
static void * PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:353
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
PageData * Page
Definition: bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:371
uint32 TransactionId
Definition: c.h:661
int errdetail(const char *fmt,...)
Definition: elog.c:1216
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
char * BuildIndexValueDescription(Relation indexRelation, const Datum *values, const bool *isnull)
Definition: genam.c:178
@ UNIQUE_CHECK_EXISTING
Definition: genam.h:147
@ UNIQUE_CHECK_PARTIAL
Definition: genam.h:146
Assert(PointerIsAligned(start, uint64))
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:456
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition: itemptr.c:51
IndexTupleData * IndexTuple
Definition: itup.h:53
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
#define P_HIKEY
Definition: nbtree.h:368
#define BTP_HAS_GARBAGE
Definition: nbtree.h:83
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:545
#define BT_READ
Definition: nbtree.h:730
#define P_IGNORE(opaque)
Definition: nbtree.h:226
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:479
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:693
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define INDEX_MAX_KEYS
uint64_t Datum
Definition: postgres.h:70
void CheckForSerializableConflictIn(Relation relation, const ItemPointerData *tid, BlockNumber blkno)
Definition: predicate.c:4336
#define RelationGetDescr(relation)
Definition: rel.h:541
#define RelationGetRelationName(relation)
Definition: rel.h:549
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:6103
#define SnapshotSelf
Definition: snapmgr.h:32
#define InitDirtySnapshot(snapshotdata)
Definition: snapmgr.h:42
OffsetNumber stricthigh
Definition: nbtree.h:836
bool bounds_valid
Definition: nbtree.h:834
OffsetNumber low
Definition: nbtree.h:835
IndexTuple itup
Definition: nbtree.h:822
BTScanInsert itup_key
Definition: nbtree.h:824
BlockNumber btpo_next
Definition: nbtree.h:66
uint16 btpo_flags
Definition: nbtree.h:68
ItemPointer scantid
Definition: nbtree.h:802
bool anynullkeys
Definition: nbtree.h:799
ItemPointerData t_tid
Definition: itup.h:37
TransactionId xmin
Definition: snapshot.h:153
TransactionId xmax
Definition: snapshot.h:154
uint32 speculativeToken
Definition: snapshot.h:189
bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead)
Definition: tableam.c:209
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdIsValid(xid)
Definition: transam.h:41

References _bt_binsrch_insert(), _bt_compare(), _bt_relandgetbuf(), _bt_relbuf(), BTScanInsertData::anynullkeys, Assert(), BTInsertStateData::bounds_valid, BT_READ, BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage(), BuildIndexValueDescription(), CheckForSerializableConflictIn(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), if(), index_deform_tuple(), INDEX_MAX_KEYS, InitDirtySnapshot, InvalidBuffer, InvalidTransactionId, ItemIdIsDead, ItemIdMarkDead, ItemPointerCompare(), BTInsertStateData::itup, BTInsertStateData::itup_key, BTInsertStateData::low, MarkBufferDirtyHint(), OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), RelationGetDescr, RelationGetRelationName, BTScanInsertData::scantid, SnapshotSelf, SnapshotData::speculativeToken, BTInsertStateData::stricthigh, IndexTupleData::t_tid, table_index_fetch_tuple_check(), TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_PARTIAL, values, SnapshotData::xmax, and SnapshotData::xmin.

Referenced by _bt_doinsert().

◆ _bt_deadblocks()

static BlockNumber * _bt_deadblocks ( Page  page,
OffsetNumber deletable,
int  ndeletable,
IndexTuple  newitem,
int *  nblocks 
)
static

Definition at line 2942 of file nbtinsert.c.

2944{
2945 int spacentids,
2946 ntids;
2947 BlockNumber *tidblocks;
2948
2949 /*
2950 * Accumulate each TID's block in array whose initial size has space for
2951 * one table block per LP_DEAD-set tuple (plus space for the newitem table
2952 * block). Array will only need to grow when there are LP_DEAD-marked
2953 * posting list tuples (which is not that common).
2954 */
2955 spacentids = ndeletable + 1;
2956 ntids = 0;
2957 tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
2958
2959 /*
2960 * First add the table block for the incoming newitem. This is the one
2961 * case where simple deletion can visit a table block that doesn't have
2962 * any known deletable items.
2963 */
2964 Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
2965 tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
2966
2967 for (int i = 0; i < ndeletable; i++)
2968 {
2969 ItemId itemid = PageGetItemId(page, deletable[i]);
2970 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2971
2972 Assert(ItemIdIsDead(itemid));
2973
2974 if (!BTreeTupleIsPosting(itup))
2975 {
2976 if (ntids + 1 > spacentids)
2977 {
2978 spacentids *= 2;
2979 tidblocks = (BlockNumber *)
2980 repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2981 }
2982
2983 tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
2984 }
2985 else
2986 {
2987 int nposting = BTreeTupleGetNPosting(itup);
2988
2989 if (ntids + nposting > spacentids)
2990 {
2991 spacentids = Max(spacentids * 2, ntids + nposting);
2992 tidblocks = (BlockNumber *)
2993 repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2994 }
2995
2996 for (int j = 0; j < nposting; j++)
2997 {
2998 ItemPointer tid = BTreeTupleGetPostingN(itup, j);
2999
3000 tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
3001 }
3002 }
3003 }
3004
3005 qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3006 *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3007
3008 return tidblocks;
3009}
#define Max(x, y)
Definition: c.h:1001
int j
Definition: isn.c:78
int i
Definition: isn.c:77
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
void * palloc(Size size)
Definition: mcxt.c:1365
static int _bt_blk_cmp(const void *arg1, const void *arg2)
Definition: nbtinsert.c:3015
#define qsort(a, b, c, d)
Definition: port.h:479
static size_t qunique(void *array, size_t elements, size_t width, int(*compare)(const void *, const void *))
Definition: qunique.h:21

References _bt_blk_cmp(), Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), i, ItemIdIsDead, ItemPointerGetBlockNumber(), j, Max, PageGetItem(), PageGetItemId(), palloc(), qsort, qunique(), repalloc(), and IndexTupleData::t_tid.

Referenced by _bt_simpledel_pass().

◆ _bt_delete_or_dedup_one_page()

static void _bt_delete_or_dedup_one_page ( Relation  rel,
Relation  heapRel,
BTInsertState  insertstate,
bool  simpleonly,
bool  checkingunique,
bool  uniquedup,
bool  indexUnchanged 
)
static

Definition at line 2687 of file nbtinsert.c.

2691{
2693 int ndeletable = 0;
2694 OffsetNumber offnum,
2695 minoff,
2696 maxoff;
2697 Buffer buffer = insertstate->buf;
2698 BTScanInsert itup_key = insertstate->itup_key;
2699 Page page = BufferGetPage(buffer);
2700 BTPageOpaque opaque = BTPageGetOpaque(page);
2701
2702 Assert(P_ISLEAF(opaque));
2703 Assert(simpleonly || itup_key->heapkeyspace);
2704 Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
2705
2706 /*
2707 * Scan over all items to see which ones need to be deleted according to
2708 * LP_DEAD flags. We'll usually manage to delete a few extra items that
2709 * are not marked LP_DEAD in passing. Often the extra items that actually
2710 * end up getting deleted are items that would have had their LP_DEAD bit
2711 * set before long anyway (if we opted not to include them as extras).
2712 */
2713 minoff = P_FIRSTDATAKEY(opaque);
2714 maxoff = PageGetMaxOffsetNumber(page);
2715 for (offnum = minoff;
2716 offnum <= maxoff;
2717 offnum = OffsetNumberNext(offnum))
2718 {
2719 ItemId itemId = PageGetItemId(page, offnum);
2720
2721 if (ItemIdIsDead(itemId))
2722 deletable[ndeletable++] = offnum;
2723 }
2724
2725 if (ndeletable > 0)
2726 {
2727 _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2728 insertstate->itup, minoff, maxoff);
2729 insertstate->bounds_valid = false;
2730
2731 /* Return when a page split has already been avoided */
2732 if (PageGetFreeSpace(page) >= insertstate->itemsz)
2733 return;
2734
2735 /* Might as well assume duplicates (if checkingunique) */
2736 uniquedup = true;
2737 }
2738
2739 /*
2740 * We're done with simple deletion. Return early with callers that only
2741 * call here so that simple deletion can be considered. This includes
2742 * callers that explicitly ask for this and checkingunique callers that
2743 * probably don't have any version churn duplicates on the page.
2744 *
2745 * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2746 * return at this point (or when we go on the try either or both of our
2747 * other strategies and they also fail). We do not bother expending a
2748 * separate write to clear it, however. Caller will definitely clear it
2749 * when it goes on to split the page (note also that the deduplication
2750 * process will clear the flag in passing, just to keep things tidy).
2751 */
2752 if (simpleonly || (checkingunique && !uniquedup))
2753 {
2754 Assert(!indexUnchanged);
2755 return;
2756 }
2757
2758 /* Assume bounds about to be invalidated (this is almost certain now) */
2759 insertstate->bounds_valid = false;
2760
2761 /*
2762 * Perform bottom-up index deletion pass when executor hint indicated that
2763 * incoming item is logically unchanged, or for a unique index that is
2764 * known to have physical duplicates for some other reason. (There is a
2765 * large overlap between these two cases for a unique index. It's worth
2766 * having both triggering conditions in order to apply the optimization in
2767 * the event of successive related INSERT and DELETE statements.)
2768 *
2769 * We'll go on to do a deduplication pass when a bottom-up pass fails to
2770 * delete an acceptable amount of free space (a significant fraction of
2771 * the page, or space for the new item, whichever is greater).
2772 *
2773 * Note: Bottom-up index deletion uses the same equality/equivalence
2774 * routines as deduplication internally. However, it does not merge
2775 * together index tuples, so the same correctness considerations do not
2776 * apply. We deliberately omit an index-is-allequalimage test here.
2777 */
2778 if ((indexUnchanged || uniquedup) &&
2779 _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2780 return;
2781
2782 /* Perform deduplication pass (when enabled and index-is-allequalimage) */
2783 if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
2784 _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
2785 (indexUnchanged || uniquedup));
2786}
Size PageGetFreeSpace(const PageData *page)
Definition: bufpage.c:906
#define MaxIndexTuplesPerPage
Definition: itup.h:181
void _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
Definition: nbtdedup.c:59
bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
Definition: nbtdedup.c:307
static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, OffsetNumber minoff, OffsetNumber maxoff)
Definition: nbtinsert.c:2816
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:1166
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
bool allequalimage
Definition: nbtree.h:798
bool heapkeyspace
Definition: nbtree.h:797

References _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_simpledel_pass(), BTScanInsertData::allequalimage, Assert(), BTInsertStateData::bounds_valid, BTGetDeduplicateItems, BTPageGetOpaque, BTInsertStateData::buf, BufferGetPage(), BTScanInsertData::heapkeyspace, ItemIdIsDead, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MaxIndexTuplesPerPage, OffsetNumberNext, P_FIRSTDATAKEY, P_ISLEAF, PageGetFreeSpace(), PageGetItemId(), and PageGetMaxOffsetNumber().

Referenced by _bt_findinsertloc().

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
Relation  heapRel 
)

Definition at line 103 of file nbtinsert.c.

106{
107 bool is_unique = false;
108 BTInsertStateData insertstate;
109 BTScanInsert itup_key;
110 BTStack stack;
111 bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
112
113 /* we need an insertion scan key to do our search, so build one */
114 itup_key = _bt_mkscankey(rel, itup);
115
116 if (checkingunique)
117 {
118 if (!itup_key->anynullkeys)
119 {
120 /* No (heapkeyspace) scantid until uniqueness established */
121 itup_key->scantid = NULL;
122 }
123 else
124 {
125 /*
126 * Scan key for new tuple contains NULL key values. Bypass
127 * checkingunique steps. They are unnecessary because core code
128 * considers NULL unequal to every value, including NULL.
129 *
130 * This optimization avoids O(N^2) behavior within the
131 * _bt_findinsertloc() heapkeyspace path when a unique index has a
132 * large number of "duplicates" with NULL key values.
133 */
134 checkingunique = false;
135 /* Tuple is unique in the sense that core code cares about */
136 Assert(checkUnique != UNIQUE_CHECK_EXISTING);
137 is_unique = true;
138 }
139 }
140
141 /*
142 * Fill in the BTInsertState working area, to track the current page and
143 * position within the page to insert on.
144 *
145 * Note that itemsz is passed down to lower level code that deals with
146 * inserting the item. It must be MAXALIGN()'d. This ensures that space
147 * accounting code consistently considers the alignment overhead that we
148 * expect PageAddItem() will add later. (Actually, index_form_tuple() is
149 * already conservative about alignment, but we don't rely on that from
150 * this distance. Besides, preserving the "true" tuple size in index
151 * tuple headers for the benefit of nbtsplitloc.c might happen someday.
152 * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
153 */
154 insertstate.itup = itup;
155 insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
156 insertstate.itup_key = itup_key;
157 insertstate.bounds_valid = false;
158 insertstate.buf = InvalidBuffer;
159 insertstate.postingoff = 0;
160
161search:
162
163 /*
164 * Find and lock the leaf page that the tuple should be added to by
165 * searching from the root page. insertstate.buf will hold a buffer that
166 * is locked in exclusive mode afterwards.
167 */
168 stack = _bt_search_insert(rel, heapRel, &insertstate);
169
170 /*
171 * checkingunique inserts are not allowed to go ahead when two tuples with
172 * equal key attribute values would be visible to new MVCC snapshots once
173 * the xact commits. Check for conflicts in the locked page/buffer (if
174 * needed) here.
175 *
176 * It might be necessary to check a page to the right in _bt_check_unique,
177 * though that should be very rare. In practice the first page the value
178 * could be on (with scantid omitted) is almost always also the only page
179 * that a matching tuple might be found on. This is due to the behavior
180 * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
181 * only be allowed to cross a page boundary when there is no candidate
182 * leaf page split point that avoids it. Also, _bt_check_unique can use
183 * the leaf page high key to determine that there will be no duplicates on
184 * the right sibling without actually visiting it (it uses the high key in
185 * cases where the new item happens to belong at the far right of the leaf
186 * page).
187 *
188 * NOTE: obviously, _bt_check_unique can only detect keys that are already
189 * in the index; so it cannot defend against concurrent insertions of the
190 * same key. We protect against that by means of holding a write lock on
191 * the first page the value could be on, with omitted/-inf value for the
192 * implicit heap TID tiebreaker attribute. Any other would-be inserter of
193 * the same key must acquire a write lock on the same page, so only one
194 * would-be inserter can be making the check at one time. Furthermore,
195 * once we are past the check we hold write locks continuously until we
196 * have performed our insertion, so no later inserter can fail to see our
197 * insertion. (This requires some care in _bt_findinsertloc.)
198 *
199 * If we must wait for another xact, we release the lock while waiting,
200 * and then must perform a new search.
201 *
202 * For a partial uniqueness check, we don't wait for the other xact. Just
203 * let the tuple in and return false for possibly non-unique, or true for
204 * definitely unique.
205 */
206 if (checkingunique)
207 {
208 TransactionId xwait;
209 uint32 speculativeToken;
210
211 xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
212 &is_unique, &speculativeToken);
213
214 if (unlikely(TransactionIdIsValid(xwait)))
215 {
216 /* Have to wait for the other guy ... */
217 _bt_relbuf(rel, insertstate.buf);
218 insertstate.buf = InvalidBuffer;
219
220 /*
221 * If it's a speculative insertion, wait for it to finish (ie. to
222 * go ahead with the insertion, or kill the tuple). Otherwise
223 * wait for the transaction to finish as usual.
224 */
225 if (speculativeToken)
226 SpeculativeInsertionWait(xwait, speculativeToken);
227 else
228 XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
229
230 /* start over... */
231 if (stack)
232 _bt_freestack(stack);
233 goto search;
234 }
235
236 /* Uniqueness is established -- restore heap tid as scantid */
237 if (itup_key->heapkeyspace)
238 itup_key->scantid = &itup->t_tid;
239 }
240
241 if (checkUnique != UNIQUE_CHECK_EXISTING)
242 {
243 OffsetNumber newitemoff;
244
245 /*
246 * The only conflict predicate locking cares about for indexes is when
247 * an index tuple insert conflicts with an existing lock. We don't
248 * know the actual page we're going to insert on for sure just yet in
249 * checkingunique and !heapkeyspace cases, but it's okay to use the
250 * first page the value could be on (with scantid omitted) instead.
251 */
253
254 /*
255 * Do the insertion. Note that insertstate contains cached binary
256 * search bounds established within _bt_check_unique when insertion is
257 * checkingunique.
258 */
259 newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
260 indexUnchanged, stack, heapRel);
261 _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
262 stack, itup, insertstate.itemsz, newitemoff,
263 insertstate.postingoff, false);
264 }
265 else
266 {
267 /* just release the buffer */
268 _bt_relbuf(rel, insertstate.buf);
269 }
270
271 /* be tidy */
272 if (stack)
273 _bt_freestack(stack);
274 pfree(itup_key);
275
276 return is_unique;
277}
#define MAXALIGN(LEN)
Definition: c.h:814
#define unlikely(x)
Definition: c.h:406
uint32_t uint32
Definition: c.h:542
@ UNIQUE_CHECK_NO
Definition: genam.h:144
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:828
void XactLockTableWait(TransactionId xid, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper)
Definition: lmgr.c:663
@ XLTW_InsertIndex
Definition: lmgr.h:31
void pfree(void *pointer)
Definition: mcxt.c:1594
static BTStack _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
Definition: nbtinsert.c:318
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:816
static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1106
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:409
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:189
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:97

References _bt_check_unique(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search_insert(), BTScanInsertData::anynullkeys, Assert(), BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetBlockNumber(), CheckForSerializableConflictIn(), BTScanInsertData::heapkeyspace, IndexTupleSize(), InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, pfree(), BTInsertStateData::postingoff, BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, unlikely, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

◆ _bt_findinsertloc()

static OffsetNumber _bt_findinsertloc ( Relation  rel,
BTInsertState  insertstate,
bool  checkingunique,
bool  indexUnchanged,
BTStack  stack,
Relation  heapRel 
)
static

Definition at line 816 of file nbtinsert.c.

822{
823 BTScanInsert itup_key = insertstate->itup_key;
824 Page page = BufferGetPage(insertstate->buf);
825 BTPageOpaque opaque;
826 OffsetNumber newitemoff;
827
828 opaque = BTPageGetOpaque(page);
829
830 /* Check 1/3 of a page restriction */
831 if (unlikely(insertstate->itemsz > BTMaxItemSize))
832 _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
833 insertstate->itup);
834
835 Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
836 Assert(!insertstate->bounds_valid || checkingunique);
837 Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
838 Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
839 Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
840
841 if (itup_key->heapkeyspace)
842 {
843 /* Keep track of whether checkingunique duplicate seen */
844 bool uniquedup = indexUnchanged;
845
846 /*
847 * If we're inserting into a unique index, we may have to walk right
848 * through leaf pages to find the one leaf page that we must insert on
849 * to.
850 *
851 * This is needed for checkingunique callers because a scantid was not
852 * used when we called _bt_search(). scantid can only be set after
853 * _bt_check_unique() has checked for duplicates. The buffer
854 * initially stored in insertstate->buf has the page where the first
855 * duplicate key might be found, which isn't always the page that new
856 * tuple belongs on. The heap TID attribute for new tuple (scantid)
857 * could force us to insert on a sibling page, though that should be
858 * very rare in practice.
859 */
860 if (checkingunique)
861 {
862 if (insertstate->low < insertstate->stricthigh)
863 {
864 /* Encountered a duplicate in _bt_check_unique() */
865 Assert(insertstate->bounds_valid);
866 uniquedup = true;
867 }
868
869 for (;;)
870 {
871 /*
872 * Does the new tuple belong on this page?
873 *
874 * The earlier _bt_check_unique() call may well have
875 * established a strict upper bound on the offset for the new
876 * item. If it's not the last item of the page (i.e. if there
877 * is at least one tuple on the page that goes after the tuple
878 * we're inserting) then we know that the tuple belongs on
879 * this page. We can skip the high key check.
880 */
881 if (insertstate->bounds_valid &&
882 insertstate->low <= insertstate->stricthigh &&
883 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
884 break;
885
886 /* Test '<=', not '!=', since scantid is set now */
887 if (P_RIGHTMOST(opaque) ||
888 _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
889 break;
890
891 _bt_stepright(rel, heapRel, insertstate, stack);
892 /* Update local state after stepping right */
893 page = BufferGetPage(insertstate->buf);
894 opaque = BTPageGetOpaque(page);
895 /* Assume duplicates (if checkingunique) */
896 uniquedup = true;
897 }
898 }
899
900 /*
901 * If the target page cannot fit newitem, try to avoid splitting the
902 * page on insert by performing deletion or deduplication now
903 */
904 if (PageGetFreeSpace(page) < insertstate->itemsz)
905 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
906 checkingunique, uniquedup,
907 indexUnchanged);
908 }
909 else
910 {
911 /*----------
912 * This is a !heapkeyspace (version 2 or 3) index. The current page
913 * is the first page that we could insert the new tuple to, but there
914 * may be other pages to the right that we could opt to use instead.
915 *
916 * If the new key is equal to one or more existing keys, we can
917 * legitimately place it anywhere in the series of equal keys. In
918 * fact, if the new key is equal to the page's "high key" we can place
919 * it on the next page. If it is equal to the high key, and there's
920 * not room to insert the new tuple on the current page without
921 * splitting, then we move right hoping to find more free space and
922 * avoid a split.
923 *
924 * Keep scanning right until we
925 * (a) find a page with enough free space,
926 * (b) reach the last page where the tuple can legally go, or
927 * (c) get tired of searching.
928 * (c) is not flippant; it is important because if there are many
929 * pages' worth of equal keys, it's better to split one of the early
930 * pages than to scan all the way to the end of the run of equal keys
931 * on every insert. We implement "get tired" as a random choice,
932 * since stopping after scanning a fixed number of pages wouldn't work
933 * well (we'd never reach the right-hand side of previously split
934 * pages). The probability of moving right is set at 0.99, which may
935 * seem too high to change the behavior much, but it does an excellent
936 * job of preventing O(N^2) behavior with many equal keys.
937 *----------
938 */
939 while (PageGetFreeSpace(page) < insertstate->itemsz)
940 {
941 /*
942 * Before considering moving right, see if we can obtain enough
943 * space by erasing LP_DEAD items
944 */
945 if (P_HAS_GARBAGE(opaque))
946 {
947 /* Perform simple deletion */
948 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
949 false, false, false);
950
951 if (PageGetFreeSpace(page) >= insertstate->itemsz)
952 break; /* OK, now we have enough space */
953 }
954
955 /*
956 * Nope, so check conditions (b) and (c) enumerated above
957 *
958 * The earlier _bt_check_unique() call may well have established a
959 * strict upper bound on the offset for the new item. If it's not
960 * the last item of the page (i.e. if there is at least one tuple
961 * on the page that's greater than the tuple we're inserting to)
962 * then we know that the tuple belongs on this page. We can skip
963 * the high key check.
964 */
965 if (insertstate->bounds_valid &&
966 insertstate->low <= insertstate->stricthigh &&
967 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
968 break;
969
970 if (P_RIGHTMOST(opaque) ||
971 _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
973 break;
974
975 _bt_stepright(rel, heapRel, insertstate, stack);
976 /* Update local state after stepping right */
977 page = BufferGetPage(insertstate->buf);
978 opaque = BTPageGetOpaque(page);
979 }
980 }
981
982 /*
983 * We should now be on the correct page. Find the offset within the page
984 * for the new tuple. (Possibly reusing earlier search bounds.)
985 */
986 Assert(P_RIGHTMOST(opaque) ||
987 _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
988
989 newitemoff = _bt_binsrch_insert(rel, insertstate);
990
991 if (insertstate->postingoff == -1)
992 {
993 /*
994 * There is an overlapping posting list tuple with its LP_DEAD bit
995 * set. We don't want to unnecessarily unset its LP_DEAD bit while
996 * performing a posting list split, so perform simple index tuple
997 * deletion early.
998 */
999 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
1000 false, false, false);
1001
1002 /*
1003 * Do new binary search. New insert location cannot overlap with any
1004 * posting list now.
1005 */
1006 Assert(!insertstate->bounds_valid);
1007 insertstate->postingoff = 0;
1008 newitemoff = _bt_binsrch_insert(rel, insertstate);
1009 Assert(insertstate->postingoff == 0);
1010 }
1011
1012 return newitemoff;
1013}
#define PG_UINT32_MAX
Definition: c.h:599
static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTInsertState insertstate, bool simpleonly, bool checkingunique, bool uniquedup, bool indexUnchanged)
Definition: nbtinsert.c:2687
static void _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack)
Definition: nbtinsert.c:1028
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:227
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:228
#define BTMaxItemSize
Definition: nbtree.h:165
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:4309
uint32 pg_prng_uint32(pg_prng_state *state)
Definition: pg_prng.c:227
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34

References _bt_binsrch_insert(), _bt_check_third_page(), _bt_compare(), _bt_delete_or_dedup_one_page(), _bt_stepright(), BTScanInsertData::allequalimage, Assert(), BTInsertStateData::bounds_valid, BTMaxItemSize, BTPageGetOpaque, BTInsertStateData::buf, BufferGetPage(), BTScanInsertData::heapkeyspace, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, BTInsertStateData::low, P_HAS_GARBAGE, P_HIKEY, P_INCOMPLETE_SPLIT, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber(), pg_global_prng_state, pg_prng_uint32(), PG_UINT32_MAX, BTInsertStateData::postingoff, BTScanInsertData::scantid, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_doinsert().

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Relation  heaprel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 2248 of file nbtinsert.c.

2249{
2250 Page lpage = BufferGetPage(lbuf);
2251 BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2252 Buffer rbuf;
2253 Page rpage;
2254 BTPageOpaque rpageop;
2255 bool wasroot;
2256 bool wasonly;
2257
2258 Assert(P_INCOMPLETE_SPLIT(lpageop));
2259 Assert(heaprel != NULL);
2260
2261 /* Lock right sibling, the one missing the downlink */
2262 rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2263 rpage = BufferGetPage(rbuf);
2264 rpageop = BTPageGetOpaque(rpage);
2265
2266 /* Could this be a root split? */
2267 if (!stack)
2268 {
2269 Buffer metabuf;
2270 Page metapg;
2271 BTMetaPageData *metad;
2272
2273 /* acquire lock on the metapage */
2274 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2275 metapg = BufferGetPage(metabuf);
2276 metad = BTPageGetMeta(metapg);
2277
2278 wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2279
2280 _bt_relbuf(rel, metabuf);
2281 }
2282 else
2283 wasroot = false;
2284
2285 /* Was this the only page on the level before split? */
2286 wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2287
2288 elog(DEBUG1, "finishing incomplete split of %u/%u",
2290
2291 _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2292}
#define DEBUG1
Definition: elog.h:30
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2106
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
#define BTPageGetMeta(p)
Definition: nbtree.h:122
#define P_LEFTMOST(opaque)
Definition: nbtree.h:219
#define BTREE_METAPAGE
Definition: nbtree.h:149
#define BT_WRITE
Definition: nbtree.h:731
BlockNumber btm_root
Definition: nbtree.h:108

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert(), BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage(), DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, and P_RIGHTMOST.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
Relation  heaprel,
BTStack  stack,
BlockNumber  child 
)

Definition at line 2326 of file nbtinsert.c.

2327{
2328 BlockNumber blkno;
2330
2331 blkno = stack->bts_blkno;
2332 start = stack->bts_offset;
2333
2334 for (;;)
2335 {
2336 Buffer buf;
2337 Page page;
2338 BTPageOpaque opaque;
2339
2340 buf = _bt_getbuf(rel, blkno, BT_WRITE);
2341 page = BufferGetPage(buf);
2342 opaque = BTPageGetOpaque(page);
2343
2344 Assert(heaprel != NULL);
2345 if (P_INCOMPLETE_SPLIT(opaque))
2346 {
2347 _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2348 continue;
2349 }
2350
2351 if (!P_IGNORE(opaque))
2352 {
2353 OffsetNumber offnum,
2354 minoff,
2355 maxoff;
2356 ItemId itemid;
2357 IndexTuple item;
2358
2359 minoff = P_FIRSTDATAKEY(opaque);
2360 maxoff = PageGetMaxOffsetNumber(page);
2361
2362 /*
2363 * start = InvalidOffsetNumber means "search the whole page". We
2364 * need this test anyway due to possibility that page has a high
2365 * key now when it didn't before.
2366 */
2367 if (start < minoff)
2368 start = minoff;
2369
2370 /*
2371 * Need this check too, to guard against possibility that page
2372 * split since we visited it originally.
2373 */
2374 if (start > maxoff)
2375 start = OffsetNumberNext(maxoff);
2376
2377 /*
2378 * These loops will check every item on the page --- but in an
2379 * order that's attuned to the probability of where it actually
2380 * is. Scan to the right first, then to the left.
2381 */
2382 for (offnum = start;
2383 offnum <= maxoff;
2384 offnum = OffsetNumberNext(offnum))
2385 {
2386 itemid = PageGetItemId(page, offnum);
2387 item = (IndexTuple) PageGetItem(page, itemid);
2388
2389 if (BTreeTupleGetDownLink(item) == child)
2390 {
2391 /* Return accurate pointer to where link is now */
2392 stack->bts_blkno = blkno;
2393 stack->bts_offset = offnum;
2394 return buf;
2395 }
2396 }
2397
2398 for (offnum = OffsetNumberPrev(start);
2399 offnum >= minoff;
2400 offnum = OffsetNumberPrev(offnum))
2401 {
2402 itemid = PageGetItemId(page, offnum);
2403 item = (IndexTuple) PageGetItem(page, itemid);
2404
2405 if (BTreeTupleGetDownLink(item) == child)
2406 {
2407 /* Return accurate pointer to where link is now */
2408 stack->bts_blkno = blkno;
2409 stack->bts_offset = offnum;
2410 return buf;
2411 }
2412 }
2413 }
2414
2415 /*
2416 * The item we're looking for moved right at least one page.
2417 *
2418 * Lehman and Yao couple/chain locks when moving right here, which we
2419 * can avoid. See nbtree/README.
2420 */
2421 if (P_RIGHTMOST(opaque))
2422 {
2423 _bt_relbuf(rel, buf);
2424 return InvalidBuffer;
2425 }
2426 blkno = opaque->btpo_next;
2428 _bt_relbuf(rel, buf);
2429 }
2430}
return str start
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2248
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:557
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
static char * buf
Definition: pg_test_fsync.c:72
BlockNumber bts_blkno
Definition: nbtree.h:745
struct BTStackData * bts_parent
Definition: nbtree.h:747
OffsetNumber bts_offset
Definition: nbtree.h:746

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), Assert(), BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage(), InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and start.

Referenced by _bt_insert_parent(), and _bt_lock_subtree_parent().

◆ _bt_insert_parent()

static void _bt_insert_parent ( Relation  rel,
Relation  heaprel,
Buffer  buf,
Buffer  rbuf,
BTStack  stack,
bool  isroot,
bool  isonly 
)
static

Definition at line 2106 of file nbtinsert.c.

2113{
2114 Assert(heaprel != NULL);
2115
2116 /*
2117 * Here we have to do something Lehman and Yao don't talk about: deal with
2118 * a root split and construction of a new root. If our stack is empty
2119 * then we have just split a node on what had been the root level when we
2120 * descended the tree. If it was still the root then we perform a
2121 * new-root construction. If it *wasn't* the root anymore, search to find
2122 * the next higher level that someone constructed meanwhile, and find the
2123 * right place to insert as for the normal case.
2124 *
2125 * If we have to search for the parent level, we do so by re-descending
2126 * from the root. This is not super-efficient, but it's rare enough not
2127 * to matter.
2128 */
2129 if (isroot)
2130 {
2131 Buffer rootbuf;
2132
2133 Assert(stack == NULL);
2134 Assert(isonly);
2135 /* create a new root node one level up and update the metapage */
2136 rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
2137 /* release the split buffers */
2138 _bt_relbuf(rel, rootbuf);
2139 _bt_relbuf(rel, rbuf);
2140 _bt_relbuf(rel, buf);
2141 }
2142 else
2143 {
2145 BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2146 Page page = BufferGetPage(buf);
2147 IndexTuple new_item;
2148 BTStackData fakestack;
2149 IndexTuple ritem;
2150 Buffer pbuf;
2151
2152 if (stack == NULL)
2153 {
2154 BTPageOpaque opaque;
2155
2156 elog(DEBUG2, "concurrent ROOT page split");
2157 opaque = BTPageGetOpaque(page);
2158
2159 /*
2160 * We should never reach here when a leaf page split takes place
2161 * despite the insert of newitem being able to apply the fastpath
2162 * optimization. Make sure of that with an assertion.
2163 *
2164 * This is more of a performance issue than a correctness issue.
2165 * The fastpath won't have a descent stack. Using a phony stack
2166 * here works, but never rely on that. The fastpath should be
2167 * rejected within _bt_search_insert() when the rightmost leaf
2168 * page will split, since it's faster to go through _bt_search()
2169 * and get a stack in the usual way.
2170 */
2171 Assert(!(P_ISLEAF(opaque) &&
2173
2174 /* Find the leftmost page at the next level up */
2175 pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
2176 /* Set up a phony stack entry pointing there */
2177 stack = &fakestack;
2178 stack->bts_blkno = BufferGetBlockNumber(pbuf);
2180 stack->bts_parent = NULL;
2181 _bt_relbuf(rel, pbuf);
2182 }
2183
2184 /* get high key from left, a strict lower bound for new right page */
2185 ritem = (IndexTuple) PageGetItem(page,
2186 PageGetItemId(page, P_HIKEY));
2187
2188 /* form an index tuple that points at the new right page */
2189 new_item = CopyIndexTuple(ritem);
2190 BTreeTupleSetDownLink(new_item, rbknum);
2191
2192 /*
2193 * Re-find and write lock the parent of buf.
2194 *
2195 * It's possible that the location of buf's downlink has changed since
2196 * our initial _bt_search() descent. _bt_getstackbuf() will detect
2197 * and recover from this, updating the stack, which ensures that the
2198 * new downlink will be inserted at the correct offset. Even buf's
2199 * parent may have changed.
2200 */
2201 pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
2202
2203 /*
2204 * Unlock the right child. The left child will be unlocked in
2205 * _bt_insertonpg().
2206 *
2207 * Unlocking the right child must be delayed until here to ensure that
2208 * no concurrent VACUUM operation can become confused. Page deletion
2209 * cannot be allowed to fail to re-find a downlink for the rbuf page.
2210 * (Actually, this is just a vestige of how things used to work. The
2211 * page deletion code is expected to check for the INCOMPLETE_SPLIT
2212 * flag on the left child. It won't attempt deletion of the right
2213 * child until the split is complete. Despite all this, we opt to
2214 * conservatively delay unlocking the right child until here.)
2215 */
2216 _bt_relbuf(rel, rbuf);
2217
2218 if (pbuf == InvalidBuffer)
2219 ereport(ERROR,
2220 (errcode(ERRCODE_INDEX_CORRUPTED),
2221 errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2222 RelationGetRelationName(rel), bknum, rbknum)));
2223
2224 /* Recursively insert into the parent */
2225 _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
2226 new_item, MAXALIGN(IndexTupleSize(new_item)),
2227 stack->bts_offset + 1, 0, isonly);
2228
2229 /* be tidy */
2230 pfree(new_item);
2231 }
2232}
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
#define DEBUG2
Definition: elog.h:29
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2326
static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2451
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:563
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2612
#define RelationGetTargetBlock(relation)
Definition: rel.h:611
uint32 btpo_level
Definition: nbtree.h:67

References _bt_get_endpoint(), _bt_getstackbuf(), _bt_insertonpg(), _bt_newlevel(), _bt_relbuf(), Assert(), BlockNumberIsValid(), BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleSetDownLink(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetBlockNumber(), BufferGetPage(), CopyIndexTuple(), DEBUG2, elog, ereport, errcode(), errmsg_internal(), ERROR, IndexTupleSize(), InvalidBuffer, InvalidOffsetNumber, MAXALIGN, P_HIKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), pfree(), RelationGetRelationName, and RelationGetTargetBlock.

Referenced by _bt_finish_split(), and _bt_insertonpg().

◆ _bt_insertonpg()

static void _bt_insertonpg ( Relation  rel,
Relation  heaprel,
BTScanInsert  itup_key,
Buffer  buf,
Buffer  cbuf,
BTStack  stack,
IndexTuple  itup,
Size  itemsz,
OffsetNumber  newitemoff,
int  postingoff,
bool  split_only_page 
)
static

Definition at line 1106 of file nbtinsert.c.

1117{
1118 Page page;
1119 BTPageOpaque opaque;
1120 bool isleaf,
1121 isroot,
1122 isrightmost,
1123 isonly;
1124 IndexTuple oposting = NULL;
1125 IndexTuple origitup = NULL;
1126 IndexTuple nposting = NULL;
1127
1128 page = BufferGetPage(buf);
1129 opaque = BTPageGetOpaque(page);
1130 isleaf = P_ISLEAF(opaque);
1131 isroot = P_ISROOT(opaque);
1132 isrightmost = P_RIGHTMOST(opaque);
1133 isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1134
1135 /* child buffer must be given iff inserting on an internal page */
1136 Assert(isleaf == !BufferIsValid(cbuf));
1137 /* tuple must have appropriate number of attributes */
1138 Assert(!isleaf ||
1139 BTreeTupleGetNAtts(itup, rel) ==
1141 Assert(isleaf ||
1142 BTreeTupleGetNAtts(itup, rel) <=
1145 Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1146 /* Caller must always finish incomplete split for us */
1147 Assert(!P_INCOMPLETE_SPLIT(opaque));
1148
1149 /*
1150 * Every internal page should have exactly one negative infinity item at
1151 * all times. Only _bt_split() and _bt_newlevel() should add items that
1152 * become negative infinity items through truncation, since they're the
1153 * only routines that allocate new internal pages.
1154 */
1155 Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
1156
1157 /*
1158 * Do we need to split an existing posting list item?
1159 */
1160 if (postingoff != 0)
1161 {
1162 ItemId itemid = PageGetItemId(page, newitemoff);
1163
1164 /*
1165 * The new tuple is a duplicate with a heap TID that falls inside the
1166 * range of an existing posting list tuple on a leaf page. Prepare to
1167 * split an existing posting list. Overwriting the posting list with
1168 * its post-split version is treated as an extra step in either the
1169 * insert or page split critical section.
1170 */
1171 Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
1172 oposting = (IndexTuple) PageGetItem(page, itemid);
1173
1174 /*
1175 * postingoff value comes from earlier call to _bt_binsrch_posting().
1176 * Its binary search might think that a plain tuple must be a posting
1177 * list tuple that needs to be split. This can happen with corruption
1178 * involving an existing plain tuple that is a duplicate of the new
1179 * item, up to and including its table TID. Check for that here in
1180 * passing.
1181 *
1182 * Also verify that our caller has made sure that the existing posting
1183 * list tuple does not have its LP_DEAD bit set.
1184 */
1185 if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
1186 ereport(ERROR,
1187 (errcode(ERRCODE_INDEX_CORRUPTED),
1188 errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1191 newitemoff, BufferGetBlockNumber(buf),
1193
1194 /* use a mutable copy of itup as our itup from here on */
1195 origitup = itup;
1196 itup = CopyIndexTuple(origitup);
1197 nposting = _bt_swap_posting(itup, oposting, postingoff);
1198 /* itup now contains rightmost/max TID from oposting */
1199
1200 /* Alter offset so that newitem goes after posting list */
1201 newitemoff = OffsetNumberNext(newitemoff);
1202 }
1203
1204 /*
1205 * Do we need to split the page to fit the item on it?
1206 *
1207 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1208 * so this comparison is correct even though we appear to be accounting
1209 * only for the item and not for its line pointer.
1210 */
1211 if (PageGetFreeSpace(page) < itemsz)
1212 {
1213 Buffer rbuf;
1214
1215 Assert(!split_only_page);
1216
1217 /* split the buffer into left and right halves */
1218 rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
1219 itup, origitup, nposting, postingoff);
1222 BufferGetBlockNumber(rbuf));
1223
1224 /*----------
1225 * By here,
1226 *
1227 * + our target page has been split;
1228 * + the original tuple has been inserted;
1229 * + we have write locks on both the old (left half)
1230 * and new (right half) buffers, after the split; and
1231 * + we know the key we want to insert into the parent
1232 * (it's the "high key" on the left child page).
1233 *
1234 * We're ready to do the parent insertion. We need to hold onto the
1235 * locks for the child pages until we locate the parent, but we can
1236 * at least release the lock on the right child before doing the
1237 * actual insertion. The lock on the left child will be released
1238 * last of all by parent insertion, where it is the 'cbuf' of parent
1239 * page.
1240 *----------
1241 */
1242 _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
1243 }
1244 else
1245 {
1246 Buffer metabuf = InvalidBuffer;
1247 Page metapg = NULL;
1248 BTMetaPageData *metad = NULL;
1249 BlockNumber blockcache;
1250
1251 /*
1252 * If we are doing this insert because we split a page that was the
1253 * only one on its tree level, but was not the root, it may have been
1254 * the "fast root". We need to ensure that the fast root link points
1255 * at or above the current page. We can safely acquire a lock on the
1256 * metapage here --- see comments for _bt_newlevel().
1257 */
1258 if (unlikely(split_only_page))
1259 {
1260 Assert(!isleaf);
1261 Assert(BufferIsValid(cbuf));
1262
1263 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1264 metapg = BufferGetPage(metabuf);
1265 metad = BTPageGetMeta(metapg);
1266
1267 if (metad->btm_fastlevel >= opaque->btpo_level)
1268 {
1269 /* no update wanted */
1270 _bt_relbuf(rel, metabuf);
1271 metabuf = InvalidBuffer;
1272 }
1273 }
1274
1275 /* Do the update. No ereport(ERROR) until changes are logged */
1277
1278 if (postingoff != 0)
1279 memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1280
1281 if (PageAddItem(page, itup, itemsz, newitemoff, false, false) == InvalidOffsetNumber)
1282 elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1284
1286
1287 if (BufferIsValid(metabuf))
1288 {
1289 /* upgrade meta-page if needed */
1290 if (metad->btm_version < BTREE_NOVAC_VERSION)
1291 _bt_upgrademetapage(metapg);
1293 metad->btm_fastlevel = opaque->btpo_level;
1294 MarkBufferDirty(metabuf);
1295 }
1296
1297 /*
1298 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1299 * finishes a split
1300 */
1301 if (!isleaf)
1302 {
1303 Page cpage = BufferGetPage(cbuf);
1304 BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1305
1306 Assert(P_INCOMPLETE_SPLIT(cpageop));
1307 cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1308 MarkBufferDirty(cbuf);
1309 }
1310
1311 /* XLOG stuff */
1312 if (RelationNeedsWAL(rel))
1313 {
1314 xl_btree_insert xlrec;
1315 xl_btree_metadata xlmeta;
1316 uint8 xlinfo;
1317 XLogRecPtr recptr;
1318 uint16 upostingoff;
1319
1320 xlrec.offnum = newitemoff;
1321
1324
1325 if (isleaf && postingoff == 0)
1326 {
1327 /* Simple leaf insert */
1328 xlinfo = XLOG_BTREE_INSERT_LEAF;
1329 }
1330 else if (postingoff != 0)
1331 {
1332 /*
1333 * Leaf insert with posting list split. Must include
1334 * postingoff field before newitem/orignewitem.
1335 */
1336 Assert(isleaf);
1337 xlinfo = XLOG_BTREE_INSERT_POST;
1338 }
1339 else
1340 {
1341 /* Internal page insert, which finishes a split on cbuf */
1342 xlinfo = XLOG_BTREE_INSERT_UPPER;
1344
1345 if (BufferIsValid(metabuf))
1346 {
1347 /* Actually, it's an internal page insert + meta update */
1348 xlinfo = XLOG_BTREE_INSERT_META;
1349
1351 xlmeta.version = metad->btm_version;
1352 xlmeta.root = metad->btm_root;
1353 xlmeta.level = metad->btm_level;
1354 xlmeta.fastroot = metad->btm_fastroot;
1355 xlmeta.fastlevel = metad->btm_fastlevel;
1357 xlmeta.allequalimage = metad->btm_allequalimage;
1358
1359 XLogRegisterBuffer(2, metabuf,
1361 XLogRegisterBufData(2, &xlmeta,
1362 sizeof(xl_btree_metadata));
1363 }
1364 }
1365
1367 if (postingoff == 0)
1368 {
1369 /* Just log itup from caller */
1370 XLogRegisterBufData(0, itup, IndexTupleSize(itup));
1371 }
1372 else
1373 {
1374 /*
1375 * Insert with posting list split (XLOG_BTREE_INSERT_POST
1376 * record) case.
1377 *
1378 * Log postingoff. Also log origitup, not itup. REDO routine
1379 * must reconstruct final itup (as well as nposting) using
1380 * _bt_swap_posting().
1381 */
1382 upostingoff = postingoff;
1383
1384 XLogRegisterBufData(0, &upostingoff, sizeof(uint16));
1385 XLogRegisterBufData(0, origitup,
1386 IndexTupleSize(origitup));
1387 }
1388
1389 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1390
1391 if (BufferIsValid(metabuf))
1392 PageSetLSN(metapg, recptr);
1393 if (!isleaf)
1394 PageSetLSN(BufferGetPage(cbuf), recptr);
1395
1396 PageSetLSN(page, recptr);
1397 }
1398
1400
1401 /* Release subsidiary buffers */
1402 if (BufferIsValid(metabuf))
1403 _bt_relbuf(rel, metabuf);
1404 if (!isleaf)
1405 _bt_relbuf(rel, cbuf);
1406
1407 /*
1408 * Cache the block number if this is the rightmost leaf page. Cache
1409 * may be used by a future inserter within _bt_search_insert().
1410 */
1411 blockcache = InvalidBlockNumber;
1412 if (isrightmost && isleaf && !isroot)
1413 blockcache = BufferGetBlockNumber(buf);
1414
1415 /* Release buffer for insertion target block */
1416 _bt_relbuf(rel, buf);
1417
1418 /*
1419 * If we decided to cache the insertion target block before releasing
1420 * its buffer lock, then cache it now. Check the height of the tree
1421 * first, though. We don't go for the optimization with small
1422 * indexes. Defer final check to this point to ensure that we don't
1423 * call _bt_getrootheight while holding a buffer lock.
1424 */
1425 if (BlockNumberIsValid(blockcache) &&
1427 RelationSetTargetBlock(rel, blockcache);
1428 }
1429
1430 /* be tidy */
1431 if (postingoff != 0)
1432 {
1433 /* itup is actually a modified copy of caller's original */
1434 pfree(nposting);
1435 pfree(itup);
1436 }
1437}
#define InvalidBlockNumber
Definition: block.h:33
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2937
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:376
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:390
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:471
uint8_t uint8
Definition: c.h:540
uint16_t uint16
Definition: c.h:541
#define PANIC
Definition: elog.h:42
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
Definition: nbtdedup.c:1020
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:31
static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
Definition: nbtinsert.c:1467
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:107
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:675
#define P_ISROOT(opaque)
Definition: nbtree.h:222
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:153
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:578
#define XLOG_BTREE_INSERT_POST
Definition: nbtxlog.h:32
#define SizeOfBtreeInsert
Definition: nbtxlog.h:84
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:27
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:28
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:29
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3144
#define RelationNeedsWAL(relation)
Definition: rel.h:638
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:618
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:527
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:534
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:115
uint32 btm_level
Definition: nbtree.h:109
BlockNumber btm_fastroot
Definition: nbtree.h:110
uint32 btm_version
Definition: nbtree.h:107
bool btm_allequalimage
Definition: nbtree.h:119
uint32 btm_fastlevel
Definition: nbtree.h:111
OffsetNumber offnum
Definition: nbtxlog.h:78
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:478
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition: xloginsert.c:409
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:368
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:245
void XLogBeginInsert(void)
Definition: xloginsert.c:152
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define REGBUF_WILL_INIT
Definition: xloginsert.h:34

References _bt_getbuf(), _bt_getrootheight(), _bt_insert_parent(), _bt_relbuf(), _bt_split(), _bt_swap_posting(), _bt_upgrademetapage(), BTScanInsertData::allequalimage, xl_btree_metadata::allequalimage, Assert(), BlockNumberIsValid(), BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTREE_FASTPATH_MIN_LEVEL, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BTreeTupleGetNAtts, BTreeTupleIsPosting(), buf, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), CopyIndexTuple(), elog, END_CRIT_SECTION, ereport, errcode(), errmsg_internal(), ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, BTScanInsertData::heapkeyspace, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize(), InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, ItemIdIsDead, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, MarkBufferDirty(), MAXALIGN, xl_btree_insert::offnum, OffsetNumberNext, P_FIRSTDATAKEY, P_INCOMPLETE_SPLIT, P_ISLEAF, P_ISROOT, P_LEFTMOST, P_RIGHTMOST, PageAddItem, PageGetFreeSpace(), PageGetItem(), PageGetItemId(), PageSetLSN(), PANIC, pfree(), PredicateLockPageSplit(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, RelationSetTargetBlock, xl_btree_metadata::root, SizeOfBtreeInsert, START_CRIT_SECTION, IndexTupleData::t_tid, unlikely, xl_btree_metadata::version, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_INSERT_META, XLOG_BTREE_INSERT_POST, XLOG_BTREE_INSERT_UPPER, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_doinsert(), and _bt_insert_parent().

◆ _bt_newlevel()

static Buffer _bt_newlevel ( Relation  rel,
Relation  heaprel,
Buffer  lbuf,
Buffer  rbuf 
)
static

Definition at line 2451 of file nbtinsert.c.

2452{
2453 Buffer rootbuf;
2454 Page lpage,
2455 rootpage;
2456 BlockNumber lbkno,
2457 rbkno;
2458 BlockNumber rootblknum;
2459 BTPageOpaque rootopaque;
2460 BTPageOpaque lopaque;
2461 ItemId itemid;
2462 IndexTuple item;
2463 IndexTuple left_item;
2464 Size left_item_sz;
2465 IndexTuple right_item;
2466 Size right_item_sz;
2467 Buffer metabuf;
2468 Page metapg;
2469 BTMetaPageData *metad;
2470
2471 lbkno = BufferGetBlockNumber(lbuf);
2472 rbkno = BufferGetBlockNumber(rbuf);
2473 lpage = BufferGetPage(lbuf);
2474 lopaque = BTPageGetOpaque(lpage);
2475
2476 /* get a new root page */
2477 rootbuf = _bt_allocbuf(rel, heaprel);
2478 rootpage = BufferGetPage(rootbuf);
2479 rootblknum = BufferGetBlockNumber(rootbuf);
2480
2481 /* acquire lock on the metapage */
2482 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2483 metapg = BufferGetPage(metabuf);
2484 metad = BTPageGetMeta(metapg);
2485
2486 /*
2487 * Create downlink item for left page (old root). The key value used is
2488 * "minus infinity", a sentinel value that's reliably less than any real
2489 * key value that could appear in the left page.
2490 */
2491 left_item_sz = sizeof(IndexTupleData);
2492 left_item = (IndexTuple) palloc(left_item_sz);
2493 left_item->t_info = left_item_sz;
2494 BTreeTupleSetDownLink(left_item, lbkno);
2495 BTreeTupleSetNAtts(left_item, 0, false);
2496
2497 /*
2498 * Create downlink item for right page. The key for it is obtained from
2499 * the "high key" position in the left page.
2500 */
2501 itemid = PageGetItemId(lpage, P_HIKEY);
2502 right_item_sz = ItemIdGetLength(itemid);
2503 item = (IndexTuple) PageGetItem(lpage, itemid);
2504 right_item = CopyIndexTuple(item);
2505 BTreeTupleSetDownLink(right_item, rbkno);
2506
2507 /* NO EREPORT(ERROR) from here till newroot op is logged */
2509
2510 /* upgrade metapage if needed */
2511 if (metad->btm_version < BTREE_NOVAC_VERSION)
2512 _bt_upgrademetapage(metapg);
2513
2514 /* set btree special data */
2515 rootopaque = BTPageGetOpaque(rootpage);
2516 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2517 rootopaque->btpo_flags = BTP_ROOT;
2518 rootopaque->btpo_level =
2519 (BTPageGetOpaque(lpage))->btpo_level + 1;
2520 rootopaque->btpo_cycleid = 0;
2521
2522 /* update metapage data */
2523 metad->btm_root = rootblknum;
2524 metad->btm_level = rootopaque->btpo_level;
2525 metad->btm_fastroot = rootblknum;
2526 metad->btm_fastlevel = rootopaque->btpo_level;
2527
2528 /*
2529 * Insert the left page pointer into the new root page. The root page is
2530 * the rightmost page on its level so there is no "high key" in it; the
2531 * two items will go into positions P_HIKEY and P_FIRSTKEY.
2532 *
2533 * Note: we *must* insert the two items in item-number order, for the
2534 * benefit of _bt_restore_page().
2535 */
2536 Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2537 if (PageAddItem(rootpage, left_item, left_item_sz, P_HIKEY, false, false) == InvalidOffsetNumber)
2538 elog(PANIC, "failed to add leftkey to new root page"
2539 " while splitting block %u of index \"%s\"",
2541
2542 /*
2543 * insert the right page pointer into the new root page.
2544 */
2545 Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2546 Assert(BTreeTupleGetNAtts(right_item, rel) <=
2548 if (PageAddItem(rootpage, right_item, right_item_sz, P_FIRSTKEY, false, false) == InvalidOffsetNumber)
2549 elog(PANIC, "failed to add rightkey to new root page"
2550 " while splitting block %u of index \"%s\"",
2552
2553 /* Clear the incomplete-split flag in the left child */
2554 Assert(P_INCOMPLETE_SPLIT(lopaque));
2555 lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2556 MarkBufferDirty(lbuf);
2557
2558 MarkBufferDirty(rootbuf);
2559 MarkBufferDirty(metabuf);
2560
2561 /* XLOG stuff */
2562 if (RelationNeedsWAL(rel))
2563 {
2564 xl_btree_newroot xlrec;
2565 XLogRecPtr recptr;
2567
2568 xlrec.rootblk = rootblknum;
2569 xlrec.level = metad->btm_level;
2570
2573
2577
2579 md.version = metad->btm_version;
2580 md.root = rootblknum;
2581 md.level = metad->btm_level;
2582 md.fastroot = rootblknum;
2583 md.fastlevel = metad->btm_level;
2585 md.allequalimage = metad->btm_allequalimage;
2586
2587 XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
2588
2589 /*
2590 * Direct access to page is not good but faster - we should implement
2591 * some new func in page API.
2592 */
2594 (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2595 ((PageHeader) rootpage)->pd_special -
2596 ((PageHeader) rootpage)->pd_upper);
2597
2598 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2599
2600 PageSetLSN(lpage, recptr);
2601 PageSetLSN(rootpage, recptr);
2602 PageSetLSN(metapg, recptr);
2603 }
2604
2606
2607 /* done with metapage */
2608 _bt_relbuf(rel, metabuf);
2609
2610 pfree(left_item);
2611 pfree(right_item);
2612
2613 return rootbuf;
2614}
size_t Size
Definition: c.h:614
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
struct IndexTupleData IndexTupleData
Buffer _bt_allocbuf(Relation rel, Relation heaprel)
Definition: nbtpage.c:869
#define BTP_ROOT
Definition: nbtree.h:78
#define P_NONE
Definition: nbtree.h:213
#define P_FIRSTKEY
Definition: nbtree.h:369
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:596
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:347
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
BlockNumber btpo_prev
Definition: nbtree.h:65
BTCycleId btpo_cycleid
Definition: nbtree.h:69
unsigned short t_info
Definition: itup.h:49
uint32 level
Definition: nbtxlog.h:344
BlockNumber rootblk
Definition: nbtxlog.h:343

References _bt_allocbuf(), _bt_getbuf(), _bt_relbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_ROOT, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BTreeTupleGetNAtts, BTreeTupleSetDownLink(), BTreeTupleSetNAtts(), BufferGetBlockNumber(), BufferGetPage(), CopyIndexTuple(), elog, END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, xl_btree_newroot::level, MarkBufferDirty(), P_FIRSTKEY, P_HIKEY, P_INCOMPLETE_SPLIT, P_NONE, PageAddItem, PageGetItem(), PageGetItemId(), PageSetLSN(), palloc(), PANIC, pfree(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, IndexTupleData::t_info, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insert_parent().

◆ _bt_pgaddtup()

static bool _bt_pgaddtup ( Page  page,
Size  itemsize,
const IndexTupleData itup,
OffsetNumber  itup_off,
bool  newfirstdataitem 
)
inlinestatic

Definition at line 2635 of file nbtinsert.c.

2640{
2641 IndexTupleData trunctuple;
2642
2643 if (newfirstdataitem)
2644 {
2645 trunctuple = *itup;
2646 trunctuple.t_info = sizeof(IndexTupleData);
2647 BTreeTupleSetNAtts(&trunctuple, 0, false);
2648 itup = &trunctuple;
2649 itemsize = sizeof(IndexTupleData);
2650 }
2651
2652 if (unlikely(PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber))
2653 return false;
2654
2655 return true;
2656}

References BTreeTupleSetNAtts(), InvalidOffsetNumber, PageAddItem, IndexTupleData::t_info, and unlikely.

Referenced by _bt_split().

◆ _bt_search_insert()

static BTStack _bt_search_insert ( Relation  rel,
Relation  heaprel,
BTInsertState  insertstate 
)
static

Definition at line 318 of file nbtinsert.c.

319{
320 Assert(insertstate->buf == InvalidBuffer);
321 Assert(!insertstate->bounds_valid);
322 Assert(insertstate->postingoff == 0);
323
325 {
326 /* Simulate a _bt_getbuf() call with conditional locking */
327 insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
328 if (_bt_conditionallockbuf(rel, insertstate->buf))
329 {
330 Page page;
331 BTPageOpaque opaque;
332
333 _bt_checkpage(rel, insertstate->buf);
334 page = BufferGetPage(insertstate->buf);
335 opaque = BTPageGetOpaque(page);
336
337 /*
338 * Check if the page is still the rightmost leaf page and has
339 * enough free space to accommodate the new tuple. Also check
340 * that the insertion scan key is strictly greater than the first
341 * non-pivot tuple on the page. (Note that we expect itup_key's
342 * scantid to be unset when our caller is a checkingunique
343 * inserter.)
344 */
345 if (P_RIGHTMOST(opaque) &&
346 P_ISLEAF(opaque) &&
347 !P_IGNORE(opaque) &&
348 PageGetFreeSpace(page) > insertstate->itemsz &&
350 _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
351 {
352 /*
353 * Caller can use the fastpath optimization because cached
354 * block is still rightmost leaf page, which can fit caller's
355 * new tuple without splitting. Keep block in local cache for
356 * next insert, and have caller use NULL stack.
357 *
358 * Note that _bt_insert_parent() has an assertion that catches
359 * leaf page splits that somehow follow from a fastpath insert
360 * (it should only be passed a NULL stack when it must deal
361 * with a concurrent root page split, and never because a NULL
362 * stack was returned here).
363 */
364 return NULL;
365 }
366
367 /* Page unsuitable for caller, drop lock and pin */
368 _bt_relbuf(rel, insertstate->buf);
369 }
370 else
371 {
372 /* Lock unavailable, drop pin */
373 ReleaseBuffer(insertstate->buf);
374 }
375
376 /* Forget block, since cache doesn't appear to be useful */
378 }
379
380 /* Cannot use optimization -- descend tree, return proper descent stack */
381 return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
382 BT_WRITE);
383}
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5371
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:745
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1093
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:107

References _bt_checkpage(), _bt_compare(), _bt_conditionallockbuf(), _bt_relbuf(), _bt_search(), Assert(), BTInsertStateData::bounds_valid, BT_WRITE, BTPageGetOpaque, BTInsertStateData::buf, BufferGetPage(), InvalidBlockNumber, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup_key, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber(), BTInsertStateData::postingoff, ReadBuffer(), RelationGetTargetBlock, RelationSetTargetBlock, and ReleaseBuffer().

Referenced by _bt_doinsert().

◆ _bt_simpledel_pass()

static void _bt_simpledel_pass ( Relation  rel,
Buffer  buffer,
Relation  heapRel,
OffsetNumber deletable,
int  ndeletable,
IndexTuple  newitem,
OffsetNumber  minoff,
OffsetNumber  maxoff 
)
static

Definition at line 2816 of file nbtinsert.c.

2819{
2820 Page page = BufferGetPage(buffer);
2821 BlockNumber *deadblocks;
2822 int ndeadblocks;
2823 TM_IndexDeleteOp delstate;
2824 OffsetNumber offnum;
2825
2826 /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2827 deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2828 &ndeadblocks);
2829
2830 /* Initialize tableam state that describes index deletion operation */
2831 delstate.irel = rel;
2832 delstate.iblknum = BufferGetBlockNumber(buffer);
2833 delstate.bottomup = false;
2834 delstate.bottomupfreespace = 0;
2835 delstate.ndeltids = 0;
2836 delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2837 delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
2838
2839 for (offnum = minoff;
2840 offnum <= maxoff;
2841 offnum = OffsetNumberNext(offnum))
2842 {
2843 ItemId itemid = PageGetItemId(page, offnum);
2844 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2845 TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2846 TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2847 BlockNumber tidblock;
2848 void *match;
2849
2850 if (!BTreeTupleIsPosting(itup))
2851 {
2852 tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2853 match = bsearch(&tidblock, deadblocks, ndeadblocks,
2854 sizeof(BlockNumber), _bt_blk_cmp);
2855
2856 if (!match)
2857 {
2858 Assert(!ItemIdIsDead(itemid));
2859 continue;
2860 }
2861
2862 /*
2863 * TID's table block is among those pointed to by the TIDs from
2864 * LP_DEAD-bit set tuples on page -- add TID to deltids
2865 */
2866 odeltid->tid = itup->t_tid;
2867 odeltid->id = delstate.ndeltids;
2868 ostatus->idxoffnum = offnum;
2869 ostatus->knowndeletable = ItemIdIsDead(itemid);
2870 ostatus->promising = false; /* unused */
2871 ostatus->freespace = 0; /* unused */
2872
2873 delstate.ndeltids++;
2874 }
2875 else
2876 {
2877 int nitem = BTreeTupleGetNPosting(itup);
2878
2879 for (int p = 0; p < nitem; p++)
2880 {
2881 ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2882
2883 tidblock = ItemPointerGetBlockNumber(tid);
2884 match = bsearch(&tidblock, deadblocks, ndeadblocks,
2885 sizeof(BlockNumber), _bt_blk_cmp);
2886
2887 if (!match)
2888 {
2889 Assert(!ItemIdIsDead(itemid));
2890 continue;
2891 }
2892
2893 /*
2894 * TID's table block is among those pointed to by the TIDs
2895 * from LP_DEAD-bit set tuples on page -- add TID to deltids
2896 */
2897 odeltid->tid = *tid;
2898 odeltid->id = delstate.ndeltids;
2899 ostatus->idxoffnum = offnum;
2900 ostatus->knowndeletable = ItemIdIsDead(itemid);
2901 ostatus->promising = false; /* unused */
2902 ostatus->freespace = 0; /* unused */
2903
2904 odeltid++;
2905 ostatus++;
2906 delstate.ndeltids++;
2907 }
2908 }
2909 }
2910
2911 pfree(deadblocks);
2912
2913 Assert(delstate.ndeltids >= ndeletable);
2914
2915 /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2916 _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2917
2918 pfree(delstate.deltids);
2919 pfree(delstate.status);
2920}
static BlockNumber * _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, int *nblocks)
Definition: nbtinsert.c:2942
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1511
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
TM_IndexStatus * status
Definition: tableam.h:254
int bottomupfreespace
Definition: tableam.h:249
Relation irel
Definition: tableam.h:246
TM_IndexDelete * deltids
Definition: tableam.h:253
BlockNumber iblknum
Definition: tableam.h:247
ItemPointerData tid
Definition: tableam.h:212
bool knowndeletable
Definition: tableam.h:219
bool promising
Definition: tableam.h:222
int16 freespace
Definition: tableam.h:223
OffsetNumber idxoffnum
Definition: tableam.h:218

References _bt_blk_cmp(), _bt_deadblocks(), _bt_delitems_delete_check(), Assert(), TM_IndexDeleteOp::bottomup, TM_IndexDeleteOp::bottomupfreespace, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), BufferGetBlockNumber(), BufferGetPage(), TM_IndexDeleteOp::deltids, TM_IndexStatus::freespace, TM_IndexDeleteOp::iblknum, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, TM_IndexDeleteOp::irel, ItemIdIsDead, ItemPointerGetBlockNumber(), TM_IndexStatus::knowndeletable, MaxTIDsPerBTreePage, TM_IndexDeleteOp::ndeltids, OffsetNumberNext, PageGetItem(), PageGetItemId(), palloc(), pfree(), TM_IndexStatus::promising, TM_IndexDeleteOp::status, IndexTupleData::t_tid, and TM_IndexDelete::tid.

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_split()

static Buffer _bt_split ( Relation  rel,
Relation  heaprel,
BTScanInsert  itup_key,
Buffer  buf,
Buffer  cbuf,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
IndexTuple  orignewitem,
IndexTuple  nposting,
uint16  postingoff 
)
static

Definition at line 1467 of file nbtinsert.c.

1470{
1471 Buffer rbuf;
1472 Page origpage;
1473 Page leftpage,
1474 rightpage;
1475 PGAlignedBlock leftpage_buf,
1476 rightpage_buf;
1477 BlockNumber origpagenumber,
1478 rightpagenumber;
1479 BTPageOpaque ropaque,
1480 lopaque,
1481 oopaque;
1482 Buffer sbuf = InvalidBuffer;
1483 Page spage = NULL;
1484 BTPageOpaque sopaque = NULL;
1485 Size itemsz;
1486 ItemId itemid;
1487 IndexTuple firstright,
1488 lefthighkey;
1489 OffsetNumber firstrightoff;
1490 OffsetNumber afterleftoff,
1491 afterrightoff,
1492 minusinfoff;
1493 OffsetNumber origpagepostingoff;
1494 OffsetNumber maxoff;
1496 bool newitemonleft,
1497 isleaf,
1498 isrightmost;
1499
1500 /*
1501 * origpage is the original page to be split. leftpage is a temporary
1502 * buffer that receives the left-sibling data, which will be copied back
1503 * into origpage on success. rightpage is the new page that will receive
1504 * the right-sibling data.
1505 *
1506 * leftpage is allocated after choosing a split point. rightpage's new
1507 * buffer isn't acquired until after leftpage is initialized and has new
1508 * high key, the last point where splitting the page may fail (barring
1509 * corruption). Failing before acquiring new buffer won't have lasting
1510 * consequences, since origpage won't have been modified and leftpage is
1511 * only workspace.
1512 */
1513 origpage = BufferGetPage(buf);
1514 oopaque = BTPageGetOpaque(origpage);
1515 isleaf = P_ISLEAF(oopaque);
1516 isrightmost = P_RIGHTMOST(oopaque);
1517 maxoff = PageGetMaxOffsetNumber(origpage);
1518 origpagenumber = BufferGetBlockNumber(buf);
1519
1520 /*
1521 * Choose a point to split origpage at.
1522 *
1523 * A split point can be thought of as a point _between_ two existing data
1524 * items on origpage (the lastleft and firstright tuples), provided you
1525 * pretend that the new item that didn't fit is already on origpage.
1526 *
1527 * Since origpage does not actually contain newitem, the representation of
1528 * split points needs to work with two boundary cases: splits where
1529 * newitem is lastleft, and splits where newitem is firstright.
1530 * newitemonleft resolves the ambiguity that would otherwise exist when
1531 * newitemoff == firstrightoff. In all other cases it's clear which side
1532 * of the split every tuple goes on from context. newitemonleft is
1533 * usually (but not always) redundant information.
1534 *
1535 * firstrightoff is supposed to be an origpage offset number, but it's
1536 * possible that its value will be maxoff+1, which is "past the end" of
1537 * origpage. This happens in the rare case where newitem goes after all
1538 * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1539 * origpage at the point that leaves newitem alone on new right page. Any
1540 * "!newitemonleft && newitemoff == firstrightoff" split point makes
1541 * newitem the firstright tuple, though, so this case isn't a special
1542 * case.
1543 */
1544 firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1545 newitem, &newitemonleft);
1546
1547 /* Use temporary buffer for leftpage */
1548 leftpage = leftpage_buf.data;
1550 lopaque = BTPageGetOpaque(leftpage);
1551
1552 /*
1553 * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1554 * and HAS_GARBAGE flags.
1555 */
1556 lopaque->btpo_flags = oopaque->btpo_flags;
1558 /* set flag in leftpage indicating that rightpage has no downlink yet */
1559 lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1560 lopaque->btpo_prev = oopaque->btpo_prev;
1561 /* handle btpo_next after rightpage buffer acquired */
1562 lopaque->btpo_level = oopaque->btpo_level;
1563 /* handle btpo_cycleid after rightpage buffer acquired */
1564
1565 /*
1566 * Copy the original page's LSN into leftpage, which will become the
1567 * updated version of the page. We need this because XLogInsert will
1568 * examine the LSN and possibly dump it in a page image.
1569 */
1570 PageSetLSN(leftpage, PageGetLSN(origpage));
1571
1572 /*
1573 * Determine page offset number of existing overlapped-with-orignewitem
1574 * posting list when it is necessary to perform a posting list split in
1575 * passing. Note that newitem was already changed by caller (newitem no
1576 * longer has the orignewitem TID).
1577 *
1578 * This page offset number (origpagepostingoff) will be used to pretend
1579 * that the posting split has already taken place, even though the
1580 * required modifications to origpage won't occur until we reach the
1581 * critical section. The lastleft and firstright tuples of our page split
1582 * point should, in effect, come from an imaginary version of origpage
1583 * that has the nposting tuple instead of the original posting list tuple.
1584 *
1585 * Note: _bt_findsplitloc() should have compensated for coinciding posting
1586 * list splits in just the same way, at least in theory. It doesn't
1587 * bother with that, though. In practice it won't affect its choice of
1588 * split point.
1589 */
1590 origpagepostingoff = InvalidOffsetNumber;
1591 if (postingoff != 0)
1592 {
1593 Assert(isleaf);
1594 Assert(ItemPointerCompare(&orignewitem->t_tid,
1595 &newitem->t_tid) < 0);
1596 Assert(BTreeTupleIsPosting(nposting));
1597 origpagepostingoff = OffsetNumberPrev(newitemoff);
1598 }
1599
1600 /*
1601 * The high key for the new left page is a possibly-truncated copy of
1602 * firstright on the leaf level (it's "firstright itself" on internal
1603 * pages; see !isleaf comments below). This may seem to be contrary to
1604 * Lehman & Yao's approach of using a copy of lastleft as the new high key
1605 * when splitting on the leaf level. It isn't, though.
1606 *
1607 * Suffix truncation will leave the left page's high key fully equal to
1608 * lastleft when lastleft and firstright are equal prior to heap TID (that
1609 * is, the tiebreaker TID value comes from lastleft). It isn't actually
1610 * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1611 * "subtree" invariant to hold. It's sufficient to make sure that the new
1612 * leaf high key is strictly less than firstright, and greater than or
1613 * equal to (not necessarily equal to) lastleft. In other words, when
1614 * suffix truncation isn't possible during a leaf page split, we take
1615 * L&Y's exact approach to generating a new high key for the left page.
1616 * (Actually, that is slightly inaccurate. We don't just use a copy of
1617 * lastleft. A tuple with all the keys from firstright but the max heap
1618 * TID from lastleft is used, to avoid introducing a special case.)
1619 */
1620 if (!newitemonleft && newitemoff == firstrightoff)
1621 {
1622 /* incoming tuple becomes firstright */
1623 itemsz = newitemsz;
1624 firstright = newitem;
1625 }
1626 else
1627 {
1628 /* existing item at firstrightoff becomes firstright */
1629 itemid = PageGetItemId(origpage, firstrightoff);
1630 itemsz = ItemIdGetLength(itemid);
1631 firstright = (IndexTuple) PageGetItem(origpage, itemid);
1632 if (firstrightoff == origpagepostingoff)
1633 firstright = nposting;
1634 }
1635
1636 if (isleaf)
1637 {
1638 IndexTuple lastleft;
1639
1640 /* Attempt suffix truncation for leaf page splits */
1641 if (newitemonleft && newitemoff == firstrightoff)
1642 {
1643 /* incoming tuple becomes lastleft */
1644 lastleft = newitem;
1645 }
1646 else
1647 {
1648 OffsetNumber lastleftoff;
1649
1650 /* existing item before firstrightoff becomes lastleft */
1651 lastleftoff = OffsetNumberPrev(firstrightoff);
1652 Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1653 itemid = PageGetItemId(origpage, lastleftoff);
1654 lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1655 if (lastleftoff == origpagepostingoff)
1656 lastleft = nposting;
1657 }
1658
1659 lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1660 itemsz = IndexTupleSize(lefthighkey);
1661 }
1662 else
1663 {
1664 /*
1665 * Don't perform suffix truncation on a copy of firstright to make
1666 * left page high key for internal page splits. Must use firstright
1667 * as new high key directly.
1668 *
1669 * Each distinct separator key value originates as a leaf level high
1670 * key; all other separator keys/pivot tuples are copied from one
1671 * level down. A separator key in a grandparent page must be
1672 * identical to high key in rightmost parent page of the subtree to
1673 * its left, which must itself be identical to high key in rightmost
1674 * child page of that same subtree (this even applies to separator
1675 * from grandparent's high key). There must always be an unbroken
1676 * "seam" of identical separator keys that guide index scans at every
1677 * level, starting from the grandparent. That's why suffix truncation
1678 * is unsafe here.
1679 *
1680 * Internal page splits will truncate firstright into a "negative
1681 * infinity" data item when it gets inserted on the new right page
1682 * below, though. This happens during the call to _bt_pgaddtup() for
1683 * the new first data item for right page. Do not confuse this
1684 * mechanism with suffix truncation. It is just a convenient way of
1685 * implementing page splits that split the internal page "inside"
1686 * firstright. The lefthighkey separator key cannot appear a second
1687 * time in the right page (only firstright's downlink goes in right
1688 * page).
1689 */
1690 lefthighkey = firstright;
1691 }
1692
1693 /*
1694 * Add new high key to leftpage
1695 */
1696 afterleftoff = P_HIKEY;
1697
1698 Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1699 Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1701 Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1702 if (PageAddItem(leftpage, lefthighkey, itemsz, afterleftoff, false, false) == InvalidOffsetNumber)
1703 elog(ERROR, "failed to add high key to the left sibling"
1704 " while splitting block %u of index \"%s\"",
1705 origpagenumber, RelationGetRelationName(rel));
1706 afterleftoff = OffsetNumberNext(afterleftoff);
1707
1708 /*
1709 * Acquire a new right page to split into, now that left page has a new
1710 * high key.
1711 *
1712 * To not confuse future VACUUM operations, we zero the right page and
1713 * work on an in-memory copy of it before writing WAL, then copy its
1714 * contents back to the actual page once we start the critical section
1715 * work. This simplifies the split work, so as there is no need to zero
1716 * the right page before throwing an error.
1717 */
1718 rbuf = _bt_allocbuf(rel, heaprel);
1719 rightpage = rightpage_buf.data;
1720
1721 /*
1722 * Copy the contents of the right page into its temporary location, and
1723 * zero the original space.
1724 */
1725 memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ);
1726 memset(BufferGetPage(rbuf), 0, BLCKSZ);
1727 rightpagenumber = BufferGetBlockNumber(rbuf);
1728 /* rightpage was initialized by _bt_allocbuf */
1729 ropaque = BTPageGetOpaque(rightpage);
1730
1731 /*
1732 * Finish off remaining leftpage special area fields. They cannot be set
1733 * before both origpage (leftpage) and rightpage buffers are acquired and
1734 * locked.
1735 *
1736 * btpo_cycleid is only used with leaf pages, though we set it here in all
1737 * cases just to be consistent.
1738 */
1739 lopaque->btpo_next = rightpagenumber;
1740 lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1741
1742 /*
1743 * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1744 * and HAS_GARBAGE flags.
1745 */
1746 ropaque->btpo_flags = oopaque->btpo_flags;
1748 ropaque->btpo_prev = origpagenumber;
1749 ropaque->btpo_next = oopaque->btpo_next;
1750 ropaque->btpo_level = oopaque->btpo_level;
1751 ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1752
1753 /*
1754 * Add new high key to rightpage where necessary.
1755 *
1756 * If the page we're splitting is not the rightmost page at its level in
1757 * the tree, then the first entry on the page is the high key from
1758 * origpage.
1759 */
1760 afterrightoff = P_HIKEY;
1761
1762 if (!isrightmost)
1763 {
1764 IndexTuple righthighkey;
1765
1766 itemid = PageGetItemId(origpage, P_HIKEY);
1767 itemsz = ItemIdGetLength(itemid);
1768 righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1769 Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1770 Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1772 if (PageAddItem(rightpage, righthighkey, itemsz, afterrightoff, false, false) == InvalidOffsetNumber)
1773 {
1774 elog(ERROR, "failed to add high key to the right sibling"
1775 " while splitting block %u of index \"%s\"",
1776 origpagenumber, RelationGetRelationName(rel));
1777 }
1778 afterrightoff = OffsetNumberNext(afterrightoff);
1779 }
1780
1781 /*
1782 * Internal page splits truncate first data item on right page -- it
1783 * becomes "minus infinity" item for the page. Set this up here.
1784 */
1785 minusinfoff = InvalidOffsetNumber;
1786 if (!isleaf)
1787 minusinfoff = afterrightoff;
1788
1789 /*
1790 * Now transfer all the data items (non-pivot tuples in isleaf case, or
1791 * additional pivot tuples in !isleaf case) to the appropriate page.
1792 *
1793 * Note: we *must* insert at least the right page's items in item-number
1794 * order, for the benefit of _bt_restore_page().
1795 */
1796 for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1797 {
1798 IndexTuple dataitem;
1799
1800 itemid = PageGetItemId(origpage, i);
1801 itemsz = ItemIdGetLength(itemid);
1802 dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1803
1804 /* replace original item with nposting due to posting split? */
1805 if (i == origpagepostingoff)
1806 {
1807 Assert(BTreeTupleIsPosting(dataitem));
1808 Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1809 dataitem = nposting;
1810 }
1811
1812 /* does new item belong before this one? */
1813 else if (i == newitemoff)
1814 {
1815 if (newitemonleft)
1816 {
1817 Assert(newitemoff <= firstrightoff);
1818 if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1819 false))
1820 {
1821 elog(ERROR, "failed to add new item to the left sibling"
1822 " while splitting block %u of index \"%s\"",
1823 origpagenumber, RelationGetRelationName(rel));
1824 }
1825 afterleftoff = OffsetNumberNext(afterleftoff);
1826 }
1827 else
1828 {
1829 Assert(newitemoff >= firstrightoff);
1830 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1831 afterrightoff == minusinfoff))
1832 {
1833 elog(ERROR, "failed to add new item to the right sibling"
1834 " while splitting block %u of index \"%s\"",
1835 origpagenumber, RelationGetRelationName(rel));
1836 }
1837 afterrightoff = OffsetNumberNext(afterrightoff);
1838 }
1839 }
1840
1841 /* decide which page to put it on */
1842 if (i < firstrightoff)
1843 {
1844 if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1845 {
1846 elog(ERROR, "failed to add old item to the left sibling"
1847 " while splitting block %u of index \"%s\"",
1848 origpagenumber, RelationGetRelationName(rel));
1849 }
1850 afterleftoff = OffsetNumberNext(afterleftoff);
1851 }
1852 else
1853 {
1854 if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1855 afterrightoff == minusinfoff))
1856 {
1857 elog(ERROR, "failed to add old item to the right sibling"
1858 " while splitting block %u of index \"%s\"",
1859 origpagenumber, RelationGetRelationName(rel));
1860 }
1861 afterrightoff = OffsetNumberNext(afterrightoff);
1862 }
1863 }
1864
1865 /* Handle case where newitem goes at the end of rightpage */
1866 if (i <= newitemoff)
1867 {
1868 /*
1869 * Can't have newitemonleft here; that would imply we were told to put
1870 * *everything* on the left page, which cannot fit (if it could, we'd
1871 * not be splitting the page).
1872 */
1873 Assert(!newitemonleft && newitemoff == maxoff + 1);
1874 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1875 afterrightoff == minusinfoff))
1876 {
1877 elog(ERROR, "failed to add new item to the right sibling"
1878 " while splitting block %u of index \"%s\"",
1879 origpagenumber, RelationGetRelationName(rel));
1880 }
1881 afterrightoff = OffsetNumberNext(afterrightoff);
1882 }
1883
1884 /*
1885 * We have to grab the original right sibling (if any) and update its prev
1886 * link. We are guaranteed that this is deadlock-free, since we couple
1887 * the locks in the standard order: left to right.
1888 */
1889 if (!isrightmost)
1890 {
1891 sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1892 spage = BufferGetPage(sbuf);
1893 sopaque = BTPageGetOpaque(spage);
1894 if (sopaque->btpo_prev != origpagenumber)
1895 {
1896 ereport(ERROR,
1897 (errcode(ERRCODE_INDEX_CORRUPTED),
1898 errmsg_internal("right sibling's left-link doesn't match: "
1899 "block %u links to %u instead of expected %u in index \"%s\"",
1900 oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1902 }
1903
1904 /*
1905 * Check to see if we can set the SPLIT_END flag in the right-hand
1906 * split page; this can save some I/O for vacuum since it need not
1907 * proceed to the right sibling. We can set the flag if the right
1908 * sibling has a different cycleid: that means it could not be part of
1909 * a group of pages that were all split off from the same ancestor
1910 * page. If you're confused, imagine that page A splits to A B and
1911 * then again, yielding A C B, while vacuum is in progress. Tuples
1912 * originally in A could now be in either B or C, hence vacuum must
1913 * examine both pages. But if D, our right sibling, has a different
1914 * cycleid then it could not contain any tuples that were in A when
1915 * the vacuum started.
1916 */
1917 if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1918 ropaque->btpo_flags |= BTP_SPLIT_END;
1919 }
1920
1921 /*
1922 * Right sibling is locked, new siblings are prepared, but original page
1923 * is not updated yet.
1924 *
1925 * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1926 * not starting the critical section till here because we haven't been
1927 * scribbling on the original page yet; see comments above.
1928 */
1930
1931 /*
1932 * By here, the original data page has been split into two new halves, and
1933 * these are correct. The algorithm requires that the left page never
1934 * move during a split, so we copy the new left page back on top of the
1935 * original. We need to do this before writing the WAL record, so that
1936 * XLogInsert can WAL log an image of the page if necessary.
1937 */
1938 memcpy(origpage, leftpage, BLCKSZ);
1939 /* leftpage, lopaque must not be used below here */
1940
1941 /*
1942 * Move the contents of the right page from its temporary location to the
1943 * destination buffer, before writing the WAL record. Unlike the left
1944 * page, the right page and its opaque area are still needed to complete
1945 * the update of the page, so reinitialize them.
1946 */
1947 rightpage = BufferGetPage(rbuf);
1948 memcpy(rightpage, rightpage_buf.data, BLCKSZ);
1949 ropaque = BTPageGetOpaque(rightpage);
1950
1952 MarkBufferDirty(rbuf);
1953
1954 if (!isrightmost)
1955 {
1956 sopaque->btpo_prev = rightpagenumber;
1957 MarkBufferDirty(sbuf);
1958 }
1959
1960 /*
1961 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1962 * a split
1963 */
1964 if (!isleaf)
1965 {
1966 Page cpage = BufferGetPage(cbuf);
1967 BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1968
1969 cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1970 MarkBufferDirty(cbuf);
1971 }
1972
1973 /* XLOG stuff */
1974 if (RelationNeedsWAL(rel))
1975 {
1976 xl_btree_split xlrec;
1977 uint8 xlinfo;
1978 XLogRecPtr recptr;
1979
1980 xlrec.level = ropaque->btpo_level;
1981 /* See comments below on newitem, orignewitem, and posting lists */
1982 xlrec.firstrightoff = firstrightoff;
1983 xlrec.newitemoff = newitemoff;
1984 xlrec.postingoff = 0;
1985 if (postingoff != 0 && origpagepostingoff < firstrightoff)
1986 xlrec.postingoff = postingoff;
1987
1990
1993 /* Log original right sibling, since we've changed its prev-pointer */
1994 if (!isrightmost)
1996 if (!isleaf)
1998
1999 /*
2000 * Log the new item, if it was inserted on the left page. (If it was
2001 * put on the right page, we don't need to explicitly WAL log it
2002 * because it's included with all the other items on the right page.)
2003 * Show the new item as belonging to the left page buffer, so that it
2004 * is not stored if XLogInsert decides it needs a full-page image of
2005 * the left page. We always store newitemoff in the record, though.
2006 *
2007 * The details are sometimes slightly different for page splits that
2008 * coincide with a posting list split. If both the replacement
2009 * posting list and newitem go on the right page, then we don't need
2010 * to log anything extra, just like the simple !newitemonleft
2011 * no-posting-split case (postingoff is set to zero in the WAL record,
2012 * so recovery doesn't need to process a posting list split at all).
2013 * Otherwise, we set postingoff and log orignewitem instead of
2014 * newitem, despite having actually inserted newitem. REDO routine
2015 * must reconstruct nposting and newitem using _bt_swap_posting().
2016 *
2017 * Note: It's possible that our page split point is the point that
2018 * makes the posting list lastleft and newitem firstright. This is
2019 * the only case where we log orignewitem/newitem despite newitem
2020 * going on the right page. If XLogInsert decides that it can omit
2021 * orignewitem due to logging a full-page image of the left page,
2022 * everything still works out, since recovery only needs to log
2023 * orignewitem for items on the left page (just like the regular
2024 * newitem-logged case).
2025 */
2026 if (newitemonleft && xlrec.postingoff == 0)
2027 XLogRegisterBufData(0, newitem, newitemsz);
2028 else if (xlrec.postingoff != 0)
2029 {
2030 Assert(isleaf);
2031 Assert(newitemonleft || firstrightoff == newitemoff);
2032 Assert(newitemsz == IndexTupleSize(orignewitem));
2033 XLogRegisterBufData(0, orignewitem, newitemsz);
2034 }
2035
2036 /* Log the left page's new high key */
2037 if (!isleaf)
2038 {
2039 /* lefthighkey isn't local copy, get current pointer */
2040 itemid = PageGetItemId(origpage, P_HIKEY);
2041 lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2042 }
2043 XLogRegisterBufData(0, lefthighkey,
2044 MAXALIGN(IndexTupleSize(lefthighkey)));
2045
2046 /*
2047 * Log the contents of the right page in the format understood by
2048 * _bt_restore_page(). The whole right page will be recreated.
2049 *
2050 * Direct access to page is not good but faster - we should implement
2051 * some new func in page API. Note we only store the tuples
2052 * themselves, knowing that they were inserted in item-number order
2053 * and so the line pointers can be reconstructed. See comments for
2054 * _bt_restore_page().
2055 */
2057 (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
2058 ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2059
2060 xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
2061 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2062
2063 PageSetLSN(origpage, recptr);
2064 PageSetLSN(rightpage, recptr);
2065 if (!isrightmost)
2066 PageSetLSN(spage, recptr);
2067 if (!isleaf)
2068 PageSetLSN(BufferGetPage(cbuf), recptr);
2069 }
2070
2072
2073 /* release the old right sibling */
2074 if (!isrightmost)
2075 _bt_relbuf(rel, sbuf);
2076
2077 /* release the child */
2078 if (!isleaf)
2079 _bt_relbuf(rel, cbuf);
2080
2081 /* be tidy */
2082 if (isleaf)
2083 pfree(lefthighkey);
2084
2085 /* split's done */
2086 return rbuf;
2087}
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:414
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:385
static bool _bt_pgaddtup(Page page, Size itemsize, const IndexTupleData *itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition: nbtinsert.c:2635
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1129
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:84
#define BTP_SPLIT_END
Definition: nbtree.h:82
OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
Definition: nbtsplitloc.c:130
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:3620
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:3883
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:31
#define SizeOfBtreeSplit
Definition: nbtxlog.h:158
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:30
uint16 postingoff
Definition: nbtxlog.h:155
OffsetNumber firstrightoff
Definition: nbtxlog.h:153
uint32 level
Definition: nbtxlog.h:152
OffsetNumber newitemoff
Definition: nbtxlog.h:154
char data[BLCKSZ]
Definition: c.h:1122

References _bt_allocbuf(), _bt_findsplitloc(), _bt_getbuf(), _bt_pageinit(), _bt_pgaddtup(), _bt_relbuf(), _bt_truncate(), _bt_vacuum_cycleid(), Assert(), BT_WRITE, BTP_HAS_GARBAGE, BTP_INCOMPLETE_SPLIT, BTP_ROOT, BTP_SPLIT_END, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetNAtts, BTreeTupleIsPosting(), buf, BufferGetBlockNumber(), BufferGetPage(), BufferGetPageSize(), PGAlignedBlock::data, elog, END_CRIT_SECTION, ereport, errcode(), errmsg_internal(), ERROR, xl_btree_split::firstrightoff, i, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize(), InvalidBuffer, InvalidOffsetNumber, ItemIdGetLength, ItemPointerCompare(), xl_btree_split::level, MarkBufferDirty(), MAXALIGN, xl_btree_split::newitemoff, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageAddItem, PageGetItem(), PageGetItemId(), PageGetLSN(), PageGetMaxOffsetNumber(), PageSetLSN(), pfree(), xl_btree_split::postingoff, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeSplit, START_CRIT_SECTION, IndexTupleData::t_tid, XLOG_BTREE_SPLIT_L, XLOG_BTREE_SPLIT_R, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insertonpg().

◆ _bt_stepright()

static void _bt_stepright ( Relation  rel,
Relation  heaprel,
BTInsertState  insertstate,
BTStack  stack 
)
static

Definition at line 1028 of file nbtinsert.c.

1030{
1031 Page page;
1032 BTPageOpaque opaque;
1033 Buffer rbuf;
1034 BlockNumber rblkno;
1035
1036 Assert(heaprel != NULL);
1037 page = BufferGetPage(insertstate->buf);
1038 opaque = BTPageGetOpaque(page);
1039
1040 rbuf = InvalidBuffer;
1041 rblkno = opaque->btpo_next;
1042 for (;;)
1043 {
1044 rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1045 page = BufferGetPage(rbuf);
1046 opaque = BTPageGetOpaque(page);
1047
1048 /*
1049 * If this page was incompletely split, finish the split now. We do
1050 * this while holding a lock on the left sibling, which is not good
1051 * because finishing the split could be a fairly lengthy operation.
1052 * But this should happen very seldom.
1053 */
1054 if (P_INCOMPLETE_SPLIT(opaque))
1055 {
1056 _bt_finish_split(rel, heaprel, rbuf, stack);
1057 rbuf = InvalidBuffer;
1058 continue;
1059 }
1060
1061 if (!P_IGNORE(opaque))
1062 break;
1063 if (P_RIGHTMOST(opaque))
1064 elog(ERROR, "fell off the end of index \"%s\"",
1066
1067 rblkno = opaque->btpo_next;
1068 }
1069 /* rbuf locked; unlock buf, update state for caller */
1070 _bt_relbuf(rel, insertstate->buf);
1071 insertstate->buf = rbuf;
1072 insertstate->bounds_valid = false;
1073}

References _bt_finish_split(), _bt_relandgetbuf(), _bt_relbuf(), Assert(), BTInsertStateData::bounds_valid, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTInsertStateData::buf, BufferGetPage(), elog, ERROR, InvalidBuffer, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_findinsertloc().