PostgreSQL Source Code git master
nbtdedup.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "utils/rel.h"
Include dependency graph for nbtdedup.c:

Go to the source code of this file.

Functions

static void _bt_bottomupdel_finish_pending (Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
 
static bool _bt_do_singleval (Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
 
static void _bt_singleval_fillfactor (Page page, BTDedupState state, Size newitemsz)
 
void _bt_dedup_pass (Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
 
bool _bt_bottomupdel_pass (Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
 
void _bt_dedup_start_pending (BTDedupState state, IndexTuple base, OffsetNumber baseoff)
 
bool _bt_dedup_save_htid (BTDedupState state, IndexTuple itup)
 
Size _bt_dedup_finish_pending (Page newpage, BTDedupState state)
 
IndexTuple _bt_form_posting (IndexTuple base, const ItemPointerData *htids, int nhtids)
 
void _bt_update_posting (BTVacuumPosting vacposting)
 
IndexTuple _bt_swap_posting (IndexTuple newitem, IndexTuple oposting, int postingoff)
 

Function Documentation

◆ _bt_bottomupdel_finish_pending()

static void _bt_bottomupdel_finish_pending ( Page  page,
BTDedupState  state,
TM_IndexDeleteOp delstate 
)
static

Definition at line 646 of file nbtdedup.c.

648{
649 bool dupinterval = (state->nitems > 1);
650
651 Assert(state->nitems > 0);
652 Assert(state->nitems <= state->nhtids);
653 Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
654
655 for (int i = 0; i < state->nitems; i++)
656 {
657 OffsetNumber offnum = state->baseoff + i;
658 ItemId itemid = PageGetItemId(page, offnum);
659 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
660 TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
661 TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
662
663 if (!BTreeTupleIsPosting(itup))
664 {
665 /* Simple case: A plain non-pivot tuple */
666 ideltid->tid = itup->t_tid;
667 ideltid->id = delstate->ndeltids;
668 istatus->idxoffnum = offnum;
669 istatus->knowndeletable = false; /* for now */
670 istatus->promising = dupinterval; /* simple rule */
671 istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
672
673 delstate->ndeltids++;
674 }
675 else
676 {
677 /*
678 * Complicated case: A posting list tuple.
679 *
680 * We make the conservative assumption that there can only be at
681 * most one affected logical row per posting list tuple. There
682 * will be at most one promising entry in deltids to represent
683 * this presumed lone logical row. Note that this isn't even
684 * considered unless the posting list tuple is also in an interval
685 * of duplicates -- this complicated rule is just a variant of the
686 * simple rule used to decide if plain index tuples are promising.
687 */
688 int nitem = BTreeTupleGetNPosting(itup);
689 bool firstpromising = false;
690 bool lastpromising = false;
691
692 Assert(_bt_posting_valid(itup));
693
694 if (dupinterval)
695 {
696 /*
697 * Complicated rule: either the first or last TID in the
698 * posting list gets marked promising (if any at all)
699 */
700 BlockNumber minblocklist,
701 midblocklist,
702 maxblocklist;
703 ItemPointer mintid,
704 midtid,
705 maxtid;
706
707 mintid = BTreeTupleGetHeapTID(itup);
708 midtid = BTreeTupleGetPostingN(itup, nitem / 2);
709 maxtid = BTreeTupleGetMaxHeapTID(itup);
710 minblocklist = ItemPointerGetBlockNumber(mintid);
711 midblocklist = ItemPointerGetBlockNumber(midtid);
712 maxblocklist = ItemPointerGetBlockNumber(maxtid);
713
714 /* Only entry with predominant table block can be promising */
715 firstpromising = (minblocklist == midblocklist);
716 lastpromising = (!firstpromising &&
717 midblocklist == maxblocklist);
718 }
719
720 for (int p = 0; p < nitem; p++)
721 {
722 ItemPointer htid = BTreeTupleGetPostingN(itup, p);
723
724 ideltid->tid = *htid;
725 ideltid->id = delstate->ndeltids;
726 istatus->idxoffnum = offnum;
727 istatus->knowndeletable = false; /* for now */
728 istatus->promising = false;
729 if ((firstpromising && p == 0) ||
730 (lastpromising && p == nitem - 1))
731 istatus->promising = true;
732 istatus->freespace = sizeof(ItemPointerData); /* at worst */
733
734 ideltid++;
735 istatus++;
736 delstate->ndeltids++;
737 }
738 }
739 }
740
741 if (dupinterval)
742 {
743 state->intervals[state->nintervals].nitems = state->nitems;
744 state->nintervals++;
745 }
746
747 /* Reset state for next interval */
748 state->nhtids = 0;
749 state->nitems = 0;
750 state->phystupsize = 0;
751}
uint32 BlockNumber
Definition: block.h:31
static void * PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:353
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
Assert(PointerIsAligned(start, uint64))
int i
Definition: isn.c:77
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
struct ItemIdData ItemIdData
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
struct ItemPointerData ItemPointerData
IndexTupleData * IndexTuple
Definition: itup.h:53
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:545
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:665
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:639
uint16 OffsetNumber
Definition: off.h:24
ItemPointerData t_tid
Definition: itup.h:37
TM_IndexStatus * status
Definition: tableam.h:254
TM_IndexDelete * deltids
Definition: tableam.h:253
ItemPointerData tid
Definition: tableam.h:212
bool knowndeletable
Definition: tableam.h:219
bool promising
Definition: tableam.h:222
int16 freespace
Definition: tableam.h:223
OffsetNumber idxoffnum
Definition: tableam.h:218
Definition: regguts.h:323

References Assert(), BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), TM_IndexDeleteOp::deltids, TM_IndexStatus::freespace, i, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, ItemIdGetLength, ItemPointerGetBlockNumber(), TM_IndexStatus::knowndeletable, TM_IndexDeleteOp::ndeltids, PageGetItem(), PageGetItemId(), TM_IndexStatus::promising, TM_IndexDeleteOp::status, IndexTupleData::t_tid, and TM_IndexDelete::tid.

Referenced by _bt_bottomupdel_pass().

◆ _bt_bottomupdel_pass()

bool _bt_bottomupdel_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
Size  newitemsz 
)

Definition at line 307 of file nbtdedup.c.

309{
310 OffsetNumber offnum,
311 minoff,
312 maxoff;
313 Page page = BufferGetPage(buf);
314 BTPageOpaque opaque = BTPageGetOpaque(page);
316 TM_IndexDeleteOp delstate;
317 bool neverdedup;
318 int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
319
320 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
321 newitemsz += sizeof(ItemIdData);
322
323 /* Initialize deduplication state */
325 state->deduplicate = true;
326 state->nmaxitems = 0;
327 state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
328 state->base = NULL;
329 state->baseoff = InvalidOffsetNumber;
330 state->basetupsize = 0;
331 state->htids = palloc(state->maxpostingsize);
332 state->nhtids = 0;
333 state->nitems = 0;
334 state->phystupsize = 0;
335 state->nintervals = 0;
336
337 /*
338 * Initialize tableam state that describes bottom-up index deletion
339 * operation.
340 *
341 * We'll go on to ask the tableam to search for TIDs whose index tuples we
342 * can safely delete. The tableam will search until our leaf page space
343 * target is satisfied, or until the cost of continuing with the tableam
344 * operation seems too high. It focuses its efforts on TIDs associated
345 * with duplicate index tuples that we mark "promising".
346 *
347 * This space target is a little arbitrary. The tableam must be able to
348 * keep the costs and benefits in balance. We provide the tableam with
349 * exhaustive information about what might work, without directly
350 * concerning ourselves with avoiding work during the tableam call. Our
351 * role in costing the bottom-up deletion process is strictly advisory.
352 */
353 delstate.irel = rel;
354 delstate.iblknum = BufferGetBlockNumber(buf);
355 delstate.bottomup = true;
356 delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
357 delstate.ndeltids = 0;
358 delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
359 delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
360
361 minoff = P_FIRSTDATAKEY(opaque);
362 maxoff = PageGetMaxOffsetNumber(page);
363 for (offnum = minoff;
364 offnum <= maxoff;
365 offnum = OffsetNumberNext(offnum))
366 {
367 ItemId itemid = PageGetItemId(page, offnum);
368 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
369
370 Assert(!ItemIdIsDead(itemid));
371
372 if (offnum == minoff)
373 {
374 /* itup starts first pending interval */
375 _bt_dedup_start_pending(state, itup, offnum);
376 }
377 else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
379 {
380 /* Tuple is equal; just added its TIDs to pending interval */
381 }
382 else
383 {
384 /* Finalize interval -- move its TIDs to delete state */
385 _bt_bottomupdel_finish_pending(page, state, &delstate);
386
387 /* itup starts new pending interval */
388 _bt_dedup_start_pending(state, itup, offnum);
389 }
390 }
391 /* Finalize final interval -- move its TIDs to delete state */
392 _bt_bottomupdel_finish_pending(page, state, &delstate);
393
394 /*
395 * We don't give up now in the event of having few (or even zero)
396 * promising tuples for the tableam because it's not up to us as the index
397 * AM to manage costs (note that the tableam might have heuristics of its
398 * own that work out what to do). We should at least avoid having our
399 * caller do a useless deduplication pass after we return in the event of
400 * zero promising tuples, though.
401 */
402 neverdedup = false;
403 if (state->nintervals == 0)
404 neverdedup = true;
405
406 pfree(state->htids);
407 pfree(state);
408
409 /* Ask tableam which TIDs are deletable, then physically delete them */
410 _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
411
412 pfree(delstate.deltids);
413 pfree(delstate.status);
414
415 /* Report "success" to caller unconditionally to avoid deduplication */
416 if (neverdedup)
417 return true;
418
419 /* Don't dedup when we won't end up back here any time soon anyway */
420 return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
421}
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4224
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:425
Size PageGetExactFreeSpace(const PageData *page)
Definition: bufpage.c:957
PageData * Page
Definition: bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:371
#define Max(x, y)
Definition: c.h:1001
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:484
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:433
static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
Definition: nbtdedup.c:646
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1511
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
BTDedupStateData * BTDedupState
Definition: nbtree.h:904
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:4102
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
static char * buf
Definition: pg_test_fsync.c:72
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:534
int bottomupfreespace
Definition: tableam.h:249
Relation irel
Definition: tableam.h:246
BlockNumber iblknum
Definition: tableam.h:247

References _bt_bottomupdel_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_delitems_delete_check(), _bt_keep_natts_fast(), Assert(), TM_IndexDeleteOp::bottomup, TM_IndexDeleteOp::bottomupfreespace, BTPageGetOpaque, buf, BufferGetBlockNumber(), BufferGetPage(), TM_IndexDeleteOp::deltids, TM_IndexDeleteOp::iblknum, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, TM_IndexDeleteOp::irel, ItemIdIsDead, Max, MaxTIDsPerBTreePage, TM_IndexDeleteOp::ndeltids, OffsetNumberNext, P_FIRSTDATAKEY, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), palloc(), pfree(), and TM_IndexDeleteOp::status.

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_dedup_finish_pending()

Size _bt_dedup_finish_pending ( Page  newpage,
BTDedupState  state 
)

Definition at line 555 of file nbtdedup.c.

556{
557 OffsetNumber tupoff;
558 Size tuplesz;
559 Size spacesaving;
560
561 Assert(state->nitems > 0);
562 Assert(state->nitems <= state->nhtids);
563 Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
564
565 tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
566 if (state->nitems == 1)
567 {
568 /* Use original, unchanged base tuple */
569 tuplesz = IndexTupleSize(state->base);
570 Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
571 Assert(tuplesz <= BTMaxItemSize);
572 if (PageAddItem(newpage, state->base, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
573 elog(ERROR, "deduplication failed to add tuple to page");
574
575 spacesaving = 0;
576 }
577 else
578 {
579 IndexTuple final;
580
581 /* Form a tuple with a posting list */
582 final = _bt_form_posting(state->base, state->htids, state->nhtids);
583 tuplesz = IndexTupleSize(final);
584 Assert(tuplesz <= state->maxpostingsize);
585
586 /* Save final number of items for posting list */
587 state->intervals[state->nintervals].nitems = state->nitems;
588
589 Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
590 Assert(tuplesz <= BTMaxItemSize);
591 if (PageAddItem(newpage, final, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
592 elog(ERROR, "deduplication failed to add tuple to page");
593
594 pfree(final);
595 spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
596 /* Increment nintervals, since we wrote a new posting list tuple */
597 state->nintervals++;
598 Assert(spacesaving > 0 && spacesaving < BLCKSZ);
599 }
600
601 /* Reset state for next pending posting list */
602 state->nhtids = 0;
603 state->nitems = 0;
604 state->phystupsize = 0;
605
606 return spacesaving;
607}
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:471
#define MAXALIGN(LEN)
Definition: c.h:814
size_t Size
Definition: c.h:614
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
IndexTuple _bt_form_posting(IndexTuple base, const ItemPointerData *htids, int nhtids)
Definition: nbtdedup.c:862
#define BTMaxItemSize
Definition: nbtree.h:165

References _bt_form_posting(), Assert(), BTMaxItemSize, elog, ERROR, IndexTupleSize(), InvalidOffsetNumber, MAXALIGN, OffsetNumberNext, PageAddItem, PageGetMaxOffsetNumber(), and pfree().

Referenced by _bt_dedup_pass(), and btree_xlog_dedup().

◆ _bt_dedup_pass()

void _bt_dedup_pass ( Relation  rel,
Buffer  buf,
IndexTuple  newitem,
Size  newitemsz,
bool  bottomupdedup 
)

Definition at line 59 of file nbtdedup.c.

61{
62 OffsetNumber offnum,
63 minoff,
64 maxoff;
65 Page page = BufferGetPage(buf);
66 BTPageOpaque opaque = BTPageGetOpaque(page);
67 Page newpage;
69 Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
70 bool singlevalstrat = false;
71 int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
72
73 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
74 newitemsz += sizeof(ItemIdData);
75
76 /*
77 * Initialize deduplication state.
78 *
79 * It would be possible for maxpostingsize (limit on posting list tuple
80 * size) to be set to one third of the page. However, it seems like a
81 * good idea to limit the size of posting lists to one sixth of a page.
82 * That ought to leave us with a good split point when pages full of
83 * duplicates can be split several times.
84 */
86 state->deduplicate = true;
87 state->nmaxitems = 0;
88 state->maxpostingsize = Min(BTMaxItemSize / 2, INDEX_SIZE_MASK);
89 /* Metadata about base tuple of current pending posting list */
90 state->base = NULL;
91 state->baseoff = InvalidOffsetNumber;
92 state->basetupsize = 0;
93 /* Metadata about current pending posting list TIDs */
94 state->htids = palloc(state->maxpostingsize);
95 state->nhtids = 0;
96 state->nitems = 0;
97 /* Size of all physical tuples to be replaced by pending posting list */
98 state->phystupsize = 0;
99 /* nintervals should be initialized to zero */
100 state->nintervals = 0;
101
102 minoff = P_FIRSTDATAKEY(opaque);
103 maxoff = PageGetMaxOffsetNumber(page);
104
105 /*
106 * Consider applying "single value" strategy, though only if the page
107 * seems likely to be split in the near future
108 */
109 if (!bottomupdedup)
110 singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
111
112 /*
113 * Deduplicate items from page, and write them to newpage.
114 *
115 * Copy the original page's LSN into newpage copy. This will become the
116 * updated version of the page. We need this because XLogInsert will
117 * examine the LSN and possibly dump it in a page image.
118 */
119 newpage = PageGetTempPageCopySpecial(page);
120 PageSetLSN(newpage, PageGetLSN(page));
121
122 /* Copy high key, if any */
123 if (!P_RIGHTMOST(opaque))
124 {
125 ItemId hitemid = PageGetItemId(page, P_HIKEY);
126 Size hitemsz = ItemIdGetLength(hitemid);
127 IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
128
129 if (PageAddItem(newpage, hitem, hitemsz, P_HIKEY, false, false) == InvalidOffsetNumber)
130 elog(ERROR, "deduplication failed to add highkey");
131 }
132
133 for (offnum = minoff;
134 offnum <= maxoff;
135 offnum = OffsetNumberNext(offnum))
136 {
137 ItemId itemid = PageGetItemId(page, offnum);
138 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
139
140 Assert(!ItemIdIsDead(itemid));
141
142 if (offnum == minoff)
143 {
144 /*
145 * No previous/base tuple for the data item -- use the data item
146 * as base tuple of pending posting list
147 */
148 _bt_dedup_start_pending(state, itup, offnum);
149 }
150 else if (state->deduplicate &&
151 _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
153 {
154 /*
155 * Tuple is equal to base tuple of pending posting list. Heap
156 * TID(s) for itup have been saved in state.
157 */
158 }
159 else
160 {
161 /*
162 * Tuple is not equal to pending posting list tuple, or
163 * _bt_dedup_save_htid() opted to not merge current item into
164 * pending posting list for some other reason (e.g., adding more
165 * TIDs would have caused posting list to exceed current
166 * maxpostingsize).
167 *
168 * If state contains pending posting list with more than one item,
169 * form new posting tuple and add it to our temp page (newpage).
170 * Else add pending interval's base tuple to the temp page as-is.
171 */
172 pagesaving += _bt_dedup_finish_pending(newpage, state);
173
174 if (singlevalstrat)
175 {
176 /*
177 * Single value strategy's extra steps.
178 *
179 * Lower maxpostingsize for sixth and final large posting list
180 * tuple at the point where 5 maxpostingsize-capped tuples
181 * have either been formed or observed.
182 *
183 * When a sixth maxpostingsize-capped item is formed/observed,
184 * stop merging together tuples altogether. The few tuples
185 * that remain at the end of the page won't be merged together
186 * at all (at least not until after a future page split takes
187 * place, when this page's newly allocated right sibling page
188 * gets its first deduplication pass).
189 */
190 if (state->nmaxitems == 5)
191 _bt_singleval_fillfactor(page, state, newitemsz);
192 else if (state->nmaxitems == 6)
193 {
194 state->deduplicate = false;
195 singlevalstrat = false; /* won't be back here */
196 }
197 }
198
199 /* itup starts new pending posting list */
200 _bt_dedup_start_pending(state, itup, offnum);
201 }
202 }
203
204 /* Handle the last item */
205 pagesaving += _bt_dedup_finish_pending(newpage, state);
206
207 /*
208 * If no items suitable for deduplication were found, newpage must be
209 * exactly the same as the original page, so just return from function.
210 *
211 * We could determine whether or not to proceed on the basis the space
212 * savings being sufficient to avoid an immediate page split instead. We
213 * don't do that because there is some small value in nbtsplitloc.c always
214 * operating against a page that is fully deduplicated (apart from
215 * newitem). Besides, most of the cost has already been paid.
216 */
217 if (state->nintervals == 0)
218 {
219 /* cannot leak memory here */
220 pfree(newpage);
221 pfree(state->htids);
222 pfree(state);
223 return;
224 }
225
226 /*
227 * By here, it's clear that deduplication will definitely go ahead.
228 *
229 * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
230 * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
231 * But keep things tidy.
232 */
233 if (P_HAS_GARBAGE(opaque))
234 {
235 BTPageOpaque nopaque = BTPageGetOpaque(newpage);
236
237 nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
238 }
239
241
242 PageRestoreTempPage(newpage, page);
244
245 /* XLOG stuff */
246 if (RelationNeedsWAL(rel))
247 {
248 XLogRecPtr recptr;
249 xl_btree_dedup xlrec_dedup;
250
251 xlrec_dedup.nintervals = state->nintervals;
252
255 XLogRegisterData(&xlrec_dedup, SizeOfBtreeDedup);
256
257 /*
258 * The intervals array is not in the buffer, but pretend that it is.
259 * When XLogInsert stores the whole buffer, the array need not be
260 * stored too.
261 */
262 XLogRegisterBufData(0, state->intervals,
263 state->nintervals * sizeof(BTDedupInterval));
264
265 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
266
267 PageSetLSN(page, recptr);
268 }
269
271
272 /* Local space accounting should agree with page accounting */
273 Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
274
275 /* cannot leak memory here */
276 pfree(state->htids);
277 pfree(state);
278}
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2937
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:423
Page PageGetTempPageCopySpecial(const PageData *page)
Definition: bufpage.c:401
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:390
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:385
#define Min(x, y)
Definition: c.h:1007
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:227
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition: nbtdedup.c:780
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition: nbtdedup.c:555
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition: nbtdedup.c:820
#define P_HIKEY
Definition: nbtree.h:368
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:227
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define XLOG_BTREE_DEDUP
Definition: nbtxlog.h:33
#define SizeOfBtreeDedup
Definition: nbtxlog.h:174
#define RelationNeedsWAL(relation)
Definition: rel.h:638
uint16 btpo_flags
Definition: nbtree.h:68
uint16 nintervals
Definition: nbtxlog.h:169
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:478
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition: xloginsert.c:409
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:368
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:245
void XLogBeginInsert(void)
Definition: xloginsert.c:152
#define REGBUF_STANDARD
Definition: xloginsert.h:35

References _bt_dedup_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_do_singleval(), _bt_keep_natts_fast(), _bt_singleval_fillfactor(), Assert(), BTMaxItemSize, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, buf, BufferGetPage(), elog, END_CRIT_SECTION, ERROR, INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, ItemIdIsDead, MarkBufferDirty(), Min, xl_btree_dedup::nintervals, OffsetNumberNext, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_RIGHTMOST, PageAddItem, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetLSN(), PageGetMaxOffsetNumber(), PageGetTempPageCopySpecial(), PageRestoreTempPage(), PageSetLSN(), palloc(), pfree(), PG_USED_FOR_ASSERTS_ONLY, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDedup, START_CRIT_SECTION, XLOG_BTREE_DEDUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_dedup_save_htid()

bool _bt_dedup_save_htid ( BTDedupState  state,
IndexTuple  itup 
)

Definition at line 484 of file nbtdedup.c.

485{
486 int nhtids;
487 ItemPointer htids;
488 Size mergedtupsz;
489
491
492 if (!BTreeTupleIsPosting(itup))
493 {
494 nhtids = 1;
495 htids = &itup->t_tid;
496 }
497 else
498 {
499 nhtids = BTreeTupleGetNPosting(itup);
500 htids = BTreeTupleGetPosting(itup);
501 }
502
503 /*
504 * Don't append (have caller finish pending posting list as-is) if
505 * appending heap TID(s) from itup would put us over maxpostingsize limit.
506 *
507 * This calculation needs to match the code used within _bt_form_posting()
508 * for new posting list tuples.
509 */
510 mergedtupsz = MAXALIGN(state->basetupsize +
511 (state->nhtids + nhtids) * sizeof(ItemPointerData));
512
513 if (mergedtupsz > state->maxpostingsize)
514 {
515 /*
516 * Count this as an oversized item for single value strategy, though
517 * only when there are 50 TIDs in the final posting list tuple. This
518 * limit (which is fairly arbitrary) avoids confusion about how many
519 * 1/6 of a page tuples have been encountered/created by the current
520 * deduplication pass.
521 *
522 * Note: We deliberately don't consider which deduplication pass
523 * merged together tuples to create this item (could be a previous
524 * deduplication pass, or current pass). See _bt_do_singleval()
525 * comments.
526 */
527 if (state->nhtids > 50)
528 state->nmaxitems++;
529
530 return false;
531 }
532
533 /*
534 * Save heap TIDs to pending posting list tuple -- itup can be merged into
535 * pending posting list
536 */
537 state->nitems++;
538 memcpy(state->htids + state->nhtids, htids,
539 sizeof(ItemPointerData) * nhtids);
540 state->nhtids += nhtids;
541 state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
542
543 return true;
544}
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:538

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize(), MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_dedup_start_pending()

void _bt_dedup_start_pending ( BTDedupState  state,
IndexTuple  base,
OffsetNumber  baseoff 
)

Definition at line 433 of file nbtdedup.c.

435{
436 Assert(state->nhtids == 0);
437 Assert(state->nitems == 0);
439
440 /*
441 * Copy heap TID(s) from new base tuple for new candidate posting list
442 * into working state's array
443 */
444 if (!BTreeTupleIsPosting(base))
445 {
446 memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
447 state->nhtids = 1;
448 state->basetupsize = IndexTupleSize(base);
449 }
450 else
451 {
452 int nposting;
453
454 nposting = BTreeTupleGetNPosting(base);
455 memcpy(state->htids, BTreeTupleGetPosting(base),
456 sizeof(ItemPointerData) * nposting);
457 state->nhtids = nposting;
458 /* basetupsize should not include existing posting list */
459 state->basetupsize = BTreeTupleGetPostingOffset(base);
460 }
461
462 /*
463 * Save new base tuple itself -- it'll be needed if we actually create a
464 * new posting list from new pending posting list.
465 *
466 * Must maintain physical size of all existing tuples (including line
467 * pointer overhead) so that we can calculate space savings on page.
468 */
469 state->nitems = 1;
470 state->base = base;
471 state->baseoff = baseoff;
472 state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
473 /* Also save baseoff in pending state for interval */
474 state->intervals[state->nintervals].baseoff = state->baseoff;
475}
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:530

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize(), MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_do_singleval()

static bool _bt_do_singleval ( Relation  rel,
Page  page,
BTDedupState  state,
OffsetNumber  minoff,
IndexTuple  newitem 
)
static

Definition at line 780 of file nbtdedup.c.

782{
783 int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
784 ItemId itemid;
785 IndexTuple itup;
786
787 itemid = PageGetItemId(page, minoff);
788 itup = (IndexTuple) PageGetItem(page, itemid);
789
790 if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
791 {
792 itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
793 itup = (IndexTuple) PageGetItem(page, itemid);
794
795 if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
796 return true;
797 }
798
799 return false;
800}

References _bt_keep_natts_fast(), IndexRelationGetNumberOfKeyAttributes, PageGetItem(), PageGetItemId(), and PageGetMaxOffsetNumber().

Referenced by _bt_dedup_pass().

◆ _bt_form_posting()

IndexTuple _bt_form_posting ( IndexTuple  base,
const ItemPointerData htids,
int  nhtids 
)

Definition at line 862 of file nbtdedup.c.

863{
864 uint32 keysize,
865 newsize;
866 IndexTuple itup;
867
868 if (BTreeTupleIsPosting(base))
869 keysize = BTreeTupleGetPostingOffset(base);
870 else
871 keysize = IndexTupleSize(base);
872
874 Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
875 Assert(keysize == MAXALIGN(keysize));
876
877 /* Determine final size of new tuple */
878 if (nhtids > 1)
879 newsize = MAXALIGN(keysize +
880 nhtids * sizeof(ItemPointerData));
881 else
882 newsize = keysize;
883
884 Assert(newsize <= INDEX_SIZE_MASK);
885 Assert(newsize == MAXALIGN(newsize));
886
887 /* Allocate memory using palloc0() (matches index_form_tuple()) */
888 itup = palloc0(newsize);
889 memcpy(itup, base, keysize);
890 itup->t_info &= ~INDEX_SIZE_MASK;
891 itup->t_info |= newsize;
892 if (nhtids > 1)
893 {
894 /* Form posting list tuple */
895 BTreeTupleSetPosting(itup, nhtids, keysize);
896 memcpy(BTreeTupleGetPosting(itup), htids,
897 sizeof(ItemPointerData) * nhtids);
898 Assert(_bt_posting_valid(itup));
899 }
900 else
901 {
902 /* Form standard non-pivot tuple */
903 itup->t_info &= ~INDEX_ALT_TID_MASK;
904 ItemPointerCopy(htids, &itup->t_tid);
906 }
907
908 return itup;
909}
uint32_t uint32
Definition: c.h:542
#define PG_UINT16_MAX
Definition: c.h:596
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition: itemptr.h:172
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition: itemptr.h:83
void * palloc0(Size size)
Definition: mcxt.c:1395
static void BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
Definition: nbtree.h:505
unsigned short t_info
Definition: itup.h:49

References Assert(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetPosting(), INDEX_SIZE_MASK, IndexTupleSize(), ItemPointerCopy(), ItemPointerIsValid(), MAXALIGN, palloc0(), PG_UINT16_MAX, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_dedup_finish_pending(), _bt_sort_dedup_finish_pending(), and bt_posting_plain_tuple().

◆ _bt_singleval_fillfactor()

static void _bt_singleval_fillfactor ( Page  page,
BTDedupState  state,
Size  newitemsz 
)
static

Definition at line 820 of file nbtdedup.c.

821{
822 Size leftfree;
823 int reduction;
824
825 /* This calculation needs to match nbtsplitloc.c */
826 leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
827 MAXALIGN(sizeof(BTPageOpaqueData));
828 /* Subtract size of new high key (includes pivot heap TID space) */
829 leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
830
831 /*
832 * Reduce maxpostingsize by an amount equal to target free space on left
833 * half of page
834 */
835 reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
836 if (state->maxpostingsize > reduction)
837 state->maxpostingsize -= reduction;
838 else
839 state->maxpostingsize = 0;
840}
static Size PageGetPageSize(const PageData *page)
Definition: bufpage.h:276
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:203

References BTREE_SINGLEVAL_FILLFACTOR, MAXALIGN, PageGetPageSize(), and SizeOfPageHeaderData.

Referenced by _bt_dedup_pass().

◆ _bt_swap_posting()

IndexTuple _bt_swap_posting ( IndexTuple  newitem,
IndexTuple  oposting,
int  postingoff 
)

Definition at line 1020 of file nbtdedup.c.

1021{
1022 int nhtids;
1023 char *replacepos;
1024 char *replaceposright;
1025 Size nmovebytes;
1026 IndexTuple nposting;
1027
1028 nhtids = BTreeTupleGetNPosting(oposting);
1029 Assert(_bt_posting_valid(oposting));
1030
1031 /*
1032 * The postingoff argument originated as a _bt_binsrch_posting() return
1033 * value. It will be 0 in the event of corruption that makes a leaf page
1034 * contain a non-pivot tuple that's somehow identical to newitem (no two
1035 * non-pivot tuples should ever have the same TID). This has been known
1036 * to happen in the field from time to time.
1037 *
1038 * Perform a basic sanity check to catch this case now.
1039 */
1040 if (!(postingoff > 0 && postingoff < nhtids))
1041 elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
1042 nhtids, postingoff);
1043
1044 /*
1045 * Move item pointers in posting list to make a gap for the new item's
1046 * heap TID. We shift TIDs one place to the right, losing original
1047 * rightmost TID. (nmovebytes must not include TIDs to the left of
1048 * postingoff, nor the existing rightmost/max TID that gets overwritten.)
1049 */
1050 nposting = CopyIndexTuple(oposting);
1051 replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
1052 replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
1053 nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
1054 memmove(replaceposright, replacepos, nmovebytes);
1055
1056 /* Fill the gap at postingoff with TID of new item (original new TID) */
1057 Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
1058 ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
1059
1060 /* Now copy oposting's rightmost/max TID into new item (final new TID) */
1061 ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
1062
1064 BTreeTupleGetHeapTID(newitem)) < 0);
1065 Assert(_bt_posting_valid(nposting));
1066
1067 return nposting;
1068}
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition: itemptr.c:51

References Assert(), BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), CopyIndexTuple(), elog, ERROR, ItemPointerCompare(), ItemPointerCopy(), and IndexTupleData::t_tid.

Referenced by _bt_insertonpg(), btree_xlog_insert(), and btree_xlog_split().

◆ _bt_update_posting()

void _bt_update_posting ( BTVacuumPosting  vacposting)

Definition at line 922 of file nbtdedup.c.

923{
924 IndexTuple origtuple = vacposting->itup;
925 uint32 keysize,
926 newsize;
927 IndexTuple itup;
928 int nhtids;
929 int ui,
930 d;
931 ItemPointer htids;
932
933 nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
934
935 Assert(_bt_posting_valid(origtuple));
936 Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
937
938 /*
939 * Determine final size of new tuple.
940 *
941 * This calculation needs to match the code used within _bt_form_posting()
942 * for new posting list tuples. We avoid calling _bt_form_posting() here
943 * to save ourselves a second memory allocation for a htids workspace.
944 */
945 keysize = BTreeTupleGetPostingOffset(origtuple);
946 if (nhtids > 1)
947 newsize = MAXALIGN(keysize +
948 nhtids * sizeof(ItemPointerData));
949 else
950 newsize = keysize;
951
952 Assert(newsize <= INDEX_SIZE_MASK);
953 Assert(newsize == MAXALIGN(newsize));
954
955 /* Allocate memory using palloc0() (matches index_form_tuple()) */
956 itup = palloc0(newsize);
957 memcpy(itup, origtuple, keysize);
958 itup->t_info &= ~INDEX_SIZE_MASK;
959 itup->t_info |= newsize;
960
961 if (nhtids > 1)
962 {
963 /* Form posting list tuple */
964 BTreeTupleSetPosting(itup, nhtids, keysize);
965 htids = BTreeTupleGetPosting(itup);
966 }
967 else
968 {
969 /* Form standard non-pivot tuple */
970 itup->t_info &= ~INDEX_ALT_TID_MASK;
971 htids = &itup->t_tid;
972 }
973
974 ui = 0;
975 d = 0;
976 for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
977 {
978 if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
979 {
980 d++;
981 continue;
982 }
983 htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
984 }
985 Assert(ui == nhtids);
986 Assert(d == vacposting->ndeletedtids);
987 Assert(nhtids == 1 || _bt_posting_valid(itup));
988 Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
989
990 /* vacposting arg's itup will now point to updated version */
991 vacposting->itup = itup;
992}
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:922
uint16 ndeletedtids
Definition: nbtree.h:921
IndexTuple itup
Definition: nbtree.h:917

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingN(), BTreeTupleGetPostingOffset(), BTreeTupleSetPosting(), BTVacuumPostingData::deletetids, i, INDEX_SIZE_MASK, ItemPointerIsValid(), BTVacuumPostingData::itup, MAXALIGN, BTVacuumPostingData::ndeletedtids, palloc0(), IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_delitems_update(), and btree_xlog_updates().