diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index fdff960c1302..c110567b0fdc 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -387,16 +387,6 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* we aren't holding any read locks, but gotta drop the pins */ - if (BTScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); - BTScanPosInvalidate(so->currPos); - } - /* * We prefer to eagerly drop leaf page pins before btgettuple returns. * This avoids making VACUUM wait to acquire a cleanup lock on the page. @@ -425,12 +415,27 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, RelationNeedsWAL(scan->indexRelation) && scan->heapRelation != NULL); + /* + * Before leaving the current position, perform final steps, since we'll + * never call _bt_steppage for its page + */ + if (BTScanPosIsValid(so->currPos)) + { + if (so->numKilled > 0) + _bt_killitems(scan); + if (!so->dropPin) + BTScanPosUnpin(so->currPos); + BTScanPosInvalidate(so->currPos); + } + + /* Always invalidate any existing mark */ so->markItemIndex = -1; - so->needPrimScan = false; - so->scanBehind = false; - so->oppositeDirCheck = false; - BTScanPosUnpinIfPinned(so->markPos); - BTScanPosInvalidate(so->markPos); + if (BTScanPosIsValid(so->markPos)) + { + if (!so->dropPin) + BTScanPosUnpin(so->markPos); + BTScanPosInvalidate(so->markPos); + } /* * Allocate tuple workspace arrays, if needed for an index-only scan and @@ -459,6 +464,9 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, */ if (scankey && scan->numberOfKeys > 0) memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); + so->needPrimScan = false; + so->scanBehind = false; + so->oppositeDirCheck = false; so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */ so->numArrayKeys = 0; /* ditto */ } @@ -471,19 +479,27 @@ btendscan(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* we aren't holding any read locks, but gotta drop the pins */ + /* + * Before leaving the current position, perform final steps, since we'll + * never call _bt_steppage for its page + */ if (BTScanPosIsValid(so->currPos)) { - /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); + if (!so->dropPin) + BTScanPosUnpin(so->currPos); + BTScanPosInvalidate(so->currPos); /* unnecessary, but be consistent */ } - so->markItemIndex = -1; - BTScanPosUnpinIfPinned(so->markPos); - - /* No need to invalidate positions, the RAM is about to be freed. */ + /* Always invalidate any existing mark */ + so->markItemIndex = -1; /* unnecessary, but be consistent */ + if (BTScanPosIsValid(so->markPos)) + { + if (!so->dropPin) + BTScanPosUnpin(so->markPos); + BTScanPosInvalidate(so->markPos); /* unnecessary, but be consistent */ + } /* Release storage */ if (so->keyData != NULL) @@ -507,8 +523,14 @@ btmarkpos(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* There may be an old mark with a pin (but no lock). */ - BTScanPosUnpinIfPinned(so->markPos); + /* Always invalidate any existing mark */ + so->markItemIndex = -1; + if (BTScanPosIsValid(so->markPos)) + { + if (!so->dropPin) + BTScanPosUnpin(so->markPos); + BTScanPosInvalidate(so->markPos); + } /* * Just record the current itemIndex. If we later step to next page @@ -518,11 +540,6 @@ btmarkpos(IndexScanDesc scan) */ if (BTScanPosIsValid(so->currPos)) so->markItemIndex = so->currPos.itemIndex; - else - { - BTScanPosInvalidate(so->markPos); - so->markItemIndex = -1; - } } /* @@ -536,41 +553,62 @@ btrestrpos(IndexScanDesc scan) if (so->markItemIndex >= 0) { /* - * The scan has never moved to a new page since the last mark. Just + * The mark position is on the same page we are currently on. Just * restore the itemIndex. * - * NB: In this case we can't count on anything in so->markPos to be - * accurate. + * We generally only need to keep around a separate BTScanPosData + * (which will have its own buffer pin when !so->dropPin) for a mark + * whose page doesn't match so->currPos.currPage. That requires care + * later on in this function, and in _bt_steppage. */ + Assert(!BTScanPosIsValid(so->markPos)); /* can't also be valid */ so->currPos.itemIndex = so->markItemIndex; } else { /* - * The scan moved to a new page after last mark or restore, and we are - * now restoring to the marked page. We aren't holding any read - * locks, but if we're still holding the pin for the current position, - * we must drop it. + * The scan moved to a page beyond that of the last mark we took. + * + * Before leaving the current position, perform final steps, since + * we'll never call _bt_steppage for its page. */ if (BTScanPosIsValid(so->currPos)) { - /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); + if (!so->dropPin) + BTScanPosUnpin(so->currPos); + BTScanPosInvalidate(so->currPos); } + /* + * We're completely done with our old so->currPos. Copy so->markPos + * into so->currPos to actually restore the mark. + */ if (BTScanPosIsValid(so->markPos)) { - /* bump pin on mark buffer for assignment to current buffer */ - if (BTScanPosIsPinned(so->markPos)) - IncrBufferRefCount(so->markPos.buf); memcpy(&so->currPos, &so->markPos, offsetof(BTScanPosData, items[1]) + so->markPos.lastItem * sizeof(BTScanPosItem)); if (so->currTuples) memcpy(so->currTuples, so->markTuples, so->markPos.nextTupleOffset); + + /* + * We need to keep the mark around, in case it gets restored + * again. We use the so->markItemIndex representation for this. + * Converting to that representation involves invalidating the + * original so->markPos representation. (This is the opposite of + * what happened back when _bt_steppage created this so->markPos + * from the scan's then-obsolescent so->currPos.) + * + * Note: We deliberately avoid a "BTScanPosUnpin(so->markPos)" + * when invalidating so->markPos, since so->markPos's original pin + * is "transferred" to the new so->currPos. + */ + BTScanPosInvalidate(so->markPos); + so->markItemIndex = so->currPos.itemIndex; /* keep the mark */ + /* Reset the scan's array keys (see _bt_steppage for why) */ if (so->numArrayKeys) { @@ -578,8 +616,6 @@ btrestrpos(IndexScanDesc scan) so->needPrimScan = false; } } - else - BTScanPosInvalidate(so->currPos); } } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index d69798795b43..54f1c05eff4a 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1670,6 +1670,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : so->currPos.moreLeft); Assert(!P_IGNORE(opaque)); + Assert(so->numKilled == 0); + Assert(so->markItemIndex < 0); Assert(BTScanPosIsPinned(so->currPos)); Assert(!so->needPrimScan); @@ -2143,6 +2145,7 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) /* Most recent _bt_readpage must have succeeded */ Assert(BTScanPosIsValid(so->currPos)); + Assert(BTScanPosIsPinned(so->currPos) == !so->dropPin); Assert(so->currPos.itemIndex >= so->currPos.firstItem); Assert(so->currPos.itemIndex <= so->currPos.lastItem); @@ -2156,6 +2159,8 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) * _bt_steppage() -- Step to next page containing valid data for scan * * Wrapper on _bt_readnextpage that performs final steps for the current page. + * Call here when leaving a so->currPos that _bt_readpage returned 'true' for + * (otherwise just call _bt_readnextpage directly). * * On entry, so->currPos must be valid. Its buffer will be pinned, though * never locked. (Actually, when so->dropPin there won't even be a pin held, @@ -2168,7 +2173,12 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) BlockNumber blkno, lastcurrblkno; + /* Assert that _bt_readpage returned true for so->currPos */ + Assert(so->currPos.firstItem <= so->currPos.lastItem); + + /* Assert that _bt_drop_lock_and_maybe_pin left so->currPos as expected */ Assert(BTScanPosIsValid(so->currPos)); + Assert(BTScanPosIsPinned(so->currPos) == !so->dropPin); /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) @@ -2180,9 +2190,9 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) */ if (so->markItemIndex >= 0) { - /* bump pin on current buffer for assignment to mark buffer */ - if (BTScanPosIsPinned(so->currPos)) - IncrBufferRefCount(so->currPos.buf); + /* mark/restore not supported by parallel scans */ + Assert(!scan->parallel_scan); + memcpy(&so->markPos, &so->currPos, offsetof(BTScanPosData, items[1]) + so->currPos.lastItem * sizeof(BTScanPosItem)); @@ -2216,18 +2226,23 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) so->markPos.moreLeft = true; } - /* mark/restore not supported by parallel scans */ - Assert(!scan->parallel_scan); + /* + * Saving so->currPos into so->markPos, so just unset its 'buf' field + * (avoid unpinning the buffer, since it is now owned by so->markPos). + * _bt_readnextpage expects so->currPos to always look like this. + */ + so->currPos.buf = InvalidBuffer; /* transferred to so->markPos */ } - - BTScanPosUnpinIfPinned(so->currPos); - - /* Walk to the next page with data */ - if (ScanDirectionIsForward(dir)) - blkno = so->currPos.nextPage; else - blkno = so->currPos.prevPage; - lastcurrblkno = so->currPos.currPage; + { + /* + * Not saving so->currPos into so->markPos, so release its buffer pin + * (except in so->dropPin case, where _bt_drop_lock_and_maybe_pin must + * have done so earlier) + */ + if (!so->dropPin) + BTScanPosUnpin(so->currPos); + } /* * Cancel primitive index scans that were scheduled when the call to @@ -2238,6 +2253,13 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) if (so->currPos.dir != dir) so->needPrimScan = false; + /* Walk to the next page with data */ + if (ScanDirectionIsForward(dir)) + blkno = so->currPos.nextPage; + else + blkno = so->currPos.prevPage; + lastcurrblkno = so->currPos.currPage; + return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false); } @@ -2267,10 +2289,10 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; - - so->numKilled = 0; /* just paranoia */ - so->markItemIndex = -1; /* ditto */ + BlockNumber blkno, + lastcurrblkno; /* Initialize so->currPos for the first page (page in so->currPos.buf) */ if (so->needPrimScan) @@ -2300,8 +2322,6 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) */ if (_bt_readpage(scan, dir, offnum, true)) { - Relation rel = scan->indexRelation; - /* * _bt_readpage succeeded. Drop the lock (and maybe the pin) on * so->currPos.buf in preparation for btgettuple returning tuples. @@ -2312,14 +2332,17 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) } /* There's no actually-matching data on the page in so->currPos.buf */ - _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + _bt_relbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; - /* Call _bt_readnextpage using its _bt_steppage wrapper function */ - if (!_bt_steppage(scan, dir)) - return false; + /* Walk to the next page with data */ + if (ScanDirectionIsForward(dir)) + blkno = so->currPos.nextPage; + else + blkno = so->currPos.prevPage; + lastcurrblkno = so->currPos.currPage; - /* _bt_readpage for a later page (now in so->currPos) succeeded */ - return true; + return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false); } /* @@ -2458,6 +2481,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, /* no matching tuples on this page */ _bt_relbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; seized = false; /* released by _bt_readpage (or by us) */ } diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index e709d2e0afe9..a448c2c9f945 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1007,23 +1007,18 @@ typedef BTScanPosData *BTScanPos; !BufferIsValid((scanpos).buf)), \ BufferIsValid((scanpos).buf) \ ) -#define BTScanPosUnpin(scanpos) \ - do { \ - ReleaseBuffer((scanpos).buf); \ - (scanpos).buf = InvalidBuffer; \ - } while (0) -#define BTScanPosUnpinIfPinned(scanpos) \ - do { \ - if (BTScanPosIsPinned(scanpos)) \ - BTScanPosUnpin(scanpos); \ - } while (0) - #define BTScanPosIsValid(scanpos) \ ( \ AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ !BufferIsValid((scanpos).buf)), \ BlockNumberIsValid((scanpos).currPage) \ ) + +#define BTScanPosUnpin(scanpos) \ + do { \ + ReleaseBuffer((scanpos).buf); \ + (scanpos).buf = InvalidBuffer; \ + } while (0) #define BTScanPosInvalidate(scanpos) \ do { \ (scanpos).buf = InvalidBuffer; \