Skip to content

Commit 882792c

Browse files
anarazelCommitfest Bot
authored andcommitted
bufmgr: aio: Prototype for not waiting for already-in-progress IO
Author: Reviewed-by: Discussion: https://postgr.es/m/zljergweqti7x67lg5ije2rzjusie37nslsnkjkkby4laqqbfw@3p3zu522yykv Backpatch:
1 parent b8f1c62 commit 882792c

File tree

2 files changed

+142
-9
lines changed

2 files changed

+142
-9
lines changed

src/backend/storage/buffer/bufmgr.c

Lines changed: 141 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1538,6 +1538,46 @@ ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
15381538
return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
15391539
}
15401540

1541+
/*
1542+
* Check if the buffer is already undergoing read AIO. If it is, assign the
1543+
* IO's wait reference to operation->io_wref, thereby allowing the caller to
1544+
* wait for that IO.
1545+
*/
1546+
static inline bool
1547+
ReadBuffersIOAlreadyInProgress(ReadBuffersOperation *operation, Buffer buffer)
1548+
{
1549+
BufferDesc *desc;
1550+
uint32 buf_state;
1551+
PgAioWaitRef iow;
1552+
1553+
pgaio_wref_clear(&iow);
1554+
1555+
if (BufferIsLocal(buffer))
1556+
{
1557+
desc = GetLocalBufferDescriptor(-buffer - 1);
1558+
buf_state = pg_atomic_read_u32(&desc->state);
1559+
if ((buf_state & BM_IO_IN_PROGRESS) && !(buf_state & BM_VALID))
1560+
iow = desc->io_wref;
1561+
}
1562+
else
1563+
{
1564+
desc = GetBufferDescriptor(buffer - 1);
1565+
buf_state = LockBufHdr(desc);
1566+
1567+
if ((buf_state & BM_IO_IN_PROGRESS) && !(buf_state & BM_VALID))
1568+
iow = desc->io_wref;
1569+
UnlockBufHdr(desc, buf_state);
1570+
}
1571+
1572+
if (pgaio_wref_valid(&iow))
1573+
{
1574+
operation->io_wref = iow;
1575+
return true;
1576+
}
1577+
1578+
return false;
1579+
}
1580+
15411581
/*
15421582
* Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
15431583
*/
@@ -1670,7 +1710,7 @@ WaitReadBuffers(ReadBuffersOperation *operation)
16701710
*
16711711
* we first check if we already know the IO is complete.
16721712
*/
1673-
if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1713+
if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) &&
16741714
!pgaio_wref_check_done(&operation->io_wref))
16751715
{
16761716
instr_time io_start = pgstat_prepare_io_time(track_io_timing);
@@ -1689,11 +1729,66 @@ WaitReadBuffers(ReadBuffersOperation *operation)
16891729
Assert(pgaio_wref_check_done(&operation->io_wref));
16901730
}
16911731

1692-
/*
1693-
* We now are sure the IO completed. Check the results. This
1694-
* includes reporting on errors if there were any.
1695-
*/
1696-
ProcessReadBuffersResult(operation);
1732+
if (unlikely(operation->foreign_io))
1733+
{
1734+
Buffer buffer = operation->buffers[operation->nblocks_done];
1735+
BufferDesc *desc;
1736+
uint32 buf_state;
1737+
1738+
if (BufferIsLocal(buffer))
1739+
{
1740+
desc = GetLocalBufferDescriptor(-buffer - 1);
1741+
buf_state = pg_atomic_read_u32(&desc->state);
1742+
}
1743+
else
1744+
{
1745+
desc = GetBufferDescriptor(buffer - 1);
1746+
buf_state = LockBufHdr(desc);
1747+
UnlockBufHdr(desc, buf_state);
1748+
}
1749+
1750+
if (buf_state & BM_VALID)
1751+
{
1752+
operation->nblocks_done += 1;
1753+
Assert(operation->nblocks_done <= operation->nblocks);
1754+
1755+
/*
1756+
* Report and track this as a 'hit' for this backend, even
1757+
* though it must have started out as a miss in
1758+
* PinBufferForBlock(). The other backend (or ourselves,
1759+
* as part of a read started earlier) will track this as a
1760+
* 'read'.
1761+
*/
1762+
TRACE_POSTGRESQL_BUFFER_READ_DONE(operation->forknum,
1763+
operation->blocknum + operation->nblocks_done,
1764+
operation->smgr->smgr_rlocator.locator.spcOid,
1765+
operation->smgr->smgr_rlocator.locator.dbOid,
1766+
operation->smgr->smgr_rlocator.locator.relNumber,
1767+
operation->smgr->smgr_rlocator.backend,
1768+
true);
1769+
1770+
if (BufferIsLocal(buffer))
1771+
pgBufferUsage.local_blks_hit += 1;
1772+
else
1773+
pgBufferUsage.shared_blks_hit += 1;
1774+
1775+
if (operation->rel)
1776+
pgstat_count_buffer_hit(operation->rel);
1777+
1778+
pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1779+
1780+
if (VacuumCostActive)
1781+
VacuumCostBalance += VacuumCostPageHit;
1782+
}
1783+
}
1784+
else
1785+
{
1786+
/*
1787+
* We now are sure the IO completed. Check the results. This
1788+
* includes reporting on errors if there were any.
1789+
*/
1790+
ProcessReadBuffersResult(operation);
1791+
}
16971792
}
16981793

16991794
/*
@@ -1779,6 +1874,43 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
17791874
io_object = IOOBJECT_RELATION;
17801875
}
17811876

1877+
/*
1878+
* If AIO is in progress, be it in this backend or another backend, we
1879+
* just associate the wait reference with the operation and wait in
1880+
* WaitReadBuffers(). This turns out to be important for performance in
1881+
* two workloads:
1882+
*
1883+
* 1) A read stream that has to read the same block multiple times within
1884+
* the readahead distance. This can happen e.g. for the table accesses of
1885+
* an index scan.
1886+
*
1887+
* 2) Concurrent scans by multiple backends on the same relation.
1888+
*
1889+
* If we were to synchronously wait for the in-progress IO, we'd not be
1890+
* able to keep enough I/O in flight.
1891+
*
1892+
* If we do find there is ongoing I/O for the buffer, we set up a 1-block
1893+
* ReadBuffersOperation that WaitReadBuffers then can wait on.
1894+
*
1895+
* It's possible that another backend starts IO on the buffer between this
1896+
* check and the ReadBuffersCanStartIO(nowait = false) below. In that case
1897+
* we will synchronously wait for the IO below, but the window for that is
1898+
* small enough that it won't happen often enough to have a significant
1899+
* performance impact.
1900+
*/
1901+
if (ReadBuffersIOAlreadyInProgress(operation, buffers[nblocks_done]))
1902+
{
1903+
*nblocks_progress = 1;
1904+
operation->foreign_io = true;
1905+
1906+
CheckReadBuffersOperation(operation, false);
1907+
1908+
1909+
return true;
1910+
}
1911+
1912+
operation->foreign_io = false;
1913+
17821914
/*
17831915
* If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
17841916
* flag. The reason for that is that, hopefully, zero_damaged_pages isn't
@@ -1836,9 +1968,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
18361968
/*
18371969
* Check if we can start IO on the first to-be-read buffer.
18381970
*
1839-
* If an I/O is already in progress in another backend, we want to wait
1840-
* for the outcome: either done, or something went wrong and we will
1841-
* retry.
1971+
* If a synchronous I/O is in progress in another backend (it can't be
1972+
* this backend), we want to wait for the outcome: either done, or
1973+
* something went wrong and we will retry.
18421974
*/
18431975
if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
18441976
{

src/include/storage/bufmgr.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ struct ReadBuffersOperation
147147
int flags;
148148
int16 nblocks;
149149
int16 nblocks_done;
150+
bool foreign_io;
150151
PgAioWaitRef io_wref;
151152
PgAioReturn io_return;
152153
};

0 commit comments

Comments
 (0)