@@ -1538,6 +1538,46 @@ ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
15381538 return StartBufferIO (GetBufferDescriptor (buffer - 1 ), true, nowait );
15391539}
15401540
1541+ /*
1542+ * Check if the buffer is already undergoing read AIO. If it is, assign the
1543+ * IO's wait reference to operation->io_wref, thereby allowing the caller to
1544+ * wait for that IO.
1545+ */
1546+ static inline bool
1547+ ReadBuffersIOAlreadyInProgress (ReadBuffersOperation * operation , Buffer buffer )
1548+ {
1549+ BufferDesc * desc ;
1550+ uint32 buf_state ;
1551+ PgAioWaitRef iow ;
1552+
1553+ pgaio_wref_clear (& iow );
1554+
1555+ if (BufferIsLocal (buffer ))
1556+ {
1557+ desc = GetLocalBufferDescriptor (- buffer - 1 );
1558+ buf_state = pg_atomic_read_u32 (& desc -> state );
1559+ if ((buf_state & BM_IO_IN_PROGRESS ) && !(buf_state & BM_VALID ))
1560+ iow = desc -> io_wref ;
1561+ }
1562+ else
1563+ {
1564+ desc = GetBufferDescriptor (buffer - 1 );
1565+ buf_state = LockBufHdr (desc );
1566+
1567+ if ((buf_state & BM_IO_IN_PROGRESS ) && !(buf_state & BM_VALID ))
1568+ iow = desc -> io_wref ;
1569+ UnlockBufHdr (desc , buf_state );
1570+ }
1571+
1572+ if (pgaio_wref_valid (& iow ))
1573+ {
1574+ operation -> io_wref = iow ;
1575+ return true;
1576+ }
1577+
1578+ return false;
1579+ }
1580+
15411581/*
15421582 * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
15431583 */
@@ -1670,7 +1710,7 @@ WaitReadBuffers(ReadBuffersOperation *operation)
16701710 *
16711711 * we first check if we already know the IO is complete.
16721712 */
1673- if (aio_ret -> result .status == PGAIO_RS_UNKNOWN &&
1713+ if (( operation -> foreign_io || aio_ret -> result .status == PGAIO_RS_UNKNOWN ) &&
16741714 !pgaio_wref_check_done (& operation -> io_wref ))
16751715 {
16761716 instr_time io_start = pgstat_prepare_io_time (track_io_timing );
@@ -1689,11 +1729,66 @@ WaitReadBuffers(ReadBuffersOperation *operation)
16891729 Assert (pgaio_wref_check_done (& operation -> io_wref ));
16901730 }
16911731
1692- /*
1693- * We now are sure the IO completed. Check the results. This
1694- * includes reporting on errors if there were any.
1695- */
1696- ProcessReadBuffersResult (operation );
1732+ if (unlikely (operation -> foreign_io ))
1733+ {
1734+ Buffer buffer = operation -> buffers [operation -> nblocks_done ];
1735+ BufferDesc * desc ;
1736+ uint32 buf_state ;
1737+
1738+ if (BufferIsLocal (buffer ))
1739+ {
1740+ desc = GetLocalBufferDescriptor (- buffer - 1 );
1741+ buf_state = pg_atomic_read_u32 (& desc -> state );
1742+ }
1743+ else
1744+ {
1745+ desc = GetBufferDescriptor (buffer - 1 );
1746+ buf_state = LockBufHdr (desc );
1747+ UnlockBufHdr (desc , buf_state );
1748+ }
1749+
1750+ if (buf_state & BM_VALID )
1751+ {
1752+ operation -> nblocks_done += 1 ;
1753+ Assert (operation -> nblocks_done <= operation -> nblocks );
1754+
1755+ /*
1756+ * Report and track this as a 'hit' for this backend, even
1757+ * though it must have started out as a miss in
1758+ * PinBufferForBlock(). The other backend (or ourselves,
1759+ * as part of a read started earlier) will track this as a
1760+ * 'read'.
1761+ */
1762+ TRACE_POSTGRESQL_BUFFER_READ_DONE (operation -> forknum ,
1763+ operation -> blocknum + operation -> nblocks_done ,
1764+ operation -> smgr -> smgr_rlocator .locator .spcOid ,
1765+ operation -> smgr -> smgr_rlocator .locator .dbOid ,
1766+ operation -> smgr -> smgr_rlocator .locator .relNumber ,
1767+ operation -> smgr -> smgr_rlocator .backend ,
1768+ true);
1769+
1770+ if (BufferIsLocal (buffer ))
1771+ pgBufferUsage .local_blks_hit += 1 ;
1772+ else
1773+ pgBufferUsage .shared_blks_hit += 1 ;
1774+
1775+ if (operation -> rel )
1776+ pgstat_count_buffer_hit (operation -> rel );
1777+
1778+ pgstat_count_io_op (io_object , io_context , IOOP_HIT , 1 , 0 );
1779+
1780+ if (VacuumCostActive )
1781+ VacuumCostBalance += VacuumCostPageHit ;
1782+ }
1783+ }
1784+ else
1785+ {
1786+ /*
1787+ * We now are sure the IO completed. Check the results. This
1788+ * includes reporting on errors if there were any.
1789+ */
1790+ ProcessReadBuffersResult (operation );
1791+ }
16971792 }
16981793
16991794 /*
@@ -1779,6 +1874,43 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
17791874 io_object = IOOBJECT_RELATION ;
17801875 }
17811876
1877+ /*
1878+ * If AIO is in progress, be it in this backend or another backend, we
1879+ * just associate the wait reference with the operation and wait in
1880+ * WaitReadBuffers(). This turns out to be important for performance in
1881+ * two workloads:
1882+ *
1883+ * 1) A read stream that has to read the same block multiple times within
1884+ * the readahead distance. This can happen e.g. for the table accesses of
1885+ * an index scan.
1886+ *
1887+ * 2) Concurrent scans by multiple backends on the same relation.
1888+ *
1889+ * If we were to synchronously wait for the in-progress IO, we'd not be
1890+ * able to keep enough I/O in flight.
1891+ *
1892+ * If we do find there is ongoing I/O for the buffer, we set up a 1-block
1893+ * ReadBuffersOperation that WaitReadBuffers then can wait on.
1894+ *
1895+ * It's possible that another backend starts IO on the buffer between this
1896+ * check and the ReadBuffersCanStartIO(nowait = false) below. In that case
1897+ * we will synchronously wait for the IO below, but the window for that is
1898+ * small enough that it won't happen often enough to have a significant
1899+ * performance impact.
1900+ */
1901+ if (ReadBuffersIOAlreadyInProgress (operation , buffers [nblocks_done ]))
1902+ {
1903+ * nblocks_progress = 1 ;
1904+ operation -> foreign_io = true;
1905+
1906+ CheckReadBuffersOperation (operation , false);
1907+
1908+
1909+ return true;
1910+ }
1911+
1912+ operation -> foreign_io = false;
1913+
17821914 /*
17831915 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
17841916 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
@@ -1836,9 +1968,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
18361968 /*
18371969 * Check if we can start IO on the first to-be-read buffer.
18381970 *
1839- * If an I/O is already in progress in another backend, we want to wait
1840- * for the outcome: either done, or something went wrong and we will
1841- * retry.
1971+ * If a synchronous I/O is in progress in another backend (it can't be
1972+ * this backend), we want to wait for the outcome: either done, or
1973+ * something went wrong and we will retry.
18421974 */
18431975 if (!ReadBuffersCanStartIO (buffers [nblocks_done ], false))
18441976 {
0 commit comments