2727#define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
2828
2929#define NUM_BUFFERCACHE_NUMA_ELEM 3
30+ #define NUM_BUFFERCACHE_OS_PAGES_ELEM 2
31+
32+ /*
33+ * Get the maximum buffer cache entries needed.
34+ */
35+ #define GET_MAX_BUFFER_ENTRIES (nbuffers , os_page_size ) \
36+ ((nbuffers) * (Max(1, BLCKSZ / (os_page_size)) + 1))
3037
3138PG_MODULE_MAGIC_EXT (
3239 .name = "pg_buffercache" ,
@@ -88,12 +95,30 @@ typedef struct
8895 BufferCacheNumaRec * record ;
8996} BufferCacheNumaContext ;
9097
98+ /*
99+ * Record structure holding the to be exposed cache data.
100+ */
101+ typedef struct
102+ {
103+ uint32 bufferid ;
104+ int64 page_num ;
105+ } BufferCacheOsPagesRec ;
106+
107+ /*
108+ * Function context for data persisting over repeated calls.
109+ */
110+ typedef struct
111+ {
112+ TupleDesc tupdesc ;
113+ BufferCacheOsPagesRec * record ;
114+ } BufferCacheOsPagesContext ;
91115
92116/*
93117 * Function returning data from the shared buffer cache - buffer number,
94118 * relation node/tablespace/database/blocknum and dirty indicator.
95119 */
96120PG_FUNCTION_INFO_V1 (pg_buffercache_pages );
121+ PG_FUNCTION_INFO_V1 (pg_buffercache_os_pages );
97122PG_FUNCTION_INFO_V1 (pg_buffercache_numa_pages );
98123PG_FUNCTION_INFO_V1 (pg_buffercache_summary );
99124PG_FUNCTION_INFO_V1 (pg_buffercache_usage_counts );
@@ -105,6 +130,197 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
105130/* Only need to touch memory once per backend process lifetime */
106131static bool firstNumaTouch = true;
107132
133+ /*
134+ * Helper function to get buffer page boundaries.
135+ *
136+ * Given a buffer pointer and OS page size, calculates the start/end
137+ * pointers and first page number.
138+ */
139+ static void
140+ get_buffer_page_boundaries (char * buffptr , Size os_page_size , char * startptr ,
141+ char * * startptr_buff , char * * endptr_buff ,
142+ int32 * page_num )
143+ {
144+ char * start_ptr ;
145+ char * end_ptr ;
146+
147+ /* start of the first page of this buffer */
148+ start_ptr = (char * ) TYPEALIGN_DOWN (os_page_size , buffptr );
149+
150+ /* end of the buffer (no need to align to memory page) */
151+ end_ptr = buffptr + BLCKSZ ;
152+
153+ Assert (start_ptr < end_ptr );
154+
155+ /* calculate ID of the first page for this buffer */
156+ * page_num = (start_ptr - startptr ) / os_page_size ;
157+ * startptr_buff = start_ptr ;
158+ * endptr_buff = end_ptr ;
159+ }
160+
161+ /*
162+ * Inquire about OS pages mappings for shared buffers.
163+ *
164+ * Returns each OS memory page used by the buffer. Buffers may
165+ * be smaller or larger than OS memory pages. For each buffer we return one
166+ * entry for each memory page used by the buffer (if the buffer is smaller,
167+ * it only uses a part of one memory page).
168+ *
169+ * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
170+ * one is always a multiple of the other.
171+ */
172+ Datum
173+ pg_buffercache_os_pages (PG_FUNCTION_ARGS )
174+ {
175+ FuncCallContext * funcctx ;
176+ Datum result ;
177+ MemoryContext oldcontext ;
178+ BufferCacheOsPagesContext * fctx ; /* User function context. */
179+ TupleDesc tupledesc ;
180+ TupleDesc expected_tupledesc ;
181+ HeapTuple tuple ;
182+
183+ if (SRF_IS_FIRSTCALL ())
184+ {
185+ int i ,
186+ idx ;
187+ Size os_page_size ;
188+ char * startptr ;
189+ int max_entries ;
190+
191+ /*
192+ * The database block size and OS memory page size are unlikely to be
193+ * the same. The block size is 1-32KB, the memory page size depends on
194+ * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
195+ * there are also features like THP etc. Moreover, we don't quite know
196+ * how the pages and buffers "align" in memory - the buffers may be
197+ * shifted in some way, using more memory pages than necessary.
198+ *
199+ * So we need to be careful about mapping buffers to memory pages. We
200+ * calculate the maximum number of pages a buffer might use, so that
201+ * we allocate enough space for the entries. And then we count the
202+ * actual number of entries as we scan the buffers.
203+ */
204+ os_page_size = pg_get_shmem_pagesize ();
205+
206+ /* Initialize the multi-call context, load entries about buffers */
207+ funcctx = SRF_FIRSTCALL_INIT ();
208+
209+ /* Switch context when allocating stuff to be used in later calls */
210+ oldcontext = MemoryContextSwitchTo (funcctx -> multi_call_memory_ctx );
211+
212+ /* Create a user function context for cross-call persistence */
213+ fctx = (BufferCacheOsPagesContext * ) palloc (sizeof (BufferCacheOsPagesContext ));
214+
215+ if (get_call_result_type (fcinfo , NULL , & expected_tupledesc ) != TYPEFUNC_COMPOSITE )
216+ elog (ERROR , "return type must be a row type" );
217+
218+ if (expected_tupledesc -> natts != NUM_BUFFERCACHE_OS_PAGES_ELEM )
219+ elog (ERROR , "incorrect number of output arguments" );
220+
221+ /* Construct a tuple descriptor for the result rows. */
222+ tupledesc = CreateTemplateTupleDesc (expected_tupledesc -> natts );
223+ TupleDescInitEntry (tupledesc , (AttrNumber ) 1 , "bufferid" ,
224+ INT4OID , -1 , 0 );
225+
226+ TupleDescInitEntry (tupledesc , (AttrNumber ) 2 , "os_page_num" ,
227+ INT8OID , -1 , 0 );
228+
229+ fctx -> tupdesc = BlessTupleDesc (tupledesc );
230+
231+ /*
232+ * Each buffer needs at least one entry, but it might be offset in
233+ * some way, and use one extra entry. So we allocate space for the
234+ * maximum number of entries we might need, and then count the exact
235+ * number as we're walking buffers. That way we can do it in one pass,
236+ * without reallocating memory.
237+ */
238+ max_entries = GET_MAX_BUFFER_ENTRIES (NBuffers , os_page_size );
239+
240+ /* Allocate NBuffers worth of BufferCacheOsPagesRec records. */
241+ fctx -> record = (BufferCacheOsPagesRec * )
242+ MemoryContextAllocHuge (CurrentMemoryContext ,
243+ sizeof (BufferCacheOsPagesRec ) * max_entries );
244+
245+ /* Return to original context when allocating transient memory */
246+ MemoryContextSwitchTo (oldcontext );
247+
248+ startptr = (char * ) TYPEALIGN_DOWN (os_page_size , (char * ) BufferGetBlock (1 ));
249+ idx = 0 ;
250+
251+ /*
252+ * Scan through all the buffers, saving the relevant fields in the
253+ * fctx->record structure.
254+ *
255+ * We don't hold the partition locks, so we don't get a consistent
256+ * snapshot across all buffers, but we do grab the buffer header
257+ * locks, so the information of each buffer is self-consistent.
258+ */
259+ for (i = 0 ; i < NBuffers ; i ++ )
260+ {
261+ char * buffptr = (char * ) BufferGetBlock (i + 1 );
262+ BufferDesc * bufHdr ;
263+ uint32 buf_state ;
264+ uint32 bufferid ;
265+ int32 page_num ;
266+ char * startptr_buff ,
267+ * endptr_buff ;
268+
269+ CHECK_FOR_INTERRUPTS ();
270+
271+ bufHdr = GetBufferDescriptor (i );
272+ /* Lock each buffer header before inspecting. */
273+ buf_state = LockBufHdr (bufHdr );
274+ bufferid = BufferDescriptorGetBuffer (bufHdr );
275+ UnlockBufHdr (bufHdr , buf_state );
276+
277+ /* Get page boundaries for this buffer. */
278+ get_buffer_page_boundaries (buffptr , os_page_size , startptr ,
279+ & startptr_buff , & endptr_buff , & page_num );
280+
281+ /* Add an entry for each OS page overlapping with this buffer. */
282+ for (char * ptr = startptr_buff ; ptr < endptr_buff ; ptr += os_page_size )
283+ {
284+ fctx -> record [idx ].bufferid = bufferid ;
285+ fctx -> record [idx ].page_num = page_num ;
286+ /* advance to the next entry/page */
287+ ++ idx ;
288+ ++ page_num ;
289+ }
290+ }
291+
292+ Assert (idx <= max_entries );
293+
294+ /* Set max calls and remember the user function context. */
295+ funcctx -> max_calls = idx ;
296+ funcctx -> user_fctx = fctx ;
297+ }
298+
299+ funcctx = SRF_PERCALL_SETUP ();
300+
301+ /* Get the saved state */
302+ fctx = funcctx -> user_fctx ;
303+
304+ if (funcctx -> call_cntr < funcctx -> max_calls )
305+ {
306+ uint32 i = funcctx -> call_cntr ;
307+ Datum values [NUM_BUFFERCACHE_OS_PAGES_ELEM ];
308+ bool nulls [NUM_BUFFERCACHE_OS_PAGES_ELEM ];
309+
310+ values [0 ] = Int32GetDatum (fctx -> record [i ].bufferid );
311+ nulls [0 ] = false;
312+ values [1 ] = Int64GetDatum (fctx -> record [i ].page_num );
313+ nulls [1 ] = false;
314+
315+ /* Build and return the tuple. */
316+ tuple = heap_form_tuple (fctx -> tupdesc , values , nulls );
317+ result = HeapTupleGetDatum (tuple );
318+
319+ SRF_RETURN_NEXT (funcctx , result );
320+ }
321+ else
322+ SRF_RETURN_DONE (funcctx );
323+ }
108324
109325Datum
110326pg_buffercache_pages (PG_FUNCTION_ARGS )
@@ -320,7 +536,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
320536 void * * os_page_ptrs ;
321537 int * os_page_status ;
322538 uint64 os_page_count ;
323- int pages_per_buffer ;
324539 int max_entries ;
325540 char * startptr ,
326541 * endptr ;
@@ -428,8 +643,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
428643 * number as we're walking buffers. That way we can do it in one pass,
429644 * without reallocating memory.
430645 */
431- pages_per_buffer = Max (1 , BLCKSZ / os_page_size ) + 1 ;
432- max_entries = NBuffers * pages_per_buffer ;
646+ max_entries = GET_MAX_BUFFER_ENTRIES (NBuffers , os_page_size );
433647
434648 /* Allocate entries for BufferCachePagesRec records. */
435649 fctx -> record = (BufferCacheNumaRec * )
@@ -475,16 +689,9 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
475689 bufferid = BufferDescriptorGetBuffer (bufHdr );
476690 UnlockBufHdr (bufHdr , buf_state );
477691
478- /* start of the first page of this buffer */
479- startptr_buff = (char * ) TYPEALIGN_DOWN (os_page_size , buffptr );
480-
481- /* end of the buffer (no need to align to memory page) */
482- endptr_buff = buffptr + BLCKSZ ;
483-
484- Assert (startptr_buff < endptr_buff );
485-
486- /* calculate ID of the first page for this buffer */
487- page_num = (startptr_buff - startptr ) / os_page_size ;
692+ /* Get page boundaries for this buffer. */
693+ get_buffer_page_boundaries (buffptr , os_page_size , startptr ,
694+ & startptr_buff , & endptr_buff , & page_num );
488695
489696 /* Add an entry for each OS page overlapping with this buffer. */
490697 for (char * ptr = startptr_buff ; ptr < endptr_buff ; ptr += os_page_size )
0 commit comments