|
27 | 27 | #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3 |
28 | 28 |
|
29 | 29 | #define NUM_BUFFERCACHE_NUMA_ELEM 3 |
| 30 | +#define NUM_BUFFERCACHE_OS_PAGES_ELEM 2 |
30 | 31 |
|
31 | 32 | /* |
32 | 33 | * Get the maximum buffer cache entries needed. |
@@ -94,12 +95,30 @@ typedef struct |
94 | 95 | BufferCacheNumaRec *record; |
95 | 96 | } BufferCacheNumaContext; |
96 | 97 |
|
| 98 | +/* |
| 99 | + * Record structure holding the to be exposed cache data. |
| 100 | + */ |
| 101 | +typedef struct |
| 102 | +{ |
| 103 | + uint32 bufferid; |
| 104 | + int64 page_num; |
| 105 | +} BufferCacheOsPagesRec; |
| 106 | + |
| 107 | +/* |
| 108 | + * Function context for data persisting over repeated calls. |
| 109 | + */ |
| 110 | +typedef struct |
| 111 | +{ |
| 112 | + TupleDesc tupdesc; |
| 113 | + BufferCacheOsPagesRec *record; |
| 114 | +} BufferCacheOsPagesContext; |
97 | 115 |
|
98 | 116 | /* |
99 | 117 | * Function returning data from the shared buffer cache - buffer number, |
100 | 118 | * relation node/tablespace/database/blocknum and dirty indicator. |
101 | 119 | */ |
102 | 120 | PG_FUNCTION_INFO_V1(pg_buffercache_pages); |
| 121 | +PG_FUNCTION_INFO_V1(pg_buffercache_os_pages); |
103 | 122 | PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages); |
104 | 123 | PG_FUNCTION_INFO_V1(pg_buffercache_summary); |
105 | 124 | PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); |
@@ -139,6 +158,170 @@ get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr, |
139 | 158 | *endptr_buff = end_ptr; |
140 | 159 | } |
141 | 160 |
|
| 161 | +/* |
| 162 | + * Inquire about OS pages mappings for shared buffers. |
| 163 | + * |
| 164 | + * Returns each OS memory page used by the buffer. Buffers may |
| 165 | + * be smaller or larger than OS memory pages. For each buffer we return one |
| 166 | + * entry for each memory page used by the buffer (if the buffer is smaller, |
| 167 | + * it only uses a part of one memory page). |
| 168 | + * |
| 169 | + * We expect both sizes (for buffers and memory pages) to be a power-of-2, so |
| 170 | + * one is always a multiple of the other. |
| 171 | + */ |
| 172 | +Datum |
| 173 | +pg_buffercache_os_pages(PG_FUNCTION_ARGS) |
| 174 | +{ |
| 175 | + FuncCallContext *funcctx; |
| 176 | + Datum result; |
| 177 | + MemoryContext oldcontext; |
| 178 | + BufferCacheOsPagesContext *fctx; /* User function context. */ |
| 179 | + TupleDesc tupledesc; |
| 180 | + TupleDesc expected_tupledesc; |
| 181 | + HeapTuple tuple; |
| 182 | + |
| 183 | + if (SRF_IS_FIRSTCALL()) |
| 184 | + { |
| 185 | + int i, |
| 186 | + idx; |
| 187 | + Size os_page_size; |
| 188 | + char *startptr; |
| 189 | + int max_entries; |
| 190 | + |
| 191 | + /* |
| 192 | + * The database block size and OS memory page size are unlikely to be |
| 193 | + * the same. The block size is 1-32KB, the memory page size depends on |
| 194 | + * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but |
| 195 | + * there are also features like THP etc. Moreover, we don't quite know |
| 196 | + * how the pages and buffers "align" in memory - the buffers may be |
| 197 | + * shifted in some way, using more memory pages than necessary. |
| 198 | + * |
| 199 | + * So we need to be careful about mapping buffers to memory pages. We |
| 200 | + * calculate the maximum number of pages a buffer might use, so that |
| 201 | + * we allocate enough space for the entries. And then we count the |
| 202 | + * actual number of entries as we scan the buffers. |
| 203 | + */ |
| 204 | + os_page_size = pg_get_shmem_pagesize(); |
| 205 | + |
| 206 | + /* Initialize the multi-call context, load entries about buffers */ |
| 207 | + funcctx = SRF_FIRSTCALL_INIT(); |
| 208 | + |
| 209 | + /* Switch context when allocating stuff to be used in later calls */ |
| 210 | + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); |
| 211 | + |
| 212 | + /* Create a user function context for cross-call persistence */ |
| 213 | + fctx = (BufferCacheOsPagesContext *) palloc(sizeof(BufferCacheOsPagesContext)); |
| 214 | + |
| 215 | + if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) |
| 216 | + elog(ERROR, "return type must be a row type"); |
| 217 | + |
| 218 | + if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM) |
| 219 | + elog(ERROR, "incorrect number of output arguments"); |
| 220 | + |
| 221 | + /* Construct a tuple descriptor for the result rows. */ |
| 222 | + tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); |
| 223 | + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", |
| 224 | + INT4OID, -1, 0); |
| 225 | + |
| 226 | + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num", |
| 227 | + INT8OID, -1, 0); |
| 228 | + |
| 229 | + fctx->tupdesc = BlessTupleDesc(tupledesc); |
| 230 | + |
| 231 | + /* |
| 232 | + * Each buffer needs at least one entry, but it might be offset in |
| 233 | + * some way, and use one extra entry. So we allocate space for the |
| 234 | + * maximum number of entries we might need, and then count the exact |
| 235 | + * number as we're walking buffers. That way we can do it in one pass, |
| 236 | + * without reallocating memory. |
| 237 | + */ |
| 238 | + max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size); |
| 239 | + |
| 240 | + /* Allocate NBuffers worth of BufferCacheOsPagesRec records. */ |
| 241 | + fctx->record = (BufferCacheOsPagesRec *) |
| 242 | + MemoryContextAllocHuge(CurrentMemoryContext, |
| 243 | + sizeof(BufferCacheOsPagesRec) * max_entries); |
| 244 | + |
| 245 | + /* Return to original context when allocating transient memory */ |
| 246 | + MemoryContextSwitchTo(oldcontext); |
| 247 | + |
| 248 | + startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1)); |
| 249 | + idx = 0; |
| 250 | + |
| 251 | + /* |
| 252 | + * Scan through all the buffers, saving the relevant fields in the |
| 253 | + * fctx->record structure. |
| 254 | + * |
| 255 | + * We don't hold the partition locks, so we don't get a consistent |
| 256 | + * snapshot across all buffers, but we do grab the buffer header |
| 257 | + * locks, so the information of each buffer is self-consistent. |
| 258 | + */ |
| 259 | + for (i = 0; i < NBuffers; i++) |
| 260 | + { |
| 261 | + char *buffptr = (char *) BufferGetBlock(i + 1); |
| 262 | + BufferDesc *bufHdr; |
| 263 | + uint32 buf_state; |
| 264 | + uint32 bufferid; |
| 265 | + int32 page_num; |
| 266 | + char *startptr_buff, |
| 267 | + *endptr_buff; |
| 268 | + |
| 269 | + CHECK_FOR_INTERRUPTS(); |
| 270 | + |
| 271 | + bufHdr = GetBufferDescriptor(i); |
| 272 | + /* Lock each buffer header before inspecting. */ |
| 273 | + buf_state = LockBufHdr(bufHdr); |
| 274 | + bufferid = BufferDescriptorGetBuffer(bufHdr); |
| 275 | + UnlockBufHdr(bufHdr, buf_state); |
| 276 | + |
| 277 | + /* Get page boundaries for this buffer. */ |
| 278 | + get_buffer_page_boundaries(buffptr, os_page_size, startptr, |
| 279 | + &startptr_buff, &endptr_buff, &page_num); |
| 280 | + |
| 281 | + /* Add an entry for each OS page overlapping with this buffer. */ |
| 282 | + for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size) |
| 283 | + { |
| 284 | + fctx->record[idx].bufferid = bufferid; |
| 285 | + fctx->record[idx].page_num = page_num; |
| 286 | + /* advance to the next entry/page */ |
| 287 | + ++idx; |
| 288 | + ++page_num; |
| 289 | + } |
| 290 | + } |
| 291 | + |
| 292 | + Assert(idx <= max_entries); |
| 293 | + |
| 294 | + /* Set max calls and remember the user function context. */ |
| 295 | + funcctx->max_calls = idx; |
| 296 | + funcctx->user_fctx = fctx; |
| 297 | + } |
| 298 | + |
| 299 | + funcctx = SRF_PERCALL_SETUP(); |
| 300 | + |
| 301 | + /* Get the saved state */ |
| 302 | + fctx = funcctx->user_fctx; |
| 303 | + |
| 304 | + if (funcctx->call_cntr < funcctx->max_calls) |
| 305 | + { |
| 306 | + uint32 i = funcctx->call_cntr; |
| 307 | + Datum values[NUM_BUFFERCACHE_OS_PAGES_ELEM]; |
| 308 | + bool nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM]; |
| 309 | + |
| 310 | + values[0] = Int32GetDatum(fctx->record[i].bufferid); |
| 311 | + nulls[0] = false; |
| 312 | + values[1] = Int64GetDatum(fctx->record[i].page_num); |
| 313 | + nulls[1] = false; |
| 314 | + |
| 315 | + /* Build and return the tuple. */ |
| 316 | + tuple = heap_form_tuple(fctx->tupdesc, values, nulls); |
| 317 | + result = HeapTupleGetDatum(tuple); |
| 318 | + |
| 319 | + SRF_RETURN_NEXT(funcctx, result); |
| 320 | + } |
| 321 | + else |
| 322 | + SRF_RETURN_DONE(funcctx); |
| 323 | +} |
| 324 | + |
142 | 325 | Datum |
143 | 326 | pg_buffercache_pages(PG_FUNCTION_ARGS) |
144 | 327 | { |
|
0 commit comments