Skip to content

Commit 319d60a

Browse files
lfittlCommitfest Bot
authored andcommitted
Use time stamp counter to measure time on Linux/x86
We switch to using the time stamp counter (TSC) instead of clock_gettime() to reduce overhead of EXPLAIN (ANALYZE, TIME ON). Tests showed that runtime is reduced by around 10% for queries moving lots of rows through the plan. For now this is only enabled on Linux/x86, in case the system clocksource is reported as TSC. Relying on the Linux kernel simplifies the logic to detect if the present TSC is usable (frequency invariant, synchronized between sockets, etc.). In all other cases we fallback to clock_gettime(). Note, that we intentionally use RDTSC in the fast paths, rather than RDTSCP. RDTSCP waits for outstanding instructions to retire on out-of-order CPUs. This adds noticably for little benefit in the typical InstrStartNode() / InstrStopNode() use case. The macro to be used in such cases is called INSTR_TIME_SET_CURRENT_FAST(). The original macro INSTR_TIME_SET_CURRENT() uses RDTSCP and is supposed to be used when precision is more important than performance. Author: David Geier <[email protected]> Author: Andres Freund <[email protected]> Author: Lukas Fittl <[email protected]> Reviewed-by: Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de
1 parent 42c7ba7 commit 319d60a

File tree

9 files changed

+348
-22
lines changed

9 files changed

+348
-22
lines changed

src/backend/access/heap/vacuumlazy.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3410,8 +3410,8 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected)
34103410
INSTR_TIME_SET_CURRENT(currenttime);
34113411
elapsed = currenttime;
34123412
INSTR_TIME_SUBTRACT(elapsed, starttime);
3413-
if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
3414-
>= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
3413+
if (INSTR_TIME_GET_MILLISEC(elapsed) >=
3414+
VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
34153415
{
34163416
if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
34173417
{

src/backend/executor/instrument.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,13 @@ InstrInit(Instrumentation *instr, int instrument_options)
6767
void
6868
InstrStartNode(Instrumentation *instr)
6969
{
70-
if (instr->need_timer &&
71-
!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
72-
elog(ERROR, "InstrStartNode called twice in a row");
70+
if (instr->need_timer)
71+
{
72+
if (!INSTR_TIME_IS_ZERO(instr->starttime))
73+
elog(ERROR, "InstrStartNode called twice in a row");
74+
else
75+
INSTR_TIME_SET_CURRENT_FAST(instr->starttime);
76+
}
7377

7478
/* save buffer usage totals at node entry, if needed */
7579
if (instr->need_bufusage)
@@ -95,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples)
9599
if (INSTR_TIME_IS_ZERO(instr->starttime))
96100
elog(ERROR, "InstrStopNode called without start");
97101

98-
INSTR_TIME_SET_CURRENT(endtime);
102+
INSTR_TIME_SET_CURRENT_FAST(endtime);
99103
INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
100104

101105
INSTR_TIME_SET_ZERO(instr->starttime);

src/backend/utils/init/postinit.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,9 @@ InitPostgres(const char *in_dbname, Oid dboid,
806806
/* Initialize portal manager */
807807
EnablePortalManager();
808808

809+
/* initialize high-precision interval timing */
810+
INSTR_TIME_INITIALIZE();
811+
809812
/*
810813
* Load relcache entries for the shared system catalogs. This must create
811814
* at least entries for pg_database and catalogs used for authentication.

src/bin/pgbench/pgbench.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7290,6 +7290,9 @@ main(int argc, char **argv)
72907290
initRandomState(&state[i].cs_func_rs);
72917291
}
72927292

7293+
/* initialize high-precision interval timing */
7294+
INSTR_TIME_INITIALIZE();
7295+
72937296
/* opening connection... */
72947297
con = doConnect();
72957298
if (con == NULL)

src/bin/psql/startup.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "help.h"
2525
#include "input.h"
2626
#include "mainloop.h"
27+
#include "portability/instr_time.h"
2728
#include "settings.h"
2829

2930
/*
@@ -327,6 +328,9 @@ main(int argc, char *argv[])
327328

328329
PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL);
329330

331+
/* initialize high-precision interval timing */
332+
INSTR_TIME_INITIALIZE();
333+
330334
SyncVariables();
331335

332336
if (options.list_dbs)

src/common/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ OBJS_COMMON = \
5959
file_perm.o \
6060
file_utils.o \
6161
hashfn.o \
62+
instr_time.o \
6263
ip.o \
6364
jsonapi.o \
6465
keywords.o \

src/common/instr_time.c

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* instr_time.c
4+
* Non-inline parts of the portable high-precision interval timing
5+
* implementation
6+
*
7+
* Portions Copyright (c) 2025, PostgreSQL Global Development Group
8+
*
9+
*
10+
* IDENTIFICATION
11+
* src/backend/port/instr_time.c
12+
*
13+
*-------------------------------------------------------------------------
14+
*/
15+
#include "postgres.h"
16+
17+
#if defined(HAVE__GET_CPUID) || (defined(HAVE__CPUIDEX) && !defined(_MSC_VER))
18+
#include <cpuid.h>
19+
#endif
20+
21+
#if defined(HAVE__CPUID) || (defined(HAVE__CPUIDEX) && defined(_MSC_VER))
22+
#include <intrin.h>
23+
#endif
24+
25+
#include "portability/instr_time.h"
26+
27+
#ifndef WIN32
28+
/*
29+
* Stores what the number of cycles needs to be multiplied with to end up
30+
* with nanoseconds using integer math. See comment in pg_initialize_rdtsc()
31+
* for more details.
32+
*
33+
* By default assume we are using clock_gettime() as a fallback which uses
34+
* nanoseconds as ticks. Hence, we set the multiplier to the precision scalar
35+
* so that the division in INSTR_TIME_GET_NANOSEC() won't change the nanoseconds.
36+
*
37+
* When using the RDTSC instruction directly this is filled in during initialization
38+
* based on the relevant CPUID fields.
39+
*/
40+
int64 ticks_per_ns_scaled = TICKS_TO_NS_PRECISION;
41+
int64 ticks_per_sec = NS_PER_S;
42+
int64 max_ticks_no_overflow = PG_INT64_MAX / TICKS_TO_NS_PRECISION;
43+
44+
#if defined(__x86_64__) && defined(__linux__)
45+
/*
46+
* Indicates if RDTSC can be used (Linux/x86 only, when OS uses TSC clocksource)
47+
*/
48+
bool has_rdtsc = false;
49+
50+
/*
51+
* Indicates if RDTSCP can be used. True if RDTSC can be used and RDTSCP is available.
52+
*/
53+
bool has_rdtscp = false;
54+
55+
#define CPUID_HYPERVISOR_VMWARE(words) (words[1] == 0x61774d56 && words[2] == 0x4d566572 && words[3] == 0x65726177) /* VMwareVMware */
56+
#define CPUID_HYPERVISOR_KVM(words) (words[1] == 0x4b4d564b && words[2] == 0x564b4d56 && words[3] == 0x0000004d) /* KVMKVMKVM */
57+
58+
static bool
59+
get_tsc_frequency_khz(uint32 *tsc_freq)
60+
{
61+
uint32 r[4] = {0, 0, 0, 0};
62+
63+
#if defined(HAVE__GET_CPUID)
64+
__get_cpuid(0x15, &r[0] /* denominator */ , &r[1] /* numerator */ , &r[2] /* hz */ , &r[3]);
65+
#elif defined(HAVE__CPUID)
66+
__cpuid(r, 0x15);
67+
#else
68+
#error cpuid instruction not available
69+
#endif
70+
71+
if (r[2] > 0)
72+
{
73+
if (r[0] == 0 || r[1] == 0)
74+
return false;
75+
76+
*tsc_freq = r[2] / 1000 * r[1] / r[0];
77+
return true;
78+
}
79+
80+
/* Some CPUs only report frequency in 16H */
81+
82+
#if defined(HAVE__GET_CPUID)
83+
__get_cpuid(0x16, &r[0] /* base_mhz */ , &r[1], &r[2], &r[3]);
84+
#elif defined(HAVE__CPUID)
85+
__cpuid(r, 0x16);
86+
#else
87+
#error cpuid instruction not available
88+
#endif
89+
90+
if (r[0] > 0)
91+
{
92+
*tsc_freq = r[0] * 1000;
93+
return true;
94+
}
95+
96+
/*
97+
* Check if we have a KVM or VMware Hypervisor passing down TSC frequency
98+
* to us in a guest VM
99+
*
100+
* Note that accessing the 0x40000000 leaf for Hypervisor info requires
101+
* use of __cpuidex to set ECX to 0. The similar __get_cpuid_count
102+
* function does not work as expected since it contains a check for
103+
* __get_cpuid_max, which has been observed to be lower than the special
104+
* Hypervisor leaf.
105+
*/
106+
#if defined(HAVE__CPUIDEX)
107+
__cpuidex((int32 *) r, 0x40000000, 0);
108+
if (r[0] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(r) || CPUID_HYPERVISOR_KVM(r)))
109+
{
110+
__cpuidex((int32 *) r, 0x40000010, 0);
111+
if (r[0] > 0)
112+
{
113+
*tsc_freq = r[0];
114+
return true;
115+
}
116+
}
117+
#endif
118+
119+
return false;
120+
}
121+
122+
static bool
123+
is_rdtscp_available()
124+
{
125+
uint32 r[4] = {0, 0, 0, 0};
126+
127+
#if defined(HAVE__GET_CPUID)
128+
if (!__get_cpuid(0x80000001, &r[0], &r[1], &r[2], &r[3]))
129+
return false;
130+
#elif defined(HAVE__CPUID)
131+
__cpuid(r, 0x80000001);
132+
#else
133+
#error cpuid instruction not available
134+
#endif
135+
136+
return (r[3] & (1 << 27)) != 0;
137+
}
138+
139+
/*
140+
* Decide whether we use the RDTSC instruction at runtime, for Linux/x86,
141+
* instead of incurring the overhead of a full clock_gettime() call.
142+
*
143+
* This can't be reliably determined at compile time, since the
144+
* availability of an "invariant" TSC (that is not affected by CPU
145+
* frequency changes) is dependent on the CPU architecture. Additionally,
146+
* there are cases where TSC availability is impacted by virtualization,
147+
* where a simple cpuid feature check would not be enough.
148+
*
149+
* Since Linux already does a significant amount of work to determine
150+
* whether TSC is a viable clock source, decide based on that.
151+
*/
152+
void
153+
pg_initialize_rdtsc(void)
154+
{
155+
FILE *fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
156+
157+
if (fp)
158+
{
159+
char buf[128];
160+
161+
if (fgets(buf, sizeof(buf), fp) != NULL && strcmp(buf, "tsc\n") == 0)
162+
{
163+
/*
164+
* Compute baseline CPU peformance, determines speed at which
165+
* RDTSC advances.
166+
*/
167+
uint32 tsc_freq;
168+
169+
if (get_tsc_frequency_khz(&tsc_freq))
170+
{
171+
/*
172+
* Ticks to nanoseconds conversion requires floating point
173+
* math because because:
174+
*
175+
* sec = ticks / frequency_hz ns = ticks / frequency_hz *
176+
* 1,000,000,000 ns = ticks * (1,000,000,000 / frequency_hz)
177+
* ns = ticks * (1,000,000 / frequency_khz) <-- now in
178+
* kilohertz
179+
*
180+
* Here, 'ns' is usually a floating number. For example for a
181+
* 2.5 GHz CPU the scaling factor becomes 1,000,000 /
182+
* 2,500,000 = 1.2.
183+
*
184+
* To be able to use integer math we work around the lack of
185+
* precision. We first scale the integer up and after the
186+
* multiplication by the number of ticks in
187+
* INSTR_TIME_GET_NANOSEC() we divide again by the same value.
188+
* We picked the scaler such that it provides enough precision
189+
* and is a power-of-two which allows for shifting instead of
190+
* doing an integer division.
191+
*/
192+
ticks_per_ns_scaled = INT64CONST(1000000) * TICKS_TO_NS_PRECISION / tsc_freq;
193+
ticks_per_sec = tsc_freq * 1000; /* KHz->Hz */
194+
max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
195+
196+
has_rdtsc = true;
197+
has_rdtscp = is_rdtscp_available();
198+
}
199+
}
200+
201+
fclose(fp);
202+
}
203+
}
204+
#endif /* defined(__x86_64__) && defined(__linux__) */
205+
206+
#endif /* WIN32 */

src/common/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ common_sources = files(
1313
'file_perm.c',
1414
'file_utils.c',
1515
'hashfn.c',
16+
'instr_time.c',
1617
'ip.c',
1718
'jsonapi.c',
1819
'keywords.c',

0 commit comments

Comments
 (0)