|
| 1 | +/*------------------------------------------------------------------------- |
| 2 | + * |
| 3 | + * instr_time.c |
| 4 | + * Non-inline parts of the portable high-precision interval timing |
| 5 | + * implementation |
| 6 | + * |
| 7 | + * Portions Copyright (c) 2025, PostgreSQL Global Development Group |
| 8 | + * |
| 9 | + * |
| 10 | + * IDENTIFICATION |
| 11 | + * src/backend/port/instr_time.c |
| 12 | + * |
| 13 | + *------------------------------------------------------------------------- |
| 14 | + */ |
| 15 | +#include "postgres.h" |
| 16 | + |
| 17 | +#if defined(HAVE__GET_CPUID) || (defined(HAVE__CPUIDEX) && !defined(_MSC_VER)) |
| 18 | +#include <cpuid.h> |
| 19 | +#endif |
| 20 | + |
| 21 | +#if defined(HAVE__CPUID) || (defined(HAVE__CPUIDEX) && defined(_MSC_VER)) |
| 22 | +#include <intrin.h> |
| 23 | +#endif |
| 24 | + |
| 25 | +#include "portability/instr_time.h" |
| 26 | + |
| 27 | +#ifndef WIN32 |
| 28 | +/* |
| 29 | + * Stores what the number of cycles needs to be multiplied with to end up |
| 30 | + * with nanoseconds using integer math. See comment in pg_initialize_rdtsc() |
| 31 | + * for more details. |
| 32 | + * |
| 33 | + * By default assume we are using clock_gettime() as a fallback which uses |
| 34 | + * nanoseconds as ticks. Hence, we set the multiplier to the precision scalar |
| 35 | + * so that the division in INSTR_TIME_GET_NANOSEC() won't change the nanoseconds. |
| 36 | + * |
| 37 | + * When using the RDTSC instruction directly this is filled in during initialization |
| 38 | + * based on the relevant CPUID fields. |
| 39 | + */ |
| 40 | +int64 ticks_per_ns_scaled = TICKS_TO_NS_PRECISION; |
| 41 | +int64 ticks_per_sec = NS_PER_S; |
| 42 | +int64 max_ticks_no_overflow = PG_INT64_MAX / TICKS_TO_NS_PRECISION; |
| 43 | + |
| 44 | +#if defined(__x86_64__) && defined(__linux__) |
| 45 | +/* |
| 46 | + * Indicates if RDTSC can be used (Linux/x86 only, when OS uses TSC clocksource) |
| 47 | + */ |
| 48 | +bool has_rdtsc = false; |
| 49 | + |
| 50 | +/* |
| 51 | + * Indicates if RDTSCP can be used. True if RDTSC can be used and RDTSCP is available. |
| 52 | + */ |
| 53 | +bool has_rdtscp = false; |
| 54 | + |
| 55 | +#define CPUID_HYPERVISOR_VMWARE(words) (words[1] == 0x61774d56 && words[2] == 0x4d566572 && words[3] == 0x65726177) /* VMwareVMware */ |
| 56 | +#define CPUID_HYPERVISOR_KVM(words) (words[1] == 0x4b4d564b && words[2] == 0x564b4d56 && words[3] == 0x0000004d) /* KVMKVMKVM */ |
| 57 | + |
| 58 | +static bool |
| 59 | +get_tsc_frequency_khz(uint32 *tsc_freq) |
| 60 | +{ |
| 61 | + uint32 r[4] = {0, 0, 0, 0}; |
| 62 | + |
| 63 | +#if defined(HAVE__GET_CPUID) |
| 64 | + __get_cpuid(0x15, &r[0] /* denominator */ , &r[1] /* numerator */ , &r[2] /* hz */ , &r[3]); |
| 65 | +#elif defined(HAVE__CPUID) |
| 66 | + __cpuid(r, 0x15); |
| 67 | +#else |
| 68 | +#error cpuid instruction not available |
| 69 | +#endif |
| 70 | + |
| 71 | + if (r[2] > 0) |
| 72 | + { |
| 73 | + if (r[0] == 0 || r[1] == 0) |
| 74 | + return false; |
| 75 | + |
| 76 | + *tsc_freq = r[2] / 1000 * r[1] / r[0]; |
| 77 | + return true; |
| 78 | + } |
| 79 | + |
| 80 | + /* Some CPUs only report frequency in 16H */ |
| 81 | + |
| 82 | +#if defined(HAVE__GET_CPUID) |
| 83 | + __get_cpuid(0x16, &r[0] /* base_mhz */ , &r[1], &r[2], &r[3]); |
| 84 | +#elif defined(HAVE__CPUID) |
| 85 | + __cpuid(r, 0x16); |
| 86 | +#else |
| 87 | +#error cpuid instruction not available |
| 88 | +#endif |
| 89 | + |
| 90 | + if (r[0] > 0) |
| 91 | + { |
| 92 | + *tsc_freq = r[0] * 1000; |
| 93 | + return true; |
| 94 | + } |
| 95 | + |
| 96 | + /* |
| 97 | + * Check if we have a KVM or VMware Hypervisor passing down TSC frequency |
| 98 | + * to us in a guest VM |
| 99 | + * |
| 100 | + * Note that accessing the 0x40000000 leaf for Hypervisor info requires |
| 101 | + * use of __cpuidex to set ECX to 0. The similar __get_cpuid_count |
| 102 | + * function does not work as expected since it contains a check for |
| 103 | + * __get_cpuid_max, which has been observed to be lower than the special |
| 104 | + * Hypervisor leaf. |
| 105 | + */ |
| 106 | +#if defined(HAVE__CPUIDEX) |
| 107 | + __cpuidex((int32 *) r, 0x40000000, 0); |
| 108 | + if (r[0] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(r) || CPUID_HYPERVISOR_KVM(r))) |
| 109 | + { |
| 110 | + __cpuidex((int32 *) r, 0x40000010, 0); |
| 111 | + if (r[0] > 0) |
| 112 | + { |
| 113 | + *tsc_freq = r[0]; |
| 114 | + return true; |
| 115 | + } |
| 116 | + } |
| 117 | +#endif |
| 118 | + |
| 119 | + return false; |
| 120 | +} |
| 121 | + |
| 122 | +static bool |
| 123 | +is_rdtscp_available() |
| 124 | +{ |
| 125 | + uint32 r[4] = {0, 0, 0, 0}; |
| 126 | + |
| 127 | +#if defined(HAVE__GET_CPUID) |
| 128 | + if (!__get_cpuid(0x80000001, &r[0], &r[1], &r[2], &r[3])) |
| 129 | + return false; |
| 130 | +#elif defined(HAVE__CPUID) |
| 131 | + __cpuid(r, 0x80000001); |
| 132 | +#else |
| 133 | +#error cpuid instruction not available |
| 134 | +#endif |
| 135 | + |
| 136 | + return (r[3] & (1 << 27)) != 0; |
| 137 | +} |
| 138 | + |
| 139 | +/* |
| 140 | + * Decide whether we use the RDTSC instruction at runtime, for Linux/x86, |
| 141 | + * instead of incurring the overhead of a full clock_gettime() call. |
| 142 | + * |
| 143 | + * This can't be reliably determined at compile time, since the |
| 144 | + * availability of an "invariant" TSC (that is not affected by CPU |
| 145 | + * frequency changes) is dependent on the CPU architecture. Additionally, |
| 146 | + * there are cases where TSC availability is impacted by virtualization, |
| 147 | + * where a simple cpuid feature check would not be enough. |
| 148 | + * |
| 149 | + * Since Linux already does a significant amount of work to determine |
| 150 | + * whether TSC is a viable clock source, decide based on that. |
| 151 | + */ |
| 152 | +void |
| 153 | +pg_initialize_rdtsc(void) |
| 154 | +{ |
| 155 | + FILE *fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); |
| 156 | + |
| 157 | + if (fp) |
| 158 | + { |
| 159 | + char buf[128]; |
| 160 | + |
| 161 | + if (fgets(buf, sizeof(buf), fp) != NULL && strcmp(buf, "tsc\n") == 0) |
| 162 | + { |
| 163 | + /* |
| 164 | + * Compute baseline CPU peformance, determines speed at which |
| 165 | + * RDTSC advances. |
| 166 | + */ |
| 167 | + uint32 tsc_freq; |
| 168 | + |
| 169 | + if (get_tsc_frequency_khz(&tsc_freq)) |
| 170 | + { |
| 171 | + /* |
| 172 | + * Ticks to nanoseconds conversion requires floating point |
| 173 | + * math because because: |
| 174 | + * |
| 175 | + * sec = ticks / frequency_hz ns = ticks / frequency_hz * |
| 176 | + * 1,000,000,000 ns = ticks * (1,000,000,000 / frequency_hz) |
| 177 | + * ns = ticks * (1,000,000 / frequency_khz) <-- now in |
| 178 | + * kilohertz |
| 179 | + * |
| 180 | + * Here, 'ns' is usually a floating number. For example for a |
| 181 | + * 2.5 GHz CPU the scaling factor becomes 1,000,000 / |
| 182 | + * 2,500,000 = 1.2. |
| 183 | + * |
| 184 | + * To be able to use integer math we work around the lack of |
| 185 | + * precision. We first scale the integer up and after the |
| 186 | + * multiplication by the number of ticks in |
| 187 | + * INSTR_TIME_GET_NANOSEC() we divide again by the same value. |
| 188 | + * We picked the scaler such that it provides enough precision |
| 189 | + * and is a power-of-two which allows for shifting instead of |
| 190 | + * doing an integer division. |
| 191 | + */ |
| 192 | + ticks_per_ns_scaled = INT64CONST(1000000) * TICKS_TO_NS_PRECISION / tsc_freq; |
| 193 | + ticks_per_sec = tsc_freq * 1000; /* KHz->Hz */ |
| 194 | + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; |
| 195 | + |
| 196 | + has_rdtsc = true; |
| 197 | + has_rdtscp = is_rdtscp_available(); |
| 198 | + } |
| 199 | + } |
| 200 | + |
| 201 | + fclose(fp); |
| 202 | + } |
| 203 | +} |
| 204 | +#endif /* defined(__x86_64__) && defined(__linux__) */ |
| 205 | + |
| 206 | +#endif /* WIN32 */ |
0 commit comments