Skip to content

Commit a83fdf2

Browse files
authored
GH-90997: Improve inline cache performance for MSVC (GH-96781)
1 parent 9f18147 commit a83fdf2

File tree

2 files changed

+21
-74
lines changed

2 files changed

+21
-74
lines changed

Include/internal/pycore_code.h

Lines changed: 19 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -285,110 +285,55 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
285285
#define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0)
286286
#endif // !Py_STATS
287287

288-
// Cache values are only valid in memory, so use native endianness.
289-
#ifdef WORDS_BIGENDIAN
288+
// Utility functions for reading/writing 32/64-bit values in the inline caches.
289+
// Great care should be taken to ensure that these functions remain correct and
290+
// performant! They should compile to just "move" instructions on all supported
291+
// compilers and platforms.
292+
293+
// We use memcpy to let the C compiler handle unaligned accesses and endianness
294+
// issues for us. It also seems to produce better code than manual copying for
295+
// most compilers (see https://blog.regehr.org/archives/959 for more info).
290296

291297
static inline void
292298
write_u32(uint16_t *p, uint32_t val)
293299
{
294-
p[0] = (uint16_t)(val >> 16);
295-
p[1] = (uint16_t)(val >> 0);
300+
memcpy(p, &val, sizeof(val));
296301
}
297302

298303
static inline void
299304
write_u64(uint16_t *p, uint64_t val)
300305
{
301-
p[0] = (uint16_t)(val >> 48);
302-
p[1] = (uint16_t)(val >> 32);
303-
p[2] = (uint16_t)(val >> 16);
304-
p[3] = (uint16_t)(val >> 0);
305-
}
306-
307-
static inline uint32_t
308-
read_u32(uint16_t *p)
309-
{
310-
uint32_t val = 0;
311-
val |= (uint32_t)p[0] << 16;
312-
val |= (uint32_t)p[1] << 0;
313-
return val;
314-
}
315-
316-
static inline uint64_t
317-
read_u64(uint16_t *p)
318-
{
319-
uint64_t val = 0;
320-
val |= (uint64_t)p[0] << 48;
321-
val |= (uint64_t)p[1] << 32;
322-
val |= (uint64_t)p[2] << 16;
323-
val |= (uint64_t)p[3] << 0;
324-
return val;
325-
}
326-
327-
#else
328-
329-
static inline void
330-
write_u32(uint16_t *p, uint32_t val)
331-
{
332-
p[0] = (uint16_t)(val >> 0);
333-
p[1] = (uint16_t)(val >> 16);
306+
memcpy(p, &val, sizeof(val));
334307
}
335308

336309
static inline void
337-
write_u64(uint16_t *p, uint64_t val)
310+
write_obj(uint16_t *p, PyObject *val)
338311
{
339-
p[0] = (uint16_t)(val >> 0);
340-
p[1] = (uint16_t)(val >> 16);
341-
p[2] = (uint16_t)(val >> 32);
342-
p[3] = (uint16_t)(val >> 48);
312+
memcpy(p, &val, sizeof(val));
343313
}
344314

345315
static inline uint32_t
346316
read_u32(uint16_t *p)
347317
{
348-
uint32_t val = 0;
349-
val |= (uint32_t)p[0] << 0;
350-
val |= (uint32_t)p[1] << 16;
318+
uint32_t val;
319+
memcpy(&val, p, sizeof(val));
351320
return val;
352321
}
353322

354323
static inline uint64_t
355324
read_u64(uint16_t *p)
356325
{
357-
uint64_t val = 0;
358-
val |= (uint64_t)p[0] << 0;
359-
val |= (uint64_t)p[1] << 16;
360-
val |= (uint64_t)p[2] << 32;
361-
val |= (uint64_t)p[3] << 48;
326+
uint64_t val;
327+
memcpy(&val, p, sizeof(val));
362328
return val;
363329
}
364330

365-
#endif
366-
367-
static inline void
368-
write_obj(uint16_t *p, PyObject *obj)
369-
{
370-
uintptr_t val = (uintptr_t)obj;
371-
#if SIZEOF_VOID_P == 8
372-
write_u64(p, val);
373-
#elif SIZEOF_VOID_P == 4
374-
write_u32(p, val);
375-
#else
376-
#error "SIZEOF_VOID_P must be 4 or 8"
377-
#endif
378-
}
379-
380331
static inline PyObject *
381332
read_obj(uint16_t *p)
382333
{
383-
uintptr_t val;
384-
#if SIZEOF_VOID_P == 8
385-
val = read_u64(p);
386-
#elif SIZEOF_VOID_P == 4
387-
val = read_u32(p);
388-
#else
389-
#error "SIZEOF_VOID_P must be 4 or 8"
390-
#endif
391-
return (PyObject *)val;
334+
PyObject *val;
335+
memcpy(&val, p, sizeof(val));
336+
return val;
392337
}
393338

394339
/* See Objects/exception_handling_notes.txt for details.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Improve the performance of reading and writing inline bytecode caches on
2+
some platforms.

0 commit comments

Comments
 (0)