Skip to content

Commit 476f8e3

Browse files
j-naylorCommitfest Bot
authored andcommitted
Use template file for parallel CRC computation
1 parent 194112e commit 476f8e3

File tree

4 files changed

+112
-86
lines changed

4 files changed

+112
-86
lines changed

src/port/pg_crc32c_armv8.c

Lines changed: 31 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,42 @@
1818

1919
#include "port/pg_crc32c.h"
2020

21+
#define DEBUG_CRC /* XXX not for commit */
22+
23+
static pg_crc32c pg_comp_crc32c_armv8_tail(pg_crc32c crc, const void *data, size_t len);
24+
25+
2126
pg_crc32c
2227
pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
28+
{
29+
const unsigned char *p = data;
30+
pg_crc32c crc0 = crc;
31+
32+
#ifdef DEBUG_CRC
33+
const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len;
34+
#endif
35+
36+
/* min size to compute multiple segments in parallel */
37+
#define MIN_PARALLEL_LENGTH 600
38+
39+
#define PG_CRC32C_1B(c, w) __crc32cb(c, w)
40+
#define PG_CRC32C_8B(c, w) __crc32cd(c, w)
41+
#include "pg_crc32c_parallel.h"
42+
43+
crc0 = pg_comp_crc32c_armv8_tail(crc0, p, len);
44+
45+
#ifdef DEBUG_CRC
46+
Assert(crc0 == pg_comp_crc32c_sb8(crc, data, orig_len));
47+
#endif
48+
49+
return crc0;
50+
}
51+
52+
static pg_crc32c
53+
pg_comp_crc32c_armv8_tail(pg_crc32c crc, const void *data, size_t len)
2354
{
2455
const unsigned char *p = data;
2556
const unsigned char *pend = p + len;
26-
const size_t min_blocklen = 42; /* Min size to consider interleaving */
27-
const pg_crc32c orig_crc = crc; // XXX not for commit
2857

2958
/*
3059
* ARMv8 doesn't require alignment, but aligned memory access is
@@ -50,36 +79,6 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
5079
p += 4;
5180
}
5281

53-
/* See pg_crc32c_sse42.c for explanation */
54-
while (p + min_blocklen * CRC_BYTES_PER_ITER <= pend)
55-
{
56-
const size_t block_len = Min(CRC_MAX_BLOCK_LEN, (pend - p) / CRC_BYTES_PER_ITER);
57-
const uint64 *in64 = (const uint64 *) (p);
58-
pg_crc32c crc0 = crc,
59-
crc1 = 0,
60-
crc2 = 0;
61-
uint64 mul0,
62-
mul1,
63-
precompute;
64-
65-
for (int i = 0; i < block_len; i++, in64++)
66-
{
67-
crc0 = __crc32cd(crc0, *(in64));
68-
crc1 = __crc32cd(crc1, *(in64 + block_len));
69-
crc2 = __crc32cd(crc2, *(in64 + block_len * 2));
70-
}
71-
72-
precompute = combine_crc_lookup[block_len - 1];
73-
mul0 = pg_clmul(crc0, (uint32) precompute);
74-
mul1 = pg_clmul(crc1, (uint32) (precompute >> 32));
75-
76-
crc0 = __crc32cd(0, mul0);
77-
crc1 = __crc32cd(0, mul1);
78-
crc = crc0 ^ crc1 ^ crc2;
79-
80-
p += block_len * CRC_BYTES_PER_ITER;
81-
}
82-
8382
/* Process eight bytes at a time, as far as we can. */
8483
while (p + 8 <= pend)
8584
{
@@ -103,7 +102,5 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
103102
crc = __crc32cb(crc, *p);
104103
}
105104

106-
// XXX not for commit
107-
Assert(crc == pg_comp_crc32c_sb8(orig_crc, data, len));
108105
return crc;
109106
}

src/port/pg_crc32c_parallel.h

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* pg_crc32c_parallel.h
4+
* Hardware-independent template for parallel CRC computation.
5+
*
6+
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7+
* Portions Copyright (c) 1994, Regents of the University of California
8+
*
9+
*
10+
* src/port/pg_crc32c_parallel.h
11+
*
12+
*-------------------------------------------------------------------------
13+
*/
14+
#ifndef PG_CRC32C_H
15+
#define PG_CRC32C_H
16+
17+
if (unlikely(len >= MIN_PARALLEL_LENGTH))
18+
{
19+
/*
20+
* Align pointer regardless of architecture to avoid straddling cacheline
21+
* boundaries, since we issue three loads per loop iteration below.
22+
*/
23+
for (; (uintptr_t) p & 7; len--)
24+
crc0 = PG_CRC32C_1B(crc0, *p++);
25+
26+
/*
27+
* A CRC instruction can be issued every cycle on many architectures, but
28+
* the latency of its result will take several cycles. We can take
29+
* advantage of this by dividing the input into 3 equal blocks and
30+
* computing the CRC of each independently.
31+
*/
32+
while (len >= MIN_PARALLEL_LENGTH)
33+
{
34+
const size_t block_len = Min(CRC_MAX_BLOCK_LEN,
35+
len / CRC_BYTES_PER_ITER);
36+
const uint64 *in64 = (const uint64 *) (p);
37+
pg_crc32c crc1 = 0,
38+
crc2 = 0;
39+
uint64 mul0,
40+
mul1,
41+
precompute;
42+
43+
for (int i = 0; i < block_len; i++, in64++)
44+
{
45+
crc0 = PG_CRC32C_8B(crc0, *(in64));
46+
crc1 = PG_CRC32C_8B(crc1, *(in64 + block_len));
47+
crc2 = PG_CRC32C_8B(crc2, *(in64 + block_len * 2));
48+
}
49+
50+
/*
51+
* Combine the partial CRCs using carryless multiplication on
52+
* pre-computed length-specific constants.
53+
*/
54+
precompute = combine_crc_lookup[block_len - 1];
55+
mul0 = pg_clmul(crc0, (uint32) precompute);
56+
mul1 = pg_clmul(crc1, (uint32) (precompute >> 32));
57+
crc0 = PG_CRC32C_8B(0, mul0);
58+
crc0 ^= PG_CRC32C_8B(0, mul1);
59+
crc0 ^= crc2;
60+
61+
p += block_len * CRC_BYTES_PER_ITER;
62+
len -= block_len * CRC_BYTES_PER_ITER;
63+
}
64+
}
65+
66+
#endif /* PG_CRC32C_H */

src/port/pg_crc32c_sb8.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,6 +1169,8 @@ static const uint32 pg_crc32c_table[8][256] = {
11691169
};
11701170

11711171

1172+
/* platform-independent infrastructure for parallel CRC computation */
1173+
11721174
/*
11731175
* Carryless multiplication in software
11741176
*/

src/port/pg_crc32c_sse42.c

Lines changed: 13 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@
1818

1919
#include "port/pg_crc32c.h"
2020

21-
/* min size to compute multiple segments in parallel */
22-
#define MIN_PARALLEL_LENGTH 600
21+
#define DEBUG_CRC /* XXX not for commit */
2322

2423
static pg_crc32c pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len);
2524

@@ -31,64 +30,26 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
3130
const unsigned char *p = data;
3231
pg_crc32c crc0 = crc;
3332

34-
/* XXX not for commit */
33+
#ifdef DEBUG_CRC
3534
const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len;
35+
#endif
3636

3737
#if SIZEOF_VOID_P >= 8
38-
if (unlikely(len >= MIN_PARALLEL_LENGTH))
39-
{
40-
/*
41-
* Align pointer to avoid straddling cacheline boundaries, since we
42-
* issue three loads per loop iteration below.
43-
*/
44-
for (; (uintptr_t) p & 7; len--)
45-
crc0 = _mm_crc32_u8(crc0, *p++);
46-
47-
/*
48-
* A CRC instruction can be issued every cycle but the latency of its
49-
* result will take several cycles. We can take advantage of this by
50-
* dividing the input into 3 equal blocks and computing the CRC of
51-
* each independently.
52-
*/
53-
while (len >= MIN_PARALLEL_LENGTH)
54-
{
55-
const size_t block_len = Min(CRC_MAX_BLOCK_LEN,
56-
len / CRC_BYTES_PER_ITER);
57-
const uint64 *in64 = (const uint64 *) (p);
58-
pg_crc32c crc1 = 0,
59-
crc2 = 0;
60-
uint64 mul0,
61-
mul1,
62-
precompute;
63-
64-
for (int i = 0; i < block_len; i++, in64++)
65-
{
66-
crc0 = _mm_crc32_u64(crc0, *(in64));
67-
crc1 = _mm_crc32_u64(crc1, *(in64 + block_len));
68-
crc2 = _mm_crc32_u64(crc2, *(in64 + block_len * 2));
69-
}
70-
71-
/*
72-
* Combine the partial CRCs using carryless multiplication on
73-
* pre-computed length-specific constants.
74-
*/
75-
precompute = combine_crc_lookup[block_len - 1];
76-
mul0 = pg_clmul(crc0, (uint32) precompute);
77-
mul1 = pg_clmul(crc1, (uint32) (precompute >> 32));
78-
crc0 = _mm_crc32_u64(0, mul0);
79-
crc0 ^= _mm_crc32_u64(0, mul1);
80-
crc0 ^= crc2;
81-
82-
p += block_len * CRC_BYTES_PER_ITER;
83-
len -= block_len * CRC_BYTES_PER_ITER;
84-
}
85-
}
38+
39+
/* min size to compute multiple segments in parallel */
40+
#define MIN_PARALLEL_LENGTH 600
41+
42+
#define PG_CRC32C_1B(c, w) _mm_crc32_u8(c, w)
43+
#define PG_CRC32C_8B(c, w) _mm_crc32_u64(c, w)
44+
#include "pg_crc32c_parallel.h"
45+
8646
#endif
8747

8848
crc0 = pg_comp_crc32c_sse42_tail(crc0, p, len);
8949

90-
/* XXX not for commit */
50+
#ifdef DEBUG_CRC
9151
Assert(crc0 == pg_comp_crc32c_sb8(crc, data, orig_len));
52+
#endif
9253

9354
return crc0;
9455
}

0 commit comments

Comments
 (0)