-
Notifications
You must be signed in to change notification settings - Fork 139
/
Copy pathprint-crc32-x86-sse42-code.go
167 lines (145 loc) · 7.43 KB
/
print-crc32-x86-sse42-code.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
// Copyright 2024 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//
// SPDX-License-Identifier: Apache-2.0 OR MIT
//go:build ignore
// +build ignore
package main
// print-crc32-x86-sse42-code.go prints the std/crc32 x86/SSE4.2 Wuffs code
// based on some C code generated by https://github.com/corsix/fast-crc32/
//
// Usage: go run print-crc32-x86-sse42-code.go
import (
"fmt"
"regexp"
"strconv"
"strings"
)
func main() {
var (
reXEqLoadu = regexp.MustCompile(`^__m128i x(\d+) = _mm_loadu_si128`)
reKEqSetr = regexp.MustCompile(`^k = _mm_setr_epi32\(([^,]+), ([^,]+), ([^,]+), ([^\)]+)\);$`)
reYEqClmul = regexp.MustCompile(`^y(\d+) = clmul_lo\(x(\d+), k\), x(\d+) = clmul_hi\(x(\d+), k\);$`)
reYEqXorLoadu = regexp.MustCompile(`^y(\d+) = _mm_xor_si128\(y(\d+), _mm_loadu_si128`)
reYEqXorYX = regexp.MustCompile(`^y(\d+) = _mm_xor_si128\(y(\d+), x(\d+)\), x(\d+) = _mm_xor_si128\(x(\d+), y(\d+)\);$`)
)
fmt.Println("// BEGIN script/print-crc32-x86-sse42-code.go generated code.")
for src := srcSSECRC32V8; src != ""; {
i := strings.IndexByte(src, '\n')
line := strings.TrimSpace(src[:i])
src = src[i+1:]
if (line == "") || strings.HasPrefix(line, "/*") {
continue
} else if s := reXEqLoadu.FindStringSubmatch(line); len(s) > 0 {
n, _ := strconv.Atoi(s[1])
fmt.Printf("x%d = util.make_m128i_slice128(a: args.x[0x%02X .. 0x%02X])\n", n, 16*(n), 16*(n+1))
} else if line == "__m128i k;" {
continue
} else if s := reKEqSetr.FindStringSubmatch(line); len(s) > 0 {
fmt.Printf("kk = util.make_m128i_multiple_u32(a00: %s, a01: %s, a02: %s, a03: %s)\n", s[1], s[2], s[3], s[4])
} else if line == "x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);" {
fmt.Printf("x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))\n")
} else if line == "buf += 128;" {
fmt.Printf("args.x = args.x[128 ..]\n")
} else if line == "len -= 128;" {
continue
} else if line == "while (len >= 128) {" {
fmt.Printf("while args.x.length() >= 128 {\n")
} else if line == "}" {
fmt.Printf("} endwhile\n")
} else if s := reYEqClmul.FindStringSubmatch(line); len(s) > 0 {
fmt.Printf("y%s = x%s._mm_clmulepi64_si128(b: kk, imm8: 0x00)\n", s[1], s[2])
fmt.Printf("x%s = x%s._mm_clmulepi64_si128(b: kk, imm8: 0x11)\n", s[3], s[4])
} else if s := reYEqXorLoadu.FindStringSubmatch(line); len(s) > 0 {
n, _ := strconv.Atoi(s[1])
fmt.Printf("y%d = y%d._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x%02X .. 0x%02X]))\n", n, n, 16*(n), 16*(n+1))
fmt.Printf("x%d = x%d._mm_xor_si128(b: y%d)\n", n, n, n)
} else if s := reYEqXorYX.FindStringSubmatch(line); len(s) > 0 {
fmt.Printf("y%s = y%s._mm_xor_si128(b: x%s)\n", s[1], s[2], s[3])
fmt.Printf("x%s = x%s._mm_xor_si128(b: y%s)\n", s[4], s[5], s[6])
} else if line == "crc0 = crc_u64(0, _mm_extract_epi64(x0, 0));" {
fmt.Printf("kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)\n")
fmt.Printf("s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 0)).\n")
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x00).\n")
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x10).\n")
fmt.Printf(" _mm_extract_epi32(imm8: 2)\n")
// fmt.Printf("s = util.make_m128i_single_u64(a: (s as base.u64) ^ args.x.peek_u64le()).\n")
} else if line == "crc0 = crc_u64(crc0, _mm_extract_epi64(x0, 1));" {
fmt.Printf("kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)\n")
fmt.Printf("s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 1) ^ (s as base.u64)).\n")
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x00).\n")
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x10).\n")
fmt.Printf(" _mm_extract_epi32(imm8: 2)\n")
} else {
fmt.Printf("// Could not process %q.\n", line)
break
}
}
fmt.Println("// END script/print-crc32-x86-sse42-code.go generated code.")
}
// This is the core (inside "if (len >= 128)") of the code produced by
// generate.c in https://github.com/corsix/fast-crc32/ when parameterized by
// "./generate -i sse -p crc32 -a v8".
const srcSSECRC32V8 = `
/* First vector chunk. */
__m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0;
__m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1;
__m128i x2 = _mm_loadu_si128((const __m128i*)(buf + 32)), y2;
__m128i x3 = _mm_loadu_si128((const __m128i*)(buf + 48)), y3;
__m128i x4 = _mm_loadu_si128((const __m128i*)(buf + 64)), y4;
__m128i x5 = _mm_loadu_si128((const __m128i*)(buf + 80)), y5;
__m128i x6 = _mm_loadu_si128((const __m128i*)(buf + 96)), y6;
__m128i x7 = _mm_loadu_si128((const __m128i*)(buf + 112)), y7;
__m128i k;
k = _mm_setr_epi32(0x33fff533, 0, 0x910eeec1, 0);
x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
buf += 128;
len -= 128;
/* Main loop. */
while (len >= 128) {
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0);
y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf + 16))), x1 = _mm_xor_si128(x1, y1);
y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf + 32))), x2 = _mm_xor_si128(x2, y2);
y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf + 48))), x3 = _mm_xor_si128(x3, y3);
y4 = _mm_xor_si128(y4, _mm_loadu_si128((const __m128i*)(buf + 64))), x4 = _mm_xor_si128(x4, y4);
y5 = _mm_xor_si128(y5, _mm_loadu_si128((const __m128i*)(buf + 80))), x5 = _mm_xor_si128(x5, y5);
y6 = _mm_xor_si128(y6, _mm_loadu_si128((const __m128i*)(buf + 96))), x6 = _mm_xor_si128(x6, y6);
y7 = _mm_xor_si128(y7, _mm_loadu_si128((const __m128i*)(buf + 112))), x7 = _mm_xor_si128(x7, y7);
buf += 128;
len -= 128;
}
/* Reduce x0 ... x7 to just x0. */
k = _mm_setr_epi32(0xae689191, 0, 0xccaa009e, 0);
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
y4 = _mm_xor_si128(y4, x5), x4 = _mm_xor_si128(x4, y4);
y6 = _mm_xor_si128(y6, x7), x6 = _mm_xor_si128(x6, y6);
k = _mm_setr_epi32(0xf1da05aa, 0, 0x81256527, 0);
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
y4 = _mm_xor_si128(y4, x6), x4 = _mm_xor_si128(x4, y4);
k = _mm_setr_epi32(0x8f352d95, 0, 0x1d9513d7, 0);
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y0 = _mm_xor_si128(y0, x4), x0 = _mm_xor_si128(x0, y0);
/* Reduce 128 bits to 32 bits, and multiply by x^32. */
crc0 = crc_u64(0, _mm_extract_epi64(x0, 0));
crc0 = crc_u64(crc0, _mm_extract_epi64(x0, 1));
`