Skip to content

Commit cd11541

Browse files
Andreas Auernhammeragl
authored andcommitted
curve25519: improve cswap
Simplify the constant swap function. On amd64: Replace the CMOVQEQ scheme with SSE2 code similar to the non-amd64 code. On non-amd64: Avoid unnecessary loop iterations. The result is less and slightly faster code. name old time/op new time/op delta ScalarBaseMult-4 653µs ± 0% 636µs ± 0% ~ (p=0.100 n=3+3) name old time/op new time/op delta ConstantSwap-4 10.4ns ± 1% 6.2ns ± 0% -39.86% (p=0.029 n=4+4) On an i7-65000U Change-Id: Ia5eea92e0b3eabb6c291d25229aa582b51278552 Reviewed-on: https://go-review.googlesource.com/39693 Reviewed-by: Adam Langley <[email protected]> Run-TryBot: Adam Langley <[email protected]> TryBot-Result: Gobot Gobot <[email protected]>
1 parent 9b9c1af commit cd11541

File tree

3 files changed

+72
-92
lines changed

3 files changed

+72
-92
lines changed

curve25519/cswap_amd64.s

Lines changed: 54 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -2,87 +2,64 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
// This code was translated into a form compatible with 6a from the public
6-
// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
7-
85
// +build amd64,!gccgo,!appengine
96

10-
// func cswap(inout *[5]uint64, v uint64)
7+
// func cswap(inout *[4][5]uint64, v uint64)
118
TEXT ·cswap(SB),7,$0
129
MOVQ inout+0(FP),DI
1310
MOVQ v+8(FP),SI
1411

15-
CMPQ SI,$1
16-
MOVQ 0(DI),SI
17-
MOVQ 80(DI),DX
18-
MOVQ 8(DI),CX
19-
MOVQ 88(DI),R8
20-
MOVQ SI,R9
21-
CMOVQEQ DX,SI
22-
CMOVQEQ R9,DX
23-
MOVQ CX,R9
24-
CMOVQEQ R8,CX
25-
CMOVQEQ R9,R8
26-
MOVQ SI,0(DI)
27-
MOVQ DX,80(DI)
28-
MOVQ CX,8(DI)
29-
MOVQ R8,88(DI)
30-
MOVQ 16(DI),SI
31-
MOVQ 96(DI),DX
32-
MOVQ 24(DI),CX
33-
MOVQ 104(DI),R8
34-
MOVQ SI,R9
35-
CMOVQEQ DX,SI
36-
CMOVQEQ R9,DX
37-
MOVQ CX,R9
38-
CMOVQEQ R8,CX
39-
CMOVQEQ R9,R8
40-
MOVQ SI,16(DI)
41-
MOVQ DX,96(DI)
42-
MOVQ CX,24(DI)
43-
MOVQ R8,104(DI)
44-
MOVQ 32(DI),SI
45-
MOVQ 112(DI),DX
46-
MOVQ 40(DI),CX
47-
MOVQ 120(DI),R8
48-
MOVQ SI,R9
49-
CMOVQEQ DX,SI
50-
CMOVQEQ R9,DX
51-
MOVQ CX,R9
52-
CMOVQEQ R8,CX
53-
CMOVQEQ R9,R8
54-
MOVQ SI,32(DI)
55-
MOVQ DX,112(DI)
56-
MOVQ CX,40(DI)
57-
MOVQ R8,120(DI)
58-
MOVQ 48(DI),SI
59-
MOVQ 128(DI),DX
60-
MOVQ 56(DI),CX
61-
MOVQ 136(DI),R8
62-
MOVQ SI,R9
63-
CMOVQEQ DX,SI
64-
CMOVQEQ R9,DX
65-
MOVQ CX,R9
66-
CMOVQEQ R8,CX
67-
CMOVQEQ R9,R8
68-
MOVQ SI,48(DI)
69-
MOVQ DX,128(DI)
70-
MOVQ CX,56(DI)
71-
MOVQ R8,136(DI)
72-
MOVQ 64(DI),SI
73-
MOVQ 144(DI),DX
74-
MOVQ 72(DI),CX
75-
MOVQ 152(DI),R8
76-
MOVQ SI,R9
77-
CMOVQEQ DX,SI
78-
CMOVQEQ R9,DX
79-
MOVQ CX,R9
80-
CMOVQEQ R8,CX
81-
CMOVQEQ R9,R8
82-
MOVQ SI,64(DI)
83-
MOVQ DX,144(DI)
84-
MOVQ CX,72(DI)
85-
MOVQ R8,152(DI)
86-
MOVQ DI,AX
87-
MOVQ SI,DX
12+
SUBQ $1, SI
13+
NOTQ SI
14+
MOVQ SI, X15
15+
PSHUFD $0x44, X15, X15
16+
17+
MOVOU 0(DI), X0
18+
MOVOU 16(DI), X2
19+
MOVOU 32(DI), X4
20+
MOVOU 48(DI), X6
21+
MOVOU 64(DI), X8
22+
MOVOU 80(DI), X1
23+
MOVOU 96(DI), X3
24+
MOVOU 112(DI), X5
25+
MOVOU 128(DI), X7
26+
MOVOU 144(DI), X9
27+
28+
MOVO X1, X10
29+
MOVO X3, X11
30+
MOVO X5, X12
31+
MOVO X7, X13
32+
MOVO X9, X14
33+
34+
PXOR X0, X10
35+
PXOR X2, X11
36+
PXOR X4, X12
37+
PXOR X6, X13
38+
PXOR X8, X14
39+
PAND X15, X10
40+
PAND X15, X11
41+
PAND X15, X12
42+
PAND X15, X13
43+
PAND X15, X14
44+
PXOR X10, X0
45+
PXOR X10, X1
46+
PXOR X11, X2
47+
PXOR X11, X3
48+
PXOR X12, X4
49+
PXOR X12, X5
50+
PXOR X13, X6
51+
PXOR X13, X7
52+
PXOR X14, X8
53+
PXOR X14, X9
54+
55+
MOVOU X0, 0(DI)
56+
MOVOU X2, 16(DI)
57+
MOVOU X4, 32(DI)
58+
MOVOU X6, 48(DI)
59+
MOVOU X8, 64(DI)
60+
MOVOU X1, 80(DI)
61+
MOVOU X3, 96(DI)
62+
MOVOU X5, 112(DI)
63+
MOVOU X7, 128(DI)
64+
MOVOU X9, 144(DI)
8865
RET

curve25519/curve25519.go

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88

99
package curve25519
1010

11+
import (
12+
"encoding/binary"
13+
)
14+
1115
// This code is a port of the public domain, "ref10" implementation of
1216
// curve25519 from SUPERCOP 20130419 by D. J. Bernstein.
1317

@@ -50,17 +54,11 @@ func feCopy(dst, src *fieldElement) {
5054
//
5155
// Preconditions: b in {0,1}.
5256
func feCSwap(f, g *fieldElement, b int32) {
53-
var x fieldElement
5457
b = -b
55-
for i := range x {
56-
x[i] = b & (f[i] ^ g[i])
57-
}
58-
5958
for i := range f {
60-
f[i] ^= x[i]
61-
}
62-
for i := range g {
63-
g[i] ^= x[i]
59+
t := b & (f[i] ^ g[i])
60+
f[i] ^= t
61+
g[i] ^= t
6462
}
6563
}
6664

@@ -75,12 +73,7 @@ func load3(in []byte) int64 {
7573

7674
// load4 reads a 32-bit, little-endian value from in.
7775
func load4(in []byte) int64 {
78-
var r int64
79-
r = int64(in[0])
80-
r |= int64(in[1]) << 8
81-
r |= int64(in[2]) << 16
82-
r |= int64(in[3]) << 24
83-
return r
76+
return int64(binary.LittleEndian.Uint32(in))
8477
}
8578

8679
func feFromBytes(dst *fieldElement, src *[32]byte) {

curve25519/curve25519_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,13 @@ func TestBaseScalarMult(t *testing.T) {
2727
t.Errorf("incorrect result: got %s, want %s", result, expectedHex)
2828
}
2929
}
30+
31+
func BenchmarkScalarBaseMult(b *testing.B) {
32+
var in, out [32]byte
33+
in[0] = 1
34+
35+
b.SetBytes(32)
36+
for i := 0; i < b.N; i++ {
37+
ScalarBaseMult(&out, &in)
38+
}
39+
}

0 commit comments

Comments
 (0)