Skip to content

Commit 367743d

Browse files
committed
mask_amd64.sh: Cleanup
1 parent 3f8c9e0 commit 367743d

File tree

2 files changed

+15
-12
lines changed

2 files changed

+15
-12
lines changed

mask_amd64.s

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,18 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28
1010
MOVQ len+8(FP), CX
1111
MOVL key+16(FP), SI
1212

13-
// calculate the DI
14-
// DI = SI<<32 | SI
13+
// Calculate the DI aka the uint64 key.
14+
// DI = uint64(SI) | uint64(SI)<<32
1515
MOVL SI, DI
1616
MOVQ DI, DX
1717
SHLQ $32, DI
1818
ORQ DX, DI
1919

20-
CMPQ CX, $7
21-
JLE less_than_8
22-
CMPQ CX, $63
23-
JLE less_than_64
24-
CMPQ CX, $128
20+
CMPQ CX, $8
21+
JL less_than_8
22+
CMPQ CX, $64
23+
JL less_than_64
24+
CMPQ CX, $512
2525
JLE sse
2626
TESTQ $31, AX
2727
JNZ unaligned
@@ -34,8 +34,8 @@ unaligned_loop_1byte:
3434
TESTQ $7, AX
3535
JNZ unaligned_loop_1byte
3636

37-
// calculate DI again since SI was modified
38-
// DI = SI<<32 | SI
37+
// Calculate DI again since SI was modified.
38+
// DI = uint64(SI) | uint64(SI)<<32
3939
MOVL SI, DI
4040
MOVQ DI, DX
4141
SHLQ $32, DI
@@ -45,11 +45,12 @@ unaligned_loop_1byte:
4545
JZ sse
4646

4747
unaligned:
48-
TESTQ $7, AX // AND $7 & len, if not zero jump to loop_1b.
48+
// $7 & len, if not zero jump to loop_1b.
49+
TESTQ $7, AX
4950
JNZ unaligned_loop_1byte
5051

5152
unaligned_loop:
52-
// we don't need to check the CX since we know it's above 128
53+
// We don't need to check the CX since we know it's above 512.
5354
XORQ DI, (AX)
5455
ADDQ $8, AX
5556
SUBQ $8, CX

mask_arm64.s

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#include "textflag.h"
22

3-
// func maskAsm(b *byte,len, int, key uint32)
3+
// func maskAsm(b *byte, len int, key uint32)
44
TEXT ·maskAsm(SB), NOSPLIT, $0-28
55
// R0 = b
66
// R1 = len
@@ -15,6 +15,8 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28
1515
CMP $64, R1
1616
BLT less_than_64
1717

18+
// TODO: allign memory like amd64
19+
1820
loop_64:
1921
VLD1 (R0), [V1.B16, V2.B16, V3.B16, V4.B16]
2022
VEOR V1.B16, V0.B16, V1.B16

0 commit comments

Comments
 (0)