-
Notifications
You must be signed in to change notification settings - Fork 4
/
asm_vecAdd_sse.s
66 lines (50 loc) · 1009 Bytes
/
asm_vecAdd_sse.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
// +build sse
// +build amd64
#include "textflag.h"
// func addAsm(a, b []float64)
TEXT ·addAsm(SB), NOSPLIT, $0
MOVQ a_data+0(FP), SI
MOVQ b_data+24(FP), DI // use destination index register for this
MOVQ a_len+8(FP), AX // len(a) into AX
// check if there are at least 8 elements
SUBQ $8, AX
JL remainder
loop:
// a[0]
MOVAPD (SI), X0
MOVAPD (DI), X1
ADDPD X0, X1
MOVAPD X1, (SI)
MOVAPD 16(SI), X2
MOVAPD 16(DI), X3
ADDPD X2, X3
MOVAPD X3, 16(SI)
MOVAPD 32(SI), X4
MOVAPD 32(DI), X5
ADDPD X4, X5
MOVAPD X5, 32(SI)
MOVAPD 48(SI), X6
MOVAPD 48(DI), X7
ADDPD X6, X7
MOVAPD X7, 48(SI)
// update pointers. 4 registers, 2 elements each, 8 bytes per element
ADDQ $64, SI
ADDQ $64, DI
// len(a) is now 4*2 elements less
SUBQ $8, AX
JGE loop
remainder:
ADDQ $8, AX
JE done
remainderloop:
MOVSD (SI), X0
MOVSD (DI), X1
ADDSD X0, X1
MOVSD X1, (SI)
// update pointer to the top of the data
ADDQ $8, SI
ADDQ $8, DI
DECQ AX
JNE remainderloop
done:
RET