-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathl2_amd64.s
50 lines (44 loc) · 820 Bytes
/
l2_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
//func L2(X []float32)
TEXT ·L2(SB), 7, $0
MOVQ X_data+0(FP), SI
MOVQ X_len+8(FP), BP
// Clear accumulator
XORPS X0, X0
SUBQ $4, BP
JL rest // There are less than 4 pairs to process
simd_loop:
// Load four pairs and scale
MOVUPS (SI), X1
MULPS X1, X1
// Save sum
ADDPS X1, X0
// Update data pointers
ADDQ $16, SI
SUBQ $4, BP
JGE simd_loop // There are 4 or more pairs to process
JMP rest
rest:
// Horizontal sum
MOVHLPS X0, X1
ADDPS X0, X1
MOVSS X1, X0
SHUFPS $0xe1, X1, X1
ADDSS X1, X0
// Undo last SUBQ
ADDQ $4, BP
// Check that are there any value to process
JE end
loop:
// Load from X and scale
MOVSS (SI), X1
MULSS X1, X1
// Save sum in Y
ADDSS X1, X0
// Update data pointers
ADDQ $4, SI
DECQ BP
JNE loop
end:
SQRTSS X0, X0
MOVSS X0, r+24(FP)
RET