-
Notifications
You must be signed in to change notification settings - Fork 23
/
_exp_arm64.s
183 lines (179 loc) · 5.28 KB
/
_exp_arm64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
//go:build !tinygo && !noasm
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#define Ln2Hi 6.9313812256e-01
#define Ln2Lo 9.0580006145e-06
#define Log2e 1.4426950216e+00
#define Overflow 7.097827e+02
#define Underflow -7.451332e+02
#define Overflow2 1.024000e+03
#define Underflow2 -1.0740e+03
#define NearZero 0x317fffff // 2**-28
#define PosInf 0x7f800000
#define FracMask 0x07fffff
#define C1 0x34000000 // 2**-23
#define P1 1.6666667163e-01 // 0x3FC55555; 0x55555555
#define P2 -2.7777778450e-03 // 0xBF66C16C; 0x16BEBD93
#define P3 6.6137559770e-05 // 0x3F11566A; 0xAF25DE2C
#define P4 -1.6533901999e-06 // 0xBEBBBD41; 0xC5D26BF1
#define P5 4.1381369442e-08 // 0x3E663769; 0x72BEA4D0
// Exp returns e**x, the base-e exponential of x.
// This is an assembly implementation of the method used for function Exp in file exp.go.
//
// func archExp(x float32) float32
TEXT ·archExp(SB),$0-12
FMOVS x+0(FP), F0 // F0 = x
FCMPS F0, F0
BNE isNaN // x = NaN, return NaN
FMOVS $Overflow, F1
FCMPS F1, F0
BGT overflow // x > Overflow, return PosInf
FMOVS $Underflow, F1
FCMPS F1, F0
BLT underflow // x < Underflow, return 0
MOVW $NearZero, R0
FMOVS R0, F2
FABSS F0, F3
FMOVS $1.0, F1 // F1 = 1.0
FCMPS F2, F3
BLT nearzero // fabs(x) < NearZero, return 1 + x
// argument reduction, x = k*ln2 + r, |r| <= 0.5*ln2
// computed as r = hi - lo for extra precision.
FMOVS $Log2e, F2
FMOVS $0.5, F3
FNMSUBS F0, F3, F2, F4 // Log2e*x - 0.5
FMADDS F0, F3, F2, F3 // Log2e*x + 0.5
FCMPS $0.0, F0
FCSELS LT, F4, F3, F3 // F3 = k
FCVTZSS F3, R1 // R1 = int(k)
SCVTFS R1, F3 // F3 = float32(int(k))
FMOVS $Ln2Hi, F4 // F4 = Ln2Hi
FMOVS $Ln2Lo, F5 // F5 = Ln2Lo
FMSUBS F3, F0, F4, F4 // F4 = hi = x - float32(int(k))*Ln2Hi
FMULS F3, F5 // F5 = lo = float32(int(k)) * Ln2Lo
FSUBS F5, F4, F6 // F6 = r = hi - lo
FMULS F6, F6, F7 // F7 = t = r * r
// compute y
FMOVS $P5, F8 // F8 = P5
FMOVS $P4, F9 // F9 = P4
FMADDS F7, F9, F8, F13 // P4+t*P5
FMOVS $P3, F10 // F10 = P3
FMADDS F7, F10, F13, F13 // P3+t*(P4+t*P5)
FMOVS $P2, F11 // F11 = P2
FMADDS F7, F11, F13, F13 // P2+t*(P3+t*(P4+t*P5))
FMOVS $P1, F12 // F12 = P1
FMADDS F7, F12, F13, F13 // P1+t*(P2+t*(P3+t*(P4+t*P5)))
FMSUBS F7, F6, F13, F13 // F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))))
FMOVS $2.0, F14
FSUBS F13, F14
FMULS F6, F13, F15
FDIVS F14, F15 // F15 = (r*c)/(2-c)
FSUBS F15, F5, F15 // lo-(r*c)/(2-c)
FSUBS F4, F15, F15 // (lo-(r*c)/(2-c))-hi
FSUBS F15, F1, F16 // F16 = y = 1-((lo-(r*c)/(2-c))-hi)
// inline Ldexp(y, k), benefit:
// 1, no parameter pass overhead.
// 2, skip unnecessary checks for Inf/NaN/Zero
FMOVS F16, R0
ANDS $FracMask, R0, R2 // fraction
LSRW $23, R0, R5 // exponent
ADDS R1, R5 // R1 = int(k)
CMPW $1, R5
BGE normal
ADDS $23, R5 // denormal
MOVW $C1, R8
FMOVS R8, F1 // m = 2**-23
normal:
ORRW R5<<23, R2, R0
FMOVS R0, F0
FMULS F1, F0 // return m * x
FMOVS F0, ret+8(FP)
RET
nearzero:
FADDS F1, F0
isNaN:
FMOVS F0, ret+8(FP)
RET
underflow:
MOVW ZR, ret+8(FP)
RET
overflow:
MOVW $PosInf, R0
MOVW R0, ret+8(FP)
RET
// Exp2 returns 2**x, the base-2 exponential of x.
// This is an assembly implementation of the method used for function Exp2 in file exp.go.
//
// func archExp2(x float32) float32
TEXT ·archExp2(SB),$0-12 // Is this correct?
FMOVS x+0(FP), F0 // F0 = x
FCMPS F0, F0
BNE isNaN // x = NaN, return NaN
FMOVS $Overflow2, F1
FCMPS F1, F0
BGT overflow // x > Overflow, return PosInf
FMOVS $Underflow2, F1
FCMPS F1, F0
BLT underflow // x < Underflow, return 0
// argument reduction; x = r*lg(e) + k with |r| <= ln(2)/2
// computed as r = hi - lo for extra precision.
FMOVS $0.5, F2
FSUBS F2, F0, F3 // x + 0.5
FADDS F2, F0, F4 // x - 0.5
FCMPS $0.0, F0
FCSELS LT, F3, F4, F3 // F3 = k
FCVTZSS F3, R1 // R1 = int(k)
SCVTFS R1, F3 // F3 = float32(int(k))
FSUBS F3, F0, F3 // t = x - float32(int(k))
FMOVS $Ln2Hi, F4 // F4 = Ln2Hi
FMOVS $Ln2Lo, F5 // F5 = Ln2Lo
FMULS F3, F4 // F4 = hi = t * Ln2Hi
FNMULS F3, F5 // F5 = lo = -t * Ln2Lo
FSUBS F5, F4, F6 // F6 = r = hi - lo
FMULS F6, F6, F7 // F7 = t = r * r
// compute y
FMOVS $P5, F8 // F8 = P5
FMOVS $P4, F9 // F9 = P4
FMADDS F7, F9, F8, F13 // P4+t*P5
FMOVS $P3, F10 // F10 = P3
FMADDS F7, F10, F13, F13 // P3+t*(P4+t*P5)
FMOVS $P2, F11 // F11 = P2
FMADDS F7, F11, F13, F13 // P2+t*(P3+t*(P4+t*P5))
FMOVS $P1, F12 // F12 = P1
FMADDS F7, F12, F13, F13 // P1+t*(P2+t*(P3+t*(P4+t*P5)))
FMSUBS F7, F6, F13, F13 // F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))))
FMOVS $2.0, F14
FSUBS F13, F14
FMULS F6, F13, F15
FDIVS F14, F15 // F15 = (r*c)/(2-c)
FMOVS $1.0, F1 // F1 = 1.0
FSUBS F15, F5, F15 // lo-(r*c)/(2-c)
FSUBS F4, F15, F15 // (lo-(r*c)/(2-c))-hi
FSUBS F15, F1, F16 // F16 = y = 1-((lo-(r*c)/(2-c))-hi)
// inline Ldexp(y, k), benefit:
// 1, no parameter pass overhead.
// 2, skip unnecessary checks for Inf/NaN/Zero
FMOVS F16, R0
ANDS $FracMask, R0, R2 // fraction
LSRW $23, R0, R5 // exponent
ADDS R1, R5 // R1 = int(k)
CMPW $1, R5
BGE normal
ADDS $23, R5 // denormal
MOVW $C1, R8
FMOVS R8, F1 // m = 2**-52
normal:
ORRW R5<<23, R2, R0
FMOVS R0, F0
FMULS F1, F0 // return m * x
isNaN:
FMOVS F0, ret+8(FP)
RET
underflow:
MOVW ZR, ret+8(FP)
RET
overflow:
MOVW $PosInf, R0
MOVW R0, ret+8(FP)
RET