1.file "erfl.s"
2
3
4// Copyright (c) 2001 - 2003, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39// History
40//==============================================================
41// 11/21/01  Initial version
42// 05/20/02  Cleaned up namespace and sf0 syntax
43// 08/14/02  Changed mli templates to mlx
44// 02/06/03  Reordered header: .section, .global, .proc, .align
45//
46// API
47//==============================================================
48// long double erfl(long double)
49//
50// Overview of operation
51//==============================================================
52//
53// Algorithm description
54// ---------------------
55//
56// There are 4 paths:
57//
58// 1. Special path: x = 0, Inf, NaNs, denormal
59//    Return erfl(x) = +/-0.0 for zeros
60//    Return erfl(x) = QNaN for NaNs
61//    Return erfl(x) = sign(x)*1.0 for Inf
62//    Return erfl(x) = (A0H+A0L)*x + x^2, ((A0H+A0L) = 2.0/sqrt(Pi))
63//                                             for denormals
64//
65// 2. [0;1/8] path: 0.0 < |x| < 1/8
66//    Return erfl(x) = x*(A1H+A1L) + x^3*A3 + ... + x^15*A15
67//
68// 3. Main path: 1/8 <= |x| < 6.53
69//    For several ranges of 1/8 <= |x| < 6.53
70//    Return erfl(x) = sign(x)*((A0H+A0L) + y*(A1H+A1L) + y^2*(A2H+A2L) +
71//                                       + y^3*A3 + y^4*A4 + ... + y^25*A25 )
72//    where y = (|x|/a) - b
73//
74//    For each range there is particular set of coefficients.
75//    Below is the list of ranges:
76//    1/8  <= |x| < 1/4     a = 0.125, b = 1.5
77//    1/4  <= |x| < 1/2     a = 0.25,  b = 1.5
78//    1/2  <= |x| < 1.0     a = 0.5,   b = 1.5
79//    1.0  <= |x| < 2.0     a = 1.0,   b = 1.5
80//    2.0  <= |x| < 3.25    a = 2.0,   b = 1.5
81//    3.25 <= |x| < 4.0     a = 2.0,   b = 2.0
82//    4.0  <= |x| < 6.53    a = 4.0,   b = 1.5
83//    ( [3.25;4.0] subrange separated for monotonicity issues resolve )
84//
85// 4. Saturation path: 6.53 <= |x| < +INF
86//    Return erfl(x) = sign(x)*(1.0 - tiny_value)
87//    (tiny_value ~ 1e-1233)
88//
89// Implementation notes
90// --------------------
91//
92// 1. Special path: x = 0, INF, NaNa, denormals
93//
94//    This branch is cut off by one fclass operation.
95//    Then zeros+nans, infinities and denormals processed separately.
96//    For denormals we had to use multiprecision A0 coefficient to reach
97//    necessary accuracy: (A0H+A0L)*x-x^2
98//
99// 2. [0;1/8] path: 0.0 < |x| < 1/8
100//
101//    First coefficient of polynomial we must split to multiprecision too.
102//    Also we can parallelise computations:
103//    (x*(A1H+A1L)) calculated in parallel with "tail" (x^3*A3 + ... + x^15*A15)
104//    Furthermore the second part is factorized using binary tree technique.
105//
106// 3. Main path: 1/8 <= |x| < 6.53
107//
108//    Multiprecision have to be performed only for first few
109//    polynomial iterations (up to 3-rd x degree)
110//    Here we use the same parallelisation way as above:
111//    Split whole polynomial to first, "multiprecision" part, and second,
112//    so called "tail", native precision part.
113//
114//    1) Multiprecision part:
115//    [v1=(A0H+A0L)+y*(A1H+A1L)] + [v2=y^2*((A2H+A2L)+y*A3)]
116//    v1 and v2 terms calculated in parallel
117//
118//    2) Tail part:
119//    v3 = x^4 * ( A4 + x*A5 + ... + x^21*A25 )
120//    v3 is splitted to 2 even parts (10 coefficient in each one).
121//    These 2 parts are also factorized using binary tree technique.
122//
123//    So Multiprecision and Tail parts cost is almost the same
124//    and we have both results ready before final summation.
125//
126// 4. Saturation path: 6.53 <= |x| < +INF
127//
128//    We use formula sign(x)*(1.0 - tiny_value) instead of simple sign(x)*1.0
129//    just to meet IEEE requirements for different rounding modes in this case.
130//
131// Registers used
132//==============================================================
133// Floating Point registers used:
134// f8 - input & output
135// f32 -> f90
136
137// General registers used:
138// r2, r3, r32 -> r52
139
140// Predicate registers used:
141// p0, p6 -> p11, p14, p15
142
143// p6  - arg is zero, denormal or special IEEE
144// p7  - arg is in [4;8] binary interval
145// p8  - arg is in [3.25;4] interval
146// p9  - arg < 1/8
147// p10 - arg is NOT in [3.25;4] interval
148// p11 - arg in saturation domain
149// p14 - arg is positive
150// p15 - arg is negative
151
152// Assembly macros
153//==============================================================
154rDataPtr           = r2
155rTailDataPtr       = r3
156
157rBias              = r33
158rSignBit           = r34
159rInterval          = r35
160
161rArgExp            = r36
162rArgSig            = r37
163r3p25Offset        = r38
164r2to4              = r39
165r1p25              = r40
166rOffset            = r41
167r1p5               = r42
168rSaturation        = r43
169r3p25Sign          = r44
170rTiny              = r45
171rAddr1             = r46
172rAddr2             = r47
173rTailAddr1         = r48
174rTailAddr2         = r49
175rTailOffset        = r50
176rTailAddOffset     = r51
177rShiftedDataPtr    = r52
178
179//==============================================================
180fA0H               = f32
181fA0L               = f33
182fA1H               = f34
183fA1L               = f35
184fA2H               = f36
185fA2L               = f37
186fA3                = f38
187fA4                = f39
188fA5                = f40
189fA6                = f41
190fA7                = f42
191fA8                = f43
192fA9                = f44
193fA10               = f45
194fA11               = f46
195fA12               = f47
196fA13               = f48
197fA14               = f49
198fA15               = f50
199fA16               = f51
200fA17               = f52
201fA18               = f53
202fA19               = f54
203fA20               = f55
204fA21               = f56
205fA22               = f57
206fA23               = f58
207fA24               = f59
208fA25               = f60
209
210fArgSqr            = f61
211fArgCube           = f62
212fArgFour           = f63
213fArgEight          = f64
214
215fArgAbsNorm        = f65
216fArgAbsNorm2       = f66
217fArgAbsNorm2L      = f67
218fArgAbsNorm3       = f68
219fArgAbsNorm4       = f69
220fArgAbsNorm11      = f70
221
222fRes               = f71
223fResH              = f72
224fResL              = f73
225fRes1H             = f74
226fRes1L             = f75
227fRes1Hd            = f76
228fRes2H             = f77
229fRes2L             = f78
230fRes3H             = f79
231fRes3L             = f80
232fRes4              = f81
233
234fTT                = f82
235fTH                = f83
236fTL                = f84
237fTT2               = f85
238fTH2               = f86
239fTL2               = f87
240
241f1p5               = f88
242f2p0               = f89
243fTiny              = f90
244
245
246// Data tables
247//==============================================================
248RODATA
249
250.align 64
251LOCAL_OBJECT_START(erfl_data)
252////////// Main tables ///////////
253_0p125_to_0p25_data: // exp = 2^-3
254// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
255data8 0xACD9ED470F0BB048, 0x0000BFF4 //A3 = -6.5937529303909561891162915809e-04
256data8 0xBF6A254428DDB452 //A2H = -3.1915980570631852578089571182e-03
257data8 0xBC131B3BE3AC5079 //A2L = -2.5893976889070198978842231134e-19
258data8 0x3FC16E2D7093CD8C //A1H = 1.3617485043469590433318217038e-01
259data8 0x3C6979A52F906B4C //A1L = 1.1048096806003284897639351952e-17
260data8 0x3FCAC45E37FE2526 //A0H = 2.0911767705937583938791135552e-01
261data8 0x3C648D48536C61E3 //A0L = 8.9129592834861155344147026365e-18
262data8 0xD1FC135B4A30E746, 0x00003F90 //A25 = 6.3189963203954877364460345654e-34
263data8 0xB1C79B06DD8C988C, 0x00003F97 //A24 = 6.8478253118093953461840838106e-32
264data8 0xCC7AE121D1DEDA30, 0x0000BF9A //A23 = -6.3010264109146390803803408666e-31
265data8 0x8927B8841D1E0CA8, 0x0000BFA1 //A22 = -5.4098171537601308358556861717e-29
266data8 0xB4E84D6D0C8F3515, 0x00003FA4 //A21 = 5.7084320046554628404861183887e-28
267data8 0xC190EAE69A67959A, 0x00003FAA //A20 = 3.9090359419467121266470910523e-26
268data8 0x90122425D312F680, 0x0000BFAE //A19 = -4.6551806872355374409398000522e-25
269data8 0xF8456C9C747138D6, 0x0000BFB3 //A18 = -2.5670639225386507569611436435e-23
270data8 0xCDCAE0B3C6F65A3A, 0x00003FB7 //A17 = 3.4045511783329546779285646369e-22
271data8 0x8F41909107C62DCC, 0x00003FBD //A16 = 1.5167830861896169812375771948e-20
272data8 0x82F0FCB8A4B8C0A3, 0x0000BFC1 //A15 = -2.2182328575376704666050112195e-19
273data8 0x92E992C58B7C3847, 0x0000BFC6 //A14 = -7.9641369349930600223371163611e-18
274LOCAL_OBJECT_END(erfl_data)
275
276LOCAL_OBJECT_START(_0p25_to_0p5_data)
277// Polynomial coefficients for the erf(x), 1/4 <= |x| < 1/2
278data8 0xF083628E8F7CE71D, 0x0000BFF6 //A3 = -3.6699405305266733332335619531e-03
279data8 0xBF978749A434FE4E //A2H = -2.2977018973732214746075186440e-02
280data8 0xBC30B3FAFBC21107 //A2L = -9.0547407100537663337591537643e-19
281data8 0x3FCF5F0CDAF15313 //A1H = 2.4508820238647696654332719390e-01
282data8 0x3C1DFF29F5AD8117 //A1L = 4.0653155218104625249413579084e-19
283data8 0x3FD9DD0D2B721F38 //A0H = 4.0411690943482225790717166092e-01
284data8 0x3C874C71FEF1759E //A0L = 4.0416653425001310671815863946e-17
285data8 0xA621D99B8C12595E, 0x0000BFAB //A25 = -6.7100271986703749013021666304e-26
286data8 0xBD7BBACB439992E5, 0x00003FAE //A24 = 6.1225362452814749024566661525e-25
287data8 0xFF2FEFF03A98E410, 0x00003FB2 //A23 = 1.3192871864994282747963195183e-23
288data8 0xAE8180957ABE6FD5, 0x0000BFB6 //A22 = -1.4434787102181180110707433640e-22
289data8 0xAF0566617B453AA6, 0x0000BFBA //A21 = -2.3163848847252215762970075142e-21
290data8 0x8F33D3616B9B8257, 0x00003FBE //A20 = 3.0324297082969526400202995913e-20
291data8 0xD58AB73354438856, 0x00003FC1 //A19 = 3.6175397854863872232142412590e-19
292data8 0xD214550E2F3210DF, 0x0000BFC5 //A18 = -5.6942141660091333278722310354e-18
293data8 0xE2CA60C328F3BBF5, 0x0000BFC8 //A17 = -4.9177359011428870333915211291e-17
294data8 0x88D9BB274F9B3873, 0x00003FCD //A16 = 9.4959118337089189766177270051e-16
295data8 0xCA4A00AB538A2DB2, 0x00003FCF //A15 = 5.6146496538690657993449251855e-15
296data8 0x9CC8FFFBDDCF9853, 0x0000BFD4 //A14 = -1.3925319209173383944263942226e-13
297LOCAL_OBJECT_END(_0p25_to_0p5_data)
298
299LOCAL_OBJECT_START(_0p5_to_1_data)
300// Polynomial coefficients for the erf(x), 1/2 <= |x| < 1
301data8 0xDB742C8FB372DBE0, 0x00003FF6 //A3 = 3.3485993187250381721535255963e-03
302data8 0xBFBEDC5644353C26 //A2H = -1.2054957547410136142751468924e-01
303data8 0xBC6D7215B023455F //A2L = -1.2770012232203569059818773287e-17
304data8 0x3FD492E42D78D2C4 //A1H = 3.2146553459760363047337250464e-01
305data8 0x3C83A163CAC22E05 //A1L = 3.4053365952542489137756724868e-17
306data8 0x3FE6C1C9759D0E5F //A0H = 7.1115563365351508462453011816e-01
307data8 0x3C8B1432F2CBC455 //A0L = 4.6974407716428899960674098333e-17
308data8 0x95A6B92162813FF8, 0x00003FC3 //A25 = 1.0140763985766801318711038400e-18
309data8 0xFE5EC3217F457B83, 0x0000BFC6 //A24 = -1.3789434273280972156856405853e-17
310data8 0x9B49651031B5310B, 0x0000BFC8 //A23 = -3.3672435142472427475576375889e-17
311data8 0xDBF73927E19B7C8D, 0x00003FCC //A22 = 7.6315938248752024965922341872e-16
312data8 0xF55CBA3052730592, 0x00003FCB //A21 = 4.2563559623888750271176552350e-16
313data8 0xA1DC9380DA82CFF6, 0x0000BFD2 //A20 = -3.5940500736023122607663701015e-14
314data8 0xAAD1AE1067F3D577, 0x00003FD2 //A19 = 3.7929451192558641569555227613e-14
315data8 0xCD1DB83F3B9D2090, 0x00003FD7 //A18 = 1.4574374961011929143375716362e-12
316data8 0x87235ACB5E8BB298, 0x0000BFD9 //A17 = -3.8408559294899660346666452560e-12
317data8 0xDA417B78FF9F46B4, 0x0000BFDC //A16 = -4.9625621225715971268115023451e-11
318data8 0xF075762685484436, 0x00003FDE //A15 = 2.1869603559309150844390066920e-10
319data8 0xB989FDB3795165C7, 0x00003FE1 //A14 = 1.3499740992928183247608593000e-09
320LOCAL_OBJECT_END(_0p5_to_1_data)
321
322LOCAL_OBJECT_START(_1_to_2_data)
323// Polynomial coefficients for the erf(x), 1 <= |x| < 2.0
324data8 0x8E15015F5B55BEAC, 0x00003FFC //A3 = 1.3875200409423426678618977531e-01
325data8 0xBFC6D5A95D0A1B7E //A2H = -1.7839543383544403942764233761e-01
326data8 0xBC7499F704C80E02 //A2L = -1.7868888188464394090788198634e-17
327data8 0x3FBE723726B824A8 //A1H = 1.1893028922362935961842822508e-01
328data8 0x3C6B77F399C2AD27 //A1L = 1.1912589318015368492508652194e-17
329data8 0x3FEEEA5557137ADF //A0H = 9.6610514647531064991170524081e-01
330data8 0x3C963D0DDD0A762F //A0L = 7.7155271023949055047261953350e-17
331data8 0x8FAA405DAD409771, 0x0000BFDB //A25 = -1.6332824616946528652252813763e-11
332data8 0x941386F4697976D8, 0x0000BFDD //A24 = -6.7337295147729213955410252613e-11
333data8 0xBCBE75234530B404, 0x00003FDF //A23 = 3.4332329029092304943838374908e-10
334data8 0xF55E2CE71A00D040, 0x00003FDF //A22 = 4.4632156034175937694868068394e-10
335data8 0xA6CADFE489D2671F, 0x0000BFE3 //A21 = -4.8543000253822277507724949798e-09
336data8 0xA4C69F11FEAFB3A8, 0x00003FE2 //A20 = 2.3978044150868471771557059958e-09
337data8 0xD63441E3BED59703, 0x00003FE6 //A19 = 4.9873285553412397317802071288e-08
338data8 0xDFDAED9D3089D732, 0x0000BFE7 //A18 = -1.0424069510877052249228047044e-07
339data8 0xB47287FF165756A5, 0x0000BFE9 //A17 = -3.3610945128073834488448164164e-07
340data8 0xCDAF2DC0A79A9059, 0x00003FEB //A16 = 1.5324673941628851136481785187e-06
341data8 0x9FD6A7B2ECE8EDA9, 0x00003FEA //A15 = 5.9544479989469083598476592569e-07
342data8 0xEC6E63BB4507B585, 0x0000BFEE //A14 = -1.4092398243085031882423746824e-05
343LOCAL_OBJECT_END(_1_to_2_data)
344
345LOCAL_OBJECT_START(_2_to_3p25_data)
346// Polynomial coefficients for the erf(x), 2 <= |x| < 3.25
347data8 0xCEDBA58E8EE6F055, 0x00003FF7 //A3 = 6.3128050215859026984338771121e-03
348data8 0xBF5B60D5E974CBBD //A2H = -1.6710366233609740427984435840e-03
349data8 0xBC0E11E2AEC18AF6 //A2L = -2.0376133202996259839305825162e-19
350data8 0x3F32408E9BA3327E //A1H = 2.7850610389349567379974059733e-04
351data8 0x3BE41010E4B3B224 //A1L = 3.3987633691879253781833531576e-20
352data8 0x3FEFFFD1AC4135F9 //A0H = 9.9997790950300136092465663751e-01
353data8 0x3C8EEAFA1E97EAE0 //A0L = 5.3633970564750967956196033852e-17
354data8 0xBF9C6F2C6D7263C1, 0x00003FF0 //A25 = 4.5683639377039166585098497471e-05
355data8 0xCB4167CC4798096D, 0x00003FF0 //A24 = 4.8459885139772945417160731273e-05
356data8 0xE1394FECFE972D32, 0x0000BFF2 //A23 = -2.1479022581129892562916533804e-04
357data8 0xC7F9E47581FC2A5F, 0x0000BFF2 //A22 = -1.9071211076537531370822343363e-04
358data8 0xDD612EDFAA41BEAE, 0x00003FF2 //A21 = 2.1112405918671957390188348542e-04
359data8 0x8C166AA4CB2AD8FD, 0x0000BFF4 //A20 = -5.3439165021555312536009227942e-04
360data8 0xEFBE33D9F62B68D4, 0x0000BFF2 //A19 = -2.2863672131516067770956697877e-04
361data8 0xCCB92F5D91562494, 0x00003FF5 //A18 = 1.5619154280865226092321881421e-03
362data8 0x80A5DBE71D4BA0E2, 0x0000BFF6 //A17 = -1.9630109664962540123775799179e-03
363data8 0xA0ADEB2D4C41347A, 0x0000BFF4 //A16 = -6.1294315248639348947483422457e-04
364data8 0xB1F5D4911B911665, 0x00003FF7 //A15 = 5.4309165882071876864550213817e-03
365data8 0xF2F3D8D21E8762E0, 0x0000BFF7 //A14 = -7.4143227286535936033409745884e-03
366LOCAL_OBJECT_END(_2_to_3p25_data)
367
368LOCAL_OBJECT_START(_4_to_6p53_data)
369// Polynomial coefficients for the erf(x), 4 <= |x| < 6.53
370data8 0xDF3151BE8652827E, 0x00003FD5 //A3 = 3.9646979666953349095427642209e-13
371data8 0xBD1C4A9787DF888B //A2H = -2.5127788450714750484839908889e-14
372data8 0xB99B35483E4603FD //A2L = -3.3536613901268985626466020210e-31
373data8 0x3CD2DBF507F1A1F3 //A1H = 1.0468963266736687758710258897e-15
374data8 0x398A97B60913B4BD //A1L = 1.6388968267515149775818013207e-31
375data8 0x3FEFFFFFFFFFFFFF //A0H = 9.9999999999999988897769753748e-01
376data8 0x3C99CC25E658129E //A0L = 8.9502895736398715695745861054e-17
377data8 0xB367B21294713D39, 0x00003FFB //A25 = 8.7600127403270828432337605471e-02
378data8 0xCEE3A423ADEC0F4C, 0x00003FFD //A24 = 4.0408051429309221404807497715e-01
379data8 0xC389626CF2D727C0, 0x00003FFE //A23 = 7.6381507072332210580356159947e-01
380data8 0xD15A03E082D0A307, 0x00003FFE //A22 = 8.1777977210259904277239787430e-01
381data8 0x8FD3DA92675E8E00, 0x00003FFE //A21 = 5.6182638239203638864793584264e-01
382data8 0xFD375E6EE167AA58, 0x00003FFC //A20 = 2.4728152801285544751731937424e-01
383data8 0x89A9482FADE66AE1, 0x00003FFB //A19 = 6.7217410998398471333985773237e-02
384data8 0xC62E1F02606C04DD, 0x00003FF7 //A18 = 6.0479785358923404401184993359e-03
385data8 0xEE7BF2BE71CC531C, 0x0000BFF5 //A17 = -1.8194898432032114199803271708e-03
386data8 0x8084081981CDC79C, 0x0000BFF5 //A16 = -9.8049734947701208487713246099e-04
387data8 0x8975DFB834C118C3, 0x0000BFF0 //A15 = -3.2773123965143773578608926094e-05
388data8 0x965DA4A80008B7BC, 0x0000BFEE //A14 = -8.9624997201558650125662820562e-06
389LOCAL_OBJECT_END(_4_to_6p53_data)
390
391LOCAL_OBJECT_START(_3p25_to_4_data)
392// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4
393data8 0xB01D29846286CE08, 0x00003FEE //A3 = 1.0497207328743021499800978059e-05
394data8 0xBEC10B1488AEB234 //A2H = -2.0317175474986489113480084279e-06
395data8 0xBB7F19701B8B74F9 //A2L = -4.1159669348226960337518214996e-22
396data8 0x3E910B1488AEB234 //A1H = 2.5396469343733111391850105348e-07
397data8 0x3B4F1944906D5D60 //A1L = 5.1448487494628801547474934193e-23
398data8 0x3FEFFFFFF7B91176 //A0H = 9.9999998458274208523732795584e-01
399data8 0x3C70B2865615DB3F //A0L = 1.4482653192002495179309994964e-17
400data8 0xA818D085D56F3021, 0x00003FEC //A25 = 2.5048394770210505593609705765e-06
401data8 0xD9C5C509AAE5561F, 0x00003FEC //A24 = 3.2450636894654766492719395406e-06
402data8 0x9682D71C549EEB07, 0x0000BFED //A23 = -4.4855801709974050650263470866e-06
403data8 0xBC230E1EB6FBF8B9, 0x00003FEA //A22 = 7.0086469577174843181452303996e-07
404data8 0xE1432649FF29D4DE, 0x0000BFEA //A21 = -8.3916747195472308725504497231e-07
405data8 0xB40CEEBD2803D2F0, 0x0000BFEF //A20 = -2.1463694318102769992677291330e-05
406data8 0xEAAB57ABFFA003EB, 0x00003FEF //A19 = 2.7974761309213643228699449426e-05
407data8 0xFBFA4D0B893A5BFB, 0x0000BFEE //A18 = -1.5019043571612821858165073446e-05
408data8 0xBB6AA248EED3E364, 0x0000BFF0 //A17 = -4.4683584873907316507141131797e-05
409data8 0x86C1B3AE3E500ED9, 0x00003FF2 //A16 = 1.2851395412345761361068234880e-04
410data8 0xB60729445F0C37B5, 0x0000BFF2 //A15 = -1.7359540313300841352152461287e-04
411data8 0xCA389F9E707337B1, 0x00003FF1 //A14 = 9.6426575465763394281615740282e-05
412LOCAL_OBJECT_END(_3p25_to_4_data)
413
414
415//////// "Tail" tables //////////
416LOCAL_OBJECT_START(_0p125_to_0p25_data_tail)
417// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
418data8 0x93086CBD21ED3962, 0x00003FCA //A13 = 1.2753071968462837024755878679e-16
419data8 0x83CB5045A6D4B419, 0x00003FCF //A12 = 3.6580237062957773626379648530e-15
420data8 0x8FCDB723209690EB, 0x0000BFD3 //A11 = -6.3861616307180801527566117146e-14
421data8 0xCAA173F680B5D56B, 0x0000BFD7 //A10 = -1.4397775466324880354578008779e-12
422data8 0xF0CEA934AD6AC013, 0x00003FDB //A9 = 2.7376616955640415767655526857e-11
423data8 0x81C69F9D0B5AB8EE, 0x00003FE0 //A8 = 4.7212187567505249115688961488e-10
424data8 0xA8B590298C20A194, 0x0000BFE4 //A7 = -9.8201697105565925460801441797e-09
425data8 0x84F3DE72AC964615, 0x0000BFE8 //A6 = -1.2382176987480830706988411266e-07
426data8 0xC01A1398868CC4BD, 0x00003FEC //A5 = 2.8625408039722670291121341583e-06
427data8 0xCC43247F4410C54A, 0x00003FEF //A4 = 2.4349960762505993017186935493e-05
428LOCAL_OBJECT_END(_0p125_to_0p25_data_tail)
429
430LOCAL_OBJECT_START(_0p25_to_0p5_data_tail)
431// Polynomial coefficients for the erf(x), 1/4 <= |x| < 1/2
432data8 0x8CEAC59AF361B78A, 0x0000BFD6 //A13 = -5.0063802958258679384986669123e-13
433data8 0x9BC67404F348C0CE, 0x00003FDB //A12 = 1.7709590771868743572061278273e-11
434data8 0xF4B5D0348AFAAC7A, 0x00003FDB //A11 = 2.7820329729584630464848160970e-11
435data8 0x83AB447FF619DA4A, 0x0000BFE2 //A10 = -1.9160363295631539615395477207e-09
436data8 0x82115AB487202E7B, 0x00003FE0 //A9 = 4.7318386460142606822119637959e-10
437data8 0xB84D5B0AE17054AA, 0x00003FE8 //A8 = 1.7164477188916895004843908951e-07
438data8 0xB2E085C1C4AA06E5, 0x0000BFE9 //A7 = -3.3318445266863554512523957574e-07
439data8 0xCD3CA2E6C3971666, 0x0000BFEE //A6 = -1.2233070175554502732980949519e-05
440data8 0xBA445C53F8DD40E6, 0x00003FF0 //A5 = 4.4409521535330413551781808621e-05
441data8 0xAA94D5E68033B764, 0x00003FF4 //A4 = 6.5071635765452563856926608000e-04
442LOCAL_OBJECT_END(_0p25_to_0p5_data_tail)
443
444LOCAL_OBJECT_START(_0p5_to_1_data_tail)
445// Polynomial coefficients for the erf(x), 1/2 <= |x| < 1
446data8 0x9ED99EDF111CB785, 0x0000BFE4 //A13 = -9.2462916180079278241704711522e-09
447data8 0xDEAF7539AE2FB062, 0x0000BFE5 //A12 = -2.5923990465973151101298441139e-08
448data8 0xA392D5E5CC9DB1A7, 0x00003FE9 //A11 = 3.0467952847327075747032372101e-07
449data8 0xC311A7619B96CA1A, 0x00003FE8 //A10 = 1.8167212632079596881709988649e-07
450data8 0x82082E6B6A93F116, 0x0000BFEE //A9 = -7.7505086843257228386931766018e-06
451data8 0x96D9997CF326A36D, 0x00003FEE //A8 = 8.9913605625817479172071008270e-06
452data8 0x97057D85DCB0ED99, 0x00003FF2 //A7 = 1.4402527482741758767786898553e-04
453data8 0xDC23BCB3599C0490, 0x0000BFF3 //A6 = -4.1988296144950673955519083419e-04
454data8 0xDA150C4867208A81, 0x0000BFF5 //A5 = -1.6638352864915033417887831090e-03
455data8 0x9A4DAF550A2CC29A, 0x00003FF8 //A4 = 9.4179355839141698591817907680e-03
456LOCAL_OBJECT_END(_0p5_to_1_data_tail)
457
458LOCAL_OBJECT_START(_1_to_2_data_tail)
459// Polynomial coefficients for the erf(x), 1 <= |x| < 2.0
460data8 0x969EAC5C7B46CAB9, 0x00003FEF //A13 = 1.7955281439310148162059582795e-05
461data8 0xA2ED832912E9FCD9, 0x00003FF1 //A12 = 7.7690020847111408916570845775e-05
462data8 0x85677C39C48E43E7, 0x0000BFF3 //A11 = -2.5444839340796031538582511806e-04
463data8 0xC2DAFA91683DAAE4, 0x0000BFF1 //A10 = -9.2914288456063075386925076097e-05
464data8 0xE01C061CBC6A2825, 0x00003FF5 //A9 = 1.7098195515864039518892834211e-03
465data8 0x9AD7271CAFD01C78, 0x0000BFF6 //A8 = -2.3626776207372761518718893636e-03
466data8 0x9B6B9D30EDD5F4FF, 0x0000BFF7 //A7 = -4.7430532011804570628999212874e-03
467data8 0x9E51EB9623F1D446, 0x00003FF9 //A6 = 1.9326171998839772791190405201e-02
468data8 0xF391B935C12546DE, 0x0000BFF8 //A5 = -1.4866286152953671441682166195e-02
469data8 0xB6AD4AE850DBF526, 0x0000BFFA //A4 = -4.4598858458861014323191919669e-02
470LOCAL_OBJECT_END(_1_to_2_data_tail)
471
472LOCAL_OBJECT_START(_2_to_3p25_data_tail)
473// Polynomial coefficients for the erf(x), 2 <= |x| < 3.25
474data8 0x847C24DAC7C7558B, 0x00003FF5 //A13 = 1.0107798565424606512130100541e-03
475data8 0xCB6340EAF02C3DF8, 0x00003FF8 //A12 = 1.2413800617425931997420375435e-02
476data8 0xB5163D252DBBC107, 0x0000BFF9 //A11 = -2.2105330871844825370020459523e-02
477data8 0x82FF9C0B68E331E4, 0x00003FF9 //A10 = 1.5991024756001692140897408128e-02
478data8 0xE9519E4A49752E04, 0x00003FF7 //A9 = 7.1203253651891723548763348088e-03
479data8 0x8D52F11B7AE846D9, 0x0000BFFA //A8 = -3.4502927613795425888684181521e-02
480data8 0xCCC5A3E32BC6FA30, 0x00003FFA //A7 = 4.9993171868423886228679106871e-02
481data8 0xC1791AD8284A1919, 0x0000BFFA //A6 = -4.7234635220336795411997070641e-02
482data8 0x853DAAA35A8A3C18, 0x00003FFA //A5 = 3.2529512934760303976755163452e-02
483data8 0x88E42D8F47FAB60E, 0x0000BFF9 //A4 = -1.6710366233609742619461063050e-02
484LOCAL_OBJECT_END(_2_to_3p25_data_tail)
485
486LOCAL_OBJECT_START(_4_to_6p53_data_tail)
487// Polynomial coefficients for the erf(x), 4 <= |x| < 6.53
488data8 0xD8235ABF08B8A6D1, 0x00003FEE //A13 = 1.2882834877224764938429832586e-05
489data8 0xAEDF44F9C77844C2, 0x0000BFEC //A12 = -2.6057980393716019511497492890e-06
490data8 0xCCD5490956A4FCFD, 0x00003FEA //A11 = 7.6306293047300300284923464089e-07
491data8 0xF71AF0126EE26AEA, 0x0000BFE8 //A10 = -2.3013467500738417953513680935e-07
492data8 0xE4CE68089858AC20, 0x00003FE6 //A9 = 5.3273112263151109935867439775e-08
493data8 0xBD15106FBBAEE593, 0x0000BFE4 //A8 = -1.1006037358336556244645388790e-08
494data8 0x8BBF9A5769B6E480, 0x00003FE2 //A7 = 2.0336075804332107927300019116e-09
495data8 0xB049D845D105E302, 0x0000BFDF //A6 = -3.2066683399502826067820249320e-10
496data8 0xBAC69B3F0DFE5483, 0x00003FDC //A5 = 4.2467901578369360007795282687e-11
497data8 0xA29C398F83F8A0D1, 0x0000BFD9 //A4 = -4.6216613698438694005327544047e-12
498LOCAL_OBJECT_END(_4_to_6p53_data_tail)
499
500LOCAL_OBJECT_START(_3p25_to_4_data_tail)
501// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4
502data8 0x95BE1BEAD738160F, 0x00003FF2 //A13 = 1.4280568455209843005829620687e-04
503data8 0x8108C8FFAC0F0B21, 0x0000BFF4 //A12 = -4.9222685622046459346377033307e-04
504data8 0xD72A7FAEE7832BBE, 0x00003FF4 //A11 = 8.2079319302109644436194651098e-04
505data8 0x823AB4281CA7BBE7, 0x0000BFF5 //A10 = -9.9357079675971109178261577703e-04
506data8 0xFA1232D476048D11, 0x00003FF4 //A9 = 9.5394549599882496825916138915e-04
507data8 0xC463D7AF88025FB2, 0x0000BFF4 //A8 = -7.4916843357898101689031755368e-04
508data8 0xFEBE32B6B379D072, 0x00003FF3 //A7 = 4.8588363901002111193445057206e-04
509data8 0x882829BB68409BF3, 0x0000BFF3 //A6 = -2.5969865184916169002074135516e-04
510data8 0xED2F886E29DAAB09, 0x00003FF1 //A5 = 1.1309894347742479284610149994e-04
511data8 0xA4C07129436555B2, 0x0000BFF0 //A4 = -3.9279872584973887163830479579e-05
512LOCAL_OBJECT_END(_3p25_to_4_data_tail)
513
514
515LOCAL_OBJECT_START(_0_to_1o8_data)
516// Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.125
517data8 0x3FF20DD750429B6D, 0x3C71AE3A8DDFFEDE //A1H, A1L
518data8 0xF8B0DACE42525CC2, 0x0000BFEE //A15
519data8 0xFCD02E1BF0EC2C37, 0x00003FF1 //A13
520data8 0xE016D968FE473B5E, 0x0000BFF4 //A11
521data8 0xAB2DE68711BF5A79, 0x00003FF7 //A9
522data8 0xDC16718944518309, 0x0000BFF9 //A7
523data8 0xE71790D0215F0C8F, 0x00003FFB //A5
524data8 0xC093A3581BCF3612, 0x0000BFFD //A3
525LOCAL_OBJECT_END(_0_to_1o8_data)
526
527
528LOCAL_OBJECT_START(_denorm_data)
529data8 0x3FF20DD750429B6D //A1H = 1.1283791670955125585606992900e+00
530data8 0x3C71AE3A914FED80 //A1L = 1.5335459613165880745599768129e-17
531LOCAL_OBJECT_END(_denorm_data)
532
533
534.section .text
535GLOBAL_LIBM_ENTRY(erfl)
536
537{ .mfi
538      alloc          r32         = ar.pfs, 0, 21, 0, 0
539      fmerge.se      fArgAbsNorm = f1, f8      // normalized x (1.0 <= x < 2.0)
540      addl           rSignBit    = 0x20000, r0 // Set sign bit for exponent
541}
542{ .mlx
543      addl           rDataPtr    = @ltoff(erfl_data), gp // Get common data ptr
544      movl           r1p5        = 0x3FF8000000000000    // 1.5 in dbl repres.
545};;
546
547{ .mfi
548      getf.exp       rArgExp     = f8              // Get arg exponent
549      fclass.m       p6,p0       = f8, 0xEF // Filter 0, denormals and specials
550                            // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf
551      addl           rBias       = 0xfffc, r0 // Value to subtract from exp
552                                              // to get actual interval number
553}
554{ .mfi
555      ld8            rDataPtr    = [rDataPtr]  // Get real common data pointer
556      fma.s1         fArgSqr     = f8, f8, f0  // x^2 (for [0;1/8] path)
557      addl           r2to4       = 0x10000, r0 // unbiased exponent
558                                               // for [2;4] binary interval
559};;
560
561{ .mfi
562      getf.sig       rArgSig     = f8              // Get arg significand
563      fcmp.lt.s1     p15, p14    = f8, f0          // Is arg negative/positive?
564      addl           rSaturation = 0xd0e, r0       // First 12 bits of
565                                                   // saturation value signif.
566}
567{ .mfi
568      setf.d         f1p5        = r1p5            // 1.5 construction
569      fma.s1         f2p0        = f1,f1,f1        // 2.0 construction
570      addl           r3p25Sign   = 0xd00, r0       // First 12 bits of
571                                                   // 3.25 value signif.
572};;
573
574{ .mfi
575      addl           rTailDataPtr = 0x700, rDataPtr  // Pointer to "tail" data
576      nop.f          0
577      andcm          rArgExp     = rArgExp, rSignBit // Remove sign of exp
578}
579{ .mfb
580      addl           rTiny       = 0xf000, r0 // Tiny value for saturation path
581      nop.f          0
582(p6)  br.cond.spnt   erfl_spec              // Branch to zero, denorm & specs
583};;
584
585{ .mfi
586      sub            rInterval   = rArgExp, rBias // Get actual interval number
587      nop.f          0
588      shr.u          rArgSig     = rArgSig, 52    // Leave only 12 bits of sign.
589}
590{ .mfi
591      adds           rShiftedDataPtr = 0x10, rDataPtr // Second ptr to data
592      nop.f          0
593      cmp.eq         p8, p10     = r2to4, rArgExp // If exp is in 2to4 interval?
594};;
595
596{ .mfi
597(p8)  cmp.le         p8, p10     = r3p25Sign, rArgSig // If sign. is greater
598                            //  than 1.25? (means arg is in [3.25;4] interval)
599      nop.f          0
600      shl            rOffset     = rInterval, 8 // Make offset from
601                                                // interval number
602}
603{ .mfi
604      cmp.gt         p9, p0      = 0x0, rInterval // If interval is less than 0
605                                                  // (means arg is in [0; 1/8])
606      nop.f          0
607      cmp.eq         p7, p0      = 0x5, rInterval // If arg is in [4:8] interv.?
608};;
609
610{ .mfi
611(p8)  adds           rOffset     = 0x200, rOffset // Add additional offset
612                                 // if arg is in [3.25;4] (another data set)
613      fma.s1         fArgCube    = fArgSqr, f8, f0  // x^3 (for [0;1/8] path)
614      shl            rTailOffset = rInterval, 7  // Make offset to "tail" data
615                                                 // from interval number
616}
617{ .mib
618      setf.exp       fTiny       = rTiny // Construct "tiny" value
619                                         // for saturation path
620      cmp.ltu        p11, p0     = 0x5, rInterval // if arg > 8
621(p9)  br.cond.spnt   _0_to_1o8
622};;
623
624{ .mfi
625      add            rAddr1      = rDataPtr, rOffset // Get address for
626                                                     // interval data
627      nop.f          0
628      shl            rTailAddOffset = rInterval, 5 // Offset to interval
629                                                   // "tail" data
630}
631{ .mib
632      add            rAddr2      = rShiftedDataPtr, rOffset // Get second
633                                                 // address for interval data
634(p7)  cmp.leu        p11, p0     = rSaturation, rArgSig // if arg is
635                                                        // in [6.53;8] interval
636(p11) br.cond.spnt   _saturation // Branch to Saturation path
637};;
638
639{ .mmi
640      ldfe           fA3         = [rAddr1], 0x90 // Load A3
641      ldfpd          fA2H, fA2L  = [rAddr2], 16 // Load A2High, A2Low
642      add            rTailOffset = rTailOffset, rTailAddOffset // "Tail" offset
643};;
644
645{ .mmi
646      ldfe           fA20        = [rAddr1], 16 // Load A20
647      ldfpd          fA1H, fA1L  = [rAddr2], 16 // Load A1High, A1Low
648(p8)  adds           rTailOffset = 0x140, rTailOffset // Additional offset
649                                                      //  for [3.24;4] interval
650};;
651
652{ .mmi
653      ldfe           fA19        = [rAddr1], 16 // Load A19
654      ldfpd          fA0H, fA0L  = [rAddr2], 16 // Load A0High, A0Low
655      add            rTailAddr1  = rTailDataPtr, rTailOffset // First tail
656                                                             // data address
657};;
658
659.pred.rel "mutex",p8,p10
660{ .mfi
661      ldfe           fA18        = [rAddr1], 16 // Load A18
662(p8)  fms.s1         fArgAbsNorm = fArgAbsNorm, f1, f2p0 // Add 2.0
663                          // to normalized arg (for [3.24;4] interval)
664      adds           rTailAddr2  = 0x10, rTailAddr1  // First tail
665                                                     // data address
666}
667{ .mfi
668      ldfe           fA25        = [rAddr2], 16 // Load A25
669(p10) fms.s1         fArgAbsNorm = fArgAbsNorm, f1, f1p5  // Add 1.5
670                                                // to normalized arg
671      nop.i          0
672};;
673
674{ .mmi
675      ldfe           fA17        = [rAddr1], 16 // Load A17
676      ldfe           fA24        = [rAddr2], 16 // Load A24
677      nop.i          0
678};;
679
680{ .mmi
681      ldfe           fA16        = [rAddr1], 16 // Load A16
682      ldfe           fA23        = [rAddr2], 16 // Load A23
683      nop.i          0
684};;
685
686{ .mmi
687      ldfe           fA15        = [rAddr1], 16 // Load A15
688      ldfe           fA22        = [rAddr2], 16 // Load A22
689      nop.i          0
690};;
691
692{ .mmi
693      ldfe           fA14        = [rAddr1], 16 // Load A14
694      ldfe           fA21        = [rAddr2], 16 // Load A21
695      nop.i          0
696};;
697
698{ .mfi
699      ldfe           fA13        = [rTailAddr1], 32              // Load A13
700      fms.s1         fArgAbsNorm2 = fArgAbsNorm, fArgAbsNorm, f0 // x^2
701      nop.i          0
702}
703{ .mfi
704      ldfe           fA12        = [rTailAddr2], 32 // Load A12
705      nop.f          0
706      nop.i          0
707};;
708
709{ .mfi
710      ldfe           fA11        = [rTailAddr1], 32       // Load A11
711      fma.s1         fRes3H      = fA3, fArgAbsNorm, fA2H // (A3*x+A2)*x^2
712      nop.i          0
713}
714{ .mfi
715      ldfe           fA10        = [rTailAddr2], 32     // Load A10
716      fma.s1         fTH         = fA3, fArgAbsNorm, f0 // (A3*x+A2)*x^2
717      nop.i          0
718};;
719
720{ .mfi
721      ldfe           fA9         = [rTailAddr1], 32 // Load A9
722      fma.s1         fTT2        = fA1L, fArgAbsNorm, f0 // A1*x+A0
723      nop.i          0
724}
725{ .mfi
726      ldfe           fA8         = [rTailAddr2], 32 // Load A8
727      nop.f          0
728      nop.i          0
729};;
730
731{ .mmi
732      ldfe           fA7         = [rTailAddr1], 32 // Load A7
733      ldfe           fA6         = [rTailAddr2], 32 // Load A6
734      nop.i          0
735};;
736
737{ .mmi
738      ldfe           fA5         = [rTailAddr1], 32 // Load A5
739      ldfe           fA4         = [rTailAddr2], 32 // Load A4
740      nop.i          0
741};;
742
743{ .mfi
744      nop.m          0
745      fms.s1         fArgAbsNorm2L = fArgAbsNorm, fArgAbsNorm, fArgAbsNorm2
746                                                    // Low part of x^2 (delta)
747      nop.i          0
748}
749{ .mfi
750      nop.m          0
751      fms.s1         fArgAbsNorm4  = fArgAbsNorm2, fArgAbsNorm2, f0 // x^4
752      nop.i          0
753};;
754
755{ .mfi
756      nop.m          0
757      fms.s1         fRes3L      = fA2H, f1, fRes3H // // (A3*x+A2)*x^2
758      nop.i          0
759};;
760
761{ .mfi
762      nop.m          0
763      fms.s1         fArgAbsNorm3 = fArgAbsNorm2, fArgAbsNorm, f0 // x^3
764      nop.i          0
765}
766{ .mfi
767      nop.m          0
768      fma.s1         fTH2        = fA1H, fArgAbsNorm, fTT2 // A1*x+A0
769      nop.i          0
770};;
771
772{ .mfi
773      nop.m          0
774      fma.s1         fA23        = fA24,  fArgAbsNorm, fA23 // Polynomial tail
775      nop.i          0
776}
777{ .mfi
778      nop.m          0
779      fma.s1         fA21        = fA22,  fArgAbsNorm, fA21 // Polynomial tail
780      nop.i          0
781};;
782
783{ .mfi
784      nop.m          0
785      fma.s1         fA12        = fA13,  fArgAbsNorm, fA12 // Polynomial tail
786      nop.i          0
787}
788;;
789
790{ .mfi
791      nop.m          0
792      fma.s1         fRes3L      = fRes3L, f1, fTH // (A3*x+A2)*x^2
793      nop.i          0
794}
795{ .mfi
796      nop.m          0
797      fma.s1         fA19        = fA20,  fArgAbsNorm, fA19 // Polynomial tail
798      nop.i          0
799};;
800
801{ .mfi
802      nop.m          0
803      fma.s1         fRes1H      = fTH2, f1, fA0H // A1*x+A0
804      nop.i          0
805}
806{ .mfi
807      nop.m          0
808      fms.s1         fTL2        = fA1H, fArgAbsNorm, fTH2 // A1*x+A0
809      nop.i          0
810};;
811
812{ .mfi
813      nop.m          0
814      fma.s1         fA8         = fA9,  fArgAbsNorm, fA8 // Polynomial tail
815      nop.i          0
816}
817{ .mfi
818      nop.m          0
819      fma.s1         fA10        = fA11,  fArgAbsNorm, fA10 // Polynomial tail
820      nop.i          0
821};;
822{ .mfi
823      nop.m          0
824      fma.s1         fA15        = fA16,  fArgAbsNorm, fA15 // Polynomial tail
825      nop.i          0
826}
827{ .mfi
828      nop.m          0
829      fma.s1         fA17        = fA18,  fArgAbsNorm, fA17 // Polynomial tail
830      nop.i          0
831};;
832{ .mfi
833      nop.m          0
834      fms.s1         fArgAbsNorm11 = fArgAbsNorm4, fArgAbsNorm4, f0 // x^8
835      nop.i          0
836}
837{ .mfi
838      nop.m          0
839      fma.s1         fA4         = fA5,  fArgAbsNorm, fA4 // Polynomial tail
840      nop.i          0
841};;
842
843{ .mfi
844      nop.m          0
845      fma.s1         fRes3L      = fRes3L, f1, fA2L // (A3*x+A2)*x^2
846      nop.i          0
847}
848{ .mfi
849      nop.m          0
850      fma.s1         fA6         = fA7,  fArgAbsNorm, fA6 // Polynomial tail
851      nop.i          0
852};;
853
854{ .mfi
855      nop.m          0
856      fma.s1         fTL2        = fTL2, f1, fTT2 // A1*x+A0
857      nop.i          0
858}
859{ .mfi
860      nop.m          0
861      fms.s1         fRes1L      = fA0H, f1, fRes1H // A1*x+A0
862      nop.i          0
863};;
864
865{ .mfi
866      nop.m          0
867      fma.s1         fA23        = fA25,  fArgAbsNorm2, fA23 // Polynomial tail
868      nop.i          0
869}
870{ .mfi
871      nop.m          0
872      fma.s1         fA12        = fA14,  fArgAbsNorm2, fA12 // Polynomial tail
873      nop.i          0
874};;
875
876{ .mfi
877      nop.m          0
878      fma.s1         fA19        = fA21,  fArgAbsNorm2, fA19  // Polynomial tail
879      nop.i          0
880}
881{ .mfi
882      nop.m          0
883      fma.s1         fA8         = fA10,  fArgAbsNorm2, fA8 // Polynomial tail
884      nop.i          0
885};;
886
887{ .mfi
888      nop.m          0
889      fma.s1         fA15        = fA17,  fArgAbsNorm2, fA15 // Polynomial tail
890      nop.i          0
891}
892{ .mfi
893      nop.m          0
894      fms.s1         fArgAbsNorm11 = fArgAbsNorm11, fArgAbsNorm3, f0 // x^11
895      nop.i          0
896};;
897
898{ .mfi
899      nop.m          0
900      fma.s1         fTT         = fRes3L, fArgAbsNorm2, f0 // (A3*x+A2)*x^2
901      nop.i          0
902}
903{ .mfi
904      nop.m          0
905      fma.s1         fA4         = fA6,  fArgAbsNorm2, fA4 // Polynomial tail
906      nop.i          0
907};;
908
909{ .mfi
910      nop.m          0
911      fma.s1         fRes1L      = fRes1L, f1, fTH2 // A1*x+A0
912      nop.i          0
913};;
914
915{ .mfi
916      nop.m          0
917      fma.s1         fA19        = fA23,  fArgAbsNorm4, fA19 // Polynomial tail
918      nop.i          0
919}
920{ .mfi
921      nop.m          0
922      fma.s1         fA8         = fA12,  fArgAbsNorm4, fA8 // Polynomial tail
923      nop.i          0
924};;
925
926{ .mfi
927      nop.m          0
928      fma.s1         fTT         = fRes3H, fArgAbsNorm2L, fTT // (A3*x+A2)*x^2
929      nop.i          0
930};;
931
932{ .mfi
933      nop.m          0
934      fma.s1         fRes1L      = fRes1L, f1, fTL2 // A1*x+A0
935      nop.i          0
936};;
937
938{ .mfi
939      nop.m          0
940      fma.s1         fA15        = fA19,  fArgAbsNorm4, fA15 // Polynomial tail
941      nop.i          0
942}
943{ .mfi
944      nop.m          0
945      fma.s1         fA4         = fA8,  fArgAbsNorm4, fA4 // Polynomial tail
946      nop.i          0
947};;
948
949{ .mfi
950      nop.m          0
951      fma.s1         fRes2H      = fRes3H, fArgAbsNorm2, fTT // (A3*x+A2)*x^2
952      nop.i          0
953};;
954
955{ .mfi
956      nop.m          0
957      fma.s1         fRes1L      = fRes1L, f1, fA0L // A1*x+A0
958      nop.i          0
959};;
960
961{ .mfi
962      nop.m          0
963      fma.s1         fRes4       = fA15, fArgAbsNorm11, fA4 // Result of
964                                                      // polynomial tail
965      nop.i          0
966};;
967
968{ .mfi
969      nop.m          0
970      fms.s1         fRes2L      = fRes3H, fArgAbsNorm2, fRes2H // (A3*x+A2)*x^2
971      nop.i          0
972}
973{ .mfi
974      nop.m          0
975      fma.s1         fResH       = fRes2H, f1, fRes1H // High result
976      nop.i          0
977};;
978
979{ .mfi
980      nop.m          0
981      fma.s1         fRes1L      = fRes4, fArgAbsNorm4, fRes1L // A1*x+A0
982      nop.i          0
983};;
984
985{ .mfi
986      nop.m          0
987      fma.s1         fRes2L      = fRes2L, f1, fTT // (A3*x+A2)*x^2
988      nop.i          0
989}
990{ .mfi
991      nop.m          0
992      fms.s1         fResL       = fRes1H, f1, fResH // Low result
993      nop.i          0
994};;
995
996{ .mfi
997      nop.m          0
998      fma.s1         fRes1L      = fRes1L, f1, fRes2L // Low result
999      nop.i          0
1000}
1001{ .mfi
1002      nop.m          0
1003      fma.s1         fResL       = fResL, f1, fRes2H // Low result
1004      nop.i          0
1005};;
1006
1007{ .mfi
1008      nop.m          0
1009(p15) fneg           fResH       = fResH // Invert high result if arg is neg.
1010      nop.i          0
1011};;
1012
1013{ .mfi
1014      nop.m          0
1015      fma.s1         fResL       = fResL, f1, fRes1L // Low result
1016      nop.i          0
1017};;
1018
1019.pred.rel "mutex",p14,p15
1020{ .mfi
1021      nop.m          0
1022(p14) fma.s0         f8          = fResH, f1, fResL // Add high and low results
1023      nop.i          0
1024}
1025{ .mfb
1026      nop.m          0
1027(p15) fms.s0         f8          = fResH, f1, fResL // Add high and low results
1028      br.ret.sptk    b0          // Main path return
1029};;
1030
1031//  satiration path ////////////////////////////////////////////////////////////
1032_saturation:
1033
1034.pred.rel "mutex",p14,p15
1035{ .mfi
1036      nop.m          0
1037(p14) fms.s0            f8          = f1, f1, fTiny // Saturation result r = 1-tiny
1038      nop.i 0
1039};;
1040{ .mfb
1041      nop.m          0
1042(p15) fnma.s0           f8          = f1, f1, fTiny // Saturation result r = tiny-1
1043      br.ret.sptk    b0         // Saturation path return
1044};;
1045
1046
1047//  0, denormals and special IEEE numbers path /////////////////////////////////
1048erfl_spec:
1049
1050{ .mfi
1051      addl           rDataPtr    = 0xBE0, rDataPtr // Ptr to denormals coeffs
1052      fclass.m       p6,p0       = f8, 0x23 // To filter infinities
1053                                          // 0x23 = @pos|@neg|@inf
1054      nop.i          0
1055};;
1056
1057{ .mfi
1058      ldfpd          fA1H, fA1L  = [rDataPtr] // Load denormals coeffs A1H, A1L
1059      fclass.m       p7,p0       = f8, 0xC7 // To filter NaNs & Zeros
1060                                 // 0xC7 = @pos|@neg|@zero|@qnan|@snan
1061      nop.i          0
1062};;
1063
1064{ .mfb
1065      nop.m          0
1066(p6)  fmerge.s       f8          = f8, f1     // +/-1 for INF args
1067(p6)  br.ret.spnt    b0                       // exit for x = INF
1068};;
1069
1070{ .mfb
1071      nop.m          0
1072(p7)  fma.s0         f8          = f8, f1, f8    // +/-0 for 0 args
1073                                                 // and NaNs for NaNs
1074(p7)  br.ret.spnt    b0                          // exit for x = NaN or +/-0
1075};;
1076
1077{ .mfi
1078      nop.m          0
1079      fnorm.s0       f8          = f8            // Normalize arg
1080      nop.i          0
1081};;
1082
1083{ .mfi
1084      nop.m          0
1085      fms.s1         fRes1H      = f8, fA1H, f0   // HighRes
1086      nop.i          0
1087}
1088{ .mfi
1089      nop.m          0
1090      fms.s1         fRes1L      = f8, fA1L, f0   // LowRes
1091      nop.i          0
1092};;
1093
1094{ .mfi
1095      nop.m          0
1096      fms.s1         fRes1Hd     = f8, fA1H, fRes1H // HighRes delta
1097      nop.i          0
1098};;
1099
1100{ .mfi
1101      nop.m          0
1102      fma.s1         fRes        = fRes1L, f1,  fRes1Hd // LowRes+HighRes delta
1103      nop.i          0
1104};;
1105
1106{ .mfi
1107      nop.m          0
1108      fma.s1         fRes        = f8, f8, fRes // r=x^2+r
1109      nop.i          0
1110};;
1111
1112{ .mfb
1113      nop.m          0
1114      fma.s0         f8          = fRes, f1, fRes1H  // res = r+ResHigh
1115      br.ret.sptk    b0          // 0, denormals, specials return
1116};;
1117
1118
1119//  0 < |x| < 1/8 path /////////////////////////////////////////////////////////
1120_0_to_1o8:
1121
1122{ .mmi
1123      adds           rAddr1      = 0xB60, rDataPtr // Ptr 1 to coeffs
1124      adds           rAddr2      = 0xB80, rDataPtr // Ptr 2 to coeffs
1125      nop.i          0
1126};;
1127
1128{ .mmi
1129      ldfpd          fA1H, fA1L  = [rAddr1], 16 // Load A1High, A1Low
1130      ldfe           fA13        = [rAddr2], 16 // Load A13
1131      nop.i          0
1132};;
1133
1134{ .mmi
1135      ldfe           fA15        = [rAddr1], 48 // Load A15
1136      ldfe           fA11        = [rAddr2], 32 // Load A11
1137      nop.i          0
1138};;
1139
1140{ .mmi
1141      ldfe           fA9         = [rAddr1], 32 // Load A9
1142      ldfe           fA7         = [rAddr2], 32 // Load A7
1143      nop.i          0
1144};;
1145
1146{ .mmi
1147      ldfe           fA5         = [rAddr1]  // Load A5
1148      ldfe           fA3         = [rAddr2] // Load A3
1149      nop.i          0
1150};;
1151
1152{ .mfi
1153      nop.m          0
1154      fms.s1         fRes1H      = f8, fA1H, f0 // x*(A1H+A1L)
1155      nop.i          0
1156}
1157{ .mfi
1158      nop.m          0
1159      fms.s1         fRes1L      = f8, fA1L, f0 // x*(A1H+A1L)
1160      nop.i          0
1161};;
1162
1163{ .mfi
1164      nop.m          0
1165      fma.s1         fA11        = fA13, fArgSqr, fA11 // Polynomial tail
1166      nop.i          0
1167}
1168{ .mfi
1169      nop.m          0
1170      fma.s1         fArgFour    = fArgSqr, fArgSqr, f0 // a^4
1171      nop.i          0
1172};;
1173
1174
1175{ .mfi
1176      nop.m          0
1177      fma.s1         fA3         = fA5, fArgSqr, fA3 // Polynomial tail
1178      nop.i          0
1179}
1180{ .mfi
1181      nop.m          0
1182      fma.s1         fA7         = fA9, fArgSqr, fA7 // Polynomial tail
1183      nop.i          0
1184};;
1185
1186
1187{ .mfi
1188      nop.m          0
1189      fms.s1         fRes1Hd     = f8, fA1H, fRes1H // x*(A1H+A1L) delta
1190      nop.i          0
1191};;
1192
1193{ .mfi
1194      nop.m          0
1195      fma.s1         fA11        = fA15, fArgFour, fA11 // Polynomial tail
1196      nop.i          0
1197};;
1198
1199{ .mfi
1200      nop.m          0
1201      fma.s1         fA3         = fA7, fArgFour, fA3 // Polynomial tail
1202      nop.i          0
1203}
1204{ .mfi
1205      nop.m          0
1206      fma.s1         fArgEight   = fArgFour, fArgFour, f0 // a^8
1207      nop.i          0
1208};;
1209
1210{ .mfi
1211      nop.m          0
1212      fma.s1         f8          = fRes1L, f1,  fRes1Hd // x*(A1H+A1L)
1213      nop.i          0
1214};;
1215
1216{ .mfi
1217      nop.m          0
1218      fma.s1         fRes        = fA11, fArgEight, fA3 //Polynomial tail result
1219      nop.i          0
1220};;
1221
1222{ .mfi
1223      nop.m          0
1224      fma.s1         f8          = fRes, fArgCube, f8 // (Polynomial tail)*x^3
1225      nop.i          0
1226};;
1227
1228{ .mfb
1229      nop.m          0
1230      fma.s0         f8          = f8, f1, fRes1H  // (Polynomial tail)*x^3 +
1231                                                   // + x*(A1H+A1L)
1232      br.ret.sptk    b0          // [0;1/8] interval return
1233};;
1234
1235
1236GLOBAL_LIBM_END(erfl)
1237libm_alias_ldouble_other (erf, erf)
1238