1.file "erfl.s" 2 3 4// Copyright (c) 2001 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//============================================================== 41// 11/21/01 Initial version 42// 05/20/02 Cleaned up namespace and sf0 syntax 43// 08/14/02 Changed mli templates to mlx 44// 02/06/03 Reordered header: .section, .global, .proc, .align 45// 46// API 47//============================================================== 48// long double erfl(long double) 49// 50// Overview of operation 51//============================================================== 52// 53// Algorithm description 54// --------------------- 55// 56// There are 4 paths: 57// 58// 1. Special path: x = 0, Inf, NaNs, denormal 59// Return erfl(x) = +/-0.0 for zeros 60// Return erfl(x) = QNaN for NaNs 61// Return erfl(x) = sign(x)*1.0 for Inf 62// Return erfl(x) = (A0H+A0L)*x + x^2, ((A0H+A0L) = 2.0/sqrt(Pi)) 63// for denormals 64// 65// 2. [0;1/8] path: 0.0 < |x| < 1/8 66// Return erfl(x) = x*(A1H+A1L) + x^3*A3 + ... + x^15*A15 67// 68// 3. Main path: 1/8 <= |x| < 6.53 69// For several ranges of 1/8 <= |x| < 6.53 70// Return erfl(x) = sign(x)*((A0H+A0L) + y*(A1H+A1L) + y^2*(A2H+A2L) + 71// + y^3*A3 + y^4*A4 + ... + y^25*A25 ) 72// where y = (|x|/a) - b 73// 74// For each range there is particular set of coefficients. 75// Below is the list of ranges: 76// 1/8 <= |x| < 1/4 a = 0.125, b = 1.5 77// 1/4 <= |x| < 1/2 a = 0.25, b = 1.5 78// 1/2 <= |x| < 1.0 a = 0.5, b = 1.5 79// 1.0 <= |x| < 2.0 a = 1.0, b = 1.5 80// 2.0 <= |x| < 3.25 a = 2.0, b = 1.5 81// 3.25 <= |x| < 4.0 a = 2.0, b = 2.0 82// 4.0 <= |x| < 6.53 a = 4.0, b = 1.5 83// ( [3.25;4.0] subrange separated for monotonicity issues resolve ) 84// 85// 4. Saturation path: 6.53 <= |x| < +INF 86// Return erfl(x) = sign(x)*(1.0 - tiny_value) 87// (tiny_value ~ 1e-1233) 88// 89// Implementation notes 90// -------------------- 91// 92// 1. Special path: x = 0, INF, NaNa, denormals 93// 94// This branch is cut off by one fclass operation. 95// Then zeros+nans, infinities and denormals processed separately. 96// For denormals we had to use multiprecision A0 coefficient to reach 97// necessary accuracy: (A0H+A0L)*x-x^2 98// 99// 2. [0;1/8] path: 0.0 < |x| < 1/8 100// 101// First coefficient of polynomial we must split to multiprecision too. 102// Also we can parallelise computations: 103// (x*(A1H+A1L)) calculated in parallel with "tail" (x^3*A3 + ... + x^15*A15) 104// Furthermore the second part is factorized using binary tree technique. 105// 106// 3. Main path: 1/8 <= |x| < 6.53 107// 108// Multiprecision have to be performed only for first few 109// polynomial iterations (up to 3-rd x degree) 110// Here we use the same parallelisation way as above: 111// Split whole polynomial to first, "multiprecision" part, and second, 112// so called "tail", native precision part. 113// 114// 1) Multiprecision part: 115// [v1=(A0H+A0L)+y*(A1H+A1L)] + [v2=y^2*((A2H+A2L)+y*A3)] 116// v1 and v2 terms calculated in parallel 117// 118// 2) Tail part: 119// v3 = x^4 * ( A4 + x*A5 + ... + x^21*A25 ) 120// v3 is splitted to 2 even parts (10 coefficient in each one). 121// These 2 parts are also factorized using binary tree technique. 122// 123// So Multiprecision and Tail parts cost is almost the same 124// and we have both results ready before final summation. 125// 126// 4. Saturation path: 6.53 <= |x| < +INF 127// 128// We use formula sign(x)*(1.0 - tiny_value) instead of simple sign(x)*1.0 129// just to meet IEEE requirements for different rounding modes in this case. 130// 131// Registers used 132//============================================================== 133// Floating Point registers used: 134// f8 - input & output 135// f32 -> f90 136 137// General registers used: 138// r2, r3, r32 -> r52 139 140// Predicate registers used: 141// p0, p6 -> p11, p14, p15 142 143// p6 - arg is zero, denormal or special IEEE 144// p7 - arg is in [4;8] binary interval 145// p8 - arg is in [3.25;4] interval 146// p9 - arg < 1/8 147// p10 - arg is NOT in [3.25;4] interval 148// p11 - arg in saturation domain 149// p14 - arg is positive 150// p15 - arg is negative 151 152// Assembly macros 153//============================================================== 154rDataPtr = r2 155rTailDataPtr = r3 156 157rBias = r33 158rSignBit = r34 159rInterval = r35 160 161rArgExp = r36 162rArgSig = r37 163r3p25Offset = r38 164r2to4 = r39 165r1p25 = r40 166rOffset = r41 167r1p5 = r42 168rSaturation = r43 169r3p25Sign = r44 170rTiny = r45 171rAddr1 = r46 172rAddr2 = r47 173rTailAddr1 = r48 174rTailAddr2 = r49 175rTailOffset = r50 176rTailAddOffset = r51 177rShiftedDataPtr = r52 178 179//============================================================== 180fA0H = f32 181fA0L = f33 182fA1H = f34 183fA1L = f35 184fA2H = f36 185fA2L = f37 186fA3 = f38 187fA4 = f39 188fA5 = f40 189fA6 = f41 190fA7 = f42 191fA8 = f43 192fA9 = f44 193fA10 = f45 194fA11 = f46 195fA12 = f47 196fA13 = f48 197fA14 = f49 198fA15 = f50 199fA16 = f51 200fA17 = f52 201fA18 = f53 202fA19 = f54 203fA20 = f55 204fA21 = f56 205fA22 = f57 206fA23 = f58 207fA24 = f59 208fA25 = f60 209 210fArgSqr = f61 211fArgCube = f62 212fArgFour = f63 213fArgEight = f64 214 215fArgAbsNorm = f65 216fArgAbsNorm2 = f66 217fArgAbsNorm2L = f67 218fArgAbsNorm3 = f68 219fArgAbsNorm4 = f69 220fArgAbsNorm11 = f70 221 222fRes = f71 223fResH = f72 224fResL = f73 225fRes1H = f74 226fRes1L = f75 227fRes1Hd = f76 228fRes2H = f77 229fRes2L = f78 230fRes3H = f79 231fRes3L = f80 232fRes4 = f81 233 234fTT = f82 235fTH = f83 236fTL = f84 237fTT2 = f85 238fTH2 = f86 239fTL2 = f87 240 241f1p5 = f88 242f2p0 = f89 243fTiny = f90 244 245 246// Data tables 247//============================================================== 248RODATA 249 250.align 64 251LOCAL_OBJECT_START(erfl_data) 252////////// Main tables /////////// 253_0p125_to_0p25_data: // exp = 2^-3 254// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4 255data8 0xACD9ED470F0BB048, 0x0000BFF4 //A3 = -6.5937529303909561891162915809e-04 256data8 0xBF6A254428DDB452 //A2H = -3.1915980570631852578089571182e-03 257data8 0xBC131B3BE3AC5079 //A2L = -2.5893976889070198978842231134e-19 258data8 0x3FC16E2D7093CD8C //A1H = 1.3617485043469590433318217038e-01 259data8 0x3C6979A52F906B4C //A1L = 1.1048096806003284897639351952e-17 260data8 0x3FCAC45E37FE2526 //A0H = 2.0911767705937583938791135552e-01 261data8 0x3C648D48536C61E3 //A0L = 8.9129592834861155344147026365e-18 262data8 0xD1FC135B4A30E746, 0x00003F90 //A25 = 6.3189963203954877364460345654e-34 263data8 0xB1C79B06DD8C988C, 0x00003F97 //A24 = 6.8478253118093953461840838106e-32 264data8 0xCC7AE121D1DEDA30, 0x0000BF9A //A23 = -6.3010264109146390803803408666e-31 265data8 0x8927B8841D1E0CA8, 0x0000BFA1 //A22 = -5.4098171537601308358556861717e-29 266data8 0xB4E84D6D0C8F3515, 0x00003FA4 //A21 = 5.7084320046554628404861183887e-28 267data8 0xC190EAE69A67959A, 0x00003FAA //A20 = 3.9090359419467121266470910523e-26 268data8 0x90122425D312F680, 0x0000BFAE //A19 = -4.6551806872355374409398000522e-25 269data8 0xF8456C9C747138D6, 0x0000BFB3 //A18 = -2.5670639225386507569611436435e-23 270data8 0xCDCAE0B3C6F65A3A, 0x00003FB7 //A17 = 3.4045511783329546779285646369e-22 271data8 0x8F41909107C62DCC, 0x00003FBD //A16 = 1.5167830861896169812375771948e-20 272data8 0x82F0FCB8A4B8C0A3, 0x0000BFC1 //A15 = -2.2182328575376704666050112195e-19 273data8 0x92E992C58B7C3847, 0x0000BFC6 //A14 = -7.9641369349930600223371163611e-18 274LOCAL_OBJECT_END(erfl_data) 275 276LOCAL_OBJECT_START(_0p25_to_0p5_data) 277// Polynomial coefficients for the erf(x), 1/4 <= |x| < 1/2 278data8 0xF083628E8F7CE71D, 0x0000BFF6 //A3 = -3.6699405305266733332335619531e-03 279data8 0xBF978749A434FE4E //A2H = -2.2977018973732214746075186440e-02 280data8 0xBC30B3FAFBC21107 //A2L = -9.0547407100537663337591537643e-19 281data8 0x3FCF5F0CDAF15313 //A1H = 2.4508820238647696654332719390e-01 282data8 0x3C1DFF29F5AD8117 //A1L = 4.0653155218104625249413579084e-19 283data8 0x3FD9DD0D2B721F38 //A0H = 4.0411690943482225790717166092e-01 284data8 0x3C874C71FEF1759E //A0L = 4.0416653425001310671815863946e-17 285data8 0xA621D99B8C12595E, 0x0000BFAB //A25 = -6.7100271986703749013021666304e-26 286data8 0xBD7BBACB439992E5, 0x00003FAE //A24 = 6.1225362452814749024566661525e-25 287data8 0xFF2FEFF03A98E410, 0x00003FB2 //A23 = 1.3192871864994282747963195183e-23 288data8 0xAE8180957ABE6FD5, 0x0000BFB6 //A22 = -1.4434787102181180110707433640e-22 289data8 0xAF0566617B453AA6, 0x0000BFBA //A21 = -2.3163848847252215762970075142e-21 290data8 0x8F33D3616B9B8257, 0x00003FBE //A20 = 3.0324297082969526400202995913e-20 291data8 0xD58AB73354438856, 0x00003FC1 //A19 = 3.6175397854863872232142412590e-19 292data8 0xD214550E2F3210DF, 0x0000BFC5 //A18 = -5.6942141660091333278722310354e-18 293data8 0xE2CA60C328F3BBF5, 0x0000BFC8 //A17 = -4.9177359011428870333915211291e-17 294data8 0x88D9BB274F9B3873, 0x00003FCD //A16 = 9.4959118337089189766177270051e-16 295data8 0xCA4A00AB538A2DB2, 0x00003FCF //A15 = 5.6146496538690657993449251855e-15 296data8 0x9CC8FFFBDDCF9853, 0x0000BFD4 //A14 = -1.3925319209173383944263942226e-13 297LOCAL_OBJECT_END(_0p25_to_0p5_data) 298 299LOCAL_OBJECT_START(_0p5_to_1_data) 300// Polynomial coefficients for the erf(x), 1/2 <= |x| < 1 301data8 0xDB742C8FB372DBE0, 0x00003FF6 //A3 = 3.3485993187250381721535255963e-03 302data8 0xBFBEDC5644353C26 //A2H = -1.2054957547410136142751468924e-01 303data8 0xBC6D7215B023455F //A2L = -1.2770012232203569059818773287e-17 304data8 0x3FD492E42D78D2C4 //A1H = 3.2146553459760363047337250464e-01 305data8 0x3C83A163CAC22E05 //A1L = 3.4053365952542489137756724868e-17 306data8 0x3FE6C1C9759D0E5F //A0H = 7.1115563365351508462453011816e-01 307data8 0x3C8B1432F2CBC455 //A0L = 4.6974407716428899960674098333e-17 308data8 0x95A6B92162813FF8, 0x00003FC3 //A25 = 1.0140763985766801318711038400e-18 309data8 0xFE5EC3217F457B83, 0x0000BFC6 //A24 = -1.3789434273280972156856405853e-17 310data8 0x9B49651031B5310B, 0x0000BFC8 //A23 = -3.3672435142472427475576375889e-17 311data8 0xDBF73927E19B7C8D, 0x00003FCC //A22 = 7.6315938248752024965922341872e-16 312data8 0xF55CBA3052730592, 0x00003FCB //A21 = 4.2563559623888750271176552350e-16 313data8 0xA1DC9380DA82CFF6, 0x0000BFD2 //A20 = -3.5940500736023122607663701015e-14 314data8 0xAAD1AE1067F3D577, 0x00003FD2 //A19 = 3.7929451192558641569555227613e-14 315data8 0xCD1DB83F3B9D2090, 0x00003FD7 //A18 = 1.4574374961011929143375716362e-12 316data8 0x87235ACB5E8BB298, 0x0000BFD9 //A17 = -3.8408559294899660346666452560e-12 317data8 0xDA417B78FF9F46B4, 0x0000BFDC //A16 = -4.9625621225715971268115023451e-11 318data8 0xF075762685484436, 0x00003FDE //A15 = 2.1869603559309150844390066920e-10 319data8 0xB989FDB3795165C7, 0x00003FE1 //A14 = 1.3499740992928183247608593000e-09 320LOCAL_OBJECT_END(_0p5_to_1_data) 321 322LOCAL_OBJECT_START(_1_to_2_data) 323// Polynomial coefficients for the erf(x), 1 <= |x| < 2.0 324data8 0x8E15015F5B55BEAC, 0x00003FFC //A3 = 1.3875200409423426678618977531e-01 325data8 0xBFC6D5A95D0A1B7E //A2H = -1.7839543383544403942764233761e-01 326data8 0xBC7499F704C80E02 //A2L = -1.7868888188464394090788198634e-17 327data8 0x3FBE723726B824A8 //A1H = 1.1893028922362935961842822508e-01 328data8 0x3C6B77F399C2AD27 //A1L = 1.1912589318015368492508652194e-17 329data8 0x3FEEEA5557137ADF //A0H = 9.6610514647531064991170524081e-01 330data8 0x3C963D0DDD0A762F //A0L = 7.7155271023949055047261953350e-17 331data8 0x8FAA405DAD409771, 0x0000BFDB //A25 = -1.6332824616946528652252813763e-11 332data8 0x941386F4697976D8, 0x0000BFDD //A24 = -6.7337295147729213955410252613e-11 333data8 0xBCBE75234530B404, 0x00003FDF //A23 = 3.4332329029092304943838374908e-10 334data8 0xF55E2CE71A00D040, 0x00003FDF //A22 = 4.4632156034175937694868068394e-10 335data8 0xA6CADFE489D2671F, 0x0000BFE3 //A21 = -4.8543000253822277507724949798e-09 336data8 0xA4C69F11FEAFB3A8, 0x00003FE2 //A20 = 2.3978044150868471771557059958e-09 337data8 0xD63441E3BED59703, 0x00003FE6 //A19 = 4.9873285553412397317802071288e-08 338data8 0xDFDAED9D3089D732, 0x0000BFE7 //A18 = -1.0424069510877052249228047044e-07 339data8 0xB47287FF165756A5, 0x0000BFE9 //A17 = -3.3610945128073834488448164164e-07 340data8 0xCDAF2DC0A79A9059, 0x00003FEB //A16 = 1.5324673941628851136481785187e-06 341data8 0x9FD6A7B2ECE8EDA9, 0x00003FEA //A15 = 5.9544479989469083598476592569e-07 342data8 0xEC6E63BB4507B585, 0x0000BFEE //A14 = -1.4092398243085031882423746824e-05 343LOCAL_OBJECT_END(_1_to_2_data) 344 345LOCAL_OBJECT_START(_2_to_3p25_data) 346// Polynomial coefficients for the erf(x), 2 <= |x| < 3.25 347data8 0xCEDBA58E8EE6F055, 0x00003FF7 //A3 = 6.3128050215859026984338771121e-03 348data8 0xBF5B60D5E974CBBD //A2H = -1.6710366233609740427984435840e-03 349data8 0xBC0E11E2AEC18AF6 //A2L = -2.0376133202996259839305825162e-19 350data8 0x3F32408E9BA3327E //A1H = 2.7850610389349567379974059733e-04 351data8 0x3BE41010E4B3B224 //A1L = 3.3987633691879253781833531576e-20 352data8 0x3FEFFFD1AC4135F9 //A0H = 9.9997790950300136092465663751e-01 353data8 0x3C8EEAFA1E97EAE0 //A0L = 5.3633970564750967956196033852e-17 354data8 0xBF9C6F2C6D7263C1, 0x00003FF0 //A25 = 4.5683639377039166585098497471e-05 355data8 0xCB4167CC4798096D, 0x00003FF0 //A24 = 4.8459885139772945417160731273e-05 356data8 0xE1394FECFE972D32, 0x0000BFF2 //A23 = -2.1479022581129892562916533804e-04 357data8 0xC7F9E47581FC2A5F, 0x0000BFF2 //A22 = -1.9071211076537531370822343363e-04 358data8 0xDD612EDFAA41BEAE, 0x00003FF2 //A21 = 2.1112405918671957390188348542e-04 359data8 0x8C166AA4CB2AD8FD, 0x0000BFF4 //A20 = -5.3439165021555312536009227942e-04 360data8 0xEFBE33D9F62B68D4, 0x0000BFF2 //A19 = -2.2863672131516067770956697877e-04 361data8 0xCCB92F5D91562494, 0x00003FF5 //A18 = 1.5619154280865226092321881421e-03 362data8 0x80A5DBE71D4BA0E2, 0x0000BFF6 //A17 = -1.9630109664962540123775799179e-03 363data8 0xA0ADEB2D4C41347A, 0x0000BFF4 //A16 = -6.1294315248639348947483422457e-04 364data8 0xB1F5D4911B911665, 0x00003FF7 //A15 = 5.4309165882071876864550213817e-03 365data8 0xF2F3D8D21E8762E0, 0x0000BFF7 //A14 = -7.4143227286535936033409745884e-03 366LOCAL_OBJECT_END(_2_to_3p25_data) 367 368LOCAL_OBJECT_START(_4_to_6p53_data) 369// Polynomial coefficients for the erf(x), 4 <= |x| < 6.53 370data8 0xDF3151BE8652827E, 0x00003FD5 //A3 = 3.9646979666953349095427642209e-13 371data8 0xBD1C4A9787DF888B //A2H = -2.5127788450714750484839908889e-14 372data8 0xB99B35483E4603FD //A2L = -3.3536613901268985626466020210e-31 373data8 0x3CD2DBF507F1A1F3 //A1H = 1.0468963266736687758710258897e-15 374data8 0x398A97B60913B4BD //A1L = 1.6388968267515149775818013207e-31 375data8 0x3FEFFFFFFFFFFFFF //A0H = 9.9999999999999988897769753748e-01 376data8 0x3C99CC25E658129E //A0L = 8.9502895736398715695745861054e-17 377data8 0xB367B21294713D39, 0x00003FFB //A25 = 8.7600127403270828432337605471e-02 378data8 0xCEE3A423ADEC0F4C, 0x00003FFD //A24 = 4.0408051429309221404807497715e-01 379data8 0xC389626CF2D727C0, 0x00003FFE //A23 = 7.6381507072332210580356159947e-01 380data8 0xD15A03E082D0A307, 0x00003FFE //A22 = 8.1777977210259904277239787430e-01 381data8 0x8FD3DA92675E8E00, 0x00003FFE //A21 = 5.6182638239203638864793584264e-01 382data8 0xFD375E6EE167AA58, 0x00003FFC //A20 = 2.4728152801285544751731937424e-01 383data8 0x89A9482FADE66AE1, 0x00003FFB //A19 = 6.7217410998398471333985773237e-02 384data8 0xC62E1F02606C04DD, 0x00003FF7 //A18 = 6.0479785358923404401184993359e-03 385data8 0xEE7BF2BE71CC531C, 0x0000BFF5 //A17 = -1.8194898432032114199803271708e-03 386data8 0x8084081981CDC79C, 0x0000BFF5 //A16 = -9.8049734947701208487713246099e-04 387data8 0x8975DFB834C118C3, 0x0000BFF0 //A15 = -3.2773123965143773578608926094e-05 388data8 0x965DA4A80008B7BC, 0x0000BFEE //A14 = -8.9624997201558650125662820562e-06 389LOCAL_OBJECT_END(_4_to_6p53_data) 390 391LOCAL_OBJECT_START(_3p25_to_4_data) 392// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4 393data8 0xB01D29846286CE08, 0x00003FEE //A3 = 1.0497207328743021499800978059e-05 394data8 0xBEC10B1488AEB234 //A2H = -2.0317175474986489113480084279e-06 395data8 0xBB7F19701B8B74F9 //A2L = -4.1159669348226960337518214996e-22 396data8 0x3E910B1488AEB234 //A1H = 2.5396469343733111391850105348e-07 397data8 0x3B4F1944906D5D60 //A1L = 5.1448487494628801547474934193e-23 398data8 0x3FEFFFFFF7B91176 //A0H = 9.9999998458274208523732795584e-01 399data8 0x3C70B2865615DB3F //A0L = 1.4482653192002495179309994964e-17 400data8 0xA818D085D56F3021, 0x00003FEC //A25 = 2.5048394770210505593609705765e-06 401data8 0xD9C5C509AAE5561F, 0x00003FEC //A24 = 3.2450636894654766492719395406e-06 402data8 0x9682D71C549EEB07, 0x0000BFED //A23 = -4.4855801709974050650263470866e-06 403data8 0xBC230E1EB6FBF8B9, 0x00003FEA //A22 = 7.0086469577174843181452303996e-07 404data8 0xE1432649FF29D4DE, 0x0000BFEA //A21 = -8.3916747195472308725504497231e-07 405data8 0xB40CEEBD2803D2F0, 0x0000BFEF //A20 = -2.1463694318102769992677291330e-05 406data8 0xEAAB57ABFFA003EB, 0x00003FEF //A19 = 2.7974761309213643228699449426e-05 407data8 0xFBFA4D0B893A5BFB, 0x0000BFEE //A18 = -1.5019043571612821858165073446e-05 408data8 0xBB6AA248EED3E364, 0x0000BFF0 //A17 = -4.4683584873907316507141131797e-05 409data8 0x86C1B3AE3E500ED9, 0x00003FF2 //A16 = 1.2851395412345761361068234880e-04 410data8 0xB60729445F0C37B5, 0x0000BFF2 //A15 = -1.7359540313300841352152461287e-04 411data8 0xCA389F9E707337B1, 0x00003FF1 //A14 = 9.6426575465763394281615740282e-05 412LOCAL_OBJECT_END(_3p25_to_4_data) 413 414 415//////// "Tail" tables ////////// 416LOCAL_OBJECT_START(_0p125_to_0p25_data_tail) 417// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4 418data8 0x93086CBD21ED3962, 0x00003FCA //A13 = 1.2753071968462837024755878679e-16 419data8 0x83CB5045A6D4B419, 0x00003FCF //A12 = 3.6580237062957773626379648530e-15 420data8 0x8FCDB723209690EB, 0x0000BFD3 //A11 = -6.3861616307180801527566117146e-14 421data8 0xCAA173F680B5D56B, 0x0000BFD7 //A10 = -1.4397775466324880354578008779e-12 422data8 0xF0CEA934AD6AC013, 0x00003FDB //A9 = 2.7376616955640415767655526857e-11 423data8 0x81C69F9D0B5AB8EE, 0x00003FE0 //A8 = 4.7212187567505249115688961488e-10 424data8 0xA8B590298C20A194, 0x0000BFE4 //A7 = -9.8201697105565925460801441797e-09 425data8 0x84F3DE72AC964615, 0x0000BFE8 //A6 = -1.2382176987480830706988411266e-07 426data8 0xC01A1398868CC4BD, 0x00003FEC //A5 = 2.8625408039722670291121341583e-06 427data8 0xCC43247F4410C54A, 0x00003FEF //A4 = 2.4349960762505993017186935493e-05 428LOCAL_OBJECT_END(_0p125_to_0p25_data_tail) 429 430LOCAL_OBJECT_START(_0p25_to_0p5_data_tail) 431// Polynomial coefficients for the erf(x), 1/4 <= |x| < 1/2 432data8 0x8CEAC59AF361B78A, 0x0000BFD6 //A13 = -5.0063802958258679384986669123e-13 433data8 0x9BC67404F348C0CE, 0x00003FDB //A12 = 1.7709590771868743572061278273e-11 434data8 0xF4B5D0348AFAAC7A, 0x00003FDB //A11 = 2.7820329729584630464848160970e-11 435data8 0x83AB447FF619DA4A, 0x0000BFE2 //A10 = -1.9160363295631539615395477207e-09 436data8 0x82115AB487202E7B, 0x00003FE0 //A9 = 4.7318386460142606822119637959e-10 437data8 0xB84D5B0AE17054AA, 0x00003FE8 //A8 = 1.7164477188916895004843908951e-07 438data8 0xB2E085C1C4AA06E5, 0x0000BFE9 //A7 = -3.3318445266863554512523957574e-07 439data8 0xCD3CA2E6C3971666, 0x0000BFEE //A6 = -1.2233070175554502732980949519e-05 440data8 0xBA445C53F8DD40E6, 0x00003FF0 //A5 = 4.4409521535330413551781808621e-05 441data8 0xAA94D5E68033B764, 0x00003FF4 //A4 = 6.5071635765452563856926608000e-04 442LOCAL_OBJECT_END(_0p25_to_0p5_data_tail) 443 444LOCAL_OBJECT_START(_0p5_to_1_data_tail) 445// Polynomial coefficients for the erf(x), 1/2 <= |x| < 1 446data8 0x9ED99EDF111CB785, 0x0000BFE4 //A13 = -9.2462916180079278241704711522e-09 447data8 0xDEAF7539AE2FB062, 0x0000BFE5 //A12 = -2.5923990465973151101298441139e-08 448data8 0xA392D5E5CC9DB1A7, 0x00003FE9 //A11 = 3.0467952847327075747032372101e-07 449data8 0xC311A7619B96CA1A, 0x00003FE8 //A10 = 1.8167212632079596881709988649e-07 450data8 0x82082E6B6A93F116, 0x0000BFEE //A9 = -7.7505086843257228386931766018e-06 451data8 0x96D9997CF326A36D, 0x00003FEE //A8 = 8.9913605625817479172071008270e-06 452data8 0x97057D85DCB0ED99, 0x00003FF2 //A7 = 1.4402527482741758767786898553e-04 453data8 0xDC23BCB3599C0490, 0x0000BFF3 //A6 = -4.1988296144950673955519083419e-04 454data8 0xDA150C4867208A81, 0x0000BFF5 //A5 = -1.6638352864915033417887831090e-03 455data8 0x9A4DAF550A2CC29A, 0x00003FF8 //A4 = 9.4179355839141698591817907680e-03 456LOCAL_OBJECT_END(_0p5_to_1_data_tail) 457 458LOCAL_OBJECT_START(_1_to_2_data_tail) 459// Polynomial coefficients for the erf(x), 1 <= |x| < 2.0 460data8 0x969EAC5C7B46CAB9, 0x00003FEF //A13 = 1.7955281439310148162059582795e-05 461data8 0xA2ED832912E9FCD9, 0x00003FF1 //A12 = 7.7690020847111408916570845775e-05 462data8 0x85677C39C48E43E7, 0x0000BFF3 //A11 = -2.5444839340796031538582511806e-04 463data8 0xC2DAFA91683DAAE4, 0x0000BFF1 //A10 = -9.2914288456063075386925076097e-05 464data8 0xE01C061CBC6A2825, 0x00003FF5 //A9 = 1.7098195515864039518892834211e-03 465data8 0x9AD7271CAFD01C78, 0x0000BFF6 //A8 = -2.3626776207372761518718893636e-03 466data8 0x9B6B9D30EDD5F4FF, 0x0000BFF7 //A7 = -4.7430532011804570628999212874e-03 467data8 0x9E51EB9623F1D446, 0x00003FF9 //A6 = 1.9326171998839772791190405201e-02 468data8 0xF391B935C12546DE, 0x0000BFF8 //A5 = -1.4866286152953671441682166195e-02 469data8 0xB6AD4AE850DBF526, 0x0000BFFA //A4 = -4.4598858458861014323191919669e-02 470LOCAL_OBJECT_END(_1_to_2_data_tail) 471 472LOCAL_OBJECT_START(_2_to_3p25_data_tail) 473// Polynomial coefficients for the erf(x), 2 <= |x| < 3.25 474data8 0x847C24DAC7C7558B, 0x00003FF5 //A13 = 1.0107798565424606512130100541e-03 475data8 0xCB6340EAF02C3DF8, 0x00003FF8 //A12 = 1.2413800617425931997420375435e-02 476data8 0xB5163D252DBBC107, 0x0000BFF9 //A11 = -2.2105330871844825370020459523e-02 477data8 0x82FF9C0B68E331E4, 0x00003FF9 //A10 = 1.5991024756001692140897408128e-02 478data8 0xE9519E4A49752E04, 0x00003FF7 //A9 = 7.1203253651891723548763348088e-03 479data8 0x8D52F11B7AE846D9, 0x0000BFFA //A8 = -3.4502927613795425888684181521e-02 480data8 0xCCC5A3E32BC6FA30, 0x00003FFA //A7 = 4.9993171868423886228679106871e-02 481data8 0xC1791AD8284A1919, 0x0000BFFA //A6 = -4.7234635220336795411997070641e-02 482data8 0x853DAAA35A8A3C18, 0x00003FFA //A5 = 3.2529512934760303976755163452e-02 483data8 0x88E42D8F47FAB60E, 0x0000BFF9 //A4 = -1.6710366233609742619461063050e-02 484LOCAL_OBJECT_END(_2_to_3p25_data_tail) 485 486LOCAL_OBJECT_START(_4_to_6p53_data_tail) 487// Polynomial coefficients for the erf(x), 4 <= |x| < 6.53 488data8 0xD8235ABF08B8A6D1, 0x00003FEE //A13 = 1.2882834877224764938429832586e-05 489data8 0xAEDF44F9C77844C2, 0x0000BFEC //A12 = -2.6057980393716019511497492890e-06 490data8 0xCCD5490956A4FCFD, 0x00003FEA //A11 = 7.6306293047300300284923464089e-07 491data8 0xF71AF0126EE26AEA, 0x0000BFE8 //A10 = -2.3013467500738417953513680935e-07 492data8 0xE4CE68089858AC20, 0x00003FE6 //A9 = 5.3273112263151109935867439775e-08 493data8 0xBD15106FBBAEE593, 0x0000BFE4 //A8 = -1.1006037358336556244645388790e-08 494data8 0x8BBF9A5769B6E480, 0x00003FE2 //A7 = 2.0336075804332107927300019116e-09 495data8 0xB049D845D105E302, 0x0000BFDF //A6 = -3.2066683399502826067820249320e-10 496data8 0xBAC69B3F0DFE5483, 0x00003FDC //A5 = 4.2467901578369360007795282687e-11 497data8 0xA29C398F83F8A0D1, 0x0000BFD9 //A4 = -4.6216613698438694005327544047e-12 498LOCAL_OBJECT_END(_4_to_6p53_data_tail) 499 500LOCAL_OBJECT_START(_3p25_to_4_data_tail) 501// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4 502data8 0x95BE1BEAD738160F, 0x00003FF2 //A13 = 1.4280568455209843005829620687e-04 503data8 0x8108C8FFAC0F0B21, 0x0000BFF4 //A12 = -4.9222685622046459346377033307e-04 504data8 0xD72A7FAEE7832BBE, 0x00003FF4 //A11 = 8.2079319302109644436194651098e-04 505data8 0x823AB4281CA7BBE7, 0x0000BFF5 //A10 = -9.9357079675971109178261577703e-04 506data8 0xFA1232D476048D11, 0x00003FF4 //A9 = 9.5394549599882496825916138915e-04 507data8 0xC463D7AF88025FB2, 0x0000BFF4 //A8 = -7.4916843357898101689031755368e-04 508data8 0xFEBE32B6B379D072, 0x00003FF3 //A7 = 4.8588363901002111193445057206e-04 509data8 0x882829BB68409BF3, 0x0000BFF3 //A6 = -2.5969865184916169002074135516e-04 510data8 0xED2F886E29DAAB09, 0x00003FF1 //A5 = 1.1309894347742479284610149994e-04 511data8 0xA4C07129436555B2, 0x0000BFF0 //A4 = -3.9279872584973887163830479579e-05 512LOCAL_OBJECT_END(_3p25_to_4_data_tail) 513 514 515LOCAL_OBJECT_START(_0_to_1o8_data) 516// Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.125 517data8 0x3FF20DD750429B6D, 0x3C71AE3A8DDFFEDE //A1H, A1L 518data8 0xF8B0DACE42525CC2, 0x0000BFEE //A15 519data8 0xFCD02E1BF0EC2C37, 0x00003FF1 //A13 520data8 0xE016D968FE473B5E, 0x0000BFF4 //A11 521data8 0xAB2DE68711BF5A79, 0x00003FF7 //A9 522data8 0xDC16718944518309, 0x0000BFF9 //A7 523data8 0xE71790D0215F0C8F, 0x00003FFB //A5 524data8 0xC093A3581BCF3612, 0x0000BFFD //A3 525LOCAL_OBJECT_END(_0_to_1o8_data) 526 527 528LOCAL_OBJECT_START(_denorm_data) 529data8 0x3FF20DD750429B6D //A1H = 1.1283791670955125585606992900e+00 530data8 0x3C71AE3A914FED80 //A1L = 1.5335459613165880745599768129e-17 531LOCAL_OBJECT_END(_denorm_data) 532 533 534.section .text 535GLOBAL_LIBM_ENTRY(erfl) 536 537{ .mfi 538 alloc r32 = ar.pfs, 0, 21, 0, 0 539 fmerge.se fArgAbsNorm = f1, f8 // normalized x (1.0 <= x < 2.0) 540 addl rSignBit = 0x20000, r0 // Set sign bit for exponent 541} 542{ .mlx 543 addl rDataPtr = @ltoff(erfl_data), gp // Get common data ptr 544 movl r1p5 = 0x3FF8000000000000 // 1.5 in dbl repres. 545};; 546 547{ .mfi 548 getf.exp rArgExp = f8 // Get arg exponent 549 fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials 550 // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf 551 addl rBias = 0xfffc, r0 // Value to subtract from exp 552 // to get actual interval number 553} 554{ .mfi 555 ld8 rDataPtr = [rDataPtr] // Get real common data pointer 556 fma.s1 fArgSqr = f8, f8, f0 // x^2 (for [0;1/8] path) 557 addl r2to4 = 0x10000, r0 // unbiased exponent 558 // for [2;4] binary interval 559};; 560 561{ .mfi 562 getf.sig rArgSig = f8 // Get arg significand 563 fcmp.lt.s1 p15, p14 = f8, f0 // Is arg negative/positive? 564 addl rSaturation = 0xd0e, r0 // First 12 bits of 565 // saturation value signif. 566} 567{ .mfi 568 setf.d f1p5 = r1p5 // 1.5 construction 569 fma.s1 f2p0 = f1,f1,f1 // 2.0 construction 570 addl r3p25Sign = 0xd00, r0 // First 12 bits of 571 // 3.25 value signif. 572};; 573 574{ .mfi 575 addl rTailDataPtr = 0x700, rDataPtr // Pointer to "tail" data 576 nop.f 0 577 andcm rArgExp = rArgExp, rSignBit // Remove sign of exp 578} 579{ .mfb 580 addl rTiny = 0xf000, r0 // Tiny value for saturation path 581 nop.f 0 582(p6) br.cond.spnt erfl_spec // Branch to zero, denorm & specs 583};; 584 585{ .mfi 586 sub rInterval = rArgExp, rBias // Get actual interval number 587 nop.f 0 588 shr.u rArgSig = rArgSig, 52 // Leave only 12 bits of sign. 589} 590{ .mfi 591 adds rShiftedDataPtr = 0x10, rDataPtr // Second ptr to data 592 nop.f 0 593 cmp.eq p8, p10 = r2to4, rArgExp // If exp is in 2to4 interval? 594};; 595 596{ .mfi 597(p8) cmp.le p8, p10 = r3p25Sign, rArgSig // If sign. is greater 598 // than 1.25? (means arg is in [3.25;4] interval) 599 nop.f 0 600 shl rOffset = rInterval, 8 // Make offset from 601 // interval number 602} 603{ .mfi 604 cmp.gt p9, p0 = 0x0, rInterval // If interval is less than 0 605 // (means arg is in [0; 1/8]) 606 nop.f 0 607 cmp.eq p7, p0 = 0x5, rInterval // If arg is in [4:8] interv.? 608};; 609 610{ .mfi 611(p8) adds rOffset = 0x200, rOffset // Add additional offset 612 // if arg is in [3.25;4] (another data set) 613 fma.s1 fArgCube = fArgSqr, f8, f0 // x^3 (for [0;1/8] path) 614 shl rTailOffset = rInterval, 7 // Make offset to "tail" data 615 // from interval number 616} 617{ .mib 618 setf.exp fTiny = rTiny // Construct "tiny" value 619 // for saturation path 620 cmp.ltu p11, p0 = 0x5, rInterval // if arg > 8 621(p9) br.cond.spnt _0_to_1o8 622};; 623 624{ .mfi 625 add rAddr1 = rDataPtr, rOffset // Get address for 626 // interval data 627 nop.f 0 628 shl rTailAddOffset = rInterval, 5 // Offset to interval 629 // "tail" data 630} 631{ .mib 632 add rAddr2 = rShiftedDataPtr, rOffset // Get second 633 // address for interval data 634(p7) cmp.leu p11, p0 = rSaturation, rArgSig // if arg is 635 // in [6.53;8] interval 636(p11) br.cond.spnt _saturation // Branch to Saturation path 637};; 638 639{ .mmi 640 ldfe fA3 = [rAddr1], 0x90 // Load A3 641 ldfpd fA2H, fA2L = [rAddr2], 16 // Load A2High, A2Low 642 add rTailOffset = rTailOffset, rTailAddOffset // "Tail" offset 643};; 644 645{ .mmi 646 ldfe fA20 = [rAddr1], 16 // Load A20 647 ldfpd fA1H, fA1L = [rAddr2], 16 // Load A1High, A1Low 648(p8) adds rTailOffset = 0x140, rTailOffset // Additional offset 649 // for [3.24;4] interval 650};; 651 652{ .mmi 653 ldfe fA19 = [rAddr1], 16 // Load A19 654 ldfpd fA0H, fA0L = [rAddr2], 16 // Load A0High, A0Low 655 add rTailAddr1 = rTailDataPtr, rTailOffset // First tail 656 // data address 657};; 658 659.pred.rel "mutex",p8,p10 660{ .mfi 661 ldfe fA18 = [rAddr1], 16 // Load A18 662(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f2p0 // Add 2.0 663 // to normalized arg (for [3.24;4] interval) 664 adds rTailAddr2 = 0x10, rTailAddr1 // First tail 665 // data address 666} 667{ .mfi 668 ldfe fA25 = [rAddr2], 16 // Load A25 669(p10) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1p5 // Add 1.5 670 // to normalized arg 671 nop.i 0 672};; 673 674{ .mmi 675 ldfe fA17 = [rAddr1], 16 // Load A17 676 ldfe fA24 = [rAddr2], 16 // Load A24 677 nop.i 0 678};; 679 680{ .mmi 681 ldfe fA16 = [rAddr1], 16 // Load A16 682 ldfe fA23 = [rAddr2], 16 // Load A23 683 nop.i 0 684};; 685 686{ .mmi 687 ldfe fA15 = [rAddr1], 16 // Load A15 688 ldfe fA22 = [rAddr2], 16 // Load A22 689 nop.i 0 690};; 691 692{ .mmi 693 ldfe fA14 = [rAddr1], 16 // Load A14 694 ldfe fA21 = [rAddr2], 16 // Load A21 695 nop.i 0 696};; 697 698{ .mfi 699 ldfe fA13 = [rTailAddr1], 32 // Load A13 700 fms.s1 fArgAbsNorm2 = fArgAbsNorm, fArgAbsNorm, f0 // x^2 701 nop.i 0 702} 703{ .mfi 704 ldfe fA12 = [rTailAddr2], 32 // Load A12 705 nop.f 0 706 nop.i 0 707};; 708 709{ .mfi 710 ldfe fA11 = [rTailAddr1], 32 // Load A11 711 fma.s1 fRes3H = fA3, fArgAbsNorm, fA2H // (A3*x+A2)*x^2 712 nop.i 0 713} 714{ .mfi 715 ldfe fA10 = [rTailAddr2], 32 // Load A10 716 fma.s1 fTH = fA3, fArgAbsNorm, f0 // (A3*x+A2)*x^2 717 nop.i 0 718};; 719 720{ .mfi 721 ldfe fA9 = [rTailAddr1], 32 // Load A9 722 fma.s1 fTT2 = fA1L, fArgAbsNorm, f0 // A1*x+A0 723 nop.i 0 724} 725{ .mfi 726 ldfe fA8 = [rTailAddr2], 32 // Load A8 727 nop.f 0 728 nop.i 0 729};; 730 731{ .mmi 732 ldfe fA7 = [rTailAddr1], 32 // Load A7 733 ldfe fA6 = [rTailAddr2], 32 // Load A6 734 nop.i 0 735};; 736 737{ .mmi 738 ldfe fA5 = [rTailAddr1], 32 // Load A5 739 ldfe fA4 = [rTailAddr2], 32 // Load A4 740 nop.i 0 741};; 742 743{ .mfi 744 nop.m 0 745 fms.s1 fArgAbsNorm2L = fArgAbsNorm, fArgAbsNorm, fArgAbsNorm2 746 // Low part of x^2 (delta) 747 nop.i 0 748} 749{ .mfi 750 nop.m 0 751 fms.s1 fArgAbsNorm4 = fArgAbsNorm2, fArgAbsNorm2, f0 // x^4 752 nop.i 0 753};; 754 755{ .mfi 756 nop.m 0 757 fms.s1 fRes3L = fA2H, f1, fRes3H // // (A3*x+A2)*x^2 758 nop.i 0 759};; 760 761{ .mfi 762 nop.m 0 763 fms.s1 fArgAbsNorm3 = fArgAbsNorm2, fArgAbsNorm, f0 // x^3 764 nop.i 0 765} 766{ .mfi 767 nop.m 0 768 fma.s1 fTH2 = fA1H, fArgAbsNorm, fTT2 // A1*x+A0 769 nop.i 0 770};; 771 772{ .mfi 773 nop.m 0 774 fma.s1 fA23 = fA24, fArgAbsNorm, fA23 // Polynomial tail 775 nop.i 0 776} 777{ .mfi 778 nop.m 0 779 fma.s1 fA21 = fA22, fArgAbsNorm, fA21 // Polynomial tail 780 nop.i 0 781};; 782 783{ .mfi 784 nop.m 0 785 fma.s1 fA12 = fA13, fArgAbsNorm, fA12 // Polynomial tail 786 nop.i 0 787} 788;; 789 790{ .mfi 791 nop.m 0 792 fma.s1 fRes3L = fRes3L, f1, fTH // (A3*x+A2)*x^2 793 nop.i 0 794} 795{ .mfi 796 nop.m 0 797 fma.s1 fA19 = fA20, fArgAbsNorm, fA19 // Polynomial tail 798 nop.i 0 799};; 800 801{ .mfi 802 nop.m 0 803 fma.s1 fRes1H = fTH2, f1, fA0H // A1*x+A0 804 nop.i 0 805} 806{ .mfi 807 nop.m 0 808 fms.s1 fTL2 = fA1H, fArgAbsNorm, fTH2 // A1*x+A0 809 nop.i 0 810};; 811 812{ .mfi 813 nop.m 0 814 fma.s1 fA8 = fA9, fArgAbsNorm, fA8 // Polynomial tail 815 nop.i 0 816} 817{ .mfi 818 nop.m 0 819 fma.s1 fA10 = fA11, fArgAbsNorm, fA10 // Polynomial tail 820 nop.i 0 821};; 822{ .mfi 823 nop.m 0 824 fma.s1 fA15 = fA16, fArgAbsNorm, fA15 // Polynomial tail 825 nop.i 0 826} 827{ .mfi 828 nop.m 0 829 fma.s1 fA17 = fA18, fArgAbsNorm, fA17 // Polynomial tail 830 nop.i 0 831};; 832{ .mfi 833 nop.m 0 834 fms.s1 fArgAbsNorm11 = fArgAbsNorm4, fArgAbsNorm4, f0 // x^8 835 nop.i 0 836} 837{ .mfi 838 nop.m 0 839 fma.s1 fA4 = fA5, fArgAbsNorm, fA4 // Polynomial tail 840 nop.i 0 841};; 842 843{ .mfi 844 nop.m 0 845 fma.s1 fRes3L = fRes3L, f1, fA2L // (A3*x+A2)*x^2 846 nop.i 0 847} 848{ .mfi 849 nop.m 0 850 fma.s1 fA6 = fA7, fArgAbsNorm, fA6 // Polynomial tail 851 nop.i 0 852};; 853 854{ .mfi 855 nop.m 0 856 fma.s1 fTL2 = fTL2, f1, fTT2 // A1*x+A0 857 nop.i 0 858} 859{ .mfi 860 nop.m 0 861 fms.s1 fRes1L = fA0H, f1, fRes1H // A1*x+A0 862 nop.i 0 863};; 864 865{ .mfi 866 nop.m 0 867 fma.s1 fA23 = fA25, fArgAbsNorm2, fA23 // Polynomial tail 868 nop.i 0 869} 870{ .mfi 871 nop.m 0 872 fma.s1 fA12 = fA14, fArgAbsNorm2, fA12 // Polynomial tail 873 nop.i 0 874};; 875 876{ .mfi 877 nop.m 0 878 fma.s1 fA19 = fA21, fArgAbsNorm2, fA19 // Polynomial tail 879 nop.i 0 880} 881{ .mfi 882 nop.m 0 883 fma.s1 fA8 = fA10, fArgAbsNorm2, fA8 // Polynomial tail 884 nop.i 0 885};; 886 887{ .mfi 888 nop.m 0 889 fma.s1 fA15 = fA17, fArgAbsNorm2, fA15 // Polynomial tail 890 nop.i 0 891} 892{ .mfi 893 nop.m 0 894 fms.s1 fArgAbsNorm11 = fArgAbsNorm11, fArgAbsNorm3, f0 // x^11 895 nop.i 0 896};; 897 898{ .mfi 899 nop.m 0 900 fma.s1 fTT = fRes3L, fArgAbsNorm2, f0 // (A3*x+A2)*x^2 901 nop.i 0 902} 903{ .mfi 904 nop.m 0 905 fma.s1 fA4 = fA6, fArgAbsNorm2, fA4 // Polynomial tail 906 nop.i 0 907};; 908 909{ .mfi 910 nop.m 0 911 fma.s1 fRes1L = fRes1L, f1, fTH2 // A1*x+A0 912 nop.i 0 913};; 914 915{ .mfi 916 nop.m 0 917 fma.s1 fA19 = fA23, fArgAbsNorm4, fA19 // Polynomial tail 918 nop.i 0 919} 920{ .mfi 921 nop.m 0 922 fma.s1 fA8 = fA12, fArgAbsNorm4, fA8 // Polynomial tail 923 nop.i 0 924};; 925 926{ .mfi 927 nop.m 0 928 fma.s1 fTT = fRes3H, fArgAbsNorm2L, fTT // (A3*x+A2)*x^2 929 nop.i 0 930};; 931 932{ .mfi 933 nop.m 0 934 fma.s1 fRes1L = fRes1L, f1, fTL2 // A1*x+A0 935 nop.i 0 936};; 937 938{ .mfi 939 nop.m 0 940 fma.s1 fA15 = fA19, fArgAbsNorm4, fA15 // Polynomial tail 941 nop.i 0 942} 943{ .mfi 944 nop.m 0 945 fma.s1 fA4 = fA8, fArgAbsNorm4, fA4 // Polynomial tail 946 nop.i 0 947};; 948 949{ .mfi 950 nop.m 0 951 fma.s1 fRes2H = fRes3H, fArgAbsNorm2, fTT // (A3*x+A2)*x^2 952 nop.i 0 953};; 954 955{ .mfi 956 nop.m 0 957 fma.s1 fRes1L = fRes1L, f1, fA0L // A1*x+A0 958 nop.i 0 959};; 960 961{ .mfi 962 nop.m 0 963 fma.s1 fRes4 = fA15, fArgAbsNorm11, fA4 // Result of 964 // polynomial tail 965 nop.i 0 966};; 967 968{ .mfi 969 nop.m 0 970 fms.s1 fRes2L = fRes3H, fArgAbsNorm2, fRes2H // (A3*x+A2)*x^2 971 nop.i 0 972} 973{ .mfi 974 nop.m 0 975 fma.s1 fResH = fRes2H, f1, fRes1H // High result 976 nop.i 0 977};; 978 979{ .mfi 980 nop.m 0 981 fma.s1 fRes1L = fRes4, fArgAbsNorm4, fRes1L // A1*x+A0 982 nop.i 0 983};; 984 985{ .mfi 986 nop.m 0 987 fma.s1 fRes2L = fRes2L, f1, fTT // (A3*x+A2)*x^2 988 nop.i 0 989} 990{ .mfi 991 nop.m 0 992 fms.s1 fResL = fRes1H, f1, fResH // Low result 993 nop.i 0 994};; 995 996{ .mfi 997 nop.m 0 998 fma.s1 fRes1L = fRes1L, f1, fRes2L // Low result 999 nop.i 0 1000} 1001{ .mfi 1002 nop.m 0 1003 fma.s1 fResL = fResL, f1, fRes2H // Low result 1004 nop.i 0 1005};; 1006 1007{ .mfi 1008 nop.m 0 1009(p15) fneg fResH = fResH // Invert high result if arg is neg. 1010 nop.i 0 1011};; 1012 1013{ .mfi 1014 nop.m 0 1015 fma.s1 fResL = fResL, f1, fRes1L // Low result 1016 nop.i 0 1017};; 1018 1019.pred.rel "mutex",p14,p15 1020{ .mfi 1021 nop.m 0 1022(p14) fma.s0 f8 = fResH, f1, fResL // Add high and low results 1023 nop.i 0 1024} 1025{ .mfb 1026 nop.m 0 1027(p15) fms.s0 f8 = fResH, f1, fResL // Add high and low results 1028 br.ret.sptk b0 // Main path return 1029};; 1030 1031// satiration path //////////////////////////////////////////////////////////// 1032_saturation: 1033 1034.pred.rel "mutex",p14,p15 1035{ .mfi 1036 nop.m 0 1037(p14) fms.s0 f8 = f1, f1, fTiny // Saturation result r = 1-tiny 1038 nop.i 0 1039};; 1040{ .mfb 1041 nop.m 0 1042(p15) fnma.s0 f8 = f1, f1, fTiny // Saturation result r = tiny-1 1043 br.ret.sptk b0 // Saturation path return 1044};; 1045 1046 1047// 0, denormals and special IEEE numbers path ///////////////////////////////// 1048erfl_spec: 1049 1050{ .mfi 1051 addl rDataPtr = 0xBE0, rDataPtr // Ptr to denormals coeffs 1052 fclass.m p6,p0 = f8, 0x23 // To filter infinities 1053 // 0x23 = @pos|@neg|@inf 1054 nop.i 0 1055};; 1056 1057{ .mfi 1058 ldfpd fA1H, fA1L = [rDataPtr] // Load denormals coeffs A1H, A1L 1059 fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros 1060 // 0xC7 = @pos|@neg|@zero|@qnan|@snan 1061 nop.i 0 1062};; 1063 1064{ .mfb 1065 nop.m 0 1066(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args 1067(p6) br.ret.spnt b0 // exit for x = INF 1068};; 1069 1070{ .mfb 1071 nop.m 0 1072(p7) fma.s0 f8 = f8, f1, f8 // +/-0 for 0 args 1073 // and NaNs for NaNs 1074(p7) br.ret.spnt b0 // exit for x = NaN or +/-0 1075};; 1076 1077{ .mfi 1078 nop.m 0 1079 fnorm.s0 f8 = f8 // Normalize arg 1080 nop.i 0 1081};; 1082 1083{ .mfi 1084 nop.m 0 1085 fms.s1 fRes1H = f8, fA1H, f0 // HighRes 1086 nop.i 0 1087} 1088{ .mfi 1089 nop.m 0 1090 fms.s1 fRes1L = f8, fA1L, f0 // LowRes 1091 nop.i 0 1092};; 1093 1094{ .mfi 1095 nop.m 0 1096 fms.s1 fRes1Hd = f8, fA1H, fRes1H // HighRes delta 1097 nop.i 0 1098};; 1099 1100{ .mfi 1101 nop.m 0 1102 fma.s1 fRes = fRes1L, f1, fRes1Hd // LowRes+HighRes delta 1103 nop.i 0 1104};; 1105 1106{ .mfi 1107 nop.m 0 1108 fma.s1 fRes = f8, f8, fRes // r=x^2+r 1109 nop.i 0 1110};; 1111 1112{ .mfb 1113 nop.m 0 1114 fma.s0 f8 = fRes, f1, fRes1H // res = r+ResHigh 1115 br.ret.sptk b0 // 0, denormals, specials return 1116};; 1117 1118 1119// 0 < |x| < 1/8 path ///////////////////////////////////////////////////////// 1120_0_to_1o8: 1121 1122{ .mmi 1123 adds rAddr1 = 0xB60, rDataPtr // Ptr 1 to coeffs 1124 adds rAddr2 = 0xB80, rDataPtr // Ptr 2 to coeffs 1125 nop.i 0 1126};; 1127 1128{ .mmi 1129 ldfpd fA1H, fA1L = [rAddr1], 16 // Load A1High, A1Low 1130 ldfe fA13 = [rAddr2], 16 // Load A13 1131 nop.i 0 1132};; 1133 1134{ .mmi 1135 ldfe fA15 = [rAddr1], 48 // Load A15 1136 ldfe fA11 = [rAddr2], 32 // Load A11 1137 nop.i 0 1138};; 1139 1140{ .mmi 1141 ldfe fA9 = [rAddr1], 32 // Load A9 1142 ldfe fA7 = [rAddr2], 32 // Load A7 1143 nop.i 0 1144};; 1145 1146{ .mmi 1147 ldfe fA5 = [rAddr1] // Load A5 1148 ldfe fA3 = [rAddr2] // Load A3 1149 nop.i 0 1150};; 1151 1152{ .mfi 1153 nop.m 0 1154 fms.s1 fRes1H = f8, fA1H, f0 // x*(A1H+A1L) 1155 nop.i 0 1156} 1157{ .mfi 1158 nop.m 0 1159 fms.s1 fRes1L = f8, fA1L, f0 // x*(A1H+A1L) 1160 nop.i 0 1161};; 1162 1163{ .mfi 1164 nop.m 0 1165 fma.s1 fA11 = fA13, fArgSqr, fA11 // Polynomial tail 1166 nop.i 0 1167} 1168{ .mfi 1169 nop.m 0 1170 fma.s1 fArgFour = fArgSqr, fArgSqr, f0 // a^4 1171 nop.i 0 1172};; 1173 1174 1175{ .mfi 1176 nop.m 0 1177 fma.s1 fA3 = fA5, fArgSqr, fA3 // Polynomial tail 1178 nop.i 0 1179} 1180{ .mfi 1181 nop.m 0 1182 fma.s1 fA7 = fA9, fArgSqr, fA7 // Polynomial tail 1183 nop.i 0 1184};; 1185 1186 1187{ .mfi 1188 nop.m 0 1189 fms.s1 fRes1Hd = f8, fA1H, fRes1H // x*(A1H+A1L) delta 1190 nop.i 0 1191};; 1192 1193{ .mfi 1194 nop.m 0 1195 fma.s1 fA11 = fA15, fArgFour, fA11 // Polynomial tail 1196 nop.i 0 1197};; 1198 1199{ .mfi 1200 nop.m 0 1201 fma.s1 fA3 = fA7, fArgFour, fA3 // Polynomial tail 1202 nop.i 0 1203} 1204{ .mfi 1205 nop.m 0 1206 fma.s1 fArgEight = fArgFour, fArgFour, f0 // a^8 1207 nop.i 0 1208};; 1209 1210{ .mfi 1211 nop.m 0 1212 fma.s1 f8 = fRes1L, f1, fRes1Hd // x*(A1H+A1L) 1213 nop.i 0 1214};; 1215 1216{ .mfi 1217 nop.m 0 1218 fma.s1 fRes = fA11, fArgEight, fA3 //Polynomial tail result 1219 nop.i 0 1220};; 1221 1222{ .mfi 1223 nop.m 0 1224 fma.s1 f8 = fRes, fArgCube, f8 // (Polynomial tail)*x^3 1225 nop.i 0 1226};; 1227 1228{ .mfb 1229 nop.m 0 1230 fma.s0 f8 = f8, f1, fRes1H // (Polynomial tail)*x^3 + 1231 // + x*(A1H+A1L) 1232 br.ret.sptk b0 // [0;1/8] interval return 1233};; 1234 1235 1236GLOBAL_LIBM_END(erfl) 1237libm_alias_ldouble_other (erf, erf) 1238