1.file "erf.s" 2 3 4// Copyright (c) 2001 - 2005, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//============================================================== 41// 08/15/01 Initial version 42// 05/20/02 Cleaned up namespace and sf0 syntax 43// 02/06/03 Reordered header: .section, .global, .proc, .align 44// 03/31/05 Reformatted delimiters between data tables 45// 46// API 47//============================================================== 48// double erf(double) 49// 50// Overview of operation 51//============================================================== 52// Background 53// 54// 55// There are 9 paths: 56// 1. x = +/-0.0 57// Return erf(x) = +/-0.0 58// 59// 2. 0.0 < |x| < 0.5 60// Return erf(x) = x *Pol9(x^2) 61// 62// 3. For several subranges of 0.5 <= |x| < 5.90625 63// Return erf(x) = sign(x)*Pol19(y), 64// where y = (|x|-b)/a, Pol19(y) = A0 + A1*y^1 + A2*y^2 + ... + A19*y^19 65// 66// For each subrange there is particular set of coefficients. 67// Below is the list of subranges: 68// 3.1 0.5 <= |x| < 1.0 b = a = 0.5 69// 3.2 1.0 <= |x| < 2.0, b = a = 1.0 70// 3.3 2.0 <= |x| < 3.25 b = a = 2.0 71// 3.4 4.0 <= |x| < 5.90625 b = 4.0, a = 2.0 72// 73// 4. 3.25 <= |x| < 4.0 74// Return erf(x) = sign(x)*Pol14(|x| - 3.25) 75// 76// 5. 5.90625 <= |x| < +INF 77// Return erf(x) = sign(x)*(1.0d - 2^(-63)) 78// 79// 6. |x| = INF 80// Return erf(x) = sign(x) * 1.0 81// 82// 7. x = [S,Q]NaN 83// Return erf(x) = QNaN 84// 85// 8. x is positive denormal 86// Return erf(x) = A0*x - x^2, 87// where A0 = 2.0/sqrt(Pi) 88// 89// 9. x is negative denormal 90// Return erf(x) = A0*x + x^2, 91// where A0 = 2.0/sqrt(Pi) 92// 93// Registers used 94//============================================================== 95// Floating Point registers used: 96// f8, input, output 97// f32 -> f63 98 99// General registers used: 100// r32 -> r48, r2, r3 101 102// Predicate registers used: 103// p0, p6 -> p15 104 105// p6 to filter out case when x = denormal 106// p7 to filter out case when x = [Q,S]NaN or +/-0, 107// used also to process denormals 108// p8 to filter out case when 3.25 <= |x| < 4.0, 109// used also to process denormals 110// p9 to filter out case when |x| = inf 111// p10 to filter out case when |x| < 0.5 112// p11 set when |x| < 3.25 or |x| > 4.0 113// p12 to filter out case when |x| >= 5.90625 114// p13 set if 4.0 <=|x| < 5.90625 115// p14 set to 1 for positive x 116// p15 set to 1 for negative x 117 118// Assembly macros 119//============================================================== 120rDataPtr = r2 121rDataPtr1 = r3 122 123rBias = r33 124rCoeffAddr3 = r34 125rThreeAndQ = r35 126rCoeffAddr2 = r36 127rMask = r37 128rArg = r38 129rSignBit = r39 130rAbsArg = r40 131rSaturation = r41 132rIndex = r42 133rCoeffAddr1 = r43 134rCoeffAddr4 = r44 135rShiftedArg = r45 136rShiftedArgMasked = r46 137rBiasedExpOf4 = r47 138rShiftedAbsArg = r48 139 140//============================================================== 141fA0 = f32 142fA1 = f33 143fA2 = f34 144fA3 = f35 145fA4 = f36 146fA5 = f37 147fA6 = f38 148fA7 = f39 149fA8 = f40 150fA9 = f41 151fA10 = f42 152fA11 = f43 153fA12 = f44 154fA13 = f45 155fA14 = f46 156fA15 = f47 157fA16 = f48 158fA17 = f49 159fA18 = f50 160fA19 = f51 161fArgSqr = f52 162fArgAbsNorm = f53 163fSignumX = f54 164fRes = f55 165fThreeAndQ = f56 166fArgAbs = f57 167fTSqr = f58 168fTQuadr = f59 169fTDeg3 = f60 170fTDeg7 = f61 171fArgAbsNormSgn = f62 172fTQuadrSgn = f63 173 174// Data tables 175//============================================================== 176RODATA 177 178.align 64 179 180LOCAL_OBJECT_START(erf_data) 181// Coefficients ##0..15 182// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0 183data8 0xB69AC40646D1F6C1, 0x00003FD2 //A19 184data8 0x90AD48C0118FA10C, 0x00003FD7 //A18 185data8 0x826FBAD055EA4AB8, 0x0000BFDB //A17 186data8 0x8DAB171246CC2B89, 0x00003FDC //A16 187data8 0xC0B1D6662F8A7564, 0x00003FDF //A15 188data8 0xA46374AC35099BAF, 0x0000BFE1 //A14 189data8 0xB2F230996346EF27, 0x0000BFE4 //A13 190data8 0xCDEC50950FACE04A, 0x00003FE6 //A12 191data8 0x826014649396E9D2, 0x00003FE9 //A11 192data8 0xCDB787DC718B13F9, 0x0000BFEB //A10 193data8 0x8E0B23C24EE0C8EE, 0x0000BFED //A9 194data8 0xA49EA40A4E5A3F76, 0x00003FF0 //A8 195data8 0xB11E30BE912617D3, 0x00003FF0 //A7 196data8 0xCCF89D9351CE26E3, 0x0000BFF4 //A6 197data8 0xEFF75AD1F0F22809, 0x00003FF2 //A5 198data8 0xBB793EF404C09A22, 0x00003FF8 //A4 199// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0 200data8 0xBAE93FF4174EA59B, 0x00003FE6 //A19 201data8 0x8A0FD46092F95D44, 0x0000BFEA //A18 202data8 0xA37B3242B7809E12, 0x00003FEC //A17 203data8 0xA0330A5CD2E91689, 0x0000BFED //A16 204data8 0x8E34A678F3497D17, 0x0000BFEC //A15 205data8 0xAC185D45A2772384, 0x00003FEF //A14 206data8 0xB0C11347CE7EEDE8, 0x00003FEF //A13 207data8 0xD3330DC14EA0E4EB, 0x0000BFF2 //A12 208data8 0xB4A6DFDE578A428F, 0x00003FF1 //A11 209data8 0xA0B4034310D2D9CB, 0x00003FF5 //A10 210data8 0xF71662D3132B7759, 0x0000BFF5 //A9 211data8 0x9C88BF157695E9EC, 0x0000BFF7 //A8 212data8 0xF84B80EFCA43895D, 0x00003FF8 //A7 213data8 0x9722D22DA628A17B, 0x00003FF7 //A6 214data8 0x8DB0A586F8F3381F, 0x0000BFFB //A5 215data8 0x8DB0A5879F87E5BE, 0x00003FFB //A4 216// Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25 217data8 0x9C4AF1F3A4B21AFC, 0x00003FF6 //A19 218data8 0x8D40D5D5DB741AB8, 0x0000BFF9 //A18 219data8 0xDEBE7099E0A75BA4, 0x00003FFA //A17 220data8 0xB99A33294D32429D, 0x0000BFFB //A16 221data8 0x8109D9C7197BC7C9, 0x00003FFB //A15 222data8 0xC30DE8E2EFC2D760, 0x00003FFA //A14 223data8 0x80DDA28C5B35DC73, 0x0000BFFC //A13 224data8 0x9BE4DE5095BACE0D, 0x00003FF9 //A12 225data8 0xDA4092509EE7D111, 0x00003FFC //A11 226data8 0x89D98C561B0C9040, 0x0000BFFD //A10 227data8 0xD20B26EB2F0881D4, 0x0000BFF9 //A9 228data8 0xD089C56948731561, 0x00003FFD //A8 229data8 0xDD704DEFFB21B7E7, 0x0000BFFD //A7 230data8 0xF0C9A6BBDE469115, 0x00003FF9 //A6 231data8 0xD673A02CB5766633, 0x00003FFD //A5 232data8 0x8D162CBAD8A12649, 0x0000BFFE //A4 233// Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0 234data8 0xD4428B75C6FE8FD1, 0x0000BFFC //A19 235data8 0xF76BE1935675D5C8, 0x00003FFE //A18 236data8 0xFD6BB3B14AA7A8E6, 0x0000BFFF //A17 237data8 0x8BE8F573D348DDA4, 0x00004000 //A16 238data8 0x81E91923A1030502, 0x0000BFFF //A15 239data8 0xCE7FE87B26CFD286, 0x0000BFFE //A14 240data8 0x84EF6B4E17404384, 0x00004000 //A13 241data8 0x91FEF33015404991, 0x0000C000 //A12 242data8 0xDEDF6A9370747E56, 0x00003FFF //A11 243data8 0x8397E6FF56CDFD9D, 0x0000BFFF //A10 244data8 0xFAD1CE912473937B, 0x00003FFD //A9 245data8 0xC48C1EA8AAA624EA, 0x0000BFFC //A8 246data8 0xFECAF0097ACF981B, 0x00003FFA //A7 247data8 0x8829A394065E4B95, 0x0000BFF9 //A6 248data8 0xED3003E477A53EE7, 0x00003FF6 //A5 249data8 0xA4C07E9BB3FCB0F3, 0x0000BFF4 //A4 250// 251// Coefficients ##16..19 252// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0 253data8 0x95FA98C337005D13, 0x0000BFF9 //A3 254data8 0xE0F7E524D2808A97, 0x0000BFFB //A2 255data8 0xE0F7E524D2808A98, 0x00003FFD //A1 256data8 0x853F7AE0C76E915F, 0x00003FFE //A0 257// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0 258data8 0x8DB0A587A96ABCF0, 0x00003FFC //A3 259data8 0xD488F84B7DE18DA8, 0x0000BFFD //A2 260data8 0xD488F84B7DE12E9C, 0x00003FFD //A1 261data8 0xD7BB3D3A08445636, 0x00003FFE //A0 262// Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25 263data8 0xC58571D23D5C4B3A, 0x00003FFD //A3 264data8 0xA94DCF467CD6AFF3, 0x0000BFFC //A2 265data8 0xA94DCF467CD10A16, 0x00003FFA //A1 266data8 0xFECD70A13CAF1997, 0x00003FFE //A0 267// Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0 268data8 0xB01D2B4F0D5AB8B0, 0x00003FF1 //A3 269data8 0x8858A465CE594BD1, 0x0000BFEE //A2 270data8 0x8858A447456DE61D, 0x00003FEA //A1 271data8 0xFFFFFFBDC88BB107, 0x00003FFE //A0 272// Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.5 273data8 0xBE839EDBB36C7FCE //A9 274data8 0x3EBB7745A18DD242 //A8 275data8 0xBF4C02DB238F2AFC //A5 276data8 0x3F7565BCD0A9A3EA //A4 277data8 0xC093A3581BCF3333, 0x0000BFFD //A1 278data8 0xBEEF4BB82AD8AE22 //A7 279data8 0x3F1F9A2A57A218CD //A6 280data8 0xBF9B82CE3127F4E4 //A3 281data8 0x3FBCE2F21A042B25 //A2 282data8 0x906EBA8214DB688D, 0x00003FFF //A0 283// 1.0 - 2^(-63) 284data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE 285// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4.0 286data8 0x95E91576C7A12250, 0x00003FE7 //A14 287data8 0x8E5E0D0E1F5D3CB5, 0x0000BFEA //A13 288data8 0xED761DAFAF814DE9, 0x00003FEB //A12 289data8 0xB3A77D921D0ACFC7, 0x0000BFEC //A11 290data8 0xA662D27096B08D7C, 0x0000BFEC //A10 291data8 0xDA0F410AE6233EA5, 0x00003FEF //A9 292data8 0xAB4A8B16B3124327, 0x0000BFF1 //A8 293data8 0xB241E236A5EDCED3, 0x00003FF2 //A7 294data8 0x8A2A65BA1F551F77, 0x0000BFF3 //A6 295data8 0xA4852D0B1D87000A, 0x00003FF3 //A5 296data8 0x963EB00039489476, 0x0000BFF3 //A4 297data8 0xCD5244FF4F7313A5, 0x00003FF2 //A3 298data8 0xC6F1E695363BCB26, 0x0000BFF1 //A2 299data8 0xF4DAF4680DA54C02, 0x00003FEF //A1 300data8 0xFFFFB7CFB3F2ABBE, 0x00003FFE //A0 301// A = 2.0/sqrt(Pi) 302data8 0x906EBA8214DB688D, 0x00003FFF 303LOCAL_OBJECT_END(erf_data) 304 305 306.section .text 307GLOBAL_LIBM_ENTRY(erf) 308 309{ .mfi 310 alloc r32 = ar.pfs, 0, 17, 0, 0 311 fmerge.se fArgAbsNorm = f1, f8 // normalized x 312 adds rSignBit = 0x1, r0 313} 314{ .mfi 315 addl rDataPtr = @ltoff(erf_data), gp 316 fma.s1 fArgSqr = f8, f8, f0 // x^2 317 addl rThreeAndQ = 0x400A0, r0 // shifted bits of 3.25 318} 319;; 320{ .mfi 321 getf.d rArg = f8 // x in GR 322 fclass.m p6,p0 = f8, 0x0b // is x denormal ? 323 shl rThreeAndQ = rThreeAndQ, 44 // bits of 3.25 324} 325{ .mfi 326 ld8 rDataPtr = [rDataPtr] 327 nop.f 0 328 addl rBiasedExpOf4 = 0x40100, r0 // shifted bits of 4.0 329} 330;; 331{ .mfi 332 addl rSaturation = 0x4017A, r0 // shifted bits of 5.90625 333 fclass.m p7,p0 = f8, 0xc7 // is x [S,Q]NaN or +/-0 ? 334 shl rSignBit = rSignBit, 63 // mask for sign bit 335} 336{ .mfi 337 addl rMask = 0x7FF00, r0 // Mask for index bits 338 nop.f 0 339 addl rBias = 0x3FE00, r0 // bias of 0.5 << 8 340} 341;; 342{ .mfi 343 setf.d fThreeAndQ = rThreeAndQ // 3.25 if FP register 344 fclass.m p9,p0 = f8, 0x23 // is x +/- inf? 345 shr.u rShiftedArg = rArg, 44 346} 347{ .mfb 348 andcm rAbsArg = rArg, rSignBit // |x| in GR 349 nop.f 0 350(p6) br.cond.spnt erf_denormal // branch out if x is denormal 351} 352;; 353{ .mfi 354 and rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8 355 fmerge.s fArgAbs = f1, f8 // |x| 356 shr rShiftedAbsArg = rAbsArg, 44 357} 358{ .mfb 359 cmp.lt p8, p11 = rThreeAndQ, rAbsArg // p8 = 1 if |x| >= 3.25 360(p7) fma.d.s0 f8 = f8,f1,f8 // NaN or +/-0 361(p7) br.ret.spnt b0 // exit for x = NaN or +/-0 362} 363;; 364{ .mfi 365 sub rIndex = rShiftedArgMasked, rBias // index << 8 366 nop.f 0 367 cmp.lt p10, p0 = rShiftedArgMasked, rBias // p10 = 1 if |x| < 0.5 368} 369{ .mfb 370 // p8 = 1 if 3.25 <= |x| < 4.0 371(p8) cmp.lt p8, p11 = rShiftedAbsArg, rBiasedExpOf4 372 fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1 373(p10) br.cond.spnt erf_near_zero // branch out if |x| < 0.5 374} 375;; 376.pred.rel "mutex", p8, p11 377{ .mfi 378(p8) adds rCoeffAddr1 = 1392, rDataPtr // coeff. for 3.25 <=|x|<4.0 379(p9) fmerge.s f8 = f8,f1 // +/- inf 380 nop.i 0 381} 382{ .mfb 383(p11) add rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14 384 nop.f 0 385(p9) br.ret.spnt b0 // exit for x = +/- inf 386} 387;; 388{ .mfi 389 adds rCoeffAddr2 = 16, rCoeffAddr1 390 fmerge.s fSignumX = f8, f1 // signum(x) 391 nop.i 0 392} 393{ .mfb 394 cmp.lt p12, p0 = rSaturation, rShiftedAbsArg // |x| > 5.90625? 395 nop.f 0 396(p12) br.cond.spnt erf_saturation // branch out if x |x| >= 6.0 397} 398;; 399// Here if paths #3,4 400// if path #4 we'll branch out after loading of 14 necessary coefficients 401{.mfi 402 ldfe fA19 = [rCoeffAddr1], 32 403 nop.f 0 404 nop.i 0 405} 406{.mfi 407 ldfe fA18 = [rCoeffAddr2], 32 408 nop.f 0 409 adds rCoeffAddr3 = 1024, rDataPtr 410} 411;; 412{.mfi 413 ldfe fA17 = [rCoeffAddr1], 32 414 nop.f 0 415 nop.i 0 416} 417{.mfi 418 ldfe fA16 = [rCoeffAddr2], 32 419 nop.f 0 420 nop.i 0 421} 422;; 423{.mfi 424 ldfe fA15 = [rCoeffAddr1], 32 425 fma.s1 fTSqr = fArgAbsNorm, fArgAbsNorm, f0 426 shr.u rIndex = rIndex, 2 427} 428{.mfi 429 ldfe fA14 = [rCoeffAddr2], 32 430 nop.f 0 431 adds rCoeffAddr4 = 16, r0 432} 433;; 434{.mfi 435 ldfe fA13 = [rCoeffAddr1], 32 436 nop.f 0 437 // address of coefficients ##16..23 438 add rCoeffAddr3 = rCoeffAddr3, rIndex 439} 440{.mfi 441 ldfe fA12 = [rCoeffAddr2], 32 442 nop.f 0 443 cmp.lt p15, p14 = rArg, r0 444} 445;; 446{.mfi 447 ldfe fA11 = [rCoeffAddr1], 32 448 nop.f 0 449 add rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4 450} 451{.mfi 452 ldfe fA10 = [rCoeffAddr2], 32 453 nop.f 0 454 nop.i 0 455} 456;; 457{.mfi 458 ldfe fA9 = [rCoeffAddr1], 32 459 nop.f 0 460 nop.i 0 461} 462{.mfi 463 ldfe fA8 = [rCoeffAddr2], 32 464 nop.f 0 465 nop.i 0 466} 467;; 468{.mfi 469 ldfe fA7 = [rCoeffAddr1], 32 470 fms.s1 fArgAbs = fArgAbs, f1, fThreeAndQ 471 nop.i 0 472} 473{.mfb 474 ldfe fA6 = [rCoeffAddr2], 32 475 nop.f 0 476(p8) br.cond.spnt erf_3q_4 // branch out if 3.25 < |x| < 4.0 477} 478;; 479{.mfi 480 ldfe fA5 = [rCoeffAddr1], 32 481 fma.s1 fTDeg3 = fArgAbsNorm, fTSqr, f0 482 nop.i 0 483} 484{.mfi 485 ldfe fA4 = [rCoeffAddr2], 32 486 fma.s1 fTQuadr = fTSqr, fTSqr, f0 487 nop.i 0 488} 489;; 490// Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm 491{.mfi 492 ldfe fA3 = [rCoeffAddr3], 32 493 fma.s1 fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0 494 nop.i 0 495} 496{.mfi 497 ldfe fA2 = [rCoeffAddr4], 32 498 nop.f 0 499 nop.i 0 500} 501;; 502{.mfi 503 ldfe fA1 = [rCoeffAddr3], 32 504 fma.s1 fRes = fA19, fArgAbsNorm, fA18 505 nop.i 0 506} 507{.mfi 508 ldfe fA0 = [rCoeffAddr4], 32 509 nop.f 0 510 nop.i 0 511} 512;; 513{ .mfi 514 nop.m 0 515 fma.s1 fA17 = fA17, fArgAbsNorm, fA16 516 nop.i 0 517} 518;; 519{ .mfi 520 nop.m 0 521 fma.s1 fA15 = fA15, fArgAbsNorm, fA14 522 nop.i 0 523} 524;; 525{ .mfi 526 nop.m 0 527 fma.s1 fTDeg7 = fTDeg3, fTQuadr, f0 528 nop.i 0 529} 530{ .mfi 531 nop.m 0 532 fma.s1 fA13 = fA13, fArgAbsNorm, fA12 533 nop.i 0 534} 535;; 536{ .mfi 537 nop.m 0 538 fma.s1 fA11 = fA11, fArgAbsNorm, fA10 539 nop.i 0 540} 541;; 542{ .mfi 543 nop.m 0 544 fma.s1 fA9 = fA9, fArgAbsNorm, fA8 545 nop.i 0 546} 547;; 548{ .mfi 549 nop.m 0 550 fma.s1 fRes = fRes, fTSqr, fA17 551 nop.i 0 552} 553{ .mfi 554 nop.m 0 555 fma.s1 fA7 = fA7, fArgAbsNorm, fA6 556 nop.i 0 557} 558;; 559{ .mfi 560 nop.m 0 561 fma.s1 fA5 = fA5, fArgAbsNorm, f0 562 nop.i 0 563} 564;; 565{ .mfi 566 nop.m 0 567 fma.s1 fA15 = fA15, fTSqr, fA13 568 nop.i 0 569} 570{ .mfi 571 nop.m 0 572 fma.s1 fA4 = fA4, fArgAbsNorm, fA3 573 nop.i 0 574} 575;; 576{ .mfi 577 nop.m 0 578 fma.s1 fA2 = fA2, fArgAbsNorm, fA1 579 nop.i 0 580} 581;; 582{ .mfi 583 nop.m 0 584 fma.s1 fA11 = fA11, fTSqr, fA9 585 nop.i 0 586} 587;; 588{ .mfi 589 nop.m 0 590 fma.s1 fA7 = fA7, fTSqr, fA5 591 nop.i 0 592} 593;; 594{ .mfi 595 nop.m 0 596 fma.s1 fRes = fRes, fTQuadr, fA15 597 nop.i 0 598} 599;; 600{ .mfi 601 nop.m 0 602 fma.s1 fA4 = fA4, fTSqr, fA2 603 nop.i 0 604} 605;; 606{ .mfi 607 nop.m 0 608 fma.s1 fRes = fRes, fTQuadr, fA11 609 nop.i 0 610} 611;; 612{ .mfi 613 nop.m 0 614 fma.s1 fA4 = fA7, fTDeg3, fA4 615 nop.i 0 616} 617;; 618{ .mfi 619 nop.m 0 620 fma.s1 fRes = fRes, fTDeg7, fA4 621 nop.i 0 622} 623;; 624{ .mfi 625 nop.m 0 626 // result for negative argument 627(p15) fms.d.s0 f8 = fRes, fArgAbsNormSgn, fA0 628 nop.i 0 629} 630{ .mfb 631 nop.m 0 632 // result for positive argument 633(p14) fma.d.s0 f8 = fRes, fArgAbsNormSgn, fA0 634 br.ret.sptk b0 635} 636 637// Here if 3.25 < |x| < 4.0 638.align 32 639erf_3q_4: 640.pred.rel "mutex", p14, p15 641{ .mfi 642 ldfe fA5 = [rCoeffAddr1], 32 643 fma.s1 fTSqr = fArgAbs, fArgAbs, f0 644 nop.i 0 645} 646{ .mfi 647 nop.m 0 648 fma.s1 fRes = fA19, fArgAbs, fA18 649 nop.i 0 650} 651;; 652{ .mfi 653 nop.m 0 654 fma.s1 fA17 = fA17, fArgAbs, fA16 655 nop.i 0 656} 657{ .mfi 658 nop.m 0 659 fma.s1 fA15 = fA15, fArgAbs, fA14 660 nop.i 0 661} 662;; 663{ .mfi 664 nop.m 0 665 fma.s1 fA13 = fA13, fArgAbs, fA12 666 nop.i 0 667} 668{ .mfi 669 nop.m 0 670 fma.s1 fA11 = fA11, fArgAbs, fA10 671 nop.i 0 672} 673;; 674{ .mfi 675 nop.m 0 676 fma.s1 fA9 = fA9, fArgAbs, fA8 677 nop.i 0 678} 679{ .mfi 680 nop.m 0 681 fma.s1 fArgAbsNormSgn = fArgAbs, fSignumX, f0 682 nop.i 0 683} 684;; 685{ .mfi 686 nop.m 0 687 fma.s1 fTQuadr = fTSqr, fTSqr, f0 688 nop.i 0 689} 690;; 691{ .mfi 692 nop.m 0 693 fma.s1 fRes = fRes, fTSqr, fA17 694 nop.i 0 695} 696;; 697{ .mfi 698 nop.m 0 699 fma.s1 fA15 = fA15, fTSqr, fA13 700 nop.i 0 701} 702;; 703{ .mfi 704 nop.m 0 705 fma.s1 fA11 = fA11, fTSqr, fA9 706 nop.i 0 707} 708{ .mfi 709 nop.m 0 710 fma.s1 fA7 = fA7, fArgAbs, fA6 711 nop.i 0 712} 713;; 714{ .mfi 715 nop.m 0 716 fma.s1 fTDeg7 = fTQuadr, fTSqr, f0 717 nop.i 0 718} 719{ .mfi 720 nop.m 0 721 fma.s1 fRes = fRes, fTQuadr, fA15 722 nop.i 0 723} 724;; 725{ .mfi 726 nop.m 0 727 fma.s1 fA11 = fA11, fTSqr, fA7 728 nop.i 0 729} 730;; 731{ .mfi 732 nop.m 0 733 fma.s1 fRes = fRes, fTDeg7, fA11 734 nop.i 0 735} 736;; 737{ .mfi 738 nop.m 0 739 // result for negative argument 740(p15) fms.d.s0 f8 = fRes, fArgAbsNormSgn, fA5 741 nop.i 0 742} 743{ .mfb 744 nop.m 0 745 // result for positive argument 746(p14) fma.d.s0 f8 = fRes, fArgAbsNormSgn, fA5 747 br.ret.sptk b0 748} 749;; 750 751// Here if |x| < 0.5 752.align 32 753erf_near_zero: 754{ .mfi 755 adds rCoeffAddr1 = 1280, rDataPtr // address of A9 756 fma.s1 fTSqr = fArgSqr, fArgSqr, f0 // x^4 757 nop.i 0 758} 759{ .mfi 760 adds rCoeffAddr2 = 1328, rDataPtr // address of A7 761 nop.f 0 762 nop.i 0 763} 764;; 765{ .mfi 766 ldfpd fA9, fA8 = [rCoeffAddr1], 16 767 nop.f 0 768 nop.i 0 769} 770{ .mfi 771 ldfpd fA7, fA6 = [rCoeffAddr2], 16 772 nop.f 0 773 nop.i 0 774} 775;; 776{ .mfi 777 ldfpd fA5, fA4 = [rCoeffAddr1], 16 778 nop.f 0 779 nop.i 0 780} 781{ .mfi 782 ldfpd fA3, fA2 = [rCoeffAddr2], 16 783 nop.f 0 784 nop.i 0 785} 786;; 787{ .mfi 788 ldfe fA1 = [rCoeffAddr1] 789 nop.f 0 790 nop.i 0 791} 792{ .mfi 793 ldfe fA0 = [rCoeffAddr2] 794 nop.f 0 795 nop.i 0 796} 797;; 798{ .mfi 799 nop.m 0 800 fma.s1 fTQuadr = fTSqr, fTSqr, f0 801 nop.i 0 802} 803;; 804{ .mfi 805 nop.m 0 806 fma.s1 fRes = fA9, fArgSqr, fA8 807 nop.i 0 808} 809{ .mfi 810 nop.m 0 811 fma.s1 fA7 = fA7, fArgSqr, fA6 812 nop.i 0 813} 814;; 815{ .mfi 816 nop.m 0 817 fma.s1 fA3 = fA3, fArgSqr, fA2 818 nop.i 0 819} 820{ .mfi 821 nop.m 0 822 fma.s1 fA5 = fA5, fArgSqr, fA4 823 nop.i 0 824} 825;; 826{ .mfi 827 nop.m 0 828 fma.s1 fA1 = fA1, fArgSqr, fA0 829 nop.i 0 830} 831{ .mfi 832 nop.m 0 833 fma.s1 fTQuadrSgn = fTQuadr, f8, f0 834 nop.i 0 835} 836;; 837{ .mfi 838 nop.m 0 839 fma.s1 fRes = fRes, fTSqr, fA7 840 nop.i 0 841} 842;; 843{ .mfi 844 nop.m 0 845 fma.s1 fA1 = fA3, fTSqr, fA1 846 nop.i 0 847} 848;; 849{ .mfi 850 nop.m 0 851 fma.s1 fRes = fRes, fTSqr, fA5 852 nop.i 0 853} 854;; 855{ .mfi 856 nop.m 0 857 fma.s1 fA1 = fA1, f8, f0 858 nop.i 0 859} 860;; 861{ .mfb 862 nop.m 0 863 fma.d.s0 f8 = fRes, fTQuadrSgn, fA1 // x*Pol9(x^2) 864 br.ret.sptk b0 // Exit for |x| < 0.5 865};; 866 867// Here if 5.90625 <= |x| < +inf 868.align 32 869erf_saturation: 870{ .mfi 871 adds rDataPtr = 1376, rDataPtr // address of A0 872 nop.f 0 873 nop.i 0 874} 875;; 876{ .mfi 877 ldfe fA0 = [rDataPtr] 878 nop.f 0 879 nop.i 0 880} 881;; 882{ .mfb 883 nop.m 0 884 fma.d.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0 - 2^(-63)) 885 // Exit for 5.90625 <= |x| < +inf 886 br.ret.sptk b0 // Exit for 5.90625 <=|x|< +inf 887} 888;; 889 890// Here if x is double precision denormal 891.align 32 892erf_denormal: 893{ .mfi 894 adds rDataPtr = 1632, rDataPtr // address of A0 895 fclass.m p7,p8 = f8, 0x0a // is x -denormal ? 896 nop.i 0 897} 898;; 899{ .mfi 900 ldfe fA0 = [rDataPtr] // A0 901 nop.f 0 902 nop.i 0 903} 904;; 905{ .mfi 906 nop.m 0 907 fma.s1 fA0 = fA0,f8,f0 // A0*x 908 nop.i 0 909} 910;; 911{ .mfi 912 nop.m 0 913(p7) fma.d.s0 f8 = f8,f8,fA0 // -denormal 914 nop.i 0 915} 916{ .mfb 917 nop.m 0 918(p8) fnma.d.s0 f8 = f8,f8,fA0 // +denormal 919 br.ret.sptk b0 // Exit for denormal 920} 921;; 922 923GLOBAL_LIBM_END(erf) 924libm_alias_double_other (erf, erf) 925