1.file "erff.s" 2 3 4// Copyright (c) 2001 - 2005, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//============================================================== 41// 08/14/01 Initial version 42// 05/20/02 Cleaned up namespace and sf0 syntax 43// 02/06/03 Reordered header: .section, .global, .proc, .align 44// 03/31/05 Reformatted delimiters between data tables 45// 46// API 47//============================================================== 48// float erff(float) 49// 50// Overview of operation 51//============================================================== 52// Background 53// 54// 55// There are 8 paths: 56// 1. x = +/-0.0 57// Return erff(x) = +/-0.0 58// 59// 2. 0.0 < |x| < 0.125 60// Return erff(x) = x *Pol3(x^2), 61// where Pol3(x^2) = C3*x^6 + C2*x^4 + C1*x^2 + C0 62// 63// 3. 0.125 <= |x| < 4.0 64// Return erff(x) = sign(x)*PolD(x)*PolC(|x|) + sign(x)*PolA(|x|), 65// where sign(x)*PolD(x) = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4), 66// PolC(|x|) = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0, 67// PolA(|x|) = A3|x|^3 + A2*x^2 + A1*|x| + A0 68// 69// Actually range 0.125<=|x|< 4.0 is splitted to 5 subranges. 70// For each subrange there is particular set of coefficients. 71// Below is the list of subranges: 72// 3.1 0.125 <= |x| < 0.25 73// 3.2 0.25 <= |x| < 0.5 74// 3.3 0.5 <= |x| < 1.0 75// 3.4 1.0 <= |x| < 2.0 76// 3.5 2.0 <= |x| < 4.0 77// 78// 4. 4.0 <= |x| < +INF 79// Return erff(x) = sign(x)*(1.0d - 2^(-52)) 80// 81// 5. |x| = INF 82// Return erff(x) = sign(x) * 1.0 83// 84// 6. x = [S,Q]NaN 85// Return erff(x) = QNaN 86// 87// 7. x is positive denormal 88// Return erff(x) = C0*x - x^2, 89// where C0 = 2.0/sqrt(Pi) 90// 91// 8. x is negative denormal 92// Return erff(x) = C0*x + x^2, 93// where C0 = 2.0/sqrt(Pi) 94// 95// Registers used 96//============================================================== 97// Floating Point registers used: 98// f8, input 99// f32 -> f59 100 101// General registers used: 102// r32 -> r45, r2, r3 103 104// Predicate registers used: 105// p0, p6 -> p12, p14, p15 106 107// p6 to filter out case when x = [Q,S]NaN or +/-0 108// p7 to filter out case when x = denormal 109// p8 set if |x| >= 0.3125, used also to process denormal input 110// p9 to filter out case when |x| = inf 111// p10 to filter out case when |x| < 0.125 112// p11 to filter out case when 0.125 <= |x| < 4.0 113// p12 to filter out case when |x| >= 4.0 114// p14 set to 1 for positive x 115// p15 set to 1 for negative x 116 117// Assembly macros 118//============================================================== 119rDataPtr = r2 120rDataPtr1 = r3 121 122rBias = r33 123rCoeffAddr3 = r34 124rCoeffAddr1 = r35 125rCoeffAddr2 = r36 126rOffset2 = r37 127rBias2 = r38 128rMask = r39 129rArg = r40 130rBound = r41 131rSignBit = r42 132rAbsArg = r43 133rDataPtr2 = r44 134rSaturation = r45 135 136//============================================================== 137fA0 = f32 138fA1 = f33 139fA2 = f34 140fA3 = f35 141fC0 = f36 142fC1 = f37 143fC2 = f38 144fC3 = f39 145fD0 = f40 146fD1 = f41 147fD2 = f42 148fB0 = f43 149fArgSqr = f44 150fAbsArg = f45 151fSignumX = f46 152fArg4 = f47 153fArg4Sgn = f48 154fArg3 = f49 155fArg3Sgn = f50 156fArg7Sgn = f51 157fArg6Sgn = f52 158fPolC = f53 159fPolCTmp = f54 160fPolA = f55 161fPolATmp = f56 162fPolD = f57 163fPolDTmp = f58 164fArgSqrSgn = f59 165 166// Data tables 167//============================================================== 168 169RODATA 170 171.align 16 172 173LOCAL_OBJECT_START(erff_data) 174// Polynomial coefficients for the erf(x), 0.125 <= |x| < 0.25 175data8 0xBE4218BB56B49E66 // C0 176data8 0x3F7AFB8315DA322B // C1 177data8 0x3F615D6EBEE0CA32 // C2 178data8 0xBF468D71CF4F0918 // C3 179data8 0x40312115B0932F24 // D0 180data8 0xC0160D6CD0991EA3 // D1 181data8 0xBFE04A567A6DBE4A // D2 182data8 0xBF4207BC640D1509 // B0 183// Polynomial coefficients for the erf(x), 0.25 <= |x| < 0.5 184data8 0x3F90849356383F58 // C0 185data8 0x3F830BD5BA240F09 // C1 186data8 0xBF3FA4970E2BCE23 // C2 187data8 0xBF6061798E58D0FD // C3 188data8 0xBF68C0D83DD22E02 // D0 189data8 0x401C0A9EE4108F94 // D1 190data8 0xC01056F9B5E387F5 // D2 191data8 0x3F1C9744E36A5706 // B0 192// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0 193data8 0x3F85F7D419A13DE3 // C0 194data8 0x3F791A13FF66D45A // C1 195data8 0x3F46B17B16B5929F // C2 196data8 0xBF5124947A8BF45E // C3 197data8 0x3FA1B3FD95EA9564 // D0 198data8 0x40250CECD79A020A // D1 199data8 0xC0190DC96FF66CCD // D2 200data8 0x3F4401AE28BA4DD5 // B0 201// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0 202data8 0xBF49E07E3584C3AE // C0 203data8 0x3F3166621131445C // C1 204data8 0xBF65B7FC1EAC2099 // C2 205data8 0x3F508C6BD211D736 // C3 206data8 0xC053FABD70601067 // D0 207data8 0x404A06640EE87808 // D1 208data8 0xC0283F30817A3F08 // D2 209data8 0xBF2F6DBBF4D6257F // B0 210// Polynomial coefficients for the erf(x), 2.0 <= |x| < 4.0 211data8 0xBF849855D67E9407 // C0 212data8 0x3F5ECA5FEC01C70C // C1 213data8 0xBF483110C30FABA4 // C2 214data8 0x3F1618DA72860403 // C3 215data8 0xC08A5C9D5FE8B9F6 // D0 216data8 0x406EFF5F088CEC4B // D1 217data8 0xC03A5743DF38FDE0 // D2 218data8 0xBEE397A9FA5686A2 // B0 219// Polynomial coefficients for the erf(x), -0.125 < x < 0.125 220data8 0x3FF20DD7504270CB // C0 221data8 0xBFD8127465AFE719 // C1 222data8 0x3FBCE2D77791DD77 // C2 223data8 0xBF9B582755CDF345 // C3 224// Polynomial coefficients for the erf(x), 0.125 <= |x| < 0.25 225data8 0xBD54E7E451AF0E36 // A0 226data8 0x3FF20DD75043FE20 // A1 227data8 0xBE05680ACF8280E4 // A2 228data8 0xBFD812745E92C3D3 // A3 229// Polynomial coefficients for the erf(x), 0.25 <= |x| < 0.5 230data8 0xBE1ACEC2859CB55F // A0 231data8 0x3FF20DD75E8D2B64 // A1 232data8 0xBEABC6A83208FCFC // A2 233data8 0xBFD81253E42E7B99 // A3 234// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0 235data8 0x3EABD5A2482B4979 // A0 236data8 0x3FF20DCAA52085D5 // A1 237data8 0x3F13A994A348795B // A2 238data8 0xBFD8167B2DFCDE44 // A3 239// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0 240data8 0xBF5BA377DDAB4E17 // A0 241data8 0x3FF2397F1D8FC0ED // A1 242data8 0xBF9945BFC1915C21 // A2 243data8 0xBFD747AAABB690D8 // A3 244// Polynomial coefficients for the erf(x), 2.0 <= |x| < 4.0 245data8 0x3FF0E2920E0391AF // A0 246data8 0xC00D249D1A95A5AE // A1 247data8 0x40233905061C3803 // A2 248data8 0xC027560B851F7690 // A3 249// 250data8 0x3FEFFFFFFFFFFFFF // 1.0 - epsilon 251data8 0x3FF20DD750429B6D // C0 = 2.0/sqrt(Pi) 252LOCAL_OBJECT_END(erff_data) 253 254 255.section .text 256GLOBAL_LIBM_ENTRY(erff) 257 258{ .mfi 259 alloc r32 = ar.pfs, 0, 14, 0, 0 260 fmerge.s fAbsArg = f1, f8 // |x| 261 addl rMask = 0x806, r0 262} 263{ .mfi 264 addl rDataPtr = @ltoff(erff_data), gp 265 fma.s1 fArgSqr = f8, f8, f0 // x^2 266 adds rSignBit = 0x1, r0 267} 268;; 269 270{ .mfi 271 getf.s rArg = f8 // x in GR 272 fclass.m p7,p0 = f8, 0x0b // is x denormal ? 273 // sign bit and 2 most bits in significand 274 shl rMask = rMask, 20 275} 276{ .mfi 277 ld8 rDataPtr = [rDataPtr] 278 nop.f 0 279 adds rBias2 = 0x1F0, r0 280} 281;; 282 283{ .mfi 284 nop.m 0 285 fmerge.s fSignumX = f8, f1 // signum(x) 286 shl rSignBit = rSignBit, 31 // mask for sign bit 287} 288{ .mfi 289 adds rBound = 0x3E0, r0 290 nop.f 0 291 adds rSaturation = 0x408, r0 292} 293;; 294 295{ .mfi 296 andcm rOffset2 = rArg, rMask 297 fclass.m p6,p0 = f8, 0xc7 // is x [S,Q]NaN or +/-0 ? 298 shl rBound = rBound, 20 // 0.125f in GR 299} 300{ .mfb 301 andcm rAbsArg = rArg, rSignBit // |x| in GR 302 nop.f 0 303(p7) br.cond.spnt erff_denormal // branch out if x is denormal 304} 305;; 306 307{ .mfi 308 adds rCoeffAddr2 = 352, rDataPtr 309 fclass.m p9,p0 = f8, 0x23 // is x +/- inf? 310 shr rOffset2 = rOffset2, 21 311} 312{ .mfi 313 cmp.lt p10, p8 = rAbsArg, rBound // |x| < 0.125? 314 nop.f 0 315 adds rCoeffAddr3 = 16, rDataPtr 316} 317;; 318 319{ .mfi 320(p8) sub rBias = rOffset2, rBias2 321 fma.s1 fArg4 = fArgSqr, fArgSqr, f0 // x^4 322 shl rSaturation = rSaturation, 20// 4.0 in GR (saturation bound) 323} 324{ .mfb 325(p10) adds rBias = 0x14, r0 326(p6) fma.s.s0 f8 = f8,f1,f8 // NaN or +/-0 327(p6) br.ret.spnt b0 // exit for x = NaN or +/-0 328} 329;; 330 331{ .mfi 332 shladd rCoeffAddr1 = rBias, 4, rDataPtr 333 fma.s1 fArg3Sgn = fArgSqr, f8, f0 // sign(x)*|x|^3 334 // is |x| < 4.0? 335 cmp.lt p11, p12 = rAbsArg, rSaturation 336} 337{ .mfi 338 shladd rCoeffAddr3 = rBias, 4, rCoeffAddr3 339 fma.s1 fArg3 = fArgSqr, fAbsArg, f0 // |x|^3 340 shladd rCoeffAddr2 = rBias, 3, rCoeffAddr2 341} 342;; 343 344{ .mfi 345(p11) ldfpd fC0, fC1 = [rCoeffAddr1] 346(p9) fmerge.s f8 = f8,f1 // +/- inf 347(p12) adds rDataPtr = 512, rDataPtr 348} 349{ .mfb 350(p11) ldfpd fC2, fC3 = [rCoeffAddr3], 16 351 nop.f 0 352(p9) br.ret.spnt b0 // exit for x = +/- inf 353} 354;; 355 356{ .mfi 357(p11) ldfpd fA0, fA1 = [rCoeffAddr2], 16 358 nop.f 0 359 nop.i 0 360} 361{ .mfi 362 add rCoeffAddr1 = 48, rCoeffAddr1 363 nop.f 0 364 nop.i 0 365} 366;; 367 368{ .mfi 369(p11) ldfpd fD0, fD1 = [rCoeffAddr3] 370 nop.f 0 371 nop.i 0 372} 373{ .mfb 374(p11) ldfpd fD2, fB0 = [rCoeffAddr1] 375 // sign(x)*|x|^2 376 fma.s1 fArgSqrSgn = fArgSqr, fSignumX, f0 377(p10) br.cond.spnt erff_near_zero 378} 379;; 380 381{ .mfi 382(p11) ldfpd fA2, fA3 = [rCoeffAddr2], 16 383 fcmp.lt.s1 p15, p14 = f8,f0 384 nop.i 0 385} 386{ .mfb 387(p12) ldfd fA0 = [rDataPtr] 388 fma.s1 fArg4Sgn = fArg4, fSignumX, f0 // sign(x)*|x|^4 389(p12) br.cond.spnt erff_saturation 390} 391;; 392{ .mfi 393 nop.m 0 394 fma.s1 fArg7Sgn = fArg4, fArg3Sgn, f0 // sign(x)*|x|^7 395 nop.i 0 396} 397{ .mfi 398 nop.m 0 399 fma.s1 fArg6Sgn = fArg3, fArg3Sgn, f0 // sign(x)*|x|^6 400 nop.i 0 401} 402;; 403 404{ .mfi 405 nop.m 0 406 fma.s1 fPolC = fC3, fAbsArg, fC2 // C3*|x| + C2 407 nop.i 0 408} 409{ .mfi 410 nop.m 0 411 fma.s1 fPolCTmp = fC1, fAbsArg, fC0 // C1*|x| + C0 412 nop.i 0 413};; 414 415{ .mfi 416 nop.m 0 417 fma.s1 fPolA = fA1, fAbsArg, fA0 // A1*|x| + A0 418 nop.i 0 419} 420;; 421 422{ .mfi 423 nop.m 0 424 fma.s1 fPolD = fD1, fAbsArg, fD0 // D1*|x| + D0 425 nop.i 0 426} 427{ .mfi 428 nop.m 0 429 // sign(x)*(|x|^7 + D2*x^6) 430 fma.s1 fPolDTmp = fArg6Sgn, fD2, fArg7Sgn 431 nop.i 0 432};; 433 434{ .mfi 435 nop.m 0 436 fma.s1 fPolATmp = fA3, fAbsArg, fA2 // A3*|x| + A2 437 nop.i 0 438} 439{ .mfi 440 nop.m 0 441 fma.s1 fB0 = fB0, fArg4, f0 // B0*x^4 442 nop.i 0 443};; 444 445{ .mfi 446 nop.m 0 447 // C3*|x|^3 + C2*x^2 + C1*|x| + C0 448 fma.s1 fPolC = fPolC, fArgSqr, fPolCTmp 449 nop.i 0 450} 451;; 452 453{ .mfi 454 nop.m 0 455 // PolD = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4) 456 fma.d.s1 fPolD = fPolD, fArg4Sgn, fPolDTmp 457 nop.i 0 458} 459;; 460 461{ .mfi 462 nop.m 0 463 // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0 464 fma.d.s1 fPolA = fPolATmp, fArgSqr, fPolA 465 nop.i 0 466} 467;; 468 469{ .mfi 470 nop.m 0 471 // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0 472 fma.d.s1 fPolC = fPolC, f1, fB0 473 nop.i 0 474} 475;; 476 477{ .mfi 478 nop.m 0 479(p14) fma.s.s0 f8 = fPolC, fPolD, fPolA // for positive x 480 nop.i 0 481} 482{ .mfb 483 nop.m 0 484(p15) fms.s.s0 f8 = fPolC, fPolD, fPolA // for negative x 485 br.ret.sptk b0 // Exit for 0.125 <=|x|< 4.0 486};; 487 488 489// Here if |x| < 0.125 490erff_near_zero: 491{ .mfi 492 nop.m 0 493 fma.s1 fPolC = fC3, fArgSqr, fC2 // C3*x^2 + C2 494 nop.i 0 495} 496{ .mfi 497 nop.m 0 498 fma.s1 fPolCTmp = fC1, fArgSqr, fC0 // C1*x^2 + C0 499 nop.i 0 500};; 501 502{ .mfi 503 nop.m 0 504 fma.s1 fPolC = fPolC, fArg4, fPolCTmp // C3*x^6 + C2*x^4 + C1*x^2 + C0 505 nop.i 0 506};; 507 508{ .mfb 509 nop.m 0 510 // x*(C3*x^6 + C2*x^4 + C1*x^2 + C0) 511 fma.s.s0 f8 = fPolC, f8, f0 512 br.ret.sptk b0 // Exit for |x| < 0.125 513};; 514 515// Here if 4.0 <= |x| < +inf 516erff_saturation: 517{ .mfb 518 nop.m 0 519 fma.s.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0d - 2^(-52)) 520 // Exit for 4.0 <= |x| < +inf 521 br.ret.sptk b0 // Exit for 4.0 <=|x|< +inf 522} 523;; 524 525// Here if x is single precision denormal 526erff_denormal: 527{ .mfi 528 adds rDataPtr = 520, rDataPtr // address of C0 529 fclass.m p7,p8 = f8, 0x0a // is x -denormal ? 530 nop.i 0 531} 532;; 533{ .mfi 534 ldfd fC0 = [rDataPtr] // C0 535 nop.f 0 536 nop.i 0 537} 538;; 539{ .mfi 540 nop.m 0 541 fma.s1 fC0 = fC0,f8,f0 // C0*x 542 nop.i 0 543} 544;; 545{ .mfi 546 nop.m 0 547(p7) fma.s.s0 f8 = f8,f8,fC0 // -denormal 548 nop.i 0 549} 550{ .mfb 551 nop.m 0 552(p8) fnma.s.s0 f8 = f8,f8,fC0 // +denormal 553 br.ret.sptk b0 // Exit for denormal 554} 555;; 556 557GLOBAL_LIBM_END(erff) 558libm_alias_float_other (erf, erf) 559