1.file "tanhf.s" 2 3 4// Copyright (c) 2001 - 2005, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//============================================================== 41// 05/30/01 Initial version 42// 05/20/02 Cleaned up namespace and sf0 syntax 43// 02/10/03 Reordered header: .section, .global, .proc, .align 44// 03/31/05 Reformatted delimiters between data tables 45// 46// API 47//============================================================== 48// float tanhf(float) 49// 50// Overview of operation 51//============================================================== 52// Background 53// 54// 55// There are 9 paths: 56// 1. x = +/-0.0 57// Return tanhf(x) = +/-0.0 58// 59// 2. 0.0 < |x| < 0.3125 60// Return tanhf(x) = x + x^3*Pol3(x^2), 61// where Pol3(x^2) = C3*x^6 + C2*x^4 + C1*x^2 + C0 62// 63// 3. 0.3125 <= |x| < 8.0 64// Return tanhf(x) = sign(x)*PolD(x)*PolC(|x|) + sign(x)*PolA(|x|), 65// where sign(x)*PolD(x) = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4), 66// PolC(|x|) = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0, 67// PolA(|x|) = A3|x|^3 + A2*x^2 + A1*|x| + A0 68// 69// Actually range 0.3125<=|x|< 8.0 is split to 5 subranges. 70// For each subrange there is particular set of coefficients. 71// Below is the list of subranges: 72// 3.1 0.3125 <= |x| < 0.5 73// 3.2 0.5 <= |x| < 1.0 74// 3.3 1.0 <= |x| < 2.0 75// 3.4 2.0 <= |x| < 4.0 76// 3.5 4.0 <= |x| < 8.0 77// 78// 4. 8.0 <= |x| < 9.125 79// Return tanhf(x) = sign(x)*(A3|x|^3 + A2*x^2 + A1*|x| + A0) 80// 81// 5. 9.125 <= |x| < +INF 82// Return tanhf(x) = sign(x)*(1.0d - 2^(-52)) 83// 84// 6. |x| = INF 85// Return tanhf(x) = sign(x) * 1.0 86// 87// 7. x = [S,Q]NaN 88// Return tanhf(x) = QNaN 89// 90// 8. x is positive denormal 91// Return tanhf(x) = x - x^2 92// 93// 9. x is negative denormal 94// Return tanhf(x) = x + x^2 95// 96// Registers used 97//============================================================== 98// Floating Point registers used: 99// f8, input 100// f32 -> f59 101 102// General registers used: 103// r32 -> r46, r2, r3 104 105// Predicate registers used: 106// p0, p6 -> p15 107 108// p6 to filter out case when x = [Q,S]NaN or +/-0 109// p7 to filter out case when x = denormal 110// p8 set if |x| >= 0.3125, used also to process denormal input 111// p9 to filter out case when |x| = inf 112// p10 to filter out case when |x| < 0.3125 113// p11 to filter out case when 0.3125 <= |x| < 9.125 114// p12 to filter out case when |x| >= 9.125 115// p13 to filter out case when 8.0 <= |x| < 9.125 116// p14 set to 1 for positive x 117// p15 set to 1 for negative x 118 119// Assembly macros 120//============================================================== 121rDataPtr = r2 122rDataPtr1 = r3 123 124rBias = r33 125rCoeffAddr3 = r34 126rNearSaturation = r35 127rCoeffAddr1 = r36 128rCoeffAddr2 = r37 129rOffset2 = r38 130rBias2 = r39 131rMask = r40 132rArg = r41 133rBound = r42 134rSignBit = r43 135rAbsArg = r44 136rDataPtr2 = r45 137rSaturation = r46 138 139//============================================================== 140fA0 = f32 141fA1 = f33 142fA2 = f34 143fA3 = f35 144fC0 = f36 145fC1 = f37 146fC2 = f38 147fC3 = f39 148fD0 = f40 149fD1 = f41 150fD2 = f42 151fB0 = f43 152fArgSqr = f44 153fAbsArg = f45 154fSignumX = f46 155fArg4 = f47 156fArg4Sgn = f48 157fArg3 = f49 158fArg3Sgn = f50 159fArg7Sgn = f51 160fArg6Sgn = f52 161fPolC = f53 162fPolCTmp = f54 163fPolA = f55 164fPolATmp = f56 165fPolD = f57 166fPolDTmp = f58 167fArgSqrSgn = f59 168 169// Data tables 170//============================================================== 171 172RODATA 173 174.align 16 175 176LOCAL_OBJECT_START(tanhf_data) 177// Polynomial coefficients for the tanh(x), 0.3125 <= |x| < 0.5 178data8 0x3F9BEEDFDD177D7B // C0 179data8 0x3F970D10C7F32458 // C1 180data8 0x3F766D6B051F3A38 // C2 181data8 0xBF732F2001B23402 // C3 182data8 0xBF854BE1CE1ED499 // D0 183data8 0x4013C944F3999A16 // D1 184data8 0xC01106C6975222C0 // D2 185data8 0x3F783D5ACCF9EBE8 // B0 186// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0 187data8 0xBF5D631440786869 // C0 188data8 0xBF575D79A0D52069 // C1 189data8 0xBF7E2237B7EFC705 // C2 190data8 0x3F6A7ACBC273041F // C3 191data8 0xC040E32EA52D91EB // D0 192data8 0x403D19463E5DB4D7 // D1 193data8 0xC02216F61F759F39 // D2 194data8 0xBF55B4EA0B844BE7 // B0 195// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0 196data8 0x3F8637DBE5B3E690 // C0 197data8 0xBF7F7FEC158C07F5 // C1 198data8 0x3F711C586706838A // C2 199data8 0xBF50EF7EF605554E // C3 200data8 0xC054D45448354E25 // D0 201data8 0x404ADFEEA282E730 // D1 202data8 0xC028AEE456D59549 // D2 203data8 0x3F25232D1BED59A8 // B0 204// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 4.0 205data8 0xBF52602285F2D06C // C0 206data8 0x3F2E57C298FFE1E0 // C1 207data8 0xBF15ED575DB3C811 // C2 208data8 0x3EE428878A08525C // C3 209data8 0xC0895A26849039C1 // D0 210data8 0x406E3C60BBFBB575 // D1 211data8 0xC03A06F62867C75A // D2 212data8 0xBEB114C70F1C723E // B0 213// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 8.0 214data8 0x3EF4B22BD17039A3 // C0 215data8 0xBEB704ADC040C57F // C1 216data8 0x3E937A98288AFE1A // C2 217data8 0xBE4F33B2C9FFE7E7 // C3 218data8 0xC0BE48CFADE2431E // D0 219data8 0x4090E74249760FDD // D1 220data8 0xC04B6F537FCF2F1E // D2 221data8 0x3E0DCD879C91ADEA // B0 222// Polynomial coefficients for the tanh(x), -0.3125 < x < 0.3125 223data8 0xBFD555551E8245B7 // A0 224data8 0x3FC110E63F52E689 // A1 225data8 0xBFAB8CD6A5B7BAFA // A2 226data8 0x3F945D467FCEB553 // A3 227// Polynomial coefficients for the tanh(x), 0.3125 <= |x| < 0.5 228data8 0xBE3DCC92FCAECBB6 // A0 229data8 0x3FF0000043B7D267 // A1 230data8 0xBED18BF28ACFC4B1 // A2 231data8 0xBFD554A56F82837E // A3 232// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0 233data8 0x3EFD6054758539F9 // A0 234data8 0x3FEFFBFC77198EBE // A1 235data8 0x3F700327CA98D237 // A2 236data8 0xBFD68955F5BB2FA1 // A3 237// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0 238data8 0xBF71A53F229DF01B // A0 239data8 0x3FF0AECFD730DE50 // A1 240data8 0xBFC882F88E5DF3BA // A2 241data8 0x3FC6EDF212CA2A8D // A3 242// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 4.0 243data8 0xBFAF0B712E9EDA47 // A0 244data8 0x3FF1C208080BEA64 // A1 245data8 0x3FC3D29B20C8946E // A2 246data8 0xBFF04514ED900A6A // A3 247// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 8.0 248data8 0xBFB1DEA49A831CBC // A0 249data8 0x3FFA729FC7085674 // A1 250data8 0xBFF2F44D923A8FA4 // A2 251data8 0x3FE092FC5712227E // A3 252// Polynomial coefficients for the tanh(x), 8.0 <= |x| <= 9.125 253data8 0x3FEFFF5769EE3041 // A0 254data8 0x3EFBBF148D850891 // A1 255data8 0xBEC86BCEF0F5C2FE // A2 256data8 0x3E7CBA4F3A885A5C // A3 257// 258data8 0x3FEFFFFFFFFFFFFF // 1.0 - epsilon 259LOCAL_OBJECT_END(tanhf_data) 260 261.section .text 262GLOBAL_LIBM_ENTRY(tanhf) 263 264{ .mfi 265 alloc r32 = ar.pfs, 1, 14, 0, 0 266 fmerge.s fAbsArg = f1, f8 // |x| 267 addl rMask = 0x806, r0 268} 269{ .mfi 270 addl rDataPtr = @ltoff(tanhf_data), gp 271 fma.s1 fArgSqr = f8, f8, f0 // x^2 272 adds rSignBit = 0x1, r0 273} 274;; 275 276{ .mfi 277 getf.s rArg = f8 // x in GR 278 fclass.m p7,p0 = f8, 0x0b // is x denormal ? 279 // sign bit and 2 most bits in significand 280 shl rMask = rMask, 20 281} 282{ .mfi 283 ld8 rDataPtr = [rDataPtr] 284 nop.f 0 285 adds rBias2 = 0x1F4, r0 286} 287;; 288 289{ .mfi 290 adds rNearSaturation = 0x14, r0 291 fmerge.s fSignumX = f8, f1 // signum(x) 292 shl rSignBit = rSignBit, 31 // mask for sign bit 293} 294{ .mfi 295 adds rBound = 0x3EA, r0 296 nop.f 0 297 addl rSaturation = 0x4112, r0 298} 299;; 300 301{ .mfi 302 andcm rOffset2 = rArg, rMask 303 fclass.m p6,p0 = f8, 0xc7 // is x [S,Q]NaN or +/-0 ? 304 shl rBound = rBound, 20 // 1.0f in GR 305} 306{ .mfb 307 andcm rAbsArg = rArg, rSignBit // |x| in GR 308 nop.f 0 309(p7) br.cond.spnt tanhf_denormal // branch out if x is denormal 310} 311;; 312 313{ .mfi 314 adds rCoeffAddr2 = 352, rDataPtr 315 fclass.m p9,p0 = f8, 0x23 // is x +/- inf? 316 shr rOffset2 = rOffset2, 21 317} 318{ .mfi 319 cmp.lt p10, p8 = rAbsArg, rBound // |x| < 0.3125? 320 nop.f 0 321 adds rCoeffAddr3 = 16, rDataPtr 322} 323;; 324 325{ .mfi 326(p8) sub rBias = rOffset2, rBias2 327 fma.s1 fArg4 = fArgSqr, fArgSqr, f0 // x^4 328 shl rSaturation = rSaturation, 16 329} 330{ .mfb 331(p10) adds rBias = 0x14, r0 332(p6) fma.s.s0 f8 = f8,f1,f8 // NaN or +/-0 333(p6) br.ret.spnt b0 // exit for x = NaN or +/-0 334} 335;; 336 337{ .mfi 338 shladd rCoeffAddr1 = rBias, 4, rDataPtr 339 fma.s1 fArg3Sgn = fArgSqr, f8, f0 // sign(x)*|x|^3 340 // is |x| < 9.125? 341 cmp.lt p11, p12 = rAbsArg, rSaturation 342} 343{ .mfi 344 shladd rCoeffAddr3 = rBias, 4, rCoeffAddr3 345 fma.s1 fArg3 = fArgSqr, fAbsArg, f0 // |x|^3 346 shladd rCoeffAddr2 = rBias, 3, rCoeffAddr2 347} 348;; 349 350{ .mfi 351(p11) ldfpd fC0, fC1 = [rCoeffAddr1] 352(p9) fmerge.s f8 = f8,f1 // +/- inf 353(p12) adds rDataPtr = 544, rDataPtr 354} 355{ .mfb 356(p11) ldfpd fC2, fC3 = [rCoeffAddr3], 16 357 nop.f 0 358(p9) br.ret.spnt b0 // exit for x = +/- inf 359} 360;; 361 362{ .mfi 363(p11) ldfpd fA0, fA1 = [rCoeffAddr2], 16 364 nop.f 0 365(p8) cmp.eq.unc p13, p0 = rBias, rNearSaturation 366} 367{ .mfi 368 add rCoeffAddr1 = 48, rCoeffAddr1 369 nop.f 0 370 nop.i 0 371} 372;; 373 374{ .mfi 375(p11) ldfpd fD0, fD1 = [rCoeffAddr3] 376 nop.f 0 377 nop.i 0 378} 379{ .mfb 380(p11) ldfpd fD2, fB0 = [rCoeffAddr1] 381 // sign(x)*|x|^2 382 fma.s1 fArgSqrSgn = fArgSqr, fSignumX, f0 383(p10) br.cond.spnt tanhf_near_zero 384} 385;; 386 387{ .mfi 388(p11) ldfpd fA2, fA3 = [rCoeffAddr2], 16 389 fcmp.lt.s1 p15, p14 = f8,f0 390 nop.i 0 391} 392{ .mfb 393(p12) ldfd fA0 = [rDataPtr] 394 fma.s1 fArg4Sgn = fArg4, fSignumX, f0 // sign(x)*|x|^4 395(p12) br.cond.spnt tanhf_saturation 396} 397;; 398{ .mfi 399 nop.m 0 400 fma.s1 fArg7Sgn = fArg4, fArg3Sgn, f0 // sign(x)*|x|^7 401 nop.i 0 402} 403{ .mfb 404 nop.m 0 405 fma.s1 fArg6Sgn = fArg3, fArg3Sgn, f0 // sign(x)*|x|^6 406(p13) br.cond.spnt tanhf_close_to_saturation 407} 408;; 409 410{ .mfi 411 nop.m 0 412 fma.s1 fPolC = fC3, fAbsArg, fC2 // C3*|x| + C2 413 nop.i 0 414} 415{ .mfi 416 nop.m 0 417 fma.s1 fPolCTmp = fC1, fAbsArg, fC0 // C1*|x| + C0 418 nop.i 0 419};; 420 421{ .mfi 422 nop.m 0 423 fma.s1 fPolA = fA1, fAbsArg, fA0 // A1*|x| + A0 424 nop.i 0 425} 426;; 427 428{ .mfi 429 nop.m 0 430 fma.s1 fPolD = fD1, fAbsArg, fD0 // D1*|x| + D0 431 nop.i 0 432} 433{ .mfi 434 nop.m 0 435 // sign(x)*(|x|^7 + D2*x^6) 436 fma.s1 fPolDTmp = fArg6Sgn, fD2, fArg7Sgn 437 nop.i 0 438};; 439 440{ .mfi 441 nop.m 0 442 fma.s1 fPolATmp = fA3, fAbsArg, fA2 // A3*|x| + A2 443 nop.i 0 444} 445{ .mfi 446 nop.m 0 447 fma.s1 fB0 = fB0, fArg4, f0 // B0*x^4 448 nop.i 0 449};; 450 451{ .mfi 452 nop.m 0 453 // C3*|x|^3 + C2*x^2 + C1*|x| + C0 454 fma.s1 fPolC = fPolC, fArgSqr, fPolCTmp 455 nop.i 0 456} 457;; 458 459{ .mfi 460 nop.m 0 461 // PolD = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4) 462 fma.d.s1 fPolD = fPolD, fArg4Sgn, fPolDTmp 463 nop.i 0 464} 465;; 466 467{ .mfi 468 nop.m 0 469 // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0 470 fma.d.s1 fPolA = fPolATmp, fArgSqr, fPolA 471 nop.i 0 472} 473;; 474 475{ .mfi 476 nop.m 0 477 // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0 478 fma.d.s1 fPolC = fPolC, f1, fB0 479 nop.i 0 480} 481;; 482 483{ .mfi 484 nop.m 0 485(p14) fma.s.s0 f8 = fPolC, fPolD, fPolA // for positive x 486 nop.i 0 487} 488{ .mfb 489 nop.m 0 490(p15) fms.s.s0 f8 = fPolC, fPolD, fPolA // for negative x 491 br.ret.sptk b0 // Exit for 0.3125 <=|x|< 8.0 492};; 493 494 495// Here if |x| < 0.3125 496tanhf_near_zero: 497{ .mfi 498 nop.m 0 499 fma.s1 fPolC = fC3, fArgSqr, fC2 // C3*x^2 + C2 500 nop.i 0 501} 502{ .mfi 503 nop.m 0 504 fma.s1 fPolCTmp = fC1, fArgSqr, fC0 // C1*x^2 + C0 505 nop.i 0 506};; 507 508{ .mfi 509 nop.m 0 510 fma.s1 fPolC = fPolC, fArg4, fPolCTmp // C3*x^6 + C2*x^4 + C1*x^2 + C0 511 nop.i 0 512};; 513 514{ .mfb 515 nop.m 0 516 // x + x^3*(C3*x^6 + C2*x^4 + C1*x^2 + C0) 517 fma.s.s0 f8 = fPolC, fArg3Sgn, f8 518 br.ret.sptk b0 // Exit for |x| < 0.3125 519};; 520 521// Here if 9.125 <= |x| < +inf 522tanhf_saturation: 523{ .mfb 524 nop.m 0 525 fma.s.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0d - 2^(-52)) 526 // Exit for 9.125 <= |x| < +inf 527 br.ret.sptk b0 // Exit for 9.125 <=|x|< +inf 528} 529;; 530 531// Here if 8.0 <= |x| < 9.125 532tanhf_close_to_saturation: 533{ .mfi 534 nop.m 0 535 fma.s1 fPolATmp = fA1, fAbsArg, fA0 // A1*|x| + A0 536 nop.i 0 537} 538{ .mfi 539 nop.m 0 540 fma.s1 fPolA = fA3, fAbsArg, fA2 // A3*|x| + A2 541 nop.i 0 542} 543;; 544 545.pred.rel "mutex", p14, p15 546{ .mfi 547 nop.m 0 548 // for positive x 549(p14) fma.s.s0 f8 = fPolA, fArgSqr, fPolATmp 550 nop.i 0 551} 552{ .mfb 553 nop.m 0 554 // for negative x 555(p15) fms.s.s0 f8 = fPolA, fArgSqrSgn, fPolATmp 556 br.ret.sptk b0 // Exit for 8.0 <=|x|< 9.125 557};; 558 559// Here if x is single precision denormal 560tanhf_denormal: 561{ .mfi 562 nop.m 0 563 fclass.m p7,p8 = f8, 0x0a // is x -denormal ? 564 nop.i 0 565} 566;; 567 568{ .mfi 569 nop.m 0 570(p7) fma.s.s0 f8 = f8,f8,f8 // -denormal 571 nop.i 0 572} 573{ .mfb 574 nop.m 0 575(p8) fnma.s.s0 f8 = f8,f8,f8 // +denormal 576 br.ret.sptk b0 // Exit for denormal 577} 578;; 579 580GLOBAL_LIBM_END(tanhf) 581libm_alias_float_other (tanh, tanh) 582