1.file "atanhl.s" 2 3 4// Copyright (c) 2001 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT 25// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL, 28// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code,and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39//********************************************************************* 40// 41// History: 42// 09/10/01 Initial version 43// 12/11/01 Corrected .restore syntax 44// 05/20/02 Cleaned up namespace and sf0 syntax 45// 02/10/03 Reordered header: .section, .global, .proc, .align; 46// used data8 for long double table values 47// 48//********************************************************************* 49// 50//********************************************************************* 51// 52// Function: atanhl(x) computes the principle value of the inverse 53// hyperbolic tangent of x. 54// 55//********************************************************************* 56// 57// Resources Used: 58// 59// Floating-Point Registers: f8 (Input and Return Value) 60// f33-f73 61// 62// General Purpose Registers: 63// r32-r52 64// r49-r52 (Used to pass arguments to error handling routine) 65// 66// Predicate Registers: p6-p15 67// 68//********************************************************************* 69// 70// IEEE Special Conditions: 71// 72// atanhl(inf) = QNaN 73// atanhl(-inf) = QNaN 74// atanhl(+/-0) = +/-0 75// atanhl(1) = +inf 76// atanhl(-1) = -inf 77// atanhl(|x|>1) = QNaN 78// atanhl(SNaN) = QNaN 79// atanhl(QNaN) = QNaN 80// 81//********************************************************************* 82// 83// Overview 84// 85// The method consists of two cases. 86// 87// If |x| < 1/32 use case atanhl_near_zero; 88// else use case atanhl_regular; 89// 90// Case atanhl_near_zero: 91// 92// atanhl(x) can be approximated by the Taylor series expansion 93// up to order 17. 94// 95// Case atanhl_regular: 96// 97// Here we use formula atanhl(x) = sign(x)*log1pl(2*|x|/(1-|x|))/2 and 98// calculation is subdivided into two stages. The first stage is 99// calculating of X = 2*|x|/(1-|x|). The second one is calculating of 100// sign(x)*log1pl(X)/2. To obtain required accuracy we use precise division 101// algorithm output of which is a pair of two extended precision values those 102// approximate result of division with accuracy higher than working 103// precision. This pair is passed to modified log1pl function. 104// 105// 106// 1. calculating of X = 2*|x|/(1-|x|) 107// ( based on Peter Markstein's "IA-64 and Elementary Functions" book ) 108// ******************************************************************** 109// 110// a = 2*|x| 111// b = 1 - |x| 112// b_lo = |x| - (1 - b) 113// 114// y = frcpa(b) initial approximation of 1/b 115// q = a*y initial approximation of a/b 116// 117// e = 1 - b*y 118// e2 = e + e^2 119// e1 = e^2 120// y1 = y + y*e2 = y + y*(e+e^2) 121// 122// e3 = e + e1^2 123// y2 = y + y1*e3 = y + y*(e+e^2+..+e^6) 124// 125// r = a - b*q 126// e = 1 - b*y2 127// X = q + r*y2 high part of a/b 128// 129// y3 = y2 + y2*e4 130// r1 = a - b*X 131// r1 = r1 - b_lo*X 132// X_lo = r1*y3 low part of a/b 133// 134// 2. special log1p algorithm overview 135// *********************************** 136// 137// Here we use a table lookup method. The basic idea is that in 138// order to compute logl(Arg) = log1pl (Arg-1) for an argument Arg in [1,2), 139// we construct a value G such that G*Arg is close to 1 and that 140// logl(1/G) is obtainable easily from a table of values calculated 141// beforehand. Thus 142// 143// logl(Arg) = logl(1/G) + logl(G*Arg) 144// = logl(1/G) + logl(1 + (G*Arg - 1)) 145// 146// Because |G*Arg - 1| is small, the second term on the right hand 147// side can be approximated by a short polynomial. We elaborate 148// this method in several steps. 149// 150// Step 0: Initialization 151// ------ 152// We need to calculate logl(X + X_lo + 1). Obtain N, S_hi such that 153// 154// X + X_lo + 1 = 2^N * ( S_hi + S_lo ) exactly 155// 156// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense 157// that |S_lo| <= ulp(S_hi). 158// 159// For the special version of log1p we add X_lo to S_lo (S_lo = S_lo + X_lo) 160// !-----------------------------------------------------------------------! 161// 162// Step 1: Argument Reduction 163// ------ 164// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate 165// 166// G := G_1 * G_2 * G_3 167// r := (G * S_hi - 1) + G * S_lo 168// 169// These G_j's have the property that the product is exactly 170// representable and that |r| < 2^(-12) as a result. 171// 172// Step 2: Approximation 173// ------ 174// logl(1 + r) is approximated by a short polynomial poly(r). 175// 176// Step 3: Reconstruction 177// ------ 178// Finally, log1pl(X + X_lo) = logl(X + X_lo + 1) is given by 179// 180// logl(X + X_lo + 1) = logl(2^N * (S_hi + S_lo)) 181// ~=~ N*logl(2) + logl(1/G) + logl(1 + r) 182// ~=~ N*logl(2) + logl(1/G) + poly(r). 183// 184// For detailed description see log1p1 function, regular path. 185// 186//********************************************************************* 187 188RODATA 189.align 64 190 191// ************* DO NOT CHANGE THE ORDER OF THESE TABLES ************* 192 193LOCAL_OBJECT_START(Constants_TaylorSeries) 194data8 0xF0F0F0F0F0F0F0F1,0x00003FFA // C17 195data8 0x8888888888888889,0x00003FFB // C15 196data8 0x9D89D89D89D89D8A,0x00003FFB // C13 197data8 0xBA2E8BA2E8BA2E8C,0x00003FFB // C11 198data8 0xE38E38E38E38E38E,0x00003FFB // C9 199data8 0x9249249249249249,0x00003FFC // C7 200data8 0xCCCCCCCCCCCCCCCD,0x00003FFC // C5 201data8 0xAAAAAAAAAAAAAAAA,0x00003FFD // C3 202data4 0x3f000000 // 1/2 203data4 0x00000000 // pad 204data4 0x00000000 205data4 0x00000000 206LOCAL_OBJECT_END(Constants_TaylorSeries) 207 208LOCAL_OBJECT_START(Constants_Q) 209data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 // log2_hi 210data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 // log2_lo 211data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 // Q4 212data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 // Q3 213data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 // Q2 214data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 // Q1 215LOCAL_OBJECT_END(Constants_Q) 216 217 218// Z1 - 16 bit fixed 219LOCAL_OBJECT_START(Constants_Z_1) 220data4 0x00008000 221data4 0x00007879 222data4 0x000071C8 223data4 0x00006BCB 224data4 0x00006667 225data4 0x00006187 226data4 0x00005D18 227data4 0x0000590C 228data4 0x00005556 229data4 0x000051EC 230data4 0x00004EC5 231data4 0x00004BDB 232data4 0x00004925 233data4 0x0000469F 234data4 0x00004445 235data4 0x00004211 236LOCAL_OBJECT_END(Constants_Z_1) 237 238// G1 and H1 - IEEE single and h1 - IEEE double 239LOCAL_OBJECT_START(Constants_G_H_h1) 240data4 0x3F800000,0x00000000 241data8 0x0000000000000000 242data4 0x3F70F0F0,0x3D785196 243data8 0x3DA163A6617D741C 244data4 0x3F638E38,0x3DF13843 245data8 0x3E2C55E6CBD3D5BB 246data4 0x3F579430,0x3E2FF9A0 247data8 0xBE3EB0BFD86EA5E7 248data4 0x3F4CCCC8,0x3E647FD6 249data8 0x3E2E6A8C86B12760 250data4 0x3F430C30,0x3E8B3AE7 251data8 0x3E47574C5C0739BA 252data4 0x3F3A2E88,0x3EA30C68 253data8 0x3E20E30F13E8AF2F 254data4 0x3F321640,0x3EB9CEC8 255data8 0xBE42885BF2C630BD 256data4 0x3F2AAAA8,0x3ECF9927 257data8 0x3E497F3497E577C6 258data4 0x3F23D708,0x3EE47FC5 259data8 0x3E3E6A6EA6B0A5AB 260data4 0x3F1D89D8,0x3EF8947D 261data8 0xBDF43E3CD328D9BE 262data4 0x3F17B420,0x3F05F3A1 263data8 0x3E4094C30ADB090A 264data4 0x3F124920,0x3F0F4303 265data8 0xBE28FBB2FC1FE510 266data4 0x3F0D3DC8,0x3F183EBF 267data8 0x3E3A789510FDE3FA 268data4 0x3F088888,0x3F20EC80 269data8 0x3E508CE57CC8C98F 270data4 0x3F042108,0x3F29516A 271data8 0xBE534874A223106C 272LOCAL_OBJECT_END(Constants_G_H_h1) 273 274// Z2 - 16 bit fixed 275LOCAL_OBJECT_START(Constants_Z_2) 276data4 0x00008000 277data4 0x00007F81 278data4 0x00007F02 279data4 0x00007E85 280data4 0x00007E08 281data4 0x00007D8D 282data4 0x00007D12 283data4 0x00007C98 284data4 0x00007C20 285data4 0x00007BA8 286data4 0x00007B31 287data4 0x00007ABB 288data4 0x00007A45 289data4 0x000079D1 290data4 0x0000795D 291data4 0x000078EB 292LOCAL_OBJECT_END(Constants_Z_2) 293 294// G2 and H2 - IEEE single and h2 - IEEE double 295LOCAL_OBJECT_START(Constants_G_H_h2) 296data4 0x3F800000,0x00000000 297data8 0x0000000000000000 298data4 0x3F7F00F8,0x3B7F875D 299data8 0x3DB5A11622C42273 300data4 0x3F7E03F8,0x3BFF015B 301data8 0x3DE620CF21F86ED3 302data4 0x3F7D08E0,0x3C3EE393 303data8 0xBDAFA07E484F34ED 304data4 0x3F7C0FC0,0x3C7E0586 305data8 0xBDFE07F03860BCF6 306data4 0x3F7B1880,0x3C9E75D2 307data8 0x3DEA370FA78093D6 308data4 0x3F7A2328,0x3CBDC97A 309data8 0x3DFF579172A753D0 310data4 0x3F792FB0,0x3CDCFE47 311data8 0x3DFEBE6CA7EF896B 312data4 0x3F783E08,0x3CFC15D0 313data8 0x3E0CF156409ECB43 314data4 0x3F774E38,0x3D0D874D 315data8 0xBE0B6F97FFEF71DF 316data4 0x3F766038,0x3D1CF49B 317data8 0xBE0804835D59EEE8 318data4 0x3F757400,0x3D2C531D 319data8 0x3E1F91E9A9192A74 320data4 0x3F748988,0x3D3BA322 321data8 0xBE139A06BF72A8CD 322data4 0x3F73A0D0,0x3D4AE46F 323data8 0x3E1D9202F8FBA6CF 324data4 0x3F72B9D0,0x3D5A1756 325data8 0xBE1DCCC4BA796223 326data4 0x3F71D488,0x3D693B9D 327data8 0xBE049391B6B7C239 328LOCAL_OBJECT_END(Constants_G_H_h2) 329 330// G3 and H3 - IEEE single and h3 - IEEE double 331LOCAL_OBJECT_START(Constants_G_H_h3) 332data4 0x3F7FFC00,0x38800100 333data8 0x3D355595562224CD 334data4 0x3F7FF400,0x39400480 335data8 0x3D8200A206136FF6 336data4 0x3F7FEC00,0x39A00640 337data8 0x3DA4D68DE8DE9AF0 338data4 0x3F7FE400,0x39E00C41 339data8 0xBD8B4291B10238DC 340data4 0x3F7FDC00,0x3A100A21 341data8 0xBD89CCB83B1952CA 342data4 0x3F7FD400,0x3A300F22 343data8 0xBDB107071DC46826 344data4 0x3F7FCC08,0x3A4FF51C 345data8 0x3DB6FCB9F43307DB 346data4 0x3F7FC408,0x3A6FFC1D 347data8 0xBD9B7C4762DC7872 348data4 0x3F7FBC10,0x3A87F20B 349data8 0xBDC3725E3F89154A 350data4 0x3F7FB410,0x3A97F68B 351data8 0xBD93519D62B9D392 352data4 0x3F7FAC18,0x3AA7EB86 353data8 0x3DC184410F21BD9D 354data4 0x3F7FA420,0x3AB7E101 355data8 0xBDA64B952245E0A6 356data4 0x3F7F9C20,0x3AC7E701 357data8 0x3DB4B0ECAABB34B8 358data4 0x3F7F9428,0x3AD7DD7B 359data8 0x3D9923376DC40A7E 360data4 0x3F7F8C30,0x3AE7D474 361data8 0x3DC6E17B4F2083D3 362data4 0x3F7F8438,0x3AF7CBED 363data8 0x3DAE314B811D4394 364data4 0x3F7F7C40,0x3B03E1F3 365data8 0xBDD46F21B08F2DB1 366data4 0x3F7F7448,0x3B0BDE2F 367data8 0xBDDC30A46D34522B 368data4 0x3F7F6C50,0x3B13DAAA 369data8 0x3DCB0070B1F473DB 370data4 0x3F7F6458,0x3B1BD766 371data8 0xBDD65DDC6AD282FD 372data4 0x3F7F5C68,0x3B23CC5C 373data8 0xBDCDAB83F153761A 374data4 0x3F7F5470,0x3B2BC997 375data8 0xBDDADA40341D0F8F 376data4 0x3F7F4C78,0x3B33C711 377data8 0x3DCD1BD7EBC394E8 378data4 0x3F7F4488,0x3B3BBCC6 379data8 0xBDC3532B52E3E695 380data4 0x3F7F3C90,0x3B43BAC0 381data8 0xBDA3961EE846B3DE 382data4 0x3F7F34A0,0x3B4BB0F4 383data8 0xBDDADF06785778D4 384data4 0x3F7F2CA8,0x3B53AF6D 385data8 0x3DCC3ED1E55CE212 386data4 0x3F7F24B8,0x3B5BA620 387data8 0xBDBA31039E382C15 388data4 0x3F7F1CC8,0x3B639D12 389data8 0x3D635A0B5C5AF197 390data4 0x3F7F14D8,0x3B6B9444 391data8 0xBDDCCB1971D34EFC 392data4 0x3F7F0CE0,0x3B7393BC 393data8 0x3DC7450252CD7ADA 394data4 0x3F7F04F0,0x3B7B8B6D 395data8 0xBDB68F177D7F2A42 396LOCAL_OBJECT_END(Constants_G_H_h3) 397 398 399 400// Floating Point Registers 401 402FR_C17 = f50 403FR_C15 = f51 404FR_C13 = f52 405FR_C11 = f53 406FR_C9 = f54 407FR_C7 = f55 408FR_C5 = f56 409FR_C3 = f57 410FR_x2 = f58 411FR_x3 = f59 412FR_x4 = f60 413FR_x8 = f61 414 415FR_Rcp = f61 416 417FR_A = f33 418FR_R1 = f33 419 420FR_E1 = f34 421FR_E3 = f34 422FR_Y2 = f34 423FR_Y3 = f34 424 425FR_E2 = f35 426FR_Y1 = f35 427 428FR_B = f36 429FR_Y0 = f37 430FR_E0 = f38 431FR_E4 = f39 432FR_Q0 = f40 433FR_R0 = f41 434FR_B_lo = f42 435 436FR_abs_x = f43 437FR_Bp = f44 438FR_Bn = f45 439FR_Yp = f46 440FR_Yn = f47 441 442FR_X = f48 443FR_BB = f48 444FR_X_lo = f49 445 446FR_G = f50 447FR_Y_hi = f51 448FR_H = f51 449FR_h = f52 450FR_G2 = f53 451FR_H2 = f54 452FR_h2 = f55 453FR_G3 = f56 454FR_H3 = f57 455FR_h3 = f58 456 457FR_Q4 = f59 458FR_poly_lo = f59 459FR_Y_lo = f59 460 461FR_Q3 = f60 462FR_Q2 = f61 463 464FR_Q1 = f62 465FR_poly_hi = f62 466 467FR_float_N = f63 468 469FR_AA = f64 470FR_S_lo = f64 471 472FR_S_hi = f65 473FR_r = f65 474 475FR_log2_hi = f66 476FR_log2_lo = f67 477FR_Z = f68 478FR_2_to_minus_N = f69 479FR_rcub = f70 480FR_rsq = f71 481FR_05r = f72 482FR_Half = f73 483 484FR_Arg_X = f50 485FR_Arg_Y = f0 486FR_RESULT = f8 487 488 489 490// General Purpose Registers 491 492GR_ad_05 = r33 493GR_Index1 = r34 494GR_ArgExp = r34 495GR_Index2 = r35 496GR_ExpMask = r35 497GR_NearZeroBound = r36 498GR_signif = r36 499GR_X_0 = r37 500GR_X_1 = r37 501GR_X_2 = r38 502GR_Index3 = r38 503GR_minus_N = r39 504GR_Z_1 = r40 505GR_Z_2 = r40 506GR_N = r41 507GR_Bias = r42 508GR_M = r43 509GR_ad_taylor = r44 510GR_ad_taylor_2 = r45 511GR_ad2_tbl_3 = r45 512GR_ad_tbl_1 = r46 513GR_ad_tbl_2 = r47 514GR_ad_tbl_3 = r48 515GR_ad_q = r49 516GR_ad_z_1 = r50 517GR_ad_z_2 = r51 518GR_ad_z_3 = r52 519 520// 521// Added for unwind support 522// 523GR_SAVE_PFS = r46 524GR_SAVE_B0 = r47 525GR_SAVE_GP = r48 526GR_Parameter_X = r49 527GR_Parameter_Y = r50 528GR_Parameter_RESULT = r51 529GR_Parameter_TAG = r52 530 531 532 533.section .text 534GLOBAL_LIBM_ENTRY(atanhl) 535 536{ .mfi 537 alloc r32 = ar.pfs,0,17,4,0 538 fnma.s1 FR_Bp = f8,f1,f1 // b = 1 - |arg| (for x>0) 539 mov GR_ExpMask = 0x1ffff 540} 541{ .mfi 542 addl GR_ad_taylor = @ltoff(Constants_TaylorSeries),gp 543 fma.s1 FR_Bn = f8,f1,f1 // b = 1 - |arg| (for x<0) 544 mov GR_NearZeroBound = 0xfffa // biased exp of 1/32 545};; 546{ .mfi 547 getf.exp GR_ArgExp = f8 548 fcmp.lt.s1 p6,p7 = f8,f0 // is negative? 549 nop.i 0 550} 551{ .mfi 552 ld8 GR_ad_taylor = [GR_ad_taylor] 553 fmerge.s FR_abs_x = f1,f8 554 nop.i 0 555};; 556{ .mfi 557 nop.m 0 558 fclass.m p8,p0 = f8,0x1C7 // is arg NaT,Q/SNaN or +/-0 ? 559 nop.i 0 560} 561{ .mfi 562 nop.m 0 563 fma.s1 FR_x2 = f8,f8,f0 564 nop.i 0 565};; 566{ .mfi 567 add GR_ad_z_1 = 0x0F0,GR_ad_taylor 568 fclass.m p9,p0 = f8,0x0a // is arg -denormal ? 569 add GR_ad_taylor_2 = 0x010,GR_ad_taylor 570} 571{ .mfi 572 add GR_ad_05 = 0x080,GR_ad_taylor 573 nop.f 0 574 nop.i 0 575};; 576{ .mfi 577 ldfe FR_C17 = [GR_ad_taylor],32 578 fclass.m p10,p0 = f8,0x09 // is arg +denormal ? 579 add GR_ad_tbl_1 = 0x040,GR_ad_z_1 // point to Constants_G_H_h1 580} 581{ .mfb 582 add GR_ad_z_2 = 0x140,GR_ad_z_1 // point to Constants_Z_2 583 (p8) fma.s0 f8 = f8,f1,f0 // NaN or +/-0 584 (p8) br.ret.spnt b0 // exit for Nan or +/-0 585};; 586{ .mfi 587 ldfe FR_C15 = [GR_ad_taylor_2],32 588 fclass.m p15,p0 = f8,0x23 // is +/-INF ? 589 add GR_ad_tbl_2 = 0x180,GR_ad_z_1 // point to Constants_G_H_h2 590} 591{ .mfb 592 ldfe FR_C13 = [GR_ad_taylor],32 593 (p9) fnma.s0 f8 = f8,f8,f8 // -denormal 594 (p9) br.ret.spnt b0 // exit for -denormal 595};; 596{ .mfi 597 ldfe FR_C11 = [GR_ad_taylor_2],32 598 fcmp.eq.s0 p13,p0 = FR_abs_x,f1 // is |arg| = 1? 599 nop.i 0 600} 601{ .mfb 602 ldfe FR_C9 = [GR_ad_taylor],32 603(p10) fma.s0 f8 = f8,f8,f8 // +denormal 604(p10) br.ret.spnt b0 // exit for +denormal 605};; 606{ .mfi 607 ldfe FR_C7 = [GR_ad_taylor_2],32 608 (p6) frcpa.s1 FR_Yn,p11 = f1,FR_Bn // y = frcpa(b) 609 and GR_ArgExp = GR_ArgExp,GR_ExpMask // biased exponent 610} 611{ .mfb 612 ldfe FR_C5 = [GR_ad_taylor],32 613 fnma.s1 FR_B = FR_abs_x,f1,f1 // b = 1 - |arg| 614(p15) br.cond.spnt atanhl_gt_one // |arg| > 1 615};; 616{ .mfb 617 cmp.gt p14,p0 = GR_NearZeroBound,GR_ArgExp 618 (p7) frcpa.s1 FR_Yp,p12 = f1,FR_Bp // y = frcpa(b) 619(p13) br.cond.spnt atanhl_eq_one // |arg| = 1/32 620} 621{ .mfb 622 ldfe FR_C3 = [GR_ad_taylor_2],32 623 fma.s1 FR_A = FR_abs_x,f1,FR_abs_x // a = 2 * |arg| 624(p14) br.cond.spnt atanhl_near_zero // |arg| < 1/32 625};; 626{ .mfi 627 nop.m 0 628 fcmp.gt.s0 p8,p0 = FR_abs_x,f1 // is |arg| > 1 ? 629 nop.i 0 630};; 631.pred.rel "mutex",p6,p7 632{ .mfi 633 nop.m 0 634 (p6) fnma.s1 FR_B_lo = FR_Bn,f1,f1 // argt = 1 - (1 - |arg|) 635 nop.i 0 636} 637{ .mfi 638 ldfs FR_Half = [GR_ad_05] 639 (p7) fnma.s1 FR_B_lo = FR_Bp,f1,f1 640 nop.i 0 641};; 642{ .mfi 643 nop.m 0 644 (p6) fnma.s1 FR_E0 = FR_Yn,FR_Bn,f1 // e = 1-b*y 645 nop.i 0 646} 647{ .mfb 648 nop.m 0 649 (p6) fma.s1 FR_Y0 = FR_Yn,f1,f0 650 (p8) br.cond.spnt atanhl_gt_one // |arg| > 1 651};; 652{ .mfi 653 nop.m 0 654 (p7) fnma.s1 FR_E0 = FR_Yp,FR_Bp,f1 655 nop.i 0 656} 657{ .mfi 658 nop.m 0 659 (p6) fma.s1 FR_Q0 = FR_A,FR_Yn,f0 // q = a*y 660 nop.i 0 661};; 662{ .mfi 663 nop.m 0 664 (p7) fma.s1 FR_Q0 = FR_A,FR_Yp,f0 665 nop.i 0 666} 667{ .mfi 668 nop.m 0 669 (p7) fma.s1 FR_Y0 = FR_Yp,f1,f0 670 nop.i 0 671};; 672{ .mfi 673 nop.m 0 674 fclass.nm p10,p0 = f8,0x1FF // test for unsupported 675 nop.i 0 676};; 677{ .mfi 678 nop.m 0 679 fma.s1 FR_E2 = FR_E0,FR_E0,FR_E0 // e2 = e+e^2 680 nop.i 0 681} 682{ .mfi 683 nop.m 0 684 fma.s1 FR_E1 = FR_E0,FR_E0,f0 // e1 = e^2 685 nop.i 0 686};; 687{ .mfb 688 nop.m 0 689// Return generated NaN or other value for unsupported values. 690(p10) fma.s0 f8 = f8, f0, f0 691(p10) br.ret.spnt b0 692};; 693{ .mfi 694 nop.m 0 695 fma.s1 FR_Y1 = FR_Y0,FR_E2,FR_Y0 // y1 = y+y*e2 696 nop.i 0 697} 698{ .mfi 699 nop.m 0 700 fma.s1 FR_E3 = FR_E1,FR_E1,FR_E0 // e3 = e+e1^2 701 nop.i 0 702};; 703{ .mfi 704 nop.m 0 705 fnma.s1 FR_B_lo = FR_abs_x,f1,FR_B_lo // b_lo = argt-|arg| 706 nop.i 0 707};; 708{ .mfi 709 nop.m 0 710 fma.s1 FR_Y2 = FR_Y1,FR_E3,FR_Y0 // y2 = y+y1*e3 711 nop.i 0 712} 713{ .mfi 714 nop.m 0 715 fnma.s1 FR_R0 = FR_B,FR_Q0,FR_A // r = a-b*q 716 nop.i 0 717};; 718{ .mfi 719 nop.m 0 720 fnma.s1 FR_E4 = FR_B,FR_Y2,f1 // e4 = 1-b*y2 721 nop.i 0 722} 723{ .mfi 724 nop.m 0 725 fma.s1 FR_X = FR_R0,FR_Y2,FR_Q0 // x = q+r*y2 726 nop.i 0 727};; 728{ .mfi 729 nop.m 0 730 fma.s1 FR_Z = FR_X,f1,f1 // x+1 731 nop.i 0 732};; 733{ .mfi 734 nop.m 0 735 (p6) fnma.s1 FR_Half = FR_Half,f1,f0 // sign(arg)/2 736 nop.i 0 737};; 738{ .mfi 739 nop.m 0 740 fma.s1 FR_Y3 = FR_Y2,FR_E4,FR_Y2 // y3 = y2+y2*e4 741 nop.i 0 742} 743{ .mfi 744 nop.m 0 745 fnma.s1 FR_R1 = FR_B,FR_X,FR_A // r1 = a-b*x 746 nop.i 0 747};; 748{ .mfi 749 getf.sig GR_signif = FR_Z // get significand of x+1 750 nop.f 0 751 nop.i 0 752};; 753 754 755{ .mfi 756 add GR_ad_q = -0x060,GR_ad_z_1 757 nop.f 0 758 extr.u GR_Index1 = GR_signif,59,4 // get high 4 bits of signif 759} 760{ .mfi 761 add GR_ad_tbl_3 = 0x280,GR_ad_z_1 // point to Constants_G_H_h3 762 nop.f 0 763 nop.i 0 764};; 765{ .mfi 766 shladd GR_ad_z_1 = GR_Index1,2,GR_ad_z_1 // point to Z_1 767 nop.f 0 768 extr.u GR_X_0 = GR_signif,49,15 // get high 15 bits of significand 769};; 770{ .mfi 771 ld4 GR_Z_1 = [GR_ad_z_1] // load Z_1 772 fmax.s1 FR_AA = FR_X,f1 // for S_lo,form AA = max(X,1.0) 773 nop.i 0 774} 775{ .mfi 776 shladd GR_ad_tbl_1 = GR_Index1,4,GR_ad_tbl_1 // point to G_1 777 nop.f 0 778 mov GR_Bias = 0x0FFFF // exponent bias 779};; 780{ .mfi 781 ldfps FR_G,FR_H = [GR_ad_tbl_1],8 // load G_1,H_1 782 fmerge.se FR_S_hi = f1,FR_Z // form |x+1| 783 nop.i 0 784};; 785{ .mfi 786 getf.exp GR_N = FR_Z // get N = exponent of x+1 787 nop.f 0 788 nop.i 0 789} 790{ .mfi 791 ldfd FR_h = [GR_ad_tbl_1] // load h_1 792 fnma.s1 FR_R1 = FR_B_lo,FR_X,FR_R1 // r1 = r1-b_lo*x 793 nop.i 0 794};; 795{ .mfi 796 ldfe FR_log2_hi = [GR_ad_q],16 // load log2_hi 797 nop.f 0 798 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // get bits 30-15 of X_0 * Z_1 799};; 800// 801// For performance,don't use result of pmpyshr2.u for 4 cycles. 802// 803{ .mfi 804 ldfe FR_log2_lo = [GR_ad_q],16 // load log2_lo 805 nop.f 0 806 sub GR_N = GR_N,GR_Bias 807};; 808{ .mfi 809 ldfe FR_Q4 = [GR_ad_q],16 // load Q4 810 fms.s1 FR_S_lo = FR_AA,f1,FR_Z // form S_lo = AA - Z 811 sub GR_minus_N = GR_Bias,GR_N // form exponent of 2^(-N) 812};; 813{ .mmf 814 ldfe FR_Q3 = [GR_ad_q],16 // load Q3 815 // put integer N into rightmost significand 816 setf.sig FR_float_N = GR_N 817 fmin.s1 FR_BB = FR_X,f1 // for S_lo,form BB = min(X,1.0) 818};; 819{ .mfi 820 ldfe FR_Q2 = [GR_ad_q],16 // load Q2 821 nop.f 0 822 extr.u GR_Index2 = GR_X_1,6,4 // extract bits 6-9 of X_1 823};; 824{ .mmi 825 ldfe FR_Q1 = [GR_ad_q] // load Q1 826 shladd GR_ad_z_2 = GR_Index2,2,GR_ad_z_2 // point to Z_2 827 nop.i 0 828};; 829{ .mmi 830 ld4 GR_Z_2 = [GR_ad_z_2] // load Z_2 831 shladd GR_ad_tbl_2 = GR_Index2,4,GR_ad_tbl_2 // point to G_2 832 nop.i 0 833};; 834{ .mfi 835 ldfps FR_G2,FR_H2 = [GR_ad_tbl_2],8 // load G_2,H_2 836 nop.f 0 837 nop.i 0 838};; 839{ .mfi 840 ldfd FR_h2 = [GR_ad_tbl_2] // load h_2 841 fma.s1 FR_S_lo = FR_S_lo,f1,FR_BB // S_lo = S_lo + BB 842 nop.i 0 843} 844{ .mfi 845 setf.exp FR_2_to_minus_N = GR_minus_N // form 2^(-N) 846 fma.s1 FR_X_lo = FR_R1,FR_Y3,f0 // x_lo = r1*y3 847 nop.i 0 848};; 849{ .mfi 850 nop.m 0 851 nop.f 0 852 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // get bits 30-15 of X_1 * Z_2 853};; 854// 855// For performance,don't use result of pmpyshr2.u for 4 cycles 856// 857{ .mfi 858 add GR_ad2_tbl_3 = 8,GR_ad_tbl_3 859 nop.f 0 860 nop.i 0 861} 862{ .mfi 863 nop.m 0 864 nop.f 0 865 nop.i 0 866};; 867{ .mfi 868 nop.m 0 869 nop.f 0 870 nop.i 0 871};; 872{ .mfi 873 nop.m 0 874 nop.f 0 875 nop.i 0 876};; 877 878// 879// Now GR_X_2 can be used 880// 881{ .mfi 882 nop.m 0 883 nop.f 0 884 extr.u GR_Index3 = GR_X_2,1,5 // extract bits 1-5 of X_2 885} 886{ .mfi 887 nop.m 0 888 fma.s1 FR_S_lo = FR_S_lo,f1,FR_X_lo // S_lo = S_lo + Arg_lo 889 nop.i 0 890};; 891 892{ .mfi 893 shladd GR_ad_tbl_3 = GR_Index3,4,GR_ad_tbl_3 // point to G_3 894 fcvt.xf FR_float_N = FR_float_N 895 nop.i 0 896} 897{ .mfi 898 shladd GR_ad2_tbl_3 = GR_Index3,4,GR_ad2_tbl_3 // point to h_3 899 fma.s1 FR_Q1 = FR_Q1,FR_Half,f0 // sign(arg)*Q1/2 900 nop.i 0 901};; 902{ .mmi 903 ldfps FR_G3,FR_H3 = [GR_ad_tbl_3],8 // load G_3,H_3 904 ldfd FR_h3 = [GR_ad2_tbl_3] // load h_3 905 nop.i 0 906};; 907{ .mfi 908 nop.m 0 909 fmpy.s1 FR_G = FR_G,FR_G2 // G = G_1 * G_2 910 nop.i 0 911} 912{ .mfi 913 nop.m 0 914 fadd.s1 FR_H = FR_H,FR_H2 // H = H_1 + H_2 915 nop.i 0 916};; 917{ .mfi 918 nop.m 0 919 fadd.s1 FR_h = FR_h,FR_h2 // h = h_1 + h_2 920 nop.i 0 921};; 922{ .mfi 923 nop.m 0 924 // S_lo = S_lo * 2^(-N) 925 fma.s1 FR_S_lo = FR_S_lo,FR_2_to_minus_N,f0 926 nop.i 0 927};; 928{ .mfi 929 nop.m 0 930 fmpy.s1 FR_G = FR_G,FR_G3 // G = (G_1 * G_2) * G_3 931 nop.i 0 932} 933{ .mfi 934 nop.m 0 935 fadd.s1 FR_H = FR_H,FR_H3 // H = (H_1 + H_2) + H_3 936 nop.i 0 937};; 938{ .mfi 939 nop.m 0 940 fadd.s1 FR_h = FR_h,FR_h3 // h = (h_1 + h_2) + h_3 941 nop.i 0 942};; 943{ .mfi 944 nop.m 0 945 fms.s1 FR_r = FR_G,FR_S_hi,f1 // r = G * S_hi - 1 946 nop.i 0 947} 948{ .mfi 949 nop.m 0 950 // Y_hi = N * log2_hi + H 951 fma.s1 FR_Y_hi = FR_float_N,FR_log2_hi,FR_H 952 nop.i 0 953};; 954{ .mfi 955 nop.m 0 956 fma.s1 FR_h = FR_float_N,FR_log2_lo,FR_h // h = N * log2_lo + h 957 nop.i 0 958};; 959{ .mfi 960 nop.m 0 961 fma.s1 FR_r = FR_G,FR_S_lo,FR_r // r = G * S_lo + (G * S_hi - 1) 962 nop.i 0 963};; 964{ .mfi 965 nop.m 0 966 fma.s1 FR_poly_lo = FR_r,FR_Q4,FR_Q3 // poly_lo = r * Q4 + Q3 967 nop.i 0 968} 969{ .mfi 970 nop.m 0 971 fmpy.s1 FR_rsq = FR_r,FR_r // rsq = r * r 972 nop.i 0 973};; 974{ .mfi 975 nop.m 0 976 fma.s1 FR_05r = FR_r,FR_Half,f0 // sign(arg)*r/2 977 nop.i 0 978};; 979{ .mfi 980 nop.m 0 981 // poly_lo = poly_lo * r + Q2 982 fma.s1 FR_poly_lo = FR_poly_lo,FR_r,FR_Q2 983 nop.i 0 984} 985{ .mfi 986 nop.m 0 987 fma.s1 FR_rcub = FR_rsq,FR_r,f0 // rcub = r^3 988 nop.i 0 989};; 990{ .mfi 991 nop.m 0 992 // poly_hi = sing(arg)*(Q1*r^2 + r)/2 993 fma.s1 FR_poly_hi = FR_Q1,FR_rsq,FR_05r 994 nop.i 0 995};; 996{ .mfi 997 nop.m 0 998 // poly_lo = poly_lo*r^3 + h 999 fma.s1 FR_poly_lo = FR_poly_lo,FR_rcub,FR_h 1000 nop.i 0 1001};; 1002{ .mfi 1003 nop.m 0 1004 // Y_lo = poly_hi + poly_lo/2 1005 fma.s0 FR_Y_lo = FR_poly_lo,FR_Half,FR_poly_hi 1006 nop.i 0 1007};; 1008{ .mfb 1009 nop.m 0 1010 // Result = arctanh(x) = Y_hi/2 + Y_lo 1011 fma.s0 f8 = FR_Y_hi,FR_Half,FR_Y_lo 1012 br.ret.sptk b0 1013};; 1014 1015// Taylor's series 1016atanhl_near_zero: 1017{ .mfi 1018 nop.m 0 1019 fma.s1 FR_x3 = FR_x2,f8,f0 1020 nop.i 0 1021} 1022{ .mfi 1023 nop.m 0 1024 fma.s1 FR_x4 = FR_x2,FR_x2,f0 1025 nop.i 0 1026};; 1027{ .mfi 1028 nop.m 0 1029 fma.s1 FR_C17 = FR_C17,FR_x2,FR_C15 1030 nop.i 0 1031} 1032{ .mfi 1033 nop.m 0 1034 fma.s1 FR_C13 = FR_C13,FR_x2,FR_C11 1035 nop.i 0 1036};; 1037{ .mfi 1038 nop.m 0 1039 fma.s1 FR_C9 = FR_C9,FR_x2,FR_C7 1040 nop.i 0 1041} 1042{ .mfi 1043 nop.m 0 1044 fma.s1 FR_C5 = FR_C5,FR_x2,FR_C3 1045 nop.i 0 1046};; 1047{ .mfi 1048 nop.m 0 1049 fma.s1 FR_x8 = FR_x4,FR_x4,f0 1050 nop.i 0 1051};; 1052{ .mfi 1053 nop.m 0 1054 fma.s1 FR_C17 = FR_C17,FR_x4,FR_C13 1055 nop.i 0 1056};; 1057{ .mfi 1058 nop.m 0 1059 fma.s1 FR_C9 = FR_C9,FR_x4,FR_C5 1060 nop.i 0 1061};; 1062{ .mfi 1063 nop.m 0 1064 fma.s1 FR_C17 = FR_C17,FR_x8,FR_C9 1065 nop.i 0 1066};; 1067{ .mfb 1068 nop.m 0 1069 fma.s0 f8 = FR_C17,FR_x3,f8 1070 br.ret.sptk b0 1071};; 1072 1073atanhl_eq_one: 1074{ .mfi 1075 nop.m 0 1076 frcpa.s0 FR_Rcp,p0 = f1,f0 // get inf,and raise Z flag 1077 nop.i 0 1078} 1079{ .mfi 1080 nop.m 0 1081 fmerge.s FR_Arg_X = f8, f8 1082 nop.i 0 1083};; 1084{ .mfb 1085 mov GR_Parameter_TAG = 130 1086 fmerge.s FR_RESULT = f8,FR_Rcp // result is +-inf 1087 br.cond.sptk __libm_error_region // exit if |x| = 1.0 1088};; 1089 1090atanhl_gt_one: 1091{ .mfi 1092 nop.m 0 1093 fmerge.s FR_Arg_X = f8, f8 1094 nop.i 0 1095};; 1096{ .mfb 1097 mov GR_Parameter_TAG = 129 1098 frcpa.s0 FR_RESULT,p0 = f0,f0 // get QNaN,and raise invalid 1099 br.cond.sptk __libm_error_region // exit if |x| > 1.0 1100};; 1101 1102GLOBAL_LIBM_END(atanhl) 1103libm_alias_ldouble_other (atanh, atanh) 1104 1105LOCAL_LIBM_ENTRY(__libm_error_region) 1106.prologue 1107{ .mfi 1108 add GR_Parameter_Y=-32,sp // Parameter 2 value 1109 nop.f 0 1110.save ar.pfs,GR_SAVE_PFS 1111 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 1112} 1113{ .mfi 1114.fframe 64 1115 add sp=-64,sp // Create new stack 1116 nop.f 0 1117 mov GR_SAVE_GP=gp // Save gp 1118};; 1119{ .mmi 1120 stfe [GR_Parameter_Y] = FR_Arg_Y,16 // Save Parameter 2 on stack 1121 add GR_Parameter_X = 16,sp // Parameter 1 address 1122.save b0,GR_SAVE_B0 1123 mov GR_SAVE_B0=b0 // Save b0 1124};; 1125.body 1126{ .mib 1127 stfe [GR_Parameter_X] = FR_Arg_X // Store Parameter 1 on stack 1128 add GR_Parameter_RESULT = 0,GR_Parameter_Y 1129 nop.b 0 // Parameter 3 address 1130} 1131{ .mib 1132 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack 1133 add GR_Parameter_Y = -16,GR_Parameter_Y 1134 br.call.sptk b0=__libm_error_support# // Call error handling function 1135};; 1136{ .mmi 1137 nop.m 0 1138 nop.m 0 1139 add GR_Parameter_RESULT = 48,sp 1140};; 1141{ .mmi 1142 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack 1143.restore sp 1144 add sp = 64,sp // Restore stack pointer 1145 mov b0 = GR_SAVE_B0 // Restore return address 1146};; 1147{ .mib 1148 mov gp = GR_SAVE_GP // Restore gp 1149 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 1150 br.ret.sptk b0 // Return 1151};; 1152 1153LOCAL_LIBM_END(__libm_error_region#) 1154 1155.type __libm_error_support#,@function 1156.global __libm_error_support# 1157