1.file "asinhl.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39//********************************************************************* 40// 41// History: 42// 09/04/01 Initial version 43// 09/13/01 Performance improved, symmetry problems fixed 44// 10/10/01 Performance improved, split issues removed 45// 12/11/01 Changed huges_logp to not be global 46// 05/20/02 Cleaned up namespace and sf0 syntax 47// 02/10/03 Reordered header: .section, .global, .proc, .align; 48// used data8 for long double table values 49// 50//********************************************************************* 51// 52// API 53//============================================================== 54// long double asinhl(long double); 55// 56// Overview of operation 57//============================================================== 58// 59// There are 6 paths: 60// 1. x = 0, [S,Q]Nan or +/-INF 61// Return asinhl(x) = x + x; 62// 63// 2. x = + denormal 64// Return asinhl(x) = x - x^2; 65// 66// 3. x = - denormal 67// Return asinhl(x) = x + x^2; 68// 69// 4. 'Near 0': max denormal < |x| < 1/128 70// Return asinhl(x) = sign(x)*(x+x^3*(c3+x^2*(c5+x^2*(c7+x^2*(c9))))); 71// 72// 5. 'Huges': |x| > 2^63 73// Return asinhl(x) = sign(x)*(logl(2*x)); 74// 75// 6. 'Main path': 1/128 < |x| < 2^63 76// b_hi + b_lo = x + sqrt(x^2 + 1); 77// asinhl(x) = sign(x)*(log_special(b_hi, b_lo)); 78// 79// Algorithm description 80//============================================================== 81// 82// Main path algorithm 83// ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! ) 84// ************************************************************************* 85// 86// There are 3 parts of x+sqrt(x^2+1) computation: 87// 88// 1) p2 = (p2_hi+p2_lo) = x^2+1 obtaining 89// ------------------------------------ 90// p2_hi = x2_hi + 1, where x2_hi = x * x; 91// p2_lo = x2_lo + p1_lo, where 92// x2_lo = FMS(x*x-x2_hi), 93// p1_lo = (1 - p2_hi) + x2_hi; 94// 95// 2) g = (g_hi+g_lo) = sqrt(p2) = sqrt(p2_hi+p2_lo) 96// ---------------------------------------------- 97// r = invsqrt(p2_hi) (8-bit reciprocal square root approximation); 98// g = p2_hi * r (first 8 bit-approximation of sqrt); 99// 100// h = 0.5 * r; 101// e = 0.5 - g * h; 102// g = g * e + g (second 16 bit-approximation of sqrt); 103// 104// h = h * e + h; 105// e = 0.5 - g * h; 106// g = g * e + g (third 32 bit-approximation of sqrt); 107// 108// h = h * e + h; 109// e = 0.5 - g * h; 110// g_hi = g * e + g (fourth 64 bit-approximation of sqrt); 111// 112// Remainder computation: 113// h = h * e + h; 114// d = (p2_hi - g_hi * g_hi) + p2_lo; 115// g_lo = d * h; 116// 117// 3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2+1) 118// ------------------------------------------------------------------- 119// b_hi = (g_hi + x) + gl; 120// b_lo = (g_hi - b_hi) + x + gl; 121// 122// Now we pass b presented as sum b_hi + b_lo to special version 123// of logl function which accept a pair of arguments as 124// 'mutiprecision' value. 125// 126// Special log algorithm overview 127// ================================ 128// Here we use a table lookup method. The basic idea is that in 129// order to compute logl(Arg) = logl (Arg-1) for an argument Arg in [1,2), 130// we construct a value G such that G*Arg is close to 1 and that 131// logl(1/G) is obtainable easily from a table of values calculated 132// beforehand. Thus 133// 134// logl(Arg) = logl(1/G) + logl((G*Arg - 1)) 135// 136// Because |G*Arg - 1| is small, the second term on the right hand 137// side can be approximated by a short polynomial. We elaborate 138// this method in four steps. 139// 140// Step 0: Initialization 141// 142// We need to calculate logl( X ). Obtain N, S_hi such that 143// 144// X = 2^N * ( S_hi + S_lo ) exactly 145// 146// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense 147// that |S_lo| <= ulp(S_hi). 148// 149// For the special version of logl: S_lo = b_lo 150// !-----------------------------------------------! 151// 152// Step 1: Argument Reduction 153// 154// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate 155// 156// G := G_1 * G_2 * G_3 157// r := (G * S_hi - 1) + G * S_lo 158// 159// These G_j's have the property that the product is exactly 160// representable and that |r| < 2^(-12) as a result. 161// 162// Step 2: Approximation 163// 164// logl(1 + r) is approximated by a short polynomial poly(r). 165// 166// Step 3: Reconstruction 167// 168// Finally, 169// 170// logl( X ) = logl( 2^N * (S_hi + S_lo) ) 171// ~=~ N*logl(2) + logl(1/G) + logl(1 + r) 172// ~=~ N*logl(2) + logl(1/G) + poly(r). 173// 174// For detailed description see logl or log1pl function, regular path. 175// 176// Registers used 177//============================================================== 178// Floating Point registers used: 179// f8, input 180// f32 -> f101 (70 registers) 181 182// General registers used: 183// r32 -> r57 (26 registers) 184 185// Predicate registers used: 186// p6 -> p11 187// p6 for '0, NaNs, Inf' path 188// p7 for '+ denormals' path 189// p8 for 'near 0' path 190// p9 for 'huges' path 191// p10 for '- denormals' path 192// p11 for negative values 193// 194// Data tables 195//============================================================== 196 197RODATA 198.align 64 199 200// C7, C9 'near 0' polynomial coefficients 201LOCAL_OBJECT_START(Poly_C_near_0_79) 202data8 0xF8DC939BBEDD5A54, 0x00003FF9 203data8 0xB6DB6DAB21565AC5, 0x0000BFFA 204LOCAL_OBJECT_END(Poly_C_near_0_79) 205 206// C3, C5 'near 0' polynomial coefficients 207LOCAL_OBJECT_START(Poly_C_near_0_35) 208data8 0x999999999991D582, 0x00003FFB 209data8 0xAAAAAAAAAAAAAAA9, 0x0000BFFC 210LOCAL_OBJECT_END(Poly_C_near_0_35) 211 212// Q coeffs 213LOCAL_OBJECT_START(Constants_Q) 214data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 215data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 216data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 217data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 218data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 219data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 220LOCAL_OBJECT_END(Constants_Q) 221 222// Z1 - 16 bit fixed 223LOCAL_OBJECT_START(Constants_Z_1) 224data4 0x00008000 225data4 0x00007879 226data4 0x000071C8 227data4 0x00006BCB 228data4 0x00006667 229data4 0x00006187 230data4 0x00005D18 231data4 0x0000590C 232data4 0x00005556 233data4 0x000051EC 234data4 0x00004EC5 235data4 0x00004BDB 236data4 0x00004925 237data4 0x0000469F 238data4 0x00004445 239data4 0x00004211 240LOCAL_OBJECT_END(Constants_Z_1) 241 242// G1 and H1 - IEEE single and h1 - IEEE double 243LOCAL_OBJECT_START(Constants_G_H_h1) 244data4 0x3F800000,0x00000000 245data8 0x0000000000000000 246data4 0x3F70F0F0,0x3D785196 247data8 0x3DA163A6617D741C 248data4 0x3F638E38,0x3DF13843 249data8 0x3E2C55E6CBD3D5BB 250data4 0x3F579430,0x3E2FF9A0 251data8 0xBE3EB0BFD86EA5E7 252data4 0x3F4CCCC8,0x3E647FD6 253data8 0x3E2E6A8C86B12760 254data4 0x3F430C30,0x3E8B3AE7 255data8 0x3E47574C5C0739BA 256data4 0x3F3A2E88,0x3EA30C68 257data8 0x3E20E30F13E8AF2F 258data4 0x3F321640,0x3EB9CEC8 259data8 0xBE42885BF2C630BD 260data4 0x3F2AAAA8,0x3ECF9927 261data8 0x3E497F3497E577C6 262data4 0x3F23D708,0x3EE47FC5 263data8 0x3E3E6A6EA6B0A5AB 264data4 0x3F1D89D8,0x3EF8947D 265data8 0xBDF43E3CD328D9BE 266data4 0x3F17B420,0x3F05F3A1 267data8 0x3E4094C30ADB090A 268data4 0x3F124920,0x3F0F4303 269data8 0xBE28FBB2FC1FE510 270data4 0x3F0D3DC8,0x3F183EBF 271data8 0x3E3A789510FDE3FA 272data4 0x3F088888,0x3F20EC80 273data8 0x3E508CE57CC8C98F 274data4 0x3F042108,0x3F29516A 275data8 0xBE534874A223106C 276LOCAL_OBJECT_END(Constants_G_H_h1) 277 278// Z2 - 16 bit fixed 279LOCAL_OBJECT_START(Constants_Z_2) 280data4 0x00008000 281data4 0x00007F81 282data4 0x00007F02 283data4 0x00007E85 284data4 0x00007E08 285data4 0x00007D8D 286data4 0x00007D12 287data4 0x00007C98 288data4 0x00007C20 289data4 0x00007BA8 290data4 0x00007B31 291data4 0x00007ABB 292data4 0x00007A45 293data4 0x000079D1 294data4 0x0000795D 295data4 0x000078EB 296LOCAL_OBJECT_END(Constants_Z_2) 297 298// G2 and H2 - IEEE single and h2 - IEEE double 299LOCAL_OBJECT_START(Constants_G_H_h2) 300data4 0x3F800000,0x00000000 301data8 0x0000000000000000 302data4 0x3F7F00F8,0x3B7F875D 303data8 0x3DB5A11622C42273 304data4 0x3F7E03F8,0x3BFF015B 305data8 0x3DE620CF21F86ED3 306data4 0x3F7D08E0,0x3C3EE393 307data8 0xBDAFA07E484F34ED 308data4 0x3F7C0FC0,0x3C7E0586 309data8 0xBDFE07F03860BCF6 310data4 0x3F7B1880,0x3C9E75D2 311data8 0x3DEA370FA78093D6 312data4 0x3F7A2328,0x3CBDC97A 313data8 0x3DFF579172A753D0 314data4 0x3F792FB0,0x3CDCFE47 315data8 0x3DFEBE6CA7EF896B 316data4 0x3F783E08,0x3CFC15D0 317data8 0x3E0CF156409ECB43 318data4 0x3F774E38,0x3D0D874D 319data8 0xBE0B6F97FFEF71DF 320data4 0x3F766038,0x3D1CF49B 321data8 0xBE0804835D59EEE8 322data4 0x3F757400,0x3D2C531D 323data8 0x3E1F91E9A9192A74 324data4 0x3F748988,0x3D3BA322 325data8 0xBE139A06BF72A8CD 326data4 0x3F73A0D0,0x3D4AE46F 327data8 0x3E1D9202F8FBA6CF 328data4 0x3F72B9D0,0x3D5A1756 329data8 0xBE1DCCC4BA796223 330data4 0x3F71D488,0x3D693B9D 331data8 0xBE049391B6B7C239 332LOCAL_OBJECT_END(Constants_G_H_h2) 333 334// G3 and H3 - IEEE single and h3 - IEEE double 335LOCAL_OBJECT_START(Constants_G_H_h3) 336data4 0x3F7FFC00,0x38800100 337data8 0x3D355595562224CD 338data4 0x3F7FF400,0x39400480 339data8 0x3D8200A206136FF6 340data4 0x3F7FEC00,0x39A00640 341data8 0x3DA4D68DE8DE9AF0 342data4 0x3F7FE400,0x39E00C41 343data8 0xBD8B4291B10238DC 344data4 0x3F7FDC00,0x3A100A21 345data8 0xBD89CCB83B1952CA 346data4 0x3F7FD400,0x3A300F22 347data8 0xBDB107071DC46826 348data4 0x3F7FCC08,0x3A4FF51C 349data8 0x3DB6FCB9F43307DB 350data4 0x3F7FC408,0x3A6FFC1D 351data8 0xBD9B7C4762DC7872 352data4 0x3F7FBC10,0x3A87F20B 353data8 0xBDC3725E3F89154A 354data4 0x3F7FB410,0x3A97F68B 355data8 0xBD93519D62B9D392 356data4 0x3F7FAC18,0x3AA7EB86 357data8 0x3DC184410F21BD9D 358data4 0x3F7FA420,0x3AB7E101 359data8 0xBDA64B952245E0A6 360data4 0x3F7F9C20,0x3AC7E701 361data8 0x3DB4B0ECAABB34B8 362data4 0x3F7F9428,0x3AD7DD7B 363data8 0x3D9923376DC40A7E 364data4 0x3F7F8C30,0x3AE7D474 365data8 0x3DC6E17B4F2083D3 366data4 0x3F7F8438,0x3AF7CBED 367data8 0x3DAE314B811D4394 368data4 0x3F7F7C40,0x3B03E1F3 369data8 0xBDD46F21B08F2DB1 370data4 0x3F7F7448,0x3B0BDE2F 371data8 0xBDDC30A46D34522B 372data4 0x3F7F6C50,0x3B13DAAA 373data8 0x3DCB0070B1F473DB 374data4 0x3F7F6458,0x3B1BD766 375data8 0xBDD65DDC6AD282FD 376data4 0x3F7F5C68,0x3B23CC5C 377data8 0xBDCDAB83F153761A 378data4 0x3F7F5470,0x3B2BC997 379data8 0xBDDADA40341D0F8F 380data4 0x3F7F4C78,0x3B33C711 381data8 0x3DCD1BD7EBC394E8 382data4 0x3F7F4488,0x3B3BBCC6 383data8 0xBDC3532B52E3E695 384data4 0x3F7F3C90,0x3B43BAC0 385data8 0xBDA3961EE846B3DE 386data4 0x3F7F34A0,0x3B4BB0F4 387data8 0xBDDADF06785778D4 388data4 0x3F7F2CA8,0x3B53AF6D 389data8 0x3DCC3ED1E55CE212 390data4 0x3F7F24B8,0x3B5BA620 391data8 0xBDBA31039E382C15 392data4 0x3F7F1CC8,0x3B639D12 393data8 0x3D635A0B5C5AF197 394data4 0x3F7F14D8,0x3B6B9444 395data8 0xBDDCCB1971D34EFC 396data4 0x3F7F0CE0,0x3B7393BC 397data8 0x3DC7450252CD7ADA 398data4 0x3F7F04F0,0x3B7B8B6D 399data8 0xBDB68F177D7F2A42 400LOCAL_OBJECT_END(Constants_G_H_h3) 401 402// Assembly macros 403//============================================================== 404 405// Floating Point Registers 406 407FR_Arg = f8 408FR_Res = f8 409FR_AX = f32 410FR_XLog_Hi = f33 411FR_XLog_Lo = f34 412 413 // Special logl registers 414FR_Y_hi = f35 415FR_Y_lo = f36 416 417FR_Scale = f37 418FR_X_Prime = f38 419FR_S_hi = f39 420FR_W = f40 421FR_G = f41 422 423FR_H = f42 424FR_wsq = f43 425FR_w4 = f44 426FR_h = f45 427FR_w6 = f46 428 429FR_G2 = f47 430FR_H2 = f48 431FR_poly_lo = f49 432FR_P8 = f50 433FR_poly_hi = f51 434 435FR_P7 = f52 436FR_h2 = f53 437FR_rsq = f54 438FR_P6 = f55 439FR_r = f56 440 441FR_log2_hi = f57 442FR_log2_lo = f58 443 444FR_float_N = f59 445FR_Q4 = f60 446 447FR_G3 = f61 448FR_H3 = f62 449FR_h3 = f63 450 451FR_Q3 = f64 452FR_Q2 = f65 453FR_1LN10_hi = f66 454 455FR_Q1 = f67 456FR_1LN10_lo = f68 457FR_P5 = f69 458FR_rcub = f70 459 460FR_Neg_One = f71 461FR_Z = f72 462FR_AA = f73 463FR_BB = f74 464FR_S_lo = f75 465FR_2_to_minus_N = f76 466 467 468 // Huge & Main path prolog registers 469FR_Half = f77 470FR_Two = f78 471FR_X2 = f79 472FR_P2 = f80 473FR_P2L = f81 474FR_Rcp = f82 475FR_GG = f83 476FR_HH = f84 477FR_EE = f85 478FR_DD = f86 479FR_GL = f87 480FR_A = f88 481FR_AL = f89 482FR_B = f90 483FR_BL = f91 484FR_Tmp = f92 485 486 // Near 0 & Huges path prolog registers 487FR_C3 = f93 488FR_C5 = f94 489FR_C7 = f95 490FR_C9 = f96 491 492FR_X3 = f97 493FR_X4 = f98 494FR_P9 = f99 495FR_P5 = f100 496FR_P3 = f101 497 498 499// General Purpose Registers 500 501 // General prolog registers 502GR_PFS = r32 503GR_TwoN7 = r40 504GR_TwoP63 = r41 505GR_ExpMask = r42 506GR_ArgExp = r43 507GR_Half = r44 508 509 // Near 0 path prolog registers 510GR_Poly_C_35 = r45 511GR_Poly_C_79 = r46 512 513 // Special logl registers 514GR_Index1 = r34 515GR_Index2 = r35 516GR_signif = r36 517GR_X_0 = r37 518GR_X_1 = r38 519GR_X_2 = r39 520GR_Z_1 = r40 521GR_Z_2 = r41 522GR_N = r42 523GR_Bias = r43 524GR_M = r44 525GR_Index3 = r45 526GR_exp_2tom80 = r45 527GR_exp_mask = r47 528GR_exp_2tom7 = r48 529GR_ad_ln10 = r49 530GR_ad_tbl_1 = r50 531GR_ad_tbl_2 = r51 532GR_ad_tbl_3 = r52 533GR_ad_q = r53 534GR_ad_z_1 = r54 535GR_ad_z_2 = r55 536GR_ad_z_3 = r56 537GR_minus_N = r57 538 539 540 541.section .text 542GLOBAL_LIBM_ENTRY(asinhl) 543 544{ .mfi 545 alloc GR_PFS = ar.pfs,0,27,0,0 546 fma.s1 FR_P2 = FR_Arg, FR_Arg, f1 // p2 = x^2 + 1 547 mov GR_Half = 0xfffe // 0.5's exp 548} 549{ .mfi 550 addl GR_Poly_C_79 = @ltoff(Poly_C_near_0_79), gp // C7, C9 coeffs 551 fma.s1 FR_X2 = FR_Arg, FR_Arg, f0 // Obtain x^2 552 addl GR_Poly_C_35 = @ltoff(Poly_C_near_0_35), gp // C3, C5 coeffs 553};; 554 555{ .mfi 556 getf.exp GR_ArgExp = FR_Arg // get arument's exponent 557 fabs FR_AX = FR_Arg // absolute value of argument 558 mov GR_TwoN7 = 0xfff8 // 2^-7 exp 559} 560{ .mfi 561 ld8 GR_Poly_C_79 = [GR_Poly_C_79] // get actual coeff table address 562 fma.s0 FR_Two = f1, f1, f1 // construct 2.0 563 mov GR_ExpMask = 0x1ffff // mask for exp 564};; 565 566{ .mfi 567 ld8 GR_Poly_C_35 = [GR_Poly_C_35] // get actual coeff table address 568 fclass.m p6,p0 = FR_Arg, 0xe7 // if arg NaN inf zero 569 mov GR_TwoP63 = 0x1003e // 2^63 exp 570} 571{ .mfi 572 addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp 573 nop.f 0 574 nop.i 0 575};; 576 577{ .mfi 578 setf.exp FR_Half = GR_Half // construct 0.5 579 fclass.m p7,p0 = FR_Arg, 0x09 // if arg + denorm 580 and GR_ArgExp = GR_ExpMask, GR_ArgExp // select exp 581} 582{ .mfb 583 ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1 584 nop.f 0 585 nop.b 0 586};; 587{ .mfi 588 ldfe FR_C9 = [GR_Poly_C_79],16 // load C9 589 fclass.m p10,p0 = FR_Arg, 0x0a // if arg - denorm 590 cmp.gt p8, p0 = GR_TwoN7, GR_ArgExp // if arg < 2^-7 ('near 0') 591} 592{ .mfb 593 cmp.le p9, p0 = GR_TwoP63, GR_ArgExp // if arg > 2^63 ('huges') 594(p6) fma.s0 FR_Res = FR_Arg,f1,FR_Arg // r = a + a 595(p6) br.ret.spnt b0 // return 596};; 597// (X^2 + 1) computation 598{ .mfi 599(p8) ldfe FR_C5 = [GR_Poly_C_35],16 // load C5 600 fms.s1 FR_Tmp = f1, f1, FR_P2 // Tmp = 1 - p2 601 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1 602} 603{ .mfb 604(p8) ldfe FR_C7 = [GR_Poly_C_79],16 // load C7 605(p7) fnma.s0 FR_Res = FR_Arg,FR_Arg,FR_Arg // r = a - a*a 606(p7) br.ret.spnt b0 // return 607};; 608 609{ .mfi 610(p8) ldfe FR_C3 = [GR_Poly_C_35],16 // load C3 611 fcmp.lt.s1 p11, p12 = FR_Arg, f0 // if arg is negative 612 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P 613} 614{ .mfb 615 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2 616(p10) fma.s0 FR_Res = FR_Arg,FR_Arg,FR_Arg // r = a + a*a 617(p10) br.ret.spnt b0 // return 618};; 619 620{ .mfi 621 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2 622 frsqrta.s1 FR_Rcp, p0 = FR_P2 // Rcp = 1/p2 reciprocal appr. 623 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3 624} 625{ .mfi 626 nop.m 0 627 fms.s1 FR_P2L = FR_AX, FR_AX, FR_X2 //low part of p2=fma(X*X-p2) 628 mov GR_Bias = 0x0FFFF // Create exponent bias 629};; 630 631{ .mfb 632 nop.m 0 633(p9) fms.s1 FR_XLog_Hi = FR_Two, FR_AX, f0 // Hi of log1p arg = 2*X - 1 634(p9) br.cond.spnt huges_logl // special version of log1p 635};; 636 637{ .mfb 638 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi 639(p8) fma.s1 FR_X3 = FR_X2, FR_Arg, f0 // x^3 = x^2 * x 640(p8) br.cond.spnt near_0 // Go to near 0 branch 641};; 642 643{ .mfi 644 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo 645 nop.f 0 646 nop.i 0 647};; 648 649{ .mfi 650 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4 651 fma.s1 FR_Tmp = FR_Tmp, f1, FR_X2 // Tmp = Tmp + x^2 652 mov GR_exp_mask = 0x1FFFF // Create exponent mask 653};; 654 655{ .mfi 656 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3 657 fma.s1 FR_GG = FR_Rcp, FR_P2, f0 // g = Rcp * p2 658 // 8 bit Newton Raphson iteration 659 nop.i 0 660} 661{ .mfi 662 nop.m 0 663 fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp 664 nop.i 0 665};; 666{ .mfi 667 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2 668 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h 669 nop.i 0 670} 671{ .mfi 672 nop.m 0 673 fma.s1 FR_P2L = FR_Tmp, f1, FR_P2L // low part of p2 = Tmp + p2l 674 nop.i 0 675};; 676 677{ .mfi 678 ldfe FR_Q1 = [GR_ad_q] // Load Q1 679 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g 680 // 16 bit Newton Raphson iteration 681 nop.i 0 682} 683{ .mfi 684 nop.m 0 685 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h 686 nop.i 0 687};; 688 689{ .mfi 690 nop.m 0 691 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h 692 nop.i 0 693};; 694 695{ .mfi 696 nop.m 0 697 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g 698 // 32 bit Newton Raphson iteration 699 nop.i 0 700} 701{ .mfi 702 nop.m 0 703 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h 704 nop.i 0 705};; 706 707{ .mfi 708 nop.m 0 709 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h 710 nop.i 0 711};; 712 713{ .mfi 714 nop.m 0 715 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g 716 // 64 bit Newton Raphson iteration 717 nop.i 0 718} 719{ .mfi 720 nop.m 0 721 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h 722 nop.i 0 723};; 724 725{ .mfi 726 nop.m 0 727 fnma.s1 FR_DD = FR_GG, FR_GG, FR_P2 // Remainder d = g * g - p2 728 nop.i 0 729} 730{ .mfi 731 nop.m 0 732 fma.s1 FR_XLog_Hi = FR_AX, f1, FR_GG // bh = z + gh 733 nop.i 0 734};; 735 736{ .mfi 737 nop.m 0 738 fma.s1 FR_DD = FR_DD, f1, FR_P2L // add p2l: d = d + p2l 739 nop.i 0 740};; 741 742 743{ .mfi 744 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1 745 fmerge.ns FR_Neg_One = f1, f1 // Form -1.0 746 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7 747};; 748 749{ .mfi 750 nop.m 0 751 fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h 752 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif 753} 754{ .mfi 755 nop.m 0 756 fma.s1 FR_XLog_Hi = FR_DD, FR_HH, FR_XLog_Hi // bh = bh + gl 757 nop.i 0 758};; 759 760{ .mmi 761 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1 762 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1 763 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif. 764};; 765 766{ .mmi 767 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1 768 nop.m 0 769 nop.i 0 770};; 771 772{ .mmi 773 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1 774 nop.m 0 775 nop.i 0 776};; 777 778{ .mfi 779 nop.m 0 780 fms.s1 FR_XLog_Lo = FR_GG, f1, FR_XLog_Hi // bl = gh - bh 781 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1 782};; 783 784// WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL! 785// "DEAD" ZONE! 786 787{ .mfi 788 nop.m 0 789 nop.f 0 790 nop.i 0 791};; 792 793{ .mfi 794 nop.m 0 795 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1| 796 nop.i 0 797};; 798 799{ .mmi 800 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1 801 ldfd FR_h = [GR_ad_tbl_1] // Load h_1 802 nop.i 0 803};; 804 805{ .mfi 806 nop.m 0 807 nop.f 0 808 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 809};; 810 811 812{ .mfi 813 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2 814 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_AX // bl = bl + x 815 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80 816} 817{ .mfi 818 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2 819 nop.f 0 820 sub GR_N = GR_N, GR_Bias // sub bias from exp 821};; 822 823{ .mmi 824 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2 825 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 826 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N) 827};; 828 829{ .mmi 830 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2 831 nop.m 0 832 nop.i 0 833};; 834 835{ .mmi 836 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign 837 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N) 838 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2 839};; 840 841// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!) 842// BECAUSE OF POSSIBLE 10 CLOCKS STALL! 843// So we can negate Q coefficients there for negative values 844 845{ .mfi 846 nop.m 0 847(p11) fma.s1 FR_Q1 = FR_Q1, FR_Neg_One, f0 // Negate Q1 848 nop.i 0 849} 850{ .mfi 851 nop.m 0 852 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GL // bl = bl + gl 853 nop.i 0 854};; 855 856{ .mfi 857 nop.m 0 858(p11) fma.s1 FR_Q2 = FR_Q2, FR_Neg_One, f0 // Negate Q2 859 nop.i 0 860};; 861 862{ .mfi 863 nop.m 0 864(p11) fma.s1 FR_Q3 = FR_Q3, FR_Neg_One, f0 // Negate Q3 865 nop.i 0 866};; 867 868{ .mfi 869 nop.m 0 870(p11) fma.s1 FR_Q4 = FR_Q4, FR_Neg_One, f0 // Negate Q4 871 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 872};; 873 874{ .mfi 875 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3 876 nop.f 0 877 nop.i 0 878};; 879 880{ .mfi 881 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3 882 nop.f 0 883 nop.i 0 884};; 885 886{ .mfi 887 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3 888 fcvt.xf FR_float_N = FR_float_N 889 nop.i 0 890};; 891 892{ .mfi 893 nop.m 0 894 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 895 nop.i 0 896} 897{ .mfi 898 nop.m 0 899 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 900 nop.i 0 901};; 902 903{ .mfi 904 nop.m 0 905 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 906 nop.i 0 907} 908{ .mfi 909 nop.m 0 910 fma.s1 FR_S_lo = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^-N 911 nop.i 0 912};; 913 914{ .mfi 915 nop.m 0 916 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 917 nop.i 0 918} 919{ .mfi 920 nop.m 0 921 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 922 nop.i 0 923};; 924 925{ .mfi 926 nop.m 0 927 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 928 nop.i 0 929};; 930 931{ .mfi 932 nop.m 0 933 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1 934 nop.i 0 935} 936{ .mfi 937 nop.m 0 938 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H 939 nop.i 0 940};; 941 942{ .mfi 943 nop.m 0 944 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h 945 nop.i 0 946} 947{ .mfi 948 nop.m 0 949 fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r=G*S_lo+(G*S_hi-1) 950 nop.i 0 951};; 952 953{ .mfi 954 nop.m 0 955 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3 956 nop.i 0 957} 958{ .mfi 959 nop.m 0 960 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r 961 nop.i 0 962};; 963 964{ .mfi 965 nop.m 0 966 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2 967 nop.i 0 968} 969{ .mfi 970 nop.m 0 971 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 972 nop.i 0 973};; 974 975.pred.rel "mutex",p12,p11 976{ .mfi 977 nop.m 0 978(p12) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r 979 nop.i 0 980} 981{ .mfi 982 nop.m 0 983(p11) fms.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r 984 nop.i 0 985};; 986 987 988.pred.rel "mutex",p12,p11 989{ .mfi 990 nop.m 0 991(p12) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h 992 nop.i 0 993} 994{ .mfi 995 nop.m 0 996(p11) fms.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h 997 nop.i 0 998} 999;; 1000 1001{ .mfi 1002 nop.m 0 1003 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo 1004 // Y_lo=poly_hi+poly_lo 1005 nop.i 0 1006} 1007{ .mfi 1008 nop.m 0 1009(p11) fma.s0 FR_Y_hi = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg 1010 nop.i 0 1011};; 1012 1013{ .mfb 1014 nop.m 0 1015 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi 1016 br.ret.sptk b0 // Common exit for 2^-7 < x < inf 1017};; 1018 1019// * SPECIAL VERSION OF LOGL FOR HUGE ARGUMENTS * 1020 1021huges_logl: 1022{ .mfi 1023 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1 1024 fmerge.ns FR_Neg_One = f1, f1 // Form -1.0 1025 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7 1026};; 1027 1028{ .mfi 1029 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1 1030 nop.f 0 1031 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P 1032} 1033{ .mfi 1034 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2 1035 nop.f 0 1036 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2 1037};; 1038 1039{ .mfi 1040 nop.m 0 1041 nop.f 0 1042 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif 1043} 1044{ .mfi 1045 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3 1046 nop.f 0 1047 nop.i 0 1048};; 1049 1050{ .mfi 1051 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1 1052 nop.f 0 1053 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif. 1054};; 1055 1056{ .mfi 1057 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1 1058 nop.f 0 1059 mov GR_exp_mask = 0x1FFFF // Create exponent mask 1060} 1061{ .mfi 1062 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1 1063 nop.f 0 1064 mov GR_Bias = 0x0FFFF // Create exponent bias 1065};; 1066 1067{ .mfi 1068 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1 1069 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1| 1070 nop.i 0 1071};; 1072 1073{ .mmi 1074 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1 1075 ldfd FR_h = [GR_ad_tbl_1] // Load h_1 1076 nop.i 0 1077};; 1078 1079{ .mfi 1080 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi 1081 nop.f 0 1082 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1 1083};; 1084 1085// WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL! 1086// "DEAD" ZONE! 1087 1088{ .mmi 1089 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo 1090 sub GR_N = GR_N, GR_Bias 1091 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80 1092};; 1093 1094{ .mfi 1095 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4 1096 nop.f 0 1097 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N) 1098};; 1099 1100{ .mmf 1101 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3 1102 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign 1103 nop.f 0 1104};; 1105 1106{ .mmi 1107 nop.m 0 1108 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2 1109 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 1110};; 1111 1112{ .mmi 1113 ldfe FR_Q1 = [GR_ad_q] // Load Q1 1114 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2 1115 nop.i 0 1116};; 1117 1118{ .mmi 1119 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 1120 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2 1121 nop.i 0 1122};; 1123 1124{ .mmi 1125 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2 1126 nop.m 0 1127 nop.i 0 1128};; 1129 1130{ .mfi 1131 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2 1132 nop.f 0 1133 nop.i 0 1134} 1135{ .mfi 1136 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N) 1137 nop.f 0 1138 nop.i 0 1139};; 1140 1141{ .mfi 1142 nop.m 0 1143 nop.f 0 1144 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2 1145};; 1146 1147// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL! 1148// "DEAD" ZONE! 1149// JUST HAVE TO INSERT 3 NOP CYCLES (nothing to do here) 1150 1151{ .mfi 1152 nop.m 0 1153 nop.f 0 1154 nop.i 0 1155};; 1156 1157{ .mfi 1158 nop.m 0 1159 nop.f 0 1160 nop.i 0 1161};; 1162 1163{ .mfi 1164 nop.m 0 1165 nop.f 0 1166 nop.i 0 1167};; 1168 1169{ .mfi 1170 nop.m 0 1171(p11) fma.s1 FR_Q4 = FR_Q4, FR_Neg_One, f0 // Negate Q4 1172 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 1173 };; 1174 1175{ .mfi 1176 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3 1177 fcvt.xf FR_float_N = FR_float_N 1178 nop.i 0 1179} 1180{ .mfi 1181 nop.m 0 1182(p11) fma.s1 FR_Q3 = FR_Q3, FR_Neg_One, f0 // Negate Q3 1183 nop.i 0 1184};; 1185 1186{ .mfi 1187 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3 1188(p11) fma.s1 FR_Q2 = FR_Q2, FR_Neg_One, f0 // Negate Q2 1189 nop.i 0 1190} 1191{ .mfi 1192 nop.m 0 1193(p11) fma.s1 FR_Q1 = FR_Q1, FR_Neg_One, f0 // Negate Q1 1194 nop.i 0 1195};; 1196 1197{ .mfi 1198 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3 1199 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 1200 nop.i 0 1201} 1202{ .mfi 1203 nop.m 0 1204 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 1205 nop.i 0 1206};; 1207 1208{ .mmf 1209 nop.m 0 1210 nop.m 0 1211 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 1212};; 1213 1214{ .mfi 1215 nop.m 0 1216 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 1217 nop.i 0 1218} 1219{ .mfi 1220 nop.m 0 1221 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 1222 nop.i 0 1223};; 1224 1225{ .mfi 1226 nop.m 0 1227 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 1228 nop.i 0 1229};; 1230 1231{ .mfi 1232 nop.m 0 1233 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1 1234 nop.i 0 1235} 1236{ .mfi 1237 nop.m 0 1238 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H 1239 nop.i 0 1240};; 1241 1242{ .mfi 1243 nop.m 0 1244 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h 1245 nop.i 0 1246};; 1247 1248{ .mfi 1249 nop.m 0 1250 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3 1251 nop.i 0 1252} 1253{ .mfi 1254 nop.m 0 1255 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r 1256 nop.i 0 1257};; 1258 1259{ .mfi 1260 nop.m 0 1261 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2 1262 nop.i 0 1263} 1264{ .mfi 1265 nop.m 0 1266 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 1267 nop.i 0 1268};; 1269 1270.pred.rel "mutex",p12,p11 1271{ .mfi 1272 nop.m 0 1273(p12) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r 1274 nop.i 0 1275} 1276{ .mfi 1277 nop.m 0 1278(p11) fms.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r 1279 nop.i 0 1280};; 1281 1282 1283.pred.rel "mutex",p12,p11 1284{ .mfi 1285 nop.m 0 1286(p12) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h 1287 nop.i 0 1288} 1289{ .mfi 1290 nop.m 0 1291(p11) fms.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h 1292 nop.i 0 1293};; 1294 1295{ .mfi 1296 nop.m 0 1297 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo=poly_hi+poly_lo 1298 nop.i 0 1299} 1300{ .mfi 1301 nop.m 0 1302(p11) fma.s0 FR_Y_hi = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg 1303 nop.i 0 1304};; 1305 1306{ .mfb 1307 nop.m 0 1308 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi 1309 br.ret.sptk b0 // Common exit for 2^-7 < x < inf 1310};; 1311 1312// NEAR ZERO POLYNOMIAL INTERVAL 1313near_0: 1314{ .mfi 1315 nop.m 0 1316 fma.s1 FR_X4 = FR_X2, FR_X2, f0 // x^4 = x^2 * x^2 1317 nop.i 0 1318};; 1319 1320{ .mfi 1321 nop.m 0 1322 fma.s1 FR_P9 = FR_C9,FR_X2,FR_C7 // p9 = C9*x^2 + C7 1323 nop.i 0 1324} 1325{ .mfi 1326 nop.m 0 1327 fma.s1 FR_P5 = FR_C5,FR_X2,FR_C3 // p5 = C5*x^2 + C3 1328 nop.i 0 1329};; 1330 1331{ .mfi 1332 nop.m 0 1333 fma.s1 FR_P3 = FR_P9,FR_X4,FR_P5 // p3 = p9*x^4 + p5 1334 nop.i 0 1335};; 1336 1337{ .mfb 1338 nop.m 0 1339 fma.s0 FR_Res = FR_P3,FR_X3,FR_Arg // res = p3*C3 + x 1340 br.ret.sptk b0 // Near 0 path return 1341};; 1342 1343GLOBAL_LIBM_END(asinhl) 1344libm_alias_ldouble_other (asinh, asinh) 1345