1.file "asinf.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38 39// History 40//============================================================== 41// 02/02/00 Initial version 42// 06/28/00 Improved speed 43// 06/31/00 Changed register allocation because of some duplicate macros 44// moved nan exit bundle up to gain a cycle. 45// 08/08/00 Improved speed by avoiding SIR flush. 46// 08/15/00 Bundle added after call to __libm_error_support to properly 47// set [the previously overwritten] GR_Parameter_RESULT. 48// 08/17/00 Changed predicate register macro-usage to direct predicate 49// names due to an assembler bug. 50// 10/17/00 Improved speed of x=0 and x=1 paths, set D flag if x denormal. 51// 03/13/01 Corrected sign of imm1 value in dep instruction. 52// 05/20/02 Cleaned up namespace and sf0 syntax 53// 02/06/03 Reordered header: .section, .global, .proc, .align 54 55 56// Description 57//========================================= 58// The asinf function computes the arc sine of x in the range [-pi,+pi]. 59// A doman error occurs for arguments not in the range [-1,+1]. 60// asinf(+-0) returns +-0 61// asinf(x) returns a Nan and raises the invalid exception for |x| >1 62 63// The acosf function returns the arc cosine in the range [0, +pi] radians. 64// A doman error occurs for arguments not in the range [-1,+1]. 65// acosf(1) returns +0 66// acosf(x) returns a Nan and raises the invalid exception for |x| >1 67 68 69// |x| <= sqrt(2)/2. get Ax and Bx 70 71// poly_p1 = x p1 72// poly_p3 = x2 p4 + p3 73// poly_p1 = x2 (poly_p1) + x = x2(x p1) + x 74// poly_p2 = x2( poly_p3) + p2 = x2(x2 p4 + p3) + p2 75 76// poly_Ax = x5(x2( poly_p3) + p2) + x2(x p1) + x 77// = x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x 78 79// poly_p7 = x2 p8 + p7 80// poly_p5 = x2 p6 + p5 81 82// poly_p7 = x4 p9 + (poly_p7) 83// poly_p7 = x4 p9 + (x2 p8 + p7) 84// poly_Bx = x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5 85 86// answer1 = x11(x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5) + x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x 87// = x19 p9 + x17 p8 + x15 p7 x13 p6 + x11 p5 + x9 p4 + x7 p3 + x5 p2 + x3 p1 + x 88 89 90 91// |x| > sqrt(2)/2 92 93// Get z = sqrt(1-x2) 94 95// Get polynomial in t = 1-x2 96 97// t2 = t t 98// t4 = t2 t2 99 100// poly_p4 = t p5 + p4 101// poly_p1 = t p1 + 1 102 103// poly_p6 = t p7 + p6 104// poly_p2 = t p3 + p2 105 106// poly_p8 = t p9 + p8 107 108// poly_p4 = t2 poly_p6 + poly_p4 109// = t2 (t p7 + p6) + (t p5 + p4) 110 111// poly_p2 = t2 poly_p2 + poly_p1 112// = t2 (t p3 + p2) + (t p1 + 1) 113 114// poly_p4 = t4 poly_p8 + poly_p4 115// = t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4)) 116 117// P(t) = poly_p2 + t4 poly_p8 118// = t2 (t p3 + p2) + (t p1 + 1) + t4 (t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4))) 119// = t3 p3 + t2 p2 + t p1 + 1 + t9 p9 + t8 p8 + t7 p7 + t6 p6 + t5 p5 + t4 p4 120 121 122// answer2 = - sign(x) z P(t) + (sign(x) pi/2) 123// 124 125 126// Assembly macros 127//========================================= 128 129// predicate registers 130//asinf_pred_LEsqrt2by2 = p7 131//asinf_pred_GTsqrt2by2 = p8 132 133// integer registers 134ASINF_Addr1 = r33 135ASINF_Addr2 = r34 136ASINF_GR_1by2 = r35 137 138ASINF_GR_3by2 = r36 139ASINF_GR_5by2 = r37 140 141GR_SAVE_B0 = r38 142GR_SAVE_PFS = r39 143GR_SAVE_GP = r40 144 145GR_Parameter_X = r41 146GR_Parameter_Y = r42 147GR_Parameter_RESULT = r43 148GR_Parameter_TAG = r44 149 150// floating point registers 151 152asinf_y = f32 153asinf_abs_x = f33 154asinf_x2 = f34 155asinf_sgn_x = f35 156 157asinf_1by2 = f36 158asinf_3by2 = f37 159asinf_5by2 = f38 160asinf_coeff_P3 = f39 161asinf_coeff_P8 = f40 162 163asinf_coeff_P1 = f41 164asinf_coeff_P4 = f42 165asinf_coeff_P5 = f43 166asinf_coeff_P2 = f44 167asinf_coeff_P7 = f45 168 169asinf_coeff_P6 = f46 170asinf_coeff_P9 = f47 171asinf_x2 = f48 172asinf_x3 = f49 173asinf_x4 = f50 174 175asinf_x8 = f51 176asinf_x5 = f52 177asinf_const_piby2 = f53 178asinf_const_sqrt2by2 = f54 179asinf_x11 = f55 180 181asinf_poly_p1 = f56 182asinf_poly_p3 = f57 183asinf_sinf1 = f58 184asinf_poly_p2 = f59 185asinf_poly_Ax = f60 186 187asinf_poly_p7 = f61 188asinf_poly_p5 = f62 189asinf_sgnx_t4 = f63 190asinf_poly_Bx = f64 191asinf_t = f65 192 193asinf_yby2 = f66 194asinf_B = f67 195asinf_B2 = f68 196asinf_Az = f69 197asinf_dz = f70 198 199asinf_Sz = f71 200asinf_d2z = f72 201asinf_Fz = f73 202asinf_z = f74 203asinf_sgnx_z = f75 204 205asinf_t2 = f76 206asinf_2poly_p4 = f77 207asinf_2poly_p6 = f78 208asinf_2poly_p1 = f79 209asinf_2poly_p2 = f80 210 211asinf_2poly_p8 = f81 212asinf_t4 = f82 213asinf_Pt = f83 214asinf_sgnx_2poly_p2 = f84 215asinf_sgn_x_piby2 = f85 216 217asinf_poly_p7a = f86 218asinf_2poly_p4a = f87 219asinf_2poly_p4b = f88 220asinf_2poly_p2a = f89 221asinf_poly_p1a = f90 222 223 224 225 226 227// Data tables 228//============================================================== 229 230RODATA 231 232.align 16 233 234LOCAL_OBJECT_START(asinf_coeff_1_table) 235data8 0x3FC5555607DCF816 // P1 236data8 0x3F9CF81AD9BAB2C6 // P4 237data8 0x3FC59E0975074DF3 // P7 238data8 0xBFA6F4CC2780AA1D // P6 239data8 0x3FC2DD45292E93CB // P9 240data8 0x3fe6a09e667f3bcd // sqrt(2)/2 241LOCAL_OBJECT_END(asinf_coeff_1_table) 242 243LOCAL_OBJECT_START(asinf_coeff_2_table) 244data8 0x3FA6F108E31EFBA6 // P3 245data8 0xBFCA31BF175D82A0 // P8 246data8 0x3FA30C0337F6418B // P5 247data8 0x3FB332C9266CB1F9 // P2 248data8 0x3ff921fb54442d18 // pi_by_2 249LOCAL_OBJECT_END(asinf_coeff_2_table) 250 251 252.section .text 253GLOBAL_LIBM_ENTRY(asinf) 254 255// Load the addresses of the two tables. 256// Then, load the coefficients and other constants. 257 258{ .mfi 259 alloc r32 = ar.pfs,1,8,4,0 260 fnma.s1 asinf_t = f8,f8,f1 261 dep.z ASINF_GR_1by2 = 0x3f,24,8 // 0x3f000000 262} 263{ .mfi 264 addl ASINF_Addr1 = @ltoff(asinf_coeff_1_table),gp 265 fma.s1 asinf_x2 = f8,f8,f0 266 addl ASINF_Addr2 = @ltoff(asinf_coeff_2_table),gp ;; 267} 268 269 270{ .mfi 271 ld8 ASINF_Addr1 = [ASINF_Addr1] 272 fmerge.s asinf_abs_x = f1,f8 273 dep ASINF_GR_3by2 = -1,r0,22,8 // 0x3fc00000 274} 275{ .mlx 276 nop.m 999 277 movl ASINF_GR_5by2 = 0x40200000;; 278} 279 280 281 282{ .mfi 283 setf.s asinf_1by2 = ASINF_GR_1by2 284 fmerge.s asinf_sgn_x = f8,f1 285 nop.i 999 286} 287{ .mfi 288 ld8 ASINF_Addr2 = [ASINF_Addr2] 289 nop.f 0 290 nop.i 999;; 291} 292 293 294{ .mfi 295 setf.s asinf_5by2 = ASINF_GR_5by2 296 fcmp.lt.s1 p11,p12 = f8,f0 297 nop.i 999;; 298} 299 300{ .mmf 301 ldfpd asinf_coeff_P1,asinf_coeff_P4 = [ASINF_Addr1],16 302 setf.s asinf_3by2 = ASINF_GR_3by2 303 fclass.m.unc p8,p0 = f8, 0xc3 ;; //@qnan | @snan 304} 305 306 307{ .mfi 308 ldfpd asinf_coeff_P7,asinf_coeff_P6 = [ASINF_Addr1],16 309 fma.s1 asinf_t2 = asinf_t,asinf_t,f0 310 nop.i 999 311} 312{ .mfi 313 ldfpd asinf_coeff_P3,asinf_coeff_P8 = [ASINF_Addr2],16 314 fma.s1 asinf_x4 = asinf_x2,asinf_x2,f0 315 nop.i 999;; 316} 317 318 319{ .mfi 320 ldfpd asinf_coeff_P9,asinf_const_sqrt2by2 = [ASINF_Addr1] 321 fclass.m.unc p10,p0 = f8, 0x07 //@zero 322 nop.i 999 323} 324{ .mfi 325 ldfpd asinf_coeff_P5,asinf_coeff_P2 = [ASINF_Addr2],16 326 fma.s1 asinf_x3 = f8,asinf_x2,f0 327 nop.i 999;; 328} 329 330 331{ .mfi 332 ldfd asinf_const_piby2 = [ASINF_Addr2] 333 frsqrta.s1 asinf_B,p0 = asinf_t 334 nop.i 999 335} 336{ .mfb 337 nop.m 999 338(p8) fma.s.s0 f8 = f8,f1,f0 339(p8) br.ret.spnt b0 ;; // Exit if x=nan 340} 341 342 343{ .mfb 344 nop.m 999 345 fcmp.eq.s1 p6,p0 = asinf_abs_x,f1 346(p10) br.ret.spnt b0 ;; // Exit if x=0 347} 348 349{ .mfi 350 nop.m 999 351 fcmp.gt.s1 p9,p0 = asinf_abs_x,f1 352 nop.i 999;; 353} 354 355{ .mfi 356 nop.m 999 357 fma.s1 asinf_x8 = asinf_x4,asinf_x4,f0 358 nop.i 999 359} 360{ .mfb 361 nop.m 999 362 fma.s1 asinf_t4 = asinf_t2,asinf_t2,f0 363(p6) br.cond.spnt ASINF_ABS_ONE ;; // Branch if |x|=1 364} 365 366{ .mfi 367 nop.m 999 368 fma.s1 asinf_x5 = asinf_x2,asinf_x3,f0 369 nop.i 999 370} 371{ .mfb 372(p9) mov GR_Parameter_TAG = 62 373 fma.s1 asinf_yby2 = asinf_t,asinf_1by2,f0 374(p9) br.cond.spnt __libm_error_region ;; // Branch if |x|>1 375} 376 377 378{ .mfi 379 nop.m 999 380 fma.s1 asinf_Az = asinf_t,asinf_B,f0 381 nop.i 999 382} 383{ .mfi 384 nop.m 999 385 fma.s1 asinf_B2 = asinf_B,asinf_B,f0 386 nop.i 999;; 387} 388 389{ .mfi 390 nop.m 999 391 fma.s1 asinf_poly_p1 = f8,asinf_coeff_P1,f0 392 nop.i 999 393} 394{ .mfi 395 nop.m 999 396 fma.s1 asinf_2poly_p1 = asinf_coeff_P1,asinf_t,f1 397 nop.i 999;; 398} 399 400{ .mfi 401 nop.m 999 402 fma.s1 asinf_poly_p3 = asinf_coeff_P4,asinf_x2,asinf_coeff_P3 403 nop.i 999 404} 405{ .mfi 406 nop.m 999 407 fma.s1 asinf_2poly_p6 = asinf_coeff_P7,asinf_t,asinf_coeff_P6 408 nop.i 999;; 409} 410 411{ .mfi 412 nop.m 999 413 fma.s1 asinf_poly_p7 = asinf_x2,asinf_coeff_P8,asinf_coeff_P7 414 nop.i 999 415} 416{ .mfi 417 nop.m 999 418 fma.s1 asinf_2poly_p2 = asinf_coeff_P3,asinf_t,asinf_coeff_P2 419 nop.i 999;; 420} 421 422 423{ .mfi 424 nop.m 999 425 fma.s1 asinf_poly_p5 = asinf_x2,asinf_coeff_P6,asinf_coeff_P5 426 nop.i 999 427} 428{ .mfi 429 nop.m 999 430 fma.s1 asinf_2poly_p4 = asinf_coeff_P5,asinf_t,asinf_coeff_P4 431 nop.i 999;; 432} 433 434 435{ .mfi 436 nop.m 999 437 fma.d.s1 asinf_x11 = asinf_x8,asinf_x3,f0 438 nop.i 999 439} 440{ .mfi 441 nop.m 999 442 fnma.s1 asinf_dz = asinf_B2,asinf_yby2,asinf_1by2 443 nop.i 999;; 444} 445 446 447{ .mfi 448 nop.m 999 449 fma.s1 asinf_poly_p1a = asinf_x2,asinf_poly_p1,f8 450 nop.i 999 451} 452{ .mfi 453 nop.m 999 454 fma.s1 asinf_2poly_p8 = asinf_coeff_P9,asinf_t,asinf_coeff_P8 455 nop.i 999;; 456} 457 458 459// Get the absolute value of x and determine the region in which x lies 460 461{ .mfi 462 nop.m 999 463 fcmp.le.s1 p7,p8 = asinf_abs_x,asinf_const_sqrt2by2 464 nop.i 999 465} 466{ .mfi 467 nop.m 999 468 fma.s1 asinf_poly_p2 = asinf_x2,asinf_poly_p3,asinf_coeff_P2 469 nop.i 999;; 470} 471 472 473{ .mfi 474 nop.m 999 475 fma.s1 asinf_poly_p7a = asinf_x4,asinf_coeff_P9,asinf_poly_p7 476 nop.i 999 477} 478{ .mfi 479 nop.m 999 480 fma.s1 asinf_2poly_p2a = asinf_2poly_p2,asinf_t2,asinf_2poly_p1 481 nop.i 999;; 482} 483 484 485{ .mfi 486 nop.m 999 487(p8) fma.s1 asinf_sgnx_t4 = asinf_sgn_x,asinf_t4,f0 488 nop.i 999 489} 490{ .mfi 491 nop.m 999 492(p8) fma.s1 asinf_2poly_p4a = asinf_2poly_p6,asinf_t2,asinf_2poly_p4 493 nop.i 999;; 494} 495 496 497{ .mfi 498 nop.m 999 499(p8) fma.s1 asinf_Sz = asinf_5by2,asinf_dz,asinf_3by2 500 nop.i 999 501} 502{ .mfi 503 nop.m 999 504(p8) fma.s1 asinf_d2z = asinf_dz,asinf_dz,f0 505 nop.i 999;; 506} 507 508 509{ .mfi 510 nop.m 999 511(p8) fma.s1 asinf_sgn_x_piby2 = asinf_sgn_x,asinf_const_piby2,f0 512 nop.i 999 513} 514{ .mfi 515 nop.m 999 516(p7) fma.d.s1 asinf_poly_Ax = asinf_x5,asinf_poly_p2,asinf_poly_p1a 517 nop.i 999;; 518} 519 520{ .mfi 521 nop.m 999 522(p7) fma.d.s1 asinf_poly_Bx = asinf_x4,asinf_poly_p7a,asinf_poly_p5 523 nop.i 999 524} 525{ .mfi 526 nop.m 999 527(p8) fma.s1 asinf_sgnx_2poly_p2 = asinf_sgn_x,asinf_2poly_p2a,f0 528 nop.i 999;; 529} 530 531{ .mfi 532 nop.m 999 533 fcmp.eq.s0 p6,p0 = f8,f0 // Only purpose is to set D if x denormal 534 nop.i 999 535} 536{ .mfi 537 nop.m 999 538(p8) fma.s1 asinf_2poly_p4b = asinf_2poly_p8,asinf_t4,asinf_2poly_p4a 539 nop.i 999;; 540} 541 542 543{ .mfi 544 nop.m 999 545(p8) fma.s1 asinf_Fz = asinf_d2z,asinf_Sz,asinf_dz 546 nop.i 999;; 547} 548 549 550{ .mfi 551 nop.m 999 552(p8) fma.d.s1 asinf_Pt = asinf_2poly_p4b,asinf_sgnx_t4,asinf_sgnx_2poly_p2 553 nop.i 999;; 554} 555 556{ .mfi 557 nop.m 999 558(p8) fma.d.s1 asinf_z = asinf_Az,asinf_Fz,asinf_Az 559 nop.i 999;; 560} 561 562.pred.rel "mutex",p8,p7 //asinf_pred_GTsqrt2by2,asinf_pred_LEsqrt2by2 563{ .mfi 564 nop.m 999 565(p8) fnma.s.s0 f8 = asinf_z,asinf_Pt,asinf_sgn_x_piby2 566 nop.i 999 567} 568 569{ .mfb 570 nop.m 999 571(p7) fma.s.s0 f8 = asinf_x11,asinf_poly_Bx,asinf_poly_Ax 572 br.ret.sptk b0 ;; 573} 574 575ASINF_ABS_ONE: 576// Here for short exit if |x|=1 577{ .mfb 578 nop.m 999 579 fma.s.s0 f8 = asinf_sgn_x,asinf_const_piby2,f0 580 br.ret.sptk b0 581} 582;; 583 584GLOBAL_LIBM_END(asinf) 585libm_alias_float_other (asin, asin) 586 587// Stack operations when calling error support. 588// (1) (2) 589// sp -> + psp -> + 590// | | 591// | | <- GR_Y 592// | | 593// | <-GR_Y Y2->| 594// | | 595// | | <- GR_X 596// | | 597// sp-64 -> + sp -> + 598// save ar.pfs save b0 599// save gp 600 601 602// Stack operations when calling error support. 603// (3) (call) (4) 604// psp -> + sp -> + 605// | | 606// R3 ->| <- GR_RESULT | -> f8 607// | | 608// Y2 ->| <- GR_Y | 609// | | 610// X1 ->| | 611// | | 612// sp -> + + 613// restore gp 614// restore ar.pfs 615 616LOCAL_LIBM_ENTRY(__libm_error_region) 617.prologue 618{ .mfi 619 add GR_Parameter_Y=-32,sp // Parameter 2 value 620 nop.f 999 621.save ar.pfs,GR_SAVE_PFS 622 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 623} 624{ .mfi 625.fframe 64 626 add sp=-64,sp // Create new stack 627 nop.f 0 628 mov GR_SAVE_GP=gp // Save gp 629};; 630{ .mmi 631 stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack 632 add GR_Parameter_X = 16,sp // Parameter 1 address 633.save b0, GR_SAVE_B0 634 mov GR_SAVE_B0=b0 // Save b0 635};; 636 637.body 638{ .mfi 639 nop.m 0 640 frcpa.s0 f9,p0 = f0,f0 641 nop.i 0 642};; 643 644{ .mib 645 stfs [GR_Parameter_X] = f8 // Store Parameter 1 on stack 646 add GR_Parameter_RESULT = 0,GR_Parameter_Y 647 nop.b 0 // Parameter 3 address 648} 649{ .mib 650 stfs [GR_Parameter_Y] = f9 // Store Parameter 3 on stack 651 add GR_Parameter_Y = -16,GR_Parameter_Y 652 br.call.sptk b0=__libm_error_support# // Call error handling function 653};; 654{ .mmi 655 nop.m 0 656 nop.m 0 657 add GR_Parameter_RESULT = 48,sp 658};; 659 660{ .mmi 661 ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack 662.restore sp 663 add sp = 64,sp // Restore stack pointer 664 mov b0 = GR_SAVE_B0 // Restore return address 665};; 666{ .mib 667 mov gp = GR_SAVE_GP // Restore gp 668 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 669 br.ret.sptk b0 // Return 670};; 671 672LOCAL_LIBM_END(__libm_error_region) 673 674.type __libm_error_support#,@function 675.global __libm_error_support# 676