1.file "acosf.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38 39// History 40//============================================================== 41// 02/02/00 Initial version 42// 06/28/00 Improved speed 43// 06/31/00 Changed register allocation because of some duplicate macros 44// moved nan exit bundle up to gain a cycle. 45// 08/15/00 Bundle added after call to __libm_error_support to properly 46// set [the previously overwritten] GR_Parameter_RESULT. 47// 08/17/00 Changed predicate register macro-usage to direct predicate 48// names due to an assembler bug. 49// 10/17/00 Improved speed of x=0 and x=1 paths, set D flag if x denormal. 50// 03/13/01 Corrected sign of imm1 value in dep instruction. 51// 05/20/02 Cleaned up namespace and sf0 syntax 52// 02/06/03 Reordered header: .section, .global, .proc, .align 53// 04/17/03 Moved mutex after label 54 55 56// Description 57//========================================= 58// The acosf function computes the principle value of the arc sine of x. 59// A doman error occurs for arguments not in the range [-1,+1]. 60 61// The acosf function returns the arc cosine in the range [0, +pi] radians. 62// acos(1) returns +0 63// acos(x) returns a Nan and raises the invalid exception for |x| >1 64 65// |x| <= sqrt(2)/2. get Ax and Bx 66 67// poly_p1 = x p1 68// poly_p3 = x2 p4 + p3 69// poly_p1 = x2 (poly_p1) + x = x2(x p1) + x 70// poly_p2 = x2( poly_p3) + p2 = x2(x2 p4 + p3) + p2 71 72// poly_Ax = x5(x2( poly_p3) + p2) + x2(x p1) + x 73// = x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x 74 75// poly_p7 = x2 p8 + p7 76// poly_p5 = x2 p6 + p5 77 78// poly_p7 = x4 p9 + (x2 p8 + p7) 79// poly_Bx = x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5 80 81// sinf1 = x11(x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5) + x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x 82// = x19 p9 + x17 p8 + x15 p7 x13 p6 + x11 p5 + x9 p4 + x7 p3 + x5 p2 + x3 p1 + x 83// answer1 = pi/2 - sinf1 84 85 86 87// |x| > sqrt(2)/2 88 89// Get z = sqrt(1-x2) 90 91// Get polynomial in t = 1-x2 92 93// t2 = t t 94// t4 = t2 t2 95 96// poly_p4 = t p5 + p4 97// poly_p1 = t p1 + 1 98 99// poly_p6 = t p7 + p6 100// poly_p2 = t p3 + p2 101 102// poly_p8 = t p9 + p8 103 104// poly_p4 = t2 poly_p6 + poly_p4 105// = t2 (t p7 + p6) + (t p5 + p4) 106 107// poly_p2 = t2 poly_p2 + poly_p1 108// = t2 (t p3 + p2) + (t p1 + 1) 109 110// poly_p4 = t4 poly_p8 + poly_p4 111// = t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4)) 112 113// P(t) = poly_p2 + t4 poly_p8 114// = t2 (t p3 + p2) + (t p1 + 1) + t4 (t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4))) 115// = t3 p3 + t2 p2 + t p1 + 1 + t9 p9 + t8 p8 + t7 p7 + t6 p6 + t5 p5 + t4 p4 116 117 118// answer2 = sign(x) z P(t) if x>0 119// = sign(x) z P(t) + pi if x<0 120 121 122// 123// Assembly macros 124//========================================= 125 126// predicate registers 127//acosf_pred_LEsqrt2by2 = p7 128//acosf_pred_GTsqrt2by2 = p8 129 130// integer registers 131ACOSF_Addr1 = r33 132ACOSF_Addr2 = r34 133ACOSF_GR_1by2 = r35 134 135ACOSF_GR_3by2 = r36 136ACOSF_GR_5by2 = r37 137 138GR_SAVE_B0 = r38 139GR_SAVE_PFS = r39 140GR_SAVE_GP = r40 141 142GR_Parameter_X = r41 143GR_Parameter_Y = r42 144GR_Parameter_RESULT = r43 145GR_Parameter_TAG = r44 146 147// floating point registers 148 149acosf_y = f32 150acosf_abs_x = f33 151acosf_x2 = f34 152acosf_sgn_x = f35 153 154acosf_1by2 = f36 155acosf_3by2 = f37 156acosf_5by2 = f38 157acosf_coeff_P3 = f39 158acosf_coeff_P8 = f40 159 160acosf_coeff_P1 = f41 161acosf_coeff_P4 = f42 162acosf_coeff_P5 = f43 163acosf_coeff_P2 = f44 164acosf_coeff_P7 = f45 165 166acosf_coeff_P6 = f46 167acosf_coeff_P9 = f47 168acosf_x2 = f48 169acosf_x3 = f49 170acosf_x4 = f50 171 172acosf_x8 = f51 173acosf_x5 = f52 174acosf_const_piby2 = f53 175acosf_const_sqrt2by2 = f54 176acosf_x11 = f55 177 178acosf_poly_p1 = f56 179acosf_poly_p3 = f57 180acosf_sinf1 = f58 181acosf_poly_p2 = f59 182acosf_poly_Ax = f60 183 184acosf_poly_p7 = f61 185acosf_poly_p5 = f62 186acosf_sgnx_t4 = f63 187acosf_poly_Bx = f64 188acosf_t = f65 189 190acosf_yby2 = f66 191acosf_B = f67 192acosf_B2 = f68 193acosf_Az = f69 194acosf_dz = f70 195 196acosf_Sz = f71 197acosf_d2z = f72 198acosf_Fz = f73 199acosf_z = f74 200acosf_sgnx_z = f75 201 202acosf_t2 = f76 203acosf_2poly_p4 = f77 204acosf_2poly_p6 = f78 205acosf_2poly_p1 = f79 206acosf_2poly_p2 = f80 207 208acosf_2poly_p8 = f81 209acosf_t4 = f82 210acosf_Pt = f83 211acosf_sgnx_2poly_p2 = f84 212acosf_sgn_x_piby2 = f85 213 214acosf_poly_p7a = f86 215acosf_2poly_p4a = f87 216acosf_2poly_p4b = f88 217acosf_2poly_p2a = f89 218acosf_poly_p1a = f90 219 220 221 222 223 224// Data tables 225//============================================================== 226 227RODATA 228 229.align 16 230 231LOCAL_OBJECT_START(acosf_coeff_1_table) 232data8 0x3FC5555607DCF816 // P1 233data8 0x3F9CF81AD9BAB2C6 // P4 234data8 0x3FC59E0975074DF3 // P7 235data8 0xBFA6F4CC2780AA1D // P6 236data8 0x3FC2DD45292E93CB // P9 237data8 0x3fe6a09e667f3bcd // sqrt(2)/2 238LOCAL_OBJECT_END(acosf_coeff_1_table) 239 240LOCAL_OBJECT_START(acosf_coeff_2_table) 241data8 0x3FA6F108E31EFBA6 // P3 242data8 0xBFCA31BF175D82A0 // P8 243data8 0x3FA30C0337F6418B // P5 244data8 0x3FB332C9266CB1F9 // P2 245data8 0x3ff921fb54442d18 // pi_by_2 246LOCAL_OBJECT_END(acosf_coeff_2_table) 247 248 249.section .text 250GLOBAL_LIBM_ENTRY(acosf) 251 252// Load the addresses of the two tables. 253// Then, load the coefficients and other constants. 254 255{ .mfi 256 alloc r32 = ar.pfs,1,8,4,0 257 fnma.s1 acosf_t = f8,f8,f1 258 dep.z ACOSF_GR_1by2 = 0x3f,24,8 // 0x3f000000 259} 260{ .mfi 261 addl ACOSF_Addr1 = @ltoff(acosf_coeff_1_table),gp 262 fma.s1 acosf_x2 = f8,f8,f0 263 addl ACOSF_Addr2 = @ltoff(acosf_coeff_2_table),gp ;; 264} 265 266 267{ .mfi 268 ld8 ACOSF_Addr1 = [ACOSF_Addr1] 269 fmerge.s acosf_abs_x = f1,f8 270 dep ACOSF_GR_3by2 = -1,r0,22,8 // 0x3fc00000 271} 272{ .mlx 273 nop.m 999 274 movl ACOSF_GR_5by2 = 0x40200000;; 275} 276 277 278 279{ .mfi 280 setf.s acosf_1by2 = ACOSF_GR_1by2 281 fmerge.s acosf_sgn_x = f8,f1 282 nop.i 999 283} 284{ .mfi 285 ld8 ACOSF_Addr2 = [ACOSF_Addr2] 286 nop.f 0 287 nop.i 999;; 288} 289 290 291{ .mfi 292 setf.s acosf_5by2 = ACOSF_GR_5by2 293 fcmp.lt.s1 p11,p12 = f8,f0 294 nop.i 999;; 295} 296 297{ .mmf 298 ldfpd acosf_coeff_P1,acosf_coeff_P4 = [ACOSF_Addr1],16 299 setf.s acosf_3by2 = ACOSF_GR_3by2 300 fclass.m.unc p8,p0 = f8, 0xc3 ;; //@qnan | @snan 301} 302 303 304{ .mfi 305 ldfpd acosf_coeff_P7,acosf_coeff_P6 = [ACOSF_Addr1],16 306 fma.s1 acosf_t2 = acosf_t,acosf_t,f0 307 nop.i 999 308} 309{ .mfi 310 ldfpd acosf_coeff_P3,acosf_coeff_P8 = [ACOSF_Addr2],16 311 fma.s1 acosf_x4 = acosf_x2,acosf_x2,f0 312 nop.i 999;; 313} 314 315 316{ .mfi 317 ldfpd acosf_coeff_P9,acosf_const_sqrt2by2 = [ACOSF_Addr1] 318 fclass.m.unc p10,p0 = f8, 0x07 //@zero 319 nop.i 999 320} 321{ .mfi 322 ldfpd acosf_coeff_P5,acosf_coeff_P2 = [ACOSF_Addr2],16 323 fma.s1 acosf_x3 = f8,acosf_x2,f0 324 nop.i 999;; 325} 326 327 328{ .mfi 329 ldfd acosf_const_piby2 = [ACOSF_Addr2] 330 frsqrta.s1 acosf_B,p0 = acosf_t 331 nop.i 999 332} 333{ .mfb 334 nop.m 999 335(p8) fma.s.s0 f8 = f8,f1,f0 336(p8) br.ret.spnt b0 ;; // Exit if x=nan 337} 338 339 340{ .mfb 341 nop.m 999 342 fcmp.eq.s1 p6,p0 = acosf_abs_x,f1 343(p10) br.cond.spnt ACOSF_ZERO ;; // Branch if x=0 344} 345 346{ .mfi 347 nop.m 999 348 fcmp.gt.s1 p9,p0 = acosf_abs_x,f1 349 nop.i 999;; 350} 351 352{ .mfi 353 nop.m 999 354 fma.s1 acosf_x8 = acosf_x4,acosf_x4,f0 355 nop.i 999 356} 357{ .mfb 358 nop.m 999 359 fma.s1 acosf_t4 = acosf_t2,acosf_t2,f0 360(p6) br.cond.spnt ACOSF_ABS_ONE ;; // Branch if |x|=1 361} 362 363{ .mfi 364 nop.m 999 365 fma.s1 acosf_x5 = acosf_x2,acosf_x3,f0 366 nop.i 999 367} 368{ .mfb 369(p9) mov GR_Parameter_TAG = 59 370 fma.s1 acosf_yby2 = acosf_t,acosf_1by2,f0 371(p9) br.cond.spnt __libm_error_region ;; // Branch if |x|>1 372} 373 374 375{ .mfi 376 nop.m 999 377 fma.s1 acosf_Az = acosf_t,acosf_B,f0 378 nop.i 999 379} 380{ .mfi 381 nop.m 999 382 fma.s1 acosf_B2 = acosf_B,acosf_B,f0 383 nop.i 999;; 384} 385 386{ .mfi 387 nop.m 999 388 fma.s1 acosf_poly_p1 = f8,acosf_coeff_P1,f0 389 nop.i 999 390} 391{ .mfi 392 nop.m 999 393 fma.s1 acosf_2poly_p1 = acosf_coeff_P1,acosf_t,f1 394 nop.i 999;; 395} 396 397{ .mfi 398 nop.m 999 399 fma.s1 acosf_poly_p3 = acosf_coeff_P4,acosf_x2,acosf_coeff_P3 400 nop.i 999 401} 402{ .mfi 403 nop.m 999 404 fma.s1 acosf_2poly_p6 = acosf_coeff_P7,acosf_t,acosf_coeff_P6 405 nop.i 999;; 406} 407 408{ .mfi 409 nop.m 999 410 fma.s1 acosf_poly_p7 = acosf_x2,acosf_coeff_P8,acosf_coeff_P7 411 nop.i 999 412} 413{ .mfi 414 nop.m 999 415 fma.s1 acosf_2poly_p2 = acosf_coeff_P3,acosf_t,acosf_coeff_P2 416 nop.i 999;; 417} 418 419 420{ .mfi 421 nop.m 999 422 fma.s1 acosf_poly_p5 = acosf_x2,acosf_coeff_P6,acosf_coeff_P5 423 nop.i 999 424} 425{ .mfi 426 nop.m 999 427 fma.s1 acosf_2poly_p4 = acosf_coeff_P5,acosf_t,acosf_coeff_P4 428 nop.i 999;; 429} 430 431 432{ .mfi 433 nop.m 999 434 fma.s1 acosf_x11 = acosf_x8,acosf_x3,f0 435 nop.i 999 436} 437{ .mfi 438 nop.m 999 439 fnma.s1 acosf_dz = acosf_B2,acosf_yby2,acosf_1by2 440 nop.i 999;; 441} 442 443 444{ .mfi 445 nop.m 999 446 fma.s1 acosf_poly_p1a = acosf_x2,acosf_poly_p1,f8 447 nop.i 999 448} 449{ .mfi 450 nop.m 999 451 fma.s1 acosf_2poly_p8 = acosf_coeff_P9,acosf_t,acosf_coeff_P8 452 nop.i 999;; 453} 454 455 456// Get the absolute value of x and determine the region in which x lies 457 458{ .mfi 459 nop.m 999 460 fcmp.le.s1 p7,p8 = acosf_abs_x,acosf_const_sqrt2by2 461 nop.i 999 462} 463{ .mfi 464 nop.m 999 465 fma.s1 acosf_poly_p2 = acosf_x2,acosf_poly_p3,acosf_coeff_P2 466 nop.i 999;; 467} 468 469 470{ .mfi 471 nop.m 999 472 fma.s1 acosf_poly_p7a = acosf_x4,acosf_coeff_P9,acosf_poly_p7 473 nop.i 999 474} 475{ .mfi 476 nop.m 999 477 fma.s1 acosf_2poly_p2a = acosf_2poly_p2,acosf_t2,acosf_2poly_p1 478 nop.i 999;; 479} 480 481 482{ .mfi 483 nop.m 999 484(p8) fma.s1 acosf_sgnx_t4 = acosf_sgn_x,acosf_t4,f0 485 nop.i 999 486} 487{ .mfi 488 nop.m 999 489(p8) fma.s1 acosf_2poly_p4a = acosf_2poly_p6,acosf_t2,acosf_2poly_p4 490 nop.i 999;; 491} 492 493 494{ .mfi 495 nop.m 999 496(p8) fma.s1 acosf_Sz = acosf_5by2,acosf_dz,acosf_3by2 497 nop.i 999 498} 499{ .mfi 500 nop.m 999 501(p8) fma.s1 acosf_d2z = acosf_dz,acosf_dz,f0 502 nop.i 999;; 503} 504 505 506{ .mfi 507 nop.m 999 508(p8) fnma.d.s1 acosf_sgn_x_piby2 = acosf_sgn_x,acosf_const_piby2,acosf_const_piby2 509 nop.i 999 510} 511{ .mfi 512 nop.m 999 513(p7) fma.s1 acosf_poly_Ax = acosf_x5,acosf_poly_p2,acosf_poly_p1a 514 nop.i 999;; 515} 516 517{ .mfi 518 nop.m 999 519(p7) fma.s1 acosf_poly_Bx = acosf_x4,acosf_poly_p7a,acosf_poly_p5 520 nop.i 999 521} 522{ .mfi 523 nop.m 999 524(p8) fma.s1 acosf_sgnx_2poly_p2 = acosf_sgn_x,acosf_2poly_p2a,f0 525 nop.i 999;; 526} 527 528{ .mfi 529 nop.m 999 530 fcmp.eq.s0 p6,p0 = f8,f0 // Only purpose is to set D if x denormal 531 nop.i 999 532} 533{ .mfi 534 nop.m 999 535(p8) fma.s1 acosf_2poly_p4b = acosf_2poly_p8,acosf_t4,acosf_2poly_p4a 536 nop.i 999;; 537} 538 539 540{ .mfi 541 nop.m 999 542(p8) fma.s1 acosf_Fz = acosf_d2z,acosf_Sz,acosf_dz 543 nop.i 999;; 544} 545 546 547{ .mfi 548 nop.m 999 549(p8) fma.d.s1 acosf_Pt = acosf_2poly_p4b,acosf_sgnx_t4,acosf_sgnx_2poly_p2 550 nop.i 999;; 551} 552 553{ .mfi 554 nop.m 999 555(p8) fma.d.s1 acosf_z = acosf_Az,acosf_Fz,acosf_Az 556 nop.i 999 ;; 557} 558 559{ .mfi 560 nop.m 999 561(p7) fma.d.s1 acosf_sinf1 = acosf_x11,acosf_poly_Bx,acosf_poly_Ax 562 nop.i 999;; 563} 564 565.pred.rel "mutex",p8,p7 //acosf_pred_GTsqrt2by2,acosf_pred_LEsqrt2by2 566{ .mfi 567 nop.m 999 568(p8) fma.s.s0 f8 = acosf_z,acosf_Pt,acosf_sgn_x_piby2 569 nop.i 999 570} 571 572{ .mfb 573 nop.m 999 574(p7) fms.s.s0 f8 = acosf_const_piby2,f1,acosf_sinf1 575 br.ret.sptk b0 ;; 576} 577 578ACOSF_ZERO: 579// Here if x=0 580{ .mfb 581 nop.m 999 582 fma.s.s0 f8 = acosf_const_piby2,f1,f0 // acosf(0)=pi/2 583 br.ret.sptk b0 ;; 584} 585 586 587ACOSF_ABS_ONE: 588.pred.rel "mutex",p11,p12 589// Here if |x|=1 590{ .mfi 591 nop.m 999 592(p11) fma.s.s0 f8 = acosf_const_piby2,f1,acosf_const_piby2 // acosf(-1)=pi 593 nop.i 999 594} 595{ .mfb 596 nop.m 999 597(p12) fma.s.s0 f8 = f1,f0,f0 // acosf(1)=0 598 br.ret.sptk b0 ;; 599} 600 601GLOBAL_LIBM_END(acosf) 602libm_alias_float_other (acos, acos) 603 604 605// Stack operations when calling error support. 606// (1) (2) 607// sp -> + psp -> + 608// | | 609// | | <- GR_Y 610// | | 611// | <-GR_Y Y2->| 612// | | 613// | | <- GR_X 614// | | 615// sp-64 -> + sp -> + 616// save ar.pfs save b0 617// save gp 618 619 620// Stack operations when calling error support. 621// (3) (call) (4) 622// psp -> + sp -> + 623// | | 624// R3 ->| <- GR_RESULT | -> f8 625// | | 626// Y2 ->| <- GR_Y | 627// | | 628// X1 ->| | 629// | | 630// sp -> + + 631// restore gp 632// restore ar.pfs 633 634 635LOCAL_LIBM_ENTRY(__libm_error_region) 636.prologue 637{ .mfi 638 add GR_Parameter_Y=-32,sp // Parameter 2 value 639 nop.f 999 640.save ar.pfs,GR_SAVE_PFS 641 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 642} 643{ .mfi 644.fframe 64 645 add sp=-64,sp // Create new stack 646 nop.f 0 647 mov GR_SAVE_GP=gp // Save gp 648};; 649{ .mmi 650 stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack 651 add GR_Parameter_X = 16,sp // Parameter 1 address 652.save b0, GR_SAVE_B0 653 mov GR_SAVE_B0=b0 // Save b0 654};; 655 656.body 657{ .mfi 658 nop.m 0 659 frcpa.s0 f9,p0 = f0,f0 660 nop.i 0 661};; 662 663{ .mib 664 stfs [GR_Parameter_X] = f8 // Store Parameter 1 on stack 665 add GR_Parameter_RESULT = 0,GR_Parameter_Y 666 nop.b 0 // Parameter 3 address 667} 668{ .mib 669 stfs [GR_Parameter_Y] = f9 // Store Parameter 3 on stack 670 add GR_Parameter_Y = -16,GR_Parameter_Y 671 br.call.sptk b0=__libm_error_support# // Call error handling function 672};; 673{ .mmi 674 nop.m 0 675 nop.m 0 676 add GR_Parameter_RESULT = 48,sp 677};; 678 679{ .mmi 680 ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack 681.restore sp 682 add sp = 64,sp // Restore stack pointer 683 mov b0 = GR_SAVE_B0 // Restore return address 684};; 685{ .mib 686 mov gp = GR_SAVE_GP // Restore gp 687 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 688 br.ret.sptk b0 // Return 689};; 690 691LOCAL_LIBM_END(__libm_error_region) 692 693.type __libm_error_support#,@function 694.global __libm_error_support# 695