1.file "tancotf.s" 2 3 4// Copyright (c) 2000 - 2005, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//============================================================== 41// 02/02/00 Initial version 42// 04/04/00 Unwind support added 43// 12/27/00 Improved speed 44// 02/21/01 Updated to call tanl 45// 05/30/02 Improved speed, added cotf. 46// 11/25/02 Added explicit completer on fnorm 47// 02/10/03 Reordered header: .section, .global, .proc, .align 48// 04/17/03 Eliminated redundant stop bits 49// 03/31/05 Reformatted delimiters between data tables 50// 51// APIs 52//============================================================== 53// float tanf(float) 54// float cotf(float) 55// 56// Algorithm Description for tanf 57//============================================================== 58// The tanf function computes the principle value of the tangent of x, 59// where x is radian argument. 60// 61// There are 5 paths: 62// 1. x = +/-0.0 63// Return tanf(x) = +/-0.0 64// 65// 2. x = [S,Q]NaN 66// Return tanf(x) = QNaN 67// 68// 3. x = +/-Inf 69// Return tanf(x) = QNaN 70// 71// 4. x = r + (Pi/2)*N, N = RoundInt(x*(2/Pi)), N is even, |r|<Pi/4 72// Return tanf(x) = P19(r) = A1*r + A3*r^3 + A5*r^5 + ... + A19*r^19 = 73// = r*(A1 + A3*t + A5*t^2 + ... + A19*t^9) = r*P9(t), where t = r^2 74// 75// 5. x = r + (Pi/2)*N, N = RoundInt(x*(2/Pi)), N is odd, |r|<Pi/4 76// Return tanf(x) = -1/r + P11(r) = -1/r + B1*r + B3*r^3 + ... + B11*r^11 = 77// = -1/r + r*(B1 + B3*t + B5*t^2 + ... + B11*t^5) = -1/r + r*P11(t), 78// where t = r^2 79// 80// Algorithm Description for cotf 81//============================================================== 82// The cotf function computes the principle value of the cotangent of x, 83// where x is radian argument. 84// 85// There are 5 paths: 86// 1. x = +/-0.0 87// Return cotf(x) = +/-Inf and error handling is called 88// 89// 2. x = [S,Q]NaN 90// Return cotf(x) = QNaN 91// 92// 3. x = +/-Inf 93// Return cotf(x) = QNaN 94// 95// 4. x = r + (Pi/2)*N, N = RoundInt(x*(2/Pi)), N is odd, |r|<Pi/4 96// Return cotf(x) = P19(-r) = A1*(-r) + A3*(-r^3) + ... + A19*(-r^19) = 97// = -r*(A1 + A3*t + A5*t^2 + ... + A19*t^9) = -r*P9(t), where t = r^2 98// 99// 5. x = r + (Pi/2)*N, N = RoundInt(x*(2/Pi)), N is even, |r|<Pi/4 100// Return cotf(x) = 1/r + P11(-r) = 1/r + B1*(-r) + ... + B11*(-r^11) = 101// = 1/r - r*(B1 + B3*t + B5*t^2 + ... + B11*t^5) = 1/r - r*P11(t), 102// where t = r^2 103// 104// We set p10 and clear p11 if computing tanf, vice versa for cotf. 105// 106// 107// Registers used 108//============================================================== 109// Floating Point registers used: 110// f8, input 111// f32 -> f80 112// 113// General registers used: 114// r14 -> r23, r32 -> r39 115// 116// Predicate registers used: 117// p6 -> p13 118// 119// Assembly macros 120//============================================================== 121// integer registers 122rExp = r14 123rSignMask = r15 124rRshf = r16 125rScFctrExp = r17 126rIntN = r18 127rSigRcpPiby2 = r19 128rScRshf = r20 129rCoeffA = r21 130rCoeffB = r22 131rExpCut = r23 132 133GR_SAVE_B0 = r33 134GR_SAVE_PFS = r34 135GR_SAVE_GP = r35 136GR_Parameter_X = r36 137GR_Parameter_Y = r37 138GR_Parameter_RESULT = r38 139GR_Parameter_Tag = r39 140 141//============================================================== 142// floating point registers 143fScRcpPiby2 = f32 144fScRshf = f33 145fNormArg = f34 146fScFctr = f35 147fRshf = f36 148fShiftedN = f37 149fN = f38 150fR = f39 151fA01 = f40 152fA03 = f41 153fA05 = f42 154fA07 = f43 155fA09 = f44 156fA11 = f45 157fA13 = f46 158fA15 = f47 159fA17 = f48 160fA19 = f49 161fB01 = f50 162fB03 = f51 163fB05 = f52 164fB07 = f53 165fB09 = f54 166fB11 = f55 167fA03_01 = f56 168fA07_05 = f57 169fA11_09 = f58 170fA15_13 = f59 171fA19_17 = f60 172fA11_05 = f61 173fA19_13 = f62 174fA19_05 = f63 175fRbyA03_01 = f64 176fB03_01 = f65 177fB07_05 = f66 178fB11_09 = f67 179fB11_05 = f68 180fRbyB03_01 = f69 181fRbyB11_01 = f70 182fRp2 = f71 183fRp4 = f72 184fRp8 = f73 185fRp5 = f74 186fY0 = f75 187fY1 = f76 188fD = f77 189fDp2 = f78 190fInvR = f79 191fPiby2 = f80 192//============================================================== 193 194 195RODATA 196.align 16 197 198LOCAL_OBJECT_START(coeff_A) 199data8 0x3FF0000000000000 // A1 = 1.00000000000000000000e+00 200data8 0x3FD5555556BCE758 // A3 = 3.33333334641442641606e-01 201data8 0x3FC111105C2DAE48 // A5 = 1.33333249100689099175e-01 202data8 0x3FABA1F876341060 // A7 = 5.39701122561673229739e-02 203data8 0x3F965FB86D12A38D // A9 = 2.18495194027670719750e-02 204data8 0x3F8265F62415F9D6 // A11 = 8.98353860497717439465e-03 205data8 0x3F69E3AE64CCF58D // A13 = 3.16032468108912746342e-03 206data8 0x3F63920D09D0E6F6 // A15 = 2.38897844840557235331e-03 207LOCAL_OBJECT_END(coeff_A) 208 209LOCAL_OBJECT_START(coeff_B) 210data8 0xC90FDAA22168C235, 0x3FFF // pi/2 211data8 0x3FD55555555358DB // B1 = 3.33333333326107426583e-01 212data8 0x3F96C16C252F643F // B3 = 2.22222230621336129239e-02 213data8 0x3F61566243AB3C60 // B5 = 2.11638633968606896785e-03 214data8 0x3F2BC1169BD4438B // B7 = 2.11748132564551094391e-04 215data8 0x3EF611B4CEA056A1 // B9 = 2.10467959860990200942e-05 216data8 0x3EC600F9E32194BF // B11 = 2.62305891234274186608e-06 217data8 0xBF42BA7BCC177616 // A17 =-5.71546981685324877205e-04 218data8 0x3F4F2614BC6D3BB8 // A19 = 9.50584530849832782542e-04 219LOCAL_OBJECT_END(coeff_B) 220 221 222.section .text 223 224LOCAL_LIBM_ENTRY(cotf) 225 226{ .mlx 227 getf.exp rExp = f8 // ***** Get 2^17 * s + E 228 movl rSigRcpPiby2= 0xA2F9836E4E44152A // significand of 2/Pi 229} 230{ .mlx 231 addl rCoeffA = @ltoff(coeff_A), gp 232 movl rScRshf = 0x47e8000000000000 // 1.5*2^(63+63+1) 233} 234;; 235 236{ .mfi 237 alloc r32 = ar.pfs, 0, 4, 4, 0 238 fclass.m p9, p0 = f8, 0xc3 // Test for x=nan 239 cmp.eq p11, p10 = r0, r0 // if p11=1 we compute cotf 240} 241{ .mib 242 ld8 rCoeffA = [rCoeffA] 243 mov rExpCut = 0x10009 // cutoff for exponent 244 br.cond.sptk Common_Path 245} 246;; 247 248LOCAL_LIBM_END(cotf) 249 250 251GLOBAL_IEEE754_ENTRY(tanf) 252 253{ .mlx 254 getf.exp rExp = f8 // ***** Get 2^17 * s + E 255 movl rSigRcpPiby2= 0xA2F9836E4E44152A // significand of 2/Pi 256} 257{ .mlx 258 addl rCoeffA = @ltoff(coeff_A), gp 259 movl rScRshf = 0x47e8000000000000 // 1.5*2^(63+63+1) 260} 261;; 262 263{ .mfi 264 alloc r32 = ar.pfs, 0, 4, 4, 0 265 fclass.m p9, p0 = f8, 0xc3 // Test for x=nan 266 cmp.eq p10, p11 = r0, r0 // if p10=1 we compute tandf 267} 268{ .mib 269 ld8 rCoeffA = [rCoeffA] 270 mov rExpCut = 0x10009 // cutoff for exponent 271 nop.b 0 272} 273;; 274 275// Below is common path for both tandf and cotdf 276Common_Path: 277{ .mfi 278 setf.sig fScRcpPiby2 = rSigRcpPiby2 // 2^(63+1)*(2/Pi) 279 fclass.m p8, p0 = f8, 0x23 // Test for x=inf 280 mov rSignMask = 0x1ffff // mask for sign bit 281} 282{ .mlx 283 setf.d fScRshf = rScRshf // 1.5*2^(63+63+1) 284 movl rRshf = 0x43e8000000000000 // 1.5 2^63 for right shift 285} 286;; 287 288{ .mfi 289 and rSignMask = rSignMask, rExp // clear sign bit 290(p10) fclass.m.unc p7, p0 = f8, 0x07 // Test for x=0 (for tanf) 291 mov rScFctrExp = 0xffff-64 // exp of scaling factor 292} 293{ .mfb 294 adds rCoeffB = coeff_B - coeff_A, rCoeffA 295(p9) fma.s.s0 f8 = f8, f1, f8 // Set qnan if x=nan 296(p9) br.ret.spnt b0 // Exit for x=nan 297} 298;; 299 300{ .mfi 301 cmp.ge p6, p0 = rSignMask, rExpCut // p6 = (E => 0x10009) 302(p8) frcpa.s0 f8, p0 = f0, f0 // Set qnan indef if x=inf 303 mov GR_Parameter_Tag = 227 // (cotf) 304} 305{ .mbb 306 ldfe fPiby2 = [rCoeffB], 16 307(p8) br.ret.spnt b0 // Exit for x=inf 308(p6) br.cond.spnt Huge_Argument // Branch if |x|>=2^10 309} 310;; 311 312{ .mfi 313 nop.m 0 314(p11) fclass.m.unc p6, p0 = f8, 0x07 // Test for x=0 (for cotf) 315 nop.i 0 316} 317{ .mfb 318 nop.m 0 319 fnorm.s0 fNormArg = f8 320(p7) br.ret.spnt b0 // Exit for x=0 (for tanf) 321} 322;; 323 324{ .mmf 325 ldfpd fA01, fA03 = [rCoeffA], 16 326 ldfpd fB01, fB03 = [rCoeffB], 16 327 fmerge.s f10 = f8, f8 // Save input for error call 328} 329;; 330 331{ .mmf 332 setf.exp fScFctr = rScFctrExp // get as real 333 setf.d fRshf = rRshf // get right shifter as real 334(p6) frcpa.s0 f8, p0 = f1, f8 // cotf(+-0) = +-Inf 335} 336;; 337 338{ .mmb 339 ldfpd fA05, fA07 = [rCoeffA], 16 340 ldfpd fB05, fB07 = [rCoeffB], 16 341(p6) br.cond.spnt __libm_error_region // call error support if cotf(+-0) 342} 343;; 344 345{ .mmi 346 ldfpd fA09, fA11 = [rCoeffA], 16 347 ldfpd fB09, fB11 = [rCoeffB], 16 348 nop.i 0 349} 350;; 351 352{ .mfi 353 nop.m 0 354 fma.s1 fShiftedN = fNormArg,fScRcpPiby2,fScRshf // x*2^70*(2/Pi)+ScRshf 355 nop.i 0 356} 357;; 358 359{ .mfi 360 nop.m 0 361 fms.s1 fN = fShiftedN, fScFctr, fRshf // N = Y*2^(-70) - Rshf 362 nop.i 0 363} 364;; 365 366.pred.rel "mutex", p10, p11 367{ .mfi 368 getf.sig rIntN = fShiftedN // get N as integer 369(p10) fnma.s1 fR = fN, fPiby2, fNormArg // R = x - (Pi/2)*N (tanf) 370 nop.i 0 371} 372{ .mfi 373 nop.m 0 374(p11) fms.s1 fR = fN, fPiby2, fNormArg // R = (Pi/2)*N - x (cotf) 375 nop.i 0 376} 377;; 378 379{ .mmi 380 ldfpd fA13, fA15 = [rCoeffA], 16 381 ldfpd fA17, fA19 = [rCoeffB], 16 382 nop.i 0 383} 384;; 385 386Return_From_Huges: 387{ .mfi 388 nop.m 0 389 fma.s1 fRp2 = fR, fR, f0 // R^2 390(p11) add rIntN = 0x1, rIntN // N = N + 1 (cotf) 391} 392;; 393 394{ .mfi 395 nop.m 0 396 frcpa.s1 fY0, p0 = f1, fR // Y0 ~ 1/R 397 tbit.z p8, p9 = rIntN, 0 // p8=1 if N is even 398} 399;; 400 401// Below are mixed polynomial calculations (mixed for even and odd N) 402{ .mfi 403 nop.m 0 404(p9) fma.s1 fB03_01 = fRp2, fB03, fB01 // R^2*B3 + B1 405 nop.i 0 406} 407{ .mfi 408 nop.m 0 409 fma.s1 fRp4 = fRp2, fRp2, f0 // R^4 410 nop.i 0 411} 412;; 413 414{ .mfi 415 nop.m 0 416(p8) fma.s1 fA15_13 = fRp2, fA15, fA13 // R^2*A15 + A13 417 nop.i 0 418} 419{ .mfi 420 nop.m 0 421(p8) fma.s1 fA19_17 = fRp2, fA19, fA17 // R^2*A19 + A17 422 nop.i 0 423} 424;; 425 426{ .mfi 427 nop.m 0 428(p8) fma.s1 fA07_05 = fRp2, fA07, fA05 // R^2*A7 + A5 429 nop.i 0 430} 431{ .mfi 432 nop.m 0 433(p8) fma.s1 fA11_09 = fRp2, fA11, fA09 // R^2*A11 + A9 434 nop.i 0 435} 436;; 437 438{ .mfi 439 nop.m 0 440(p9) fma.s1 fB07_05 = fRp2, fB07, fB05 // R^2*B7 + B5 441 nop.i 0 442} 443{ .mfi 444 nop.m 0 445(p9) fma.s1 fB11_09 = fRp2, fB11, fB09 // R^2*B11 + B9 446 nop.i 0 447} 448;; 449 450{ .mfi 451 nop.m 0 452(p9) fnma.s1 fD = fR, fY0, f1 // D = 1 - R*Y0 453 nop.i 0 454} 455{ .mfi 456 nop.m 0 457(p8) fma.s1 fA03_01 = fRp2, fA03, fA01 // R^2*A3 + A1 458 nop.i 0 459} 460;; 461 462{ .mfi 463 nop.m 0 464 fma.s1 fRp8 = fRp4, fRp4, f0 // R^8 465 nop.i 0 466} 467{ .mfi 468 nop.m 0 469 fma.s1 fRp5 = fR, fRp4, f0 // R^5 470 nop.i 0 471} 472;; 473 474{ .mfi 475 nop.m 0 476(p8) fma.s1 fA11_05 = fRp4, fA11_09, fA07_05 // R^4*(R^2*A11 + A9) + ... 477 nop.i 0 478} 479{ .mfi 480 nop.m 0 481(p8) fma.s1 fA19_13 = fRp4, fA19_17, fA15_13 // R^4*(R^2*A19 + A17) + .. 482 nop.i 0 483} 484;; 485 486{ .mfi 487 nop.m 0 488(p9) fma.s1 fB11_05 = fRp4, fB11_09, fB07_05 // R^4*(R^2*B11 + B9) + ... 489 nop.i 0 490} 491{ .mfi 492 nop.m 0 493(p9) fma.s1 fRbyB03_01 = fR, fB03_01, f0 // R*(R^2*B3 + B1) 494 nop.i 0 495} 496;; 497 498{ .mfi 499 nop.m 0 500(p9) fma.s1 fY1 = fY0, fD, fY0 // Y1 = Y0*D + Y0 501 nop.i 0 502} 503{ .mfi 504 nop.m 0 505(p9) fma.s1 fDp2 = fD, fD, f0 // D^2 506 nop.i 0 507} 508;; 509 510{ .mfi 511 nop.m 0 512 // R^8*(R^6*A19 + R^4*A17 + R^2*A15 + A13) + R^6*A11 + R^4*A9 + R^2*A7 + A5 513(p8) fma.d.s1 fA19_05 = fRp8, fA19_13, fA11_05 514 nop.i 0 515} 516{ .mfi 517 nop.m 0 518(p8) fma.d.s1 fRbyA03_01 = fR, fA03_01, f0 // R*(R^2*A3 + A1) 519 nop.i 0 520} 521;; 522 523{ .mfi 524 nop.m 0 525(p9) fma.d.s1 fInvR = fY1, fDp2, fY1 // 1/R = Y1*D^2 + Y1 526 nop.i 0 527} 528{ .mfi 529 nop.m 0 530 // R^5*(R^6*B11 + R^4*B9 + R^2*B7 + B5) + R^3*B3 + R*B1 531(p9) fma.d.s1 fRbyB11_01 = fRp5, fB11_05, fRbyB03_01 532 nop.i 0 533} 534;; 535 536.pred.rel "mutex", p8, p9 537{ .mfi 538 nop.m 0 539 // Result = R^5*(R^14*A19 + R^12*A17 + R^10*A15 + ...) + R^3*A3 + R*A1 540(p8) fma.s.s0 f8 = fRp5, fA19_05, fRbyA03_01 541 nop.i 0 542} 543{ .mfb 544 nop.m 0 545 // Result = -1/R + R^11*B11 + R^9*B9 + R^7*B7 + R^5*B5 + R^3*B3 + R*B1 546(p9) fnma.s.s0 f8 = f1, fInvR, fRbyB11_01 547 br.ret.sptk b0 // exit for main path 548} 549;; 550 551GLOBAL_IEEE754_END(tanf) 552libm_alias_float_other (__tan, tan) 553 554 555LOCAL_LIBM_ENTRY(__libm_callout) 556Huge_Argument: 557.prologue 558 559{ .mfi 560 nop.m 0 561 fmerge.s f9 = f0,f0 562.save ar.pfs,GR_SAVE_PFS 563 mov GR_SAVE_PFS=ar.pfs 564} 565;; 566 567{ .mfi 568 mov GR_SAVE_GP=gp 569 nop.f 0 570.save b0, GR_SAVE_B0 571 mov GR_SAVE_B0=b0 572} 573 574.body 575{ .mmb 576 nop.m 999 577 nop.m 999 578(p10) br.cond.sptk.many call_tanl ;; 579} 580 581// Here if we should call cotl (p10=0, p11=1) 582{ .mmb 583 nop.m 999 584 nop.m 999 585 br.call.sptk.many b0=__libm_cotl# ;; 586} 587 588{ .mfi 589 mov gp = GR_SAVE_GP 590 fnorm.s.s0 f8 = f8 591 mov b0 = GR_SAVE_B0 592} 593;; 594 595{ .mib 596 nop.m 999 597 mov ar.pfs = GR_SAVE_PFS 598 br.ret.sptk b0 599;; 600} 601 602// Here if we should call tanl (p10=1, p11=0) 603call_tanl: 604{ .mmb 605 nop.m 999 606 nop.m 999 607 br.call.sptk.many b0=__libm_tanl# ;; 608} 609 610{ .mfi 611 mov gp = GR_SAVE_GP 612 fnorm.s.s0 f8 = f8 613 mov b0 = GR_SAVE_B0 614} 615;; 616 617{ .mib 618 nop.m 999 619 mov ar.pfs = GR_SAVE_PFS 620 br.ret.sptk b0 621;; 622} 623 624LOCAL_LIBM_END(__libm_callout) 625 626.type __libm_tanl#,@function 627.global __libm_tanl# 628.type __libm_cotl#,@function 629.global __libm_cotl# 630 631 632LOCAL_LIBM_ENTRY(__libm_error_region) 633.prologue 634 635// (1) 636{ .mfi 637 add GR_Parameter_Y=-32,sp // Parameter 2 value 638 nop.f 0 639.save ar.pfs,GR_SAVE_PFS 640 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 641} 642{ .mfi 643.fframe 64 644 add sp=-64,sp // Create new stack 645 nop.f 0 646 mov GR_SAVE_GP=gp // Save gp 647};; 648 649// (2) 650{ .mmi 651 stfs [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack 652 add GR_Parameter_X = 16,sp // Parameter 1 address 653.save b0, GR_SAVE_B0 654 mov GR_SAVE_B0=b0 // Save b0 655};; 656 657.body 658// (3) 659{ .mib 660 stfs [GR_Parameter_X] = f10 // STORE Parameter 1 on stack 661 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address 662 nop.b 0 663} 664{ .mib 665 stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack 666 add GR_Parameter_Y = -16,GR_Parameter_Y 667 br.call.sptk b0=__libm_error_support# // Call error handling function 668};; 669{ .mmi 670 nop.m 0 671 nop.m 0 672 add GR_Parameter_RESULT = 48,sp 673};; 674 675// (4) 676{ .mmi 677 ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack 678.restore sp 679 add sp = 64,sp // Restore stack pointer 680 mov b0 = GR_SAVE_B0 // Restore return address 681};; 682{ .mib 683 mov gp = GR_SAVE_GP // Restore gp 684 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 685 br.ret.sptk b0 // Return 686};; 687 688LOCAL_LIBM_END(__libm_error_region) 689 690.type __libm_error_support#,@function 691.global __libm_error_support# 692