1.file "fmodl.s" 2 3 4// Copyright (c) 2000 - 2004, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//==================================================================== 41// 02/02/00 Initial version 42// 03/02/00 New Algorithm 43// 04/04/00 Unwind support added 44// 08/15/00 Bundle added after call to __libm_error_support to properly 45// set [ the previously overwritten ] GR_Parameter_RESULT. 46// 11/28/00 Set FR_Y to f9 47// 03/11/02 Fixed flags for fmodl(qnan, zero) 48// 05/20/02 Cleaned up namespace and sf0 syntax 49// 02/10/03 Reordered header:.section,.global,.proc,.align 50// 04/28/03 Fix: fmod(sNaN, 0) no longer sets errno 51// 11/23/04 Reformatted routine and improved speed 52// 53// API 54//==================================================================== 55// long double fmodl(long double, long double); 56// 57// Overview of operation 58//==================================================================== 59// fmod(a, b)= a-i*b, 60// where i is an integer such that, if b!= 0, 61// |i|<|a/b| and |a/b-i|<1 62// 63// Algorithm 64//==================================================================== 65// a). if |a|<|b|, return a 66// b). get quotient and reciprocal overestimates accurate to 67// 33 bits (q2, y2) 68// c). if the exponent difference (exponent(a)-exponent(b)) 69// is less than 32, truncate quotient to integer and 70// finish in one iteration 71// d). if exponent(a)-exponent(b)>= 32 (q2>= 2^32) 72// round quotient estimate to single precision (k= RN(q2)), 73// calculate partial remainder (a'= a-k*b), 74// get quotient estimate (a'*y2), and repeat from c). 75// 76// Registers used 77//==================================================================== 78 79GR_SMALLBIASEXP = r2 80GR_2P32 = r3 81GR_SMALLBIASEXP = r20 82GR_ROUNDCONST = r21 83GR_SIG_B = r22 84GR_ARPFS = r23 85GR_TMP1 = r24 86GR_TMP2 = r25 87GR_TMP3 = r26 88 89GR_SAVE_B0 = r33 90GR_SAVE_PFS = r34 91GR_SAVE_GP = r35 92GR_SAVE_SP = r36 93 94GR_Parameter_X = r37 95GR_Parameter_Y = r38 96GR_Parameter_RESULT = r39 97GR_Parameter_TAG = r40 98 99FR_X = f10 100FR_Y = f9 101FR_RESULT = f8 102 103FR_ABS_A = f6 104FR_ABS_B = f7 105FR_Y_INV = f10 106FR_SMALLBIAS = f11 107FR_E0 = f12 108FR_Q = f13 109FR_E1 = f14 110FR_2P32 = f15 111FR_TMPX = f32 112FR_TMPY = f33 113FR_ROUNDCONST = f34 114FR_QINT = f35 115FR_QRND24 = f36 116FR_NORM_B = f37 117FR_TMP = f38 118FR_TMP2 = f39 119FR_DFLAG = f40 120FR_Y_INV0 = f41 121FR_Y_INV1 = f42 122FR_Q0 = f43 123FR_Q1 = f44 124FR_QINT_Z = f45 125FR_QREM = f46 126FR_B_SGN_A = f47 127 128.section .text 129GLOBAL_IEEE754_ENTRY(fmodl) 130 131// inputs in f8, f9 132// result in f8 133 134{ .mfi 135 getf.sig GR_SIG_B = f9 136 // FR_ABS_A = |a| 137 fmerge.s FR_ABS_A = f0, f8 138 mov GR_SMALLBIASEXP = 0x0ffdd 139} 140{ .mfi 141 nop.m 0 142 // FR_ABS_B = |b| 143 fmerge.s FR_ABS_B = f0, f9 144 nop.i 0 145} 146;; 147 148{ .mfi 149 setf.exp FR_SMALLBIAS = GR_SMALLBIASEXP 150 // (1) y0 151 frcpa.s1 FR_Y_INV0, p6 = FR_ABS_A, FR_ABS_B 152 nop.i 0 153} 154;; 155 156{ .mlx 157 nop.m 0 158 movl GR_ROUNDCONST = 0x33a00000 159} 160;; 161 162// eliminate special cases 163{ .mmi 164 nop.m 0 165 nop.m 0 166 // y pseudo-zero ? 167 cmp.eq p7, p10 = GR_SIG_B, r0 168} 169;; 170 171// set p7 if b +/-NAN, +/-inf, +/-0 172{ .mfi 173 nop.m 0 174 (p10) fclass.m p7, p10 = f9, 0xe7 175 nop.i 0 176} 177;; 178 179{ .mfi 180 mov GR_2P32 = 0x1001f 181 // (2) q0 = a*y0 182 (p6) fma.s1 FR_Q0 = FR_ABS_A, FR_Y_INV0, f0 183 nop.i 0 184} 185{ .mfi 186 nop.m 0 187 // (3) e0 = 1 - b * y0 188 (p6) fnma.s1 FR_E0 = FR_ABS_B, FR_Y_INV0, f1 189 nop.i 0 190} 191;; 192 193// set p9 if a +/-NAN, +/-inf 194{ .mfi 195 nop.m 0 196 fclass.m.unc p9, p11 = f8, 0xe3 197 nop.i 0 198} 199 // |a| < |b|? Return a, p8=1 200{ .mfi 201 nop.m 0 202 (p10) fcmp.lt.unc.s1 p8, p0 = FR_ABS_A, FR_ABS_B 203 nop.i 0 204} 205;; 206 207// set p7 if b +/-NAN, +/-inf, +/-0 208{ .mfi 209 nop.m 0 210 // pseudo-NaN ? 211 (p10) fclass.nm p7, p0 = f9, 0xff 212 nop.i 0 213} 214;; 215 216// set p9 if a is +/-NaN, +/-Inf 217{ .mfi 218 nop.m 0 219 (p11) fclass.nm p9, p0 = f8, 0xff 220 nop.i 0 221} 222{ .mfi 223 nop.m 0 224 // b denormal ? set D flag (if |a|<|b|) 225 (p8) fnma.s0 FR_DFLAG = f9, f1, f9 226 nop.i 0 227} 228;; 229 230{ .mfi 231 // FR_2P32 = 2^32 232 setf.exp FR_2P32 = GR_2P32 233 // (4) q1 = q0+e0*q0 234 (p6) fma.s1 FR_Q1 = FR_E0, FR_Q0, FR_Q0 235 nop.i 0 236} 237{ .mfi 238 nop.m 0 239 // (5) e1 = e0 * e0 + 2^-34 240 (p6) fma.s1 FR_E1 = FR_E0, FR_E0, FR_SMALLBIAS 241 nop.i 0 242} 243;; 244 245{ .mfi 246 nop.m 0 247 // normalize a (if |a|<|b|) 248 (p8) fma.s0 f8 = f8, f1, f0 249 nop.i 0 250} 251{ .bbb 252 (p9) br.cond.spnt FMOD_A_NAN_INF 253 (p7) br.cond.spnt FMOD_B_NAN_INF_ZERO 254 // if |a|<|b|, return 255 (p8) br.ret.spnt b0 256} 257;; 258 259 260{ .mfi 261 nop.m 0 262 // (6) y1 = y0 + e0 * y0 263 (p6) fma.s1 FR_Y_INV1 = FR_E0, FR_Y_INV0, FR_Y_INV0 264 nop.i 0 265} 266;; 267 268{ .mfi 269 nop.m 0 270 // a denormal ? set D flag 271 // b denormal ? set D flag 272 fcmp.eq.s0 p12,p0 = FR_ABS_A, FR_ABS_B 273 nop.i 0 274} 275{ .mfi 276 // set FR_ROUNDCONST = 1.25*2^{-24} 277 setf.s FR_ROUNDCONST = GR_ROUNDCONST 278 // (7) q2 = q1+e1*q1 279 (p6) fma.s1 FR_Q = FR_Q1, FR_E1, FR_Q1 280 nop.i 0 281} 282;; 283 284{ .mfi 285 nop.m 0 286 fmerge.s FR_B_SGN_A = f8, f9 287 nop.i 0 288} 289{ .mfi 290 nop.m 0 291 // (8) y2 = y1 + e1 * y1 292 (p6) fma.s1 FR_Y_INV = FR_E1, FR_Y_INV1, FR_Y_INV1 293 // set p6 = 0, p10 = 0 294 cmp.ne.and p6, p10 = r0, r0 295} 296;; 297 298// will compute integer quotient bits (24 bits per iteration) 299.align 32 300loop64: 301{ .mfi 302 nop.m 0 303 // compare q2, 2^32 304 fcmp.lt.unc.s1 p8, p7 = FR_Q, FR_2P32 305 nop.i 0 306} 307{ .mfi 308 nop.m 0 309 // will truncate quotient to integer, if exponent<32 (in advance) 310 fcvt.fx.trunc.s1 FR_QINT = FR_Q 311 nop.i 0 312} 313;; 314 315{ .mfi 316 nop.m 0 317 // if exponent>32 round quotient to single precision (perform in advance) 318 fma.s.s1 FR_QRND24 = FR_Q, f1, f0 319 nop.i 0 320} 321;; 322 323{ .mfi 324 nop.m 0 325 // set FR_ROUNDCONST = sgn(a) 326 (p8) fmerge.s FR_ROUNDCONST = f8, f1 327 nop.i 0 328} 329{ .mfi 330 nop.m 0 331 // normalize truncated quotient 332 (p8) fcvt.xf FR_QRND24 = FR_QINT 333 nop.i 0 334} 335;; 336 337{ .mfi 338 nop.m 0 339 // calculate remainder (assuming FR_QRND24 = RZ(Q)) 340 (p7) fnma.s1 FR_E1 = FR_QRND24, FR_ABS_B, FR_ABS_A 341 nop.i 0 342} 343{ .mfi 344 nop.m 0 345 // also if exponent>32, round quotient to single precision 346 // and subtract 1 ulp: q = q-q*(1.25*2^{-24}) 347 (p7) fnma.s.s1 FR_QINT_Z = FR_QRND24, FR_ROUNDCONST, FR_QRND24 348 nop.i 0 349} 350;; 351 352{ .mfi 353 nop.m 0 354 // (p8) calculate remainder (82-bit format) 355 (p8) fnma.s1 FR_QREM = FR_QRND24, FR_ABS_B, FR_ABS_A 356 nop.i 0 357} 358{ .mfi 359 nop.m 0 360 // (p7) calculate remainder (assuming FR_QINT_Z = RZ(Q)) 361 (p7) fnma.s1 FR_ABS_A = FR_QINT_Z, FR_ABS_B, FR_ABS_A 362 nop.i 0 363} 364;; 365 366{ .mfi 367 nop.m 0 368 // Final iteration (p8): is FR_ABS_A the correct remainder 369 // (quotient was not overestimated) ? 370 (p8) fcmp.lt.unc.s1 p6, p10 = FR_QREM, f0 371 nop.i 0 372} 373;; 374 375{ .mfi 376 nop.m 0 377 // get new quotient estimation: a'*y2 378 (p7) fma.s1 FR_Q = FR_E1, FR_Y_INV, f0 379 nop.i 0 380} 381{ .mfb 382 nop.m 0 383 // was FR_Q = RZ(Q) ? (then new remainder FR_E1> = 0) 384 (p7) fcmp.lt.unc.s1 p7, p9 = FR_E1, f0 385 nop.b 0 386} 387;; 388 389.pred.rel "mutex", p6, p10 390{ .mfb 391 nop.m 0 392 // add b to estimated remainder (to cover the case when the quotient was 393 // overestimated) 394 // also set correct sign by using 395 // FR_B_SGN_A = |b|*sgn(a), FR_ROUNDCONST = sgn(a) 396 (p6) fma.s0 f8 = FR_QREM, FR_ROUNDCONST, FR_B_SGN_A 397 nop.b 0 398} 399{ .mfb 400 nop.m 0 401 // set correct sign of result before returning: FR_ROUNDCONST = sgn(a) 402 (p10) fma.s0 f8 = FR_QREM, FR_ROUNDCONST, f0 403 (p8) br.ret.sptk b0 404} 405;; 406 407{ .mfi 408 nop.m 0 409 // if f13! = RZ(Q), get alternative quotient estimation: a''*y2 410 (p7) fma.s1 FR_Q = FR_ABS_A, FR_Y_INV, f0 411 nop.i 0 412} 413{ .mfb 414 nop.m 0 415 // if FR_E1 was RZ(Q), set remainder to FR_E1 416 (p9) fma.s1 FR_ABS_A = FR_E1, f1, f0 417 br.cond.sptk loop64 418} 419;; 420 421FMOD_A_NAN_INF: 422 423// b zero ? 424{ .mfi 425 nop.m 0 426 fclass.m p10, p0 = f8, 0xc3 // Test a = nan 427 nop.i 0 428} 429{ .mfi 430 nop.m 0 431 fma.s1 FR_NORM_B = f9, f1, f0 432 nop.i 0 433} 434;; 435 436{ .mfi 437 nop.m 0 438 fma.s0 f8 = f8, f1, f0 439 nop.i 0 440} 441{ .mfi 442 nop.m 0 443 (p10) fclass.m p10, p0 = f9, 0x07 // Test x = nan, and y = zero 444 nop.i 0 445} 446;; 447 448{ .mfb 449 nop.m 0 450 fcmp.eq.unc.s1 p11, p0 = FR_NORM_B, f0 451 (p10) br.ret.spnt b0 // Exit with result = a if a = nan and b = zero 452} 453;; 454 455{ .mib 456 nop.m 0 457 nop.i 0 458 // if Y zero 459 (p11) br.cond.spnt FMOD_B_ZERO 460} 461;; 462 463// a= infinity? Return QNAN indefinite 464{ .mfi 465 // set p7 t0 0 466 cmp.ne p7, p0 = r0, r0 467 fclass.m.unc p8, p9 = f8, 0x23 468 nop.i 0 469} 470;; 471 472// b NaN ? 473{ .mfi 474 nop.m 0 475 (p8) fclass.m p9, p8 = f9, 0xc3 476 nop.i 0 477} 478;; 479 480// b not pseudo-zero ? (GR_SIG_B holds significand) 481{ .mii 482 nop.m 0 483 (p8) cmp.ne p7, p0 = GR_SIG_B, r0 484 nop.i 0 485} 486;; 487 488{ .mfi 489 nop.m 0 490 (p8) frcpa.s0 f8, p0 = f8, f8 491 nop.i 0 492} 493{ .mfi 494 nop.m 0 495 // also set Denormal flag if necessary 496 (p7) fnma.s0 f9 = f9, f1, f9 497 nop.i 0 498} 499;; 500 501{ .mfb 502 nop.m 0 503 (p8) fma.s0 f8 = f8, f1, f0 504 nop.b 0 505} 506;; 507 508{ .mfb 509 nop.m 0 510 (p9) frcpa.s0 f8, p7 = f8, f9 511 br.ret.sptk b0 512} 513;; 514 515FMOD_B_NAN_INF_ZERO: 516// b INF 517{ .mfi 518 nop.m 0 519 fclass.m.unc p7, p0 = f9, 0x23 520 nop.i 0 521} 522;; 523 524{ .mfb 525 nop.m 0 526 (p7) fma.s0 f8 = f8, f1, f0 527 (p7) br.ret.spnt b0 528} 529;; 530 531// b NAN? 532{ .mfi 533 nop.m 0 534 fclass.m.unc p9, p10 = f9, 0xc3 535 nop.i 0 536} 537;; 538 539{ .mfi 540 nop.m 0 541 (p10) fclass.nm p9, p0 = f9, 0xff 542 nop.i 0 543} 544;; 545 546{ .mfb 547 nop.m 0 548 (p9) fma.s0 f8 = f9, f1, f0 549 (p9) br.ret.spnt b0 550} 551;; 552 553FMOD_B_ZERO: 554// Y zero? Must be zero at this point 555// because it is the only choice left. 556// Return QNAN indefinite 557 558{ .mfi 559 nop.m 0 560 // set Invalid 561 frcpa.s0 FR_TMP, p0 = f0, f0 562 nop.i 0 563} 564;; 565 566// a NAN? 567{ .mfi 568 nop.m 0 569 fclass.m.unc p9, p10 = f8, 0xc3 570 nop.i 0 571} 572;; 573 574{ .mfi 575 alloc GR_ARPFS = ar.pfs, 1, 4, 4, 0 576 (p10) fclass.nm p9, p10 = f8, 0xff 577 nop.i 0 578} 579;; 580 581{ .mfi 582 nop.m 0 583 (p9) frcpa.s0 FR_TMP2, p7 = f8, f0 584 nop.i 0 585} 586;; 587 588{ .mfi 589 nop.m 0 590 (p10) frcpa.s0 FR_TMP2, p7 = f9, f9 591 mov GR_Parameter_TAG = 120 592} 593;; 594 595{ .mfi 596 nop.m 0 597 fmerge.s FR_X = f8, f8 598 nop.i 0 599} 600{ .mfb 601 nop.m 0 602 fma.s0 f8 = FR_TMP2, f1, f0 603 br.sptk __libm_error_region 604} 605;; 606 607GLOBAL_IEEE754_END(fmodl) 608libm_alias_ldouble_other (__fmod, fmod) 609 610LOCAL_LIBM_ENTRY(__libm_error_region) 611.prologue 612{ .mfi 613 add GR_Parameter_Y = -32, sp // Parameter 2 value 614 nop.f 0 615.save ar.pfs, GR_SAVE_PFS 616 mov GR_SAVE_PFS = ar.pfs // Save ar.pfs 617} 618{ .mfi 619.fframe 64 620 add sp = -64, sp // Create new stack 621 nop.f 0 622 mov GR_SAVE_GP = gp // Save gp 623} 624;; 625 626{ .mmi 627 stfe [ GR_Parameter_Y ] = FR_Y, 16 // Save Parameter 2 on stack 628 add GR_Parameter_X = 16, sp // Parameter 1 address 629.save b0, GR_SAVE_B0 630 mov GR_SAVE_B0 = b0 // Save b0 631} 632;; 633 634.body 635{ .mib 636 stfe [ GR_Parameter_X ] = FR_X // Store Parameter 1 on stack 637 add GR_Parameter_RESULT = 0, GR_Parameter_Y 638 nop.b 0 // Parameter 3 address 639} 640{ .mib 641 stfe [ GR_Parameter_Y ] = FR_RESULT // Store Parameter 3 on stack 642 add GR_Parameter_Y = -16, GR_Parameter_Y 643 br.call.sptk b0 = __libm_error_support# // Call error handling function 644} 645;; 646 647{ .mmi 648 nop.m 0 649 nop.m 0 650 add GR_Parameter_RESULT = 48, sp 651} 652;; 653 654{ .mmi 655 ldfe f8 = [ GR_Parameter_RESULT ] // Get return result off stack 656.restore sp 657 add sp = 64, sp // Restore stack pointer 658 mov b0 = GR_SAVE_B0 // Restore return address 659} 660;; 661 662{ .mib 663 mov gp = GR_SAVE_GP // Restore gp 664 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 665 br.ret.sptk b0 // Return 666} 667;; 668 669LOCAL_LIBM_END(__libm_error_region) 670 671.type __libm_error_support#, @function 672.global __libm_error_support# 673