1.file "fmod.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//==================================================================== 41// 02/02/00 Initial version 42// 03/02/00 New Algorithm 43// 04/04/00 Unwind support added 44// 08/15/00 Bundle added after call to __libm_error_support to properly 45// set [the previously overwritten] GR_Parameter_RESULT. 46// 11/28/00 Set FR_Y to f9 47// 03/11/02 Fixed flags for fmod(qnan,zero) 48// 05/20/02 Cleaned up namespace and sf0 syntax 49// 02/10/03 Reordered header: .section, .global, .proc, .align 50// 04/28/03 Fix: fmod(sNaN,0) no longer sets errno 51// 52// API 53//==================================================================== 54// double fmod(double,double); 55// 56// Overview of operation 57//==================================================================== 58// fmod(a,b)=a-i*b, 59// where i is an integer such that, if b!=0, 60// |i|<|a/b| and |a/b-i|<1 61// 62// Algorithm 63//==================================================================== 64// a). if |a|<|b|, return a 65// b). get quotient and reciprocal overestimates accurate to 66// 33 bits (q2,y2) 67// c). if the exponent difference (exponent(a)-exponent(b)) 68// is less than 32, truncate quotient to integer and 69// finish in one iteration 70// d). if exponent(a)-exponent(b)>=32 (q2>=2^32) 71// round quotient estimate to single precision (k=RN(q2)), 72// calculate partial remainder (a'=a-k*b), 73// get quotient estimate (a'*y2), and repeat from c). 74// 75// Special cases 76//==================================================================== 77// b=+/-0: return NaN, call libm_error_support 78// a=+/-Inf, a=NaN or b=NaN: return NaN 79// 80// Registers used 81//==================================================================== 82// Predicate registers: p6-p11 83// General registers: r2,r29,r32 (ar.pfs), r33-r39 84// Floating point registers: f6-f15 85 86GR_SAVE_B0 = r33 87GR_SAVE_PFS = r34 88GR_SAVE_GP = r35 89GR_SAVE_SP = r36 90 91GR_Parameter_X = r37 92GR_Parameter_Y = r38 93GR_Parameter_RESULT = r39 94GR_Parameter_TAG = r40 95 96FR_X = f10 97FR_Y = f9 98FR_RESULT = f8 99 100 101.section .text 102GLOBAL_IEEE754_ENTRY(fmod) 103 104// inputs in f8, f9 105// result in f8 106 107{ .mfi 108 alloc r32=ar.pfs,1,4,4,0 109 // f6=|a| 110 fmerge.s f6=f0,f8 111 mov r2 = 0x0ffdd 112} 113 {.mfi 114 nop.m 0 115 // f7=|b| 116 fmerge.s f7=f0,f9 117 nop.i 0;; 118} 119 120{ .mfi 121 setf.exp f11 = r2 122 // (1) y0 123 frcpa.s1 f10,p6=f6,f7 124 nop.i 0 125} 126 127// Y +-NAN, +-inf, +-0? p7 128{ .mfi 129 nop.m 999 130 fclass.m.unc p7,p0 = f9, 0xe7 131 nop.i 999;; 132} 133 134// qnan snan inf norm unorm 0 -+ 135// 1 1 1 0 0 0 11 136// e 3 137// X +-NAN, +-inf, ? p9 138 139{ .mfi 140 nop.m 999 141 fclass.m.unc p9,p0 = f8, 0xe3 142 nop.i 999 143} 144 145// |x| < |y|? Return x p8 146{ .mfi 147 nop.m 999 148 fcmp.lt.unc.s1 p8,p0 = f6,f7 149 nop.i 999 ;; 150} 151 152{ .mfi 153 nop.m 0 154 // normalize y (if |x|<|y|) 155 (p8) fma.s0 f9=f9,f1,f0 156 nop.i 0;; 157} 158 159 { .mfi 160 mov r2=0x1001f 161 // (2) q0=a*y0 162 (p6) fma.s1 f13=f6,f10,f0 163 nop.i 0 164} 165{ .mfi 166 nop.m 0 167 // (3) e0 = 1 - b * y0 168 (p6) fnma.s1 f12=f7,f10,f1 169 nop.i 0;; 170} 171 172 {.mfi 173 nop.m 0 174 // normalize x (if |x|<|y|) 175 (p8) fma.d.s0 f8=f8,f1,f0 176 nop.i 0 177} 178{.bbb 179 (p9) br.cond.spnt FMOD_X_NAN_INF 180 (p7) br.cond.spnt FMOD_Y_NAN_INF_ZERO 181 // if |x|<|y|, return 182 (p8) br.ret.spnt b0;; 183} 184 185 {.mfi 186 nop.m 0 187 // normalize x 188 fma.s0 f6=f6,f1,f0 189 nop.i 0 190} 191{.mfi 192 nop.m 0 193 // normalize y 194 fma.s0 f7=f7,f1,f0 195 nop.i 0;; 196} 197 198 {.mfi 199 // f15=2^32 200 setf.exp f15=r2 201 // (4) q1=q0+e0*q0 202 (p6) fma.s1 f13=f12,f13,f13 203 nop.i 0 204} 205{ .mfi 206 nop.m 0 207 // (5) e1 = e0 * e0 + 2^-34 208 (p6) fma.s1 f14=f12,f12,f11 209 nop.i 0;; 210} 211{.mlx 212 nop.m 0 213 movl r2=0x33a00000;; 214} 215{ .mfi 216 nop.m 0 217 // (6) y1 = y0 + e0 * y0 218 (p6) fma.s1 f10=f12,f10,f10 219 nop.i 0;; 220} 221{.mfi 222 // set f12=1.25*2^{-24} 223 setf.s f12=r2 224 // (7) q2=q1+e1*q1 225 (p6) fma.s1 f13=f13,f14,f13 226 nop.i 0;; 227} 228{.mfi 229 nop.m 0 230 fmerge.s f9=f8,f9 231 nop.i 0 232} 233{ .mfi 234 nop.m 0 235 // (8) y2 = y1 + e1 * y1 236 (p6) fma.s1 f10=f14,f10,f10 237 // set p6=0, p10=0 238 cmp.ne.and p6,p10=r0,r0;; 239} 240 241.align 32 242loop53: 243 {.mfi 244 nop.m 0 245 // compare q2, 2^32 246 fcmp.lt.unc.s1 p8,p7=f13,f15 247 nop.i 0 248} 249 {.mfi 250 nop.m 0 251 // will truncate quotient to integer, if exponent<32 (in advance) 252 fcvt.fx.trunc.s1 f11=f13 253 nop.i 0;; 254} 255 {.mfi 256 nop.m 0 257 // if exponent>32, round quotient to single precision (perform in advance) 258 fma.s.s1 f13=f13,f1,f0 259 nop.i 0;; 260} 261 {.mfi 262 nop.m 0 263 // set f12=sgn(a) 264 (p8) fmerge.s f12=f8,f1 265 nop.i 0 266} 267 {.mfi 268 nop.m 0 269 // normalize truncated quotient 270 (p8) fcvt.xf f13=f11 271 nop.i 0;; 272} 273 { .mfi 274 nop.m 0 275 // calculate remainder (assuming f13=RZ(Q)) 276 (p7) fnma.s1 f14=f13,f7,f6 277 nop.i 0 278} 279 {.mfi 280 nop.m 0 281 // also if exponent>32, round quotient to single precision 282 // and subtract 1 ulp: q=q-q*(1.25*2^{-24}) 283 (p7) fnma.s.s1 f11=f13,f12,f13 284 nop.i 0;; 285} 286 287 {.mfi 288 nop.m 0 289 // (p8) calculate remainder (82-bit format) 290 (p8) fnma.s1 f11=f13,f7,f6 291 nop.i 0 292} 293 {.mfi 294 nop.m 0 295 // (p7) calculate remainder (assuming f11=RZ(Q)) 296 (p7) fnma.s1 f6=f11,f7,f6 297 nop.i 0;; 298} 299 300 301 {.mfi 302 nop.m 0 303 // Final iteration (p8): is f6 the correct remainder (quotient was not overestimated) ? 304 (p8) fcmp.lt.unc.s1 p6,p10=f11,f0 305 nop.i 0;; 306} 307 {.mfi 308 nop.m 0 309 // get new quotient estimation: a'*y2 310 (p7) fma.s1 f13=f14,f10,f0 311 nop.i 0 312} 313 {.mfb 314 nop.m 0 315 // was f14=RZ(Q) ? (then new remainder f14>=0) 316 (p7) fcmp.lt.unc.s1 p7,p9=f14,f0 317 nop.b 0;; 318} 319 320 321.pred.rel "mutex",p6,p10 322 {.mfb 323 nop.m 0 324 // add b to estimated remainder (to cover the case when the quotient was overestimated) 325 // also set correct sign by using f9=|b|*sgn(a), f12=sgn(a) 326 (p6) fma.d.s0 f8=f11,f12,f9 327 nop.b 0 328} 329 {.mfb 330 nop.m 0 331 // calculate remainder (single precision) 332 // set correct sign of result before returning 333 (p10) fma.d.s0 f8=f11,f12,f0 334 (p8) br.ret.sptk b0;; 335} 336 {.mfi 337 nop.m 0 338 // if f13!=RZ(Q), get alternative quotient estimation: a''*y2 339 (p7) fma.s1 f13=f6,f10,f0 340 nop.i 0 341} 342 {.mfb 343 nop.m 0 344 // if f14 was RZ(Q), set remainder to f14 345 (p9) mov f6=f14 346 br.cond.sptk loop53;; 347} 348 349 350 351FMOD_X_NAN_INF: 352 353// Y zero ? 354{.mfi 355 nop.m 0 356 fclass.m p10,p0=f8,0xc3 // Test x=nan 357 nop.i 0 358} 359{.mfi 360 nop.m 0 361 fma.s1 f10=f9,f1,f0 362 nop.i 0;; 363} 364 365{.mfi 366 nop.m 0 367 fma.s0 f8=f8,f1,f0 368 nop.i 0 369} 370{.mfi 371 nop.m 0 372(p10) fclass.m p10,p0=f9,0x07 // Test x=nan, and y=zero 373 nop.i 0;; 374} 375 376{.mfb 377 nop.m 0 378 fcmp.eq.unc.s1 p11,p0=f10,f0 379(p10) br.ret.spnt b0;; // Exit with result=x if x=nan and y=zero 380} 381{.mib 382 nop.m 0 383 nop.i 0 384 // if Y zero 385 (p11) br.cond.spnt FMOD_Y_ZERO;; 386} 387 388// X infinity? Return QNAN indefinite 389{ .mfi 390 nop.m 999 391 fclass.m.unc p8,p9 = f8, 0x23 392 nop.i 999;; 393} 394// Y NaN ? 395{.mfi 396 nop.m 999 397(p8) fclass.m p9,p8=f9,0xc3 398 nop.i 0;; 399} 400{.mfi 401 nop.m 999 402(p8) frcpa.s0 f8,p0 = f8,f8 403 nop.i 0 404} 405{ .mfi 406 nop.m 999 407 // also set Denormal flag if necessary 408(p8) fma.s0 f9=f9,f1,f0 409 nop.i 999 ;; 410} 411 412{ .mfb 413 nop.m 999 414(p8) fma.d.s0 f8=f8,f1,f0 415 nop.b 999 ;; 416} 417 418{ .mfb 419 nop.m 999 420(p9) frcpa.s0 f8,p7=f8,f9 421 br.ret.sptk b0 ;; 422} 423 424 425FMOD_Y_NAN_INF_ZERO: 426 427// Y INF 428{ .mfi 429 nop.m 999 430 fclass.m.unc p7,p0 = f9, 0x23 431 nop.i 999 ;; 432} 433 434{ .mfb 435 nop.m 999 436(p7) fma.d.s0 f8=f8,f1,f0 437(p7) br.ret.spnt b0 ;; 438} 439 440// Y NAN? 441{ .mfi 442 nop.m 999 443 fclass.m.unc p9,p0 = f9, 0xc3 444 nop.i 999 ;; 445} 446 447{ .mfb 448 nop.m 999 449(p9) fma.d.s0 f8=f9,f1,f0 450(p9) br.ret.spnt b0 ;; 451} 452 453FMOD_Y_ZERO: 454// Y zero? Must be zero at this point 455// because it is the only choice left. 456// Return QNAN indefinite 457 458{.mfi 459 nop.m 0 460 // set Invalid 461 frcpa.s0 f12,p0=f0,f0 462 nop.i 0 463} 464// X NAN? 465{ .mfi 466 nop.m 999 467 fclass.m.unc p9,p10 = f8, 0xc3 468 nop.i 999 ;; 469} 470{ .mfi 471 nop.m 999 472(p10) fclass.nm p9,p10 = f8, 0xff 473 nop.i 999 ;; 474} 475 476{.mfi 477 nop.m 999 478 (p9) frcpa.s0 f11,p7=f8,f0 479 nop.i 0;; 480} 481 482{ .mfi 483 nop.m 999 484(p10) frcpa.s0 f11,p7 = f9,f9 485 mov GR_Parameter_TAG = 121 ;; 486} 487 488{ .mfi 489 nop.m 999 490 fmerge.s f10 = f8, f8 491 nop.i 999 492} 493 494{ .mfb 495 nop.m 999 496 fma.d.s0 f8=f11,f1,f0 497 br.sptk __libm_error_region;; 498} 499 500GLOBAL_IEEE754_END(fmod) 501libm_alias_double_other (__fmod, fmod) 502 503LOCAL_LIBM_ENTRY(__libm_error_region) 504.prologue 505{ .mfi 506 add GR_Parameter_Y=-32,sp // Parameter 2 value 507 nop.f 0 508.save ar.pfs,GR_SAVE_PFS 509 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 510} 511{ .mfi 512.fframe 64 513 add sp=-64,sp // Create new stack 514 nop.f 0 515 mov GR_SAVE_GP=gp // Save gp 516};; 517{ .mmi 518 stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack 519 add GR_Parameter_X = 16,sp // Parameter 1 address 520.save b0, GR_SAVE_B0 521 mov GR_SAVE_B0=b0 // Save b0 522};; 523.body 524{ .mib 525 stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack 526 add GR_Parameter_RESULT = 0,GR_Parameter_Y 527 nop.b 0 // Parameter 3 address 528} 529{ .mib 530 stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack 531 add GR_Parameter_Y = -16,GR_Parameter_Y 532 br.call.sptk b0=__libm_error_support# // Call error handling function 533};; 534{ .mmi 535 nop.m 0 536 nop.m 0 537 add GR_Parameter_RESULT = 48,sp 538};; 539{ .mmi 540 ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack 541.restore sp 542 add sp = 64,sp // Restore stack pointer 543 mov b0 = GR_SAVE_B0 // Restore return address 544};; 545{ .mib 546 mov gp = GR_SAVE_GP // Restore gp 547 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 548 br.ret.sptk b0 // Return 549};; 550 551LOCAL_LIBM_END(__libm_error_region) 552 553 554.type __libm_error_support#,@function 555.global __libm_error_support# 556