1.file "remainderf.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//==================================================================== 41// 02/02/00 Initial version 42// 03/02/00 New algorithm 43// 04/04/00 Unwind support added 44// 07/21/00 Fixed quotient=2^{24*m+23} bug 45// 08/15/00 Bundle added after call to __libm_error_support to properly 46// set [the previously overwritten] GR_Parameter_RESULT. 47// 11/29/00 Set FR_Y to f9 48// 05/20/02 Cleaned up namespace and sf0 syntax 49// 02/10/03 Reordered header: .section, .global, .proc, .align 50// 51// API 52//==================================================================== 53// float remainderf(float,float); 54// 55// Overview of operation 56//==================================================================== 57// remainder(a,b)=a-i*b, 58// where i is an integer such that, if b!=0 and a is finite, 59// |a/b-i|<=1/2. If |a/b-i|=1/2, i is even. 60// 61// Algorithm 62//==================================================================== 63// a). eliminate special cases 64// b). if |a/b|<0.25 (first quotient estimate), return a 65// c). use single precision divide algorithm to get quotient q 66// rounded to 24 bits of precision 67// d). calculate partial remainders (using both q and q-ulp); 68// select one and RZ(a/b) based on the sign of |a|-|b|*q 69// e). if the exponent difference (exponent(a)-exponent(b)) 70// is less than 24 (quotient estimate<2^{24}-2), use RZ(a/b) 71// and sticky bits to round to integer; exit loop and 72// calculate final remainder 73// f). if exponent(a)-exponent(b)>=24, select new value of a as 74// the partial remainder calculated using RZ(a/b); 75// repeat from c). 76// 77// Special cases 78//==================================================================== 79// a=+/- Inf, or b=+/-0: return NaN, call libm_error_support 80// a=NaN or b=NaN: return NaN 81// 82// Registers used 83//==================================================================== 84// Predicate registers: p6-p12 85// General registers: r2,r3,r28,r29,r32 (ar.pfs), r33-r39 86// Floating point registers: f6-f15 87// 88 89GR_SAVE_B0 = r33 90GR_SAVE_PFS = r34 91GR_SAVE_GP = r35 92GR_SAVE_SP = r36 93 94GR_Parameter_X = r37 95GR_Parameter_Y = r38 96GR_Parameter_RESULT = r39 97GR_Parameter_TAG = r40 98 99FR_X = f10 100FR_Y = f9 101FR_RESULT = f8 102 103 104.section .text 105GLOBAL_IEEE754_ENTRY(remainderf) 106 107// inputs in f8, f9 108// result in f8 109 110{ .mfi 111 alloc r32=ar.pfs,1,4,4,0 112 // f13=|a| 113 fmerge.s f13=f0,f8 114 nop.i 0 115} 116 {.mfi 117 nop.m 0 118 // f14=|b| 119 fmerge.s f14=f0,f9 120 nop.i 0;; 121} 122 {.mlx 123 nop.m 0 124 // r2=2^{24}-2 125 movl r3=0x4b7ffffe;; 126} 127 128// Y +-NAN, +-inf, +-0? p11 129{ .mfi 130 nop.m 999 131 fclass.m.unc p11,p0 = f9, 0xe7 132 nop.i 999 133} 134// qnan snan inf norm unorm 0 -+ 135// 1 1 1 0 0 0 11 136// e 3 137// X +-NAN, +-inf, ? p9 138{ .mfi 139 nop.m 999 140 fclass.m.unc p9,p0 = f8, 0xe3 141 nop.i 999;; 142} 143 144{.mfi 145 nop.m 0 146 mov f15=f0 147 nop.i 0 148} 149{ .mfi 150 // set p7=1 151 cmp.eq.unc p7,p0=r0,r0 152 // Step (1) 153 // y0 = 1 / b in f10 154 frcpa.s1 f10,p6=f13,f14 155 nop.i 0;; 156} 157{.bbb 158 (p9) br.cond.spnt FREM_X_NAN_INF 159 (p11) br.cond.spnt FREM_Y_NAN_INF_ZERO 160 nop.b 0 161} {.mfi 162 nop.m 0 163 // set D flag if a (f8) is denormal 164 fnma.s0 f6=f8,f1,f8 165 nop.i 0;; 166} 167 168.align 32 169remloop24: 170 { .mfi 171 // f12=2^{24}-2 172 setf.s f12=r3 173 // Step (2) 174 // q0 = a * y0 in f15 175 (p6) fma.s1 f15=f13,f10,f0 176 nop.i 0 177} 178{ .mfi 179 nop.m 0 180 // Step (3) 181 // e0 = 1 - b * y0 in f7 182 (p6) fnma.s1 f7=f14,f10,f1 183 nop.i 0;; 184} 185{.mlx 186 nop.m 0 187 // r2=1.25*2^{-24} 188 movl r2=0x33a00000;; 189} 190 { .mfi 191 nop.m 0 192 // Step (4) 193 // q1 = q0 + e0 * q0 in f6 194 (p6) fma.s1 f6=f7,f15,f15 195 nop.i 0 196} 197{ .mfi 198 nop.m 0 199 // Step (5) 200 // e1 = e0 * e0 in f7 201 (p6) fma.s1 f7=f7,f7,f0 202 nop.i 0;; 203} 204 {.mii 205 (p7) getf.exp r29=f15 206 (p7) mov r28=0xfffd 207 nop.i 0;; 208} 209 210 { .mfi 211 // f15=1.25*2^{-24} 212 setf.s f15=r2 213 // Step (6) 214 // q2 = q1 + e1 * q1 in f6 215 (p6) fma.s1 f6=f7,f6,f6 216 nop.i 0 217} 218{ .mfi 219 mov r2=0x3e7 220 // Step (7) 221 // e2 = e1 * e1 in f7 222 (p6) fma.s1 f7=f7,f7,f0 223 nop.i 0;; 224} 225 226 {.mmi 227 // q<1/4 ? (i.e. expon< -2) 228 (p7) cmp.gt.unc p7,p0=r28,r29 229 nop.m 0 230 // r2=0x3e7000000 231 shl r2=r2,24;; 232} 233 234{.mfb 235 // r2=0x3e7000001 236 add r2=1,r2 237 // if |a/b|<1/4, set D flag before returning 238 (p7) fma.s.s0 f9=f9,f0,f8 239 nop.b 0;; 240} 241 {.mfb 242 nop.m 0 243 // can be combined with bundle above if sign of 0 or 244 // FTZ enabled are not important 245 (p7) fmerge.s f8=f8,f9 246 // return if |a|<4*|b| (estimated quotient < 1/4) 247 (p7) br.ret.spnt b0;; 248} 249 {.mfi 250 nop.m 0 251 // set f8 to current a value | sign 252 fmerge.s f8=f8,f13 253 // r2=2^{-24}+2^{-48} (double prec.) 254 shl r2=r2,28;; 255} 256 257 258{ .mfi 259 // r29= -32+bias 260 mov r29=0xffdf 261 // Step (8) 262 // q3 = q2 + e2 * q2 in f6 263 (p6) fma.d.s1 f6=f7,f6,f6 264 nop.i 0;; 265} 266{ .mfi 267 nop.m 0 268 // Step (9) 269 // q = q3 in f11 270 (p6) fma.s.s1 f11=f6,f1,f0 271 nop.i 0;; 272} 273 {.mfi 274 // f7=2^{-24} 275 setf.d f7=r2 276 // last step ? (q3<2^{24}-2 --> q<2^{24}) 277 fcmp.lt.unc.s1 p0,p12=f6,f12 278 nop.i 0 279} {.mfi 280 // f12=2^{-32} 281 setf.exp f12=r29 282 nop.f 0 283 nop.i 0;; 284} 285 {.mfi 286 nop.m 0 287 // r=a-b*q 288 fnma.s1 f6=f14,f11,f13 289 nop.i 0 290} 291{.mfi 292 nop.m 0 293 // q'=q-q*(1.25*2^{-24}) (q'=q-ulp) 294 fnma.s.s1 f15=f11,f15,f11 295 nop.i 0;; 296} 297 298 {.mfi 299 nop.m 0 300 // r2=a-b*q' 301 fnma.s1 f13=f14,f15,f13 302 nop.i 0;; 303} 304 {.mfi 305 nop.m 0 306 // r>0 iff q=RZ(a/b) and inexact 307 fcmp.gt.unc.s1 p8,p0=f6,f0 308 nop.i 0 309} 310{.mfi 311 nop.m 0 312 // r<0 iff q'=RZ(a/b) and inexact 313 fcmp.lt.unc.s1 p9,p10=f6,f0 314 nop.i 0;; 315} 316.pred.rel "mutex",p8,p9 317 {.mfi 318 nop.m 0 319 // (p8) Q=q+(last iteration ? sticky bits:0) 320 // i.e. Q=q+q*x (x=2^{-32} or 0) 321 (p8) fma.s1 f11=f11,f12,f11 322 nop.i 0 323} 324{.mfi 325 nop.m 0 326 // (p9) Q=q'+(last iteration ? sticky bits:0) 327 // i.e. Q=q'+q'*x (x=2^{-24} or 0: if expon. difference=23, want to round back to q) 328 (p9) fma.s1 f11=f15,f7,f15 329 nop.i 0;; 330} 331 332 {.mfb 333 nop.m 0 334 // (p9) set r=r2 (new a, if not last iteration) 335 // (p10) new a =r 336 (p10) mov f13=f6 337 (p12) br.cond.sptk remloop24;; 338} 339 340// last iteration 341 {.mfi 342 nop.m 0 343 // set f9=|b|*sgn(a) 344 fmerge.s f9=f8,f9 345 nop.i 0 346} 347 {.mfi 348 nop.m 0 349 // round to integer 350 fcvt.fx.s1 f11=f11 351 nop.i 0;; 352} 353 {.mfi 354 nop.m 0 355 // save sign of a 356 fmerge.s f7=f8,f8 357 nop.i 0 358} 359{.mfi 360 nop.m 0 361 // normalize 362 fcvt.xf f11=f11 363 nop.i 0;; 364} 365 {.mfi 366 nop.m 0 367 // This can be removed if sign of 0 is not important 368 // get remainder using sf1 369 fnma.s.s1 f12=f9,f11,f8 370 nop.i 0 371} 372 {.mfi 373 nop.m 0 374 // get remainder 375 fnma.s.s0 f8=f9,f11,f8 376 nop.i 0;; 377} 378 379 380 381 {.mfi 382 nop.m 0 383 // f12=0? 384 // This can be removed if sign of 0 is not important 385 fcmp.eq.unc.s1 p8,p0=f12,f0 386 nop.i 0;; 387} 388 {.mfb 389 nop.m 0 390 // if f8=0, set sign correctly 391 // This can be removed if sign of 0 is not important 392 (p8) fmerge.s f8=f7,f8 393 // return 394 br.ret.sptk b0;; 395} 396 397 398FREM_X_NAN_INF: 399 400// Y zero ? 401{.mfi 402 nop.m 0 403 fma.s1 f10=f9,f1,f0 404 nop.i 0;; 405} 406{.mfi 407 nop.m 0 408 fcmp.eq.unc.s1 p11,p0=f10,f0 409 nop.i 0;; 410} 411{.mib 412 nop.m 0 413 nop.i 0 414 // if Y zero 415 (p11) br.cond.spnt FREM_Y_ZERO;; 416} 417 418// X infinity? Return QNAN indefinite 419{ .mfi 420 nop.m 999 421 fclass.m.unc p8,p0 = f8, 0x23 422 nop.i 999 423} 424// X infinity? Return QNAN indefinite 425{ .mfi 426 nop.m 999 427 fclass.m.unc p11,p0 = f8, 0x23 428 nop.i 999;; 429} 430// Y NaN ? 431{.mfi 432 nop.m 999 433(p8) fclass.m.unc p0,p8=f9,0xc3 434 nop.i 0;; 435} 436{.mfi 437 nop.m 999 438 // also set Denormal flag if necessary 439(p8) fma.s0 f9=f9,f1,f0 440 nop.i 0 441} 442{ .mfi 443 nop.m 999 444(p8) frcpa.s0 f8,p7 = f8,f8 445 nop.i 999 ;; 446} 447 448{.mfi 449 nop.m 999 450(p11) mov f10=f8 451 nop.i 0 452} 453{ .mfi 454 nop.m 999 455(p8) fma.s.s0 f8=f8,f1,f0 456 nop.i 0 ;; 457} 458 459{ .mfb 460 nop.m 999 461 frcpa.s0 f8,p7=f8,f9 462 (p11) br.cond.spnt EXP_ERROR_RETURN;; 463} 464{ .mib 465 nop.m 0 466 nop.i 0 467 br.ret.spnt b0 ;; 468} 469 470 471FREM_Y_NAN_INF_ZERO: 472 473// Y INF 474{ .mfi 475 nop.m 999 476 fclass.m.unc p7,p0 = f9, 0x23 477 nop.i 999 ;; 478} 479 480{ .mfb 481 nop.m 999 482(p7) fma.s.s0 f8=f8,f1,f0 483(p7) br.ret.spnt b0 ;; 484} 485 486// Y NAN? 487{ .mfi 488 nop.m 999 489 fclass.m.unc p9,p0 = f9, 0xc3 490 nop.i 999 ;; 491} 492 493{ .mfb 494 nop.m 999 495(p9) fma.s.s0 f8=f9,f1,f0 496(p9) br.ret.spnt b0 ;; 497} 498 499FREM_Y_ZERO: 500// Y zero? Must be zero at this point 501// because it is the only choice left. 502// Return QNAN indefinite 503 504// X NAN? 505{ .mfi 506 nop.m 999 507 fclass.m.unc p9,p10 = f8, 0xc3 508 nop.i 999 ;; 509} 510{ .mfi 511 nop.m 999 512(p10) fclass.nm p9,p10 = f8, 0xff 513 nop.i 999 ;; 514} 515 516{.mfi 517 nop.m 999 518 (p9) frcpa.s0 f11,p7=f8,f0 519 nop.i 0;; 520} 521 522{ .mfi 523 nop.m 999 524(p10) frcpa.s0 f11,p7 = f0,f0 525nop.i 999;; 526} 527 528{ .mfi 529 nop.m 999 530 fmerge.s f10 = f8, f8 531 nop.i 999 532} 533 534{ .mfi 535 nop.m 999 536 fma.s.s0 f8=f11,f1,f0 537 nop.i 999 538} 539 540 541EXP_ERROR_RETURN: 542 543{ .mib 544 mov GR_Parameter_TAG = 125 545 nop.i 999 546 br.sptk __libm_error_region;; 547} 548 549GLOBAL_IEEE754_END(remainderf) 550libm_alias_float_other (__remainder, remainder) 551weak_alias (__remainderf, dremf) 552 553 554 555LOCAL_LIBM_ENTRY(__libm_error_region) 556.prologue 557{ .mfi 558 add GR_Parameter_Y=-32,sp // Parameter 2 value 559 nop.f 0 560.save ar.pfs,GR_SAVE_PFS 561 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 562} 563{ .mfi 564.fframe 64 565 add sp=-64,sp // Create new stack 566 nop.f 0 567 mov GR_SAVE_GP=gp // Save gp 568};; 569{ .mmi 570 stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack 571 add GR_Parameter_X = 16,sp // Parameter 1 address 572.save b0, GR_SAVE_B0 573 mov GR_SAVE_B0=b0 // Save b0 574};; 575.body 576{ .mib 577 stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack 578 add GR_Parameter_RESULT = 0,GR_Parameter_Y 579 nop.b 0 // Parameter 3 address 580} 581{ .mib 582 stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack 583 add GR_Parameter_Y = -16,GR_Parameter_Y 584 br.call.sptk b0=__libm_error_support#;; // Call error handling function 585} 586{ .mmi 587 nop.m 0 588 nop.m 0 589 add GR_Parameter_RESULT = 48,sp 590};; 591{ .mmi 592 ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack 593.restore sp 594 add sp = 64,sp // Restore stack pointer 595 mov b0 = GR_SAVE_B0 // Restore return address 596};; 597{ .mib 598 mov gp = GR_SAVE_GP // Restore gp 599 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 600 br.ret.sptk b0 // Return 601};; 602 603LOCAL_LIBM_END(__libm_error_region) 604 605 606.type __libm_error_support#,@function 607.global __libm_error_support# 608