1.file "remainder.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//==================================================================== 41// 02/02/00 Initial version 42// 03/02/00 New Algorithm 43// 04/04/00 Unwind support added 44// 07/21/00 Fixed quotient=2^{24*m+23}*1.q1...q23 1 bug 45// 08/15/00 Bundle added after call to __libm_error_support to properly 46// set [the previously overwritten] GR_Parameter_RESULT. 47// 11/29/00 Set FR_Y to f9 48// 05/20/02 Cleaned up namespace and sf0 syntax 49// 02/10/03 Reordered header: .section, .global, .proc, .align 50// 51// API 52//==================================================================== 53// double remainder(double,double); 54// 55// Overview of operation 56//==================================================================== 57// remainder(a,b)=a-i*b, 58// where i is an integer such that, if b!=0 and a is finite, 59// |a/b-i|<=1/2. If |a/b-i|=1/2, i is even. 60// 61// Algorithm 62//==================================================================== 63// a). eliminate special cases 64// b). if |a/b|<0.25 (first quotient estimate), return a 65// c). use single precision divide algorithm to get quotient q 66// rounded to 24 bits of precision 67// d). calculate partial remainders (using both q and q-ulp); 68// select one and RZ(a/b) based on the sign of |a|-|b|*q 69// e). if the exponent difference (exponent(a)-exponent(b)) 70// is less than 24 (quotient estimate<2^{24}-2), use RZ(a/b) 71// and sticky bits to round to integer; exit loop and 72// calculate final remainder 73// f). if exponent(a)-exponent(b)>=24, select new value of a as 74// the partial remainder calculated using RZ(a/b); 75// repeat from c). 76// 77// Special cases 78//==================================================================== 79// a=+/- Inf, or b=+/-0: return NaN, call libm_error_support 80// a=NaN or b=NaN: return NaN 81 82// Registers used 83//==================================================================== 84// Predicate registers: p6-p14 85// General registers: r2,r3,r28,r29,r32 (ar.pfs), r33-r39 86// Floating point registers: f6-f15,f32 87 88GR_SAVE_B0 = r33 89GR_SAVE_PFS = r34 90GR_SAVE_GP = r35 91GR_SAVE_SP = r36 92 93GR_Parameter_X = r37 94GR_Parameter_Y = r38 95GR_Parameter_RESULT = r39 96GR_Parameter_TAG = r40 97 98FR_X = f10 99FR_Y = f9 100FR_RESULT = f8 101 102 103.section .text 104GLOBAL_IEEE754_ENTRY(remainder) 105 106// inputs in f8, f9 107// result in f8 108 109{ .mfi 110 alloc r32=ar.pfs,1,4,4,0 111 // f13=|a| 112 fmerge.s f13=f0,f8 113 nop.i 0 114} 115 {.mfi 116 nop.m 0 117 // f14=|b| 118 fmerge.s f14=f0,f9 119 nop.i 0;; 120} 121 {.mlx 122 mov r28=0x2ffdd 123 // r2=2^{23} 124 movl r3=0x4b000000;; 125} 126 127// Y +-NAN, +-inf, +-0? p11 128{ .mfi 129 setf.exp f32=r28 130 fclass.m.unc p11,p0 = f9, 0xe7 131 nop.i 999 132} 133// qnan snan inf norm unorm 0 -+ 134// 1 1 1 0 0 0 11 135// e 3 136// X +-NAN, +-inf, ? p9 137{ .mfi 138 nop.m 999 139 fclass.m.unc p9,p0 = f8, 0xe3 140 nop.i 999;; 141} 142 143{.mfi 144 nop.m 0 145 mov f12=f0 146 nop.i 0 147} 148{ .mfi 149 // set p7=1 150 cmp.eq.unc p7,p0=r0,r0 151 // Step (1) 152 // y0 = 1 / b in f10 153 frcpa.s1 f10,p6=f13,f14 154 nop.i 0;; 155} 156 157{.bbb 158 (p9) br.cond.spnt FREM_X_NAN_INF 159 (p11) br.cond.spnt FREM_Y_NAN_INF_ZERO 160 nop.b 0 161} {.mfi 162 nop.m 0 163 // set D flag if a (f8) is denormal 164 fnma.s0 f6=f8,f1,f8 165 nop.i 0;; 166} 167 168 169remloop24: 170 { .mfi 171 nop.m 0 172 // Step (2) 173 // q0 = a * y0 in f12 174 (p6) fma.s1 f12=f13,f10,f0 175 nop.i 0 176} { .mfi 177 nop.m 0 178 // Step (3) 179 // e0 = 1 - b * y0 in f7 180 (p6) fnma.s1 f7=f14,f10,f1 181 nop.i 0;; 182} {.mlx 183 nop.m 0 184 // r2=1.25*2^{-24} 185 movl r2=0x33a00000;; 186} 187 188{.mfi 189 nop.m 0 190 // q1=q0*(1+e0) 191 (p6) fma.s1 f15=f12,f7,f12 192 nop.i 0 193} 194{ .mfi 195 nop.m 0 196 // Step (4) 197 // e1 = e0 * e0 + E in f7 198 (p6) fma.s1 f7=f7,f7,f32 199 nop.i 0;; 200} 201 {.mii 202 (p7) getf.exp r29=f12 203 (p7) mov r28=0xfffd 204 nop.i 0;; 205} 206 { .mfi 207 // f12=2^{23} 208 setf.s f12=r3 209 // Step (5) 210 // q2 = q1 + e1 * q1 in f11 211 (p6) fma.s.s1 f11=f7,f15,f15 212 nop.i 0 213} { .mfi 214 nop.m 0 215 // Step (6) 216 // q2 = q1 + e1 * q1 in f6 217 (p6) fma.s1 f6=f7,f15,f15 218 nop.i 0;; 219} 220 221 {.mmi 222 // f15=1.25*2^{-24} 223 setf.s f15=r2 224 // q<1/4 ? (i.e. expon< -2) 225 (p7) cmp.gt p7,p0=r28,r29 226 nop.i 0;; 227} 228 229{.mfb 230 // r29= -32+bias 231 mov r29=0xffdf 232 // if |a/b|<1/4, set D flag before returning 233 (p7) fma.d.s0 f9=f9,f0,f8 234 nop.b 0;; 235} 236 {.mfb 237 nop.m 0 238 // can be combined with bundle above if sign of 0 or 239 // FTZ enabled are not important 240 (p7) fmerge.s f8=f8,f9 241 // return if |a|<4*|b| (estimated quotient < 1/4) 242 (p7) br.ret.spnt b0;; 243} 244 {.mfi 245 // f7=2^{-32} 246 setf.exp f7=r29 247 // set f8 to current a value | sign 248 fmerge.s f8=f8,f13 249 nop.i 0;; 250} 251 252 253 {.mfi 254 getf.exp r28=f6 255 // last step ? (q<2^{23}) 256 fcmp.lt.unc.s1 p0,p12=f6,f12 257 nop.i 0;; 258} 259 {.mfi 260 nop.m 0 261 // r=a-b*q 262 fnma.s1 f6=f14,f11,f13 263 nop.i 0 264} {.mfi 265 // r2=23+bias 266 mov r2=0xffff+23 267 // q'=q-q*(1.25*2^{-24}) (q'=q-ulp) 268 fnma.s.s1 f15=f11,f15,f11 269 nop.i 0;; 270} 271 {.mmi 272 nop.m 0 273 cmp.eq p11,p14=r2,r28 274 nop.i 0;; 275} 276 277.pred.rel "mutex",p11,p14 278 {.mfi 279 nop.m 0 280 // if exp_q=2^23, then r=a-b*2^{23} 281 (p11) fnma.s1 f13=f12,f14,f13 282 nop.i 0 283} 284{.mfi 285 nop.m 0 286 // r2=a-b*q' 287 (p14) fnma.s1 f13=f14,f15,f13 288 nop.i 0;; 289} 290 {.mfi 291 nop.m 0 292 // r>0 iff q=RZ(a/b) and inexact 293 fcmp.gt.unc.s1 p8,p0=f6,f0 294 nop.i 0 295} {.mfi 296 nop.m 0 297 // r<0 iff q'=RZ(a/b) and inexact 298 (p14) fcmp.lt.unc.s1 p9,p10=f6,f0 299 nop.i 0;; 300} 301 302.pred.rel "mutex",p8,p9 303 {.mfi 304 nop.m 0 305 // (p8) Q=q+(last iteration ? sticky bits:0) 306 // i.e. Q=q+q*x (x=2^{-32} or 0) 307 (p8) fma.s1 f11=f11,f7,f11 308 nop.i 0 309} {.mfi 310 nop.m 0 311 // (p9) Q=q'+(last iteration ? sticky bits:0) 312 // i.e. Q=q'+q'*x (x=2^{-32} or 0) 313 (p9) fma.s1 f11=f15,f7,f15 314 nop.i 0;; 315} 316 317 {.mfb 318 nop.m 0 319 // (p9) set r=r2 (new a, if not last iteration) 320 // (p10) new a =r 321 (p10) mov f13=f6 322 (p12) br.cond.sptk remloop24;; 323} 324 325// last iteration 326 {.mfi 327 nop.m 0 328 // set f9=|b|*sgn(a) 329 fmerge.s f9=f8,f9 330 nop.i 0 331} 332 {.mfi 333 nop.m 0 334 // round to integer 335 fcvt.fx.s1 f11=f11 336 nop.i 0;; 337} 338 {.mfi 339 nop.m 0 340 // save sign of a 341 fmerge.s f7=f8,f8 342 nop.i 0 343} {.mfi 344 nop.m 0 345 // normalize 346 fcvt.xf f11=f11 347 nop.i 0;; 348} 349 {.mfi 350 nop.m 0 351 // This can be removed if sign of 0 is not important 352 // get remainder using sf1 353 fnma.d.s1 f12=f9,f11,f8 354 nop.i 0 355} 356 {.mfi 357 nop.m 0 358 // get remainder 359 fnma.d.s0 f8=f9,f11,f8 360 nop.i 0;; 361} 362 {.mfi 363 nop.m 0 364 // f12=0? 365 // This can be removed if sign of 0 is not important 366 fcmp.eq.unc.s1 p8,p0=f12,f0 367 nop.i 0;; 368} 369 {.mfb 370 nop.m 0 371 // if f8=0, set sign correctly 372 // This can be removed if sign of 0 is not important 373 (p8) fmerge.s f8=f7,f8 374 // return 375 br.ret.sptk b0;; 376} 377 378 379FREM_X_NAN_INF: 380 381// Y zero ? 382{.mfi 383 nop.m 0 384 fma.s1 f10=f9,f1,f0 385 nop.i 0;; 386} 387{.mfi 388 nop.m 0 389 fcmp.eq.unc.s1 p11,p0=f10,f0 390 nop.i 0;; 391} 392{.mib 393 nop.m 0 394 nop.i 0 395 // if Y zero 396 (p11) br.cond.spnt FREM_Y_ZERO;; 397} 398 399// X infinity? Return QNAN indefinite 400{ .mfi 401 nop.m 999 402 fclass.m.unc p8,p0 = f8, 0x23 403 nop.i 999 404} 405// X infinity? Return QNAN indefinite 406{ .mfi 407 nop.m 999 408 fclass.m.unc p11,p0 = f8, 0x23 409 nop.i 999;; 410} 411// Y NaN ? 412{.mfi 413 nop.m 999 414(p8) fclass.m.unc p0,p8=f9,0xc3 415 nop.i 0;; 416} 417{.mfi 418 nop.m 999 419 // also set Denormal flag if necessary 420(p8) fma.s0 f9=f9,f1,f0 421 nop.i 0 422} 423{ .mfi 424 nop.m 999 425(p8) frcpa.s0 f8,p7 = f8,f8 426 nop.i 999 ;; 427} 428 429{.mfi 430 nop.m 999 431(p11) mov f10=f8 432 nop.i 0 433} 434{ .mfi 435 nop.m 999 436(p8) fma.d.s0 f8=f8,f1,f0 437 nop.i 0 ;; 438} 439 440{ .mfb 441 nop.m 999 442 frcpa.s0 f8,p7=f8,f9 443 (p11) br.cond.spnt EXP_ERROR_RETURN;; 444} 445{ .mib 446 nop.m 0 447 nop.i 0 448 br.ret.spnt b0 ;; 449} 450 451 452FREM_Y_NAN_INF_ZERO: 453 454// Y INF 455{ .mfi 456 nop.m 999 457 fclass.m.unc p7,p0 = f9, 0x23 458 nop.i 999 ;; 459} 460 461{ .mfb 462 nop.m 999 463(p7) fma.d.s0 f8=f8,f1,f0 464(p7) br.ret.spnt b0 ;; 465} 466 467// Y NAN? 468{ .mfi 469 nop.m 999 470 fclass.m.unc p9,p0 = f9, 0xc3 471 nop.i 999 ;; 472} 473 474{ .mfb 475 nop.m 999 476(p9) fma.d.s0 f8=f9,f1,f0 477(p9) br.ret.spnt b0 ;; 478} 479 480FREM_Y_ZERO: 481// Y zero? Must be zero at this point 482// because it is the only choice left. 483// Return QNAN indefinite 484 485// X NAN? 486{ .mfi 487 nop.m 999 488 fclass.m.unc p9,p10 = f8, 0xc3 489 nop.i 999 ;; 490} 491{ .mfi 492 nop.m 999 493(p10) fclass.nm p9,p10 = f8, 0xff 494 nop.i 999 ;; 495} 496 497{.mfi 498 nop.m 999 499 (p9) frcpa.s0 f11,p7=f8,f0 500 nop.i 0;; 501} 502 503{ .mfi 504 nop.m 999 505(p10) frcpa.s0 f11,p7 = f0,f0 506 nop.i 999;; 507} 508 509{ .mfi 510 nop.m 999 511 fmerge.s f10 = f8, f8 512 nop.i 999 513} 514 515{ .mfi 516 nop.m 999 517 fma.d.s0 f8=f11,f1,f0 518 nop.i 999 519} 520 521 522EXP_ERROR_RETURN: 523 524{ .mib 525 mov GR_Parameter_TAG = 124 526 nop.i 999 527 br.sptk __libm_error_region;; 528} 529 530GLOBAL_IEEE754_END(remainder) 531libm_alias_double_other (__remainder, remainder) 532weak_alias (__remainder, drem) 533 534 535 536LOCAL_LIBM_ENTRY(__libm_error_region) 537.prologue 538{ .mfi 539 add GR_Parameter_Y=-32,sp // Parameter 2 value 540 nop.f 0 541.save ar.pfs,GR_SAVE_PFS 542 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 543} 544{ .mfi 545.fframe 64 546 add sp=-64,sp // Create new stack 547 nop.f 0 548 mov GR_SAVE_GP=gp // Save gp 549};; 550{ .mmi 551 stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack 552 add GR_Parameter_X = 16,sp // Parameter 1 address 553.save b0, GR_SAVE_B0 554 mov GR_SAVE_B0=b0 // Save b0 555};; 556.body 557{ .mib 558 stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack 559 add GR_Parameter_RESULT = 0,GR_Parameter_Y 560 nop.b 0 // Parameter 3 address 561} 562{ .mib 563 stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack 564 add GR_Parameter_Y = -16,GR_Parameter_Y 565 br.call.sptk b0=__libm_error_support# // Call error handling function 566};; 567{ .mmi 568 nop.m 0 569 nop.m 0 570 add GR_Parameter_RESULT = 48,sp 571};; 572{ .mmi 573 ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack 574.restore sp 575 add sp = 64,sp // Restore stack pointer 576 mov b0 = GR_SAVE_B0 // Restore return address 577};; 578{ .mib 579 mov gp = GR_SAVE_GP // Restore gp 580 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 581 br.ret.sptk b0 // Return 582};; 583 584LOCAL_LIBM_END(__libm_error_region) 585 586 587 588.type __libm_error_support#,@function 589.global __libm_error_support# 590