1.file "remainderl.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//==================================================================== 41// 02/02/00 Initial version 42// 03/02/00 New algorithm 43// 04/04/00 Unwind support added 44// 07/21/00 Fixed quotient=2^{24*m+23}*1.q1...q23 1 bug 45// 08/15/00 Bundle added after call to __libm_error_support to properly 46// set [the previously overwritten] GR_Parameter_RESULT. 47// 11/29/00 Set FR_Y to f9 48// 05/20/02 Cleaned up namespace and sf0 syntax 49// 02/10/03 Reordered header: .section, .global, .proc, .align 50// 51// API 52//==================================================================== 53// long double remainderl(long double,long double); 54// 55// Overview of operation 56//==================================================================== 57// remainder(a,b)=a-i*b, 58// where i is an integer such that, if b!=0 and a is finite, 59// |a/b-i|<=1/2. If |a/b-i|=1/2, i is even. 60// 61// Algorithm 62//==================================================================== 63// a). eliminate special cases 64// b). if |a/b|<0.25 (first quotient estimate), return a 65// c). use single precision divide algorithm to get quotient q 66// rounded to 24 bits of precision 67// d). calculate partial remainders (using both q and q-ulp); 68// select one and RZ(a/b) based on the sign of |a|-|b|*q 69// e). if the exponent difference (exponent(a)-exponent(b)) 70// is less than 24 (quotient estimate<2^{24}-2), use RZ(a/b) 71// and sticky bits to round to integer; exit loop and 72// calculate final remainder 73// f). if exponent(a)-exponent(b)>=24, select new value of a as 74// the partial remainder calculated using RZ(a/b); 75// repeat from c). 76// 77// Special cases 78//==================================================================== 79// a=+/- Inf, or b=+/-0: return NaN, call libm_error_support 80// a=NaN or b=NaN: return NaN 81// 82// Registers used 83//==================================================================== 84// Predicate registers: p6-p14 85// General registers: r2,r3,r28,r29,r32 (ar.pfs), r33-r39 86// Floating point registers: f6-f15,f32 87// 88 89GR_SAVE_B0 = r33 90GR_SAVE_PFS = r34 91GR_SAVE_GP = r35 92GR_SAVE_SP = r36 93 94GR_Parameter_X = r37 95GR_Parameter_Y = r38 96GR_Parameter_RESULT = r39 97GR_Parameter_TAG = r40 98 99FR_X = f10 100FR_Y = f9 101FR_RESULT = f8 102 103 104.section .text 105GLOBAL_IEEE754_ENTRY(remainderl) 106 107// inputs in f8, f9 108// result in f8 109 110{ .mfi 111 alloc r32=ar.pfs,1,4,4,0 112 // f13=|a| 113 fmerge.s f13=f0,f8 114 nop.i 0 115} 116 {.mfi 117 getf.sig r29=f9 118 // f14=|b| 119 fmerge.s f14=f0,f9 120 nop.i 0;; 121} 122 {.mlx 123 mov r28=0x2ffdd 124 // r2=2^{23} 125 movl r3=0x4b000000;; 126} 127 128 129{.mmi 130setf.exp f32=r28 131nop.m 0 132// y pseudo-zero ? 133cmp.eq p11,p10=r29,r0;; 134} 135 136// Y +-NAN, +-inf, +-0? p11 137{ .mfi 138 nop.m 999 139(p10) fclass.m p11,p10 = f9, 0xe7 140 nop.i 999 141} 142// qnan snan inf norm unorm 0 -+ 143// 1 1 1 0 0 0 11 144// e 3 145// X +-NAN, +-inf, ? p9 146{ .mfi 147 nop.m 999 148 fclass.m.unc p9,p8 = f8, 0xe3 149 nop.i 999;; 150} 151 152{.mfi 153 nop.m 0 154 mov f12=f0 155 nop.i 0 156} 157{ .mfi 158 // set p7=1 159 cmp.eq.unc p7,p0=r0,r0 160 // Step (1) 161 // y0 = 1 / b in f10 162 frcpa.s1 f10,p6=f13,f14 163 nop.i 0;; 164} 165// Y +-NAN, +-inf, +-0? p11 166{ .mfi 167 nop.m 999 168 // pseudo-NaN ? 169(p10) fclass.nm p11,p0 = f9, 0xff 170 nop.i 999 171} 172 173// qnan snan inf norm unorm 0 -+ 174// 1 1 1 0 0 0 11 175// e 3 176// X +-NAN, +-inf, ? p9 177 178{ .mfi 179 nop.m 999 180(p8) fclass.nm p9,p0 = f8, 0xff 181 nop.i 999;; 182} 183 184{.bbb 185 (p9) br.cond.spnt FREM_X_NAN_INF 186 (p11) br.cond.spnt FREM_Y_NAN_INF_ZERO 187 nop.b 0 188} {.mfi 189 nop.m 0 190 // set D flag if a (f8) is denormal 191 fnma.s0 f6=f8,f1,f8 192 nop.i 0;; 193} 194 195remloop24: 196 { .mfi 197 nop.m 0 198 // Step (2) 199 // q0 = a * y0 in f15 200 (p6) fma.s1 f12=f13,f10,f0 201 nop.i 0 202} { .mfi 203 nop.m 0 204 // Step (3) 205 // e0 = 1 - b * y0 in f7 206 (p6) fnma.s1 f7=f14,f10,f1 207 nop.i 0;; 208} {.mlx 209 nop.m 0 210 // r2=1.25*2^{-24} 211 movl r2=0x33a00000;; 212} 213 214{.mfi 215 nop.m 0 216 // q1=q0*(1+e0) 217 (p6) fma.s1 f15=f12,f7,f12 218 nop.i 0 219} 220{ .mfi 221 nop.m 0 222 // Step (4) 223 // e1 = e0 * e0 + E in f7 224 (p6) fma.s1 f7=f7,f7,f32 225 nop.i 0;; 226} 227 {.mii 228 (p7) getf.exp r29=f12 229 (p7) mov r28=0xfffd 230 nop.i 0;; 231} 232 233 { .mfi 234 // f12=2^{23} 235 setf.s f12=r3 236 // Step (5) 237 // q2 = q1 + e1 * q1 in f11 238 (p6) fma.s.s1 f11=f7,f15,f15 239 nop.i 0 240} { .mfi 241 nop.m 0 242 // Step (6) 243 // q2 = q1 + e1 * q1 in f6 244 (p6) fma.s1 f6=f7,f15,f15 245 nop.i 0;; 246} 247 248 {.mmi 249 // f15=1.25*2^{-24} 250 setf.s f15=r2 251 // q<1/4 ? (i.e. expon< -2) 252 (p7) cmp.gt p7,p0=r28,r29 253 nop.i 0;; 254} 255 256{.mfb 257 // r29= -32+bias 258 mov r29=0xffdf 259 // if |a/b|<1/4, set D flag before returning 260 (p7) fma.s0 f9=f9,f0,f8 261 nop.b 0;; 262} 263 {.mfb 264 nop.m 0 265 // can be combined with bundle above if sign of 0 or 266 // FTZ enabled are not important 267 (p7) fmerge.s f8=f8,f9 268 // return if |a|<4*|b| (estimated quotient < 1/4) 269 (p7) br.ret.spnt b0;; 270} 271 {.mfi 272 // f7=2^{-32} 273 setf.exp f7=r29 274 // set f8 to current a value | sign 275 fmerge.s f8=f8,f13 276 nop.i 0;; 277} 278 {.mfi 279 getf.exp r28=f6 280 // last step ? (q<2^{23}) 281 fcmp.lt.unc.s1 p0,p12=f6,f12 282 nop.i 0;; 283} 284 {.mfi 285 nop.m 0 286 // r=a-b*q 287 fnma.s1 f6=f14,f11,f13 288 nop.i 0 289} {.mfi 290 // r2=23+bias 291 mov r2=0xffff+23 292 // q'=q-q*(1.25*2^{-24}) (q'=q-ulp) 293 fnma.s.s1 f15=f11,f15,f11 294 nop.i 0;; 295} 296 {.mmi 297 nop.m 0 298 cmp.eq p11,p14=r2,r28 299 nop.i 0;; 300} 301 302.pred.rel "mutex",p11,p14 303 {.mfi 304 nop.m 0 305 // if exp_q=2^23, then r=a-b*2^{23} 306 (p11) fnma.s1 f13=f12,f14,f13 307 nop.i 0 308} 309{.mfi 310 nop.m 0 311 // r2=a-b*q' 312 (p14) fnma.s1 f13=f14,f15,f13 313 nop.i 0;; 314} 315 {.mfi 316 nop.m 0 317 // r>0 iff q=RZ(a/b) and inexact 318 fcmp.gt.unc.s1 p8,p0=f6,f0 319 nop.i 0 320} {.mfi 321 nop.m 0 322 // r<0 iff q'=RZ(a/b) and inexact 323 (p14) fcmp.lt.unc.s1 p9,p10=f6,f0 324 nop.i 0;; 325} 326 327.pred.rel "mutex",p8,p9 328 {.mfi 329 nop.m 0 330 // (p8) Q=q+(last iteration ? sticky bits:0) 331 // i.e. Q=q+q*x (x=2^{-32} or 0) 332 (p8) fma.s1 f11=f11,f7,f11 333 nop.i 0 334} {.mfi 335 nop.m 0 336 // (p9) Q=q'+(last iteration ? sticky bits:0) 337 // i.e. Q=q'+q'*x (x=2^{-32} or 0) 338 (p9) fma.s1 f11=f15,f7,f15 339 nop.i 0;; 340} 341 342 {.mfb 343 nop.m 0 344 // (p9) set r=r2 (new a, if not last iteration) 345 // (p10) new a =r 346 (p10) mov f13=f6 347 (p12) br.cond.sptk remloop24;; 348} 349 350// last iteration 351 {.mfi 352 nop.m 0 353 // set f9=|b|*sgn(a) 354 fmerge.s f9=f8,f9 355 nop.i 0 356} 357 {.mfi 358 nop.m 0 359 // round to integer 360 fcvt.fx.s1 f11=f11 361 nop.i 0;; 362} 363 {.mfi 364 nop.m 0 365 // save sign of a 366 fmerge.s f7=f8,f8 367 nop.i 0 368} {.mfi 369 nop.m 0 370 // normalize 371 fcvt.xf f11=f11 372 nop.i 0;; 373} 374 {.mfi 375 nop.m 0 376 // This can be removed if sign of 0 is not important 377 // get remainder using sf1 378 fnma.s1 f12=f9,f11,f8 379 nop.i 0 380} 381 {.mfi 382 nop.m 0 383 // get remainder 384 fnma.s0 f8=f9,f11,f8 385 nop.i 0;; 386} 387 {.mfi 388 nop.m 0 389 // f12=0? 390 // This can be removed if sign of 0 is not important 391 fcmp.eq.unc.s1 p8,p0=f12,f0 392 nop.i 0;; 393} 394 {.mfb 395 nop.m 0 396 // if f8=0, set sign correctly 397 // This can be removed if sign of 0 is not important 398 (p8) fmerge.s f8=f7,f8 399 // return 400 br.ret.sptk b0;; 401} 402 403 404 405FREM_X_NAN_INF: 406 407// Y zero ? 408{.mfi 409 nop.m 0 410 fma.s1 f10=f9,f1,f0 411 nop.i 0;; 412} 413{.mfi 414 nop.m 0 415 fcmp.eq.unc.s1 p11,p0=f10,f0 416 nop.i 0;; 417} 418{.mib 419 nop.m 0 420 nop.i 0 421 // if Y zero 422 (p11) br.cond.spnt FREM_Y_ZERO;; 423} 424 425// X infinity? Return QNAN indefinite 426{ .mfi 427 nop.m 999 428 fclass.m.unc p8,p0 = f8, 0x23 429 nop.i 999 430} 431// X infinity? Return QNAN indefinite 432{ .mfi 433 nop.m 999 434 fclass.m.unc p11,p0 = f8, 0x23 435 nop.i 999;; 436} 437// Y NaN ? 438{.mfi 439 nop.m 999 440(p8) fclass.m.unc p0,p8=f9,0xc3 441 nop.i 0;; 442} 443{.mfi 444 nop.m 999 445 // also set Denormal flag if necessary 446(p8) fnma.s0 f9=f9,f1,f9 447 nop.i 0 448} 449{ .mfi 450 nop.m 999 451(p8) frcpa.s0 f8,p7 = f8,f8 452 nop.i 999 ;; 453} 454 455{.mfi 456 nop.m 999 457(p11) mov f10=f8 458 nop.i 0 459} 460{ .mfi 461 nop.m 999 462(p8) fma.s0 f8=f8,f1,f0 463 nop.i 0 ;; 464} 465 466{ .mfb 467 nop.m 999 468 frcpa.s0 f8,p7=f8,f9 469 (p11) br.cond.spnt EXP_ERROR_RETURN;; 470} 471{ .mib 472 nop.m 0 473 nop.i 0 474 br.ret.spnt b0 ;; 475} 476 477 478FREM_Y_NAN_INF_ZERO: 479// Y INF 480{ .mfi 481 nop.m 999 482 fclass.m.unc p7,p0 = f9, 0x23 483 nop.i 999 ;; 484} 485 486{ .mfb 487 nop.m 999 488(p7) fma.s0 f8=f8,f1,f0 489(p7) br.ret.spnt b0 ;; 490} 491 492// Y NAN? 493{ .mfi 494 nop.m 999 495 fclass.m.unc p9,p10 = f9, 0xc3 496 nop.i 999 ;; 497} 498{ .mfi 499 nop.m 999 500(p10) fclass.nm p9,p0 = f9, 0xff 501 nop.i 999 ;; 502} 503 504{ .mfb 505 nop.m 999 506(p9) fma.s0 f8=f9,f1,f0 507(p9) br.ret.spnt b0 ;; 508} 509 510FREM_Y_ZERO: 511// Y zero? Must be zero at this point 512// because it is the only choice left. 513// Return QNAN indefinite 514 515// X NAN? 516{ .mfi 517 nop.m 999 518 fclass.m.unc p9,p10 = f8, 0xc3 519 nop.i 999 ;; 520} 521{ .mfi 522 nop.m 999 523(p10) fclass.nm p9,p10 = f8, 0xff 524 nop.i 999 ;; 525} 526 527{.mfi 528 nop.m 999 529 (p9) frcpa.s0 f11,p7=f8,f0 530 nop.i 0;; 531} 532{ .mfi 533 nop.m 999 534(p10) frcpa.s0 f11,p7 = f0,f0 535 nop.i 999;; 536} 537 538{ .mfi 539 nop.m 999 540 fmerge.s f10 = f8, f8 541 nop.i 999 542} 543 544{ .mfi 545 nop.m 999 546 fma.s0 f8=f11,f1,f0 547 nop.i 999;; 548} 549 550EXP_ERROR_RETURN: 551 552{ .mib 553 mov GR_Parameter_TAG = 123 554 nop.i 999 555 br.sptk __libm_error_region;; 556} 557 558GLOBAL_IEEE754_END(remainderl) 559libm_alias_ldouble_other (__remainder, remainder) 560weak_alias (__remainderl, dreml) 561 562LOCAL_LIBM_ENTRY(__libm_error_region) 563.prologue 564{ .mfi 565 add GR_Parameter_Y=-32,sp // Parameter 2 value 566 nop.f 0 567.save ar.pfs,GR_SAVE_PFS 568 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 569} 570{ .mfi 571.fframe 64 572 add sp=-64,sp // Create new stack 573 nop.f 0 574 mov GR_SAVE_GP=gp // Save gp 575};; 576{ .mmi 577 stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack 578 add GR_Parameter_X = 16,sp // Parameter 1 address 579.save b0, GR_SAVE_B0 580 mov GR_SAVE_B0=b0 // Save b0 581};; 582.body 583{ .mib 584 stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack 585 add GR_Parameter_RESULT = 0,GR_Parameter_Y 586 nop.b 0 // Parameter 3 address 587} 588{ .mib 589 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack 590 add GR_Parameter_Y = -16,GR_Parameter_Y 591 br.call.sptk b0=__libm_error_support# // Call error handling function 592};; 593{ .mmi 594 nop.m 0 595 nop.m 0 596 add GR_Parameter_RESULT = 48,sp 597};; 598{ .mmi 599 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack 600.restore sp 601 add sp = 64,sp // Restore stack pointer 602 mov b0 = GR_SAVE_B0 // Restore return address 603};; 604{ .mib 605 mov gp = GR_SAVE_GP // Restore gp 606 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 607 br.ret.sptk b0 // Return 608};; 609 610LOCAL_LIBM_END(__libm_error_region) 611 612 613.type __libm_error_support#,@function 614.global __libm_error_support# 615