1.file "exp2.s" 2 3 4// Copyright (c) 2000 - 2005, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//============================================================== 41// 08/25/00 Initial version 42// 05/20/02 Cleaned up namespace and sf0 syntax 43// 09/05/02 Improved performance 44// 01/17/03 Fixed to call error support when x=1024.0 45// 03/31/05 Reformatted delimiters between data tables 46// 47// API 48//============================================================== 49// double exp2(double) 50// 51// Overview of operation 52//============================================================== 53// Background 54// 55// Implementation 56// 57// Let x= (K + fh + fl + r), where 58// K is an integer, fh= 0.b1 b2 b3 b4 b5, 59// fl= 2^{-5}* 0.b6 b7 b8 b8 b10 (fh, fl >= 0), 60// and |r|<2^{-11} 61// Th is a table that stores 2^fh (32 entries) rounded to 62// double extended precision (only mantissa is stored) 63// Tl is a table that stores 2^fl (32 entries) rounded to 64// double extended precision (only mantissa is stored) 65// 66// 2^x is approximated as 67// 2^K * Th [ f ] * Tl [ f ] * (1+c1*r+c2*r^2+c3*r^3+c4*r^4) 68 69// Note: We use the following trick to speed up conversion from FP to integer: 70// 71// Let x = K + r, where K is an integer, and |r| <= 0.5 72// Let N be the number of significand bits for the FP format used 73// ( N=64 for double-extended, N=53 for double) 74// 75// Then let y = 1.5 * 2^(N-1) + x for RN mode 76// K = y - 1.5 * 2^(N-1) 77// r = x - K 78// 79// If we want to obtain the integer part and the first m fractional bits of x, 80// we can use the same trick, but with a constant of 1.5 * 2^(N-1-m): 81// 82// Let x = K + f + r 83// f = 0.b_1 b_2 ... b_m 84// |r| <= 2^(-m-1) 85// 86// Then let y = 1.5 * 2^(N-1-m) + x for RN mode 87// (K+f) = y - 1.5 * 2^(N-1-m) 88// r = x - K 89 90 91// Special values 92//============================================================== 93// exp2(0)= 1 94// exp2(+inf)= inf 95// exp2(-inf)= 0 96// 97 98// Registers used 99//============================================================== 100// r2-r3, r14-r40 101// f6-f15, f32-f45 102// p6-p8, p12 103// 104 105 106GR_TBL_START = r2 107GR_LOG_TBL = r3 108 109GR_OF_LIMIT = r14 110GR_UF_LIMIT = r15 111GR_EXP_CORR = r16 112GR_F_low = r17 113GR_F_high = r18 114GR_K = r19 115GR_Flow_ADDR = r20 116 117GR_BIAS = r21 118GR_Fh = r22 119GR_Fh_ADDR = r23 120GR_EXPMAX = r24 121GR_EMIN = r25 122 123GR_ROUNDVAL = r26 124GR_MASK = r27 125GR_KF0 = r28 126GR_MASK_low = r29 127GR_COEFF_START = r30 128 129GR_SAVE_B0 = r33 130GR_SAVE_PFS = r34 131GR_SAVE_GP = r35 132GR_SAVE_SP = r36 133 134GR_Parameter_X = r37 135GR_Parameter_Y = r38 136GR_Parameter_RESULT = r39 137GR_Parameter_TAG = r40 138 139 140FR_X = f10 141FR_Y = f1 142FR_RESULT = f8 143 144 145FR_COEFF1 = f6 146FR_COEFF2 = f7 147FR_R = f9 148 149FR_KF0 = f12 150FR_COEFF3 = f13 151FR_COEFF4 = f14 152FR_UF_LIMIT = f15 153 154FR_OF_LIMIT = f32 155FR_EXPMIN = f33 156FR_ROUNDVAL = f34 157FR_KF = f35 158 159FR_2_TO_K = f36 160FR_T_low = f37 161FR_T_high = f38 162FR_P34 = f39 163FR_R2 = f40 164 165FR_P12 = f41 166FR_T_low_K = f42 167FR_P14 = f43 168FR_T = f44 169FR_P = f45 170 171 172// Data tables 173//============================================================== 174 175RODATA 176 177.align 16 178 179LOCAL_OBJECT_START(poly_coeffs) 180 181data8 0x3fac6b08d704a0c0, 0x3f83b2ab6fba4e77 // C_3 and C_4 182data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1 183data8 0xf5fdeffc162c7541, 0x00003ffc // C_2 184LOCAL_OBJECT_END(poly_coeffs) 185 186 187LOCAL_OBJECT_START(T_table) 188 189// 2^{0.00000 b6 b7 b8 b9 b10} 190data8 0x8000000000000000, 0x8016302f17467628 191data8 0x802c6436d0e04f50, 0x80429c17d77c18ed 192data8 0x8058d7d2d5e5f6b0, 0x806f17687707a7af 193data8 0x80855ad965e88b83, 0x809ba2264dada76a 194data8 0x80b1ed4fd999ab6c, 0x80c83c56b50cf77f 195data8 0x80de8f3b8b85a0af, 0x80f4e5ff089f763e 196data8 0x810b40a1d81406d4, 0x81219f24a5baa59d 197data8 0x813801881d886f7b, 0x814e67cceb90502c 198data8 0x8164d1f3bc030773, 0x817b3ffd3b2f2e47 199data8 0x8191b1ea15813bfd, 0x81a827baf7838b78 200data8 0x81bea1708dde6055, 0x81d51f0b8557ec1c 201data8 0x81eba08c8ad4536f, 0x820225f44b55b33b 202data8 0x8218af4373fc25eb, 0x822f3c7ab205c89a 203data8 0x8245cd9ab2cec048, 0x825c62a423d13f0c 204data8 0x8272fb97b2a5894c, 0x828998760d01faf3 205data8 0x82a0393fe0bb0ca8, 0x82b6ddf5dbc35906 206// 207// 2^{0.b1 b2 b3 b4 b5} 208data8 0x8000000000000000, 0x82cd8698ac2ba1d7 209data8 0x85aac367cc487b14, 0x88980e8092da8527 210data8 0x8b95c1e3ea8bd6e6, 0x8ea4398b45cd53c0 211data8 0x91c3d373ab11c336, 0x94f4efa8fef70961 212data8 0x9837f0518db8a96f, 0x9b8d39b9d54e5538 213data8 0x9ef5326091a111ad, 0xa27043030c496818 214data8 0xa5fed6a9b15138ea, 0xa9a15ab4ea7c0ef8 215data8 0xad583eea42a14ac6, 0xb123f581d2ac258f 216data8 0xb504f333f9de6484, 0xb8fbaf4762fb9ee9 217data8 0xbd08a39f580c36be, 0xc12c4cca66709456 218data8 0xc5672a115506dadd, 0xc9b9bd866e2f27a2 219data8 0xce248c151f8480e3, 0xd2a81d91f12ae45a 220data8 0xd744fccad69d6af4, 0xdbfbb797daf23755 221data8 0xe0ccdeec2a94e111, 0xe5b906e77c8348a8 222data8 0xeac0c6e7dd24392e, 0xefe4b99bdcdaf5cb 223data8 0xf5257d152486cc2c, 0xfa83b2db722a033a 224LOCAL_OBJECT_END(T_table) 225 226 227 228.section .text 229WEAK_LIBM_ENTRY(exp2) 230 231 232{.mfi 233 alloc r32= ar.pfs, 1, 4, 4, 0 234 // will continue only for non-zero normal/denormal numbers 235 fclass.nm p12, p0= f8, 0x1b 236 // GR_TBL_START= pointer to C_1...C_4 followed by T_table 237 addl GR_TBL_START= @ltoff(poly_coeffs), gp 238} 239{.mlx 240 mov GR_OF_LIMIT= 0xffff + 10 // Exponent of overflow limit 241 movl GR_ROUNDVAL= 0x5a400000 // 1.5*2^(63-10) (SP) 242} 243;; 244 245// Form special constant 1.5*2^(63-10) to give integer part and first 10 246// fractional bits of x 247{.mfi 248 setf.s FR_ROUNDVAL= GR_ROUNDVAL // Form special constant 249 fcmp.lt.s1 p6, p8= f8, f0 // X<0 ? 250 nop.i 0 251} 252{.mfb 253 ld8 GR_COEFF_START= [ GR_TBL_START ] // Load pointer to coeff table 254 nop.f 0 255 (p12) br.cond.spnt SPECIAL_exp2 // Branch if nan, inf, zero 256} 257;; 258 259{.mlx 260 setf.exp FR_OF_LIMIT= GR_OF_LIMIT // Set overflow limit 261 movl GR_UF_LIMIT= 0xc4866000 // (-2^10-51) = -1075 262} 263;; 264 265{.mfi 266 ldfpd FR_COEFF3, FR_COEFF4= [ GR_COEFF_START ], 16 // load C_3, C_4 267 fma.s0 f8= f8, f1, f0 // normalize x 268 nop.i 0 269} 270;; 271 272{.mmi 273 setf.s FR_UF_LIMIT= GR_UF_LIMIT // Set underflow limit 274 ldfe FR_COEFF1= [ GR_COEFF_START ], 16 // load C_1 275 mov GR_EXP_CORR= 0xffff-126 276} 277;; 278 279{.mfi 280 ldfe FR_COEFF2= [ GR_COEFF_START ], 16 // load C_2 281 fma.s1 FR_KF0= f8, f1, FR_ROUNDVAL // y= x + 1.5*2^(63-10) 282 nop.i 0 283} 284;; 285 286{.mfi 287 mov GR_MASK= 1023 288 fms.s1 FR_KF= FR_KF0, f1, FR_ROUNDVAL // (K+f) 289 mov GR_MASK_low= 31 290} 291;; 292 293{.mfi 294 getf.sig GR_KF0= FR_KF0 // (K+f)*2^10= round_to_int(y) 295 fcmp.ge.s1 p12, p7= f8, FR_OF_LIMIT // x >= overflow threshold ? 296 add GR_LOG_TBL= 256, GR_COEFF_START // Pointer to high T_table 297} 298;; 299 300{.mmi 301 and GR_F_low= GR_KF0, GR_MASK_low // f_low 302 and GR_F_high= GR_MASK, GR_KF0 // f_high*32 303 shr GR_K= GR_KF0, 10 // K 304} 305;; 306 307{.mmi 308 shladd GR_Flow_ADDR= GR_F_low, 3, GR_COEFF_START // address of 2^{f_low} 309 add GR_BIAS= GR_K, GR_EXP_CORR // K= bias-2*63 310 shr GR_Fh= GR_F_high, 5 // f_high 311} 312;; 313 314{.mfi 315 setf.exp FR_2_TO_K= GR_BIAS // 2^{K-126} 316 fnma.s1 FR_R= FR_KF, f1, f8 // r= x - (K+f) 317 shladd GR_Fh_ADDR= GR_Fh, 3, GR_LOG_TBL // address of 2^{f_high} 318} 319{.mlx 320 ldf8 FR_T_low= [ GR_Flow_ADDR ] // load T_low= 2^{f_low} 321 movl GR_EMIN= 0xc47f8000 // EMIN= -1022 322} 323;; 324 325{.mfi 326 ldf8 FR_T_high= [ GR_Fh_ADDR ] // load T_high= 2^{f_high} 327 (p7) fcmp.lt.s1 p12, p7= f8, FR_UF_LIMIT // x<underflow threshold ? 328 nop.i 0 329} 330;; 331 332{.mfi 333 setf.s FR_EXPMIN= GR_EMIN // FR_EXPMIN= EMIN 334 fma.s1 FR_P34= FR_COEFF4, FR_R, FR_COEFF3 // P34= C_3+C_4*r 335 nop.i 0 336} 337{.mfb 338 nop.m 0 339 fma.s1 FR_R2= FR_R, FR_R, f0 // r*r 340 (p12) br.cond.spnt OUT_RANGE_exp2 341} 342;; 343 344{.mfi 345 nop.m 0 346 fma.s1 FR_P12= FR_COEFF2, FR_R, FR_COEFF1 // P12= C_1+C_2*r 347 nop.i 0 348} 349;; 350 351{.mfi 352 nop.m 0 353 fma.s1 FR_T_low_K= FR_T_low, FR_2_TO_K, f0 // T= 2^{K-126}*T_low 354 nop.i 0 355} 356;; 357 358{.mfi 359 nop.m 0 360 fma.s1 FR_P14= FR_R2, FR_P34, FR_P12 // P14= P12+r2*P34 361 nop.i 0 362} 363;; 364 365{.mfi 366 nop.m 0 367 fma.s1 FR_T= FR_T_low_K, FR_T_high, f0 // T= T*T_high 368 nop.i 0 369} 370;; 371 372{.mfi 373 nop.m 0 374 fcmp.lt.s0 p6, p8= f8, FR_EXPMIN // underflow (x<EMIN) ? 375 nop.i 0 376} 377;; 378 379{.mfi 380 nop.m 0 381 fma.s1 FR_P= FR_P14, FR_R, f0 // P= P14*r 382 nop.i 0 383} 384;; 385 386{.mfb 387 nop.m 0 388 fma.d.s0 f8= FR_P, FR_T, FR_T // result= T+T*P 389 (p8) br.ret.sptk b0 // return 390} 391;; 392 393{.mfb 394 (p6) mov GR_Parameter_TAG= 162 395 nop.f 0 396 (p6) br.cond.sptk __libm_error_region 397} 398;; 399 400 401SPECIAL_exp2: 402{.mfi 403 nop.m 0 404 fclass.m p6, p0= f8, 0x22 // x= -Infinity ? 405 nop.i 0 406} 407;; 408 409{.mfi 410 nop.m 0 411 fclass.m p7, p0= f8, 0x21 // x= +Infinity ? 412 nop.i 0 413} 414;; 415 416{.mfi 417 nop.m 0 418 fclass.m p8, p0= f8, 0x7 // x= +/-Zero ? 419 nop.i 0 420} 421{.mfb 422 nop.m 0 423 (p6) mov f8= f0 // exp2(-Infinity)= 0 424 (p6) br.ret.spnt b0 425} 426;; 427 428{.mfb 429 nop.m 0 430 nop.f 0 431 (p7) br.ret.spnt b0 // exp2(+Infinity)= +Infinity 432} 433;; 434 435{.mfb 436 nop.m 0 437 (p8) mov f8= f1 // exp2(+/-0)= 1 438 (p8) br.ret.spnt b0 439} 440;; 441 442{.mfb 443 nop.m 0 444 fma.d.s0 f8= f8, f1, f0 // Remaining cases: NaNs 445 br.ret.sptk b0 446} 447;; 448 449 450OUT_RANGE_exp2: 451 452// overflow: p8= 1 453 454{.mii 455 (p8) mov GR_EXPMAX= 0x1fffe 456 nop.i 0 457 nop.i 0 458} 459;; 460 461{.mmb 462 (p8) mov GR_Parameter_TAG= 161 463 (p8) setf.exp FR_R= GR_EXPMAX 464 nop.b 999 465} 466;; 467 468{.mfi 469 nop.m 999 470 (p8) fma.d.s0 f8= FR_R, FR_R, f0 // Create overflow 471 nop.i 999 472} 473// underflow: p6= 1 474{.mii 475 (p6) mov GR_Parameter_TAG= 162 476 (p6) mov GR_EXPMAX= 1 477 nop.i 0 478} 479;; 480 481{.mmb 482 nop.m 0 483 (p6) setf.exp FR_R= GR_EXPMAX 484 nop.b 999 485} 486;; 487 488{.mfb 489 nop.m 999 490 (p6) fma.d.s0 f8= FR_R, FR_R, f0 // Create underflow 491 nop.b 0 492} 493;; 494 495WEAK_LIBM_END(exp2) 496libm_alias_double_other (__exp2, exp2) 497#ifdef SHARED 498.symver exp2,exp2@@GLIBC_2.29 499.weak __exp2_compat 500.set __exp2_compat,__exp2 501.symver __exp2_compat,exp2@GLIBC_2.2 502#endif 503 504 505LOCAL_LIBM_ENTRY(__libm_error_region) 506 507.prologue 508{.mfi 509 add GR_Parameter_Y= -32, sp // Parameter 2 value 510 nop.f 0 511.save ar.pfs, GR_SAVE_PFS 512 mov GR_SAVE_PFS= ar.pfs // Save ar.pfs 513} 514 515{.mfi 516.fframe 64 517 add sp= -64, sp // Create new stack 518 nop.f 0 519 mov GR_SAVE_GP= gp // Save gp 520} 521;; 522 523{.mmi 524 stfd [ GR_Parameter_Y ]= FR_Y, 16 // STORE Parameter 2 on stack 525 add GR_Parameter_X= 16, sp // Parameter 1 address 526.save b0, GR_SAVE_B0 527 mov GR_SAVE_B0= b0 // Save b0 528} 529;; 530 531.body 532{.mib 533 stfd [ GR_Parameter_X ]= FR_X // STORE Parameter 1 on stack 534 add GR_Parameter_RESULT= 0, GR_Parameter_Y // Parameter 3 address 535 nop.b 0 536} 537{.mib 538 stfd [ GR_Parameter_Y ]= FR_RESULT // STORE Parameter 3 on stack 539 add GR_Parameter_Y= -16, GR_Parameter_Y 540 br.call.sptk b0= __libm_error_support# // Call error handling function 541} 542;; 543 544{.mmi 545 add GR_Parameter_RESULT= 48, sp 546 nop.m 0 547 nop.i 0 548} 549;; 550 551{.mmi 552 ldfd f8= [ GR_Parameter_RESULT ] // Get return result off stack 553.restore sp 554 add sp= 64, sp // Restore stack pointer 555 mov b0= GR_SAVE_B0 // Restore return address 556} 557;; 558 559{.mib 560 mov gp= GR_SAVE_GP // Restore gp 561 mov ar.pfs= GR_SAVE_PFS // Restore ar.pfs 562 br.ret.sptk b0 // Return 563} 564;; 565 566 567LOCAL_LIBM_END(__libm_error_region) 568 569.type __libm_error_support#, @function 570.global __libm_error_support# 571