1.file "exp2f.s" 2 3 4// Copyright (c) 2000 - 2005, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//============================================================== 41// 08/25/00 Initial version 42// 05/20/02 Cleaned up namespace and sf0 syntax 43// 09/05/02 Improved performance and accuracy 44// 01/17/03 Fixed to call error support when x=128.0 45// 03/31/05 Reformatted delimiters between data tables 46// 47// API 48//============================================================== 49// float exp2f(float) 50// 51// Overview of operation 52//============================================================== 53// Background 54// 55// Implementation 56// 57// Let x= (K + fh + fl + r), where 58// K is an integer, fh= 0.b1 b2 b3 b4 b5, 59// fl= 2^{-5}* 0.b6 b7 b8 b8 b10 (fh, fl >= 0), 60// and |r|<2^{-11} 61// Th is a table that stores 2^fh (32 entries) rounded to 62// double extended precision (only mantissa is stored) 63// Tl is a table that stores 2^fl (32 entries) rounded to 64// double extended precision (only mantissa is stored) 65// 66// 2^x is approximated as 67// 2^K * Th [ f ] * Tl [ f ] * (1+c1*r+c2*r^2) 68 69// Note: We use the following trick to speed up conversion from FP to integer: 70// 71// Let x = K + r, where K is an integer, and |r| <= 0.5 72// Let N be the number of significand bits for the FP format used 73// ( N=64 for double-extended, N=53 for double) 74// 75// Then let y = 1.5 * 2^(N-1) + x for RN mode 76// K = y - 1.5 * 2^(N-1) 77// r = x - K 78// 79// If we want to obtain the integer part and the first m fractional bits of x, 80// we can use the same trick, but with a constant of 1.5 * 2^(N-1-m): 81// 82// Let x = K + f + r 83// f = 0.b_1 b_2 ... b_m 84// |r| <= 2^(-m-1) 85// 86// Then let y = 1.5 * 2^(N-1-m) + x for RN mode 87// (K+f) = y - 1.5 * 2^(N-1-m) 88// r = x - K 89 90 91// Special values 92//============================================================== 93// exp2(0)= 1 94// exp2(+inf)= inf 95// exp2(-inf)= 0 96// 97 98// Registers used 99//============================================================== 100// r2-r3, r14-r40 101// f6-f15, f32-f45 102// p6-p8, p12 103// 104 105 106GR_TBL_START = r2 107GR_LOG_TBL = r3 108 109GR_OF_LIMIT = r14 110GR_UF_LIMIT = r15 111GR_EXP_CORR = r16 112GR_F_low = r17 113GR_F_high = r18 114GR_K = r19 115GR_Flow_ADDR = r20 116 117GR_BIAS = r21 118GR_Fh = r22 119GR_Fh_ADDR = r23 120GR_EXPMAX = r24 121GR_EMIN = r25 122 123GR_ROUNDVAL = r26 124GR_MASK = r27 125GR_KF0 = r28 126GR_MASK_low = r29 127GR_COEFF_START = r30 128 129GR_SAVE_B0 = r33 130GR_SAVE_PFS = r34 131GR_SAVE_GP = r35 132GR_SAVE_SP = r36 133 134GR_Parameter_X = r37 135GR_Parameter_Y = r38 136GR_Parameter_RESULT = r39 137GR_Parameter_TAG = r40 138 139 140FR_X = f10 141FR_Y = f1 142FR_RESULT = f8 143 144 145FR_COEFF1 = f6 146FR_COEFF2 = f7 147FR_R = f9 148 149FR_KF0 = f12 150FR_UF_LIMIT = f15 151 152FR_OF_LIMIT = f32 153FR_EXPMIN = f33 154FR_ROUNDVAL = f34 155FR_KF = f35 156 157FR_2_TO_K = f36 158FR_T_low = f37 159FR_T_high = f38 160 161FR_P12 = f41 162FR_T_low_K = f42 163FR_T = f44 164FR_P = f45 165 166 167// Data tables 168//============================================================== 169 170RODATA 171 172.align 16 173 174LOCAL_OBJECT_START(poly_coeffs) 175 176data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1 177data8 0xf5fdeffc162c7541, 0x00003ffc // C_2 178LOCAL_OBJECT_END(poly_coeffs) 179 180 181LOCAL_OBJECT_START(T_table) 182 183// 2^{0.00000 b6 b7 b8 b9 b10} 184data8 0x8000000000000000, 0x8016302f17467628 185data8 0x802c6436d0e04f50, 0x80429c17d77c18ed 186data8 0x8058d7d2d5e5f6b0, 0x806f17687707a7af 187data8 0x80855ad965e88b83, 0x809ba2264dada76a 188data8 0x80b1ed4fd999ab6c, 0x80c83c56b50cf77f 189data8 0x80de8f3b8b85a0af, 0x80f4e5ff089f763e 190data8 0x810b40a1d81406d4, 0x81219f24a5baa59d 191data8 0x813801881d886f7b, 0x814e67cceb90502c 192data8 0x8164d1f3bc030773, 0x817b3ffd3b2f2e47 193data8 0x8191b1ea15813bfd, 0x81a827baf7838b78 194data8 0x81bea1708dde6055, 0x81d51f0b8557ec1c 195data8 0x81eba08c8ad4536f, 0x820225f44b55b33b 196data8 0x8218af4373fc25eb, 0x822f3c7ab205c89a 197data8 0x8245cd9ab2cec048, 0x825c62a423d13f0c 198data8 0x8272fb97b2a5894c, 0x828998760d01faf3 199data8 0x82a0393fe0bb0ca8, 0x82b6ddf5dbc35906 200// 201// 2^{0.b1 b2 b3 b4 b5} 202data8 0x8000000000000000, 0x82cd8698ac2ba1d7 203data8 0x85aac367cc487b14, 0x88980e8092da8527 204data8 0x8b95c1e3ea8bd6e6, 0x8ea4398b45cd53c0 205data8 0x91c3d373ab11c336, 0x94f4efa8fef70961 206data8 0x9837f0518db8a96f, 0x9b8d39b9d54e5538 207data8 0x9ef5326091a111ad, 0xa27043030c496818 208data8 0xa5fed6a9b15138ea, 0xa9a15ab4ea7c0ef8 209data8 0xad583eea42a14ac6, 0xb123f581d2ac258f 210data8 0xb504f333f9de6484, 0xb8fbaf4762fb9ee9 211data8 0xbd08a39f580c36be, 0xc12c4cca66709456 212data8 0xc5672a115506dadd, 0xc9b9bd866e2f27a2 213data8 0xce248c151f8480e3, 0xd2a81d91f12ae45a 214data8 0xd744fccad69d6af4, 0xdbfbb797daf23755 215data8 0xe0ccdeec2a94e111, 0xe5b906e77c8348a8 216data8 0xeac0c6e7dd24392e, 0xefe4b99bdcdaf5cb 217data8 0xf5257d152486cc2c, 0xfa83b2db722a033a 218LOCAL_OBJECT_END(T_table) 219 220 221 222.section .text 223WEAK_LIBM_ENTRY(exp2f) 224 225 226{.mfi 227 alloc r32= ar.pfs, 1, 4, 4, 0 228 // will continue only for non-zero normal/denormal numbers 229 fclass.nm p12, p0= f8, 0x1b 230 // GR_TBL_START= pointer to C_1...C_2 followed by T_table 231 addl GR_TBL_START= @ltoff(poly_coeffs), gp 232} 233{.mlx 234 mov GR_OF_LIMIT= 0xffff + 7 // Exponent of overflow limit 235 movl GR_ROUNDVAL= 0x5a400000 // 1.5*2^(63-10) (SP) 236} 237;; 238 239// Form special constant 1.5*2^(63-10) to give integer part and first 10 240// fractional bits of x 241{.mfi 242 setf.s FR_ROUNDVAL= GR_ROUNDVAL // Form special constant 243 fcmp.lt.s1 p6, p8= f8, f0 // X<0 ? 244 nop.i 0 245} 246{.mfb 247 ld8 GR_COEFF_START= [ GR_TBL_START ] // Load pointer to coeff table 248 nop.f 0 249 (p12) br.cond.spnt SPECIAL_exp2 // Branch if nan, inf, zero 250} 251;; 252 253{.mlx 254 setf.exp FR_OF_LIMIT= GR_OF_LIMIT // Set overflow limit 255 movl GR_UF_LIMIT= 0xc3160000 // (-2^7-22) = -150 256} 257;; 258 259{.mfi 260 ldfe FR_COEFF1= [ GR_COEFF_START ], 16 // load C_1 261 fma.s0 f8= f8, f1, f0 // normalize x 262 nop.i 0 263} 264;; 265 266{.mmi 267 ldfe FR_COEFF2= [ GR_COEFF_START ], 16 // load C_2 268 setf.s FR_UF_LIMIT= GR_UF_LIMIT // Set underflow limit 269 mov GR_EXP_CORR= 0xffff-126 270} 271;; 272 273{.mfi 274 nop.m 0 275 fma.s1 FR_KF0= f8, f1, FR_ROUNDVAL // y= x + 1.5*2^(63-10) 276 nop.i 0 277} 278;; 279 280{.mfi 281 mov GR_MASK= 1023 282 fms.s1 FR_KF= FR_KF0, f1, FR_ROUNDVAL // (K+f) 283 mov GR_MASK_low= 31 284} 285;; 286 287{.mfi 288 getf.sig GR_KF0= FR_KF0 // (K+f)*2^10= round_to_int(y) 289 fcmp.ge.s1 p12, p7= f8, FR_OF_LIMIT // x >= overflow threshold ? 290 add GR_LOG_TBL= 256, GR_COEFF_START // Pointer to high T_table 291} 292;; 293 294{.mmi 295 and GR_F_low= GR_KF0, GR_MASK_low // f_low 296 and GR_F_high= GR_MASK, GR_KF0 // f_high*32 297 shr GR_K= GR_KF0, 10 // K 298} 299;; 300 301{.mmi 302 shladd GR_Flow_ADDR= GR_F_low, 3, GR_COEFF_START // address of 2^{f_low} 303 add GR_BIAS= GR_K, GR_EXP_CORR // K= bias-2*63 304 shr GR_Fh= GR_F_high, 5 // f_high 305} 306;; 307 308{.mfi 309 setf.exp FR_2_TO_K= GR_BIAS // 2^{K-126} 310 fnma.s1 FR_R= FR_KF, f1, f8 // r= x - (K+f) 311 shladd GR_Fh_ADDR= GR_Fh, 3, GR_LOG_TBL // address of 2^{f_high} 312} 313{.mlx 314 ldf8 FR_T_low= [ GR_Flow_ADDR ] // load T_low= 2^{f_low} 315 movl GR_EMIN= 0xc2fc0000 // EMIN= -126 316} 317;; 318 319{.mfi 320 ldf8 FR_T_high= [ GR_Fh_ADDR ] // load T_high= 2^{f_high} 321 (p7) fcmp.lt.s1 p12, p7= f8, FR_UF_LIMIT // x<underflow threshold ? 322 nop.i 0 323} 324;; 325 326{.mfb 327 setf.s FR_EXPMIN= GR_EMIN // FR_EXPMIN= EMIN 328 fma.s1 FR_P12= FR_COEFF2, FR_R, FR_COEFF1 // P12= C_1+C_2*r 329 (p12) br.cond.spnt OUT_RANGE_exp2 330} 331;; 332 333{.mfi 334 nop.m 0 335 fma.s1 FR_T_low_K= FR_T_low, FR_2_TO_K, f0 // T= 2^{K-126}*T_low 336 nop.i 0 337} 338;; 339 340{.mfi 341 nop.m 0 342 fma.s1 FR_P= FR_R, FR_P12, f0 // P= P12+r 343 nop.i 0 344} 345;; 346 347{.mfi 348 nop.m 0 349 fma.s1 FR_T= FR_T_low_K, FR_T_high, f0 // T= T*T_high 350 nop.i 0 351} 352;; 353 354{.mfi 355 nop.m 0 356 fcmp.lt.s0 p6, p8= f8, FR_EXPMIN // underflow (x<EMIN) ? 357 nop.i 0 358} 359;; 360 361{.mfb 362 nop.m 0 363 fma.s.s0 f8= FR_P, FR_T, FR_T // result= T+T*P 364 (p8) br.ret.sptk b0 // return 365} 366;; 367 368{.mfb 369 (p6) mov GR_Parameter_TAG= 164 370 nop.f 0 371 (p6) br.cond.sptk __libm_error_region 372} 373;; 374 375 376SPECIAL_exp2: 377{.mfi 378 nop.m 0 379 fclass.m p6, p0= f8, 0x22 // x= -Infinity ? 380 nop.i 0 381} 382;; 383 384{.mfi 385 nop.m 0 386 fclass.m p7, p0= f8, 0x21 // x= +Infinity ? 387 nop.i 0 388} 389;; 390 391{.mfi 392 nop.m 0 393 fclass.m p8, p0= f8, 0x7 // x= +/-Zero ? 394 nop.i 0 395} 396{.mfb 397 nop.m 0 398 (p6) mov f8= f0 // exp2(-Infinity)= 0 399 (p6) br.ret.spnt b0 400} 401;; 402 403{.mfb 404 nop.m 0 405 nop.f 0 406 (p7) br.ret.spnt b0 // exp2(+Infinity)= +Infinity 407} 408;; 409 410{.mfb 411 nop.m 0 412 (p8) mov f8= f1 // exp2(+/-0)= 1 413 (p8) br.ret.spnt b0 414} 415;; 416 417{.mfb 418 nop.m 0 419 fma.s.s0 f8= f8, f1, f0 // Remaining cases: NaNs 420 br.ret.sptk b0 421} 422;; 423 424 425OUT_RANGE_exp2: 426 427// overflow: p8= 1 428 429{.mii 430 (p8) mov GR_EXPMAX= 0x1fffe 431 nop.i 0 432 nop.i 0 433} 434;; 435 436{.mmb 437 (p8) mov GR_Parameter_TAG= 163 438 (p8) setf.exp FR_R= GR_EXPMAX 439 nop.b 999 440} 441;; 442 443{.mfi 444 nop.m 999 445 (p8) fma.s.s0 f8= FR_R, FR_R, f0 // Create overflow 446 nop.i 999 447} 448// underflow: p6= 1 449{.mii 450 (p6) mov GR_Parameter_TAG= 164 451 (p6) mov GR_EXPMAX= 1 452 nop.i 0 453} 454;; 455 456{.mmb 457 nop.m 0 458 (p6) setf.exp FR_R= GR_EXPMAX 459 nop.b 999 460} 461;; 462 463{.mfb 464 nop.m 999 465 (p6) fma.s.s0 f8= FR_R, FR_R, f0 // Create underflow 466 nop.b 0 467} 468;; 469 470WEAK_LIBM_END(exp2f) 471libm_alias_float_other (__exp2, exp2) 472#ifdef SHARED 473.symver exp2f,exp2f@@GLIBC_2.27 474.weak __exp2f_compat 475.set __exp2f_compat,__exp2f 476.symver __exp2f_compat,exp2f@GLIBC_2.2 477#endif 478 479 480LOCAL_LIBM_ENTRY(__libm_error_region) 481 482.prologue 483{.mfi 484 add GR_Parameter_Y= -32, sp // Parameter 2 value 485 nop.f 0 486.save ar.pfs, GR_SAVE_PFS 487 mov GR_SAVE_PFS= ar.pfs // Save ar.pfs 488} 489 490{.mfi 491.fframe 64 492 add sp= -64, sp // Create new stack 493 nop.f 0 494 mov GR_SAVE_GP= gp // Save gp 495} 496;; 497 498{.mmi 499 stfs [ GR_Parameter_Y ]= FR_Y, 16 // STORE Parameter 2 on stack 500 add GR_Parameter_X= 16, sp // Parameter 1 address 501.save b0, GR_SAVE_B0 502 mov GR_SAVE_B0= b0 // Save b0 503} 504;; 505 506.body 507{.mib 508 stfs [ GR_Parameter_X ]= FR_X // STORE Parameter 1 on stack 509 add GR_Parameter_RESULT= 0, GR_Parameter_Y // Parameter 3 address 510 nop.b 0 511} 512{.mib 513 stfs [ GR_Parameter_Y ]= FR_RESULT // STORE Parameter 3 on stack 514 add GR_Parameter_Y= -16, GR_Parameter_Y 515 br.call.sptk b0= __libm_error_support# // Call error handling function 516} 517;; 518 519{.mmi 520 add GR_Parameter_RESULT= 48, sp 521 nop.m 0 522 nop.i 0 523} 524;; 525 526{.mmi 527 ldfs f8= [ GR_Parameter_RESULT ] // Get return result off stack 528.restore sp 529 add sp= 64, sp // Restore stack pointer 530 mov b0= GR_SAVE_B0 // Restore return address 531} 532;; 533 534{.mib 535 mov gp= GR_SAVE_GP // Restore gp 536 mov ar.pfs= GR_SAVE_PFS // Restore ar.pfs 537 br.ret.sptk b0 // Return 538} 539;; 540 541 542LOCAL_LIBM_END(__libm_error_region) 543 544.type __libm_error_support#, @function 545.global __libm_error_support# 546