1.file "coshl.s" 2 3 4// Copyright (c) 2000 - 2002, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//============================================================== 41// 02/02/00 Initial version 42// 04/04/00 Unwind support added 43// 08/15/00 Bundle added after call to __libm_error_support to properly 44// set [the previously overwritten] GR_Parameter_RESULT. 45// 01/23/01 Set inexact flag for large args. 46// 05/07/01 Reworked to improve speed of all paths 47// 05/20/02 Cleaned up namespace and sf0 syntax 48// 12/06/02 Improved performance 49// 50// API 51//============================================================== 52// long double = coshl(long double) 53// input floating point f8 54// output floating point f8 55// 56// Registers used 57//============================================================== 58// general registers: 59// r14 -> r40 60// predicate registers used: 61// p6 -> p11 62// floating-point registers used: 63// f9 -> f15; f32 -> f90; 64// f8 has input, then output 65// 66// Overview of operation 67//============================================================== 68// There are seven paths 69// 1. 0 < |x| < 0.25 COSH_BY_POLY 70// 2. 0.25 <=|x| < 32 COSH_BY_TBL 71// 3. 32 <= |x| < 11357.21655 COSH_BY_EXP (merged path with COSH_BY_TBL) 72// 4. |x| >= 11357.21655 COSH_HUGE 73// 5. x=0 Done with early exit 74// 6. x=inf,nan Done with early exit 75// 7. x=denormal COSH_DENORM 76// 77// For double extended we get overflow for x >= 400c b174 ddc0 31ae c0ea 78// >= 11357.21655 79// 80// 81// 1. COSH_BY_POLY 0 < |x| < 0.25 82// =============== 83// Evaluate cosh(x) by a 12th order polynomial 84// Care is take for the order of multiplication; and P2 is not exactly 1/4!, 85// P3 is not exactly 1/6!, etc. 86// cosh(x) = 1 + (P1*x^2 + P2*x^4 + P3*x^6 + P4*x^8 + P5*x^10 + P6*x^12) 87// 88// 2. COSH_BY_TBL 0.25 <= |x| < 32.0 89// ============= 90// cosh(x) = cosh(B+R) 91// = cosh(B)cosh(R) + sinh(B)sinh(R) 92// 93// ax = |x| = M*log2/64 + R 94// B = M*log2/64 95// M = 64*N + j 96// We will calculate M and get N as (M-j)/64 97// The division is a shift. 98// exp(B) = exp(N*log2 + j*log2/64) 99// = 2^N * 2^(j*log2/64) 100// cosh(B) = 1/2(e^B + e^-B) 101// = 1/2(2^N * 2^(j*log2/64) + 2^-N * 2^(-j*log2/64)) 102// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64)) 103// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64)) 104// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32 105// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit) 106// 107// R = ax - M*log2/64 108// R = ax - M*log2_by_64_hi - M*log2_by_64_lo 109// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...) 110// = 1 + p_odd + p_even 111// where the p_even uses the A coefficients and the p_even uses 112// the B coefficients 113// 114// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd 115// cosh(R) = 1 + p_even 116// cosh(B) = C_hi + C_lo 117// sinh(B) = S_hi 118// cosh(x) = cosh(B)cosh(R) + sinh(B)sinh(R) 119// 120// 3. COSH_BY_EXP 32.0 <= |x| < 11357.21655 ( 400c b174 ddc0 31ae c0ea ) 121// ============== 122// Can approximate result by exp(x)/2 in this region. 123// Y_hi = Tjhi 124// Y_lo = Tjhi * (p_odd + p_even) + Tjlo 125// cosh(x) = Y_hi + Y_lo 126// 127// 4. COSH_HUGE |x| >= 11357.21655 ( 400c b174 ddc0 31ae c0ea ) 128// ============ 129// Set error tag and call error support 130// 131// 132// Assembly macros 133//============================================================== 134r_ad5 = r14 135r_rshf_2to57 = r15 136r_exp_denorm = r15 137r_ad_mJ_lo = r15 138r_ad_J_lo = r16 139r_2Nm1 = r17 140r_2mNm1 = r18 141r_exp_x = r18 142r_ad_J_hi = r19 143r_ad2o = r19 144r_ad_mJ_hi = r20 145r_mj = r21 146r_ad2e = r22 147r_ad3 = r23 148r_ad1 = r24 149r_Mmj = r24 150r_rshf = r25 151r_M = r25 152r_N = r25 153r_jshf = r26 154r_exp_2tom57 = r26 155r_j = r26 156r_exp_mask = r27 157r_signexp_x = r28 158r_signexp_0_5 = r28 159r_exp_0_25 = r29 160r_sig_inv_ln2 = r30 161r_exp_32 = r30 162r_exp_huge = r30 163r_ad4 = r31 164 165GR_SAVE_PFS = r34 166GR_SAVE_B0 = r35 167GR_SAVE_GP = r36 168 169GR_Parameter_X = r37 170GR_Parameter_Y = r38 171GR_Parameter_RESULT = r39 172GR_Parameter_TAG = r40 173 174 175f_ABS_X = f9 176f_X2 = f10 177f_X4 = f11 178f_tmp = f14 179f_RSHF = f15 180 181f_Inv_log2by64 = f32 182f_log2by64_lo = f33 183f_log2by64_hi = f34 184f_A1 = f35 185 186f_A2 = f36 187f_A3 = f37 188f_Rcub = f38 189f_M_temp = f39 190f_R_temp = f40 191 192f_Rsq = f41 193f_R = f42 194f_M = f43 195f_B1 = f44 196f_B2 = f45 197 198f_B3 = f46 199f_peven_temp1 = f47 200f_peven_temp2 = f48 201f_peven = f49 202f_podd_temp1 = f50 203 204f_podd_temp2 = f51 205f_podd = f52 206f_poly65 = f53 207f_poly6543 = f53 208f_poly6to1 = f53 209f_poly43 = f54 210f_poly21 = f55 211 212f_X3 = f56 213f_INV_LN2_2TO63 = f57 214f_RSHF_2TO57 = f58 215f_2TOM57 = f59 216f_smlst_oflow_input = f60 217 218f_pre_result = f61 219f_huge = f62 220f_spos = f63 221f_sneg = f64 222f_Tjhi = f65 223 224f_Tjlo = f66 225f_Tmjhi = f67 226f_Tmjlo = f68 227f_S_hi = f69 228f_SC_hi_temp = f70 229 230f_C_lo_temp1 = f71 231f_C_lo_temp2 = f72 232f_C_lo_temp3 = f73 233f_C_lo_temp4 = f73 234f_C_lo = f74 235f_C_hi = f75 236 237f_Y_hi = f77 238f_Y_lo_temp = f78 239f_Y_lo = f79 240f_NORM_X = f80 241 242f_P1 = f81 243f_P2 = f82 244f_P3 = f83 245f_P4 = f84 246f_P5 = f85 247 248f_P6 = f86 249f_Tjhi_spos = f87 250f_Tjlo_spos = f88 251f_huge = f89 252f_signed_hi_lo = f90 253 254 255// Data tables 256//============================================================== 257 258// DO NOT CHANGE ORDER OF THESE TABLES 259RODATA 260 261.align 16 262LOCAL_OBJECT_START(cosh_arg_reduction) 263// data8 0xB8AA3B295C17F0BC, 0x00004005 // 64/log2 -- signif loaded with setf 264 data8 0xB17217F7D1000000, 0x00003FF8 // log2/64 high part 265 data8 0xCF79ABC9E3B39804, 0x00003FD0 // log2/64 low part 266 data8 0xb174ddc031aec0ea, 0x0000400c // Smallest x to overflow (11357.21655) 267LOCAL_OBJECT_END(cosh_arg_reduction) 268 269LOCAL_OBJECT_START(cosh_p_table) 270 data8 0x8FA02AC65BCBD5BC, 0x00003FE2 // P6 271 data8 0xD00D00D1021D7370, 0x00003FEF // P4 272 data8 0xAAAAAAAAAAAAAB80, 0x00003FFA // P2 273 data8 0x93F27740C0C2F1CC, 0x00003FE9 // P5 274 data8 0xB60B60B60B4FE884, 0x00003FF5 // P3 275 data8 0x8000000000000000, 0x00003FFE // P1 276LOCAL_OBJECT_END(cosh_p_table) 277 278LOCAL_OBJECT_START(cosh_ab_table) 279 data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC // A1 280 data8 0x88888888884ECDD5, 0x00003FF8 // A2 281 data8 0xD00D0C6DCC26A86B, 0x00003FF2 // A3 282 data8 0x8000000000000002, 0x00003FFE // B1 283 data8 0xAAAAAAAAAA402C77, 0x00003FFA // B2 284 data8 0xB60B6CC96BDB144D, 0x00003FF5 // B3 285LOCAL_OBJECT_END(cosh_ab_table) 286 287LOCAL_OBJECT_START(cosh_j_hi_table) 288 data8 0xB504F333F9DE6484, 0x00003FFE 289 data8 0xB6FD91E328D17791, 0x00003FFE 290 data8 0xB8FBAF4762FB9EE9, 0x00003FFE 291 data8 0xBAFF5AB2133E45FB, 0x00003FFE 292 data8 0xBD08A39F580C36BF, 0x00003FFE 293 data8 0xBF1799B67A731083, 0x00003FFE 294 data8 0xC12C4CCA66709456, 0x00003FFE 295 data8 0xC346CCDA24976407, 0x00003FFE 296 data8 0xC5672A115506DADD, 0x00003FFE 297 data8 0xC78D74C8ABB9B15D, 0x00003FFE 298 data8 0xC9B9BD866E2F27A3, 0x00003FFE 299 data8 0xCBEC14FEF2727C5D, 0x00003FFE 300 data8 0xCE248C151F8480E4, 0x00003FFE 301 data8 0xD06333DAEF2B2595, 0x00003FFE 302 data8 0xD2A81D91F12AE45A, 0x00003FFE 303 data8 0xD4F35AABCFEDFA1F, 0x00003FFE 304 data8 0xD744FCCAD69D6AF4, 0x00003FFE 305 data8 0xD99D15C278AFD7B6, 0x00003FFE 306 data8 0xDBFBB797DAF23755, 0x00003FFE 307 data8 0xDE60F4825E0E9124, 0x00003FFE 308 data8 0xE0CCDEEC2A94E111, 0x00003FFE 309 data8 0xE33F8972BE8A5A51, 0x00003FFE 310 data8 0xE5B906E77C8348A8, 0x00003FFE 311 data8 0xE8396A503C4BDC68, 0x00003FFE 312 data8 0xEAC0C6E7DD24392F, 0x00003FFE 313 data8 0xED4F301ED9942B84, 0x00003FFE 314 data8 0xEFE4B99BDCDAF5CB, 0x00003FFE 315 data8 0xF281773C59FFB13A, 0x00003FFE 316 data8 0xF5257D152486CC2C, 0x00003FFE 317 data8 0xF7D0DF730AD13BB9, 0x00003FFE 318 data8 0xFA83B2DB722A033A, 0x00003FFE 319 data8 0xFD3E0C0CF486C175, 0x00003FFE 320 data8 0x8000000000000000, 0x00003FFF // Center of table 321 data8 0x8164D1F3BC030773, 0x00003FFF 322 data8 0x82CD8698AC2BA1D7, 0x00003FFF 323 data8 0x843A28C3ACDE4046, 0x00003FFF 324 data8 0x85AAC367CC487B15, 0x00003FFF 325 data8 0x871F61969E8D1010, 0x00003FFF 326 data8 0x88980E8092DA8527, 0x00003FFF 327 data8 0x8A14D575496EFD9A, 0x00003FFF 328 data8 0x8B95C1E3EA8BD6E7, 0x00003FFF 329 data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF 330 data8 0x8EA4398B45CD53C0, 0x00003FFF 331 data8 0x9031DC431466B1DC, 0x00003FFF 332 data8 0x91C3D373AB11C336, 0x00003FFF 333 data8 0x935A2B2F13E6E92C, 0x00003FFF 334 data8 0x94F4EFA8FEF70961, 0x00003FFF 335 data8 0x96942D3720185A00, 0x00003FFF 336 data8 0x9837F0518DB8A96F, 0x00003FFF 337 data8 0x99E0459320B7FA65, 0x00003FFF 338 data8 0x9B8D39B9D54E5539, 0x00003FFF 339 data8 0x9D3ED9A72CFFB751, 0x00003FFF 340 data8 0x9EF5326091A111AE, 0x00003FFF 341 data8 0xA0B0510FB9714FC2, 0x00003FFF 342 data8 0xA27043030C496819, 0x00003FFF 343 data8 0xA43515AE09E6809E, 0x00003FFF 344 data8 0xA5FED6A9B15138EA, 0x00003FFF 345 data8 0xA7CD93B4E965356A, 0x00003FFF 346 data8 0xA9A15AB4EA7C0EF8, 0x00003FFF 347 data8 0xAB7A39B5A93ED337, 0x00003FFF 348 data8 0xAD583EEA42A14AC6, 0x00003FFF 349 data8 0xAF3B78AD690A4375, 0x00003FFF 350 data8 0xB123F581D2AC2590, 0x00003FFF 351 data8 0xB311C412A9112489, 0x00003FFF 352 data8 0xB504F333F9DE6484, 0x00003FFF 353LOCAL_OBJECT_END(cosh_j_hi_table) 354 355LOCAL_OBJECT_START(cosh_j_lo_table) 356 data4 0x1EB2FB13 357 data4 0x1CE2CBE2 358 data4 0x1DDC3CBC 359 data4 0x1EE9AA34 360 data4 0x9EAEFDC1 361 data4 0x9DBF517B 362 data4 0x1EF88AFB 363 data4 0x1E03B216 364 data4 0x1E78AB43 365 data4 0x9E7B1747 366 data4 0x9EFE3C0E 367 data4 0x9D36F837 368 data4 0x9DEE53E4 369 data4 0x9E24AE8E 370 data4 0x1D912473 371 data4 0x1EB243BE 372 data4 0x1E669A2F 373 data4 0x9BBC610A 374 data4 0x1E761035 375 data4 0x9E0BE175 376 data4 0x1CCB12A1 377 data4 0x1D1BFE90 378 data4 0x1DF2F47A 379 data4 0x1EF22F22 380 data4 0x9E3F4A29 381 data4 0x1EC01A5B 382 data4 0x1E8CAC3A 383 data4 0x9DBB3FAB 384 data4 0x1EF73A19 385 data4 0x9BB795B5 386 data4 0x1EF84B76 387 data4 0x9EF5818B 388 data4 0x00000000 // Center of table 389 data4 0x1F77CACA 390 data4 0x1EF8A91D 391 data4 0x1E57C976 392 data4 0x9EE8DA92 393 data4 0x1EE85C9F 394 data4 0x1F3BF1AF 395 data4 0x1D80CA1E 396 data4 0x9D0373AF 397 data4 0x9F167097 398 data4 0x1EB70051 399 data4 0x1F6EB029 400 data4 0x1DFD6D8E 401 data4 0x9EB319B0 402 data4 0x1EBA2BEB 403 data4 0x1F11D537 404 data4 0x1F0D5A46 405 data4 0x9E5E7BCA 406 data4 0x9F3AAFD1 407 data4 0x9E86DACC 408 data4 0x9F3EDDC2 409 data4 0x1E496E3D 410 data4 0x9F490BF6 411 data4 0x1DD1DB48 412 data4 0x1E65EBFB 413 data4 0x9F427496 414 data4 0x1F283C4A 415 data4 0x1F4B0047 416 data4 0x1F130152 417 data4 0x9E8367C0 418 data4 0x9F705F90 419 data4 0x1EFB3C53 420 data4 0x1F32FB13 421LOCAL_OBJECT_END(cosh_j_lo_table) 422 423 424.section .text 425GLOBAL_IEEE754_ENTRY(coshl) 426 427{ .mlx 428 getf.exp r_signexp_x = f8 // Get signexp of x, must redo if unorm 429 movl r_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 430} 431{ .mlx 432 addl r_ad1 = @ltoff(cosh_arg_reduction), gp 433 movl r_rshf_2to57 = 0x4778000000000000 // 1.10000 2^(63+57) 434} 435;; 436 437{ .mfi 438 ld8 r_ad1 = [r_ad1] 439 fmerge.s f_ABS_X = f0,f8 440 mov r_exp_0_25 = 0x0fffd // Form exponent for 0.25 441} 442{ .mfi 443 nop.m 0 444 fnorm.s1 f_NORM_X = f8 445 mov r_exp_2tom57 = 0xffff-57 446} 447;; 448 449{ .mfi 450 setf.d f_RSHF_2TO57 = r_rshf_2to57 // Form const 1.100 * 2^120 451 fclass.m p10,p0 = f8, 0x0b // Test for denorm 452 mov r_exp_mask = 0x1ffff 453} 454{ .mlx 455 setf.sig f_INV_LN2_2TO63 = r_sig_inv_ln2 // Form 1/ln2 * 2^63 456 movl r_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift 457} 458;; 459 460{ .mfi 461 nop.m 0 462 fclass.m p7,p0 = f8, 0x07 // Test if x=0 463 nop.i 0 464} 465{ .mfi 466 setf.exp f_2TOM57 = r_exp_2tom57 // Form 2^-57 for scaling 467 nop.f 0 468 add r_ad3 = 0x90, r_ad1 // Point to ab_table 469} 470;; 471 472{ .mfi 473 setf.d f_RSHF = r_rshf // Form right shift const 1.100 * 2^63 474 fclass.m p6,p0 = f8, 0xe3 // Test if x nan, inf 475 add r_ad4 = 0x2f0, r_ad1 // Point to j_hi_table midpoint 476} 477{ .mib 478 add r_ad2e = 0x20, r_ad1 // Point to p_table 479 nop.i 0 480(p10) br.cond.spnt COSH_DENORM // Branch if x denorm 481} 482;; 483 484// Common path -- return here from COSH_DENORM if x is unnorm 485COSH_COMMON: 486{ .mfi 487 ldfe f_smlst_oflow_input = [r_ad2e],16 488(p7) fma.s0 f8 = f1, f1, f0 // Result = 1.0 if x=0 489 add r_ad5 = 0x580, r_ad1 // Point to j_lo_table midpoint 490} 491{ .mib 492 ldfe f_log2by64_hi = [r_ad1],16 493 and r_exp_x = r_exp_mask, r_signexp_x 494(p7) br.ret.spnt b0 // Exit if x=0 495} 496;; 497 498// Get the A coefficients for COSH_BY_TBL 499{ .mfi 500 ldfe f_A1 = [r_ad3],16 501 fcmp.lt.s1 p8,p9 = f8,f0 // Test for x<0 502 cmp.lt p7,p0 = r_exp_x, r_exp_0_25 // Test x < 0.25 503} 504{ .mfb 505 add r_ad2o = 0x30, r_ad2e // Point to p_table odd coeffs 506(p6) fma.s0 f8 = f8,f8,f0 // Result for x nan, inf 507(p6) br.ret.spnt b0 // Exit for x nan, inf 508} 509;; 510 511// Calculate X2 = ax*ax for COSH_BY_POLY 512{ .mfi 513 ldfe f_log2by64_lo = [r_ad1],16 514 nop.f 0 515 nop.i 0 516} 517{ .mfb 518 ldfe f_A2 = [r_ad3],16 519 fma.s1 f_X2 = f_NORM_X, f_NORM_X, f0 520(p7) br.cond.spnt COSH_BY_POLY 521} 522;; 523 524// Here if |x| >= 0.25 525COSH_BY_TBL: 526// ****************************************************** 527// STEP 1 (TBL and EXP) - Argument reduction 528// ****************************************************** 529// Get the following constants. 530// Inv_log2by64 531// log2by64_hi 532// log2by64_lo 533 534 535// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and 536// put them in an exponent. 537// f_spos = 2^(N-1) and f_sneg = 2^(-N-1) 538// 0xffff + (N-1) = 0xffff +N -1 539// 0xffff - (N +1) = 0xffff -N -1 540 541 542// Calculate M and keep it as integer and floating point. 543// M = round-to-integer(x*Inv_log2by64) 544// f_M = M = truncate(ax/(log2/64)) 545// Put the integer representation of M in r_M 546// and the floating point representation of M in f_M 547 548// Get the remaining A,B coefficients 549{ .mmi 550 ldfe f_A3 = [r_ad3],16 551 nop.m 0 552 nop.i 0 553} 554;; 555 556// Use constant (1.100*2^(63-6)) to get rounded M into rightmost significand 557// |x| * 64 * 1/ln2 * 2^(63-6) + 1.1000 * 2^(63+(63-6)) 558{ .mfi 559 nop.m 0 560 fma.s1 f_M_temp = f_ABS_X, f_INV_LN2_2TO63, f_RSHF_2TO57 561 mov r_signexp_0_5 = 0x0fffe // signexp of +0.5 562} 563;; 564 565// Test for |x| >= overflow limit 566{ .mfi 567 ldfe f_B1 = [r_ad3],16 568 fcmp.ge.s1 p6,p0 = f_ABS_X, f_smlst_oflow_input 569 nop.i 0 570} 571;; 572 573{ .mfi 574 ldfe f_B2 = [r_ad3],16 575 nop.f 0 576 mov r_exp_32 = 0x10004 577} 578;; 579 580// Subtract RSHF constant to get rounded M as a floating point value 581// M_temp * 2^(63-6) - 2^63 582{ .mfb 583 ldfe f_B3 = [r_ad3],16 584 fms.s1 f_M = f_M_temp, f_2TOM57, f_RSHF 585(p6) br.cond.spnt COSH_HUGE // Branch if result will overflow 586} 587;; 588 589{ .mfi 590 getf.sig r_M = f_M_temp 591 nop.f 0 592 cmp.ge p7,p6 = r_exp_x, r_exp_32 // Test if x >= 32 593} 594;; 595 596// Calculate j. j is the signed extension of the six lsb of M. It 597// has a range of -32 thru 31. 598 599// Calculate R 600// ax - M*log2by64_hi 601// R = (ax - M*log2by64_hi) - M*log2by64_lo 602 603{ .mfi 604 nop.m 0 605 fnma.s1 f_R_temp = f_M, f_log2by64_hi, f_ABS_X 606 and r_j = 0x3f, r_M 607} 608;; 609 610{ .mii 611 nop.m 0 612 shl r_jshf = r_j, 0x2 // Shift j so can sign extend it 613;; 614 sxt1 r_jshf = r_jshf 615} 616;; 617 618{ .mii 619 nop.m 0 620 shr r_j = r_jshf, 0x2 // Now j has range -32 to 31 621 nop.i 0 622} 623;; 624 625{ .mmi 626 shladd r_ad_J_hi = r_j, 4, r_ad4 // pointer to Tjhi 627 sub r_Mmj = r_M, r_j // M-j 628 sub r_mj = r0, r_j // Form -j 629} 630;; 631 632// The TBL and EXP branches are merged and predicated 633// If TBL, p6 true, 0.25 <= |x| < 32 634// If EXP, p7 true, 32 <= |x| < overflow_limit 635// 636// N = (M-j)/64 637{ .mfi 638 ldfe f_Tjhi = [r_ad_J_hi] 639 fnma.s1 f_R = f_M, f_log2by64_lo, f_R_temp 640 shr r_N = r_Mmj, 0x6 // N = (M-j)/64 641} 642{ .mfi 643 shladd r_ad_mJ_hi = r_mj, 4, r_ad4 // pointer to Tmjhi 644 nop.f 0 645 shladd r_ad_mJ_lo = r_mj, 2, r_ad5 // pointer to Tmjlo 646} 647;; 648 649{ .mfi 650 sub r_2mNm1 = r_signexp_0_5, r_N // signexp 2^(-N-1) 651 nop.f 0 652 shladd r_ad_J_lo = r_j, 2, r_ad5 // pointer to Tjlo 653} 654{ .mfi 655 ldfe f_Tmjhi = [r_ad_mJ_hi] 656 nop.f 0 657 add r_2Nm1 = r_signexp_0_5, r_N // signexp 2^(N-1) 658} 659;; 660 661{ .mmf 662 ldfs f_Tmjlo = [r_ad_mJ_lo] 663 setf.exp f_sneg = r_2mNm1 // Form 2^(-N-1) 664 nop.f 0 665} 666;; 667 668{ .mmf 669 ldfs f_Tjlo = [r_ad_J_lo] 670 setf.exp f_spos = r_2Nm1 // Form 2^(N-1) 671 nop.f 0 672} 673;; 674 675// ****************************************************** 676// STEP 2 (TBL and EXP) 677// ****************************************************** 678// Calculate Rsquared and Rcubed in preparation for p_even and p_odd 679 680{ .mmf 681 nop.m 0 682 nop.m 0 683 fma.s1 f_Rsq = f_R, f_R, f0 684} 685;; 686 687 688// Calculate p_even 689// B_2 + Rsq *B_3 690// B_1 + Rsq * (B_2 + Rsq *B_3) 691// p_even = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) 692{ .mfi 693 nop.m 0 694 fma.s1 f_peven_temp1 = f_Rsq, f_B3, f_B2 695 nop.i 0 696} 697// Calculate p_odd 698// A_2 + Rsq *A_3 699// A_1 + Rsq * (A_2 + Rsq *A_3) 700// podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) 701{ .mfi 702 nop.m 0 703 fma.s1 f_podd_temp1 = f_Rsq, f_A3, f_A2 704 nop.i 0 705} 706;; 707 708{ .mfi 709 nop.m 0 710 fma.s1 f_Rcub = f_Rsq, f_R, f0 711 nop.i 0 712} 713;; 714 715// 716// If TBL, 717// Calculate S_hi and S_lo, and C_hi 718// SC_hi_temp = sneg * Tmjhi 719// S_hi = spos * Tjhi - SC_hi_temp 720// S_hi = spos * Tjhi - (sneg * Tmjhi) 721// C_hi = spos * Tjhi + SC_hi_temp 722// C_hi = spos * Tjhi + (sneg * Tmjhi) 723 724{ .mfi 725 nop.m 0 726(p6) fma.s1 f_SC_hi_temp = f_sneg, f_Tmjhi, f0 727 nop.i 0 728} 729;; 730 731// If TBL, 732// C_lo_temp3 = sneg * Tmjlo 733// C_lo_temp4 = spos * Tjlo + C_lo_temp3 734// C_lo_temp4 = spos * Tjlo + (sneg * Tmjlo) 735{ .mfi 736 nop.m 0 737(p6) fma.s1 f_C_lo_temp3 = f_sneg, f_Tmjlo, f0 738 nop.i 0 739} 740;; 741 742{ .mfi 743 nop.m 0 744 fma.s1 f_peven_temp2 = f_Rsq, f_peven_temp1, f_B1 745 nop.i 0 746} 747{ .mfi 748 nop.m 0 749 fma.s1 f_podd_temp2 = f_Rsq, f_podd_temp1, f_A1 750 nop.i 0 751} 752;; 753 754// If EXP, 755// Compute 2^(N-1) * Tjhi and 2^(N-1) * Tjlo 756{ .mfi 757 nop.m 0 758(p7) fma.s1 f_Tjhi_spos = f_Tjhi, f_spos, f0 759 nop.i 0 760} 761{ .mfi 762 nop.m 0 763(p7) fma.s1 f_Tjlo_spos = f_Tjlo, f_spos, f0 764 nop.i 0 765} 766;; 767 768{ .mfi 769 nop.m 0 770(p6) fma.s1 f_C_hi = f_spos, f_Tjhi, f_SC_hi_temp 771 nop.i 0 772} 773;; 774 775{ .mfi 776 nop.m 0 777(p6) fms.s1 f_S_hi = f_spos, f_Tjhi, f_SC_hi_temp 778 nop.i 0 779} 780{ .mfi 781 nop.m 0 782(p6) fma.s1 f_C_lo_temp4 = f_spos, f_Tjlo, f_C_lo_temp3 783 nop.i 0 784} 785;; 786 787{ .mfi 788 nop.m 0 789 fma.s1 f_peven = f_Rsq, f_peven_temp2, f0 790 nop.i 0 791} 792{ .mfi 793 nop.m 0 794 fma.s1 f_podd = f_podd_temp2, f_Rcub, f_R 795 nop.i 0 796} 797;; 798 799// If TBL, 800// C_lo_temp1 = spos * Tjhi - C_hi 801// C_lo_temp2 = sneg * Tmjlo + C_lo_temp1 802// C_lo_temp2 = sneg * Tmjlo + (spos * Tjhi - C_hi) 803 804{ .mfi 805 nop.m 0 806(p6) fms.s1 f_C_lo_temp1 = f_spos, f_Tjhi, f_C_hi 807 nop.i 0 808} 809;; 810 811{ .mfi 812 nop.m 0 813(p6) fma.s1 f_C_lo_temp2 = f_sneg, f_Tmjhi, f_C_lo_temp1 814 nop.i 0 815} 816;; 817 818// If EXP, 819// Y_hi = 2^(N-1) * Tjhi 820// Y_lo = 2^(N-1) * Tjhi * (p_odd + p_even) + 2^(N-1) * Tjlo 821{ .mfi 822 nop.m 0 823(p7) fma.s1 f_Y_lo_temp = f_peven, f1, f_podd 824 nop.i 0 825} 826;; 827 828// If TBL, 829// C_lo = C_lo_temp4 + C_lo_temp2 830{ .mfi 831 nop.m 0 832(p6) fma.s1 f_C_lo = f_C_lo_temp4, f1, f_C_lo_temp2 833 nop.i 0 834} 835;; 836 837// If TBL, 838// Y_hi = C_hi 839// Y_lo = S_hi*p_odd + (C_hi*p_even + C_lo) 840{ .mfi 841 nop.m 0 842(p6) fma.s1 f_Y_lo_temp = f_C_hi, f_peven, f_C_lo 843 nop.i 0 844} 845;; 846 847{ .mfi 848 nop.m 0 849(p7) fma.s1 f_Y_lo = f_Tjhi_spos, f_Y_lo_temp, f_Tjlo_spos 850 nop.i 0 851} 852;; 853 854// Dummy multiply to generate inexact 855{ .mfi 856 nop.m 0 857 fmpy.s0 f_tmp = f_B2, f_B2 858 nop.i 0 859} 860{ .mfi 861 nop.m 0 862(p6) fma.s1 f_Y_lo = f_S_hi, f_podd, f_Y_lo_temp 863 nop.i 0 864} 865;; 866 867// f8 = answer = Y_hi + Y_lo 868{ .mfi 869 nop.m 0 870(p7) fma.s0 f8 = f_Y_lo, f1, f_Tjhi_spos 871 nop.i 0 872} 873;; 874 875// f8 = answer = Y_hi + Y_lo 876{ .mfb 877 nop.m 0 878(p6) fma.s0 f8 = f_Y_lo, f1, f_C_hi 879 br.ret.sptk b0 // Exit for COSH_BY_TBL and COSH_BY_EXP 880} 881;; 882 883 884// Here if 0 < |x| < 0.25 885COSH_BY_POLY: 886{ .mmf 887 ldfe f_P6 = [r_ad2e],16 888 ldfe f_P5 = [r_ad2o],16 889 nop.f 0 890} 891;; 892 893{ .mmi 894 ldfe f_P4 = [r_ad2e],16 895 ldfe f_P3 = [r_ad2o],16 896 nop.i 0 897} 898;; 899 900{ .mmi 901 ldfe f_P2 = [r_ad2e],16 902 ldfe f_P1 = [r_ad2o],16 903 nop.i 0 904} 905;; 906 907{ .mfi 908 nop.m 0 909 fma.s1 f_X3 = f_NORM_X, f_X2, f0 910 nop.i 0 911} 912{ .mfi 913 nop.m 0 914 fma.s1 f_X4 = f_X2, f_X2, f0 915 nop.i 0 916} 917;; 918 919{ .mfi 920 nop.m 0 921 fma.s1 f_poly65 = f_X2, f_P6, f_P5 922 nop.i 0 923} 924{ .mfi 925 nop.m 0 926 fma.s1 f_poly43 = f_X2, f_P4, f_P3 927 nop.i 0 928} 929;; 930 931{ .mfi 932 nop.m 0 933 fma.s1 f_poly21 = f_X2, f_P2, f_P1 934 nop.i 0 935} 936;; 937 938{ .mfi 939 nop.m 0 940 fma.s1 f_poly6543 = f_X4, f_poly65, f_poly43 941 nop.i 0 942} 943;; 944 945{ .mfi 946 nop.m 0 947 fma.s1 f_poly6to1 = f_X4, f_poly6543, f_poly21 948 nop.i 0 949} 950;; 951 952// Dummy multiply to generate inexact 953{ .mfi 954 nop.m 0 955 fmpy.s0 f_tmp = f_P6, f_P6 956 nop.i 0 957} 958{ .mfb 959 nop.m 0 960 fma.s0 f8 = f_poly6to1, f_X2, f1 961 br.ret.sptk b0 // Exit COSH_BY_POLY 962} 963;; 964 965 966// Here if x denorm or unorm 967COSH_DENORM: 968// Determine if x really a denorm and not a unorm 969{ .mmf 970 getf.exp r_signexp_x = f_NORM_X 971 mov r_exp_denorm = 0x0c001 // Real denorms have exp < this 972 fmerge.s f_ABS_X = f0, f_NORM_X 973} 974;; 975 976{ .mfi 977 nop.m 0 978 fcmp.eq.s0 p10,p0 = f8, f0 // Set denorm flag 979 nop.i 0 980} 981;; 982 983// Set p8 if really a denorm 984{ .mmi 985 and r_exp_x = r_exp_mask, r_signexp_x 986;; 987 cmp.lt p8,p9 = r_exp_x, r_exp_denorm 988 nop.i 0 989} 990;; 991 992// Identify denormal operands. 993{ .mfb 994 nop.m 0 995(p8) fma.s0 f8 = f8,f8,f1 // If x denorm, result=1+x^2 996(p9) br.cond.sptk COSH_COMMON // Return to main path if x unorm 997} 998;; 999 1000{ .mfb 1001 nop.m 0 1002 nop.f 0 1003 br.ret.sptk b0 // Exit if x denorm 1004} 1005;; 1006 1007 1008// Here if |x| >= overflow limit 1009COSH_HUGE: 1010// for COSH_HUGE, put 24000 in exponent; take sign from input 1011{ .mmi 1012 mov r_exp_huge = 0x15dbf 1013;; 1014 setf.exp f_huge = r_exp_huge 1015 nop.i 0 1016} 1017;; 1018 1019{ .mfi 1020 alloc r32 = ar.pfs,0,5,4,0 1021 fma.s1 f_signed_hi_lo = f_huge, f1, f1 1022 nop.i 0 1023} 1024;; 1025 1026{ .mfi 1027 nop.m 0 1028 fma.s0 f_pre_result = f_signed_hi_lo, f_huge, f0 1029 mov GR_Parameter_TAG = 63 1030} 1031;; 1032 1033GLOBAL_IEEE754_END(coshl) 1034libm_alias_ldouble_other (__cosh, cosh) 1035 1036 1037LOCAL_LIBM_ENTRY(__libm_error_region) 1038.prologue 1039 1040{ .mfi 1041 add GR_Parameter_Y=-32,sp // Parameter 2 value 1042 nop.f 0 1043.save ar.pfs,GR_SAVE_PFS 1044 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 1045} 1046{ .mfi 1047.fframe 64 1048 add sp=-64,sp // Create new stack 1049 nop.f 0 1050 mov GR_SAVE_GP=gp // Save gp 1051};; 1052 1053{ .mmi 1054 stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack 1055 add GR_Parameter_X = 16,sp // Parameter 1 address 1056.save b0, GR_SAVE_B0 1057 mov GR_SAVE_B0=b0 // Save b0 1058};; 1059 1060.body 1061{ .mib 1062 stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack 1063 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address 1064 nop.b 0 1065} 1066{ .mib 1067 stfe [GR_Parameter_Y] = f_pre_result // STORE Parameter 3 on stack 1068 add GR_Parameter_Y = -16,GR_Parameter_Y 1069 br.call.sptk b0=__libm_error_support# // Call error handling function 1070};; 1071 1072{ .mmi 1073 add GR_Parameter_RESULT = 48,sp 1074 nop.m 0 1075 nop.i 0 1076};; 1077 1078{ .mmi 1079 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack 1080.restore sp 1081 add sp = 64,sp // Restore stack pointer 1082 mov b0 = GR_SAVE_B0 // Restore return address 1083};; 1084 1085{ .mib 1086 mov gp = GR_SAVE_GP // Restore gp 1087 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 1088 br.ret.sptk b0 // Return 1089};; 1090 1091LOCAL_LIBM_END(__libm_error_region) 1092 1093 1094.type __libm_error_support#,@function 1095.global __libm_error_support# 1096