1.file "sinhl.s" 2 3 4// Copyright (c) 2000 - 2002, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//============================================================== 41// 02/02/00 Initial version 42// 04/04/00 Unwind support added 43// 08/15/00 Bundle added after call to __libm_error_support to properly 44// set [the previously overwritten] GR_Parameter_RESULT. 45// 10/12/00 Update to set denormal operand and underflow flags 46// 01/22/01 Fixed to set inexact flag for small args. Fixed incorrect 47// call to __libm_error_support for 710.476 < x < 11357.2166. 48// 05/02/01 Reworked to improve speed of all paths 49// 05/20/02 Cleaned up namespace and sf0 syntax 50// 12/04/02 Improved performance 51// 52// API 53//============================================================== 54// long double = sinhl(long double) 55// input floating point f8 56// output floating point f8 57// 58// Registers used 59//============================================================== 60// general registers: 61// r14 -> r40 62// predicate registers used: 63// p6 -> p11 64// floating-point registers used: 65// f9 -> f15; f32 -> f90; 66// f8 has input, then output 67// 68// Overview of operation 69//============================================================== 70// There are seven paths 71// 1. 0 < |x| < 0.25 SINH_BY_POLY 72// 2. 0.25 <=|x| < 32 SINH_BY_TBL 73// 3. 32 <= |x| < 11357.21655 SINH_BY_EXP (merged path with SINH_BY_TBL) 74// 4. |x| >= 11357.21655 SINH_HUGE 75// 5. x=0 Done with early exit 76// 6. x=inf,nan Done with early exit 77// 7. x=denormal SINH_DENORM 78// 79// For double extended we get overflow for x >= 400c b174 ddc0 31ae c0ea 80// >= 11357.21655 81// 82// 83// 1. SINH_BY_POLY 0 < |x| < 0.25 84// =============== 85// Evaluate sinh(x) by a 13th order polynomial 86// Care is take for the order of multiplication; and P_1 is not exactly 1/3!, 87// P_2 is not exactly 1/5!, etc. 88// sinh(x) = sign * (series(e^x) - series(e^-x))/2 89// = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11! 90// + ax^13/13!) 91// = sign * (ax + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) ) 92// + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) )) 93// = sign * (ax + ax*p_odd + (ax*p_even)) 94// = sign * (ax + Y_lo) 95// sinh(x) = sign * (Y_hi + Y_lo) 96// Note that ax = |x| 97// 98// 2. SINH_BY_TBL 0.25 <= |x| < 32.0 99// ============= 100// sinh(x) = sinh(B+R) 101// = sinh(B)cosh(R) + cosh(B)sinh(R) 102// 103// ax = |x| = M*log2/64 + R 104// B = M*log2/64 105// M = 64*N + j 106// We will calculate M and get N as (M-j)/64 107// The division is a shift. 108// exp(B) = exp(N*log2 + j*log2/64) 109// = 2^N * 2^(j*log2/64) 110// sinh(B) = 1/2(e^B -e^-B) 111// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64)) 112// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64)) 113// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64)) 114// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32 115// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit) 116// 117// R = ax - M*log2/64 118// R = ax - M*log2_by_64_hi - M*log2_by_64_lo 119// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...) 120// = 1 + p_odd + p_even 121// where the p_even uses the A coefficients and the p_even uses 122// the B coefficients 123// 124// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd 125// cosh(R) = 1 + p_even 126// sinh(B) = S_hi + S_lo 127// cosh(B) = C_hi 128// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R) 129// 130// 3. SINH_BY_EXP 32.0 <= |x| < 11357.21655 ( 400c b174 ddc0 31ae c0ea ) 131// ============== 132// Can approximate result by exp(x)/2 in this region. 133// Y_hi = Tjhi 134// Y_lo = Tjhi * (p_odd + p_even) + Tjlo 135// sinh(x) = Y_hi + Y_lo 136// 137// 4. SINH_HUGE |x| >= 11357.21655 ( 400c b174 ddc0 31ae c0ea ) 138// ============ 139// Set error tag and call error support 140// 141// 142// Assembly macros 143//============================================================== 144r_ad5 = r14 145r_rshf_2to57 = r15 146r_exp_denorm = r15 147r_ad_mJ_lo = r15 148r_ad_J_lo = r16 149r_2Nm1 = r17 150r_2mNm1 = r18 151r_exp_x = r18 152r_ad_J_hi = r19 153r_ad2o = r19 154r_ad_mJ_hi = r20 155r_mj = r21 156r_ad2e = r22 157r_ad3 = r23 158r_ad1 = r24 159r_Mmj = r24 160r_rshf = r25 161r_M = r25 162r_N = r25 163r_jshf = r26 164r_exp_2tom57 = r26 165r_j = r26 166r_exp_mask = r27 167r_signexp_x = r28 168r_signexp_sgnx_0_5 = r28 169r_exp_0_25 = r29 170r_sig_inv_ln2 = r30 171r_exp_32 = r30 172r_exp_huge = r30 173r_ad4 = r31 174 175GR_SAVE_PFS = r34 176GR_SAVE_B0 = r35 177GR_SAVE_GP = r36 178 179GR_Parameter_X = r37 180GR_Parameter_Y = r38 181GR_Parameter_RESULT = r39 182GR_Parameter_TAG = r40 183 184 185f_ABS_X = f9 186f_X2 = f10 187f_X4 = f11 188f_tmp = f14 189f_RSHF = f15 190 191f_Inv_log2by64 = f32 192f_log2by64_lo = f33 193f_log2by64_hi = f34 194f_A1 = f35 195 196f_A2 = f36 197f_A3 = f37 198f_Rcub = f38 199f_M_temp = f39 200f_R_temp = f40 201 202f_Rsq = f41 203f_R = f42 204f_M = f43 205f_B1 = f44 206f_B2 = f45 207 208f_B3 = f46 209f_peven_temp1 = f47 210f_peven_temp2 = f48 211f_peven = f49 212f_podd_temp1 = f50 213 214f_podd_temp2 = f51 215f_podd = f52 216f_poly65 = f53 217f_poly6543 = f53 218f_poly6to1 = f53 219f_poly43 = f54 220f_poly21 = f55 221 222f_X3 = f56 223f_INV_LN2_2TO63 = f57 224f_RSHF_2TO57 = f58 225f_2TOM57 = f59 226f_smlst_oflow_input = f60 227 228f_pre_result = f61 229f_huge = f62 230f_spos = f63 231f_sneg = f64 232f_Tjhi = f65 233 234f_Tjlo = f66 235f_Tmjhi = f67 236f_Tmjlo = f68 237f_S_hi = f69 238f_SC_hi_temp = f70 239 240f_S_lo_temp1 = f71 241f_S_lo_temp2 = f72 242f_S_lo_temp3 = f73 243f_S_lo_temp4 = f73 244f_S_lo = f74 245f_C_hi = f75 246 247f_Y_hi = f77 248f_Y_lo_temp = f78 249f_Y_lo = f79 250f_NORM_X = f80 251 252f_P1 = f81 253f_P2 = f82 254f_P3 = f83 255f_P4 = f84 256f_P5 = f85 257 258f_P6 = f86 259f_Tjhi_spos = f87 260f_Tjlo_spos = f88 261f_huge = f89 262f_signed_hi_lo = f90 263 264 265// Data tables 266//============================================================== 267 268// DO NOT CHANGE ORDER OF THESE TABLES 269RODATA 270 271.align 16 272LOCAL_OBJECT_START(sinh_arg_reduction) 273// data8 0xB8AA3B295C17F0BC, 0x00004005 // 64/log2 -- signif loaded with setf 274 data8 0xB17217F7D1000000, 0x00003FF8 // log2/64 high part 275 data8 0xCF79ABC9E3B39804, 0x00003FD0 // log2/64 low part 276 data8 0xb174ddc031aec0ea, 0x0000400c // Smallest x to overflow (11357.21655) 277LOCAL_OBJECT_END(sinh_arg_reduction) 278 279LOCAL_OBJECT_START(sinh_p_table) 280 data8 0xB08AF9AE78C1239F, 0x00003FDE // P6 281 data8 0xB8EF1D28926D8891, 0x00003FEC // P4 282 data8 0x8888888888888412, 0x00003FF8 // P2 283 data8 0xD732377688025BE9, 0x00003FE5 // P5 284 data8 0xD00D00D00D4D39F2, 0x00003FF2 // P3 285 data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC // P1 286LOCAL_OBJECT_END(sinh_p_table) 287 288LOCAL_OBJECT_START(sinh_ab_table) 289 data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC // A1 290 data8 0x88888888884ECDD5, 0x00003FF8 // A2 291 data8 0xD00D0C6DCC26A86B, 0x00003FF2 // A3 292 data8 0x8000000000000002, 0x00003FFE // B1 293 data8 0xAAAAAAAAAA402C77, 0x00003FFA // B2 294 data8 0xB60B6CC96BDB144D, 0x00003FF5 // B3 295LOCAL_OBJECT_END(sinh_ab_table) 296 297LOCAL_OBJECT_START(sinh_j_hi_table) 298 data8 0xB504F333F9DE6484, 0x00003FFE 299 data8 0xB6FD91E328D17791, 0x00003FFE 300 data8 0xB8FBAF4762FB9EE9, 0x00003FFE 301 data8 0xBAFF5AB2133E45FB, 0x00003FFE 302 data8 0xBD08A39F580C36BF, 0x00003FFE 303 data8 0xBF1799B67A731083, 0x00003FFE 304 data8 0xC12C4CCA66709456, 0x00003FFE 305 data8 0xC346CCDA24976407, 0x00003FFE 306 data8 0xC5672A115506DADD, 0x00003FFE 307 data8 0xC78D74C8ABB9B15D, 0x00003FFE 308 data8 0xC9B9BD866E2F27A3, 0x00003FFE 309 data8 0xCBEC14FEF2727C5D, 0x00003FFE 310 data8 0xCE248C151F8480E4, 0x00003FFE 311 data8 0xD06333DAEF2B2595, 0x00003FFE 312 data8 0xD2A81D91F12AE45A, 0x00003FFE 313 data8 0xD4F35AABCFEDFA1F, 0x00003FFE 314 data8 0xD744FCCAD69D6AF4, 0x00003FFE 315 data8 0xD99D15C278AFD7B6, 0x00003FFE 316 data8 0xDBFBB797DAF23755, 0x00003FFE 317 data8 0xDE60F4825E0E9124, 0x00003FFE 318 data8 0xE0CCDEEC2A94E111, 0x00003FFE 319 data8 0xE33F8972BE8A5A51, 0x00003FFE 320 data8 0xE5B906E77C8348A8, 0x00003FFE 321 data8 0xE8396A503C4BDC68, 0x00003FFE 322 data8 0xEAC0C6E7DD24392F, 0x00003FFE 323 data8 0xED4F301ED9942B84, 0x00003FFE 324 data8 0xEFE4B99BDCDAF5CB, 0x00003FFE 325 data8 0xF281773C59FFB13A, 0x00003FFE 326 data8 0xF5257D152486CC2C, 0x00003FFE 327 data8 0xF7D0DF730AD13BB9, 0x00003FFE 328 data8 0xFA83B2DB722A033A, 0x00003FFE 329 data8 0xFD3E0C0CF486C175, 0x00003FFE 330 data8 0x8000000000000000, 0x00003FFF // Center of table 331 data8 0x8164D1F3BC030773, 0x00003FFF 332 data8 0x82CD8698AC2BA1D7, 0x00003FFF 333 data8 0x843A28C3ACDE4046, 0x00003FFF 334 data8 0x85AAC367CC487B15, 0x00003FFF 335 data8 0x871F61969E8D1010, 0x00003FFF 336 data8 0x88980E8092DA8527, 0x00003FFF 337 data8 0x8A14D575496EFD9A, 0x00003FFF 338 data8 0x8B95C1E3EA8BD6E7, 0x00003FFF 339 data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF 340 data8 0x8EA4398B45CD53C0, 0x00003FFF 341 data8 0x9031DC431466B1DC, 0x00003FFF 342 data8 0x91C3D373AB11C336, 0x00003FFF 343 data8 0x935A2B2F13E6E92C, 0x00003FFF 344 data8 0x94F4EFA8FEF70961, 0x00003FFF 345 data8 0x96942D3720185A00, 0x00003FFF 346 data8 0x9837F0518DB8A96F, 0x00003FFF 347 data8 0x99E0459320B7FA65, 0x00003FFF 348 data8 0x9B8D39B9D54E5539, 0x00003FFF 349 data8 0x9D3ED9A72CFFB751, 0x00003FFF 350 data8 0x9EF5326091A111AE, 0x00003FFF 351 data8 0xA0B0510FB9714FC2, 0x00003FFF 352 data8 0xA27043030C496819, 0x00003FFF 353 data8 0xA43515AE09E6809E, 0x00003FFF 354 data8 0xA5FED6A9B15138EA, 0x00003FFF 355 data8 0xA7CD93B4E965356A, 0x00003FFF 356 data8 0xA9A15AB4EA7C0EF8, 0x00003FFF 357 data8 0xAB7A39B5A93ED337, 0x00003FFF 358 data8 0xAD583EEA42A14AC6, 0x00003FFF 359 data8 0xAF3B78AD690A4375, 0x00003FFF 360 data8 0xB123F581D2AC2590, 0x00003FFF 361 data8 0xB311C412A9112489, 0x00003FFF 362 data8 0xB504F333F9DE6484, 0x00003FFF 363LOCAL_OBJECT_END(sinh_j_hi_table) 364 365LOCAL_OBJECT_START(sinh_j_lo_table) 366 data4 0x1EB2FB13 367 data4 0x1CE2CBE2 368 data4 0x1DDC3CBC 369 data4 0x1EE9AA34 370 data4 0x9EAEFDC1 371 data4 0x9DBF517B 372 data4 0x1EF88AFB 373 data4 0x1E03B216 374 data4 0x1E78AB43 375 data4 0x9E7B1747 376 data4 0x9EFE3C0E 377 data4 0x9D36F837 378 data4 0x9DEE53E4 379 data4 0x9E24AE8E 380 data4 0x1D912473 381 data4 0x1EB243BE 382 data4 0x1E669A2F 383 data4 0x9BBC610A 384 data4 0x1E761035 385 data4 0x9E0BE175 386 data4 0x1CCB12A1 387 data4 0x1D1BFE90 388 data4 0x1DF2F47A 389 data4 0x1EF22F22 390 data4 0x9E3F4A29 391 data4 0x1EC01A5B 392 data4 0x1E8CAC3A 393 data4 0x9DBB3FAB 394 data4 0x1EF73A19 395 data4 0x9BB795B5 396 data4 0x1EF84B76 397 data4 0x9EF5818B 398 data4 0x00000000 // Center of table 399 data4 0x1F77CACA 400 data4 0x1EF8A91D 401 data4 0x1E57C976 402 data4 0x9EE8DA92 403 data4 0x1EE85C9F 404 data4 0x1F3BF1AF 405 data4 0x1D80CA1E 406 data4 0x9D0373AF 407 data4 0x9F167097 408 data4 0x1EB70051 409 data4 0x1F6EB029 410 data4 0x1DFD6D8E 411 data4 0x9EB319B0 412 data4 0x1EBA2BEB 413 data4 0x1F11D537 414 data4 0x1F0D5A46 415 data4 0x9E5E7BCA 416 data4 0x9F3AAFD1 417 data4 0x9E86DACC 418 data4 0x9F3EDDC2 419 data4 0x1E496E3D 420 data4 0x9F490BF6 421 data4 0x1DD1DB48 422 data4 0x1E65EBFB 423 data4 0x9F427496 424 data4 0x1F283C4A 425 data4 0x1F4B0047 426 data4 0x1F130152 427 data4 0x9E8367C0 428 data4 0x9F705F90 429 data4 0x1EFB3C53 430 data4 0x1F32FB13 431LOCAL_OBJECT_END(sinh_j_lo_table) 432 433 434.section .text 435GLOBAL_IEEE754_ENTRY(sinhl) 436 437{ .mlx 438 getf.exp r_signexp_x = f8 // Get signexp of x, must redo if unorm 439 movl r_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 440} 441{ .mlx 442 addl r_ad1 = @ltoff(sinh_arg_reduction), gp 443 movl r_rshf_2to57 = 0x4778000000000000 // 1.10000 2^(63+57) 444} 445;; 446 447{ .mfi 448 ld8 r_ad1 = [r_ad1] 449 fmerge.s f_ABS_X = f0,f8 450 mov r_exp_0_25 = 0x0fffd // Form exponent for 0.25 451} 452{ .mfi 453 nop.m 0 454 fnorm.s1 f_NORM_X = f8 455 mov r_exp_2tom57 = 0xffff-57 456} 457;; 458 459{ .mfi 460 setf.d f_RSHF_2TO57 = r_rshf_2to57 // Form const 1.100 * 2^120 461 fclass.m p10,p0 = f8, 0x0b // Test for denorm 462 mov r_exp_mask = 0x1ffff 463} 464{ .mlx 465 setf.sig f_INV_LN2_2TO63 = r_sig_inv_ln2 // Form 1/ln2 * 2^63 466 movl r_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift 467} 468;; 469 470{ .mfi 471 nop.m 0 472 fclass.m p7,p0 = f8, 0x07 // Test if x=0 473 nop.i 0 474} 475{ .mfi 476 setf.exp f_2TOM57 = r_exp_2tom57 // Form 2^-57 for scaling 477 nop.f 0 478 add r_ad3 = 0x90, r_ad1 // Point to ab_table 479} 480;; 481 482{ .mfi 483 setf.d f_RSHF = r_rshf // Form right shift const 1.100 * 2^63 484 fclass.m p6,p0 = f8, 0xe3 // Test if x nan, inf 485 add r_ad4 = 0x2f0, r_ad1 // Point to j_hi_table midpoint 486} 487{ .mib 488 add r_ad2e = 0x20, r_ad1 // Point to p_table 489 nop.i 0 490(p10) br.cond.spnt SINH_DENORM // Branch if x denorm 491} 492;; 493 494// Common path -- return here from SINH_DENORM if x is unnorm 495SINH_COMMON: 496{ .mfi 497 ldfe f_smlst_oflow_input = [r_ad2e],16 498 nop.f 0 499 add r_ad5 = 0x580, r_ad1 // Point to j_lo_table midpoint 500} 501{ .mib 502 ldfe f_log2by64_hi = [r_ad1],16 503 and r_exp_x = r_exp_mask, r_signexp_x 504(p7) br.ret.spnt b0 // Exit if x=0 505} 506;; 507 508// Get the A coefficients for SINH_BY_TBL 509{ .mfi 510 ldfe f_A1 = [r_ad3],16 511 fcmp.lt.s1 p8,p9 = f8,f0 // Test for x<0 512 cmp.lt p7,p0 = r_exp_x, r_exp_0_25 // Test x < 0.25 513} 514{ .mfb 515 add r_ad2o = 0x30, r_ad2e // Point to p_table odd coeffs 516(p6) fma.s0 f8 = f8,f1,f0 // Result for x nan, inf 517(p6) br.ret.spnt b0 // Exit for x nan, inf 518} 519;; 520 521// Calculate X2 = ax*ax for SINH_BY_POLY 522{ .mfi 523 ldfe f_log2by64_lo = [r_ad1],16 524 nop.f 0 525 nop.i 0 526} 527{ .mfb 528 ldfe f_A2 = [r_ad3],16 529 fma.s1 f_X2 = f_NORM_X, f_NORM_X, f0 530(p7) br.cond.spnt SINH_BY_POLY 531} 532;; 533 534// Here if |x| >= 0.25 535SINH_BY_TBL: 536// ****************************************************** 537// STEP 1 (TBL and EXP) - Argument reduction 538// ****************************************************** 539// Get the following constants. 540// Inv_log2by64 541// log2by64_hi 542// log2by64_lo 543 544 545// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and 546// put them in an exponent. 547// f_spos = 2^(N-1) and f_sneg = 2^(-N-1) 548// 0xffff + (N-1) = 0xffff +N -1 549// 0xffff - (N +1) = 0xffff -N -1 550 551 552// Calculate M and keep it as integer and floating point. 553// M = round-to-integer(x*Inv_log2by64) 554// f_M = M = truncate(ax/(log2/64)) 555// Put the integer representation of M in r_M 556// and the floating point representation of M in f_M 557 558// Get the remaining A,B coefficients 559{ .mmi 560 ldfe f_A3 = [r_ad3],16 561 nop.m 0 562 nop.i 0 563} 564;; 565 566.pred.rel "mutex",p8,p9 567// Use constant (1.100*2^(63-6)) to get rounded M into rightmost significand 568// |x| * 64 * 1/ln2 * 2^(63-6) + 1.1000 * 2^(63+(63-6)) 569{ .mfi 570(p8) mov r_signexp_sgnx_0_5 = 0x2fffe // signexp of -0.5 571 fma.s1 f_M_temp = f_ABS_X, f_INV_LN2_2TO63, f_RSHF_2TO57 572(p9) mov r_signexp_sgnx_0_5 = 0x0fffe // signexp of +0.5 573} 574;; 575 576// Test for |x| >= overflow limit 577{ .mfi 578 ldfe f_B1 = [r_ad3],16 579 fcmp.ge.s1 p6,p0 = f_ABS_X, f_smlst_oflow_input 580 nop.i 0 581} 582;; 583 584{ .mfi 585 ldfe f_B2 = [r_ad3],16 586 nop.f 0 587 mov r_exp_32 = 0x10004 588} 589;; 590 591// Subtract RSHF constant to get rounded M as a floating point value 592// M_temp * 2^(63-6) - 2^63 593{ .mfb 594 ldfe f_B3 = [r_ad3],16 595 fms.s1 f_M = f_M_temp, f_2TOM57, f_RSHF 596(p6) br.cond.spnt SINH_HUGE // Branch if result will overflow 597} 598;; 599 600{ .mfi 601 getf.sig r_M = f_M_temp 602 nop.f 0 603 cmp.ge p7,p6 = r_exp_x, r_exp_32 // Test if x >= 32 604} 605;; 606 607// Calculate j. j is the signed extension of the six lsb of M. It 608// has a range of -32 thru 31. 609 610// Calculate R 611// ax - M*log2by64_hi 612// R = (ax - M*log2by64_hi) - M*log2by64_lo 613 614{ .mfi 615 nop.m 0 616 fnma.s1 f_R_temp = f_M, f_log2by64_hi, f_ABS_X 617 and r_j = 0x3f, r_M 618} 619;; 620 621{ .mii 622 nop.m 0 623 shl r_jshf = r_j, 0x2 // Shift j so can sign extend it 624;; 625 sxt1 r_jshf = r_jshf 626} 627;; 628 629{ .mii 630 nop.m 0 631 shr r_j = r_jshf, 0x2 // Now j has range -32 to 31 632 nop.i 0 633} 634;; 635 636{ .mmi 637 shladd r_ad_J_hi = r_j, 4, r_ad4 // pointer to Tjhi 638 sub r_Mmj = r_M, r_j // M-j 639 sub r_mj = r0, r_j // Form -j 640} 641;; 642 643// The TBL and EXP branches are merged and predicated 644// If TBL, p6 true, 0.25 <= |x| < 32 645// If EXP, p7 true, 32 <= |x| < overflow_limit 646// 647// N = (M-j)/64 648{ .mfi 649 ldfe f_Tjhi = [r_ad_J_hi] 650 fnma.s1 f_R = f_M, f_log2by64_lo, f_R_temp 651 shr r_N = r_Mmj, 0x6 // N = (M-j)/64 652} 653{ .mfi 654 shladd r_ad_mJ_hi = r_mj, 4, r_ad4 // pointer to Tmjhi 655 nop.f 0 656 shladd r_ad_mJ_lo = r_mj, 2, r_ad5 // pointer to Tmjlo 657} 658;; 659 660{ .mfi 661 sub r_2mNm1 = r_signexp_sgnx_0_5, r_N // signexp sgnx*2^(-N-1) 662 nop.f 0 663 shladd r_ad_J_lo = r_j, 2, r_ad5 // pointer to Tjlo 664} 665{ .mfi 666 ldfe f_Tmjhi = [r_ad_mJ_hi] 667 nop.f 0 668 add r_2Nm1 = r_signexp_sgnx_0_5, r_N // signexp sgnx*2^(N-1) 669} 670;; 671 672{ .mmf 673 ldfs f_Tmjlo = [r_ad_mJ_lo] 674 setf.exp f_sneg = r_2mNm1 // Form sgnx * 2^(-N-1) 675 nop.f 0 676} 677;; 678 679{ .mmf 680 ldfs f_Tjlo = [r_ad_J_lo] 681 setf.exp f_spos = r_2Nm1 // Form sgnx * 2^(N-1) 682 nop.f 0 683} 684;; 685 686// ****************************************************** 687// STEP 2 (TBL and EXP) 688// ****************************************************** 689// Calculate Rsquared and Rcubed in preparation for p_even and p_odd 690 691{ .mmf 692 nop.m 0 693 nop.m 0 694 fma.s1 f_Rsq = f_R, f_R, f0 695} 696;; 697 698 699// Calculate p_even 700// B_2 + Rsq *B_3 701// B_1 + Rsq * (B_2 + Rsq *B_3) 702// p_even = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) 703{ .mfi 704 nop.m 0 705 fma.s1 f_peven_temp1 = f_Rsq, f_B3, f_B2 706 nop.i 0 707} 708// Calculate p_odd 709// A_2 + Rsq *A_3 710// A_1 + Rsq * (A_2 + Rsq *A_3) 711// podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) 712{ .mfi 713 nop.m 0 714 fma.s1 f_podd_temp1 = f_Rsq, f_A3, f_A2 715 nop.i 0 716} 717;; 718 719{ .mfi 720 nop.m 0 721 fma.s1 f_Rcub = f_Rsq, f_R, f0 722 nop.i 0 723} 724;; 725 726// 727// If TBL, 728// Calculate S_hi and S_lo, and C_hi 729// SC_hi_temp = sneg * Tmjhi 730// S_hi = spos * Tjhi - SC_hi_temp 731// S_hi = spos * Tjhi - (sneg * Tmjhi) 732// C_hi = spos * Tjhi + SC_hi_temp 733// C_hi = spos * Tjhi + (sneg * Tmjhi) 734 735{ .mfi 736 nop.m 0 737(p6) fma.s1 f_SC_hi_temp = f_sneg, f_Tmjhi, f0 738 nop.i 0 739} 740;; 741 742// If TBL, 743// S_lo_temp3 = sneg * Tmjlo 744// S_lo_temp4 = spos * Tjlo - S_lo_temp3 745// S_lo_temp4 = spos * Tjlo -(sneg * Tmjlo) 746{ .mfi 747 nop.m 0 748(p6) fma.s1 f_S_lo_temp3 = f_sneg, f_Tmjlo, f0 749 nop.i 0 750} 751;; 752 753{ .mfi 754 nop.m 0 755 fma.s1 f_peven_temp2 = f_Rsq, f_peven_temp1, f_B1 756 nop.i 0 757} 758{ .mfi 759 nop.m 0 760 fma.s1 f_podd_temp2 = f_Rsq, f_podd_temp1, f_A1 761 nop.i 0 762} 763;; 764 765// If EXP, 766// Compute sgnx * 2^(N-1) * Tjhi and sgnx * 2^(N-1) * Tjlo 767{ .mfi 768 nop.m 0 769(p7) fma.s1 f_Tjhi_spos = f_Tjhi, f_spos, f0 770 nop.i 0 771} 772{ .mfi 773 nop.m 0 774(p7) fma.s1 f_Tjlo_spos = f_Tjlo, f_spos, f0 775 nop.i 0 776} 777;; 778 779{ .mfi 780 nop.m 0 781(p6) fms.s1 f_S_hi = f_spos, f_Tjhi, f_SC_hi_temp 782 nop.i 0 783} 784;; 785 786{ .mfi 787 nop.m 0 788(p6) fma.s1 f_C_hi = f_spos, f_Tjhi, f_SC_hi_temp 789 nop.i 0 790} 791{ .mfi 792 nop.m 0 793(p6) fms.s1 f_S_lo_temp4 = f_spos, f_Tjlo, f_S_lo_temp3 794 nop.i 0 795} 796;; 797 798{ .mfi 799 nop.m 0 800 fma.s1 f_peven = f_Rsq, f_peven_temp2, f0 801 nop.i 0 802} 803{ .mfi 804 nop.m 0 805 fma.s1 f_podd = f_podd_temp2, f_Rcub, f_R 806 nop.i 0 807} 808;; 809 810// If TBL, 811// S_lo_temp1 = spos * Tjhi - S_hi 812// S_lo_temp2 = -sneg * Tmjlo + S_lo_temp1 813// S_lo_temp2 = -sneg * Tmjlo + (spos * Tjhi - S_hi) 814 815{ .mfi 816 nop.m 0 817(p6) fms.s1 f_S_lo_temp1 = f_spos, f_Tjhi, f_S_hi 818 nop.i 0 819} 820;; 821 822{ .mfi 823 nop.m 0 824(p6) fnma.s1 f_S_lo_temp2 = f_sneg, f_Tmjhi, f_S_lo_temp1 825 nop.i 0 826} 827;; 828 829// If EXP, 830// Y_hi = sgnx * 2^(N-1) * Tjhi 831// Y_lo = sgnx * 2^(N-1) * Tjhi * (p_odd + p_even) + sgnx * 2^(N-1) * Tjlo 832{ .mfi 833 nop.m 0 834(p7) fma.s1 f_Y_lo_temp = f_peven, f1, f_podd 835 nop.i 0 836} 837;; 838 839// If TBL, 840// S_lo = S_lo_temp4 + S_lo_temp2 841{ .mfi 842 nop.m 0 843(p6) fma.s1 f_S_lo = f_S_lo_temp4, f1, f_S_lo_temp2 844 nop.i 0 845} 846;; 847 848// If TBL, 849// Y_hi = S_hi 850// Y_lo = C_hi*p_odd + (S_hi*p_even + S_lo) 851{ .mfi 852 nop.m 0 853(p6) fma.s1 f_Y_lo_temp = f_S_hi, f_peven, f_S_lo 854 nop.i 0 855} 856;; 857 858{ .mfi 859 nop.m 0 860(p7) fma.s1 f_Y_lo = f_Tjhi_spos, f_Y_lo_temp, f_Tjlo_spos 861 nop.i 0 862} 863;; 864 865// Dummy multiply to generate inexact 866{ .mfi 867 nop.m 0 868 fmpy.s0 f_tmp = f_B2, f_B2 869 nop.i 0 870} 871{ .mfi 872 nop.m 0 873(p6) fma.s1 f_Y_lo = f_C_hi, f_podd, f_Y_lo_temp 874 nop.i 0 875} 876;; 877 878// f8 = answer = Y_hi + Y_lo 879{ .mfi 880 nop.m 0 881(p7) fma.s0 f8 = f_Y_lo, f1, f_Tjhi_spos 882 nop.i 0 883} 884;; 885 886// f8 = answer = Y_hi + Y_lo 887{ .mfb 888 nop.m 0 889(p6) fma.s0 f8 = f_Y_lo, f1, f_S_hi 890 br.ret.sptk b0 // Exit for SINH_BY_TBL and SINH_BY_EXP 891} 892;; 893 894 895// Here if 0 < |x| < 0.25 896SINH_BY_POLY: 897{ .mmf 898 ldfe f_P6 = [r_ad2e],16 899 ldfe f_P5 = [r_ad2o],16 900 nop.f 0 901} 902;; 903 904{ .mmi 905 ldfe f_P4 = [r_ad2e],16 906 ldfe f_P3 = [r_ad2o],16 907 nop.i 0 908} 909;; 910 911{ .mmi 912 ldfe f_P2 = [r_ad2e],16 913 ldfe f_P1 = [r_ad2o],16 914 nop.i 0 915} 916;; 917 918{ .mfi 919 nop.m 0 920 fma.s1 f_X3 = f_NORM_X, f_X2, f0 921 nop.i 0 922} 923{ .mfi 924 nop.m 0 925 fma.s1 f_X4 = f_X2, f_X2, f0 926 nop.i 0 927} 928;; 929 930{ .mfi 931 nop.m 0 932 fma.s1 f_poly65 = f_X2, f_P6, f_P5 933 nop.i 0 934} 935{ .mfi 936 nop.m 0 937 fma.s1 f_poly43 = f_X2, f_P4, f_P3 938 nop.i 0 939} 940;; 941 942{ .mfi 943 nop.m 0 944 fma.s1 f_poly21 = f_X2, f_P2, f_P1 945 nop.i 0 946} 947;; 948 949{ .mfi 950 nop.m 0 951 fma.s1 f_poly6543 = f_X4, f_poly65, f_poly43 952 nop.i 0 953} 954;; 955 956{ .mfi 957 nop.m 0 958 fma.s1 f_poly6to1 = f_X4, f_poly6543, f_poly21 959 nop.i 0 960} 961;; 962 963// Dummy multiply to generate inexact 964{ .mfi 965 nop.m 0 966 fmpy.s0 f_tmp = f_P6, f_P6 967 nop.i 0 968} 969{ .mfb 970 nop.m 0 971 fma.s0 f8 = f_poly6to1, f_X3, f_NORM_X 972 br.ret.sptk b0 // Exit SINH_BY_POLY 973} 974;; 975 976 977// Here if x denorm or unorm 978SINH_DENORM: 979// Determine if x really a denorm and not a unorm 980{ .mmf 981 getf.exp r_signexp_x = f_NORM_X 982 mov r_exp_denorm = 0x0c001 // Real denorms have exp < this 983 fmerge.s f_ABS_X = f0, f_NORM_X 984} 985;; 986 987{ .mfi 988 nop.m 0 989 fcmp.eq.s0 p10,p0 = f8, f0 // Set denorm flag 990 nop.i 0 991} 992;; 993 994// Set p8 if really a denorm 995{ .mmi 996 and r_exp_x = r_exp_mask, r_signexp_x 997;; 998 cmp.lt p8,p9 = r_exp_x, r_exp_denorm 999 nop.i 0 1000} 1001;; 1002 1003// Identify denormal operands. 1004{ .mfb 1005 nop.m 0 1006(p8) fcmp.ge.unc.s1 p6,p7 = f8, f0 // Test sign of denorm 1007(p9) br.cond.sptk SINH_COMMON // Return to main path if x unorm 1008} 1009;; 1010 1011{ .mfi 1012 nop.m 0 1013(p6) fma.s0 f8 = f8,f8,f8 // If x +denorm, result=x+x^2 1014 nop.i 0 1015} 1016{ .mfb 1017 nop.m 0 1018(p7) fnma.s0 f8 = f8,f8,f8 // If x -denorm, result=x-x^2 1019 br.ret.sptk b0 // Exit if x denorm 1020} 1021;; 1022 1023 1024// Here if |x| >= overflow limit 1025SINH_HUGE: 1026// for SINH_HUGE, put 24000 in exponent; take sign from input 1027{ .mmi 1028 mov r_exp_huge = 0x15dbf 1029;; 1030 setf.exp f_huge = r_exp_huge 1031 nop.i 0 1032} 1033;; 1034 1035.pred.rel "mutex",p8,p9 1036{ .mfi 1037 alloc r32 = ar.pfs,0,5,4,0 1038(p8) fnma.s1 f_signed_hi_lo = f_huge, f1, f1 1039 nop.i 0 1040} 1041{ .mfi 1042 nop.m 0 1043(p9) fma.s1 f_signed_hi_lo = f_huge, f1, f1 1044 nop.i 0 1045} 1046;; 1047 1048{ .mfi 1049 nop.m 0 1050 fma.s0 f_pre_result = f_signed_hi_lo, f_huge, f0 1051 mov GR_Parameter_TAG = 126 1052} 1053;; 1054 1055GLOBAL_IEEE754_END(sinhl) 1056libm_alias_ldouble_other (__sinh, sinh) 1057 1058 1059LOCAL_LIBM_ENTRY(__libm_error_region) 1060.prologue 1061 1062{ .mfi 1063 add GR_Parameter_Y=-32,sp // Parameter 2 value 1064 nop.f 0 1065.save ar.pfs,GR_SAVE_PFS 1066 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 1067} 1068{ .mfi 1069.fframe 64 1070 add sp=-64,sp // Create new stack 1071 nop.f 0 1072 mov GR_SAVE_GP=gp // Save gp 1073};; 1074 1075{ .mmi 1076 stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack 1077 add GR_Parameter_X = 16,sp // Parameter 1 address 1078.save b0, GR_SAVE_B0 1079 mov GR_SAVE_B0=b0 // Save b0 1080};; 1081 1082.body 1083{ .mib 1084 stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack 1085 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address 1086 nop.b 0 1087} 1088{ .mib 1089 stfe [GR_Parameter_Y] = f_pre_result // STORE Parameter 3 on stack 1090 add GR_Parameter_Y = -16,GR_Parameter_Y 1091 br.call.sptk b0=__libm_error_support# // Call error handling function 1092};; 1093 1094{ .mmi 1095 add GR_Parameter_RESULT = 48,sp 1096 nop.m 0 1097 nop.i 0 1098};; 1099 1100{ .mmi 1101 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack 1102.restore sp 1103 add sp = 64,sp // Restore stack pointer 1104 mov b0 = GR_SAVE_B0 // Restore return address 1105};; 1106 1107{ .mib 1108 mov gp = GR_SAVE_GP // Restore gp 1109 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 1110 br.ret.sptk b0 // Return 1111};; 1112 1113LOCAL_LIBM_END(__libm_error_region) 1114 1115 1116.type __libm_error_support#,@function 1117.global __libm_error_support# 1118