1.file "powl.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39//********************************************************************* 40// 41// Function: powl(x,y), where 42// y 43// powl(x,y) = x , for double extended precision x and y values 44// 45//********************************************************************* 46// 47// History: 48// 02/02/00 (Hand Optimized) 49// 04/04/00 Unwind support added 50// 08/15/00 Bundle added after call to __libm_error_support to properly 51// set [the previously overwritten] GR_Parameter_RESULT. 52// 01/22/01 Corrected results for powl(1,inf), powl(1,nan), and 53// powl(snan,0) to be 1 per C99, not nan. Fixed many flag settings. 54// 02/06/01 Call __libm_error support if over/underflow when y=2. 55// 04/17/01 Support added for y close to 1 and x a non-special value. 56// Shared software under/overflow detection for all paths 57// 02/07/02 Corrected sf3 setting to disable traps 58// 05/13/02 Improved performance of all paths 59// 02/10/03 Reordered header: .section, .global, .proc, .align; 60// used data8 for long double table values 61// 04/17/03 Added missing mutex directive 62// 10/13/03 Corrected .endp names to match .proc names 63// 64//********************************************************************* 65// 66// Resources Used: 67// 68// Floating-Point Registers: 69// f8 (Input x and Return Value) 70// f9 (Input y) 71// f10-f15,f32-f79 72// 73// General Purpose Registers: 74// Locals r14-24,r32-r65 75// Parameters to __libm_error_support r62,r63,r64,r65 76// 77// Predicate Registers: p6-p15 78// 79//********************************************************************* 80// 81// Special Cases and IEEE special conditions: 82// 83// Denormal fault raised on denormal inputs 84// Overflow exceptions raised when appropriate for pow 85// Underflow exceptions raised when appropriate for pow 86// (Error Handling Routine called for overflow and Underflow) 87// Inexact raised when appropriate by algorithm 88// 89// 1. (anything) ** NatVal or (NatVal) ** anything is NatVal 90// 2. X or Y unsupported or sNaN is qNaN/Invalid 91// 3. (anything) ** 0 is 1 92// 4. (anything) ** 1 is itself 93// 5. (anything except 1) ** qNAN is qNAN 94// 6. qNAN ** (anything except 0) is qNAN 95// 7. +-(|x| > 1) ** +INF is +INF 96// 8. +-(|x| > 1) ** -INF is +0 97// 9. +-(|x| < 1) ** +INF is +0 98// 10. +-(|x| < 1) ** -INF is +INF 99// 11. +-1 ** +-INF is +1 100// 12. +0 ** (+anything except 0, NAN) is +0 101// 13. -0 ** (+anything except 0, NAN, odd integer) is +0 102// 14. +0 ** (-anything except 0, NAN) is +INF/div_0 103// 15. -0 ** (-anything except 0, NAN, odd integer) is +INF/div_0 104// 16. -0 ** (odd integer) = -( +0 ** (odd integer) ) 105// 17. +INF ** (+anything except 0,NAN) is +INF 106// 18. +INF ** (-anything except 0,NAN) is +0 107// 19. -INF ** (anything except NAN) = -0 ** (-anything) 108// 20. (-anything) ** (integer) is (-1)**(integer)*(+anything**integer) 109// 21. (-anything except 0 and inf) ** (non-integer) is qNAN/Invalid 110// 22. X or Y denorm/unorm and denorm/unorm operand trap is enabled, 111// generate denorm/unorm fault except if invalid or div_0 raised. 112// 113//********************************************************************* 114// 115// Algorithm 116// ========= 117// 118// Special Cases 119// 120// If Y = 2, return X*X. 121// If Y = 0.5, return sqrt(X). 122// 123// Compute log(X) to extra precision. 124// 125// ker_log_80( X, logX_hi, logX_lo, Safe ); 126// 127// ...logX_hi + logX_lo approximates log(X) to roughly 80 128// ...significant bits of accuracy. 129// 130// Compute Y*log(X) to extra precision. 131// 132// P_hi := Y * logX_hi 133// P_lo := Y * logX_hi - P_hi ...using FMA 134// P_lo := Y * logX_lo + P_lo ...using FMA 135// 136// Compute exp(P_hi + P_lo) 137// 138// Flag := 2; 139// Expo_Range := 2; (assuming double-extended power function) 140// ker_exp_64( P_hi, P_lo, Flag, Expo_Range, 141// Z_hi, Z_lo, scale, Safe ) 142// 143// scale := sgn * scale 144// 145// If (Safe) then ...result will not over/underflow 146// return scale*Z_hi + (scale*Z_lo) 147// quickly 148// Else 149// take necessary precaution in computing 150// scale*Z_hi + (scale*Z_lo) 151// to set possible exceptions correctly. 152// End If 153// 154// Case_Y_Special 155// 156// ...Follow the order of the case checks 157// 158// If Y is +-0, return +1 without raising any exception. 159// If Y is +1, return X without raising any exception. 160// If Y is qNaN, return Y without exception. 161// If X is qNaN, return X without exception. 162// 163// At this point, X is real and Y is +-inf. 164// Thus |X| can only be 1, strictly bigger than 1, or 165// strictly less than 1. 166// 167// If |X| < 1, then 168// return ( Y == +inf? +0 : +inf ) 169// elseif |X| > 1, then 170// return ( Y == +inf? +0 : +inf ) 171// else 172// goto Case_Invalid 173// 174// Case_X_Special 175// 176// ...Follow the order of the case checks 177// ...Note that Y is real, finite, non-zero, and not +1. 178// 179// If X is qNaN, return X without exception. 180// 181// If X is +-0, 182// return ( Y > 0 ? +0 : +inf ) 183// 184// If X is +inf 185// return ( Y > 0 ? +inf : +0 ) 186// 187// If X is -inf 188// return -0 ** -Y 189// return ( Y > 0 ? +inf : +0 ) 190// 191// Case_Invalid 192// 193// Return 0 * inf to generate a quiet NaN together 194// with an invalid exception. 195// 196// Implementation 197// ============== 198// 199// We describe the quick branch since this part is important 200// in reaching the normal case efficiently. 201// 202// STAGE 1 203// ------- 204// This stage contains two threads. 205// 206// Stage1.Thread1 207// 208// fclass.m X_excep, X_ok = X, (NatVal or s/qNaN) or 209// +-0, +-infinity 210// 211// fclass.nm X_unsupp, X_supp = X, (NatVal or s/qNaN) or 212// +-(0, unnorm, norm, infinity) 213// 214// X_norm := fnorm( X ) with traps disabled 215// 216// If (X_excep) goto Filtering (Step 2) 217// If (X_unsupp) goto Filtering (Step 2) 218// 219// Stage1.Thread2 220// .............. 221// 222// fclass.m Y_excep, Y_ok = Y, (NatVal or s/qNaN) or 223// +-0, +-infinity 224// 225// fclass.nm Y_unsupp, Y_supp = Y, (NatVal or s/qNaN) or 226// +-(0, unnorm, norm, infinity) 227// 228// Y_norm := fnorm( Y ) with traps disabled 229// 230// If (Y_excep) goto Filtering (Step 2) 231// If (Y_unsupp) goto Filtering (Step 2) 232// 233// 234// STAGE 2 235// ------- 236// This stage contains two threads. 237// 238// Stage2.Thread1 239// .............. 240// 241// Set X_lt_0 if X < 0 (using fcmp) 242// sgn := +1.0 243// If (X_lt_0) goto Filtering (Step 2) 244// 245// Stage2.Thread2 246// .............. 247// 248// Set Y_is_1 if Y = +1 (using fcmp) 249// If (Y_is_1) goto Filtering (Step 2) 250// 251// STAGE 3 252// ------- 253// This stage contains two threads. 254// 255// 256// Stage3.Thread1 257// .............. 258// 259// X := fnorm(X) in prevailing traps 260// 261// 262// Stage3.Thread2 263// .............. 264// 265// Y := fnorm(Y) in prevailing traps 266// 267// STAGE 4 268// ------- 269// 270// Go to Case_Normal. 271// 272 273 274// ************* DO NOT CHANGE ORDER OF THESE TABLES ******************** 275 276// double-extended 1/ln(2) 277// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88 278// 3fff b8aa 3b29 5c17 f0bc 279// For speed the significand will be loaded directly with a movl and setf.sig 280// and the exponent will be bias+63 instead of bias+0. Thus subsequent 281// computations need to scale appropriately. 282// The constant 2^12/ln(2) is needed for the computation of N. This is also 283// obtained by scaling the computations. 284// 285// Two shifting constants are loaded directly with movl and setf.d. 286// 1. RSHF_2TO51 = 1.1000..00 * 2^(63-12) 287// This constant is added to x*1/ln2 to shift the integer part of 288// x*2^12/ln2 into the rightmost bits of the significand. 289// The result of this fma is N_signif. 290// 2. RSHF = 1.1000..00 * 2^(63) 291// This constant is subtracted from N_signif * 2^(-51) to give 292// the integer part of N, N_fix, as a floating-point number. 293// The result of this fms is float_N. 294RODATA 295 296.align 16 297// L_hi, L_lo 298LOCAL_OBJECT_START(Constants_exp_64_Arg) 299data8 0xB17217F400000000,0x00003FF2 // L_hi = hi part log(2)/2^12 300data8 0xF473DE6AF278ECE6,0x00003FD4 // L_lo = lo part log(2)/2^12 301LOCAL_OBJECT_END(Constants_exp_64_Arg) 302 303LOCAL_OBJECT_START(Constants_exp_64_A) 304// Reversed 305data8 0xAAAAAAABB1B736A0,0x00003FFA 306data8 0xAAAAAAAB90CD6327,0x00003FFC 307data8 0xFFFFFFFFFFFFFFFF,0x00003FFD 308LOCAL_OBJECT_END(Constants_exp_64_A) 309 310LOCAL_OBJECT_START(Constants_exp_64_P) 311// Reversed 312data8 0xD00D6C8143914A8A,0x00003FF2 313data8 0xB60BC4AC30304B30,0x00003FF5 314data8 0x888888887474C518,0x00003FF8 315data8 0xAAAAAAAA8DAE729D,0x00003FFA 316data8 0xAAAAAAAAAAAAAF61,0x00003FFC 317data8 0x80000000000004C7,0x00003FFE 318LOCAL_OBJECT_END(Constants_exp_64_P) 319 320LOCAL_OBJECT_START(Constants_exp_64_T1) 321data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 322data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 323data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC 324data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D 325data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA 326data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516 327data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A 328data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4 329data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B 330data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD 331data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15 332data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B 333data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 334data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A 335data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 336data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C 337LOCAL_OBJECT_END(Constants_exp_64_T1) 338 339LOCAL_OBJECT_START(Constants_exp_64_T2) 340data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 341data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 342data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E 343data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 344data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 345data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA 346data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 347data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A 348data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 349data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA 350data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 351data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA 352data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 353data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 354data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE 355data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 356LOCAL_OBJECT_END(Constants_exp_64_T2) 357 358LOCAL_OBJECT_START(Constants_exp_64_W1) 359data8 0x0000000000000000, 0xBE384454171EC4B4 360data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8 361data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36 362data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE 363data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F 364data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329 365data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5 366data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F 367data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF 368data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F 369data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92 370data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E 371data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D 372data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29 373data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A 374data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA 375data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6 376data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF 377data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC 378data8 0xBE51C2141AA42614, 0xBE48D087C37293F4 379data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38 380data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962 381data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788 382data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7 383data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2 384data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4 385data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA 386data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B 387data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A 388data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719 389data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D 390data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707 391LOCAL_OBJECT_END(Constants_exp_64_W1) 392 393LOCAL_OBJECT_START(Constants_exp_64_W2) 394data8 0x0000000000000000, 0xBE641F2537A3D7A2 395data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6 396data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE 397data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3 398data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4 399data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B 400data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7 401data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA 402data8 0xBE56856B49BFF529, 0x3E66DD3300508651 403data8 0x3E51165FC114BC13, 0x3E53333DC453290F 404data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696 405data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93 406data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE 407data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22 408data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97 409data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8 410data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC 411data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1 412data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7 413data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D 414data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C 415data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5 416data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9 417data8 0xBE559725ADE45917, 0xBE68C29C042FC476 418data8 0xBE67593B01E511FA, 0xBE4A4313398801ED 419data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E 420data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D 421data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F 422data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1 423data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795 424data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E 425data8 0x3E68BF5C17365712, 0x3E3956F9B3785569 426LOCAL_OBJECT_END(Constants_exp_64_W2) 427 428LOCAL_OBJECT_START(Constants_log_80_P) 429// P_8, P_7, ..., P_1 430data8 0xCCCE8B883B1042BC, 0x0000BFFB // P_8 431data8 0xE38997B7CADC2149, 0x00003FFB // P_7 432data8 0xFFFFFFFEB1ACB090, 0x0000BFFB // P_6 433data8 0x9249249806481C81, 0x00003FFC // P_5 434data8 0x0000000000000000, 0x00000000 // Pad for bank conflicts 435data8 0xAAAAAAAAAAAAB0EF, 0x0000BFFC // P_4 436data8 0xCCCCCCCCCCC91416, 0x00003FFC // P_3 437data8 0x8000000000000000, 0x0000BFFD // P_2 438data8 0xAAAAAAAAAAAAAAAB, 0x00003FFD // P_1 439LOCAL_OBJECT_END(Constants_log_80_P) 440 441LOCAL_OBJECT_START(Constants_log_80_Q) 442// log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1 443data8 0xB172180000000000,0x00003FFE 444data8 0x82E308654361C4C6,0x0000BFE2 445data8 0x92492453A51BE0AF,0x00003FFC 446data8 0xAAAAAB73A0CFD29F,0x0000BFFC 447data8 0xCCCCCCCCCCCE3872,0x00003FFC 448data8 0xFFFFFFFFFFFFB4FB,0x0000BFFC 449data8 0xAAAAAAAAAAAAAAAB,0x00003FFD 450data8 0x8000000000000000,0x0000BFFE 451LOCAL_OBJECT_END(Constants_log_80_Q) 452 453LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h1) 454// Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double 455data4 0x00008000,0x3F800000,0x00000000,0x00000000 456data4 0x00000000,0x00000000,0x00000000,0x00000000 457data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000 458data4 0xEBA0E0D1,0x8B1D330B,0x00003FDA,0x00000000 459data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000 460data4 0x9EADD553,0xE2AF365E,0x00003FE2,0x00000000 461data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000 462data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000 463data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000 464data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000 465data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000 466data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000 467data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000 468data4 0x457978A1,0x8718789F,0x00003FE2,0x00000000 469data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000 470data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000 471data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000 472data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000 473data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000 474data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000 475data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000 476data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000 477data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000 478data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000 479data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000 480data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000 481data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000 482data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000 483data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000 484data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000 485data4 0x00004211,0x3F042108,0x3F29516A,0x00000000 486data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000 487LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h1) 488 489LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h2) 490// Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double 491data4 0x00008000,0x3F800000,0x00000000,0x00000000 492data4 0x00000000,0x00000000,0x00000000,0x00000000 493data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000 494data4 0x211398BF,0xAD08B116,0x00003FDB,0x00000000 495data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000 496data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000 497data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000 498data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000 499data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000 500data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000 501data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000 502data4 0x049EB22F,0xD1B87D3C,0x00003FDE,0x00000000 503data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000 504data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000 505data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000 506data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000 507data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000 508data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000 509data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000 510data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000 511data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000 512data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000 513data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000 514data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000 515data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000 516data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000 517data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000 518data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000 519data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000 520data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000 521data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000 522data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000 523LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h2) 524 525LOCAL_OBJECT_START(Constants_log_80_h3_G_H) 526// h3 IEEE double extended, H3 and G3 IEEE single 527data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00 528data4 0x9B7FAD21,0x90051030,0x00003FD8,0x3F7FF400 529data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00 530data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400 531data4 0xCA964D95,0xCE65C1D8,0x0000BFD8,0x3F7FDC00 532data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400 533data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08 534data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408 535data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10 536data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410 537data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18 538data4 0x2F053150,0xB25CA912,0x0000BFDA,0x3F7FA420 539data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20 540data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428 541data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30 542data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438 543data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40 544data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448 545data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50 546data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458 547data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68 548data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470 549data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78 550data4 0x1F34A7EB,0x9A995A97,0x0000BFDC,0x3F7F4488 551data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90 552data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0 553data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8 554data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8 555data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8 556data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8 557data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0 558data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0 559data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here 560data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D 561data4 0x3A87F20B,0x3A97F68B,0x3AA7EB86,0x3AB7E101 562data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED 563data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766 564data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6 565data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620 566data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D 567LOCAL_OBJECT_END(Constants_log_80_h3_G_H) 568 569GR_sig_inv_ln2 = r14 570GR_rshf_2to51 = r15 571GR_exp_2tom51 = r16 572GR_rshf = r17 573GR_exp_half = r18 574GR_sign_mask = r19 575GR_exp_square_oflow = r20 576GR_exp_square_uflow = r21 577GR_exp_ynear1_oflow = r22 578GR_exp_ynear1_uflow = r23 579GR_signif_Z = r24 580 581GR_signexp_x = r32 582 583GR_exp_x = r33 584 585GR_Table_Ptr = r34 586 587GR_Table_Ptr1 = r35 588 589GR_Index1 = r36 590 591GR_Index2 = r37 592GR_Expo_X = r37 593 594GR_M = r38 595 596GR_X_0 = r39 597GR_Mask = r39 598 599GR_X_1 = r40 600GR_W1_ptr = r40 601 602GR_W2_ptr = r41 603GR_X_2 = r41 604 605GR_Z_1 = r42 606GR_M2 = r42 607 608GR_M1 = r43 609GR_Z_2 = r43 610 611GR_N = r44 612GR_k = r44 613 614GR_Big_Pos_Exp = r45 615 616GR_exp_pos_max = r46 617 618GR_exp_bias_p_k = r47 619 620GR_Index3 = r48 621GR_temp = r48 622 623GR_vsm_expo = r49 624 625GR_T1_ptr = r50 626GR_P_ptr1 = r50 627GR_T2_ptr = r51 628GR_P_ptr2 = r51 629GR_N_fix = r52 630GR_exp_y = r53 631GR_signif_y = r54 632GR_signexp_y = r55 633GR_fraction_y = r55 634GR_low_order_bit = r56 635GR_exp_mask = r57 636GR_exp_bias = r58 637GR_y_sign = r59 638GR_table_base = r60 639GR_ptr_exp_Arg = r61 640GR_Delta_Exp = r62 641GR_Special_Exp = r63 642GR_exp_neg_max = r64 643GR_Big_Neg_Exp = r65 644 645//** Registers for unwind support 646 647GR_SAVE_PFS = r59 648GR_SAVE_B0 = r60 649GR_SAVE_GP = r61 650GR_Parameter_X = r62 651GR_Parameter_Y = r63 652GR_Parameter_RESULT = r64 653GR_Parameter_TAG = r65 654 655//** 656 657FR_Input_X = f8 658FR_Result = f8 659FR_Input_Y = f9 660 661FR_Neg = f10 662FR_P_hi = f10 663FR_X = f10 664 665FR_Half = f11 666FR_h_3 = f11 667FR_poly_hi = f11 668 669FR_Sgn = f12 670 671FR_half_W = f13 672 673FR_X_cor = f14 674FR_P_lo = f14 675 676FR_W = f15 677 678FR_X_lo = f32 679 680FR_S = f33 681FR_W3 = f33 682 683FR_Y_hi = f34 684FR_logx_hi = f34 685 686FR_Z = f35 687FR_logx_lo = f35 688FR_GS_hi = f35 689FR_Y_lo = f35 690 691FR_r_cor = f36 692FR_Scale = f36 693 694FR_G_1 = f37 695FR_G = f37 696FR_Wsq = f37 697FR_temp = f37 698 699FR_H_1 = f38 700FR_H = f38 701FR_W4 = f38 702 703FR_h = f39 704FR_h_1 = f39 705FR_N = f39 706FR_P_7 = f39 707 708FR_G_2 = f40 709FR_P_8 = f40 710FR_L_hi = f40 711 712FR_H_2 = f41 713FR_L_lo = f41 714FR_A_1 = f41 715 716FR_h_2 = f42 717 718FR_W1 = f43 719 720FR_G_3 = f44 721FR_P_8 = f44 722FR_T1 = f44 723 724FR_log2_hi = f45 725FR_W2 = f45 726 727FR_GS_lo = f46 728FR_T2 = f46 729 730FR_W_1_p1 = f47 731FR_H_3 = f47 732 733FR_float_N = f48 734 735FR_A_2 = f49 736 737FR_Q_4 = f50 738FR_r4 = f50 739 740FR_Q_3 = f51 741FR_A_3 = f51 742 743FR_Q_2 = f52 744FR_P_2 = f52 745 746FR_Q_1 = f53 747FR_P_1 = f53 748FR_T = f53 749 750FR_Wp1 = f54 751FR_Q_5 = f54 752FR_P_3 = f54 753 754FR_Q_6 = f55 755 756FR_log2_lo = f56 757FR_Two = f56 758 759FR_Big = f57 760 761FR_neg_2_mK = f58 762 763FR_r = f59 764 765FR_poly_lo = f60 766 767FR_poly = f61 768 769FR_P_5 = f62 770FR_Result_small = f62 771 772FR_rsq = f63 773 774FR_Delta = f64 775 776FR_save_Input_X = f65 777FR_norm_X = f66 778FR_norm_Y = f67 779FR_Y_lo_2 = f68 780 781FR_P_6 = f69 782FR_Result_big = f69 783 784FR_RSHF_2TO51 = f70 785FR_INV_LN2_2TO63 = f71 786FR_2TOM51 = f72 787FR_RSHF = f73 788FR_TMP1 = f74 789FR_TMP2 = f75 790FR_TMP3 = f76 791FR_Tscale = f77 792FR_P_4 = f78 793FR_NBig = f79 794 795 796.section .text 797GLOBAL_LIBM_ENTRY(powl) 798// 799// Get significand of x. It is the critical path. 800// 801{ .mfi 802 getf.sig GR_signif_Z = FR_Input_X // Get significand of x 803 fclass.m p11, p12 = FR_Input_X, 0x0b // Test x unorm 804 nop.i 999 805} 806{ .mfi 807 nop.m 999 808 fnorm.s1 FR_norm_X = FR_Input_X // Normalize x 809 mov GR_exp_half = 0xffff - 1 // Exponent for 0.5 810} 811;; 812 813{ .mfi 814 alloc r32 = ar.pfs,0,30,4,0 815 fclass.m p7, p0 = FR_Input_Y, 0x1E7 // Test y natval, nan, inf, zero 816 mov GR_exp_pos_max = 0x13fff // Max exponent for pos oflow test 817} 818{ .mfi 819 addl GR_table_base = @ltoff(Constants_exp_64_Arg#), gp // Ptr to tables 820 fnorm.s1 FR_norm_Y = FR_Input_Y // Normalize y 821 mov GR_exp_neg_max = 0x33fff // Max exponent for neg oflow test 822} 823;; 824 825{ .mfi 826 getf.exp GR_signexp_y = FR_Input_Y // Get sign and exp of y 827(p12) fclass.m p11, p0 = FR_Input_Y, 0x0b // Test y unorm 828 mov GR_sign_mask = 0x20000 // Sign mask 829} 830{ .mfi 831 ld8 GR_table_base = [GR_table_base] // Get base address for tables 832 fadd.s1 FR_Two = f1, f1 // Form 2.0 for square test 833 mov GR_exp_mask = 0x1FFFF // Exponent mask 834} 835;; 836 837{ .mfi 838 getf.sig GR_signif_y = FR_Input_Y // Get significand of y 839 fclass.m p6, p0 = FR_Input_X, 0x1E7 // Test x natval, nan, inf, zero 840 nop.i 999 841} 842;; 843 844{ .mfi 845 getf.exp GR_signexp_x = FR_Input_X // Get signexp of x 846 fmerge.s FR_save_Input_X = FR_Input_X, FR_Input_X 847 extr.u GR_Index1 = GR_signif_Z, 59, 4 // Extract upper 4 signif bits of x 848} 849{ .mfb 850 setf.exp FR_Half = GR_exp_half // Load half 851 nop.f 999 852(p11) br.cond.spnt POWL_DENORM // Branch if x or y denorm/unorm 853} 854;; 855 856// Return here from POWL_DENORM 857POWL_COMMON: 858{ .mfi 859 setf.exp FR_Big = GR_exp_pos_max // Form big pos value for oflow test 860 fclass.nm p11, p0 = FR_Input_Y, 0x1FF // Test Y unsupported 861 shl GR_Index1 = GR_Index1,5 // Adjust index1 pointer x 32 862} 863{ .mfi 864 add GR_Table_Ptr = 0x7c0, GR_table_base // Constants_log_80_Z_G_H_h1 865 fma.s1 FR_Sgn = f1,f1,f0 // Assume result positive 866 mov GR_exp_bias = 0xFFFF // Form exponent bias 867} 868;; 869 870// 871// Identify NatVals, NaNs, Infs, and Zeros. 872// 873// 874// Remove sign bit from exponent of y. 875// Check for x = 1 876// Branch on Infs, Nans, Zeros, and Natvals 877// Check to see that exponent < 0 878// 879{ .mfi 880 setf.exp FR_NBig = GR_exp_neg_max // Form big neg value for oflow test 881 fclass.nm p8, p0 = FR_Input_X, 0x1FF // Test X unsupported 882 and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent of y 883} 884{ .mfb 885 add GR_Index1 = GR_Index1,GR_Table_Ptr 886 nop.f 999 887(p6) br.cond.spnt POWL_64_SPECIAL // Branch if x natval, nan, inf, zero 888} 889;; 890 891// load Z_1 from Index1 892 893// There is logic starting here to determine if y is an integer when x < 0. 894// If 0 < |y| < 1 then clearly y is not an integer. 895// If |y| > 1, then the significand of y is shifted left by the size of 896// the exponent of y. This preserves the lsb of the integer part + the 897// fractional bits. The lsb of the integer can be tested to determine if 898// the integer is even or odd. The fractional bits can be tested. If zero, 899// then y is an integer. 900// 901{ .mfi 902 ld2 GR_Z_1 =[GR_Index1],4 // Load Z_1 903 fmerge.s FR_Z = f0, FR_norm_X // Z = |x| 904 extr.u GR_X_0 = GR_signif_Z, 49, 15 // Extract X_0 from significand 905} 906{ .mfb 907 cmp.lt p9, p0 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1 908 nop.f 999 909(p7) br.cond.spnt POWL_64_SPECIAL // Branch if y natval, nan, inf, zero 910} 911;; 912 913{ .mfb 914 ldfs FR_G_1 = [GR_Index1],4 // Load G_1 915 fcmp.eq.s1 p10, p0 = FR_Input_Y, f1 // Test Y = +1.0 916(p8) br.cond.spnt POWL_64_UNSUPPORT // Branch if x unsupported 917} 918;; 919 920// 921// X_0 = High order 15 bit of Z 922// 923{ .mfb 924 ldfs FR_H_1 = [GR_Index1],8 // Load H_1 925(p9) fcmp.lt.unc.s1 p9, p0 = FR_Input_X, f0 // Test x<0, 0 <|y|<1 926(p11) br.cond.spnt POWL_64_UNSUPPORT // Branch if y unsupported 927} 928;; 929 930{ .mfi 931 ldfe FR_h_1 = [GR_Index1] // Load h_1 932 fcmp.eq.s1 p7, p0 = FR_Input_Y, FR_Two // Test y = 2.0 933 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // X_1 = X_0 * Z_1 (bits 15-30) 934 // Wait 4 cycles to use result 935} 936{ .mfi 937 add GR_Table_Ptr = 0x9c0, GR_table_base // Constants_log_80_Z_G_H_h2 938 nop.f 999 939 sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent of y 940} 941;; 942 943// 944// Branch for (x < 0) and Y not an integer. 945// 946{ .mfb 947 nop.m 999 948 fcmp.lt.s1 p6, p0 = FR_Input_X, f0 // Test x < 0 949(p9) br.cond.spnt POWL_64_XNEG // Branch if x < 0, 0 < |y| < 1 950} 951;; 952 953{ .mfi 954 nop.m 999 955 fcmp.eq.s1 p12, p0 = FR_Input_X, f1 // Test x=+1.0 956 nop.i 999 957} 958{ .mfb 959 nop.m 999 960 fsub.s1 FR_W = FR_Z, f1 // W = Z - 1 961(p7) br.cond.spnt POWL_64_SQUARE // Branch if y=2 962} 963;; 964 965{ .mfi 966 nop.m 999 967(p10) fmpy.s0 FR_Result = FR_Input_X, f1 // If y=+1.0, result=x 968(p6) shl GR_fraction_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction 969 // Wait 4 cycles to use result 970} 971;; 972 973{ .mfi 974 nop.m 999 975(p12) fma.s0 FR_Result = FR_Input_Y, f0, f1 // If x=1.0, result=1, chk denorm 976 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract index2 977} 978;; 979 980// 981// N = exponent of Z 982// 983{ .mib 984 getf.exp GR_N = FR_Z // Get exponent of Z (also x) 985 shl GR_Index2=GR_Index2,5 // Index2 x 32 bytes 986(p10) br.ret.spnt b0 // Exit if y=+1.0 987} 988;; 989 990{ .mib 991 add GR_Index2 = GR_Index2, GR_Table_Ptr // Pointer to table 2 992 nop.i 999 993(p12) br.ret.spnt b0 // Exit if x=+1.0 994} 995;; 996 997{ .mmi 998 ld2 GR_Z_2 =[GR_Index2],4 // Load Z_2 999;; 1000 ldfs FR_G_2 = [GR_Index2],4 // Load G_2 1001 nop.i 999 1002} 1003;; 1004 1005{ .mii 1006 ldfs FR_H_2 = [GR_Index2],8 // Load H_2 1007(p6) tbit.nz.unc p9, p0 = GR_fraction_y, 63 // Test x<0 and y odd integer 1008 add GR_Table_Ptr = 0xbcc, GR_table_base // Constants_log_80_h3_G_H, G_3 1009} 1010;; 1011 1012// 1013// For x < 0 and y odd integer,, set sign = -1. 1014// 1015{ .mfi 1016 getf.exp GR_M = FR_W // Get signexp of W 1017 nop.f 999 1018 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // X_2 = X_1 * Z_2 (bits 15-30) 1019} 1020{ .mfi 1021 ldfe FR_h_2 = [GR_Index2] // Load h_2 1022(p9) fnma.s1 FR_Sgn = f1, f1, f0 // If x<0, y odd int, result negative 1023 sub GR_N = GR_N, GR_exp_bias // Get true exponent of x = N 1024} 1025;; 1026 1027{ .mfi 1028 add GR_Table_Ptr1 = 0xdc0, GR_table_base // Ptr to H_3 1029 fcmp.eq.s0 p11, p0 = FR_Input_Y, FR_Half // Test y=0.5, also set denorm 1030(p6) shl GR_fraction_y= GR_fraction_y, 1 // Shift left 1 to get fraction 1031} 1032;; 1033 1034{ .mmb 1035 setf.sig FR_float_N = GR_N 1036(p6) cmp.ne.unc p8, p0 = GR_fraction_y, r0 // Test x<0 and y not integer 1037(p8) br.cond.spnt POWL_64_XNEG // Branch if x<0 and y not int 1038} 1039;; 1040 1041// 1042// Raise possible denormal operand exception for both X and Y. 1043// Set pointers in case |x| near 1 1044// Branch to embedded sqrt(x) if y=0.5 1045// 1046{ .mfi 1047 add GR_P_ptr1 = 0x6b0, GR_table_base // Constants_log_80_P, P8, NEAR path 1048 fcmp.eq.s0 p12, p0 = FR_Input_X, FR_Input_Y // Dummy to set denormal 1049 add GR_P_ptr2 = 0x700, GR_table_base // Constants_log_80_P, P4, NEAR path 1050} 1051{ .mfb 1052 cmp.eq p15, p14 = r0, r0 // Assume result safe (no over/under) 1053 fsub.s1 FR_Delta = FR_Input_Y,f1 // Delta = y - 1.0 1054(p11) br.cond.spnt POWL_64_SQRT // Branch if y=0.5 1055} 1056;; 1057 1058// 1059// Computes ln( x ) to extra precision 1060// Input FR 1: FR_X 1061// Output FR 2: FR_Y_hi 1062// Output FR 3: FR_Y_lo 1063// Output PR 1: PR_Safe 1064// 1065{ .mfi 1066 and GR_M = GR_exp_mask, GR_M // Mask to get exponent of W 1067 nop.f 999 1068 extr.u GR_Index3 = GR_X_2, 1, 5 // Get index3 1069} 1070;; 1071 1072{ .mmi 1073 shladd GR_Table_Ptr1 = GR_Index3,2,GR_Table_Ptr1 // Ptr to H_3 1074 shladd GR_Index3 = GR_Index3,4,GR_Table_Ptr // Ptr to G_3 1075 sub GR_M = GR_M, GR_exp_bias // Get true exponent of W 1076} 1077;; 1078 1079{ .mib 1080 ldfs FR_G_3 = [GR_Index3],-12 // Load G_3 1081 cmp.gt p7, p14 = -8, GR_M // Test if |x-1| < 2^-8 1082(p7) br.cond.spnt LOGL80_NEAR // Branch if |x-1| < 2^-8 1083} 1084;; 1085 1086// Here if |x-1| >= 2^-8 1087{ .mmf 1088 ldfs FR_H_3 = [GR_Table_Ptr1] // Load H_3 1089 nop.m 999 1090 nop.f 999 1091} 1092;; 1093 1094{ .mfi 1095 ldfe FR_h_3 = [GR_Index3] // Load h_3 1096 fmerge.se FR_S = f1,FR_Z // S = merge of 1.0 and signif(Z) 1097 nop.i 999 1098} 1099{ .mfi 1100 add GR_Table_Ptr = 0x740, GR_table_base // Constants_log_80_Q 1101 fmpy.s1 FR_G = FR_G_1, FR_G_2 // G = G_1 * G_2 1102 nop.i 999 1103} 1104;; 1105 1106// 1107// Begin Loading Q's - load log2_hi part 1108// 1109{ .mfi 1110 ldfe FR_log2_hi = [GR_Table_Ptr],16 // Load log2_hi 1111 fadd.s1 FR_H = FR_H_1, FR_H_2 // H = H_1 + H_2 1112 nop.i 999 1113};; 1114 1115// 1116// h = h_1 + h_2 1117// 1118{ .mfi 1119 ldfe FR_log2_lo = [GR_Table_Ptr],16 // Load log2_lo 1120 fadd.s1 FR_h = FR_h_1, FR_h_2 // h = h_1 + h_2 1121 nop.i 999 1122} 1123;; 1124 1125{ .mfi 1126 ldfe FR_Q_6 = [GR_Table_Ptr],16 // Load Q_6 1127 fcvt.xf FR_float_N = FR_float_N 1128 nop.i 999 1129} 1130;; 1131 1132{ .mfi 1133 ldfe FR_Q_5 = [GR_Table_Ptr],16 // Load Q_5 1134 nop.f 999 1135 nop.i 999 1136} 1137;; 1138 1139// 1140// G = G_1 * G_2 * G_3 1141// 1142{ .mfi 1143 ldfe FR_Q_4 = [GR_Table_Ptr],16 // Load Q_4 1144 fmpy.s1 FR_G = FR_G, FR_G_3 1145 nop.i 999 1146} 1147;; 1148 1149// 1150// H = H_1 + H_2 + H_3 1151// 1152{ .mfi 1153 ldfe FR_Q_3 = [GR_Table_Ptr],16 // Load Q_3 1154 fadd.s1 FR_H = FR_H, FR_H_3 1155 nop.i 999 1156} 1157;; 1158 1159// 1160// Y_lo = poly + Y_lo 1161// 1162// h = h_1 + h_2 + h_3 1163// 1164{ .mfi 1165 ldfe FR_Q_2 = [GR_Table_Ptr],16 // Load Q_2 1166 fadd.s1 FR_h = FR_h, FR_h_3 1167 nop.i 999 1168} 1169;; 1170 1171// 1172// GS_hi = G*S 1173// r = G*S -1 1174// 1175{ .mfi 1176 ldfe FR_Q_1 = [GR_Table_Ptr],16 // Load Q_1 1177 fmpy.s1 FR_GS_hi = FR_G, FR_S 1178 nop.i 999 1179} 1180{ .mfi 1181 nop.m 999 1182 fms.s1 FR_r = FR_G, FR_S, f1 1183 nop.i 999 1184} 1185;; 1186 1187// 1188// poly_lo = Q_5 + r * Q_6 1189// 1190{ .mfi 1191 getf.exp GR_Delta_Exp = FR_Delta // Get signexp of y-1 for exp calc 1192 fma.s1 FR_poly_lo = FR_r, FR_Q_6, FR_Q_5 1193 nop.i 999 1194} 1195// 1196// r_cor = GS_hi -1 1197// 1198{ .mfi 1199 nop.m 999 1200 fsub.s1 FR_r_cor = FR_GS_hi, f1 1201 nop.i 999 1202} 1203;; 1204 1205// 1206// GS_lo = G*S - GS_hi 1207// 1208{ .mfi 1209 nop.m 999 1210 fms.s1 FR_GS_lo = FR_G, FR_S, FR_GS_hi 1211 nop.i 999 1212} 1213;; 1214 1215// 1216// rsq = r * r 1217// 1218{ .mfi 1219 nop.m 999 1220 fmpy.s1 FR_rsq = FR_r, FR_r 1221 nop.i 999 1222} 1223// 1224// G = float_N*log2_hi + H 1225// 1226{ .mfi 1227 nop.m 999 1228 fma.s1 FR_G = FR_float_N, FR_log2_hi, FR_H 1229 nop.i 999 1230} 1231;; 1232 1233// 1234// Y_lo = float_N*log2_lo + h 1235// 1236{ .mfi 1237 nop.m 999 1238 fma.s1 FR_Y_lo = FR_float_N, FR_log2_lo, FR_h 1239 nop.i 999 1240} 1241;; 1242 1243// 1244// poly_lo = Q_4 + r * poly_lo 1245// r_cor = r_cor - r 1246// 1247{ .mfi 1248 nop.m 999 1249 fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_4 1250 nop.i 999 1251} 1252{ .mfi 1253 nop.m 999 1254 fsub.s1 FR_r_cor = FR_r_cor, FR_r 1255 nop.i 999 1256} 1257;; 1258 1259// 1260// poly_hi = r * Q_2 + Q_1 1261// Y_hi = G + r 1262// 1263{ .mfi 1264 nop.m 999 1265 fma.s1 FR_poly = FR_r, FR_Q_2, FR_Q_1 1266 nop.i 999 1267} 1268{ .mfi 1269 nop.m 999 1270 fadd.s1 FR_Y_hi = FR_G, FR_r 1271 nop.i 999 1272} 1273;; 1274 1275// 1276// poly_lo = Q_3 + r * poly_lo 1277// r_cor = r_cor + GS_lo 1278// 1279{ .mfi 1280 nop.m 999 1281 fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_3 1282 nop.i 999 1283} 1284{ .mfi 1285 nop.m 999 1286 fadd.s1 FR_r_cor = FR_r_cor, FR_GS_lo 1287 nop.i 999 1288} 1289;; 1290 1291// 1292// Y_lo = G - Y_hi 1293// 1294{ .mfi 1295 nop.m 999 1296 fsub.s1 FR_Y_lo_2 = FR_G, FR_Y_hi 1297 nop.i 999 1298} 1299;; 1300 1301// 1302// r_cor = r_cor + Y_lo 1303// poly = poly_hi + rsq * poly_lo 1304// 1305{ .mfi 1306 add GR_Table_Ptr = 0x0, GR_table_base // Constants_exp_64_Arg 1307 fadd.s1 FR_r_cor = FR_r_cor, FR_Y_lo 1308 nop.i 999 1309} 1310{ .mfi 1311 nop.m 999 1312 fma.s1 FR_poly = FR_rsq, FR_poly_lo, FR_poly 1313 nop.i 999 1314} 1315;; 1316 1317// 1318// Load L_hi 1319// Load L_lo 1320// all long before they are needed. 1321// They are used in LOGL_RETURN PATH 1322// 1323// Y_lo = Y_lo + r 1324// poly = rsq * poly + r_cor 1325// 1326{ .mfi 1327 ldfe FR_L_hi = [GR_Table_Ptr],16 // Load L_hi 1328 fadd.s1 FR_Y_lo = FR_Y_lo_2, FR_r 1329 nop.i 999 1330} 1331{ .mfi 1332 nop.m 999 1333 fma.s1 FR_poly = FR_rsq, FR_poly, FR_r_cor 1334 nop.i 999 1335} 1336;; 1337 1338{ .mfb 1339 ldfe FR_L_lo = [GR_Table_Ptr],16 // Load L_lo 1340 fadd.s1 FR_Y_lo = FR_Y_lo, FR_poly 1341 br.cond.sptk LOGL_RETURN // Branch to common code 1342} 1343;; 1344 1345 1346LOGL80_NEAR: 1347// Here if |x-1| < 2^-8 1348// 1349// Branch LOGL80_NEAR 1350// 1351 1352{ .mmf 1353 ldfe FR_P_8 = [GR_P_ptr1],16 // Load P_8 1354 ldfe FR_P_4 = [GR_P_ptr2],16 // Load P_4 1355 fmpy.s1 FR_Wsq = FR_W, FR_W 1356} 1357;; 1358 1359{ .mmi 1360 ldfe FR_P_7 = [GR_P_ptr1],16 // Load P_7 1361 ldfe FR_P_3 = [GR_P_ptr2],16 // Load P_3 1362 nop.i 999 1363} 1364;; 1365 1366{ .mmi 1367 ldfe FR_P_6 = [GR_P_ptr1],16 // Load P_6 1368 ldfe FR_P_2 = [GR_P_ptr2],16 // Load P_2 1369 nop.i 999 1370} 1371;; 1372 1373{ .mmi 1374 ldfe FR_P_5 = [GR_P_ptr1],16 // Load P_5 1375 ldfe FR_P_1 = [GR_P_ptr2],16 // Load P_1 1376 nop.i 999 1377} 1378;; 1379 1380{ .mfi 1381 getf.exp GR_Delta_Exp = FR_Delta // Get signexp of y-1 for exp calc 1382 fmpy.s1 FR_W4 = FR_Wsq, FR_Wsq 1383 nop.i 999 1384} 1385{ .mfi 1386 add GR_Table_Ptr = 0x0, GR_table_base // Constants_exp_64_Arg 1387 fmpy.s1 FR_W3 = FR_Wsq, FR_W 1388 nop.i 999 1389} 1390;; 1391 1392{ .mfi 1393 nop.m 999 1394 fmpy.s1 FR_half_W = FR_Half, FR_W 1395 nop.i 999 1396} 1397;; 1398 1399{ .mfi 1400 ldfe FR_L_hi = [GR_Table_Ptr],16 1401 fma.s1 FR_poly_lo = FR_W, FR_P_8,FR_P_7 1402 nop.i 999 1403} 1404{ .mfi 1405 nop.m 999 1406 fma.s1 FR_poly = FR_W, FR_P_4, FR_P_3 1407 nop.i 999 1408} 1409;; 1410 1411{ .mfi 1412 ldfe FR_L_lo = [GR_Table_Ptr],16 1413 fnma.s1 FR_Y_hi = FR_W, FR_half_W, FR_W 1414 nop.i 999 1415} 1416;; 1417 1418{ .mfi 1419 nop.m 999 1420 fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_6 1421 nop.i 999 1422} 1423{ .mfi 1424 nop.m 999 1425 fma.s1 FR_poly = FR_W, FR_poly, FR_P_2 1426 nop.i 999 1427} 1428;; 1429 1430{ .mfi 1431 nop.m 999 1432 fsub.s1 FR_Y_lo = FR_W, FR_Y_hi 1433 nop.i 999 1434} 1435;; 1436 1437{ .mfi 1438 nop.m 999 1439 fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_5 1440 nop.i 999 1441} 1442{ .mfi 1443 nop.m 999 1444 fma.s1 FR_poly = FR_W, FR_poly, FR_P_1 1445 nop.i 999 1446} 1447;; 1448 1449{ .mfi 1450 nop.m 999 1451 fnma.s1 FR_Y_lo = FR_W, FR_half_W, FR_Y_lo 1452 nop.i 999 1453} 1454;; 1455 1456{ .mfi 1457 nop.m 999 1458 fma.s1 FR_poly = FR_poly_lo, FR_W4, FR_poly 1459 nop.i 999 1460} 1461;; 1462 1463{ .mfi 1464 nop.m 999 1465 fma.s1 FR_Y_lo = FR_poly, FR_W3, FR_Y_lo 1466 nop.i 999 1467} 1468;; 1469 1470 1471LOGL_RETURN: 1472// Common code for completion of both logx paths 1473 1474// 1475// L_hi, L_lo already loaded. 1476// 1477// 1478// kernel_log_80 computed ln(X) 1479// and return logX_hi and logX_lo as results. 1480// PR_pow_Safe set as well. 1481// 1482// 1483// Compute Y * (logX_hi + logX_lo) 1484// P_hi -> X 1485// P_lo -> X_cor 1486// (Manipulate names so that inputs are in 1487// the place kernel_exp expects them) 1488// 1489// This function computes exp( x + x_cor) 1490// Input FR 1: FR_X 1491// Input FR 2: FR_X_cor 1492// Output FR 3: FR_Y_hi 1493// Output FR 4: FR_Y_lo 1494// Output FR 5: FR_Scale 1495// Output PR 1: PR_Safe 1496// 1497// P15 is True 1498// 1499// Load constants used in computing N using right-shift technique 1500{ .mlx 1501 mov GR_exp_2tom51 = 0xffff-51 1502 movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 1503} 1504{ .mlx 1505 add GR_Special_Exp = -50,GR_exp_bias 1506 movl GR_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51) 1507} 1508;; 1509 1510// 1511// Point to Table of W1s 1512// Point to Table of W2s 1513// 1514{ .mmi 1515 add GR_W1_ptr = 0x2b0, GR_table_base // Constants_exp_64_W1 1516 add GR_W2_ptr = 0x4b0, GR_table_base // Constants_exp_64_W2 1517 cmp.le p6,p0= GR_Delta_Exp,GR_Special_Exp 1518};; 1519 1520// Form two constants we need 1521// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 1522// 1.1000..000 * 2^(63+63-12) to right shift int(N) into the significand 1523 1524{ .mfi 1525 setf.sig FR_INV_LN2_2TO63 = GR_sig_inv_ln2 // form 1/ln2 * 2^63 1526 nop.f 999 1527 and GR_Delta_Exp=GR_Delta_Exp,GR_exp_mask // Get exponent of y-1 1528} 1529{ .mlx 1530 setf.d FR_RSHF_2TO51 = GR_rshf_2to51 // Form const 1.1000 * 2^(63+51) 1531 movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift 1532} 1533;; 1534 1535{ .mfi 1536 nop.m 999 1537 fmpy.s1 FR_X_lo = FR_Input_Y, FR_logx_lo // logx_lo is Y_lo 1538 cmp.eq p15, p0= r0, r0 // Set p15, assume safe 1539};; 1540 1541{ .mmi 1542 setf.exp FR_2TOM51 = GR_exp_2tom51 // Form 2^-51 for scaling float_N 1543 setf.d FR_RSHF = GR_rshf // Form right shift const 1.1000 * 2^63 1544 add GR_Table_Ptr1 = 0x50, GR_table_base // Constants_exp_64_P for 1545 // EXPL_SMALL path 1546} 1547;; 1548 1549{ .mmi 1550 ldfe FR_P_6 = [GR_Table_Ptr1],16 // Load P_6 for EXPL_SMALL path 1551;; 1552 ldfe FR_P_5 = [GR_Table_Ptr1],16 // Load P_5 for EXPL_SMALL path 1553 nop.i 999 1554} 1555;; 1556 1557{ .mfi 1558 ldfe FR_P_4 = [GR_Table_Ptr1],16 // Load P_4 for EXPL_SMALL path 1559 fma.s1 FR_P_hi = FR_Input_Y, FR_logx_hi,FR_X_lo // logx_hi ix Y_hi 1560 nop.i 999 1561} 1562;; 1563 1564{ .mmi 1565 ldfe FR_P_3 = [GR_Table_Ptr1],16 // Load P_3 for EXPL_SMALL path 1566;; 1567 ldfe FR_P_2 = [GR_Table_Ptr1],16 // Load P_2 for EXPL_SMALL path 1568 nop.i 999 1569} 1570;; 1571 1572// N = X * Inv_log2_by_2^12 1573// By adding 1.10...0*2^63 we shift and get round_int(N_signif) in significand. 1574// We actually add 1.10...0*2^51 to X * Inv_log2 to do the same thing. 1575{ .mfi 1576 ldfe FR_P_1 = [GR_Table_Ptr1] // Load P_1 for EXPL_SMALL path 1577 fma.s1 FR_N = FR_X, FR_INV_LN2_2TO63, FR_RSHF_2TO51 1578 nop.i 999 1579} 1580{ .mfb 1581 nop.m 999 1582 fms.s1 FR_P_lo= FR_Input_Y, FR_logx_hi, FR_P_hi // P_hi is X 1583(p6) br.cond.spnt POWL_Y_ALMOST_1 // Branch if |y-1| < 2^-50 1584} 1585;; 1586 1587{ .mmi 1588 getf.exp GR_Expo_X = FR_X 1589 add GR_T1_ptr = 0x0b0, GR_table_base // Constants_exp_64_T1 1590 add GR_T2_ptr = 0x1b0, GR_table_base // Constants_exp_64_T2 1591} 1592;; 1593 1594// float_N = round_int(N) 1595// The signficand of N contains the rounded integer part of X * 2^12/ln2, 1596// as a twos complement number in the lower bits (that is, it may be negative). 1597// That twos complement number (called N) is put into GR_N_fix. 1598 1599// Since N is scaled by 2^51, it must be multiplied by 2^-51 1600// before the shift constant 1.10000 * 2^63 is subtracted to yield float_N. 1601// Thus, float_N contains the floating point version of N 1602 1603 1604{ .mfi 1605 add GR_Table_Ptr = 0x20, GR_table_base // Constants_exp_64_A 1606 fms.s1 FR_float_N = FR_N, FR_2TOM51, FR_RSHF // Form float_N 1607 nop.i 999 1608} 1609// Create low part of Y(ln(x)_hi + ln(x)_lo) as P_lo 1610{ .mfi 1611 mov GR_Big_Pos_Exp = 0x3ffe // 16382, largest safe exponent 1612 fadd.s1 FR_P_lo = FR_P_lo, FR_X_lo 1613 mov GR_Big_Neg_Exp = -0x3ffd // -16381 smallest safe exponent 1614};; 1615 1616{ .mfi 1617 nop.m 999 1618 fmpy.s1 FR_rsq = FR_X, FR_X // rsq = X*X for EXPL_SMALL path 1619 mov GR_vsm_expo = -70 // Exponent for very small path 1620} 1621{ .mfi 1622 nop.m 999 1623 fma.s1 FR_poly_lo = FR_P_6, FR_X, FR_P_5 // poly_lo for EXPL_SMALL path 1624 add GR_temp = 0x1,r0 // For tiny signif if small path 1625} 1626;; 1627 1628// 1629// If expo_X < -6 goto exp_small 1630// 1631{ .mmi 1632 getf.sig GR_N_fix = FR_N 1633 ldfe FR_A_3 = [GR_Table_Ptr],16 // Load A_3 1634 and GR_Expo_X = GR_Expo_X, GR_exp_mask // Get exponent of X 1635} 1636;; 1637 1638{ .mfi 1639 ldfe FR_A_2 = [GR_Table_Ptr],16 // Load A_2 1640 nop.f 999 1641 sub GR_Expo_X = GR_Expo_X, GR_exp_bias // Get true exponent of X 1642} 1643;; 1644 1645// 1646// If -6 > Expo_X, set P9 and branch 1647// 1648{ .mfb 1649 cmp.gt p9, p0 = -6, GR_Expo_X 1650 fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_X // r = X - L_hi * float_N 1651(p9) br.cond.spnt EXPL_SMALL // Branch if |X| < 2^-6 1652} 1653;; 1654 1655// 1656// If 14 <= Expo_X, set P10 1657// 1658{ .mib 1659 cmp.le p10, p0 = 14, GR_Expo_X 1660 nop.i 999 1661(p10) br.cond.spnt EXPL_HUGE // Branch if |X| >= 2^14 1662} 1663;; 1664 1665// 1666// Load single T1 1667// Load single T2 1668// W_1_p1 = W_1 + 1 1669// 1670{ .mmi 1671 nop.m 999 1672 nop.m 999 1673 extr.u GR_M1 = GR_N_fix, 6, 6 // Extract index M_1 1674} 1675;; 1676 1677// 1678// k = extr.u(N_fix,0,6) 1679// 1680{ .mmi 1681 shladd GR_W1_ptr = GR_M1,3,GR_W1_ptr // Point to W1 1682 shladd GR_T1_ptr = GR_M1,2,GR_T1_ptr // Point to T1 1683 extr.u GR_M2 = GR_N_fix, 0, 6 // Extract index M_2 1684} 1685;; 1686 1687// N_fix is only correct up to 50 bits because of our right shift technique. 1688// Actually in the normal path we will have restricted K to about 14 bits. 1689// Somewhat arbitrarily we extract 32 bits. 1690{ .mmi 1691 ldfd FR_W1 = [GR_W1_ptr] 1692 shladd GR_W2_ptr = GR_M2,3,GR_W2_ptr // Point to W2 1693 extr GR_k = GR_N_fix, 12, 32 // Extract k 1694} 1695;; 1696 1697{ .mfi 1698 ldfs FR_T1 = [GR_T1_ptr] 1699 fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r 1700 shladd GR_T2_ptr = GR_M2,2,GR_T2_ptr // Point to T2 1701} 1702{ .mfi 1703 add GR_exp_bias_p_k = GR_exp_bias, GR_k 1704 nop.f 999 1705 cmp.gt p14,p15 = GR_k,GR_Big_Pos_Exp 1706} 1707;; 1708 1709// 1710// if k < big_neg_exp, set p14 and Safe=False 1711// 1712{ .mmi 1713 ldfs FR_T2 = [GR_T2_ptr] 1714(p15) cmp.lt p14,p15 = GR_k,GR_Big_Neg_Exp 1715 nop.i 999 1716} 1717;; 1718 1719{ .mmi 1720 setf.exp FR_Scale = GR_exp_bias_p_k 1721 ldfd FR_W2 = [GR_W2_ptr] 1722 nop.i 999 1723} 1724;; 1725 1726{ .mfi 1727 ldfe FR_A_1 = [GR_Table_Ptr],16 1728 fadd.s1 FR_r = FR_r, FR_X_cor 1729 nop.i 999 1730} 1731;; 1732 1733{ .mfi 1734 nop.m 999 1735 fadd.s1 FR_W_1_p1 = FR_W1, f1 1736 nop.i 999 1737} 1738;; 1739 1740{ .mfi 1741 nop.m 999 1742 fma.s1 FR_poly = FR_r, FR_A_3, FR_A_2 1743 nop.i 999 1744} 1745{ .mfi 1746 nop.m 999 1747 fmpy.s1 FR_rsq = FR_r, FR_r 1748 nop.i 999 1749} 1750;; 1751 1752{ .mfi 1753 nop.m 999 1754 fmpy.s1 FR_T = FR_T1, FR_T2 1755 nop.i 999 1756} 1757;; 1758 1759{ .mfi 1760 nop.m 999 1761 fma.s1 FR_W = FR_W2, FR_W_1_p1, FR_W1 1762 nop.i 999 1763} 1764;; 1765 1766{ .mfi 1767 nop.m 999 1768 fma.s1 FR_TMP1 = FR_Scale, FR_Sgn, f0 1769 nop.i 999 1770} 1771;; 1772 1773{ .mfi 1774 nop.m 999 1775 fma.s1 FR_poly = FR_r, FR_poly, FR_A_1 1776 nop.i 999 1777} 1778;; 1779 1780{ .mfi 1781 nop.m 999 1782 fma.s1 FR_TMP2 = FR_T, f1, f0 // TMP2 = Y_hi = T 1783 nop.i 999 1784} 1785;; 1786 1787{ .mfi 1788 nop.m 999 1789 fadd.s1 FR_Wp1 = FR_W, f1 1790 nop.i 999 1791} 1792;; 1793 1794{ .mfi 1795 nop.m 999 1796 fma.s1 FR_poly = FR_rsq, FR_poly,FR_r 1797 nop.i 999 1798} 1799;; 1800 1801{ .mfi 1802 nop.m 999 1803 fma.s1 FR_Tscale = FR_T, FR_TMP1, f0 // Scale * Sgn * T 1804 nop.i 999 1805} 1806{ .mfi 1807 nop.m 999 1808 fma.s1 FR_Y_lo = FR_Wp1, FR_poly, FR_W 1809 nop.i 999 1810} 1811;; 1812 1813{ .mfb 1814 nop.m 999 1815 fmpy.s1 FR_TMP3 = FR_Y_lo, FR_Tscale 1816 br.cond.sptk POWL_64_SHARED 1817} 1818;; 1819 1820 1821EXPL_SMALL: 1822// Here if |ylogx| < 2^-6 1823// 1824// Begin creating lsb to perturb final result 1825// 1826{ .mfi 1827 setf.sig FR_temp = GR_temp 1828 fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_4 1829 cmp.lt p12, p0 = GR_Expo_X, GR_vsm_expo // Test |ylogx| < 2^-70 1830} 1831{ .mfi 1832 nop.m 999 1833 fma.s1 FR_poly_hi = FR_P_2, FR_X, FR_P_1 1834 nop.i 999 1835} 1836;; 1837 1838{ .mfi 1839 nop.m 999 1840 fmpy.s1 FR_TMP2 = f1, f1 1841 nop.i 999 1842} 1843{ .mfi 1844 nop.m 999 1845 fmpy.s1 FR_TMP1 = FR_Sgn, f1 1846 nop.i 999 1847} 1848;; 1849 1850{ .mfi 1851 nop.m 999 1852 fmpy.s1 FR_r4 = FR_rsq, FR_rsq 1853(p12) cmp.eq p15, p0 = r0, r0 // Set safe if |ylogx| < 2^-70 1854} 1855{ .mfb 1856 nop.m 999 1857(p12) fmpy.s1 FR_TMP3 = FR_Sgn, FR_X 1858(p12) br.cond.spnt POWL_64_SHARED // Branch if |ylogx| < 2^-70 1859} 1860;; 1861 1862{ .mfi 1863 nop.m 999 1864 fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_3 1865 nop.i 999 1866} 1867{ .mfi 1868 nop.m 999 1869 fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, FR_X 1870 nop.i 999 1871} 1872;; 1873 1874{ .mfi 1875 nop.m 999 1876 fma.s1 FR_Y_lo = FR_poly_lo, FR_r4, FR_poly_hi 1877 nop.i 999 1878} 1879;; 1880 1881{ .mfi 1882 nop.m 999 1883 fmpy.s1 FR_TMP3 = FR_Y_lo, FR_TMP1 // Add sign info 1884 nop.i 999 1885} 1886;; 1887 1888// 1889// Toggle on last bit of Y_lo 1890// Set lsb of Y_lo to 1 1891// 1892{ .mfi 1893 nop.m 999 1894 for FR_temp = FR_Y_lo,FR_temp 1895 nop.i 999 1896} 1897;; 1898 1899{ .mfb 1900 nop.m 999 1901 fmerge.se FR_TMP3 = FR_TMP3,FR_temp 1902 br.cond.sptk POWL_64_SHARED 1903} 1904;; 1905 1906 1907EXPL_HUGE: 1908// Here if |ylogx| >= 2^14 1909{ .mfi 1910 mov GR_temp = 0x0A1DC // If X < 0, exponent -24100 1911 fcmp.gt.s1 p12, p13 = FR_X, f0 // Test X > 0 1912 cmp.eq p14, p15 = r0, r0 // Set Safe to false 1913} 1914;; 1915 1916{ .mmi 1917(p12) mov GR_Mask = 0x15DC0 // If X > 0, exponent +24000 1918(p13) mov GR_Mask = 0x0A240 // If X < 0, exponent -24000 1919 nop.i 999 1920} 1921;; 1922 1923{ .mmf 1924 setf.exp FR_TMP2 = GR_Mask // Form Y_hi = TMP2 1925(p13) setf.exp FR_Y_lo = GR_temp // If X < 0, Y_lo = 2^-24100 1926(p12) mov FR_Y_lo = f1 // IF X > 0, Y_lo = 1.0 1927} 1928;; 1929 1930{ .mfi 1931 nop.m 999 1932 fmpy.s1 FR_TMP1 = FR_TMP2, FR_Sgn // TMP1 = Y_hi * Sgn 1933 nop.i 999 1934} 1935;; 1936 1937{ .mfb 1938 nop.m 999 1939 fmpy.s1 FR_TMP3 = FR_Y_lo,FR_TMP1 // TMP3 = Y_lo * (Y_hi * Sgn) 1940 br.cond.sptk POWL_64_SHARED 1941} 1942;; 1943 1944POWL_Y_ALMOST_1: 1945// Here if delta = |y-1| < 2^-50 1946// 1947// x**(1 + delta) = x * e (ln(x)*delta) = x ( 1 + ln(x) * delta) 1948// 1949// Computation will be safe for 2^-16381 <= x < 2^16383 1950 1951{ .mfi 1952 mov GR_exp_ynear1_oflow = 0xffff + 16383 1953 fma.s1 FR_TMP1 = FR_Input_X,FR_Delta,f0 1954 and GR_exp_x = GR_exp_mask, GR_signexp_x 1955} 1956;; 1957 1958{ .mfi 1959 cmp.lt p15, p14 = GR_exp_x, GR_exp_ynear1_oflow 1960 fma.s1 FR_TMP2 = FR_logx_hi,f1,FR_X_lo 1961 mov GR_exp_ynear1_uflow = 0xffff - 16381 1962} 1963;; 1964 1965{ .mfb 1966(p15) cmp.ge p15, p14 = GR_exp_x, GR_exp_ynear1_uflow 1967 fma.s1 FR_TMP3 = FR_Input_X,f1,f0 1968 br.cond.sptk POWL_64_SHARED 1969};; 1970 1971POWL_64_SQUARE: 1972// 1973// Here if x not zero and y=2. 1974// 1975// Setup for multipath code 1976// 1977{ .mfi 1978 mov GR_exp_square_oflow = 0xffff + 8192 // Exponent where x*x overflows 1979 fmerge.se FR_TMP1 = FR_Input_X, FR_Input_X 1980 and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x 1981} 1982;; 1983 1984{ .mfi 1985 cmp.lt p15, p14 = GR_exp_x, GR_exp_square_oflow // Decide safe/unsafe 1986 fmerge.se FR_TMP2 = FR_Input_X, FR_Input_X 1987 mov GR_exp_square_uflow = 0xffff - 8191 // Exponent where x*x underflows 1988} 1989;; 1990 1991{ .mfi 1992(p15) cmp.ge p15, p14 = GR_exp_x, GR_exp_square_uflow // Decide safe/unsafe 1993 fma.s1 FR_TMP3 = f0,f0,f0 1994 nop.i 999 1995} 1996;; 1997 1998// 1999// This is the shared path that will set overflow and underflow. 2000// 2001POWL_64_SHARED: 2002 2003// 2004// Return if no danger of over or underflow. 2005// 2006{ .mfb 2007 nop.m 999 2008 fma.s0 FR_Result = FR_TMP1, FR_TMP2, FR_TMP3 2009(p15) br.ret.sptk b0 // Main path return if certain no over/underflow 2010} 2011;; 2012 2013// 2014// S0 user supplied status 2015// S2 user supplied status + WRE + TD (Overflows) 2016// S2 user supplied status + FZ + TD (Underflows) 2017// 2018// 2019// If (Safe) is true, then 2020// Compute result using user supplied status field. 2021// No overflow or underflow here, but perhaps inexact. 2022// Return 2023// Else 2024// Determine if overflow or underflow was raised. 2025// Fetch +/- overflow threshold for IEEE double extended 2026 2027{ .mfi 2028 nop.m 999 2029 fsetc.s2 0x7F,0x41 // For underflow test, set S2=User+TD+FTZ 2030 nop.i 999 2031} 2032;; 2033 2034{ .mfi 2035 nop.m 999 2036 fma.s2 FR_Result_small = FR_TMP1, FR_TMP2, FR_TMP3 2037 nop.i 999 2038} 2039;; 2040 2041{ .mfi 2042 nop.m 999 2043 fsetc.s2 0x7F,0x42 // For overflow test, set S2=User+TD+WRE 2044 nop.i 999 2045} 2046;; 2047 2048{ .mfi 2049 nop.m 999 2050 fma.s2 FR_Result_big = FR_TMP1, FR_TMP2,FR_TMP3 2051 nop.i 999 2052} 2053;; 2054 2055{ .mfi 2056 nop.m 999 2057 fsetc.s2 0x7F,0x40 // Reset S2=User 2058 nop.i 999 2059} 2060;; 2061 2062{ .mfi 2063 nop.m 999 2064 fclass.m p11, p0 = FR_Result_small, 0x00F // Test small result unorm/zero 2065 nop.i 999 2066} 2067;; 2068 2069{ .mfi 2070 nop.m 999 2071 fcmp.ge.s1 p8, p0 = FR_Result_big , FR_Big // Test >= + oflow threshold 2072 nop.i 999 2073} 2074;; 2075 2076{ .mfb 2077(p11) mov GR_Parameter_TAG = 19 // Set tag for underflow 2078 fcmp.le.s1 p9, p0 = FR_Result_big, FR_NBig // Test <= - oflow threshold 2079(p11) br.cond.spnt __libm_error_region // Branch if pow underflowed 2080} 2081;; 2082 2083{ .mfb 2084(p8) mov GR_Parameter_TAG = 18 // Set tag for overflow 2085 nop.f 999 2086(p8) br.cond.spnt __libm_error_region // Branch if pow +overflow 2087} 2088;; 2089 2090{ .mbb 2091(p9) mov GR_Parameter_TAG = 18 // Set tag for overflow 2092(p9) br.cond.spnt __libm_error_region // Branch if pow -overflow 2093 br.ret.sptk b0 // Branch if result really ok 2094} 2095;; 2096 2097 2098POWL_64_SPECIAL: 2099// Here if x or y is NatVal, nan, inf, or zero 2100{ .mfi 2101 nop.m 999 2102 fcmp.eq.s1 p15, p0 = FR_Input_X, f1 // Test x=+1 2103 nop.i 999 2104} 2105;; 2106 2107{ .mfi 2108 nop.m 999 2109 fclass.m p8, p0 = FR_Input_X, 0x143 // Test x natval, snan 2110 nop.i 999 2111} 2112;; 2113 2114{ .mfi 2115 nop.m 999 2116(p15) fcmp.eq.unc.s0 p6,p0 = FR_Input_Y, f0 // If x=1, flag invalid if y=SNaN 2117 nop.i 999 2118} 2119{ .mfb 2120 nop.m 999 2121(p15) fmpy.s0 FR_Result = f1,f1 // If x=1, result=1 2122(p15) br.ret.spnt b0 // Exit if x=1 2123} 2124;; 2125 2126{ .mfi 2127 nop.m 999 2128 fclass.m p6, p0 = FR_Input_Y, 0x007 // Test y zero 2129 nop.i 999 2130} 2131;; 2132 2133{ .mfi 2134 nop.m 999 2135 fclass.m p9, p0 = FR_Input_Y, 0x143 // Test y natval, snan 2136 nop.i 999 2137} 2138;; 2139 2140{ .mfi 2141 nop.m 999 2142 fclass.m p10, p0 = FR_Input_X, 0x083 // Test x qnan 2143 nop.i 999 2144} 2145{ .mfi 2146 nop.m 999 2147(p8) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If x=snan, result=qnan 2148(p6) cmp.ne p8,p0 = r0,r0 // Don't exit if x=snan, y=0 ==> result=+1 2149} 2150;; 2151 2152{ .mfi 2153 nop.m 999 2154(p6) fclass.m.unc p15, p0 = FR_Input_X,0x007 // Test x=0, y=0 2155 nop.i 999 2156} 2157{ .mfb 2158 nop.m 999 2159(p9) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If y=snan, result=qnan 2160(p8) br.ret.spnt b0 // Exit if x=snan, y not 0, 2161 // result=qnan 2162} 2163;; 2164 2165{ .mfi 2166 nop.m 999 2167 fcmp.eq.s1 p7, p0 = FR_Input_Y, f1 // Test y +1.0 2168 nop.i 999 2169} 2170{ .mfb 2171 nop.m 999 2172(p10) fmpy.s0 FR_Result = FR_Input_X, f0 // If x=qnan, result=qnan 2173(p9) br.ret.spnt b0 // Exit if y=snan, result=qnan 2174} 2175;; 2176 2177{ .mfi 2178 nop.m 999 2179(p6) fclass.m.unc p8, p0 = FR_Input_X,0x0C3 // Test x=nan, y=0 2180 nop.i 999 2181} 2182;; 2183 2184{ .mfi 2185 nop.m 999 2186(p6) fcmp.eq.s0 p9,p0 = FR_Input_X, f0 // If y=0, flag if x denormal 2187 nop.i 999 2188} 2189{ .mfi 2190 nop.m 999 2191(p6) fadd.s0 FR_Result = f1, f0 // If y=0, result=1 2192 nop.i 999 2193} 2194;; 2195 2196{ .mfi 2197 nop.m 999 2198 fclass.m p11, p0 = FR_Input_Y, 0x083 // Test y qnan 2199 nop.i 999 2200} 2201{ .mfb 2202(p15) mov GR_Parameter_TAG = 20 // Error tag for x=0, y=0 2203(p7) fmpy.s0 FR_Result = FR_Input_X,f1 // If y=1, result=x 2204(p15) br.cond.spnt __libm_error_region // Branch if x=0, y=0, result=1 2205} 2206;; 2207 2208{ .mfb 2209(p8) mov GR_Parameter_TAG = 23 // Error tag for x=nan, y=0 2210 fclass.m p14, p0 = FR_Input_Y, 0x023 // Test y inf 2211(p8) br.cond.spnt __libm_error_region // Branch if x=snan, y=0, 2212 // result=1 2213} 2214;; 2215 2216{ .mfb 2217 nop.m 999 2218 fclass.m p13, p0 = FR_Input_X, 0x023 // Test x inf 2219(p6) br.ret.spnt b0 // Exit y=0, x not nan or 0, 2220 // result=1 2221} 2222;; 2223 2224{ .mfb 2225 nop.m 999 2226(p14) fcmp.eq.unc.s1 p0,p14 = FR_Input_X,f0 // Test x not 0, y=inf 2227(p7) br.ret.spnt b0 // Exit y=1, x not snan, 2228 // result=x 2229} 2230;; 2231 2232{ .mfb 2233 nop.m 999 2234(p10) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X // If x=qnan, y not snan, 2235 // result=qnan 2236(p10) br.ret.spnt b0 // Exit x=qnan, y not snan, 2237 // result=qnan 2238} 2239;; 2240 2241{ .mfb 2242 nop.m 999 2243(p11) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X // If y=qnan, x not nan or 1, 2244 // result=qnan 2245(p11) br.ret.spnt b0 // Exit y=qnan, x not nan or 1, 2246 // result=qnan 2247} 2248;; 2249 2250{ .mbb 2251 nop.m 999 2252(p14) br.cond.spnt POWL_64_Y_IS_INF // Branch if y=inf, x not 1 or nan 2253(p13) br.cond.spnt POWL_64_X_IS_INF // Branch if x=inf, y not 1 or nan 2254} 2255;; 2256 2257 2258POWL_64_X_IS_ZERO: 2259// Here if x=0, y not nan or 1 or inf or 0 2260 2261// There is logic starting here to determine if y is an integer when x = 0. 2262// If 0 < |y| < 1 then clearly y is not an integer. 2263// If |y| > 1, then the significand of y is shifted left by the size of 2264// the exponent of y. This preserves the lsb of the integer part + the 2265// fractional bits. The lsb of the integer can be tested to determine if 2266// the integer is even or odd. The fractional bits can be tested. If zero, 2267// then y is an integer. 2268// 2269{ .mfi 2270 and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent of y 2271 nop.f 999 2272 and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y 2273} 2274;; 2275 2276// 2277// Maybe y is < 1 already, so 2278// can never be an integer. 2279// 2280{ .mfi 2281 cmp.lt p9, p8 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1 2282 nop.f 999 2283 sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent of y 2284} 2285;; 2286 2287// 2288// Shift significand of y looking for nonzero bits 2289// For y > 1, shift signif_y exp_y bits to the left 2290// For y < 1, turn on 4 low order bits of significand of y 2291// so that the fraction will always be non-zero 2292// 2293{ .mmi 2294(p9) or GR_exp_y= 0xF,GR_signif_y // Force nonzero fraction if y<1 2295;; 2296 nop.m 999 2297(p8) shl GR_exp_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction 2298 // Wait 4 cycles to use result 2299} 2300;; 2301 2302{ .mmi 2303 nop.m 999 2304;; 2305 nop.m 999 2306 nop.i 999 2307} 2308;; 2309 2310{ .mmi 2311 nop.m 999 2312;; 2313 nop.m 999 2314 shl GR_fraction_y= GR_exp_y,1 // Shift left 1 to get fraction 2315} 2316;; 2317 2318// 2319// Integer part of y shifted off. 2320// Get y's low even or odd bit - y might not be an int. 2321// 2322{ .mii 2323 cmp.eq p13,p0 = GR_fraction_y, r0 // Test for y integer 2324 cmp.eq p8,p0 = GR_y_sign, r0 // Test for y > 0 2325;; 2326(p13) tbit.nz.unc p13,p0 = GR_exp_y, 63 // Test if y an odd integer 2327} 2328;; 2329 2330{ .mfi 2331(p13) cmp.eq.unc p13,p14 = GR_y_sign, r0 // Test y pos odd integer 2332(p8) fcmp.eq.s0 p12,p0 = FR_Input_Y, f0 // If x=0 and y>0 flag if y denormal 2333 nop.i 999 2334} 2335;; 2336 2337// 2338// Return +/-0 when x=+/-0 and y is positive odd integer 2339// 2340{ .mfb 2341 nop.m 999 2342(p13) mov FR_Result = FR_Input_X // If x=0, y pos odd int, result=x 2343(p13) br.ret.spnt b0 // Exit x=0, y pos odd int, result=x 2344} 2345;; 2346 2347// 2348// Return +/-inf when x=+/-0 and y is negative odd int 2349// 2350{ .mfb 2351(p14) mov GR_Parameter_TAG = 21 2352(p14) frcpa.s0 FR_Result, p0 = f1, FR_Input_X // Result +-inf, set Z flag 2353(p14) br.cond.spnt __libm_error_region 2354} 2355;; 2356 2357// 2358// Return +0 when x=+/-0 and y positive and not an odd integer 2359// 2360{ .mfb 2361 nop.m 999 2362(p8) mov FR_Result = f0 // If x=0, y>0 and not odd integer, result=+0 2363(p8) br.ret.sptk b0 // Exit x=0, y>0 and not odd integer, result=+0 2364} 2365;; 2366 2367// 2368// Return +inf when x=+/-0 and y is negative and not odd int 2369// 2370{ .mfb 2371 mov GR_Parameter_TAG = 21 2372 frcpa.s0 FR_Result, p10 = f1,f0 // Result +inf, raise Z flag 2373 br.cond.sptk __libm_error_region 2374} 2375;; 2376 2377 2378POWL_64_X_IS_INF: 2379// 2380// Here if x=inf, y not 1 or nan 2381// 2382{ .mfi 2383 and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent y 2384 fclass.m p13, p0 = FR_Input_X,0x022 // Test x=-inf 2385 nop.i 999 2386} 2387;; 2388 2389{ .mfi 2390 and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y 2391 fcmp.eq.s0 p9,p0 = FR_Input_Y, f0 // Dummy to set flag if y denorm 2392 nop.i 999 2393} 2394;; 2395 2396// 2397// Maybe y is < 1 already, so 2398// isn't an int. 2399// 2400{ .mfi 2401(p13) cmp.lt.unc p9, p8 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1 if x=-inf 2402 fclass.m p11, p0 = FR_Input_X,0x021 // Test x=+inf 2403 sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent y 2404} 2405;; 2406 2407// 2408// Shift significand of y looking for nonzero bits 2409// For y > 1, shift signif_y exp_y bits to the left 2410// For y < 1, turn on 4 low order bits of significand of y 2411// so that the fraction will always be non-zero 2412// 2413{ .mmi 2414(p9) or GR_exp_y= 0xF,GR_signif_y // Force nonzero fraction if y<1 2415;; 2416(p11) cmp.eq.unc p14,p12 = GR_y_sign, r0 // Test x=+inf, y>0 2417(p8) shl GR_exp_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction 2418 // Wait 4 cycles to use result 2419} 2420;; 2421 2422// 2423// Return +inf for x=+inf, y > 0 2424// Return +0 for x=+inf, y < 0 2425// 2426{ .mfi 2427 nop.m 999 2428(p12) mov FR_Result = f0 // If x=+inf, y<0, result=+0 2429 nop.i 999 2430} 2431{ .mfb 2432 nop.m 999 2433(p14) fma.s0 FR_Result = FR_Input_X,f1,f0 // If x=+inf, y>0, result=+inf 2434(p11) br.ret.sptk b0 // Exit x=+inf 2435} 2436;; 2437 2438// 2439// Here only if x=-inf. Wait until can use result of shl... 2440// 2441{ .mmi 2442 nop.m 999 2443;; 2444 nop.m 999 2445 nop.i 999 2446} 2447;; 2448 2449{ .mfi 2450 cmp.eq p8,p9 = GR_y_sign, r0 // Test y pos 2451 nop.f 999 2452 shl GR_fraction_y = GR_exp_y,1 // Shift left 1 to get fraction 2453} 2454;; 2455 2456{ .mmi 2457 cmp.eq p13,p0 = GR_fraction_y, r0 // Test y integer 2458;; 2459 nop.m 999 2460(p13) tbit.nz.unc p13,p0 = GR_exp_y, 63 // Test y odd integer 2461} 2462;; 2463 2464// 2465// Is y even or odd? 2466// 2467{ .mii 2468(p13) cmp.eq.unc p14,p10 = GR_y_sign, r0 // Test x=-inf, y pos odd int 2469(p13) cmp.ne.and p8,p9 = r0,r0 // If y odd int, turn off p8,p9 2470 nop.i 999 2471} 2472;; 2473 2474// 2475// Return -0 for x = -inf and y < 0 and odd int. 2476// Return -Inf for x = -inf and y > 0 and odd int. 2477// 2478{ .mfi 2479 nop.m 999 2480(p10) fmerge.ns FR_Result = f0, f0 // If x=-inf, y neg odd int, result=-0 2481 nop.i 999 2482} 2483{ .mfi 2484 nop.m 999 2485(p14) fmpy.s0 FR_Result = FR_Input_X,f1 // If x=-inf, y pos odd int, result=-inf 2486 nop.i 999 2487} 2488;; 2489 2490// 2491// Return Inf for x = -inf and y > 0 not an odd int. 2492// Return +0 for x = -inf and y < 0 not an odd int. 2493// 2494.pred.rel "mutex",p8,p9 2495{ .mfi 2496 nop.m 999 2497(p8) fmerge.ns FR_Result = FR_Input_X, FR_Input_X // If x=-inf, y>0 not odd int 2498 // result=+inf 2499 nop.i 999 2500} 2501{ .mfb 2502 nop.m 999 2503(p9) fmpy.s0 FR_Result = f0,f0 // If x=-inf, y<0 not odd int 2504 // result=+0 2505 br.ret.sptk b0 // Exit for x=-inf 2506} 2507;; 2508 2509 2510POWL_64_Y_IS_INF: 2511// Here if y=inf, x not 1 or nan 2512// 2513// For y = +Inf and |x| < 1 returns 0 2514// For y = +Inf and |x| > 1 returns Inf 2515// For y = -Inf and |x| < 1 returns Inf 2516// For y = -Inf and |x| > 1 returns 0 2517// For y = Inf and |x| = 1 returns 1 2518// 2519{ .mfi 2520 nop.m 999 2521 fclass.m p8, p0 = FR_Input_Y, 0x021 // Test y=+inf 2522 nop.i 999 2523} 2524;; 2525 2526{ .mfi 2527 nop.m 999 2528 fclass.m p9, p0 = FR_Input_Y, 0x022 // Test y=-inf 2529 nop.i 999 2530} 2531;; 2532 2533{ .mfi 2534 nop.m 999 2535 fabs FR_X = FR_Input_X // Form |x| 2536 nop.i 999 2537} 2538;; 2539 2540{ .mfi 2541 nop.m 999 2542 fcmp.eq.s0 p10,p0 = FR_Input_X, f0 // flag if x denormal 2543 nop.i 999 2544} 2545;; 2546 2547{ .mfi 2548 nop.m 999 2549(p8) fcmp.lt.unc.s1 p6, p0 = FR_X, f1 // Test y=+inf, |x|<1 2550 nop.i 999 2551} 2552;; 2553 2554{ .mfi 2555 nop.m 999 2556(p8) fcmp.gt.unc.s1 p7, p0 = FR_X, f1 // Test y=+inf, |x|>1 2557 nop.i 999 2558} 2559;; 2560 2561{ .mfi 2562 nop.m 999 2563(p9) fcmp.lt.unc.s1 p12, p0 = FR_X, f1 // Test y=-inf, |x|<1 2564 nop.i 999 2565} 2566{ .mfi 2567 nop.m 999 2568(p6) fmpy.s0 FR_Result = f0,f0 // If y=+inf, |x|<1, result=+0 2569 nop.i 999 2570} 2571;; 2572 2573{ .mfi 2574 nop.m 999 2575(p9) fcmp.gt.unc.s1 p13, p0 = FR_X, f1 // Test y=-inf, |x|>1 2576 nop.i 999 2577} 2578{ .mfi 2579 nop.m 999 2580(p7) fmpy.s0 FR_Result = FR_Input_Y, f1 // If y=+inf, |x|>1, result=+inf 2581 nop.i 999 2582} 2583;; 2584 2585{ .mfi 2586 nop.m 999 2587 fcmp.eq.s1 p14, p0 = FR_X, f1 // Test y=inf, |x|=1 2588 nop.i 999 2589} 2590{ .mfi 2591 nop.m 999 2592(p12) fnma.s0 FR_Result = FR_Input_Y, f1, f0 // If y=-inf, |x|<1, result=+inf 2593 nop.i 999 2594} 2595;; 2596 2597{ .mfi 2598 nop.m 999 2599(p13) mov FR_Result = f0 // If y=-inf, |x|>1, result=+0 2600 nop.i 999 2601} 2602;; 2603 2604{ .mfb 2605 nop.m 999 2606(p14) fmpy.s0 FR_Result = f1,f1 // If y=inf, |x|=1, result=+1 2607 br.ret.sptk b0 // Common return for y=inf 2608} 2609;; 2610 2611 2612// Here if x or y denorm/unorm 2613POWL_DENORM: 2614{ .mmi 2615 getf.sig GR_signif_Z = FR_norm_X // Get significand of x 2616;; 2617 getf.exp GR_signexp_y = FR_norm_Y // Get sign and exp of y 2618 nop.i 999 2619} 2620;; 2621 2622{ .mfi 2623 getf.sig GR_signif_y = FR_norm_Y // Get significand of y 2624 nop.f 999 2625 nop.i 999 2626} 2627;; 2628 2629{ .mib 2630 getf.exp GR_signexp_x = FR_norm_X // Get sign and exp of x 2631 extr.u GR_Index1 = GR_signif_Z, 59, 4 // Extract upper 4 signif bits of x 2632 br.cond.sptk POWL_COMMON // Branch back to main path 2633} 2634;; 2635 2636 2637POWL_64_UNSUPPORT: 2638// 2639// Raise exceptions for specific 2640// values - pseudo NaN and 2641// infinities. 2642// Return NaN and raise invalid 2643// 2644{ .mfb 2645 nop.m 999 2646 fmpy.s0 FR_Result = FR_Input_X,f0 2647 br.ret.sptk b0 2648} 2649;; 2650 2651POWL_64_XNEG: 2652// 2653// Raise invalid for x < 0 and 2654// y not an integer 2655// 2656{ .mfi 2657 nop.m 999 2658 frcpa.s0 FR_Result, p8 = f0, f0 2659 mov GR_Parameter_TAG = 22 2660} 2661{ .mib 2662 nop.m 999 2663 nop.i 999 2664 br.cond.sptk __libm_error_region 2665} 2666;; 2667 2668POWL_64_SQRT: 2669{ .mfi 2670 nop.m 999 2671 frsqrta.s0 FR_Result,p10 = FR_save_Input_X 2672 nop.i 999 ;; 2673} 2674{ .mfi 2675 nop.m 999 2676(p10) fma.s1 f62=FR_Half,FR_save_Input_X,f0 2677 nop.i 999 ;; 2678} 2679{ .mfi 2680 nop.m 999 2681(p10) fma.s1 f63=FR_Result,FR_Result,f0 2682 nop.i 999 ;; 2683} 2684{ .mfi 2685 nop.m 999 2686(p10) fnma.s1 f32=f63,f62,FR_Half 2687 nop.i 999 ;; 2688} 2689{ .mfi 2690 nop.m 999 2691(p10) fma.s1 f33=f32,FR_Result,FR_Result 2692 nop.i 999 ;; 2693} 2694{ .mfi 2695 nop.m 999 2696(p10) fma.s1 f34=f33,f62,f0 2697 nop.i 999 ;; 2698} 2699{ .mfi 2700 nop.m 999 2701(p10) fnma.s1 f35=f34,f33,FR_Half 2702 nop.i 999 ;; 2703} 2704{ .mfi 2705 nop.m 999 2706(p10) fma.s1 f63=f35,f33,f33 2707 nop.i 999 ;; 2708} 2709{ .mfi 2710 nop.m 999 2711(p10) fma.s1 f32=FR_save_Input_X,f63,f0 2712 nop.i 999 2713} 2714{ .mfi 2715 nop.m 999 2716(p10) fma.s1 FR_Result=f63,f62,f0 2717 nop.i 999 ;; 2718} 2719{ .mfi 2720 nop.m 999 2721(p10) fma.s1 f33=f11,f63,f0 2722 nop.i 999 ;; 2723} 2724{ .mfi 2725 nop.m 999 2726(p10) fnma.s1 f34=f32,f32,FR_save_Input_X 2727 nop.i 999 2728} 2729{ .mfi 2730 nop.m 999 2731(p10) fnma.s1 f35=FR_Result,f63,FR_Half 2732 nop.i 999 ;; 2733} 2734{ .mfi 2735 nop.m 999 2736(p10) fma.s1 f62=f33,f34,f32 2737 nop.i 999 2738} 2739{ .mfi 2740 nop.m 999 2741(p10) fma.s1 f63=f33,f35,f33 2742 nop.i 999 ;; 2743} 2744{ .mfi 2745 nop.m 999 2746(p10) fnma.s1 f32=f62,f62,FR_save_Input_X 2747 nop.i 999 ;; 2748} 2749{ .mfb 2750 nop.m 999 2751(p10) fma.s0 FR_Result=f32,f63,f62 2752 br.ret.sptk b0 // Exit for x > 0, y = 0.5 2753} 2754;; 2755 2756GLOBAL_LIBM_END(powl) 2757libm_alias_ldouble_other (pow, pow) 2758 2759 2760LOCAL_LIBM_ENTRY(__libm_error_region) 2761.prologue 2762{ .mfi 2763 add GR_Parameter_Y=-32,sp // Parameter 2 value 2764 nop.f 0 2765.save ar.pfs,GR_SAVE_PFS 2766 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 2767} 2768{ .mfi 2769.fframe 64 2770 add sp=-64,sp // Create new stack 2771 nop.f 0 2772 mov GR_SAVE_GP=gp // Save gp 2773};; 2774{ .mmi 2775 stfe [GR_Parameter_Y] = FR_Input_Y,16 // Save Parameter 2 on stack 2776 add GR_Parameter_X = 16,sp // Parameter 1 address 2777.save b0, GR_SAVE_B0 2778 mov GR_SAVE_B0=b0 // Save b0 2779};; 2780.body 2781{ .mib 2782 stfe [GR_Parameter_X] = FR_save_Input_X // Store Parameter 1 on stack 2783 add GR_Parameter_RESULT = 0,GR_Parameter_Y 2784 nop.b 0 // Parameter 3 address 2785} 2786{ .mib 2787 stfe [GR_Parameter_Y] = FR_Result // Store Parameter 3 on stack 2788 add GR_Parameter_Y = -16,GR_Parameter_Y 2789 br.call.sptk b0=__libm_error_support# // Call error handling function 2790};; 2791{ .mmi 2792 add GR_Parameter_RESULT = 48,sp 2793 nop.m 0 2794 nop.i 0 2795};; 2796{ .mmi 2797 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack 2798.restore sp 2799 add sp = 64,sp // Restore stack pointer 2800 mov b0 = GR_SAVE_B0 // Restore return address 2801};; 2802{ .mib 2803 mov gp = GR_SAVE_GP // Restore gp 2804 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 2805 br.ret.sptk b0 // Return 2806};; 2807 2808LOCAL_LIBM_END(__libm_error_region#) 2809.type __libm_error_support#,@function 2810.global __libm_error_support# 2811