1.file "powl.s"
2
3
4// Copyright (c) 2000 - 2003, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39//*********************************************************************
40//
41// Function:   powl(x,y), where
42//                          y
43//             powl(x,y) = x , for double extended precision x and y values
44//
45//*********************************************************************
46//
47// History:
48// 02/02/00 (Hand Optimized)
49// 04/04/00 Unwind support added
50// 08/15/00 Bundle added after call to __libm_error_support to properly
51//          set [the previously overwritten] GR_Parameter_RESULT.
52// 01/22/01 Corrected results for powl(1,inf), powl(1,nan), and
53//          powl(snan,0) to be 1 per C99, not nan.  Fixed many flag settings.
54// 02/06/01 Call __libm_error support if over/underflow when y=2.
55// 04/17/01 Support added for y close to 1 and x a non-special value.
56//          Shared software under/overflow detection for all paths
57// 02/07/02 Corrected sf3 setting to disable traps
58// 05/13/02 Improved performance of all paths
59// 02/10/03 Reordered header: .section, .global, .proc, .align;
60//          used data8 for long double table values
61// 04/17/03 Added missing mutex directive
62// 10/13/03 Corrected .endp names to match .proc names
63//
64//*********************************************************************
65//
66// Resources Used:
67//
68//    Floating-Point Registers:
69//                        f8  (Input x and Return Value)
70//                        f9  (Input y)
71//                        f10-f15,f32-f79
72//
73//    General Purpose Registers:
74//                        Locals r14-24,r32-r65
75//                        Parameters to __libm_error_support r62,r63,r64,r65
76//
77//    Predicate Registers: p6-p15
78//
79//*********************************************************************
80//
81//  Special Cases and IEEE special conditions:
82//
83//    Denormal fault raised on denormal inputs
84//    Overflow exceptions raised when appropriate for pow
85//    Underflow exceptions raised when appropriate for pow
86//    (Error Handling Routine called for overflow and Underflow)
87//    Inexact raised when appropriate by algorithm
88//
89//  1.  (anything) ** NatVal or (NatVal) ** anything  is NatVal
90//  2.  X or Y unsupported or sNaN                    is qNaN/Invalid
91//  3.  (anything) ** 0  is 1
92//  4.  (anything) ** 1  is itself
93//  5.  (anything except 1) ** qNAN is qNAN
94//  6.  qNAN ** (anything except 0) is qNAN
95//  7.  +-(|x| > 1) **  +INF is +INF
96//  8.  +-(|x| > 1) **  -INF is +0
97//  9.  +-(|x| < 1) **  +INF is +0
98//  10. +-(|x| < 1) **  -INF is +INF
99//  11. +-1         ** +-INF is +1
100//  12. +0 ** (+anything except 0, NAN)               is +0
101//  13. -0 ** (+anything except 0, NAN, odd integer)  is +0
102//  14. +0 ** (-anything except 0, NAN)               is +INF/div_0
103//  15. -0 ** (-anything except 0, NAN, odd integer)  is +INF/div_0
104//  16. -0 ** (odd integer) = -( +0 ** (odd integer) )
105//  17. +INF ** (+anything except 0,NAN) is +INF
106//  18. +INF ** (-anything except 0,NAN) is +0
107//  19. -INF ** (anything except NAN)  = -0 ** (-anything)
108//  20. (-anything) ** (integer) is (-1)**(integer)*(+anything**integer)
109//  21. (-anything except 0 and inf) ** (non-integer) is qNAN/Invalid
110//  22. X or Y denorm/unorm and denorm/unorm operand trap is enabled,
111//      generate denorm/unorm fault except if invalid or div_0 raised.
112//
113//*********************************************************************
114//
115//  Algorithm
116//  =========
117//
118//  Special Cases
119//
120//    If Y = 2,    return X*X.
121//    If Y = 0.5,  return sqrt(X).
122//
123//  Compute log(X) to extra precision.
124//
125//  ker_log_80( X, logX_hi, logX_lo, Safe );
126//
127//   ...logX_hi + logX_lo approximates log(X) to roughly 80
128//   ...significant bits of accuracy.
129//
130//  Compute Y*log(X) to extra precision.
131//
132//    P_hi := Y * logX_hi
133//    P_lo := Y * logX_hi - P_hi       ...using FMA
134//    P_lo := Y * logX_lo + P_lo       ...using FMA
135//
136//  Compute exp(P_hi + P_lo)
137//
138//    Flag := 2;
139//    Expo_Range := 2; (assuming double-extended power function)
140//    ker_exp_64( P_hi, P_lo, Flag, Expo_Range,
141//                Z_hi, Z_lo, scale, Safe )
142//
143//    scale := sgn * scale
144//
145//    If (Safe) then ...result will not over/underflow
146//       return scale*Z_hi + (scale*Z_lo)
147//       quickly
148//    Else
149//       take necessary precaution in computing
150//       scale*Z_hi + (scale*Z_lo)
151//       to set possible exceptions correctly.
152//    End If
153//
154//  Case_Y_Special
155//
156//   ...Follow the order of the case checks
157//
158//   If Y is +-0, return +1 without raising any exception.
159//   If Y is +1,  return X  without raising any exception.
160//   If Y is qNaN, return Y without exception.
161//   If X is qNaN, return X without exception.
162//
163//   At this point, X is real and Y is +-inf.
164//   Thus |X| can only be 1, strictly bigger than 1, or
165//   strictly less than 1.
166//
167//   If |X| < 1, then
168//   return ( Y == +inf?  +0 : +inf )
169//   elseif |X| > 1, then
170//   return ( Y == +inf? +0 : +inf )
171//   else
172//   goto Case_Invalid
173//
174//  Case_X_Special
175//
176//   ...Follow the order of the case checks
177//   ...Note that Y is real, finite, non-zero, and not +1.
178//
179//   If X is qNaN, return X without exception.
180//
181//   If X is +-0,
182//   return ( Y > 0 ? +0 : +inf )
183//
184//   If X is +inf
185//   return ( Y > 0 ? +inf : +0 )
186//
187//   If X is -inf
188//   return -0 ** -Y
189//   return ( Y > 0 ? +inf : +0 )
190//
191//  Case_Invalid
192//
193//   Return 0 * inf to generate a quiet NaN together
194//   with an invalid exception.
195//
196//  Implementation
197//  ==============
198//
199//   We describe the quick branch since this part is important
200//   in reaching the normal case efficiently.
201//
202//  STAGE 1
203//  -------
204//   This stage contains two threads.
205//
206//   Stage1.Thread1
207//
208//     fclass.m   X_excep,  X_ok   = X, (NatVal or s/qNaN) or
209//                              +-0, +-infinity
210//
211//     fclass.nm  X_unsupp, X_supp = X, (NatVal or s/qNaN) or
212//                              +-(0, unnorm, norm, infinity)
213//
214//     X_norm := fnorm( X ) with traps disabled
215//
216//     If (X_excep)  goto Filtering (Step 2)
217//     If (X_unsupp) goto Filtering (Step 2)
218//
219//     Stage1.Thread2
220//     ..............
221//
222//     fclass.m   Y_excep,  Y_ok   = Y, (NatVal or s/qNaN) or
223//                              +-0, +-infinity
224//
225//     fclass.nm  Y_unsupp, Y_supp = Y, (NatVal or s/qNaN) or
226//                              +-(0, unnorm, norm, infinity)
227//
228//     Y_norm := fnorm( Y ) with traps disabled
229//
230//     If (Y_excep)  goto Filtering (Step 2)
231//     If (Y_unsupp) goto Filtering (Step 2)
232//
233//
234//  STAGE 2
235//  -------
236//  This stage contains two threads.
237//
238//     Stage2.Thread1
239//     ..............
240//
241//     Set X_lt_0 if X < 0 (using fcmp)
242//     sgn := +1.0
243//     If (X_lt_0) goto Filtering (Step 2)
244//
245//     Stage2.Thread2
246//     ..............
247//
248//     Set Y_is_1 if Y = +1 (using fcmp)
249//     If (Y_is_1) goto Filtering (Step 2)
250//
251//   STAGE 3
252//   -------
253//   This stage contains two threads.
254//
255//
256//   Stage3.Thread1
257//   ..............
258//
259//     X := fnorm(X) in prevailing traps
260//
261//
262//     Stage3.Thread2
263//     ..............
264//
265//     Y := fnorm(Y) in prevailing traps
266//
267//   STAGE 4
268//   -------
269//
270//   Go to Case_Normal.
271//
272
273
274// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
275
276// double-extended 1/ln(2)
277// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
278// 3fff b8aa 3b29 5c17 f0bc
279// For speed the significand will be loaded directly with a movl and setf.sig
280//   and the exponent will be bias+63 instead of bias+0.  Thus subsequent
281//   computations need to scale appropriately.
282// The constant 2^12/ln(2) is needed for the computation of N.  This is also
283//   obtained by scaling the computations.
284//
285// Two shifting constants are loaded directly with movl and setf.d.
286//   1. RSHF_2TO51 = 1.1000..00 * 2^(63-12)
287//        This constant is added to x*1/ln2 to shift the integer part of
288//        x*2^12/ln2 into the rightmost bits of the significand.
289//        The result of this fma is N_signif.
290//   2. RSHF       = 1.1000..00 * 2^(63)
291//        This constant is subtracted from N_signif * 2^(-51) to give
292//        the integer part of N, N_fix, as a floating-point number.
293//        The result of this fms is float_N.
294RODATA
295
296.align 16
297// L_hi, L_lo
298LOCAL_OBJECT_START(Constants_exp_64_Arg)
299data8 0xB17217F400000000,0x00003FF2 // L_hi = hi part log(2)/2^12
300data8 0xF473DE6AF278ECE6,0x00003FD4 // L_lo = lo part log(2)/2^12
301LOCAL_OBJECT_END(Constants_exp_64_Arg)
302
303LOCAL_OBJECT_START(Constants_exp_64_A)
304// Reversed
305data8 0xAAAAAAABB1B736A0,0x00003FFA
306data8 0xAAAAAAAB90CD6327,0x00003FFC
307data8 0xFFFFFFFFFFFFFFFF,0x00003FFD
308LOCAL_OBJECT_END(Constants_exp_64_A)
309
310LOCAL_OBJECT_START(Constants_exp_64_P)
311// Reversed
312data8 0xD00D6C8143914A8A,0x00003FF2
313data8 0xB60BC4AC30304B30,0x00003FF5
314data8 0x888888887474C518,0x00003FF8
315data8 0xAAAAAAAA8DAE729D,0x00003FFA
316data8 0xAAAAAAAAAAAAAF61,0x00003FFC
317data8 0x80000000000004C7,0x00003FFE
318LOCAL_OBJECT_END(Constants_exp_64_P)
319
320LOCAL_OBJECT_START(Constants_exp_64_T1)
321data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
322data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
323data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
324data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
325data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
326data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516
327data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A
328data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4
329data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B
330data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD
331data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15
332data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B
333data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
334data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
335data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
336data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
337LOCAL_OBJECT_END(Constants_exp_64_T1)
338
339LOCAL_OBJECT_START(Constants_exp_64_T2)
340data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
341data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
342data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
343data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
344data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
345data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
346data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
347data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
348data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
349data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
350data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
351data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
352data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
353data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
354data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
355data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
356LOCAL_OBJECT_END(Constants_exp_64_T2)
357
358LOCAL_OBJECT_START(Constants_exp_64_W1)
359data8 0x0000000000000000, 0xBE384454171EC4B4
360data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8
361data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36
362data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE
363data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F
364data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329
365data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5
366data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F
367data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF
368data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F
369data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92
370data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E
371data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D
372data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29
373data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A
374data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA
375data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6
376data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF
377data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC
378data8 0xBE51C2141AA42614, 0xBE48D087C37293F4
379data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38
380data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962
381data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788
382data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7
383data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2
384data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4
385data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA
386data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B
387data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A
388data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719
389data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D
390data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707
391LOCAL_OBJECT_END(Constants_exp_64_W1)
392
393LOCAL_OBJECT_START(Constants_exp_64_W2)
394data8 0x0000000000000000, 0xBE641F2537A3D7A2
395data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6
396data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE
397data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3
398data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4
399data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B
400data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7
401data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA
402data8 0xBE56856B49BFF529, 0x3E66DD3300508651
403data8 0x3E51165FC114BC13, 0x3E53333DC453290F
404data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696
405data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93
406data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE
407data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22
408data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97
409data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8
410data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC
411data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1
412data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7
413data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D
414data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C
415data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5
416data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9
417data8 0xBE559725ADE45917, 0xBE68C29C042FC476
418data8 0xBE67593B01E511FA, 0xBE4A4313398801ED
419data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E
420data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D
421data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F
422data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1
423data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795
424data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E
425data8 0x3E68BF5C17365712, 0x3E3956F9B3785569
426LOCAL_OBJECT_END(Constants_exp_64_W2)
427
428LOCAL_OBJECT_START(Constants_log_80_P)
429// P_8, P_7, ..., P_1
430data8 0xCCCE8B883B1042BC, 0x0000BFFB // P_8
431data8 0xE38997B7CADC2149, 0x00003FFB // P_7
432data8 0xFFFFFFFEB1ACB090, 0x0000BFFB // P_6
433data8 0x9249249806481C81, 0x00003FFC // P_5
434data8 0x0000000000000000, 0x00000000 // Pad for bank conflicts
435data8 0xAAAAAAAAAAAAB0EF, 0x0000BFFC // P_4
436data8 0xCCCCCCCCCCC91416, 0x00003FFC // P_3
437data8 0x8000000000000000, 0x0000BFFD // P_2
438data8 0xAAAAAAAAAAAAAAAB, 0x00003FFD // P_1
439LOCAL_OBJECT_END(Constants_log_80_P)
440
441LOCAL_OBJECT_START(Constants_log_80_Q)
442// log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1
443data8 0xB172180000000000,0x00003FFE
444data8 0x82E308654361C4C6,0x0000BFE2
445data8 0x92492453A51BE0AF,0x00003FFC
446data8 0xAAAAAB73A0CFD29F,0x0000BFFC
447data8 0xCCCCCCCCCCCE3872,0x00003FFC
448data8 0xFFFFFFFFFFFFB4FB,0x0000BFFC
449data8 0xAAAAAAAAAAAAAAAB,0x00003FFD
450data8 0x8000000000000000,0x0000BFFE
451LOCAL_OBJECT_END(Constants_log_80_Q)
452
453LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h1)
454// Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double
455data4 0x00008000,0x3F800000,0x00000000,0x00000000
456data4 0x00000000,0x00000000,0x00000000,0x00000000
457data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000
458data4 0xEBA0E0D1,0x8B1D330B,0x00003FDA,0x00000000
459data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000
460data4 0x9EADD553,0xE2AF365E,0x00003FE2,0x00000000
461data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000
462data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000
463data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000
464data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000
465data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000
466data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000
467data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000
468data4 0x457978A1,0x8718789F,0x00003FE2,0x00000000
469data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000
470data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000
471data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000
472data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000
473data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000
474data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000
475data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000
476data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000
477data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000
478data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000
479data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000
480data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000
481data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000
482data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000
483data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000
484data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000
485data4 0x00004211,0x3F042108,0x3F29516A,0x00000000
486data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000
487LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h1)
488
489LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h2)
490// Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double
491data4 0x00008000,0x3F800000,0x00000000,0x00000000
492data4 0x00000000,0x00000000,0x00000000,0x00000000
493data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000
494data4 0x211398BF,0xAD08B116,0x00003FDB,0x00000000
495data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000
496data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000
497data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000
498data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000
499data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000
500data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000
501data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000
502data4 0x049EB22F,0xD1B87D3C,0x00003FDE,0x00000000
503data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000
504data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000
505data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000
506data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000
507data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000
508data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000
509data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000
510data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000
511data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000
512data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000
513data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000
514data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000
515data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000
516data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000
517data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000
518data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000
519data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000
520data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000
521data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000
522data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000
523LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h2)
524
525LOCAL_OBJECT_START(Constants_log_80_h3_G_H)
526// h3 IEEE double extended, H3 and G3 IEEE single
527data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00
528data4 0x9B7FAD21,0x90051030,0x00003FD8,0x3F7FF400
529data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00
530data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400
531data4 0xCA964D95,0xCE65C1D8,0x0000BFD8,0x3F7FDC00
532data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400
533data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08
534data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408
535data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10
536data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410
537data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18
538data4 0x2F053150,0xB25CA912,0x0000BFDA,0x3F7FA420
539data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20
540data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428
541data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30
542data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438
543data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40
544data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448
545data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50
546data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458
547data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68
548data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470
549data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78
550data4 0x1F34A7EB,0x9A995A97,0x0000BFDC,0x3F7F4488
551data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90
552data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0
553data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8
554data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8
555data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8
556data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8
557data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0
558data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0
559data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here
560data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D
561data4 0x3A87F20B,0x3A97F68B,0x3AA7EB86,0x3AB7E101
562data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED
563data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766
564data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6
565data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620
566data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D
567LOCAL_OBJECT_END(Constants_log_80_h3_G_H)
568
569GR_sig_inv_ln2      = r14
570GR_rshf_2to51       = r15
571GR_exp_2tom51       = r16
572GR_rshf             = r17
573GR_exp_half         = r18
574GR_sign_mask        = r19
575GR_exp_square_oflow = r20
576GR_exp_square_uflow = r21
577GR_exp_ynear1_oflow = r22
578GR_exp_ynear1_uflow = r23
579GR_signif_Z         = r24
580
581GR_signexp_x        = r32
582
583GR_exp_x            = r33
584
585GR_Table_Ptr        = r34
586
587GR_Table_Ptr1       = r35
588
589GR_Index1           = r36
590
591GR_Index2           = r37
592GR_Expo_X           = r37
593
594GR_M                = r38
595
596GR_X_0              = r39
597GR_Mask             = r39
598
599GR_X_1              = r40
600GR_W1_ptr           = r40
601
602GR_W2_ptr           = r41
603GR_X_2              = r41
604
605GR_Z_1              = r42
606GR_M2               = r42
607
608GR_M1               = r43
609GR_Z_2              = r43
610
611GR_N                = r44
612GR_k                = r44
613
614GR_Big_Pos_Exp      = r45
615
616GR_exp_pos_max      = r46
617
618GR_exp_bias_p_k     = r47
619
620GR_Index3           = r48
621GR_temp             = r48
622
623GR_vsm_expo         = r49
624
625GR_T1_ptr           = r50
626GR_P_ptr1           = r50
627GR_T2_ptr           = r51
628GR_P_ptr2           = r51
629GR_N_fix            = r52
630GR_exp_y            = r53
631GR_signif_y         = r54
632GR_signexp_y        = r55
633GR_fraction_y       = r55
634GR_low_order_bit    = r56
635GR_exp_mask         = r57
636GR_exp_bias         = r58
637GR_y_sign           = r59
638GR_table_base       = r60
639GR_ptr_exp_Arg      = r61
640GR_Delta_Exp        = r62
641GR_Special_Exp      = r63
642GR_exp_neg_max      = r64
643GR_Big_Neg_Exp      = r65
644
645//** Registers for unwind support
646
647GR_SAVE_PFS         = r59
648GR_SAVE_B0          = r60
649GR_SAVE_GP          = r61
650GR_Parameter_X      = r62
651GR_Parameter_Y      = r63
652GR_Parameter_RESULT = r64
653GR_Parameter_TAG    = r65
654
655//**
656
657FR_Input_X          = f8
658FR_Result           = f8
659FR_Input_Y          = f9
660
661FR_Neg              = f10
662FR_P_hi             = f10
663FR_X                = f10
664
665FR_Half             = f11
666FR_h_3              = f11
667FR_poly_hi          = f11
668
669FR_Sgn              = f12
670
671FR_half_W           = f13
672
673FR_X_cor            = f14
674FR_P_lo             = f14
675
676FR_W                = f15
677
678FR_X_lo             = f32
679
680FR_S                = f33
681FR_W3               = f33
682
683FR_Y_hi             = f34
684FR_logx_hi          = f34
685
686FR_Z                = f35
687FR_logx_lo          = f35
688FR_GS_hi            = f35
689FR_Y_lo             = f35
690
691FR_r_cor            = f36
692FR_Scale            = f36
693
694FR_G_1              = f37
695FR_G                = f37
696FR_Wsq              = f37
697FR_temp             = f37
698
699FR_H_1              = f38
700FR_H                = f38
701FR_W4               = f38
702
703FR_h                = f39
704FR_h_1              = f39
705FR_N                = f39
706FR_P_7              = f39
707
708FR_G_2              = f40
709FR_P_8              = f40
710FR_L_hi             = f40
711
712FR_H_2              = f41
713FR_L_lo             = f41
714FR_A_1              = f41
715
716FR_h_2              = f42
717
718FR_W1               = f43
719
720FR_G_3              = f44
721FR_P_8              = f44
722FR_T1               = f44
723
724FR_log2_hi          = f45
725FR_W2               = f45
726
727FR_GS_lo            = f46
728FR_T2               = f46
729
730FR_W_1_p1           = f47
731FR_H_3              = f47
732
733FR_float_N          = f48
734
735FR_A_2              = f49
736
737FR_Q_4              = f50
738FR_r4               = f50
739
740FR_Q_3              = f51
741FR_A_3              = f51
742
743FR_Q_2              = f52
744FR_P_2              = f52
745
746FR_Q_1              = f53
747FR_P_1              = f53
748FR_T                = f53
749
750FR_Wp1              = f54
751FR_Q_5              = f54
752FR_P_3              = f54
753
754FR_Q_6              = f55
755
756FR_log2_lo          = f56
757FR_Two              = f56
758
759FR_Big              = f57
760
761FR_neg_2_mK         = f58
762
763FR_r                = f59
764
765FR_poly_lo          = f60
766
767FR_poly             = f61
768
769FR_P_5              = f62
770FR_Result_small     = f62
771
772FR_rsq              = f63
773
774FR_Delta            = f64
775
776FR_save_Input_X     = f65
777FR_norm_X           = f66
778FR_norm_Y           = f67
779FR_Y_lo_2           = f68
780
781FR_P_6              = f69
782FR_Result_big       = f69
783
784FR_RSHF_2TO51       = f70
785FR_INV_LN2_2TO63    = f71
786FR_2TOM51           = f72
787FR_RSHF             = f73
788FR_TMP1             = f74
789FR_TMP2             = f75
790FR_TMP3             = f76
791FR_Tscale           = f77
792FR_P_4              = f78
793FR_NBig             = f79
794
795
796.section .text
797GLOBAL_LIBM_ENTRY(powl)
798//
799//     Get significand of x.  It is the critical path.
800//
801{ .mfi
802      getf.sig GR_signif_Z = FR_Input_X    // Get significand of x
803      fclass.m p11, p12 = FR_Input_X, 0x0b // Test x unorm
804      nop.i 999
805}
806{ .mfi
807      nop.m 999
808      fnorm.s1 FR_norm_X = FR_Input_X      // Normalize x
809      mov GR_exp_half = 0xffff - 1         // Exponent for 0.5
810}
811;;
812
813{ .mfi
814      alloc  r32 = ar.pfs,0,30,4,0
815      fclass.m p7, p0 =  FR_Input_Y, 0x1E7 // Test y natval, nan, inf, zero
816      mov GR_exp_pos_max = 0x13fff         // Max exponent for pos oflow test
817}
818{ .mfi
819      addl GR_table_base = @ltoff(Constants_exp_64_Arg#), gp // Ptr to tables
820      fnorm.s1 FR_norm_Y = FR_Input_Y      // Normalize y
821      mov GR_exp_neg_max = 0x33fff         // Max exponent for neg oflow test
822}
823;;
824
825{ .mfi
826      getf.exp GR_signexp_y = FR_Input_Y   // Get sign and exp of y
827(p12) fclass.m p11, p0 =  FR_Input_Y, 0x0b // Test y unorm
828      mov GR_sign_mask = 0x20000           // Sign mask
829}
830{ .mfi
831      ld8 GR_table_base = [GR_table_base]  // Get base address for tables
832      fadd.s1 FR_Two = f1, f1              // Form 2.0 for square test
833      mov GR_exp_mask = 0x1FFFF            // Exponent mask
834}
835;;
836
837{ .mfi
838      getf.sig GR_signif_y = FR_Input_Y    // Get significand of y
839      fclass.m p6, p0 =  FR_Input_X, 0x1E7 // Test x natval, nan, inf, zero
840      nop.i 999
841}
842;;
843
844{ .mfi
845      getf.exp GR_signexp_x = FR_Input_X   // Get signexp of x
846      fmerge.s FR_save_Input_X = FR_Input_X, FR_Input_X
847      extr.u GR_Index1 = GR_signif_Z, 59, 4  // Extract upper 4 signif bits of x
848}
849{ .mfb
850      setf.exp FR_Half = GR_exp_half       // Load half
851      nop.f 999
852(p11) br.cond.spnt  POWL_DENORM            // Branch if x or y denorm/unorm
853}
854;;
855
856// Return here from POWL_DENORM
857POWL_COMMON:
858{ .mfi
859      setf.exp FR_Big = GR_exp_pos_max     // Form big pos value for oflow test
860      fclass.nm p11, p0 = FR_Input_Y, 0x1FF // Test Y unsupported
861      shl GR_Index1 = GR_Index1,5          // Adjust index1 pointer x 32
862}
863{ .mfi
864      add GR_Table_Ptr = 0x7c0, GR_table_base // Constants_log_80_Z_G_H_h1
865      fma.s1 FR_Sgn = f1,f1,f0             // Assume result positive
866      mov GR_exp_bias = 0xFFFF             // Form exponent bias
867}
868;;
869
870//
871//     Identify NatVals, NaNs, Infs, and Zeros.
872//
873//
874//     Remove sign bit from exponent of y.
875//     Check for x = 1
876//     Branch on Infs, Nans, Zeros, and Natvals
877//     Check to see that exponent < 0
878//
879{ .mfi
880      setf.exp FR_NBig = GR_exp_neg_max    // Form big neg value for oflow test
881      fclass.nm p8, p0 =  FR_Input_X, 0x1FF  // Test X unsupported
882      and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent of y
883}
884{ .mfb
885      add GR_Index1 = GR_Index1,GR_Table_Ptr
886      nop.f 999
887(p6)  br.cond.spnt POWL_64_SPECIAL         // Branch if x natval, nan, inf, zero
888}
889;;
890
891//     load Z_1 from Index1
892
893// There is logic starting here to determine if y is an integer when x < 0.
894// If 0 < |y| < 1 then clearly y is not an integer.
895// If |y| > 1, then the significand of y is shifted left by the size of
896//    the exponent of y.  This preserves the lsb of the integer part + the
897//    fractional bits.  The lsb of the integer can be tested to determine if
898//    the integer is even or odd.  The fractional bits can be tested.  If zero,
899//    then y is an integer.
900//
901{ .mfi
902      ld2 GR_Z_1 =[GR_Index1],4            // Load Z_1
903      fmerge.s FR_Z = f0, FR_norm_X        // Z = |x|
904      extr.u GR_X_0 = GR_signif_Z, 49, 15  // Extract X_0 from significand
905}
906{ .mfb
907      cmp.lt p9, p0 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1
908      nop.f 999
909(p7)  br.cond.spnt POWL_64_SPECIAL         // Branch if y natval, nan, inf, zero
910}
911;;
912
913{ .mfb
914      ldfs  FR_G_1 = [GR_Index1],4         // Load G_1
915      fcmp.eq.s1 p10, p0 =  FR_Input_Y, f1 // Test Y = +1.0
916(p8)  br.cond.spnt POWL_64_UNSUPPORT       // Branch if x unsupported
917}
918;;
919
920//
921//     X_0  = High order 15 bit of Z
922//
923{ .mfb
924      ldfs  FR_H_1 = [GR_Index1],8             // Load H_1
925(p9)  fcmp.lt.unc.s1 p9, p0 = FR_Input_X, f0   // Test x<0, 0 <|y|<1
926(p11) br.cond.spnt POWL_64_UNSUPPORT           // Branch if y unsupported
927}
928;;
929
930{ .mfi
931      ldfe FR_h_1 = [GR_Index1]                // Load h_1
932      fcmp.eq.s1 p7, p0 =  FR_Input_Y, FR_Two  // Test y = 2.0
933      pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15     // X_1 = X_0 * Z_1 (bits 15-30)
934                                               // Wait 4 cycles to use result
935}
936{ .mfi
937      add GR_Table_Ptr = 0x9c0, GR_table_base  // Constants_log_80_Z_G_H_h2
938      nop.f 999
939      sub GR_exp_y = GR_exp_y,GR_exp_bias      // Get true exponent of y
940}
941;;
942
943//
944//      Branch for (x < 0) and Y not an integer.
945//
946{ .mfb
947      nop.m 999
948      fcmp.lt.s1 p6, p0  =  FR_Input_X, f0     // Test x < 0
949(p9)  br.cond.spnt POWL_64_XNEG                // Branch if x < 0, 0 < |y| < 1
950}
951;;
952
953{ .mfi
954      nop.m 999
955      fcmp.eq.s1 p12, p0 =  FR_Input_X, f1     // Test x=+1.0
956      nop.i 999
957}
958{ .mfb
959      nop.m 999
960      fsub.s1 FR_W = FR_Z, f1                  // W = Z - 1
961(p7)  br.cond.spnt POWL_64_SQUARE              // Branch if y=2
962}
963;;
964
965{ .mfi
966      nop.m 999
967(p10) fmpy.s0 FR_Result = FR_Input_X, f1       // If y=+1.0, result=x
968(p6)  shl GR_fraction_y=  GR_signif_y,GR_exp_y // Get lsb of int + fraction
969                                               // Wait 4 cycles to use result
970}
971;;
972
973{ .mfi
974      nop.m 999
975(p12) fma.s0 FR_Result = FR_Input_Y, f0, f1    // If x=1.0, result=1, chk denorm
976      extr.u GR_Index2 = GR_X_1, 6, 4          // Extract index2
977}
978;;
979
980//
981//     N = exponent of Z
982//
983{ .mib
984      getf.exp GR_N =  FR_Z                    // Get exponent of Z (also x)
985      shl GR_Index2=GR_Index2,5                // Index2  x 32 bytes
986(p10) br.ret.spnt  b0                          // Exit if y=+1.0
987}
988;;
989
990{ .mib
991      add GR_Index2 = GR_Index2, GR_Table_Ptr  // Pointer to table 2
992      nop.i 999
993(p12) br.ret.spnt  b0                          // Exit if x=+1.0
994}
995;;
996
997{ .mmi
998      ld2 GR_Z_2 =[GR_Index2],4                // Load Z_2
999;;
1000      ldfs  FR_G_2 = [GR_Index2],4             // Load G_2
1001      nop.i 999
1002}
1003;;
1004
1005{ .mii
1006      ldfs  FR_H_2 = [GR_Index2],8             // Load H_2
1007(p6)  tbit.nz.unc p9, p0 = GR_fraction_y, 63   // Test x<0 and y odd integer
1008      add GR_Table_Ptr = 0xbcc, GR_table_base  // Constants_log_80_h3_G_H, G_3
1009}
1010;;
1011
1012//
1013//      For x < 0 and y odd integer,, set sign = -1.
1014//
1015{ .mfi
1016      getf.exp GR_M = FR_W                      // Get signexp of W
1017      nop.f 999
1018      pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15      // X_2 = X_1 * Z_2 (bits 15-30)
1019}
1020{ .mfi
1021      ldfe FR_h_2 = [GR_Index2]                // Load h_2
1022(p9)  fnma.s1 FR_Sgn = f1, f1, f0          // If x<0, y odd int, result negative
1023      sub GR_N = GR_N, GR_exp_bias             // Get true exponent of x = N
1024}
1025;;
1026
1027{ .mfi
1028      add GR_Table_Ptr1 = 0xdc0, GR_table_base // Ptr to H_3
1029      fcmp.eq.s0 p11, p0 = FR_Input_Y, FR_Half // Test y=0.5, also set denorm
1030(p6)  shl GR_fraction_y=  GR_fraction_y, 1     // Shift left 1 to get fraction
1031}
1032;;
1033
1034{ .mmb
1035      setf.sig FR_float_N = GR_N
1036(p6)  cmp.ne.unc p8, p0 = GR_fraction_y, r0    // Test x<0 and y not integer
1037(p8)  br.cond.spnt POWL_64_XNEG                // Branch if x<0 and y not int
1038}
1039;;
1040
1041//
1042//      Raise possible denormal operand exception for both X and Y.
1043//      Set pointers in case |x| near 1
1044//      Branch to embedded sqrt(x) if y=0.5
1045//
1046{ .mfi
1047      add GR_P_ptr1 = 0x6b0, GR_table_base // Constants_log_80_P, P8, NEAR path
1048      fcmp.eq.s0 p12, p0 =  FR_Input_X, FR_Input_Y // Dummy to set denormal
1049      add GR_P_ptr2 = 0x700, GR_table_base // Constants_log_80_P, P4, NEAR path
1050}
1051{ .mfb
1052      cmp.eq p15, p14 =  r0, r0            // Assume result safe (no over/under)
1053      fsub.s1  FR_Delta = FR_Input_Y,f1    // Delta = y - 1.0
1054(p11) br.cond.spnt POWL_64_SQRT            // Branch if y=0.5
1055}
1056;;
1057
1058//
1059//     Computes ln( x ) to extra precision
1060//     Input  FR 1: FR_X
1061//     Output FR 2: FR_Y_hi
1062//     Output FR 3: FR_Y_lo
1063//     Output PR 1: PR_Safe
1064//
1065{ .mfi
1066      and GR_M = GR_exp_mask, GR_M            // Mask to get exponent of W
1067      nop.f 999
1068      extr.u GR_Index3 = GR_X_2, 1, 5         // Get index3
1069}
1070;;
1071
1072{ .mmi
1073      shladd GR_Table_Ptr1 = GR_Index3,2,GR_Table_Ptr1 // Ptr to H_3
1074      shladd GR_Index3 = GR_Index3,4,GR_Table_Ptr      // Ptr to G_3
1075      sub GR_M = GR_M, GR_exp_bias            // Get true exponent of W
1076}
1077;;
1078
1079{ .mib
1080      ldfs FR_G_3 = [GR_Index3],-12           // Load G_3
1081      cmp.gt  p7, p14 =  -8, GR_M             // Test if |x-1| < 2^-8
1082(p7)  br.cond.spnt LOGL80_NEAR                // Branch if |x-1| < 2^-8
1083}
1084;;
1085
1086// Here if |x-1| >= 2^-8
1087{ .mmf
1088      ldfs FR_H_3 = [GR_Table_Ptr1]           // Load H_3
1089      nop.m 999
1090      nop.f 999
1091}
1092;;
1093
1094{ .mfi
1095      ldfe FR_h_3 = [GR_Index3]               // Load h_3
1096      fmerge.se FR_S =  f1,FR_Z               // S = merge of 1.0 and signif(Z)
1097      nop.i 999
1098}
1099{ .mfi
1100      add GR_Table_Ptr = 0x740, GR_table_base // Constants_log_80_Q
1101      fmpy.s1 FR_G = FR_G_1, FR_G_2           // G = G_1 * G_2
1102      nop.i 999
1103}
1104;;
1105
1106//
1107//     Begin Loading Q's -  load log2_hi part
1108//
1109{ .mfi
1110      ldfe FR_log2_hi = [GR_Table_Ptr],16     // Load log2_hi
1111      fadd.s1 FR_H = FR_H_1, FR_H_2           // H = H_1 + H_2
1112      nop.i 999
1113};;
1114
1115//
1116//     h = h_1 + h_2
1117//
1118{ .mfi
1119      ldfe FR_log2_lo = [GR_Table_Ptr],16     // Load log2_lo
1120      fadd.s1 FR_h = FR_h_1, FR_h_2           // h = h_1 + h_2
1121      nop.i 999
1122}
1123;;
1124
1125{ .mfi
1126      ldfe FR_Q_6 = [GR_Table_Ptr],16         // Load Q_6
1127      fcvt.xf FR_float_N = FR_float_N
1128      nop.i 999
1129}
1130;;
1131
1132{ .mfi
1133      ldfe FR_Q_5 = [GR_Table_Ptr],16         // Load Q_5
1134      nop.f 999
1135      nop.i 999
1136}
1137;;
1138
1139//
1140//     G = G_1 * G_2 * G_3
1141//
1142{ .mfi
1143      ldfe FR_Q_4 = [GR_Table_Ptr],16         // Load Q_4
1144      fmpy.s1 FR_G = FR_G, FR_G_3
1145      nop.i 999
1146}
1147;;
1148
1149//
1150//     H = H_1 + H_2 + H_3
1151//
1152{ .mfi
1153      ldfe FR_Q_3 = [GR_Table_Ptr],16         // Load Q_3
1154      fadd.s1 FR_H = FR_H, FR_H_3
1155      nop.i 999
1156}
1157;;
1158
1159//
1160//     Y_lo = poly + Y_lo
1161//
1162//     h = h_1 + h_2 + h_3
1163//
1164{ .mfi
1165      ldfe FR_Q_2 = [GR_Table_Ptr],16         // Load Q_2
1166      fadd.s1 FR_h = FR_h, FR_h_3
1167      nop.i 999
1168}
1169;;
1170
1171//
1172//     GS_hi = G*S
1173//     r = G*S -1
1174//
1175{ .mfi
1176      ldfe FR_Q_1 = [GR_Table_Ptr],16         // Load Q_1
1177      fmpy.s1 FR_GS_hi = FR_G, FR_S
1178      nop.i 999
1179}
1180{ .mfi
1181      nop.m 999
1182      fms.s1 FR_r = FR_G, FR_S, f1
1183      nop.i 999
1184}
1185;;
1186
1187//
1188//     poly_lo = Q_5 + r * Q_6
1189//
1190{ .mfi
1191      getf.exp GR_Delta_Exp =  FR_Delta     // Get signexp of y-1 for exp calc
1192      fma.s1 FR_poly_lo = FR_r, FR_Q_6, FR_Q_5
1193      nop.i 999
1194}
1195//
1196//     r_cor = GS_hi -1
1197//
1198{ .mfi
1199      nop.m 999
1200      fsub.s1 FR_r_cor = FR_GS_hi, f1
1201      nop.i 999
1202}
1203;;
1204
1205//
1206//     GS_lo  = G*S - GS_hi
1207//
1208{ .mfi
1209      nop.m 999
1210      fms.s1 FR_GS_lo = FR_G, FR_S, FR_GS_hi
1211      nop.i 999
1212}
1213;;
1214
1215//
1216//     rsq = r * r
1217//
1218{ .mfi
1219      nop.m 999
1220      fmpy.s1 FR_rsq = FR_r, FR_r
1221      nop.i 999
1222}
1223//
1224//     G = float_N*log2_hi + H
1225//
1226{ .mfi
1227      nop.m 999
1228      fma.s1 FR_G = FR_float_N, FR_log2_hi, FR_H
1229      nop.i 999
1230}
1231;;
1232
1233//
1234//     Y_lo = float_N*log2_lo + h
1235//
1236{ .mfi
1237      nop.m 999
1238      fma.s1 FR_Y_lo = FR_float_N, FR_log2_lo, FR_h
1239      nop.i 999
1240}
1241;;
1242
1243//
1244//      poly_lo = Q_4 + r * poly_lo
1245//      r_cor = r_cor - r
1246//
1247{ .mfi
1248      nop.m 999
1249      fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_4
1250      nop.i 999
1251}
1252{ .mfi
1253      nop.m 999
1254      fsub.s1 FR_r_cor = FR_r_cor, FR_r
1255      nop.i 999
1256}
1257;;
1258
1259//
1260//      poly_hi = r * Q_2 + Q_1
1261//      Y_hi = G + r
1262//
1263{ .mfi
1264      nop.m 999
1265      fma.s1 FR_poly = FR_r, FR_Q_2, FR_Q_1
1266      nop.i 999
1267}
1268{ .mfi
1269      nop.m 999
1270      fadd.s1 FR_Y_hi = FR_G, FR_r
1271      nop.i 999
1272}
1273;;
1274
1275//
1276//      poly_lo = Q_3 + r * poly_lo
1277//      r_cor = r_cor + GS_lo
1278//
1279{ .mfi
1280      nop.m 999
1281      fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_3
1282      nop.i 999
1283}
1284{ .mfi
1285      nop.m 999
1286      fadd.s1 FR_r_cor = FR_r_cor, FR_GS_lo
1287      nop.i 999
1288}
1289;;
1290
1291//
1292//      Y_lo = G - Y_hi
1293//
1294{ .mfi
1295      nop.m 999
1296      fsub.s1 FR_Y_lo_2 = FR_G, FR_Y_hi
1297      nop.i 999
1298}
1299;;
1300
1301//
1302//      r_cor = r_cor + Y_lo
1303//      poly = poly_hi + rsq * poly_lo
1304//
1305{ .mfi
1306      add  GR_Table_Ptr   = 0x0, GR_table_base   // Constants_exp_64_Arg
1307      fadd.s1 FR_r_cor = FR_r_cor, FR_Y_lo
1308      nop.i 999
1309}
1310{ .mfi
1311      nop.m 999
1312      fma.s1 FR_poly = FR_rsq, FR_poly_lo, FR_poly
1313      nop.i 999
1314}
1315;;
1316
1317//
1318//      Load L_hi
1319//      Load L_lo
1320//      all long before they are needed.
1321//      They are used in LOGL_RETURN PATH
1322//
1323//      Y_lo =  Y_lo + r
1324//      poly = rsq * poly + r_cor
1325//
1326{ .mfi
1327      ldfe FR_L_hi = [GR_Table_Ptr],16           // Load L_hi
1328      fadd.s1 FR_Y_lo = FR_Y_lo_2, FR_r
1329      nop.i 999
1330}
1331{ .mfi
1332      nop.m 999
1333      fma.s1 FR_poly = FR_rsq, FR_poly, FR_r_cor
1334      nop.i 999
1335}
1336;;
1337
1338{ .mfb
1339      ldfe FR_L_lo = [GR_Table_Ptr],16           // Load L_lo
1340      fadd.s1 FR_Y_lo = FR_Y_lo, FR_poly
1341      br.cond.sptk LOGL_RETURN                   // Branch to common code
1342}
1343;;
1344
1345
1346LOGL80_NEAR:
1347// Here if |x-1| < 2^-8
1348//
1349//     Branch LOGL80_NEAR
1350//
1351
1352{ .mmf
1353      ldfe FR_P_8 = [GR_P_ptr1],16           // Load P_8
1354      ldfe FR_P_4 = [GR_P_ptr2],16           // Load P_4
1355      fmpy.s1 FR_Wsq = FR_W, FR_W
1356}
1357;;
1358
1359{ .mmi
1360      ldfe FR_P_7 = [GR_P_ptr1],16           // Load P_7
1361      ldfe FR_P_3 = [GR_P_ptr2],16           // Load P_3
1362      nop.i 999
1363}
1364;;
1365
1366{ .mmi
1367      ldfe FR_P_6 = [GR_P_ptr1],16           // Load P_6
1368      ldfe FR_P_2 = [GR_P_ptr2],16           // Load P_2
1369      nop.i 999
1370}
1371;;
1372
1373{ .mmi
1374      ldfe FR_P_5 = [GR_P_ptr1],16           // Load P_5
1375      ldfe FR_P_1 = [GR_P_ptr2],16           // Load P_1
1376      nop.i 999
1377}
1378;;
1379
1380{ .mfi
1381      getf.exp GR_Delta_Exp =  FR_Delta      // Get signexp of y-1 for exp calc
1382      fmpy.s1 FR_W4 = FR_Wsq, FR_Wsq
1383      nop.i 999
1384}
1385{ .mfi
1386      add  GR_Table_Ptr = 0x0, GR_table_base // Constants_exp_64_Arg
1387      fmpy.s1 FR_W3 = FR_Wsq, FR_W
1388      nop.i 999
1389}
1390;;
1391
1392{ .mfi
1393      nop.m 999
1394      fmpy.s1 FR_half_W = FR_Half, FR_W
1395      nop.i 999
1396}
1397;;
1398
1399{ .mfi
1400      ldfe FR_L_hi = [GR_Table_Ptr],16
1401      fma.s1 FR_poly_lo = FR_W, FR_P_8,FR_P_7
1402      nop.i 999
1403}
1404{ .mfi
1405      nop.m 999
1406      fma.s1 FR_poly = FR_W, FR_P_4, FR_P_3
1407      nop.i 999
1408}
1409;;
1410
1411{ .mfi
1412      ldfe FR_L_lo = [GR_Table_Ptr],16
1413      fnma.s1 FR_Y_hi = FR_W, FR_half_W, FR_W
1414      nop.i 999
1415}
1416;;
1417
1418{ .mfi
1419      nop.m 999
1420      fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_6
1421      nop.i 999
1422}
1423{ .mfi
1424      nop.m 999
1425      fma.s1 FR_poly = FR_W, FR_poly, FR_P_2
1426      nop.i 999
1427}
1428;;
1429
1430{ .mfi
1431      nop.m 999
1432      fsub.s1 FR_Y_lo = FR_W, FR_Y_hi
1433      nop.i 999
1434}
1435;;
1436
1437{ .mfi
1438      nop.m 999
1439      fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_5
1440      nop.i 999
1441}
1442{ .mfi
1443      nop.m 999
1444      fma.s1 FR_poly = FR_W, FR_poly, FR_P_1
1445      nop.i 999
1446}
1447;;
1448
1449{ .mfi
1450      nop.m 999
1451      fnma.s1 FR_Y_lo = FR_W, FR_half_W, FR_Y_lo
1452      nop.i 999
1453}
1454;;
1455
1456{ .mfi
1457      nop.m 999
1458      fma.s1 FR_poly = FR_poly_lo, FR_W4, FR_poly
1459      nop.i 999
1460}
1461;;
1462
1463{ .mfi
1464      nop.m 999
1465      fma.s1 FR_Y_lo = FR_poly, FR_W3, FR_Y_lo
1466      nop.i 999
1467}
1468;;
1469
1470
1471LOGL_RETURN:
1472// Common code for completion of both logx paths
1473
1474//
1475//     L_hi, L_lo already loaded.
1476//
1477//
1478//     kernel_log_80 computed ln(X)
1479//     and return logX_hi and logX_lo as results.
1480//     PR_pow_Safe set as well.
1481//
1482//
1483//     Compute Y * (logX_hi + logX_lo)
1484//     P_hi -> X
1485//     P_lo -> X_cor
1486//     (Manipulate names so that inputs are in
1487//     the place kernel_exp expects them)
1488//
1489//     This function computes exp( x  + x_cor)
1490//     Input  FR 1: FR_X
1491//     Input  FR 2: FR_X_cor
1492//     Output FR 3: FR_Y_hi
1493//     Output FR 4: FR_Y_lo
1494//     Output FR 5: FR_Scale
1495//     Output PR 1: PR_Safe
1496//
1497//     P15 is True
1498//
1499// Load constants used in computing N using right-shift technique
1500{ .mlx
1501      mov GR_exp_2tom51 = 0xffff-51
1502      movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc  // significand of 1/ln2
1503}
1504{ .mlx
1505      add  GR_Special_Exp = -50,GR_exp_bias
1506      movl GR_rshf_2to51 = 0x4718000000000000   // 1.10000 2^(63+51)
1507}
1508;;
1509
1510//
1511//     Point to Table of W1s
1512//     Point to Table of W2s
1513//
1514{ .mmi
1515      add GR_W1_ptr   = 0x2b0, GR_table_base    // Constants_exp_64_W1
1516      add GR_W2_ptr   = 0x4b0, GR_table_base    // Constants_exp_64_W2
1517      cmp.le p6,p0= GR_Delta_Exp,GR_Special_Exp
1518};;
1519
1520// Form two constants we need
1521//  1/ln2 * 2^63  to compute  w = x * 1/ln2 * 128
1522//  1.1000..000 * 2^(63+63-12) to right shift int(N) into the significand
1523
1524{ .mfi
1525      setf.sig  FR_INV_LN2_2TO63 = GR_sig_inv_ln2 // form 1/ln2 * 2^63
1526      nop.f 999
1527      and GR_Delta_Exp=GR_Delta_Exp,GR_exp_mask  // Get exponent of y-1
1528}
1529{ .mlx
1530      setf.d  FR_RSHF_2TO51 = GR_rshf_2to51    // Form const 1.1000 * 2^(63+51)
1531      movl GR_rshf = 0x43e8000000000000        // 1.10000 2^63 for right shift
1532}
1533;;
1534
1535{ .mfi
1536      nop.m 999
1537      fmpy.s1 FR_X_lo = FR_Input_Y, FR_logx_lo // logx_lo is Y_lo
1538      cmp.eq  p15, p0=  r0, r0                 // Set p15, assume safe
1539};;
1540
1541{ .mmi
1542      setf.exp FR_2TOM51 = GR_exp_2tom51 // Form 2^-51 for scaling float_N
1543      setf.d  FR_RSHF = GR_rshf          // Form right shift const 1.1000 * 2^63
1544      add GR_Table_Ptr1   = 0x50, GR_table_base // Constants_exp_64_P for
1545                                                // EXPL_SMALL path
1546}
1547;;
1548
1549{ .mmi
1550      ldfe FR_P_6 = [GR_Table_Ptr1],16          // Load P_6 for EXPL_SMALL path
1551;;
1552      ldfe FR_P_5 = [GR_Table_Ptr1],16          // Load P_5 for EXPL_SMALL path
1553      nop.i 999
1554}
1555;;
1556
1557{ .mfi
1558      ldfe FR_P_4 = [GR_Table_Ptr1],16          // Load P_4 for EXPL_SMALL path
1559      fma.s1 FR_P_hi = FR_Input_Y, FR_logx_hi,FR_X_lo  // logx_hi ix Y_hi
1560      nop.i 999
1561}
1562;;
1563
1564{ .mmi
1565      ldfe FR_P_3 = [GR_Table_Ptr1],16          // Load P_3 for EXPL_SMALL path
1566;;
1567      ldfe FR_P_2 = [GR_Table_Ptr1],16          // Load P_2 for EXPL_SMALL path
1568      nop.i 999
1569}
1570;;
1571
1572// N = X * Inv_log2_by_2^12
1573// By adding 1.10...0*2^63 we shift and get round_int(N_signif) in significand.
1574// We actually add 1.10...0*2^51 to X * Inv_log2 to do the same thing.
1575{ .mfi
1576      ldfe FR_P_1 = [GR_Table_Ptr1]             // Load P_1 for EXPL_SMALL path
1577      fma.s1 FR_N = FR_X, FR_INV_LN2_2TO63, FR_RSHF_2TO51
1578      nop.i 999
1579}
1580{ .mfb
1581      nop.m 999
1582      fms.s1 FR_P_lo= FR_Input_Y, FR_logx_hi, FR_P_hi  // P_hi is X
1583(p6)  br.cond.spnt POWL_Y_ALMOST_1              // Branch if |y-1| < 2^-50
1584}
1585;;
1586
1587{ .mmi
1588      getf.exp GR_Expo_X = FR_X
1589      add GR_T1_ptr   = 0x0b0, GR_table_base    // Constants_exp_64_T1
1590      add GR_T2_ptr   = 0x1b0, GR_table_base    // Constants_exp_64_T2
1591}
1592;;
1593
1594// float_N = round_int(N)
1595// The signficand of N contains the rounded integer part of X * 2^12/ln2,
1596// as a twos complement number in the lower bits (that is, it may be negative).
1597// That twos complement number (called N) is put into GR_N_fix.
1598
1599// Since N is scaled by 2^51, it must be multiplied by 2^-51
1600// before the shift constant 1.10000 * 2^63 is subtracted to yield float_N.
1601// Thus, float_N contains the floating point version of N
1602
1603
1604{ .mfi
1605      add  GR_Table_Ptr   = 0x20, GR_table_base    // Constants_exp_64_A
1606      fms.s1 FR_float_N = FR_N, FR_2TOM51, FR_RSHF // Form float_N
1607      nop.i 999
1608}
1609//     Create low part of Y(ln(x)_hi + ln(x)_lo) as P_lo
1610{ .mfi
1611      mov GR_Big_Pos_Exp = 0x3ffe               // 16382, largest safe exponent
1612      fadd.s1 FR_P_lo = FR_P_lo, FR_X_lo
1613      mov GR_Big_Neg_Exp = -0x3ffd              // -16381 smallest safe exponent
1614};;
1615
1616{ .mfi
1617      nop.m 999
1618      fmpy.s1 FR_rsq = FR_X, FR_X               // rsq = X*X for EXPL_SMALL path
1619      mov GR_vsm_expo = -70                     // Exponent for very small path
1620}
1621{ .mfi
1622      nop.m 999
1623      fma.s1 FR_poly_lo = FR_P_6, FR_X, FR_P_5  // poly_lo for EXPL_SMALL path
1624      add GR_temp = 0x1,r0                      // For tiny signif if small path
1625}
1626;;
1627
1628//
1629//      If expo_X < -6 goto exp_small
1630//
1631{ .mmi
1632      getf.sig GR_N_fix = FR_N
1633      ldfe FR_A_3 = [GR_Table_Ptr],16         // Load A_3
1634      and GR_Expo_X = GR_Expo_X, GR_exp_mask  // Get exponent of X
1635}
1636;;
1637
1638{ .mfi
1639      ldfe FR_A_2 = [GR_Table_Ptr],16         // Load A_2
1640      nop.f 999
1641      sub GR_Expo_X = GR_Expo_X, GR_exp_bias  // Get true exponent of X
1642}
1643;;
1644
1645//
1646//     If -6 > Expo_X, set P9 and branch
1647//
1648{ .mfb
1649      cmp.gt  p9, p0  =  -6, GR_Expo_X
1650      fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_X // r = X - L_hi * float_N
1651(p9)  br.cond.spnt EXPL_SMALL                  // Branch if |X| < 2^-6
1652}
1653;;
1654
1655//
1656//     If 14 <= Expo_X, set P10
1657//
1658{ .mib
1659      cmp.le  p10, p0 =  14, GR_Expo_X
1660      nop.i 999
1661(p10) br.cond.spnt EXPL_HUGE                   // Branch if |X| >= 2^14
1662}
1663;;
1664
1665//
1666//      Load single T1
1667//      Load single T2
1668//      W_1_p1 = W_1 + 1
1669//
1670{ .mmi
1671      nop.m 999
1672      nop.m 999
1673      extr.u GR_M1 = GR_N_fix, 6, 6            // Extract index M_1
1674}
1675;;
1676
1677//
1678//      k = extr.u(N_fix,0,6)
1679//
1680{ .mmi
1681      shladd GR_W1_ptr = GR_M1,3,GR_W1_ptr     // Point to W1
1682      shladd GR_T1_ptr = GR_M1,2,GR_T1_ptr     // Point to T1
1683      extr.u GR_M2 = GR_N_fix, 0, 6            // Extract index M_2
1684}
1685;;
1686
1687// N_fix is only correct up to 50 bits because of our right shift technique.
1688// Actually in the normal path we will have restricted K to about 14 bits.
1689// Somewhat arbitrarily we extract 32 bits.
1690{ .mmi
1691      ldfd  FR_W1 = [GR_W1_ptr]
1692      shladd GR_W2_ptr = GR_M2,3,GR_W2_ptr     // Point to W2
1693      extr GR_k = GR_N_fix, 12, 32             // Extract k
1694}
1695;;
1696
1697{ .mfi
1698      ldfs  FR_T1 = [GR_T1_ptr]
1699      fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r
1700      shladd GR_T2_ptr = GR_M2,2,GR_T2_ptr     // Point to T2
1701}
1702{ .mfi
1703      add GR_exp_bias_p_k = GR_exp_bias, GR_k
1704      nop.f 999
1705      cmp.gt  p14,p15 = GR_k,GR_Big_Pos_Exp
1706}
1707;;
1708
1709//
1710//      if k < big_neg_exp, set p14 and Safe=False
1711//
1712{ .mmi
1713      ldfs  FR_T2 = [GR_T2_ptr]
1714(p15) cmp.lt p14,p15 = GR_k,GR_Big_Neg_Exp
1715      nop.i 999
1716}
1717;;
1718
1719{ .mmi
1720      setf.exp FR_Scale = GR_exp_bias_p_k
1721      ldfd  FR_W2 = [GR_W2_ptr]
1722      nop.i 999
1723}
1724;;
1725
1726{ .mfi
1727      ldfe FR_A_1 = [GR_Table_Ptr],16
1728      fadd.s1 FR_r = FR_r, FR_X_cor
1729      nop.i 999
1730}
1731;;
1732
1733{ .mfi
1734      nop.m 999
1735      fadd.s1 FR_W_1_p1 = FR_W1, f1
1736      nop.i 999
1737}
1738;;
1739
1740{ .mfi
1741      nop.m 999
1742      fma.s1 FR_poly = FR_r, FR_A_3, FR_A_2
1743      nop.i 999
1744}
1745{ .mfi
1746      nop.m 999
1747      fmpy.s1 FR_rsq = FR_r, FR_r
1748      nop.i 999
1749}
1750;;
1751
1752{ .mfi
1753      nop.m 999
1754      fmpy.s1 FR_T = FR_T1, FR_T2
1755      nop.i 999
1756}
1757;;
1758
1759{ .mfi
1760      nop.m 999
1761      fma.s1 FR_W = FR_W2, FR_W_1_p1, FR_W1
1762      nop.i 999
1763}
1764;;
1765
1766{ .mfi
1767      nop.m 999
1768      fma.s1 FR_TMP1 = FR_Scale, FR_Sgn, f0
1769      nop.i 999
1770}
1771;;
1772
1773{ .mfi
1774      nop.m 999
1775      fma.s1 FR_poly = FR_r, FR_poly, FR_A_1
1776      nop.i 999
1777}
1778;;
1779
1780{ .mfi
1781      nop.m 999
1782      fma.s1 FR_TMP2 = FR_T, f1, f0            // TMP2 = Y_hi = T
1783      nop.i 999
1784}
1785;;
1786
1787{ .mfi
1788      nop.m 999
1789      fadd.s1 FR_Wp1 = FR_W, f1
1790      nop.i 999
1791}
1792;;
1793
1794{ .mfi
1795      nop.m 999
1796      fma.s1 FR_poly = FR_rsq, FR_poly,FR_r
1797      nop.i 999
1798}
1799;;
1800
1801{ .mfi
1802      nop.m 999
1803      fma.s1 FR_Tscale = FR_T, FR_TMP1, f0    // Scale * Sgn * T
1804      nop.i 999
1805}
1806{ .mfi
1807      nop.m 999
1808      fma.s1 FR_Y_lo = FR_Wp1, FR_poly, FR_W
1809      nop.i 999
1810}
1811;;
1812
1813{ .mfb
1814      nop.m 999
1815      fmpy.s1 FR_TMP3 = FR_Y_lo, FR_Tscale
1816      br.cond.sptk POWL_64_SHARED
1817}
1818;;
1819
1820
1821EXPL_SMALL:
1822// Here if |ylogx| < 2^-6
1823//
1824//     Begin creating lsb to perturb final result
1825//
1826{ .mfi
1827      setf.sig FR_temp = GR_temp
1828      fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_4
1829      cmp.lt  p12, p0 =  GR_Expo_X, GR_vsm_expo   // Test |ylogx| < 2^-70
1830}
1831{ .mfi
1832      nop.m 999
1833      fma.s1 FR_poly_hi = FR_P_2, FR_X, FR_P_1
1834      nop.i 999
1835}
1836;;
1837
1838{ .mfi
1839      nop.m 999
1840      fmpy.s1 FR_TMP2 = f1, f1
1841      nop.i 999
1842}
1843{ .mfi
1844      nop.m 999
1845      fmpy.s1 FR_TMP1 = FR_Sgn, f1
1846      nop.i 999
1847}
1848;;
1849
1850{ .mfi
1851      nop.m 999
1852      fmpy.s1 FR_r4 = FR_rsq, FR_rsq
1853(p12) cmp.eq  p15, p0 =  r0, r0                   // Set safe if |ylogx| < 2^-70
1854}
1855{ .mfb
1856      nop.m 999
1857(p12) fmpy.s1 FR_TMP3 = FR_Sgn, FR_X
1858(p12) br.cond.spnt POWL_64_SHARED                 // Branch if |ylogx| < 2^-70
1859}
1860;;
1861
1862{ .mfi
1863      nop.m 999
1864      fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_3
1865      nop.i 999
1866}
1867{ .mfi
1868      nop.m 999
1869      fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, FR_X
1870      nop.i 999
1871}
1872;;
1873
1874{ .mfi
1875      nop.m 999
1876      fma.s1 FR_Y_lo = FR_poly_lo, FR_r4, FR_poly_hi
1877      nop.i 999
1878}
1879;;
1880
1881{ .mfi
1882      nop.m 999
1883      fmpy.s1 FR_TMP3 = FR_Y_lo, FR_TMP1      // Add sign info
1884      nop.i 999
1885}
1886;;
1887
1888//
1889//     Toggle on last bit of Y_lo
1890//     Set lsb of Y_lo to 1
1891//
1892{ .mfi
1893      nop.m 999
1894      for FR_temp = FR_Y_lo,FR_temp
1895      nop.i 999
1896}
1897;;
1898
1899{ .mfb
1900      nop.m 999
1901      fmerge.se FR_TMP3 = FR_TMP3,FR_temp
1902      br.cond.sptk POWL_64_SHARED
1903}
1904;;
1905
1906
1907EXPL_HUGE:
1908// Here if |ylogx| >= 2^14
1909{ .mfi
1910      mov GR_temp = 0x0A1DC               // If X < 0, exponent -24100
1911      fcmp.gt.s1 p12, p13 =  FR_X, f0     // Test X > 0
1912      cmp.eq  p14, p15 =  r0, r0          // Set Safe to false
1913}
1914;;
1915
1916{ .mmi
1917(p12) mov GR_Mask = 0x15DC0               // If X > 0, exponent +24000
1918(p13) mov GR_Mask = 0x0A240               // If X < 0, exponent -24000
1919      nop.i 999
1920}
1921;;
1922
1923{ .mmf
1924      setf.exp FR_TMP2 = GR_Mask          // Form Y_hi = TMP2
1925(p13) setf.exp FR_Y_lo = GR_temp          // If X < 0, Y_lo = 2^-24100
1926(p12) mov FR_Y_lo = f1                    // IF X > 0, Y_lo = 1.0
1927}
1928;;
1929
1930{ .mfi
1931      nop.m 999
1932      fmpy.s1 FR_TMP1 = FR_TMP2, FR_Sgn   // TMP1 = Y_hi * Sgn
1933      nop.i 999
1934}
1935;;
1936
1937{ .mfb
1938      nop.m 999
1939      fmpy.s1 FR_TMP3 = FR_Y_lo,FR_TMP1   // TMP3 = Y_lo * (Y_hi * Sgn)
1940      br.cond.sptk POWL_64_SHARED
1941}
1942;;
1943
1944POWL_Y_ALMOST_1:
1945// Here if delta = |y-1| < 2^-50
1946//
1947//  x**(1 + delta) = x * e (ln(x)*delta) = x ( 1 + ln(x) * delta)
1948//
1949// Computation will be safe for 2^-16381 <= x < 2^16383
1950
1951{ .mfi
1952       mov GR_exp_ynear1_oflow = 0xffff + 16383
1953       fma.s1 FR_TMP1 = FR_Input_X,FR_Delta,f0
1954       and GR_exp_x = GR_exp_mask, GR_signexp_x
1955}
1956;;
1957
1958{ .mfi
1959       cmp.lt  p15, p14 =  GR_exp_x, GR_exp_ynear1_oflow
1960       fma.s1 FR_TMP2 = FR_logx_hi,f1,FR_X_lo
1961       mov GR_exp_ynear1_uflow = 0xffff - 16381
1962}
1963;;
1964
1965{ .mfb
1966(p15)  cmp.ge  p15, p14 =  GR_exp_x, GR_exp_ynear1_uflow
1967       fma.s1 FR_TMP3 = FR_Input_X,f1,f0
1968       br.cond.sptk POWL_64_SHARED
1969};;
1970
1971POWL_64_SQUARE:
1972//
1973//      Here if x not zero and y=2.
1974//
1975//      Setup for multipath code
1976//
1977{ .mfi
1978      mov GR_exp_square_oflow = 0xffff + 8192   // Exponent where x*x overflows
1979      fmerge.se FR_TMP1 = FR_Input_X, FR_Input_X
1980      and GR_exp_x = GR_exp_mask, GR_signexp_x  // Get exponent of x
1981}
1982;;
1983
1984{ .mfi
1985      cmp.lt  p15, p14 =  GR_exp_x, GR_exp_square_oflow // Decide safe/unsafe
1986      fmerge.se FR_TMP2 = FR_Input_X, FR_Input_X
1987      mov GR_exp_square_uflow = 0xffff - 8191   // Exponent where x*x underflows
1988}
1989;;
1990
1991{ .mfi
1992(p15) cmp.ge  p15, p14 =  GR_exp_x, GR_exp_square_uflow // Decide safe/unsafe
1993      fma.s1 FR_TMP3 = f0,f0,f0
1994      nop.i 999
1995}
1996;;
1997
1998//
1999//      This is the shared path that will set overflow and underflow.
2000//
2001POWL_64_SHARED:
2002
2003//
2004//      Return if no danger of over or underflow.
2005//
2006{ .mfb
2007      nop.m 999
2008      fma.s0 FR_Result = FR_TMP1, FR_TMP2, FR_TMP3
2009(p15) br.ret.sptk  b0      // Main path return if certain no over/underflow
2010}
2011;;
2012
2013//
2014//      S0 user supplied status
2015//      S2 user supplied status + WRE + TD  (Overflows)
2016//      S2 user supplied status + FZ + TD   (Underflows)
2017//
2018//
2019//     If (Safe) is true, then
2020//        Compute result using user supplied status field.
2021//        No overflow or underflow here, but perhaps inexact.
2022//        Return
2023//     Else
2024//       Determine if overflow or underflow was raised.
2025//       Fetch +/- overflow threshold for IEEE double extended
2026
2027{ .mfi
2028      nop.m 999
2029      fsetc.s2 0x7F,0x41       // For underflow test, set S2=User+TD+FTZ
2030      nop.i 999
2031}
2032;;
2033
2034{ .mfi
2035      nop.m 999
2036      fma.s2 FR_Result_small = FR_TMP1, FR_TMP2, FR_TMP3
2037      nop.i 999
2038}
2039;;
2040
2041{ .mfi
2042      nop.m 999
2043      fsetc.s2 0x7F,0x42       // For overflow test, set S2=User+TD+WRE
2044      nop.i 999
2045}
2046;;
2047
2048{ .mfi
2049      nop.m 999
2050      fma.s2 FR_Result_big = FR_TMP1, FR_TMP2,FR_TMP3
2051      nop.i 999
2052}
2053;;
2054
2055{ .mfi
2056      nop.m 999
2057      fsetc.s2 0x7F,0x40       // Reset S2=User
2058      nop.i 999
2059}
2060;;
2061
2062{ .mfi
2063      nop.m 999
2064      fclass.m p11, p0 = FR_Result_small, 0x00F // Test small result unorm/zero
2065      nop.i 999
2066}
2067;;
2068
2069{ .mfi
2070      nop.m 999
2071      fcmp.ge.s1 p8, p0 = FR_Result_big , FR_Big // Test >= + oflow threshold
2072      nop.i 999
2073}
2074;;
2075
2076{ .mfb
2077(p11) mov   GR_Parameter_TAG = 19                // Set tag for underflow
2078      fcmp.le.s1 p9, p0 = FR_Result_big, FR_NBig // Test <= - oflow threshold
2079(p11) br.cond.spnt __libm_error_region           // Branch if pow underflowed
2080}
2081;;
2082
2083{ .mfb
2084(p8)  mov   GR_Parameter_TAG = 18                // Set tag for overflow
2085      nop.f 999
2086(p8)  br.cond.spnt __libm_error_region           // Branch if pow +overflow
2087}
2088;;
2089
2090{ .mbb
2091(p9)  mov   GR_Parameter_TAG = 18                // Set tag for overflow
2092(p9)  br.cond.spnt __libm_error_region           // Branch if pow -overflow
2093      br.ret.sptk  b0                            // Branch if result really ok
2094}
2095;;
2096
2097
2098POWL_64_SPECIAL:
2099// Here if x or y is NatVal, nan, inf, or zero
2100{ .mfi
2101      nop.m 999
2102      fcmp.eq.s1 p15, p0 =  FR_Input_X, f1  // Test x=+1
2103      nop.i 999
2104}
2105;;
2106
2107{ .mfi
2108      nop.m 999
2109      fclass.m p8, p0 =  FR_Input_X, 0x143  // Test x natval, snan
2110      nop.i 999
2111}
2112;;
2113
2114{ .mfi
2115      nop.m 999
2116(p15) fcmp.eq.unc.s0 p6,p0 = FR_Input_Y, f0 // If x=1, flag invalid if y=SNaN
2117      nop.i 999
2118}
2119{ .mfb
2120      nop.m 999
2121(p15) fmpy.s0 FR_Result = f1,f1             // If x=1, result=1
2122(p15) br.ret.spnt b0                        // Exit if x=1
2123}
2124;;
2125
2126{ .mfi
2127      nop.m 999
2128      fclass.m p6, p0 =  FR_Input_Y, 0x007  // Test y zero
2129      nop.i 999
2130}
2131;;
2132
2133{ .mfi
2134      nop.m 999
2135      fclass.m p9, p0 =  FR_Input_Y, 0x143  // Test y natval, snan
2136      nop.i 999
2137}
2138;;
2139
2140{ .mfi
2141      nop.m 999
2142      fclass.m p10, p0 =  FR_Input_X, 0x083 // Test x qnan
2143      nop.i 999
2144}
2145{ .mfi
2146      nop.m 999
2147(p8)  fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If x=snan, result=qnan
2148(p6)  cmp.ne p8,p0 = r0,r0     // Don't exit if x=snan, y=0 ==> result=+1
2149}
2150;;
2151
2152{ .mfi
2153      nop.m 999
2154(p6)  fclass.m.unc p15, p0 =  FR_Input_X,0x007   // Test x=0, y=0
2155      nop.i 999
2156}
2157{ .mfb
2158      nop.m 999
2159(p9)  fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If y=snan, result=qnan
2160(p8)  br.ret.spnt b0                             // Exit if x=snan, y not 0,
2161                                                 //   result=qnan
2162}
2163;;
2164
2165{ .mfi
2166      nop.m 999
2167      fcmp.eq.s1 p7, p0 =  FR_Input_Y, f1        // Test y +1.0
2168      nop.i 999
2169}
2170{ .mfb
2171      nop.m 999
2172(p10) fmpy.s0 FR_Result = FR_Input_X, f0         // If x=qnan, result=qnan
2173(p9)  br.ret.spnt b0                             // Exit if y=snan, result=qnan
2174}
2175;;
2176
2177{ .mfi
2178      nop.m 999
2179(p6)  fclass.m.unc p8, p0 =  FR_Input_X,0x0C3    // Test x=nan, y=0
2180      nop.i 999
2181}
2182;;
2183
2184{ .mfi
2185      nop.m 999
2186(p6)  fcmp.eq.s0 p9,p0 = FR_Input_X, f0          // If y=0, flag if x denormal
2187      nop.i 999
2188}
2189{ .mfi
2190      nop.m 999
2191(p6)  fadd.s0 FR_Result = f1, f0                 // If y=0, result=1
2192      nop.i 999
2193}
2194;;
2195
2196{ .mfi
2197      nop.m 999
2198      fclass.m p11, p0 =  FR_Input_Y, 0x083      // Test y qnan
2199      nop.i 999
2200}
2201{ .mfb
2202(p15) mov GR_Parameter_TAG = 20                  // Error tag for x=0, y=0
2203(p7)  fmpy.s0 FR_Result = FR_Input_X,f1          // If y=1, result=x
2204(p15) br.cond.spnt __libm_error_region           // Branch if x=0, y=0, result=1
2205}
2206;;
2207
2208{ .mfb
2209(p8)  mov GR_Parameter_TAG = 23                  // Error tag for x=nan, y=0
2210      fclass.m p14, p0 =  FR_Input_Y, 0x023      // Test y inf
2211(p8)  br.cond.spnt __libm_error_region           // Branch if x=snan, y=0,
2212                                                 //   result=1
2213}
2214;;
2215
2216{ .mfb
2217      nop.m 999
2218      fclass.m p13, p0 =  FR_Input_X, 0x023      // Test x inf
2219(p6)  br.ret.spnt b0                             // Exit y=0, x not nan or 0,
2220                                                 //   result=1
2221}
2222;;
2223
2224{ .mfb
2225      nop.m 999
2226(p14) fcmp.eq.unc.s1 p0,p14 = FR_Input_X,f0      // Test x not 0, y=inf
2227(p7)  br.ret.spnt b0                             // Exit y=1, x not snan,
2228                                                 //   result=x
2229}
2230;;
2231
2232{ .mfb
2233      nop.m 999
2234(p10) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X  // If x=qnan, y not snan,
2235                                                 //   result=qnan
2236(p10) br.ret.spnt b0                             // Exit x=qnan, y not snan,
2237                                                 //   result=qnan
2238}
2239;;
2240
2241{ .mfb
2242      nop.m 999
2243(p11) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X  // If y=qnan, x not nan or 1,
2244                                                 //   result=qnan
2245(p11) br.ret.spnt b0                             // Exit y=qnan, x not nan or 1,
2246                                                 //   result=qnan
2247}
2248;;
2249
2250{ .mbb
2251      nop.m 999
2252(p14) br.cond.spnt POWL_64_Y_IS_INF           // Branch if y=inf, x not 1 or nan
2253(p13) br.cond.spnt POWL_64_X_IS_INF           // Branch if x=inf, y not 1 or nan
2254}
2255;;
2256
2257
2258POWL_64_X_IS_ZERO:
2259// Here if x=0, y not nan or 1 or inf or 0
2260
2261// There is logic starting here to determine if y is an integer when x = 0.
2262// If 0 < |y| < 1 then clearly y is not an integer.
2263// If |y| > 1, then the significand of y is shifted left by the size of
2264//    the exponent of y.  This preserves the lsb of the integer part + the
2265//    fractional bits.  The lsb of the integer can be tested to determine if
2266//    the integer is even or odd.  The fractional bits can be tested.  If zero,
2267//    then y is an integer.
2268//
2269{ .mfi
2270      and GR_exp_y = GR_exp_mask,GR_signexp_y   // Get biased exponent of y
2271      nop.f 999
2272      and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y
2273}
2274;;
2275
2276//
2277//     Maybe y is < 1 already, so
2278//     can never be an integer.
2279//
2280{ .mfi
2281      cmp.lt  p9, p8 = GR_exp_y,GR_exp_bias     // Test 0 < |y| < 1
2282      nop.f 999
2283      sub GR_exp_y = GR_exp_y,GR_exp_bias       // Get true exponent of y
2284}
2285;;
2286
2287//
2288//     Shift significand of y looking for nonzero bits
2289//     For y > 1, shift signif_y exp_y bits to the left
2290//     For y < 1, turn on 4 low order bits of significand of y
2291//     so that the fraction will always be non-zero
2292//
2293{ .mmi
2294(p9)  or  GR_exp_y=  0xF,GR_signif_y            // Force nonzero fraction if y<1
2295;;
2296      nop.m 999
2297(p8)  shl GR_exp_y=  GR_signif_y,GR_exp_y       // Get lsb of int + fraction
2298                                                // Wait 4 cycles to use result
2299}
2300;;
2301
2302{ .mmi
2303      nop.m 999
2304;;
2305      nop.m 999
2306      nop.i 999
2307}
2308;;
2309
2310{ .mmi
2311      nop.m 999
2312;;
2313      nop.m 999
2314      shl GR_fraction_y=  GR_exp_y,1            // Shift left 1 to get fraction
2315}
2316;;
2317
2318//
2319//     Integer part of y  shifted off.
2320//     Get y's low even or odd bit - y might not be an int.
2321//
2322{ .mii
2323      cmp.eq  p13,p0  =  GR_fraction_y, r0      // Test for y integer
2324      cmp.eq  p8,p0 =  GR_y_sign, r0            // Test for y > 0
2325;;
2326(p13) tbit.nz.unc p13,p0 = GR_exp_y, 63         // Test if y an odd integer
2327}
2328;;
2329
2330{ .mfi
2331(p13) cmp.eq.unc p13,p14 =  GR_y_sign, r0   // Test y pos odd integer
2332(p8)  fcmp.eq.s0 p12,p0 = FR_Input_Y, f0    // If x=0 and y>0 flag if y denormal
2333      nop.i 999
2334}
2335;;
2336
2337//
2338//     Return +/-0 when x=+/-0 and y is positive odd integer
2339//
2340{ .mfb
2341      nop.m 999
2342(p13) mov FR_Result = FR_Input_X            // If x=0,  y pos odd int, result=x
2343(p13) br.ret.spnt b0                        // Exit x=0, y pos odd int, result=x
2344}
2345;;
2346
2347//
2348//     Return +/-inf when x=+/-0 and y is negative odd int
2349//
2350{ .mfb
2351(p14) mov GR_Parameter_TAG = 21
2352(p14) frcpa.s0 FR_Result, p0 = f1, FR_Input_X  // Result +-inf, set Z flag
2353(p14) br.cond.spnt __libm_error_region
2354}
2355;;
2356
2357//
2358//     Return +0 when x=+/-0 and y positive and not an odd integer
2359//
2360{ .mfb
2361      nop.m 999
2362(p8)  mov FR_Result = f0      // If x=0, y>0 and not odd integer, result=+0
2363(p8)  br.ret.sptk b0          // Exit x=0, y>0 and not odd integer, result=+0
2364}
2365;;
2366
2367//
2368//     Return +inf when x=+/-0 and y is negative and not odd int
2369//
2370{ .mfb
2371      mov GR_Parameter_TAG = 21
2372      frcpa.s0 FR_Result, p10 = f1,f0   // Result +inf, raise Z flag
2373      br.cond.sptk __libm_error_region
2374}
2375;;
2376
2377
2378POWL_64_X_IS_INF:
2379//
2380// Here if x=inf, y not 1 or nan
2381//
2382{ .mfi
2383      and GR_exp_y = GR_exp_mask,GR_signexp_y   // Get biased exponent y
2384      fclass.m p13, p0 =  FR_Input_X,0x022      // Test x=-inf
2385      nop.i 999
2386}
2387;;
2388
2389{ .mfi
2390      and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y
2391      fcmp.eq.s0 p9,p0 = FR_Input_Y, f0         // Dummy to set flag if y denorm
2392      nop.i 999
2393}
2394;;
2395
2396//
2397//     Maybe y is < 1 already, so
2398//     isn't an int.
2399//
2400{ .mfi
2401(p13) cmp.lt.unc  p9, p8 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1 if x=-inf
2402      fclass.m p11, p0 =  FR_Input_X,0x021      // Test x=+inf
2403      sub GR_exp_y = GR_exp_y,GR_exp_bias       // Get true exponent y
2404}
2405;;
2406
2407//
2408//     Shift significand of y looking for nonzero bits
2409//     For y > 1, shift signif_y exp_y bits to the left
2410//     For y < 1, turn on 4 low order bits of significand of y
2411//     so that the fraction will always be non-zero
2412//
2413{ .mmi
2414(p9)  or  GR_exp_y=  0xF,GR_signif_y          // Force nonzero fraction if y<1
2415;;
2416(p11) cmp.eq.unc  p14,p12 = GR_y_sign, r0     // Test x=+inf, y>0
2417(p8)  shl GR_exp_y=  GR_signif_y,GR_exp_y     // Get lsb of int + fraction
2418                                              // Wait 4 cycles to use result
2419}
2420;;
2421
2422//
2423//     Return +inf for x=+inf, y > 0
2424//     Return +0   for x=+inf, y < 0
2425//
2426{ .mfi
2427      nop.m 999
2428(p12) mov FR_Result = f0                      // If x=+inf, y<0, result=+0
2429      nop.i 999
2430}
2431{ .mfb
2432      nop.m 999
2433(p14) fma.s0 FR_Result = FR_Input_X,f1,f0     // If x=+inf, y>0, result=+inf
2434(p11) br.ret.sptk b0                          // Exit x=+inf
2435}
2436;;
2437
2438//
2439// Here only if x=-inf.  Wait until can use result of shl...
2440//
2441{ .mmi
2442      nop.m 999
2443;;
2444      nop.m 999
2445      nop.i 999
2446}
2447;;
2448
2449{ .mfi
2450      cmp.eq  p8,p9 = GR_y_sign, r0           // Test y pos
2451      nop.f 999
2452      shl GR_fraction_y = GR_exp_y,1          // Shift left 1 to get fraction
2453}
2454;;
2455
2456{ .mmi
2457      cmp.eq  p13,p0 = GR_fraction_y, r0      // Test y integer
2458;;
2459      nop.m 999
2460(p13) tbit.nz.unc  p13,p0 = GR_exp_y, 63      // Test y odd integer
2461}
2462;;
2463
2464//
2465//     Is y even or odd?
2466//
2467{ .mii
2468(p13) cmp.eq.unc  p14,p10 = GR_y_sign, r0     // Test x=-inf, y pos odd int
2469(p13) cmp.ne.and  p8,p9 = r0,r0               // If y odd int, turn off p8,p9
2470      nop.i 999
2471}
2472;;
2473
2474//
2475//     Return -0   for x = -inf and y < 0 and odd int.
2476//     Return -Inf for x = -inf and y > 0 and odd int.
2477//
2478{ .mfi
2479      nop.m 999
2480(p10) fmerge.ns FR_Result = f0, f0      // If x=-inf, y neg odd int, result=-0
2481      nop.i 999
2482}
2483{ .mfi
2484      nop.m 999
2485(p14) fmpy.s0 FR_Result = FR_Input_X,f1 // If x=-inf, y pos odd int, result=-inf
2486      nop.i 999
2487}
2488;;
2489
2490//
2491//     Return Inf for x = -inf and y > 0 not an odd int.
2492//     Return +0  for x = -inf and y < 0 not an odd int.
2493//
2494.pred.rel "mutex",p8,p9
2495{ .mfi
2496      nop.m 999
2497(p8)  fmerge.ns FR_Result = FR_Input_X, FR_Input_X // If x=-inf, y>0 not odd int
2498                                                   //   result=+inf
2499      nop.i 999
2500}
2501{ .mfb
2502      nop.m 999
2503(p9)  fmpy.s0 FR_Result = f0,f0                    // If x=-inf, y<0 not odd int
2504                                                   //   result=+0
2505      br.ret.sptk b0                               // Exit for x=-inf
2506}
2507;;
2508
2509
2510POWL_64_Y_IS_INF:
2511// Here if y=inf, x not 1 or nan
2512//
2513//     For y = +Inf and |x| < 1  returns 0
2514//     For y = +Inf and |x| > 1  returns Inf
2515//     For y = -Inf and |x| < 1  returns Inf
2516//     For y = -Inf and |x| > 1  returns 0
2517//     For y =  Inf and |x| = 1  returns 1
2518//
2519{ .mfi
2520      nop.m 999
2521      fclass.m p8, p0 =  FR_Input_Y, 0x021    // Test y=+inf
2522      nop.i 999
2523}
2524;;
2525
2526{ .mfi
2527      nop.m 999
2528      fclass.m p9, p0 =  FR_Input_Y, 0x022    // Test y=-inf
2529      nop.i 999
2530}
2531;;
2532
2533{ .mfi
2534      nop.m 999
2535      fabs FR_X = FR_Input_X                  // Form |x|
2536      nop.i 999
2537}
2538;;
2539
2540{ .mfi
2541      nop.m 999
2542      fcmp.eq.s0 p10,p0 = FR_Input_X, f0      // flag if x denormal
2543      nop.i 999
2544}
2545;;
2546
2547{ .mfi
2548      nop.m 999
2549(p8)  fcmp.lt.unc.s1 p6, p0  =  FR_X, f1      // Test y=+inf, |x|<1
2550      nop.i 999
2551}
2552;;
2553
2554{ .mfi
2555      nop.m 999
2556(p8)  fcmp.gt.unc.s1 p7, p0  =  FR_X, f1      // Test y=+inf, |x|>1
2557      nop.i 999
2558}
2559;;
2560
2561{ .mfi
2562      nop.m 999
2563(p9)  fcmp.lt.unc.s1 p12, p0 =  FR_X, f1      // Test y=-inf, |x|<1
2564      nop.i 999
2565}
2566{ .mfi
2567      nop.m 999
2568(p6)  fmpy.s0 FR_Result = f0,f0               // If y=+inf, |x|<1, result=+0
2569      nop.i 999
2570}
2571;;
2572
2573{ .mfi
2574      nop.m 999
2575(p9)  fcmp.gt.unc.s1 p13, p0 =  FR_X, f1      // Test y=-inf, |x|>1
2576      nop.i 999
2577}
2578{ .mfi
2579      nop.m 999
2580(p7)  fmpy.s0 FR_Result = FR_Input_Y, f1      // If y=+inf, |x|>1, result=+inf
2581      nop.i 999
2582}
2583;;
2584
2585{ .mfi
2586      nop.m 999
2587      fcmp.eq.s1 p14, p0 =  FR_X, f1          // Test y=inf, |x|=1
2588      nop.i 999
2589}
2590{ .mfi
2591      nop.m 999
2592(p12) fnma.s0 FR_Result = FR_Input_Y, f1, f0  // If y=-inf, |x|<1, result=+inf
2593      nop.i 999
2594}
2595;;
2596
2597{ .mfi
2598      nop.m 999
2599(p13) mov FR_Result = f0                      // If y=-inf, |x|>1, result=+0
2600      nop.i 999
2601}
2602;;
2603
2604{ .mfb
2605      nop.m 999
2606(p14) fmpy.s0 FR_Result = f1,f1               // If y=inf, |x|=1, result=+1
2607      br.ret.sptk b0                          // Common return for y=inf
2608}
2609;;
2610
2611
2612// Here if x or y denorm/unorm
2613POWL_DENORM:
2614{ .mmi
2615      getf.sig GR_signif_Z = FR_norm_X   // Get significand of x
2616;;
2617      getf.exp GR_signexp_y = FR_norm_Y  // Get sign and exp of y
2618      nop.i 999
2619}
2620;;
2621
2622{ .mfi
2623      getf.sig GR_signif_y = FR_norm_Y   // Get significand of y
2624      nop.f 999
2625      nop.i 999
2626}
2627;;
2628
2629{ .mib
2630      getf.exp GR_signexp_x = FR_norm_X  // Get sign and exp of x
2631      extr.u GR_Index1 = GR_signif_Z, 59, 4  // Extract upper 4 signif bits of x
2632      br.cond.sptk  POWL_COMMON          // Branch back to main path
2633}
2634;;
2635
2636
2637POWL_64_UNSUPPORT:
2638//
2639//     Raise exceptions for specific
2640//     values - pseudo NaN and
2641//     infinities.
2642//     Return NaN and raise invalid
2643//
2644{ .mfb
2645      nop.m 999
2646      fmpy.s0 FR_Result = FR_Input_X,f0
2647      br.ret.sptk b0
2648}
2649;;
2650
2651POWL_64_XNEG:
2652//
2653//     Raise invalid for x < 0  and
2654//     y not an integer
2655//
2656{ .mfi
2657      nop.m 999
2658      frcpa.s0 FR_Result, p8 =  f0, f0
2659      mov GR_Parameter_TAG = 22
2660}
2661{ .mib
2662      nop.m 999
2663      nop.i 999
2664      br.cond.sptk __libm_error_region
2665}
2666;;
2667
2668POWL_64_SQRT:
2669{ .mfi
2670      nop.m 999
2671      frsqrta.s0 FR_Result,p10 = FR_save_Input_X
2672      nop.i 999 ;;
2673}
2674{ .mfi
2675      nop.m 999
2676(p10) fma.s1   f62=FR_Half,FR_save_Input_X,f0
2677      nop.i 999 ;;
2678}
2679{ .mfi
2680      nop.m 999
2681(p10) fma.s1   f63=FR_Result,FR_Result,f0
2682      nop.i 999 ;;
2683}
2684{ .mfi
2685      nop.m 999
2686(p10) fnma.s1  f32=f63,f62,FR_Half
2687      nop.i 999 ;;
2688}
2689{ .mfi
2690      nop.m 999
2691(p10) fma.s1   f33=f32,FR_Result,FR_Result
2692      nop.i 999 ;;
2693}
2694{ .mfi
2695      nop.m 999
2696(p10) fma.s1   f34=f33,f62,f0
2697      nop.i 999 ;;
2698}
2699{ .mfi
2700      nop.m 999
2701(p10) fnma.s1  f35=f34,f33,FR_Half
2702      nop.i 999 ;;
2703}
2704{ .mfi
2705      nop.m 999
2706(p10) fma.s1   f63=f35,f33,f33
2707      nop.i 999 ;;
2708}
2709{ .mfi
2710      nop.m 999
2711(p10) fma.s1   f32=FR_save_Input_X,f63,f0
2712      nop.i 999
2713}
2714{ .mfi
2715      nop.m 999
2716(p10) fma.s1   FR_Result=f63,f62,f0
2717      nop.i 999 ;;
2718}
2719{ .mfi
2720      nop.m 999
2721(p10) fma.s1   f33=f11,f63,f0
2722      nop.i 999 ;;
2723}
2724{ .mfi
2725      nop.m 999
2726(p10) fnma.s1  f34=f32,f32,FR_save_Input_X
2727      nop.i 999
2728}
2729{ .mfi
2730      nop.m 999
2731(p10) fnma.s1  f35=FR_Result,f63,FR_Half
2732      nop.i 999 ;;
2733}
2734{ .mfi
2735      nop.m 999
2736(p10) fma.s1   f62=f33,f34,f32
2737      nop.i 999
2738}
2739{ .mfi
2740      nop.m 999
2741(p10) fma.s1   f63=f33,f35,f33
2742      nop.i 999 ;;
2743}
2744{ .mfi
2745      nop.m 999
2746(p10) fnma.s1  f32=f62,f62,FR_save_Input_X
2747      nop.i 999 ;;
2748}
2749{ .mfb
2750      nop.m 999
2751(p10) fma.s0 FR_Result=f32,f63,f62
2752      br.ret.sptk   b0                // Exit for x > 0, y = 0.5
2753}
2754;;
2755
2756GLOBAL_LIBM_END(powl)
2757libm_alias_ldouble_other (pow, pow)
2758
2759
2760LOCAL_LIBM_ENTRY(__libm_error_region)
2761.prologue
2762{ .mfi
2763        add   GR_Parameter_Y=-32,sp             // Parameter 2 value
2764        nop.f 0
2765.save   ar.pfs,GR_SAVE_PFS
2766        mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
2767}
2768{ .mfi
2769.fframe 64
2770        add sp=-64,sp                           // Create new stack
2771        nop.f 0
2772        mov GR_SAVE_GP=gp                       // Save gp
2773};;
2774{ .mmi
2775        stfe [GR_Parameter_Y] = FR_Input_Y,16   // Save Parameter 2 on stack
2776        add GR_Parameter_X = 16,sp              // Parameter 1 address
2777.save   b0, GR_SAVE_B0
2778        mov GR_SAVE_B0=b0                       // Save b0
2779};;
2780.body
2781{ .mib
2782        stfe [GR_Parameter_X] = FR_save_Input_X // Store Parameter 1 on stack
2783        add   GR_Parameter_RESULT = 0,GR_Parameter_Y
2784        nop.b 0                                 // Parameter 3 address
2785}
2786{ .mib
2787        stfe [GR_Parameter_Y] = FR_Result       // Store Parameter 3 on stack
2788        add   GR_Parameter_Y = -16,GR_Parameter_Y
2789        br.call.sptk b0=__libm_error_support#   // Call error handling function
2790};;
2791{ .mmi
2792        add   GR_Parameter_RESULT = 48,sp
2793        nop.m 0
2794        nop.i 0
2795};;
2796{ .mmi
2797        ldfe  f8 = [GR_Parameter_RESULT]        // Get return result off stack
2798.restore sp
2799        add   sp = 64,sp                        // Restore stack pointer
2800        mov   b0 = GR_SAVE_B0                   // Restore return address
2801};;
2802{ .mib
2803        mov   gp = GR_SAVE_GP                  // Restore gp
2804        mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
2805        br.ret.sptk     b0                     // Return
2806};;
2807
2808LOCAL_LIBM_END(__libm_error_region#)
2809.type   __libm_error_support#,@function
2810.global __libm_error_support#
2811