1.file "asinhl.s"
2
3
4// Copyright (c) 2000 - 2003, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39//*********************************************************************
40//
41// History:
42// 09/04/01 Initial version
43// 09/13/01 Performance improved, symmetry problems fixed
44// 10/10/01 Performance improved, split issues removed
45// 12/11/01 Changed huges_logp to not be global
46// 05/20/02 Cleaned up namespace and sf0 syntax
47// 02/10/03 Reordered header: .section, .global, .proc, .align;
48//          used data8 for long double table values
49//
50//*********************************************************************
51//
52// API
53//==============================================================
54// long double asinhl(long double);
55//
56// Overview of operation
57//==============================================================
58//
59// There are 6 paths:
60// 1. x = 0, [S,Q]Nan or +/-INF
61//    Return asinhl(x) = x + x;
62//
63// 2. x = + denormal
64//    Return asinhl(x) = x - x^2;
65//
66// 3. x = - denormal
67//    Return asinhl(x) = x + x^2;
68//
69// 4. 'Near 0': max denormal < |x| < 1/128
70//    Return asinhl(x) = sign(x)*(x+x^3*(c3+x^2*(c5+x^2*(c7+x^2*(c9)))));
71//
72// 5. 'Huges': |x| > 2^63
73//    Return asinhl(x) = sign(x)*(logl(2*x));
74//
75// 6. 'Main path': 1/128 < |x| < 2^63
76//    b_hi + b_lo = x + sqrt(x^2 + 1);
77//    asinhl(x) = sign(x)*(log_special(b_hi, b_lo));
78//
79// Algorithm description
80//==============================================================
81//
82// Main path algorithm
83// ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
84// *************************************************************************
85//
86// There are 3 parts of x+sqrt(x^2+1) computation:
87//
88//  1) p2 = (p2_hi+p2_lo) = x^2+1 obtaining
89//     ------------------------------------
90//     p2_hi = x2_hi + 1, where x2_hi = x * x;
91//     p2_lo = x2_lo + p1_lo, where
92//                            x2_lo = FMS(x*x-x2_hi),
93//                            p1_lo = (1 - p2_hi) + x2_hi;
94//
95//  2) g = (g_hi+g_lo) = sqrt(p2) = sqrt(p2_hi+p2_lo)
96//     ----------------------------------------------
97//     r = invsqrt(p2_hi) (8-bit reciprocal square root approximation);
98//     g = p2_hi * r (first 8 bit-approximation of sqrt);
99//
100//     h = 0.5 * r;
101//     e = 0.5 - g * h;
102//     g = g * e + g (second 16 bit-approximation of sqrt);
103//
104//     h = h * e + h;
105//     e = 0.5 - g * h;
106//     g = g * e + g (third 32 bit-approximation of sqrt);
107//
108//     h = h * e + h;
109//     e = 0.5 - g * h;
110//     g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
111//
112//     Remainder computation:
113//     h = h * e + h;
114//     d = (p2_hi - g_hi * g_hi) + p2_lo;
115//     g_lo = d * h;
116//
117//  3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2+1)
118//     -------------------------------------------------------------------
119//     b_hi = (g_hi + x) + gl;
120//     b_lo = (g_hi - b_hi) + x + gl;
121//
122//  Now we pass b presented as sum b_hi + b_lo to special version
123//  of logl function which accept a pair of arguments as
124//  'mutiprecision' value.
125//
126//  Special log algorithm overview
127//  ================================
128//   Here we use a table lookup method. The basic idea is that in
129//   order to compute logl(Arg) = logl (Arg-1) for an argument Arg in [1,2),
130//   we construct a value G such that G*Arg is close to 1 and that
131//   logl(1/G) is obtainable easily from a table of values calculated
132//   beforehand. Thus
133//
134//      logl(Arg) = logl(1/G) + logl((G*Arg - 1))
135//
136//   Because |G*Arg - 1| is small, the second term on the right hand
137//   side can be approximated by a short polynomial. We elaborate
138//   this method in four steps.
139//
140//   Step 0: Initialization
141//
142//   We need to calculate logl( X ). Obtain N, S_hi such that
143//
144//      X = 2^N * ( S_hi + S_lo )   exactly
145//
146//   where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
147//   that |S_lo| <= ulp(S_hi).
148//
149//   For the special version of logl: S_lo = b_lo
150//   !-----------------------------------------------!
151//
152//   Step 1: Argument Reduction
153//
154//   Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
155//
156//      G := G_1 * G_2 * G_3
157//      r := (G * S_hi - 1) + G * S_lo
158//
159//   These G_j's have the property that the product is exactly
160//   representable and that |r| < 2^(-12) as a result.
161//
162//   Step 2: Approximation
163//
164//   logl(1 + r) is approximated by a short polynomial poly(r).
165//
166//   Step 3: Reconstruction
167//
168//   Finally,
169//
170//   logl( X )   =   logl( 2^N * (S_hi + S_lo) )
171//                 ~=~  N*logl(2) + logl(1/G) + logl(1 + r)
172//                 ~=~  N*logl(2) + logl(1/G) + poly(r).
173//
174//   For detailed description see logl or log1pl function, regular path.
175//
176// Registers used
177//==============================================================
178// Floating Point registers used:
179// f8, input
180// f32 -> f101 (70 registers)
181
182// General registers used:
183// r32 -> r57 (26 registers)
184
185// Predicate registers used:
186// p6 -> p11
187// p6  for '0, NaNs, Inf' path
188// p7  for '+ denormals' path
189// p8  for 'near 0' path
190// p9  for 'huges' path
191// p10 for '- denormals' path
192// p11 for negative values
193//
194// Data tables
195//==============================================================
196
197RODATA
198.align 64
199
200// C7, C9 'near 0' polynomial coefficients
201LOCAL_OBJECT_START(Poly_C_near_0_79)
202data8 0xF8DC939BBEDD5A54, 0x00003FF9
203data8 0xB6DB6DAB21565AC5, 0x0000BFFA
204LOCAL_OBJECT_END(Poly_C_near_0_79)
205
206// C3, C5 'near 0' polynomial coefficients
207LOCAL_OBJECT_START(Poly_C_near_0_35)
208data8 0x999999999991D582, 0x00003FFB
209data8 0xAAAAAAAAAAAAAAA9, 0x0000BFFC
210LOCAL_OBJECT_END(Poly_C_near_0_35)
211
212// Q coeffs
213LOCAL_OBJECT_START(Constants_Q)
214data4  0x00000000,0xB1721800,0x00003FFE,0x00000000
215data4  0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
216data4  0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
217data4  0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
218data4  0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
219data4  0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
220LOCAL_OBJECT_END(Constants_Q)
221
222// Z1 - 16 bit fixed
223LOCAL_OBJECT_START(Constants_Z_1)
224data4  0x00008000
225data4  0x00007879
226data4  0x000071C8
227data4  0x00006BCB
228data4  0x00006667
229data4  0x00006187
230data4  0x00005D18
231data4  0x0000590C
232data4  0x00005556
233data4  0x000051EC
234data4  0x00004EC5
235data4  0x00004BDB
236data4  0x00004925
237data4  0x0000469F
238data4  0x00004445
239data4  0x00004211
240LOCAL_OBJECT_END(Constants_Z_1)
241
242// G1 and H1 - IEEE single and h1 - IEEE double
243LOCAL_OBJECT_START(Constants_G_H_h1)
244data4  0x3F800000,0x00000000
245data8  0x0000000000000000
246data4  0x3F70F0F0,0x3D785196
247data8  0x3DA163A6617D741C
248data4  0x3F638E38,0x3DF13843
249data8  0x3E2C55E6CBD3D5BB
250data4  0x3F579430,0x3E2FF9A0
251data8  0xBE3EB0BFD86EA5E7
252data4  0x3F4CCCC8,0x3E647FD6
253data8  0x3E2E6A8C86B12760
254data4  0x3F430C30,0x3E8B3AE7
255data8  0x3E47574C5C0739BA
256data4  0x3F3A2E88,0x3EA30C68
257data8  0x3E20E30F13E8AF2F
258data4  0x3F321640,0x3EB9CEC8
259data8  0xBE42885BF2C630BD
260data4  0x3F2AAAA8,0x3ECF9927
261data8  0x3E497F3497E577C6
262data4  0x3F23D708,0x3EE47FC5
263data8  0x3E3E6A6EA6B0A5AB
264data4  0x3F1D89D8,0x3EF8947D
265data8  0xBDF43E3CD328D9BE
266data4  0x3F17B420,0x3F05F3A1
267data8  0x3E4094C30ADB090A
268data4  0x3F124920,0x3F0F4303
269data8  0xBE28FBB2FC1FE510
270data4  0x3F0D3DC8,0x3F183EBF
271data8  0x3E3A789510FDE3FA
272data4  0x3F088888,0x3F20EC80
273data8  0x3E508CE57CC8C98F
274data4  0x3F042108,0x3F29516A
275data8  0xBE534874A223106C
276LOCAL_OBJECT_END(Constants_G_H_h1)
277
278// Z2 - 16 bit fixed
279LOCAL_OBJECT_START(Constants_Z_2)
280data4  0x00008000
281data4  0x00007F81
282data4  0x00007F02
283data4  0x00007E85
284data4  0x00007E08
285data4  0x00007D8D
286data4  0x00007D12
287data4  0x00007C98
288data4  0x00007C20
289data4  0x00007BA8
290data4  0x00007B31
291data4  0x00007ABB
292data4  0x00007A45
293data4  0x000079D1
294data4  0x0000795D
295data4  0x000078EB
296LOCAL_OBJECT_END(Constants_Z_2)
297
298// G2 and H2 - IEEE single and h2 - IEEE double
299LOCAL_OBJECT_START(Constants_G_H_h2)
300data4  0x3F800000,0x00000000
301data8  0x0000000000000000
302data4  0x3F7F00F8,0x3B7F875D
303data8  0x3DB5A11622C42273
304data4  0x3F7E03F8,0x3BFF015B
305data8  0x3DE620CF21F86ED3
306data4  0x3F7D08E0,0x3C3EE393
307data8  0xBDAFA07E484F34ED
308data4  0x3F7C0FC0,0x3C7E0586
309data8  0xBDFE07F03860BCF6
310data4  0x3F7B1880,0x3C9E75D2
311data8  0x3DEA370FA78093D6
312data4  0x3F7A2328,0x3CBDC97A
313data8  0x3DFF579172A753D0
314data4  0x3F792FB0,0x3CDCFE47
315data8  0x3DFEBE6CA7EF896B
316data4  0x3F783E08,0x3CFC15D0
317data8  0x3E0CF156409ECB43
318data4  0x3F774E38,0x3D0D874D
319data8  0xBE0B6F97FFEF71DF
320data4  0x3F766038,0x3D1CF49B
321data8  0xBE0804835D59EEE8
322data4  0x3F757400,0x3D2C531D
323data8  0x3E1F91E9A9192A74
324data4  0x3F748988,0x3D3BA322
325data8  0xBE139A06BF72A8CD
326data4  0x3F73A0D0,0x3D4AE46F
327data8  0x3E1D9202F8FBA6CF
328data4  0x3F72B9D0,0x3D5A1756
329data8  0xBE1DCCC4BA796223
330data4  0x3F71D488,0x3D693B9D
331data8  0xBE049391B6B7C239
332LOCAL_OBJECT_END(Constants_G_H_h2)
333
334// G3 and H3 - IEEE single and h3 - IEEE double
335LOCAL_OBJECT_START(Constants_G_H_h3)
336data4  0x3F7FFC00,0x38800100
337data8  0x3D355595562224CD
338data4  0x3F7FF400,0x39400480
339data8  0x3D8200A206136FF6
340data4  0x3F7FEC00,0x39A00640
341data8  0x3DA4D68DE8DE9AF0
342data4  0x3F7FE400,0x39E00C41
343data8  0xBD8B4291B10238DC
344data4  0x3F7FDC00,0x3A100A21
345data8  0xBD89CCB83B1952CA
346data4  0x3F7FD400,0x3A300F22
347data8  0xBDB107071DC46826
348data4  0x3F7FCC08,0x3A4FF51C
349data8  0x3DB6FCB9F43307DB
350data4  0x3F7FC408,0x3A6FFC1D
351data8  0xBD9B7C4762DC7872
352data4  0x3F7FBC10,0x3A87F20B
353data8  0xBDC3725E3F89154A
354data4  0x3F7FB410,0x3A97F68B
355data8  0xBD93519D62B9D392
356data4  0x3F7FAC18,0x3AA7EB86
357data8  0x3DC184410F21BD9D
358data4  0x3F7FA420,0x3AB7E101
359data8  0xBDA64B952245E0A6
360data4  0x3F7F9C20,0x3AC7E701
361data8  0x3DB4B0ECAABB34B8
362data4  0x3F7F9428,0x3AD7DD7B
363data8  0x3D9923376DC40A7E
364data4  0x3F7F8C30,0x3AE7D474
365data8  0x3DC6E17B4F2083D3
366data4  0x3F7F8438,0x3AF7CBED
367data8  0x3DAE314B811D4394
368data4  0x3F7F7C40,0x3B03E1F3
369data8  0xBDD46F21B08F2DB1
370data4  0x3F7F7448,0x3B0BDE2F
371data8  0xBDDC30A46D34522B
372data4  0x3F7F6C50,0x3B13DAAA
373data8  0x3DCB0070B1F473DB
374data4  0x3F7F6458,0x3B1BD766
375data8  0xBDD65DDC6AD282FD
376data4  0x3F7F5C68,0x3B23CC5C
377data8  0xBDCDAB83F153761A
378data4  0x3F7F5470,0x3B2BC997
379data8  0xBDDADA40341D0F8F
380data4  0x3F7F4C78,0x3B33C711
381data8  0x3DCD1BD7EBC394E8
382data4  0x3F7F4488,0x3B3BBCC6
383data8  0xBDC3532B52E3E695
384data4  0x3F7F3C90,0x3B43BAC0
385data8  0xBDA3961EE846B3DE
386data4  0x3F7F34A0,0x3B4BB0F4
387data8  0xBDDADF06785778D4
388data4  0x3F7F2CA8,0x3B53AF6D
389data8  0x3DCC3ED1E55CE212
390data4  0x3F7F24B8,0x3B5BA620
391data8  0xBDBA31039E382C15
392data4  0x3F7F1CC8,0x3B639D12
393data8  0x3D635A0B5C5AF197
394data4  0x3F7F14D8,0x3B6B9444
395data8  0xBDDCCB1971D34EFC
396data4  0x3F7F0CE0,0x3B7393BC
397data8  0x3DC7450252CD7ADA
398data4  0x3F7F04F0,0x3B7B8B6D
399data8  0xBDB68F177D7F2A42
400LOCAL_OBJECT_END(Constants_G_H_h3)
401
402// Assembly macros
403//==============================================================
404
405// Floating Point Registers
406
407FR_Arg          = f8
408FR_Res          = f8
409FR_AX           = f32
410FR_XLog_Hi      = f33
411FR_XLog_Lo      = f34
412
413    // Special logl registers
414FR_Y_hi         = f35
415FR_Y_lo         = f36
416
417FR_Scale        = f37
418FR_X_Prime      = f38
419FR_S_hi         = f39
420FR_W            = f40
421FR_G            = f41
422
423FR_H            = f42
424FR_wsq          = f43
425FR_w4           = f44
426FR_h            = f45
427FR_w6           = f46
428
429FR_G2           = f47
430FR_H2           = f48
431FR_poly_lo      = f49
432FR_P8           = f50
433FR_poly_hi      = f51
434
435FR_P7           = f52
436FR_h2           = f53
437FR_rsq          = f54
438FR_P6           = f55
439FR_r            = f56
440
441FR_log2_hi      = f57
442FR_log2_lo      = f58
443
444FR_float_N      = f59
445FR_Q4           = f60
446
447FR_G3           = f61
448FR_H3           = f62
449FR_h3           = f63
450
451FR_Q3           = f64
452FR_Q2           = f65
453FR_1LN10_hi     = f66
454
455FR_Q1           = f67
456FR_1LN10_lo     = f68
457FR_P5           = f69
458FR_rcub         = f70
459
460FR_Neg_One      = f71
461FR_Z            = f72
462FR_AA           = f73
463FR_BB           = f74
464FR_S_lo         = f75
465FR_2_to_minus_N = f76
466
467
468    // Huge & Main path prolog registers
469FR_Half         = f77
470FR_Two          = f78
471FR_X2           = f79
472FR_P2           = f80
473FR_P2L          = f81
474FR_Rcp          = f82
475FR_GG           = f83
476FR_HH           = f84
477FR_EE           = f85
478FR_DD           = f86
479FR_GL           = f87
480FR_A            = f88
481FR_AL           = f89
482FR_B            = f90
483FR_BL           = f91
484FR_Tmp          = f92
485
486    // Near 0 & Huges path prolog registers
487FR_C3           = f93
488FR_C5           = f94
489FR_C7           = f95
490FR_C9           = f96
491
492FR_X3           = f97
493FR_X4           = f98
494FR_P9           = f99
495FR_P5           = f100
496FR_P3           = f101
497
498
499// General Purpose Registers
500
501    // General prolog registers
502GR_PFS          = r32
503GR_TwoN7        = r40
504GR_TwoP63       = r41
505GR_ExpMask      = r42
506GR_ArgExp       = r43
507GR_Half         = r44
508
509    // Near 0 path prolog registers
510GR_Poly_C_35    = r45
511GR_Poly_C_79    = r46
512
513    // Special logl registers
514GR_Index1       = r34
515GR_Index2       = r35
516GR_signif       = r36
517GR_X_0          = r37
518GR_X_1          = r38
519GR_X_2          = r39
520GR_Z_1          = r40
521GR_Z_2          = r41
522GR_N            = r42
523GR_Bias         = r43
524GR_M            = r44
525GR_Index3       = r45
526GR_exp_2tom80   = r45
527GR_exp_mask     = r47
528GR_exp_2tom7    = r48
529GR_ad_ln10      = r49
530GR_ad_tbl_1     = r50
531GR_ad_tbl_2     = r51
532GR_ad_tbl_3     = r52
533GR_ad_q         = r53
534GR_ad_z_1       = r54
535GR_ad_z_2       = r55
536GR_ad_z_3       = r56
537GR_minus_N      = r57
538
539
540
541.section .text
542GLOBAL_LIBM_ENTRY(asinhl)
543
544{ .mfi
545      alloc     GR_PFS        = ar.pfs,0,27,0,0
546      fma.s1    FR_P2         = FR_Arg, FR_Arg, f1  // p2 = x^2 + 1
547      mov   	GR_Half       = 0xfffe              // 0.5's exp
548}
549{ .mfi
550      addl      GR_Poly_C_79  = @ltoff(Poly_C_near_0_79), gp // C7, C9 coeffs
551      fma.s1    FR_X2         = FR_Arg, FR_Arg, f0           // Obtain x^2
552      addl      GR_Poly_C_35  = @ltoff(Poly_C_near_0_35), gp // C3, C5 coeffs
553};;
554
555{ .mfi
556      getf.exp  GR_ArgExp     = FR_Arg        // get arument's exponent
557      fabs      FR_AX         = FR_Arg        // absolute value of argument
558      mov       GR_TwoN7      = 0xfff8        // 2^-7 exp
559}
560{ .mfi
561      ld8       GR_Poly_C_79  = [GR_Poly_C_79] // get actual coeff table address
562      fma.s0       FR_Two        = f1, f1, f1        // construct 2.0
563      mov       GR_ExpMask    = 0x1ffff        // mask for exp
564};;
565
566{ .mfi
567      ld8       GR_Poly_C_35  = [GR_Poly_C_35] // get actual coeff table address
568      fclass.m  p6,p0         = FR_Arg, 0xe7   // if arg NaN inf zero
569      mov       GR_TwoP63     = 0x1003e        // 2^63 exp
570}
571{ .mfi
572      addl      GR_ad_z_1     = @ltoff(Constants_Z_1#),gp
573      nop.f 0
574      nop.i 0
575};;
576
577{ .mfi
578      setf.exp	FR_Half       = GR_Half              // construct 0.5
579      fclass.m  p7,p0         = FR_Arg, 0x09  //  if arg + denorm
580      and       GR_ArgExp     = GR_ExpMask, GR_ArgExp // select exp
581}
582{ .mfb
583      ld8       GR_ad_z_1     = [GR_ad_z_1]   // Get pointer to Constants_Z_1
584      nop.f 0
585      nop.b 0
586};;
587{ .mfi
588      ldfe      FR_C9         = [GR_Poly_C_79],16 // load C9
589      fclass.m  p10,p0        = FR_Arg, 0x0a    //  if arg - denorm
590      cmp.gt    p8, p0        = GR_TwoN7,  GR_ArgExp // if arg < 2^-7 ('near 0')
591}
592{ .mfb
593      cmp.le    p9, p0        = GR_TwoP63, GR_ArgExp  // if arg > 2^63 ('huges')
594(p6)  fma.s0    FR_Res        = FR_Arg,f1,FR_Arg     // r = a + a
595(p6)  br.ret.spnt b0                            // return
596};;
597// (X^2 + 1) computation
598{ .mfi
599(p8)  ldfe      FR_C5         = [GR_Poly_C_35],16        // load C5
600      fms.s1    FR_Tmp        = f1, f1, FR_P2           // Tmp = 1 - p2
601      add       GR_ad_tbl_1   = 0x040, GR_ad_z_1    // Point to Constants_G_H_h1
602}
603{ .mfb
604(p8)  ldfe      FR_C7         = [GR_Poly_C_79],16        // load C7
605(p7)  fnma.s0   FR_Res        =  FR_Arg,FR_Arg,FR_Arg // r = a - a*a
606(p7)  br.ret.spnt b0                              // return
607};;
608
609{ .mfi
610(p8)  ldfe      FR_C3         = [GR_Poly_C_35],16     // load C3
611      fcmp.lt.s1 p11, p12      = FR_Arg, f0     // if arg is negative
612      add       GR_ad_q       = -0x60, GR_ad_z_1    // Point to Constants_P
613}
614{ .mfb
615      add       GR_ad_z_2     = 0x140, GR_ad_z_1    // Point to Constants_Z_2
616(p10) fma.s0    FR_Res        =  FR_Arg,FR_Arg,FR_Arg // r = a + a*a
617(p10) br.ret.spnt b0                             // return
618};;
619
620{ .mfi
621      add       GR_ad_tbl_2   = 0x180, GR_ad_z_1    // Point to Constants_G_H_h2
622      frsqrta.s1 FR_Rcp, p0   = FR_P2           // Rcp = 1/p2 reciprocal appr.
623      add       GR_ad_tbl_3   = 0x280, GR_ad_z_1    // Point to Constants_G_H_h3
624}
625{ .mfi
626      nop.m 0
627      fms.s1    FR_P2L        = FR_AX, FR_AX, FR_X2 //low part of p2=fma(X*X-p2)
628      mov       GR_Bias       = 0x0FFFF            // Create exponent bias
629};;
630
631{ .mfb
632      nop.m 0
633(p9)  fms.s1    FR_XLog_Hi    = FR_Two, FR_AX, f0  // Hi  of log1p arg = 2*X - 1
634(p9)  br.cond.spnt huges_logl                      // special version of log1p
635};;
636
637{ .mfb
638      ldfe      FR_log2_hi    = [GR_ad_q],16      // Load log2_hi
639(p8)  fma.s1    FR_X3         = FR_X2, FR_Arg, f0        // x^3 = x^2 * x
640(p8)  br.cond.spnt near_0                                // Go to near 0 branch
641};;
642
643{ .mfi
644      ldfe      FR_log2_lo    = [GR_ad_q],16      // Load log2_lo
645      nop.f 0
646      nop.i 0
647};;
648
649{ .mfi
650      ldfe      FR_Q4         = [GR_ad_q],16          // Load Q4
651      fma.s1    FR_Tmp        = FR_Tmp, f1, FR_X2       // Tmp = Tmp + x^2
652      mov       GR_exp_mask   = 0x1FFFF        // Create exponent mask
653};;
654
655{ .mfi
656      ldfe      FR_Q3         = [GR_ad_q],16   // Load Q3
657      fma.s1    FR_GG         = FR_Rcp, FR_P2, f0        // g = Rcp * p2
658                                               // 8 bit Newton Raphson iteration
659      nop.i 0
660}
661{ .mfi
662      nop.m 0
663      fma.s1    FR_HH         = FR_Half, FR_Rcp, f0      // h = 0.5 * Rcp
664      nop.i 0
665};;
666{ .mfi
667      ldfe      FR_Q2         = [GR_ad_q],16      // Load Q2
668      fnma.s1   FR_EE         = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
669      nop.i 0
670}
671{ .mfi
672      nop.m 0
673      fma.s1    FR_P2L        = FR_Tmp, f1, FR_P2L // low part of p2 = Tmp + p2l
674      nop.i 0
675};;
676
677{ .mfi
678      ldfe      FR_Q1         = [GR_ad_q]                // Load Q1
679      fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
680                                              // 16 bit Newton Raphson iteration
681      nop.i 0
682}
683{ .mfi
684      nop.m 0
685      fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
686      nop.i 0
687};;
688
689{ .mfi
690      nop.m 0
691      fnma.s1   FR_EE         = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
692      nop.i 0
693};;
694
695{ .mfi
696      nop.m 0
697      fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
698                                              // 32 bit Newton Raphson iteration
699      nop.i 0
700}
701{ .mfi
702      nop.m 0
703      fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
704      nop.i 0
705};;
706
707{ .mfi
708      nop.m 0
709      fnma.s1   FR_EE         = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
710      nop.i 0
711};;
712
713{ .mfi
714      nop.m 0
715      fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
716                                              // 64 bit Newton Raphson iteration
717      nop.i 0
718}
719{ .mfi
720      nop.m 0
721      fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
722      nop.i 0
723};;
724
725{ .mfi
726      nop.m 0
727      fnma.s1   FR_DD         = FR_GG, FR_GG, FR_P2  // Remainder d = g * g - p2
728      nop.i 0
729}
730{ .mfi
731      nop.m 0
732      fma.s1    FR_XLog_Hi     = FR_AX, f1, FR_GG // bh = z + gh
733      nop.i 0
734};;
735
736{ .mfi
737      nop.m 0
738      fma.s1    FR_DD         = FR_DD, f1, FR_P2L       // add p2l: d = d + p2l
739      nop.i 0
740};;
741
742
743{ .mfi
744      getf.sig  GR_signif     = FR_XLog_Hi     // Get significand of x+1
745      fmerge.ns FR_Neg_One    = f1, f1         // Form -1.0
746      mov       GR_exp_2tom7  = 0x0fff8        // Exponent of 2^-7
747};;
748
749{ .mfi
750      nop.m 0
751      fma.s1    FR_GL         = FR_DD, FR_HH, f0        // gl = d * h
752      extr.u    GR_Index1     = GR_signif, 59, 4    // Get high 4 bits of signif
753}
754{ .mfi
755      nop.m 0
756      fma.s1    FR_XLog_Hi     = FR_DD,  FR_HH, FR_XLog_Hi // bh = bh + gl
757      nop.i 0
758};;
759
760{ .mmi
761      shladd    GR_ad_z_1     = GR_Index1, 2, GR_ad_z_1  // Point to Z_1
762      shladd    GR_ad_tbl_1   = GR_Index1, 4, GR_ad_tbl_1  // Point to G_1
763      extr.u    GR_X_0        = GR_signif, 49, 15 // Get high 15 bits of signif.
764};;
765
766{ .mmi
767      ld4       GR_Z_1        = [GR_ad_z_1]    // Load Z_1
768      nop.m 0
769      nop.i 0
770};;
771
772{ .mmi
773      ldfps     FR_G, FR_H    = [GR_ad_tbl_1],8     // Load G_1, H_1
774      nop.m 0
775      nop.i 0
776};;
777
778{ .mfi
779      nop.m 0
780      fms.s1    FR_XLog_Lo     = FR_GG,  f1,   FR_XLog_Hi // bl = gh - bh
781      pmpyshr2.u GR_X_1       = GR_X_0,GR_Z_1,15  // Get bits 30-15 of X_0 * Z_1
782};;
783
784// WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
785// "DEAD" ZONE!
786
787{ .mfi
788      nop.m 0
789      nop.f 0
790      nop.i 0
791};;
792
793{ .mfi
794      nop.m 0
795      fmerge.se FR_S_hi       =  f1,FR_XLog_Hi            // Form |x+1|
796      nop.i 0
797};;
798
799{ .mmi
800      getf.exp  GR_N          =  FR_XLog_Hi    // Get N = exponent of x+1
801      ldfd      FR_h          = [GR_ad_tbl_1]        // Load h_1
802      nop.i 0
803};;
804
805{ .mfi
806      nop.m 0
807      nop.f 0
808      extr.u    GR_Index2     = GR_X_1, 6, 4      // Extract bits 6-9 of X_1
809};;
810
811
812{ .mfi
813      shladd    GR_ad_tbl_2   = GR_Index2, 4, GR_ad_tbl_2  // Point to G_2
814      fma.s1    FR_XLog_Lo    = FR_XLog_Lo, f1, FR_AX // bl = bl + x
815      mov       GR_exp_2tom80 = 0x0ffaf           // Exponent of 2^-80
816}
817{ .mfi
818      shladd    GR_ad_z_2     = GR_Index2, 2, GR_ad_z_2  // Point to Z_2
819      nop.f 0
820      sub       GR_N          = GR_N, GR_Bias // sub bias from exp
821};;
822
823{ .mmi
824      ldfps     FR_G2, FR_H2  = [GR_ad_tbl_2],8       // Load G_2, H_2
825      ld4       GR_Z_2        = [GR_ad_z_2]                // Load Z_2
826      sub       GR_minus_N    = GR_Bias, GR_N         // Form exponent of 2^(-N)
827};;
828
829{ .mmi
830      ldfd      FR_h2         = [GR_ad_tbl_2]             // Load h_2
831      nop.m 0
832      nop.i 0
833};;
834
835{ .mmi
836      setf.sig  FR_float_N    = GR_N        // Put integer N into rightmost sign
837      setf.exp  FR_2_to_minus_N = GR_minus_N   // Form 2^(-N)
838      pmpyshr2.u GR_X_2       = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
839};;
840
841// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
842// BECAUSE OF POSSIBLE 10 CLOCKS STALL!
843// So we can negate Q coefficients there for negative values
844
845{ .mfi
846      nop.m 0
847(p11) fma.s1    FR_Q1         = FR_Q1, FR_Neg_One, f0 // Negate Q1
848      nop.i 0
849}
850{ .mfi
851      nop.m 0
852      fma.s1    FR_XLog_Lo     = FR_XLog_Lo, f1, FR_GL // bl = bl + gl
853      nop.i 0
854};;
855
856{ .mfi
857      nop.m 0
858(p11) fma.s1    FR_Q2         = FR_Q2, FR_Neg_One, f0 // Negate Q2
859      nop.i 0
860};;
861
862{ .mfi
863      nop.m 0
864(p11) fma.s1    FR_Q3         = FR_Q3, FR_Neg_One, f0 // Negate Q3
865      nop.i 0
866};;
867
868{ .mfi
869      nop.m 0
870(p11) fma.s1    FR_Q4         = FR_Q4, FR_Neg_One, f0 // Negate Q4
871      extr.u    GR_Index3     = GR_X_2, 1, 5         // Extract bits 1-5 of X_2
872};;
873
874{ .mfi
875      shladd    GR_ad_tbl_3   = GR_Index3, 4, GR_ad_tbl_3  // Point to G_3
876      nop.f 0
877      nop.i 0
878};;
879
880{ .mfi
881      ldfps     FR_G3, FR_H3  = [GR_ad_tbl_3],8   // Load G_3, H_3
882      nop.f 0
883      nop.i 0
884};;
885
886{ .mfi
887      ldfd      FR_h3         = [GR_ad_tbl_3]            // Load h_3
888	  fcvt.xf   FR_float_N    = FR_float_N
889      nop.i 0
890};;
891
892{ .mfi
893      nop.m 0
894      fmpy.s1   FR_G          = FR_G, FR_G2              // G = G_1 * G_2
895      nop.i 0
896}
897{ .mfi
898      nop.m 0
899      fadd.s1   FR_H          = FR_H, FR_H2              // H = H_1 + H_2
900      nop.i 0
901};;
902
903{ .mfi
904      nop.m 0
905      fadd.s1   FR_h          = FR_h, FR_h2              // h = h_1 + h_2
906      nop.i 0
907}
908{ .mfi
909      nop.m 0
910      fma.s1    FR_S_lo       = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^-N
911      nop.i 0
912};;
913
914{ .mfi
915      nop.m 0
916      fmpy.s1   FR_G          = FR_G, FR_G3             // G = (G_1 * G_2) * G_3
917      nop.i 0
918}
919{ .mfi
920      nop.m 0
921      fadd.s1   FR_H          = FR_H, FR_H3             // H = (H_1 + H_2) + H_3
922      nop.i 0
923};;
924
925{ .mfi
926      nop.m 0
927      fadd.s1   FR_h          = FR_h, FR_h3             // h = (h_1 + h_2) + h_3
928      nop.i 0
929};;
930
931{ .mfi
932      nop.m 0
933      fms.s1    FR_r          = FR_G, FR_S_hi, f1           // r = G * S_hi - 1
934      nop.i 0
935}
936{ .mfi
937      nop.m 0
938      fma.s1    FR_Y_hi       = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
939      nop.i 0
940};;
941
942{ .mfi
943      nop.m 0
944      fma.s1    FR_h          = FR_float_N, FR_log2_lo, FR_h  // h=N*log2_lo+h
945      nop.i 0
946}
947{ .mfi
948      nop.m 0
949      fma.s1    FR_r          = FR_G, FR_S_lo, FR_r  // r=G*S_lo+(G*S_hi-1)
950      nop.i 0
951};;
952
953{ .mfi
954      nop.m 0
955      fma.s1    FR_poly_lo    = FR_r, FR_Q4, FR_Q3      // poly_lo = r * Q4 + Q3
956      nop.i 0
957}
958{ .mfi
959      nop.m 0
960      fmpy.s1   FR_rsq        = FR_r, FR_r              // rsq = r * r
961      nop.i 0
962};;
963
964{ .mfi
965      nop.m 0
966      fma.s1    FR_poly_lo    = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
967      nop.i 0
968}
969{ .mfi
970      nop.m 0
971      fma.s1    FR_rcub       = FR_rsq, FR_r, f0        // rcub = r^3
972      nop.i 0
973};;
974
975.pred.rel "mutex",p12,p11
976{ .mfi
977      nop.m 0
978(p12) fma.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
979      nop.i 0
980}
981{ .mfi
982      nop.m 0
983(p11) fms.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r     // poly_hi = Q1*rsq + r
984      nop.i 0
985};;
986
987
988.pred.rel "mutex",p12,p11
989{ .mfi
990      nop.m 0
991(p12) fma.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
992      nop.i 0
993}
994{ .mfi
995      nop.m 0
996(p11) fms.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
997      nop.i 0
998}
999;;
1000
1001{ .mfi
1002      nop.m 0
1003      fadd.s0   FR_Y_lo       = FR_poly_hi, FR_poly_lo
1004	                                                     // Y_lo=poly_hi+poly_lo
1005      nop.i 0
1006}
1007{ .mfi
1008      nop.m 0
1009(p11) fma.s0    FR_Y_hi       = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1010      nop.i 0
1011};;
1012
1013{ .mfb
1014      nop.m 0
1015      fadd.s0   FR_Res        = FR_Y_lo,FR_Y_hi    // Result=Y_lo+Y_hi
1016      br.ret.sptk   b0                         // Common exit for 2^-7 < x < inf
1017};;
1018
1019// * SPECIAL VERSION OF LOGL FOR HUGE ARGUMENTS *
1020
1021huges_logl:
1022{ .mfi
1023      getf.sig  GR_signif     = FR_XLog_Hi     // Get significand of x+1
1024      fmerge.ns FR_Neg_One    = f1, f1         // Form -1.0
1025      mov       GR_exp_2tom7  = 0x0fff8        // Exponent of 2^-7
1026};;
1027
1028{ .mfi
1029      add       GR_ad_tbl_1   = 0x040, GR_ad_z_1    // Point to Constants_G_H_h1
1030      nop.f 0
1031      add       GR_ad_q       = -0x60, GR_ad_z_1    // Point to Constants_P
1032}
1033{ .mfi
1034      add       GR_ad_z_2     = 0x140, GR_ad_z_1    // Point to Constants_Z_2
1035      nop.f 0
1036      add       GR_ad_tbl_2   = 0x180, GR_ad_z_1    // Point to Constants_G_H_h2
1037};;
1038
1039{ .mfi
1040      nop.m 0
1041      nop.f 0
1042      extr.u    GR_Index1     = GR_signif, 59, 4    // Get high 4 bits of signif
1043}
1044{ .mfi
1045      add       GR_ad_tbl_3   = 0x280, GR_ad_z_1    // Point to Constants_G_H_h3
1046      nop.f 0
1047      nop.i 0
1048};;
1049
1050{ .mfi
1051      shladd    GR_ad_z_1     = GR_Index1, 2, GR_ad_z_1  // Point to Z_1
1052      nop.f 0
1053      extr.u    GR_X_0        = GR_signif, 49, 15 // Get high 15 bits of signif.
1054};;
1055
1056{ .mfi
1057      ld4       GR_Z_1        = [GR_ad_z_1]    // Load Z_1
1058      nop.f 0
1059      mov       GR_exp_mask   = 0x1FFFF        // Create exponent mask
1060}
1061{ .mfi
1062      shladd    GR_ad_tbl_1   = GR_Index1, 4, GR_ad_tbl_1  // Point to G_1
1063      nop.f 0
1064      mov       GR_Bias       = 0x0FFFF            // Create exponent bias
1065};;
1066
1067{ .mfi
1068      ldfps     FR_G, FR_H    = [GR_ad_tbl_1],8     // Load G_1, H_1
1069      fmerge.se FR_S_hi       =  f1,FR_XLog_Hi            // Form |x+1|
1070      nop.i 0
1071};;
1072
1073{ .mmi
1074      getf.exp  GR_N          =  FR_XLog_Hi          // Get N = exponent of x+1
1075      ldfd      FR_h          = [GR_ad_tbl_1]        // Load h_1
1076      nop.i 0
1077};;
1078
1079{ .mfi
1080      ldfe      FR_log2_hi    = [GR_ad_q],16      // Load log2_hi
1081      nop.f 0
1082      pmpyshr2.u GR_X_1       = GR_X_0,GR_Z_1,15  // Get bits 30-15 of X_0 * Z_1
1083};;
1084
1085// WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1086// "DEAD" ZONE!
1087
1088{ .mmi
1089      ldfe      FR_log2_lo    = [GR_ad_q],16      // Load log2_lo
1090      sub       GR_N          = GR_N, GR_Bias
1091      mov       GR_exp_2tom80 = 0x0ffaf           // Exponent of 2^-80
1092};;
1093
1094{ .mfi
1095      ldfe      FR_Q4         = [GR_ad_q],16          // Load Q4
1096      nop.f 0
1097      sub       GR_minus_N    = GR_Bias, GR_N         // Form exponent of 2^(-N)
1098};;
1099
1100{ .mmf
1101      ldfe      FR_Q3         = [GR_ad_q],16   // Load Q3
1102      setf.sig  FR_float_N    = GR_N        // Put integer N into rightmost sign
1103      nop.f 0
1104};;
1105
1106{ .mmi
1107      nop.m 0
1108      ldfe      FR_Q2         = [GR_ad_q],16      // Load Q2
1109      extr.u    GR_Index2     = GR_X_1, 6, 4      // Extract bits 6-9 of X_1
1110};;
1111
1112{ .mmi
1113      ldfe      FR_Q1         = [GR_ad_q]                // Load Q1
1114      shladd    GR_ad_z_2     = GR_Index2, 2, GR_ad_z_2  // Point to Z_2
1115      nop.i 0
1116};;
1117
1118{ .mmi
1119      ld4       GR_Z_2        = [GR_ad_z_2]                // Load Z_2
1120      shladd    GR_ad_tbl_2   = GR_Index2, 4, GR_ad_tbl_2  // Point to G_2
1121      nop.i 0
1122};;
1123
1124{ .mmi
1125      ldfps     FR_G2, FR_H2  = [GR_ad_tbl_2],8       // Load G_2, H_2
1126      nop.m 0
1127      nop.i 0
1128};;
1129
1130{ .mfi
1131      ldfd      FR_h2         = [GR_ad_tbl_2]             // Load h_2
1132      nop.f 0
1133      nop.i 0
1134}
1135{ .mfi
1136      setf.exp  FR_2_to_minus_N = GR_minus_N   // Form 2^(-N)
1137      nop.f 0
1138      nop.i 0
1139};;
1140
1141{ .mfi
1142      nop.m 0
1143      nop.f 0
1144      pmpyshr2.u GR_X_2       = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
1145};;
1146
1147// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1148// "DEAD" ZONE!
1149// JUST HAVE TO INSERT 3 NOP CYCLES (nothing to do here)
1150
1151{ .mfi
1152      nop.m 0
1153      nop.f 0
1154      nop.i 0
1155};;
1156
1157{ .mfi
1158      nop.m 0
1159      nop.f 0
1160      nop.i 0
1161};;
1162
1163{ .mfi
1164      nop.m 0
1165      nop.f 0
1166      nop.i 0
1167};;
1168
1169{ .mfi
1170      nop.m 0
1171(p11) fma.s1    FR_Q4         = FR_Q4, FR_Neg_One, f0 // Negate Q4
1172      extr.u    GR_Index3     = GR_X_2, 1, 5          // Extract bits 1-5 of X_2
1173 };;
1174
1175{ .mfi
1176      shladd    GR_ad_tbl_3   = GR_Index3, 4, GR_ad_tbl_3  // Point to G_3
1177	  fcvt.xf   FR_float_N    = FR_float_N
1178      nop.i 0
1179}
1180{ .mfi
1181      nop.m 0
1182(p11) fma.s1    FR_Q3         = FR_Q3, FR_Neg_One, f0 // Negate Q3
1183      nop.i 0
1184};;
1185
1186{ .mfi
1187      ldfps     FR_G3, FR_H3  = [GR_ad_tbl_3],8   // Load G_3, H_3
1188(p11) fma.s1    FR_Q2         = FR_Q2, FR_Neg_One, f0 // Negate Q2
1189      nop.i 0
1190}
1191{ .mfi
1192      nop.m 0
1193(p11) fma.s1    FR_Q1         = FR_Q1, FR_Neg_One, f0 // Negate Q1
1194      nop.i 0
1195};;
1196
1197{ .mfi
1198      ldfd      FR_h3         = [GR_ad_tbl_3]            // Load h_3
1199      fmpy.s1   FR_G          = FR_G, FR_G2              // G = G_1 * G_2
1200      nop.i 0
1201}
1202{ .mfi
1203      nop.m 0
1204      fadd.s1   FR_H          = FR_H, FR_H2              // H = H_1 + H_2
1205      nop.i 0
1206};;
1207
1208{ .mmf
1209      nop.m 0
1210      nop.m 0
1211      fadd.s1   FR_h          = FR_h, FR_h2              // h = h_1 + h_2
1212};;
1213
1214{ .mfi
1215      nop.m 0
1216      fmpy.s1   FR_G          = FR_G, FR_G3             // G = (G_1 * G_2) * G_3
1217      nop.i 0
1218}
1219{ .mfi
1220      nop.m 0
1221      fadd.s1   FR_H          = FR_H, FR_H3             // H = (H_1 + H_2) + H_3
1222      nop.i 0
1223};;
1224
1225{ .mfi
1226      nop.m 0
1227      fadd.s1   FR_h          = FR_h, FR_h3             // h = (h_1 + h_2) + h_3
1228      nop.i 0
1229};;
1230
1231{ .mfi
1232      nop.m 0
1233      fms.s1    FR_r          = FR_G, FR_S_hi, f1           // r = G * S_hi - 1
1234      nop.i 0
1235}
1236{ .mfi
1237      nop.m 0
1238      fma.s1    FR_Y_hi       = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1239      nop.i 0
1240};;
1241
1242{ .mfi
1243      nop.m 0
1244      fma.s1    FR_h          = FR_float_N, FR_log2_lo, FR_h  // h=N*log2_lo+h
1245      nop.i 0
1246};;
1247
1248{ .mfi
1249      nop.m 0
1250      fma.s1    FR_poly_lo    = FR_r, FR_Q4, FR_Q3      // poly_lo = r * Q4 + Q3
1251      nop.i 0
1252}
1253{ .mfi
1254      nop.m 0
1255      fmpy.s1   FR_rsq        = FR_r, FR_r              // rsq = r * r
1256      nop.i 0
1257};;
1258
1259{ .mfi
1260      nop.m 0
1261      fma.s1    FR_poly_lo    = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1262      nop.i 0
1263}
1264{ .mfi
1265      nop.m 0
1266      fma.s1    FR_rcub       = FR_rsq, FR_r, f0        // rcub = r^3
1267      nop.i 0
1268};;
1269
1270.pred.rel "mutex",p12,p11
1271{ .mfi
1272      nop.m 0
1273(p12) fma.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1274      nop.i 0
1275}
1276{ .mfi
1277      nop.m 0
1278(p11) fms.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1279      nop.i 0
1280};;
1281
1282
1283.pred.rel "mutex",p12,p11
1284{ .mfi
1285      nop.m 0
1286(p12) fma.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1287      nop.i 0
1288}
1289{ .mfi
1290      nop.m 0
1291(p11) fms.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1292      nop.i 0
1293};;
1294
1295{ .mfi
1296      nop.m 0
1297      fadd.s0   FR_Y_lo       = FR_poly_hi, FR_poly_lo  // Y_lo=poly_hi+poly_lo
1298      nop.i 0
1299}
1300{ .mfi
1301      nop.m 0
1302(p11) fma.s0    FR_Y_hi       = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1303      nop.i 0
1304};;
1305
1306{ .mfb
1307      nop.m 0
1308      fadd.s0   FR_Res        = FR_Y_lo,FR_Y_hi    // Result=Y_lo+Y_hi
1309      br.ret.sptk   b0                         // Common exit for 2^-7 < x < inf
1310};;
1311
1312// NEAR ZERO POLYNOMIAL INTERVAL
1313near_0:
1314{ .mfi
1315      nop.m 0
1316      fma.s1    FR_X4         = FR_X2, FR_X2, f0 // x^4 = x^2 * x^2
1317      nop.i 0
1318};;
1319
1320{ .mfi
1321      nop.m 0
1322      fma.s1    FR_P9         = FR_C9,FR_X2,FR_C7  // p9 = C9*x^2 + C7
1323      nop.i 0
1324}
1325{ .mfi
1326      nop.m 0
1327      fma.s1    FR_P5         = FR_C5,FR_X2,FR_C3  // p5 = C5*x^2 + C3
1328      nop.i 0
1329};;
1330
1331{ .mfi
1332      nop.m 0
1333      fma.s1    FR_P3         = FR_P9,FR_X4,FR_P5  // p3 = p9*x^4 + p5
1334      nop.i 0
1335};;
1336
1337{ .mfb
1338      nop.m 0
1339      fma.s0    FR_Res        = FR_P3,FR_X3,FR_Arg // res = p3*C3 + x
1340      br.ret.sptk   b0                          // Near 0 path return
1341};;
1342
1343GLOBAL_LIBM_END(asinhl)
1344libm_alias_ldouble_other (asinh, asinh)
1345