1.file "libm_lgammal.s"
2
3
4// Copyright (c) 2002 - 2005, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
25// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
28// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code,and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39//*********************************************************************
40//
41// History:
42// 03/28/02  Original version
43// 05/20/02  Cleaned up namespace and sf0 syntax
44// 08/21/02  Added support of SIGN(GAMMA(x)) calculation
45// 09/26/02  Algorithm description improved
46// 10/21/02  Now it returns SIGN(GAMMA(x))=-1 for negative zero
47// 02/10/03  Reordered header: .section, .global, .proc, .align
48// 03/31/05  Reformatted delimiters between data tables
49//
50//*********************************************************************
51//
52// Function: __libm_lgammal(long double x, int* signgam, int szsigngam)
53// computes the principal value of the logarithm of the GAMMA function
54// of x. Signum of GAMMA(x) is stored to memory starting at the address
55// specified by the signgam.
56//
57//*********************************************************************
58//
59// Resources Used:
60//
61//    Floating-Point Registers: f8 (Input and Return Value)
62//                              f9-f15
63//                              f32-f127
64//
65//    General Purpose Registers:
66//      r2, r3, r8-r11, r14-r31
67//      r32-r65
68//      r66-r69 (Used to pass arguments to error handling routine)
69//
70//    Predicate Registers:      p6-p15
71//
72//*********************************************************************
73//
74// IEEE Special Conditions:
75//
76//    __libm_lgammal(+inf) = +inf
77//    __libm_lgammal(-inf) = QNaN
78//    __libm_lgammal(+/-0) = +inf
79//    __libm_lgammal(x<0, x - integer) = QNaN
80//    __libm_lgammal(SNaN) = QNaN
81//    __libm_lgammal(QNaN) = QNaN
82//
83//*********************************************************************
84//
85// ALGORITHM DESCRIPTION
86//
87// Below we suppose that there is log(z) function which takes an long
88// double argument and returns result as a pair of long double numbers
89// lnHi and lnLo (such that sum lnHi + lnLo provides ~80 correct bits
90// of significand). Algorithm description for such log(z) function
91// see below.
92// Also, it this algorithm description we use the following notational
93// conventions:
94// a) pair A = (Ahi, Alo) means number A represented as sum of Ahi and Alo
95// b) C = A + B = (Ahi, Alo) + (Bhi, Blo) means multi-precision addition.
96//    The result would be C = (Chi, Clo). Notice, that Clo shouldn't be
97//    equal to Alo + Blo
98// c) D = A*B = (Ahi, Alo)*(Bhi, Blo) = (Dhi, Dlo) multi-precisiion
99//    multiplication.
100//
101// So, lgammal has the following computational paths:
102// 1) |x| < 0.5
103//    P = A1*|x| + A2*|x|^2 + ... + A22*|x|^22
104//    A1, A2, A3 represented as a sum of two double precision
105//    numbers and multi-precision computations are used for 3 higher
106//    terms of the polynomial. We get polynomial as a sum of two
107//    double extended numbers: P = (Phi, Plo)
108//    1.1) x > 0
109//         lgammal(x) = P - log(|x|) = (Phi, Plo) - (lnHi(|x|), lnLo(|x|))
110//    1.2) x < 0
111//         lgammal(x) = -P - log(|x|) - log(sin(Pi*x)/(Pi*x))
112//         P and log(|x|) are computed by the same way as in 1.1;
113//         - log(sin(Pi*x)/(Pi*x)) is approximated by a polynomial Plnsin.
114//         Plnsin:= fLnSin2*|x|^2 + fLnSin4*|x|^4 + ... + fLnSin36*|x|^36
115//         The first coefficient of Plnsin is represented as sum of two
116//         double precision numbers (fLnSin2, fLnSin2L). Multi-precision
117//         computations for higher two terms of Plnsin are used.
118//         So, the final result is reconstructed by the following formula
119//         lgammal(x) = (-(Phi, Plo) - (lnHi(|x|), lnLo(|x|))) -
120//                      - (PlnsinHi,PlnsinLo)
121//
122// 2)    0.5 <= x <   0.75  -> t = x - 0.625
123//     -0.75 <  x <= -0.5   -> t = x + 0.625
124//      2.25 <= x <   4.0   -> t = x/2 - 1.5
125//       4.0 <= x <   8.0   -> t = x/4 - 1.5
126//       -0.5 < x <= -0.40625 -> t = x + 0.5
127//       -2.6005859375 < x <= -2.5 -> t = x + 2.5
128//       1.3125 <= x < 1.5625 -> t = x - LOC_MIN, where LOC_MIN is point in
129//                                   which lgammal has local minimum. Exact
130//                                   value can be found in the table below,
131//                                   approximate value is ~1.46
132//
133//    lgammal(x) is approximated by the polynomial of 25th degree: P25(t)
134//    P25(t) = A0 + A1*t + ... + A25*t^25 = (Phi, Plo) + t^4*P21(t),
135//    where
136//    (Phi, Plo) is sum of four highest terms of the polynomial P25(t):
137//    (Phi, Plo) = ((A0, A0L) + (A1, A1L)*t) + t^2 *((A2, A2L) + (A3, A3L)*t),
138//    (Ai, AiL) - coefficients represented as pairs of DP numbers.
139//
140//    P21(t) = (PolC(t)*t^8 + PolD(t))*t^8 + PolE(t),
141//    where
142//    PolC(t) = C21*t^5 + C20*t^4 + ... + C16,
143//    C21 = A25, C20 = A24, ..., C16 = A20
144//
145//    PolD(t) = D7*t^7 + D6*t^6 + ... + D0,
146//    D7 = A19, D6 = A18, ..., D0 = A12
147//
148//    PolE(t) = E7*t^7 + E6*t^6 + ... + E0,
149//    E7 = A11, E6 = A10, ..., E0 = A4
150//
151//    Cis and Dis are represented as double precision numbers,
152//    Eis are represented as double extended numbers.
153//
154// 3) 0.75 <=  x < 1.3125   -> t = x - 1.0
155//    1.5625 <= x < 2.25   -> t = x - 2.0
156//    lgammal(x) is approximated by the polynomial of 25th degree: P25(t)
157//    P25(t) = A1*t + ... + A25*t^25, and computations are carried out
158//    by similar way as in the previous case
159//
160// 4) 10.0 < x <= Overflow Bound ("positive Sterling" range)
161//    lgammal(x) is approximated using Sterling's formula:
162//    lgammal(x) ~ ((x*(lnHi(x) - 1, lnLo(x))) - 0.5*(lnHi(x), lnLo(x))) +
163//                 + ((Chi, Clo) + S(1/x))
164//    where
165//    C = (Chi, Clo) - pair of double precision numbers representing constant
166//    0.5*ln(2*Pi);
167//    S(1/x) = 1/x * (B2 + B4*(1/x)^2 + ... + B20*(1/x)^18), B2, ..., B20 are
168//    Bernulli numbers. S is computed in native precision and then added to
169//    Clo;
170//    lnHi(x) - 1 is computed in native precision and the multiprecision
171//    multiplication (x, 0) *(lnHi(x) - 1, lnLo(x)) is used.
172//
173// 5) -INF < x <= -2^63, any negative integer < 0
174//    All numbers in this range are integers -> error handler is called
175//
176// 6) -2^63 < x <= -0.75 ("negative Sterling" range), x is "far" from root,
177//    lgammal(-t) for positive t is approximated using the following formula:
178//    lgammal(-t) = -lgammal(t)-log(t)-log(|dT|)+log(sin(Pi*|dT|)/(Pi*|dT|))
179//        where dT = -t -round_to_nearest_integer(-t)
180//    Last item is approximated by the same polynomial as described in 1.2.
181//    We split the whole range into three subranges due to different ways of
182//    approximation of the first terms.
183//    6.1) -2^63 < x < -6.0 ("negative Sterling" range)
184//       lgammal(t) is approximated exactly as in #4. The only difference that
185//       for -13.0 < x < -6.0 subrange instead of Bernulli numbers we use their
186//       minimax approximation on this range.
187//       log(t), log(|dT|) are approximated by the log routine mentioned above.
188//    6.2) -6.0 < x <= -0.75, |x + 1|> 2^(-7)
189//       log(t), log(|dT|) are approximated by the log routine mentioned above,
190//       lgammal(t) is approximated by polynomials of the 25th degree similar
191//       to ones from #2. Arguments z of the polynomials are as follows
192//       a) 0.75 <= t < 1.0 - 2^(-7),  z = 2*t - 1.5
193//       b) 1.0 - 2^(-7)  < t < 2.0,   z = t - 1.5
194//       c) 2.0  < t < 3.0,   z = t/2 - 1.5
195//       d) 3.0  < t < 4.0,   z = t/2 - 1.5. Notice, that range reduction is
196//          the same as in case c) but the set of coefficients is different
197//       e) 4.0  < t < 6.0,   z = t/4 - 1.5
198//    6.3) |x + 1| <= 2^(-7)
199//       log(1 + (x-1)) is approximated by Taylor series,
200//       log(sin(Pi*|dT|)/(Pi*|dT|)) is still approximated by polynomial but
201//       it has just 4th degree.
202//       log(|dT|) is approximated by the log routine mentioned above.
203//       lgammal(-x) is approximated by polynomial of 8th degree from (-x + 1).
204//
205// 7) -20.0 < x < -2.0, x falls in root "neighbourhood".
206//    "Neighbourhood" means that |lgammal(x)| < epsilon, where epsilon is
207//    different for every root (and it is stored in the table), but typically
208//    it is ~ 0.15. There are 35 roots significant from "double extended"
209//    point of view. We split all the roots into two subsets: "left" and "right"
210//    roots. Considering [-(N+1), -N] range we call root as "left" one if it
211//    lies closer to -(N+1) and "right" otherwise. There is no "left" root in
212//    the [-20, -19] range (it exists, but is insignificant for double extended
213//    precision). To determine if x falls in root "neighbourhood" we store
214//    significands of all the 35 roots as well as epsilon values (expressed
215//    by the left and right bound).
216//    In these ranges we approximate lgammal(x) by polynomial series of 19th
217//    degree:
218//    lgammal(x) = P19(t) = A0 + A1*t + ...+ A19*t^19, where t = x - EDP_Root,
219//    EDP_Root is the exact value of the corresponding root rounded to double
220//    extended precision. So, we have 35 different polynomials which make our
221//    table rather big. We may hope that x falls in root "neighbourhood"
222//    quite rarely -> ther might be no need in frequent use of different
223//    polynomials.
224//    A0, A1, A2, A3 are represented as pairs of double precision numbers,
225//    A4, A5 are long doubles, and to decrease the size of the table we
226//    keep the rest of coefficients in just double precision
227//
228//*********************************************************************
229// Algorithm for log(X) = (lnHi(X), lnLo(X))
230//
231//   ALGORITHM
232//
233//   Here we use a table lookup method. The basic idea is that in
234//   order to compute logl(Arg) for an argument Arg in [1,2), we
235//   construct a value G such that G*Arg is close to 1 and that
236//   logl(1/G) is obtainable easily from a table of values calculated
237//   beforehand. Thus
238//
239//      logl(Arg) = logl(1/G) + logl(G*Arg)
240//                = logl(1/G) + logl(1 + (G*Arg - 1))
241//
242//   Because |G*Arg - 1| is small, the second term on the right hand
243//   side can be approximated by a short polynomial. We elaborate
244//   this method in four steps.
245//
246//   Step 0: Initialization
247//
248//   We need to calculate logl( X ). Obtain N, S_hi such that
249//
250//      X = 2^N * S_hi exactly
251//
252//   where S_hi in [1,2)
253//
254//   Step 1: Argument Reduction
255//
256//   Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
257//
258//      G := G_1 * G_2 * G_3
259//      r := (G * S_hi - 1)
260//
261//   These G_j's have the property that the product is exactly
262//   representable and that |r| < 2^(-12) as a result.
263//
264//   Step 2: Approximation
265//
266//
267//   logl(1 + r) is approximated by a short polynomial poly(r).
268//
269//   Step 3: Reconstruction
270//
271//
272//   Finally, logl( X ) is given by
273//
274//   logl( X )   =   logl( 2^N * S_hi )
275//                 ~=~  N*logl(2) + logl(1/G) + logl(1 + r)
276//                 ~=~  N*logl(2) + logl(1/G) + poly(r).
277//
278//   IMPLEMENTATION
279//
280//   Step 0. Initialization
281//   ----------------------
282//
283//   Z := X
284//   N := unbaised exponent of Z
285//   S_hi := 2^(-N) * Z
286//
287//   Step 1. Argument Reduction
288//   --------------------------
289//
290//   Let
291//
292//      Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63
293//
294//   We obtain G_1, G_2, G_3 by the following steps.
295//
296//
297//      Define          X_0 := 1.d_1 d_2 ... d_14. This is extracted
298//                      from S_hi.
299//
300//      Define          A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated
301//                      to lsb = 2^(-4).
302//
303//      Define          index_1 := [ d_1 d_2 d_3 d_4 ].
304//
305//      Fetch           Z_1 := (1/A_1) rounded UP in fixed point with
306//      fixed point     lsb = 2^(-15).
307//                      Z_1 looks like z_0.z_1 z_2 ... z_15
308//                      Note that the fetching is done using index_1.
309//                      A_1 is actually not needed in the implementation
310//                      and is used here only to explain how is the value
311//                      Z_1 defined.
312//
313//      Fetch           G_1 := (1/A_1) truncated to 21 sig. bits.
314//      floating pt.    Again, fetching is done using index_1. A_1
315//                      explains how G_1 is defined.
316//
317//      Calculate       X_1 := X_0 * Z_1 truncated to lsb = 2^(-14)
318//                           = 1.0 0 0 0 d_5 ... d_14
319//                      This is accomplished by integer multiplication.
320//                      It is proved that X_1 indeed always begin
321//                      with 1.0000 in fixed point.
322//
323//
324//      Define          A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
325//                      truncated to lsb = 2^(-8). Similar to A_1,
326//                      A_2 is not needed in actual implementation. It
327//                      helps explain how some of the values are defined.
328//
329//      Define          index_2 := [ d_5 d_6 d_7 d_8 ].
330//
331//      Fetch           Z_2 := (1/A_2) rounded UP in fixed point with
332//      fixed point     lsb = 2^(-15). Fetch done using index_2.
333//                      Z_2 looks like z_0.z_1 z_2 ... z_15
334//
335//      Fetch           G_2 := (1/A_2) truncated to 21 sig. bits.
336//      floating pt.
337//
338//      Calculate       X_2 := X_1 * Z_2 truncated to lsb = 2^(-14)
339//                           = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14
340//                      This is accomplished by integer multiplication.
341//                      It is proved that X_2 indeed always begin
342//                      with 1.00000000 in fixed point.
343//
344//
345//      Define          A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1.
346//                      This is 2^(-14) + X_2 truncated to lsb = 2^(-13).
347//
348//      Define          index_3 := [ d_9 d_10 d_11 d_12 d_13 ].
349//
350//      Fetch           G_3 := (1/A_3) truncated to 21 sig. bits.
351//      floating pt.    Fetch is done using index_3.
352//
353//      Compute         G := G_1 * G_2 * G_3.
354//
355//      This is done exactly since each of G_j only has 21 sig. bits.
356//
357//      Compute
358//
359//               r := (G*S_hi - 1)
360//
361//
362//  Step 2. Approximation
363//  ---------------------
364//
365//   This step computes an approximation to logl( 1 + r ) where r is the
366//   reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13);
367//   thus logl(1+r) can be approximated by a short polynomial:
368//
369//      logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5
370//
371//
372//  Step 3. Reconstruction
373//  ----------------------
374//
375//   This step computes the desired result of logl(X):
376//
377//      logl(X)  =   logl( 2^N * S_hi )
378//               =   N*logl(2) + logl( S_hi )
379//               =   N*logl(2) + logl(1/G) +
380//                      logl(1 + G*S_hi - 1 )
381//
382//   logl(2), logl(1/G_j) are stored as pairs of (single,double) numbers:
383//   log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are
384//   single-precision numbers and the low parts are double precision
385//   numbers. These have the property that
386//
387//      N*log2_hi + SUM ( log1byGj_hi )
388//
389//   is computable exactly in double-extended precision (64 sig. bits).
390//   Finally
391//
392//      lnHi(X) := N*log2_hi + SUM ( log1byGj_hi )
393//      lnLo(X) := poly_hi + [ poly_lo +
394//              ( SUM ( log1byGj_lo ) + N*log2_lo ) ]
395//
396//
397//*********************************************************************
398// General Purpose Registers
399// scratch registers
400rPolDataPtr              = r2
401rLnSinDataPtr            = r3
402rExpX                    = r8
403rSignifX                 = r9
404rDelta                   = r10
405rSignExpX                = r11
406GR_ad_z_1                = r14
407r17Ones                  = r15
408GR_Index1                = r16
409rSignif1andQ             = r17
410GR_X_0                   = r18
411GR_X_1                   = r19
412GR_X_2                   = r20
413GR_Z_1                   = r21
414GR_Z_2                   = r22
415GR_N                     = r23
416rExpHalf                 = r24
417rExp8                    = r25
418rX0Dx                    = r25
419GR_ad_tbl_1              = r26
420GR_ad_tbl_2              = r27
421GR_ad_tbl_3              = r28
422GR_ad_q                  = r29
423GR_ad_z_1                = r30
424GR_ad_z_2                = r31
425// stacked registers
426rPFS_SAVED               = r32
427GR_ad_z_3                = r33
428rSgnGamAddr              = r34
429rSgnGamSize              = r35
430rLogDataPtr              = r36
431rZ1offsett               = r37
432rTmpPtr                  = r38
433rTmpPtr2                 = r39
434rTmpPtr3                 = r40
435rExp2                    = r41
436rExp2tom7                = r42
437rZ625                    = r42
438rExpOne                  = r43
439rNegSingularity          = r44
440rXint                    = r45
441rTbl1Addr                = r46
442rTbl2Addr                = r47
443rTbl3Addr                = r48
444rZ2Addr                  = r49
445rRootsAddr               = r50
446rRootsBndAddr            = r51
447rRoot                    = r52
448rRightBound              = r53
449rLeftBound               = r54
450rSignifDx                = r55
451rBernulliPtr             = r56
452rLnSinTmpPtr             = r56
453rIndex1Dx                = r57
454rIndexPol                = r58
455GR_Index3                = r59
456GR_Index2                = r60
457rSgnGam                  = r61
458rXRnd                    = r62
459
460GR_SAVE_B0               = r63
461GR_SAVE_GP               = r64
462GR_SAVE_PFS              = r65
463// output parameters when calling error handling routine
464GR_Parameter_X           = r66
465GR_Parameter_Y           = r67
466GR_Parameter_RESULT      = r68
467GR_Parameter_TAG         = r69
468
469//********************************************************************
470// Floating Point Registers
471// CAUTION: due to the lack of registers there exist (below in the code)
472// sometimes "unconventional" use of declared registers
473//
474fAbsX                    = f6
475fDelX4                   = f6
476fSignifX                 = f7
477// macros for error handling routine
478FR_X                     = f10 // first argument
479FR_Y                     = f1  // second argument (lgammal has just one)
480FR_RESULT                = f8  // result
481
482// First 7 Bernulli numbers
483fB2                      = f9
484fLnDeltaL                = f9
485fXSqr                    = f9
486fB4                      = f10
487fX4                      = f10
488fB6                      = f11
489fX6                      = f11
490fB8                      = f12
491fXSqrL                   = f12
492fB10                     = f13
493fRes7H                   = f13
494fB12                     = f14
495fRes7L                   = f14
496fB14                     = f15
497
498// stack registers
499// Polynomial coefficients: A0, ..., A25
500fA0                      = f32
501fA0L                     = f33
502fInvXL                   = f33
503fA1                      = f34
504fA1L                     = f35
505fA2                      = f36
506fA2L                     = f37
507fA3                      = f38
508fA3L                     = f39
509fA4                      = f40
510fA4L                     = f41
511fRes6H                   = f41
512fA5                      = f42
513fB2L                     = f42
514fA5L                     = f43
515fMinNegStir              = f43
516fRes6L                   = f43
517fA6                      = f44
518fMaxNegStir              = f44
519fA7                      = f45
520fLnDeltaH                = f45
521fA8                      = f46
522fBrnL                    = f46
523fA9                      = f47
524fBrnH                    = f47
525fA10                     = f48
526fRes5L                   = f48
527fA11                     = f49
528fRes5H                   = f49
529fA12                     = f50
530fDx6                     = f50
531fA13                     = f51
532fDx8                     = f51
533fA14                     = f52
534fDx4                     = f52
535fA15                     = f53
536fYL                      = f53
537fh3Dx                    = f53
538fA16                     = f54
539fYH                      = f54
540fH3Dx                    = f54
541fA17                     = f55
542fResLnDxL                = f55
543fG3Dx                    = f55
544fA18                     = f56
545fResLnDxH                = f56
546fh2Dx                    = f56
547fA19                     = f57
548fFloatNDx                = f57
549fA20                     = f58
550fPolyHiDx                = f58
551fhDx                     = f58
552fA21                     = f59
553fRDxCub                  = f59
554fHDx                     = f59
555fA22                     = f60
556fRDxSq                   = f60
557fGDx                     = f60
558fA23                     = f61
559fPolyLoDx                = f61
560fInvX3                   = f61
561fA24                     = f62
562fRDx                     = f62
563fInvX8                   = f62
564fA25                     = f63
565fInvX4                   = f63
566fPol                     = f64
567fPolL                    = f65
568// Coefficients of ln(sin(Pi*x)/Pi*x)
569fLnSin2                  = f66
570fLnSin2L                 = f67
571fLnSin4                  = f68
572fLnSin6                  = f69
573fLnSin8                  = f70
574fLnSin10                 = f71
575fLnSin12                 = f72
576fLnSin14                 = f73
577fLnSin16                 = f74
578fLnSin18                 = f75
579fDelX8                   = f75
580fLnSin20                 = f76
581fLnSin22                 = f77
582fDelX6                   = f77
583fLnSin24                 = f78
584fLnSin26                 = f79
585fLnSin28                 = f80
586fLnSin30                 = f81
587fhDelX                   = f81
588fLnSin32                 = f82
589fLnSin34                 = f83
590fLnSin36                 = f84
591fXint                    = f85
592fDxSqr                   = f85
593fRes3L                   = f86
594fRes3H                   = f87
595fRes4H                   = f88
596fRes4L                   = f89
597fResH                    = f90
598fResL                    = f91
599fDx                      = f92
600FR_MHalf                 = f93
601fRes1H                   = f94
602fRes1L                   = f95
603fRes2H                   = f96
604fRes2L                   = f97
605FR_FracX                 = f98
606fRcpX                    = f99
607fLnSinH                  = f99
608fTwo                     = f100
609fMOne                    = f100
610FR_G                     = f101
611FR_H                     = f102
612FR_h                     = f103
613FR_G2                    = f104
614FR_H2                    = f105
615FR_poly_lo               = f106
616FR_poly_hi               = f107
617FR_h2                    = f108
618FR_rsq                   = f109
619FR_r                     = f110
620FR_log2_hi               = f111
621FR_log2_lo               = f112
622fFloatN                  = f113
623FR_Q4                    = f114
624FR_G3                    = f115
625FR_H3                    = f116
626FR_h3                    = f117
627FR_Q3                    = f118
628FR_Q2                    = f119
629FR_Q1                    = f120
630fThirteen                = f121
631fSix                     = f121
632FR_rcub                  = f121
633// Last three Bernulli numbers
634fB16                     = f122
635fB18                     = f123
636fB20                     = f124
637fInvX                    = f125
638fLnSinL                  = f125
639fDxSqrL                  = f126
640fFltIntX                 = f126
641fRoot                    = f127
642fNormDx                  = f127
643
644// Data tables
645//==============================================================
646RODATA
647// ************* DO NOT CHANGE THE ORDER OF THESE TABLES *************
648.align 16
649LOCAL_OBJECT_START(lgammal_right_roots_data)
650// List of all right roots themselves
651data8 0x9D3FE4B007C360AB, 0x0000C000 // Range [-3, -2]
652data8 0xC9306DE4F2CD7BEE, 0x0000C000 // Range [-4, -3]
653data8 0x814273C2CCAC0618, 0x0000C001 // Range [-5, -4]
654data8 0xA04352BF85B6C865, 0x0000C001 // Range [-6, -5]
655data8 0xC00B592C4BE4676C, 0x0000C001 // Range [-7, -6]
656data8 0xE0019FEF6FF0F5BF, 0x0000C001 // Range [-8, -7]
657data8 0x80001A01459FC9F6, 0x0000C002 // Range [-9, -8]
658data8 0x900002E3BB47D86D, 0x0000C002 // Range [-10, -9]
659data8 0xA0000049F93BB992, 0x0000C002 // Range [-11, -10]
660data8 0xB0000006B9915316, 0x0000C002 // Range [-12, -11]
661data8 0xC00000008F76C773, 0x0000C002 // Range [-13, -12]
662data8 0xD00000000B09230A, 0x0000C002 // Range [-14, -13]
663data8 0xE000000000C9CBA5, 0x0000C002 // Range [-15, -14]
664data8 0xF0000000000D73FA, 0x0000C002 // Range [-16, -15]
665data8 0x8000000000006BA0, 0x0000C003 // Range [-17, -16]
666data8 0x8800000000000655, 0x0000C003 // Range [-18, -17]
667data8 0x900000000000005A, 0x0000C003 // Range [-19, -18]
668data8 0x9800000000000005, 0x0000C003 // Range [-20, -19]
669// List of bounds of ranges with special polynomial approximation near root
670// Only significands of bounds are actually stored
671data8 0xA000000000000000, 0x9800000000000000 // Bounds for root on [-3, -2]
672data8 0xCAB88035C5EFBB41, 0xC7E05E31F4B02115 // Bounds for root on [-4, -3]
673data8 0x817831B899735C72, 0x8114633941B8053A // Bounds for root on [-5, -4]
674data8 0xA04E8B34C6AA9476, 0xA039B4A42978197B // Bounds for root on [-6, -5]
675data8 0xC00D3D5E588A78A9, 0xC009BA25F7E858A6 // Bounds for root on [-7, -6]
676data8 0xE001E54202991EB4, 0xE001648416CE897F // Bounds for root on [-8, -7]
677data8 0x80001E56D13A6B9F, 0x8000164A3BAD888A // Bounds for root on [-9, -8]
678data8 0x9000035F0529272A, 0x9000027A0E3D94F0 // Bounds for root on [-10, -9]
679data8 0xA00000564D705880, 0xA000003F67EA0CC7 // Bounds for root on [-11, -10]
680data8 0xB0000007D87EE0EF, 0xB0000005C3A122A5 // Bounds for root on [-12, -11]
681data8 0xC0000000A75FE8B1, 0xC00000007AF818AC // Bounds for root on [-13, -12]
682data8 0xD00000000CDFFE36, 0xD000000009758BBF // Bounds for root on [-14, -13]
683data8 0xE000000000EB6D96, 0xE000000000ACF7B2 // Bounds for root on [-15, -14]
684data8 0xF0000000000FB1F9, 0xF0000000000B87FB // Bounds for root on [-16, -15]
685data8 0x8000000000007D90, 0x8000000000005C40 // Bounds for root on [-17, -16]
686data8 0x8800000000000763, 0x880000000000056D // Bounds for root on [-18, -17]
687data8 0x9000000000000069, 0x900000000000004D // Bounds for root on [-19, -18]
688data8 0x9800000000000006, 0x9800000000000005 // Bounds for root on [-20, -19]
689// List of all left roots themselves
690data8 0xAFDA0850DEC8065E, 0x0000C000 // Range [-3, -2]
691data8 0xFD238AA3E17F285C, 0x0000C000 // Range [-4, -3]
692data8 0x9FBABBD37757E6A2, 0x0000C001 // Range [-5, -4]
693data8 0xBFF497AC8FA06AFC, 0x0000C001 // Range [-6, -5]
694data8 0xDFFE5FBB5C377FE8, 0x0000C001 // Range [-7, -6]
695data8 0xFFFFCBFC0ACE7879, 0x0000C001 // Range [-8, -7]
696data8 0x8FFFFD1C425E8100, 0x0000C002 // Range [-9, -8]
697data8 0x9FFFFFB606BDFDCD, 0x0000C002 // Range [-10, -9]
698data8 0xAFFFFFF9466E9F1B, 0x0000C002 // Range [-11, -10]
699data8 0xBFFFFFFF70893874, 0x0000C002 // Range [-12, -11]
700data8 0xCFFFFFFFF4F6DCF6, 0x0000C002 // Range [-13, -12]
701data8 0xDFFFFFFFFF36345B, 0x0000C002 // Range [-14, -13]
702data8 0xEFFFFFFFFFF28C06, 0x0000C002 // Range [-15, -14]
703data8 0xFFFFFFFFFFFF28C0, 0x0000C002 // Range [-16, -15]
704data8 0x87FFFFFFFFFFF9AB, 0x0000C003 // Range [-17, -16]
705data8 0x8FFFFFFFFFFFFFA6, 0x0000C003 // Range [-18, -17]
706data8 0x97FFFFFFFFFFFFFB, 0x0000C003 // Range [-19, -18]
707data8 0x0000000000000000, 0x00000000 // pad to keep logic in the main path
708// List of bounds of ranges with special polynomial approximation near root
709// Only significands of bounds are actually stored
710data8 0xB235880944CC758E, 0xADD2F1A9FBE76C8B // Bounds for root on [-3, -2]
711data8 0xFD8E7844F307B07C, 0xFCA655C2152BDE4D // Bounds for root on [-4, -3]
712data8 0x9FC4D876EE546967, 0x9FAEE4AF68BC4292 // Bounds for root on [-5, -4]
713data8 0xBFF641FFBFCC44F1, 0xBFF2A47919F4BA89 // Bounds for root on [-6, -5]
714data8 0xDFFE9C803DEFDD59, 0xDFFE18932EB723FE // Bounds for root on [-7, -6]
715data8 0xFFFFD393FA47AFC3, 0xFFFFC317CF638AE1 // Bounds for root on [-8, -7]
716data8 0x8FFFFD8840279925, 0x8FFFFC9DCECEEE92 // Bounds for root on [-9, -8]
717data8 0x9FFFFFC0D34E2AF8, 0x9FFFFFA9619AA3B7 // Bounds for root on [-10, -9]
718data8 0xAFFFFFFA41C18246, 0xAFFFFFF82025A23C // Bounds for root on [-11, -10]
719data8 0xBFFFFFFF857ACB4E, 0xBFFFFFFF58032378 // Bounds for root on [-12, -11]
720data8 0xCFFFFFFFF6934AB8, 0xCFFFFFFFF313EF0A // Bounds for root on [-13, -12]
721data8 0xDFFFFFFFFF53A9E9, 0xDFFFFFFFFF13B5A5 // Bounds for root on [-14, -13]
722data8 0xEFFFFFFFFFF482CB, 0xEFFFFFFFFFF03F4F // Bounds for root on [-15, -14]
723data8 0xFFFFFFFFFFFF482D, 0xFFFFFFFFFFFF03F5 // Bounds for root on [-16, -15]
724data8 0x87FFFFFFFFFFFA98, 0x87FFFFFFFFFFF896 // Bounds for root on [-17, -16]
725data8 0x8FFFFFFFFFFFFFB3, 0x8FFFFFFFFFFFFF97 // Bounds for root on [-18, -17]
726data8 0x97FFFFFFFFFFFFFC, 0x97FFFFFFFFFFFFFB // Bounds for root on [-19, -18]
727LOCAL_OBJECT_END(lgammal_right_roots_data)
728
729LOCAL_OBJECT_START(lgammal_0_Half_data)
730// Polynomial coefficients for the lgammal(x), 0.0 < |x| < 0.5
731data8 0xBFD9A4D55BEAB2D6, 0xBC8AA3C097746D1F //A3
732data8 0x3FEA51A6625307D3, 0x3C7180E7BD2D0DCC //A2
733data8 0xBFE2788CFC6FB618, 0xBC9E9346C4692BCC //A1
734data8 0x8A8991563EC1BD13, 0x00003FFD //A4
735data8 0xD45CE0BD52C27EF2, 0x0000BFFC //A5
736data8 0xADA06587FA2BBD47, 0x00003FFC //A6
737data8 0x9381D0ED2194902A, 0x0000BFFC //A7
738data8 0x80859B3CF92D4192, 0x00003FFC //A8
739data8 0xE4033517C622A946, 0x0000BFFB //A9
740data8 0xCD00CE67A51FC82A, 0x00003FFB //A10
741data8 0xBA44E2A96C3B5700, 0x0000BFFB //A11
742data8 0xAAAD008FA46DBD99, 0x00003FFB //A12
743data8 0x9D604AC65A41153D, 0x0000BFFB //A13
744data8 0x917CECB864B5A861, 0x00003FFB //A14
745data8 0x85A4810EB730FDE4, 0x0000BFFB //A15
746data8 0xEF2761C38BD21F77, 0x00003FFA //A16
747data8 0xC913043A128367DA, 0x0000BFFA //A17
748data8 0x96A29B71FF7AFFAA, 0x00003FFA //A18
749data8 0xBB9FFA1A5FE649BB, 0x0000BFF9 //A19
750data8 0xB17982CD2DAA0EE3, 0x00003FF8 //A20
751data8 0xDE1DDCBFFB9453F0, 0x0000BFF6 //A21
752data8 0x87FBF5D7ACD9FA9D, 0x00003FF4 //A22
753LOCAL_OBJECT_END(lgammal_0_Half_data)
754
755LOCAL_OBJECT_START(Constants_Q)
756// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
757data4  0x00000000,0xB1721800,0x00003FFE,0x00000000
758data4  0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
759data4  0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
760data4  0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
761data4  0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
762data4  0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
763LOCAL_OBJECT_END(Constants_Q)
764
765LOCAL_OBJECT_START(Constants_Z_1)
766// Z1 - 16 bit fixed
767data4  0x00008000
768data4  0x00007879
769data4  0x000071C8
770data4  0x00006BCB
771data4  0x00006667
772data4  0x00006187
773data4  0x00005D18
774data4  0x0000590C
775data4  0x00005556
776data4  0x000051EC
777data4  0x00004EC5
778data4  0x00004BDB
779data4  0x00004925
780data4  0x0000469F
781data4  0x00004445
782data4  0x00004211
783LOCAL_OBJECT_END(Constants_Z_1)
784
785LOCAL_OBJECT_START(Constants_G_H_h1)
786// G1 and H1 - IEEE single and h1 - IEEE double
787data4  0x3F800000,0x00000000,0x00000000,0x00000000
788data4  0x3F70F0F0,0x3D785196,0x617D741C,0x3DA163A6
789data4  0x3F638E38,0x3DF13843,0xCBD3D5BB,0x3E2C55E6
790data4  0x3F579430,0x3E2FF9A0,0xD86EA5E7,0xBE3EB0BF
791data4  0x3F4CCCC8,0x3E647FD6,0x86B12760,0x3E2E6A8C
792data4  0x3F430C30,0x3E8B3AE7,0x5C0739BA,0x3E47574C
793data4  0x3F3A2E88,0x3EA30C68,0x13E8AF2F,0x3E20E30F
794data4  0x3F321640,0x3EB9CEC8,0xF2C630BD,0xBE42885B
795data4  0x3F2AAAA8,0x3ECF9927,0x97E577C6,0x3E497F34
796data4  0x3F23D708,0x3EE47FC5,0xA6B0A5AB,0x3E3E6A6E
797data4  0x3F1D89D8,0x3EF8947D,0xD328D9BE,0xBDF43E3C
798data4  0x3F17B420,0x3F05F3A1,0x0ADB090A,0x3E4094C3
799data4  0x3F124920,0x3F0F4303,0xFC1FE510,0xBE28FBB2
800data4  0x3F0D3DC8,0x3F183EBF,0x10FDE3FA,0x3E3A7895
801data4  0x3F088888,0x3F20EC80,0x7CC8C98F,0x3E508CE5
802data4  0x3F042108,0x3F29516A,0xA223106C,0xBE534874
803LOCAL_OBJECT_END(Constants_G_H_h1)
804
805LOCAL_OBJECT_START(Constants_Z_2)
806// Z2 - 16 bit fixed
807data4  0x00008000
808data4  0x00007F81
809data4  0x00007F02
810data4  0x00007E85
811data4  0x00007E08
812data4  0x00007D8D
813data4  0x00007D12
814data4  0x00007C98
815data4  0x00007C20
816data4  0x00007BA8
817data4  0x00007B31
818data4  0x00007ABB
819data4  0x00007A45
820data4  0x000079D1
821data4  0x0000795D
822data4  0x000078EB
823LOCAL_OBJECT_END(Constants_Z_2)
824
825LOCAL_OBJECT_START(Constants_G_H_h2)
826// G2 and H2 - IEEE single and h2 - IEEE double
827data4  0x3F800000,0x00000000,0x00000000,0x00000000
828data4  0x3F7F00F8,0x3B7F875D,0x22C42273,0x3DB5A116
829data4  0x3F7E03F8,0x3BFF015B,0x21F86ED3,0x3DE620CF
830data4  0x3F7D08E0,0x3C3EE393,0x484F34ED,0xBDAFA07E
831data4  0x3F7C0FC0,0x3C7E0586,0x3860BCF6,0xBDFE07F0
832data4  0x3F7B1880,0x3C9E75D2,0xA78093D6,0x3DEA370F
833data4  0x3F7A2328,0x3CBDC97A,0x72A753D0,0x3DFF5791
834data4  0x3F792FB0,0x3CDCFE47,0xA7EF896B,0x3DFEBE6C
835data4  0x3F783E08,0x3CFC15D0,0x409ECB43,0x3E0CF156
836data4  0x3F774E38,0x3D0D874D,0xFFEF71DF,0xBE0B6F97
837data4  0x3F766038,0x3D1CF49B,0x5D59EEE8,0xBE080483
838data4  0x3F757400,0x3D2C531D,0xA9192A74,0x3E1F91E9
839data4  0x3F748988,0x3D3BA322,0xBF72A8CD,0xBE139A06
840data4  0x3F73A0D0,0x3D4AE46F,0xF8FBA6CF,0x3E1D9202
841data4  0x3F72B9D0,0x3D5A1756,0xBA796223,0xBE1DCCC4
842data4  0x3F71D488,0x3D693B9D,0xB6B7C239,0xBE049391
843LOCAL_OBJECT_END(Constants_G_H_h2)
844
845LOCAL_OBJECT_START(Constants_G_H_h3)
846// G3 and H3 - IEEE single and h3 - IEEE double
847data4  0x3F7FFC00,0x38800100,0x562224CD,0x3D355595
848data4  0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2
849data4  0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D
850data4  0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291
851data4  0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8
852data4  0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707
853data4  0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9
854data4  0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47
855data4  0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E
856data4  0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D
857data4  0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441
858data4  0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95
859data4  0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC
860data4  0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337
861data4  0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B
862data4  0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B
863data4  0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21
864data4  0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4
865data4  0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070
866data4  0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC
867data4  0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83
868data4  0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40
869data4  0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7
870data4  0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B
871data4  0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E
872data4  0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06
873data4  0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1
874data4  0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103
875data4  0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B
876data4  0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19
877data4  0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502
878data4  0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17
879LOCAL_OBJECT_END(Constants_G_H_h3)
880
881LOCAL_OBJECT_START(lgammal_data)
882// Positive overflow value
883data8 0xB8D54C8BFFFDEBF4, 0x00007FF1
884LOCAL_OBJECT_END(lgammal_data)
885
886LOCAL_OBJECT_START(lgammal_Stirling)
887// Coefficients needed for Strirling's formula
888data8 0x3FED67F1C864BEB4 // High part of 0.5*ln(2*Pi)
889data8 0x3C94D252F2400510 // Low part of 0.5*ln(2*Pi)
890//
891// Bernulli numbers used in Striling's formula for -2^63 < |x| < -13.0
892//(B1H, B1L) = 8.3333333333333333333262747254e-02
893data8 0x3FB5555555555555, 0x3C55555555555555
894data8 0xB60B60B60B60B60B, 0x0000BFF6 //B2 = -2.7777777777777777777777777778e-03
895data8 0xD00D00D00D00D00D, 0x00003FF4 //B3 = 7.9365079365079365079365079365e-04
896data8 0x9C09C09C09C09C0A, 0x0000BFF4 //B4 = -5.9523809523809523809523809524e-04
897data8 0xDCA8F158C7F91AB8, 0x00003FF4 //B5 = 8.4175084175084175084175084175e-04
898data8 0xFB5586CCC9E3E410, 0x0000BFF5 //B6 = -1.9175269175269175269175269175e-03
899data8 0xD20D20D20D20D20D, 0x00003FF7 //B7 = 6.4102564102564102564102564103e-03
900data8 0xF21436587A9CBEE1, 0x0000BFF9 //B8 = -2.9550653594771241830065359477e-02
901data8 0xB7F4B1C0F033FFD1, 0x00003FFC //B9 = 1.7964437236883057316493849002e-01
902data8 0xB23B3808C0F9CF6E, 0x0000BFFF //B10 = -1.3924322169059011164274322169e+00
903// Polynomial coefficients for Stirling's formula, -13.0 < x < -6.0
904data8 0x3FB5555555555555, 0x3C4D75060289C58B //A0
905data8 0xB60B60B60B0F0876, 0x0000BFF6 //A1
906data8 0xD00D00CE54B1256C, 0x00003FF4 //A2
907data8 0x9C09BF46B58F75E1, 0x0000BFF4 //A3
908data8 0xDCA8483BC91ACC6D, 0x00003FF4 //A4
909data8 0xFB3965C939CC9FEE, 0x0000BFF5 //A5
910data8 0xD0723ADE3F0BC401, 0x00003FF7 //A6
911data8 0xE1ED7434E81F0B73, 0x0000BFF9 //A7
912data8 0x8069C6982F993283, 0x00003FFC //A8
913data8 0xC271F65BFA5BEE3F, 0x0000BFFD //A9
914LOCAL_OBJECT_END(lgammal_Stirling)
915
916LOCAL_OBJECT_START(lgammal_lnsin_data)
917// polynomial approximation of -ln(sin(Pi*x)/(Pi*x)), 0 < x <= 0.5
918data8 0x3FFA51A6625307D3, 0x3C81873332FAF94C //A2
919data8 0x8A8991563EC241C3, 0x00003FFE //A4
920data8 0xADA06588061805DF, 0x00003FFD //A6
921data8 0x80859B57C338D0F7, 0x00003FFD //A8
922data8 0xCD00F1C2D78754BD, 0x00003FFC //A10
923data8 0xAAB56B1D3A1F4655, 0x00003FFC //A12
924data8 0x924B6F2FBBED12B1, 0x00003FFC //A14
925data8 0x80008E58765F43FC, 0x00003FFC //A16
926data8 0x3FBC718EC115E429//A18
927data8 0x3FB99CE544FE183E//A20
928data8 0x3FB7251C09EAAD89//A22
929data8 0x3FB64A970733628C//A24
930data8 0x3FAC92D6802A3498//A26
931data8 0x3FC47E1165261586//A28
932data8 0xBFCA1BAA434750D4//A30
933data8 0x3FE460001C4D5961//A32
934data8 0xBFE6F06A3E4908AD//A34
935data8 0x3FE300889EBB203A//A36
936LOCAL_OBJECT_END(lgammal_lnsin_data)
937
938LOCAL_OBJECT_START(lgammal_half_3Q_data)
939// Polynomial coefficients for the lgammal(x), 0.5 <= x < 0.75
940data8 0xBFF7A648EE90C62E, 0x3C713F326857E066 // A3, A0L
941data8 0xBFF73E4B8BA780AE, 0xBCA953BC788877EF // A1, A1L
942data8 0x403774DCD58D0291, 0xC0415254D5AE6623 // D0, D1
943data8 0x40B07213855CBFB0, 0xC0B8855E25D2D229 // C20, C21
944data8 0x3FFB359F85FF5000, 0x3C9BAECE6EF9EF3A // A2, A2L
945data8 0x3FD717D498A3A8CC, 0xBC9088E101CFEDFA  // A0, A3L
946data8 0xAFEF36CC5AEC3FF0, 0x00004002 // E6
947data8 0xABE2054E1C34E791, 0x00004001 // E4
948data8 0xB39343637B2900D1, 0x00004000 // E2
949data8 0xD74FB710D53F58F6, 0x00003FFF // E0
950data8 0x4070655963BA4256, 0xC078DA9D263C4EA3 // D6, D7
951data8 0x405CD2B6A9B90978, 0xC065B3B9F4F4F171 // D4, D5
952data8 0x4049BC2204CF61FF, 0xC05337227E0BA152 // D2, D3
953data8 0x4095509A50C07A96, 0xC0A0747949D2FB45 // C18, C19
954data8 0x4082ECCBAD709414, 0xC08CD02FB088A702 // C16, C17
955data8 0xFFE4B2A61B508DD5, 0x0000C002 // E7
956data8 0xF461ADB8AE17E0A5, 0x0000C001 // E5
957data8 0xF5BE8B0B90325F20, 0x0000C000 // E3
958data8 0x877B275F3FB78DCA, 0x0000C000 // E1
959LOCAL_OBJECT_END(lgammal_half_3Q_data)
960
961LOCAL_OBJECT_START(lgammal_half_3Q_neg_data)
962// Polynomial coefficients for the lgammal(x), -0.75 < x <= -0.5
963data8 0xC014836EFD94899C, 0x3C9835679663B44F // A3, A0L
964data8 0xBFF276C7B4FB1875, 0xBC92D3D9FA29A1C0 // A1, A1L
965data8 0x40C5178F24E1A435, 0xC0D9DE84FBC5D76A // D0, D1
966data8 0x41D4D1B236BF6E93, 0xC1EBB0445CE58550 // C20, C21
967data8 0x4015718CD67F63D3, 0x3CC5354B6F04B59C // A2, A2L
968data8 0x3FF554493087E1ED, 0xBCB72715E37B02B9 // A0, A3L
969data8 0xE4AC7E915FA72229, 0x00004009 // E6
970data8 0xA28244206395FCC6, 0x00004007 // E4
971data8 0xFB045F19C07B2544, 0x00004004 // E2
972data8 0xE5C8A6E6A9BA7D7B, 0x00004002 // E0
973data8 0x4143943B55BF5118, 0xC158AC05EA675406 // D6, D7
974data8 0x4118F6833D19717C, 0xC12F51A6F375CC80 // D4, D5
975data8 0x40F00C209483481C, 0xC103F1DABF750259 // D2, D3
976data8 0x4191038F2D8F9E40, 0xC1A413066DA8AE4A // C18, C19
977data8 0x4170B537EDD833DE, 0xC1857E79424C61CE // C16, C17
978data8 0x8941D8AB4855DB73, 0x0000C00B // E7
979data8 0xBB822B131BD2E813, 0x0000C008 // E5
980data8 0x852B4C03B83D2D4F, 0x0000C006 // E3
981data8 0xC754CA7E2DDC0F1F, 0x0000C003 // E1
982LOCAL_OBJECT_END(lgammal_half_3Q_neg_data)
983
984LOCAL_OBJECT_START(lgammal_2Q_4_data)
985// Polynomial coefficients for the lgammal(x), 2.25 <= |x| < 4.0
986data8 0xBFCA4D55BEAB2D6F, 0x3C7ABC9DA14141F5 // A3, A0L
987data8 0x3FFD8773039049E7, 0x3C66CB7957A95BA4 // A1, A1L
988data8 0x3F45C3CC79E91E7D, 0xBF3A8E5005937E97 // D0, D1
989data8 0x3EC951E35E1C9203, 0xBEB030A90026C5DF // C20, C21
990data8 0x3FE94699894C1F4C, 0x3C91884D21D123F1 // A2, A2L
991data8 0x3FE62E42FEFA39EF, 0xBC66480CEB70870F // A0, A3L
992data8 0xF1C2EAFF0B3A7579, 0x00003FF5 // E6
993data8 0xB36AF863926B55A3, 0x00003FF7 // E4
994data8 0x9620656185BB44CA, 0x00003FF9 // E2
995data8 0xA264558FB0906AFF, 0x00003FFB // E0
996data8 0x3F03D59E9666C961, 0xBEF91115893D84A6 // D6, D7
997data8 0x3F19333611C46225, 0xBF0F89EB7D029870 // D4, D5
998data8 0x3F3055A96B347AFE, 0xBF243B5153E178A8 // D2, D3
999data8 0x3ED9A4AEF30C4BB2, 0xBED388138B1CEFF2 // C18, C19
1000data8 0x3EEF7945A3C3A254, 0xBEE36F32A938EF11 // C16, C17
1001data8 0x9028923F47C82118, 0x0000BFF5 // E7
1002data8 0xCE0DAAFB6DC93B22, 0x0000BFF6 // E5
1003data8 0xA0D0983B34AC4C8D, 0x0000BFF8 // E3
1004data8 0x94D6C50FEB8B0CE7, 0x0000BFFA // E1
1005LOCAL_OBJECT_END(lgammal_2Q_4_data)
1006
1007LOCAL_OBJECT_START(lgammal_4_8_data)
1008// Polynomial coefficients for the lgammal(x), 4.0 <= |x| < 8.0
1009data8 0xBFD6626BC9B31B54, 0x3CAA53C82493A92B // A3, A0L
1010data8 0x401B4C420A50AD7C, 0x3C8C6E9929F789A3 // A1, A1L
1011data8 0x3F49410427E928C2, 0xBF3E312678F8C146 // D0, D1
1012data8 0x3ED51065F7CD5848, 0xBED052782A03312F // C20, C21
1013data8 0x3FF735973273D5EC, 0x3C831DFC65BF8CCF // A2, A2L
1014data8 0x401326643C4479C9, 0xBC6FA0498C5548A6 // A0, A3L
1015data8 0x9382D8B3CD4EB7E3, 0x00003FF6 // E6
1016data8 0xE9F92CAD8A85CBCD, 0x00003FF7 // E4
1017data8 0xD58389FE38258CEC, 0x00003FF9 // E2
1018data8 0x81310136363AE8AA, 0x00003FFC // E0
1019data8 0x3F04F0AE38E78570, 0xBEF9E2144BB8F03C // D6, D7
1020data8 0x3F1B5E992A6CBC2A, 0xBF10F3F400113911 // D4, D5
1021data8 0x3F323EE00AAB7DEE, 0xBF2640FDFA9FB637 // D2, D3
1022data8 0x3ED2143EBAFF067A, 0xBEBBDEB92D6FF35D // C18, C19
1023data8 0x3EF173A42B69AAA4, 0xBEE78B9951A2EAA5 // C16, C17
1024data8 0xAB3CCAC6344E52AA, 0x0000BFF5 // E7
1025data8 0x81ACCB8915B16508, 0x0000BFF7 // E5
1026data8 0xDA62C7221102C426, 0x0000BFF8 // E3
1027data8 0xDF1BD44C4083580A, 0x0000BFFA // E1
1028LOCAL_OBJECT_END(lgammal_4_8_data)
1029
1030LOCAL_OBJECT_START(lgammal_loc_min_data)
1031// Polynomial coefficients for the lgammal(x), 1.3125 <= x < 1.5625
1032data8 0xBB16C31AB5F1FB71, 0x00003FFF // xMin - point of local minimum
1033data8 0xBFC2E4278DC6BC23, 0xBC683DA8DDCA9650 // A3, A0L
1034data8 0x3BD4DB7D0CA61D5F, 0x386E719EDD01D801 // A1, A1L
1035data8 0x3F4CC72638E1D93F, 0xBF4228EC9953CCB9 // D0, D1
1036data8 0x3ED222F97A04613E,0xBED3DDD58095CB6C  // C20, C21
1037data8 0x3FDEF72BC8EE38AB, 0x3C863AFF3FC48940 // A2, A2L
1038data8 0xBFBF19B9BCC38A41,  0xBC7425F1BFFC1442// A0, A3L
1039data8 0x941890032BEB34C3, 0x00003FF6 // E6
1040data8 0xC7E701591CE534BC, 0x00003FF7 // E4
1041data8 0x93373CBD05138DD4, 0x00003FF9 // E2
1042data8 0x845A14A6A81C05D6, 0x00003FFB // E0
1043data8 0x3F0F6C4DF6D47A13, 0xBF045DCDB5B49E19 // D6, D7
1044data8 0x3F22E23345DDE59C, 0xBF1851159AFB1735 // D4, D5
1045data8 0x3F37101EA4022B78, 0xBF2D721E6323AF13 // D2, D3
1046data8 0x3EE691EBE82DF09D, 0xBEDD42550961F730 // C18, C19
1047data8 0x3EFA793EDE99AD85, 0xBEF14000108E70BE // C16, C17
1048data8 0xB7CBC033ACE0C99C, 0x0000BFF5 // E7
1049data8 0xF178D1F7B1A45E27, 0x0000BFF6 // E5
1050data8 0xA8FCFCA8106F471C, 0x0000BFF8 // E3
1051data8 0x864D46FA898A9AD2, 0x0000BFFA // E1
1052LOCAL_OBJECT_END(lgammal_loc_min_data)
1053
1054LOCAL_OBJECT_START(lgammal_03Q_1Q_data)
1055// Polynomial coefficients for the lgammal(x), 0.75 <= |x| < 1.3125
1056data8 0x3FD151322AC7D848, 0x3C7184DE0DB7B4EE // A4, A2L
1057data8 0x3FD9A4D55BEAB2D6, 0x3C9E934AAB10845F // A3, A1L
1058data8 0x3FB111289C381259, 0x3FAFFFCFB32AE18D // D2, D3
1059data8 0x3FB3B1D9E0E3E00D, 0x3FB2496F0D3768DF // D0, D1
1060data8 0xBA461972C057D439, 0x00003FFB         // E6
1061data8 0x3FEA51A6625307D3, 0x3C76ABC886A72DA2 // A2, A4L
1062data8 0x3FA8EFE46B32A70E, 0x3F8F31B3559576B6 // C17, C20
1063data8 0xE403383700387D85, 0x00003FFB // E4
1064data8 0x9381D0EE74BF7251, 0x00003FFC // E2
1065data8 0x3FAA2177A6D28177, 0x3FA4895E65FBD995 // C18, C19
1066data8 0x3FAAED2C77DBEE5D, 0x3FA94CA59385512C // D6, D7
1067data8 0x3FAE1F522E8A5941, 0x3FAC785EF56DD87E // D4, D5
1068data8 0x3FB556AD5FA56F0A, 0x3FA81F416E87C783 // E7, C16
1069data8 0xCD00F1C2DC2C9F1E, 0x00003FFB // E5
1070data8 0x3FE2788CFC6FB618, 0x3C8E52519B5B17CB // A1, A3L
1071data8 0x80859B57C3E7F241, 0x00003FFC // E3
1072data8 0xADA065880615F401, 0x00003FFC // E1
1073data8 0xD45CE0BD530AB50E, 0x00003FFC // E0
1074LOCAL_OBJECT_END(lgammal_03Q_1Q_data)
1075
1076LOCAL_OBJECT_START(lgammal_13Q_2Q_data)
1077// Polynomial coefficients for the lgammal(x), 1.5625 <= |x| < 2.25
1078data8 0x3F951322AC7D8483, 0x3C71873D88C6539D // A4, A2L
1079data8 0xBFB13E001A557606, 0x3C56CB907018A101 // A3, A1L
1080data8 0xBEC11B2EC1E7F6FC, 0x3EB0064ED9824CC7 // D2, D3
1081data8 0xBEE3CBC963EC103A, 0x3ED2597A330C107D // D0, D1
1082data8 0xBC6F2DEBDFE66F38, 0x0000BFF0 // E6
1083data8 0x3FD4A34CC4A60FA6, 0x3C3AFC9BF775E8A0 // A2, A4L
1084data8 0x3E48B0C542F85B32, 0xBE347F12EAF787AB // C17, C20
1085data8 0xE9FEA63B6984FA1E, 0x0000BFF2 // E4
1086data8 0x9C562E15FC703BBF, 0x0000BFF5 // E2
1087data8 0xBE3C12A50AB0355E, 0xBE1C941626AE4717 // C18, C19
1088data8 0xBE7AFA8714342BC4,0x3E69A12D2B7761CB // D6, D7
1089data8 0xBE9E25EF1D526730, 0x3E8C762291889B99 // D4, D5
1090data8 0x3EF580DCEE754733, 0xBE57C811D070549C // E7, C16
1091data8 0xD093D878BE209C98, 0x00003FF1 // E5
1092data8 0x3FDB0EE6072093CE, 0xBC6024B9E81281C4 // A1, A3L
1093data8 0x859B57C31CB77D96, 0x00003FF4 // E3
1094data8 0xBD6EB756DB617E8D, 0x00003FF6 // E1
1095data8 0xF2027E10C7AF8C38, 0x0000BFF7 // E0
1096LOCAL_OBJECT_END(lgammal_13Q_2Q_data)
1097
1098LOCAL_OBJECT_START(lgammal_8_10_data)
1099// Polynomial coefficients for the lgammal(x), 8.0 <= |x| < 10.0
1100// Multi Precision terms
1101data8 0x40312008A3A23E5C, 0x3CE020B4F2E4083A //A1
1102data8 0x4025358E82FCB70C, 0x3CD4A5A74AF7B99C //A0
1103// Native precision terms
1104data8 0xF0AA239FFBC616D2, 0x00004000 //A2
1105data8 0x96A8EA798FE57D66, 0x0000BFFF //A3
1106data8 0x8D501B7E3B9B9BDB, 0x00003FFE //A4
1107data8 0x9EE062401F4B1DC2, 0x0000BFFD //A5
1108data8 0xC63FD8CD31E93431, 0x00003FFC //A6
1109data8 0x8461101709C23C30, 0x0000BFFC //A7
1110data8 0xB96D7EA7EF3648B2, 0x00003FFB //A8
1111data8 0x86886759D2ACC906, 0x0000BFFB //A9
1112data8 0xC894B6E28265B183, 0x00003FFA //A10
1113data8 0x98C4348CAD821662, 0x0000BFFA //A11
1114data8 0xEC9B092226A94DF2, 0x00003FF9 //A12
1115data8 0xB9F169FF9B98CDDC, 0x0000BFF9 //A13
1116data8 0x9A3A32BB040894D3, 0x00003FF9 //A14
1117data8 0xF9504CCC1003B3C3, 0x0000BFF8 //A15
1118LOCAL_OBJECT_END(lgammal_8_10_data)
1119
1120LOCAL_OBJECT_START(lgammal_03Q_6_data)
1121// Polynomial coefficients for the lgammal(x), 0.75 <= |x| < 1.0
1122data8 0xBFBC47DCA479E295, 0xBC607E6C1A379D55 //A3
1123data8 0x3FCA051C372609ED, 0x3C7B02D73EB7D831 //A0
1124data8 0xBFE15FAFA86B04DB, 0xBC3F52EE4A8945B5 //A1
1125data8 0x3FD455C4FF28F0BF, 0x3C75F8C6C99F30BB //A2
1126data8 0xD2CF04CD934F03E1, 0x00003FFA //A4
1127data8 0xDB4ED667E29256E1, 0x0000BFF9 //A5
1128data8 0xF155A33A5B6021BF, 0x00003FF8 //A6
1129data8 0x895E9B9D386E0338, 0x0000BFF8 //A7
1130data8 0xA001BE94B937112E, 0x00003FF7 //A8
1131data8 0xBD82846E490ED048, 0x0000BFF6 //A9
1132data8 0xE358D24EC30DBB5D, 0x00003FF5 //A10
1133data8 0x89C4F3652446B78B, 0x0000BFF5 //A11
1134data8 0xA86043E10280193D, 0x00003FF4 //A12
1135data8 0xCF3A2FBA61EB7682, 0x0000BFF3 //A13
1136data8 0x3F300900CC9200EC //A14
1137data8 0xBF23F42264B94AE8 //A15
1138data8 0x3F18EEF29895FE73 //A16
1139data8 0xBF0F3C4563E3EDFB //A17
1140data8 0x3F0387DBBC385056 //A18
1141data8 0xBEF81B4004F92900 //A19
1142data8 0x3EECA6692A9A5B81 //A20
1143data8 0xBEDF61A0059C15D3 //A21
1144data8 0x3ECDA9F40DCA0111 //A22
1145data8 0xBEB60FE788217BAF //A23
1146data8 0x3E9661D795DFC8C6 //A24
1147data8 0xBE66C7756A4EDEE5 //A25
1148// Polynomial coefficients for the lgammal(x), 1.0 <= |x| < 2.0
1149data8 0xBFC1AE55B180726B, 0xBC7DE1BC478453F5 //A3
1150data8 0xBFBEEB95B094C191, 0xBC53456FF6F1C9D9 //A0
1151data8 0x3FA2AED059BD608A, 0x3C0B65CC647D557F //A1
1152data8 0x3FDDE9E64DF22EF2, 0x3C8993939A8BA8E4 //A2
1153data8 0xF07C206D6B100CFF, 0x00003FFA //A4
1154data8 0xED2CEA9BA52FE7FB, 0x0000BFF9 //A5
1155data8 0xFCE51CED52DF3602, 0x00003FF8 //A6
1156data8 0x8D45D27872326619, 0x0000BFF8 //A7
1157data8 0xA2B78D6BCEBE27F7, 0x00003FF7 //A8
1158data8 0xBF6DC0996A895B6F, 0x0000BFF6 //A9
1159data8 0xE4B9AD335AF82D79, 0x00003FF5 //A10
1160data8 0x8A451880195362A1, 0x0000BFF5 //A11
1161data8 0xA8BE35E63089A7A9, 0x00003FF4 //A12
1162data8 0xCF7FA175FA11C40C, 0x0000BFF3 //A13
1163data8 0x3F300C282FAA3B02 //A14
1164data8 0xBF23F6AEBDA68B80 //A15
1165data8 0x3F18F6860E2224DD //A16
1166data8 0xBF0F542B3CE32F28 //A17
1167data8 0x3F039436218C9BF8 //A18
1168data8 0xBEF8AE6307677AEC //A19
1169data8 0x3EF0B55527B3A211 //A20
1170data8 0xBEE576AC995E7605 //A21
1171data8 0x3ED102DDC1365D2D //A22
1172data8 0xBEC442184F97EA54 //A23
1173data8 0x3ED4D2283DFE5FC6 //A24
1174data8 0xBECB9219A9B46787 //A25
1175// Polynomial coefficients for the lgammal(x), 2.0 <= |x| < 3.0
1176data8 0xBFCA4D55BEAB2D6F, 0xBC66F80E5BFD5AF5 //A3
1177data8 0x3FE62E42FEFA39EF, 0x3C7ABC9E3B347E3D //A0
1178data8 0x3FFD8773039049E7, 0x3C66CB9007C426EA //A1
1179data8 0x3FE94699894C1F4C, 0x3C918726EB111663 //A2
1180data8 0xA264558FB0906209, 0x00003FFB //A4
1181data8 0x94D6C50FEB902ADC, 0x0000BFFA //A5
1182data8 0x9620656184243D17, 0x00003FF9 //A6
1183data8 0xA0D0983B8BCA910B, 0x0000BFF8 //A7
1184data8 0xB36AF8559B222BD3, 0x00003FF7 //A8
1185data8 0xCE0DACB3260AE6E5, 0x0000BFF6 //A9
1186data8 0xF1C2C0BF0437C7DB, 0x00003FF5 //A10
1187data8 0x902A2F2F3AB74A92, 0x0000BFF5 //A11
1188data8 0xAE05009B1B2C6E4C, 0x00003FF4 //A12
1189data8 0xD5B71F6456D7D4CB, 0x0000BFF3 //A13
1190data8 0x3F2F0351D71BC9C6 //A14
1191data8 0xBF2B53BC56A3B793 //A15
1192data8 0xBF18B12DC6F6B861 //A16
1193data8 0xBF43EE6EB5215C2F //A17
1194data8 0xBF5474787CDD455E //A18
1195data8 0xBF642B503C9C060A //A19
1196data8 0xBF6E07D1AA254AA3 //A20
1197data8 0xBF71C785443AAEE8 //A21
1198data8 0xBF6F67BF81B71052 //A22
1199data8 0xBF63E4BCCF4FFABF //A23
1200data8 0xBF50067F8C671D5A //A24
1201data8 0xBF29C770D680A5AC //A25
1202// Polynomial coefficients for the lgammal(x), 4.0 <= |x| < 6.0
1203data8 0xBFD6626BC9B31B54, 0xBC85AABE08680902 //A3
1204data8 0x401326643C4479C9, 0x3CAA53C26F31E364 //A0
1205data8 0x401B4C420A50AD7C, 0x3C8C76D55E57DD8D //A1
1206data8 0x3FF735973273D5EC, 0x3C83A0B78E09188A //A2
1207data8 0x81310136363AAB6D, 0x00003FFC //A4
1208data8 0xDF1BD44C4075C0E6, 0x0000BFFA //A5
1209data8 0xD58389FE38D8D664, 0x00003FF9 //A6
1210data8 0xDA62C7221D5B5F87, 0x0000BFF8 //A7
1211data8 0xE9F92CAD0263E157, 0x00003FF7 //A8
1212data8 0x81ACCB8606C165FE, 0x0000BFF7 //A9
1213data8 0x9382D8D263D1C2A3, 0x00003FF6 //A10
1214data8 0xAB3CCBA4C853B12C, 0x0000BFF5 //A11
1215data8 0xCA0818BBCCC59296, 0x00003FF4 //A12
1216data8 0xF18912691CBB5BD0, 0x0000BFF3 //A13
1217data8 0x3F323EF5D8330339 //A14
1218data8 0xBF2641132EA571F7 //A15
1219data8 0x3F1B5D9576175CA9 //A16
1220data8 0xBF10F56A689C623D //A17
1221data8 0x3F04CACA9141A18D //A18
1222data8 0xBEFA307AC9B4E85D //A19
1223data8 0x3EF4B625939FBE32 //A20
1224data8 0xBECEE6AC1420F86F //A21
1225data8 0xBE9A95AE2E485964 //A22
1226data8 0xBF039EF47F8C09BB //A23
1227data8 0xBF05345957F7B7A9 //A24
1228data8 0xBEF85AE6385D4CCC //A25
1229// Polynomial coefficients for the lgammal(x), 3.0 <= |x| < 4.0
1230data8 0xBFCA4D55BEAB2D6F, 0xBC667B20FF46C6A8 //A3
1231data8 0x3FE62E42FEFA39EF, 0x3C7ABC9E3B398012 //A0
1232data8 0x3FFD8773039049E7, 0x3C66CB9070238D77 //A1
1233data8 0x3FE94699894C1F4C, 0x3C91873D8839B1CD //A2
1234data8 0xA264558FB0906D7E, 0x00003FFB //A4
1235data8 0x94D6C50FEB8AFD72, 0x0000BFFA //A5
1236data8 0x9620656185B68F14, 0x00003FF9 //A6
1237data8 0xA0D0983B34B7088A, 0x0000BFF8 //A7
1238data8 0xB36AF863964AA440, 0x00003FF7 //A8
1239data8 0xCE0DAAFB5497AFB8, 0x0000BFF6 //A9
1240data8 0xF1C2EAFA79CC2864, 0x00003FF5 //A10
1241data8 0x9028922A839572B8, 0x0000BFF5 //A11
1242data8 0xAE1E62F870BA0278, 0x00003FF4 //A12
1243data8 0xD4726F681E2ABA29, 0x0000BFF3 //A13
1244data8 0x3F30559B9A02FADF //A14
1245data8 0xBF243ADEB1266CAE //A15
1246data8 0x3F19303B6F552603 //A16
1247data8 0xBF0F768C288EC643 //A17
1248data8 0x3F039D5356C21DE1 //A18
1249data8 0xBEF81BCA8168E6BE //A19
1250data8 0x3EEC74A53A06AD54 //A20
1251data8 0xBEDED52D1A5DACDF //A21
1252data8 0x3ECCB4C2C7087342 //A22
1253data8 0xBEB4F1FAFDFF5C2F //A23
1254data8 0x3E94C80B52D58904 //A24
1255data8 0xBE64A328CBE92A27 //A25
1256LOCAL_OBJECT_END(lgammal_03Q_6_data)
1257
1258LOCAL_OBJECT_START(lgammal_1pEps_data)
1259// Polynomial coefficients for the lgammal(x), 1 - 2^(-7) <= |x| < 1 + 2^(-7)
1260data8 0x93C467E37DB0C7A5, 0x00003FFE //A1
1261data8 0xD28D3312983E9919, 0x00003FFE //A2
1262data8 0xCD26AADF559A47E3, 0x00003FFD //A3
1263data8 0x8A8991563EC22E81, 0x00003FFD //A4
1264data8 0x3FCA8B9C168D52FE //A5
1265data8 0x3FC5B40CB0696370 //A6
1266data8 0x3FC270AC2229A65D //A7
1267data8 0x3FC0110AF10FCBFC //A8
1268// Polynomial coefficients for the log1p(x), - 2^(-7) <= |x| <  2^(-7)
1269data8 0x3FBC71C71C71C71C //P8
1270data8 0xBFC0000000000000 //P7
1271data8 0x3FC2492492492492 //P6
1272data8 0xBFC5555555555555 //P5
1273data8 0x3FC999999999999A //P4
1274data8 0xBFD0000000000000 //P3
1275data8 0x3FD5555555555555 //P2
1276data8 0xBFE0000000000000 //P1
1277// short version of "lnsin" polynomial
1278data8 0xD28D3312983E9918, 0x00003FFF //A2
1279data8 0x8A8991563EC241B6, 0x00003FFE //A4
1280data8 0xADA06588061830A5, 0x00003FFD //A6
1281data8 0x80859B57C31CB746, 0x00003FFD //A8
1282LOCAL_OBJECT_END(lgammal_1pEps_data)
1283
1284LOCAL_OBJECT_START(lgammal_neg2andHalf_data)
1285// Polynomial coefficients for the lgammal(x), -2.005859375 <= x < -2.5
1286data8 0xBF927781D4BB093A, 0xBC511D86D85B7045 // A3, A0L
1287data8 0x3FF1A68793DEFC15, 0x3C9852AE2DA7DEEF // A1, A1L
1288data8 0x408555562D45FAFD, 0xBF972CDAFE5FEFAD // D0, D1
1289data8 0xC18682331EF492A5, 0xC1845E3E0D29606B // C20, C21
1290data8 0x4013141822E16979, 0x3CCF8718B6E75F6C // A2, A2L
1291data8 0xBFACCBF9F5ED0F15, 0xBBDD1AEB73297401 // A0, A3L
1292data8 0xCCCDB17423046445, 0x00004006 // E6
1293data8 0x800514E230A3A452, 0x00004005 // E4
1294data8 0xAAE9A48EC162E76F, 0x00004003 // E2
1295data8 0x81D4F88B3F3EA0FC, 0x00004002 // E0
1296data8 0x40CF3F3E35238DA0, 0xC0F8B340945F1A7E // D6, D7
1297data8 0x40BF89EC0BD609C6, 0xC095897242AEFEE2 // D4, D5
1298data8 0x40A2482FF01DBC5C, 0xC02095E275FDCF62 // D2, D3
1299data8 0xC1641354F2312A6A, 0xC17B3657F85258E9 // C18, C19
1300data8 0xC11F964E9ECBE2C9, 0xC146D7A90F70696C // C16, C17
1301data8 0xE7AECDE6AF8EA816, 0x0000BFEF // E7
1302data8 0xD711252FEBBE1091, 0x0000BFEB // E5
1303data8 0xE648BD10F8C43391, 0x0000BFEF // E3
1304data8 0x948A1E78AA00A98D, 0x0000BFF4 // E1
1305LOCAL_OBJECT_END(lgammal_neg2andHalf_data)
1306
1307LOCAL_OBJECT_START(lgammal_near_neg_half_data)
1308// Polynomial coefficients for the lgammal(x), -0.5 < x < -0.40625
1309data8 0xBFC1AE55B180726C, 0x3C8053CD734E6A1D // A3, A0L
1310data8 0x3FA2AED059BD608A, 0x3C0CD3D2CDBA17F4 // A1, A1L
1311data8 0x40855554DBCD1E1E, 0x3F96C51AC2BEE9E1 // D0, D1
1312data8 0xC18682331EF4927D, 0x41845E3E0D295DFC // C20, C21
1313data8 0x4011DE9E64DF22EF, 0x3CA692B70DAD6B7B // A2, A2L
1314data8 0x3FF43F89A3F0EDD6, 0xBC4955AED0FA087D // A0, A3L
1315data8 0xCCCD3F1DF4A2C1DD, 0x00004006 // E6
1316data8 0x80028ADE33C7FCD9, 0x00004005 // E4
1317data8 0xAACA474E485507EF, 0x00004003 // E2
1318data8 0x80F07C206D6B0ECD, 0x00004002 // E0
1319data8 0x40CF3F3E33E83056, 0x40F8B340944633D9 // D6, D7
1320data8 0x40BF89EC059931F0, 0x409589723307AD20 // D4, D5
1321data8 0x40A2482FD0054824, 0x402095CE7F19D011 // D2, D3
1322data8 0xC1641354F2313614, 0x417B3657F8525354 // C18, C19
1323data8 0xC11F964E9ECFD21C, 0x4146D7A90F701836 // C16, C17
1324data8 0x86A9C01F0EA11E5A, 0x0000BFF5 // E7
1325data8 0xBF6D8469142881C0, 0x0000BFF6 // E5
1326data8 0x8D45D277BA8255F1, 0x0000BFF8 // E3
1327data8 0xED2CEA9BA528BCC3, 0x0000BFF9 // E1
1328LOCAL_OBJECT_END(lgammal_near_neg_half_data)
1329
1330//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1331////////////// POLYNOMIAL COEFFICIENTS FOR "NEAR ROOTS" RANGES    /////////////
1332////////////// THIS PART OF TABLE SHOULD BE ADDRESSED REALLY RARE /////////////
1333//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1334LOCAL_OBJECT_START(lgammal_right_roots_polynomial_data)
1335// Polynomial coefficients for right root on [-3, -2]
1336// Lgammal is approximated by polynomial within [-.056244 ; .158208 ] range
1337data8 0xBBBD5E9DCD11030B, 0xB867411D9FF87DD4 //A0
1338data8 0x3FF83FE966AF535E, 0x3CAA21235B8A769A //A1
1339data8 0x40136EEBB002F55C, 0x3CC3959A6029838E //A2
1340data8 0xB4A5302C53C2BEDD, 0x00003FFF //A3
1341data8 0x8B8C6BE504F2DA1C, 0x00004002 //A4
1342data8 0xB99CFF02593B4D98, 0x00004001 //A5
1343data8 0x4038D32F682AA1CF //A6
1344data8 0x403809F04EE6C5B5 //A7
1345data8 0x40548EAA81634CEE //A8
1346data8 0x4059297ADB6BC03D //A9
1347data8 0x407286FB8EC5C9DA //A10
1348data8 0x407A92E05B744CFB //A11
1349data8 0x4091A9D4144258CD //A12
1350data8 0x409C4D01D24F367E //A13
1351data8 0x40B1871B9A426A83 //A14
1352data8 0x40BE51C48BD9A583 //A15
1353data8 0x40D2140D0C6153E7 //A16
1354data8 0x40E0FB2C989CE4A3 //A17
1355data8 0x40E52739AB005641 //A18
1356data8 0x41161E3E6DDF503A //A19
1357// Polynomial coefficients for right root on [-4, -3]
1358// Lgammal is approximated by polynomial within [-.172797 ; .171573 ] range
1359data8 0x3C172712B248E42E, 0x38CB8D17801A5D67 //A0
1360data8 0x401F20A65F2FAC54, 0x3CCB9EA1817A824E //A1
1361data8 0x4039D4D2977150EF, 0x3CDA42E149B6276A //A2
1362data8 0xE089B8926AE2D9CB, 0x00004005 //A3
1363data8 0x933901EBBB586C37, 0x00004008 //A4
1364data8 0xCCD319BED1CFA1CD, 0x0000400A //A5
1365data8 0x40D293C3F78D3C37 //A6
1366data8 0x40FBB97AA0B6DD02 //A7
1367data8 0x41251EA3345E5EB9 //A8
1368data8 0x415057F65C92E7B0 //A9
1369data8 0x41799C865241B505 //A10
1370data8 0x41A445209EFE896B //A11
1371data8 0x41D02D21880C953B //A12
1372data8 0x41F9FFDE8C63E16D //A13
1373data8 0x422504DC8302D2BE //A14
1374data8 0x425111BF18C95414 //A15
1375data8 0x427BCBE74A2B8EF7 //A16
1376data8 0x42A7256F59B286F7 //A17
1377data8 0x42D462D1586DE61F //A18
1378data8 0x42FBB1228D6C5118 //A19
1379// Polynomial coefficients for right root on [-5, -4]
1380// Lgammal is approximated by polynomial within [-.163171 ; .161988 ] range
1381data8 0x3C5840FBAFDEE5BB, 0x38CAC0336E8C490A //A0
1382data8 0x403ACA5CF4921642, 0x3CCEDCDDA5491E56 //A1
1383data8 0x40744415CD813F8E, 0x3CFBFEBC17E39146 //A2
1384data8 0xAACD88D954E3E1BD, 0x0000400B //A3
1385data8 0xCB68C710D75ED802, 0x0000400F //A4
1386data8 0x8130F5AB997277AC, 0x00004014 //A5
1387data8 0x41855E3DBF99EBA7 //A6
1388data8 0x41CD14FE49C49FC2 //A7
1389data8 0x421433DCE281F07D //A8
1390data8 0x425C8399C7A92B6F //A9
1391data8 0x42A45FBE67840F1A //A10
1392data8 0x42ED68D75F9E6C98 //A11
1393data8 0x433567291C27E5BE //A12
1394data8 0x437F5ED7A9D9FD28 //A13
1395data8 0x43C720A65C8AB711 //A14
1396data8 0x441120A6C1D40B9B //A15
1397data8 0x44596F561F2D1CBE //A16
1398data8 0x44A3507DA81D5C01 //A17
1399data8 0x44EF06A31E39EEDF //A18
1400data8 0x45333774C99F523F //A19
1401// Polynomial coefficients for right root on [-6, -5]
1402// Lgammal is approximated by polynomial within [-.156450 ; .156126 ] range
1403data8 0x3C71B82D6B2B3304, 0x3917186E3C0DC231 //A0
1404data8 0x405ED72E0829AE02, 0x3C960C25157980EB //A1
1405data8 0x40BCECC32EC22F9B, 0x3D5D8335A32F019C //A2
1406data8 0x929EC2B1FB931F17, 0x00004012 //A3
1407data8 0xD112EF96D37316DE, 0x00004018 //A4
1408data8 0x9F00BB9BB13416AB, 0x0000401F //A5
1409data8 0x425F7D8D5BDCB223 //A6
1410data8 0x42C9A8D00C776CC6 //A7
1411data8 0x433557FD8C481424 //A8
1412data8 0x43A209221A953EF0 //A9
1413data8 0x440EDC98D5618AB7 //A10
1414data8 0x447AABD25E367378 //A11
1415data8 0x44E73DE20CC3B288 //A12
1416data8 0x455465257B4E0BD8 //A13
1417data8 0x45C2011532085353 //A14
1418data8 0x462FEE4CC191945B //A15
1419data8 0x469C63AEEFEF0A7F //A16
1420data8 0x4709D045390A3810 //A17
1421data8 0x4778D360873C9F64 //A18
1422data8 0x47E26965BE9A682A //A19
1423// Polynomial coefficients for right root on [-7, -6]
1424// Lgammal is approximated by polynomial within [-.154582 ; .154521 ] range
1425data8 0x3C75F103A1B00A48, 0x391C041C190C726D //A0
1426data8 0x40869DE49E3AF2AA, 0x3D1C17E1F813063B //A1
1427data8 0x410FCE23484CFD10, 0x3DB6F38C2F11DAB9 //A2
1428data8 0xEF281D1E1BE2055A, 0x00004019 //A3
1429data8 0xFCE3DA92AC55DFF8, 0x00004022 //A4
1430data8 0x8E9EA838A20BD58E, 0x0000402C //A5
1431data8 0x4354F21E2FB9E0C9 //A6
1432data8 0x43E9500994CD4F09 //A7
1433data8 0x447F3A2C23C033DF //A8
1434data8 0x45139152656606D8 //A9
1435data8 0x45A8D45F8D3BF2E8 //A10
1436data8 0x463FD32110E5BFE5 //A11
1437data8 0x46D490B3BDBAE0BE //A12
1438data8 0x476AC3CAD905DD23 //A13
1439data8 0x48018558217AD473 //A14
1440data8 0x48970AF371D30585 //A15
1441data8 0x492E6273A8BEFFE3 //A16
1442data8 0x49C47CC9AE3F1073 //A17
1443data8 0x4A5D38E8C35EFF45 //A18
1444data8 0x4AF0123E89694CD8 //A19
1445// Polynomial coefficients for right root on [-8, -7]
1446// Lgammal is approximated by polynomial within [-.154217 ; .154208 ] range
1447data8 0xBCD2507D818DDD68, 0xB97F6940EA2871A0 //A0
1448data8 0x40B3B407AA387BCB, 0x3D6320238F2C43D1 //A1
1449data8 0x41683E85DAAFBAC7, 0x3E148D085958EA3A //A2
1450data8 0x9F2A95AF1E10A548, 0x00004022 //A3
1451data8 0x92F21522F482300E, 0x0000402E //A4
1452data8 0x90B51AB03A1F244D, 0x0000403A //A5
1453data8 0x44628E1C70EF534F //A6
1454data8 0x452393E2BC32D244 //A7
1455data8 0x45E5164141F4BA0B //A8
1456data8 0x46A712B3A8AF5808 //A9
1457data8 0x47698FD36CEDD0F2 //A10
1458data8 0x482C9AE6BBAA3637 //A11
1459data8 0x48F023821857C8E9 //A12
1460data8 0x49B2569053FC106F //A13
1461data8 0x4A74F646D5C1604B //A14
1462data8 0x4B3811CF5ABA4934 //A15
1463data8 0x4BFBB5DD6C84E233 //A16
1464data8 0x4CC05021086F637B //A17
1465data8 0x4D8450A345B0FB49 //A18
1466data8 0x4E43825848865DB2 //A19
1467// Polynomial coefficients for right root on [-9, -8]
1468// Lgammal is approximated by polynomial within [-.154160 ; .154158 ] range
1469data8 0x3CDF4358564F2B46, 0x397969BEE6042F81 //A0
1470data8 0x40E3B088FED67721, 0x3D82787BA937EE85 //A1
1471data8 0x41C83A3893550EF4, 0x3E542ED57E244DA8 //A2
1472data8 0x9F003C6DC56E0B8E, 0x0000402B //A3
1473data8 0x92BDF64A3213A699, 0x0000403A //A4
1474data8 0x9074F503AAD417AF, 0x00004049 //A5
1475data8 0x4582843E1313C8CD //A6
1476data8 0x467387BD6A7826C1 //A7
1477data8 0x4765074E788CF440 //A8
1478data8 0x4857004DD9D1E09D //A9
1479data8 0x4949792ED7530EAF //A10
1480data8 0x4A3C7F089A292ED3 //A11
1481data8 0x4B30125BF0AABB86 //A12
1482data8 0x4C224175195E307E //A13
1483data8 0x4D14DC4C8B32C08D //A14
1484data8 0x4E07F1DB2786197E //A15
1485data8 0x4EFB8EA1C336DACB //A16
1486data8 0x4FF03797EACD0F23 //A17
1487data8 0x50E4304A8E68A730 //A18
1488data8 0x51D3618FB2EC9F93 //A19
1489// Polynomial coefficients for right root on [-10, -9]
1490// Lgammal is approximated by polynomial within [-.154152 ; .154152 ] range
1491data8 0x3D42F34DA97ECF0C, 0x39FD1256F345B0D0 //A0
1492data8 0x4116261203919787, 0x3DC12D44055588EB //A1
1493data8 0x422EA8F32FB7FE99, 0x3ED849CE4E7B2D77 //A2
1494data8 0xE25BAF73477A57B5, 0x00004034 //A3
1495data8 0xEB021FD10060504A, 0x00004046 //A4
1496data8 0x8220A208EE206C5F, 0x00004059 //A5
1497data8 0x46B2C3903EC9DA14 //A6
1498data8 0x47D64393744B9C67 //A7
1499data8 0x48FAF79CCDC604DD //A8
1500data8 0x4A20975DB8061EBA //A9
1501data8 0x4B44AB9CBB38DB21 //A10
1502data8 0x4C6A032F60094FE9 //A11
1503data8 0x4D908103927634B4 //A12
1504data8 0x4EB516CA21D30861 //A13
1505data8 0x4FDB1BF12C58D318 //A14
1506data8 0x510180AAE094A553 //A15
1507data8 0x5226A8F2A2D45D57 //A16
1508data8 0x534E00B6B0C8B809 //A17
1509data8 0x5475022FE21215B2 //A18
1510data8 0x5596B02BF6C5E19B //A19
1511// Polynomial coefficients for right root on [-11, -10]
1512// Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1513data8 0x3D7AA9C2E2B1029C, 0x3A15FB37578544DB //A0
1514data8 0x414BAF825A0C91D4, 0x3DFB9DA2CE398747 //A1
1515data8 0x4297F3EC8AE0AF03, 0x3F34208B55FB8781 //A2
1516data8 0xDD0C97D3197F56DE, 0x0000403E //A3
1517data8 0x8F6F3AF7A5499674, 0x00004054 //A4
1518data8 0xC68DA1AF6D878EEB, 0x00004069 //A5
1519data8 0x47F1E4E1E2197CE0 //A6
1520data8 0x494A8A28E597C3EB //A7
1521data8 0x4AA4175D0D35D705 //A8
1522data8 0x4BFEE6F0AF69E814 //A9
1523data8 0x4D580FE7B3DBB3C6 //A10
1524data8 0x4EB2ECE60E4608AF //A11
1525data8 0x500E04BE3E2B4F24 //A12
1526data8 0x5167F9450F0FB8FD //A13
1527data8 0x52C342BDE747603F //A14
1528data8 0x541F1699D557268C //A15
1529data8 0x557927C5F079864E //A16
1530data8 0x56D4D10FEEDB030C //A17
1531data8 0x5832385DF86AD28A //A18
1532data8 0x598898914B4D6523 //A19
1533// Polynomial coefficients for right root on [-12, -11]
1534// Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1535data8 0xBD96F61647C58B03, 0xBA3ABB0C2A6C755B //A0
1536data8 0x418308A82714B70D, 0x3E1088FC6A104C39 //A1
1537data8 0x4306A493DD613C39, 0x3FB2341ECBF85741 //A2
1538data8 0x8FA8FE98339474AB, 0x00004049 //A3
1539data8 0x802CCDF570BA7942, 0x00004062 //A4
1540data8 0xF3F748AF11A32890, 0x0000407A //A5
1541data8 0x493E3B567EF178CF //A6
1542data8 0x4ACED38F651BA362 //A7
1543data8 0x4C600B357337F946 //A8
1544data8 0x4DF0F71A52B54CCF //A9
1545data8 0x4F8229F3B9FA2C70 //A10
1546data8 0x5113A4C4979B770E //A11
1547data8 0x52A56BC367F298D5 //A12
1548data8 0x543785CF31842DC0 //A13
1549data8 0x55C9FC37E3E40896 //A14
1550data8 0x575CD5D1BA556C82 //A15
1551data8 0x58F00A7AD99A9E08 //A16
1552data8 0x5A824088688B008D //A17
1553data8 0x5C15F75EF7E08EBD //A18
1554data8 0x5DA462EA902F0C90 //A19
1555// Polynomial coefficients for right root on [-13, -12]
1556// Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1557data8 0x3DC3191752ACFC9D, 0x3A26CB6629532DBF //A0
1558data8 0x41BC8CFC051191BD, 0x3E68A84DA4E62AF2 //A1
1559data8 0x43797926294A0148, 0x400F345FF3723CFF //A2
1560data8 0xF26D2AF700B82625, 0x00004053 //A3
1561data8 0xA238B24A4B1F7B15, 0x00004070 //A4
1562data8 0xE793B5C0A41A264F, 0x0000408C //A5
1563data8 0x4A9585BDDACE863D //A6
1564data8 0x4C6075953448088A //A7
1565data8 0x4E29B2F38D1FC670 //A8
1566data8 0x4FF4619B079C440F //A9
1567data8 0x51C05DAE118D8AD9 //A10
1568data8 0x538A8C7F87326AD4 //A11
1569data8 0x5555B6937588DAB3 //A12
1570data8 0x5721E1F8B6E6A7DB //A13
1571data8 0x58EDA1D7A77DD6E5 //A14
1572data8 0x5AB8A9616B7DC9ED //A15
1573data8 0x5C84942AA209ED17 //A16
1574data8 0x5E518FC34C6F54EF //A17
1575data8 0x601FB3F17BCCD9A0 //A18
1576data8 0x61E61128D512FE97 //A1
1577// Polynomial coefficients for right root on [-14, -13]
1578// Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1579data8 0xBE170D646421B3F5, 0xBAAD95F79FCB5097 //A0
1580data8 0x41F7328CBFCD9AC7, 0x3E743B8B1E8AEDB1 //A1
1581data8 0x43F0D0FA2DBDA237, 0x40A0422D6A227B55 //A2
1582data8 0x82082DF2D32686CC, 0x0000405F //A3
1583data8 0x8D64EE9B42E68B43, 0x0000407F //A4
1584data8 0xA3FFD82E08C5F1F1, 0x0000409F //A5
1585data8 0x4BF8C49D99123454 //A6
1586data8 0x4DFEC79DDF11342F //A7
1587data8 0x50038615A892F6BD //A8
1588data8 0x520929453DB32EF1 //A9
1589data8 0x54106A7808189A7F //A10
1590data8 0x5615A302D03C207B //A11
1591data8 0x581CC175AA736F5E //A12
1592data8 0x5A233E071147C017 //A13
1593data8 0x5C29E81917243F22 //A14
1594data8 0x5E3184B0B5AC4707 //A15
1595data8 0x6037C11DE62D8388 //A16
1596data8 0x6240787C4B1C9D6C //A17
1597data8 0x6448289235E80977 //A18
1598data8 0x664B5352C6C3449E //A19
1599// Polynomial coefficients for right root on [-15, -14]
1600// Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1601data8 0x3E562C2E34A9207D, 0x3ADC00DA3DFF7A83 //A0
1602data8 0x42344C3B2F0D90AB, 0x3EB8A2E979F24536 //A1
1603data8 0x4469BFFF28B50D07, 0x41181E3D05C1C294 //A2
1604data8 0xAE38F64DCB24D9F8, 0x0000406A //A3
1605data8 0xA5C3F52C1B350702, 0x0000408E //A4
1606data8 0xA83BC857BCD67A1B, 0x000040B2 //A5
1607data8 0x4D663B4727B4D80A //A6
1608data8 0x4FA82C965B0F7788 //A7
1609data8 0x51EAD58C02908D95 //A8
1610data8 0x542E427970E073D8 //A9
1611data8 0x56714644C558A818 //A10
1612data8 0x58B3EC2040C77BAE //A11
1613data8 0x5AF72AE6A83D45B1 //A12
1614data8 0x5D3B214F611F5D12 //A13
1615data8 0x5F7FF5E49C54E92A //A14
1616data8 0x61C2E917AB765FB2 //A15
1617data8 0x64066FD70907B4C1 //A16
1618data8 0x664B3998D60D0F9B //A17
1619data8 0x689178710782FA8B //A18
1620data8 0x6AD14A66C1C7BEC3 //A19
1621// Polynomial coefficients for right root on [-16, -15]
1622// Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1623data8 0xBE6D7E7192615BAE, 0xBB0137677D7CC719 //A0
1624data8 0x4273077763F6628C, 0x3F09250FB8FC8EC9 //A1
1625data8 0x44E6A1BF095B1AB3, 0x4178D5A74F6CB3B3 //A2
1626data8 0x8F8E0D5060FCC76E, 0x00004076 //A3
1627data8 0x800CC1DCFF092A63, 0x0000409E //A4
1628data8 0xF3AB0BA9D14CDA72, 0x000040C5 //A5
1629data8 0x4EDE3000A2F6D54F //A6
1630data8 0x515EC613B9C8E241 //A7
1631data8 0x53E003309FEEEA96 //A8
1632data8 0x5660ED908D7C9A90 //A9
1633data8 0x58E21E9B517B1A50 //A10
1634data8 0x5B639745E4374EE2 //A11
1635data8 0x5DE55BB626B2075D //A12
1636data8 0x606772B7506BA747 //A13
1637data8 0x62E9E581AB2E057B //A14
1638data8 0x656CBAD1CF85D396 //A15
1639data8 0x67EFF4EBD7989872 //A16
1640data8 0x6A722D2B19B7E2F9 //A17
1641data8 0x6CF5DEB3073B0743 //A18
1642data8 0x6F744AC11550B93A //A19
1643// Polynomial coefficients for right root on [-17, -16]
1644// Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1645data8 0xBEDCC6291188207E, 0xBB872E3FDD48F5B7 //A0
1646data8 0x42B3076EE7525EF9, 0x3F6687A5038CA81C //A1
1647data8 0x4566A1AAD96EBCB5, 0x421F0FEDFBF548D2 //A2
1648data8 0x8F8D4D3DE9850DBA, 0x00004082 //A3
1649data8 0x800BDD6DA2CE1859, 0x000040AE //A4
1650data8 0xF3A8EC4C9CDC1CE5, 0x000040D9 //A5
1651data8 0x505E2FAFDB812628 //A6
1652data8 0x531EC5B3A7508719 //A7
1653data8 0x55E002F77E99B628 //A8
1654data8 0x58A0ED4C9B4DAE54 //A9
1655data8 0x5B621E4A8240F90C //A10
1656data8 0x5E2396E5C8849814 //A11
1657data8 0x60E55B43D8C5CE71 //A12
1658data8 0x63A7722F5D45D01D //A13
1659data8 0x6669E4E010DCE45A //A14
1660data8 0x692CBA120D5E78F6 //A15
1661data8 0x6BEFF4045350B22E //A16
1662data8 0x6EB22C9807C21819 //A17
1663data8 0x7175DE20D04617C4 //A18
1664data8 0x74344AB87C6D655F //A19
1665// Polynomial coefficients for right root on [-18, -17]
1666// Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1667data8 0xBF28AEEE7B61D77C, 0xBBDBBB5FC57ABF79 //A0
1668data8 0x42F436F56B3B8A0C, 0x3FA43EE3C5C576E9 //A1
1669data8 0x45E98A22535D115D, 0x42984678BE78CC48 //A2
1670data8 0xAC176F3775E6FCFC, 0x0000408E //A3
1671data8 0xA3114F53A9FEB922, 0x000040BE //A4
1672data8 0xA4D168A8334ABF41, 0x000040EE //A5
1673data8 0x51E5B0E7EC7182BB //A6
1674data8 0x54E77D67B876EAB6 //A7
1675data8 0x57E9F7C30C09C4B6 //A8
1676data8 0x5AED29B0488614CA //A9
1677data8 0x5DF09486F87E79F9 //A10
1678data8 0x60F30B199979654E //A11
1679data8 0x63F60E02C7DCCC5F //A12
1680data8 0x66F9B8A00EB01684 //A13
1681data8 0x69FE2D3ED0700044 //A14
1682data8 0x6D01C8363C7DCC84 //A15
1683data8 0x700502B29C2F06E3 //A16
1684data8 0x730962B4500F4A61 //A17
1685data8 0x76103C6ED099192A //A18
1686data8 0x79100C7132CFD6E3 //A19
1687// Polynomial coefficients for right root on [-19, -18]
1688// Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1689data8 0x3F3C19A53328A0C3, 0x3BE04ADC3FBE1458 //A0
1690data8 0x4336C16C16C16C19, 0x3FE58CE3AC4A7C28 //A1
1691data8 0x46702E85C0898B70, 0x432C922E412CEC6E //A2
1692data8 0xF57B99A1C034335D, 0x0000409A //A3
1693data8 0x82EC9634223DF909, 0x000040CF //A4
1694data8 0x94F66D7557E2EA60, 0x00004103 //A5
1695data8 0x5376118B79AE34D0 //A6
1696data8 0x56BAE7106D52E548 //A7
1697data8 0x5A00BD48CC8E25AB //A8
1698data8 0x5D4529722821B493 //A9
1699data8 0x608B1654AF31BBC1 //A10
1700data8 0x63D182CC98AEA859 //A11
1701data8 0x6716D43D5EEB05E8 //A12
1702data8 0x6A5DF884FC172E1C //A13
1703data8 0x6DA3CA7EBB97976B //A14
1704data8 0x70EA416D0BE6D2EF //A15
1705data8 0x743176C31EBB65F2 //A16
1706data8 0x7777C401A8715CF9 //A17
1707data8 0x7AC1110C6D350440 //A18
1708data8 0x7E02D0971CF84865 //A19
1709// Polynomial coefficients for right root on [-20, -19]
1710// Lgammal is approximated by polynomial within [-.154151 ; .154151 ] range
1711data8 0xBFAB767F9BE21803, 0xBC5ACEF5BB1BD8B5 //A0
1712data8 0x4379999999999999, 0x4029241C7F5914C8 //A1
1713data8 0x46F47AE147AE147A, 0x43AC2979B64B9D7E //A2
1714data8 0xAEC33E1F67152993, 0x000040A7 //A3
1715data8 0xD1B71758E219616F, 0x000040DF //A4
1716data8 0x8637BD05AF6CF468, 0x00004118 //A5
1717data8 0x55065E9F80F293DE //A6
1718data8 0x588EADA78C44EE66 //A7
1719data8 0x5C15798EE22DEF09 //A8
1720data8 0x5F9E8ABFD644FA63 //A9
1721data8 0x6325FD7FE29BD7CD //A10
1722data8 0x66AFFC5C57E1F802 //A11
1723data8 0x6A3774CD7D5C0181 //A12
1724data8 0x6DC152724DE2A6FE //A13
1725data8 0x7149BB138EB3D0C2 //A14
1726data8 0x74D32FF8A70896C2 //A15
1727data8 0x785D3749F9C72BD7 //A16
1728data8 0x7BE5CCF65EBC4E40 //A17
1729data8 0x7F641A891B5FC652 //A18
1730data8 0x7FEFFFFFFFFFFFFF //A19
1731LOCAL_OBJECT_END(lgammal_right_roots_polynomial_data)
1732
1733LOCAL_OBJECT_START(lgammal_left_roots_polynomial_data)
1734// Polynomial coefficients for left root on [-3, -2]
1735// Lgammal is approximated by polynomial within [.084641 ; -.059553 ] range
1736data8 0xBC0844590979B82E, 0xB8BC7CE8CE2ECC3B //A0
1737data8 0xBFFEA12DA904B18C, 0xBC91A6B2BAD5EF6E //A1
1738data8 0x4023267F3C265A51, 0x3CD7055481D03AED //A2
1739data8 0xA0C2D618645F8E00, 0x0000C003 //A3
1740data8 0xFA8256664F8CD2BE, 0x00004004 //A4
1741data8 0xC2C422C103F57158, 0x0000C006 //A5
1742data8 0x4084373F7CC70AF5 //A6
1743data8 0xC0A12239BDD6BB95 //A7
1744data8 0x40BDBA65E2709397 //A8
1745data8 0xC0DA2D2504DFB085 //A9
1746data8 0x40F758173CA5BF3C //A10
1747data8 0xC11506C65C267E72 //A11
1748data8 0x413318EE3A6B05FC //A12
1749data8 0xC1517767F247DA98 //A13
1750data8 0x41701237B4754D73 //A14
1751data8 0xC18DB8A03BC5C3D8 //A15
1752data8 0x41AB80953AC14A07 //A16
1753data8 0xC1C9B7B76638D0A4 //A17
1754data8 0x41EA727E3033E2D9 //A18
1755data8 0xC20812C297729142 //A19
1756//
1757// Polynomial coefficients for left root on [-4, -3]
1758// Lgammal is approximated by polynomial within [.147147 ; -.145158 ] range
1759data8 0xBC3130AE5C4F54DB, 0xB8ED23294C13398A //A0
1760data8 0xC034B99D966C5646, 0xBCE2E5FE3BC3DBB9 //A1
1761data8 0x406F76DEAE0436BD, 0x3D14974DDEC057BD //A2
1762data8 0xE929ACEA5979BE96, 0x0000C00A //A3
1763data8 0xF47C14F8A0D52771, 0x0000400E //A4
1764data8 0x88B7BC036937481C, 0x0000C013 //A5
1765data8 0x4173E8F3AB9FC266 //A6
1766data8 0xC1B7DBBE062FB11B //A7
1767data8 0x41FD2F76DE7A47A7 //A8
1768data8 0xC242225FE53B124D //A9
1769data8 0x4286D12AE2FBFA30 //A10
1770data8 0xC2CCFFC267A3C4C0 //A11
1771data8 0x431294E10008E014 //A12
1772data8 0xC357FAC8C9A2DF6A //A13
1773data8 0x439F2190AB9FAE01 //A14
1774data8 0xC3E44C1D8E8C67C3 //A15
1775data8 0x442A8901105D5A38 //A16
1776data8 0xC471C4421E908C3A //A17
1777data8 0x44B92CD4D59D6D17 //A18
1778data8 0xC4FB3A078B5247FA //A19
1779// Polynomial coefficients for left root on [-5, -4]
1780// Lgammal is approximated by polynomial within [.155671 ; -.155300 ] range
1781data8 0xBC57BF3C6E8A94C1, 0xB902FB666934AC9E //A0
1782data8 0xC05D224A3EF9E41F, 0xBCF6F5713913E440 //A1
1783data8 0x40BB533C678A3955, 0x3D688E53E3C72538 //A2
1784data8 0x869FBFF732E99B84, 0x0000C012 //A3
1785data8 0xBA9537AD61392DEC, 0x00004018 //A4
1786data8 0x89EAE8B1DEA06B05, 0x0000C01F //A5
1787data8 0x425A8C5C53458D3C //A6
1788data8 0xC2C5068B3ED6509B //A7
1789data8 0x4330FFA575E99B4E //A8
1790data8 0xC39BEC12DDDF7669 //A9
1791data8 0x44073825725F74F9 //A10
1792data8 0xC47380EBCA299047 //A11
1793data8 0x44E084DD9B666437 //A12
1794data8 0xC54C2DA6BF787ACF //A13
1795data8 0x45B82D65C8D6FA42 //A14
1796data8 0xC624D62113FE950A //A15
1797data8 0x469200CC19B45016 //A16
1798data8 0xC6FFDDC6DD938E2E //A17
1799data8 0x476DD7C07184B9F9 //A18
1800data8 0xC7D554A30085C052 //A19
1801// Polynomial coefficients for left root on [-6, -5]
1802// Lgammal is approximated by polynomial within [.157425 ; -.157360 ] range
1803data8 0x3C9E20A87C8B79F1, 0x39488BE34B2427DB //A0
1804data8 0xC08661F6A43A5E12, 0xBD3D912526D759CC //A1
1805data8 0x410F79DCB794F270, 0x3DB9BEE7CD3C1BF5 //A2
1806data8 0xEB7404450D0005DB, 0x0000C019 //A3
1807data8 0xF7AE9846DFE4D4AB, 0x00004022 //A4
1808data8 0x8AF535855A95B6DA, 0x0000C02C //A5
1809data8 0x43544D54E9FE240E //A6
1810data8 0xC3E8684E40CE6CFC //A7
1811data8 0x447DF44C1D803454 //A8
1812data8 0xC512AC305439B2BA //A9
1813data8 0x45A79226AF79211A //A10
1814data8 0xC63E0DFF7244893A //A11
1815data8 0x46D35216C3A83AF3 //A12
1816data8 0xC76903BE0C390E28 //A13
1817data8 0x48004A4DECFA4FD5 //A14
1818data8 0xC8954FBD243DB8BE //A15
1819data8 0x492BF3A31EB18DDA //A16
1820data8 0xC9C2C6A864521F3A //A17
1821data8 0x4A5AB127C62E8DA1 //A18
1822data8 0xCAECF60EF3183C57 //A19
1823// Polynomial coefficients for left root on [-7, -6]
1824// Lgammal is approximated by polynomial within [.157749 ; -.157739 ] range
1825data8 0x3CC9B9E8B8D551D6, 0x3961813C8E1E10DB //A0
1826data8 0xC0B3ABF7A5CEA91F, 0xBD55638D4BCB4CC4 //A1
1827data8 0x4168349A25504236, 0x3E0287ECE50CCF76 //A2
1828data8 0x9EC8ED6E4C219E67, 0x0000C022 //A3
1829data8 0x9279EB1B799A3FF3, 0x0000402E //A4
1830data8 0x90213EF8D9A5DBCF, 0x0000C03A //A5
1831data8 0x4462775E857FB71C //A6
1832data8 0xC52377E70B45FDBF //A7
1833data8 0x45E4F3D28EDA8C28 //A8
1834data8 0xC6A6E85571BD2D0B //A9
1835data8 0x47695BB17E74DF74 //A10
1836data8 0xC82C5AC0ED6A662F //A11
1837data8 0x48EFF8159441C2E3 //A12
1838data8 0xC9B22602C1B68AE5 //A13
1839data8 0x4A74BA8CE7B34100 //A14
1840data8 0xCB37C7E208482E4B //A15
1841data8 0x4BFB5A1D57352265 //A16
1842data8 0xCCC01CB3021212FF //A17
1843data8 0x4D841613AC3431D1 //A18
1844data8 0xCE431C9E9EE43AD9 //A19
1845// Polynomial coefficients for left root on [-8, -7]
1846// Lgammal is approximated by polynomial within [.157799 ; -.157798 ] range
1847data8 0xBCF9C7A33AD9478C, 0xB995B0470F11E5ED //A0
1848data8 0xC0E3AF76FE4C2F8B, 0xBD8DBCD503250511 //A1
1849data8 0x41C838E76CAAF0D5, 0x3E5D79F5E2E069C3 //A2
1850data8 0x9EF345992B262CE0, 0x0000C02B //A3
1851data8 0x92AE0292985FD559, 0x0000403A //A4
1852data8 0x90615420C08F7D8C, 0x0000C049 //A5
1853data8 0x45828139342CEEB7 //A6
1854data8 0xC67384066C31E2D3 //A7
1855data8 0x476502BC4DAC2C35 //A8
1856data8 0xC856FAADFF22ADC6 //A9
1857data8 0x49497243255AB3CE //A10
1858data8 0xCA3C768489520F6B //A11
1859data8 0x4B300D1EA47AF838 //A12
1860data8 0xCC223B0508AC620E //A13
1861data8 0x4D14D46583338CD8 //A14
1862data8 0xCE07E7A87AA068E4 //A15
1863data8 0x4EFB811AD2F8BEAB //A16
1864data8 0xCFF0351B51508523 //A17
1865data8 0x50E4364CCBF53100 //A18
1866data8 0xD1D33CFD0BF96FA6 //A19
1867// Polynomial coefficients for left root on [-9, -8]
1868// Lgammal is approximated by polynomial within [.157806 ; -.157806 ] range
1869data8 0x3D333E4438B1B9D4, 0x39E7B956B83964C1 //A0
1870data8 0xC11625EDFC63DCD8, 0xBDCF39625709EFAC //A1
1871data8 0x422EA8C150480F16, 0x3EC16ED908AB7EDD //A2
1872data8 0xE2598725E2E11646, 0x0000C034 //A3
1873data8 0xEAFF2346DE3EBC98, 0x00004046 //A4
1874data8 0x821E90DE12A0F05F, 0x0000C059 //A5
1875data8 0x46B2C334AE5366FE //A6
1876data8 0xC7D64314B43191B6 //A7
1877data8 0x48FAF6ED5899E01B //A8
1878data8 0xCA2096E4472AF37D //A9
1879data8 0x4B44AAF49FB7E4C8 //A10
1880data8 0xCC6A02469F2BD920 //A11
1881data8 0x4D9080626D2EFC07 //A12
1882data8 0xCEB515EDCF0695F7 //A13
1883data8 0x4FDB1AC69BF36960 //A14
1884data8 0xD1017F8274339270 //A15
1885data8 0x5226A684961BAE2F //A16
1886data8 0xD34E085C088404A5 //A17
1887data8 0x547511892FF8960E //A18
1888data8 0xD5968FA3B1ED67A9 //A19
1889// Polynomial coefficients for left root on [-10, -9]
1890// Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
1891data8 0xBD355818A2B42BA2, 0xB9B7320B6A0D61EA //A0
1892data8 0xC14BAF7DA5F3770E, 0xBDE64AF9A868F719 //A1
1893data8 0x4297F3E8791F9CD3, 0x3F2A553E59B4835E //A2
1894data8 0xDD0C5F7E551BD13C, 0x0000C03E //A3
1895data8 0x8F6F0A3B2EB08BBB, 0x00004054 //A4
1896data8 0xC68D4D5AD230BA08, 0x0000C069 //A5
1897data8 0x47F1E4D8C35D1A3E //A6
1898data8 0xC94A8A191DB0A466 //A7
1899data8 0x4AA4174F65FE6AE8 //A8
1900data8 0xCBFEE6D90F94E9DD //A9
1901data8 0x4D580FD3438BE16C //A10
1902data8 0xCEB2ECD456D50224 //A11
1903data8 0x500E049F7FE64546 //A12
1904data8 0xD167F92D9600F378 //A13
1905data8 0x52C342AE2B43261A //A14
1906data8 0xD41F15DEEDA4B67E //A15
1907data8 0x55792638748AFB7D //A16
1908data8 0xD6D4D760074F6E6B //A17
1909data8 0x5832469D58ED3FA9 //A18
1910data8 0xD988769F3DC76642 //A19
1911// Polynomial coefficients for left root on [-11, -10]
1912// Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
1913data8 0xBDA050601F39778A, 0xBA0D4D1CE53E8241 //A0
1914data8 0xC18308A7D8EA4039, 0xBE370C379D3EAD41 //A1
1915data8 0x4306A49380644E6C, 0x3FBBB143C0E7B5C8 //A2
1916data8 0x8FA8FB233E4AA6D2, 0x0000C049 //A3
1917data8 0x802CC9D8AEAC207D, 0x00004062 //A4
1918data8 0xF3F73EE651A37A13, 0x0000C07A //A5
1919data8 0x493E3B550A7B9568 //A6
1920data8 0xCACED38DAA060929 //A7
1921data8 0x4C600B346BAB3BC6 //A8
1922data8 0xCDF0F719193E3D26 //A9
1923data8 0x4F8229F24528B151 //A10
1924data8 0xD113A4C2D32FBBE2 //A11
1925data8 0x52A56BC13DC4474D //A12
1926data8 0xD43785CFAF5E3CE3 //A13
1927data8 0x55C9FC3EA5941202 //A14
1928data8 0xD75CD545A3341AF5 //A15
1929data8 0x58F009911F77C282 //A16
1930data8 0xDA8246294D210BEC //A17
1931data8 0x5C1608AAC32C3A8E //A18
1932data8 0xDDA446E570A397D5 //A19
1933// Polynomial coefficients for left root on [-12, -11]
1934// Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
1935data8 0x3DEACBB3081C502E, 0x3A8AA6F01DEDF745 //A0
1936data8 0xC1BC8CFBFB0A9912, 0xBE6556B6504A2AE6 //A1
1937data8 0x43797926206941D7, 0x40289A9644C2A216 //A2
1938data8 0xF26D2A78446D0839, 0x0000C053 //A3
1939data8 0xA238B1D937FFED38, 0x00004070 //A4
1940data8 0xE793B4F6DE470538, 0x0000C08C //A5
1941data8 0x4A9585BDC44DC45D //A6
1942data8 0xCC60759520342C47 //A7
1943data8 0x4E29B2F3694C0404 //A8
1944data8 0xCFF4619AE7B6BBAB //A9
1945data8 0x51C05DADF52B89E8 //A10
1946data8 0xD38A8C7F48819A4A //A11
1947data8 0x5555B6932D687860 //A12
1948data8 0xD721E1FACB6C1B5B //A13
1949data8 0x58EDA1E2677C8F91 //A14
1950data8 0xDAB8A8EC523C1F71 //A15
1951data8 0x5C84930133F30411 //A16
1952data8 0xDE51952FDFD1EC49 //A17
1953data8 0x601FCCEC1BBD25F1 //A18
1954data8 0xE1E5F2D76B610920 //A19
1955// Polynomial coefficients for left root on [-13, -12]
1956// Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
1957data8 0xBE01612F373268ED, 0xBA97B7A18CDF103B //A0
1958data8 0xC1F7328CBF7A4FAC, 0xBE89A25A6952F481 //A1
1959data8 0x43F0D0FA2DBDA237, 0x40A0422EC1CE6084 //A2
1960data8 0x82082DF2D32686C5, 0x0000C05F //A3
1961data8 0x8D64EE9B42E68B36, 0x0000407F //A4
1962data8 0xA3FFD82E08C630C9, 0x0000C09F //A5
1963data8 0x4BF8C49D99123466 //A6
1964data8 0xCDFEC79DDF1119ED //A7
1965data8 0x50038615A892D242 //A8
1966data8 0xD20929453DC8B537 //A9
1967data8 0x54106A78083BA1EE //A10
1968data8 0xD615A302C69E27B2 //A11
1969data8 0x581CC175870FF16F //A12
1970data8 0xDA233E0979E12B74 //A13
1971data8 0x5C29E822BC568C80 //A14
1972data8 0xDE31845DB5340FBC //A15
1973data8 0x6037BFC6D498D5F9 //A16
1974data8 0xE2407D92CD613E82 //A17
1975data8 0x64483B9B62367EB7 //A18
1976data8 0xE64B2DC830E8A799 //A1
1977// Polynomial coefficients for left root on [-14, -13]
1978// Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
1979data8 0x3E563D0B930B371F, 0x3AE779957E14F012 //A0
1980data8 0xC2344C3B2F083767, 0xBEC0B7769AA3DD66 //A1
1981data8 0x4469BFFF28B50D07, 0x41181E3F13ED2401 //A2
1982data8 0xAE38F64DCB24D9EE, 0x0000C06A //A3
1983data8 0xA5C3F52C1B3506F2, 0x0000408E //A4
1984data8 0xA83BC857BCD6BA92, 0x0000C0B2 //A5
1985data8 0x4D663B4727B4D81A //A6
1986data8 0xCFA82C965B0F62E9 //A7
1987data8 0x51EAD58C02905B71 //A8
1988data8 0xD42E427970FA56AD //A9
1989data8 0x56714644C57D8476 //A10
1990data8 0xD8B3EC2037EC95F2 //A11
1991data8 0x5AF72AE68BBA5B3D //A12
1992data8 0xDD3B2152C67AA6B7 //A13
1993data8 0x5F7FF5F082861B8B //A14
1994data8 0xE1C2E8BE125A5B7A //A15
1995data8 0x64066E92FE9EBE7D //A16
1996data8 0xE64B4201CDF9F138 //A17
1997data8 0x689186351E58AA88 //A18
1998data8 0xEAD132A585DFC60A //A19
1999// Polynomial coefficients for left root on [-15, -14]
2000// Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
2001data8 0xBE6D7DDE12700AC1, 0xBB1E025BF1667FB5 //A0
2002data8 0xC273077763F60AD5, 0xBF2A1698184C7A9A //A1
2003data8 0x44E6A1BF095B1AB3, 0x4178D5AE8A4A2874 //A2
2004data8 0x8F8E0D5060FCC767, 0x0000C076 //A3
2005data8 0x800CC1DCFF092A57, 0x0000409E //A4
2006data8 0xF3AB0BA9D14D37D1, 0x0000C0C5 //A5
2007data8 0x4EDE3000A2F6D565 //A6
2008data8 0xD15EC613B9C8C800 //A7
2009data8 0x53E003309FEECCAA //A8
2010data8 0xD660ED908D8B15C4 //A9
2011data8 0x58E21E9B51A1C4AE //A10
2012data8 0xDB639745DB82210D //A11
2013data8 0x5DE55BB60C68FCF6 //A12
2014data8 0xE06772BA3FCA23C6 //A13
2015data8 0x62E9E58B4F702C31 //A14
2016data8 0xE56CBA49B071ABE2 //A15
2017data8 0x67EFF31E4F2BA36A //A16
2018data8 0xEA7232C8804F32C3 //A17
2019data8 0x6CF5EFEE929A0928 //A18
2020data8 0xEF742EE03EC3E8FF //A19
2021// Polynomial coefficients for left root on [-16, -15]
2022// Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
2023data8 0xBEDCC628FEAC7A1B, 0xBB80582C8BEBB198 //A0
2024data8 0xC2B3076EE752595E, 0xBF5388F55AFAE53E //A1
2025data8 0x4566A1AAD96EBCB5, 0x421F0FEFE2444293 //A2
2026data8 0x8F8D4D3DE9850DB2, 0x0000C082 //A3
2027data8 0x800BDD6DA2CE184C, 0x000040AE //A4
2028data8 0xF3A8EC4C9CDC7A43, 0x0000C0D9 //A5
2029data8 0x505E2FAFDB81263F //A6
2030data8 0xD31EC5B3A7506CD9 //A7
2031data8 0x55E002F77E999810 //A8
2032data8 0xD8A0ED4C9B5C2900 //A9
2033data8 0x5B621E4A8267C401 //A10
2034data8 0xDE2396E5BFCFDA7A //A11
2035data8 0x60E55B43BE6F9A79 //A12
2036data8 0xE3A772324C7405FA //A13
2037data8 0x6669E4E9B7E57A2D //A14
2038data8 0xE92CB989F8A8FB37 //A15
2039data8 0x6BEFF2368849A36E //A16
2040data8 0xEEB23234FE191D55 //A17
2041data8 0x7175EF5D1080B105 //A18
2042data8 0xF4342ED7B1B7BE31 //A19
2043// Polynomial coefficients for left root on [-17, -16]
2044// Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
2045data8 0xBF28AEEE7B58C790, 0xBBC4448DE371FA0A //A0
2046data8 0xC2F436F56B3B89B1, 0xBF636755245AC63A //A1
2047data8 0x45E98A22535D115D, 0x4298467DA93DB784 //A2
2048data8 0xAC176F3775E6FCF2, 0x0000C08E //A3
2049data8 0xA3114F53A9FEB908, 0x000040BE //A4
2050data8 0xA4D168A8334AFE5A, 0x0000C0EE //A5
2051data8 0x51E5B0E7EC7182CF //A6
2052data8 0xD4E77D67B876D6B4 //A7
2053data8 0x57E9F7C30C098C83 //A8
2054data8 0xDAED29B0489EF7A7 //A9
2055data8 0x5DF09486F8A524B8 //A10
2056data8 0xE0F30B19910A2393 //A11
2057data8 0x63F60E02AB3109F4 //A12
2058data8 0xE6F9B8A3431854D5 //A13
2059data8 0x69FE2D4A6D94218E //A14
2060data8 0xED01C7E272A73560 //A15
2061data8 0x7005017D82B186B6 //A16
2062data8 0xF3096A81A69BD8AE //A17
2063data8 0x76104951BAD67D5C //A18
2064data8 0xF90FECC99786FD5B //A19
2065// Polynomial coefficients for left root on [-18, -17]
2066// Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
2067data8 0x3F3C19A53328E26A, 0x3BE238D7BA036B3B //A0
2068data8 0xC336C16C16C16C13, 0xBFEACE245DEC56F3 //A1
2069data8 0x46702E85C0898B70, 0x432C922B64FD1DA4 //A2
2070data8 0xF57B99A1C0343350, 0x0000C09A //A3
2071data8 0x82EC9634223DF90D, 0x000040CF //A4
2072data8 0x94F66D7557E3237D, 0x0000C103 //A5
2073data8 0x5376118B79AE34D6 //A6
2074data8 0xD6BAE7106D52CE49 //A7
2075data8 0x5A00BD48CC8E11AB //A8
2076data8 0xDD4529722833E2DF //A9
2077data8 0x608B1654AF5F46AF //A10
2078data8 0xE3D182CC90D8723F //A11
2079data8 0x6716D43D46706AA0 //A12
2080data8 0xEA5DF888C5B428D3 //A13
2081data8 0x6DA3CA85888931A6 //A14
2082data8 0xF0EA40EF2AC7E070 //A15
2083data8 0x743175D1A251AFCD //A16
2084data8 0xF777CB6E2B550D73 //A17
2085data8 0x7AC11E468A134A51 //A18
2086data8 0xFE02B6BDD0FC40AA //A19
2087// Polynomial coefficients for left root on [-19, -18]
2088// Lgammal is approximated by polynomial within [.157807 ; -.157807 ] range
2089data8 0xBFAB767F9BE217FC, 0xBC4A5541CE0D8D0D //A0
2090data8 0xC379999999999999, 0xC01A84981B490BE8 //A1
2091data8 0x46F47AE147AE147A, 0x43AC2987BBC466EB //A2
2092data8 0xAEC33E1F67152987, 0x0000C0A7 //A3
2093data8 0xD1B71758E2196153, 0x000040DF //A4
2094data8 0x8637BD05AF6D420E, 0x0000C118 //A5
2095data8 0x55065E9F80F293B2 //A6
2096data8 0xD88EADA78C44BFA7 //A7
2097data8 0x5C15798EE22EC6CD //A8
2098data8 0xDF9E8ABFD67895CF //A9
2099data8 0x6325FD7FE13B0DE0 //A10
2100data8 0xE6AFFC5C3DE70858 //A11
2101data8 0x6A3774CE81C70D43 //A12
2102data8 0xEDC1527412D8129F //A13
2103data8 0x7149BABCDA8B7A72 //A14
2104data8 0xF4D330AD49071BB5 //A15
2105data8 0x785D4046F4C5F1FD //A16
2106data8 0xFBE59BFEDBA73FAF //A17
2107data8 0x7F64BEF2B2EC8DA1 //A18
2108data8 0xFFEFFFFFFFFFFFFF //A19
2109LOCAL_OBJECT_END(lgammal_left_roots_polynomial_data)
2110
2111
2112//==============================================================
2113// Code
2114//==============================================================
2115
2116.section .text
2117GLOBAL_LIBM_ENTRY(__libm_lgammal)
2118{ .mfi
2119      getf.exp           rSignExpX = f8
2120      // Test x for NaTVal, NaN, +/-0, +/-INF, denormals
2121      fclass.m           p6,p0  = f8,0x1EF
2122      addl               r17Ones = 0x1FFFF, r0 // exponent mask
2123}
2124{ .mfi
2125      addl               GR_ad_z_1 = @ltoff(Constants_Z_1#),gp
2126      fcvt.fx.s1         fXint = f8 // Convert arg to int (int repres. in FR)
2127      adds               rDelta = 0x3FC, r0
2128}
2129;;
2130{ .mfi
2131      getf.sig           rSignifX = f8
2132      fcmp.lt.s1         p15, p14 = f8, f0
2133      shl                rDelta = rDelta, 20 // single precision 1.5
2134}
2135{ .mfi
2136      ld8                GR_ad_z_1 = [GR_ad_z_1]// get pointer to Constants_Z_1
2137      fma.s1             fTwo = f1, f1, f1      // 2.0
2138      addl               rExp8 = 0x10002, r0    // exponent of 8.0
2139}
2140;;
2141{ .mfi
2142      alloc              rPFS_SAVED = ar.pfs, 0, 34, 4, 0 // get some registers
2143      fmerge.s           fAbsX = f1, f8                   // |x|
2144      and                rExpX = rSignExpX, r17Ones       // mask sign bit
2145}
2146{ .mib
2147      addl               rExpHalf = 0xFFFE, r0 // exponent of 0.5
2148      addl               rExp2 = 0x10000, r0 // exponent of 2.0
2149      // branch out if x is NaTVal, NaN, +/-0, +/-INF, or denormalized number
2150(p6)  br.cond.spnt       lgammal_spec
2151}
2152;;
2153_deno_back_to_main_path:
2154{ .mfi
2155      // Point to Constants_G_H_h1
2156      add                rTbl1Addr = 0x040, GR_ad_z_1
2157      frcpa.s1           fRcpX, p0 = f1, f8 // initial approximation of 1/x
2158      extr.u             GR_Index1 = rSignifX, 59, 4
2159}
2160{ .mib
2161(p14) cmp.ge.unc         p8, p0 = rExpX, rExp8 // p8 = 1 if x >= 8.0
2162      adds               rZ625 = 0x3F2, r0
2163(p8)  br.cond.spnt       lgammal_big_positive // branch out if x >= 8.0
2164}
2165;;
2166{ .mfi
2167      shladd             rZ1offsett = GR_Index1, 2, GR_ad_z_1  // Point to Z_1
2168      fmerge.se          fSignifX =  f1, f8 // sifnificand of x
2169      // Get high 15 bits of significand
2170      extr.u             GR_X_0 = rSignifX, 49, 15
2171}
2172{ .mib
2173      cmp.lt.unc         p9, p0 = rExpX, rExpHalf // p9 = 1 if |x| < 0.5
2174      // set p11 if 2 <= x < 4
2175(p14) cmp.eq.unc         p11, p0 = rExpX, rExp2
2176(p9)  br.cond.spnt       lgammal_0_half // branch out if |x| < 0.5
2177}
2178;;
2179{ .mfi
2180      ld4                GR_Z_1 = [rZ1offsett] // Load Z_1
2181      fms.s1             fA5L = f1, f1, f8 // for 0.75 <= x < 1.3125 path
2182      shl                rZ625 = rZ625, 20 // sinfle precision 0.625
2183}
2184{ .mib
2185      setf.s             FR_MHalf = rDelta
2186      // set p10 if x >= 4.0
2187(p14) cmp.gt.unc         p10, p0 = rExpX, rExp2
2188      // branch to special path for 4.0 <= x < 8
2189(p10) br.cond.spnt       lgammal_4_8
2190}
2191;;
2192{ .mfi
2193      // for 1.3125 <= x < 1.5625 path
2194      addl               rPolDataPtr= @ltoff(lgammal_loc_min_data),gp
2195      // argument of polynomial approximation for 1.5625 <= x < 2.25
2196      fms.s1             fB4 = f8, f1, fTwo
2197      cmp.eq             p12, p0 = rExpX, rExpHalf
2198}
2199{ .mib
2200      addl               rExpOne = 0xFFFF, r0 // exponent of 1.0
2201     // set p10 if significand of x >= 1.125
2202(p11) cmp.le             p11, p0 = 2, GR_Index1
2203(p11) br.cond.spnt       lgammal_2Q_4
2204}
2205;;
2206{ .mfi
2207      // point to xMin for 1.3125 <= x < 1.5625 path
2208      ld8                rPolDataPtr = [rPolDataPtr]
2209      fcvt.xf            fFltIntX = fXint // RTN(x)
2210(p14) cmp.eq.unc         p13, p7 = rExpX, rExpOne // p13 set if 1.0 <= x < 2.0
2211}
2212{ .mib
2213      setf.s             FR_FracX = rZ625
2214      // set p12 if |x| < 0.75
2215(p12) cmp.gt.unc         p12, p0 = 8, GR_Index1
2216      // branch out to special path for |x| < 0.75
2217(p12) br.cond.spnt       lgammal_half_3Q
2218}
2219;;
2220.pred.rel "mutex", p7, p13
2221{ .mfi
2222      getf.sig           rXRnd = fXint // integer part of the input value
2223      fnma.s1            fInvX = f8, fRcpX, f1 // start of 1st NR iteration
2224      // Get bits 30-15 of X_0 * Z_1
2225      pmpyshr2.u         GR_X_1 = GR_X_0,GR_Z_1,15
2226}
2227{ .mib
2228(p7)  cmp.eq             p6, p0 = rExpX, rExp2 // p6 set if 2.0 <= x < 2.25
2229(p13) cmp.le             p6, p0 = 9, GR_Index1
2230      // branch to special path 1.5625 <= x < 2.25
2231(p6)  br.cond.spnt       lgammal_13Q_2Q
2232}
2233;;
2234//
2235//    For performance, don't use result of pmpyshr2.u for 4 cycles.
2236//
2237{ .mfi
2238      shladd             GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr // Point to G_1
2239      fma.s1             fSix = fTwo, fTwo, fTwo // 6.0
2240      add                GR_ad_q = -0x60, GR_ad_z_1   // Point to Constants_Q
2241}
2242{ .mib
2243      add                rTmpPtr3 = -0x50, GR_ad_z_1
2244(p13) cmp.gt             p7, p0 = 5, GR_Index1
2245      // branch to special path 0.75 <= x < 1.3125
2246(p7)  br.cond.spnt       lgammal_03Q_1Q
2247}
2248;;
2249{ .mfi
2250      add                rTmpPtr = 8, GR_ad_tbl_1
2251      fma.s1             fRoot = f8, f1, f1 // x + 1
2252       // Absolute value of int arg. Will be used as index in table with roots
2253      sub                rXRnd = r0, rXRnd
2254}
2255{ .mib
2256      ldfe               fA5L = [rPolDataPtr], 16 // xMin
2257      addl               rNegSingularity = 0x3003E, r0
2258(p14) br.cond.spnt       lgammal_loc_min
2259}
2260;;
2261{ .mfi
2262      ldfps              FR_G, FR_H = [GR_ad_tbl_1], 8 // Load G_1, H_1
2263      nop.f              0
2264      add                rZ2Addr = 0x140, GR_ad_z_1  // Point to Constants_Z_2
2265}
2266{ .mib
2267      ldfd               FR_h = [rTmpPtr] // Load h_1
2268      // If arg is less or equal to -2^63
2269      cmp.geu.unc        p8,p0 = rSignExpX, rNegSingularity
2270      // Singularity for x < -2^63 since all such arguments are integers
2271      // branch to special code which deals with singularity
2272(p8)  br.cond.spnt       lgammal_singularity
2273}
2274;;
2275{ .mfi
2276      ldfe               FR_log2_hi = [GR_ad_q], 32 // Load log2_hi
2277      nop.f              0
2278      extr.u             GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
2279}
2280{ .mfi
2281      ldfe               FR_log2_lo = [rTmpPtr3], 32 // Load log2_lo
2282      fms.s1             fDx = f8, f1, fFltIntX // x - RTN(x)
2283      // index in table with roots and bounds
2284      adds               rXint = -2, rXRnd
2285}
2286;;
2287{ .mfi
2288      ldfe               FR_Q4 = [GR_ad_q], 32      // Load Q4
2289      nop.f              0
2290      // set p12 if x may be close to negative root: -19.5 < x < -2.0
2291      cmp.gtu            p12, p0 = 18, rXint
2292}
2293{ .mfi
2294      shladd             GR_ad_z_2 = GR_Index2, 2, rZ2Addr  // Point to Z_2
2295      fma.s1             fRcpX = fInvX, fRcpX, fRcpX // end of 1st NR iteration
2296      // Point to Constants_G_H_h2
2297      add                rTbl2Addr = 0x180, GR_ad_z_1
2298}
2299;;
2300{ .mfi
2301      shladd             GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
2302      // set p9 if x is integer and negative
2303      fcmp.eq.s1         p9, p0 = f8,fFltIntX
2304      // Point to Constants_G_H_h3
2305      add                rTbl3Addr = 0x280, GR_ad_z_1
2306}
2307{ .mfi
2308      ld4                GR_Z_2 = [GR_ad_z_2] // Load Z_2
2309      nop.f              0
2310      sub                GR_N = rExpX, rExpHalf, 1
2311}
2312;;
2313{ .mfi
2314      ldfe               FR_Q3 = [rTmpPtr3], 32 // Load Q3
2315      nop.f              0
2316      // Point to lnsin polynomial coefficients
2317      adds               rLnSinDataPtr = 864, rTbl3Addr
2318}
2319{ .mfi
2320      ldfe               FR_Q2 = [GR_ad_q],32 // Load Q2
2321      nop.f              0
2322      add                rTmpPtr = 8, GR_ad_tbl_2
2323}
2324;;
2325{ .mfi
2326      ldfe               FR_Q1 = [rTmpPtr3] // Load Q1
2327      fcmp.lt.s1         p0, p15 = fAbsX, fSix // p15 is set when x < -6.0
2328      // point to table with roots and bounds
2329      adds               rRootsBndAddr = -1296, GR_ad_z_1
2330}
2331{ .mfb
2332      // Put integer N into rightmost significand
2333      setf.sig           fFloatN = GR_N
2334      fma.s1             fThirteen = fSix, fTwo, f1 // 13.0
2335      // Singularity if -2^63 < x < 0 and x is integer
2336      // branch to special code which deals with singularity
2337(p9)  br.cond.spnt       lgammal_singularity
2338}
2339;;
2340{ .mfi
2341      ldfps              FR_G2, FR_H2 = [GR_ad_tbl_2]  // Load G_2, H_2
2342      // y = |x|/2^(exponent(x)) - 1.5
2343      fms.s1             FR_FracX = fSignifX, f1, FR_MHalf
2344      // Get bits 30-15 of X_1 * Z_2
2345      pmpyshr2.u         GR_X_2 = GR_X_1,GR_Z_2,15
2346}
2347{ .mfi
2348      ldfd               FR_h2 = [rTmpPtr] // Load h_2
2349      fma.s1             fDxSqr = fDx, fDx, f0 // deltaX^2
2350      adds               rTmpPtr3 = 128, rLnSinDataPtr
2351}
2352;;
2353//
2354//    For performance, don't use result of pmpyshr2.u for 4 cycles.
2355//
2356{ .mfi
2357      getf.exp           rRoot = fRoot // sign and biased exponent of (x + 1)
2358      nop.f              0
2359      // set p6 if -4 < x <= -2
2360      cmp.eq             p6, p0 = rExpX, rExp2
2361}
2362{ .mfi
2363      ldfpd              fLnSin2, fLnSin2L = [rLnSinDataPtr], 16
2364      fnma.s1            fInvX = f8, fRcpX, f1 // start of 2nd NR iteration
2365      sub                rIndexPol = rExpX, rExpHalf // index of polynom
2366}
2367;;
2368{ .mfi
2369      ldfe               fLnSin4 = [rLnSinDataPtr], 96
2370      // p10 is set if x is potential "right" root
2371      // p11 set for possible "left" root
2372      fcmp.lt.s1         p10, p11 = fDx, f0
2373      shl                rIndexPol = rIndexPol, 6  // (i*16)*4
2374}
2375{ .mfi
2376      ldfpd              fLnSin18, fLnSin20 = [rTmpPtr3], 16
2377      nop.f              0
2378      mov                rExp2tom7 = 0x0fff8 // Exponent of 2^-7
2379}
2380;;
2381{ .mfi
2382      getf.sig           rSignifDx = fDx // Get significand of RTN(x)
2383      nop.f              0
2384      // set p6 if -4 < x <= -3.0
2385(p6)  cmp.le.unc         p6, p0 = 0x8, GR_Index1
2386}
2387{ .mfi
2388      ldfpd              fLnSin22, fLnSin24 = [rTmpPtr3], 16
2389      nop.f              0
2390      // mask sign bit in the exponent of (x + 1)
2391      and                rRoot = rRoot, r17Ones
2392}
2393;;
2394{ .mfi
2395      ldfe               fLnSin16 = [rLnSinDataPtr], -80
2396      nop.f              0
2397      extr.u             GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
2398}
2399{ .mfi
2400      ldfpd              fLnSin26, fLnSin28 = [rTmpPtr3], 16
2401      nop.f              0
2402      and                rXRnd = 1, rXRnd
2403}
2404;;
2405{ .mfi
2406      shladd             GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
2407      fms.s1             fDxSqrL = fDx, fDx, fDxSqr // low part of deltaX^2
2408      // potential "left" root
2409(p11) adds               rRootsBndAddr = 560, rRootsBndAddr
2410}
2411{ .mib
2412      ldfpd              fLnSin30, fLnSin32 = [rTmpPtr3], 16
2413      // set p7 if |x+1| < 2^-7
2414      cmp.lt             p7, p0 =  rRoot, rExp2tom7
2415      // branch to special path for |x+1| < 2^-7
2416(p7)  br.cond.spnt       _closeToNegOne
2417}
2418;;
2419{ .mfi
2420      ldfps              FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3
2421      fcmp.lt.s1         p14, p0 = fAbsX, fThirteen // set p14 if x > -13.0
2422      // base address of polynomial on range [-6.0, -0.75]
2423      adds               rPolDataPtr = 3440, rTbl3Addr
2424}
2425{ .mfi
2426      // (i*16)*4 + (i*16)*8 - offsett of polynomial on range [-6.0, -0.75]
2427      shladd             rTmpPtr = rIndexPol, 2, rIndexPol
2428      fma.s1             fXSqr = FR_FracX, FR_FracX, f0 // y^2
2429      // point to left "near root" bound
2430(p12) shladd             rRootsBndAddr = rXint, 4, rRootsBndAddr
2431}
2432;;
2433{ .mfi
2434      ldfpd              fLnSin34, fLnSin36 = [rTmpPtr3], 16
2435      fma.s1             fRcpX = fInvX, fRcpX, fRcpX // end of 2nd NR iteration
2436      // add special offsett if -4 < x <= -3.0
2437(p6)  adds               rPolDataPtr = 640, rPolDataPtr
2438}
2439{ .mfi
2440      // point to right "near root" bound
2441      adds               rTmpPtr2 = 8, rRootsBndAddr
2442      fnma.s1            fMOne = f1, f1, f0 // -1.0
2443      // Point to Bernulli numbers
2444      adds               rBernulliPtr = 544, rTbl3Addr
2445}
2446;;
2447{ .mfi
2448      // left bound of "near root" range
2449(p12) ld8                rLeftBound = [rRootsBndAddr]
2450      fmerge.se          fNormDx = f1, fDx // significand of DeltaX
2451      // base address + offsett for polynomial coeff. on range [-6.0, -0.75]
2452      add                rPolDataPtr = rPolDataPtr, rTmpPtr
2453}
2454{ .mfi
2455      // right bound of "near root" range
2456(p12) ld8                rRightBound = [rTmpPtr2]
2457      fcvt.xf            fFloatN = fFloatN
2458      // special "Bernulli" numbers for Stirling's formula for -13 < x < -6
2459(p14) adds               rBernulliPtr = 160, rBernulliPtr
2460}
2461;;
2462{ .mfi
2463      ldfd               FR_h3 = [GR_ad_tbl_3] // Load h_3
2464      fmpy.s1            FR_G = FR_G, FR_G2 // G = G_1 * G_2
2465      adds               rTmpPtr3 = -160, rTmpPtr3
2466}
2467{ .mfb
2468      adds               rTmpPtr = 80, rPolDataPtr
2469      fadd.s1            FR_H = FR_H, FR_H2 // H = H_1 + H_2
2470      // p15 is set if -2^63 < x < 6.0 and x is not an integer
2471      // branch to path with implementation using Stirling's formula for neg. x
2472(p15) br.cond.spnt       _negStirling
2473}
2474;;
2475{ .mfi
2476      ldfpd              fA3, fA3L = [rPolDataPtr], 16 // A3
2477      fma.s1             fDelX4 = fDxSqr, fDxSqr, f0 // deltaX^4
2478      // Get high 4 bits of signif
2479      extr.u             rIndex1Dx = rSignifDx, 59, 4
2480}
2481{ .mfi
2482      ldfe               fA5 = [rTmpPtr], -16 // A5
2483      fadd.s1            FR_h = FR_h, FR_h2 // h = h_1 + h_2
2484      adds               rLnSinTmpPtr = 16, rLnSinDataPtr
2485}
2486;;
2487{ .mfi
2488      ldfpd              fA0, fA0L = [rPolDataPtr], 16 // A0
2489      fma.s1             fLnSin20 = fLnSin20, fDxSqr, fLnSin18
2490      // Get high 15 bits of significand
2491      extr.u             rX0Dx = rSignifDx, 49, 15
2492}
2493{ .mfi
2494      ldfe               fA4 = [rTmpPtr], 192 // A4
2495      fms.s1             fXSqrL = FR_FracX, FR_FracX, fXSqr // low part of y^2
2496      shladd             GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1
2497}
2498;;
2499{ .mfi
2500      ldfpd              fA1, fA1L = [rPolDataPtr], 16 // A1
2501      fma.s1             fX4 = fXSqr, fXSqr, f0 // y^4
2502      adds               rTmpPtr2 = 32, rTmpPtr
2503}
2504{ .mfi
2505      ldfpd              fA18, fA19 = [rTmpPtr], 16 // A18, A19
2506      fma.s1             fLnSin24 = fLnSin24, fDxSqr, fLnSin22
2507      nop.i              0
2508}
2509;;
2510{ .mfi
2511      ldfe               fLnSin6 = [rLnSinDataPtr], 32
2512      fma.s1             fLnSin28 = fLnSin28, fDxSqr, fLnSin26
2513      nop.i              0
2514}
2515{ .mfi
2516      ldfe               fLnSin8 = [rLnSinTmpPtr], 32
2517      nop.f              0
2518      nop.i              0
2519}
2520;;
2521{ .mfi
2522      ldfpd              fA20, fA21 = [rTmpPtr], 16 // A20, A21
2523      fma.s1             fLnSin32 = fLnSin32, fDxSqr, fLnSin30
2524      nop.i              0
2525}
2526{ .mfi
2527      ldfpd              fA22, fA23 = [rTmpPtr2], 16 // A22, A23
2528      fma.s1             fB20 = f1, f1, FR_MHalf // 2.5
2529(p12) cmp.ltu.unc        p6, p0 = rSignifX, rLeftBound
2530}
2531;;
2532{ .mfi
2533      ldfpd              fA2, fA2L = [rPolDataPtr], 16 // A2
2534      fmpy.s1            FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
2535      // set p6 if x falls in "near root" range
2536(p6)  cmp.geu.unc        p6, p0 = rSignifX, rRightBound
2537}
2538{ .mfb
2539      adds               rTmpPtr3 = -64, rTmpPtr
2540      fadd.s1            FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
2541      // branch to special path if x falls in "near root" range
2542(p6)  br.cond.spnt       _negRoots
2543}
2544;;
2545{ .mfi
2546      ldfpd              fA24, fA25 = [rTmpPtr2], 16 // A24, A25
2547      fma.s1             fLnSin36 = fLnSin36, fDxSqr, fLnSin34
2548(p11) cmp.eq.unc         p7, p0 = 1,rXint // p7 set if  -3.0 < x < -2.5
2549}
2550{ .mfi
2551      adds               rTmpPtr = -48, rTmpPtr
2552      fma.s1             fLnSin20 = fLnSin20, fDxSqr, fLnSin16
2553      addl               rDelta = 0x5338, r0 // significand of -2.605859375
2554}
2555;;
2556{ .mfi
2557      getf.exp           GR_N =  fDx // Get N = exponent of DeltaX
2558      fma.s1             fX6 = fX4, fXSqr, f0 // y^6
2559      // p7 set if -2.605859375 <= x < -2.5
2560(p7)  cmp.gt.unc         p7, p0 = rDelta, GR_X_0
2561}
2562{ .mfb
2563      ld4                GR_Z_1 = [GR_ad_z_1] // Load Z_1
2564      fma.s1             fDelX8 = fDelX4, fDelX4, f0 // deltaX^8
2565      // branch to special path for -2.605859375 <= x < -2.5
2566(p7)  br.cond.spnt       _neg2andHalf
2567}
2568;;
2569{ .mfi
2570      ldfpd              fA14, fA15 = [rTmpPtr3], 16 // A14, A15
2571      fadd.s1            FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
2572      adds               rTmpPtr2 = 128 , rPolDataPtr
2573}
2574{ .mfi
2575      ldfpd              fA16, fA17 = [rTmpPtr], 16 // A16, A17
2576      fma.s1             fLnSin28 = fLnSin28, fDelX4, fLnSin24
2577      adds               rPolDataPtr = 144 , rPolDataPtr
2578}
2579;;
2580{ .mfi
2581      ldfe               fLnSin10 = [rLnSinDataPtr], 32
2582      fma.s1             fRes1H = fA3, FR_FracX, f0 // (A3*y)hi
2583      and                GR_N = GR_N, r17Ones // mask sign bit
2584}
2585{ .mfi
2586      ldfe               fLnSin12 = [rLnSinTmpPtr]
2587      fma.s1             fDelX6 = fDxSqr, fDelX4, f0 // DeltaX^6
2588      shladd             GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1
2589}
2590;;
2591{ .mfi
2592      ldfe               fA13 = [rPolDataPtr], -32 // A13
2593      fma.s1             fA4 = fA5, FR_FracX, fA4   // A5*y + A4
2594      // Get bits 30-15 of X_0 * Z_1
2595      pmpyshr2.u         GR_X_1 = rX0Dx, GR_Z_1, 15
2596}
2597{ .mfi
2598      ldfe               fA12 = [rTmpPtr2], -32 // A12
2599      fms.s1             FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1
2600      sub                GR_N = GR_N, rExpHalf, 1 // unbisaed exponent of DeltaX
2601}
2602;;
2603//
2604//    For performance, don't use result of pmpyshr2.u for 4 cycles.
2605//
2606.pred.rel "mutex",p10,p11
2607{ .mfi
2608      ldfe               fA11 = [rPolDataPtr], -32 // A11
2609      // High part of log(|x|) = Y_hi = N * log2_hi + H
2610      fma.s1             fResH = fFloatN, FR_log2_hi, FR_H
2611(p10) cmp.eq             p8, p9 = rXRnd, r0
2612}
2613{ .mfi
2614      ldfe               fA10 = [rTmpPtr2], -32 // A10
2615      fma.s1             fRes6H = fA1, FR_FracX, f0 // (A1*y)hi
2616(p11) cmp.eq             p9, p8 = rXRnd, r0
2617}
2618;;
2619{ .mfi
2620      ldfe               fA9 = [rPolDataPtr], -32 // A9
2621      fma.s1             fB14 = fLnSin6, fDxSqr, f0 // (LnSin6*deltaX^2)hi
2622      cmp.eq             p6, p7 = 4, rSgnGamSize
2623}
2624{ .mfi
2625      ldfe               fA8 = [rTmpPtr2], -32 // A8
2626      fma.s1             fA18 = fA19, FR_FracX, fA18
2627      nop.i              0
2628}
2629;;
2630{ .mfi
2631      ldfe               fA7 = [rPolDataPtr] // A7
2632      fma.s1             fA23 = fA23, FR_FracX, fA22
2633      nop.i              0
2634}
2635{ .mfi
2636      ldfe               fA6 = [rTmpPtr2] // A6
2637      fma.s1             fA21 = fA21, FR_FracX, fA20
2638      nop.i              0
2639}
2640;;
2641{ .mfi
2642      ldfe               fLnSin14 = [rLnSinDataPtr]
2643      fms.s1             fRes1L = fA3, FR_FracX, fRes1H // delta((A3*y)hi)
2644      extr.u             GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
2645}
2646{ .mfi
2647      setf.sig           fFloatNDx = GR_N
2648      fadd.s1            fPol = fRes1H, fA2 // (A3*y + A2)hi
2649      nop.i              0
2650}
2651;;
2652{ .mfi
2653      ldfps              FR_G, FR_H = [GR_ad_tbl_1], 8 // Load G_1, H_1
2654      fma.s1             fRes2H = fA4, fXSqr, f0 // ((A5 + A4*y)*y^2)hi
2655      nop.i              0
2656}
2657{ .mfi
2658      shladd             GR_ad_z_2 = GR_Index2, 2, rZ2Addr  // Point to Z_2
2659      fma.s1             fA25 = fA25, FR_FracX, fA24
2660      shladd             GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr  // Point to G_2
2661}
2662;;
2663.pred.rel "mutex",p8,p9
2664{ .mfi
2665      ld4                GR_Z_2 = [GR_ad_z_2] // Load Z_2
2666      fms.s1             fRes6L = fA1, FR_FracX, fRes6H // delta((A1*y)hi)
2667      // sign of GAMMA(x) is negative
2668(p8)  adds               rSgnGam = -1, r0
2669}
2670{ .mfi
2671      adds               rTmpPtr = 8, GR_ad_tbl_2
2672      fadd.s1            fRes3H = fRes6H, fA0 // (A1*y + A0)hi
2673      // sign of GAMMA(x) is positive
2674(p9)  adds               rSgnGam = 1, r0
2675}
2676;;
2677{ .mfi
2678      ldfps              FR_G2, FR_H2 = [GR_ad_tbl_2] // Load G_2, H_2
2679      // (LnSin6*deltaX^2 + LnSin4)hi
2680      fadd.s1            fLnSinH = fB14, fLnSin4
2681      nop.i              0
2682}
2683{ .mfi
2684      ldfd               FR_h2 = [rTmpPtr] // Load h_2
2685      fms.s1             fB16 = fLnSin6, fDxSqr, fB14 // delta(LnSin6*deltaX^2)
2686      nop.i              0
2687}
2688;;
2689{ .mfi
2690      ldfd               fhDelX = [GR_ad_tbl_1] // Load h_1
2691      fma.s1             fA21 = fA21, fXSqr, fA18
2692      nop.i              0
2693}
2694{ .mfi
2695      nop.m              0
2696      fma.s1             fLnSin36 = fLnSin36, fDelX4, fLnSin32
2697      nop.i              0
2698}
2699;;
2700{ .mfi
2701      nop.m              0
2702      fma.s1             fRes1L = fA3L, FR_FracX, fRes1L // (A3*y)lo
2703      // Get bits 30-15 of X_1 * Z_
2704      pmpyshr2.u         GR_X_2 = GR_X_1,GR_Z_2,15
2705}
2706{ .mfi
2707      nop.m              0
2708      fsub.s1            fPolL = fA2, fPol
2709      nop.i              0
2710}
2711;;
2712//
2713//    For performance, don't use result of pmpyshr2.u for 4 cycles.
2714//
2715{ .mfi
2716      nop.m              0
2717     // delta(((A5 + A4*y)*y^2)hi)
2718      fms.s1             fRes2L = fA4, fXSqr, fRes2H
2719      nop.i              0
2720}
2721{ .mfi
2722      nop.m              0
2723      // (((A5 + A4*y)*y^2) + A3*y + A2)hi
2724      fadd.s1            fRes4H = fRes2H, fPol
2725      nop.i              0
2726}
2727;;
2728{ .mfi
2729      // store signgam if size of variable is 4 bytes
2730(p6)  st4                [rSgnGamAddr] = rSgnGam
2731      fma.s1             fRes6L = fA1L, FR_FracX, fRes6L // (A1*y)lo
2732      nop.i              0
2733}
2734{ .mfi
2735      // store signgam if size of variable is 8 bytes
2736(p7)  st8                [rSgnGamAddr] = rSgnGam
2737      fsub.s1            fRes3L = fA0, fRes3H
2738      nop.i              0
2739}
2740;;
2741{ .mfi
2742      nop.m              0
2743      fsub.s1            fLnSinL = fLnSin4, fLnSinH
2744      nop.i              0
2745}
2746{ .mfi
2747      nop.m              0
2748      // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)hi
2749      fma.s1             fB18 = fLnSinH, fDxSqr, f0
2750      nop.i              0
2751}
2752;;
2753{ .mfi
2754      adds               rTmpPtr = 8, rTbl3Addr
2755      fma.s1             fB16 = fLnSin6, fDxSqrL, fB16 // (LnSin6*deltaX^2)lo
2756      extr.u             GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
2757}
2758{ .mfi
2759      nop.m              0
2760      fma.s1             fA25 = fA25, fXSqr, fA23
2761      nop.i              0
2762}
2763;;
2764{ .mfi
2765      shladd             GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
2766      fadd.s1            fPolL = fPolL, fRes1H
2767      nop.i              0
2768}
2769{ .mfi
2770      shladd             rTmpPtr = GR_Index3, 4, rTmpPtr // Point to G_3
2771      fadd.s1            fRes1L = fRes1L, fA2L // (A3*y)lo + A2lo
2772      nop.i              0
2773}
2774;;
2775{ .mfi
2776      ldfps              FR_G3, FR_H3 = [GR_ad_tbl_3] // Load G_3, H_3
2777      fma.s1             fRes2L = fA4, fXSqrL, fRes2L // ((A5 + A4*y)*y^2)lo
2778      nop.i              0
2779}
2780{ .mfi
2781      ldfd               FR_h3 = [rTmpPtr] // Load h_3
2782      fsub.s1            fRes4L = fPol, fRes4H
2783      nop.i              0
2784}
2785;;
2786{ .mfi
2787      nop.m              0
2788      // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)hi
2789      fma.s1             fRes7H = fRes4H, fXSqr, f0
2790      nop.i              0
2791}
2792{ .mfi
2793      nop.m              0
2794      fma.s1             fA15 = fA15, FR_FracX, fA14
2795      nop.i              0
2796}
2797;;
2798{ .mfi
2799      nop.m              0
2800      fadd.s1            fRes3L = fRes3L, fRes6H
2801      nop.i              0
2802}
2803{ .mfi
2804      nop.m              0
2805      fadd.s1            fRes6L = fRes6L, fA0L // (A1*y)lo + A0lo
2806      nop.i              0
2807}
2808;;
2809{ .mfi
2810      nop.m              0
2811      fadd.s1            fLnSinL = fLnSinL, fB14
2812
2813      nop.i              0
2814}
2815{ .mfi
2816      nop.m              0
2817      // delta((LnSin6*deltaX^2 + LnSin4)*deltaX^2)
2818      fms.s1             fB20 = fLnSinH, fDxSqr, fB18
2819      nop.i              0
2820}
2821;;
2822{ .mfi
2823      nop.m              0
2824      fadd.s1            fPolL = fPolL, fRes1L // (A3*y + A2)lo
2825
2826      nop.i              0
2827}
2828{ .mfi
2829      nop.m              0
2830      // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)hi
2831      fadd.s1            fLnSin6 = fB18, fLnSin2
2832      nop.i              0
2833}
2834;;
2835{ .mfi
2836      nop.m              0
2837      fadd.s1            fRes4L = fRes4L, fRes2H
2838      nop.i              0
2839}
2840{ .mfi
2841      nop.m              0
2842      fma.s1             fA17 = fA17, FR_FracX, fA16
2843      nop.i              0
2844}
2845;;
2846{ .mfi
2847      nop.m              0
2848      // delta(((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)
2849      fms.s1             fRes7L = fRes4H, fXSqr, fRes7H
2850      nop.i              0
2851}
2852{ .mfi
2853      nop.m              0
2854      fadd.s1            fPol = fRes7H, fRes3H
2855      nop.i              0
2856}
2857;;
2858{ .mfi
2859      nop.m              0
2860      fadd.s1            fRes3L = fRes3L, fRes6L // (A1*y + A0)lo
2861      nop.i              0
2862}
2863{ .mfi
2864      nop.m              0
2865      fma.s1             fA25 = fA25, fX4, fA21
2866      nop.i              0
2867}
2868;;
2869{ .mfi
2870      nop.m              0
2871      // (LnSin6*deltaX^2 + LnSin4)lo
2872      fadd.s1            fLnSinL = fLnSinL, fB16
2873      nop.i              0
2874}
2875{ .mfi
2876      nop.m              0
2877      fma.s1             fB20 = fLnSinH, fDxSqrL, fB20
2878      nop.i              0
2879}
2880;;
2881{ .mfi
2882      nop.m              0
2883      fsub.s1            fLnSin4 = fLnSin2, fLnSin6
2884      nop.i              0
2885}
2886{ .mfi
2887      nop.m              0
2888      // (((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)*DeltaX^2)hi
2889      fma.s1             fLnSinH = fLnSin6, fDxSqr, f0
2890      nop.i              0
2891}
2892;;
2893{ .mfi
2894      nop.m              0
2895      // ((A5 + A4*y)*y^2)lo + (A3*y + A2)lo
2896      fadd.s1            fRes2L = fRes2L, fPolL
2897      nop.i              0
2898}
2899{ .mfi
2900      nop.m              0
2901      fma.s1             fA17 = fA17, fXSqr, fA15
2902      nop.i              0
2903}
2904;;
2905{ .mfi
2906      nop.m              0
2907      // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)lo
2908      fma.s1             fRes7L = fRes4H, fXSqrL, fRes7L
2909      nop.i              0
2910}
2911{ .mfi
2912      nop.m              0
2913      fsub.s1            fPolL = fRes3H, fPol
2914      nop.i              0
2915}
2916;;
2917{ .mfi
2918      nop.m              0
2919      fma.s1             fA13 = fA13, FR_FracX, fA12
2920      nop.i              0
2921}
2922{ .mfi
2923      nop.m              0
2924      fma.s1             fA11 = fA11, FR_FracX, fA10
2925      nop.i              0
2926}
2927;;
2928{ .mfi
2929      nop.m              0
2930      // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)lo
2931      fma.s1             fB20 = fLnSinL, fDxSqr, fB20
2932      nop.i              0
2933}
2934{ .mfi
2935      nop.m              0
2936      fmpy.s1            FR_G = FR_G, FR_G2 // G = G_1 * G_2
2937      nop.i              0
2938}
2939;;
2940{ .mfi
2941      nop.m              0
2942      fadd.s1            fLnSin4 = fLnSin4, fB18
2943      nop.i              0
2944}
2945{ .mfi
2946      nop.m              0
2947      fms.s1             fLnSinL = fLnSin6, fDxSqr, fLnSinH
2948      nop.i              0
2949}
2950;;
2951{ .mfi
2952      nop.m              0
2953      // (((A5 + A4*y)*y^2) + A3*y + A2)lo
2954      fadd.s1            fRes4L = fRes4L, fRes2L
2955      nop.i              0
2956}
2957{ .mfi
2958      nop.m              0
2959      fadd.s1            fhDelX = fhDelX, FR_h2 // h = h_1 + h_2
2960      nop.i              0
2961}
2962;;
2963{ .mfi
2964      nop.m              0
2965      fadd.s1            fRes7L = fRes7L, fRes3L
2966      nop.i              0
2967}
2968{ .mfi
2969      nop.m              0
2970      fadd.s1            fPolL = fPolL, fRes7H
2971      nop.i              0
2972}
2973;;
2974{ .mfi
2975      nop.m              0
2976      fcvt.xf            fFloatNDx = fFloatNDx
2977      nop.i              0
2978}
2979{ .mfi
2980      nop.m              0
2981      fadd.s1            FR_H = FR_H, FR_H2 // H = H_1 + H_2
2982      nop.i              0
2983}
2984;;
2985{ .mfi
2986      nop.m              0
2987      fmpy.s1            FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
2988      nop.i              0
2989}
2990{ .mfi
2991      nop.m              0
2992      // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)lo + (LnSin2)lo
2993      fadd.s1            fLnSin2L = fLnSin2L, fB20
2994      nop.i              0
2995}
2996;;
2997{ .mfi
2998      nop.m              0
2999      fma.s1             fA25 = fA25, fX4, fA17
3000      nop.i              0
3001}
3002{ .mfi
3003      nop.m              0
3004      fma.s1             fA13 = fA13, fXSqr, fA11
3005      nop.i              0
3006}
3007;;
3008{ .mfi
3009      nop.m              0
3010      fma.s1             fA9 = fA9, FR_FracX, fA8
3011      nop.i              0
3012}
3013{ .mfi
3014      nop.m              0
3015      fma.s1             fA7 = fA7, FR_FracX, fA6
3016      nop.i              0
3017}
3018;;
3019{ .mfi
3020      nop.m              0
3021      fma.s1             fLnSin36 = fLnSin36, fDelX8, fLnSin28
3022      nop.i              0
3023}
3024{ .mfi
3025      nop.m              0
3026      fma.s1             fLnSin14 = fLnSin14, fDxSqr, fLnSin12
3027      nop.i              0
3028}
3029;;
3030{ .mfi
3031      nop.m              0
3032      fma.s1             fLnSin10 = fLnSin10, fDxSqr, fLnSin8
3033      nop.i              0
3034}
3035{ .mfi
3036      nop.m              0
3037      fadd.s1            FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
3038      nop.i              0
3039}
3040;;
3041{ .mfi
3042      nop.m              0
3043      fms.s1             fRDx = FR_G, fNormDx, f1 // r = G * S_hi - 1
3044      nop.i              0
3045}
3046{ .mfi
3047      nop.m              0
3048      // poly_lo = r * Q4 + Q3
3049      fma.s1             FR_poly_lo = FR_r, FR_Q4, FR_Q3
3050      nop.i              0
3051}
3052;;
3053{ .mfi
3054      nop.m              0
3055      fmpy.s1            FR_rsq = FR_r, FR_r // rsq = r * r
3056      nop.i              0
3057}
3058{ .mfi
3059      nop.m              0
3060      // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)lo + (A1*y + A0)lo
3061      fma.s1             fRes7L = fRes4L, fXSqr, fRes7L
3062      nop.i              0
3063}
3064;;
3065{ .mfi
3066      nop.m              0
3067      fma.s1             fA25 = fA25, fX4, fA13
3068      nop.i              0
3069}
3070{ .mfi
3071      nop.m              0
3072      fma.s1             fA9 = fA9, fXSqr, fA7
3073      nop.i              0
3074}
3075;;
3076{ .mfi
3077      nop.m              0
3078      // h = N * log2_lo + h
3079      fma.s1             FR_h = fFloatN, FR_log2_lo, FR_h
3080      nop.i              0
3081}
3082{ .mfi
3083      nop.m              0
3084      fadd.s1            fhDelX = fhDelX, FR_h3 // h = (h_1 + h_2) + h_3
3085      nop.i              0
3086}
3087;;
3088{ .mfi
3089      nop.m              0
3090      fma.s1             fLnSin36 = fLnSin36, fDelX6, fLnSin20
3091      nop.i              0
3092}
3093{ .mfi
3094      nop.m              0
3095      fma.s1             fLnSin14 = fLnSin14, fDelX4, fLnSin10
3096      nop.i              0
3097}
3098;;
3099{ .mfi
3100      nop.m              0
3101      // poly_lo = r * Q4 + Q3
3102      fma.s1             fPolyLoDx = fRDx, FR_Q4, FR_Q3
3103      nop.i              0
3104}
3105{ .mfi
3106      nop.m              0
3107      fmpy.s1            fRDxSq = fRDx, fRDx // rsq = r * r
3108      nop.i              0
3109}
3110;;
3111{ .mfi
3112      nop.m              0
3113      // Y_hi = N * log2_hi + H
3114      fma.s1             fResLnDxH = fFloatNDx, FR_log2_hi, FR_H
3115      nop.i              0
3116}
3117{ .mfi
3118      nop.m              0
3119      fma.s1             FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
3120      nop.i              0
3121}
3122;;
3123{ .mfi
3124      nop.m              0
3125      fma.s1             fA9 = fA25, fX4, fA9
3126      nop.i              0
3127}
3128{ .mfi
3129      nop.m              0
3130      fadd.s1            fPolL = fPolL, fRes7L
3131      nop.i              0
3132}
3133;;
3134{ .mfi
3135      nop.m              0
3136      fadd.s1            fLnSin4 = fLnSin4, fLnSin2L
3137      nop.i              0
3138}
3139{ .mfi
3140      nop.m              0
3141      // h = N * log2_lo + h
3142      fma.s1             fhDelX = fFloatNDx, FR_log2_lo, fhDelX
3143      nop.i              0
3144}
3145;;
3146{ .mfi
3147      nop.m              0
3148      fma.s1             fLnSin36 = fLnSin36, fDelX8, fLnSin14
3149      nop.i              0
3150}
3151{ .mfi
3152      nop.m              0
3153      // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)lo
3154      fma.s1             fLnSinL = fLnSin6, fDxSqrL, fLnSinL
3155      nop.i              0
3156}
3157;;
3158{ .mfi
3159      nop.m              0
3160      // poly_lo = poly_lo * r + Q2
3161      fma.s1             fPolyLoDx = fPolyLoDx, fRDx, FR_Q2
3162      nop.i              0
3163}
3164{ .mfi
3165      nop.m              0
3166      fma.s1             fRDxCub = fRDxSq, fRDx, f0 // rcub = r^3
3167      nop.i              0
3168}
3169;;
3170{ .mfi
3171      nop.m              0
3172      famax.s0              fRes5H = fPol, fResH
3173      nop.i              0
3174}
3175{ .mfi
3176      nop.m              0
3177      //  High part of (lgammal(|x|) + log(|x|))
3178      fadd.s1            fRes1H = fPol, fResH
3179      nop.i              0
3180}
3181;;
3182{ .mfi
3183      nop.m              0
3184      // poly_lo = poly_lo * r + Q2
3185      fma.s1             FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
3186      nop.i              0
3187}
3188{ .mfi
3189      nop.m              0
3190      fma.s1             fPolL = fA9, fX6, fPolL // P25lo
3191      nop.i              0
3192}
3193;;
3194
3195{ .mfi
3196      nop.m              0
3197      famin.s0              fRes5L = fPol, fResH
3198      nop.i              0
3199}
3200{ .mfi
3201      nop.m              0
3202      // High part of -(LnSin + log(|DeltaX|))
3203      fnma.s1            fRes2H = fResLnDxH, f1, fLnSinH
3204      nop.i              0
3205}
3206;;
3207
3208{ .mfi
3209      nop.m              0
3210      // (((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)*DeltaX^2)lo
3211      fma.s1             fLnSinL = fLnSin4, fDxSqr, fLnSinL
3212      nop.i              0
3213}
3214{ .mfi
3215      nop.m              0
3216      fma.s1             fLnSin36 = fLnSin36, fDelX6, f0
3217      nop.i              0
3218}
3219;;
3220{ .mfi
3221      nop.m              0
3222      // poly_hi = Q1 * rsq + r
3223      fma.s1             fPolyHiDx = FR_Q1, fRDxSq, fRDx
3224      nop.i              0
3225}
3226{ .mfi
3227      nop.m              0
3228      // poly_lo = poly_lo*r^3 + h
3229      fma.s1             fPolyLoDx = fPolyLoDx, fRDxCub, fhDelX
3230      nop.i              0
3231}
3232;;
3233{ .mfi
3234      nop.m              0
3235      fsub.s1            fRes1L = fRes5H, fRes1H
3236      nop.i              0
3237}
3238{ .mfi
3239      nop.m              0
3240      //  -(lgammal(|x|) + log(|x|))hi
3241      fnma.s1            fRes1H = fRes1H, f1, f0
3242
3243      nop.i              0
3244}
3245;;
3246{ .mfi
3247      nop.m              0
3248      // poly_hi = Q1 * rsq + r
3249      fma.s1             FR_poly_hi = FR_Q1, FR_rsq, FR_r
3250      nop.i              0
3251}
3252{ .mfi
3253      nop.m              0
3254      // poly_lo = poly_lo*r^3 + h
3255      fma.s1             FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
3256      nop.i              0
3257}
3258;;
3259{ .mfi
3260      nop.m              0
3261      fms.s1             fRes2L = fResLnDxH, fMOne, fRes2H
3262      nop.i              0
3263}
3264;;
3265{ .mfi
3266      nop.m              0
3267      fma.s1             fLnSinL = fLnSin36, fDxSqr, fLnSinL
3268      nop.i              0
3269}
3270{ .mfi
3271      nop.m              0
3272      // Y_lo = poly_hi + poly_lo
3273      fadd.s1            fResLnDxL = fPolyHiDx, fPolyLoDx
3274      nop.i              0
3275}
3276;;
3277{ .mfi
3278      nop.m              0
3279      fadd.s1            fRes1L = fRes1L, fRes5L
3280      nop.i              0
3281}
3282{ .mfi
3283      nop.m              0
3284      // high part of the final result
3285      fadd.s1            fYH = fRes2H, fRes1H
3286      nop.i              0
3287}
3288;;
3289{ .mfi
3290      nop.m              0
3291      // Y_lo = poly_hi + poly_lo
3292      fadd.s1            fResL = FR_poly_hi, FR_poly_lo
3293      nop.i              0
3294}
3295;;
3296{ .mfi
3297      nop.m              0
3298      famax.s0              fRes4H = fRes2H, fRes1H
3299      nop.i              0
3300}
3301;;
3302{ .mfi
3303      nop.m              0
3304      famin.s0              fRes4L = fRes2H, fRes1H
3305      nop.i              0
3306}
3307;;
3308{ .mfi
3309      nop.m              0
3310      // (LnSin)lo + (log(|DeltaX|))lo
3311      fsub.s1            fLnSinL = fLnSinL, fResLnDxL
3312      nop.i              0
3313}
3314{ .mfi
3315      nop.m              0
3316      fadd.s1            fRes2L = fRes2L, fLnSinH
3317      nop.i              0
3318}
3319;;
3320{ .mfi
3321      nop.m              0
3322      //(lgammal(|x|))lo + (log(|x|))lo
3323      fadd.s1            fPolL = fResL, fPolL
3324      nop.i              0
3325}
3326;;
3327{ .mfi
3328      nop.m              0
3329      fsub.s1            fYL = fRes4H, fYH
3330      nop.i              0
3331}
3332;;
3333{ .mfi
3334      nop.m              0
3335      // Low part of -(LnSin + log(|DeltaX|))
3336      fadd.s1            fRes2L = fRes2L, fLnSinL
3337      nop.i              0
3338}
3339{ .mfi
3340      nop.m              0
3341      //  High part of (lgammal(|x|) + log(|x|))
3342      fadd.s1            fRes1L = fRes1L, fPolL
3343      nop.i              0
3344}
3345;;
3346{ .mfi
3347      nop.m              0
3348      fadd.s1            fYL = fYL, fRes4L
3349      nop.i              0
3350}
3351{ .mfi
3352      nop.m              0
3353      fsub.s1            fRes2L = fRes2L, fRes1L
3354      nop.i              0
3355}
3356;;
3357{ .mfi
3358      nop.m              0
3359      // low part of the final result
3360      fadd.s1            fYL = fYL, fRes2L
3361      nop.i              0
3362}
3363;;
3364{ .mfb
3365      nop.m              0
3366      // final result for -6.0 < x <= -0.75, non-integer, "far" from roots
3367      fma.s0             f8 = fYH, f1, fYL
3368      // exit here for -6.0 < x <= -0.75, non-integer, "far" from roots
3369      br.ret.sptk        b0
3370}
3371;;
3372
3373// here if |x+1| < 2^(-7)
3374.align 32
3375_closeToNegOne:
3376{ .mfi
3377      getf.exp           GR_N =  fDx // Get N = exponent of x
3378      fmerge.se          fAbsX =  f1, fDx // Form |deltaX|
3379      // Get high 4 bits of significand of deltaX
3380      extr.u             rIndex1Dx = rSignifDx, 59, 4
3381}
3382{ .mfi
3383      addl               rPolDataPtr= @ltoff(lgammal_1pEps_data),gp
3384      fma.s1             fA0L = fDxSqr, fDxSqr, f0 // deltaX^4
3385      // sign of GAMMA is positive if p10 is set to 1
3386(p10) adds               rSgnGam = 1, r0
3387}
3388;;
3389{ .mfi
3390      shladd             GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1
3391      fnma.s1            fResL = fDx, f1, f0 // -(x+1)
3392      // Get high 15 bits of significand
3393      extr.u             GR_X_0 = rSignifDx, 49, 15
3394}
3395{ .mfi
3396      ld8                rPolDataPtr = [rPolDataPtr]
3397      nop.f              0
3398      shladd             GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1
3399}
3400;;
3401{ .mfi
3402      ld4                GR_Z_1 = [GR_ad_z_1] // Load Z_1
3403      nop.f              0
3404      and                GR_N = GR_N, r17Ones // mask sign bit
3405}
3406{ .mfi
3407      adds               rTmpPtr = 8, GR_ad_tbl_1
3408      nop.f              0
3409      cmp.eq             p6, p7 = 4, rSgnGamSize
3410}
3411;;
3412{ .mfi
3413      ldfps              FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
3414      nop.f              0
3415      adds               rTmpPtr2 = 96, rPolDataPtr
3416}
3417{ .mfi
3418      ldfd               FR_h = [rTmpPtr] // Load h_1
3419      nop.f              0
3420      // unbiased exponent of deltaX
3421      sub                GR_N = GR_N, rExpHalf, 1
3422}
3423;;
3424{ .mfi
3425      adds               rTmpPtr3 = 192, rPolDataPtr
3426      nop.f              0
3427      // sign of GAMMA is negative if p11 is set to 1
3428(p11) adds               rSgnGam = -1, r0
3429}
3430{ .mfi
3431      ldfe               fA1 = [rPolDataPtr], 16 // A1
3432      nop.f              0
3433      nop.i              0
3434}
3435;;
3436{.mfi
3437      ldfe               fA2 = [rPolDataPtr], 16 // A2
3438      nop.f              0
3439      // Get bits 30-15 of X_0 * Z_1
3440      pmpyshr2.u         GR_X_1 = GR_X_0,GR_Z_1,15
3441}
3442{ .mfi
3443      ldfpd              fA20, fA19 = [rTmpPtr2], 16 // P8, P7
3444      nop.f              0
3445      nop.i              0
3446}
3447;;
3448//
3449//    For performance, don't use result of pmpyshr2.u for 4 cycles.
3450//
3451{ .mfi
3452      ldfe               fA3 = [rPolDataPtr], 16 // A3
3453      nop.f              0
3454      nop.i              0
3455}
3456{ .mfi
3457      ldfpd              fA18, fA17 = [rTmpPtr2], 16 // P6, P5
3458      nop.f              0
3459      nop.i              0
3460}
3461;;
3462{ .mfi
3463      ldfe               fA4 = [rPolDataPtr], 16 // A4
3464      nop.f              0
3465      nop.i              0
3466}
3467{ .mfi
3468      ldfpd              fA16, fA15 = [rTmpPtr2], 16 // P4, p3
3469      nop.f              0
3470      nop.i              0
3471}
3472;;
3473{ .mfi
3474      ldfpd              fA5L, fA6 = [rPolDataPtr], 16 // A5, A6
3475      nop.f              0
3476      nop.i              0
3477}
3478{ .mfi
3479      ldfpd              fA14, fA13 = [rTmpPtr2], 16 // P2, P1
3480      nop.f              0
3481      nop.i              0
3482}
3483;;
3484{ .mfi
3485      ldfpd              fA7, fA8 = [rPolDataPtr], 16 // A7, A8
3486      nop.f              0
3487      extr.u             GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
3488}
3489{ .mfi
3490      ldfe               fLnSin2 = [rTmpPtr2], 16
3491      nop.f              0
3492      nop.i              0
3493}
3494;;
3495{ .mfi
3496      shladd             GR_ad_z_2 = GR_Index2, 2, rZ2Addr  // Point to Z_2
3497      nop.f              0
3498      shladd             GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
3499}
3500{ .mfi
3501      ldfe               fLnSin4 = [rTmpPtr2], 32
3502      nop.f              0
3503      nop.i              0
3504}
3505;;
3506{ .mfi
3507      ld4                GR_Z_2 = [GR_ad_z_2] // Load Z_2
3508      nop.f              0
3509      adds               rTmpPtr = 8, GR_ad_tbl_2
3510}
3511{ .mfi
3512      // Put integer N into rightmost significand
3513      setf.sig           fFloatN = GR_N
3514      nop.f              0
3515      nop.i              0
3516}
3517;;
3518{ .mfi
3519      ldfe               fLnSin6 = [rTmpPtr3]
3520      nop.f              0
3521      nop.i              0
3522}
3523{ .mfi
3524      ldfe               fLnSin8 = [rTmpPtr2]
3525      nop.f              0
3526      nop.i              0
3527}
3528;;
3529{ .mfi
3530      ldfps              FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
3531      nop.f              0
3532      nop.i              0
3533}
3534{ .mfi
3535      ldfd               FR_h2 = [rTmpPtr] // Load h_2
3536      nop.f              0
3537      nop.i              0
3538}
3539;;
3540{ .mfi
3541      // store signgam if size of variable is 4 bytes
3542(p6)  st4                [rSgnGamAddr] = rSgnGam
3543      fma.s1             fResH = fA20, fResL, fA19 //polynomial for log(|x|)
3544     // Get bits 30-15 of X_1 * Z_2
3545      pmpyshr2.u         GR_X_2 = GR_X_1,GR_Z_2,15
3546}
3547{ .mfi
3548      // store signgam if size of variable is 8 bytes
3549(p7)  st8                [rSgnGamAddr] = rSgnGam
3550      fma.s1             fA2 = fA2, fDx, fA1 // polynomial for lgammal(|x|)
3551      nop.i              0
3552}
3553;;
3554//
3555//    For performance, don't use result of pmpyshr2.u for 4 cycles.
3556//
3557{ .mfi
3558      nop.m              0
3559      fma.s1             fA18 = fA18, fResL, fA17 //polynomial for log(|x|)
3560      nop.i              0
3561}
3562;;
3563{ .mfi
3564      nop.m              0
3565      fma.s1             fA16 = fA16, fResL, fA15 //polynomial for log(|x|)
3566      nop.i              0
3567}
3568{ .mfi
3569      nop.m              0
3570      fma.s1             fA4 = fA4, fDx, fA3 // polynomial for lgammal(|x|)
3571      nop.i              0
3572}
3573;;
3574{ .mfi
3575      nop.m              0
3576      fma.s1             fA14 = fA14, fResL, fA13 //polynomial for log(|x|)
3577      nop.i              0
3578}
3579{ .mfi
3580      nop.m              0
3581      fma.s1             fA6 = fA6, fDx, fA5L // polynomial for lgammal(|x|)
3582      nop.i              0
3583}
3584;;
3585{ .mfi
3586      nop.m              0
3587      fma.s1             fPol = fA8, fDx, fA7 // polynomial for lgammal(|x|)
3588      extr.u             GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
3589}
3590;;
3591{ .mfi
3592      shladd             GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
3593      // loqw part of lnsin polynomial
3594      fma.s1             fRes3L = fLnSin4, fDxSqr, fLnSin2
3595      nop.i              0
3596}
3597;;
3598{ .mfi
3599      ldfps              FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3
3600      fcvt.xf            fFloatN = fFloatN // N as FP number
3601      nop.i              0
3602}
3603{ .mfi
3604      nop.m              0
3605      fma.s1             fResH = fResH, fDxSqr, fA18 // High part of log(|x|)
3606      nop.i              0
3607}
3608;;
3609{ .mfi
3610      ldfd               FR_h3 = [GR_ad_tbl_3] // Load h_3
3611      fma.s1             fA4 = fA4, fDxSqr, fA2 // Low part of lgammal(|x|)
3612      nop.i              0
3613}
3614{ .mfi
3615      nop.m              0
3616      // high part of lnsin polynomial
3617      fma.s1             fRes3H = fLnSin8, fDxSqr, fLnSin6
3618      nop.i              0
3619}
3620;;
3621{ .mfi
3622      nop.m              0
3623      fmpy.s1            FR_G = FR_G, FR_G2 // G = G_1 * G_2
3624      nop.i              0
3625}
3626{ .mfi
3627      nop.m              0
3628      fadd.s1            FR_H = FR_H, FR_H2 // H = H_1 + H_2
3629      nop.i              0
3630}
3631;;
3632{ .mfi
3633      nop.m              0
3634      fadd.s1            FR_h = FR_h, FR_h2 // h = h_1 + h_2
3635      nop.i              0
3636}
3637{ .mfi
3638      nop.m              0
3639      fma.s1             fA16 = fA16, fDxSqr, fA14 // Low part of log(|x|)
3640      nop.i              0
3641}
3642;;
3643{ .mfi
3644      nop.m              0
3645      fma.s1             fPol = fPol, fDxSqr, fA6 // High part of lgammal(|x|)
3646      nop.i              0
3647}
3648;;
3649{ .mfi
3650      nop.m              0
3651      fma.s1             fResH = fResH, fA0L, fA16 // log(|x|)/deltaX^2 - deltaX
3652      nop.i              0
3653}
3654;;
3655{ .mfi
3656      nop.m              0
3657      fmpy.s1            FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
3658      nop.i              0
3659}
3660{ .mfi
3661      nop.m              0
3662      fadd.s1            FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
3663      nop.i              0
3664}
3665;;
3666{ .mfi
3667      nop.m              0
3668      fadd.s1            FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
3669      nop.i              0
3670}
3671;;
3672{ .mfi
3673      nop.m              0
3674      fma.s1             fResH = fResH, fDxSqr, fResL // log(|x|)
3675      nop.i              0
3676}
3677{ .mfi
3678      nop.m              0
3679      fma.s1             fPol = fPol, fA0L, fA4 // lgammal(|x|)/|x|
3680      nop.i              0
3681}
3682;;
3683{ .mfi
3684      nop.m              0
3685      fms.s1             FR_r = FR_G, fAbsX, f1 // r = G * S_hi - 1
3686      nop.i              0
3687}
3688{ .mfi
3689      nop.m              0
3690      // high part of log(deltaX)= Y_hi = N * log2_hi + H
3691      fma.s1             fRes4H = fFloatN, FR_log2_hi, FR_H
3692      nop.i              0
3693}
3694;;
3695{ .mfi
3696      nop.m              0
3697      // h = N * log2_lo + h
3698      fma.s1             FR_h = fFloatN, FR_log2_lo, FR_h
3699      nop.i              0
3700}
3701;;
3702{ .mfi
3703      nop.m              0
3704      fma.s1             fResH = fPol, fDx, fResH // lgammal(|x|) + log(|x|)
3705      nop.i              0
3706}
3707{ .mfi
3708      nop.m              0
3709      // lnsin/deltaX^2
3710      fma.s1             fRes3H = fRes3H, fA0L, fRes3L
3711      nop.i              0
3712}
3713;;
3714{ .mfi
3715      nop.m              0
3716      // poly_lo = r * Q4 + Q3
3717      fma.s1             FR_poly_lo = FR_r, FR_Q4, FR_Q3
3718      nop.i              0
3719}
3720{ .mfi
3721      nop.m              0
3722      fmpy.s1            FR_rsq = FR_r, FR_r // rsq = r * r
3723      nop.i              0
3724}
3725;;
3726{ .mfi
3727      nop.m              0
3728      // lnSin - log(|x|) - lgammal(|x|)
3729      fms.s1             fResH = fRes3H, fDxSqr, fResH
3730      nop.i              0
3731}
3732;;
3733
3734{ .mfi
3735      nop.m              0
3736      // poly_lo = poly_lo * r + Q2
3737      fma.s1             FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
3738      nop.i              0
3739}
3740{ .mfi
3741      nop.m              0
3742      fma.s1             FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
3743      nop.i              0
3744}
3745;;
3746
3747{ .mfi
3748      nop.m              0
3749      // poly_hi = Q1 * rsq + r
3750      fma.s1             FR_poly_hi = FR_Q1, FR_rsq, FR_r
3751      nop.i              0
3752}
3753;;
3754
3755{ .mfi
3756      nop.m              0
3757      // poly_lo = poly_lo*r^3 + h
3758      fma.s1             FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
3759      nop.i              0
3760}
3761;;
3762
3763{ .mfi
3764      nop.m              0
3765      // low part of log(|deltaX|) = Y_lo = poly_hi + poly_lo
3766      fadd.s1            fRes4L = FR_poly_hi, FR_poly_lo
3767      nop.i              0
3768}
3769;;
3770{ .mfi
3771      nop.m              0
3772      fsub.s1            fResH = fResH, fRes4L
3773      nop.i              0
3774}
3775;;
3776{ .mfb
3777      nop.m              0
3778      // final result for |x+1|< 2^(-7) path
3779      fsub.s0            f8 = fResH, fRes4H
3780      // exit for |x+1|< 2^(-7) path
3781      br.ret.sptk        b0
3782}
3783;;
3784
3785
3786// here if -2^63 < x < -6.0 and x is not an integer
3787// Also we are going to filter out cases when x falls in
3788// range which is "close enough" to negative root. Rhis case
3789// may occur only for -19.5 < x since other roots of lgamma are
3790// insignificant from double extended point of view (they are closer
3791// to RTN(x) than one ulp(x).
3792.align 32
3793_negStirling:
3794{ .mfi
3795      ldfe               fLnSin6 = [rLnSinDataPtr], 32
3796      fnma.s1            fInvX = f8, fRcpX, f1 // start of 3rd NR iteration
3797      // Get high 4 bits of significand of deltaX
3798      extr.u             rIndex1Dx = rSignifDx, 59, 4
3799}
3800{ .mfi
3801      ldfe               fLnSin8 = [rTmpPtr3], 32
3802      fadd.s1            FR_h = FR_h, FR_h2 // h = h_1 + h_2
3803(p12) cmp.ltu.unc        p6, p0 = rSignifX, rLeftBound
3804}
3805;;
3806{ .mfi
3807      ldfe               fLnSin10 = [rLnSinDataPtr], 32
3808      fmpy.s1            FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
3809      // Get high 15 bits of significand
3810      extr.u             GR_X_0 = rSignifDx, 49, 15
3811}
3812{ .mfi
3813      shladd             GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1  // Point to Z_1
3814      fadd.s1            FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
3815      // set p6 if x falls in "near root" range
3816(p6)  cmp.geu.unc        p6, p0 = rSignifX, rRightBound
3817}
3818;;
3819{ .mfi
3820      getf.exp           GR_N =  fDx // Get N = exponent of x
3821      fma.s1             fDx4 = fDxSqr, fDxSqr, f0 // deltaX^4
3822      adds               rTmpPtr = 96, rBernulliPtr
3823}
3824{ .mfb
3825      ld4                GR_Z_1 = [GR_ad_z_1] // Load Z_1
3826      fma.s1             fLnSin34 = fLnSin34, fDxSqr, fLnSin32
3827      // branch to special path if x falls in "near root" range
3828(p6)  br.cond.spnt       _negRoots
3829}
3830;;
3831.pred.rel "mutex",p10,p11
3832{ .mfi
3833      ldfe               fLnSin12 = [rTmpPtr3]
3834      fma.s1             fLnSin26 = fLnSin26, fDxSqr, fLnSin24
3835(p10) cmp.eq             p8, p9 = rXRnd, r0
3836}
3837{ .mfi
3838      ldfe               fLnSin14 = [rLnSinDataPtr]
3839      fma.s1             fLnSin30 = fLnSin30, fDxSqr, fLnSin28
3840(p11) cmp.eq             p9, p8 = rXRnd, r0
3841}
3842;;
3843{ .mfi
3844      ldfpd              fB2, fB2L = [rBernulliPtr], 16
3845      fma.s1             fLnSin18 = fLnSin18, fDxSqr, fLnSin16
3846      shladd             GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1
3847
3848}
3849{ .mfi
3850      ldfe               fB14 = [rTmpPtr], 16
3851      fma.s1             fLnSin22 = fLnSin22, fDxSqr, fLnSin20
3852      and                GR_N = GR_N, r17Ones // mask sign bit
3853}
3854;;
3855{ .mfi
3856      ldfe               fB4 = [rBernulliPtr], 16
3857      fma.s1             fInvX = fInvX, fRcpX, fRcpX // end of 3rd NR iteration
3858      // Get bits 30-15 of X_0 * Z_1
3859      pmpyshr2.u         GR_X_1 = GR_X_0,GR_Z_1,15
3860}
3861{ .mfi
3862      ldfe               fB16 = [rTmpPtr], 16
3863      fadd.s1            FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
3864      adds               rTmpPtr2 = 8, GR_ad_tbl_1
3865}
3866;;
3867//
3868//    For performance, don't use result of pmpyshr2.u for 4 cycles.
3869//
3870{ .mfi
3871      ldfe               fB6 = [rBernulliPtr], 16
3872      fms.s1             FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1
3873      adds               rTmpPtr3 = -48, rTmpPtr
3874}
3875{ .mfi
3876      ldfe               fB18 = [rTmpPtr], 16
3877      // High part of the log(|x|) = Y_hi = N * log2_hi + H
3878      fma.s1             fResH = fFloatN, FR_log2_hi, FR_H
3879      sub                GR_N = GR_N, rExpHalf, 1 // unbiased exponent of deltaX
3880}
3881;;
3882.pred.rel "mutex",p8,p9
3883{ .mfi
3884      ldfe               fB8 = [rBernulliPtr], 16
3885      fma.s1             fLnSin36 = fLnSin36, fDx4, fLnSin34
3886      // sign of GAMMA(x) is negative
3887(p8)  adds               rSgnGam = -1, r0
3888}
3889{ .mfi
3890      ldfe               fB20 = [rTmpPtr], -160
3891      fma.s1             fRes5H = fLnSin4, fDxSqr, f0
3892      // sign of GAMMA(x) is positive
3893(p9)  adds               rSgnGam = 1, r0
3894
3895}
3896;;
3897{ .mfi
3898      ldfe               fB10 = [rBernulliPtr], 16
3899      fma.s1             fLnSin30 = fLnSin30, fDx4, fLnSin26
3900(p14) adds               rTmpPtr = -160, rTmpPtr
3901}
3902{ .mfi
3903      ldfe               fB12 = [rTmpPtr3], 16
3904      fma.s1             fDx8 = fDx4, fDx4, f0 // deltaX^8
3905      cmp.eq             p6, p7 = 4, rSgnGamSize
3906}
3907;;
3908{ .mfi
3909      ldfps              fGDx, fHDx = [GR_ad_tbl_1], 8 // Load G_1, H_1
3910      fma.s1             fDx6 = fDx4, fDxSqr, f0 // deltaX^6
3911      extr.u             GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
3912}
3913{ .mfi
3914      ldfd               fhDx = [rTmpPtr2] // Load h_1
3915      fma.s1             fLnSin22 = fLnSin22, fDx4, fLnSin18
3916      nop.i              0
3917}
3918;;
3919{ .mfi
3920      // Load two parts of C
3921      ldfpd              fRes1H, fRes1L = [rTmpPtr], 16
3922      fma.s1             fRcpX = fInvX, fInvX, f0  // (1/x)^2
3923      shladd             GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
3924}
3925{ .mfi
3926      shladd             GR_ad_z_2 = GR_Index2, 2, rZ2Addr  // Point to Z_2
3927      fma.s1             FR_h = fFloatN, FR_log2_lo, FR_h// h = N * log2_lo + h
3928      nop.i              0
3929}
3930;;
3931{ .mfi
3932      ld4                GR_Z_2 = [GR_ad_z_2] // Load Z_2
3933      fnma.s1            fInvXL = f8, fInvX, f1 // relative error of 1/x
3934      nop.i              0
3935}
3936{ .mfi
3937      adds               rTmpPtr2 = 8, GR_ad_tbl_2
3938      fma.s1             fLnSin8 = fLnSin8, fDxSqr, fLnSin6
3939      nop.i              0
3940}
3941;;
3942{ .mfi
3943      ldfps              FR_G2, FR_H2 = [GR_ad_tbl_2],8   // Load G_2, H_2
3944      // poly_lo = r * Q4 + Q3
3945      fma.s1             FR_poly_lo = FR_r, FR_Q4, FR_Q3
3946      nop.i              0
3947}
3948{ .mfi
3949      ldfd               fh2Dx = [rTmpPtr2] // Load h_2
3950      fmpy.s1            FR_rsq = FR_r, FR_r // rsq = r * r
3951      nop.i              0
3952}
3953;;
3954{ .mfi
3955      nop.m              0
3956      fma.s1             fA1L = fB2, fInvX, f0 // (B2*(1/x))hi
3957      nop.i              0
3958}
3959{ .mfi
3960      // Put integer N into rightmost significand
3961      setf.sig           fFloatNDx = GR_N
3962      fms.s1             fRes4H = fResH, f1, f1  // ln(|x|)hi - 1
3963      nop.i              0
3964}
3965;;
3966{ .mfi
3967      nop.m              0
3968      fadd.s1            fRes2H = fRes5H, fLnSin2//(lnSin4*DeltaX^2 + lnSin2)hi
3969      // Get bits 30-15 of X_1 * Z_2
3970      pmpyshr2.u         GR_X_2 = GR_X_1,GR_Z_2,15
3971}
3972{ .mfi
3973      nop.m              0
3974      fms.s1             fRes5L = fLnSin4, fDxSqr, fRes5H
3975      nop.i              0
3976}
3977;;
3978//
3979//    For performance, don't use result of pmpyshr2.u for 4 cycles.
3980//
3981{ .mfi
3982      nop.m              0
3983      fma.s1             fInvX4 = fRcpX, fRcpX, f0 // (1/x)^4
3984      nop.i              0
3985}
3986{ .mfi
3987      nop.m              0
3988      fma.s1             fB6 = fB6, fRcpX, fB4
3989      nop.i              0
3990}
3991;;
3992{ .mfi
3993      // store signgam if size of variable is 4 bytes
3994(p6)  st4                [rSgnGamAddr] = rSgnGam
3995      fma.s1             fB18 = fB18, fRcpX, fB16
3996      nop.i              0
3997}
3998{ .mfi
3999      // store signgam if size of variable is 8 bytes
4000(p7)  st8                [rSgnGamAddr] = rSgnGam
4001      fma.s1             fInvXL = fInvXL, fInvX, f0 // low part of 1/x
4002      nop.i              0
4003}
4004;;
4005{ .mfi
4006      nop.m              0
4007      // poly_lo = poly_lo * r + Q2
4008      fma.s1             FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
4009      nop.i              0
4010}
4011{ .mfi
4012      nop.m              0
4013      fma.s1             FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
4014      nop.i              0
4015}
4016;;
4017{ .mfi
4018      nop.m              0
4019      fma.s1             fRes3H = fRes4H, f8, f0 // (-|x|*(ln(|x|)-1))hi
4020      extr.u             GR_Index3 = GR_X_2, 1, 5  // Extract bits 1-5 of X_2
4021}
4022{ .mfi
4023      nop.m              0
4024      // poly_hi = Q1 * rsq + r
4025      fma.s1             FR_poly_hi = FR_Q1, FR_rsq, FR_r
4026      nop.i              0
4027}
4028;;
4029{ .mfi
4030      shladd             GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
4031      fms.s1             fA2L = fB2, fInvX, fA1L // delta(B2*(1/x))
4032      nop.i              0
4033}
4034{ .mfi
4035      nop.m              0
4036      fnma.s1            fBrnH = fRes1H, f1, fA1L // (-C - S(1/x))hi
4037      nop.i              0
4038}
4039;;
4040{ .mfi
4041      ldfps              fG3Dx, fH3Dx = [GR_ad_tbl_3],8 // Load G_3, H_3
4042      fma.s1             fInvX8 = fInvX4, fInvX4, f0 // (1/x)^8
4043      nop.i              0
4044}
4045{ .mfi
4046      nop.m              0
4047      fma.s1             fB10 = fB10, fRcpX, fB8
4048      nop.i              0
4049}
4050;;
4051
4052{ .mfi
4053      ldfd               fh3Dx = [GR_ad_tbl_3] // Load h_3
4054      fma.s1             fB20 = fB20, fInvX4, fB18
4055      nop.i              0
4056}
4057{ .mfi
4058      nop.m              0
4059      fma.s1             fB14 = fB14, fRcpX, fB12
4060      nop.i              0
4061}
4062;;
4063{ .mfi
4064      nop.m              0
4065      fma.s1             fLnSin36 = fLnSin36, fDx8, fLnSin30
4066      nop.i              0
4067}
4068{ .mfi
4069      nop.m              0
4070      fma.s1             fLnSin12 = fLnSin12, fDxSqr, fLnSin10
4071      nop.i              0
4072}
4073;;
4074{ .mfi
4075      nop.m              0
4076      fsub.s1            fRes2L = fLnSin2, fRes2H
4077      nop.i              0
4078}
4079{ .mfi
4080      nop.m              0
4081      fma.s1             fPol = fRes2H, fDxSqr, f0 // high part of LnSin
4082      nop.i              0
4083}
4084;;
4085{ .mfi
4086      nop.m              0
4087      fnma.s1            fResH = fResH, FR_MHalf, fResH // -0.5*ln(|x|)hi
4088      nop.i              0
4089}
4090{ .mfi
4091      nop.m              0
4092      fmpy.s1            fGDx = fGDx, FR_G2 // G = G_1 * G_2
4093      nop.i              0
4094}
4095;;
4096{ .mfi
4097      nop.m              0
4098      // poly_lo = poly_lo*r^3 + h
4099      fma.s1             FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
4100      nop.i              0
4101}
4102{ .mfi
4103      nop.m              0
4104      // B2lo*(1/x)hi+ delta(B2*(1/x))
4105      fma.s1             fA2L = fB2L, fInvX, fA2L
4106      nop.i              0
4107}
4108;;
4109{ .mfi
4110      nop.m              0
4111      fma.s1             fB20 = fB20, fInvX4, fB14
4112      nop.i              0
4113}
4114{ .mfi
4115      nop.m              0
4116      fma.s1             fB10 = fB10, fInvX4, fB6
4117      nop.i              0
4118}
4119;;
4120{ .mfi
4121      nop.m              0
4122      fcvt.xf            fFloatNDx = fFloatNDx
4123      nop.i              0
4124}
4125{ .mfi
4126      nop.m              0
4127      fma.s1             fLnSin14 = fLnSin14, fDx4, fLnSin12
4128      nop.i              0
4129}
4130;;
4131{ .mfi
4132      nop.m              0
4133      fma.s1             fLnSin36 = fLnSin36, fDx8, fLnSin22
4134      nop.i              0
4135}
4136{ .mfi
4137      nop.m              0
4138      fms.s1             fRes3L = fRes4H, f8, fRes3H // delta(-|x|*(ln(|x|)-1))
4139      nop.i              0
4140}
4141;;
4142{ .mfi
4143      nop.m              0
4144      fmpy.s1            fGDx = fGDx, fG3Dx // G = (G_1 * G_2) * G_3
4145      nop.i              0
4146}
4147{ .mfi
4148      nop.m              0
4149      // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))hi
4150      fadd.s1            fRes4H = fRes3H, fResH
4151      nop.i              0
4152}
4153;;
4154{ .mfi
4155      nop.m              0
4156      fma.s1             fA2L = fInvXL, fB2, fA2L //(B2*(1/x))lo
4157      nop.i              0
4158}
4159{ .mfi
4160      nop.m              0
4161      // low part of log(|x|) = Y_lo = poly_hi + poly_lo
4162      fadd.s1            fResL = FR_poly_hi, FR_poly_lo
4163      nop.i              0
4164}
4165;;
4166{ .mfi
4167      nop.m              0
4168      fma.s1             fB20 = fB20, fInvX8, fB10
4169      nop.i              0
4170}
4171{ .mfi
4172      nop.m              0
4173      fma.s1             fInvX3 = fInvX, fRcpX, f0 // (1/x)^3
4174      nop.i              0
4175}
4176;;
4177{ .mfi
4178      nop.m              0
4179      fadd.s1            fHDx = fHDx, FR_H2 // H = H_1 + H_2
4180      nop.i              0
4181}
4182{ .mfi
4183      nop.m              0
4184      fadd.s1            fRes5L = fRes5L, fLnSin2L
4185      nop.i              0
4186}
4187;;
4188{ .mfi
4189      nop.m              0
4190      fadd.s1            fRes2L = fRes2L, fRes5H
4191      nop.i              0
4192}
4193{ .mfi
4194      nop.m              0
4195      fadd.s1            fhDx = fhDx, fh2Dx // h = h_1 + h_2
4196      nop.i              0
4197}
4198;;
4199{ .mfi
4200      nop.m              0
4201      fms.s1             fBrnL = fRes1H, fMOne, fBrnH
4202      nop.i              0
4203}
4204{ .mfi
4205      nop.m              0
4206      fms.s1             FR_r = fGDx, fNormDx, f1 // r = G * S_hi - 1
4207      nop.i              0
4208}
4209;;
4210{ .mfi
4211      nop.m              0
4212      fma.s1             fRes3L = fResL, f8 , fRes3L // (-|x|*(ln(|x|)-1))lo
4213      nop.i              0
4214}
4215{ .mfi
4216      nop.m              0
4217      fsub.s1            fRes4L = fRes3H, fRes4H
4218      nop.i              0
4219}
4220;;
4221{ .mfi
4222      nop.m              0
4223      // low part of "Bernulli" polynomial
4224      fma.s1             fB20 = fB20, fInvX3, fA2L
4225      nop.i              0
4226}
4227{ .mfi
4228      nop.m              0
4229      fnma.s1            fResL = fResL, FR_MHalf, fResL // -0.5*ln(|x|)lo
4230      nop.i              0
4231}
4232;;
4233{ .mfi
4234      nop.m              0
4235      fadd.s1            fHDx = fHDx, fH3Dx // H = (H_1 + H_2) + H_3
4236      nop.i              0
4237}
4238{ .mfi
4239      nop.m              0
4240      fms.s1             fPolL = fRes2H, fDxSqr, fPol
4241      nop.i              0
4242}
4243;;
4244{ .mfi
4245      nop.m              0
4246      fadd.s1            fhDx = fhDx, fh3Dx // h = (h_1 + h_2) + h_3
4247      nop.i              0
4248}
4249{ .mfi
4250      nop.m              0
4251      // (-|x|*(ln(|x|)-1) - 0.5ln(|x|) - C - S(1/x))hi
4252      fadd.s1            fB14 = fRes4H, fBrnH
4253      nop.i              0
4254}
4255;;
4256{ .mfi
4257      nop.m              0
4258      // poly_lo = r * Q4 + Q3
4259      fma.s1             FR_poly_lo = FR_r, FR_Q4, FR_Q3
4260      nop.i              0
4261}
4262{ .mfi
4263      nop.m              0
4264      fmpy.s1            FR_rsq = FR_r, FR_r // rsq = r * r
4265      nop.i              0
4266}
4267;;
4268{ .mfi
4269      nop.m              0
4270      fadd.s1            fRes4L = fRes4L, fResH
4271      nop.i              0
4272}
4273{ .mfi
4274      nop.m              0
4275      fadd.s1            fBrnL = fBrnL, fA1L
4276      nop.i              0
4277}
4278;;
4279{ .mfi
4280      nop.m              0
4281      // (-|x|*(ln(|x|)-1))lo + (-0.5ln(|x|))lo
4282      fadd.s1            fRes3L = fRes3L, fResL
4283      nop.i              0
4284}
4285{ .mfi
4286      nop.m              0
4287      fnma.s1            fB20 = fRes1L, f1, fB20 // -Clo - S(1/x)lo
4288      nop.i              0
4289}
4290;;
4291{ .mfi
4292      nop.m              0
4293      fadd.s1            fRes2L = fRes2L, fRes5L // (lnSin4*DeltaX^2 + lnSin2)lo
4294      nop.i              0
4295}
4296{ .mfi
4297      nop.m              0
4298      fma.s1             fPolL = fDxSqrL, fRes2H, fPolL
4299      nop.i              0
4300}
4301;;
4302{ .mfi
4303      nop.m              0
4304      fma.s1             fLnSin14 = fLnSin14, fDx4, fLnSin8
4305      nop.i              0
4306}
4307{ .mfi
4308      nop.m              0
4309      fma.s1             fLnSin36 = fLnSin36,  fDx8, f0
4310      nop.i              0
4311}
4312;;
4313{ .mfi
4314      nop.m              0
4315      // poly_lo = poly_lo * r + Q2
4316      fma.s1             FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
4317      nop.i              0
4318}
4319{ .mfi
4320      nop.m              0
4321      fma.s1             FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
4322      nop.i              0
4323}
4324;;
4325{ .mfi
4326      nop.m              0
4327      // poly_hi = Q1 * rsq + r
4328      fma.s1             FR_poly_hi = FR_Q1, FR_rsq, FR_r
4329      nop.i              0
4330}
4331{ .mfi
4332      nop.m              0
4333      fsub.s1            fB12 = fRes4H, fB14
4334      nop.i              0
4335}
4336;;
4337{ .mfi
4338      nop.m              0
4339      // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))lo
4340      fadd.s1            fRes4L = fRes4L, fRes3L
4341      nop.i              0
4342}
4343{ .mfi
4344      nop.m              0
4345      fadd.s1            fBrnL = fBrnL, fB20 // (-C - S(1/x))lo
4346      nop.i              0
4347}
4348;;
4349{ .mfi
4350      nop.m              0
4351      // high part of log(|DeltaX|) = Y_hi = N * log2_hi + H
4352      fma.s1             fLnDeltaH = fFloatNDx, FR_log2_hi, fHDx
4353      nop.i              0
4354}
4355{ .mfi
4356      nop.m              0
4357      // h = N * log2_lo + h
4358      fma.s1             fhDx = fFloatNDx, FR_log2_lo, fhDx
4359      nop.i              0
4360}
4361;;
4362{ .mfi
4363      nop.m              0
4364      fma.s1             fPolL = fRes2L, fDxSqr, fPolL
4365      nop.i              0
4366}
4367{ .mfi
4368      nop.m              0
4369      fma.s1             fLnSin14 = fLnSin36,  fDxSqr, fLnSin14
4370      nop.i              0
4371}
4372;;
4373{ .mfi
4374      nop.m              0
4375      // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))lo + (- C - S(1/x))lo
4376      fadd.s1            fBrnL = fBrnL, fRes4L
4377      nop.i              0
4378}
4379{ .mfi
4380      nop.m              0
4381      fadd.s1            fB12 = fB12, fBrnH
4382      nop.i              0
4383}
4384;;
4385{ .mfi
4386      nop.m              0
4387      // poly_lo = poly_lo*r^3 + h
4388      fma.s1             FR_poly_lo = FR_poly_lo, FR_rcub, fhDx
4389      nop.i              0
4390}
4391{ .mfi
4392      nop.m              0
4393      fnma.s1            fRes1H = fLnDeltaH, f1, fPol//(-ln(|DeltaX|) + LnSin)hi
4394      nop.i              0
4395}
4396;;
4397{ .mfi
4398      nop.m              0
4399      fma.s1             fPolL = fDxSqrL, fRes2L, fPolL
4400      nop.i              0
4401}
4402{ .mfi
4403      nop.m              0
4404      fma.s1             fLnSin36 = fLnSin14, fDx6, f0
4405      nop.i              0
4406}
4407;;
4408{ .mfi
4409      nop.m              0
4410      // (-|x|*(ln(|x|)-1) - 0.5ln(|x|) - C - S(1/x))lo
4411      fadd.s1            fB12 = fB12, fBrnL
4412      nop.i              0
4413}
4414;;
4415{ .mfi
4416      nop.m              0
4417      // low part of log(|DeltaX|) =  Y_lo = poly_hi + poly_lo
4418      fadd.s1            fLnDeltaL= FR_poly_hi, FR_poly_lo
4419      nop.i              0
4420}
4421{ .mfi
4422      nop.m              0
4423      fms.s1             fRes1L = fLnDeltaH, fMOne, fRes1H
4424      nop.i              0
4425}
4426;;
4427{ .mfi
4428      nop.m              0
4429      fadd.s1            fPolL = fPolL, fLnSin36
4430      nop.i              0
4431}
4432{ .mfi
4433      nop.m              0
4434      //(-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi + (-ln(|DeltaX|) + LnSin)hi
4435      fadd.s1            f8 = fRes1H, fB14
4436      nop.i              0
4437}
4438;;
4439{ .mfi
4440      nop.m              0
4441      //max((-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi,
4442      //    (-ln(|DeltaX|) + LnSin)hi)
4443      famax.s1           fMaxNegStir = fRes1H, fB14
4444      nop.i              0
4445}
4446{ .mfi
4447      nop.m              0
4448      //min((-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi,
4449      //    (-ln(|DeltaX|) + LnSin)hi)
4450      famin.s1           fMinNegStir = fRes1H, fB14
4451      nop.i              0
4452}
4453;;
4454{ .mfi
4455      nop.m              0
4456      fadd.s1            fRes1L = fRes1L, fPol
4457      nop.i              0
4458}
4459{ .mfi
4460      nop.m              0
4461      // (-ln(|DeltaX|))lo + (LnSin)lo
4462      fnma.s1            fPolL = fLnDeltaL, f1, fPolL
4463      nop.i              0
4464}
4465;;
4466{ .mfi
4467      nop.m              0
4468      fsub.s1            f9 = fMaxNegStir, f8 // delta1
4469      nop.i              0
4470}
4471;;
4472{ .mfi
4473      nop.m              0
4474      fadd.s1            fRes1L = fRes1L, fPolL // (-ln(|DeltaX|) + LnSin)lo
4475      nop.i              0
4476}
4477;;
4478{ .mfi
4479      nop.m              0
4480      fadd.s1            f9 = f9, fMinNegStir
4481      nop.i              0
4482}
4483;;
4484{ .mfi
4485      nop.m              0
4486      fadd.s1            fRes1L = fRes1L, fB12
4487      nop.i              0
4488}
4489;;
4490{ .mfi
4491      // low part of the result
4492      fadd.s1            f9 = f9, fRes1L
4493      nop.i              0
4494}
4495;;
4496{ .mfb
4497      nop.m              0
4498      // final result for -2^63 < x < -6.0 path
4499      fma.s0             f8 = f8, f1, f9
4500      // exit here  for -2^63 < x < -6.0 path
4501      br.ret.sptk        b0
4502}
4503;;
4504
4505// here if x falls in neighbourhood of any negative root
4506// "neighbourhood" typically means that |lgammal(x)| < 0.17
4507// on the [-3.0,-2.0] range |lgammal(x)| has even less
4508// magnitude
4509// rXint contains index of the root
4510// p10 is set if root belongs to "right" ones
4511// p11 is set if root belongs to "left" ones
4512// lgammal(x) is approximated by polynomial of
4513// 19th degree from (x - root) argument
4514.align 32
4515_negRoots:
4516{ .mfi
4517      addl          rPolDataPtr= @ltoff(lgammal_right_roots_polynomial_data),gp
4518      nop.f              0
4519      shl                rTmpPtr2 = rXint, 7 // (i*16)*8
4520}
4521{ .mfi
4522      adds               rRootsAddr = -288, rRootsBndAddr
4523      nop.f              0
4524      nop.i              0
4525}
4526;;
4527{ .mfi
4528      ldfe               fRoot = [rRootsAddr] // FP representation of root
4529      nop.f              0
4530      shl                rTmpPtr = rXint, 6  // (i*16)*4
4531}
4532{ .mfi
4533(p11) adds               rTmpPtr2 = 3536, rTmpPtr2
4534      nop.f              0
4535      nop.i              0
4536}
4537;;
4538{ .mfi
4539      ld8                rPolDataPtr = [rPolDataPtr]
4540      nop.f              0
4541      shladd             rTmpPtr = rXint, 4, rTmpPtr // (i*16) + (i*16)*4
4542}
4543{ .mfi
4544      adds               rTmpPtr3 = 32, rTmpPtr2
4545      nop.f              0
4546      nop.i              0
4547}
4548;;
4549.pred.rel "mutex",p10,p11
4550{ .mfi
4551      add                rTmpPtr3 = rTmpPtr, rTmpPtr3
4552      nop.f              0
4553(p10) cmp.eq             p8, p9 = rXRnd, r0
4554}
4555{ .mfi
4556      // (i*16) + (i*16)*4 + (i*16)*8
4557      add                rTmpPtr = rTmpPtr, rTmpPtr2
4558      nop.f              0
4559(p11) cmp.eq             p9, p8 = rXRnd, r0
4560}
4561;;
4562{ .mfi
4563      add                rTmpPtr2 = rPolDataPtr, rTmpPtr3
4564      nop.f              0
4565      nop.i              0
4566}
4567{ .mfi
4568      add                rPolDataPtr = rPolDataPtr, rTmpPtr // begin + offsett
4569      nop.f              0
4570      nop.i              0
4571}
4572;;
4573{ .mfi
4574      ldfpd              fA0, fA0L = [rPolDataPtr], 16 // A0
4575      nop.f              0
4576      adds               rTmpPtr = 112, rTmpPtr2
4577}
4578{ .mfi
4579      ldfpd              fA2, fA2L = [rTmpPtr2], 16 // A2
4580      nop.f              0
4581      cmp.eq             p12, p13 = 4, rSgnGamSize
4582}
4583;;
4584{ .mfi
4585      ldfpd              fA1, fA1L = [rPolDataPtr], 16 // A1
4586      nop.f              0
4587      nop.i              0
4588}
4589{ .mfi
4590      ldfe               fA3 = [rTmpPtr2], 128 // A4
4591      nop.f              0
4592      nop.i              0
4593}
4594;;
4595{ .mfi
4596      ldfpd              fA12, fA13 = [rTmpPtr], 16 // A12, A13
4597      nop.f              0
4598      adds               rTmpPtr3 = 64, rPolDataPtr
4599}
4600{ .mfi
4601      ldfpd              fA16, fA17 = [rTmpPtr2], 16 // A16, A17
4602      nop.f              0
4603      adds               rPolDataPtr = 32, rPolDataPtr
4604}
4605;;
4606.pred.rel "mutex",p8,p9
4607{ .mfi
4608      ldfpd              fA14, fA15 = [rTmpPtr], 16 // A14, A15
4609      nop.f              0
4610      // sign of GAMMA(x) is negative
4611(p8)  adds               rSgnGam = -1, r0
4612}
4613{ .mfi
4614      ldfpd              fA18, fA19 = [rTmpPtr2], 16 // A18, A19
4615      nop.f              0
4616      // sign of GAMMA(x) is positive
4617(p9)  adds               rSgnGam = 1, r0
4618}
4619;;
4620{ .mfi
4621      ldfe               fA4 = [rPolDataPtr], 16 // A4
4622      nop.f              0
4623      nop.i              0
4624}
4625{ .mfi
4626      ldfpd              fA6, fA7 = [rTmpPtr3], 16 // A6, A7
4627      nop.f              0
4628      nop.i              0
4629}
4630;;
4631{ .mfi
4632      ldfe               fA5 = [rPolDataPtr], 16 // A5
4633      // if x equals to (rounded) root exactly
4634      fcmp.eq.s1         p6, p0 = f8, fRoot
4635      nop.i              0
4636}
4637{ .mfi
4638      ldfpd              fA8, fA9 = [rTmpPtr3], 16 // A8, A9
4639      fms.s1             FR_FracX = f8, f1, fRoot
4640      nop.i              0
4641}
4642;;
4643{ .mfi
4644      // store signgam if size of variable is 4 bytes
4645(p12) st4                [rSgnGamAddr] = rSgnGam
4646      nop.f              0
4647      nop.i              0
4648}
4649{ .mfb
4650      // store signgam if size of variable is 8 bytes
4651(p13) st8                [rSgnGamAddr] = rSgnGam
4652      // answer if x equals to (rounded) root exactly
4653(p6)  fadd.s0            f8 = fA0, fA0L
4654      // exit if x equals to (rounded) root exactly
4655(p6)  br.ret.spnt        b0
4656}
4657;;
4658{ .mmf
4659      ldfpd              fA10, fA11 = [rTmpPtr3], 16 // A10, A11
4660      nop.m              0
4661      nop.f              0
4662}
4663;;
4664{ .mfi
4665      nop.m              0
4666      fma.s1             fResH = fA2, FR_FracX, f0 // (A2*x)hi
4667      nop.i              0
4668}
4669{ .mfi
4670      nop.m              0
4671      fma.s1             fA4L = FR_FracX, FR_FracX, f0 // x^2
4672      nop.i              0
4673}
4674;;
4675{ .mfi
4676      nop.m              0
4677      fma.s1             fA17 = fA17, FR_FracX, fA16
4678      nop.i              0
4679}
4680{.mfi
4681      nop.m              0
4682      fma.s1             fA13 = fA13, FR_FracX, fA12
4683      nop.i              0
4684}
4685;;
4686{ .mfi
4687      nop.m              0
4688      fma.s1             fA19 = fA19, FR_FracX, fA18
4689      nop.i              0
4690}
4691{.mfi
4692      nop.m              0
4693      fma.s1             fA15 = fA15, FR_FracX, fA14
4694      nop.i              0
4695}
4696;;
4697{.mfi
4698      nop.m              0
4699      fma.s1             fPol = fA7, FR_FracX, fA6
4700      nop.i              0
4701}
4702;;
4703{.mfi
4704      nop.m              0
4705      fma.s1             fA9 = fA9, FR_FracX, fA8
4706      nop.i              0
4707}
4708;;
4709{ .mfi
4710      nop.m              0
4711      fms.s1             fResL = fA2, FR_FracX, fResH // delta(A2*x)
4712      nop.i              0
4713}
4714{.mfi
4715      nop.m              0
4716      fadd.s1            fRes1H = fResH, fA1 // (A2*x + A1)hi
4717      nop.i              0
4718}
4719;;
4720{ .mfi
4721      nop.m              0
4722      fma.s1             fA11 = fA11, FR_FracX, fA10
4723      nop.i              0
4724}
4725{.mfi
4726      nop.m              0
4727      fma.s1             fA5L = fA4L, fA4L, f0 // x^4
4728      nop.i              0
4729}
4730;;
4731{ .mfi
4732      nop.m              0
4733      fma.s1             fA19 = fA19, fA4L, fA17
4734      nop.i              0
4735}
4736{.mfi
4737      nop.m              0
4738      fma.s1             fA15 = fA15, fA4L, fA13
4739      nop.i              0
4740}
4741;;
4742{ .mfi
4743      nop.m              0
4744      fma.s1             fPol = fPol, FR_FracX, fA5
4745      nop.i              0
4746}
4747{.mfi
4748      nop.m              0
4749      fma.s1             fA3L = fA4L, FR_FracX, f0 // x^3
4750      nop.i              0
4751}
4752;;
4753{ .mfi
4754      nop.m              0
4755      // delta(A2*x) + A2L*x = (A2*x)lo
4756      fma.s1             fResL = fA2L, FR_FracX, fResL
4757      nop.i              0
4758}
4759{.mfi
4760      nop.m              0
4761      fsub.s1            fRes1L = fA1, fRes1H
4762      nop.i              0
4763}
4764;;
4765{ .mfi
4766      nop.m              0
4767      fma.s1             fA11 = fA11, fA4L, fA9
4768      nop.i              0
4769}
4770{.mfi
4771      nop.m              0
4772      fma.s1             fA19 = fA19, fA5L, fA15
4773      nop.i              0
4774}
4775;;
4776{.mfi
4777      nop.m              0
4778      fma.s1             fPol = fPol, FR_FracX, fA4
4779      nop.i              0
4780}
4781;;
4782{ .mfi
4783      nop.m              0
4784      fadd.s1            fResL = fResL, fA1L // (A2*x)lo + A1
4785      nop.i              0
4786}
4787{.mfi
4788      nop.m              0
4789      fadd.s1            fRes1L = fRes1L, fResH
4790      nop.i              0
4791}
4792;;
4793{ .mfi
4794      nop.m              0
4795      fma.s1             fRes2H = fRes1H, FR_FracX, f0 // ((A2*x + A1)*x)hi
4796      nop.i              0
4797}
4798;;
4799{.mfi
4800      nop.m              0
4801      fma.s1             fA19 = fA19, fA5L, fA11
4802      nop.i              0
4803}
4804;;
4805{.mfi
4806      nop.m              0
4807      fma.s1             fPol = fPol, FR_FracX, fA3
4808      nop.i              0
4809}
4810;;
4811{ .mfi
4812      nop.m              0
4813      fadd.s1            fRes1L = fRes1L, fResL // (A2*x + A1)lo
4814      nop.i              0
4815}
4816;;
4817{ .mfi
4818      nop.m              0
4819      // delta((A2*x + A1)*x)
4820      fms.s1             fRes2L = fRes1H, FR_FracX, fRes2H
4821      nop.i              0
4822}
4823{.mfi
4824      nop.m              0
4825      fadd.s1            fRes3H = fRes2H, fA0 // ((A2*x + A1)*x + A0)hi
4826      nop.i              0
4827}
4828;;
4829{ .mfi
4830      nop.m              0
4831      fma.s1             fA19 = fA19, fA5L, f0
4832      nop.i              0
4833}
4834
4835;;
4836{ .mfi
4837      nop.m              0
4838      fma.s1             fRes2L = fRes1L, FR_FracX, fRes2L // ((A2*x + A1)*x)lo
4839      nop.i              0
4840}
4841{.mfi
4842      nop.m              0
4843      fsub.s1            fRes3L = fRes2H, fRes3H
4844      nop.i              0
4845}
4846;;
4847{.mfi
4848      nop.m              0
4849      fma.s1             fPol = fA19, FR_FracX, fPol
4850      nop.i              0
4851}
4852;;
4853{ .mfi
4854      nop.m              0
4855      fadd.s1            fRes3L = fRes3L, fA0
4856      nop.i              0
4857}
4858{.mfi
4859      nop.m              0
4860      fadd.s1            fRes2L = fRes2L, fA0L // ((A2*x + A1)*x)lo + A0L
4861      nop.i              0
4862}
4863;;
4864{ .mfi
4865      nop.m              0
4866      fadd.s1            fRes3L = fRes3L, fRes2L // (((A2*x + A1)*x) + A0)lo
4867      nop.i              0
4868}
4869;;
4870{.mfi
4871      nop.m              0
4872      fma.s1             fRes3L = fPol, fA3L, fRes3L
4873      nop.i              0
4874}
4875;;
4876{ .mfb
4877      nop.m              0
4878      // final result for arguments which are close to negative roots
4879      fma.s0             f8 = fRes3H, f1, fRes3L
4880      // exit here for arguments which are close to negative roots
4881      br.ret.sptk        b0
4882}
4883;;
4884
4885// here if |x| < 0.5
4886.align 32
4887lgammal_0_half:
4888{ .mfi
4889      ld4                GR_Z_1 = [rZ1offsett] // Load Z_1
4890      fma.s1             fA4L = f8, f8, f0 // x^2
4891      addl               rPolDataPtr    = @ltoff(lgammal_0_Half_data), gp
4892}
4893{ .mfi
4894      shladd             GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr// Point to G_1
4895      nop.f              0
4896      addl               rLnSinDataPtr    = @ltoff(lgammal_lnsin_data), gp
4897}
4898;;
4899{ .mfi
4900      ldfps              FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
4901      nop.f              0
4902      // Point to Constants_Z_2
4903      add                GR_ad_z_2 = 0x140, GR_ad_z_1
4904}
4905{ .mfi
4906      add                GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q
4907      nop.f              0
4908      // Point to Constants_G_H_h2
4909      add                GR_ad_tbl_2 = 0x180, GR_ad_z_1
4910}
4911;;
4912{ .mfi
4913      ld8                rPolDataPtr = [rPolDataPtr]
4914      nop.f              0
4915      // Point to Constants_G_H_h3
4916      add                GR_ad_tbl_3 = 0x280, GR_ad_z_1
4917}
4918{ .mfi
4919      ldfd               FR_h = [GR_ad_tbl_1] // Load h_1
4920      nop.f              0
4921      sub                GR_N = rExpX, rExpHalf, 1
4922}
4923;;
4924{ .mfi
4925      ld8                rLnSinDataPtr    = [rLnSinDataPtr]
4926      nop.f              0
4927      // Get bits 30-15 of X_0 * Z_1
4928      pmpyshr2.u         GR_X_1 = GR_X_0,GR_Z_1,15
4929}
4930{ .mfi
4931      ldfe               FR_log2_hi = [GR_ad_q],16 // Load log2_hi
4932      nop.f              0
4933      sub                GR_N = r0, GR_N
4934}
4935;;
4936//
4937//    For performance, don't use result of pmpyshr2.u for 4 cycles.
4938//
4939{ .mfi
4940      ldfe               FR_log2_lo = [GR_ad_q], 16 // Load log2_lo
4941      nop.f              0
4942      add                rTmpPtr2 = 320, rPolDataPtr
4943}
4944{ .mfi
4945      add                rTmpPtr = 32, rPolDataPtr
4946      nop.f              0
4947      // exponent of 0.25
4948      adds               rExp2 = -1, rExpHalf
4949}
4950;;
4951{ .mfi
4952      ldfpd              fA3, fA3L = [rPolDataPtr], 16 // A3
4953      fma.s1             fA5L = fA4L, fA4L, f0 // x^4
4954      nop.i              0
4955}
4956{ .mfi
4957      ldfpd              fA1, fA1L = [rTmpPtr], 16 // A1
4958      fms.s1             fB8 = f8, f8, fA4L // x^2 - <x^2>
4959      // set p6 if -0.5 < x <= -0.25
4960(p15) cmp.eq.unc         p6, p0 = rExpX, rExp2
4961}
4962;;
4963{ .mfi
4964      ldfpd              fA2, fA2L = [rPolDataPtr], 16 // A2
4965      nop.f              0
4966      // set p6 if -0.5 < x <= -0.40625
4967(p6)  cmp.le.unc         p6, p0 = 10, GR_Index1
4968}
4969{ .mfi
4970      ldfe               fA21 = [rTmpPtr2], -16 // A21
4971      // Put integer N into rightmost significand
4972      nop.f              0
4973      adds               rTmpPtr = 240, rTmpPtr
4974}
4975;;
4976{ .mfi
4977      setf.sig           fFloatN = GR_N
4978      nop.f              0
4979      extr.u             GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
4980}
4981{ .mfi
4982      ldfe               FR_Q4 = [GR_ad_q], 16 // Load Q4
4983      nop.f              0
4984      adds               rPolDataPtr = 304, rPolDataPtr
4985}
4986;;
4987{ .mfi
4988      ldfe               fA20 = [rTmpPtr2], -32 // A20
4989      nop.f              0
4990      shladd             GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2  // Point to Z_2
4991}
4992{ .mfi
4993      ldfe               fA19 = [rTmpPtr], -32 // A19
4994      nop.f              0
4995      shladd             GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2// Point to G_2
4996}
4997;;
4998{ .mfi
4999      ldfe               fA17 = [rTmpPtr], -32 // A17
5000      nop.f              0
5001      adds               rTmpPtr3 = 8, GR_ad_tbl_2
5002}
5003{ .mfb
5004      ldfe               fA18 = [rTmpPtr2], -32 // A18
5005      nop.f              0
5006      // branch to special path for -0.5 < x <= 0.40625
5007(p6)  br.cond.spnt       lgammal_near_neg_half
5008}
5009;;
5010{ .mmf
5011      ld4                GR_Z_2 = [GR_ad_z_2] // Load Z_2
5012      ldfe               fA15 = [rTmpPtr], -32 // A15
5013      fma.s1             fB20 = fA5L, fA5L, f0 // x^8
5014}
5015;;
5016{ .mmf
5017      ldfe               fA16 = [rTmpPtr2], -32 // A16
5018      ldfe               fA13 = [rTmpPtr], -32 // A13
5019      fms.s1             fB16 = fA4L, fA4L, fA5L
5020}
5021;;
5022{ .mmf
5023      ldfps              FR_G2, FR_H2 = [GR_ad_tbl_2], 8 // Load G_2, H_2
5024      ldfd               FR_h2 = [rTmpPtr3] // Load h_2
5025      fmerge.s           fB10 = f8, fA5L // sign(x) * x^4
5026}
5027;;
5028{ .mmi
5029      ldfe               fA14 = [rTmpPtr2], -32 // A14
5030      ldfe               fA11 = [rTmpPtr], -32 // A11
5031      // Get bits 30-15 of X_1 * Z_2
5032      pmpyshr2.u         GR_X_2 = GR_X_1,GR_Z_2,15
5033}
5034;;
5035//
5036//    For performance, don't use result of pmpyshr2.u for 4 cycles.
5037//
5038{ .mfi
5039      ldfe               fA12 = [rTmpPtr2], -32 // A12
5040      fma.s1             fRes4H = fA3, fAbsX, f0
5041      adds               rTmpPtr3 = 16, GR_ad_q
5042}
5043{ .mfi
5044      ldfe               fA9 = [rTmpPtr], -32 // A9
5045      nop.f              0
5046      nop.i              0
5047}
5048;;
5049{ .mmf
5050      ldfe               fA10 = [rTmpPtr2], -32 // A10
5051      ldfe               fA7 = [rTmpPtr], -32 // A7
5052      fma.s1             fB18 = fB20, fB20, f0 // x^16
5053}
5054;;
5055{ .mmf
5056      ldfe               fA8 = [rTmpPtr2], -32 // A8
5057      ldfe               fA22 = [rPolDataPtr], 16 // A22
5058      fcvt.xf            fFloatN = fFloatN
5059}
5060;;
5061{ .mfi
5062      ldfe               fA5 = [rTmpPtr], -32 // A5
5063      fma.s1             fA21 = fA21, fAbsX, fA20 // v16
5064      extr.u             GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
5065}
5066{ .mfi
5067      ldfe               fA6 = [rTmpPtr2], -32 // A6
5068      nop.f              0
5069      nop.i              0
5070}
5071;;
5072{ .mmf
5073      // Point to G_3
5074      shladd             GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3
5075      ldfe               fA4 = [rTmpPtr2], -32 // A4
5076      fma.s1             fA19 = fA19, fAbsX, fA18 // v13
5077}
5078;;
5079.pred.rel "mutex",p14,p15
5080{ .mfi
5081      ldfps              FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
5082      fms.s1             fRes4L = fA3, fAbsX, fRes4H
5083(p14) adds               rSgnGam = 1, r0
5084}
5085{ .mfi
5086      cmp.eq             p6, p7 = 4, rSgnGamSize
5087      fadd.s1            fRes2H = fRes4H, fA2
5088(p15) adds               rSgnGam = -1, r0
5089}
5090;;
5091
5092{ .mfi
5093      ldfd               FR_h3 = [GR_ad_tbl_3] // Load h_3
5094      fma.s1             fA17 = fA17, fAbsX, fA16 // v12
5095      nop.i              0
5096}
5097;;
5098{ .mfi
5099      ldfe               FR_Q3 = [GR_ad_q], 32 // Load Q3
5100      fmpy.s1            FR_G = FR_G, FR_G2 // G = G_1 * G_2
5101      nop.i              0
5102}
5103{ .mfi
5104      ldfe               FR_Q2 = [rTmpPtr3], 16 // Load Q2
5105      fadd.s1            FR_H = FR_H, FR_H2 // H = H_1 + H_2
5106      nop.i              0
5107}
5108;;
5109{ .mfi
5110      ldfe               FR_Q1 = [GR_ad_q] // Load Q1
5111      fma.s1             fA15 = fA15, fAbsX, fA14 // v8
5112      nop.i              0
5113}
5114{ .mfi
5115      adds               rTmpPtr3 = 32, rLnSinDataPtr
5116      fadd.s1            FR_h = FR_h, FR_h2 // h = h_1 + h_2
5117      nop.i              0
5118}
5119;;
5120{ .mmf
5121      ldfpd              fLnSin2, fLnSin2L = [rLnSinDataPtr], 16
5122      ldfe               fLnSin6 = [rTmpPtr3], 32
5123      fma.s1             fA13 = fA13, fAbsX, fA12 // v7
5124
5125}
5126;;
5127{ .mfi
5128      ldfe               fLnSin4 = [rLnSinDataPtr], 32
5129      fma.s1             fRes4L = fA3L, fAbsX, fRes4L
5130      nop.i              0
5131}
5132{ .mfi
5133      ldfe               fLnSin10 = [rTmpPtr3], 32
5134      fsub.s1            fRes2L = fA2, fRes2H
5135      nop.i              0
5136}
5137;;
5138{ .mfi
5139      ldfe               fLnSin8 = [rLnSinDataPtr], 32
5140      fma.s1             fResH = fRes2H, fAbsX, f0
5141      nop.i              0
5142}
5143{ .mfi
5144      ldfe               fLnSin14 = [rTmpPtr3], 32
5145      fma.s1             fA22 = fA22, fA4L, fA21 // v15
5146      nop.i              0
5147}
5148;;
5149{ .mfi
5150      ldfe               fLnSin12 = [rLnSinDataPtr], 32
5151      fma.s1             fA9 = fA9, fAbsX, fA8 // v4
5152      nop.i              0
5153}
5154{ .mfi
5155      ldfd               fLnSin18 = [rTmpPtr3], 16
5156      fma.s1             fA11 = fA11, fAbsX, fA10 // v5
5157      nop.i              0
5158}
5159;;
5160{ .mfi
5161      ldfe               fLnSin16 = [rLnSinDataPtr], 24
5162      fma.s1             fA19 = fA19, fA4L, fA17 // v11
5163      nop.i              0
5164}
5165{ .mfi
5166      ldfd               fLnSin22 = [rTmpPtr3], 16
5167      fma.s1             fPolL = fA7, fAbsX, fA6
5168      nop.i              0
5169}
5170;;
5171{ .mfi
5172      ldfd               fLnSin20 = [rLnSinDataPtr], 16
5173      fmpy.s1            FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
5174      nop.i              0
5175}
5176{ .mfi
5177      ldfd               fLnSin26 = [rTmpPtr3], 16
5178      fadd.s1            FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
5179      nop.i              0
5180}
5181;;
5182{ .mfi
5183      ldfd               fLnSin24 = [rLnSinDataPtr], 16
5184      fadd.s1            fRes2L = fRes2L, fRes4H
5185      nop.i              0
5186}
5187{ .mfi
5188      ldfd               fLnSin30 = [rTmpPtr3], 16
5189      fadd.s1            fA2L = fA2L, fRes4L
5190      nop.i              0
5191}
5192;;
5193{ .mfi
5194      ldfd               fLnSin28 = [rLnSinDataPtr], 16
5195      fms.s1             fResL = fRes2H, fAbsX, fResH
5196      nop.i              0
5197}
5198{ .mfi
5199      ldfd               fLnSin34 = [rTmpPtr3], 8
5200      fadd.s1            fRes2H = fResH, fA1
5201      nop.i              0
5202}
5203;;
5204{ .mfi
5205      ldfd               fLnSin32 = [rLnSinDataPtr]
5206      fma.s1             fA11 = fA11, fA4L, fA9 // v3
5207      nop.i              0
5208}
5209{ .mfi
5210      ldfd               fLnSin36 = [rTmpPtr3]
5211      fma.s1             fA15 = fA15, fA4L, fA13 // v6
5212      nop.i              0
5213}
5214;;
5215
5216{ .mfi
5217      // store signgam if size of variable is 4 bytes
5218(p6)  st4                [rSgnGamAddr] = rSgnGam
5219      fadd.s1            FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
5220      nop.i              0
5221}
5222{ .mfi
5223      // store signgam if size of variable is 8 bytes
5224(p7)  st8                [rSgnGamAddr] = rSgnGam
5225      fma.s1             fA5 = fA5, fAbsX, fA4
5226      nop.i              0
5227}
5228;;
5229{ .mfi
5230      nop.m              0
5231      fms.s1             FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1
5232      nop.i              0
5233}
5234{ .mfi
5235      nop.m               0
5236      // High part of the log(|x|): Y_hi = N * log2_hi + H
5237      fms.s1             FR_log2_hi = fFloatN, FR_log2_hi, FR_H
5238      nop.i              0
5239}
5240;;
5241{ .mfi
5242      nop.m              0
5243      fadd.s1            fA3L = fRes2L, fA2L
5244      nop.i              0
5245}
5246{ .mfi
5247      nop.m              0
5248      fma.s1             fA22 = fA22, fA5L, fA19
5249      nop.i              0
5250}
5251;;
5252{ .mfi
5253      nop.m              0
5254      fsub.s1            fRes2L = fA1, fRes2H
5255      nop.i              0
5256}
5257{ .mfi
5258      nop.m              0
5259      fma.s1             fRes3H = fRes2H, f8, f0
5260      nop.i              0
5261}
5262;;
5263{ .mfi
5264      nop.m              0
5265      fma.s1             fA15 = fA15, fA5L, fA11 // v2
5266      nop.i              0
5267}
5268{ .mfi
5269      nop.m              0
5270      fma.s1             fLnSin18 = fLnSin18, fA4L, fLnSin16
5271      nop.i              0
5272}
5273;;
5274{ .mfi
5275      nop.m              0
5276      // h = N * log2_lo + h
5277      fms.s1             FR_h = fFloatN, FR_log2_lo, FR_h
5278      nop.i              0
5279}
5280{ .mfi
5281      nop.m              0
5282      fma.s1             fPolL = fPolL, fA4L, fA5
5283      nop.i              0
5284}
5285;;
5286{ .mfi
5287      nop.m              0
5288      // poly_lo = r * Q4 + Q3
5289      fma.s1             FR_poly_lo = FR_r, FR_Q4, FR_Q3
5290      nop.i              0
5291}
5292{ .mfi
5293      nop.m              0
5294      fmpy.s1            FR_rsq = FR_r, FR_r // rsq = r * r
5295      nop.i              0
5296}
5297;;
5298{ .mfi
5299      nop.m              0
5300      fma.s1             fResL = fA3L, fAbsX, fResL
5301      nop.i              0
5302}
5303{ .mfi
5304      nop.m              0
5305      fma.s1             fLnSin30 = fLnSin30, fA4L, fLnSin28
5306      nop.i              0
5307}
5308;;
5309{ .mfi
5310      nop.m              0
5311      fadd.s1            fRes2L = fRes2L, fResH
5312      nop.i              0
5313}
5314{ .mfi
5315      nop.m              0
5316      fms.s1             fRes3L = fRes2H, f8, fRes3H
5317      nop.i              0
5318}
5319;;
5320{ .mfi
5321      nop.m              0
5322      fadd.s1            fRes1H = fRes3H, FR_log2_hi
5323      nop.i              0
5324}
5325{ .mfi
5326      nop.m              0
5327      fma.s1             fPol = fB20, fA22, fA15
5328      nop.i              0
5329}
5330;;
5331{ .mfi
5332      nop.m              0
5333      fma.s1             fLnSin34 = fLnSin34, fA4L, fLnSin32
5334      nop.i              0
5335}
5336{ .mfi
5337      nop.m              0
5338      fma.s1             fLnSin14 = fLnSin14, fA4L, fLnSin12
5339      nop.i              0
5340}
5341;;
5342
5343{ .mfi
5344      nop.m              0
5345      // poly_lo = poly_lo * r + Q2
5346      fma.s1             FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
5347      nop.i              0
5348}
5349{ .mfi
5350      nop.m              0
5351      fnma.s1            FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
5352      nop.i              0
5353}
5354;;
5355{ .mfi
5356      nop.m              0
5357      // poly_hi = Q1 * rsq + r
5358      fma.s1             FR_poly_hi = FR_Q1, FR_rsq, FR_r
5359      nop.i              0
5360}
5361{ .mfi
5362      nop.m              0
5363      fadd.s1            fA1L = fA1L, fResL
5364      nop.i              0
5365}
5366;;
5367
5368{ .mfi
5369      nop.m              0
5370      fma.s1             fLnSin22 = fLnSin22, fA4L, fLnSin20
5371      nop.i              0
5372}
5373{ .mfi
5374      nop.m              0
5375      fma.s1             fLnSin26 = fLnSin26, fA4L, fLnSin24
5376      nop.i              0
5377}
5378;;
5379
5380{ .mfi
5381      nop.m              0
5382      fsub.s1            fRes1L = FR_log2_hi, fRes1H
5383      nop.i              0
5384}
5385{ .mfi
5386      nop.m              0
5387      fma.s1             fPol = fPol, fA5L, fPolL
5388      nop.i              0
5389}
5390;;
5391{ .mfi
5392      nop.m              0
5393      fma.s1             fLnSin34 = fLnSin36, fA5L, fLnSin34
5394      nop.i              0
5395}
5396{ .mfi
5397      nop.m              0
5398      fma.s1             fLnSin18 = fLnSin18, fA5L, fLnSin14
5399      nop.i              0
5400}
5401;;
5402{ .mfi
5403      nop.m              0
5404      fma.s1             fLnSin6 = fLnSin6, fA4L, fLnSin4
5405      nop.i              0
5406}
5407{ .mfi
5408      nop.m              0
5409      fma.s1             fLnSin10 = fLnSin10, fA4L, fLnSin8
5410      nop.i              0
5411}
5412;;
5413{ .mfi
5414      nop.m              0
5415      // poly_hi = Q1 * rsq + r
5416      fma.s1             FR_poly_hi = FR_Q1, FR_rsq, FR_r
5417      nop.i              0
5418}
5419{ .mfi
5420      nop.m              0
5421      fadd.s1            fRes2L = fRes2L, fA1L
5422      nop.i              0
5423}
5424;;
5425{ .mfi
5426      nop.m              0
5427      // poly_lo = poly_lo*r^3 + h
5428      fma.s1             FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
5429      nop.i              0
5430}
5431{ .mfi
5432      nop.m              0
5433      fma.s1             fB2 = fLnSin2, fA4L, f0
5434      nop.i              0
5435}
5436;;
5437{ .mfi
5438      nop.m              0
5439      fadd.s1            fRes1L = fRes1L, fRes3H
5440      nop.i              0
5441}
5442{ .mfi
5443      nop.m              0
5444      fma.s1             fPol = fPol, fB10, f0
5445      nop.i              0
5446}
5447;;
5448{ .mfi
5449      nop.m              0
5450      fma.s1             fLnSin26 = fLnSin26, fA5L, fLnSin22
5451      nop.i              0
5452}
5453{ .mfi
5454      nop.m              0
5455      fma.s1             fLnSin34 = fLnSin34, fA5L, fLnSin30
5456      nop.i              0
5457}
5458;;
5459{ .mfi
5460      nop.m              0
5461      fma.s1             fLnSin10 = fLnSin10, fA5L, fLnSin6
5462      nop.i              0
5463}
5464{ .mfi
5465      nop.m              0
5466      fma.s1             fLnSin2L = fLnSin2L, fA4L, f0
5467      nop.i              0
5468}
5469;;
5470
5471{ .mfi
5472      nop.m              0
5473      fma.s1             fRes3L = fRes2L, f8, fRes3L
5474      nop.i              0
5475}
5476;;
5477{ .mfi
5478      nop.m              0
5479      // Y_lo = poly_hi + poly_lo
5480      fsub.s1            FR_log2_lo = FR_poly_lo, FR_poly_hi
5481      nop.i              0
5482}
5483{ .mfi
5484      nop.m              0
5485      fms.s1             fB4 = fLnSin2, fA4L, fB2
5486      nop.i              0
5487}
5488;;
5489{ .mfi
5490      nop.m              0
5491      fadd.s1            fRes2H = fRes1H, fPol
5492      nop.i              0
5493}
5494;;
5495{ .mfi
5496      nop.m              0
5497      fma.s1             fLnSin34 = fLnSin34, fB20, fLnSin26
5498      nop.i              0
5499}
5500;;
5501{ .mfi
5502      nop.m              0
5503      fma.s1             fLnSin18 = fLnSin18, fB20, fLnSin10
5504      nop.i              0
5505}
5506{ .mfi
5507      nop.m              0
5508      fma.s1             fLnSin2L = fB8, fLnSin2, fLnSin2L
5509      nop.i              0
5510}
5511;;
5512
5513{ .mfi
5514      nop.m              0
5515      fadd.s1            FR_log2_lo = FR_log2_lo, fRes3L
5516      nop.i              0
5517}
5518;;
5519{ .mfi
5520      nop.m              0
5521      fsub.s1            fRes2L = fRes1H, fRes2H
5522      nop.i              0
5523}
5524;;
5525{ .mfi
5526      nop.m              0
5527      fma.s1             fB6 = fLnSin34, fB18, fLnSin18
5528      nop.i              0
5529}
5530{ .mfi
5531      nop.m              0
5532      fadd.s1            fB4 = fLnSin2L, fB4
5533      nop.i              0
5534}
5535;;
5536
5537{ .mfi
5538      nop.m              0
5539      fadd.s1            fRes1L = fRes1L, FR_log2_lo
5540      nop.i              0
5541}
5542;;
5543{ .mfi
5544      nop.m              0
5545      fadd.s1            fRes2L = fRes2L, fPol
5546      nop.i              0
5547}
5548;;
5549{ .mfi
5550      nop.m              0
5551      fma.s1             fB12 = fB6, fA5L, f0
5552      nop.i              0
5553}
5554;;
5555{ .mfi
5556      nop.m              0
5557      fadd.s1            fRes2L = fRes2L, fRes1L
5558      nop.i              0
5559}
5560;;
5561
5562{ .mfi
5563      nop.m              0
5564      fms.s1             fB14 = fB6, fA5L, fB12
5565      nop.i              0
5566}
5567{ .mfb
5568      nop.m              0
5569      fadd.s1            fLnSin30 = fB2, fB12
5570      // branch out if x is negative
5571(p15) br.cond.spnt       _O_Half_neg
5572}
5573;;
5574{ .mfb
5575      nop.m              0
5576      // sign(x)*Pol(|x|) - log(|x|)
5577      fma.s0             f8 = fRes2H, f1, fRes2L
5578      // it's an answer already for positive x
5579      // exit if 0 < x < 0.5
5580      br.ret.sptk        b0
5581}
5582;;
5583
5584// here if x is negative and |x| < 0.5
5585.align 32
5586_O_Half_neg:
5587{ .mfi
5588      nop.m              0
5589      fma.s1             fB14 = fB16, fB6, fB14
5590      nop.i              0
5591}
5592{ .mfi
5593      nop.m              0
5594      fsub.s1            fLnSin16 = fB2, fLnSin30
5595      nop.i              0
5596}
5597;;
5598{ .mfi
5599      nop.m              0
5600      fadd.s1            fResH = fLnSin30, fRes2H
5601      nop.i              0
5602}
5603;;
5604{ .mfi
5605      nop.m              0
5606      fadd.s1            fLnSin16 = fLnSin16, fB12
5607      nop.i              0
5608}
5609{ .mfi
5610      nop.m              0
5611      fadd.s1            fB4 = fB14, fB4
5612      nop.i              0
5613}
5614;;
5615{ .mfi
5616      nop.m              0
5617      fadd.s1            fLnSin16 = fB4, fLnSin16
5618      nop.i              0
5619}
5620{ .mfi
5621      nop.m              0
5622      fsub.s1            fResL = fRes2H, fResH
5623      nop.i              0
5624}
5625;;
5626{ .mfi
5627      nop.m              0
5628      fadd.s1            fResL = fResL, fLnSin30
5629      nop.i              0
5630}
5631{ .mfi
5632      nop.m              0
5633      fadd.s1            fLnSin16 = fLnSin16, fRes2L
5634      nop.i              0
5635}
5636;;
5637{ .mfi
5638      nop.m              0
5639      fadd.s1            fResL = fResL, fLnSin16
5640      nop.i              0
5641}
5642;;
5643{ .mfb
5644      nop.m              0
5645      // final result for -0.5 < x < 0
5646      fma.s0             f8 = fResH, f1, fResL
5647      // exit for -0.5 < x < 0
5648      br.ret.sptk        b0
5649}
5650;;
5651
5652// here if x >= 8.0
5653// there are two computational paths:
5654// 1) For x >10.0 Stirling's formula is used
5655// 2) Polynomial approximation for 8.0 <= x <= 10.0
5656.align 32
5657lgammal_big_positive:
5658{ .mfi
5659      addl               rPolDataPtr = @ltoff(lgammal_data), gp
5660      fmerge.se          fSignifX =  f1, f8
5661      // Get high 15 bits of significand
5662      extr.u             GR_X_0 = rSignifX, 49, 15
5663}
5664{.mfi
5665      shladd             rZ1offsett = GR_Index1, 2, GR_ad_z_1  // Point to Z_1
5666      fnma.s1            fInvX = f8, fRcpX, f1 // start of 1st NR iteration
5667      adds               rSignif1andQ = 0x5, r0
5668}
5669;;
5670{.mfi
5671      ld4                GR_Z_1 = [rZ1offsett] // Load Z_1
5672      nop.f              0
5673      shl                rSignif1andQ = rSignif1andQ, 61 // significand of 1.25
5674}
5675{  .mfi
5676      cmp.eq             p8, p0 = rExpX, rExp8 // p8 = 1 if 8.0 <= x < 16
5677      nop.f              0
5678      adds               rSgnGam = 1, r0 // gamma is positive at this range
5679}
5680;;
5681{ .mfi
5682      shladd             GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr// Point to G_1
5683      nop.f              0
5684      add                GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q
5685}
5686{ .mlx
5687      ld8                rPolDataPtr    = [rPolDataPtr]
5688      movl               rDelta = 0x3FF2000000000000
5689}
5690;;
5691{ .mfi
5692      ldfps              FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
5693      nop.f              0
5694      add                GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
5695}
5696{ .mfi
5697      // Point to Constants_G_H_h2
5698      add                GR_ad_tbl_2 = 0x180, GR_ad_z_1
5699      nop.f              0
5700      // p8 = 1 if 8.0 <= x <= 10.0
5701(p8)  cmp.leu.unc        p8, p0 = rSignifX, rSignif1andQ
5702}
5703;;
5704{ .mfi
5705      ldfd               FR_h = [GR_ad_tbl_1] // Load h_1
5706      nop.f              0
5707      // Get bits 30-15 of X_0 * Z_1
5708      pmpyshr2.u         GR_X_1 = GR_X_0,GR_Z_1,15
5709}
5710{ .mfb
5711(p8)  setf.d             FR_MHalf = rDelta
5712      nop.f              0
5713(p8)  br.cond.spnt       lgammal_8_10 // branch out if 8.0 <= x <= 10.0
5714}
5715;;
5716//
5717//    For performance, don't use result of pmpyshr2.u for 4 cycles.
5718//
5719{ .mfi
5720      ldfe               fA1 = [rPolDataPtr], 16 // Load overflow threshold
5721      fma.s1             fRcpX = fInvX, fRcpX, fRcpX // end of 1st NR iteration
5722      // Point to Constants_G_H_h3
5723      add                GR_ad_tbl_3 = 0x280, GR_ad_z_1
5724}
5725{ .mlx
5726      nop.m              0
5727      movl               rDelta = 0xBFE0000000000000 // -0.5 in DP
5728}
5729;;
5730{ .mfi
5731      ldfe               FR_log2_hi = [GR_ad_q],16 // Load log2_hi
5732      nop.f              0
5733      sub                GR_N = rExpX, rExpHalf, 1 // unbiased exponent of x
5734}
5735;;
5736{ .mfi
5737      ldfe               FR_log2_lo = [GR_ad_q],16 // Load log2_lo
5738      nop.f              0
5739      nop.i              0
5740}
5741{ .mfi
5742      setf.d             FR_MHalf = rDelta
5743      nop.f              0
5744      nop.i              0
5745}
5746;;
5747{ .mfi
5748      // Put integer N into rightmost significand
5749      setf.sig           fFloatN = GR_N
5750      nop.f              0
5751      extr.u             GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
5752}
5753{ .mfi
5754      ldfe               FR_Q4 = [GR_ad_q], 16 // Load Q4
5755      nop.f              0
5756      nop.i              0
5757}
5758;;
5759{ .mfi
5760      shladd             GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2  // Point to Z_2
5761      nop.f              0
5762      shladd             GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2// Point to G_2
5763}
5764{ .mfi
5765      ldfe               FR_Q3 = [GR_ad_q], 16 // Load Q3
5766      nop.f              0
5767      nop.i              0
5768}
5769;;
5770{ .mfi
5771      ld4                GR_Z_2 = [GR_ad_z_2] // Load Z_2
5772      fnma.s1            fInvX = f8, fRcpX, f1 // start of 2nd NR iteration
5773      nop.i              0
5774}
5775;;
5776{ .mfi
5777      ldfps              FR_G2, FR_H2 = [GR_ad_tbl_2], 8 // Load G_2, H_2
5778      nop.f              0
5779      nop.i              0
5780}
5781;;
5782{ .mfi
5783      ldfd               FR_h2 = [GR_ad_tbl_2] // Load h_2
5784      nop.f              0
5785      nop.i              0
5786}
5787;;
5788{ .mfi
5789      ldfe               FR_Q2 = [GR_ad_q],16 // Load Q2
5790      nop.f              0
5791      // Get bits 30-15 of X_1 * Z_2
5792      pmpyshr2.u         GR_X_2 = GR_X_1,GR_Z_2,15
5793}
5794;;
5795//
5796//    For performance, don't use result of pmpyshr2.u for 4 cycles.
5797//
5798{ .mfi
5799      ldfe               FR_Q1 = [GR_ad_q] // Load Q1
5800      fcmp.gt.s1         p7,p0 = f8, fA1 // check if x > overflow threshold
5801      nop.i              0
5802}
5803;;
5804{.mfi
5805      ldfpd              fA0, fA0L = [rPolDataPtr], 16 // Load two parts of C
5806      fma.s1             fRcpX = fInvX, fRcpX, fRcpX // end of 2nd NR iteration
5807      nop.i              0
5808}
5809;;
5810{ .mfb
5811      ldfpd              fB2, fA1 = [rPolDataPtr], 16
5812      nop.f              0
5813(p7)  br.cond.spnt       lgammal_overflow // branch if x > overflow threshold
5814}
5815;;
5816{.mfi
5817      ldfe               fB4 = [rPolDataPtr], 16
5818      fcvt.xf            fFloatN = fFloatN
5819      extr.u             GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
5820}
5821;;
5822{ .mfi
5823      shladd             GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3// Point to G_3
5824      nop.f              0
5825      nop.i              0
5826}
5827{ .mfi
5828      ldfe               fB6 = [rPolDataPtr], 16
5829      nop.f              0
5830      nop.i              0
5831}
5832;;
5833{ .mfi
5834      ldfps              FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3
5835      nop.f              0
5836      nop.i              0
5837}
5838;;
5839{ .mfi
5840      ldfd               FR_h3 = [GR_ad_tbl_3] // Load h_3
5841      fmpy.s1            FR_G = FR_G, FR_G2 // G = G_1 * G_2
5842      nop.i              0
5843}
5844{ .mfi
5845      nop.m              0
5846      fadd.s1            FR_H = FR_H, FR_H2 // H = H_1 + H_2
5847      nop.i              0
5848}
5849;;
5850
5851{ .mfi
5852      ldfe               fB8 = [rPolDataPtr], 16
5853      fadd.s1            FR_h = FR_h, FR_h2 // h = h_1 + h_2
5854      nop.i              0
5855}
5856{ .mfi
5857      nop.m              0
5858      fnma.s1            fInvX = f8, fRcpX, f1 // start of 3rd NR iteration
5859      nop.i              0
5860}
5861;;
5862{ .mfi
5863      ldfe               fB10 = [rPolDataPtr], 16
5864      nop.f              0
5865      cmp.eq             p6, p7 = 4, rSgnGamSize
5866}
5867;;
5868{ .mfi
5869      ldfe               fB12 = [rPolDataPtr], 16
5870      nop.f              0
5871      nop.i              0
5872}
5873;;
5874{ .mfi
5875      ldfe               fB14 = [rPolDataPtr], 16
5876      nop.f              0
5877      nop.i              0
5878}
5879;;
5880
5881{ .mfi
5882      ldfe               fB16 = [rPolDataPtr], 16
5883      // get double extended coefficients from two doubles
5884      // two doubles are needed in Stitling's formula for negative x
5885      fadd.s1            fB2 = fB2, fA1
5886      nop.i              0
5887}
5888;;
5889{ .mfi
5890      ldfe               fB18 = [rPolDataPtr], 16
5891      fma.s1             fInvX = fInvX, fRcpX, fRcpX // end of 3rd NR iteration
5892      nop.i              0
5893}
5894;;
5895{ .mfi
5896      ldfe               fB20 = [rPolDataPtr], 16
5897      nop.f              0
5898      nop.i              0
5899}
5900;;
5901{ .mfi
5902      // store signgam if size of variable is 4 bytes
5903(p6)  st4                [rSgnGamAddr] = rSgnGam
5904      fmpy.s1            FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
5905      nop.i              0
5906}
5907{ .mfi
5908      // store signgam if size of variable is 8 bytes
5909(p7)  st8                [rSgnGamAddr] = rSgnGam
5910      fadd.s1            FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
5911      nop.i              0
5912}
5913;;
5914{ .mfi
5915      nop.m              0
5916      fadd.s1            FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
5917      nop.i              0
5918}
5919;;
5920{ .mfi
5921      nop.m              0
5922      fma.s1             fRcpX = fInvX, fInvX, f0 // 1/x^2
5923      nop.i              0
5924}
5925{ .mfi
5926      nop.m              0
5927      fma.s1             fA0L = fB2, fInvX, fA0L
5928      nop.i              0
5929}
5930;;
5931{ .mfi
5932      nop.m              0
5933      fms.s1             FR_r = fSignifX, FR_G, f1 // r = G * S_hi - 1
5934      nop.i              0
5935}
5936{ .mfi
5937      nop.m              0
5938      // High part of the log(x): Y_hi = N * log2_hi + H
5939      fma.s1             fRes2H = fFloatN, FR_log2_hi, FR_H
5940      nop.i              0
5941}
5942;;
5943
5944{ .mfi
5945      nop.m              0
5946      // h = N * log2_lo + h
5947      fma.s1             FR_h = fFloatN, FR_log2_lo, FR_h
5948      nop.i              0
5949}
5950{ .mfi
5951      nop.m              0
5952      // High part of the log(x): Y_hi = N * log2_hi + H
5953      fma.s1             fRes1H = fFloatN, FR_log2_hi, FR_H
5954      nop.i              0
5955}
5956;;
5957{.mfi
5958      nop.m              0
5959      fma.s1             fPol = fB18, fRcpX, fB16 // v9
5960      nop.i              0
5961}
5962{ .mfi
5963      nop.m              0
5964      fma.s1             fA2L = fRcpX, fRcpX, f0   // v10
5965      nop.i              0
5966}
5967;;
5968{.mfi
5969      nop.m              0
5970      fma.s1             fA3 = fB6, fRcpX, fB4     // v3
5971      nop.i              0
5972}
5973{ .mfi
5974      nop.m              0
5975      fma.s1             fA4 = fB10, fRcpX, fB8    // v4
5976      nop.i              0
5977}
5978;;
5979{ .mfi
5980      nop.m              0
5981      fms.s1             fRes2H =fRes2H, f1, f1 //  log_Hi(x) -1
5982      nop.i              0
5983}
5984{ .mfi
5985      nop.m              0
5986      // poly_lo = r * Q4 + Q3
5987      fma.s1             FR_poly_lo = FR_r, FR_Q4, FR_Q3
5988      nop.i              0
5989}
5990;;
5991{ .mfi
5992      nop.m              0
5993      fma.s1             fRes1H = fRes1H, FR_MHalf, f0 // -0.5*log_Hi(x)
5994      nop.i              0
5995}
5996{ .mfi
5997      nop.m              0
5998      fmpy.s1            FR_rsq = FR_r, FR_r // rsq = r * r
5999      nop.i              0
6000}
6001;;
6002{ .mfi
6003      nop.m              0
6004      fma.s1             fA7 = fB14, fRcpX, fB12  // v7
6005      nop.i              0
6006}
6007{ .mfi
6008      nop.m              0
6009      fma.s1             fA8 = fA2L, fB20, fPol   // v8
6010      nop.i              0
6011}
6012;;
6013{ .mfi
6014      nop.m              0
6015      fma.s1             fA2 = fA4, fA2L, fA3    // v2
6016      nop.i              0
6017}
6018{ .mfi
6019      nop.m              0
6020      fma.s1             fA4L = fA2L, fA2L, f0    // v5
6021      nop.i              0
6022}
6023;;
6024{ .mfi
6025      nop.m              0
6026      fma.s1             fResH = fRes2H, f8, f0 // (x*(ln(x)-1))hi
6027      nop.i              0
6028}
6029{ .mfi
6030      nop.m              0
6031      // poly_lo = poly_lo * r + Q2
6032      fma.s1             FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
6033      nop.i              0
6034}
6035;;
6036{ .mfi
6037      nop.m              0
6038      fma.s1             FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
6039      nop.i              0
6040}
6041{ .mfi
6042      nop.m              0
6043      // poly_hi = Q1 * rsq + r
6044      fma.s1             FR_poly_hi = FR_Q1, FR_rsq, FR_r
6045      nop.i              0
6046}
6047;;
6048{ .mfi
6049      nop.m              0
6050      fma.s1             fA11 = fRcpX, fInvX, f0 // 1/x^3
6051      nop.i              0
6052}
6053{ .mfi
6054      nop.m              0
6055      fma.s1             fA6 = fA8, fA2L, fA7   // v6
6056      nop.i              0
6057}
6058;;
6059{ .mfi
6060      nop.m              0
6061      fms.s1             fResL = fRes2H, f8, fResH // d(x*(ln(x)-1))
6062      nop.i              0
6063}
6064{ .mfi
6065      nop.m              0
6066      fadd.s1            fRes3H = fResH, fRes1H // (x*(ln(x)-1) -0.5ln(x))hi
6067      nop.i              0
6068}
6069;;
6070{ .mfi
6071      nop.m              0
6072      // poly_lo = poly_lo*r^3 + h
6073      fma.s1             FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
6074      nop.i              0
6075}
6076;;
6077{ .mfi
6078      nop.m              0
6079      fma.s1             fPol = fA4L, fA6, fA2   // v1
6080      nop.i              0
6081}
6082{ .mfi
6083      nop.m              0
6084      // raise inexact exception
6085      fma.s0             FR_log2_lo = FR_log2_lo, FR_log2_lo, f0
6086      nop.i              0
6087}
6088;;
6089{ .mfi
6090      nop.m              0
6091      fadd.s1            fRes4H = fRes3H, fA0 // (x*(ln(x)-1) -0.5ln(x))hi + Chi
6092      nop.i              0
6093}
6094{ .mfi
6095      nop.m              0
6096      fsub.s1            fRes3L = fResH, fRes3H
6097      nop.i              0
6098}
6099;;
6100{ .mfi
6101      nop.m              0
6102      // Y_lo = poly_hi + poly_lo
6103      fadd.s1            fRes2L = FR_poly_hi, FR_poly_lo
6104      nop.i              0
6105}
6106;;
6107
6108{ .mfi
6109      nop.m              0
6110      fma.s1             fA0L = fPol, fA11, fA0L // S(1/x) + Clo
6111      nop.i              0
6112}
6113;;
6114{ .mfi
6115      nop.m              0
6116      fadd.s1            fRes3L = fRes3L, fRes1H
6117      nop.i              0
6118}
6119{ .mfi
6120      nop.m              0
6121      fsub.s1            fRes4L = fRes3H, fRes4H
6122      nop.i              0
6123}
6124;;
6125{ .mfi
6126      nop.m              0
6127      fma.s1             fResL = fRes2L, f8 , fResL // lo part of x*(ln(x)-1)
6128      nop.i              0
6129}
6130;;
6131{ .mfi
6132      nop.m              0
6133      // Clo + S(1/x) - 0.5*logLo(x)
6134      fma.s1             fA0L = fRes2L, FR_MHalf, fA0L
6135      nop.i              0
6136}
6137;;
6138{ .mfi
6139      nop.m              0
6140      fadd.s1            fRes4L = fRes4L, fA0
6141      nop.i              0
6142}
6143;;
6144{ .mfi
6145      nop.m              0
6146      // Clo + S(1/x) - 0.5*logLo(x) + (x*(ln(x)-1))lo
6147      fadd.s1            fA0L = fA0L, fResL
6148      nop.i              0
6149}
6150;;
6151{ .mfi
6152      nop.m              0
6153      fadd.s1            fRes4L = fRes4L, fRes3L
6154      nop.i              0
6155}
6156;;
6157{ .mfi
6158      nop.m              0
6159      fadd.s1            fRes4L = fRes4L, fA0L
6160      nop.i              0
6161}
6162;;
6163{ .mfb
6164      nop.m              0
6165      fma.s0             f8 = fRes4H, f1, fRes4L
6166      // exit for x > 10.0
6167      br.ret.sptk        b0
6168}
6169;;
6170// here if 8.0 <= x <= 10.0
6171// Result = P15(y), where y = x/8.0 - 1.5
6172.align 32
6173lgammal_8_10:
6174{ .mfi
6175      addl               rPolDataPtr    = @ltoff(lgammal_8_10_data), gp
6176      fms.s1             FR_FracX = fSignifX, f1, FR_MHalf // y = x/8.0 - 1.5
6177      cmp.eq             p6, p7 = 4, rSgnGamSize
6178}
6179;;
6180{ .mfi
6181      ld8                rLnSinDataPtr = [rPolDataPtr]
6182      nop.f              0
6183      nop.i              0
6184}
6185{ .mfi
6186      ld8                rPolDataPtr = [rPolDataPtr]
6187      nop.f              0
6188      nop.i              0
6189}
6190;;
6191{ .mfi
6192      adds               rZ1offsett = 32, rLnSinDataPtr
6193      nop.f              0
6194      nop.i              0
6195}
6196{ .mfi
6197      adds               rLnSinDataPtr = 48, rLnSinDataPtr
6198      nop.f              0
6199      nop.i              0
6200}
6201;;
6202{ .mfi
6203      ldfpd              fA1, fA1L = [rPolDataPtr], 16 // A1
6204      nop.f              0
6205      nop.i              0
6206}
6207{ .mfi
6208      ldfe               fA2 = [rZ1offsett], 32 // A5
6209      nop.f              0
6210      nop.i              0
6211}
6212;;
6213{ .mfi
6214      ldfpd              fA0, fA0L = [rPolDataPtr], 16 // A0
6215      fma.s1             FR_rsq = FR_FracX, FR_FracX, f0 // y^2
6216      nop.i              0
6217}
6218{ .mfi
6219      ldfe               fA3 = [rLnSinDataPtr],32 // A5
6220      nop.f              0
6221      nop.i              0
6222}
6223;;
6224{ .mmf
6225      ldfe               fA4 = [rZ1offsett], 32 // A4
6226      ldfe               fA5 = [rLnSinDataPtr], 32 // A5
6227      nop.f              0
6228}
6229;;
6230{ .mmf
6231      ldfe               fA6 = [rZ1offsett], 32 // A6
6232      ldfe               fA7 = [rLnSinDataPtr], 32 // A7
6233      nop.f              0
6234}
6235;;
6236{ .mmf
6237      ldfe               fA8 = [rZ1offsett], 32 // A8
6238      ldfe               fA9 = [rLnSinDataPtr], 32 // A9
6239      nop.f              0
6240}
6241;;
6242{ .mmf
6243      ldfe               fA10 = [rZ1offsett], 32 // A10
6244      ldfe               fA11 = [rLnSinDataPtr], 32 // A11
6245      nop.f              0
6246}
6247;;
6248{ .mmf
6249      ldfe               fA12 = [rZ1offsett], 32 // A12
6250      ldfe               fA13 = [rLnSinDataPtr], 32 // A13
6251      fma.s1             FR_Q4 = FR_rsq, FR_rsq, f0 // y^4
6252}
6253;;
6254{ .mmf
6255      ldfe               fA14 = [rZ1offsett], 32 // A14
6256      ldfe               fA15 = [rLnSinDataPtr], 32 // A15
6257      nop.f              0
6258}
6259;;
6260{ .mfi
6261      nop.m              0
6262      fma.s1             fRes1H = FR_FracX, fA1, f0
6263      nop.i              0
6264}
6265;;
6266{ .mfi
6267      nop.m              0
6268      fma.s1             fA3 = fA3, FR_FracX, fA2 // v4
6269      nop.i              0
6270}
6271;;
6272{ .mfi
6273      nop.m              0
6274      fma.s1             fA5 = fA5, FR_FracX, fA4 // v5
6275      nop.i              0
6276}
6277;;
6278{ .mfi
6279      // store sign of GAMMA(x) if size of variable is 4 bytes
6280(p6)  st4                [rSgnGamAddr] = rSgnGam
6281      fma.s1             fA3L = FR_Q4, FR_Q4, f0 // v9 = y^8
6282      nop.i              0
6283}
6284{ .mfi
6285      // store sign of GAMMA(x) if size of variable is 8 bytes
6286(p7)  st8                [rSgnGamAddr] = rSgnGam
6287      fma.s1             fA7 = fA7, FR_FracX, fA6 // v7
6288      nop.i              0
6289}
6290;;
6291{ .mfi
6292      nop.m              0
6293      fma.s1             fA9 = fA9, FR_FracX, fA8 // v8
6294      nop.i              0
6295}
6296;;
6297{ .mfi
6298      nop.m              0
6299      fms.s1             fRes1L = FR_FracX, fA1, fRes1H
6300      nop.i              0
6301}
6302{ .mfi
6303      nop.m              0
6304      fma.s1             fA11 = fA11, FR_FracX, fA10 // v12
6305      nop.i              0
6306}
6307;;
6308{ .mfi
6309      nop.m              0
6310      fma.s1             fA13 = fA13, FR_FracX, fA12 // v13
6311      nop.i              0
6312}
6313{ .mfi
6314      nop.m              0
6315      fma.s1             fRes2H = fRes1H, f1, fA0
6316      nop.i              0
6317}
6318;;
6319{ .mfi
6320      nop.m              0
6321      fma.s1             fA15 = fA15, FR_FracX, fA14 // v16
6322      nop.i              0
6323}
6324{ .mfi
6325      nop.m              0
6326      fma.s1             fA5 = fA5, FR_rsq, fA3 // v3
6327      nop.i              0
6328}
6329;;
6330{ .mfi
6331      nop.m              0
6332      fma.s1             fA9 = fA9, FR_rsq, fA7 // v6
6333      nop.i              0
6334}
6335;;
6336{ .mfi
6337      nop.m              0
6338      fma.s1             fRes1L = FR_FracX, fA1L, fRes1L
6339      nop.i              0
6340}
6341;;
6342{ .mfi
6343      nop.m              0
6344      fms.s1             fRes2L = fA0, f1, fRes2H
6345      nop.i              0
6346}
6347{ .mfi
6348      nop.m              0
6349      fma.s1             fA13 = fA13, FR_rsq, fA11 // v11
6350      nop.i              0
6351}
6352;;
6353{ .mfi
6354      nop.m              0
6355      fma.s1             fA9 = fA9, FR_Q4, fA5 // v2
6356      nop.i              0
6357}
6358;;
6359{ .mfi
6360      nop.m              0
6361      fma.s1             fRes1L = fRes1L, f1, fA0L
6362      nop.i              0
6363}
6364;;
6365{ .mfi
6366      nop.m              0
6367      fma.s1             fRes2L = fRes2L, f1, fRes1H
6368      nop.i              0
6369}
6370{ .mfi
6371      nop.m              0
6372      fma.s1             fA15 = fA15, FR_Q4, fA13 // v10
6373      nop.i              0
6374}
6375;;
6376{ .mfi
6377      nop.m              0
6378      fma.s1             fRes2L = fRes1L, f1, fRes2L
6379      nop.i              0
6380}
6381{ .mfi
6382      nop.m              0
6383      fma.s1             fPol = fA3L, fA15, fA9
6384      nop.i              0
6385}
6386;;
6387{ .mfi
6388      nop.m              0
6389      fma.s1             f8 = FR_rsq , fPol, fRes2H
6390      nop.i              0
6391}
6392{ .mfi
6393      nop.m              0
6394      fma.s1             fPol = fPol, FR_rsq, f0
6395      nop.i              0
6396}
6397;;
6398{ .mfi
6399      nop.m              0
6400      fms.s1             fRes1L = fRes2H, f1, f8
6401      nop.i              0
6402}
6403;;
6404{ .mfi
6405      nop.m              0
6406      fma.s1             fRes1L = fRes1L, f1, fPol
6407      nop.i              0
6408}
6409;;
6410{.mfi
6411      nop.m              0
6412      fma.s1             fRes1L = fRes1L, f1, fRes2L
6413      nop.i              0
6414}
6415;;
6416{ .mfb
6417      nop.m              0
6418      fma.s0             f8 = f8, f1, fRes1L
6419      // exit for 8.0 <= x <= 10.0
6420      br.ret.sptk        b0
6421}
6422;;
6423
6424// here if 4.0 <=x < 8.0
6425.align 32
6426lgammal_4_8:
6427{ .mfi
6428      addl               rPolDataPtr= @ltoff(lgammal_4_8_data),gp
6429      fms.s1             FR_FracX = fSignifX, f1, FR_MHalf
6430      adds               rSgnGam = 1, r0
6431}
6432;;
6433{ .mfi
6434      ld8                rPolDataPtr = [rPolDataPtr]
6435      nop.f              0
6436      nop.i              0
6437}
6438;;
6439
6440{ .mfb
6441      adds               rTmpPtr = 160, rPolDataPtr
6442      nop.f              0
6443      // branch to special path which computes polynomial of 25th degree
6444      br.sptk            lgamma_polynom25
6445}
6446;;
6447
6448// here if 2.25 <=x < 4.0
6449.align 32
6450lgammal_2Q_4:
6451{ .mfi
6452      addl               rPolDataPtr= @ltoff(lgammal_2Q_4_data),gp
6453      fms.s1             FR_FracX = fSignifX, f1, FR_MHalf
6454      adds               rSgnGam = 1, r0
6455}
6456;;
6457{ .mfi
6458      ld8                rPolDataPtr = [rPolDataPtr]
6459      nop.f              0
6460      nop.i              0
6461}
6462;;
6463
6464{ .mfb
6465      adds               rTmpPtr = 160, rPolDataPtr
6466      nop.f              0
6467      // branch to special path which computes polynomial of 25th degree
6468      br.sptk            lgamma_polynom25
6469}
6470;;
6471
6472// here if 0.5 <= |x| < 0.75
6473.align 32
6474lgammal_half_3Q:
6475.pred.rel "mutex", p14, p15
6476{ .mfi
6477(p14) addl               rPolDataPtr= @ltoff(lgammal_half_3Q_data),gp
6478     // FR_FracX = x - 0.625 for positive x
6479(p14) fms.s1             FR_FracX = f8, f1, FR_FracX
6480(p14) adds               rSgnGam = 1, r0
6481}
6482{ .mfi
6483(p15) addl               rPolDataPtr= @ltoff(lgammal_half_3Q_neg_data),gp
6484     // FR_FracX = x + 0.625 for negative x
6485(p15) fma.s1             FR_FracX = f8, f1, FR_FracX
6486(p15) adds               rSgnGam = -1, r0
6487}
6488;;
6489{ .mfi
6490      ld8                rPolDataPtr = [rPolDataPtr]
6491       nop.f              0
6492       nop.i              0
6493}
6494;;
6495{ .mfb
6496      adds               rTmpPtr = 160, rPolDataPtr
6497      nop.f              0
6498      // branch to special path which computes polynomial of 25th degree
6499      br.sptk            lgamma_polynom25
6500}
6501;;
6502// here if 1.3125 <= x < 1.5625
6503.align 32
6504lgammal_loc_min:
6505{ .mfi
6506      adds               rSgnGam = 1, r0
6507      nop.f              0
6508      nop.i              0
6509}
6510{ .mfb
6511      adds               rTmpPtr = 160, rPolDataPtr
6512      fms.s1             FR_FracX = f8, f1, fA5L
6513      br.sptk            lgamma_polynom25
6514}
6515;;
6516// here if -2.605859375 <= x < -2.5
6517// special polynomial approximation used since neither "near root"
6518// approximation nor reflection formula give satisfactory accuracy on
6519// this range
6520.align 32
6521_neg2andHalf:
6522{ .mfi
6523      addl               rPolDataPtr= @ltoff(lgammal_neg2andHalf_data),gp
6524      fma.s1             FR_FracX = fB20, f1, f8 // 2.5 + x
6525      adds               rSgnGam = -1, r0
6526}
6527;;
6528{.mfi
6529      ld8                rPolDataPtr = [rPolDataPtr]
6530      nop.f              0
6531      nop.i              0
6532}
6533;;
6534{ .mfb
6535      adds               rTmpPtr = 160, rPolDataPtr
6536      nop.f              0
6537      // branch to special path which computes polynomial of 25th degree
6538      br.sptk            lgamma_polynom25
6539}
6540;;
6541
6542// here if -0.5 < x <= -0.40625
6543.align 32
6544lgammal_near_neg_half:
6545{ .mmf
6546      addl               rPolDataPtr= @ltoff(lgammal_near_neg_half_data),gp
6547      setf.exp           FR_FracX = rExpHalf
6548      nop.f              0
6549}
6550;;
6551{ .mfi
6552      ld8                rPolDataPtr = [rPolDataPtr]
6553      nop.f              0
6554      adds               rSgnGam = -1, r0
6555}
6556;;
6557{ .mfb
6558      adds               rTmpPtr = 160, rPolDataPtr
6559      fma.s1             FR_FracX = FR_FracX, f1, f8
6560      // branch to special path which computes polynomial of 25th degree
6561      br.sptk            lgamma_polynom25
6562}
6563;;
6564
6565// here if there an answer is P25(x)
6566// rPolDataPtr, rTmpPtr point to coefficients
6567// x is in FR_FracX register
6568.align 32
6569lgamma_polynom25:
6570{ .mfi
6571      ldfpd              fA3, fA0L = [rPolDataPtr], 16 // A3
6572      nop.f              0
6573      cmp.eq             p6, p7 = 4, rSgnGamSize
6574}
6575{ .mfi
6576      ldfpd              fA18, fA19 = [rTmpPtr], 16 // D7, D6
6577      nop.f              0
6578      nop.i              0
6579}
6580;;
6581{ .mfi
6582      ldfpd              fA1, fA1L = [rPolDataPtr], 16 // A1
6583      nop.f              0
6584      nop.i              0
6585}
6586{ .mfi
6587      ldfpd              fA16, fA17 = [rTmpPtr], 16 // D4, D5
6588      nop.f              0
6589}
6590;;
6591{ .mfi
6592      ldfpd              fA12, fA13 = [rPolDataPtr], 16 // D0, D1
6593      nop.f              0
6594      nop.i              0
6595}
6596{ .mfi
6597      ldfpd              fA14, fA15 = [rTmpPtr], 16 // D2, D3
6598      nop.f              0
6599      nop.i              0
6600}
6601;;
6602{ .mfi
6603      ldfpd              fA24, fA25 = [rPolDataPtr], 16 // C21, C20
6604      nop.f              0
6605      nop.i              0
6606}
6607{ .mfi
6608      ldfpd              fA22, fA23 = [rTmpPtr], 16 // C19, C18
6609      nop.f              0
6610      nop.i              0
6611}
6612;;
6613{ .mfi
6614      ldfpd              fA2, fA2L = [rPolDataPtr], 16 // A2
6615      fma.s1             fA4L = FR_FracX, FR_FracX, f0 // x^2
6616      nop.i              0
6617}
6618{ .mfi
6619      ldfpd              fA20, fA21 = [rTmpPtr], 16 // C17, C16
6620      nop.f              0
6621      nop.i              0
6622}
6623;;
6624{ .mfi
6625      ldfe               fA11 = [rTmpPtr], 16 // E7
6626      nop.f              0
6627      nop.i              0
6628}
6629{ .mfi
6630      ldfpd              fA0, fA3L = [rPolDataPtr], 16 // A0
6631      nop.f              0
6632      nop.i              0
6633};;
6634{ .mfi
6635      ldfe               fA10 = [rPolDataPtr], 16 // E6
6636      nop.f              0
6637      nop.i              0
6638}
6639{ .mfi
6640      ldfe               fA9 = [rTmpPtr], 16 // E5
6641      nop.f              0
6642      nop.i              0
6643}
6644;;
6645{ .mmf
6646      ldfe               fA8 = [rPolDataPtr], 16 // E4
6647      ldfe               fA7 = [rTmpPtr], 16 // E3
6648      nop.f              0
6649}
6650;;
6651{ .mmf
6652      ldfe               fA6 = [rPolDataPtr], 16 // E2
6653      ldfe               fA5 = [rTmpPtr], 16 // E1
6654      nop.f              0
6655}
6656;;
6657{ .mfi
6658      ldfe               fA4 = [rPolDataPtr], 16 // E0
6659      fma.s1             fA5L = fA4L, fA4L, f0 // x^4
6660      nop.i              0
6661}
6662{ .mfi
6663      nop.m              0
6664      fms.s1             fB2 = FR_FracX, FR_FracX, fA4L // x^2 - <x^2>
6665      nop.i              0
6666}
6667;;
6668{ .mfi
6669      // store signgam if size of variable is 4 bytes
6670(p6)  st4                [rSgnGamAddr] = rSgnGam
6671      fma.s1             fRes4H = fA3, FR_FracX, f0 // (A3*x)hi
6672      nop.i              0
6673}
6674{ .mfi
6675      // store signgam if size of variable is 8 bytes
6676(p7)  st8                [rSgnGamAddr] = rSgnGam
6677      fma.s1             fA19 = fA19, FR_FracX, fA18 // D7*x + D6
6678      nop.i              0
6679}
6680;;
6681{ .mfi
6682      nop.m              0
6683      fma.s1             fResH = fA1, FR_FracX, f0 // (A1*x)hi
6684      nop.i              0
6685}
6686{ .mfi
6687      nop.m              0
6688      fma.s1             fB6 = fA1L, FR_FracX, fA0L // A1L*x + A0L
6689      nop.i              0
6690}
6691;;
6692{ .mfi
6693      nop.m              0
6694      fma.s1             fA17 = fA17, FR_FracX, fA16 // D5*x + D4
6695      nop.i              0
6696}
6697{ .mfi
6698      nop.m              0
6699      fma.s1             fA15 = fA15, FR_FracX, fA14 // D3*x + D2
6700      nop.i              0
6701}
6702;;
6703{ .mfi
6704      nop.m              0
6705      fma.s1             fA25 = fA25, FR_FracX, fA24 // C21*x + C20
6706      nop.i              0
6707}
6708{ .mfi
6709      nop.m              0
6710      fma.s1             fA13 = fA13, FR_FracX, fA12 // D1*x + D0
6711      nop.i              0
6712}
6713;;
6714{ .mfi
6715      nop.m              0
6716      fma.s1             fA23 = fA23, FR_FracX, fA22 // C19*x + C18
6717      nop.i              0
6718}
6719{ .mfi
6720      nop.m              0
6721      fma.s1             fA21 = fA21, FR_FracX, fA20 // C17*x + C16
6722      nop.i              0
6723}
6724;;
6725{ .mfi
6726      nop.m              0
6727      fms.s1             fRes4L = fA3, FR_FracX, fRes4H // delta((A3*x)hi)
6728      nop.i              0
6729}
6730{ .mfi
6731      nop.m              0
6732      fadd.s1            fRes2H = fRes4H, fA2 // (A3*x + A2)hi
6733      nop.i              0
6734}
6735;;
6736{ .mfi
6737      nop.m              0
6738      fms.s1             fResL = fA1, FR_FracX, fResH // d(A1*x)
6739      nop.i              0
6740}
6741{ .mfi
6742      nop.m              0
6743      fadd.s1            fRes1H = fResH, fA0 // (A1*x + A0)hi
6744      nop.i              0
6745}
6746;;
6747{ .mfi
6748      nop.m              0
6749      fma.s1             fA19 = fA19, fA4L, fA17 // Dhi
6750      nop.i              0
6751}
6752{ .mfi
6753      nop.m              0
6754      fma.s1             fA11 = fA11, FR_FracX, fA10 // E7*x + E6
6755      nop.i              0
6756}
6757;;
6758{ .mfi
6759      nop.m              0
6760      // Doing this to raise inexact flag
6761      fma.s0             fA10 = fA0, fA0, f0
6762      nop.i              0
6763}
6764;;
6765{ .mfi
6766      nop.m              0
6767      fma.s1             fA15 = fA15, fA4L, fA13 // Dlo
6768      nop.i              0
6769}
6770{ .mfi
6771      nop.m              0
6772      // (C21*x + C20)*x^2 + C19*x + C18
6773      fma.s1             fA25 = fA25, fA4L, fA23
6774      nop.i              0
6775}
6776;;
6777{ .mfi
6778      nop.m              0
6779      fma.s1             fA9 = fA9, FR_FracX, fA8 // E5*x + E4
6780      nop.i              0
6781}
6782{ .mfi
6783      nop.m              0
6784      fma.s1             fA7 = fA7, FR_FracX, fA6 // E3*x + E2
6785      nop.i              0
6786}
6787;;
6788{ .mfi
6789      nop.m              0
6790      fma.s1             fRes4L = fA3L, FR_FracX, fRes4L // (A3*x)lo
6791      nop.i              0
6792}
6793{ .mfi
6794      nop.m              0
6795      fsub.s1            fRes2L = fA2, fRes2H
6796      nop.i              0
6797}
6798;;
6799{ .mfi
6800      nop.m              0
6801      fadd.s1            fResL = fResL, fB6 // (A1L*x + A0L) + d(A1*x)
6802      nop.i              0
6803}
6804{ .mfi
6805      nop.m              0
6806      fsub.s1            fRes1L = fA0, fRes1H
6807      nop.i              0
6808}
6809;;
6810{ .mfi
6811      nop.m              0
6812      fma.s1             fA5 = fA5, FR_FracX, fA4 // E1*x + E0
6813      nop.i              0
6814}
6815{ .mfi
6816      nop.m              0
6817      fma.s1             fB8 = fA5L, fA5L, f0  // x^8
6818      nop.i              0
6819}
6820;;
6821{ .mfi
6822      nop.m              0
6823      // ((C21*x + C20)*x^2 + C19*x + C18)*x^2 + C17*x + C16
6824      fma.s1             fA25 = fA25, fA4L, fA21
6825      nop.i              0
6826}
6827{ .mfi
6828      nop.m              0
6829      fma.s1             fA19 = fA19, fA5L, fA15 // D
6830      nop.i              0
6831}
6832;;
6833{ .mfi
6834      nop.m              0
6835      fma.s1             fA11 = fA11, fA4L, fA9 // Ehi
6836      nop.i              0
6837}
6838;;
6839{ .mfi
6840      nop.m              0
6841      fadd.s1            fRes2L = fRes2L, fRes4H
6842      nop.i              0
6843}
6844{ .mfi
6845      nop.m              0
6846      fadd.s1            fRes4L = fRes4L, fA2L // (A3*x)lo + A2L
6847      nop.i              0
6848}
6849;;
6850{ .mfi
6851      nop.m              0
6852      fma.s1             fRes3H = fRes2H, fA4L, f0 //  ((A3*x + A2)*x^2)hi
6853      nop.i              0
6854}
6855{ .mfi
6856      nop.m              0
6857      fadd.s1            fRes1L = fRes1L, fResH
6858      nop.i              0
6859}
6860;;
6861{ .mfi
6862      nop.m              0
6863      fma.s1             fRes3L = fRes2H, fB2, f0 // (A3*x + A2)hi*d(x^2)
6864      nop.i              0
6865}
6866{ .mfi
6867      nop.m              0
6868      fma.s1             fA7 = fA7, fA4L, fA5 // Elo
6869      nop.i              0
6870}
6871;;
6872{ .mfi
6873      nop.m              0
6874      fma.s1             fA25 = fA25, fB8, fA19 // C*x^8 + D
6875      nop.i              0
6876}
6877;;
6878{ .mfi
6879      nop.m              0
6880      fadd.s1            fRes2L = fRes2L, fRes4L // (A3*x + A2)lo
6881      nop.i              0
6882}
6883;;
6884{ .mfi
6885      nop.m              0
6886      fms.s1             fB4 = fRes2H, fA4L, fRes3H // d((A3*x + A2)*x^2))
6887      nop.i              0
6888}
6889{ .mfi
6890      nop.m              0
6891      fadd.s1            fRes1L = fRes1L, fResL // (A1*x + A0)lo
6892      nop.i              0
6893}
6894;;
6895{ .mfi
6896      nop.m              0
6897      fadd.s1            fB20 = fRes3H, fRes1H // Phi
6898      nop.i              0
6899}
6900{ .mfi
6901      nop.m              0
6902      fma.s1             fA11 = fA11, fA5L, fA7 // E
6903      nop.i              0
6904}
6905;;
6906{ .mfi
6907      nop.m              0
6908      //  ( (A3*x + A2)lo*<x^2> + (A3*x + A2)hi*d(x^2))
6909      fma.s1             fRes3L = fRes2L, fA4L, fRes3L
6910      nop.i              0
6911}
6912;;
6913{ .mfi
6914      nop.m              0
6915      // d((A3*x + A2)*x^2)) + (A1*x + A0)lo
6916      fadd.s1            fRes1L = fRes1L, fB4
6917      nop.i              0
6918}
6919;;
6920{ .mfi
6921      nop.m              0
6922      fsub.s1            fB18 = fRes1H, fB20
6923      nop.i              0
6924}
6925{ .mfi
6926      nop.m              0
6927      fma.s1             fPol = fA25, fB8, fA11
6928      nop.i              0
6929}
6930;;
6931{ .mfi
6932      nop.m              0
6933      fadd.s1            fRes1L = fRes1L, fRes3L
6934      nop.i              0
6935}
6936;;
6937{ .mfi
6938      nop.m              0
6939      fadd.s1            fB18 = fB18, fRes3H
6940      nop.i              0
6941}
6942{ .mfi
6943      nop.m              0
6944      fma.s1             fRes4H = fPol, fA5L, fB20
6945      nop.i              0
6946}
6947;;
6948{ .mfi
6949      nop.m              0
6950      fma.s1             fPolL = fPol, fA5L, f0
6951      nop.i              0
6952}
6953;;
6954{ .mfi
6955      nop.m              0
6956      fadd.s1            fB18 = fB18, fRes1L // Plo
6957      nop.i              0
6958}
6959{ .mfi
6960      nop.m              0
6961      fsub.s1            fRes4L = fB20, fRes4H
6962      nop.i              0
6963}
6964;;
6965{ .mfi
6966      nop.m              0
6967      fadd.s1            fB18 = fB18, fPolL
6968      nop.i              0
6969}
6970;;
6971{ .mfi
6972      nop.m              0
6973      fadd.s1            fRes4L = fRes4L, fB18
6974      nop.i              0
6975}
6976;;
6977{ .mfb
6978      nop.m              0
6979      fma.s0             f8 = fRes4H, f1, fRes4L
6980      // P25(x) computed, exit here
6981      br.ret.sptk        b0
6982}
6983;;
6984
6985
6986// here if 0.75 <= x < 1.3125
6987.align 32
6988lgammal_03Q_1Q:
6989{ .mfi
6990      addl               rPolDataPtr= @ltoff(lgammal_03Q_1Q_data),gp
6991      fma.s1             FR_FracX = fA5L, f1, f0 // x
6992      adds               rSgnGam = 1, r0
6993}
6994{ .mfi
6995      nop.m              0
6996      fma.s1             fB4 = fA5L, fA5L, f0 // x^2
6997      nop.i              0
6998}
6999;;
7000{ .mfi
7001      ld8                rPolDataPtr = [rPolDataPtr]
7002      nop.f              0
7003      nop.i              0
7004}
7005;;
7006{ .mfb
7007      adds               rTmpPtr = 144, rPolDataPtr
7008      nop.f              0
7009      br.sptk            lgamma_polynom24x
7010}
7011;;
7012
7013// here if 1.5625 <= x < 2.25
7014.align 32
7015lgammal_13Q_2Q:
7016{ .mfi
7017      addl               rPolDataPtr= @ltoff(lgammal_13Q_2Q_data),gp
7018      fma.s1             FR_FracX = fB4, f1, f0 // x
7019      adds               rSgnGam = 1, r0
7020}
7021{ .mfi
7022      nop.m              0
7023      fma.s1             fB4 = fB4, fB4, f0 // x^2
7024      nop.i              0
7025}
7026;;
7027{ .mfi
7028      ld8                rPolDataPtr = [rPolDataPtr]
7029      nop.f              0
7030      nop.i              0
7031}
7032;;
7033{ .mfb
7034      adds               rTmpPtr = 144, rPolDataPtr
7035      nop.f              0
7036      br.sptk            lgamma_polynom24x
7037}
7038;;
7039
7040// here if result is Pol24(x)
7041// x is in FR_FracX,
7042// rPolDataPtr, rTmpPtr point to coefficients
7043.align 32
7044lgamma_polynom24x:
7045{ .mfi
7046      ldfpd              fA4, fA2L = [rPolDataPtr], 16
7047      nop.f              0
7048      cmp.eq             p6, p7 = 4, rSgnGamSize
7049}
7050{ .mfi
7051      ldfpd              fA23, fA24 = [rTmpPtr], 16 // C18, C19
7052      nop.f              0
7053      nop.i              0
7054}
7055;;
7056{ .mfi
7057      ldfpd              fA3, fA1L = [rPolDataPtr], 16
7058      fma.s1             fA5L = fB4, fB4, f0 // x^4
7059      nop.i              0
7060}
7061{ .mfi
7062      ldfpd              fA19, fA20 = [rTmpPtr], 16 // D6, D7
7063      fms.s1             fB2 = FR_FracX, FR_FracX, fB4 // x^2 - <x^2>
7064      nop.i              0
7065}
7066;;
7067{ .mmf
7068      ldfpd              fA15, fA16 = [rPolDataPtr], 16 // D2, D3
7069      ldfpd              fA17, fA18 = [rTmpPtr], 16 // D4, D5
7070      nop.f              0
7071}
7072;;
7073{ .mmf
7074      ldfpd              fA13, fA14 = [rPolDataPtr], 16 // D0, D1
7075      ldfpd              fA12, fA21 = [rTmpPtr], 16 // E7, C16
7076      nop.f              0
7077}
7078;;
7079{ .mfi
7080      ldfe               fA11 = [rPolDataPtr], 16 // E6
7081      nop.f              0
7082      nop.i              0
7083}
7084{ .mfi
7085      ldfe               fA10 = [rTmpPtr], 16 // E5
7086      nop.f              0
7087      nop.i              0
7088}
7089;;
7090{ .mfi
7091      ldfpd              fA2, fA4L = [rPolDataPtr], 16
7092      nop.f              0
7093      nop.i              0
7094}
7095{ .mfi
7096      ldfpd              fA1, fA3L = [rTmpPtr], 16
7097      nop.f              0
7098      nop.i              0
7099}
7100;;
7101{ .mfi
7102      ldfpd              fA22, fA25 = [rPolDataPtr], 16 // C17, C20
7103      fma.s1             fA0 = fA5L, fA5L, f0 // x^8
7104      nop.i              0
7105}
7106{ .mfi
7107      nop.m              0
7108      fma.s1             fA0L = fA5L, FR_FracX, f0 // x^5
7109      nop.i              0
7110}
7111;;
7112{ .mmf
7113      ldfe               fA9 = [rPolDataPtr], 16 // E4
7114      ldfe               fA8 = [rTmpPtr], 16 // E3
7115      nop.f              0
7116}
7117;;
7118{ .mmf
7119      ldfe               fA7 = [rPolDataPtr], 16 // E2
7120      ldfe               fA6 = [rTmpPtr], 16 // E1
7121      nop.f              0
7122}
7123;;
7124{ .mfi
7125      ldfe               fA5 = [rTmpPtr], 16 // E0
7126      fma.s1             fRes4H = fA4, fB4, f0 // A4*<x^2>
7127      nop.i              0
7128}
7129{ .mfi
7130      nop.m              0
7131      fma.s1             fPol = fA24, FR_FracX, fA23 // C19*x + C18
7132      nop.i              0
7133}
7134;;
7135{ .mfi
7136      // store signgam if size of variable is 4 bytes
7137(p6)  st4                [rSgnGamAddr] = rSgnGam
7138      fma.s1             fRes1H = fA3, fB4, f0 // A3*<x^2>
7139      nop.i              0
7140}
7141{ .mfi
7142      // store signgam if size of variable is 8 bytes
7143(p7)  st8                [rSgnGamAddr] = rSgnGam
7144      fma.s1             fA1L = fA3, fB2,fA1L //  A3*d(x^2) + A1L
7145      nop.i              0
7146}
7147;;
7148{ .mfi
7149      nop.m              0
7150      fma.s1             fA20 = fA20, FR_FracX, fA19 // D7*x + D6
7151      nop.i              0
7152}
7153{ .mfi
7154      nop.m              0
7155      fma.s1             fA18 = fA18, FR_FracX, fA17 // D5*x + D4
7156      nop.i              0
7157}
7158;;
7159{ .mfi
7160      nop.m              0
7161      fma.s1             fA16 = fA16, FR_FracX, fA15 // D3*x + D2
7162      nop.i              0
7163}
7164{ .mfi
7165      nop.m              0
7166      fma.s1             fA14 = fA14, FR_FracX, fA13 // D1*x + D0
7167      nop.i              0
7168}
7169;;
7170{ .mfi
7171      nop.m              0
7172      fma.s1             fA2L = fA4, fB2,fA2L //  A4*d(x^2) + A2L
7173      nop.i              0
7174}
7175{ .mfi
7176      nop.m              0
7177      fma.s1             fA12 = fA12, FR_FracX, fA11 // E7*x + E6
7178      nop.i              0
7179}
7180;;
7181{ .mfi
7182      nop.m              0
7183      fms.s1             fRes2L = fA4, fB4, fRes4H  // delta(A4*<x^2>)
7184      nop.i              0
7185}
7186{ .mfi
7187      nop.m              0
7188      fadd.s1            fRes2H = fRes4H, fA2       // A4*<x^2> + A2
7189      nop.i              0
7190}
7191;;
7192{ .mfi
7193      nop.m              0
7194      fms.s1             fRes3L = fA3, fB4, fRes1H  // delta(A3*<x^2>)
7195      nop.i              0
7196}
7197{ .mfi
7198      nop.m              0
7199      fadd.s1            fRes3H = fRes1H, fA1       // A3*<x^2> + A1
7200      nop.i              0
7201}
7202;;
7203{ .mfi
7204      nop.m              0
7205      fma.s1             fA20 = fA20, fB4, fA18 // (D7*x + D6)*x^2 + D5*x + D4
7206      nop.i              0
7207}
7208{ .mfi
7209      nop.m              0
7210      fma.s1             fA22 = fA22, FR_FracX, fA21 // C17*x + C16
7211      nop.i              0
7212}
7213;;
7214{ .mfi
7215      nop.m              0
7216      fma.s1             fA16 = fA16, fB4, fA14 // (D3*x + D2)*x^2 + D1*x + D0
7217      nop.i              0
7218}
7219{ .mfi
7220      nop.m              0
7221      fma.s1             fPol = fA25, fB4, fPol // C20*x^2 + C19*x + C18
7222      nop.i              0
7223}
7224;;
7225{ .mfi
7226      nop.m              0
7227      fma.s1             fA2L = fA4L, fB4, fA2L //  A4L*<x^2> + A4*d(x^2) + A2L
7228      nop.i              0
7229}
7230{ .mfi
7231      nop.m              0
7232      fma.s1             fA1L = fA3L, fB4, fA1L //  A3L*<x^2> + A3*d(x^2) + A1L
7233      nop.i              0
7234}
7235;;
7236{ .mfi
7237      nop.m              0
7238      fsub.s1            fRes4L = fA2, fRes2H // d1
7239      nop.i              0
7240}
7241{ .mfi
7242      nop.m              0
7243      fma.s1             fResH = fRes2H, fB4, f0 // (A4*<x^2> + A2)*x^2
7244      nop.i              0
7245}
7246;;
7247{ .mfi
7248      nop.m              0
7249      fsub.s1            fRes1L = fA1, fRes3H // d1
7250      nop.i              0
7251}
7252{ .mfi
7253      nop.m              0
7254      fma.s1             fB6 = fRes3H, FR_FracX, f0 // (A3*<x^2> + A1)*x
7255      nop.i              0
7256}
7257;;
7258{ .mfi
7259      nop.m              0
7260      fma.s1             fA10 = fA10, FR_FracX, fA9  // E5*x + E4
7261      nop.i              0
7262}
7263{ .mfi
7264      nop.m              0
7265      fma.s1             fA8 = fA8, FR_FracX, fA7 // E3*x + E2
7266      nop.i              0
7267}
7268;;
7269{ .mfi
7270      nop.m              0
7271      // (C20*x^2 + C19*x + C18)*x^2 + C17*x + C16
7272      fma.s1             fPol = fPol, fB4, fA22
7273      nop.i              0
7274}
7275{ .mfi
7276      nop.m              0
7277      fma.s1             fA6 = fA6, FR_FracX, fA5 // E1*x + E0
7278      nop.i              0
7279}
7280;;
7281{ .mfi
7282      nop.m              0
7283      // A4L*<x^2> + A4*d(x^2) + A2L + delta(A4*<x^2>)
7284      fadd.s1            fRes2L = fA2L, fRes2L
7285      nop.i              0
7286}
7287{ .mfi
7288      nop.m              0
7289      // A3L*<x^2> + A3*d(x^2) + A1L + delta(A3*<x^2>)
7290      fadd.s1            fRes3L = fA1L, fRes3L
7291      nop.i              0
7292}
7293;;
7294{ .mfi
7295      nop.m              0
7296      fadd.s1            fRes4L = fRes4L, fRes4H // d2
7297      nop.i              0
7298}
7299{ .mfi
7300      nop.m              0
7301      fms.s1             fResL = fRes2H, fB4, fResH  // d(A4*<x^2> + A2)*x^2)
7302      nop.i              0
7303}
7304;;
7305{ .mfi
7306      nop.m              0
7307      fadd.s1            fRes1L = fRes1L, fRes1H // d2
7308      nop.i              0
7309}
7310{ .mfi
7311      nop.m              0
7312      fms.s1             fB8 = fRes3H, FR_FracX, fB6 // d((A3*<x^2> + A1)*x)
7313      nop.i              0
7314}
7315;;
7316{ .mfi
7317      nop.m              0
7318      fadd.s1            fB10 = fResH, fB6 // (A4*x^4 + .. + A1*x)hi
7319      nop.i              0
7320}
7321{ .mfi
7322      nop.m              0
7323      fma.s1             fA12 = fA12, fB4, fA10 // Ehi
7324      nop.i              0
7325}
7326;;
7327{ .mfi
7328      nop.m              0
7329      // ((D7*x + D6)*x^2 + D5*x + D4)*x^4 + (D3*x + D2)*x^2 + D1*x + D0
7330      fma.s1             fA20 = fA20, fA5L, fA16
7331      nop.i              0
7332}
7333{ .mfi
7334      nop.m              0
7335      fma.s1             fA8 = fA8, fB4, fA6 // Elo
7336      nop.i              0
7337}
7338;;
7339{ .mfi
7340      nop.m              0
7341      fadd.s1            fRes2L = fRes2L, fRes4L // (A4*<x^2> + A2)lo
7342      nop.i              0
7343}
7344{ .mfi
7345      nop.m              0
7346      // d(A4*<x^2> + A2)*x^2) + A4*<x^2> + A2)*d(x^2)
7347      fma.s1             fResL = fRes2H, fB2, fResL
7348      nop.i              0
7349}
7350;;
7351{ .mfi
7352      nop.m              0
7353      fadd.s1            fRes3L = fRes3L, fRes1L   // (A4*<x^2> + A2)lo
7354      nop.i              0
7355}
7356;;
7357{ .mfi
7358      nop.m              0
7359      fsub.s1            fB12 = fB6, fB10
7360      nop.i              0
7361}
7362;;
7363{ .mfi
7364      nop.m              0
7365      fma.s1             fPol = fPol, fA0, fA20 // PolC*x^8 + PolD
7366      nop.i              0
7367}
7368{ .mfi
7369      nop.m              0
7370      fma.s1             fPolL = fA12, fA5L, fA8 // E
7371      nop.i              0
7372}
7373;;
7374{ .mfi
7375      nop.m              0
7376      fma.s1             fResL = fB4, fRes2L, fResL // ((A4*<x^2> + A2)*x^2)lo
7377      nop.i              0
7378}
7379;;
7380{ .mfi
7381      nop.m              0
7382      fma.s1             fRes3L = fRes3L, FR_FracX, fB8 // ((A3*<x^2> + A1)*x)lo
7383      nop.i              0
7384}
7385;;
7386{ .mfi
7387      nop.m              0
7388      fadd.s1            fB12 = fB12, fResH
7389      nop.i              0
7390}
7391;;
7392{ .mfi
7393      nop.m              0
7394      fma.s1             fPol = fPol, fA0, fPolL
7395      nop.i              0
7396}
7397;;
7398{ .mfi
7399      nop.m              0
7400      fadd.s1            fRes3L = fRes3L, fResL
7401      nop.i              0
7402}
7403;;
7404{ .mfi
7405      nop.m              0
7406      fma.s1             fRes2H = fPol, fA0L, fB10
7407      nop.i              0
7408}
7409;;
7410{ .mfi
7411      nop.m              0
7412      fadd.s1            fRes3L = fB12, fRes3L
7413      nop.i              0
7414}
7415;;
7416{ .mfi
7417      nop.m              0
7418      fsub.s1            fRes4L = fB10, fRes2H
7419      nop.i              0
7420}
7421;;
7422{ .mfi
7423      nop.m              0
7424      fma.s1             fRes4L = fPol, fA0L, fRes4L
7425      nop.i              0
7426}
7427;;
7428{ .mfi
7429      nop.m              0
7430      fadd.s1            fRes4L = fRes4L, fRes3L
7431      nop.i              0
7432}
7433;;
7434{ .mfb
7435      nop.m              0
7436      // final result for all paths for which the result is Pol24(x)
7437      fma.s0             f8 = fRes2H, f1, fRes4L
7438      // here is the exit for all paths for which the result is Pol24(x)
7439      br.ret.sptk        b0
7440}
7441;;
7442
7443
7444//  here if x is natval, nan, +/-inf, +/-0, or denormal
7445.align 32
7446lgammal_spec:
7447{ .mfi
7448      nop.m              0
7449      fclass.m           p9, p0 =  f8, 0xB // +/-denormals
7450      nop.i              0
7451};;
7452{ .mfi
7453      nop.m              0
7454      fclass.m           p6, p0 =  f8, 0x1E1 // Test x for natval, nan, +inf
7455      nop.i              0
7456};;
7457{ .mfb
7458      nop.m              0
7459      fclass.m           p7, p0 =  f8, 0x7 // +/-0
7460(p9)  br.cond.sptk       lgammal_denormal_input
7461};;
7462{ .mfb
7463      nop.m              0
7464      nop.f              0
7465      // branch out if x is natval, nan, +inf
7466(p6)  br.cond.spnt       lgammal_nan_pinf
7467};;
7468{ .mfb
7469      nop.m              0
7470      nop.f              0
7471(p7)  br.cond.spnt       lgammal_singularity
7472};;
7473// if we are still here then x = -inf
7474{ .mfi
7475      cmp.eq             p6, p7 = 4, rSgnGamSize
7476      nop.f              0
7477      adds               rSgnGam = 1, r0
7478};;
7479{ .mfi
7480      // store signgam if size of variable is 4 bytes
7481(p6)  st4                [rSgnGamAddr] = rSgnGam
7482      nop.f              0
7483      nop.i              0
7484}
7485{ .mfb
7486      // store signgam if size of variable is 8 bytes
7487(p7)  st8                [rSgnGamAddr] = rSgnGam
7488      fma.s0             f8 = f8,f8,f0 // return +inf, no call to error support
7489      br.ret.spnt        b0
7490};;
7491
7492// here if x is NaN, NatVal or +INF
7493.align 32
7494lgammal_nan_pinf:
7495{ .mfi
7496      cmp.eq             p6, p7 = 4, rSgnGamSize
7497      nop.f              0
7498      adds               rSgnGam = 1, r0
7499}
7500;;
7501{ .mfi
7502      // store signgam if size of variable is 4 bytes
7503(p6)  st4                [rSgnGamAddr] = rSgnGam
7504      fma.s0             f8 = f8,f1,f8 // return x+x if x is natval, nan, +inf
7505      nop.i              0
7506}
7507{ .mfb
7508      // store signgam if size of variable is 8 bytes
7509(p7)  st8                [rSgnGamAddr] = rSgnGam
7510      nop.f              0
7511      br.ret.sptk        b0
7512}
7513;;
7514
7515// here if x denormal or unnormal
7516.align 32
7517lgammal_denormal_input:
7518{ .mfi
7519      nop.m              0
7520      fma.s0             fResH = f1, f1, f8 // raise denormal exception
7521      nop.i              0
7522}
7523{ .mfi
7524      nop.m              0
7525      fnorm.s1           f8 = f8 // normalize input value
7526      nop.i              0
7527}
7528;;
7529{ .mfi
7530      getf.sig           rSignifX = f8
7531      fmerge.se          fSignifX =  f1, f8
7532      nop.i              0
7533}
7534{ .mfi
7535      getf.exp           rSignExpX = f8
7536      fcvt.fx.s1         fXint = f8 // Convert arg to int (int repres. in FR)
7537      nop.i              0
7538}
7539;;
7540{ .mfi
7541      getf.exp           rSignExpX = f8
7542      fcmp.lt.s1         p15, p14 = f8, f0
7543      nop.i              0
7544}
7545;;
7546{ .mfb
7547      and                rExpX = rSignExpX, r17Ones
7548      fmerge.s           fAbsX = f1, f8 // |x|
7549      br.cond.sptk       _deno_back_to_main_path
7550}
7551;;
7552
7553
7554// here if overflow (x > overflow_bound)
7555.align 32
7556lgammal_overflow:
7557{ .mfi
7558      addl               r8 = 0x1FFFE, r0
7559      nop.f              0
7560      cmp.eq             p6, p7 = 4, rSgnGamSize
7561}
7562{ .mfi
7563      adds               rSgnGam = 1, r0
7564      nop.f              0
7565      nop.i              0
7566}
7567;;
7568{ .mfi
7569      setf.exp           f9 = r8
7570      fmerge.s           FR_X = f8,f8
7571      mov                GR_Parameter_TAG = 102 // overflow
7572};;
7573{ .mfi
7574      // store signgam if size of variable is 4 bytes
7575(p6)  st4                [rSgnGamAddr] = rSgnGam
7576      nop.f              0
7577      nop.i              0
7578}
7579{ .mfb
7580      // store signgam if size of variable is 8 bytes
7581(p7)  st8                [rSgnGamAddr] = rSgnGam
7582      fma.s0             FR_RESULT = f9,f9,f0 // Set I,O and +INF result
7583      br.cond.sptk       __libm_error_region
7584};;
7585
7586// here if x is negative integer or +/-0 (SINGULARITY)
7587.align 32
7588lgammal_singularity:
7589{ .mfi
7590      adds               rSgnGam = 1, r0
7591      fclass.m           p8,p0 = f8,0x6 // is x -0?
7592      mov                GR_Parameter_TAG = 103 // negative
7593}
7594{ .mfi
7595      cmp.eq             p6, p7 = 4, rSgnGamSize
7596      fma.s1             FR_X = f0,f0,f8
7597      nop.i              0
7598};;
7599{ .mfi
7600(p8)  sub                rSgnGam = r0, rSgnGam
7601      nop.f              0
7602      nop.i              0
7603}
7604{ .mfi
7605      nop.m              0
7606      nop.f              0
7607      nop.i              0
7608};;
7609{ .mfi
7610      // store signgam if size of variable is 4 bytes
7611(p6)  st4                [rSgnGamAddr] = rSgnGam
7612      nop.f              0
7613      nop.i              0
7614}
7615{ .mfb
7616      // store signgam if size of variable is 8 bytes
7617(p7)  st8                [rSgnGamAddr] = rSgnGam
7618      frcpa.s0           FR_RESULT, p0 = f1, f0
7619      br.cond.sptk       __libm_error_region
7620};;
7621
7622GLOBAL_LIBM_END(__libm_lgammal)
7623
7624
7625
7626LOCAL_LIBM_ENTRY(__libm_error_region)
7627.prologue
7628{ .mfi
7629        add   GR_Parameter_Y=-32,sp             // Parameter 2 value
7630        nop.f 0
7631.save   ar.pfs,GR_SAVE_PFS
7632        mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
7633}
7634{ .mfi
7635.fframe 64
7636        add sp=-64,sp                           // Create new stack
7637        nop.f 0
7638        mov GR_SAVE_GP=gp                       // Save gp
7639};;
7640{ .mmi
7641        stfe [GR_Parameter_Y] = FR_Y,16         // Save Parameter 2 on stack
7642        add GR_Parameter_X = 16,sp              // Parameter 1 address
7643.save   b0, GR_SAVE_B0
7644        mov GR_SAVE_B0=b0                       // Save b0
7645};;
7646.body
7647{ .mib
7648        stfe [GR_Parameter_X] = FR_X            // Store Parameter 1 on stack
7649        add   GR_Parameter_RESULT = 0,GR_Parameter_Y
7650        nop.b 0                                 // Parameter 3 address
7651}
7652{ .mib
7653        stfe [GR_Parameter_Y] = FR_RESULT      // Store Parameter 3 on stack
7654        add   GR_Parameter_Y = -16,GR_Parameter_Y
7655        br.call.sptk b0=__libm_error_support#  // Call error handling function
7656};;
7657{ .mmi
7658        add   GR_Parameter_RESULT = 48,sp
7659        nop.m 999
7660        nop.i 999
7661};;
7662{ .mmi
7663        ldfe  f8 = [GR_Parameter_RESULT]       // Get return result off stack
7664.restore sp
7665        add   sp = 64,sp                       // Restore stack pointer
7666        mov   b0 = GR_SAVE_B0                  // Restore return address
7667};;
7668{ .mib
7669        mov   gp = GR_SAVE_GP                  // Restore gp
7670        mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
7671        br.ret.sptk     b0                     // Return
7672};;
7673
7674LOCAL_LIBM_END(__libm_error_region#)
7675
7676.type   __libm_error_support#,@function
7677.global __libm_error_support#
7678