1.file "tgamma.s"
2
3
4// Copyright (c) 2001 - 2005, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
25// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
28// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code,and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39//*********************************************************************
40//
41// History:
42// 10/12/01  Initial version
43// 05/20/02  Cleaned up namespace and sf0 syntax
44// 02/10/03  Reordered header: .section, .global, .proc, .align
45// 04/04/03  Changed error codes for overflow and negative integers
46// 04/10/03  Changed code for overflow near zero handling
47// 03/31/05  Reformatted delimiters between data tables
48//
49//*********************************************************************
50//
51//*********************************************************************
52//
53// Function: tgamma(x) computes the principle value of the GAMMA
54// function of x.
55//
56//*********************************************************************
57//
58// Resources Used:
59//
60//    Floating-Point Registers: f8-f15
61//                              f33-f87
62//
63//    General Purpose Registers:
64//      r8-r11
65//      r14-r28
66//      r32-r36
67//      r37-r40 (Used to pass arguments to error handling routine)
68//
69//    Predicate Registers:      p6-p15
70//
71//*********************************************************************
72//
73// IEEE Special Conditions:
74//
75//    tgamma(+inf) = +inf
76//    tgamma(-inf) = QNaN
77//    tgamma(+/-0) = +/-inf
78//    tgamma(x<0, x - integer) = QNaN
79//    tgamma(SNaN) = QNaN
80//    tgamma(QNaN) = QNaN
81//
82//*********************************************************************
83//
84// Overview
85//
86// The method consists of three cases.
87//
88// If       2 <= x < OVERFLOW_BOUNDARY      use case tgamma_regular;
89// else if    0 < x < 2                     use case tgamma_from_0_to_2;
90// else    if  -(i+1) <  x < -i, i = 0...184 use case tgamma_negatives;
91//
92// Case 2 <= x < OVERFLOW_BOUNDARY
93// -------------------------------
94//   Here we use algorithm based on the recursive formula
95//   GAMMA(x+1) = x*GAMMA(x). For that we subdivide interval
96//   [2; OVERFLOW_BOUNDARY] into intervals [16*n; 16*(n+1)] and
97//   approximate GAMMA(x) by polynomial of 22th degree on each
98//   [16*n; 16*n+1], recursive formula is used to expand GAMMA(x)
99//   to [16*n; 16*n+1]. In other words we need to find n, i and r
100//   such that x = 16 * n + i + r where n and i are integer numbers
101//   and r is fractional part of x. So GAMMA(x) = GAMMA(16*n+i+r) =
102//   = (x-1)*(x-2)*...*(x-i)*GAMMA(x-i) =
103//   = (x-1)*(x-2)*...*(x-i)*GAMMA(16*n+r) ~
104//   ~ (x-1)*(x-2)*...*(x-i)*P22n(r).
105//
106//   Step 1: Reduction
107//   -----------------
108//    N = [x] with truncate
109//    r = x - N, note 0 <= r < 1
110//
111//    n = N & ~0xF - index of table that contains coefficient of
112//                   polynomial approximation
113//    i = N & 0xF  - is used in recursive formula
114//
115//
116//   Step 2: Approximation
117//   ---------------------
118//    We use factorized minimax approximation polynomials
119//    P22n(r) = A22*(r^2+C01(n)*R+C00(n))*
120//              *(r^2+C11(n)*R+C10(n))*...*(r^2+CA1(n)*R+CA0(n))
121//
122//   Step 3: Recursion
123//   -----------------
124//    In case when i > 0 we need to multiply P22n(r) by product
125//    R(i)=(x-1)*(x-2)*...*(x-i). To reduce number of fp-instructions
126//    we can calculate R as follow:
127//    R(i) = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-1))*(x-i)) if i is
128//    even or R = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-2))*(x-(i-1)))*
129//    *(i-1) if i is odd. In both cases we need to calculate
130//    R2(i) = (x^2-3*x+2)*(x^2-7*x+12)*...*(x^2+x+2*j*(2*j-1)) =
131//    = (x^2-3*x+2)*(x^2-7*x+12)*...*((x^2+x)+2*j*(2*(j-1)+(1-2*x))) =
132//    = (RA+2*(2-RB))*(RA+4*(4-RB))*...*(RA+2*j*(2*(j-1)+RB))
133//    where j = 1..[i/2], RA = x^2+x, RB = 1-2*x.
134//
135//   Step 4: Reconstruction
136//   ----------------------
137//    Reconstruction is just simple multiplication i.e.
138//    GAMMA(x) = P22n(r)*R(i)
139//
140// Case 0 < x < 2
141// --------------
142//    To calculate GAMMA(x) on this interval we do following
143//        if 1 <= x < 1.25   than  GAMMA(x) = P15(x-1)
144//        if 1.25 <= x < 1.5 than  GAMMA(x) = P15(x-x_min) where
145//        x_min is point of local minimum on [1; 2] interval.
146//        if 1.5  <= x < 2.0 than  GAMMA(x) = P15(x-1.5)
147//    and
148//        if 0 < x < 1 than GAMMA(x) = GAMMA(x+1)/x
149//
150// Case -(i+1) <  x < -i, i = 0...184
151// ----------------------------------
152//    Here we use the fact that GAMMA(-x) = PI/(x*GAMMA(x)*sin(PI*x)) and
153//    so we need to calculate GAMMA(x), sin(PI*x)/PI. Calculation of
154//    GAMMA(x) is described above.
155//
156//   Step 1: Reduction
157//   -----------------
158//    Note that period of sin(PI*x) is 2 and range reduction for
159//    sin(PI*x) is like to range reduction for GAMMA(x)
160//    i.e r = x - [x] with exception of cases
161//    when r > 0.5 (in such cases r = 1 - (x - [x])).
162//
163//   Step 2: Approximation
164//   ---------------------
165//    To approximate sin(PI*x)/PI = sin(PI*(2*n+r))/PI =
166//    = (-1)^n*sin(PI*r)/PI Taylor series is used.
167//    sin(PI*r)/PI ~ S21(r).
168//
169//   Step 3: Division
170//   ----------------
171//    To calculate 1/(x*GAMMA(x)*S21(r)) we use frcpa instruction
172//    with following Newton-Raphson interations.
173//
174//
175//*********************************************************************
176
177GR_Sig                  = r8
178GR_TAG                  = r8
179GR_ad_Data              = r9
180GR_SigRqLin             = r10
181GR_iSig                 = r11
182GR_ExpOf1               = r11
183GR_ExpOf8               = r11
184
185
186GR_Sig2                 = r14
187GR_Addr_Mask1           = r15
188GR_Sign_Exp             = r16
189GR_Tbl_Offs             = r17
190GR_Addr_Mask2           = r18
191GR_ad_Co                = r19
192GR_Bit2                 = r19
193GR_ad_Ce                = r20
194GR_ad_Co7               = r21
195GR_NzOvfBound           = r21
196GR_ad_Ce7               = r22
197GR_Tbl_Ind              = r23
198GR_Tbl_16xInd           = r24
199GR_ExpOf025             = r24
200GR_ExpOf05              = r25
201GR_0x30033              = r26
202GR_10                   = r26
203GR_12                   = r27
204GR_185                  = r27
205GR_14                   = r28
206GR_2                    = r28
207GR_fpsr                 = r28
208
209GR_SAVE_B0              = r33
210GR_SAVE_PFS             = r34
211GR_SAVE_GP              = r35
212GR_SAVE_SP              = r36
213
214GR_Parameter_X          = r37
215GR_Parameter_Y          = r38
216GR_Parameter_RESULT     = r39
217GR_Parameter_TAG        = r40
218
219
220
221FR_X                    = f10
222FR_Y                    = f1 // tgamma is single argument function
223FR_RESULT               = f8
224
225FR_AbsX                 = f9
226FR_NormX                = f9
227FR_r02                  = f11
228FR_AbsXp1               = f12
229FR_X2pX                 = f13
230FR_1m2X                 = f14
231FR_Rq1                  = f14
232FR_Xt                   = f15
233
234FR_r                    = f33
235FR_OvfBound             = f34
236FR_Xmin                 = f35
237FR_2                    = f36
238FR_Rcp1                 = f36
239FR_Rcp3                 = f36
240FR_4                    = f37
241FR_5                    = f38
242FR_6                    = f39
243FR_8                    = f40
244FR_10                   = f41
245FR_12                   = f42
246FR_14                   = f43
247FR_GAMMA                = f43
248FR_05                   = f44
249
250FR_Rq2                  = f45
251FR_Rq3                  = f46
252FR_Rq4                  = f47
253FR_Rq5                  = f48
254FR_Rq6                  = f49
255FR_Rq7                  = f50
256FR_RqLin                = f51
257
258FR_InvAn                = f52
259
260FR_C01                  = f53
261FR_A15                  = f53
262FR_C11                  = f54
263FR_A14                  = f54
264FR_C21                  = f55
265FR_A13                  = f55
266FR_C31                  = f56
267FR_A12                  = f56
268FR_C41                  = f57
269FR_A11                  = f57
270FR_C51                  = f58
271FR_A10                  = f58
272FR_C61                  = f59
273FR_A9                   = f59
274FR_C71                  = f60
275FR_A8                   = f60
276FR_C81                  = f61
277FR_A7                   = f61
278FR_C91                  = f62
279FR_A6                   = f62
280FR_CA1                  = f63
281FR_A5                   = f63
282FR_C00                  = f64
283FR_A4                   = f64
284FR_rs2                  = f64
285FR_C10                  = f65
286FR_A3                   = f65
287FR_rs3                  = f65
288FR_C20                  = f66
289FR_A2                   = f66
290FR_rs4                  = f66
291FR_C30                  = f67
292FR_A1                   = f67
293FR_rs7                  = f67
294FR_C40                  = f68
295FR_A0                   = f68
296FR_rs8                  = f68
297FR_C50                  = f69
298FR_r2                   = f69
299FR_C60                  = f70
300FR_r3                   = f70
301FR_C70                  = f71
302FR_r4                   = f71
303FR_C80                  = f72
304FR_r7                   = f72
305FR_C90                  = f73
306FR_r8                   = f73
307FR_CA0                  = f74
308FR_An                   = f75
309
310FR_S21                  = f76
311FR_S19                  = f77
312FR_Rcp0                 = f77
313FR_Rcp2                 = f77
314FR_S17                  = f78
315FR_S15                  = f79
316FR_S13                  = f80
317FR_S11                  = f81
318FR_S9                   = f82
319FR_S7                   = f83
320FR_S5                   = f84
321FR_S3                   = f85
322
323FR_iXt                  = f86
324FR_rs                   = f87
325
326
327// Data tables
328//==============================================================
329RODATA
330.align 16
331
332LOCAL_OBJECT_START(tgamma_data)
333data8 0x406573FAE561F648 // overflow boundary (171.624376956302739927196)
334data8 0x3FDD8B618D5AF8FE // point of local minium (0.461632144968362356785)
335//
336//[2; 3]
337data8 0xEF0E85C9AE40ABE2,0x00004000 // C01
338data8 0xCA2049DDB4096DD8,0x00004000 // C11
339data8 0x99A203B4DC2D1A8C,0x00004000 // C21
340data8 0xBF5D9D9C0C295570,0x00003FFF // C31
341data8 0xE8DD037DEB833BAB,0x00003FFD // C41
342data8 0xB6AE39A2A36AA03A,0x0000BFFE // C51
343data8 0x804960DC2850277B,0x0000C000 // C61
344data8 0xD9F3973841C09F80,0x0000C000 // C71
345data8 0x9C198A676F8A2239,0x0000C001 // C81
346data8 0xC98B7DAE02BE3226,0x0000C001 // C91
347data8 0xE9CAF31AC69301BA,0x0000C001 // CA1
348data8 0xFBBDD58608A0D172,0x00004000 // C00
349data8 0xFDD0316D1E078301,0x00004000 // C10
350data8 0x8630B760468C15E4,0x00004001 // C20
351data8 0x93EDE20E47D9152E,0x00004001 // C30
352data8 0xA86F3A38C77D6B19,0x00004001 // C40
353//[16; 17]
354data8 0xF87F757F365EE813,0x00004000 // C01
355data8 0xECA84FBA92759DA4,0x00004000 // C11
356data8 0xD4E0A55E07A8E913,0x00004000 // C21
357data8 0xB0EB45E94C8A5F7B,0x00004000 // C31
358data8 0x8050D6B4F7C8617D,0x00004000 // C41
359data8 0x8471B111AA691E5A,0x00003FFF // C51
360data8 0xADAF462AF96585C9,0x0000BFFC // C61
361data8 0xD327C7A587A8C32B,0x0000BFFF // C71
362data8 0xDEF5192B4CF5E0F1,0x0000C000 // C81
363data8 0xBADD64BB205AEF02,0x0000C001 // C91
364data8 0x9330A24AA67D6860,0x0000C002 // CA1
365data8 0xF57EEAF36D8C47BE,0x00004000 // C00
366data8 0x807092E12A251B38,0x00004001 // C10
367data8 0x8C458F80DEE7ED1C,0x00004001 // C20
368data8 0x9F30C731DC77F1A6,0x00004001 // C30
369data8 0xBAC4E7E099C3A373,0x00004001 // C40
370//[32; 33]
371data8 0xC3059A415F142DEF,0x00004000 // C01
372data8 0xB9C1DAC24664587A,0x00004000 // C11
373data8 0xA7101D910992FFB2,0x00004000 // C21
374data8 0x8A9522B8E4AA0AB4,0x00004000 // C31
375data8 0xC76A271E4BA95DCC,0x00003FFF // C41
376data8 0xC5D6DE2A38DB7FF2,0x00003FFE // C51
377data8 0xDBA42086997818B2,0x0000BFFC // C61
378data8 0xB8EDDB1424C1C996,0x0000BFFF // C71
379data8 0xBF7372FB45524B5D,0x0000C000 // C81
380data8 0xA03DDE759131580A,0x0000C001 // C91
381data8 0xFDA6FC4022C1FFE3,0x0000C001 // CA1
382data8 0x9759ABF797B2533D,0x00004000 // C00
383data8 0x9FA160C6CF18CEC5,0x00004000 // C10
384data8 0xB0EFF1E3530E0FCD,0x00004000 // C20
385data8 0xCCD60D5C470165D1,0x00004000 // C30
386data8 0xF5E53F6307B0B1C1,0x00004000 // C40
387//[48; 49]
388data8 0xAABE577FBCE37F5E,0x00004000 // C01
389data8 0xA274CAEEB5DF7172,0x00004000 // C11
390data8 0x91B90B6646C1B924,0x00004000 // C21
391data8 0xF06718519CA256D9,0x00003FFF // C31
392data8 0xAA9EE181C0E30263,0x00003FFF // C41
393data8 0xA07BDB5325CB28D2,0x00003FFE // C51
394data8 0x86C8B873204F9219,0x0000BFFD // C61
395data8 0xB0192C5D3E4787D6,0x0000BFFF // C71
396data8 0xB1E0A6263D4C19EF,0x0000C000 // C81
397data8 0x93BA32A118EAC9AE,0x0000C001 // C91
398data8 0xE942A39CD9BEE887,0x0000C001 // CA1
399data8 0xE838B0957B0D3D0D,0x00003FFF // C00
400data8 0xF60E0F00074FCF34,0x00003FFF // C10
401data8 0x89869936AE00C2A5,0x00004000 // C20
402data8 0xA0FE4E8AA611207F,0x00004000 // C30
403data8 0xC3B1229CFF1DDAFE,0x00004000 // C40
404//[64; 65]
405data8 0x9C00DDF75CDC6183,0x00004000 // C01
406data8 0x9446AE9C0F6A833E,0x00004000 // C11
407data8 0x84ABC5083310B774,0x00004000 // C21
408data8 0xD9BA3A0977B1ED83,0x00003FFF // C31
409data8 0x989B18C99411D300,0x00003FFF // C41
410data8 0x886E66402318CE6F,0x00003FFE // C51
411data8 0x99028C2468F18F38,0x0000BFFD // C61
412data8 0xAB72D17DCD40CCE1,0x0000BFFF // C71
413data8 0xA9D9AC9BE42C2EF9,0x0000C000 // C81
414data8 0x8C11D983AA177AD2,0x0000C001 // C91
415data8 0xDC779E981C1F0F06,0x0000C001 // CA1
416data8 0xC1FD4AC85965E8D6,0x00003FFF // C00
417data8 0xCE3D2D909D389EC2,0x00003FFF // C10
418data8 0xE7F79980AD06F5D8,0x00003FFF // C20
419data8 0x88DD9F73C8680B5D,0x00004000 // C30
420data8 0xA7D6CB2CB2D46F9D,0x00004000 // C40
421//[80; 81]
422data8 0x91C7FF4E993430D0,0x00004000 // C01
423data8 0x8A6E7AB83E45A7E9,0x00004000 // C11
424data8 0xF72D6382E427BEA9,0x00003FFF // C21
425data8 0xC9E2E4F9B3B23ED6,0x00003FFF // C31
426data8 0x8BEFEF56AE05D775,0x00003FFF // C41
427data8 0xEE9666AB6A185560,0x00003FFD // C51
428data8 0xA6AFAF5CEFAEE04D,0x0000BFFD // C61
429data8 0xA877EAFEF1F9C880,0x0000BFFF // C71
430data8 0xA45BD433048ECA15,0x0000C000 // C81
431data8 0x86BD1636B774CC2E,0x0000C001 // C91
432data8 0xD3721BE006E10823,0x0000C001 // CA1
433data8 0xA97EFABA91854208,0x00003FFF // C00
434data8 0xB4AF0AEBB3F97737,0x00003FFF // C10
435data8 0xCC38241936851B0B,0x00003FFF // C20
436data8 0xF282A6261006EA84,0x00003FFF // C30
437data8 0x95B8E9DB1BD45BAF,0x00004000 // C40
438//[96; 97]
439data8 0x8A1FA3171B35A106,0x00004000 // C01
440data8 0x830D5B8843890F21,0x00004000 // C11
441data8 0xE98B0F1616677A23,0x00003FFF // C21
442data8 0xBDF8347F5F67D4EC,0x00003FFF // C31
443data8 0x825F15DE34EC055D,0x00003FFF // C41
444data8 0xD4846186B8AAC7BE,0x00003FFD // C51
445data8 0xB161093AB14919B1,0x0000BFFD // C61
446data8 0xA65758EEA4800EF4,0x0000BFFF // C71
447data8 0xA046B67536FA329C,0x0000C000 // C81
448data8 0x82BBEC1BCB9E9068,0x0000C001 // C91
449data8 0xCC9DE2B23BA91B0B,0x0000C001 // CA1
450data8 0x983B16148AF77F94,0x00003FFF // C00
451data8 0xA2A4D8EE90FEE5DD,0x00003FFF // C10
452data8 0xB89446FA37FF481C,0x00003FFF // C20
453data8 0xDC5572648485FB01,0x00003FFF // C30
454data8 0x88CD5D7DB976129A,0x00004000 // C40
455//[112; 113]
456data8 0x8417098FD62AC5E3,0x00004000 // C01
457data8 0xFA7896486B779CBB,0x00003FFF // C11
458data8 0xDEC98B14AF5EEBD1,0x00003FFF // C21
459data8 0xB48E153C6BF0B5A3,0x00003FFF // C31
460data8 0xF597B038BC957582,0x00003FFE // C41
461data8 0xBFC6F0884A415694,0x00003FFD // C51
462data8 0xBA075A1392BDB5E5,0x0000BFFD // C61
463data8 0xA4B79E01B44C7DB4,0x0000BFFF // C71
464data8 0x9D12FA7711BFAB0F,0x0000C000 // C81
465data8 0xFF24C47C8E108AB4,0x0000C000 // C91
466data8 0xC7325EC86562606A,0x0000C001 // CA1
467data8 0x8B47DCD9E1610938,0x00003FFF // C00
468data8 0x9518B111B70F88B8,0x00003FFF // C10
469data8 0xA9CC197206F68682,0x00003FFF // C20
470data8 0xCB98294CC0D7A6A6,0x00003FFF // C30
471data8 0xFE09493EA9165181,0x00003FFF // C40
472//[128; 129]
473data8 0xFE53D03442270D90,0x00003FFF // C01
474data8 0xF0F857BAEC1993E4,0x00003FFF // C11
475data8 0xD5FF6D70DBBC2FD3,0x00003FFF // C21
476data8 0xACDAA5F4988B1074,0x00003FFF // C31
477data8 0xE92E069F8AD75B54,0x00003FFE // C41
478data8 0xAEBB64645BD94234,0x00003FFD // C51
479data8 0xC13746249F39B43C,0x0000BFFD // C61
480data8 0xA36B74F5B6297A1F,0x0000BFFF // C71
481data8 0x9A77860DF180F6E5,0x0000C000 // C81
482data8 0xF9F8457D84410A0C,0x0000C000 // C91
483data8 0xC2BF44C649EB8597,0x0000C001 // CA1
484data8 0x81225E7489BCDC0E,0x00003FFF // C00
485data8 0x8A788A09CE0EED11,0x00003FFF // C10
486data8 0x9E2E6F86D1B1D89C,0x00003FFF // C20
487data8 0xBE6866B21CF6CCB5,0x00003FFF // C30
488data8 0xEE94426EC1486AAE,0x00003FFF // C40
489//[144; 145]
490data8 0xF6113E09732A6497,0x00003FFF // C01
491data8 0xE900D45931B04FC8,0x00003FFF // C11
492data8 0xCE9FD58F745EBA5D,0x00003FFF // C21
493data8 0xA663A9636C864C86,0x00003FFF // C31
494data8 0xDEBF5315896CE629,0x00003FFE // C41
495data8 0xA05FEA415EBD7737,0x00003FFD // C51
496data8 0xC750F112BD9C4031,0x0000BFFD // C61
497data8 0xA2593A35C51C6F6C,0x0000BFFF // C71
498data8 0x9848E1DA7FB40C8C,0x0000C000 // C81
499data8 0xF59FEE87A5759A4B,0x0000C000 // C91
500data8 0xBF00203909E45A1D,0x0000C001 // CA1
501data8 0xF1D8E157200127E5,0x00003FFE // C00
502data8 0x81DD5397CB08D487,0x00003FFF // C10
503data8 0x94C1DC271A8B766F,0x00003FFF // C20
504data8 0xB3AFAF9B5D6EDDCF,0x00003FFF // C30
505data8 0xE1FB4C57CA81BE1E,0x00003FFF // C40
506//[160; 161]
507data8 0xEEFFE5122AC72FFD,0x00003FFF // C01
508data8 0xE22F70BB52AD54B3,0x00003FFF // C11
509data8 0xC84FF021FE993EEA,0x00003FFF // C21
510data8 0xA0DA2208EB5B2752,0x00003FFF // C31
511data8 0xD5CDD2FCF8AD2DF5,0x00003FFE // C41
512data8 0x940BEC6DCD811A59,0x00003FFD // C51
513data8 0xCC954EF4FD4EBB81,0x0000BFFD // C61
514data8 0xA1712E29A8C04554,0x0000BFFF // C71
515data8 0x966B55DFB243521A,0x0000C000 // C81
516data8 0xF1E6A2B9CEDD0C4C,0x0000C000 // C91
517data8 0xBBC87BCC031012DB,0x0000C001 // CA1
518data8 0xE43974E6D2818583,0x00003FFE // C00
519data8 0xF5702A516B64C5B7,0x00003FFE // C10
520data8 0x8CEBCB1B32E19471,0x00003FFF // C20
521data8 0xAAC10F05BB77E0AF,0x00003FFF // C30
522data8 0xD776EFCAB205CC58,0x00003FFF // C40
523//[176; 177]
524data8 0xE8DA614119811E5D,0x00003FFF // C01
525data8 0xDC415E0288B223D8,0x00003FFF // C11
526data8 0xC2D2243E44EC970E,0x00003FFF // C21
527data8 0x9C086664B5307BEA,0x00003FFF // C31
528data8 0xCE03D7A08B461156,0x00003FFE // C41
529data8 0x894BE3BAAAB66ADC,0x00003FFD // C51
530data8 0xD131EDD71A702D4D,0x0000BFFD // C61
531data8 0xA0A907CDDBE10898,0x0000BFFF // C71
532data8 0x94CC3CD9C765C808,0x0000C000 // C81
533data8 0xEEA85F237815FC0D,0x0000C000 // C91
534data8 0xB8FA04B023E43F91,0x0000C001 // CA1
535data8 0xD8B2C7D9FCBD7EF9,0x00003FFE // C00
536data8 0xE9566E93AAE7E38F,0x00003FFE // C10
537data8 0x8646E78AABEF0255,0x00003FFF // C20
538data8 0xA32AEDB62E304345,0x00003FFF // C30
539data8 0xCE83E40280EE7DF0,0x00003FFF // C40
540//
541//[2; 3]
542data8 0xC44FB47E90584083,0x00004001 // C50
543data8 0xE863EE77E1C45981,0x00004001 // C60
544data8 0x8AC15BE238B9D70E,0x00004002 // C70
545data8 0xA5D94B6592350EF4,0x00004002 // C80
546data8 0xC379DB3E20A148B3,0x00004002 // C90
547data8 0xDACA49B73974F6C9,0x00004002 // CA0
548data8 0x810E496A1AFEC895,0x00003FE1 // An
549//[16; 17]
550data8 0xE17C0357AAF3F817,0x00004001 // C50
551data8 0x8BA8804750FBFBFE,0x00004002 // C60
552data8 0xB18EAB3CB64BEBEE,0x00004002 // C70
553data8 0xE90AB7015AF1C28F,0x00004002 // C80
554data8 0xA0AB97CE9E259196,0x00004003 // C90
555data8 0xF5E0E0A000C2D720,0x00004003 // CA0
556data8 0xD97F0F87EC791954,0x00004005 // An
557//[32; 33]
558data8 0x980C293F3696040D,0x00004001 // C50
559data8 0xC0DBFFBB948A9A4E,0x00004001 // C60
560data8 0xFAB54625E9A588A2,0x00004001 // C70
561data8 0xA7E08176D6050FBF,0x00004002 // C80
562data8 0xEBAAEC4952270A9F,0x00004002 // C90
563data8 0xB7479CDAD20550FE,0x00004003 // CA0
564data8 0xAACD45931C3FF634,0x00004054 // An
565//[48; 49]
566data8 0xF5180F0000419AD5,0x00004000 // C50
567data8 0x9D507D07BFBB2273,0x00004001 // C60
568data8 0xCEB53F7A13A383E3,0x00004001 // C70
569data8 0x8BAFEF9E0A49128F,0x00004002 // C80
570data8 0xC58EF912D39E228C,0x00004002 // C90
571data8 0x9A88118422BA208E,0x00004003 // CA0
572data8 0xBD6C0E2477EC12CB,0x000040AC // An
573//[64; 65]
574data8 0xD410AC48BF7748DA,0x00004000 // C50
575data8 0x89399B90AFEBD931,0x00004001 // C60
576data8 0xB596DF8F77EB8560,0x00004001 // C70
577data8 0xF6D9445A047FB4A6,0x00004001 // C80
578data8 0xAF52F0DD65221357,0x00004002 // C90
579data8 0x8989B45BFC881989,0x00004003 // CA0
580data8 0xB7FCAE86E6E10D5A,0x0000410B // An
581//[80; 81]
582data8 0xBE759740E3B5AA84,0x00004000 // C50
583data8 0xF8037B1B07D27609,0x00004000 // C60
584data8 0xA4F6F6C7F0977D4F,0x00004001 // C70
585data8 0xE131960233BF02C4,0x00004001 // C80
586data8 0xA06DF43D3922BBE2,0x00004002 // C90
587data8 0xFC266AB27255A360,0x00004002 // CA0
588data8 0xD9F4B012EDAFEF2F,0x0000416F // An
589//[96; 97]
590data8 0xAEFC84CDA8E1EAA6,0x00004000 // C50
591data8 0xE5009110DB5F3C8A,0x00004000 // C60
592data8 0x98F5F48738E7B232,0x00004001 // C70
593data8 0xD17EE64E21FFDC6B,0x00004001 // C80
594data8 0x9596F7A7E36145CC,0x00004002 // C90
595data8 0xEB64DBE50E125CAF,0x00004002 // CA0
596data8 0xA090530D79E32D2E,0x000041D8 // An
597//[112; 113]
598data8 0xA33AEA22A16B2655,0x00004000 // C50
599data8 0xD682B93BD7D7945C,0x00004000 // C60
600data8 0x8FC854C6E6E30CC3,0x00004001 // C70
601data8 0xC5754D828AFFDC7A,0x00004001 // C80
602data8 0x8D41216B397139C2,0x00004002 // C90
603data8 0xDE78D746848116E5,0x00004002 // CA0
604data8 0xB8A297A2DC0630DB,0x00004244 // An
605//[128; 129]
606data8 0x99EB00F11D95E292,0x00004000 // C50
607data8 0xCB005CB911EB779A,0x00004000 // C60
608data8 0x8879AA2FDFF3A37A,0x00004001 // C70
609data8 0xBBDA538AD40CAC2C,0x00004001 // C80
610data8 0x8696D849D311B9DE,0x00004002 // C90
611data8 0xD41E1C041481199F,0x00004002 // CA0
612data8 0xEBA1A43D34EE61EE,0x000042B3 // An
613//[144; 145]
614data8 0x924F822578AA9F3D,0x00004000 // C50
615data8 0xC193FAF9D3B36960,0x00004000 // C60
616data8 0x827AE3A6B68ED0CA,0x00004001 // C70
617data8 0xB3F52A27EED23F0B,0x00004001 // C80
618data8 0x811A079FB3C94D79,0x00004002 // C90
619data8 0xCB94415470B6F8D2,0x00004002 // CA0
620data8 0x80A0260DCB3EC9AC,0x00004326 // An
621//[160; 161]
622data8 0x8BF24091E88B331D,0x00004000 // C50
623data8 0xB9ADE01187E65201,0x00004000 // C60
624data8 0xFAE4508F6E7625FE,0x00004000 // C70
625data8 0xAD516668AD6D7367,0x00004001 // C80
626data8 0xF8F5FF171154F637,0x00004001 // C90
627data8 0xC461321268990C82,0x00004002 // CA0
628data8 0xC3B693F344B0E6FE,0x0000439A // An
629//
630//[176; 177]
631data8 0x868545EB42A258ED,0x00004000 // C50
632data8 0xB2EF04ACE8BA0E6E,0x00004000 // C60
633data8 0xF247D22C22E69230,0x00004000 // C70
634data8 0xA7A1AB93E3981A90,0x00004001 // C80
635data8 0xF10951733E2C697F,0x00004001 // C90
636data8 0xBE3359BFAD128322,0x00004002 // CA0
637data8 0x8000000000000000,0x00003fff
638//
639//[160; 161] for negatives
640data8 0xA76DBD55B2E32D71,0x00003C63 // 1/An
641//
642// sin(pi*x)/pi
643data8 0xBCBC4342112F52A2,0x00003FDE // S21
644data8 0xFAFCECB86536F655,0x0000BFE3 // S19
645data8 0x87E4C97F9CF09B92,0x00003FE9 // S17
646data8 0xEA124C68E704C5CB,0x0000BFED // S15
647data8 0x9BA38CFD59C8AA1D,0x00003FF2 // S13
648data8 0x99C0B552303D5B21,0x0000BFF6 // S11
649//
650//[176; 177] for negatives
651data8 0xBA5D5869211696FF,0x00003BEC // 1/An
652//
653// sin(pi*x)/pi
654data8 0xD63402E79A853175,0x00003FF9 // S9
655data8 0xC354723906DB36BA,0x0000BFFC // S7
656data8 0xCFCE5A015E236291,0x00003FFE // S5
657data8 0xD28D3312983E9918,0x0000BFFF // S3
658//
659//
660// [1.0;1.25]
661data8 0xA405530B067ECD3C,0x0000BFFC // A15
662data8 0xF5B5413F95E1C282,0x00003FFD // A14
663data8 0xC4DED71C782F76C8,0x0000BFFE // A13
664data8 0xECF7DDDFD27C9223,0x00003FFE // A12
665data8 0xFB73D31793068463,0x0000BFFE // A11
666data8 0xFF173B7E66FD1D61,0x00003FFE // A10
667data8 0xFFA5EF3959089E94,0x0000BFFE // A9
668data8 0xFF8153BD42E71A4F,0x00003FFE // A8
669data8 0xFEF9CAEE2CB5B533,0x0000BFFE // A7
670data8 0xFE3F02E5EDB6811E,0x00003FFE // A6
671data8 0xFB64074CED2658FB,0x0000BFFE // A5
672data8 0xFB52882A095B18A4,0x00003FFE // A4
673data8 0xE8508C7990A0DAC0,0x0000BFFE // A3
674data8 0xFD32C611D8A881D0,0x00003FFE // A2
675data8 0x93C467E37DB0C536,0x0000BFFE // A1
676data8 0x8000000000000000,0x00003FFF // A0
677//
678// [1.25;1.5]
679data8 0xD038092400619677,0x0000BFF7 // A15
680data8 0xEA6DE925E6EB8C8F,0x00003FF3 // A14
681data8 0xC53F83645D4597FC,0x0000BFF7 // A13
682data8 0xE366DB2FB27B7ECD,0x00003FF7 // A12
683data8 0xAC8FD5E11F6EEAD8,0x0000BFF8 // A11
684data8 0xFB14010FB3697785,0x00003FF8 // A10
685data8 0xB6F91CB5C371177B,0x0000BFF9 // A9
686data8 0x85A262C6F8FEEF71,0x00003FFA // A8
687data8 0xC038E6E3261568F9,0x0000BFFA // A7
688data8 0x8F4BDE8883232364,0x00003FFB // A6
689data8 0xBCFBBD5786537E9A,0x0000BFFB // A5
690data8 0xA4C08BAF0A559479,0x00003FFC // A4
691data8 0x85D74FA063E81476,0x0000BFFC // A3
692data8 0xDB629FB9BBDC1C4E,0x00003FFD // A2
693data8 0xF4F8FBC7C0C9D317,0x00003FC6 // A1
694data8 0xE2B6E4153A57746C,0x00003FFE // A0
695//
696// [1.25;1.5]
697data8 0x9533F9D3723B448C,0x0000BFF2 // A15
698data8 0xF1F75D3C561CBBAF,0x00003FF5 // A14
699data8 0xBA55A9A1FC883523,0x0000BFF8 // A13
700data8 0xB5D5E9E5104FA995,0x00003FFA // A12
701data8 0xFD84F35B70CD9AE2,0x0000BFFB // A11
702data8 0x87445235F4688CC5,0x00003FFD // A10
703data8 0xE7F236EBFB9F774E,0x0000BFFD // A9
704data8 0xA6605F2721F787CE,0x00003FFE // A8
705data8 0xCF579312AD7EAD72,0x0000BFFE // A7
706data8 0xE96254A2407A5EAC,0x00003FFE // A6
707data8 0xF41312A8572ED346,0x0000BFFE // A5
708data8 0xF9535027C1B1F795,0x00003FFE // A4
709data8 0xE7E82D0C613A8DE4,0x0000BFFE // A3
710data8 0xFD23CD9741B460B8,0x00003FFE // A2
711data8 0x93C30FD9781DBA88,0x0000BFFE // A1
712data8 0xFFFFF1781FDBEE84,0x00003FFE // A0
713LOCAL_OBJECT_END(tgamma_data)
714
715
716//==============================================================
717// Code
718//==============================================================
719
720.section .text
721GLOBAL_LIBM_ENTRY(tgamma)
722{ .mfi
723      getf.exp      GR_Sign_Exp = f8
724      fma.s1        FR_1m2X = f8,f1,f8 // 2x
725      addl          GR_ad_Data = @ltoff(tgamma_data), gp
726}
727{ .mfi
728      mov           GR_ExpOf8 = 0x10002 // 8
729      fcvt.fx.trunc.s1 FR_iXt = f8 // [x]
730      mov           GR_ExpOf05 = 0xFFFE // 0.5
731};;
732{ .mfi
733      getf.sig      GR_Sig = f8
734      fma.s1        FR_2 = f1,f1,f1 // 2
735      mov           GR_Addr_Mask1 = 0x780
736}
737{ .mlx
738      setf.exp      FR_8 = GR_ExpOf8
739      movl          GR_10 = 0x4024000000000000
740};;
741{ .mfi
742      ld8           GR_ad_Data = [GR_ad_Data]
743      fcmp.lt.s1    p14,p15 = f8,f0
744      tbit.z        p12,p13 = GR_Sign_Exp,0x10 // p13 if x >= 2
745}
746{ .mlx
747      and           GR_Bit2 = 4,GR_Sign_Exp
748      movl          GR_12 = 0x4028000000000000
749};;
750{ .mfi
751      setf.d        FR_10 = GR_10
752      fma.s1        FR_r02 = f8,f1,f0
753      extr.u        GR_Tbl_Offs = GR_Sig,58,6
754}
755{ .mfi
756(p12) mov           GR_Addr_Mask1 = r0
757      fma.s1        FR_NormX = f8,f1,f0
758      cmp.ne        p8,p0 = GR_Bit2,r0
759};;
760{ .mfi
761(p8)  shladd        GR_Tbl_Offs = GR_Tbl_Offs,4,r0
762      fclass.m      p10,p0 =  f8,0x1E7 // Test x for NaTVal, NaN, +/-0, +/-INF
763      tbit.nz       p11,p0 = GR_Sign_Exp,1
764}
765{ .mlx
766      add           GR_Addr_Mask2 = GR_Addr_Mask1,GR_Addr_Mask1
767      movl          GR_14 = 0x402C000000000000
768};;
769.pred.rel "mutex",p14,p15
770{ .mfi
771      setf.d        FR_12 = GR_12
772(p14) fma.s1        FR_1m2X = f1,f1,FR_1m2X // RB=1-2|x|
773      tbit.nz       p8,p9 = GR_Sign_Exp,0
774}
775{ .mfi
776      ldfpd         FR_OvfBound,FR_Xmin = [GR_ad_Data],16
777(p15) fms.s1        FR_1m2X = f1,f1,FR_1m2X // RB=1-2|x|
778(p11) shladd        GR_Tbl_Offs = GR_Tbl_Offs,2,r0
779};;
780.pred.rel "mutex",p9,p8
781{ .mfi
782      setf.d        FR_14 = GR_14
783      fma.s1        FR_4 = FR_2,FR_2,f0
784(p8)  and           GR_Tbl_Offs = GR_Tbl_Offs, GR_Addr_Mask1
785}
786{ .mfi
787      setf.exp      FR_05 = GR_ExpOf05
788      fma.s1        FR_6 = FR_2,FR_2,FR_2
789(p9)  and           GR_Tbl_Offs = GR_Tbl_Offs, GR_Addr_Mask2
790};;
791.pred.rel "mutex",p9,p8
792{ .mfi
793(p8)  shladd        GR_ad_Co = GR_Tbl_Offs,1,GR_ad_Data
794      fcvt.xf       FR_Xt = FR_iXt // [x]
795(p15) tbit.z.unc    p11,p0 = GR_Sign_Exp,0x10 // p11 if 0 < x < 2
796}
797{ .mfi
798(p9)  add           GR_ad_Co = GR_ad_Data,GR_Tbl_Offs
799      fma.s1        FR_5 = FR_2,FR_2,f1
800(p15) cmp.lt.unc    p7,p6 = GR_ExpOf05,GR_Sign_Exp // p7 if 0 < x < 1
801};;
802{ .mfi
803      add           GR_ad_Ce = 16,GR_ad_Co
804(p11) frcpa.s1      FR_Rcp0,p0 = f1,f8
805      sub           GR_Tbl_Offs = GR_ad_Co,GR_ad_Data
806}
807{ .mfb
808      ldfe          FR_C01 = [GR_ad_Co],32
809(p7)  fms.s1        FR_r02 = FR_r02,f1,f1
810      // jump if x is NaTVal, NaN, +/-0, +/-INF
811(p10) br.cond.spnt  tgamma_spec
812};;
813.pred.rel "mutex",p14,p15
814{ .mfi
815      ldfe          FR_C11 = [GR_ad_Ce],32
816(p14) fms.s1        FR_X2pX = f8,f8,f8 // RA=x^2+|x|
817      shr           GR_Tbl_Ind = GR_Tbl_Offs,8
818}
819{ .mfb
820      ldfe          FR_C21 = [GR_ad_Co],32
821(p15) fma.s1        FR_X2pX = f8,f8,f8 // RA=x^2+x
822      // jump if 0 < x < 2
823(p11) br.cond.spnt  tgamma_from_0_to_2
824};;
825{ .mfi
826      ldfe          FR_C31 = [GR_ad_Ce],32
827      fma.s1        FR_Rq2 = FR_2,f1,FR_1m2X  // 2 + B
828      cmp.ltu       p7,p0=0xB,GR_Tbl_Ind
829}
830{ .mfb
831      ldfe          FR_C41 = [GR_ad_Co],32
832      fma.s1        FR_Rq3 = FR_2,FR_2,FR_1m2X  // 4 + B
833      // jump if GR_Tbl_Ind > 11, i.e |x| is more than 192
834(p7)  br.cond.spnt  tgamma_spec_res
835};;
836{ .mfi
837      ldfe          FR_C51 = [GR_ad_Ce],32
838      fma.s1        FR_Rq4 = FR_6,f1,FR_1m2X  // 6 + B
839      shr           GR_Tbl_Offs = GR_Tbl_Offs,1
840}
841{ .mfi
842      ldfe          FR_C61 = [GR_ad_Co],32
843      fma.s1        FR_Rq5 = FR_4,FR_2,FR_1m2X // 8 + B
844      nop.i         0
845};;
846{ .mfi
847      ldfe          FR_C71 = [GR_ad_Ce],32
848(p14) fms.s1        FR_r = FR_Xt,f1,f8 // r = |x| - [|x|]
849      shr           GR_Tbl_16xInd = GR_Tbl_Offs,3
850}
851{ .mfi
852      ldfe          FR_C81 = [GR_ad_Co],32
853(p15) fms.s1        FR_r = f8,f1,FR_Xt // r = x - [x]
854      add           GR_ad_Data = 0xC00,GR_ad_Data
855};;
856{ .mfi
857      ldfe          FR_C91 = [GR_ad_Ce],32
858      fma.s1        FR_Rq6 = FR_5,FR_2,FR_1m2X  // 10 + B
859(p14) mov           GR_0x30033 = 0x30033
860}
861{ .mfi
862      ldfe          FR_CA1 = [GR_ad_Co],32
863      fma.s1        FR_Rq7 = FR_6,FR_2,FR_1m2X // 12 + B
864      sub           GR_Tbl_Offs = GR_Tbl_Offs,GR_Tbl_16xInd
865};;
866{ .mfi
867      ldfe          FR_C00 = [GR_ad_Ce],32
868      fma.s1        FR_Rq1 = FR_Rq1,FR_2,FR_X2pX // (x-1)*(x-2)
869(p13) cmp.eq.unc    p8,p0 = r0,GR_Tbl_16xInd // index is 0 i.e. arg from [2;16)
870}
871{ .mfi
872      ldfe          FR_C10 = [GR_ad_Co],32
873(p14) fms.s1        FR_AbsX = f0,f0,FR_NormX // absolute value of argument
874      add           GR_ad_Co7 = GR_ad_Data,GR_Tbl_Offs
875};;
876{ .mfi
877      ldfe          FR_C20 = [GR_ad_Ce],32
878      fma.s1        FR_Rq2 = FR_Rq2,FR_4,FR_X2pX // (x-3)*(x-4)
879      add           GR_ad_Ce7 = 16,GR_ad_Co7
880}
881{ .mfi
882      ldfe          FR_C30 = [GR_ad_Co],32
883      fma.s1        FR_Rq3 = FR_Rq3,FR_6,FR_X2pX // (x-5)*(x-6)
884      nop.i         0
885};;
886{ .mfi
887      ldfe          FR_C40 = [GR_ad_Ce],32
888      fma.s1        FR_Rq4 = FR_Rq4,FR_8,FR_X2pX // (x-7)*(x-8)
889(p14) cmp.leu.unc   p7,p0 = GR_0x30033,GR_Sign_Exp
890}
891{ .mfb
892      ldfe          FR_C50 = [GR_ad_Co7],32
893      fma.s1        FR_Rq5 = FR_Rq5,FR_10,FR_X2pX // (x-9)*(x-10)
894      // jump if x is less or equal to -2^52, i.e. x is big negative integer
895(p7)  br.cond.spnt  tgamma_singularity
896};;
897{ .mfi
898      ldfe          FR_C60 = [GR_ad_Ce7],32
899      fma.s1        FR_C01 = FR_C01,f1,FR_r
900      add           GR_ad_Ce = 0x560,GR_ad_Data
901}
902{ .mfi
903      ldfe          FR_C70 = [GR_ad_Co7],32
904      fma.s1        FR_rs = f0,f0,FR_r // reduced arg for sin(pi*x)
905      add           GR_ad_Co = 0x550,GR_ad_Data
906};;
907{ .mfi
908      ldfe          FR_C80 = [GR_ad_Ce7],32
909      fma.s1        FR_C11 = FR_C11,f1,FR_r
910      nop.i         0
911}
912{ .mfi
913      ldfe          FR_C90 = [GR_ad_Co7],32
914      fma.s1        FR_C21 = FR_C21,f1,FR_r
915      nop.i         0
916};;
917.pred.rel "mutex",p12,p13
918{ .mfi
919(p13) getf.sig      GR_iSig = FR_iXt
920      fcmp.lt.s1    p11,p0 = FR_05,FR_r
921      mov           GR_185 = 185
922}
923{ .mfi
924      nop.m         0
925      fma.s1        FR_Rq6 = FR_Rq6,FR_12,FR_X2pX // (x-11)*(x-12)
926      nop.i         0
927};;
928{ .mfi
929      ldfe          FR_CA0 = [GR_ad_Ce7],32
930      fma.s1        FR_C31 = FR_C31,f1,FR_r
931(p12) mov           GR_iSig = 0
932}
933{ .mfi
934      ldfe          FR_An = [GR_ad_Co7],0x80
935      fma.s1        FR_C41 = FR_C41,f1,FR_r
936      nop.i         0
937};;
938{ .mfi
939(p14) getf.sig      GR_Sig = FR_r
940      fma.s1        FR_C51 = FR_C51,f1,FR_r
941(p14) sub           GR_iSig = r0,GR_iSig
942}
943{ .mfi
944      ldfe          FR_S21 = [GR_ad_Co],32
945      fma.s1        FR_C61 = FR_C61,f1,FR_r
946      nop.i         0
947};;
948{ .mfi
949      ldfe          FR_S19 = [GR_ad_Ce],32
950      fma.s1        FR_C71 = FR_C71,f1,FR_r
951      and           GR_SigRqLin = 0xF,GR_iSig
952}
953{ .mfi
954      ldfe          FR_S17 = [GR_ad_Co],32
955      fma.s1        FR_C81 = FR_C81,f1,FR_r
956      mov           GR_2 = 2
957};;
958{ .mfi
959(p14) ldfe          FR_InvAn = [GR_ad_Co7]
960      fma.s1        FR_C91 = FR_C91,f1,FR_r
961      // if significand of r is 0 tnan argument is negative integer
962(p14) cmp.eq.unc    p12,p0 = r0,GR_Sig
963}
964{ .mfb
965(p8)  sub           GR_SigRqLin = GR_SigRqLin,GR_2 // subtract 2 if 2 <= x < 16
966      fma.s1        FR_CA1 = FR_CA1,f1,FR_r
967      // jump if x is negative integer such that -2^52 < x < -185
968(p12) br.cond.spnt  tgamma_singularity
969};;
970{ .mfi
971      setf.sig      FR_Xt = GR_SigRqLin
972(p11) fms.s1        FR_rs = f1,f1,FR_r
973(p14) cmp.ltu.unc   p7,p0 = GR_185,GR_iSig
974}
975{ .mfb
976      ldfe          FR_S15 = [GR_ad_Ce],32
977      fma.s1        FR_Rq7 = FR_Rq7,FR_14,FR_X2pX // (x-13)*(x-14)
978      // jump if x is noninteger such that -2^52 < x < -185
979(p7)  br.cond.spnt  tgamma_underflow
980};;
981{ .mfi
982      ldfe          FR_S13 = [GR_ad_Co],48
983      fma.s1        FR_C01 = FR_C01,FR_r,FR_C00
984      and           GR_Sig2 = 0xE,GR_SigRqLin
985}
986{ .mfi
987      ldfe          FR_S11 = [GR_ad_Ce],48
988      fma.s1        FR_C11 = FR_C11,FR_r,FR_C10
989      nop.i         0
990};;
991{ .mfi
992      ldfe          FR_S9 = [GR_ad_Co],32
993      fma.s1        FR_C21 = FR_C21,FR_r,FR_C20
994      // should we mul by polynomial of recursion?
995      cmp.eq        p13,p12 = r0,GR_SigRqLin
996}
997{ .mfi
998      ldfe          FR_S7 = [GR_ad_Ce],32
999      fma.s1        FR_C31 = FR_C31,FR_r,FR_C30
1000      nop.i         0
1001};;
1002{ .mfi
1003      ldfe          FR_S5 = [GR_ad_Co],32
1004      fma.s1        FR_C41 = FR_C41,FR_r,FR_C40
1005      nop.i         0
1006}
1007{ .mfi
1008      ldfe          FR_S3 = [GR_ad_Ce],32
1009      fma.s1        FR_C51 = FR_C51,FR_r,FR_C50
1010      nop.i         0
1011};;
1012{ .mfi
1013      nop.m         0
1014      fma.s1        FR_C61 = FR_C61,FR_r,FR_C60
1015      nop.i         0
1016}
1017{ .mfi
1018      nop.m         0
1019      fma.s1        FR_C71 = FR_C71,FR_r,FR_C70
1020      nop.i         0
1021};;
1022{ .mfi
1023      nop.m         0
1024      fma.s1        FR_C81 = FR_C81,FR_r,FR_C80
1025      nop.i         0
1026}
1027{ .mfi
1028      nop.m         0
1029      fma.s1        FR_C91 = FR_C91,FR_r,FR_C90
1030      nop.i         0
1031};;
1032{ .mfi
1033      nop.m         0
1034      fma.s1        FR_CA1 = FR_CA1,FR_r,FR_CA0
1035      nop.i         0
1036}
1037{ .mfi
1038      nop.m         0
1039      fma.s1        FR_C01 = FR_C01,FR_C11,f0
1040      nop.i         0
1041};;
1042{ .mfi
1043      nop.m         0
1044      fma.s1        FR_C21 = FR_C21,FR_C31,f0
1045      nop.i         0
1046}
1047{ .mfi
1048      nop.m         0
1049      fma.s1        FR_rs2 = FR_rs,FR_rs,f0
1050(p12) cmp.lt.unc    p7,p0 = 2,GR_Sig2 // should mul by FR_Rq2?
1051};;
1052{ .mfi
1053      nop.m         0
1054      fma.s1        FR_C41 = FR_C41,FR_C51,f0
1055      nop.i         0
1056}
1057{ .mfi
1058      nop.m         0
1059(p7)  fma.s1        FR_Rq1 = FR_Rq1,FR_Rq2,f0
1060(p12) cmp.lt.unc    p9,p0 = 6,GR_Sig2 // should mul by FR_Rq4?
1061};;
1062{ .mfi
1063      nop.m         0
1064      fma.s1        FR_C61 = FR_C61,FR_C71,f0
1065(p15) cmp.eq        p11,p0 = r0,r0
1066}
1067{ .mfi
1068      nop.m         0
1069(p9)  fma.s1        FR_Rq3 = FR_Rq3,FR_Rq4,f0
1070(p12) cmp.lt.unc    p8,p0 = 10,GR_Sig2 // should mul by FR_Rq6?
1071};;
1072{ .mfi
1073      nop.m         0
1074      fma.s1        FR_C81 = FR_C81,FR_C91,f0
1075      nop.i         0
1076}
1077{ .mfi
1078      nop.m         0
1079(p8)  fma.s1        FR_Rq5 = FR_Rq5,FR_Rq6,f0
1080(p14) cmp.ltu       p0,p11 = 0x9,GR_Tbl_Ind
1081};;
1082{ .mfi
1083      nop.m         0
1084      fcvt.xf       FR_RqLin = FR_Xt
1085      nop.i         0
1086}
1087{ .mfi
1088      nop.m         0
1089(p11) fma.s1        FR_CA1 = FR_CA1,FR_An,f0
1090      nop.i         0
1091};;
1092{ .mfi
1093      nop.m         0
1094      fma.s1        FR_S21 = FR_S21,FR_rs2,FR_S19
1095      nop.i         0
1096}
1097{ .mfi
1098      nop.m         0
1099      fma.s1        FR_S17 = FR_S17,FR_rs2,FR_S15
1100      nop.i         0
1101};;
1102{ .mfi
1103      nop.m         0
1104      fma.s1        FR_C01 = FR_C01,FR_C21,f0
1105      nop.i         0
1106}
1107{ .mfi
1108      nop.m         0
1109      fma.s1        FR_rs4 = FR_rs2,FR_rs2,f0
1110(p12) cmp.lt.unc    p8,p0 = 4,GR_Sig2 // should mul by FR_Rq3?
1111};;
1112{ .mfi
1113      nop.m         0
1114(p8)  fma.s1        FR_Rq1 = FR_Rq1,FR_Rq3,f0
1115      nop.i         0
1116}
1117{ .mfi
1118      nop.m         0
1119      fma.s1        FR_S13 = FR_S13,FR_rs2,FR_S11
1120(p12) cmp.lt.unc    p9,p0 = 12,GR_Sig2 // should mul by FR_Rq7?
1121};;
1122{ .mfi
1123      nop.m         0
1124      fma.s1        FR_C41 = FR_C41,FR_C61,f0
1125      nop.i         0
1126}
1127{ .mfi
1128      nop.m         0
1129(p9)  fma.s1        FR_Rq5 = FR_Rq5,FR_Rq7,f0
1130      nop.i         0
1131};;
1132{ .mfi
1133      nop.m         0
1134      fma.s1        FR_C81 = FR_C81,FR_CA1,f0
1135      nop.i         0
1136}
1137{ .mfi
1138      nop.m         0
1139      fma.s1        FR_S9 = FR_S9,FR_rs2,FR_S7
1140      nop.i         0
1141};;
1142{ .mfi
1143      nop.m         0
1144      fma.s1        FR_S5 = FR_S5,FR_rs2,FR_S3
1145      nop.i         0
1146};;
1147{ .mfi
1148      nop.m         0
1149      fma.s1        FR_rs3 = FR_rs2,FR_rs,f0
1150(p12) tbit.nz.unc   p6,p0 = GR_SigRqLin,0
1151}
1152{ .mfi
1153      nop.m         0
1154      fma.s1        FR_rs8 = FR_rs4,FR_rs4,f0
1155      nop.i         0
1156};;
1157{ .mfi
1158      nop.m         0
1159      fma.s1        FR_S21 = FR_S21,FR_rs4,FR_S17
1160      mov           GR_ExpOf1 = 0x2FFFF
1161}
1162{ .mfi
1163      nop.m         0
1164(p6)  fms.s1        FR_RqLin = FR_AbsX,f1,FR_RqLin
1165(p12) cmp.lt.unc    p8,p0 = 8,GR_Sig2 // should mul by FR_Rq5?
1166};;
1167{ .mfi
1168      nop.m         0
1169      fma.s1        FR_C01 = FR_C01,FR_C41,f0
1170      nop.i         0
1171}
1172{ .mfi
1173      nop.m         0
1174(p8)  fma.s1        FR_Rq1 = FR_Rq1,FR_Rq5,f0
1175(p14) cmp.gtu.unc   p7,p0 = GR_Sign_Exp,GR_ExpOf1
1176};;
1177{ .mfi
1178      nop.m         0
1179      fma.s1        FR_S13 = FR_S13,FR_rs4,FR_S9
1180      nop.i         0
1181}
1182{ .mfi
1183      nop.m         0
1184(p7)  fma.s1        FR_C81 = FR_C81,FR_AbsX,f0
1185      nop.i         0
1186};;
1187{ .mfi
1188      nop.m         0
1189(p14) fma.s1        FR_AbsXp1 = f1,f1,FR_AbsX // |x|+1
1190      nop.i         0
1191}
1192{ .mfi
1193      nop.m         0
1194(p15) fcmp.lt.unc.s1 p0,p10 = FR_AbsX,FR_OvfBound // x >= overflow_boundary
1195      nop.i         0
1196};;
1197{ .mfi
1198      nop.m         0
1199      fma.s1        FR_rs7 = FR_rs4,FR_rs3,f0
1200      nop.i         0
1201}
1202{ .mfi
1203      nop.m         0
1204      fma.s1        FR_S5 = FR_S5,FR_rs3,FR_rs
1205      nop.i         0
1206};;
1207{ .mib
1208(p14) cmp.lt        p13,p0 = r0,r0 // set p13 to 0 if x < 0
1209(p12) cmp.eq.unc    p8,p9 = 1,GR_SigRqLin
1210(p10) br.cond.spnt  tgamma_spec_res
1211};;
1212{ .mfi
1213      getf.sig      GR_Sig = FR_iXt
1214(p6)  fma.s1        FR_Rq1 = FR_Rq1,FR_RqLin,f0
1215      // should we mul by polynomial of recursion?
1216(p15) cmp.eq.unc    p0,p11 = r0,GR_SigRqLin
1217}
1218{ .mfb
1219      nop.m         0
1220      fma.s1        FR_GAMMA = FR_C01,FR_C81,f0
1221(p11) br.cond.spnt  tgamma_positives
1222};;
1223{ .mfi
1224      nop.m         0
1225      fma.s1        FR_S21 = FR_S21,FR_rs8,FR_S13
1226      nop.i         0
1227}
1228{ .mfb
1229      nop.m         0
1230(p13) fma.d.s0      f8 = FR_C01,FR_C81,f0
1231(p13) br.ret.spnt   b0
1232};;
1233.pred.rel "mutex",p8,p9
1234{ .mfi
1235      nop.m         0
1236(p9)  fma.s1        FR_GAMMA = FR_GAMMA,FR_Rq1,f0
1237      tbit.z        p6,p7 = GR_Sig,0 // p6 if sin<0, p7 if sin>0
1238}
1239{ .mfi
1240      nop.m         0
1241(p8)  fma.s1        FR_GAMMA = FR_GAMMA,FR_RqLin,f0
1242      nop.i         0
1243};;
1244{ .mfi
1245      nop.m         0
1246      fma.s1        FR_S21 = FR_S21,FR_rs7,FR_S5
1247      nop.i         0
1248};;
1249.pred.rel "mutex",p6,p7
1250{ .mfi
1251      nop.m         0
1252(p6)  fnma.s1       FR_GAMMA = FR_GAMMA,FR_S21,f0
1253      nop.i         0
1254}
1255{ .mfi
1256      nop.m         0
1257(p7)  fma.s1        FR_GAMMA = FR_GAMMA,FR_S21,f0
1258      mov           GR_Sig2 = 1
1259};;
1260{ .mfi
1261      nop.m         0
1262      frcpa.s1      FR_Rcp0,p0 = f1,FR_GAMMA
1263      cmp.ltu       p13,p0 = GR_Sign_Exp,GR_ExpOf1
1264};;
1265// NR method: ineration #1
1266{ .mfi
1267(p13) getf.exp      GR_Sign_Exp = FR_AbsX
1268      fnma.s1       FR_Rcp1 = FR_Rcp0,FR_GAMMA,f1 // t = 1 - r0*x
1269(p13) shl           GR_Sig2 = GR_Sig2,63
1270};;
1271{ .mfi
1272(p13) getf.sig      GR_Sig = FR_AbsX
1273      nop.f         0
1274(p13) mov           GR_NzOvfBound = 0xFBFF
1275};;
1276{ .mfi
1277(p13) cmp.ltu.unc   p8,p0 = GR_Sign_Exp,GR_NzOvfBound // p8 <- overflow
1278      nop.f         0
1279(p13) cmp.eq.unc    p9,p0 = GR_Sign_Exp,GR_NzOvfBound
1280};;
1281{ .mfb
1282      nop.m         0
1283(p13) fma.d.s0      FR_X = f1,f1,f8 // set deno & inexact flags
1284(p8)  br.cond.spnt  tgamma_ovf_near_0 //tgamma_neg_overflow
1285};;
1286{ .mib
1287      nop.m         0
1288(p9)  cmp.eq.unc    p8,p0 = GR_Sig,GR_Sig2
1289(p8)  br.cond.spnt  tgamma_ovf_near_0_boundary //tgamma_neg_overflow
1290};;
1291{ .mfi
1292      nop.m         0
1293      fma.s1        FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0
1294      nop.i         0
1295};;
1296// NR method: ineration #2
1297{ .mfi
1298      nop.m         0
1299      fnma.s1       FR_Rcp2 = FR_Rcp1,FR_GAMMA,f1 // t = 1 - r1*x
1300      nop.i         0
1301};;
1302{ .mfi
1303      nop.m         0
1304      fma.s1        FR_Rcp2 = FR_Rcp1,FR_Rcp2,FR_Rcp1
1305      nop.i         0
1306};;
1307// NR method: ineration #3
1308{ .mfi
1309      nop.m         0
1310      fnma.s1       FR_Rcp3 = FR_Rcp2,FR_GAMMA,f1 // t = 1 - r2*x
1311      nop.i         0
1312}
1313{ .mfi
1314      nop.m         0
1315(p13) fma.s1        FR_Rcp2 = FR_Rcp2,FR_AbsXp1,f0
1316(p14) cmp.ltu       p10,p11 = 0x9,GR_Tbl_Ind
1317};;
1318.pred.rel "mutex",p10,p11
1319{ .mfi
1320      nop.m         0
1321(p10) fma.s1        FR_GAMMA = FR_Rcp2,FR_Rcp3,FR_Rcp2
1322      nop.i         0
1323}
1324{ .mfi
1325      nop.m         0
1326(p11) fma.d.s0      f8 = FR_Rcp2,FR_Rcp3,FR_Rcp2
1327      nop.i         0
1328};;
1329{ .mfb
1330      nop.m         0
1331(p10) fma.d.s0      f8 = FR_GAMMA,FR_InvAn,f0
1332      br.ret.sptk   b0
1333};;
1334
1335
1336// here if x >= 3
1337//--------------------------------------------------------------------
1338.align 32
1339tgamma_positives:
1340.pred.rel "mutex",p8,p9
1341{ .mfi
1342      nop.m         0
1343(p9)  fma.d.s0      f8 = FR_GAMMA,FR_Rq1,f0
1344      nop.i         0
1345}
1346{ .mfb
1347      nop.m         0
1348(p8)  fma.d.s0      f8 = FR_GAMMA,FR_RqLin,f0
1349      br.ret.sptk   b0
1350};;
1351
1352// here if 0 < x < 1
1353//--------------------------------------------------------------------
1354.align 32
1355tgamma_from_0_to_2:
1356{ .mfi
1357      getf.exp      GR_Sign_Exp = FR_r02
1358      fms.s1        FR_r = FR_r02,f1,FR_Xmin
1359      mov           GR_ExpOf025 = 0xFFFD
1360}
1361{ .mfi
1362      add           GR_ad_Co = 0x1200,GR_ad_Data
1363(p6)  fnma.s1       FR_Rcp1 = FR_Rcp0,FR_NormX,f1  // t = 1 - r0*x
1364(p6)  mov           GR_Sig2 = 1
1365};;
1366{ .mfi
1367(p6)  getf.sig      GR_Sig = FR_NormX
1368      nop.f         0
1369(p6)  shl           GR_Sig2 = GR_Sig2,63
1370}
1371{ .mfi
1372      add           GR_ad_Ce = 0x1210,GR_ad_Data
1373      nop.f         0
1374(p6)  mov           GR_NzOvfBound = 0xFBFF
1375};;
1376{ .mfi
1377      cmp.eq        p8,p0 = GR_Sign_Exp,GR_ExpOf05 // r02 >= 1/2
1378      nop.f         0
1379      cmp.eq        p9,p10 = GR_Sign_Exp,GR_ExpOf025 // r02 >= 1/4
1380}
1381{ .mfi
1382(p6)  cmp.ltu.unc   p11,p0 = GR_Sign_Exp,GR_NzOvfBound // p11 <- overflow
1383      nop.f         0
1384(p6)  cmp.eq.unc    p12,p0 = GR_Sign_Exp,GR_NzOvfBound
1385};;
1386.pred.rel "mutex",p8,p9
1387{ .mfi
1388(p8)  add           GR_ad_Co = 0x200,GR_ad_Co
1389(p6)  fma.d.s0      FR_X = f1,f1,f8 // set deno & inexact flags
1390(p9)  add           GR_ad_Co = 0x100,GR_ad_Co
1391}
1392{ .mib
1393(p8)  add           GR_ad_Ce = 0x200,GR_ad_Ce
1394(p9)  add           GR_ad_Ce = 0x100,GR_ad_Ce
1395(p11) br.cond.spnt  tgamma_ovf_near_0 //tgamma_spec_res
1396};;
1397{ .mfi
1398      ldfe          FR_A15 = [GR_ad_Co],32
1399      nop.f         0
1400(p12) cmp.eq.unc    p13,p0 = GR_Sig,GR_Sig2
1401}
1402{ .mfb
1403      ldfe          FR_A14 = [GR_ad_Ce],32
1404      nop.f         0
1405(p13) br.cond.spnt  tgamma_ovf_near_0_boundary //tgamma_spec_res
1406};;
1407{ .mfi
1408      ldfe          FR_A13 = [GR_ad_Co],32
1409      nop.f         0
1410      nop.i         0
1411}
1412{ .mfi
1413      ldfe          FR_A12 = [GR_ad_Ce],32
1414      nop.f         0
1415      nop.i         0
1416};;
1417.pred.rel "mutex",p9,p10
1418{ .mfi
1419      ldfe          FR_A11 = [GR_ad_Co],32
1420(p10) fma.s1        FR_r2 = FR_r02,FR_r02,f0
1421      nop.i         0
1422}
1423{ .mfi
1424      ldfe          FR_A10 = [GR_ad_Ce],32
1425(p9)  fma.s1        FR_r2 = FR_r,FR_r,f0
1426      nop.i         0
1427};;
1428{ .mfi
1429      ldfe          FR_A9 = [GR_ad_Co],32
1430(p6)  fma.s1        FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0
1431      nop.i         0
1432}
1433{ .mfi
1434      ldfe          FR_A8 = [GR_ad_Ce],32
1435(p10) fma.s1        FR_r = f0,f0,FR_r02
1436      nop.i         0
1437};;
1438{ .mfi
1439      ldfe          FR_A7 = [GR_ad_Co],32
1440      nop.f         0
1441      nop.i         0
1442}
1443{ .mfi
1444      ldfe          FR_A6 = [GR_ad_Ce],32
1445      nop.f         0
1446      nop.i         0
1447};;
1448{ .mfi
1449      ldfe          FR_A5 = [GR_ad_Co],32
1450      nop.f         0
1451      nop.i         0
1452}
1453{ .mfi
1454      ldfe          FR_A4 = [GR_ad_Ce],32
1455      nop.f         0
1456      nop.i         0
1457};;
1458{ .mfi
1459      ldfe          FR_A3 = [GR_ad_Co],32
1460      nop.f         0
1461      nop.i         0
1462}
1463{ .mfi
1464      ldfe          FR_A2 = [GR_ad_Ce],32
1465      nop.f         0
1466      nop.i         0
1467};;
1468{ .mfi
1469      ldfe          FR_A1 = [GR_ad_Co],32
1470      fma.s1        FR_r4 = FR_r2,FR_r2,f0
1471      nop.i         0
1472}
1473{ .mfi
1474      ldfe          FR_A0 = [GR_ad_Ce],32
1475      nop.f         0
1476      nop.i         0
1477};;
1478{ .mfi
1479      nop.m         0
1480(p6)  fnma.s1       FR_Rcp2 = FR_Rcp1,FR_NormX,f1 // t = 1 - r1*x
1481      nop.i         0
1482};;
1483{ .mfi
1484      nop.m         0
1485      fma.s1        FR_A15 = FR_A15,FR_r,FR_A14
1486      nop.i         0
1487}
1488{ .mfi
1489      nop.m         0
1490      fma.s1        FR_A11 = FR_A11,FR_r,FR_A10
1491      nop.i         0
1492};;
1493{ .mfi
1494      nop.m         0
1495      fma.s1        FR_r8 = FR_r4,FR_r4,f0
1496      nop.i         0
1497};;
1498{ .mfi
1499      nop.m         0
1500(p6)  fma.s1        FR_Rcp2 = FR_Rcp1,FR_Rcp2,FR_Rcp1
1501      nop.i         0
1502};;
1503{ .mfi
1504      nop.m         0
1505      fma.s1        FR_A7 = FR_A7,FR_r,FR_A6
1506      nop.i         0
1507}
1508{ .mfi
1509      nop.m         0
1510      fma.s1        FR_A3 = FR_A3,FR_r,FR_A2
1511      nop.i         0
1512};;
1513{ .mfi
1514      nop.m         0
1515      fma.s1        FR_A15 = FR_A15,FR_r,FR_A13
1516      nop.i         0
1517}
1518{ .mfi
1519      nop.m         0
1520      fma.s1        FR_A11 = FR_A11,FR_r,FR_A9
1521      nop.i         0
1522};;
1523{ .mfi
1524      nop.m         0
1525(p6)  fnma.s1       FR_Rcp3 = FR_Rcp2,FR_NormX,f1 // t = 1 - r1*x
1526      nop.i         0
1527};;
1528{ .mfi
1529      nop.m         0
1530      fma.s1        FR_A7 = FR_A7,FR_r,FR_A5
1531      nop.i         0
1532}
1533{ .mfi
1534      nop.m         0
1535      fma.s1        FR_A3 = FR_A3,FR_r,FR_A1
1536      nop.i         0
1537};;
1538{ .mfi
1539      nop.m         0
1540      fma.s1        FR_A15 = FR_A15,FR_r,FR_A12
1541      nop.i         0
1542}
1543{ .mfi
1544      nop.m         0
1545      fma.s1        FR_A11 = FR_A11,FR_r,FR_A8
1546      nop.i         0
1547};;
1548{ .mfi
1549      nop.m         0
1550(p6)  fma.s1        FR_Rcp3 = FR_Rcp2,FR_Rcp3,FR_Rcp2
1551      nop.i         0
1552};;
1553{ .mfi
1554      nop.m         0
1555      fma.s1        FR_A7 = FR_A7,FR_r,FR_A4
1556      nop.i         0
1557}
1558{ .mfi
1559      nop.m         0
1560      fma.s1        FR_A3 = FR_A3,FR_r,FR_A0
1561      nop.i         0
1562};;
1563{ .mfi
1564      nop.m         0
1565      fma.s1        FR_A15 = FR_A15,FR_r4,FR_A11
1566      nop.i         0
1567}
1568{ .mfi
1569      nop.m         0
1570      fma.s1        FR_A7 = FR_A7,FR_r4,FR_A3
1571      nop.i         0
1572};;
1573.pred.rel "mutex",p6,p7
1574{ .mfi
1575      nop.m         0
1576(p6)  fma.s1        FR_A15 = FR_A15,FR_r8,FR_A7
1577      nop.i         0
1578}
1579{ .mfi
1580      nop.m         0
1581(p7)  fma.d.s0      f8 = FR_A15,FR_r8,FR_A7
1582      nop.i         0
1583};;
1584{ .mfb
1585      nop.m         0
1586(p6)  fma.d.s0      f8 = FR_A15,FR_Rcp3,f0
1587      br.ret.sptk   b0
1588};;
1589
1590// overflow
1591//--------------------------------------------------------------------
1592.align 32
1593tgamma_ovf_near_0_boundary:
1594.pred.rel "mutex",p14,p15
1595{ .mfi
1596	  mov           GR_fpsr = ar.fpsr
1597	  nop.f         0
1598(p15) mov           r8 = 0x7ff
1599}
1600{ .mfi
1601      nop.m         0
1602      nop.f         0
1603(p14) mov           r8 = 0xfff
1604};;
1605{ .mfi
1606	  nop.m         0
1607	  nop.f         0
1608	  shl           r8 = r8,52
1609};;
1610{ .mfi
1611      sub           r8 = r8,r0,1
1612      nop.f         0
1613	  extr.u        GR_fpsr = GR_fpsr,10,2 // rounding mode
1614};;
1615.pred.rel "mutex",p14,p15
1616{ .mfi
1617      // set p8 to 0 in case of overflow and to 1 otherwise
1618	  // for negative arg:
1619	  //    no overflow if rounding mode either Z or +Inf, i.e.
1620	  //    GR_fpsr > 1
1621(p14) cmp.lt        p8,p0 = 1,GR_fpsr
1622      nop.f         0
1623	  // for positive arg:
1624	  //    no overflow if rounding mode either Z or -Inf, i.e.
1625	  //    (GR_fpsr & 1) == 0
1626(p15) tbit.z        p0,p8 = GR_fpsr,0
1627};;
1628{ .mib
1629(p8)  setf.d        f8 = r8 // set result to 0x7fefffffffffffff without
1630                            // OVERFLOW flag raising
1631      nop.i         0
1632(p8)  br.ret.sptk   b0
1633};;
1634.align 32
1635tgamma_ovf_near_0:
1636{ .mfi
1637      mov           r8 = 0x1FFFE
1638      nop.f         0
1639      nop.i         0
1640};;
1641{ .mfi
1642      setf.exp      f9 = r8
1643      fmerge.s      FR_X = f8,f8
1644      mov           GR_TAG = 258 // overflow
1645};;
1646.pred.rel "mutex",p14,p15
1647{ .mfi
1648      nop.m         0
1649(p15) fma.d.s0      f8 = f9,f9,f0 // Set I,O and +INF result
1650      nop.i         0
1651}
1652{ .mfb
1653      nop.m         0
1654(p14) fnma.d.s0     f8 = f9,f9,f0 // Set I,O and -INF result
1655      br.cond.sptk  tgamma_libm_err
1656};;
1657// overflow or absolute value of x is too big
1658//--------------------------------------------------------------------
1659.align 32
1660tgamma_spec_res:
1661{ .mfi
1662      mov           GR_0x30033 = 0x30033
1663(p14) fcmp.eq.unc.s1 p10,p11 = f8,FR_Xt
1664(p15) mov           r8 = 0x1FFFE
1665};;
1666{ .mfi
1667(p15) setf.exp      f9 = r8
1668      nop.f         0
1669      nop.i         0
1670};;
1671{ .mfb
1672(p11) cmp.ltu.unc   p7,p8 = GR_0x30033,GR_Sign_Exp
1673      nop.f         0
1674(p10) br.cond.spnt  tgamma_singularity
1675};;
1676.pred.rel "mutex",p7,p8
1677{ .mbb
1678      nop.m         0
1679(p7)  br.cond.spnt  tgamma_singularity
1680(p8)  br.cond.spnt  tgamma_underflow
1681};;
1682{ .mfi
1683      nop.m         0
1684      fmerge.s      FR_X = f8,f8
1685      mov           GR_TAG = 258 // overflow
1686}
1687{ .mfb
1688      nop.m         0
1689(p15) fma.d.s0      f8 = f9,f9,f0 // Set I,O and +INF result
1690      br.cond.sptk  tgamma_libm_err
1691};;
1692
1693// x is negative integer or +/-0
1694//--------------------------------------------------------------------
1695.align 32
1696tgamma_singularity:
1697{ .mfi
1698      nop.m         0
1699      fmerge.s      FR_X = f8,f8
1700      mov           GR_TAG = 259 // negative
1701}
1702{ .mfb
1703      nop.m         0
1704      frcpa.s0      f8,p0 = f0,f0
1705      br.cond.sptk  tgamma_libm_err
1706};;
1707// x is negative noninteger with big absolute value
1708//--------------------------------------------------------------------
1709.align 32
1710tgamma_underflow:
1711{ .mmi
1712      getf.sig      GR_Sig = FR_iXt
1713      mov           r11 = 0x00001
1714      nop.i         0
1715};;
1716{ .mfi
1717      setf.exp      f9 = r11
1718      nop.f         0
1719      nop.i         0
1720};;
1721{ .mfi
1722      nop.m         0
1723      nop.f         0
1724      tbit.z        p6,p7 = GR_Sig,0
1725};;
1726.pred.rel "mutex",p6,p7
1727{ .mfi
1728      nop.m         0
1729(p6)  fms.d.s0      f8 = f9,f9,f9
1730      nop.i         0
1731}
1732{ .mfb
1733      nop.m         0
1734(p7)  fma.d.s0      f8 = f9,f9,f9
1735      br.ret.sptk   b0
1736};;
1737
1738//  x for natval, nan, +/-inf or +/-0
1739//--------------------------------------------------------------------
1740.align 32
1741tgamma_spec:
1742{ .mfi
1743      nop.m         0
1744      fclass.m      p6,p0 =  f8,0x1E1 // Test x for natval, nan, +inf
1745      nop.i         0
1746};;
1747{ .mfi
1748      nop.m         0
1749      fclass.m      p7,p8 =  f8,0x7 // +/-0
1750      nop.i         0
1751};;
1752{ .mfi
1753      nop.m         0
1754      fmerge.s      FR_X = f8,f8
1755      nop.i         0
1756}
1757{ .mfb
1758      nop.m         0
1759(p6)  fma.d.s0      f8 = f8,f1,f8
1760(p6)  br.ret.spnt   b0
1761};;
1762.pred.rel "mutex",p7,p8
1763{ .mfi
1764(p7)  mov           GR_TAG = 259 // negative
1765(p7)  frcpa.s0      f8,p0 = f1,f8
1766      nop.i         0
1767}
1768{ .mib
1769      nop.m         0
1770      nop.i         0
1771(p8)  br.cond.spnt  tgamma_singularity
1772};;
1773
1774.align 32
1775tgamma_libm_err:
1776{ .mfi
1777       alloc        r32 = ar.pfs,1,4,4,0
1778       nop.f        0
1779       mov          GR_Parameter_TAG = GR_TAG
1780};;
1781
1782GLOBAL_LIBM_END(tgamma)
1783libm_alias_double_other (tgamma, tgamma)
1784
1785LOCAL_LIBM_ENTRY(__libm_error_region)
1786.prologue
1787{ .mfi
1788        add   GR_Parameter_Y=-32,sp             // Parameter 2 value
1789        nop.f 0
1790.save   ar.pfs,GR_SAVE_PFS
1791        mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
1792}
1793{ .mfi
1794.fframe 64
1795        add sp=-64,sp                           // Create new stack
1796        nop.f 0
1797        mov GR_SAVE_GP=gp                       // Save gp
1798};;
1799{ .mmi
1800        stfd [GR_Parameter_Y] = FR_Y,16         // STORE Parameter 2 on stack
1801        add GR_Parameter_X = 16,sp              // Parameter 1 address
1802.save   b0, GR_SAVE_B0
1803        mov GR_SAVE_B0=b0                       // Save b0
1804};;
1805.body
1806{ .mib
1807        stfd [GR_Parameter_X] = FR_X                  // STORE Parameter 1 on stack
1808        add   GR_Parameter_RESULT = 0,GR_Parameter_Y  // Parameter 3 address
1809        nop.b 0
1810}
1811{ .mib
1812        stfd [GR_Parameter_Y] = FR_RESULT             // STORE Parameter 3 on stack
1813        add   GR_Parameter_Y = -16,GR_Parameter_Y
1814        br.call.sptk b0=__libm_error_support#         // Call error handling function
1815};;
1816{ .mmi
1817        nop.m 0
1818        nop.m 0
1819        add   GR_Parameter_RESULT = 48,sp
1820};;
1821{ .mmi
1822        ldfd  f8 = [GR_Parameter_RESULT]       // Get return result off stack
1823.restore sp
1824        add   sp = 64,sp                       // Restore stack pointer
1825        mov   b0 = GR_SAVE_B0                  // Restore return address
1826};;
1827{ .mib
1828        mov   gp = GR_SAVE_GP                  // Restore gp
1829        mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
1830        br.ret.sptk     b0                     // Return
1831};;
1832
1833LOCAL_LIBM_END(__libm_error_region)
1834.type   __libm_error_support#,@function
1835.global __libm_error_support#
1836