1.file "tgammaf.s"
2
3
4// Copyright (c) 2001 - 2005, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
25// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
28// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code,and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39//*********************************************************************
40//
41// History:
42// 11/30/01  Initial version
43// 05/20/02  Cleaned up namespace and sf0 syntax
44// 02/10/03  Reordered header: .section, .global, .proc, .align
45// 04/04/03  Changed error codes for overflow and negative integers
46// 04/10/03  Changed code for overflow near zero handling
47// 12/16/03  Fixed parameter passing to/from error handling routine
48// 03/31/05  Reformatted delimiters between data tables
49//
50//*********************************************************************
51//
52//*********************************************************************
53//
54// Function: tgammaf(x) computes the principle value of the GAMMA
55// function of x.
56//
57//*********************************************************************
58//
59// Resources Used:
60//
61//    Floating-Point Registers: f8-f15
62//                              f33-f75
63//
64//    General Purpose Registers:
65//      r8-r11
66//      r14-r29
67//      r32-r36
68//      r37-r40 (Used to pass arguments to error handling routine)
69//
70//    Predicate Registers:      p6-p15
71//
72//*********************************************************************
73//
74// IEEE Special Conditions:
75//
76//    tgammaf(+inf) = +inf
77//    tgammaf(-inf) = QNaN
78//    tgammaf(+/-0) = +/-inf
79//    tgammaf(x<0, x - integer) = QNaN
80//    tgammaf(SNaN) = QNaN
81//    tgammaf(QNaN) = QNaN
82//
83//*********************************************************************
84//
85// Overview
86//
87// The method consists of three cases.
88//
89// If       2 <= x < OVERFLOW_BOUNDARY   use case tgamma_regular;
90// else if  0 < x < 2                    use case tgamma_from_0_to_2;
91// else if  -(i+1) <  x < -i, i = 0...43 use case tgamma_negatives;
92//
93// Case 2 <= x < OVERFLOW_BOUNDARY
94// -------------------------------
95//   Here we use algorithm based on the recursive formula
96//   GAMMA(x+1) = x*GAMMA(x). For that we subdivide interval
97//   [2; OVERFLOW_BOUNDARY] into intervals [8*n; 8*(n+1)] and
98//   approximate GAMMA(x) by polynomial of 22th degree on each
99//   [8*n; 8*n+1], recursive formula is used to expand GAMMA(x)
100//   to [8*n; 8*n+1]. In other words we need to find n, i and r
101//   such that x = 8 * n + i + r where n and i are integer numbers
102//   and r is fractional part of x. So GAMMA(x) = GAMMA(8*n+i+r) =
103//   = (x-1)*(x-2)*...*(x-i)*GAMMA(x-i) =
104//   = (x-1)*(x-2)*...*(x-i)*GAMMA(8*n+r) ~
105//   ~ (x-1)*(x-2)*...*(x-i)*P12n(r).
106//
107//   Step 1: Reduction
108//   -----------------
109//    N = [x] with truncate
110//    r = x - N, note 0 <= r < 1
111//
112//    n = N & ~0xF - index of table that contains coefficient of
113//                   polynomial approximation
114//    i = N & 0xF  - is used in recursive formula
115//
116//
117//   Step 2: Approximation
118//   ---------------------
119//    We use factorized minimax approximation polynomials
120//    P12n(r) = A12*(r^2+C01(n)*r+C00(n))*
121//              *(r^2+C11(n)*r+C10(n))*...*(r^2+C51(n)*r+C50(n))
122//
123//   Step 3: Recursion
124//   -----------------
125//    In case when i > 0 we need to multiply P12n(r) by product
126//    R(i,x)=(x-1)*(x-2)*...*(x-i). To reduce number of fp-instructions
127//    we can calculate R as follow:
128//    R(i,x) = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-1))*(x-i)) if i is
129//    even or R = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-2))*(x-(i-1)))*
130//    *(i-1) if i is odd. In both cases we need to calculate
131//    R2(i,x) = (x^2-3*x+2)*(x^2-7*x+12)*...*(x^2+x+2*j*(2*j-1)) =
132//    = ((x^2-x)+2*(1-x))*((x^2-x)+6*(2-x))*...*((x^2-x)+2*(2*j-1)*(j-x)) =
133//    = (RA+2*RB)*(RA+6*(1-RB))*...*(RA+2*(2*j-1)*(j-1+RB))
134//    where j = 1..[i/2], RA = x^2-x, RB = 1-x.
135//
136//   Step 4: Reconstruction
137//   ----------------------
138//    Reconstruction is just simple multiplication i.e.
139//    GAMMA(x) = P12n(r)*R(i,x)
140//
141// Case 0 < x < 2
142// --------------
143//    To calculate GAMMA(x) on this interval we do following
144//        if 1.0  <= x < 1.25  than  GAMMA(x) = P7(x-1)
145//        if 1.25 <= x < 1.5   than  GAMMA(x) = P7(x-x_min) where
146//              x_min is point of local minimum on [1; 2] interval.
147//        if 1.5  <= x < 1.75  than  GAMMA(x) = P7(x-1.5)
148//        if 1.75 <= x < 2.0   than  GAMMA(x) = P7(x-1.5)
149//    and
150//        if 0 < x < 1 than GAMMA(x) = GAMMA(x+1)/x
151//
152// Case -(i+1) <  x < -i, i = 0...43
153// ----------------------------------
154//    Here we use the fact that GAMMA(-x) = PI/(x*GAMMA(x)*sin(PI*x)) and
155//    so we need to calculate GAMMA(x), sin(PI*x)/PI. Calculation of
156//    GAMMA(x) is described above.
157//
158//   Step 1: Reduction
159//   -----------------
160//    Note that period of sin(PI*x) is 2 and range reduction for
161//    sin(PI*x) is like to range reduction for GAMMA(x)
162//    i.e rs = x - round(x) and |rs| <= 0.5.
163//
164//   Step 2: Approximation
165//   ---------------------
166//    To approximate sin(PI*x)/PI = sin(PI*(2*n+rs))/PI =
167//    = (-1)^n*sin(PI*rs)/PI Taylor series is used.
168//    sin(PI*rs)/PI ~ S17(rs).
169//
170//   Step 3: Division
171//   ----------------
172//    To calculate 1/x and 1/(GAMMA(x)*S12(rs)) we use frcpa
173//    instruction with following Newton-Raphson interations.
174//
175//
176//*********************************************************************
177
178GR_ad_Data              = r8
179GR_TAG                  = r8
180GR_SignExp              = r9
181GR_Sig                  = r10
182GR_ArgNz                = r10
183GR_RqDeg                = r11
184
185GR_NanBound             = r14
186GR_ExpOf025             = r15
187GR_ExpOf05              = r16
188GR_ad_Co                = r17
189GR_ad_Ce                = r18
190GR_TblOffs              = r19
191GR_Arg                  = r20
192GR_Exp2Ind              = r21
193GR_TblOffsMask          = r21
194GR_Offs                 = r22
195GR_OvfNzBound           = r23
196GR_ZeroResBound         = r24
197GR_ad_SinO              = r25
198GR_ad_SinE              = r26
199GR_Correction           = r27
200GR_Tbl12Offs            = r28
201GR_NzBound              = r28
202GR_ExpOf1               = r29
203GR_fpsr                 = r29
204
205GR_SAVE_B0              = r33
206GR_SAVE_PFS             = r34
207GR_SAVE_GP              = r35
208GR_SAVE_SP              = r36
209
210GR_Parameter_X          = r37
211GR_Parameter_Y          = r38
212GR_Parameter_RESULT     = r39
213GR_Parameter_TAG        = r40
214
215
216FR_X                    = f10
217FR_Y                    = f1
218FR_RESULT               = f8
219
220FR_iXt                  = f11
221FR_Xt                   = f12
222FR_r                    = f13
223FR_r2                   = f14
224FR_r4                   = f15
225
226FR_C01                  = f33
227FR_A7                   = f33
228FR_C11                  = f34
229FR_A6                   = f34
230FR_C21                  = f35
231FR_A5                   = f35
232FR_C31                  = f36
233FR_A4                   = f36
234FR_C41                  = f37
235FR_A3                   = f37
236FR_C51                  = f38
237FR_A2                   = f38
238
239FR_C00                  = f39
240FR_A1                   = f39
241FR_C10                  = f40
242FR_A0                   = f40
243FR_C20                  = f41
244FR_C30                  = f42
245FR_C40                  = f43
246FR_C50                  = f44
247FR_An                   = f45
248FR_OvfBound             = f46
249FR_InvAn                = f47
250
251FR_Multplr              = f48
252FR_NormX                = f49
253FR_X2mX                 = f50
254FR_1mX                  = f51
255FR_Rq0                  = f51
256FR_Rq1                  = f52
257FR_Rq2                  = f53
258FR_Rq3                  = f54
259
260FR_Rcp0                 = f55
261FR_Rcp1                 = f56
262FR_Rcp2                 = f57
263
264FR_InvNormX1            = f58
265FR_InvNormX2            = f59
266
267FR_rs                   = f60
268FR_rs2                  = f61
269
270FR_LocalMin             = f62
271FR_10                   = f63
272
273FR_05                   = f64
274
275FR_S32                  = f65
276FR_S31                  = f66
277FR_S01                  = f67
278FR_S11                  = f68
279FR_S21                  = f69
280FR_S00                  = f70
281FR_S10                  = f71
282FR_S20                  = f72
283
284FR_GAMMA                = f73
285FR_2                    = f74
286FR_6                    = f75
287
288
289
290
291// Data tables
292//==============================================================
293RODATA
294.align 16
295LOCAL_OBJECT_START(tgammaf_data)
296data8 0x3FDD8B618D5AF8FE // local minimum (0.461632144968362356785)
297data8 0x4024000000000000 // 10.0
298data8 0x3E90FC992FF39E13 // S32
299data8 0xBEC144B2760626E2 // S31
300//
301//[2; 8)
302data8 0x4009EFD1BA0CB3B4 // C01
303data8 0x3FFFB35378FF4822 // C11
304data8 0xC01032270413B896 // C41
305data8 0xC01F171A4C0D6827 // C51
306data8 0x40148F8E197396AC // C20
307data8 0x401C601959F1249C // C30
308data8 0x3EE21AD881741977 // An
309data8 0x4041852200000000 // overflow boundary (35.04010009765625)
310data8 0x3FD9CE68F695B198 // C21
311data8 0xBFF8C30AC900DA03 // C31
312data8 0x400E17D2F0535C02 // C00
313data8 0x4010689240F7FAC8 // C10
314data8 0x402563147DDCCF8D // C40
315data8 0x4033406D0480A21C // C50
316//
317//[8; 16)
318data8 0x4006222BAE0B793B // C01
319data8 0x4002452733473EDA // C11
320data8 0xC0010EF3326FDDB3 // C41
321data8 0xC01492B817F99C0F // C51
322data8 0x40099C905A249B75 // C20
323data8 0x4012B972AE0E533D // C30
324data8 0x3FE6F6DB91D0D4CC // An
325data8 0x4041852200000000 // overflow boundary
326data8 0x3FF545828F7B73C5 // C21
327data8 0xBFBBD210578764DF // C31
328data8 0x4000542098F53CFC // C00
329data8 0x40032C1309AD6C81 // C10
330data8 0x401D7331E19BD2E1 // C40
331data8 0x402A06807295EF57 // C50
332//
333//[16; 24)
334data8 0x4000131002867596 // C01
335data8 0x3FFAA362D5D1B6F2 // C11
336data8 0xBFFCB6985697DB6D // C41
337data8 0xC0115BEE3BFC3B3B // C51
338data8 0x3FFE62FF83456F73 // C20
339data8 0x4007E33478A114C4 // C30
340data8 0x41E9B2B73795ED57 // An
341data8 0x4041852200000000 // overflow boundary
342data8 0x3FEEB1F345BC2769 // C21
343data8 0xBFC3BBE6E7F3316F // C31
344data8 0x3FF14E07DA5E9983 // C00
345data8 0x3FF53B76BF81E2C0 // C10
346data8 0x4014051E0269A3DC // C40
347data8 0x40229D4227468EDB // C50
348//
349//[24; 32)
350data8 0x3FFAF7BD498384DE // C01
351data8 0x3FF62AD8B4D1C3D2 // C11
352data8 0xBFFABCADCD004C32 // C41
353data8 0xC00FADE97C097EC9 // C51
354data8 0x3FF6DA9ED737707E // C20
355data8 0x4002A29E9E0C782C // C30
356data8 0x44329D5B5167C6C3 // An
357data8 0x4041852200000000 // overflow boundary
358data8 0x3FE8943CBBB4B727 // C21
359data8 0xBFCB39D466E11756 // C31
360data8 0x3FE879AF3243D8C1 // C00
361data8 0x3FEEC7DEBB14CE1E // C10
362data8 0x401017B79BA80BCB // C40
363data8 0x401E941DC3C4DE80 // C50
364//
365//[32; 40)
366data8 0x3FF7ECB3A0E8FE5C // C01
367data8 0x3FF3815A8516316B // C11
368data8 0xBFF9ABD8FCC000C3 // C41
369data8 0xC00DD89969A4195B // C51
370data8 0x3FF2E43139CBF563 // C20
371data8 0x3FFF96DC3474A606 // C30
372data8 0x46AFF4CA9B0DDDF0 // An
373data8 0x4041852200000000 // overflow boundary
374data8 0x3FE4CE76DA1B5783 // C21
375data8 0xBFD0524DB460BC4E // C31
376data8 0x3FE35852DF14E200 // C00
377data8 0x3FE8C7610359F642 // C10
378data8 0x400BCF750EC16173 // C40
379data8 0x401AC14E02EA701C // C50
380//
381//[40; 48)
382data8 0x3FF5DCE4D8193097 // C01
383data8 0x3FF1B0D8C4974FFA // C11
384data8 0xBFF8FB450194CAEA // C41
385data8 0xC00C9658E030A6C4 // C51
386data8 0x3FF068851118AB46 // C20
387data8 0x3FFBF7C7BB46BF7D // C30
388data8 0x3FF0000000000000 // An
389data8 0x4041852200000000 // overflow boundary
390data8 0x3FE231DEB11D847A // C21
391data8 0xBFD251ECAFD7E935 // C31
392data8 0x3FE0368AE288F6BF // C00
393data8 0x3FE513AE4215A70C // C10
394data8 0x4008F960F7141B8B // C40
395data8 0x40183BA08134397B // C50
396//
397//[1.0; 1.25)
398data8 0xBFD9909648921868 // A7
399data8 0x3FE96FFEEEA8520F // A6
400data8 0xBFED0800D93449B8 // A3
401data8 0x3FEFA648D144911C // A2
402data8 0xBFEE3720F7720B4D // A5
403data8 0x3FEF4857A010CA3B // A4
404data8 0xBFE2788CCD545AA4 // A1
405data8 0x3FEFFFFFFFE9209E // A0
406//
407//[1.25; 1.5)
408data8 0xBFB421236426936C // A7
409data8 0x3FAF237514F36691 // A6
410data8 0xBFC0BADE710A10B9 // A3
411data8 0x3FDB6C5465BBEF1F // A2
412data8 0xBFB7E7F83A546EBE // A5
413data8 0x3FC496A01A545163 // A4
414data8 0xBDEE86A39D8452EB // A1
415data8 0x3FEC56DC82A39AA2 // A0
416//
417//[1.5; 1.75)
418data8 0xBF94730B51795867 // A7
419data8 0x3FBF4203E3816C7B // A6
420data8 0xBFE85B427DBD23E4 // A3
421data8 0x3FEE65557AB26771 // A2
422data8 0xBFD59D31BE3AB42A // A5
423data8 0x3FE3C90CC8F09147 // A4
424data8 0xBFE245971DF735B8 // A1
425data8 0x3FEFFC613AE7FBC8 // A0
426//
427//[1.75; 2.0)
428data8 0xBF7746A85137617E // A7
429data8 0x3FA96E37D09735F3 // A6
430data8 0xBFE3C24AC40AC0BB // A3
431data8 0x3FEC56A80A977CA5 // A2
432data8 0xBFC6F0E707560916 // A5
433data8 0x3FDB262D949175BE // A4
434data8 0xBFE1C1AEDFB25495 // A1
435data8 0x3FEFEE1E644B2022 // A0
436//
437// sin(pi*x)/pi
438data8 0xC026FB0D377656CC // S01
439data8 0x3FFFB15F95A22324 // S11
440data8 0x406CE58F4A41C6E7 // S10
441data8 0x404453786302C61E // S20
442data8 0xC023D59A47DBFCD3 // S21
443data8 0x405541D7ABECEFCA // S00
444//
445// 1/An for [40; 48)
446data8 0xCAA7576DE621FCD5, 0x3F68
447LOCAL_OBJECT_END(tgammaf_data)
448
449//==============================================================
450// Code
451//==============================================================
452
453.section .text
454GLOBAL_LIBM_ENTRY(tgammaf)
455{ .mfi
456      getf.exp      GR_SignExp = f8
457      fma.s1        FR_NormX = f8,f1,f0
458      addl          GR_ad_Data = @ltoff(tgammaf_data), gp
459}
460{ .mfi
461      mov           GR_ExpOf05 = 0xFFFE
462      fcvt.fx.trunc.s1 FR_iXt = f8 // [x]
463      mov           GR_Offs = 0 // 2 <= x < 8
464};;
465{ .mfi
466      getf.d        GR_Arg = f8
467      fcmp.lt.s1    p14,p15 = f8,f0
468      mov           GR_Tbl12Offs = 0
469}
470{ .mfi
471      setf.exp      FR_05 = GR_ExpOf05
472      fma.s1        FR_2 = f1,f1,f1 // 2
473      mov           GR_Correction = 0
474};;
475{ .mfi
476      ld8           GR_ad_Data = [GR_ad_Data]
477      fclass.m      p10,p0 = f8,0x1E7 // is x  NaTVal, NaN, +/-0 or +/-INF?
478      tbit.z        p12,p13 = GR_SignExp,16 // p13 if |x| >= 2
479}
480{ .mfi
481      mov           GR_ExpOf1 = 0xFFFF
482      fcvt.fx.s1    FR_rs = f8 // round(x)
483      and           GR_Exp2Ind = 7,GR_SignExp
484};;
485.pred.rel "mutex",p14,p15
486{ .mfi
487(p15) cmp.eq.unc    p11,p0 = GR_ExpOf1,GR_SignExp // p11 if 1 <= x < 2
488(p14) fma.s1        FR_1mX = f1,f1,f8 // 1 - |x|
489      mov           GR_Sig = 0 // if |x| < 2
490}
491{ .mfi
492(p13) cmp.eq.unc    p7,p0 = 2,GR_Exp2Ind
493(p15) fms.s1        FR_1mX = f1,f1,f8 // 1 - |x|
494(p13) cmp.eq.unc    p8,p0 = 3,GR_Exp2Ind
495};;
496.pred.rel "mutex",p7,p8
497{ .mfi
498(p7)  mov           GR_Offs = 0x7    // 8 <= |x| < 16
499      nop.f         0
500(p8)  tbit.z.unc    p0,p6 = GR_Arg,51
501}
502{ .mib
503(p13) cmp.lt.unc    p9,p0 = 3,GR_Exp2Ind
504(p8)  mov           GR_Offs = 0xE // 16 <= |x| < 32
505      // jump if x is NaTVal, NaN, +/-0 or +/-INF?
506(p10) br.cond.spnt  tgammaf_spec_args
507};;
508.pred.rel "mutex",p14,p15
509.pred.rel "mutex",p6,p9
510{ .mfi
511(p9)  mov           GR_Offs = 0x1C // 32 <= |x|
512(p14) fma.s1        FR_X2mX = FR_NormX,FR_NormX,FR_NormX // x^2-|x|
513(p9)  tbit.z.unc    p0,p8 = GR_Arg,50
514}
515{ .mfi
516      ldfpd         FR_LocalMin,FR_10 = [GR_ad_Data],16
517(p15) fms.s1        FR_X2mX = FR_NormX,FR_NormX,FR_NormX // x^2-|x|
518(p6)  add           GR_Offs = 0x7,GR_Offs // 24 <= x < 32
519};;
520.pred.rel "mutex",p8,p12
521{ .mfi
522      add           GR_ad_Ce = 0x50,GR_ad_Data
523(p15) fcmp.lt.unc.s1 p10,p0 = f8,f1 // p10 if 0 <= x < 1
524      mov           GR_OvfNzBound = 2
525}
526{ .mib
527      ldfpd         FR_S32,FR_S31 = [GR_ad_Data],16
528(p8)  add           GR_Offs = 0x7,GR_Offs // 40 <= |x|
529      // jump if 1 <= x < 2
530(p11) br.cond.spnt  tgammaf_from_1_to_2
531};;
532{ .mfi
533      shladd        GR_ad_Ce = GR_Offs,4,GR_ad_Ce
534      fcvt.xf       FR_Xt = FR_iXt // [x]
535(p13) cmp.eq.unc    p7,p0 = r0,GR_Offs // p7 if 2 <= |x| < 8
536}
537{ .mfi
538      shladd        GR_ad_Co = GR_Offs,4,GR_ad_Data
539      fma.s1        FR_6 = FR_2,FR_2,FR_2
540      mov           GR_ExpOf05 = 0x7FC
541};;
542{ .mfi
543(p13) getf.sig      GR_Sig = FR_iXt // if |x| >= 2
544      frcpa.s1      FR_Rcp0,p0 = f1,FR_NormX
545(p10) shr           GR_Arg = GR_Arg,51
546}
547{ .mib
548      ldfpd         FR_C01,FR_C11 = [GR_ad_Co],16
549(p7)  mov           GR_Correction = 2
550      // jump if 0 < x < 1
551(p10) br.cond.spnt  tgammaf_from_0_to_1
552};;
553{ .mfi
554      ldfpd         FR_C21,FR_C31 = [GR_ad_Ce],16
555      fma.s1        FR_Rq2 = f1,f1,FR_1mX // 2 - |x|
556(p14) sub           GR_Correction = r0,GR_Correction
557}
558{ .mfi
559      ldfpd         FR_C41,FR_C51 = [GR_ad_Co],16
560(p14) fcvt.xf       FR_rs = FR_rs
561(p14) add           GR_ad_SinO = 0x3A0,GR_ad_Data
562};;
563.pred.rel "mutex",p14,p15
564{ .mfi
565      ldfpd         FR_C00,FR_C10 = [GR_ad_Ce],16
566      nop.f         0
567(p14) sub           GR_Sig = GR_Correction,GR_Sig
568}
569{ .mfi
570      ldfpd         FR_C20,FR_C30 = [GR_ad_Co],16
571      fma.s1        FR_Rq1 = FR_1mX,FR_2,FR_X2mX // (x-1)*(x-2)
572(p15) sub           GR_Sig = GR_Sig,GR_Correction
573};;
574{ .mfi
575(p14) ldfpd         FR_S01,FR_S11 = [GR_ad_SinO],16
576      fma.s1        FR_Rq3 = FR_2,f1,FR_1mX // 3 - |x|
577      and           GR_RqDeg = 0x6,GR_Sig
578}
579{ .mfi
580      ldfpd         FR_C40,FR_C50 = [GR_ad_Ce],16
581(p14) fma.d.s0      FR_X = f0,f0,f8 // set deno flag
582      mov           GR_NanBound = 0x30016 // -2^23
583};;
584.pred.rel "mutex",p14,p15
585{ .mfi
586(p14) add           GR_ad_SinE = 0x3C0,GR_ad_Data
587(p15) fms.s1        FR_r = FR_NormX,f1,FR_Xt // r = x - [x]
588      cmp.eq        p8,p0 = 2,GR_RqDeg
589}
590{ .mfi
591      ldfpd         FR_An,FR_OvfBound = [GR_ad_Co]
592(p14) fms.s1        FR_r = FR_Xt,f1,FR_NormX // r = |x - [x]|
593      cmp.eq        p9,p0 = 4,GR_RqDeg
594};;
595.pred.rel "mutex",p8,p9
596{ .mfi
597(p14) ldfpd         FR_S21,FR_S00 = [GR_ad_SinE],16
598(p8)  fma.s1        FR_Rq0 = FR_2,f1,FR_1mX // (3-x)
599      tbit.z        p0,p6 = GR_Sig,0
600}
601{ .mfi
602(p14) ldfpd         FR_S10,FR_S20 = [GR_ad_SinO],16
603(p9)  fma.s1        FR_Rq0 = FR_2,FR_2,FR_1mX // (5-x)
604      cmp.eq        p10,p0 = 6,GR_RqDeg
605};;
606{ .mfi
607(p14) getf.s        GR_Arg = f8
608(p14) fcmp.eq.unc.s1 p13,p0 = FR_NormX,FR_Xt
609(p14) mov           GR_ZeroResBound = 0xC22C // -43
610}
611{ .mfi
612(p14) ldfe          FR_InvAn = [GR_ad_SinE]
613(p10) fma.s1        FR_Rq0 = FR_6,f1,FR_1mX // (7-x)
614      cmp.eq        p7,p0 = r0,GR_RqDeg
615};;
616{ .mfi
617(p14) cmp.ge.unc    p11,p0 = GR_SignExp,GR_NanBound
618      fma.s1        FR_Rq2 = FR_Rq2,FR_6,FR_X2mX // (x-3)*(x-4)
619(p14) shl           GR_ZeroResBound = GR_ZeroResBound,16
620}
621{ .mfb
622(p14) mov           GR_OvfNzBound = 0x802
623(p14) fms.s1        FR_rs = FR_rs,f1,FR_NormX // rs = round(x) - x
624      // jump if  x < -2^23 i.e. x is negative integer
625(p11) br.cond.spnt  tgammaf_singularity
626};;
627{ .mfi
628      nop.m         0
629(p7)  fma.s1        FR_Rq1 = f0,f0,f1
630(p14) shl           GR_OvfNzBound = GR_OvfNzBound,20
631}
632{ .mfb
633      nop.m         0
634      fma.s1        FR_Rq3 = FR_Rq3,FR_10,FR_X2mX // (x-5)*(x-6)
635      // jump if x is negative integer such that -2^23 < x < 0
636(p13) br.cond.spnt  tgammaf_singularity
637};;
638{ .mfi
639      nop.m         0
640      fma.s1        FR_C01 = FR_C01,f1,FR_r
641(p14) mov           GR_ExpOf05 = 0xFFFE
642}
643{ .mfi
644(p14) cmp.eq.unc    p7,p0 = GR_Arg,GR_OvfNzBound
645      fma.s1        FR_C11 = FR_C11,f1,FR_r
646(p14) cmp.ltu.unc   p11,p0 = GR_Arg,GR_OvfNzBound
647};;
648{ .mfi
649      nop.m         0
650      fma.s1        FR_C21 = FR_C21,f1,FR_r
651(p14) cmp.ltu.unc   p9,p0 = GR_ZeroResBound,GR_Arg
652}
653{ .mfb
654      nop.m         0
655      fma.s1        FR_C31 = FR_C31,f1,FR_r
656      // jump if argument is close to 0 negative
657(p11) br.cond.spnt  tgammaf_overflow
658};;
659{ .mfi
660      nop.m         0
661      fma.s1        FR_C41 = FR_C41,f1,FR_r
662      nop.i         0
663}
664{ .mfb
665      nop.m         0
666      fma.s1        FR_C51 = FR_C51,f1,FR_r
667      // jump if x is negative noninteger such that -2^23 < x < -43
668(p9)  br.cond.spnt  tgammaf_underflow
669};;
670{ .mfi
671      nop.m         0
672(p14) fma.s1        FR_rs2 = FR_rs,FR_rs,f0
673      nop.i         0
674}
675{ .mfb
676      nop.m         0
677(p14) fma.s1        FR_S01 = FR_rs,FR_rs,FR_S01
678      // jump if argument is 0x80200000
679(p7)  br.cond.spnt  tgammaf_overflow_near0_bound
680};;
681{ .mfi
682      nop.m         0
683(p6)  fnma.s1       FR_Rq1 = FR_Rq1,FR_Rq0,f0
684      nop.i         0
685}
686{ .mfi
687      nop.m         0
688(p10) fma.s1        FR_Rq2 = FR_Rq2,FR_Rq3,f0
689      and           GR_Sig = 0x7,GR_Sig
690};;
691{ .mfi
692      nop.m         0
693      fma.s1        FR_C01 = FR_C01,FR_r,FR_C00
694      nop.i         0
695}
696{ .mfi
697      nop.m         0
698      fma.s1        FR_C11 = FR_C11,FR_r,FR_C10
699      cmp.eq        p6,p7 = r0,GR_Sig // p6 if |x| from one of base intervals
700};;
701{ .mfi
702      nop.m         0
703      fma.s1        FR_C21 = FR_C21,FR_r,FR_C20
704      nop.i         0
705}
706{ .mfi
707      nop.m         0
708      fma.s1        FR_C31 = FR_C31,FR_r,FR_C30
709(p7)  cmp.lt.unc    p9,p0 = 2,GR_RqDeg
710};;
711{ .mfi
712      nop.m         0
713(p14) fma.s1        FR_S11 = FR_rs,FR_rs,FR_S11
714      nop.i         0
715}
716{ .mfi
717      nop.m         0
718(p14) fma.s1        FR_S21 = FR_rs,FR_rs,FR_S21
719      nop.i         0
720};;
721{ .mfi
722      nop.m         0
723      fma.s1        FR_C41 = FR_C41,FR_r,FR_C40
724      nop.i         0
725}
726{ .mfi
727      nop.m         0
728(p14) fma.s1        FR_S32 = FR_rs2,FR_S32,FR_S31
729      nop.i         0
730};;
731{ .mfi
732      nop.m         0
733(p9)  fma.s1        FR_Rq1 = FR_Rq1,FR_Rq2,f0
734      nop.i         0
735}
736{ .mfi
737      nop.m         0
738      fma.s1        FR_C51 = FR_C51,FR_r,FR_C50
739      nop.i         0
740};;
741{ .mfi
742(p14) getf.exp      GR_SignExp = FR_rs
743      fma.s1        FR_C01 = FR_C01,FR_C11,f0
744      nop.i         0
745}
746{ .mfi
747      nop.m         0
748(p14) fma.s1        FR_S01 = FR_S01,FR_rs2,FR_S00
749      nop.i         0
750};;
751{ .mfi
752      nop.m         0
753      fma.s1        FR_C21 = FR_C21,FR_C31,f0
754      nop.i         0
755}
756{ .mfi
757      nop.m         0
758      // NR-iteration
759(p14) fnma.s1       FR_InvNormX1 = FR_Rcp0,FR_NormX,f1
760      nop.i         0
761};;
762{ .mfi
763      nop.m         0
764(p14) fma.s1        FR_S11 = FR_S11,FR_rs2,FR_S10
765(p14) tbit.z.unc    p11,p12 = GR_SignExp,17
766}
767{ .mfi
768      nop.m         0
769(p14) fma.s1        FR_S21 = FR_S21,FR_rs2,FR_S20
770      nop.i         0
771};;
772{ .mfi
773      nop.m         0
774(p15) fcmp.lt.unc.s1 p0,p13 = FR_NormX,FR_OvfBound
775      nop.i         0
776}
777{ .mfi
778      nop.m         0
779(p14) fma.s1        FR_S32 = FR_rs2,FR_S32,f0
780      nop.i         0
781};;
782{ .mfi
783      nop.m         0
784      fma.s1        FR_C41 = FR_C41,FR_C51,f0
785      nop.i         0
786}
787{ .mfi
788      nop.m         0
789(p7)  fma.s1        FR_An = FR_Rq1,FR_An,f0
790      nop.i         0
791};;
792{ .mfb
793      nop.m         0
794      nop.f         0
795      // jump if x > 35.04010009765625
796(p13) br.cond.spnt  tgammaf_overflow
797};;
798{ .mfi
799      nop.m         0
800      // NR-iteration
801(p14) fma.s1        FR_InvNormX1 = FR_Rcp0,FR_InvNormX1,FR_Rcp0
802      nop.i         0
803};;
804{ .mfi
805      nop.m         0
806(p14) fma.s1        FR_S01 = FR_S01,FR_S11,f0
807      nop.i         0
808};;
809{ .mfi
810      nop.m         0
811(p14) fma.s1        FR_S21 = FR_S21,FR_S32,f0
812      nop.i         0
813};;
814{ .mfi
815(p14) getf.exp      GR_SignExp = FR_NormX
816      fma.s1        FR_C01 = FR_C01,FR_C21,f0
817      nop.i         0
818}
819{ .mfi
820      nop.m         0
821      fma.s1        FR_C41 = FR_C41,FR_An,f0
822(p14) mov           GR_ExpOf1 = 0x2FFFF
823};;
824{ .mfi
825      nop.m         0
826      // NR-iteration
827(p14) fnma.s1       FR_InvNormX2 = FR_InvNormX1,FR_NormX,f1
828      nop.i         0
829};;
830.pred.rel "mutex",p11,p12
831{ .mfi
832      nop.m         0
833(p12) fnma.s1       FR_S01 = FR_S01,FR_S21,f0
834      nop.i         0
835}
836{ .mfi
837      nop.m         0
838(p11) fma.s1        FR_S01 = FR_S01,FR_S21,f0
839      nop.i         0
840};;
841
842{ .mfi
843      nop.m         0
844(p14) fma.s1        FR_GAMMA = FR_C01,FR_C41,f0
845(p14) tbit.z.unc    p6,p7 = GR_Sig,0
846}
847{ .mfb
848      nop.m         0
849(p15) fma.s.s0      f8 = FR_C01,FR_C41,f0
850(p15) br.ret.spnt   b0 // exit for positives
851};;
852.pred.rel "mutex",p11,p12
853{ .mfi
854      nop.m         0
855(p12) fms.s1        FR_S01 = FR_rs,FR_S01,FR_rs
856      nop.i         0
857}
858{ .mfi
859      nop.m         0
860(p11) fma.s1        FR_S01 = FR_rs,FR_S01,FR_rs
861      nop.i         0
862};;
863{ .mfi
864      nop.m         0
865      // NR-iteration
866      fma.s1        FR_InvNormX2 = FR_InvNormX1,FR_InvNormX2,FR_InvNormX1
867      cmp.eq        p10,p0 = 0x23,GR_Offs
868};;
869.pred.rel "mutex",p6,p7
870{ .mfi
871      nop.m         0
872(p6)  fma.s1        FR_GAMMA = FR_S01,FR_GAMMA,f0
873      cmp.gtu       p8,p0 = GR_SignExp,GR_ExpOf1
874}
875{ .mfi
876      nop.m         0
877(p7)  fnma.s1       FR_GAMMA = FR_S01,FR_GAMMA,f0
878      cmp.eq        p9,p0 = GR_SignExp,GR_ExpOf1
879};;
880{ .mfi
881      nop.m         0
882      // NR-iteration
883      fnma.s1       FR_InvNormX1 = FR_InvNormX2,FR_NormX,f1
884      nop.i         0
885}
886{ .mfi
887      nop.m         0
888(p10) fma.s1        FR_InvNormX2 = FR_InvNormX2,FR_InvAn,f0
889      nop.i         0
890};;
891{ .mfi
892      nop.m         0
893      frcpa.s1      FR_Rcp0,p0 = f1,FR_GAMMA
894      nop.i         0
895};;
896{ .mfi
897      nop.m         0
898      fms.s1        FR_Multplr = FR_NormX,f1,f1 // x - 1
899      nop.i         0
900};;
901{ .mfi
902      nop.m         0
903      // NR-iteration
904      fnma.s1       FR_Rcp1 = FR_Rcp0,FR_GAMMA,f1
905      nop.i         0
906};;
907.pred.rel "mutex",p8,p9
908{ .mfi
909      nop.m         0
910      // 1/x or 1/(An*x)
911(p8)  fma.s1        FR_Multplr = FR_InvNormX2,FR_InvNormX1,FR_InvNormX2
912      nop.i         0
913}
914{ .mfi
915      nop.m         0
916(p9)  fma.s1        FR_Multplr = f1,f1,f0
917      nop.i         0
918};;
919{ .mfi
920      nop.m         0
921      // NR-iteration
922      fma.s1        FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0
923      nop.i         0
924};;
925{ .mfi
926      nop.m         0
927      // NR-iteration
928      fnma.s1       FR_Rcp2 = FR_Rcp1,FR_GAMMA,f1
929      nop.i         0
930}
931{ .mfi
932      nop.m         0
933      // NR-iteration
934      fma.s1        FR_Rcp1 = FR_Rcp1,FR_Multplr,f0
935      nop.i         0
936};;
937{ .mfb
938      nop.m         0
939      fma.s.s0      f8 = FR_Rcp1,FR_Rcp2,FR_Rcp1
940      br.ret.sptk   b0
941};;
942
943// here if 0 < x < 1
944//--------------------------------------------------------------------
945.align 32
946tgammaf_from_0_to_1:
947{ .mfi
948      cmp.lt        p7,p0 = GR_Arg,GR_ExpOf05
949      // NR-iteration
950      fnma.s1       FR_Rcp1 = FR_Rcp0,FR_NormX,f1
951      cmp.eq        p8,p0 = GR_Arg,GR_ExpOf05
952}
953{ .mfi
954      cmp.gt        p9,p0 = GR_Arg,GR_ExpOf05
955      fma.s1        FR_r = f0,f0,FR_NormX // reduced arg for (0;1)
956      mov           GR_ExpOf025 = 0x7FA
957};;
958{ .mfi
959      getf.s        GR_ArgNz = f8
960      fma.d.s0      FR_X = f0,f0,f8 // set deno flag
961      shl           GR_OvfNzBound = GR_OvfNzBound,20
962}
963{ .mfi
964(p8)  mov           GR_Tbl12Offs = 0x80 // 0.5 <= x < 0.75
965      nop.f         0
966(p7)  cmp.ge.unc    p6,p0 = GR_Arg,GR_ExpOf025
967};;
968.pred.rel "mutex",p6,p9
969{ .mfi
970(p9)  mov           GR_Tbl12Offs = 0xC0 // 0.75 <= x < 1
971      nop.f         0
972(p6)  mov           GR_Tbl12Offs = 0x40 // 0.25 <= x < 0.5
973}
974{ .mfi
975      add           GR_ad_Ce = 0x2C0,GR_ad_Data
976      nop.f         0
977      add           GR_ad_Co = 0x2A0,GR_ad_Data
978};;
979{ .mfi
980      add           GR_ad_Co = GR_ad_Co,GR_Tbl12Offs
981      nop.f         0
982      cmp.lt        p12,p0 = GR_ArgNz,GR_OvfNzBound
983}
984{ .mib
985      add           GR_ad_Ce = GR_ad_Ce,GR_Tbl12Offs
986      cmp.eq        p7,p0 = GR_ArgNz,GR_OvfNzBound
987      // jump if argument is 0x00200000
988(p7)  br.cond.spnt  tgammaf_overflow_near0_bound
989};;
990{ .mmb
991      ldfpd         FR_A7,FR_A6 = [GR_ad_Co],16
992      ldfpd         FR_A5,FR_A4 = [GR_ad_Ce],16
993      // jump if argument is close to 0 positive
994(p12) br.cond.spnt  tgammaf_overflow
995};;
996{ .mfi
997      ldfpd         FR_A3,FR_A2 = [GR_ad_Co],16
998      // NR-iteration
999      fma.s1        FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0
1000      nop.i         0
1001}
1002{ .mfb
1003      ldfpd         FR_A1,FR_A0 = [GR_ad_Ce],16
1004      nop.f         0
1005      br.cond.sptk  tgamma_from_0_to_2
1006};;
1007
1008// here if 1 < x < 2
1009//--------------------------------------------------------------------
1010.align 32
1011tgammaf_from_1_to_2:
1012{ .mfi
1013      add           GR_ad_Co = 0x2A0,GR_ad_Data
1014      fms.s1        FR_r = f0,f0,FR_1mX
1015      shr           GR_TblOffs = GR_Arg,47
1016}
1017{ .mfi
1018      add           GR_ad_Ce = 0x2C0,GR_ad_Data
1019      nop.f         0
1020      mov           GR_TblOffsMask = 0x18
1021};;
1022{ .mfi
1023      nop.m         0
1024      nop.f         0
1025      and           GR_TblOffs = GR_TblOffs,GR_TblOffsMask
1026};;
1027{ .mfi
1028      shladd        GR_ad_Co = GR_TblOffs,3,GR_ad_Co
1029      nop.f         0
1030      nop.i         0
1031}
1032{ .mfi
1033      shladd        GR_ad_Ce = GR_TblOffs,3,GR_ad_Ce
1034      nop.f         0
1035      cmp.eq        p6,p7 = 8,GR_TblOffs
1036};;
1037{ .mmi
1038      ldfpd         FR_A7,FR_A6 = [GR_ad_Co],16
1039      ldfpd         FR_A5,FR_A4 = [GR_ad_Ce],16
1040      nop.i         0
1041};;
1042{ .mmi
1043      ldfpd         FR_A3,FR_A2 = [GR_ad_Co],16
1044      ldfpd         FR_A1,FR_A0 = [GR_ad_Ce],16
1045      nop.i         0
1046};;
1047
1048.align 32
1049tgamma_from_0_to_2:
1050{ .mfi
1051      nop.m         0
1052(p6)  fms.s1        FR_r = FR_r,f1,FR_LocalMin
1053      nop.i         0
1054};;
1055{ .mfi
1056      nop.m         0
1057      // NR-iteration
1058(p10) fnma.s1       FR_Rcp2 = FR_Rcp1,FR_NormX,f1
1059      nop.i         0
1060};;
1061{ .mfi
1062      nop.m         0
1063      fms.s1        FR_r2 = FR_r,FR_r,f0
1064      nop.i         0
1065};;
1066{ .mfi
1067      nop.m         0
1068      fma.s1        FR_A7 = FR_A7,FR_r,FR_A6
1069      nop.i         0
1070}
1071{ .mfi
1072      nop.m         0
1073      fma.s1        FR_A5 = FR_A5,FR_r,FR_A4
1074      nop.i         0
1075};;
1076{ .mfi
1077      nop.m         0
1078      fma.s1        FR_A3 = FR_A3,FR_r,FR_A2
1079      nop.i         0
1080}
1081{ .mfi
1082      nop.m         0
1083      fma.s1        FR_A1 = FR_A1,FR_r,FR_A0
1084      nop.i         0
1085};;
1086{ .mfi
1087      nop.m         0
1088      // NR-iteration
1089(p10) fma.s1        FR_Rcp2 = FR_Rcp1,FR_Rcp2,FR_Rcp1
1090      nop.i         0
1091};;
1092{ .mfi
1093      nop.m         0
1094      fma.s1        FR_A7 = FR_A7,FR_r2,FR_A5
1095      nop.i         0
1096}
1097{ .mfi
1098      nop.m         0
1099      fma.s1        FR_r4 = FR_r2,FR_r2,f0
1100      nop.i         0
1101};;
1102{ .mfi
1103      nop.m         0
1104      fma.s1        FR_A3 = FR_A3,FR_r2,FR_A1
1105      nop.i         0
1106};;
1107{ .mfi
1108      nop.m         0
1109(p10) fma.s1        FR_GAMMA = FR_A7,FR_r4,FR_A3
1110      nop.i         0
1111}
1112{ .mfi
1113      nop.m         0
1114(p11) fma.s.s0      f8 = FR_A7,FR_r4,FR_A3
1115      nop.i         0
1116};;
1117{ .mfb
1118      nop.m         0
1119(p10) fma.s.s0      f8 = FR_GAMMA,FR_Rcp2,f0
1120      br.ret.sptk   b0
1121};;
1122
1123
1124// overflow
1125//--------------------------------------------------------------------
1126.align 32
1127tgammaf_overflow_near0_bound:
1128.pred.rel "mutex",p14,p15
1129{ .mfi
1130	  mov           GR_fpsr = ar.fpsr
1131	  nop.f         0
1132(p15) mov           r8 = 0x7f8
1133}
1134{ .mfi
1135      nop.m         0
1136      nop.f         0
1137(p14) mov           r8 = 0xff8
1138};;
1139{ .mfi
1140	  nop.m         0
1141	  nop.f         0
1142	  shl           r8 = r8,20
1143};;
1144{ .mfi
1145      sub           r8 = r8,r0,1
1146      nop.f         0
1147	  extr.u        GR_fpsr = GR_fpsr,10,2 // rounding mode
1148};;
1149.pred.rel "mutex",p14,p15
1150{ .mfi
1151      // set p8 to 0 in case of overflow and to 1 otherwise
1152	  // for negative arg:
1153	  //    no overflow if rounding mode either Z or +Inf, i.e.
1154	  //    GR_fpsr > 1
1155(p14) cmp.lt        p8,p0 = 1,GR_fpsr
1156      nop.f         0
1157	  // for positive arg:
1158	  //    no overflow if rounding mode either Z or -Inf, i.e.
1159	  //    (GR_fpsr & 1) == 0
1160(p15) tbit.z        p0,p8 = GR_fpsr,0
1161};;
1162{ .mib
1163(p8)  setf.s        f8 = r8 // set result to 0x7f7fffff without
1164                            // OVERFLOW flag raising
1165      nop.i         0
1166(p8)  br.ret.sptk   b0
1167};;
1168
1169.align 32
1170tgammaf_overflow:
1171{ .mfi
1172      nop.m         0
1173      nop.f         0
1174      mov           r8 = 0x1FFFE
1175};;
1176{ .mfi
1177      setf.exp      f9 = r8
1178      fmerge.s      FR_X = f8,f8
1179      nop.i         0
1180};;
1181.pred.rel "mutex",p14,p15
1182{ .mfi
1183      nop.m         0
1184(p14) fnma.s.s0     f8 = f9,f9,f0 // set I,O and -INF result
1185      mov           GR_TAG = 261 // overflow
1186}
1187{ .mfb
1188      nop.m         0
1189(p15) fma.s.s0      f8 = f9,f9,f0 // set I,O and +INF result
1190      br.cond.sptk  tgammaf_libm_err
1191};;
1192
1193// x is negative integer or +/-0
1194//--------------------------------------------------------------------
1195.align 32
1196tgammaf_singularity:
1197{ .mfi
1198      nop.m         0
1199      fmerge.s      FR_X = f8,f8
1200      mov           GR_TAG = 262 // negative
1201}
1202{ .mfb
1203      nop.m         0
1204      frcpa.s0      f8,p0 = f0,f0
1205      br.cond.sptk  tgammaf_libm_err
1206};;
1207// x is negative noninteger with big absolute value
1208//--------------------------------------------------------------------
1209.align 32
1210tgammaf_underflow:
1211{ .mfi
1212      mov           r8 = 0x00001
1213      nop.f         0
1214      tbit.z        p6,p7 = GR_Sig,0
1215};;
1216{ .mfi
1217      setf.exp      f9 = r8
1218      nop.f         0
1219      nop.i         0
1220};;
1221.pred.rel "mutex",p6,p7
1222{ .mfi
1223      nop.m         0
1224(p6)  fms.s.s0      f8 = f9,f9,f9
1225      nop.i         0
1226}
1227{ .mfb
1228      nop.m         0
1229(p7)  fma.s.s0      f8 = f9,f9,f9
1230      br.ret.sptk   b0
1231};;
1232
1233//  x for natval, nan, +/-inf or +/-0
1234//--------------------------------------------------------------------
1235.align 32
1236tgammaf_spec_args:
1237{ .mfi
1238      nop.m         0
1239      fclass.m      p6,p0 =  f8,0x1E1 // Test x for natval, nan, +inf
1240      nop.i         0
1241};;
1242{ .mfi
1243      nop.m         0
1244      fclass.m      p7,p8 =  f8,0x7 // +/-0
1245      nop.i         0
1246};;
1247{ .mfi
1248      nop.m         0
1249      fmerge.s      FR_X = f8,f8
1250      nop.i         0
1251}
1252{ .mfb
1253      nop.m         0
1254(p6)  fma.s.s0      f8 = f8,f1,f8
1255(p6)  br.ret.spnt   b0
1256};;
1257.pred.rel "mutex",p7,p8
1258{ .mfi
1259(p7)  mov           GR_TAG = 262 // negative
1260(p7)  frcpa.s0      f8,p0 = f1,f8
1261      nop.i         0
1262}
1263{ .mib
1264      nop.m         0
1265      nop.i         0
1266(p8)  br.cond.spnt  tgammaf_singularity
1267};;
1268
1269.align 32
1270tgammaf_libm_err:
1271{ .mfi
1272      alloc        r32 = ar.pfs,1,4,4,0
1273      nop.f        0
1274      mov          GR_Parameter_TAG = GR_TAG
1275};;
1276
1277GLOBAL_LIBM_END(tgammaf)
1278libm_alias_float_other (tgamma, tgamma)
1279
1280LOCAL_LIBM_ENTRY(__libm_error_region)
1281.prologue
1282{ .mfi
1283        add   GR_Parameter_Y=-32,sp             // Parameter 2 value
1284        nop.f 0
1285.save   ar.pfs,GR_SAVE_PFS
1286        mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
1287}
1288{ .mfi
1289.fframe 64
1290        add sp=-64,sp                           // Create new stack
1291        nop.f 0
1292        mov GR_SAVE_GP=gp                       // Save gp
1293};;
1294{ .mmi
1295        stfs [GR_Parameter_Y] = FR_Y,16         // STORE Parameter 2 on stack
1296        add GR_Parameter_X = 16,sp              // Parameter 1 address
1297.save   b0, GR_SAVE_B0
1298        mov GR_SAVE_B0=b0                       // Save b0
1299};;
1300.body
1301{ .mib
1302        stfs [GR_Parameter_X] = FR_X           // STORE Parameter 1 on stack
1303        add   GR_Parameter_RESULT = 0,GR_Parameter_Y  // Parameter 3 address
1304        nop.b 0
1305}
1306{ .mib
1307        stfs [GR_Parameter_Y] = FR_RESULT      // STORE Parameter 3 on stack
1308        add   GR_Parameter_Y = -16,GR_Parameter_Y
1309        br.call.sptk b0=__libm_error_support# // Call error handling function
1310};;
1311{ .mmi
1312        nop.m 0
1313        nop.m 0
1314        add   GR_Parameter_RESULT = 48,sp
1315};;
1316{ .mmi
1317        ldfs  f8 = [GR_Parameter_RESULT]       // Get return result off stack
1318.restore sp
1319        add   sp = 64,sp                       // Restore stack pointer
1320        mov   b0 = GR_SAVE_B0                  // Restore return address
1321};;
1322{ .mib
1323        mov   gp = GR_SAVE_GP                  // Restore gp
1324        mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
1325        br.ret.sptk     b0                     // Return
1326};;
1327
1328LOCAL_LIBM_END(__libm_error_region)
1329.type   __libm_error_support#,@function
1330.global __libm_error_support#
1331