1.file "acoshl.s"
2
3
4// Copyright (c) 2000 - 2005, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39//*********************************************************************
40//
41// History:
42// 10/01/01 Initial version
43// 10/10/01 Performance inproved
44// 12/11/01 Changed huges_logp to not be global
45// 01/02/02 Corrected .restore syntax
46// 05/20/02 Cleaned up namespace and sf0 syntax
47// 08/14/02 Changed mli templates to mlx
48// 02/06/03 Reorganized data tables
49// 03/31/05 Reformatted delimiters between data tables
50//
51//*********************************************************************
52//
53// API
54//==============================================================
55// long double acoshl(long double);
56//
57// Overview of operation
58//==============================================================
59//
60// There are 6 paths:
61// 1. x = 1
62//    Return acoshl(x) = 0;
63//
64// 2. x < 1
65//    Return acoshl(x) = Nan (Domain error, error handler call with tag 135);
66//
67// 3. x = [S,Q]Nan or +INF
68//    Return acoshl(x) = x + x;
69//
70// 4. 'Near 1': 1 < x < 1+1/8
71//    Return acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
72//                   where y = 1, P(y)/Q(y) - rational approximation
73//
74// 5. 'Huges': x > 0.5*2^64
75//    Return acoshl(x) = (logl(2*x-1));
76//
77// 6. 'Main path': 1+1/8 < x < 0.5*2^64
78//    b_hi + b_lo = x + sqrt(x^2 - 1);
79//    acoshl(x) = logl_special(b_hi, b_lo);
80//
81// Algorithm description
82//==============================================================
83//
84// I. Near 1 path algorithm
85// **************************************************************
86// The formula is acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
87//                 where y = 1, P(y)/Q(y) - rational approximation
88//
89// 1) y = x - 1, y2 = 2 * y
90//
91// 2) Compute in parallel sqrtl(2*y) and P(y)/Q(y)
92//    a) sqrtl computation method described below (main path algorithm, item 2))
93//       As result we obtain (gg+gl) - multiprecision result
94//       as pair of double extended values
95//    b) P(y) and Q(y) calculated without any extra precision manipulations
96//    c) P/Q division:
97//       y = frcpa(Q)         initial approximation of 1/Q
98//       z = P*y              initial approximation of P/Q
99//
100//       e = 1 - b*y
101//       e2 = e + e^2
102//       e1 = e^2
103//       y1 = y + y*e2 = y + y*(e+e^2)
104//
105//       e3 = e + e1^2
106//       y2 = y + y1*e3 = y + y*(e+e^2+..+e^6)
107//
108//       r = P - Q*z
109//       e = 1 - Q*y2
110//       xx = z + r*y2         high part of a/b
111//
112//       y3 = y2 + y2*e4
113//       r1 = P  - Q*xx
114//       xl = r1*y3            low part of a/b
115//
116// 3) res = sqrt(2*y) - sqrt(2*y)*(P(y)/Q(y)) =
117//        = (gg+gl) - (gg + gl)*(xx+xl);
118//
119//    a) hh = gg*xx; hl = gg*xl; lh = gl*xx; ll = gl*xl;
120//    b) res = ((((gl + ll) + lh) + hl) + hh) + gg;
121//       (exactly in this order)
122//
123// II. Main path algorithm
124// ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
125// **********************************************************************
126//
127// There are 3 parts of x+sqrt(x^2-1) computation:
128//
129//  1) m2 = (m2_hi+m2_lo) = x^2-1 obtaining
130//     ------------------------------------
131//     m2_hi = x2_hi - 1, where x2_hi = x * x;
132//     m2_lo = x2_lo + p1_lo, where
133//                            x2_lo = FMS(x*x-x2_hi),
134//                            p1_lo = (1 + m2_hi) - x2_hi;
135//
136//  2) g = (g_hi+g_lo) = sqrt(m2) = sqrt(m2_hi+m2_lo)
137//     ----------------------------------------------
138//     r = invsqrt(m2_hi) (8-bit reciprocal square root approximation);
139//     g = m2_hi * r (first 8 bit-approximation of sqrt);
140//
141//     h = 0.5 * r;
142//     e = 0.5 - g * h;
143//     g = g * e + g (second 16 bit-approximation of sqrt);
144//
145//     h = h * e + h;
146//     e = 0.5 - g * h;
147//     g = g * e + g (third 32 bit-approximation of sqrt);
148//
149//     h = h * e + h;
150//     e = 0.5 - g * h;
151//     g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
152//
153//     Remainder computation:
154//     h = h * e + h;
155//     d = (m2_hi - g_hi * g_hi) + m2_lo;
156//     g_lo = d * h;
157//
158//  3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2-1)
159//     -------------------------------------------------------------------
160//     b_hi = (g_hi + x) + gl;
161//     b_lo = (x - b_hi) + g_hi + gl;
162//
163//  Now we pass b presented as sum b_hi + b_lo to special version
164//  of logl function which accept a pair of arguments as
165//  mutiprecision value.
166//
167//  Special log algorithm overview
168//  ================================
169//   Here we use a table lookup method. The basic idea is that in
170//   order to compute logl(Arg) for an argument Arg in [1,2),
171//   we construct a value G such that G*Arg is close to 1 and that
172//   logl(1/G) is obtainable easily from a table of values calculated
173//   beforehand. Thus
174//
175//      logl(Arg) = logl(1/G) + logl((G*Arg - 1))
176//
177//   Because |G*Arg - 1| is small, the second term on the right hand
178//   side can be approximated by a short polynomial. We elaborate
179//   this method in four steps.
180//
181//   Step 0: Initialization
182//
183//   We need to calculate logl( X+1 ). Obtain N, S_hi such that
184//
185//      X = 2^N * ( S_hi + S_lo )   exactly
186//
187//   where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
188//   that |S_lo| <= ulp(S_hi).
189//
190//   For the special version of logl: S_lo = b_lo
191//   !-----------------------------------------------!
192//
193//   Step 1: Argument Reduction
194//
195//   Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
196//
197//      G := G_1 * G_2 * G_3
198//      r := (G * S_hi - 1) + G * S_lo
199//
200//   These G_j's have the property that the product is exactly
201//   representable and that |r| < 2^(-12) as a result.
202//
203//   Step 2: Approximation
204//
205//   logl(1 + r) is approximated by a short polynomial poly(r).
206//
207//   Step 3: Reconstruction
208//
209//   Finally, logl( X ) = logl( X+1 ) is given by
210//
211//   logl( X )   =   logl( 2^N * (S_hi + S_lo) )
212//                 ~=~  N*logl(2) + logl(1/G) + logl(1 + r)
213//                 ~=~  N*logl(2) + logl(1/G) + poly(r).
214//
215//   For detailed description see logl or log1pl function, regular path.
216//
217// Registers used
218//==============================================================
219// Floating Point registers used:
220// f8, input
221// f32 -> f95 (64 registers)
222
223// General registers used:
224// r32 -> r67 (36 registers)
225
226// Predicate registers used:
227// p7 -> p11
228// p7  for 'NaNs, Inf' path
229// p8  for 'near 1' path
230// p9  for 'huges' path
231// p10 for x = 1
232// p11 for x < 1
233//
234//*********************************************************************
235// IEEE Special Conditions:
236//
237//    acoshl(+inf)  = +inf
238//    acoshl(-inf) = QNaN
239//    acoshl(1)    = 0
240//    acoshl(x<1)  = QNaN
241//    acoshl(SNaN) = QNaN
242//    acoshl(QNaN) = QNaN
243//
244
245// Data tables
246//==============================================================
247
248RODATA
249.align 64
250
251// Near 1 path rational approximation coefficients
252LOCAL_OBJECT_START(Poly_P)
253data8 0xB0978143F695D40F, 0x3FF1  // .84205539791447100108478906277453574946e-4
254data8 0xB9800D841A8CAD29, 0x3FF6  // .28305085180397409672905983082168721069e-2
255data8 0xC889F455758C1725, 0x3FF9  // .24479844297887530847660233111267222945e-1
256data8 0x9BE1DFF006F45F12, 0x3FFB  // .76114415657565879842941751209926938306e-1
257data8 0x9E34AF4D372861E0, 0x3FFB  // .77248925727776366270605984806795850504e-1
258data8 0xF3DC502AEE14C4AE, 0x3FA6  // .3077953476682583606615438814166025592e-26
259LOCAL_OBJECT_END(Poly_P)
260
261//
262LOCAL_OBJECT_START(Poly_Q)
263data8 0xF76E3FD3C7680357, 0x3FF1  // .11798413344703621030038719253730708525e-3
264data8 0xD107D2E7273263AE, 0x3FF7  // .63791065024872525660782716786703188820e-2
265data8 0xB609BE5CDE206AEF, 0x3FFB  // .88885771950814004376363335821980079985e-1
266data8 0xF7DEACAC28067C8A, 0x3FFD  // .48412074662702495416825113623936037072302
267data8 0x8F9BE5890CEC7E38, 0x3FFF  // 1.1219450873557867470217771071068369729526
268data8 0xED4F06F3D2BC92D1, 0x3FFE  // .92698710873331639524734537734804056798748
269LOCAL_OBJECT_END(Poly_Q)
270
271// Q coeffs
272LOCAL_OBJECT_START(Constants_Q)
273data4  0x00000000,0xB1721800,0x00003FFE,0x00000000
274data4  0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
275data4  0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
276data4  0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
277data4  0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
278data4  0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
279LOCAL_OBJECT_END(Constants_Q)
280
281// Z1 - 16 bit fixed
282LOCAL_OBJECT_START(Constants_Z_1)
283data4  0x00008000
284data4  0x00007879
285data4  0x000071C8
286data4  0x00006BCB
287data4  0x00006667
288data4  0x00006187
289data4  0x00005D18
290data4  0x0000590C
291data4  0x00005556
292data4  0x000051EC
293data4  0x00004EC5
294data4  0x00004BDB
295data4  0x00004925
296data4  0x0000469F
297data4  0x00004445
298data4  0x00004211
299LOCAL_OBJECT_END(Constants_Z_1)
300
301// G1 and H1 - IEEE single and h1 - IEEE double
302LOCAL_OBJECT_START(Constants_G_H_h1)
303data4  0x3F800000,0x00000000
304data8  0x0000000000000000
305data4  0x3F70F0F0,0x3D785196
306data8  0x3DA163A6617D741C
307data4  0x3F638E38,0x3DF13843
308data8  0x3E2C55E6CBD3D5BB
309data4  0x3F579430,0x3E2FF9A0
310data8  0xBE3EB0BFD86EA5E7
311data4  0x3F4CCCC8,0x3E647FD6
312data8  0x3E2E6A8C86B12760
313data4  0x3F430C30,0x3E8B3AE7
314data8  0x3E47574C5C0739BA
315data4  0x3F3A2E88,0x3EA30C68
316data8  0x3E20E30F13E8AF2F
317data4  0x3F321640,0x3EB9CEC8
318data8  0xBE42885BF2C630BD
319data4  0x3F2AAAA8,0x3ECF9927
320data8  0x3E497F3497E577C6
321data4  0x3F23D708,0x3EE47FC5
322data8  0x3E3E6A6EA6B0A5AB
323data4  0x3F1D89D8,0x3EF8947D
324data8  0xBDF43E3CD328D9BE
325data4  0x3F17B420,0x3F05F3A1
326data8  0x3E4094C30ADB090A
327data4  0x3F124920,0x3F0F4303
328data8  0xBE28FBB2FC1FE510
329data4  0x3F0D3DC8,0x3F183EBF
330data8  0x3E3A789510FDE3FA
331data4  0x3F088888,0x3F20EC80
332data8  0x3E508CE57CC8C98F
333data4  0x3F042108,0x3F29516A
334data8  0xBE534874A223106C
335LOCAL_OBJECT_END(Constants_G_H_h1)
336
337// Z2 - 16 bit fixed
338LOCAL_OBJECT_START(Constants_Z_2)
339data4  0x00008000
340data4  0x00007F81
341data4  0x00007F02
342data4  0x00007E85
343data4  0x00007E08
344data4  0x00007D8D
345data4  0x00007D12
346data4  0x00007C98
347data4  0x00007C20
348data4  0x00007BA8
349data4  0x00007B31
350data4  0x00007ABB
351data4  0x00007A45
352data4  0x000079D1
353data4  0x0000795D
354data4  0x000078EB
355LOCAL_OBJECT_END(Constants_Z_2)
356
357// G2 and H2 - IEEE single and h2 - IEEE double
358LOCAL_OBJECT_START(Constants_G_H_h2)
359data4  0x3F800000,0x00000000
360data8  0x0000000000000000
361data4  0x3F7F00F8,0x3B7F875D
362data8  0x3DB5A11622C42273
363data4  0x3F7E03F8,0x3BFF015B
364data8  0x3DE620CF21F86ED3
365data4  0x3F7D08E0,0x3C3EE393
366data8  0xBDAFA07E484F34ED
367data4  0x3F7C0FC0,0x3C7E0586
368data8  0xBDFE07F03860BCF6
369data4  0x3F7B1880,0x3C9E75D2
370data8  0x3DEA370FA78093D6
371data4  0x3F7A2328,0x3CBDC97A
372data8  0x3DFF579172A753D0
373data4  0x3F792FB0,0x3CDCFE47
374data8  0x3DFEBE6CA7EF896B
375data4  0x3F783E08,0x3CFC15D0
376data8  0x3E0CF156409ECB43
377data4  0x3F774E38,0x3D0D874D
378data8  0xBE0B6F97FFEF71DF
379data4  0x3F766038,0x3D1CF49B
380data8  0xBE0804835D59EEE8
381data4  0x3F757400,0x3D2C531D
382data8  0x3E1F91E9A9192A74
383data4  0x3F748988,0x3D3BA322
384data8  0xBE139A06BF72A8CD
385data4  0x3F73A0D0,0x3D4AE46F
386data8  0x3E1D9202F8FBA6CF
387data4  0x3F72B9D0,0x3D5A1756
388data8  0xBE1DCCC4BA796223
389data4  0x3F71D488,0x3D693B9D
390data8  0xBE049391B6B7C239
391LOCAL_OBJECT_END(Constants_G_H_h2)
392
393// G3 and H3 - IEEE single and h3 - IEEE double
394LOCAL_OBJECT_START(Constants_G_H_h3)
395data4  0x3F7FFC00,0x38800100
396data8  0x3D355595562224CD
397data4  0x3F7FF400,0x39400480
398data8  0x3D8200A206136FF6
399data4  0x3F7FEC00,0x39A00640
400data8  0x3DA4D68DE8DE9AF0
401data4  0x3F7FE400,0x39E00C41
402data8  0xBD8B4291B10238DC
403data4  0x3F7FDC00,0x3A100A21
404data8  0xBD89CCB83B1952CA
405data4  0x3F7FD400,0x3A300F22
406data8  0xBDB107071DC46826
407data4  0x3F7FCC08,0x3A4FF51C
408data8  0x3DB6FCB9F43307DB
409data4  0x3F7FC408,0x3A6FFC1D
410data8  0xBD9B7C4762DC7872
411data4  0x3F7FBC10,0x3A87F20B
412data8  0xBDC3725E3F89154A
413data4  0x3F7FB410,0x3A97F68B
414data8  0xBD93519D62B9D392
415data4  0x3F7FAC18,0x3AA7EB86
416data8  0x3DC184410F21BD9D
417data4  0x3F7FA420,0x3AB7E101
418data8  0xBDA64B952245E0A6
419data4  0x3F7F9C20,0x3AC7E701
420data8  0x3DB4B0ECAABB34B8
421data4  0x3F7F9428,0x3AD7DD7B
422data8  0x3D9923376DC40A7E
423data4  0x3F7F8C30,0x3AE7D474
424data8  0x3DC6E17B4F2083D3
425data4  0x3F7F8438,0x3AF7CBED
426data8  0x3DAE314B811D4394
427data4  0x3F7F7C40,0x3B03E1F3
428data8  0xBDD46F21B08F2DB1
429data4  0x3F7F7448,0x3B0BDE2F
430data8  0xBDDC30A46D34522B
431data4  0x3F7F6C50,0x3B13DAAA
432data8  0x3DCB0070B1F473DB
433data4  0x3F7F6458,0x3B1BD766
434data8  0xBDD65DDC6AD282FD
435data4  0x3F7F5C68,0x3B23CC5C
436data8  0xBDCDAB83F153761A
437data4  0x3F7F5470,0x3B2BC997
438data8  0xBDDADA40341D0F8F
439data4  0x3F7F4C78,0x3B33C711
440data8  0x3DCD1BD7EBC394E8
441data4  0x3F7F4488,0x3B3BBCC6
442data8  0xBDC3532B52E3E695
443data4  0x3F7F3C90,0x3B43BAC0
444data8  0xBDA3961EE846B3DE
445data4  0x3F7F34A0,0x3B4BB0F4
446data8  0xBDDADF06785778D4
447data4  0x3F7F2CA8,0x3B53AF6D
448data8  0x3DCC3ED1E55CE212
449data4  0x3F7F24B8,0x3B5BA620
450data8  0xBDBA31039E382C15
451data4  0x3F7F1CC8,0x3B639D12
452data8  0x3D635A0B5C5AF197
453data4  0x3F7F14D8,0x3B6B9444
454data8  0xBDDCCB1971D34EFC
455data4  0x3F7F0CE0,0x3B7393BC
456data8  0x3DC7450252CD7ADA
457data4  0x3F7F04F0,0x3B7B8B6D
458data8  0xBDB68F177D7F2A42
459LOCAL_OBJECT_END(Constants_G_H_h3)
460
461// Assembly macros
462//==============================================================
463
464// Floating Point Registers
465
466FR_Arg          = f8
467FR_Res          = f8
468
469
470FR_PP0          = f32
471FR_PP1          = f33
472FR_PP2          = f34
473FR_PP3          = f35
474FR_PP4          = f36
475FR_PP5          = f37
476FR_QQ0          = f38
477FR_QQ1          = f39
478FR_QQ2          = f40
479FR_QQ3          = f41
480FR_QQ4          = f42
481FR_QQ5          = f43
482
483FR_Q1           = f44
484FR_Q2           = f45
485FR_Q3           = f46
486FR_Q4           = f47
487
488FR_Half         = f48
489FR_Two          = f49
490
491FR_log2_hi      = f50
492FR_log2_lo      = f51
493
494
495FR_X2           = f52
496FR_M2           = f53
497FR_M2L          = f54
498FR_Rcp          = f55
499FR_GG           = f56
500FR_HH           = f57
501FR_EE           = f58
502FR_DD           = f59
503FR_GL           = f60
504FR_Tmp          = f61
505
506
507FR_XM1          = f62
508FR_2XM1         = f63
509FR_XM12         = f64
510
511
512
513    // Special logl registers
514FR_XLog_Hi      = f65
515FR_XLog_Lo      = f66
516
517FR_Y_hi         = f67
518FR_Y_lo         = f68
519
520FR_S_hi         = f69
521FR_S_lo         = f70
522
523FR_poly_lo      = f71
524FR_poly_hi      = f72
525
526FR_G            = f73
527FR_H            = f74
528FR_h            = f75
529
530FR_G2           = f76
531FR_H2           = f77
532FR_h2           = f78
533
534FR_r            = f79
535FR_rsq          = f80
536FR_rcub         = f81
537
538FR_float_N      = f82
539
540FR_G3           = f83
541FR_H3           = f84
542FR_h3           = f85
543
544FR_2_to_minus_N = f86
545
546
547   // Near 1  registers
548FR_PP           = f65
549FR_QQ           = f66
550
551
552FR_PV6          = f69
553FR_PV4          = f70
554FR_PV3          = f71
555FR_PV2          = f72
556
557FR_QV6          = f73
558FR_QV4          = f74
559FR_QV3          = f75
560FR_QV2          = f76
561
562FR_Y0           = f77
563FR_Q0           = f78
564FR_E0           = f79
565FR_E2           = f80
566FR_E1           = f81
567FR_Y1           = f82
568FR_E3           = f83
569FR_Y2           = f84
570FR_R0           = f85
571FR_E4           = f86
572FR_Y3           = f87
573FR_R1           = f88
574FR_X_Hi         = f89
575FR_X_lo         = f90
576
577FR_HH           = f91
578FR_LL           = f92
579FR_HL           = f93
580FR_LH           = f94
581
582
583
584	// Error handler registers
585FR_Arg_X        = f95
586FR_Arg_Y        = f0
587
588
589// General Purpose Registers
590
591    // General prolog registers
592GR_PFS          = r32
593GR_OneP125      = r33
594GR_TwoP63       = r34
595GR_Arg          = r35
596GR_Half         = r36
597
598    // Near 1 path registers
599GR_Poly_P       = r37
600GR_Poly_Q       = r38
601
602    // Special logl registers
603GR_Index1       = r39
604GR_Index2       = r40
605GR_signif       = r41
606GR_X_0          = r42
607GR_X_1          = r43
608GR_X_2          = r44
609GR_minus_N      = r45
610GR_Z_1          = r46
611GR_Z_2          = r47
612GR_N            = r48
613GR_Bias         = r49
614GR_M            = r50
615GR_Index3       = r51
616GR_exp_2tom80   = r52
617GR_exp_mask     = r53
618GR_exp_2tom7    = r54
619GR_ad_ln10      = r55
620GR_ad_tbl_1     = r56
621GR_ad_tbl_2     = r57
622GR_ad_tbl_3     = r58
623GR_ad_q         = r59
624GR_ad_z_1       = r60
625GR_ad_z_2       = r61
626GR_ad_z_3       = r62
627
628//
629// Added for unwind support
630//
631GR_SAVE_PFS         = r32
632GR_SAVE_B0          = r33
633GR_SAVE_GP          = r34
634
635GR_Parameter_X      = r64
636GR_Parameter_Y      = r65
637GR_Parameter_RESULT = r66
638GR_Parameter_TAG    = r67
639
640
641
642.section .text
643GLOBAL_LIBM_ENTRY(acoshl)
644
645{ .mfi
646      alloc      GR_PFS       = ar.pfs,0,32,4,0     // Local frame allocation
647      fcmp.lt.s1 p11, p0      = FR_Arg, f1          // if arg is less than 1
648      mov	     GR_Half      = 0xfffe              // 0.5's exp
649}
650{ .mfi
651      addl       GR_Poly_Q    = @ltoff(Poly_Q), gp  // Address of Q-coeff table
652      fma.s1     FR_X2        = FR_Arg, FR_Arg, f0  // Obtain x^2
653      addl       GR_Poly_P    = @ltoff(Poly_P), gp  // Address of P-coeff table
654};;
655
656{ .mfi
657      getf.d     GR_Arg       = FR_Arg        // get argument as double (int64)
658      fma.s0        FR_Two       = f1, f1, f1    // construct 2.0
659      addl       GR_ad_z_1    = @ltoff(Constants_Z_1#),gp // logl tables
660}
661{ .mlx
662      nop.m 0
663      movl       GR_TwoP63    = 0x43E8000000000000 // 0.5*2^63 (huge arguments)
664};;
665
666{ .mfi
667      ld8        GR_Poly_P    = [GR_Poly_P]  // get actual P-coeff table address
668      fcmp.eq.s1 p10, p0      = FR_Arg, f1   // if arg == 1 (return 0)
669      nop.i 0
670}
671{ .mlx
672      ld8        GR_Poly_Q    = [GR_Poly_Q]  // get actual Q-coeff table address
673      movl       GR_OneP125   = 0x3FF2000000000000  // 1.125 (near 1 path bound)
674};;
675
676{ .mfi
677      ld8        GR_ad_z_1    = [GR_ad_z_1]      // Get pointer to Constants_Z_1
678      fclass.m   p7,p0        = FR_Arg, 0xe3       // if arg NaN inf
679      cmp.le     p9, p0       = GR_TwoP63, GR_Arg // if arg > 0.5*2^63 ('huges')
680}
681{ .mfb
682      cmp.ge     p8, p0       = GR_OneP125, GR_Arg // if arg<1.125 -near 1 path
683	  fms.s1     FR_XM1       = FR_Arg, f1, f1     // X0 = X-1 (for near 1 path)
684(p11) br.cond.spnt acoshl_lt_pone                  // error branch (less than 1)
685};;
686
687{ .mmi
688      setf.exp	FR_Half       = GR_Half     // construct 0.5
689(p9)  setf.s    FR_XLog_Lo    = r0          // Low of logl arg=0 (Huges path)
690      mov        GR_exp_mask  = 0x1FFFF         // Create exponent mask
691};;
692
693{ .mmf
694(p8)  ldfe       FR_PP5       = [GR_Poly_P],16     // Load P5
695(p8)  ldfe       FR_QQ5       = [GR_Poly_Q],16     // Load Q5
696      fms.s1     FR_M2        = FR_X2, f1, f1      // m2 = x^2 - 1
697};;
698
699{ .mfi
700(p8)  ldfe       FR_QQ4       = [GR_Poly_Q],16         // Load Q4
701      fms.s1     FR_M2L       = FR_Arg, FR_Arg, FR_X2  // low part of
702	                                                   //    m2 = fma(X*X - m2)
703      add        GR_ad_tbl_1  = 0x040, GR_ad_z_1    // Point to Constants_G_H_h1
704}
705{ .mfb
706(p8)  ldfe       FR_PP4       = [GR_Poly_P],16     // Load P4
707(p7)  fma.s0     FR_Res       = FR_Arg,f1,FR_Arg   // r = a + a (Nan, Inf)
708(p7)  br.ret.spnt b0                               // return    (Nan, Inf)
709};;
710
711{ .mfi
712(p8)  ldfe       FR_PP3       = [GR_Poly_P],16      // Load P3
713      nop.f 0
714      add        GR_ad_q      = -0x60, GR_ad_z_1    // Point to Constants_P
715}
716{ .mfb
717(p8)  ldfe       FR_QQ3       = [GR_Poly_Q],16      // Load Q3
718(p9)  fms.s1 FR_XLog_Hi       = FR_Two, FR_Arg, f1  // Hi  of log arg = 2*X-1
719(p9)  br.cond.spnt huges_logl                       // special version of log
720}
721;;
722
723{ .mfi
724(p8)  ldfe       FR_PP2       = [GR_Poly_P],16       // Load P2
725(p8)  fma.s1     FR_2XM1      = FR_Two, FR_XM1, f0   // 2X0 = 2 * X0
726      add        GR_ad_z_2    = 0x140, GR_ad_z_1    // Point to Constants_Z_2
727}
728{ .mfb
729(p8)  ldfe       FR_QQ2       = [GR_Poly_Q],16       // Load Q2
730(p10) fma.s0   FR_Res         = f0,f1,f0             // r = 0  (arg = 1)
731(p10) br.ret.spnt b0                                 // return (arg = 1)
732};;
733
734{ .mmi
735(p8)  ldfe       FR_PP1       = [GR_Poly_P],16       // Load P1
736(p8)  ldfe       FR_QQ1       = [GR_Poly_Q],16       // Load Q1
737      add        GR_ad_tbl_2  = 0x180, GR_ad_z_1    // Point to Constants_G_H_h2
738}
739;;
740
741{ .mfi
742(p8)  ldfe       FR_PP0       = [GR_Poly_P]          // Load P0
743      fma.s1     FR_Tmp       = f1, f1, FR_M2        // Tmp = 1 + m2
744      add        GR_ad_tbl_3  = 0x280, GR_ad_z_1    // Point to Constants_G_H_h3
745}
746{ .mfb
747(p8)  ldfe       FR_QQ0       = [GR_Poly_Q]
748      nop.f 0
749(p8)  br.cond.spnt near_1                            // near 1 path
750};;
751{ .mfi
752      ldfe       FR_log2_hi   = [GR_ad_q],16      // Load log2_hi
753      nop.f 0
754      mov        GR_Bias      = 0x0FFFF                  // Create exponent bias
755};;
756{ .mfi
757      nop.m 0
758      frsqrta.s1 FR_Rcp, p0   = FR_M2           // Rcp = 1/m2 reciprocal appr.
759      nop.i 0
760};;
761
762{ .mfi
763      ldfe       FR_log2_lo   = [GR_ad_q],16     // Load log2_lo
764      fms.s1     FR_Tmp       = FR_X2, f1, FR_Tmp  // Tmp =  x^2 - Tmp
765      nop.i 0
766};;
767
768{ .mfi
769      ldfe       FR_Q4        = [GR_ad_q],16          // Load Q4
770      fma.s1     FR_GG        = FR_Rcp, FR_M2, f0   // g = Rcp * m2
771                                               // 8 bit Newton Raphson iteration
772      nop.i 0
773}
774{ .mfi
775      nop.m 0
776      fma.s1     FR_HH 		  = FR_Half, FR_Rcp, f0      // h = 0.5 * Rcp
777      nop.i 0
778};;
779{ .mfi
780      ldfe       FR_Q3        = [GR_ad_q],16   // Load Q3
781      fnma.s1    FR_EE        = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
782      nop.i 0
783}
784{ .mfi
785      nop.m 0
786      fma.s1     FR_M2L       = FR_Tmp, f1, FR_M2L  // low part of m2 = Tmp+m2l
787      nop.i 0
788};;
789
790{ .mfi
791      ldfe       FR_Q2        = [GR_ad_q],16      // Load Q2
792      fma.s1     FR_GG        = FR_GG, FR_EE, FR_GG     // g = g * e + g
793                                              // 16 bit Newton Raphson iteration
794      nop.i 0
795}
796{ .mfi
797      nop.m 0
798      fma.s1     FR_HH        = FR_HH, FR_EE, FR_HH     // h = h * e + h
799      nop.i 0
800};;
801
802{ .mfi
803      ldfe       FR_Q1        = [GR_ad_q]                // Load Q1
804      fnma.s1    FR_EE        = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
805      nop.i 0
806};;
807{ .mfi
808      nop.m 0
809      fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
810                                              // 32 bit Newton Raphson iteration
811      nop.i 0
812}
813{ .mfi
814      nop.m 0
815      fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
816      nop.i 0
817};;
818
819{ .mfi
820      nop.m 0
821      fnma.s1   FR_EE         = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
822      nop.i 0
823};;
824
825{ .mfi
826      nop.m 0
827      fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
828                                              // 64 bit Newton Raphson iteration
829      nop.i 0
830}
831{ .mfi
832      nop.m 0
833      fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
834      nop.i 0
835};;
836
837{ .mfi
838      nop.m 0
839      fnma.s1   FR_DD         = FR_GG, FR_GG, FR_M2  // Remainder d = g * g - p2
840      nop.i 0
841}
842{ .mfi
843      nop.m 0
844      fma.s1    FR_XLog_Hi     = FR_Arg, f1, FR_GG // bh = z + gh
845      nop.i 0
846};;
847
848{ .mfi
849      nop.m 0
850      fma.s1    FR_DD         = FR_DD, f1, FR_M2L       // add p2l: d = d + p2l
851      nop.i 0
852};;
853
854{ .mfi
855      getf.sig  GR_signif     = FR_XLog_Hi     // Get significand of x+1
856      nop.f 0
857      mov       GR_exp_2tom7  = 0x0fff8        // Exponent of 2^-7
858};;
859
860{ .mfi
861      nop.m 0
862      fma.s1    FR_GL         = FR_DD, FR_HH, f0        // gl = d * h
863      extr.u    GR_Index1     = GR_signif, 59, 4    // Get high 4 bits of signif
864}
865{ .mfi
866      nop.m 0
867      fma.s1    FR_XLog_Hi     = FR_DD,  FR_HH, FR_XLog_Hi // bh = bh + gl
868      nop.i 0
869};;
870
871
872
873{ .mmi
874      shladd    GR_ad_z_1     = GR_Index1, 2, GR_ad_z_1  // Point to Z_1
875      shladd    GR_ad_tbl_1   = GR_Index1, 4, GR_ad_tbl_1  // Point to G_1
876      extr.u    GR_X_0        = GR_signif, 49, 15 // Get high 15 bits of signif.
877};;
878
879{ .mmi
880      ld4       GR_Z_1        = [GR_ad_z_1]    // Load Z_1
881      nop.m 0
882      nop.i 0
883};;
884
885{ .mmi
886      ldfps     FR_G, FR_H    = [GR_ad_tbl_1],8     // Load G_1, H_1
887      nop.m 0
888      nop.i 0
889};;
890
891{ .mfi
892      nop.m 0
893      fms.s1    FR_XLog_Lo     = FR_Arg,  f1,   FR_XLog_Hi // bl = x - bh
894      pmpyshr2.u GR_X_1       = GR_X_0,GR_Z_1,15  // Get bits 30-15 of X_0 * Z_1
895};;
896
897// WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
898// "DEAD" ZONE!
899
900{ .mfi
901      nop.m 0
902      nop.f 0
903      nop.i 0
904};;
905
906{ .mfi
907      nop.m 0
908      fmerge.se FR_S_hi       =  f1,FR_XLog_Hi            // Form |x+1|
909      nop.i 0
910};;
911
912
913{ .mmi
914      getf.exp  GR_N          =  FR_XLog_Hi    // Get N = exponent of x+1
915      ldfd      FR_h          = [GR_ad_tbl_1]        // Load h_1
916      nop.i 0
917};;
918
919{ .mfi
920      nop.m 0
921      nop.f 0
922      extr.u    GR_Index2     = GR_X_1, 6, 4      // Extract bits 6-9 of X_1
923};;
924
925{ .mfi
926      shladd    GR_ad_tbl_2   = GR_Index2, 4, GR_ad_tbl_2  // Point to G_2
927      fma.s1    FR_XLog_Lo    = FR_XLog_Lo, f1, FR_GG // bl = bl + gg
928      mov       GR_exp_2tom80 = 0x0ffaf           // Exponent of 2^-80
929}
930{ .mfi
931      shladd    GR_ad_z_2     = GR_Index2, 2, GR_ad_z_2  // Point to Z_2
932      nop.f 0
933      sub       GR_N          = GR_N, GR_Bias // sub bias from exp
934};;
935
936{ .mmi
937      ldfps     FR_G2, FR_H2  = [GR_ad_tbl_2],8       // Load G_2, H_2
938      ld4       GR_Z_2        = [GR_ad_z_2]                // Load Z_2
939      sub       GR_minus_N    = GR_Bias, GR_N         // Form exponent of 2^(-N)
940};;
941
942{ .mmi
943      ldfd      FR_h2         = [GR_ad_tbl_2]             // Load h_2
944      nop.m 0
945      nop.i 0
946};;
947
948{ .mmi
949      setf.sig  FR_float_N    = GR_N        // Put integer N into rightmost sign
950      setf.exp  FR_2_to_minus_N = GR_minus_N   // Form 2^(-N)
951      pmpyshr2.u GR_X_2       = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
952};;
953
954// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
955// BECAUSE OF POSSIBLE 10 CLOCKS STALL!
956// (Just nops added - nothing to do here)
957
958{ .mfi
959      nop.m 0
960      fma.s1    FR_XLog_Lo     = FR_XLog_Lo, f1, FR_GL // bl = bl + gl
961      nop.i 0
962};;
963{ .mfi
964      nop.m 0
965      nop.f 0
966      nop.i 0
967};;
968{ .mfi
969      nop.m 0
970      nop.f 0
971      nop.i 0
972};;
973
974{ .mfi
975      nop.m 0
976      nop.f 0
977      extr.u    GR_Index3     = GR_X_2, 1, 5         // Extract bits 1-5 of X_2
978};;
979
980{ .mfi
981      shladd    GR_ad_tbl_3   = GR_Index3, 4, GR_ad_tbl_3  // Point to G_3
982      nop.f 0
983      nop.i 0
984};;
985
986{ .mfi
987      ldfps     FR_G3, FR_H3  = [GR_ad_tbl_3],8   // Load G_3, H_3
988      nop.f 0
989      nop.i 0
990};;
991
992{ .mfi
993      ldfd      FR_h3         = [GR_ad_tbl_3]            // Load h_3
994	  fcvt.xf   FR_float_N    = FR_float_N
995      nop.i 0
996};;
997
998{ .mfi
999      nop.m 0
1000      fmpy.s1   FR_G          = FR_G, FR_G2              // G = G_1 * G_2
1001      nop.i 0
1002}
1003{ .mfi
1004      nop.m 0
1005      fadd.s1   FR_H          = FR_H, FR_H2              // H = H_1 + H_2
1006      nop.i 0
1007};;
1008
1009{ .mfi
1010      nop.m 0
1011      fadd.s1   FR_h          = FR_h, FR_h2              // h = h_1 + h_2
1012      nop.i 0
1013}
1014{ .mfi
1015      nop.m 0
1016      fma.s1    FR_S_lo     = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^(-N)
1017      nop.i 0
1018};;
1019
1020{ .mfi
1021      nop.m 0
1022      fmpy.s1   FR_G          = FR_G, FR_G3             // G = (G_1 * G_2) * G_3
1023      nop.i 0
1024}
1025{ .mfi
1026      nop.m 0
1027      fadd.s1   FR_H          = FR_H, FR_H3             // H = (H_1 + H_2) + H_3
1028      nop.i 0
1029};;
1030
1031{ .mfi
1032      nop.m 0
1033      fadd.s1   FR_h          = FR_h, FR_h3             // h = (h_1 + h_2) + h_3
1034      nop.i 0
1035};;
1036
1037{ .mfi
1038      nop.m 0
1039      fms.s1    FR_r          = FR_G, FR_S_hi, f1           // r = G * S_hi - 1
1040      nop.i 0
1041}
1042{ .mfi
1043      nop.m 0
1044      fma.s1    FR_Y_hi       = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1045      nop.i 0
1046};;
1047
1048{ .mfi
1049      nop.m 0
1050      fma.s1    FR_h          = FR_float_N, FR_log2_lo, FR_h  // h=N*log2_lo+h
1051      nop.i 0
1052}
1053{ .mfi
1054      nop.m 0
1055      fma.s1    FR_r          = FR_G, FR_S_lo, FR_r  // r=G*S_lo+(G*S_hi-1)
1056      nop.i 0
1057};;
1058
1059{ .mfi
1060      nop.m 0
1061      fma.s1    FR_poly_lo    = FR_r, FR_Q4, FR_Q3      // poly_lo = r * Q4 + Q3
1062      nop.i 0
1063}
1064{ .mfi
1065      nop.m 0
1066      fmpy.s1   FR_rsq        = FR_r, FR_r              // rsq = r * r
1067      nop.i 0
1068};;
1069
1070{ .mfi
1071      nop.m 0
1072      fma.s1    FR_poly_lo    = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1073      nop.i 0
1074}
1075{ .mfi
1076      nop.m 0
1077      fma.s1    FR_rcub       = FR_rsq, FR_r, f0        // rcub = r^3
1078      nop.i 0
1079};;
1080
1081{ .mfi
1082      nop.m 0
1083      fma.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1084      nop.i 0
1085};;
1086
1087{ .mfi
1088      nop.m 0
1089      fma.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1090      nop.i 0
1091};;
1092
1093{ .mfi
1094      nop.m 0
1095      fadd.s0   FR_Y_lo       = FR_poly_hi, FR_poly_lo
1096	                                                     // Y_lo=poly_hi+poly_lo
1097      nop.i 0
1098};;
1099
1100{ .mfb
1101      nop.m 0
1102      fadd.s0   FR_Res        = FR_Y_lo,FR_Y_hi    // Result=Y_lo+Y_hi
1103      br.ret.sptk   b0                         // Common exit for 2^-7 < x < inf
1104};;
1105
1106
1107huges_logl:
1108{ .mmi
1109      getf.sig   GR_signif    = FR_XLog_Hi               // Get significand of x+1
1110      mov        GR_exp_2tom7 = 0x0fff8            // Exponent of 2^-7
1111      nop.i 0
1112};;
1113
1114{ .mfi
1115      add        GR_ad_tbl_1  = 0x040, GR_ad_z_1    // Point to Constants_G_H_h1
1116      nop.f 0
1117      add        GR_ad_q      = -0x60, GR_ad_z_1    // Point to Constants_P
1118}
1119{ .mfi
1120      add        GR_ad_z_2    = 0x140, GR_ad_z_1    // Point to Constants_Z_2
1121      nop.f 0
1122      add        GR_ad_tbl_2  = 0x180, GR_ad_z_1    // Point to Constants_G_H_h2
1123};;
1124
1125{ .mfi
1126      add        GR_ad_tbl_3  = 0x280, GR_ad_z_1    // Point to Constants_G_H_h3
1127      nop.f 0
1128      extr.u     GR_Index1    = GR_signif, 59, 4    // Get high 4 bits of signif
1129};;
1130
1131{ .mfi
1132      shladd     GR_ad_z_1    = GR_Index1, 2, GR_ad_z_1  // Point to Z_1
1133      nop.f 0
1134      extr.u     GR_X_0       = GR_signif, 49, 15 // Get high 15 bits of signif.
1135};;
1136
1137{ .mfi
1138      ld4        GR_Z_1       = [GR_ad_z_1]     // Load Z_1
1139      nop.f 0
1140      mov        GR_exp_mask  = 0x1FFFF         // Create exponent mask
1141}
1142{ .mfi
1143      shladd     GR_ad_tbl_1  = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
1144      nop.f 0
1145      mov        GR_Bias      = 0x0FFFF                  // Create exponent bias
1146};;
1147
1148{ .mfi
1149      ldfps      FR_G, FR_H   = [GR_ad_tbl_1],8     // Load G_1, H_1
1150      fmerge.se  FR_S_hi      =  f1,FR_XLog_Hi            // Form |x|
1151      nop.i 0
1152};;
1153
1154{ .mmi
1155      getf.exp   GR_N         =  FR_XLog_Hi         // Get N = exponent of x+1
1156      ldfd       FR_h         = [GR_ad_tbl_1] // Load h_1
1157      nop.i 0
1158};;
1159
1160{ .mfi
1161      ldfe       FR_log2_hi   = [GR_ad_q],16      // Load log2_hi
1162      nop.f 0
1163      pmpyshr2.u GR_X_1       = GR_X_0,GR_Z_1,15  // Get bits 30-15 of X_0 * Z_1
1164};;
1165
1166{ .mmi
1167      ldfe       FR_log2_lo   = [GR_ad_q],16     // Load log2_lo
1168      sub        GR_N         = GR_N, GR_Bias
1169      mov        GR_exp_2tom80 = 0x0ffaf         // Exponent of 2^-80
1170};;
1171
1172{ .mfi
1173      ldfe       FR_Q4        = [GR_ad_q],16          // Load Q4
1174      nop.f 0
1175      sub        GR_minus_N   = GR_Bias, GR_N         // Form exponent of 2^(-N)
1176};;
1177
1178{ .mmf
1179      ldfe       FR_Q3        = [GR_ad_q],16   // Load Q3
1180      setf.sig   FR_float_N   = GR_N        // Put integer N into rightmost sign
1181      nop.f 0
1182};;
1183
1184{ .mmi
1185      ldfe       FR_Q2        = [GR_ad_q],16      // Load Q2
1186	  nop.m 0
1187      extr.u     GR_Index2    = GR_X_1, 6, 4      // Extract bits 6-9 of X_1
1188};;
1189
1190{ .mmi
1191      ldfe       FR_Q1        = [GR_ad_q]                // Load Q1
1192      shladd     GR_ad_z_2    = GR_Index2, 2, GR_ad_z_2  // Point to Z_2
1193      nop.i 0
1194};;
1195
1196{ .mmi
1197      ld4        GR_Z_2       = [GR_ad_z_2]                // Load Z_2
1198      shladd     GR_ad_tbl_2  = GR_Index2, 4, GR_ad_tbl_2  // Point to G_2
1199	  nop.i 0
1200};;
1201
1202{ .mmi
1203      ldfps      FR_G2, FR_H2 = [GR_ad_tbl_2],8       // Load G_2, H_2
1204      nop.m 0
1205      nop.i 0
1206};;
1207
1208{ .mmf
1209      ldfd       FR_h2        = [GR_ad_tbl_2]         // Load h_2
1210      setf.exp FR_2_to_minus_N = GR_minus_N   // Form 2^(-N)
1211      nop.f 0
1212};;
1213
1214{ .mfi
1215      nop.m 0
1216      nop.f 0
1217      pmpyshr2.u GR_X_2       = GR_X_1,GR_Z_2,15   // Get bits 30-15 of X_1*Z_2
1218};;
1219
1220// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
1221// BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1222// (Just nops added - nothing to do here)
1223
1224{ .mfi
1225      nop.m 0
1226      nop.f 0
1227      nop.i 0
1228};;
1229
1230{ .mfi
1231      nop.m 0
1232      nop.f 0
1233      nop.i 0
1234};;
1235
1236{ .mfi
1237      nop.m 0
1238      nop.f 0
1239      nop.i 0
1240};;
1241
1242{ .mfi
1243      nop.m 0
1244      nop.f 0
1245      extr.u     GR_Index3    = GR_X_2, 1, 5          // Extract bits 1-5 of X_2
1246};;
1247
1248{ .mfi
1249      shladd     GR_ad_tbl_3  = GR_Index3, 4, GR_ad_tbl_3  // Point to G_3
1250	  fcvt.xf    FR_float_N   = FR_float_N
1251      nop.i 0
1252};;
1253
1254{ .mfi
1255      ldfps      FR_G3, FR_H3 = [GR_ad_tbl_3],8   // Load G_3, H_3
1256      nop.f 0
1257      nop.i 0
1258};;
1259
1260{ .mfi
1261      ldfd       FR_h3        = [GR_ad_tbl_3]            // Load h_3
1262      fmpy.s1    FR_G         = FR_G, FR_G2              // G = G_1 * G_2
1263      nop.i 0
1264}
1265{ .mfi
1266      nop.m 0
1267      fadd.s1    FR_H         = FR_H, FR_H2              // H = H_1 + H_2
1268      nop.i 0
1269};;
1270
1271{ .mmf
1272      nop.m 0
1273      nop.m 0
1274      fadd.s1    FR_h         = FR_h, FR_h2              // h = h_1 + h_2
1275};;
1276
1277{ .mfi
1278      nop.m 0
1279      fmpy.s1    FR_G         = FR_G, FR_G3              // G = (G_1 * G_2)*G_3
1280      nop.i 0
1281}
1282{ .mfi
1283      nop.m 0
1284      fadd.s1    FR_H         = FR_H, FR_H3              // H = (H_1 + H_2)+H_3
1285      nop.i 0
1286};;
1287
1288{ .mfi
1289      nop.m 0
1290      fadd.s1    FR_h         = FR_h, FR_h3            // h = (h_1 + h_2) + h_3
1291      nop.i 0
1292};;
1293
1294{ .mfi
1295      nop.m 0
1296      fms.s1     FR_r         = FR_G, FR_S_hi, f1           // r = G * S_hi - 1
1297      nop.i 0
1298}
1299{ .mfi
1300      nop.m 0
1301      fma.s1     FR_Y_hi      = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1302      nop.i 0
1303};;
1304
1305{ .mfi
1306      nop.m 0
1307      fma.s1     FR_h         = FR_float_N, FR_log2_lo, FR_h  // h = N*log2_lo+h
1308      nop.i 0
1309};;
1310
1311{ .mfi
1312      nop.m 0
1313      fma.s1     FR_poly_lo   = FR_r, FR_Q4, FR_Q3      // poly_lo = r * Q4 + Q3
1314      nop.i 0
1315}
1316{ .mfi
1317      nop.m 0
1318      fmpy.s1    FR_rsq       = FR_r, FR_r              // rsq = r * r
1319      nop.i 0
1320};;
1321
1322{ .mfi
1323      nop.m 0
1324      fma.s1     FR_poly_lo   = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1325      nop.i 0
1326}
1327{ .mfi
1328      nop.m 0
1329      fma.s1     FR_rcub      = FR_rsq, FR_r, f0        // rcub = r^3
1330      nop.i 0
1331};;
1332
1333{ .mfi
1334      nop.m 0
1335      fma.s1     FR_poly_hi   = FR_Q1, FR_rsq, FR_r     // poly_hi = Q1*rsq + r
1336      nop.i 0
1337};;
1338
1339{ .mfi
1340      nop.m 0
1341      fma.s1     FR_poly_lo   = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1342      nop.i 0
1343};;
1344{ .mfi
1345      nop.m 0
1346      fadd.s0    FR_Y_lo      = FR_poly_hi, FR_poly_lo  // Y_lo=poly_hi+poly_lo
1347      nop.i 0
1348};;
1349{ .mfb
1350      nop.m 0
1351      fadd.s0    FR_Res       = FR_Y_lo,FR_Y_hi    // Result=Y_lo+Y_hi
1352      br.ret.sptk   b0                        // Common exit
1353};;
1354
1355
1356// NEAR ONE INTERVAL
1357near_1:
1358{ .mfi
1359      nop.m 0
1360      frsqrta.s1 FR_Rcp, p0   = FR_2XM1 // Rcp = 1/x reciprocal appr. &SQRT&
1361      nop.i 0
1362};;
1363
1364{ .mfi
1365      nop.m 0
1366      fma.s1     FR_PV6       = FR_PP5, FR_XM1, FR_PP4 // pv6 = P5*xm1+P4 $POLY$
1367      nop.i 0
1368}
1369{ .mfi
1370      nop.m 0
1371	  fma.s1     FR_QV6       = FR_QQ5, FR_XM1, FR_QQ4 // qv6 = Q5*xm1+Q4 $POLY$
1372      nop.i 0
1373};;
1374
1375{ .mfi
1376      nop.m 0
1377	  fma.s1     FR_PV4       = FR_PP3, FR_XM1, FR_PP2 // pv4 = P3*xm1+P2 $POLY$
1378      nop.i 0
1379}
1380{ .mfi
1381      nop.m 0
1382	  fma.s1     FR_QV4       = FR_QQ3, FR_XM1, FR_QQ2 // qv4 = Q3*xm1+Q2 $POLY$
1383      nop.i 0
1384};;
1385
1386{ .mfi
1387      nop.m 0
1388	  fma.s1     FR_XM12      = FR_XM1, FR_XM1, f0 // xm1^2 = xm1 * xm1 $POLY$
1389      nop.i 0
1390};;
1391
1392{ .mfi
1393      nop.m 0
1394	  fma.s1     FR_PV2       = FR_PP1, FR_XM1, FR_PP0 // pv2 = P1*xm1+P0 $POLY$
1395      nop.i 0
1396}
1397{ .mfi
1398      nop.m 0
1399	  fma.s1     FR_QV2       = FR_QQ1, FR_XM1, FR_QQ0 // qv2 = Q1*xm1+Q0 $POLY$
1400      nop.i 0
1401};;
1402
1403{ .mfi
1404      nop.m 0
1405      fma.s1     FR_GG        = FR_Rcp, FR_2XM1, f0 // g = Rcp * x &SQRT&
1406      nop.i 0
1407}
1408{ .mfi
1409      nop.m 0
1410      fma.s1     FR_HH        = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp &SQRT&
1411      nop.i 0
1412};;
1413
1414
1415{ .mfi
1416      nop.m 0
1417	  fma.s1    FR_PV3       = FR_XM12, FR_PV6, FR_PV4//pv3=pv6*xm1^2+pv4 $POLY$
1418      nop.i 0
1419}
1420{ .mfi
1421      nop.m 0
1422	  fma.s1    FR_QV3       = FR_XM12, FR_QV6, FR_QV4//qv3=qv6*xm1^2+qv4 $POLY$
1423      nop.i 0
1424};;
1425
1426
1427{ .mfi
1428      nop.m 0
1429      fnma.s1   FR_EE        = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h &SQRT&
1430      nop.i 0
1431};;
1432
1433{ .mfi
1434      nop.m 0
1435	  fma.s1    FR_PP        = FR_XM12, FR_PV3, FR_PV2 //pp=pv3*xm1^2+pv2 $POLY$
1436      nop.i 0
1437}
1438{ .mfi
1439      nop.m 0
1440	  fma.s1    FR_QQ        = FR_XM12, FR_QV3, FR_QV2 //qq=qv3*xm1^2+qv2 $POLY$
1441      nop.i 0
1442};;
1443
1444{ .mfi
1445      nop.m 0
1446      fma.s1     FR_GG        = FR_GG, FR_EE, FR_GG  // g = g * e + g &SQRT&
1447      nop.i 0
1448}
1449{ .mfi
1450      nop.m 0
1451      fma.s1     FR_HH        = FR_HH, FR_EE, FR_HH  // h = h * e + h &SQRT&
1452      nop.i 0
1453};;
1454
1455{ .mfi
1456      nop.m 0
1457      frcpa.s1   FR_Y0,p0     = f1,FR_QQ // y = frcpa(b)  #DIV#
1458      nop.i 0
1459}
1460{ .mfi
1461      nop.m 0
1462      fnma.s1    FR_EE        = FR_GG, FR_HH, FR_Half // e = 0.5 - g*h &SQRT&
1463      nop.i 0
1464};;
1465
1466{ .mfi
1467      nop.m 0
1468      fma.s1     FR_Q0        = FR_PP,FR_Y0,f0 // q = a*y  #DIV#
1469      nop.i 0
1470}
1471{ .mfi
1472      nop.m 0
1473      fnma.s1    FR_E0        = FR_Y0,FR_QQ,f1 // e = 1 - b*y  #DIV#
1474      nop.i 0
1475};;
1476
1477{ .mfi
1478      nop.m 0
1479      fma.s1     FR_GG        = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT&
1480      nop.i 0
1481}
1482{ .mfi
1483      nop.m 0
1484      fma.s1     FR_HH        = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
1485      nop.i 0
1486};;
1487
1488{ .mfi
1489      nop.m 0
1490      fma.s1     FR_E2        = FR_E0,FR_E0,FR_E0 // e2 = e+e^2 #DIV#
1491      nop.i 0
1492}
1493{ .mfi
1494      nop.m 0
1495      fma.s1     FR_E1        = FR_E0,FR_E0,f0 // e1 = e^2 #DIV#
1496      nop.i 0
1497};;
1498
1499{ .mfi
1500      nop.m 0
1501      fnma.s1   FR_EE        = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h &SQRT&
1502      nop.i 0
1503}
1504{ .mfi
1505      nop.m 0
1506	  fnma.s1   FR_DD        = FR_GG, FR_GG, FR_2XM1   // d = x - g * g &SQRT&
1507      nop.i 0
1508};;
1509
1510{ .mfi
1511      nop.m 0
1512      fma.s1     FR_Y1        = FR_Y0,FR_E2,FR_Y0 // y1 = y+y*e2 #DIV#
1513      nop.i 0
1514}
1515{ .mfi
1516      nop.m 0
1517      fma.s1     FR_E3        = FR_E1,FR_E1,FR_E0 // e3 = e+e1^2 #DIV#
1518      nop.i 0
1519};;
1520
1521{ .mfi
1522      nop.m 0
1523      fma.s1     FR_GG        = FR_DD, FR_HH, FR_GG // g = d * h + g &SQRT&
1524      nop.i 0
1525}
1526{ .mfi
1527      nop.m 0
1528      fma.s1     FR_HH        = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
1529      nop.i 0
1530};;
1531
1532{ .mfi
1533      nop.m 0
1534      fma.s1     FR_Y2        = FR_Y1,FR_E3,FR_Y0 // y2 = y+y1*e3 #DIV#
1535      nop.i 0
1536}
1537{ .mfi
1538      nop.m 0
1539      fnma.s1    FR_R0        = FR_QQ,FR_Q0,FR_PP // r = a-b*q #DIV#
1540      nop.i 0
1541};;
1542
1543{ .mfi
1544      nop.m 0
1545      fnma.s1    FR_DD        = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT&
1546      nop.i 0
1547};;
1548
1549{ .mfi
1550      nop.m 0
1551      fnma.s1    FR_E4        = FR_QQ,FR_Y2,f1    // e4 = 1-b*y2 #DIV#
1552      nop.i 0
1553}
1554{ .mfi
1555      nop.m 0
1556      fma.s1     FR_X_Hi      = FR_R0,FR_Y2,FR_Q0 // x = q+r*y2 #DIV#
1557      nop.i 0
1558};;
1559
1560{ .mfi
1561      nop.m 0
1562      fma.s1     FR_GL        = FR_DD, FR_HH, f0   // gl = d * h &SQRT&
1563      nop.i 0
1564};;
1565
1566{ .mfi
1567      nop.m 0
1568      fma.s1     FR_Y3        = FR_Y2,FR_E4,FR_Y2 // y3 = y2+y2*e4 #DIV#
1569      nop.i 0
1570}
1571{ .mfi
1572      nop.m 0
1573      fnma.s1    FR_R1        = FR_QQ,FR_X_Hi,FR_PP // r1 = a-b*x #DIV#
1574      nop.i 0
1575};;
1576
1577{ .mfi
1578      nop.m 0
1579      fma.s1     FR_HH        = FR_GG, FR_X_Hi, f0 // hh = gg * x_hi
1580      nop.i 0
1581}
1582{ .mfi
1583      nop.m 0
1584      fma.s1     FR_LH        = FR_GL, FR_X_Hi, f0 // lh = gl * x_hi
1585      nop.i 0
1586};;
1587
1588{ .mfi
1589      nop.m 0
1590      fma.s1     FR_X_lo      = FR_R1,FR_Y3,f0 // x_lo = r1*y3 #DIV#
1591      nop.i 0
1592};;
1593
1594{ .mfi
1595      nop.m 0
1596      fma.s1     FR_LL        = FR_GL, FR_X_lo, f0 // ll = gl*x_lo
1597      nop.i 0
1598}
1599{ .mfi
1600      nop.m 0
1601      fma.s1     FR_HL        = FR_GG, FR_X_lo, f0 // hl = gg * x_lo
1602      nop.i 0
1603};;
1604
1605{ .mfi
1606      nop.m 0
1607	  fms.s1     FR_Res       = FR_GL,  f1, FR_LL // res = gl + ll
1608      nop.i 0
1609};;
1610
1611{ .mfi
1612      nop.m 0
1613	  fms.s1     FR_Res       = FR_Res, f1, FR_LH // res = res + lh
1614      nop.i 0
1615};;
1616
1617{ .mfi
1618      nop.m 0
1619	  fms.s1     FR_Res       = FR_Res, f1, FR_HL // res = res + hl
1620      nop.i 0
1621};;
1622
1623{ .mfi
1624      nop.m 0
1625	  fms.s1     FR_Res       = FR_Res, f1, FR_HH // res = res + hh
1626      nop.i 0
1627};;
1628
1629{ .mfb
1630      nop.m 0
1631	  fma.s0     FR_Res       = FR_Res, f1, FR_GG  // result = res + gg
1632      br.ret.sptk   b0                     // Exit for near 1 path
1633};;
1634// NEAR ONE INTERVAL END
1635
1636
1637
1638
1639acoshl_lt_pone:
1640{ .mfi
1641      nop.m 0
1642      fmerge.s   FR_Arg_X            = FR_Arg, FR_Arg
1643      nop.i 0
1644};;
1645{ .mfb
1646      mov        GR_Parameter_TAG    = 135
1647      frcpa.s0   FR_Res,p0           = f0,f0 // get QNaN,and raise invalid
1648      br.cond.sptk  __libm_error_region      // exit if x < 1.0
1649};;
1650
1651GLOBAL_LIBM_END(acoshl)
1652libm_alias_ldouble_other (acosh, acosh)
1653
1654
1655
1656LOCAL_LIBM_ENTRY(__libm_error_region)
1657.prologue
1658{ .mfi
1659        add      GR_Parameter_Y      = -32,sp        // Parameter 2 value
1660        nop.f 0
1661.save   ar.pfs,GR_SAVE_PFS
1662        mov      GR_SAVE_PFS         = ar.pfs        // Save ar.pfs
1663}
1664{ .mfi
1665.fframe 64
1666        add      sp                  = -64,sp        // Create new stack
1667        nop.f 0
1668        mov      GR_SAVE_GP          = gp            // Save gp
1669};;
1670
1671{ .mmi
1672        stfe     [GR_Parameter_Y]    = FR_Arg_Y,16   // Parameter 2 to stack
1673        add      GR_Parameter_X      = 16,sp         // Parameter 1 address
1674.save   b0,GR_SAVE_B0
1675        mov      GR_SAVE_B0          = b0            // Save b0
1676};;
1677
1678.body
1679{ .mib
1680        stfe     [GR_Parameter_X]    = FR_Arg_X         // Parameter 1 to stack
1681        add      GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
1682        nop.b 0
1683}
1684{ .mib
1685        stfe     [GR_Parameter_Y]    = FR_Res        // Parameter 3 to stack
1686        add      GR_Parameter_Y      = -16,GR_Parameter_Y
1687        br.call.sptk b0 = __libm_error_support#      // Error handling function
1688};;
1689
1690{ .mmi
1691        nop.m 0
1692        nop.m 0
1693        add      GR_Parameter_RESULT = 48,sp
1694};;
1695
1696{ .mmi
1697        ldfe     f8                  = [GR_Parameter_RESULT]  // Get return res
1698.restore sp
1699        add      sp                  = 64,sp       // Restore stack pointer
1700        mov      b0                  = GR_SAVE_B0  // Restore return address
1701};;
1702
1703{ .mib
1704        mov      gp                  = GR_SAVE_GP  // Restore gp
1705        mov      ar.pfs              = GR_SAVE_PFS // Restore ar.pfs
1706        br.ret.sptk b0                             // Return
1707};;
1708
1709LOCAL_LIBM_END(__libm_error_region#)
1710
1711.type   __libm_error_support#,@function
1712.global __libm_error_support#
1713