1.file "atanhl.s"
2
3
4// Copyright (c) 2001 - 2003, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
25// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
28// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code,and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39//*********************************************************************
40//
41// History:
42// 09/10/01  Initial version
43// 12/11/01  Corrected .restore syntax
44// 05/20/02  Cleaned up namespace and sf0 syntax
45// 02/10/03  Reordered header: .section, .global, .proc, .align;
46//           used data8 for long double table values
47//
48//*********************************************************************
49//
50//*********************************************************************
51//
52// Function: atanhl(x) computes the principle value of the inverse
53// hyperbolic tangent of x.
54//
55//*********************************************************************
56//
57// Resources Used:
58//
59//    Floating-Point Registers: f8 (Input and Return Value)
60//                              f33-f73
61//
62//    General Purpose Registers:
63//      r32-r52
64//      r49-r52 (Used to pass arguments to error handling routine)
65//
66//    Predicate Registers:      p6-p15
67//
68//*********************************************************************
69//
70// IEEE Special Conditions:
71//
72//    atanhl(inf) = QNaN
73//    atanhl(-inf) = QNaN
74//    atanhl(+/-0) = +/-0
75//    atanhl(1) =  +inf
76//    atanhl(-1) =  -inf
77//    atanhl(|x|>1) = QNaN
78//    atanhl(SNaN) = QNaN
79//    atanhl(QNaN) = QNaN
80//
81//*********************************************************************
82//
83// Overview
84//
85// The method consists of two cases.
86//
87// If      |x| < 1/32  use case atanhl_near_zero;
88// else                 use case atanhl_regular;
89//
90// Case atanhl_near_zero:
91//
92//   atanhl(x) can be approximated by the Taylor series expansion
93//   up to order 17.
94//
95// Case atanhl_regular:
96//
97//   Here we use formula atanhl(x) = sign(x)*log1pl(2*|x|/(1-|x|))/2 and
98//   calculation is subdivided into two stages. The first stage is
99//   calculating of X = 2*|x|/(1-|x|). The second one is calculating of
100//   sign(x)*log1pl(X)/2. To obtain required accuracy we use precise division
101//   algorithm output of which is a pair of two extended precision values those
102//   approximate result of division with accuracy higher than working
103//   precision. This pair is passed to modified log1pl function.
104//
105//
106//   1. calculating of X = 2*|x|/(1-|x|)
107//   ( based on Peter Markstein's "IA-64 and Elementary Functions" book )
108//   ********************************************************************
109//
110//     a = 2*|x|
111//     b = 1 - |x|
112//     b_lo = |x| - (1 - b)
113//
114//     y = frcpa(b)         initial approximation of 1/b
115//     q = a*y              initial approximation of a/b
116//
117//     e = 1 - b*y
118//     e2 = e + e^2
119//     e1 = e^2
120//     y1 = y + y*e2 = y + y*(e+e^2)
121//
122//     e3 = e + e1^2
123//     y2 = y + y1*e3 = y + y*(e+e^2+..+e^6)
124//
125//     r = a - b*q
126//     e = 1 - b*y2
127//     X = q + r*y2         high part of a/b
128//
129//     y3 = y2 + y2*e4
130//     r1 = a - b*X
131//     r1 = r1 - b_lo*X
132//     X_lo = r1*y3         low part of a/b
133//
134//   2. special log1p algorithm overview
135//   ***********************************
136//
137//    Here we use a table lookup method. The basic idea is that in
138//    order to compute logl(Arg) = log1pl (Arg-1) for an argument Arg in [1,2),
139//    we construct a value G such that G*Arg is close to 1 and that
140//    logl(1/G) is obtainable easily from a table of values calculated
141//    beforehand. Thus
142//
143//      logl(Arg) = logl(1/G) + logl(G*Arg)
144//           = logl(1/G) + logl(1 + (G*Arg - 1))
145//
146//    Because |G*Arg - 1| is small, the second term on the right hand
147//    side can be approximated by a short polynomial. We elaborate
148//    this method in several steps.
149//
150//    Step 0: Initialization
151//    ------
152//    We need to calculate logl(X + X_lo + 1). Obtain N, S_hi such that
153//
154//      X + X_lo + 1 = 2^N * ( S_hi + S_lo )   exactly
155//
156//    where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
157//    that |S_lo| <= ulp(S_hi).
158//
159//    For the special version of log1p we add X_lo to S_lo (S_lo = S_lo + X_lo)
160//    !-----------------------------------------------------------------------!
161//
162//    Step 1: Argument Reduction
163//    ------
164//    Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
165//
166//      G := G_1 * G_2 * G_3
167//      r := (G * S_hi - 1) + G * S_lo
168//
169//    These G_j's have the property that the product is exactly
170//    representable and that |r| < 2^(-12) as a result.
171//
172//    Step 2: Approximation
173//    ------
174//    logl(1 + r) is approximated by a short polynomial poly(r).
175//
176//    Step 3: Reconstruction
177//    ------
178//    Finally, log1pl(X + X_lo) = logl(X + X_lo + 1) is given by
179//
180//    logl(X + X_lo + 1) =  logl(2^N * (S_hi + S_lo))
181//                      ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
182//                      ~=~ N*logl(2) + logl(1/G) + poly(r).
183//
184//    For detailed description see log1p1 function, regular path.
185//
186//*********************************************************************
187
188RODATA
189.align 64
190
191// ************* DO NOT CHANGE THE ORDER OF THESE TABLES *************
192
193LOCAL_OBJECT_START(Constants_TaylorSeries)
194data8  0xF0F0F0F0F0F0F0F1,0x00003FFA // C17
195data8  0x8888888888888889,0x00003FFB // C15
196data8  0x9D89D89D89D89D8A,0x00003FFB // C13
197data8  0xBA2E8BA2E8BA2E8C,0x00003FFB // C11
198data8  0xE38E38E38E38E38E,0x00003FFB // C9
199data8  0x9249249249249249,0x00003FFC // C7
200data8  0xCCCCCCCCCCCCCCCD,0x00003FFC // C5
201data8  0xAAAAAAAAAAAAAAAA,0x00003FFD // C3
202data4  0x3f000000                    // 1/2
203data4  0x00000000                    // pad
204data4  0x00000000
205data4  0x00000000
206LOCAL_OBJECT_END(Constants_TaylorSeries)
207
208LOCAL_OBJECT_START(Constants_Q)
209data4  0x00000000,0xB1721800,0x00003FFE,0x00000000 // log2_hi
210data4  0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 // log2_lo
211data4  0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 // Q4
212data4  0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 // Q3
213data4  0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 // Q2
214data4  0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 // Q1
215LOCAL_OBJECT_END(Constants_Q)
216
217
218// Z1 - 16 bit fixed
219LOCAL_OBJECT_START(Constants_Z_1)
220data4  0x00008000
221data4  0x00007879
222data4  0x000071C8
223data4  0x00006BCB
224data4  0x00006667
225data4  0x00006187
226data4  0x00005D18
227data4  0x0000590C
228data4  0x00005556
229data4  0x000051EC
230data4  0x00004EC5
231data4  0x00004BDB
232data4  0x00004925
233data4  0x0000469F
234data4  0x00004445
235data4  0x00004211
236LOCAL_OBJECT_END(Constants_Z_1)
237
238// G1 and H1 - IEEE single and h1 - IEEE double
239LOCAL_OBJECT_START(Constants_G_H_h1)
240data4  0x3F800000,0x00000000
241data8  0x0000000000000000
242data4  0x3F70F0F0,0x3D785196
243data8  0x3DA163A6617D741C
244data4  0x3F638E38,0x3DF13843
245data8  0x3E2C55E6CBD3D5BB
246data4  0x3F579430,0x3E2FF9A0
247data8  0xBE3EB0BFD86EA5E7
248data4  0x3F4CCCC8,0x3E647FD6
249data8  0x3E2E6A8C86B12760
250data4  0x3F430C30,0x3E8B3AE7
251data8  0x3E47574C5C0739BA
252data4  0x3F3A2E88,0x3EA30C68
253data8  0x3E20E30F13E8AF2F
254data4  0x3F321640,0x3EB9CEC8
255data8  0xBE42885BF2C630BD
256data4  0x3F2AAAA8,0x3ECF9927
257data8  0x3E497F3497E577C6
258data4  0x3F23D708,0x3EE47FC5
259data8  0x3E3E6A6EA6B0A5AB
260data4  0x3F1D89D8,0x3EF8947D
261data8  0xBDF43E3CD328D9BE
262data4  0x3F17B420,0x3F05F3A1
263data8  0x3E4094C30ADB090A
264data4  0x3F124920,0x3F0F4303
265data8  0xBE28FBB2FC1FE510
266data4  0x3F0D3DC8,0x3F183EBF
267data8  0x3E3A789510FDE3FA
268data4  0x3F088888,0x3F20EC80
269data8  0x3E508CE57CC8C98F
270data4  0x3F042108,0x3F29516A
271data8  0xBE534874A223106C
272LOCAL_OBJECT_END(Constants_G_H_h1)
273
274// Z2 - 16 bit fixed
275LOCAL_OBJECT_START(Constants_Z_2)
276data4  0x00008000
277data4  0x00007F81
278data4  0x00007F02
279data4  0x00007E85
280data4  0x00007E08
281data4  0x00007D8D
282data4  0x00007D12
283data4  0x00007C98
284data4  0x00007C20
285data4  0x00007BA8
286data4  0x00007B31
287data4  0x00007ABB
288data4  0x00007A45
289data4  0x000079D1
290data4  0x0000795D
291data4  0x000078EB
292LOCAL_OBJECT_END(Constants_Z_2)
293
294// G2 and H2 - IEEE single and h2 - IEEE double
295LOCAL_OBJECT_START(Constants_G_H_h2)
296data4  0x3F800000,0x00000000
297data8  0x0000000000000000
298data4  0x3F7F00F8,0x3B7F875D
299data8  0x3DB5A11622C42273
300data4  0x3F7E03F8,0x3BFF015B
301data8  0x3DE620CF21F86ED3
302data4  0x3F7D08E0,0x3C3EE393
303data8  0xBDAFA07E484F34ED
304data4  0x3F7C0FC0,0x3C7E0586
305data8  0xBDFE07F03860BCF6
306data4  0x3F7B1880,0x3C9E75D2
307data8  0x3DEA370FA78093D6
308data4  0x3F7A2328,0x3CBDC97A
309data8  0x3DFF579172A753D0
310data4  0x3F792FB0,0x3CDCFE47
311data8  0x3DFEBE6CA7EF896B
312data4  0x3F783E08,0x3CFC15D0
313data8  0x3E0CF156409ECB43
314data4  0x3F774E38,0x3D0D874D
315data8  0xBE0B6F97FFEF71DF
316data4  0x3F766038,0x3D1CF49B
317data8  0xBE0804835D59EEE8
318data4  0x3F757400,0x3D2C531D
319data8  0x3E1F91E9A9192A74
320data4  0x3F748988,0x3D3BA322
321data8  0xBE139A06BF72A8CD
322data4  0x3F73A0D0,0x3D4AE46F
323data8  0x3E1D9202F8FBA6CF
324data4  0x3F72B9D0,0x3D5A1756
325data8  0xBE1DCCC4BA796223
326data4  0x3F71D488,0x3D693B9D
327data8  0xBE049391B6B7C239
328LOCAL_OBJECT_END(Constants_G_H_h2)
329
330// G3 and H3 - IEEE single and h3 - IEEE double
331LOCAL_OBJECT_START(Constants_G_H_h3)
332data4  0x3F7FFC00,0x38800100
333data8  0x3D355595562224CD
334data4  0x3F7FF400,0x39400480
335data8  0x3D8200A206136FF6
336data4  0x3F7FEC00,0x39A00640
337data8  0x3DA4D68DE8DE9AF0
338data4  0x3F7FE400,0x39E00C41
339data8  0xBD8B4291B10238DC
340data4  0x3F7FDC00,0x3A100A21
341data8  0xBD89CCB83B1952CA
342data4  0x3F7FD400,0x3A300F22
343data8  0xBDB107071DC46826
344data4  0x3F7FCC08,0x3A4FF51C
345data8  0x3DB6FCB9F43307DB
346data4  0x3F7FC408,0x3A6FFC1D
347data8  0xBD9B7C4762DC7872
348data4  0x3F7FBC10,0x3A87F20B
349data8  0xBDC3725E3F89154A
350data4  0x3F7FB410,0x3A97F68B
351data8  0xBD93519D62B9D392
352data4  0x3F7FAC18,0x3AA7EB86
353data8  0x3DC184410F21BD9D
354data4  0x3F7FA420,0x3AB7E101
355data8  0xBDA64B952245E0A6
356data4  0x3F7F9C20,0x3AC7E701
357data8  0x3DB4B0ECAABB34B8
358data4  0x3F7F9428,0x3AD7DD7B
359data8  0x3D9923376DC40A7E
360data4  0x3F7F8C30,0x3AE7D474
361data8  0x3DC6E17B4F2083D3
362data4  0x3F7F8438,0x3AF7CBED
363data8  0x3DAE314B811D4394
364data4  0x3F7F7C40,0x3B03E1F3
365data8  0xBDD46F21B08F2DB1
366data4  0x3F7F7448,0x3B0BDE2F
367data8  0xBDDC30A46D34522B
368data4  0x3F7F6C50,0x3B13DAAA
369data8  0x3DCB0070B1F473DB
370data4  0x3F7F6458,0x3B1BD766
371data8  0xBDD65DDC6AD282FD
372data4  0x3F7F5C68,0x3B23CC5C
373data8  0xBDCDAB83F153761A
374data4  0x3F7F5470,0x3B2BC997
375data8  0xBDDADA40341D0F8F
376data4  0x3F7F4C78,0x3B33C711
377data8  0x3DCD1BD7EBC394E8
378data4  0x3F7F4488,0x3B3BBCC6
379data8  0xBDC3532B52E3E695
380data4  0x3F7F3C90,0x3B43BAC0
381data8  0xBDA3961EE846B3DE
382data4  0x3F7F34A0,0x3B4BB0F4
383data8  0xBDDADF06785778D4
384data4  0x3F7F2CA8,0x3B53AF6D
385data8  0x3DCC3ED1E55CE212
386data4  0x3F7F24B8,0x3B5BA620
387data8  0xBDBA31039E382C15
388data4  0x3F7F1CC8,0x3B639D12
389data8  0x3D635A0B5C5AF197
390data4  0x3F7F14D8,0x3B6B9444
391data8  0xBDDCCB1971D34EFC
392data4  0x3F7F0CE0,0x3B7393BC
393data8  0x3DC7450252CD7ADA
394data4  0x3F7F04F0,0x3B7B8B6D
395data8  0xBDB68F177D7F2A42
396LOCAL_OBJECT_END(Constants_G_H_h3)
397
398
399
400// Floating Point Registers
401
402FR_C17              = f50
403FR_C15              = f51
404FR_C13              = f52
405FR_C11              = f53
406FR_C9               = f54
407FR_C7               = f55
408FR_C5               = f56
409FR_C3               = f57
410FR_x2               = f58
411FR_x3               = f59
412FR_x4               = f60
413FR_x8               = f61
414
415FR_Rcp              = f61
416
417FR_A                = f33
418FR_R1               = f33
419
420FR_E1               = f34
421FR_E3               = f34
422FR_Y2               = f34
423FR_Y3               = f34
424
425FR_E2               = f35
426FR_Y1               = f35
427
428FR_B                = f36
429FR_Y0               = f37
430FR_E0               = f38
431FR_E4               = f39
432FR_Q0               = f40
433FR_R0               = f41
434FR_B_lo             = f42
435
436FR_abs_x            = f43
437FR_Bp               = f44
438FR_Bn               = f45
439FR_Yp               = f46
440FR_Yn               = f47
441
442FR_X                = f48
443FR_BB               = f48
444FR_X_lo             = f49
445
446FR_G                = f50
447FR_Y_hi             = f51
448FR_H                = f51
449FR_h                = f52
450FR_G2               = f53
451FR_H2               = f54
452FR_h2               = f55
453FR_G3               = f56
454FR_H3               = f57
455FR_h3               = f58
456
457FR_Q4               = f59
458FR_poly_lo          = f59
459FR_Y_lo             = f59
460
461FR_Q3               = f60
462FR_Q2               = f61
463
464FR_Q1               = f62
465FR_poly_hi          = f62
466
467FR_float_N          = f63
468
469FR_AA               = f64
470FR_S_lo             = f64
471
472FR_S_hi             = f65
473FR_r                = f65
474
475FR_log2_hi          = f66
476FR_log2_lo          = f67
477FR_Z                = f68
478FR_2_to_minus_N     = f69
479FR_rcub             = f70
480FR_rsq              = f71
481FR_05r              = f72
482FR_Half             = f73
483
484FR_Arg_X            = f50
485FR_Arg_Y            = f0
486FR_RESULT           = f8
487
488
489
490// General Purpose Registers
491
492GR_ad_05            = r33
493GR_Index1           = r34
494GR_ArgExp           = r34
495GR_Index2           = r35
496GR_ExpMask          = r35
497GR_NearZeroBound    = r36
498GR_signif           = r36
499GR_X_0              = r37
500GR_X_1              = r37
501GR_X_2              = r38
502GR_Index3           = r38
503GR_minus_N          = r39
504GR_Z_1              = r40
505GR_Z_2              = r40
506GR_N                = r41
507GR_Bias             = r42
508GR_M                = r43
509GR_ad_taylor        = r44
510GR_ad_taylor_2      = r45
511GR_ad2_tbl_3        = r45
512GR_ad_tbl_1         = r46
513GR_ad_tbl_2         = r47
514GR_ad_tbl_3         = r48
515GR_ad_q             = r49
516GR_ad_z_1           = r50
517GR_ad_z_2           = r51
518GR_ad_z_3           = r52
519
520//
521// Added for unwind support
522//
523GR_SAVE_PFS         = r46
524GR_SAVE_B0          = r47
525GR_SAVE_GP          = r48
526GR_Parameter_X      = r49
527GR_Parameter_Y      = r50
528GR_Parameter_RESULT = r51
529GR_Parameter_TAG    = r52
530
531
532
533.section .text
534GLOBAL_LIBM_ENTRY(atanhl)
535
536{ .mfi
537      alloc         r32 = ar.pfs,0,17,4,0
538      fnma.s1       FR_Bp = f8,f1,f1 // b = 1 - |arg| (for x>0)
539      mov           GR_ExpMask = 0x1ffff
540}
541{ .mfi
542      addl          GR_ad_taylor = @ltoff(Constants_TaylorSeries),gp
543      fma.s1        FR_Bn = f8,f1,f1 // b = 1 - |arg| (for x<0)
544      mov           GR_NearZeroBound = 0xfffa  // biased exp of 1/32
545};;
546{ .mfi
547      getf.exp      GR_ArgExp = f8
548      fcmp.lt.s1    p6,p7 = f8,f0 // is negative?
549      nop.i         0
550}
551{ .mfi
552      ld8           GR_ad_taylor = [GR_ad_taylor]
553      fmerge.s      FR_abs_x =  f1,f8
554      nop.i         0
555};;
556{ .mfi
557      nop.m         0
558      fclass.m      p8,p0 = f8,0x1C7 // is arg NaT,Q/SNaN or +/-0 ?
559      nop.i         0
560}
561{ .mfi
562      nop.m         0
563      fma.s1        FR_x2 = f8,f8,f0
564      nop.i         0
565};;
566{ .mfi
567      add           GR_ad_z_1 = 0x0F0,GR_ad_taylor
568      fclass.m      p9,p0 = f8,0x0a // is arg -denormal ?
569      add           GR_ad_taylor_2 = 0x010,GR_ad_taylor
570}
571{ .mfi
572      add           GR_ad_05 = 0x080,GR_ad_taylor
573      nop.f         0
574      nop.i         0
575};;
576{ .mfi
577      ldfe          FR_C17 = [GR_ad_taylor],32
578      fclass.m      p10,p0 = f8,0x09 // is arg +denormal ?
579      add           GR_ad_tbl_1 = 0x040,GR_ad_z_1 // point to Constants_G_H_h1
580}
581{ .mfb
582      add           GR_ad_z_2 = 0x140,GR_ad_z_1 // point to Constants_Z_2
583 (p8) fma.s0        f8 =  f8,f1,f0 // NaN or +/-0
584 (p8) br.ret.spnt   b0             // exit for Nan or +/-0
585};;
586{ .mfi
587      ldfe          FR_C15 = [GR_ad_taylor_2],32
588      fclass.m      p15,p0 = f8,0x23 // is +/-INF ?
589      add           GR_ad_tbl_2 = 0x180,GR_ad_z_1 // point to Constants_G_H_h2
590}
591{ .mfb
592      ldfe          FR_C13 = [GR_ad_taylor],32
593 (p9) fnma.s0       f8 =  f8,f8,f8 // -denormal
594 (p9) br.ret.spnt   b0             // exit for -denormal
595};;
596{ .mfi
597      ldfe          FR_C11 = [GR_ad_taylor_2],32
598      fcmp.eq.s0       p13,p0 = FR_abs_x,f1 // is |arg| = 1?
599      nop.i         0
600}
601{ .mfb
602      ldfe          FR_C9 = [GR_ad_taylor],32
603(p10) fma.s0        f8 =  f8,f8,f8 // +denormal
604(p10) br.ret.spnt   b0             // exit for +denormal
605};;
606{ .mfi
607      ldfe          FR_C7 = [GR_ad_taylor_2],32
608 (p6) frcpa.s1      FR_Yn,p11 = f1,FR_Bn // y = frcpa(b)
609      and           GR_ArgExp = GR_ArgExp,GR_ExpMask // biased exponent
610}
611{ .mfb
612      ldfe          FR_C5 = [GR_ad_taylor],32
613      fnma.s1       FR_B = FR_abs_x,f1,f1 // b = 1 - |arg|
614(p15) br.cond.spnt  atanhl_gt_one // |arg| > 1
615};;
616{ .mfb
617      cmp.gt        p14,p0 = GR_NearZeroBound,GR_ArgExp
618 (p7) frcpa.s1      FR_Yp,p12 = f1,FR_Bp // y = frcpa(b)
619(p13) br.cond.spnt  atanhl_eq_one // |arg| = 1/32
620}
621{ .mfb
622      ldfe          FR_C3 = [GR_ad_taylor_2],32
623      fma.s1        FR_A = FR_abs_x,f1,FR_abs_x // a = 2 * |arg|
624(p14) br.cond.spnt  atanhl_near_zero // |arg| < 1/32
625};;
626{ .mfi
627      nop.m         0
628      fcmp.gt.s0       p8,p0 = FR_abs_x,f1 // is |arg| > 1 ?
629      nop.i         0
630};;
631.pred.rel "mutex",p6,p7
632{ .mfi
633      nop.m         0
634 (p6) fnma.s1       FR_B_lo = FR_Bn,f1,f1 // argt = 1 - (1 - |arg|)
635      nop.i         0
636}
637{ .mfi
638      ldfs          FR_Half = [GR_ad_05]
639 (p7) fnma.s1       FR_B_lo = FR_Bp,f1,f1
640      nop.i         0
641};;
642{ .mfi
643      nop.m         0
644 (p6) fnma.s1       FR_E0 = FR_Yn,FR_Bn,f1 // e = 1-b*y
645      nop.i         0
646}
647{ .mfb
648      nop.m         0
649 (p6) fma.s1        FR_Y0 = FR_Yn,f1,f0
650 (p8) br.cond.spnt  atanhl_gt_one // |arg| > 1
651};;
652{ .mfi
653      nop.m         0
654 (p7) fnma.s1       FR_E0 = FR_Yp,FR_Bp,f1
655      nop.i         0
656}
657{ .mfi
658      nop.m         0
659 (p6) fma.s1        FR_Q0 = FR_A,FR_Yn,f0 // q = a*y
660      nop.i         0
661};;
662{ .mfi
663      nop.m         0
664 (p7) fma.s1        FR_Q0 = FR_A,FR_Yp,f0
665      nop.i         0
666}
667{ .mfi
668      nop.m         0
669 (p7) fma.s1        FR_Y0 = FR_Yp,f1,f0
670      nop.i         0
671};;
672{ .mfi
673      nop.m         0
674      fclass.nm     p10,p0 = f8,0x1FF  // test for unsupported
675      nop.i         0
676};;
677{ .mfi
678      nop.m         0
679      fma.s1        FR_E2 = FR_E0,FR_E0,FR_E0 // e2 = e+e^2
680      nop.i         0
681}
682{ .mfi
683      nop.m         0
684      fma.s1        FR_E1 = FR_E0,FR_E0,f0 // e1 = e^2
685      nop.i         0
686};;
687{ .mfb
688      nop.m         0
689//    Return generated NaN or other value for unsupported values.
690(p10) fma.s0        f8 = f8, f0, f0
691(p10) br.ret.spnt   b0
692};;
693{ .mfi
694      nop.m         0
695      fma.s1        FR_Y1 = FR_Y0,FR_E2,FR_Y0 // y1 = y+y*e2
696      nop.i         0
697}
698{ .mfi
699      nop.m         0
700      fma.s1        FR_E3 = FR_E1,FR_E1,FR_E0 // e3 = e+e1^2
701      nop.i         0
702};;
703{ .mfi
704      nop.m         0
705      fnma.s1       FR_B_lo = FR_abs_x,f1,FR_B_lo // b_lo = argt-|arg|
706      nop.i         0
707};;
708{ .mfi
709      nop.m         0
710      fma.s1        FR_Y2 = FR_Y1,FR_E3,FR_Y0 // y2 = y+y1*e3
711      nop.i         0
712}
713{ .mfi
714      nop.m         0
715      fnma.s1       FR_R0 = FR_B,FR_Q0,FR_A // r = a-b*q
716      nop.i         0
717};;
718{ .mfi
719      nop.m         0
720      fnma.s1       FR_E4 = FR_B,FR_Y2,f1 // e4 = 1-b*y2
721      nop.i         0
722}
723{ .mfi
724      nop.m         0
725      fma.s1        FR_X = FR_R0,FR_Y2,FR_Q0 // x = q+r*y2
726      nop.i         0
727};;
728{ .mfi
729      nop.m         0
730      fma.s1        FR_Z = FR_X,f1,f1 // x+1
731      nop.i         0
732};;
733{ .mfi
734      nop.m         0
735 (p6) fnma.s1       FR_Half = FR_Half,f1,f0 // sign(arg)/2
736      nop.i         0
737};;
738{ .mfi
739      nop.m         0
740      fma.s1        FR_Y3 = FR_Y2,FR_E4,FR_Y2 // y3 = y2+y2*e4
741      nop.i         0
742}
743{ .mfi
744      nop.m         0
745      fnma.s1       FR_R1 = FR_B,FR_X,FR_A // r1 = a-b*x
746      nop.i         0
747};;
748{ .mfi
749      getf.sig      GR_signif = FR_Z // get significand of x+1
750      nop.f         0
751      nop.i         0
752};;
753
754
755{ .mfi
756      add           GR_ad_q = -0x060,GR_ad_z_1
757      nop.f         0
758      extr.u        GR_Index1 = GR_signif,59,4 // get high 4 bits of signif
759}
760{ .mfi
761      add           GR_ad_tbl_3 = 0x280,GR_ad_z_1 // point to Constants_G_H_h3
762      nop.f         0
763      nop.i         0
764};;
765{ .mfi
766      shladd        GR_ad_z_1 = GR_Index1,2,GR_ad_z_1 // point to Z_1
767      nop.f         0
768      extr.u        GR_X_0 = GR_signif,49,15 // get high 15 bits of significand
769};;
770{ .mfi
771      ld4           GR_Z_1 = [GR_ad_z_1] // load Z_1
772      fmax.s1       FR_AA = FR_X,f1 // for S_lo,form AA = max(X,1.0)
773      nop.i         0
774}
775{ .mfi
776      shladd        GR_ad_tbl_1 = GR_Index1,4,GR_ad_tbl_1 // point to G_1
777      nop.f         0
778      mov           GR_Bias = 0x0FFFF // exponent bias
779};;
780{ .mfi
781      ldfps         FR_G,FR_H = [GR_ad_tbl_1],8  // load G_1,H_1
782      fmerge.se     FR_S_hi =  f1,FR_Z // form |x+1|
783      nop.i         0
784};;
785{ .mfi
786      getf.exp      GR_N =  FR_Z // get N = exponent of x+1
787      nop.f         0
788      nop.i         0
789}
790{ .mfi
791      ldfd          FR_h = [GR_ad_tbl_1] // load h_1
792      fnma.s1       FR_R1 = FR_B_lo,FR_X,FR_R1 // r1 = r1-b_lo*x
793      nop.i         0
794};;
795{ .mfi
796      ldfe          FR_log2_hi = [GR_ad_q],16 // load log2_hi
797      nop.f         0
798      pmpyshr2.u    GR_X_1 = GR_X_0,GR_Z_1,15 // get bits 30-15 of X_0 * Z_1
799};;
800//
801//    For performance,don't use result of pmpyshr2.u for 4 cycles.
802//
803{ .mfi
804      ldfe          FR_log2_lo = [GR_ad_q],16 // load log2_lo
805      nop.f         0
806      sub           GR_N = GR_N,GR_Bias
807};;
808{ .mfi
809      ldfe          FR_Q4 = [GR_ad_q],16  // load Q4
810      fms.s1        FR_S_lo = FR_AA,f1,FR_Z // form S_lo = AA - Z
811      sub           GR_minus_N = GR_Bias,GR_N // form exponent of 2^(-N)
812};;
813{ .mmf
814      ldfe          FR_Q3 = [GR_ad_q],16 // load Q3
815      // put integer N into rightmost significand
816      setf.sig      FR_float_N = GR_N
817      fmin.s1       FR_BB = FR_X,f1 // for S_lo,form BB = min(X,1.0)
818};;
819{ .mfi
820      ldfe          FR_Q2 = [GR_ad_q],16 // load Q2
821      nop.f         0
822      extr.u        GR_Index2 = GR_X_1,6,4 // extract bits 6-9 of X_1
823};;
824{ .mmi
825      ldfe          FR_Q1 = [GR_ad_q] // load Q1
826      shladd        GR_ad_z_2 = GR_Index2,2,GR_ad_z_2 // point to Z_2
827      nop.i         0
828};;
829{ .mmi
830      ld4           GR_Z_2 = [GR_ad_z_2] // load Z_2
831      shladd        GR_ad_tbl_2 = GR_Index2,4,GR_ad_tbl_2 // point to G_2
832      nop.i         0
833};;
834{ .mfi
835      ldfps         FR_G2,FR_H2 = [GR_ad_tbl_2],8 // load G_2,H_2
836      nop.f         0
837      nop.i         0
838};;
839{ .mfi
840      ldfd          FR_h2 = [GR_ad_tbl_2] // load h_2
841      fma.s1        FR_S_lo = FR_S_lo,f1,FR_BB // S_lo = S_lo + BB
842      nop.i         0
843}
844{ .mfi
845      setf.exp      FR_2_to_minus_N = GR_minus_N // form 2^(-N)
846      fma.s1        FR_X_lo = FR_R1,FR_Y3,f0 // x_lo = r1*y3
847      nop.i         0
848};;
849{ .mfi
850      nop.m         0
851      nop.f         0
852      pmpyshr2.u    GR_X_2 = GR_X_1,GR_Z_2,15 // get bits 30-15 of X_1 * Z_2
853};;
854//
855//    For performance,don't use result of pmpyshr2.u for 4 cycles
856//
857{ .mfi
858      add           GR_ad2_tbl_3 = 8,GR_ad_tbl_3
859      nop.f         0
860      nop.i         0
861}
862{ .mfi
863      nop.m         0
864      nop.f         0
865      nop.i         0
866};;
867{ .mfi
868      nop.m         0
869      nop.f         0
870      nop.i         0
871};;
872{ .mfi
873      nop.m         0
874      nop.f         0
875      nop.i         0
876};;
877
878//
879//    Now GR_X_2 can be used
880//
881{ .mfi
882      nop.m         0
883      nop.f         0
884      extr.u        GR_Index3 = GR_X_2,1,5 // extract bits 1-5 of X_2
885}
886{ .mfi
887      nop.m         0
888      fma.s1        FR_S_lo = FR_S_lo,f1,FR_X_lo // S_lo = S_lo + Arg_lo
889      nop.i         0
890};;
891
892{ .mfi
893      shladd        GR_ad_tbl_3 = GR_Index3,4,GR_ad_tbl_3 // point to G_3
894      fcvt.xf       FR_float_N = FR_float_N
895      nop.i         0
896}
897{ .mfi
898      shladd        GR_ad2_tbl_3 = GR_Index3,4,GR_ad2_tbl_3 // point to h_3
899      fma.s1        FR_Q1 = FR_Q1,FR_Half,f0 // sign(arg)*Q1/2
900      nop.i         0
901};;
902{ .mmi
903      ldfps         FR_G3,FR_H3 = [GR_ad_tbl_3],8 // load G_3,H_3
904      ldfd          FR_h3 = [GR_ad2_tbl_3] // load h_3
905      nop.i         0
906};;
907{ .mfi
908      nop.m         0
909      fmpy.s1       FR_G = FR_G,FR_G2 // G = G_1 * G_2
910      nop.i         0
911}
912{ .mfi
913      nop.m         0
914      fadd.s1       FR_H = FR_H,FR_H2 // H = H_1 + H_2
915      nop.i         0
916};;
917{ .mfi
918      nop.m         0
919      fadd.s1       FR_h = FR_h,FR_h2 // h = h_1 + h_2
920      nop.i         0
921};;
922{ .mfi
923      nop.m         0
924      // S_lo = S_lo * 2^(-N)
925      fma.s1        FR_S_lo = FR_S_lo,FR_2_to_minus_N,f0
926      nop.i         0
927};;
928{ .mfi
929      nop.m         0
930      fmpy.s1       FR_G = FR_G,FR_G3 // G = (G_1 * G_2) * G_3
931      nop.i         0
932}
933{ .mfi
934      nop.m         0
935      fadd.s1       FR_H = FR_H,FR_H3 // H = (H_1 + H_2) + H_3
936      nop.i         0
937};;
938{ .mfi
939      nop.m         0
940      fadd.s1       FR_h = FR_h,FR_h3 // h = (h_1 + h_2) + h_3
941      nop.i         0
942};;
943{ .mfi
944      nop.m         0
945      fms.s1        FR_r = FR_G,FR_S_hi,f1 // r = G * S_hi - 1
946      nop.i         0
947}
948{ .mfi
949      nop.m         0
950      // Y_hi = N * log2_hi + H
951      fma.s1        FR_Y_hi = FR_float_N,FR_log2_hi,FR_H
952      nop.i         0
953};;
954{ .mfi
955      nop.m         0
956      fma.s1        FR_h = FR_float_N,FR_log2_lo,FR_h // h = N * log2_lo + h
957      nop.i         0
958};;
959{ .mfi
960      nop.m         0
961      fma.s1        FR_r = FR_G,FR_S_lo,FR_r // r = G * S_lo + (G * S_hi - 1)
962      nop.i         0
963};;
964{ .mfi
965      nop.m         0
966      fma.s1        FR_poly_lo = FR_r,FR_Q4,FR_Q3 // poly_lo = r * Q4 + Q3
967      nop.i         0
968}
969{ .mfi
970      nop.m         0
971      fmpy.s1       FR_rsq = FR_r,FR_r // rsq = r * r
972      nop.i         0
973};;
974{ .mfi
975      nop.m         0
976      fma.s1        FR_05r = FR_r,FR_Half,f0 // sign(arg)*r/2
977      nop.i         0
978};;
979{ .mfi
980      nop.m         0
981      // poly_lo = poly_lo * r + Q2
982      fma.s1        FR_poly_lo = FR_poly_lo,FR_r,FR_Q2
983      nop.i         0
984}
985{ .mfi
986      nop.m         0
987      fma.s1        FR_rcub = FR_rsq,FR_r,f0 // rcub = r^3
988      nop.i         0
989};;
990{ .mfi
991      nop.m         0
992      // poly_hi = sing(arg)*(Q1*r^2 + r)/2
993      fma.s1        FR_poly_hi = FR_Q1,FR_rsq,FR_05r
994      nop.i         0
995};;
996{ .mfi
997      nop.m         0
998      // poly_lo = poly_lo*r^3 + h
999      fma.s1        FR_poly_lo = FR_poly_lo,FR_rcub,FR_h
1000      nop.i         0
1001};;
1002{ .mfi
1003      nop.m         0
1004      // Y_lo = poly_hi + poly_lo/2
1005      fma.s0        FR_Y_lo = FR_poly_lo,FR_Half,FR_poly_hi
1006      nop.i         0
1007};;
1008{ .mfb
1009      nop.m         0
1010     // Result = arctanh(x) = Y_hi/2 + Y_lo
1011      fma.s0        f8 = FR_Y_hi,FR_Half,FR_Y_lo
1012      br.ret.sptk   b0
1013};;
1014
1015// Taylor's series
1016atanhl_near_zero:
1017{ .mfi
1018      nop.m         0
1019      fma.s1        FR_x3 = FR_x2,f8,f0
1020      nop.i         0
1021}
1022{ .mfi
1023      nop.m         0
1024      fma.s1        FR_x4 = FR_x2,FR_x2,f0
1025      nop.i         0
1026};;
1027{ .mfi
1028      nop.m         0
1029      fma.s1        FR_C17 = FR_C17,FR_x2,FR_C15
1030      nop.i         0
1031}
1032{ .mfi
1033      nop.m         0
1034      fma.s1        FR_C13 = FR_C13,FR_x2,FR_C11
1035      nop.i         0
1036};;
1037{ .mfi
1038      nop.m         0
1039      fma.s1        FR_C9 = FR_C9,FR_x2,FR_C7
1040      nop.i         0
1041}
1042{ .mfi
1043      nop.m         0
1044      fma.s1        FR_C5 = FR_C5,FR_x2,FR_C3
1045      nop.i         0
1046};;
1047{ .mfi
1048      nop.m         0
1049      fma.s1        FR_x8 = FR_x4,FR_x4,f0
1050      nop.i         0
1051};;
1052{ .mfi
1053      nop.m         0
1054      fma.s1        FR_C17 = FR_C17,FR_x4,FR_C13
1055      nop.i         0
1056};;
1057{ .mfi
1058      nop.m         0
1059      fma.s1        FR_C9 = FR_C9,FR_x4,FR_C5
1060      nop.i         0
1061};;
1062{ .mfi
1063      nop.m         0
1064      fma.s1        FR_C17 = FR_C17,FR_x8,FR_C9
1065      nop.i         0
1066};;
1067{ .mfb
1068      nop.m         0
1069      fma.s0        f8 = FR_C17,FR_x3,f8
1070      br.ret.sptk   b0
1071};;
1072
1073atanhl_eq_one:
1074{ .mfi
1075      nop.m         0
1076      frcpa.s0      FR_Rcp,p0 = f1,f0 // get inf,and raise Z flag
1077      nop.i         0
1078}
1079{ .mfi
1080      nop.m         0
1081      fmerge.s      FR_Arg_X = f8, f8
1082      nop.i         0
1083};;
1084{ .mfb
1085      mov           GR_Parameter_TAG = 130
1086      fmerge.s      FR_RESULT = f8,FR_Rcp // result is +-inf
1087      br.cond.sptk  __libm_error_region // exit if |x| = 1.0
1088};;
1089
1090atanhl_gt_one:
1091{ .mfi
1092      nop.m         0
1093      fmerge.s      FR_Arg_X = f8, f8
1094      nop.i         0
1095};;
1096{ .mfb
1097      mov           GR_Parameter_TAG = 129
1098      frcpa.s0      FR_RESULT,p0 = f0,f0 // get QNaN,and raise invalid
1099      br.cond.sptk  __libm_error_region // exit if |x| > 1.0
1100};;
1101
1102GLOBAL_LIBM_END(atanhl)
1103libm_alias_ldouble_other (atanh, atanh)
1104
1105LOCAL_LIBM_ENTRY(__libm_error_region)
1106.prologue
1107{ .mfi
1108        add   GR_Parameter_Y=-32,sp             // Parameter 2 value
1109        nop.f 0
1110.save   ar.pfs,GR_SAVE_PFS
1111        mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
1112}
1113{ .mfi
1114.fframe 64
1115        add sp=-64,sp                           // Create new stack
1116        nop.f 0
1117        mov GR_SAVE_GP=gp                       // Save gp
1118};;
1119{ .mmi
1120        stfe [GR_Parameter_Y] = FR_Arg_Y,16     // Save Parameter 2 on stack
1121        add GR_Parameter_X = 16,sp              // Parameter 1 address
1122.save   b0,GR_SAVE_B0
1123        mov GR_SAVE_B0=b0                       // Save b0
1124};;
1125.body
1126{ .mib
1127        stfe [GR_Parameter_X] = FR_Arg_X        // Store Parameter 1 on stack
1128        add   GR_Parameter_RESULT = 0,GR_Parameter_Y
1129        nop.b 0                                 // Parameter 3 address
1130}
1131{ .mib
1132        stfe [GR_Parameter_Y] = FR_RESULT       // Store Parameter 3 on stack
1133        add   GR_Parameter_Y = -16,GR_Parameter_Y
1134        br.call.sptk b0=__libm_error_support#  // Call error handling function
1135};;
1136{ .mmi
1137        nop.m 0
1138        nop.m 0
1139        add   GR_Parameter_RESULT = 48,sp
1140};;
1141{ .mmi
1142        ldfe  f8 = [GR_Parameter_RESULT]       // Get return result off stack
1143.restore sp
1144        add   sp = 64,sp                       // Restore stack pointer
1145        mov   b0 = GR_SAVE_B0                  // Restore return address
1146};;
1147{ .mib
1148        mov   gp = GR_SAVE_GP                  // Restore gp
1149        mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
1150        br.ret.sptk     b0                     // Return
1151};;
1152
1153LOCAL_LIBM_END(__libm_error_region#)
1154
1155.type   __libm_error_support#,@function
1156.global __libm_error_support#
1157