1.file "atanh.s"
2
3
4// Copyright (c) 2000 - 2005, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39// ==============================================================
40// History
41// ==============================================================
42// 05/03/01  Initial version
43// 05/20/02  Cleaned up namespace and sf0 syntax
44// 02/06/03  Reordered header: .section, .global, .proc, .align
45// 05/26/03  Improved performance, fixed to handle unorms
46// 03/31/05  Reformatted delimiters between data tables
47//
48// API
49// ==============================================================
50// double atanh(double)
51//
52// Overview of operation
53// ==============================================================
54//
55// There are 7 paths:
56// 1. x = +/-0.0
57//    Return atanh(x) = +/-0.0
58//
59// 2. 0.0 < |x| < 1/4
60//    Return atanh(x) = Po2l(x),
61//    where Po2l(x) = (((((((((C9*x^2 + C8)*x^2 + C7)*x^2 + C6)*x^2 +
62//          C5)*x^2 + C4)*x^2 + C3)*x^2 + C2)*x^2 + C1)* x^2 + C0)*x^3 + x
63// 3. 1/4 <= |x| < 1
64//    Return atanh(x) = sign(x) * log((1 + |x|)/(1 - |x|))
65//    To compute (1 + |x|)/(1 - |x|) modified Newton Raphson method is used
66//    (3 iterations)
67//    Algorithm description for log function see below.
68//
69// 4. |x| = 1
70//    Return atanh(x) = sign(x) * +INF
71//
72// 5. 1 < |x| <= +INF
73//    Return atanh(x) = QNaN
74//
75// 6. x = [S,Q]NaN
76//    Return atanh(x) = QNaN
77//
78// 7. x = denormal
79//    Return atanh(x) = x
80//
81//==============================================================
82// Algorithm Description for log(x) function
83// Below we are using the fact that inequality x - 1.0 > 2^(-6) is always true
84// for this atanh implementation
85//
86// Consider  x = 2^N 1.f1 f2 f3 f4...f63
87// Log(x) = log(x * frcpa(x) / frcpa(x))
88//        = log(x * frcpa(x)) + log(1/frcpa(x))
89//        = log(x * frcpa(x)) - log(frcpa(x))
90//
91// frcpa(x)       = 2^-N * frcpa(1.f1 f2 ... f63)
92//
93// -log(frcpa(x)) = -log(C)
94//                = -log(2^-N) - log(frcpa(1.f1 f2 ... f63))
95//
96// -log(frcpa(x)) = -log(C)
97//                = N*log2 - log(frcpa(1.f1 f2 ... f63))
98//
99//
100// Log(x) = log(1/frcpa(x)) + log(frcpa(x) x)
101//
102// Log(x) = N*log2 + log(1./frcpa(1.f1 f2 ... f63)) + log(x * frcpa(x))
103// Log(x) = N*log2 + T                              + log(frcpa(x) x)
104//
105// Log(x) = N*log2 + T                              + log(C * x)
106//
107// C * x = 1 + r
108//
109// Log(x) = N*log2 + T + log(1 + r)
110// Log(x) = N*log2 + T + Series(r - r^2/2 + r^3/3 - r^4/4 + ...)
111//
112// 1.f1 f2 ... f8 has 256 entries.
113// They are 1 + k/2^8, k = 0 ... 255
114// These 256 values are the table entries.
115//
116// Implementation
117//==============================================================
118// C = frcpa(x)
119// r = C * x - 1
120//
121// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4 + P4*r^5 + P5*r^6
122//
123// x = f * 2*N where f is 1.f_1f_2f_3...f_63
124// Nfloat = float(n)  where n is the true unbiased exponent
125// pre-index = f_1f_2....f_8
126// index = pre_index * 16
127// get the dxt table entry at index + offset = T
128//
129// result = (T + Nfloat * log(2)) + rseries
130//
131// The T table is calculated as follows
132// Form x_k = 1 + k/2^8 where k goes from 0... 255
133//      y_k = frcpa(x_k)
134//      log(1/y_k)  in quad and round to double-extended
135//
136//
137// Registers used
138//==============================================================
139// Floating Point registers used:
140// f8, input
141// f32 -> f77
142
143// General registers used:
144// r14 -> r27, r33 -> r39
145
146// Predicate registers used:
147// p6 -> p14
148
149// p10, p11      to indicate is argument positive or negative
150// p12           to filter out case when x = [Q,S]NaN or +/-0
151// p13           to filter out case when x = denormal
152// p6, p7        to filter out case when |x| >= 1
153// p8            to filter out case when |x| < 1/4
154
155// Assembly macros
156//==============================================================
157Data2Ptr              = r14
158Data3Ptr              = r15
159RcpTablePtr           = r16
160rExpbMask             = r17
161rBias                 = r18
162rNearZeroBound        = r19
163rArgSExpb             = r20
164rArgExpb              = r21
165rSExpb                = r22
166rExpb                 = r23
167rSig                  = r24
168rN                    = r25
169rInd                  = r26
170DataPtr               = r27
171
172GR_SAVE_B0            = r33
173GR_SAVE_GP            = r34
174GR_SAVE_PFS           = r35
175
176GR_Parameter_X        = r36
177GR_Parameter_Y        = r37
178GR_Parameter_RESULT   = r38
179atanh_GR_tag          = r39
180
181//==============================================================
182fAbsX                 = f32
183fOneMx                = f33
184fOnePx                = f34
185fY                    = f35
186fR                    = f36
187fR2                   = f37
188fR3                   = f38
189fRcp                  = f39
190fY4Rcp                = f40
191fRcp0                 = f41
192fRcp0n                = f42
193fRcp1                 = f43
194fRcp2                 = f44
195fRcp3                 = f45
196fN4Cvt                = f46
197fN                    = f47
198fY2                   = f48
199fLog2                 = f49
200fLogT                 = f50
201fLogT_N               = f51
202fX2                   = f52
203fX3                   = f53
204fX4                   = f54
205fX8                   = f55
206fP0                   = f56
207fP5                   = f57
208fP4                   = f58
209fP3                   = f59
210fP2                   = f60
211fP1                   = f61
212fNormX                = f62
213fC9                   = f63
214fC8                   = f64
215fC7                   = f65
216fC6                   = f66
217fC5                   = f67
218fC4                   = f68
219fC3                   = f69
220fC2                   = f70
221fC1                   = f71
222fC0                   = f72
223fP98                  = f73
224fP76                  = f74
225fP54                  = f75
226fP32                  = f76
227fP10                  = f77
228
229// Data tables
230//==============================================================
231RODATA
232.align 16
233
234LOCAL_OBJECT_START(atanh_data)
235data8 0xBFC5555DA7212371              //   P5
236data8 0x3FC999A19EEF5826              //   P4
237data8 0xBFCFFFFFFFFEF009              //   P3
238data8 0x3FD555555554ECB2              //   P2
239data8 0xBFE0000000000000              //   P1 = -0.5
240data8 0x0000000000000000              //   pad
241data8 0xb17217f7d1cf79ac , 0x00003ffd //   0.5*log(2)
242data8 0x0000000000000000 , 0x00000000 //   pad to eliminate bank conflicts
243LOCAL_OBJECT_END(atanh_data)
244
245LOCAL_OBJECT_START(atanh_data_2)
246data8 0x8649FB89D3AD51FB , 0x00003FFB //   C9
247data8 0xCC10AABEF160077A , 0x00003FFA //   C8
248data8 0xF1EDB99AC0819CE2 , 0x00003FFA //   C7
249data8 0x8881E53A809AD24D , 0x00003FFB //   C6
250data8 0x9D8A116EF212F271 , 0x00003FFB //   C5
251data8 0xBA2E8A6D1D756453 , 0x00003FFB //   C4
252data8 0xE38E38E7A0945692 , 0x00003FFB //   C3
253data8 0x924924924536891A , 0x00003FFC //   C2
254data8 0xCCCCCCCCCCD08D51 , 0x00003FFC //   C1
255data8 0xAAAAAAAAAAAAAA0C , 0x00003FFD //   C0
256LOCAL_OBJECT_END(atanh_data_2)
257
258
259LOCAL_OBJECT_START(atanh_data_3)
260data8 0x80200aaeac44ef38 , 0x00003ff5 //   log(1/frcpa(1+0/2^-8))/2
261//
262data8 0xc09090a2c35aa070 , 0x00003ff6 //   log(1/frcpa(1+1/2^-8))/2
263data8 0xa0c94fcb41977c75 , 0x00003ff7 //   log(1/frcpa(1+2/2^-8))/2
264data8 0xe18b9c263af83301 , 0x00003ff7 //   log(1/frcpa(1+3/2^-8))/2
265data8 0x8d35c8d6399c30ea , 0x00003ff8 //   log(1/frcpa(1+4/2^-8))/2
266data8 0xadd4d2ecd601cbb8 , 0x00003ff8 //   log(1/frcpa(1+5/2^-8))/2
267//
268data8 0xce95403a192f9f01 , 0x00003ff8 //   log(1/frcpa(1+6/2^-8))/2
269data8 0xeb59392cbcc01096 , 0x00003ff8 //   log(1/frcpa(1+7/2^-8))/2
270data8 0x862c7d0cefd54c5d , 0x00003ff9 //   log(1/frcpa(1+8/2^-8))/2
271data8 0x94aa63c65e70d499 , 0x00003ff9 //   log(1/frcpa(1+9/2^-8))/2
272data8 0xa54a696d4b62b382 , 0x00003ff9 //   log(1/frcpa(1+10/2^-8))/2
273//
274data8 0xb3e4a796a5dac208 , 0x00003ff9 //   log(1/frcpa(1+11/2^-8))/2
275data8 0xc28c45b1878340a9 , 0x00003ff9 //   log(1/frcpa(1+12/2^-8))/2
276data8 0xd35c55f39d7a6235 , 0x00003ff9 //   log(1/frcpa(1+13/2^-8))/2
277data8 0xe220f037b954f1f5 , 0x00003ff9 //   log(1/frcpa(1+14/2^-8))/2
278data8 0xf0f3389b036834f3 , 0x00003ff9 //   log(1/frcpa(1+15/2^-8))/2
279//
280data8 0xffd3488d5c980465 , 0x00003ff9 //   log(1/frcpa(1+16/2^-8))/2
281data8 0x87609ce2ed300490 , 0x00003ffa //   log(1/frcpa(1+17/2^-8))/2
282data8 0x8ede9321e8c85927 , 0x00003ffa //   log(1/frcpa(1+18/2^-8))/2
283data8 0x96639427f2f8e2f4 , 0x00003ffa //   log(1/frcpa(1+19/2^-8))/2
284data8 0x9defad3e8f73217b , 0x00003ffa //   log(1/frcpa(1+20/2^-8))/2
285//
286data8 0xa582ebd50097029c , 0x00003ffa //   log(1/frcpa(1+21/2^-8))/2
287data8 0xac06dbe75ab80fee , 0x00003ffa //   log(1/frcpa(1+22/2^-8))/2
288data8 0xb3a78449b2d3ccca , 0x00003ffa //   log(1/frcpa(1+23/2^-8))/2
289data8 0xbb4f79635ab46bb2 , 0x00003ffa //   log(1/frcpa(1+24/2^-8))/2
290data8 0xc2fec93a83523f3f , 0x00003ffa //   log(1/frcpa(1+25/2^-8))/2
291//
292data8 0xc99af2eaca4c4571 , 0x00003ffa //   log(1/frcpa(1+26/2^-8))/2
293data8 0xd1581106472fa653 , 0x00003ffa //   log(1/frcpa(1+27/2^-8))/2
294data8 0xd8002560d4355f2e , 0x00003ffa //   log(1/frcpa(1+28/2^-8))/2
295data8 0xdfcb43b4fe508632 , 0x00003ffa //   log(1/frcpa(1+29/2^-8))/2
296data8 0xe67f6dff709d4119 , 0x00003ffa //   log(1/frcpa(1+30/2^-8))/2
297//
298data8 0xed393b1c22351280 , 0x00003ffa //   log(1/frcpa(1+31/2^-8))/2
299data8 0xf5192bff087bcc35 , 0x00003ffa //   log(1/frcpa(1+32/2^-8))/2
300data8 0xfbdf4ff6dfef2fa3 , 0x00003ffa //   log(1/frcpa(1+33/2^-8))/2
301data8 0x81559a97f92f9cc7 , 0x00003ffb //   log(1/frcpa(1+34/2^-8))/2
302data8 0x84be72bce90266e8 , 0x00003ffb //   log(1/frcpa(1+35/2^-8))/2
303//
304data8 0x88bc74113f23def2 , 0x00003ffb //   log(1/frcpa(1+36/2^-8))/2
305data8 0x8c2ba3edf6799d11 , 0x00003ffb //   log(1/frcpa(1+37/2^-8))/2
306data8 0x8f9dc92f92ea08b1 , 0x00003ffb //   log(1/frcpa(1+38/2^-8))/2
307data8 0x9312e8f36efab5a7 , 0x00003ffb //   log(1/frcpa(1+39/2^-8))/2
308data8 0x968b08643409ceb6 , 0x00003ffb //   log(1/frcpa(1+40/2^-8))/2
309//
310data8 0x9a062cba08a1708c , 0x00003ffb //   log(1/frcpa(1+41/2^-8))/2
311data8 0x9d845b3abf95485c , 0x00003ffb //   log(1/frcpa(1+42/2^-8))/2
312data8 0xa06fd841bc001bb4 , 0x00003ffb //   log(1/frcpa(1+43/2^-8))/2
313data8 0xa3f3a74652fbe0db , 0x00003ffb //   log(1/frcpa(1+44/2^-8))/2
314data8 0xa77a8fb2336f20f5 , 0x00003ffb //   log(1/frcpa(1+45/2^-8))/2
315//
316data8 0xab0497015d28b0a0 , 0x00003ffb //   log(1/frcpa(1+46/2^-8))/2
317data8 0xae91c2be6ba6a615 , 0x00003ffb //   log(1/frcpa(1+47/2^-8))/2
318data8 0xb189d1b99aebb20b , 0x00003ffb //   log(1/frcpa(1+48/2^-8))/2
319data8 0xb51cced5de9c1b2c , 0x00003ffb //   log(1/frcpa(1+49/2^-8))/2
320data8 0xb819bee9e720d42f , 0x00003ffb //   log(1/frcpa(1+50/2^-8))/2
321//
322data8 0xbbb2a0947b093a5d , 0x00003ffb //   log(1/frcpa(1+51/2^-8))/2
323data8 0xbf4ec1505811684a , 0x00003ffb //   log(1/frcpa(1+52/2^-8))/2
324data8 0xc2535bacfa8975ff , 0x00003ffb //   log(1/frcpa(1+53/2^-8))/2
325data8 0xc55a3eafad187eb8 , 0x00003ffb //   log(1/frcpa(1+54/2^-8))/2
326data8 0xc8ff2484b2c0da74 , 0x00003ffb //   log(1/frcpa(1+55/2^-8))/2
327//
328data8 0xcc0b1a008d53ab76 , 0x00003ffb //   log(1/frcpa(1+56/2^-8))/2
329data8 0xcfb6203844b3209b , 0x00003ffb //   log(1/frcpa(1+57/2^-8))/2
330data8 0xd2c73949a47a19f5 , 0x00003ffb //   log(1/frcpa(1+58/2^-8))/2
331data8 0xd5daae18b49d6695 , 0x00003ffb //   log(1/frcpa(1+59/2^-8))/2
332data8 0xd8f08248cf7e8019 , 0x00003ffb //   log(1/frcpa(1+60/2^-8))/2
333//
334data8 0xdca7749f1b3e540e , 0x00003ffb //   log(1/frcpa(1+61/2^-8))/2
335data8 0xdfc28e033aaaf7c7 , 0x00003ffb //   log(1/frcpa(1+62/2^-8))/2
336data8 0xe2e012a5f91d2f55 , 0x00003ffb //   log(1/frcpa(1+63/2^-8))/2
337data8 0xe600064ed9e292a8 , 0x00003ffb //   log(1/frcpa(1+64/2^-8))/2
338data8 0xe9226cce42b39f60 , 0x00003ffb //   log(1/frcpa(1+65/2^-8))/2
339//
340data8 0xec4749fd97a28360 , 0x00003ffb //   log(1/frcpa(1+66/2^-8))/2
341data8 0xef6ea1bf57780495 , 0x00003ffb //   log(1/frcpa(1+67/2^-8))/2
342data8 0xf29877ff38809091 , 0x00003ffb //   log(1/frcpa(1+68/2^-8))/2
343data8 0xf5c4d0b245cb89be , 0x00003ffb //   log(1/frcpa(1+69/2^-8))/2
344data8 0xf8f3afd6fcdef3aa , 0x00003ffb //   log(1/frcpa(1+70/2^-8))/2
345//
346data8 0xfc2519756be1abc7 , 0x00003ffb //   log(1/frcpa(1+71/2^-8))/2
347data8 0xff59119f503e6832 , 0x00003ffb //   log(1/frcpa(1+72/2^-8))/2
348data8 0x8147ce381ae0e146 , 0x00003ffc //   log(1/frcpa(1+73/2^-8))/2
349data8 0x82e45f06cb1ad0f2 , 0x00003ffc //   log(1/frcpa(1+74/2^-8))/2
350data8 0x842f5c7c573cbaa2 , 0x00003ffc //   log(1/frcpa(1+75/2^-8))/2
351//
352data8 0x85ce471968c8893a , 0x00003ffc //   log(1/frcpa(1+76/2^-8))/2
353data8 0x876e8305bc04066d , 0x00003ffc //   log(1/frcpa(1+77/2^-8))/2
354data8 0x891012678031fbb3 , 0x00003ffc //   log(1/frcpa(1+78/2^-8))/2
355data8 0x8a5f1493d766a05f , 0x00003ffc //   log(1/frcpa(1+79/2^-8))/2
356data8 0x8c030c778c56fa00 , 0x00003ffc //   log(1/frcpa(1+80/2^-8))/2
357//
358data8 0x8da85df17e31d9ae , 0x00003ffc //   log(1/frcpa(1+81/2^-8))/2
359data8 0x8efa663e7921687e , 0x00003ffc //   log(1/frcpa(1+82/2^-8))/2
360data8 0x90a22b6875c6a1f8 , 0x00003ffc //   log(1/frcpa(1+83/2^-8))/2
361data8 0x91f62cc8f5d24837 , 0x00003ffc //   log(1/frcpa(1+84/2^-8))/2
362data8 0x93a06cfc3857d980 , 0x00003ffc //   log(1/frcpa(1+85/2^-8))/2
363//
364data8 0x94f66d5e6fd01ced , 0x00003ffc //   log(1/frcpa(1+86/2^-8))/2
365data8 0x96a330156e6772f2 , 0x00003ffc //   log(1/frcpa(1+87/2^-8))/2
366data8 0x97fb3582754ea25b , 0x00003ffc //   log(1/frcpa(1+88/2^-8))/2
367data8 0x99aa8259aad1bbf2 , 0x00003ffc //   log(1/frcpa(1+89/2^-8))/2
368data8 0x9b0492f6227ae4a8 , 0x00003ffc //   log(1/frcpa(1+90/2^-8))/2
369//
370data8 0x9c5f8e199bf3a7a5 , 0x00003ffc //   log(1/frcpa(1+91/2^-8))/2
371data8 0x9e1293b9998c1daa , 0x00003ffc //   log(1/frcpa(1+92/2^-8))/2
372data8 0x9f6fa31e0b41f308 , 0x00003ffc //   log(1/frcpa(1+93/2^-8))/2
373data8 0xa0cda11eaf46390e , 0x00003ffc //   log(1/frcpa(1+94/2^-8))/2
374data8 0xa22c8f029cfa45aa , 0x00003ffc //   log(1/frcpa(1+95/2^-8))/2
375//
376data8 0xa3e48badb7856b34 , 0x00003ffc //   log(1/frcpa(1+96/2^-8))/2
377data8 0xa5459a0aa95849f9 , 0x00003ffc //   log(1/frcpa(1+97/2^-8))/2
378data8 0xa6a79c84480cfebd , 0x00003ffc //   log(1/frcpa(1+98/2^-8))/2
379data8 0xa80a946d0fcb3eb2 , 0x00003ffc //   log(1/frcpa(1+99/2^-8))/2
380data8 0xa96e831a3ea7b314 , 0x00003ffc //   log(1/frcpa(1+100/2^-8))/2
381//
382data8 0xaad369e3dc544e3b , 0x00003ffc //   log(1/frcpa(1+101/2^-8))/2
383data8 0xac92e9588952c815 , 0x00003ffc //   log(1/frcpa(1+102/2^-8))/2
384data8 0xadfa035aa1ed8fdc , 0x00003ffc //   log(1/frcpa(1+103/2^-8))/2
385data8 0xaf6219eae1ad6e34 , 0x00003ffc //   log(1/frcpa(1+104/2^-8))/2
386data8 0xb0cb2e6d8160f753 , 0x00003ffc //   log(1/frcpa(1+105/2^-8))/2
387//
388data8 0xb2354249ad950f72 , 0x00003ffc //   log(1/frcpa(1+106/2^-8))/2
389data8 0xb3a056e98ef4a3b4 , 0x00003ffc //   log(1/frcpa(1+107/2^-8))/2
390data8 0xb50c6dba52c6292a , 0x00003ffc //   log(1/frcpa(1+108/2^-8))/2
391data8 0xb679882c33876165 , 0x00003ffc //   log(1/frcpa(1+109/2^-8))/2
392data8 0xb78c07429785cedc , 0x00003ffc //   log(1/frcpa(1+110/2^-8))/2
393//
394data8 0xb8faeb8dc4a77d24 , 0x00003ffc //   log(1/frcpa(1+111/2^-8))/2
395data8 0xba6ad77eb36ae0d6 , 0x00003ffc //   log(1/frcpa(1+112/2^-8))/2
396data8 0xbbdbcc915e9bee50 , 0x00003ffc //   log(1/frcpa(1+113/2^-8))/2
397data8 0xbd4dcc44f8cf12ef , 0x00003ffc //   log(1/frcpa(1+114/2^-8))/2
398data8 0xbec0d81bf5b531fa , 0x00003ffc //   log(1/frcpa(1+115/2^-8))/2
399//
400data8 0xc034f19c139186f4 , 0x00003ffc //   log(1/frcpa(1+116/2^-8))/2
401data8 0xc14cb69f7c5e55ab , 0x00003ffc //   log(1/frcpa(1+117/2^-8))/2
402data8 0xc2c2abbb6e5fd56f , 0x00003ffc //   log(1/frcpa(1+118/2^-8))/2
403data8 0xc439b2c193e6771e , 0x00003ffc //   log(1/frcpa(1+119/2^-8))/2
404data8 0xc553acb9d5c67733 , 0x00003ffc //   log(1/frcpa(1+120/2^-8))/2
405//
406data8 0xc6cc96e441272441 , 0x00003ffc //   log(1/frcpa(1+121/2^-8))/2
407data8 0xc8469753eca88c30 , 0x00003ffc //   log(1/frcpa(1+122/2^-8))/2
408data8 0xc962cf3ce072b05c , 0x00003ffc //   log(1/frcpa(1+123/2^-8))/2
409data8 0xcadeba8771f694aa , 0x00003ffc //   log(1/frcpa(1+124/2^-8))/2
410data8 0xcc5bc08d1f72da94 , 0x00003ffc //   log(1/frcpa(1+125/2^-8))/2
411//
412data8 0xcd7a3f99ea035c29 , 0x00003ffc //   log(1/frcpa(1+126/2^-8))/2
413data8 0xcef93860c8a53c35 , 0x00003ffc //   log(1/frcpa(1+127/2^-8))/2
414data8 0xd0192f68a7ed23df , 0x00003ffc //   log(1/frcpa(1+128/2^-8))/2
415data8 0xd19a201127d3c645 , 0x00003ffc //   log(1/frcpa(1+129/2^-8))/2
416data8 0xd2bb92f4061c172c , 0x00003ffc //   log(1/frcpa(1+130/2^-8))/2
417//
418data8 0xd43e80b2ee8cc8fc , 0x00003ffc //   log(1/frcpa(1+131/2^-8))/2
419data8 0xd56173601fc4ade4 , 0x00003ffc //   log(1/frcpa(1+132/2^-8))/2
420data8 0xd6e6637efb54086f , 0x00003ffc //   log(1/frcpa(1+133/2^-8))/2
421data8 0xd80ad9f58f3c8193 , 0x00003ffc //   log(1/frcpa(1+134/2^-8))/2
422data8 0xd991d1d31aca41f8 , 0x00003ffc //   log(1/frcpa(1+135/2^-8))/2
423//
424data8 0xdab7d02231484a93 , 0x00003ffc //   log(1/frcpa(1+136/2^-8))/2
425data8 0xdc40d532cde49a54 , 0x00003ffc //   log(1/frcpa(1+137/2^-8))/2
426data8 0xdd685f79ed8b265e , 0x00003ffc //   log(1/frcpa(1+138/2^-8))/2
427data8 0xde9094bbc0e17b1d , 0x00003ffc //   log(1/frcpa(1+139/2^-8))/2
428data8 0xe01c91b78440c425 , 0x00003ffc //   log(1/frcpa(1+140/2^-8))/2
429//
430data8 0xe14658f26997e729 , 0x00003ffc //   log(1/frcpa(1+141/2^-8))/2
431data8 0xe270cdc2391e0d23 , 0x00003ffc //   log(1/frcpa(1+142/2^-8))/2
432data8 0xe3ffce3a2aa64922 , 0x00003ffc //   log(1/frcpa(1+143/2^-8))/2
433data8 0xe52bdb274ed82887 , 0x00003ffc //   log(1/frcpa(1+144/2^-8))/2
434data8 0xe6589852e75d7df6 , 0x00003ffc //   log(1/frcpa(1+145/2^-8))/2
435//
436data8 0xe786068c79937a7d , 0x00003ffc //   log(1/frcpa(1+146/2^-8))/2
437data8 0xe91903adad100911 , 0x00003ffc //   log(1/frcpa(1+147/2^-8))/2
438data8 0xea481236f7d35bb0 , 0x00003ffc //   log(1/frcpa(1+148/2^-8))/2
439data8 0xeb77d48c692e6b14 , 0x00003ffc //   log(1/frcpa(1+149/2^-8))/2
440data8 0xeca84b83d7297b87 , 0x00003ffc //   log(1/frcpa(1+150/2^-8))/2
441//
442data8 0xedd977f4962aa158 , 0x00003ffc //   log(1/frcpa(1+151/2^-8))/2
443data8 0xef7179a22f257754 , 0x00003ffc //   log(1/frcpa(1+152/2^-8))/2
444data8 0xf0a450d139366ca7 , 0x00003ffc //   log(1/frcpa(1+153/2^-8))/2
445data8 0xf1d7e0524ff9ffdb , 0x00003ffc //   log(1/frcpa(1+154/2^-8))/2
446data8 0xf30c29036a8b6cae , 0x00003ffc //   log(1/frcpa(1+155/2^-8))/2
447//
448data8 0xf4412bc411ea8d92 , 0x00003ffc //   log(1/frcpa(1+156/2^-8))/2
449data8 0xf576e97564c8619d , 0x00003ffc //   log(1/frcpa(1+157/2^-8))/2
450data8 0xf6ad62fa1b5f172f , 0x00003ffc //   log(1/frcpa(1+158/2^-8))/2
451data8 0xf7e499368b55c542 , 0x00003ffc //   log(1/frcpa(1+159/2^-8))/2
452data8 0xf91c8d10abaffe22 , 0x00003ffc //   log(1/frcpa(1+160/2^-8))/2
453//
454data8 0xfa553f7018c966f3 , 0x00003ffc //   log(1/frcpa(1+161/2^-8))/2
455data8 0xfb8eb13e185d802c , 0x00003ffc //   log(1/frcpa(1+162/2^-8))/2
456data8 0xfcc8e3659d9bcbed , 0x00003ffc //   log(1/frcpa(1+163/2^-8))/2
457data8 0xfe03d6d34d487fd2 , 0x00003ffc //   log(1/frcpa(1+164/2^-8))/2
458data8 0xff3f8c7581e9f0ae , 0x00003ffc //   log(1/frcpa(1+165/2^-8))/2
459//
460data8 0x803e029e280173ae , 0x00003ffd //   log(1/frcpa(1+166/2^-8))/2
461data8 0x80dca10cc52d0757 , 0x00003ffd //   log(1/frcpa(1+167/2^-8))/2
462data8 0x817ba200632755a1 , 0x00003ffd //   log(1/frcpa(1+168/2^-8))/2
463data8 0x821b05f3b01d6774 , 0x00003ffd //   log(1/frcpa(1+169/2^-8))/2
464data8 0x82bacd623ff19d06 , 0x00003ffd //   log(1/frcpa(1+170/2^-8))/2
465//
466data8 0x835af8c88e7a8f47 , 0x00003ffd //   log(1/frcpa(1+171/2^-8))/2
467data8 0x83c5f8299e2b4091 , 0x00003ffd //   log(1/frcpa(1+172/2^-8))/2
468data8 0x8466cb43f3d87300 , 0x00003ffd //   log(1/frcpa(1+173/2^-8))/2
469data8 0x850803a67c80ca4b , 0x00003ffd //   log(1/frcpa(1+174/2^-8))/2
470data8 0x85a9a1d11a23b461 , 0x00003ffd //   log(1/frcpa(1+175/2^-8))/2
471//
472data8 0x864ba644a18e6e05 , 0x00003ffd //   log(1/frcpa(1+176/2^-8))/2
473data8 0x86ee1182dcc432f7 , 0x00003ffd //   log(1/frcpa(1+177/2^-8))/2
474data8 0x875a925d7e48c316 , 0x00003ffd //   log(1/frcpa(1+178/2^-8))/2
475data8 0x87fdaa109d23aef7 , 0x00003ffd //   log(1/frcpa(1+179/2^-8))/2
476data8 0x88a129ed4becfaf2 , 0x00003ffd //   log(1/frcpa(1+180/2^-8))/2
477//
478data8 0x89451278ecd7f9cf , 0x00003ffd //   log(1/frcpa(1+181/2^-8))/2
479data8 0x89b29295f8432617 , 0x00003ffd //   log(1/frcpa(1+182/2^-8))/2
480data8 0x8a572ac5a5496882 , 0x00003ffd //   log(1/frcpa(1+183/2^-8))/2
481data8 0x8afc2d0ce3b2dadf , 0x00003ffd //   log(1/frcpa(1+184/2^-8))/2
482data8 0x8b6a69c608cfd3af , 0x00003ffd //   log(1/frcpa(1+185/2^-8))/2
483//
484data8 0x8c101e106e899a83 , 0x00003ffd //   log(1/frcpa(1+186/2^-8))/2
485data8 0x8cb63de258f9d626 , 0x00003ffd //   log(1/frcpa(1+187/2^-8))/2
486data8 0x8d2539c5bd19e2b1 , 0x00003ffd //   log(1/frcpa(1+188/2^-8))/2
487data8 0x8dcc0e064b29e6f1 , 0x00003ffd //   log(1/frcpa(1+189/2^-8))/2
488data8 0x8e734f45d88357ae , 0x00003ffd //   log(1/frcpa(1+190/2^-8))/2
489//
490data8 0x8ee30cef034a20db , 0x00003ffd //   log(1/frcpa(1+191/2^-8))/2
491data8 0x8f8b0515686d1d06 , 0x00003ffd //   log(1/frcpa(1+192/2^-8))/2
492data8 0x90336bba039bf32f , 0x00003ffd //   log(1/frcpa(1+193/2^-8))/2
493data8 0x90a3edd23d1c9d58 , 0x00003ffd //   log(1/frcpa(1+194/2^-8))/2
494data8 0x914d0de2f5d61b32 , 0x00003ffd //   log(1/frcpa(1+195/2^-8))/2
495//
496data8 0x91be0c20d28173b5 , 0x00003ffd //   log(1/frcpa(1+196/2^-8))/2
497data8 0x9267e737c06cd34a , 0x00003ffd //   log(1/frcpa(1+197/2^-8))/2
498data8 0x92d962ae6abb1237 , 0x00003ffd //   log(1/frcpa(1+198/2^-8))/2
499data8 0x9383fa6afbe2074c , 0x00003ffd //   log(1/frcpa(1+199/2^-8))/2
500data8 0x942f0421651c1c4e , 0x00003ffd //   log(1/frcpa(1+200/2^-8))/2
501//
502data8 0x94a14a3845bb985e , 0x00003ffd //   log(1/frcpa(1+201/2^-8))/2
503data8 0x954d133857f861e7 , 0x00003ffd //   log(1/frcpa(1+202/2^-8))/2
504data8 0x95bfd96468e604c4 , 0x00003ffd //   log(1/frcpa(1+203/2^-8))/2
505data8 0x9632d31cafafa858 , 0x00003ffd //   log(1/frcpa(1+204/2^-8))/2
506data8 0x96dfaabd86fa1647 , 0x00003ffd //   log(1/frcpa(1+205/2^-8))/2
507//
508data8 0x9753261fcbb2a594 , 0x00003ffd //   log(1/frcpa(1+206/2^-8))/2
509data8 0x9800c11b426b996d , 0x00003ffd //   log(1/frcpa(1+207/2^-8))/2
510data8 0x9874bf4d45ae663c , 0x00003ffd //   log(1/frcpa(1+208/2^-8))/2
511data8 0x99231f5ee9a74f79 , 0x00003ffd //   log(1/frcpa(1+209/2^-8))/2
512data8 0x9997a18a56bcad28 , 0x00003ffd //   log(1/frcpa(1+210/2^-8))/2
513//
514data8 0x9a46c873a3267e79 , 0x00003ffd //   log(1/frcpa(1+211/2^-8))/2
515data8 0x9abbcfc621eb6cb6 , 0x00003ffd //   log(1/frcpa(1+212/2^-8))/2
516data8 0x9b310cb0d354c990 , 0x00003ffd //   log(1/frcpa(1+213/2^-8))/2
517data8 0x9be14cf9e1b3515c , 0x00003ffd //   log(1/frcpa(1+214/2^-8))/2
518data8 0x9c5710b8cbb73a43 , 0x00003ffd //   log(1/frcpa(1+215/2^-8))/2
519//
520data8 0x9ccd0abd301f399c , 0x00003ffd //   log(1/frcpa(1+216/2^-8))/2
521data8 0x9d7e67f3bdce8888 , 0x00003ffd //   log(1/frcpa(1+217/2^-8))/2
522data8 0x9df4ea81a99daa01 , 0x00003ffd //   log(1/frcpa(1+218/2^-8))/2
523data8 0x9e6ba405a54514ba , 0x00003ffd //   log(1/frcpa(1+219/2^-8))/2
524data8 0x9f1e21c8c7bb62b3 , 0x00003ffd //   log(1/frcpa(1+220/2^-8))/2
525//
526data8 0x9f956593f6b6355c , 0x00003ffd //   log(1/frcpa(1+221/2^-8))/2
527data8 0xa00ce1092e5498c3 , 0x00003ffd //   log(1/frcpa(1+222/2^-8))/2
528data8 0xa0c08309c4b912c1 , 0x00003ffd //   log(1/frcpa(1+223/2^-8))/2
529data8 0xa1388a8c6faa2afa , 0x00003ffd //   log(1/frcpa(1+224/2^-8))/2
530data8 0xa1b0ca7095b5f985 , 0x00003ffd //   log(1/frcpa(1+225/2^-8))/2
531//
532data8 0xa22942eb47534a00 , 0x00003ffd //   log(1/frcpa(1+226/2^-8))/2
533data8 0xa2de62326449d0a3 , 0x00003ffd //   log(1/frcpa(1+227/2^-8))/2
534data8 0xa357690f88bfe345 , 0x00003ffd //   log(1/frcpa(1+228/2^-8))/2
535data8 0xa3d0a93f45169a4b , 0x00003ffd //   log(1/frcpa(1+229/2^-8))/2
536data8 0xa44a22f7ffe65f30 , 0x00003ffd //   log(1/frcpa(1+230/2^-8))/2
537//
538data8 0xa500c5e5b4c1aa36 , 0x00003ffd //   log(1/frcpa(1+231/2^-8))/2
539data8 0xa57ad064eb2ebbc2 , 0x00003ffd //   log(1/frcpa(1+232/2^-8))/2
540data8 0xa5f5152dedf4384e , 0x00003ffd //   log(1/frcpa(1+233/2^-8))/2
541data8 0xa66f9478856233ec , 0x00003ffd //   log(1/frcpa(1+234/2^-8))/2
542data8 0xa6ea4e7cca02c32e , 0x00003ffd //   log(1/frcpa(1+235/2^-8))/2
543//
544data8 0xa765437325341ccf , 0x00003ffd //   log(1/frcpa(1+236/2^-8))/2
545data8 0xa81e21e6c75b4020 , 0x00003ffd //   log(1/frcpa(1+237/2^-8))/2
546data8 0xa899ab333fe2b9ca , 0x00003ffd //   log(1/frcpa(1+238/2^-8))/2
547data8 0xa9157039c51ebe71 , 0x00003ffd //   log(1/frcpa(1+239/2^-8))/2
548data8 0xa991713433c2b999 , 0x00003ffd //   log(1/frcpa(1+240/2^-8))/2
549//
550data8 0xaa0dae5cbcc048b3 , 0x00003ffd //   log(1/frcpa(1+241/2^-8))/2
551data8 0xaa8a27ede5eb13ad , 0x00003ffd //   log(1/frcpa(1+242/2^-8))/2
552data8 0xab06de228a9e3499 , 0x00003ffd //   log(1/frcpa(1+243/2^-8))/2
553data8 0xab83d135dc633301 , 0x00003ffd //   log(1/frcpa(1+244/2^-8))/2
554data8 0xac3fb076adc7fe7a , 0x00003ffd //   log(1/frcpa(1+245/2^-8))/2
555//
556data8 0xacbd3cbbe47988f1 , 0x00003ffd //   log(1/frcpa(1+246/2^-8))/2
557data8 0xad3b06b1a5dc57c3 , 0x00003ffd //   log(1/frcpa(1+247/2^-8))/2
558data8 0xadb90e94af887717 , 0x00003ffd //   log(1/frcpa(1+248/2^-8))/2
559data8 0xae3754a218f7c816 , 0x00003ffd //   log(1/frcpa(1+249/2^-8))/2
560data8 0xaeb5d9175437afa2 , 0x00003ffd //   log(1/frcpa(1+250/2^-8))/2
561//
562data8 0xaf349c322e9c7cee , 0x00003ffd //   log(1/frcpa(1+251/2^-8))/2
563data8 0xafb39e30d1768d1c , 0x00003ffd //   log(1/frcpa(1+252/2^-8))/2
564data8 0xb032df51c2c93116 , 0x00003ffd //   log(1/frcpa(1+253/2^-8))/2
565data8 0xb0b25fd3e6035ad9 , 0x00003ffd //   log(1/frcpa(1+254/2^-8))/2
566data8 0xb1321ff67cba178c , 0x00003ffd //   log(1/frcpa(1+255/2^-8))/2
567LOCAL_OBJECT_END(atanh_data_3)
568
569
570
571.section .text
572GLOBAL_LIBM_ENTRY(atanh)
573
574{ .mfi
575      getf.exp      rArgSExpb = f8                  // Must recompute if x unorm
576      fclass.m      p13,p0 = f8, 0x0b               // is arg denormal ?
577      mov           rExpbMask = 0x1ffff
578}
579{ .mfi
580      addl          DataPtr = @ltoff(atanh_data), gp
581      fnma.s1       fOneMx = f8, f1, f1             // fOneMx = 1 - x
582      mov           rBias = 0xffff
583}
584;;
585
586{ .mfi
587      mov           rNearZeroBound = 0xfffd         // biased exp of 1/4
588      fclass.m      p12,p0 = f8, 0xc7               // is arg NaN or +/-0 ?
589      nop.i         0
590}
591{ .mfi
592      ld8           DataPtr = [DataPtr]
593      fma.s1        fOnePx = f8, f1, f1             // fOnePx = 1 + x
594      nop.i         0
595}
596;;
597
598{ .mfi
599      nop.m         0
600      fcmp.lt.s1    p10,p11 = f8,f0                 // is x < 0 ?
601      nop.i         0
602}
603{ .mfb
604      nop.m         0
605      fnorm.s1      fNormX = f8                     // Normalize x
606(p13) br.cond.spnt  ATANH_UNORM                     // Branch if x=unorm
607}
608;;
609
610ATANH_COMMON:
611// Return here if x=unorm and not denorm
612{ .mfi
613      adds          Data2Ptr = 0x50, DataPtr
614      fma.s1        fX2 = f8, f8, f0                // x^2
615      nop.i         0
616}
617{ .mfb
618      adds          Data3Ptr = 0xC0, DataPtr
619(p12) fma.d.s0      f8 = f8,f1,f8                   // NaN or +/-0
620(p12) br.ret.spnt   b0                              // Exit for x Nan or zero
621}
622;;
623
624{ .mfi
625      ldfe          fC9 = [Data2Ptr], 16
626(p11) frcpa.s1      fRcp0, p0 = f1, fOneMx
627      nop.i         0
628}
629;;
630
631{ .mfi
632      ldfe          fC8 = [Data2Ptr], 16
633(p10) frcpa.s1      fRcp0n, p0 = f1, fOnePx
634      and           rArgExpb = rArgSExpb, rExpbMask // biased exponent
635}
636{ .mfi
637      nop.m         0
638(p10) fma.s1        fOneMx = fOnePx, f1, f0         // fOnePx = 1 - |x|
639      nop.i         0
640}
641;;
642
643{ .mfi
644      ldfe          fC7 = [Data2Ptr], 16
645(p10) fnma.s1       fOnePx = fNormX, f1, f1         // fOnePx = 1 + |x|
646      cmp.ge        p6,p0 = rArgExpb, rBias         // is Expb(Arg) >= Expb(1) ?
647}
648{ .mfb
649      nop.m         0
650      nop.f         0
651(p6)  br.cond.spnt  atanh_ge_one                    // Branch if |x| >=1.0
652}
653;;
654
655{ .mfi
656      ldfe          fC6 = [Data2Ptr], 16
657      nop.f         0
658      nop.i         0
659}
660;;
661
662{ .mfi
663      ldfe          fC5 = [Data2Ptr], 16
664      fma.s1        fX4 = fX2, fX2, f0              // x^4
665      cmp.gt        p8,p0 = rNearZeroBound, rArgExpb
666}
667{ .mfb
668      ldfe          fC2 = [Data3Ptr], 16
669      fma.s1        fX3 = fX2, fNormX, f0           // x^3
670(p8)  br.cond.spnt  atanh_near_zero                 // Exit if 0 < |x| < 0.25
671}
672;;
673
674// Main path: 0.25 <= |x| < 1.0
675// NR method: iteration #1
676.pred.rel "mutex",p11,p10
677{ .mfi
678      ldfpd         fP5, fP4 = [DataPtr], 16
679(p11) fnma.s1       fRcp1 = fRcp0, fOneMx, f1       // t = 1 - r0*x
680      nop.i         0
681}
682{ .mfi
683      nop.m         0
684(p10) fnma.s1       fRcp1 = fRcp0n, fOneMx, f1      // t = 1 - r0*x
685      nop.i         0
686}
687;;
688
689{ .mfi
690      ldfpd         fP3, fP2 = [DataPtr], 16
691      // r1 = r0 + r0*t = r0 + r0*(1 - r0*x)
692(p11) fma.s1        fRcp1 = fRcp0, fRcp1, fRcp0
693      nop.i         0
694}
695{ .mfi
696      nop.m         0
697      // r1 = r0 + r0*t = r0 + r0*(1 - r0*x)
698(p10) fma.s1        fRcp1 = fRcp0n, fRcp1, fRcp0n
699      nop.i         0
700}
701;;
702
703// NR method: iteration #2
704{ .mfi
705      ldfd          fP1 = [DataPtr], 16
706      fnma.s1       fRcp2 = fRcp1, fOneMx, f1       // t = 1 - r1*x
707      nop.i         0
708}
709;;
710
711{ .mfi
712      ldfe          fLog2 = [DataPtr], 16
713      // r2 = r1 + r1*t = r1 + r1*(1 - r1*x)
714      fma.s1        fRcp2 = fRcp1, fRcp2, fRcp1
715      nop.i         0
716}
717;;
718
719// NR method: iteration #3
720{ .mfi
721      adds          RcpTablePtr = 0xB0, DataPtr
722      fnma.s1       fRcp3 = fRcp2, fOneMx, f1       // t = 1 - r2*x
723      nop.i         0
724}
725{ .mfi
726      nop.m         0
727      fma.s1        fY4Rcp = fRcp2, fOnePx, f0      // fY4Rcp = r2*(1 + x)
728      nop.i         0
729}
730;;
731
732// polynomial approximation & final reconstruction
733{ .mfi
734      nop.m         0
735      frcpa.s1      fRcp, p0 = f1, fY4Rcp
736      nop.i         0
737}
738{ .mfi
739      nop.m         0
740      // y = r2 * (1 + x) + r2 * (1 + x) * t = (1 + x) * (r2 + r2*(1 - r2*x))
741      fma.s1        fY = fY4Rcp, fRcp3, fY4Rcp
742      nop.i         0
743}
744;;
745
746{ .mmi
747      getf.exp      rSExpb = fY4Rcp                 // biased exponent and sign
748;;
749      getf.sig      rSig = fY4Rcp                   // significand
750      nop.i         0
751}
752;;
753
754{ .mfi
755      nop.m         0
756      fms.s1        fR = fY, fRcp, f1               // fR = fY * fRcp - 1
757      nop.i         0
758}
759;;
760
761{ .mmi
762      and           rExpb = rSExpb, rExpbMask
763;;
764      sub           rN = rExpb, rBias               // exponent
765      extr.u        rInd = rSig,55,8                // Extract 8 bits
766}
767;;
768
769{ .mmi
770      setf.sig      fN4Cvt = rN
771      shladd        RcpTablePtr = rInd, 4, RcpTablePtr
772      nop.i         0
773}
774;;
775
776{ .mfi
777      ldfe          fLogT = [RcpTablePtr]
778      fma.s1        fR2 = fR, fR, f0                // r^2
779      nop.i         0
780}
781{
782      nop.m         0
783      fma.s1        fP54 = fP5, fR, fP4             // P5*r + P4
784      nop.i         0
785}
786;;
787
788{ .mfi
789      nop.m         0
790      fma.s1        fP32 = fP3, fR, fP2             // P3*r + P2
791      nop.i         0
792}
793;;
794
795{ .mfi
796      nop.m         0
797      fma.s1        fR3 = fR2, fR, f0               // r^3
798      nop.i         0
799}
800{ .mfi
801      nop.m         0
802      fma.s1        fP10 = fP1, fR2, fR             // P1*r^2 + r
803      nop.i         0
804}
805;;
806
807{ .mfi
808      nop.m         0
809      fcvt.xf       fN = fN4Cvt
810      nop.i         0
811}
812{ .mfi
813      nop.m         0
814      fma.s1        fP54 = fP54, fR2, fP32      // (P5*r + P4)*r^2 + P3*r + P2
815      nop.i         0
816}
817;;
818
819{ .mfi
820      nop.m         0
821      fma.s1        fLogT_N = fN, fLog2, fLogT      // N*Log2 + LogT
822      nop.i         0
823}
824{ .mfi
825      nop.m         0
826      // ((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r
827      fma.s1        fP54 = fP54, fR3, fP10
828      nop.i         0
829}
830;;
831
832.pred.rel "mutex",p11,p10
833{ .mfi
834      nop.m         0
835      // 0.5*(((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r) + 0.5*(N*Log2 + T)
836(p11) fnma.d.s0     f8 = fP54, fP1, fLogT_N
837      nop.i         0
838}
839{ .mfb
840      nop.m         0
841     // -0.5*(((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r) - 0.5*(N*Log2 + T)
842(p10) fms.d.s0      f8 = fP54, fP1, fLogT_N
843      br.ret.sptk   b0                          // Exit for 0.25 <= |x| < 1.0
844}
845;;
846
847// Here if 0 < |x| < 0.25
848atanh_near_zero:
849{ .mfi
850      ldfe          fC4 = [Data2Ptr], 16
851      fma.s1        fP98 = fC9, fX2, fC8           // C9*x^2 + C8
852      nop.i         0
853}
854{ .mfi
855      ldfe          fC1 = [Data3Ptr], 16
856      fma.s1        fP76 = fC7, fX2, fC6           // C7*x^2 + C6
857      nop.i         0
858}
859;;
860
861{ .mfi
862      ldfe          fC3 = [Data2Ptr], 16
863      fma.s1        fX8 = fX4, fX4, f0             // x^8
864      nop.i         0
865}
866{ .mfi
867      ldfe          fC0 = [Data3Ptr], 16
868      nop.f         0
869      nop.i         0
870}
871;;
872
873{ .mfi
874      nop.m         0
875      fma.s1        fP98 = fP98, fX4, fP76     // C9*x^6 + C8*x^4 + C7*x^2 + C6
876      nop.i         0
877}
878;;
879
880{ .mfi
881      nop.m         0
882      fma.s1        fP54 = fC5, fX2, fC4           // C5*x^2 + C4
883      nop.i         0
884}
885;;
886
887{ .mfi
888      nop.m         0
889      fma.s1        fP32 = fC3, fX2, fC2           // C3*x^2 + C2
890      nop.i         0
891}
892;;
893
894{ .mfi
895      nop.m         0
896      fma.s1        fP10 = fC1, fX2, fC0           // C1*x^2 + C0
897      nop.i         0
898}
899;;
900
901{ .mfi
902      nop.m         0
903      fma.s1        fP54 = fP54, fX4, fP32      // C5*x^6 + C4*x^4 + C3*x^2 + C2
904      nop.i         0
905}
906;;
907
908{ .mfi
909      nop.m         0
910      // C9*x^14 + C8*x^12 + C7*x^10 + C6*x^8 + C5*x^6 + C4*x^4 + C3*x^2 + C2
911      fma.s1        fP98 = fP98, fX8, fP54
912      nop.i         0
913}
914;;
915
916{ .mfi
917      nop.m         0
918      // C9*x^18 + C8*x^16 + C7*x^14 + C6*x^12 + C5*x^10 + C4*x^8 + C3*x^6 +
919      // C2*x^4 + C1*x^2 + C0
920      fma.s1        fP98 = fP98, fX4, fP10
921      nop.i         0
922}
923;;
924
925{ .mfb
926      nop.m         0
927      // C9*x^21 + C8*x^19 + C7*x^17 + C6*x^15 + C5*x^13 + C4*x^11 + C3*x^9 +
928      // C2*x^7 + C1*x^5 + C0*x^3 + x
929      fma.d.s0      f8 = fP98, fX3, fNormX
930      br.ret.sptk   b0                           // Exit for 0 < |x| < 0.25
931}
932;;
933
934ATANH_UNORM:
935// Here if x=unorm
936{ .mfi
937      getf.exp      rArgSExpb = fNormX           // Recompute if x unorm
938      fclass.m      p0,p13 = fNormX, 0x0b        // Test x denorm
939      nop.i         0
940}
941;;
942
943{ .mfb
944      nop.m         0
945      fcmp.eq.s0    p7,p0 = f8, f0        // Dummy to set denormal flag
946(p13) br.cond.sptk  ATANH_COMMON          // Continue if x unorm and not denorm
947}
948;;
949
950.pred.rel "mutex",p10,p11
951{ .mfi
952      nop.m         0
953(p10) fnma.d.s0     f8 = f8,f8,f8                // Result x-x^2 if x=-denorm
954      nop.i         0
955}
956{ .mfb
957      nop.m         0
958(p11) fma.d.s0      f8 = f8,f8,f8                // Result x+x^2 if x=+denorm
959      br.ret.spnt   b0                           // Exit if denorm
960}
961;;
962
963// Here if |x| >= 1.0
964atanh_ge_one:
965{ .mfi
966      alloc         r32 = ar.pfs,1,3,4,0
967      fmerge.s      fAbsX = f0, f8          // Form |x|
968      nop.i         0
969}
970;;
971
972{ .mfi
973      nop.m         0
974      fmerge.s      f10 = f8, f8            // Save input for error call
975      nop.i         0
976}
977;;
978
979{ .mfi
980      nop.m         0
981      fcmp.eq.s1    p6,p7 = fAbsX, f1       // Test for |x| = 1.0
982      nop.i         0
983}
984;;
985
986// Set error tag and result, and raise invalid flag if |x| > 1.0
987{ .mfi
988(p7)  mov           atanh_GR_tag = 131
989(p7)  frcpa.s0      f8, p0 = f0, f0         // Get QNaN, and raise invalid
990      nop.i         0
991}
992;;
993
994// Set error tag and result, and raise Z flag if |x| = 1.0
995{ .mfi
996      nop.m         0
997(p6)  frcpa.s0      fRcp, p0 = f1, f0       // Get inf, and raise Z flag
998      nop.i         0
999}
1000;;
1001
1002{ .mfb
1003(p6)  mov           atanh_GR_tag = 132
1004(p6)  fmerge.s      f8 = f8, fRcp           // result is +-inf
1005      br.cond.sptk  __libm_error_region     // Exit if |x| >= 1.0
1006}
1007;;
1008
1009GLOBAL_LIBM_END(atanh)
1010libm_alias_double_other (atanh, atanh)
1011
1012
1013LOCAL_LIBM_ENTRY(__libm_error_region)
1014.prologue
1015
1016{ .mfi
1017      add           GR_Parameter_Y=-32,sp        // Parameter 2 value
1018      nop.f         0
1019.save   ar.pfs,GR_SAVE_PFS
1020      mov           GR_SAVE_PFS=ar.pfs           // Save ar.pfs
1021}
1022{ .mfi
1023.fframe 64
1024      add sp=-64,sp                              // Create new stack
1025      nop.f 0
1026      mov GR_SAVE_GP=gp                          // Save gp
1027};;
1028
1029{ .mmi
1030      stfd [GR_Parameter_Y] = f1,16              // STORE Parameter 2 on stack
1031      add GR_Parameter_X = 16,sp                 // Parameter 1 address
1032.save   b0, GR_SAVE_B0
1033      mov GR_SAVE_B0=b0                          // Save b0
1034};;
1035
1036.body
1037{ .mib
1038      stfd [GR_Parameter_X] = f10                // STORE Parameter 1 on stack
1039      add   GR_Parameter_RESULT = 0,GR_Parameter_Y  // Parameter 3 address
1040      nop.b 0
1041}
1042{ .mib
1043      stfd [GR_Parameter_Y] = f8                 // STORE Parameter 3 on stack
1044      add   GR_Parameter_Y = -16,GR_Parameter_Y
1045      br.call.sptk b0=__libm_error_support#      // Call error handling function
1046};;
1047
1048{ .mmi
1049      add   GR_Parameter_RESULT = 48,sp
1050      nop.m 0
1051      nop.i 0
1052};;
1053
1054{ .mmi
1055      ldfd  f8 = [GR_Parameter_RESULT]           // Get return result off stack
1056.restore sp
1057      add   sp = 64,sp                           // Restore stack pointer
1058      mov   b0 = GR_SAVE_B0                      // Restore return address
1059};;
1060
1061{ .mib
1062      mov   gp = GR_SAVE_GP                      // Restore gp
1063      mov   ar.pfs = GR_SAVE_PFS                 // Restore ar.pfs
1064      br.ret.sptk     b0                         // Return
1065};;
1066
1067LOCAL_LIBM_END(__libm_error_region)
1068
1069
1070.type   __libm_error_support#,@function
1071.global __libm_error_support#
1072