1.file "atanl.s"
2
3
4// Copyright (c) 2000 - 2005, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39//
40//*********************************************************************
41//
42// History
43// 02/02/00 (hand-optimized)
44// 04/04/00 Unwind support added
45// 08/15/00 Bundle added after call to __libm_error_support to properly
46//          set [the previously overwritten] GR_Parameter_RESULT.
47// 03/13/01 Fixed flags when denormal raised on intermediate result
48// 01/08/02 Improved speed.
49// 02/06/02 Corrected .section statement
50// 05/20/02 Cleaned up namespace and sf0 syntax
51// 02/10/03 Reordered header: .section, .global, .proc, .align;
52//          used data8 for long double table values
53// 03/31/05 Reformatted delimiters between data tables
54//
55//*********************************************************************
56//
57// Function:   atanl(x) = inverse tangent(x), for double extended x values
58// Function:   atan2l(y,x) = atan(y/x), for double extended y, x values
59//
60// API
61//
62//  long double atanl  (long double x)
63//  long double atan2l (long double y, long double x)
64//
65//*********************************************************************
66//
67// Resources Used:
68//
69//    Floating-Point Registers: f8 (Input and Return Value)
70//                              f9 (Input for atan2l)
71//                              f10-f15, f32-f83
72//
73//    General Purpose Registers:
74//      r32-r51
75//      r49-r52 (Arguments to error support for 0,0 case)
76//
77//    Predicate Registers:      p6-p15
78//
79//*********************************************************************
80//
81// IEEE Special Conditions:
82//
83//    Denormal fault raised on denormal inputs
84//    Underflow exceptions may occur
85//    Special error handling for the y=0 and x=0 case
86//    Inexact raised when appropriate by algorithm
87//
88//    atanl(SNaN) = QNaN
89//    atanl(QNaN) = QNaN
90//    atanl(+/-0) = +/- 0
91//    atanl(+/-Inf) = +/-pi/2
92//
93//    atan2l(Any NaN for x or y) = QNaN
94//    atan2l(+/-0,x) = +/-0 for x > 0
95//    atan2l(+/-0,x) = +/-pi for x < 0
96//    atan2l(+/-0,+0) = +/-0
97//    atan2l(+/-0,-0) = +/-pi
98//    atan2l(y,+/-0) = pi/2 y > 0
99//    atan2l(y,+/-0) = -pi/2 y < 0
100//    atan2l(+/-y, Inf) = +/-0 for finite y > 0
101//    atan2l(+/-Inf, x) = +/-pi/2 for finite x
102//    atan2l(+/-y, -Inf) = +/-pi for finite  y > 0
103//    atan2l(+/-Inf, Inf) = +/-pi/4
104//    atan2l(+/-Inf, -Inf) = +/-3pi/4
105//
106//*********************************************************************
107//
108// Mathematical Description
109// ---------------------------
110//
111// The function ATANL( Arg_Y, Arg_X ) returns the "argument"
112// or the "phase" of the complex number
113//
114//           Arg_X + i Arg_Y
115//
116// or equivalently, the angle in radians from the positive
117// x-axis to the line joining the origin and the point
118// (Arg_X,Arg_Y)
119//
120//
121//        (Arg_X, Arg_Y) x
122//                        \
123//                \
124//                 \
125//                  \
126//                   \ angle between is ATANL(Arg_Y,Arg_X)
127
128
129
130
131//                    \
132//                     ------------------> X-axis
133
134//                   Origin
135//
136// Moreover, this angle is reported in the range [-pi,pi] thus
137//
138//      -pi <= ATANL( Arg_Y, Arg_X ) <= pi.
139//
140// From the geometry, it is easy to define ATANL when one of
141// Arg_X or Arg_Y is +-0 or +-inf:
142//
143//
144//      \ Y |
145//     X \  |  +0  | -0  |  +inf |  -inf  |  finite non-zero
146//        \ |      |     |       |        |
147//    ______________________________________________________
148//          |            |       |        |
149//     +-0  |   Invalid/ |  pi/2 | -pi/2  |  sign(Y)*pi/2
150//          |    qNaN    |       |        |
151//  --------------------------------------------------------
152//          |      |     |       |        |
153//     +inf |  +0  | -0  |  pi/4 | -pi/4  |  sign(Y)*0
154//  --------------------------------------------------------
155//          |      |     |       |        |
156//     -inf |  +pi | -pi | 3pi/4 | -3pi/4 |  sign(Y)*pi
157//  --------------------------------------------------------
158//   finite |    X>0?    |  pi/2 | -pi/2  |  normal case
159//  non-zero| sign(Y)*0: |       |        |
160//       | sign(Y)*pi |       |        |
161//
162//
163// One must take note that ATANL is NOT the arctangent of the
164// value Arg_Y/Arg_X; but rather ATANL and arctan are related
165// in a slightly more complicated way as follows:
166//
167// Let U := max(|Arg_X|, |Arg_Y|);  V := min(|Arg_X|, |Arg_Y|);
168// sign_X be the sign bit of Arg_X, i.e., sign_X is 0 or 1;
169// s_X    be the sign     of Arg_X, i.e., s_X = (-1)^sign_X;
170//
171// sign_Y be the sign bit of Arg_Y, i.e., sign_Y is 0 or 1;
172// s_Y    be the sign     of Arg_Y, i.e., s_Y = (-1)^sign_Y;
173//
174// swap   be 0  if |Arg_X| >= |Arg_Y|  and 1 otherwise.
175//
176// Then, ATANL(Arg_Y, Arg_X) =
177//
178//       /    arctan(V/U)     \      sign_X = 0 & swap = 0
179//       | pi/2 - arctan(V/U) |      sign_X = 0 & swap = 1
180// s_Y * |                    |
181//       |  pi  - arctan(V/U) |      sign_X = 1 & swap = 0
182//       \ pi/2 + arctan(V/U) /      sign_X = 1 & swap = 1
183//
184//
185// This relationship also suggest that the algorithm's major
186// task is to calculate arctan(V/U) for 0 < V <= U; and the
187// final Result is given by
188//
189//      s_Y * { (P_hi + P_lo) + sigma * arctan(V/U) }
190//
191// where
192//
193//   (P_hi,P_lo) represents M(sign_X,swap)*(pi/2) accurately
194//
195//   M(sign_X,swap) = 0  for sign_X = 0 and swap = 0
196//              1  for swap   = 1
197//              2  for sign_X = 1 and swap = 0
198//
199// and
200//
201//   sigma = { (sign_X  XOR  swap) :  -1.0 : 1.0 }
202//
203//      =  (-1) ^ ( sign_X XOR swap )
204//
205// Both (P_hi,P_lo) and sigma can be stored in a table and fetched
206// using (sign_X,swap) as an index. (P_hi, P_lo) can be stored as a
207// double-precision, and single-precision pair; and sigma can
208// obviously be just a single-precision number.
209//
210// In the algorithm we propose, arctan(V/U) is calculated to high accuracy
211// as A_hi + A_lo. Consequently, the Result ATANL( Arg_Y, Arg_X ) is
212// given by
213//
214//    s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
215//
216// We now discuss the calculation of arctan(V/U) for 0 < V <= U.
217//
218// For (V/U) < 2^(-3), we use a simple polynomial of the form
219//
220//      z + z^3*(P_1 + z^2*(P_2 + z^2*(P_3 + ... + P_8)))
221//
222// where z = V/U.
223//
224// For the sake of accuracy, the first term "z" must approximate V/U to
225// extra precision. For z^3 and higher power, a working precision
226// approximation to V/U suffices. Thus, we obtain:
227//
228//      z_hi + z_lo = V/U  to extra precision and
229//      z           = V/U  to working precision
230//
231// The value arctan(V/U) is delivered as two pieces (A_hi, A_lo)
232//
233//      (A_hi,A_lo) = (z_hi, z^3*(P_1 + ... + P_8) + z_lo).
234//
235//
236// For 2^(-3) <= (V/U) <= 1, we use a table-driven approach.
237// Consider
238//
239//      (V/U) = 2^k * 1.b_1 b_2 .... b_63 b_64 b_65 ....
240//
241// Define
242//
243//       z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1
244//
245// then
246//                                            /                \
247//                                            |  (V/U) - z_hi  |
248
249//      arctan(V/U) = arctan(z_hi) + acrtan| -------------- |
250//                                            | 1 + (V/U)*z_hi |
251//                                            \                /
252//
253//                                            /                \
254//                                            |   V - z_hi*U   |
255
256//                  = arctan(z_hi) + acrtan| -------------- |
257//                                            |   U + V*z_hi   |
258//                                            \                /
259//
260//                  = arctan(z_hi) + acrtan( V' / U' )
261//
262//
263// where
264//
265//      V' = V - U*z_hi;   U' = U + V*z_hi.
266//
267// Let
268//
269//      w_hi + w_lo  = V'/U' to extra precision and
270//           w       = V'/U' to working precision
271//
272// then we can approximate arctan(V'/U') by
273//
274//      arctan(V'/U') = w_hi + w_lo
275//                     + w^3*(Q_1 + w^2*(Q_2 + w^2*(Q_3 + w^2*Q_4)))
276//
277//                       = w_hi + w_lo + poly
278//
279// Finally, arctan(z_hi) is calculated beforehand and stored in a table
280// as Tbl_hi, Tbl_lo. Thus,
281//
282//      (A_hi, A_lo) = (Tbl_hi, w_hi+(poly+(w_lo+Tbl_lo)))
283//
284// This completes the mathematical description.
285//
286//
287// Algorithm
288// -------------
289//
290// Step 0. Check for unsupported format.
291//
292//    If
293//       ( expo(Arg_X) not zero AND msb(Arg_X) = 0 ) OR
294//       ( expo(Arg_Y) not zero AND msb(Arg_Y) = 0 )
295//
296//    then one of the arguments is unsupported. Generate an
297//    invalid and return qNaN.
298//
299// Step 1. Initialize
300//
301//    Normalize Arg_X and Arg_Y and set the following
302//
303//    sign_X :=  sign_bit(Arg_X)
304//    s_Y    := (sign_bit(Arg_Y)==0? 1.0 : -1.0)
305//    swap   := (|Arg_X| >= |Arg_Y|?   0 :  1  )
306//    U      := max( |Arg_X|, |Arg_Y| )
307//    V      := min( |Arg_X|, |Arg_Y| )
308//
309//    execute: frcpa E, pred, V, U
310//    If pred is 0, go to Step 5 for special cases handling.
311//
312// Step 2. Decide on branch.
313//
314//    Q := E * V
315//    If Q < 2^(-3) go to Step 4 for simple polynomial case.
316//
317// Step 3. Table-driven algorithm.
318//
319//    Q is represented as
320//
321//      2^(-k) * 1.b_1 b_2 b_3 ... b_63; k = 0,-1,-2,-3
322//
323// and that if k = 0, b_1 = b_2 = b_3 = b_4 = 0.
324//
325// Define
326//
327//      z_hi := 2^(-k) * 1.b_1 b_2 b_3 b_4 1
328//
329// (note that there are 49 possible values of z_hi).
330//
331//      ...We now calculate V' and U'. While V' is representable
332//      ...as a 64-bit number because of cancellation, U' is
333//      ...not in general a 64-bit number. Obtaining U' accurately
334//      ...requires two working precision numbers
335//
336//      U_prime_hi := U + V * z_hi            ...WP approx. to U'
337//      U_prime_lo := ( U - U_prime_hi ) + V*z_hi ...observe order
338//      V_prime    := V - U * z_hi             ...this is exact
339//
340//         C_hi := frcpa (1.0, U_prime_hi)  ...C_hi approx 1/U'_hi
341//
342//      loop 3 times
343//         C_hi := C_hi + C_hi*(1.0 - C_hi*U_prime_hi)
344//
345//      ...at this point C_hi is (1/U_prime_hi) to roughly 64 bits
346//
347//      w_hi := V_prime * C_hi     ...w_hi is V_prime/U_prime to
348//                     ...roughly working precision
349//
350//         ...note that we want w_hi + w_lo to approximate
351//      ...V_prime/(U_prime_hi + U_prime_lo) to extra precision
352//         ...but for now, w_hi is good enough for the polynomial
353//      ...calculation.
354//
355//         wsq  := w_hi*w_hi
356//      poly := w_hi*wsq*(Q_1 + wsq*(Q_2 + wsq*(Q_3 + wsq*Q_4)))
357//
358//      Fetch
359//      (Tbl_hi, Tbl_lo) = atan(z_hi) indexed by (k,b_1,b_2,b_3,b_4)
360//      ...Tbl_hi is a double-precision number
361//      ...Tbl_lo is a single-precision number
362//
363//         (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo)
364//      ...as discussed previous. Again; the implementation can
365//      ...chose to fetch P_hi and P_lo from a table indexed by
366//      ...(sign_X, swap).
367//      ...P_hi is a double-precision number;
368//      ...P_lo is a single-precision number.
369//
370//      ...calculate w_lo so that w_hi + w_lo is V'/U' accurately
371//         w_lo := ((V_prime - w_hi*U_prime_hi) -
372//              w_hi*U_prime_lo) * C_hi     ...observe order
373//
374//
375//      ...Ready to deliver arctan(V'/U') as A_hi, A_lo
376//      A_hi := Tbl_hi
377//      A_lo := w_hi + (poly + (Tbl_lo + w_lo)) ...observe order
378//
379//      ...Deliver final Result
380//      ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
381//
382//      sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 )
383//      ...sigma can be obtained by a table lookup using
384//      ...(sign_X,swap) as index and stored as single precision
385//         ...sigma should be calculated earlier
386//
387//      P_hi := s_Y*P_hi
388//      A_hi := s_Y*A_hi
389//
390//      Res_hi := P_hi + sigma*A_hi     ...this is exact because
391//                          ...both P_hi and Tbl_hi
392//                          ...are double-precision
393//                          ...and |Tbl_hi| > 2^(-4)
394//                          ...P_hi is either 0 or
395//                          ...between (1,4)
396//
397//      Res_lo := sigma*A_lo + P_lo
398//
399//      Return Res_hi + s_Y*Res_lo in user-defined rounding control
400//
401// Step 4. Simple polynomial case.
402//
403//    ...E and Q are inherited from Step 2.
404//
405//    A_hi := Q     ...Q is inherited from Step 2 Q approx V/U
406//
407//    loop 3 times
408//       E := E + E2(1.0 - E*U1
409//    ...at this point E approximates 1/U to roughly working precision
410//
411//    z := V * E     ...z approximates V/U to roughly working precision
412//    zsq := z * z
413//    z4 := zsq * zsq; z8 := z4 * z4
414//
415//    poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
416//    poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3))
417//
418//    poly  := poly1 + z8*poly2
419//
420//    z_lo := (V - A_hi*U)*E
421//
422//    A_lo := z*poly + z_lo
423//    ...A_hi, A_lo approximate arctan(V/U) accurately
424//
425//    (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo)
426//    ...one can store the M(sign_X,swap) as single precision
427//    ...values
428//
429//    ...Deliver final Result
430//    ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
431//
432//    sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 )
433//    ...sigma can be obtained by a table lookup using
434//    ...(sign_X,swap) as index and stored as single precision
435//    ...sigma should be calculated earlier
436//
437//    P_hi := s_Y*P_hi
438//    A_hi := s_Y*A_hi
439//
440//    Res_hi := P_hi + sigma*A_hi          ...need to compute
441//                          ...P_hi + sigma*A_hi
442//                          ...exactly
443//
444//    tmp    := (P_hi - Res_hi) + sigma*A_hi
445//
446//    Res_lo := s_Y*(sigma*A_lo + P_lo) + tmp
447//
448//    Return Res_hi + Res_lo in user-defined rounding control
449//
450// Step 5. Special Cases
451//
452//    These are detected early in the function by fclass instructions.
453//
454//    We are in one of those special cases when X or Y is 0,+-inf or NaN
455//
456//    If one of X and Y is NaN, return X+Y (which will generate
457//    invalid in case one is a signaling NaN). Otherwise,
458//    return the Result as described in the table
459//
460//
461//
462//      \ Y |
463//     X \  |  +0  | -0  |  +inf |  -inf  |  finite non-zero
464//        \ |      |     |       |        |
465//    ______________________________________________________
466//          |            |       |        |
467//     +-0  |   Invalid/ |  pi/2 | -pi/2  |  sign(Y)*pi/2
468//          |    qNaN    |       |        |
469//  --------------------------------------------------------
470//          |      |     |       |        |
471//     +inf |  +0  | -0  |  pi/4 | -pi/4  |  sign(Y)*0
472//  --------------------------------------------------------
473//          |      |     |       |        |
474//     -inf |  +pi | -pi | 3pi/4 | -3pi/4 |  sign(Y)*pi
475//  --------------------------------------------------------
476//   finite |    X>0?    |  pi/2 | -pi/2  |
477//  non-zero| sign(Y)*0: |       |        |      N/A
478//       | sign(Y)*pi |       |        |
479//
480//
481
482ArgY_orig   =   f8
483Result      =   f8
484FR_RESULT   =   f8
485ArgX_orig   =   f9
486ArgX        =   f10
487FR_X        =   f10
488ArgY        =   f11
489FR_Y        =   f11
490s_Y         =   f12
491U           =   f13
492V           =   f14
493E           =   f15
494Q           =   f32
495z_hi        =   f33
496U_prime_hi  =   f34
497U_prime_lo  =   f35
498V_prime     =   f36
499C_hi        =   f37
500w_hi        =   f38
501w_lo        =   f39
502wsq         =   f40
503poly        =   f41
504Tbl_hi      =   f42
505Tbl_lo      =   f43
506P_hi        =   f44
507P_lo        =   f45
508A_hi        =   f46
509A_lo        =   f47
510sigma       =   f48
511Res_hi      =   f49
512Res_lo      =   f50
513Z           =   f52
514zsq         =   f53
515z4          =   f54
516z8          =   f54
517poly1       =   f55
518poly2       =   f56
519z_lo        =   f57
520tmp         =   f58
521P_1         =   f59
522Q_1         =   f60
523P_2         =   f61
524Q_2         =   f62
525P_3         =   f63
526Q_3         =   f64
527P_4         =   f65
528Q_4         =   f66
529P_5         =   f67
530P_6         =   f68
531P_7         =   f69
532P_8         =   f70
533U_hold      =   f71
534TWO_TO_NEG3 =   f72
535C_hi_hold   =   f73
536E_hold      =   f74
537M           =   f75
538ArgX_abs    =   f76
539ArgY_abs    =   f77
540Result_lo   =   f78
541A_temp      =   f79
542FR_temp     =   f80
543Xsq         =   f81
544Ysq         =   f82
545tmp_small   =   f83
546
547GR_SAVE_PFS   = r33
548GR_SAVE_B0    = r34
549GR_SAVE_GP    = r35
550sign_X        = r36
551sign_Y        = r37
552swap          = r38
553table_ptr1    = r39
554table_ptr2    = r40
555k             = r41
556lookup        = r42
557exp_ArgX      = r43
558exp_ArgY      = r44
559exponent_Q    = r45
560significand_Q = r46
561special       = r47
562sp_exp_Q      = r48
563sp_exp_4sig_Q = r49
564table_base    = r50
565int_temp      = r51
566
567GR_Parameter_X      = r49
568GR_Parameter_Y      = r50
569GR_Parameter_RESULT = r51
570GR_Parameter_TAG    = r52
571GR_temp             = r52
572
573RODATA
574.align 16
575
576LOCAL_OBJECT_START(Constants_atan)
577//       double pi/2
578data8 0x3FF921FB54442D18
579//       single lo_pi/2, two**(-3)
580data4 0x248D3132, 0x3E000000
581data8 0xAAAAAAAAAAAAAAA3, 0xBFFD // P_1
582data8 0xCCCCCCCCCCCC54B2, 0x3FFC // P_2
583data8 0x9249249247E4D0C2, 0xBFFC // P_3
584data8 0xE38E38E058870889, 0x3FFB // P_4
585data8 0xBA2E895B290149F8, 0xBFFB // P_5
586data8 0x9D88E6D4250F733D, 0x3FFB // P_6
587data8 0x884E51FFFB8745A0, 0xBFFB // P_7
588data8 0xE1C7412B394396BD, 0x3FFA // P_8
589data8 0xAAAAAAAAAAAAA52F, 0xBFFD // Q_1
590data8 0xCCCCCCCCC75B60D3, 0x3FFC // Q_2
591data8 0x924923AD011F1940, 0xBFFC // Q_3
592data8 0xE36F716D2A5F89BD, 0x3FFB // Q_4
593//
594//    Entries Tbl_hi  (double precision)
595//    B = 1+Index/16+1/32  Index = 0
596//    Entries Tbl_lo (single precision)
597//    B = 1+Index/16+1/32  Index = 0
598//
599data8 0x3FE9A000A935BD8E
600data4 0x23ACA08F, 0x00000000
601//
602//    Entries Tbl_hi  (double precision) Index = 0,1,...,15
603//    B = 2^(-1)*(1+Index/16+1/32)
604//    Entries Tbl_lo (single precision)
605//    Index = 0,1,...,15  B = 2^(-1)*(1+Index/16+1/32)
606//
607data8 0x3FDE77EB7F175A34
608data4 0x238729EE, 0x00000000
609data8 0x3FE0039C73C1A40B
610data4 0x249334DB, 0x00000000
611data8 0x3FE0C6145B5B43DA
612data4 0x22CBA7D1, 0x00000000
613data8 0x3FE1835A88BE7C13
614data4 0x246310E7, 0x00000000
615data8 0x3FE23B71E2CC9E6A
616data4 0x236210E5, 0x00000000
617data8 0x3FE2EE628406CBCA
618data4 0x2462EAF5, 0x00000000
619data8 0x3FE39C391CD41719
620data4 0x24B73EF3, 0x00000000
621data8 0x3FE445065B795B55
622data4 0x24C11260, 0x00000000
623data8 0x3FE4E8DE5BB6EC04
624data4 0x242519EE, 0x00000000
625data8 0x3FE587D81F732FBA
626data4 0x24D4346C, 0x00000000
627data8 0x3FE6220D115D7B8D
628data4 0x24ED487B, 0x00000000
629data8 0x3FE6B798920B3D98
630data4 0x2495FF1E, 0x00000000
631data8 0x3FE748978FBA8E0F
632data4 0x223D9531, 0x00000000
633data8 0x3FE7D528289FA093
634data4 0x242B0411, 0x00000000
635data8 0x3FE85D69576CC2C5
636data4 0x2335B374, 0x00000000
637data8 0x3FE8E17AA99CC05D
638data4 0x24C27CFB, 0x00000000
639//
640//    Entries Tbl_hi  (double precision) Index = 0,1,...,15
641//    B = 2^(-2)*(1+Index/16+1/32)
642//    Entries Tbl_lo (single precision)
643//    Index = 0,1,...,15  B = 2^(-2)*(1+Index/16+1/32)
644//
645data8 0x3FD025FA510665B5
646data4 0x24263482, 0x00000000
647data8 0x3FD1151A362431C9
648data4 0x242C8DC9, 0x00000000
649data8 0x3FD2025567E47C95
650data4 0x245CF9BA, 0x00000000
651data8 0x3FD2ED987A823CFE
652data4 0x235C892C, 0x00000000
653data8 0x3FD3D6D129271134
654data4 0x2389BE52, 0x00000000
655data8 0x3FD4BDEE586890E6
656data4 0x24436471, 0x00000000
657data8 0x3FD5A2E0175E0F4E
658data4 0x2389DBD4, 0x00000000
659data8 0x3FD685979F5FA6FD
660data4 0x2476D43F, 0x00000000
661data8 0x3FD7660752817501
662data4 0x24711774, 0x00000000
663data8 0x3FD84422B8DF95D7
664data4 0x23EBB501, 0x00000000
665data8 0x3FD91FDE7CD0C662
666data4 0x23883A0C, 0x00000000
667data8 0x3FD9F93066168001
668data4 0x240DF63F, 0x00000000
669data8 0x3FDAD00F5422058B
670data4 0x23FE261A, 0x00000000
671data8 0x3FDBA473378624A5
672data4 0x23A8CD0E, 0x00000000
673data8 0x3FDC76550AAD71F8
674data4 0x2422D1D0, 0x00000000
675data8 0x3FDD45AEC9EC862B
676data4 0x2344A109, 0x00000000
677//
678//    Entries Tbl_hi  (double precision) Index = 0,1,...,15
679//    B = 2^(-3)*(1+Index/16+1/32)
680//    Entries Tbl_lo (single precision)
681//    Index = 0,1,...,15  B = 2^(-3)*(1+Index/16+1/32)
682//
683data8 0x3FC068D584212B3D
684data4 0x239874B6, 0x00000000
685data8 0x3FC1646541060850
686data4 0x2335E774, 0x00000000
687data8 0x3FC25F6E171A535C
688data4 0x233E36BE, 0x00000000
689data8 0x3FC359E8EDEB99A3
690data4 0x239680A3, 0x00000000
691data8 0x3FC453CEC6092A9E
692data4 0x230FB29E, 0x00000000
693data8 0x3FC54D18BA11570A
694data4 0x230C1418, 0x00000000
695data8 0x3FC645BFFFB3AA73
696data4 0x23F0564A, 0x00000000
697data8 0x3FC73DBDE8A7D201
698data4 0x23D4A5E1, 0x00000000
699data8 0x3FC8350BE398EBC7
700data4 0x23D4ADDA, 0x00000000
701data8 0x3FC92BA37D050271
702data4 0x23BCB085, 0x00000000
703data8 0x3FCA217E601081A5
704data4 0x23BC841D, 0x00000000
705data8 0x3FCB1696574D780B
706data4 0x23CF4A8E, 0x00000000
707data8 0x3FCC0AE54D768466
708data4 0x23BECC90, 0x00000000
709data8 0x3FCCFE654E1D5395
710data4 0x2323DCD2, 0x00000000
711data8 0x3FCDF110864C9D9D
712data4 0x23F53F3A, 0x00000000
713data8 0x3FCEE2E1451D980C
714data4 0x23CCB11F, 0x00000000
715//
716data8 0x400921FB54442D18, 0x3CA1A62633145C07 // PI two doubles
717data8 0x3FF921FB54442D18, 0x3C91A62633145C07 // PI_by_2 two dbles
718data8 0x3FE921FB54442D18, 0x3C81A62633145C07 // PI_by_4 two dbles
719data8 0x4002D97C7F3321D2, 0x3C9A79394C9E8A0A // 3PI_by_4 two dbles
720LOCAL_OBJECT_END(Constants_atan)
721
722
723.section .text
724GLOBAL_IEEE754_ENTRY(atanl)
725
726// Use common code with atan2l after setting x=1.0
727{ .mfi
728      alloc r32 = ar.pfs, 0, 17, 4, 0
729      fma.s1 Ysq = ArgY_orig, ArgY_orig, f0          // Form y*y
730      nop.i 999
731}
732{ .mfi
733      addl table_ptr1 = @ltoff(Constants_atan#), gp  // Address of table pointer
734      fma.s1 Xsq = f1, f1, f0                        // Form x*x
735      nop.i 999
736}
737;;
738
739{ .mfi
740      ld8 table_ptr1 = [table_ptr1]                  // Get table pointer
741      fnorm.s1 ArgY = ArgY_orig
742      nop.i 999
743}
744{ .mfi
745      nop.m 999
746      fnorm.s1 ArgX = f1
747      nop.i 999
748}
749;;
750
751{ .mfi
752      getf.exp sign_X = f1               // Get signexp of x
753      fmerge.s ArgX_abs = f0, f1         // Form |x|
754      nop.i 999
755}
756{ .mfi
757      nop.m 999
758      fnorm.s1 ArgX_orig = f1
759      nop.i 999
760}
761;;
762
763{ .mfi
764      getf.exp sign_Y = ArgY_orig        // Get signexp of y
765      fmerge.s ArgY_abs = f0, ArgY_orig  // Form |y|
766      mov table_base = table_ptr1        // Save base pointer to tables
767}
768;;
769
770{ .mfi
771      ldfd P_hi = [table_ptr1],8         // Load double precision hi part of pi
772      fclass.m p8,p0 = ArgY_orig, 0x1e7  // Test y natval, nan, inf, zero
773      nop.i 999
774}
775;;
776
777{ .mfi
778      ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
779      nop.f 999
780      nop.i 999
781}
782{ .mfi
783      nop.m 999
784      fma.s1 M = f1, f1, f0              // Set M = 1.0
785      nop.i 999
786}
787;;
788
789//
790//     Check for everything - if false, then must be pseudo-zero
791//     or pseudo-nan (IA unsupporteds).
792//
793{ .mfb
794      nop.m 999
795      fclass.m p0,p12 = f1, 0x1FF        // Test x unsupported
796(p8)  br.cond.spnt ATANL_Y_SPECIAL       // Branch if y natval, nan, inf, zero
797}
798;;
799
800//     U = max(ArgX_abs,ArgY_abs)
801//     V = min(ArgX_abs,ArgY_abs)
802{ .mfi
803      nop.m 999
804      fcmp.ge.s1 p6,p7 = Xsq, Ysq        // Test for |x| >= |y| using squares
805      nop.i 999
806}
807{ .mfb
808      nop.m 999
809      fma.s1 V = ArgX_abs, f1, f0        // Set V assuming |x| < |y|
810      br.cond.sptk ATANL_COMMON          // Branch to common code
811}
812;;
813
814GLOBAL_IEEE754_END(atanl)
815libm_alias_ldouble_other (__atan, atan)
816
817GLOBAL_IEEE754_ENTRY(atan2l)
818
819{ .mfi
820      alloc r32 = ar.pfs, 0, 17, 4, 0
821      fma.s1 Ysq = ArgY_orig, ArgY_orig, f0          // Form y*y
822      nop.i 999
823}
824{ .mfi
825      addl table_ptr1 = @ltoff(Constants_atan#), gp  // Address of table pointer
826      fma.s1 Xsq = ArgX_orig, ArgX_orig, f0          // Form x*x
827      nop.i 999
828}
829;;
830
831{ .mfi
832      ld8 table_ptr1 = [table_ptr1]                  // Get table pointer
833      fnorm.s1 ArgY = ArgY_orig
834      nop.i 999
835}
836{ .mfi
837      nop.m 999
838      fnorm.s1 ArgX = ArgX_orig
839      nop.i 999
840}
841;;
842
843{ .mfi
844      getf.exp sign_X = ArgX_orig        // Get signexp of x
845      fmerge.s ArgX_abs = f0, ArgX_orig  // Form |x|
846      nop.i 999
847}
848;;
849
850{ .mfi
851      getf.exp sign_Y = ArgY_orig        // Get signexp of y
852      fmerge.s ArgY_abs = f0, ArgY_orig  // Form |y|
853      mov table_base = table_ptr1        // Save base pointer to tables
854}
855;;
856
857{ .mfi
858      ldfd P_hi = [table_ptr1],8         // Load double precision hi part of pi
859      fclass.m p8,p0 = ArgY_orig, 0x1e7  // Test y natval, nan, inf, zero
860      nop.i 999
861}
862;;
863
864{ .mfi
865      ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
866      fclass.m p9,p0 = ArgX_orig, 0x1e7  // Test x natval, nan, inf, zero
867      nop.i 999
868}
869{ .mfi
870      nop.m 999
871      fma.s1 M = f1, f1, f0              // Set M = 1.0
872      nop.i 999
873}
874;;
875
876//
877//     Check for everything - if false, then must be pseudo-zero
878//     or pseudo-nan (IA unsupporteds).
879//
880{ .mfb
881      nop.m 999
882      fclass.m p0,p12 = ArgX_orig, 0x1FF // Test x unsupported
883(p8)  br.cond.spnt ATANL_Y_SPECIAL       // Branch if y natval, nan, inf, zero
884}
885;;
886
887//     U = max(ArgX_abs,ArgY_abs)
888//     V = min(ArgX_abs,ArgY_abs)
889{ .mfi
890      nop.m 999
891      fcmp.ge.s1 p6,p7 = Xsq, Ysq        // Test for |x| >= |y| using squares
892      nop.i 999
893}
894{ .mfb
895      nop.m 999
896      fma.s1 V = ArgX_abs, f1, f0        // Set V assuming |x| < |y|
897(p9)  br.cond.spnt ATANL_X_SPECIAL       // Branch if x natval, nan, inf, zero
898}
899;;
900
901// Now common code for atanl and atan2l
902ATANL_COMMON:
903{ .mfi
904      nop.m 999
905      fclass.m p0,p13 = ArgY_orig, 0x1FF // Test y unsupported
906      shr sign_X = sign_X, 17            // Get sign bit of x
907}
908{ .mfi
909      nop.m 999
910      fma.s1 U = ArgY_abs, f1, f0        // Set U assuming |x| < |y|
911      adds table_ptr1 = 176, table_ptr1  // Point to Q4
912}
913;;
914
915{ .mfi
916(p6)  add swap = r0, r0                  // Set swap=0 if |x| >= |y|
917(p6)  frcpa.s1 E, p0 = ArgY_abs, ArgX_abs // Compute E if |x| >= |y|
918      shr sign_Y = sign_Y, 17            // Get sign bit of y
919}
920{ .mfb
921      nop.m 999
922(p6)  fma.s1 V = ArgY_abs, f1, f0        // Set V if |x| >= |y|
923(p12) br.cond.spnt ATANL_UNSUPPORTED     // Branch if x unsupported
924}
925;;
926
927// Set p8 if y >=0
928// Set p9 if y < 0
929// Set p10 if |x| >= |y| and x >=0
930// Set p11 if |x| >= |y| and x < 0
931{ .mfi
932      cmp.eq p8, p9 = 0, sign_Y          // Test for y >= 0
933(p7)  frcpa.s1 E, p0 = ArgX_abs, ArgY_abs // Compute E if |x| < |y|
934(p7)  add swap = 1, r0                   // Set swap=1 if |x| < |y|
935}
936{ .mfb
937(p6)  cmp.eq.unc p10, p11 = 0, sign_X    // If |x| >= |y|, test for x >= 0
938(p6)  fma.s1 U = ArgX_abs, f1, f0        // Set U if |x| >= |y|
939(p13) br.cond.spnt ATANL_UNSUPPORTED     // Branch if y unsupported
940}
941;;
942
943//
944//     if p8, s_Y = 1.0
945//     if p9, s_Y = -1.0
946//
947.pred.rel "mutex",p8,p9
948{ .mfi
949      nop.m 999
950(p8)  fadd.s1 s_Y = f0, f1               // If y >= 0 set s_Y = 1.0
951      nop.i 999
952}
953{ .mfi
954      nop.m 999
955(p9)  fsub.s1 s_Y = f0, f1               // If y < 0 set s_Y = -1.0
956      nop.i 999
957}
958;;
959
960.pred.rel "mutex",p10,p11
961{ .mfi
962      nop.m 999
963(p10) fsub.s1 M = M, f1                  // If |x| >= |y| and x >=0, set M=0
964      nop.i 999
965}
966{ .mfi
967      nop.m 999
968(p11) fadd.s1 M = M, f1                  // If |x| >= |y| and x < 0, set M=2.0
969      nop.i 999
970}
971;;
972
973{ .mfi
974      nop.m 999
975      fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag
976      nop.i 999
977}
978// *************************************************
979// ********************* STEP2 *********************
980// *************************************************
981//
982//     Q = E * V
983//
984{ .mfi
985      nop.m 999
986      fmpy.s1 Q = E, V
987      nop.i 999
988}
989;;
990
991{ .mfi
992      nop.m 999
993      fnma.s1 E_hold = E, U, f1           // E_hold = 1.0 - E*U (1) if POLY path
994      nop.i 999
995}
996;;
997
998// Create a single precision representation of the signexp of Q with the
999// 4 most significant bits of the significand followed by a 1 and then 18 0's
1000{ .mfi
1001      nop.m 999
1002      fmpy.s1 P_hi = M, P_hi
1003      dep.z special = 0x1, 18, 1           // Form 0x0000000000040000
1004}
1005{ .mfi
1006      nop.m 999
1007      fmpy.s1 P_lo = M, P_lo
1008      add table_ptr2 = 32, table_ptr1
1009}
1010;;
1011
1012{ .mfi
1013      nop.m 999
1014      fma.s1 A_temp = Q, f1, f0            // Set A_temp if POLY path
1015      nop.i 999
1016}
1017{ .mfi
1018      nop.m 999
1019      fma.s1 E = E, E_hold, E              // E = E + E*E_hold (1) if POLY path
1020      nop.i 999
1021}
1022;;
1023
1024//
1025//     Is Q < 2**(-3)?
1026//     swap = xor(swap,sign_X)
1027//
1028{ .mfi
1029      nop.m 999
1030      fcmp.lt.s1 p9, p0 = Q, TWO_TO_NEG3    // Test Q < 2^-3
1031      xor swap = sign_X, swap
1032}
1033;;
1034
1035//     P_hi = s_Y * P_hi
1036{ .mmf
1037      getf.exp exponent_Q =  Q              // Get signexp of Q
1038      cmp.eq.unc p7, p6 = 0x00000, swap
1039      fmpy.s1 P_hi = s_Y, P_hi
1040}
1041;;
1042
1043//
1044//     if (PR_1) sigma = -1.0
1045//     if (PR_2) sigma =  1.0
1046//
1047{ .mfi
1048      getf.sig significand_Q = Q            // Get significand of Q
1049(p6)  fsub.s1 sigma = f0, f1
1050      nop.i 999
1051}
1052{ .mfb
1053(p9)  add table_ptr1 = 128, table_base      // Point to P8 if POLY path
1054(p7)  fadd.s1 sigma = f0, f1
1055(p9)  br.cond.spnt ATANL_POLY               // Branch to POLY if 0 < Q < 2^-3
1056}
1057;;
1058
1059//
1060// *************************************************
1061// ******************** STEP3 **********************
1062// *************************************************
1063//
1064//     lookup = b_1 b_2 b_3 B_4
1065//
1066{ .mmi
1067      nop.m 999
1068      nop.m 999
1069      andcm k = 0x0003, exponent_Q  // k=0,1,2,3 for exp_Q=0,-1,-2,-3
1070}
1071;;
1072
1073//
1074//  Generate sign_exp_Q b_1 b_2 b_3 b_4 1 0 0 0 ... 0  in single precision
1075//  representation.  Note sign of Q is always 0.
1076//
1077{ .mfi
1078      cmp.eq p8, p9 = 0x0000, k             // Test k=0
1079      nop.f 999
1080      extr.u lookup = significand_Q, 59, 4  // Extract b_1 b_2 b_3 b_4 for index
1081}
1082{ .mfi
1083      sub sp_exp_Q = 0x7f, k                // Form single prec biased exp of Q
1084      nop.f 999
1085      sub k = k, r0, 1                      // Decrement k
1086}
1087;;
1088
1089//     Form pointer to B index table
1090{ .mfi
1091      ldfe Q_4 = [table_ptr1], -16          // Load Q_4
1092      nop.f 999
1093(p9)  shl k = k, 8                          // k = 0, 256, or 512
1094}
1095{ .mfi
1096(p9)  shladd table_ptr2 = lookup, 4, table_ptr2
1097      nop.f 999
1098      shladd sp_exp_4sig_Q = sp_exp_Q, 4, lookup // Shift and add in 4 high bits
1099}
1100;;
1101
1102{ .mmi
1103(p8)  add table_ptr2 = -16, table_ptr2      // Pointer if original k was 0
1104(p9)  add table_ptr2 = k, table_ptr2        // Pointer if k was 1, 2, 3
1105      dep special = sp_exp_4sig_Q, special, 19, 13 // Form z_hi as single prec
1106}
1107;;
1108
1109//     z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
1110{ .mmi
1111      ldfd Tbl_hi = [table_ptr2], 8         // Load Tbl_hi from index table
1112;;
1113      setf.s z_hi = special                 // Form z_hi
1114      nop.i 999
1115}
1116{ .mmi
1117      ldfs Tbl_lo = [table_ptr2], 8         // Load Tbl_lo from index table
1118;;
1119      ldfe Q_3 = [table_ptr1], -16          // Load Q_3
1120      nop.i 999
1121}
1122;;
1123
1124{ .mmi
1125      ldfe Q_2 = [table_ptr1], -16          // Load Q_2
1126      nop.m 999
1127      nop.i 999
1128}
1129;;
1130
1131{ .mmf
1132      ldfe Q_1 = [table_ptr1], -16          // Load Q_1
1133      nop.m 999
1134      nop.f 999
1135}
1136;;
1137
1138{ .mfi
1139      nop.m 999
1140      fma.s1 U_prime_hi = V, z_hi, U        // U_prime_hi = U + V * z_hi
1141      nop.i 999
1142}
1143{ .mfi
1144      nop.m 999
1145      fnma.s1 V_prime = U, z_hi, V          // V_prime =  V - U * z_hi
1146      nop.i 999
1147}
1148;;
1149
1150{ .mfi
1151      nop.m 999
1152      mov A_hi = Tbl_hi                     // Start with A_hi = Tbl_hi
1153      nop.i 999
1154}
1155;;
1156
1157{ .mfi
1158      nop.m 999
1159      fsub.s1 U_hold = U, U_prime_hi        // U_hold = U - U_prime_hi
1160      nop.i 999
1161}
1162;;
1163
1164{ .mfi
1165      nop.m 999
1166      frcpa.s1 C_hi, p0 = f1, U_prime_hi    // C_hi = frcpa(1,U_prime_hi)
1167      nop.i 999
1168}
1169;;
1170
1171{ .mfi
1172      nop.m 999
1173      fmpy.s1 A_hi = s_Y, A_hi              // A_hi = s_Y * A_hi
1174      nop.i 999
1175}
1176;;
1177
1178{ .mfi
1179      nop.m 999
1180      fma.s1 U_prime_lo = z_hi, V, U_hold   // U_prime_lo =  U_hold + V * z_hi
1181      nop.i 999
1182}
1183;;
1184
1185//     C_hi_hold = 1 - C_hi * U_prime_hi (1)
1186{ .mfi
1187      nop.m 999
1188      fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1189      nop.i 999
1190}
1191;;
1192
1193{ .mfi
1194      nop.m 999
1195      fma.s1 Res_hi = sigma, A_hi, P_hi   // Res_hi = P_hi + sigma * A_hi
1196      nop.i 999
1197}
1198;;
1199
1200{ .mfi
1201      nop.m 999
1202      fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (1)
1203      nop.i 999
1204}
1205;;
1206
1207//     C_hi_hold = 1 - C_hi * U_prime_hi (2)
1208{ .mfi
1209      nop.m 999
1210      fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1211      nop.i 999
1212}
1213;;
1214
1215{ .mfi
1216      nop.m 999
1217      fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (2)
1218      nop.i 999
1219}
1220;;
1221
1222//     C_hi_hold = 1 - C_hi * U_prime_hi (3)
1223{ .mfi
1224      nop.m 999
1225      fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1226      nop.i 999
1227}
1228;;
1229
1230{ .mfi
1231      nop.m 999
1232      fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (3)
1233      nop.i 999
1234}
1235;;
1236
1237{ .mfi
1238      nop.m 999
1239      fmpy.s1 w_hi = V_prime, C_hi           // w_hi = V_prime * C_hi
1240      nop.i 999
1241}
1242;;
1243
1244{ .mfi
1245      nop.m 999
1246      fmpy.s1 wsq = w_hi, w_hi               // wsq = w_hi * w_hi
1247      nop.i 999
1248}
1249{ .mfi
1250      nop.m 999
1251      fnma.s1 w_lo = w_hi, U_prime_hi, V_prime // w_lo = V_prime-w_hi*U_prime_hi
1252      nop.i 999
1253}
1254;;
1255
1256{ .mfi
1257      nop.m 999
1258      fma.s1 poly =  wsq, Q_4, Q_3           // poly = Q_3 + wsq * Q_4
1259      nop.i 999
1260}
1261{ .mfi
1262      nop.m 999
1263      fnma.s1 w_lo = w_hi, U_prime_lo, w_lo  // w_lo = w_lo - w_hi * U_prime_lo
1264      nop.i 999
1265}
1266;;
1267
1268{ .mfi
1269      nop.m 999
1270      fma.s1 poly = wsq, poly, Q_2           // poly = Q_2 + wsq * poly
1271      nop.i 999
1272}
1273{ .mfi
1274      nop.m 999
1275      fmpy.s1 w_lo = C_hi, w_lo              // w_lo =  = w_lo * C_hi
1276      nop.i 999
1277}
1278;;
1279
1280{ .mfi
1281      nop.m 999
1282      fma.s1 poly = wsq, poly, Q_1           // poly = Q_1 + wsq * poly
1283      nop.i 999
1284}
1285{ .mfi
1286      nop.m 999
1287      fadd.s1 A_lo = Tbl_lo, w_lo            // A_lo = Tbl_lo + w_lo
1288      nop.i 999
1289}
1290;;
1291
1292{ .mfi
1293      nop.m 999
1294      fmpy.s0 Q_1 =  Q_1, Q_1                // Dummy operation to raise inexact
1295      nop.i 999
1296}
1297;;
1298
1299{ .mfi
1300      nop.m 999
1301      fmpy.s1 poly = wsq, poly               // poly = wsq * poly
1302      nop.i 999
1303}
1304;;
1305
1306{ .mfi
1307      nop.m 999
1308      fmpy.s1 poly = w_hi, poly              // poly = w_hi * poly
1309      nop.i 999
1310}
1311;;
1312
1313{ .mfi
1314      nop.m 999
1315      fadd.s1 A_lo = A_lo, poly              // A_lo = A_lo + poly
1316      nop.i 999
1317}
1318;;
1319
1320{ .mfi
1321      nop.m 999
1322      fadd.s1 A_lo = A_lo, w_hi              // A_lo = A_lo + w_hi
1323      nop.i 999
1324}
1325;;
1326
1327{ .mfi
1328      nop.m 999
1329      fma.s1 Res_lo = sigma, A_lo, P_lo      // Res_lo = P_lo + sigma * A_lo
1330      nop.i 999
1331}
1332;;
1333
1334//
1335//     Result  =  Res_hi + Res_lo * s_Y  (User Supplied Rounding Mode)
1336//
1337{ .mfb
1338      nop.m 999
1339      fma.s0 Result = Res_lo, s_Y, Res_hi
1340      br.ret.sptk   b0                        // Exit table path 2^-3 <= V/U < 1
1341}
1342;;
1343
1344
1345ATANL_POLY:
1346// Here if 0 < V/U < 2^-3
1347//
1348// ***********************************************
1349// ******************** STEP4 ********************
1350// ***********************************************
1351
1352//
1353//     Following:
1354//     Iterate 3 times E = E + E*(1.0 - E*U)
1355//     Also load P_8, P_7, P_6, P_5, P_4
1356//
1357{ .mfi
1358      ldfe P_8 = [table_ptr1], -16            // Load P_8
1359      fnma.s1 z_lo = A_temp, U, V             // z_lo = V - A_temp * U
1360      nop.i 999
1361}
1362{ .mfi
1363      nop.m 999
1364      fnma.s1 E_hold = E, U, f1               // E_hold = 1.0 - E*U (2)
1365      nop.i 999
1366}
1367;;
1368
1369{ .mmi
1370      ldfe P_7 = [table_ptr1], -16            // Load P_7
1371;;
1372      ldfe P_6 = [table_ptr1], -16            // Load P_6
1373      nop.i 999
1374}
1375;;
1376
1377{ .mfi
1378      ldfe P_5 = [table_ptr1], -16            // Load P_5
1379      fma.s1 E = E, E_hold, E                 // E = E + E_hold*E (2)
1380      nop.i 999
1381}
1382;;
1383
1384{ .mmi
1385      ldfe P_4 = [table_ptr1], -16            // Load P_4
1386;;
1387      ldfe P_3 = [table_ptr1], -16            // Load P_3
1388      nop.i 999
1389}
1390;;
1391
1392{ .mfi
1393      ldfe P_2 = [table_ptr1], -16            // Load P_2
1394      fnma.s1 E_hold = E, U, f1               // E_hold = 1.0 - E*U (3)
1395      nop.i 999
1396}
1397{ .mlx
1398      nop.m 999
1399      movl         int_temp = 0x24005         // Signexp for small neg number
1400}
1401;;
1402
1403{ .mmf
1404      ldfe P_1 = [table_ptr1], -16            // Load P_1
1405      setf.exp     tmp_small = int_temp       // Form small neg number
1406      fma.s1 E = E, E_hold, E                 // E = E + E_hold*E (3)
1407}
1408;;
1409
1410//
1411//
1412// At this point E approximates 1/U to roughly working precision
1413// Z = V*E approximates V/U
1414//
1415{ .mfi
1416      nop.m 999
1417      fmpy.s1 Z = V, E                         // Z = V * E
1418      nop.i 999
1419}
1420{ .mfi
1421      nop.m 999
1422      fmpy.s1 z_lo = z_lo, E                   // z_lo = z_lo * E
1423      nop.i 999
1424}
1425;;
1426
1427//
1428//     Now what we want to do is
1429//     poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
1430//     poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
1431//
1432//
1433//     Fixup added to force inexact later -
1434//     A_hi = A_temp + z_lo
1435//     z_lo = (A_temp - A_hi) + z_lo
1436//
1437{ .mfi
1438      nop.m 999
1439      fmpy.s1 zsq = Z, Z                        // zsq = Z * Z
1440      nop.i 999
1441}
1442{ .mfi
1443      nop.m 999
1444      fadd.s1 A_hi = A_temp, z_lo               // A_hi = A_temp + z_lo
1445      nop.i 999
1446}
1447;;
1448
1449{ .mfi
1450      nop.m 999
1451      fma.s1 poly1 = zsq, P_8, P_7              // poly1 = P_7 + zsq * P_8
1452      nop.i 999
1453}
1454{ .mfi
1455      nop.m 999
1456      fma.s1 poly2 = zsq, P_3, P_2              // poly2 = P_2 + zsq * P_3
1457      nop.i 999
1458}
1459;;
1460
1461{ .mfi
1462      nop.m 999
1463      fmpy.s1 z4 = zsq, zsq                     // z4 = zsq * zsq
1464      nop.i 999
1465}
1466{ .mfi
1467      nop.m 999
1468      fsub.s1 A_temp = A_temp, A_hi             // A_temp = A_temp - A_hi
1469      nop.i 999
1470}
1471;;
1472
1473{ .mfi
1474      nop.m 999
1475      fmerge.s     tmp = A_hi, A_hi             // Copy tmp = A_hi
1476      nop.i 999
1477}
1478;;
1479
1480{ .mfi
1481      nop.m 999
1482      fma.s1 poly1 = zsq, poly1, P_6            // poly1 = P_6 + zsq * poly1
1483      nop.i 999
1484}
1485{ .mfi
1486      nop.m 999
1487      fma.s1 poly2 = zsq, poly2, P_1            // poly2 = P_2 + zsq * poly2
1488      nop.i 999
1489}
1490;;
1491
1492{ .mfi
1493      nop.m 999
1494      fmpy.s1 z8 = z4, z4                       // z8 = z4 * z4
1495      nop.i 999
1496}
1497{ .mfi
1498      nop.m 999
1499      fadd.s1 z_lo = A_temp, z_lo               // z_lo = (A_temp - A_hi) + z_lo
1500      nop.i 999
1501}
1502;;
1503
1504{ .mfi
1505      nop.m 999
1506      fma.s1 poly1 = zsq, poly1, P_5            // poly1 = P_5 + zsq * poly1
1507      nop.i 999
1508}
1509{ .mfi
1510      nop.m 999
1511      fmpy.s1 poly2 = poly2, zsq                // poly2 = zsq * poly2
1512      nop.i 999
1513}
1514;;
1515
1516//     Create small GR double in case need to raise underflow
1517{ .mfi
1518      nop.m 999
1519      fma.s1 poly1 = zsq, poly1, P_4            // poly1 = P_4 + zsq * poly1
1520      dep GR_temp = -1,r0,0,53
1521}
1522;;
1523
1524//     Create small double in case need to raise underflow
1525{ .mfi
1526      setf.d FR_temp = GR_temp
1527      fma.s1 poly = z8, poly1, poly2            // poly = poly2 + z8 * poly1
1528      nop.i 999
1529}
1530;;
1531
1532{ .mfi
1533      nop.m 999
1534      fma.s1 A_lo = Z, poly, z_lo               // A_lo = z_lo + Z * poly
1535      nop.i 999
1536}
1537;;
1538
1539{ .mfi
1540      nop.m 999
1541      fadd.s1      A_hi = tmp, A_lo             // A_hi = tmp + A_lo
1542      nop.i 999
1543}
1544;;
1545
1546{ .mfi
1547      nop.m 999
1548      fsub.s1      tmp = tmp, A_hi              // tmp = tmp - A_hi
1549      nop.i 999
1550}
1551{ .mfi
1552      nop.m 999
1553      fmpy.s1 A_hi = s_Y, A_hi                  // A_hi = s_Y * A_hi
1554      nop.i 999
1555}
1556;;
1557
1558{ .mfi
1559      nop.m 999
1560      fadd.s1      A_lo = tmp, A_lo             // A_lo = tmp + A_lo
1561      nop.i 999
1562}
1563{ .mfi
1564      nop.m 999
1565      fma.s1 Res_hi = sigma, A_hi, P_hi         // Res_hi = P_hi + sigma * A_hi
1566      nop.i 999
1567}
1568;;
1569
1570{ .mfi
1571      nop.m 999
1572      fsub.s1 tmp =  P_hi, Res_hi               // tmp = P_hi - Res_hi
1573      nop.i 999
1574}
1575;;
1576
1577//
1578//     Test if A_lo is zero
1579//
1580{ .mfi
1581      nop.m 999
1582      fclass.m p6,p0 = A_lo, 0x007              // Test A_lo = 0
1583      nop.i 999
1584}
1585;;
1586
1587{ .mfi
1588      nop.m 999
1589(p6)  mov          A_lo = tmp_small             // If A_lo zero, make very small
1590      nop.i 999
1591}
1592;;
1593
1594{ .mfi
1595      nop.m 999
1596      fma.s1 tmp = A_hi, sigma, tmp             // tmp = sigma * A_hi  + tmp
1597      nop.i 999
1598}
1599{ .mfi
1600      nop.m 999
1601      fma.s1 sigma =  A_lo, sigma, P_lo         // sigma = A_lo * sigma  + P_lo
1602      nop.i 999
1603}
1604;;
1605
1606{ .mfi
1607      nop.m 999
1608      fma.s1 Res_lo = s_Y, sigma, tmp           // Res_lo = s_Y * sigma + tmp
1609      nop.i 999
1610}
1611;;
1612
1613//
1614//     Test if Res_lo is denormal
1615//
1616{ .mfi
1617      nop.m 999
1618      fclass.m p14, p15 = Res_lo, 0x0b
1619      nop.i 999
1620}
1621;;
1622
1623//
1624//     Compute Result = Res_lo + Res_hi.  Use s3 if Res_lo is denormal.
1625//
1626{ .mfi
1627      nop.m 999
1628(p14) fadd.s3 Result = Res_lo, Res_hi     // Result for Res_lo denormal
1629      nop.i 999
1630}
1631{ .mfi
1632      nop.m 999
1633(p15) fadd.s0 Result = Res_lo, Res_hi     // Result for Res_lo normal
1634      nop.i 999
1635}
1636;;
1637
1638//
1639//     If Res_lo is denormal test if Result equals zero
1640//
1641{ .mfi
1642      nop.m 999
1643(p14) fclass.m.unc p14, p0 = Result, 0x07
1644      nop.i 999
1645}
1646;;
1647
1648//
1649//     If Res_lo is denormal and Result equals zero, raise inexact, underflow
1650//     by squaring small double
1651//
1652{ .mfb
1653      nop.m 999
1654(p14) fmpy.d.s0 FR_temp = FR_temp, FR_temp
1655      br.ret.sptk   b0                     // Exit POLY path, 0 < Q < 2^-3
1656}
1657;;
1658
1659
1660ATANL_UNSUPPORTED:
1661{ .mfb
1662      nop.m 999
1663      fmpy.s0 Result = ArgX,ArgY
1664      br.ret.sptk   b0
1665}
1666;;
1667
1668// Here if y natval, nan, inf, zero
1669ATANL_Y_SPECIAL:
1670// Here if x natval, nan, inf, zero
1671ATANL_X_SPECIAL:
1672{ .mfi
1673      nop.m 999
1674      fclass.m p13,p12 = ArgY_orig, 0x0c3  // Test y nan
1675      nop.i 999
1676}
1677;;
1678
1679{ .mfi
1680      nop.m 999
1681      fclass.m p15,p14 = ArgY_orig, 0x103  // Test y natval
1682      nop.i 999
1683}
1684;;
1685
1686{ .mfi
1687      nop.m 999
1688(p12) fclass.m p13,p0 = ArgX_orig, 0x0c3  // Test x nan
1689      nop.i 999
1690}
1691;;
1692
1693{ .mfi
1694      nop.m 999
1695(p14) fclass.m p15,p0 = ArgX_orig, 0x103  // Test x natval
1696      nop.i 999
1697}
1698;;
1699
1700{ .mfb
1701      nop.m 999
1702(p13) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result nan if x or y nan
1703(p13) br.ret.spnt b0                      // Exit if x or y nan
1704}
1705;;
1706
1707{ .mfb
1708      nop.m 999
1709(p15) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result natval if x or y natval
1710(p15) br.ret.spnt b0                      // Exit if x or y natval
1711}
1712;;
1713
1714
1715// Here if x or y inf or zero
1716ATANL_SPECIAL_HANDLING:
1717{ .mfi
1718      nop.m 999
1719      fclass.m p6, p7 = ArgY_orig, 0x007        // Test y zero
1720      mov special = 992                         // Offset to table
1721}
1722;;
1723
1724{ .mfb
1725      add table_ptr1 = table_base, special      // Point to 3pi/4
1726      fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig  // Dummy to set denormal flag
1727(p7)  br.cond.spnt ATANL_ArgY_Not_ZERO          // Branch if y not zero
1728}
1729;;
1730
1731// Here if y zero
1732{ .mmf
1733      ldfd  Result = [table_ptr1], 8            // Get pi high
1734      nop.m 999
1735      fclass.m p14, p0 = ArgX, 0x035            // Test for x>=+0
1736}
1737;;
1738
1739{ .mmf
1740      nop.m 999
1741      ldfd  Result_lo = [table_ptr1], -8        // Get pi lo
1742      fclass.m p15, p0 = ArgX, 0x036            // Test for x<=-0
1743}
1744;;
1745
1746//
1747//     Return sign_Y * 0 when  ArgX > +0
1748//
1749{ .mfi
1750      nop.m 999
1751(p14) fmerge.s Result = ArgY, f0               // If x>=+0, y=0, hi sgn(y)*0
1752      nop.i 999
1753}
1754;;
1755
1756{ .mfi
1757      nop.m 999
1758      fclass.m p13, p0 = ArgX, 0x007           // Test for x=0
1759      nop.i 999
1760}
1761;;
1762
1763{ .mfi
1764      nop.m 999
1765(p14) fmerge.s Result_lo = ArgY, f0            // If x>=+0, y=0, lo sgn(y)*0
1766      nop.i 999
1767}
1768;;
1769
1770{ .mfi
1771(p13) mov GR_Parameter_TAG = 36                // Error tag for x=0, y=0
1772      nop.f 999
1773      nop.i 999
1774}
1775;;
1776
1777//
1778//     Return sign_Y * pi when  ArgX < -0
1779//
1780{ .mfi
1781      nop.m 999
1782(p15) fmerge.s Result = ArgY, Result           // If x<0, y=0, hi=sgn(y)*pi
1783      nop.i 999
1784}
1785;;
1786
1787{ .mfi
1788      nop.m 999
1789(p15) fmerge.s Result_lo = ArgY, Result_lo     // If x<0, y=0, lo=sgn(y)*pi
1790      nop.i 999
1791}
1792;;
1793
1794//
1795//     Call error support function for atan(0,0)
1796//
1797{ .mfb
1798      nop.m 999
1799      fadd.s0 Result = Result, Result_lo
1800(p13) br.cond.spnt __libm_error_region         // Branch if atan(0,0)
1801}
1802;;
1803
1804{ .mib
1805      nop.m 999
1806      nop.i 999
1807      br.ret.sptk   b0                         // Exit for y=0, x not 0
1808}
1809;;
1810
1811// Here if y not zero
1812ATANL_ArgY_Not_ZERO:
1813{ .mfi
1814      nop.m 999
1815      fclass.m p0, p10 = ArgY, 0x023           // Test y inf
1816      nop.i 999
1817}
1818;;
1819
1820{ .mfb
1821      nop.m 999
1822      fclass.m p6, p0 = ArgX, 0x017            // Test for 0 <= |x| < inf
1823(p10) br.cond.spnt  ATANL_ArgY_Not_INF         // Branch if 0 < |y| < inf
1824}
1825;;
1826
1827// Here if y=inf
1828//
1829//     Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal
1830//     Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal
1831//     Return +PI/4 when ArgY = +Inf and ArgX = +Inf
1832//     Return -PI/4 when ArgY = -Inf and ArgX = +Inf
1833//     Return +3PI/4 when ArgY = +Inf and ArgX = -Inf
1834//     Return -3PI/4 when ArgY = -Inf and ArgX = -Inf
1835//
1836{ .mfi
1837      nop.m 999
1838      fclass.m p7, p0 = ArgX, 0x021            // Test for x=+inf
1839      nop.i 999
1840}
1841;;
1842
1843{ .mfi
1844(p6)  add table_ptr1 =  16, table_ptr1         // Point to pi/2, if x finite
1845      fclass.m p8, p0 = ArgX, 0x022            // Test for x=-inf
1846      nop.i 999
1847}
1848;;
1849
1850{ .mmi
1851(p7)  add table_ptr1 =  32, table_ptr1         // Point to pi/4 if x=+inf
1852;;
1853(p8)  add table_ptr1 =  48, table_ptr1         // Point to 3pi/4 if x=-inf
1854
1855      nop.i 999
1856}
1857;;
1858
1859{ .mmi
1860      ldfd Result = [table_ptr1], 8            // Load pi/2, pi/4, or 3pi/4 hi
1861;;
1862      ldfd Result_lo = [table_ptr1], -8        // Load pi/2, pi/4, or 3pi/4 lo
1863      nop.i 999
1864}
1865;;
1866
1867{ .mfi
1868      nop.m 999
1869      fmerge.s Result = ArgY, Result           // Merge sgn(y) in hi
1870      nop.i 999
1871}
1872;;
1873
1874{ .mfi
1875      nop.m 999
1876      fmerge.s Result_lo = ArgY, Result_lo     // Merge sgn(y) in lo
1877      nop.i 999
1878}
1879;;
1880
1881{ .mfb
1882      nop.m 999
1883      fadd.s0 Result = Result, Result_lo       // Compute complete result
1884      br.ret.sptk   b0                         // Exit for y=inf
1885}
1886;;
1887
1888// Here if y not INF, and x=0 or INF
1889ATANL_ArgY_Not_INF:
1890//
1891//     Return +PI/2 when ArgY NOT Inf, ArgY > 0 and ArgX = +/-0
1892//     Return -PI/2 when ArgY NOT Inf, ArgY < 0 and ArgX = +/-0
1893//     Return +0    when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
1894//     Return -0    when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
1895//     Return +PI   when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
1896//     Return -PI   when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
1897//
1898{ .mfi
1899      nop.m 999
1900      fclass.m p7, p9 = ArgX, 0x021            // Test for x=+inf
1901      nop.i 999
1902}
1903;;
1904
1905{ .mfi
1906      nop.m 999
1907      fclass.m p6, p0 = ArgX, 0x007            // Test for x=0
1908      nop.i 999
1909}
1910;;
1911
1912{ .mfi
1913(p6)  add table_ptr1 = 16, table_ptr1          // Point to pi/2
1914      fclass.m p8, p0 = ArgX, 0x022            // Test for x=-inf
1915      nop.i 999
1916}
1917;;
1918
1919.pred.rel "mutex",p7,p9
1920{ .mfi
1921(p9)  ldfd Result = [table_ptr1], 8           // Load pi or pi/2 hi
1922(p7)  fmerge.s Result = ArgY, f0              // If y not inf, x=+inf, sgn(y)*0
1923      nop.i 999
1924}
1925;;
1926
1927{ .mfi
1928(p9)  ldfd Result_lo = [table_ptr1], -8       // Load pi or pi/2 lo
1929(p7)  fnorm.s0 Result = Result                // If y not inf, x=+inf normalize
1930      nop.i 999
1931}
1932;;
1933
1934{ .mfi
1935      nop.m 999
1936(p9)  fmerge.s Result = ArgY, Result          // Merge sgn(y) in hi
1937      nop.i 999
1938}
1939;;
1940
1941{ .mfi
1942      nop.m 999
1943(p9)  fmerge.s Result_lo = ArgY, Result_lo    // Merge sgn(y) in lo
1944      nop.i 999
1945}
1946;;
1947
1948{ .mfb
1949      nop.m 999
1950(p9)  fadd.s0 Result = Result, Result_lo      // Compute complete result
1951      br.ret.spnt   b0                        // Exit for y not inf, x=0,inf
1952}
1953;;
1954
1955GLOBAL_IEEE754_END(atan2l)
1956libm_alias_ldouble_other (__atan2, atan2)
1957
1958LOCAL_LIBM_ENTRY(__libm_error_region)
1959.prologue
1960{ .mfi
1961        add   GR_Parameter_Y=-32,sp             // Parameter 2 value
1962        nop.f 0
1963.save   ar.pfs,GR_SAVE_PFS
1964        mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
1965}
1966{ .mfi
1967.fframe 64
1968        add sp=-64,sp                           // Create new stack
1969        nop.f 0
1970        mov GR_SAVE_GP=gp                       // Save gp
1971};;
1972{ .mmi
1973        stfe [GR_Parameter_Y] = FR_Y,16         // Save Parameter 2 on stack
1974        add GR_Parameter_X = 16,sp              // Parameter 1 address
1975.save   b0, GR_SAVE_B0
1976        mov GR_SAVE_B0=b0                       // Save b0
1977};;
1978.body
1979{ .mib
1980        stfe [GR_Parameter_X] = FR_X            // Store Parameter 1 on stack
1981        add   GR_Parameter_RESULT = 0,GR_Parameter_Y
1982        nop.b 0                                 // Parameter 3 address
1983}
1984{ .mib
1985        stfe [GR_Parameter_Y] = FR_RESULT      // Store Parameter 3 on stack
1986        add   GR_Parameter_Y = -16,GR_Parameter_Y
1987        br.call.sptk b0=__libm_error_support#  // Call error handling function
1988};;
1989{ .mmi
1990        nop.m 0
1991        nop.m 0
1992        add   GR_Parameter_RESULT = 48,sp
1993};;
1994{ .mmi
1995        ldfe  f8 = [GR_Parameter_RESULT]       // Get return result off stack
1996.restore sp
1997        add   sp = 64,sp                       // Restore stack pointer
1998        mov   b0 = GR_SAVE_B0                  // Restore return address
1999};;
2000{ .mib
2001        mov   gp = GR_SAVE_GP                  // Restore gp
2002        mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
2003        br.ret.sptk     b0                     // Return
2004};;
2005
2006LOCAL_LIBM_END(__libm_error_region#)
2007.type   __libm_error_support#,@function
2008.global __libm_error_support#
2009