1.file "tancotl.s"
2
3
4// Copyright (c) 2000 - 2004, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39//*********************************************************************
40//
41// History:
42//
43// 02/02/00 (hand-optimized)
44// 04/04/00 Unwind support added
45// 12/28/00 Fixed false invalid flags
46// 02/06/02 Improved speed
47// 05/07/02 Changed interface to __libm_pi_by_2_reduce
48// 05/30/02 Added cotl
49// 02/10/03 Reordered header: .section, .global, .proc, .align;
50//          used data8 for long double table values
51// 05/15/03 Reformatted data tables
52// 10/26/04 Avoided using r14-31 as scratch so not clobbered by dynamic loader
53//
54//*********************************************************************
55//
56// Functions:   tanl(x) = tangent(x), for double-extended precision x values
57//              cotl(x) = cotangent(x), for double-extended precision x values
58//
59//*********************************************************************
60//
61// Resources Used:
62//
63//    Floating-Point Registers: f8 (Input and Return Value)
64//                              f9-f15
65//                              f32-f121
66//
67//    General Purpose Registers:
68//      r32-r70
69//
70//    Predicate Registers:      p6-p15
71//
72//*********************************************************************
73//
74// IEEE Special Conditions for tanl:
75//
76//    Denormal  fault raised on denormal inputs
77//    Overflow exceptions do not occur
78//    Underflow exceptions raised when appropriate for tan
79//    (No specialized error handling for this routine)
80//    Inexact raised when appropriate by algorithm
81//
82//    tanl(SNaN) = QNaN
83//    tanl(QNaN) = QNaN
84//    tanl(inf) = QNaN
85//    tanl(+/-0) = +/-0
86//
87//*********************************************************************
88//
89// IEEE Special Conditions for cotl:
90//
91//    Denormal  fault raised on denormal inputs
92//    Overflow exceptions occur at zero and near zero
93//    Underflow exceptions do not occur
94//    Inexact raised when appropriate by algorithm
95//
96//    cotl(SNaN) = QNaN
97//    cotl(QNaN) = QNaN
98//    cotl(inf) = QNaN
99//    cotl(+/-0) = +/-Inf and error handling is called
100//
101//*********************************************************************
102//
103//    Below are mathematical and algorithmic descriptions for tanl.
104//    For cotl we use next identity cot(x) = -tan(x + Pi/2).
105//    So, to compute cot(x) we just need to increment N (N = N + 1)
106//    and invert sign of the computed result.
107//
108//*********************************************************************
109//
110// Mathematical Description
111//
112// We consider the computation of FPTANL of Arg. Now, given
113//
114//      Arg = N pi/2  + alpha,          |alpha| <= pi/4,
115//
116// basic mathematical relationship shows that
117//
118//      tan( Arg ) =  tan( alpha )     if N is even;
119//                 = -cot( alpha )      otherwise.
120//
121// The value of alpha is obtained by argument reduction and
122// represented by two working precision numbers r and c where
123//
124//      alpha =  r  +  c     accurately.
125//
126// The reduction method is described in a previous write up.
127// The argument reduction scheme identifies 4 cases. For Cases 2
128// and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be
129// computed very easily by 2 or 3 terms of the Taylor series
130// expansion as follows:
131//
132// Case 2:
133// -------
134//
135//      tan(r + c) = r + c + r^3/3          ...accurately
136//     -cot(r + c) = -1/(r+c) + r/3          ...accurately
137//
138// Case 4:
139// -------
140//
141//      tan(r + c) = r + c + r^3/3 + 2r^5/15     ...accurately
142//     -cot(r + c) = -1/(r+c) + r/3 + r^3/45     ...accurately
143//
144//
145// The only cases left are Cases 1 and 3 of the argument reduction
146// procedure. These two cases will be merged since after the
147// argument is reduced in either cases, we have the reduced argument
148// represented as r + c and that the magnitude |r + c| is not small
149// enough to allow the usage of a very short approximation.
150//
151// The greatest challenge of this task is that the second terms of
152// the Taylor series for tan(r) and -cot(r)
153//
154//      r + r^3/3 + 2 r^5/15 + ...
155//
156// and
157//
158//      -1/r + r/3 + r^3/45 + ...
159//
160// are not very small when |r| is close to pi/4 and the rounding
161// errors will be a concern if simple polynomial accumulation is
162// used. When |r| < 2^(-2), however, the second terms will be small
163// enough (5 bits or so of right shift) that a normal Horner
164// recurrence suffices. Hence there are two cases that we consider
165// in the accurate computation of tan(r) and cot(r), |r| <= pi/4.
166//
167// Case small_r: |r| < 2^(-2)
168// --------------------------
169//
170// Since Arg = N pi/4 + r + c accurately, we have
171//
172//      tan(Arg) =  tan(r+c)            for N even,
173//               = -cot(r+c)            otherwise.
174//
175// Here for this case, both tan(r) and -cot(r) can be approximated
176// by simple polynomials:
177//
178//      tan(r) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
179//     -cot(r) = -1/r + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
180//
181// accurately. Since |r| is relatively small, tan(r+c) and
182// -cot(r+c) can be accurately approximated by replacing r with
183// r+c only in the first two terms of the corresponding polynomials.
184//
185// Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to
186// almost 64 sig. bits, thus
187//
188//      P1_1 (r+c)^3 =  P1_1 r^3 + c * r^2     accurately.
189//
190// Hence,
191//
192//      tan(r+c) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
193//                     + c*(1 + r^2)
194//
195//        -cot(r+c) = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
196//               + Q1_1*c
197//
198//
199// Case normal_r: 2^(-2) <= |r| <= pi/4
200// ------------------------------------
201//
202// This case is more likely than the previous one if one considers
203// r to be uniformly distributed in [-pi/4 pi/4].
204//
205// The required calculation is either
206//
207//      tan(r + c)  =  tan(r)  +  correction,  or
208//     -cot(r + c)  = -cot(r)  +  correction.
209//
210// Specifically,
211//
212//      tan(r + c) =  tan(r) + c tan'(r)  + O(c^2)
213//                 =  tan(r) + c sec^2(r) + O(c^2)
214//                 =  tan(r) + c SEC_sq     ...accurately
215//                as long as SEC_sq approximates sec^2(r)
216//                to, say, 5 bits or so.
217//
218// Similarly,
219//
220//     -cot(r + c) = -cot(r) - c cot'(r)  + O(c^2)
221//                 = -cot(r) + c csc^2(r) + O(c^2)
222//                 = -cot(r) + c CSC_sq     ...accurately
223//                as long as CSC_sq approximates csc^2(r)
224//                to, say, 5 bits or so.
225//
226// We therefore concentrate on accurately calculating tan(r) and
227// cot(r) for a working-precision number r, |r| <= pi/4 to within
228// 0.1% or so.
229//
230// We will employ a table-driven approach. Let
231//
232//      r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63
233//        = sgn_r * ( B + x )
234//
235// where
236//
237//      B = 2^k * 1.b_1 b_2 ... b_5 1
238//      x = |r| - B
239//
240// Now,
241//                   tan(B)  +   tan(x)
242//      tan( B + x ) =  ------------------------
243//                   1 -  tan(B)*tan(x)
244//
245//               /                         \
246//               |   tan(B)  +   tan(x)          |
247
248//      = tan(B) +  | ------------------------ - tan(B) |
249//               |     1 -  tan(B)*tan(x)          |
250//               \                         /
251//
252//                 sec^2(B) * tan(x)
253//      = tan(B) + ------------------------
254//                 1 -  tan(B)*tan(x)
255//
256//                (1/[sin(B)*cos(B)]) * tan(x)
257//      = tan(B) + --------------------------------
258//                      cot(B)  -  tan(x)
259//
260//
261// Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
262// calculated beforehand and stored in a table. Since
263//
264//      |x| <= 2^k * 2^(-6)  <= 2^(-7)  (because k = -1, -2)
265//
266// a very short polynomial will be sufficient to approximate tan(x)
267// accurately. The details involved in computing the last expression
268// will be given in the next section on algorithm description.
269//
270//
271// Now, we turn to the case where cot( B + x ) is needed.
272//
273//
274//                   1 - tan(B)*tan(x)
275//      cot( B + x ) =  ------------------------
276//                   tan(B)  +  tan(x)
277//
278//               /                           \
279//               |   1 - tan(B)*tan(x)              |
280
281//      = cot(B) +  | ----------------------- - cot(B) |
282//               |     tan(B)  +  tan(x)            |
283//               \                           /
284//
285//               [tan(B) + cot(B)] * tan(x)
286//      = cot(B) - ----------------------------
287//                   tan(B)  +  tan(x)
288//
289//                (1/[sin(B)*cos(B)]) * tan(x)
290//      = cot(B) - --------------------------------
291//                      tan(B)  +  tan(x)
292//
293//
294// Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that
295// are needed are the same set of values needed in the previous
296// case.
297//
298// Finally, we can put all the ingredients together as follows:
299//
300//      Arg = N * pi/2 +  r + c          ...accurately
301//
302//      tan(Arg) =  tan(r) + correction    if N is even;
303//               = -cot(r) + correction    otherwise.
304//
305// For Cases 2 and 4,
306//
307//     Case 2:
308//     tan(Arg) =  tan(r + c) = r + c + r^3/3           N even
309//              = -cot(r + c) = -1/(r+c) + r/3           N odd
310//     Case 4:
311//     tan(Arg) =  tan(r + c) = r + c + r^3/3 + 2r^5/15  N even
312//              = -cot(r + c) = -1/(r+c) + r/3 + r^3/45  N odd
313//
314//
315// For Cases 1 and 3,
316//
317//     Case small_r: |r| < 2^(-2)
318//
319//      tan(Arg) =  r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
320//                     + c*(1 + r^2)               N even
321//
322//               = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
323//                     + Q1_1*c                    N odd
324//
325//     Case normal_r: 2^(-2) <= |r| <= pi/4
326//
327//      tan(Arg) =  tan(r) + c * sec^2(r)     N even
328//               = -cot(r) + c * csc^2(r)     otherwise
329//
330//     For N even,
331//
332//      tan(Arg) = tan(r) + c*sec^2(r)
333//               = tan( sgn_r * (B+x) ) + c * sec^2(|r|)
334//               = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(|r|) )
335//               = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(B) )
336//
337// since B approximates |r| to 2^(-6) in relative accuracy.
338//
339//                 /            (1/[sin(B)*cos(B)]) * tan(x)
340//    tan(Arg) = sgn_r * | tan(B) + --------------------------------
341//                 \                     cot(B)  -  tan(x)
342//                                        \
343//                       + CORR  |
344
345//                                     /
346// where
347//
348//    CORR = sgn_r*c*tan(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
349//
350// For N odd,
351//
352//      tan(Arg) = -cot(r) + c*csc^2(r)
353//               = -cot( sgn_r * (B+x) ) + c * csc^2(|r|)
354//               = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(|r|) )
355//               = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(B) )
356//
357// since B approximates |r| to 2^(-6) in relative accuracy.
358//
359//                 /            (1/[sin(B)*cos(B)]) * tan(x)
360//    tan(Arg) = sgn_r * | -cot(B) + --------------------------------
361//                 \                     tan(B)  +  tan(x)
362//                                        \
363//                       + CORR  |
364
365//                                     /
366// where
367//
368//    CORR = sgn_r*c*cot(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
369//
370//
371// The actual algorithm prescribes how all the mathematical formulas
372// are calculated.
373//
374//
375// 2. Algorithmic Description
376// ==========================
377//
378// 2.1 Computation for Cases 2 and 4.
379// ----------------------------------
380//
381// For Case 2, we use two-term polynomials.
382//
383//    For N even,
384//
385//    rsq := r * r
386//    Poly := c + r * rsq * P1_1
387//    Result := r + Poly          ...in user-defined rounding
388//
389//    For N odd,
390//    S_hi  := -frcpa(r)               ...8 bits
391//    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
392//    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
393//    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
394//    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
395//    ...S_hi + S_lo is -1/(r+c) to extra precision
396//    S_lo  := S_lo + Q1_1*r
397//
398//    Result := S_hi + S_lo     ...in user-defined rounding
399//
400// For Case 4, we use three-term polynomials
401//
402//    For N even,
403//
404//    rsq := r * r
405//    Poly := c + r * rsq * (P1_1 + rsq * P1_2)
406//    Result := r + Poly          ...in user-defined rounding
407//
408//    For N odd,
409//    S_hi  := -frcpa(r)               ...8 bits
410//    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
411//    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
412//    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
413//    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
414//    ...S_hi + S_lo is -1/(r+c) to extra precision
415//    rsq   := r * r
416//    P      := Q1_1 + rsq*Q1_2
417//    S_lo  := S_lo + r*P
418//
419//    Result := S_hi + S_lo     ...in user-defined rounding
420//
421//
422// Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are
423// the same as those used in the small_r case of Cases 1 and 3
424// below.
425//
426//
427// 2.2 Computation for Cases 1 and 3.
428// ----------------------------------
429// This is further divided into the case of small_r,
430// where |r| < 2^(-2), and the case of normal_r, where |r| lies between
431// 2^(-2) and pi/4.
432//
433// Algorithm for the case of small_r
434// ---------------------------------
435//
436// For N even,
437//      rsq   := r * r
438//      Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3))
439//      r_to_the_8    := rsq * rsq
440//      r_to_the_8    := r_to_the_8 * r_to_the_8
441//      Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9))
442//      CORR  := c * ( 1 + rsq )
443//      Poly  := Poly1 + r_to_the_8*Poly2
444//      Poly := r*Poly + CORR
445//      Result := r + Poly     ...in user-defined rounding
446//      ...note that Poly1 and r_to_the_8 can be computed in parallel
447//      ...with Poly2 (Poly1 is intentionally set to be much
448//      ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden)
449//
450// For N odd,
451//      S_hi  := -frcpa(r)               ...8 bits
452//      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
453//      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
454//      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
455//      S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
456//      ...S_hi + S_lo is -1/(r+c) to extra precision
457//      S_lo  := S_lo + Q1_1*c
458//
459//      ...S_hi and S_lo are computed in parallel with
460//      ...the following
461//      rsq := r*r
462//      P   := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7))
463//
464//      Poly :=  r*P + S_lo
465//      Result :=  S_hi  +  Poly      ...in user-defined rounding
466//
467//
468// Algorithm for the case of normal_r
469// ----------------------------------
470//
471// Here, we first consider the computation of tan( r + c ). As
472// presented in the previous section,
473//
474//      tan( r + c )  =  tan(r) + c * sec^2(r)
475//                 =  sgn_r * [ tan(B+x) + CORR ]
476//      CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)]
477//
478// because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits.
479//
480//      tan( r + c ) =
481//           /           (1/[sin(B)*cos(B)]) * tan(x)
482//      sgn_r * | tan(B) + --------------------------------  +
483//           \                     cot(B)  -  tan(x)
484//                                \
485//                          CORR  |
486
487//                                /
488//
489// The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
490// calculated beforehand and stored in a table. Specifically,
491// the table values are
492//
493//      tan(B)             as  T_hi  +  T_lo;
494//      cot(B)             as  C_hi  +  C_lo;
495//      1/[sin(B)*cos(B)]  as  SC_inv
496//
497// T_hi, C_hi are in  double-precision  memory format;
498// T_lo, C_lo are in  single-precision  memory format;
499// SC_inv     is  in extended-precision memory format.
500//
501// The value of tan(x) will be approximated by a short polynomial of
502// the form
503//
504//      tan(x)  as  x  +  x * P, where
505//           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
506//
507// Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x)
508// to a relative accuracy better than 2^(-20). Thus, a good
509// initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative
510// division is:
511//
512//      1/(cot(B) - tan(x))      is approximately
513//      1/(cot(B) -   x)         is
514//      tan(B)/(1 - x*tan(B))    is approximately
515//      T_hi / ( 1 - T_hi * x )  is approximately
516//
517//      T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ]
518//
519// The calculation of tan(r+c) therefore proceed as follows:
520//
521//      Tx     := T_hi * x
522//      xsq     := x * x
523//
524//      V_hi     := T_hi*(1 + Tx*(1 + Tx))
525//      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
526//      ...V_hi serves as an initial guess of 1/(cot(B) - tan(x))
527//         ...good to about 20 bits of accuracy
528//
529//      tanx     := x + x*P
530//      D     := C_hi - tanx
531//      ...D is a double precision denominator: cot(B) - tan(x)
532//
533//      V_hi     := V_hi + V_hi*(1 - V_hi*D)
534//      ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits
535//
536//      V_lo     := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ]
537//                           - V_hi*C_lo )   ...observe all order
538//         ...V_hi + V_lo approximates 1/(cot(B) - tan(x))
539//      ...to extra accuracy
540//
541//      ...               SC_inv(B) * (x + x*P)
542//      ...   tan(B) +      ------------------------- + CORR
543//         ...                cot(B) - (x + x*P)
544//      ...
545//      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
546//      ...
547//
548//      Sx     := SC_inv * x
549//      CORR     := sgn_r * c * SC_inv * T_hi
550//
551//      ...put the ingredients together to compute
552//      ...               SC_inv(B) * (x + x*P)
553//      ...   tan(B) +      ------------------------- + CORR
554//         ...                cot(B) - (x + x*P)
555//      ...
556//      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
557//      ...
558//      ... = T_hi + T_lo + CORR +
559//      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
560//
561//      CORR := CORR + T_lo
562//      tail := V_lo + P*(V_hi + V_lo)
563//         tail := Sx * tail  +  CORR
564//      tail := Sx * V_hi  +  tail
565//         T_hi := sgn_r * T_hi
566//
567//         ...T_hi + sgn_r*tail  now approximate
568//      ...sgn_r*(tan(B+x) + CORR) accurately
569//
570//      Result :=  T_hi + sgn_r*tail  ...in user-defined
571//                           ...rounding control
572//      ...It is crucial that independent paths be fully
573//      ...exploited for performance's sake.
574//
575//
576// Next, we consider the computation of -cot( r + c ). As
577// presented in the previous section,
578//
579//        -cot( r + c )  =  -cot(r) + c * csc^2(r)
580//                 =  sgn_r * [ -cot(B+x) + CORR ]
581//      CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)]
582//
583// because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits.
584//
585//        -cot( r + c ) =
586//           /             (1/[sin(B)*cos(B)]) * tan(x)
587//      sgn_r * | -cot(B) + --------------------------------  +
588//           \                     tan(B)  +  tan(x)
589//                                \
590//                          CORR  |
591
592//                                /
593//
594// The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
595// calculated beforehand and stored in a table. Specifically,
596// the table values are
597//
598//      tan(B)             as  T_hi  +  T_lo;
599//      cot(B)             as  C_hi  +  C_lo;
600//      1/[sin(B)*cos(B)]  as  SC_inv
601//
602// T_hi, C_hi are in  double-precision  memory format;
603// T_lo, C_lo are in  single-precision  memory format;
604// SC_inv     is  in extended-precision memory format.
605//
606// The value of tan(x) will be approximated by a short polynomial of
607// the form
608//
609//      tan(x)  as  x  +  x * P, where
610//           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
611//
612// Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x)
613// to a relative accuracy better than 2^(-18). Thus, a good
614// initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative
615// division is:
616//
617//      1/(tan(B) + tan(x))      is approximately
618//      1/(tan(B) +   x)         is
619//      cot(B)/(1 + x*cot(B))    is approximately
620//      C_hi / ( 1 + C_hi * x )  is approximately
621//
622//      C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ]
623//
624// The calculation of -cot(r+c) therefore proceed as follows:
625//
626//      Cx     := C_hi * x
627//      xsq     := x * x
628//
629//      V_hi     := C_hi*(1 - Cx*(1 - Cx))
630//      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
631//      ...V_hi serves as an initial guess of 1/(tan(B) + tan(x))
632//         ...good to about 18 bits of accuracy
633//
634//      tanx     := x + x*P
635//      D     := T_hi + tanx
636//      ...D is a double precision denominator: tan(B) + tan(x)
637//
638//      V_hi     := V_hi + V_hi*(1 - V_hi*D)
639//      ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits
640//
641//      V_lo     := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ]
642//                           - V_hi*T_lo )   ...observe all order
643//         ...V_hi + V_lo approximates 1/(tan(B) + tan(x))
644//      ...to extra accuracy
645//
646//      ...               SC_inv(B) * (x + x*P)
647//      ...  -cot(B) +      ------------------------- + CORR
648//         ...                tan(B) + (x + x*P)
649//      ...
650//      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
651//      ...
652//
653//      Sx     := SC_inv * x
654//      CORR     := sgn_r * c * SC_inv * C_hi
655//
656//      ...put the ingredients together to compute
657//      ...               SC_inv(B) * (x + x*P)
658//      ...  -cot(B) +      ------------------------- + CORR
659//         ...                tan(B) + (x + x*P)
660//      ...
661//      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
662//      ...
663//      ... =-C_hi - C_lo + CORR +
664//      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
665//
666//      CORR := CORR - C_lo
667//      tail := V_lo + P*(V_hi + V_lo)
668//         tail := Sx * tail  +  CORR
669//      tail := Sx * V_hi  +  tail
670//         C_hi := -sgn_r * C_hi
671//
672//         ...C_hi + sgn_r*tail now approximates
673//      ...sgn_r*(-cot(B+x) + CORR) accurately
674//
675//      Result :=  C_hi + sgn_r*tail   in user-defined rounding control
676//      ...It is crucial that independent paths be fully
677//      ...exploited for performance's sake.
678//
679// 3. Implementation Notes
680// =======================
681//
682//   Table entries T_hi, T_lo; C_hi, C_lo; SC_inv
683//
684//   Recall that 2^(-2) <= |r| <= pi/4;
685//
686//      r = sgn_r * 2^k * 1.b_1 b_2 ... b_63
687//
688//   and
689//
690//        B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1
691//
692//   Thus, for k = -2, possible values of B are
693//
694//          B = 2^(-2) * ( 1 + index/32  +  1/64 ),
695//      index ranges from 0 to 31
696//
697//   For k = -1, however, since |r| <= pi/4 = 0.78...
698//   possible values of B are
699//
700//        B = 2^(-1) * ( 1 + index/32  +  1/64 )
701//      index ranges from 0 to 19.
702//
703//
704
705RODATA
706.align 16
707
708LOCAL_OBJECT_START(TANL_BASE_CONSTANTS)
709
710tanl_table_1:
711data8    0xA2F9836E4E44152A, 0x00003FFE // two_by_pi
712data8    0xC84D32B0CE81B9F1, 0x00004016 // P_0
713data8    0xC90FDAA22168C235, 0x00003FFF // P_1
714data8    0xECE675D1FC8F8CBB, 0x0000BFBD // P_2
715data8    0xB7ED8FBBACC19C60, 0x0000BF7C // P_3
716LOCAL_OBJECT_END(TANL_BASE_CONSTANTS)
717
718LOCAL_OBJECT_START(tanl_table_2)
719data8    0xC90FDAA22168C234, 0x00003FFE // PI_BY_4
720data8    0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0
721data8    0x8D848E89DBD171A1, 0x0000BFBF // d_1
722data8    0xD5394C3618A66F8E, 0x0000BF7C // d_2
723data4    0x3E800000 // two**-2
724data4    0xBE800000 // -two**-2
725data4    0x00000000 // pad
726data4    0x00000000 // pad
727LOCAL_OBJECT_END(tanl_table_2)
728
729LOCAL_OBJECT_START(tanl_table_p1)
730data8    0xAAAAAAAAAAAAAABD, 0x00003FFD // P1_1
731data8    0x8888888888882E6A, 0x00003FFC // P1_2
732data8    0xDD0DD0DD0F0177B6, 0x00003FFA // P1_3
733data8    0xB327A440646B8C6D, 0x00003FF9 // P1_4
734data8    0x91371B251D5F7D20, 0x00003FF8 // P1_5
735data8    0xEB69A5F161C67914, 0x00003FF6 // P1_6
736data8    0xBEDD37BE019318D2, 0x00003FF5 // P1_7
737data8    0x9979B1463C794015, 0x00003FF4 // P1_8
738data8    0x8EBD21A38C6EB58A, 0x00003FF3 // P1_9
739LOCAL_OBJECT_END(tanl_table_p1)
740
741LOCAL_OBJECT_START(tanl_table_q1)
742data8    0xAAAAAAAAAAAAAAB4, 0x00003FFD // Q1_1
743data8    0xB60B60B60B5FC93E, 0x00003FF9 // Q1_2
744data8    0x8AB355E00C9BBFBF, 0x00003FF6 // Q1_3
745data8    0xDDEBBC89CBEE3D4C, 0x00003FF2 // Q1_4
746data8    0xB3548A685F80BBB6, 0x00003FEF // Q1_5
747data8    0x913625604CED5BF1, 0x00003FEC // Q1_6
748data8    0xF189D95A8EE92A83, 0x00003FE8 // Q1_7
749LOCAL_OBJECT_END(tanl_table_q1)
750
751LOCAL_OBJECT_START(tanl_table_p2)
752data8    0xAAAAAAAAAAAB362F, 0x00003FFD // P2_1
753data8    0x88888886E97A6097, 0x00003FFC // P2_2
754data8    0xDD108EE025E716A1, 0x00003FFA // P2_3
755LOCAL_OBJECT_END(tanl_table_p2)
756
757LOCAL_OBJECT_START(tanl_table_tm2)
758//
759//  Entries T_hi   double-precision memory format
760//  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
761//  Entries T_lo  single-precision memory format
762//  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
763//
764data8 0x3FD09BC362400794
765data4 0x23A05C32, 0x00000000
766data8 0x3FD124A9DFFBC074
767data4 0x240078B2, 0x00000000
768data8 0x3FD1AE235BD4920F
769data4 0x23826B8E, 0x00000000
770data8 0x3FD2383515E2701D
771data4 0x22D31154, 0x00000000
772data8 0x3FD2C2E463739C2D
773data4 0x2265C9E2, 0x00000000
774data8 0x3FD34E36AFEEA48B
775data4 0x245C05EB, 0x00000000
776data8 0x3FD3DA317DBB35D1
777data4 0x24749F2D, 0x00000000
778data8 0x3FD466DA67321619
779data4 0x2462CECE, 0x00000000
780data8 0x3FD4F4371F94A4D5
781data4 0x246D0DF1, 0x00000000
782data8 0x3FD5824D740C3E6D
783data4 0x240A85B5, 0x00000000
784data8 0x3FD611234CB1E73D
785data4 0x23F96E33, 0x00000000
786data8 0x3FD6A0BEAD9EA64B
787data4 0x247C5393, 0x00000000
788data8 0x3FD73125B804FD01
789data4 0x241F3B29, 0x00000000
790data8 0x3FD7C25EAB53EE83
791data4 0x2479989B, 0x00000000
792data8 0x3FD8546FE6640EED
793data4 0x23B343BC, 0x00000000
794data8 0x3FD8E75FE8AF1892
795data4 0x241454D1, 0x00000000
796data8 0x3FD97B3553928BDA
797data4 0x238613D9, 0x00000000
798data8 0x3FDA0FF6EB9DE4DE
799data4 0x22859FA7, 0x00000000
800data8 0x3FDAA5AB99ECF92D
801data4 0x237A6D06, 0x00000000
802data8 0x3FDB3C5A6D8F1796
803data4 0x23952F6C, 0x00000000
804data8 0x3FDBD40A9CFB8BE4
805data4 0x2280FC95, 0x00000000
806data8 0x3FDC6CC387943100
807data4 0x245D2EC0, 0x00000000
808data8 0x3FDD068CB736C500
809data4 0x23C4AD7D, 0x00000000
810data8 0x3FDDA16DE1DDBC31
811data4 0x23D076E6, 0x00000000
812data8 0x3FDE3D6EEB515A93
813data4 0x244809A6, 0x00000000
814data8 0x3FDEDA97E6E9E5F1
815data4 0x220856C8, 0x00000000
816data8 0x3FDF78F11963CE69
817data4 0x244BE993, 0x00000000
818data8 0x3FE00C417D635BCE
819data4 0x23D21799, 0x00000000
820data8 0x3FE05CAB1C302CD3
821data4 0x248A1B1D, 0x00000000
822data8 0x3FE0ADB9DB6A1FA0
823data4 0x23D53E33, 0x00000000
824data8 0x3FE0FF724A20BA81
825data4 0x24DB9ED5, 0x00000000
826data8 0x3FE151D9153FA6F5
827data4 0x24E9E451, 0x00000000
828LOCAL_OBJECT_END(tanl_table_tm2)
829
830LOCAL_OBJECT_START(tanl_table_tm1)
831//
832//  Entries T_hi   double-precision memory format
833//  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
834//  Entries T_lo  single-precision memory format
835//  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
836//
837data8 0x3FE1CEC4BA1BE39E
838data4 0x24B60F9E, 0x00000000
839data8 0x3FE277E45ABD9B2D
840data4 0x248C2474, 0x00000000
841data8 0x3FE324180272B110
842data4 0x247B8311, 0x00000000
843data8 0x3FE3D38B890E2DF0
844data4 0x24C55751, 0x00000000
845data8 0x3FE4866D46236871
846data4 0x24E5BC34, 0x00000000
847data8 0x3FE53CEE45E044B0
848data4 0x24001BA4, 0x00000000
849data8 0x3FE5F74282EC06E4
850data4 0x24B973DC, 0x00000000
851data8 0x3FE6B5A125DF43F9
852data4 0x24895440, 0x00000000
853data8 0x3FE77844CAFD348C
854data4 0x240021CA, 0x00000000
855data8 0x3FE83F6BCEED6B92
856data4 0x24C45372, 0x00000000
857data8 0x3FE90B58A34F3665
858data4 0x240DAD33, 0x00000000
859data8 0x3FE9DC522C1E56B4
860data4 0x24F846CE, 0x00000000
861data8 0x3FEAB2A427041578
862data4 0x2323FB6E, 0x00000000
863data8 0x3FEB8E9F9DD8C373
864data4 0x24B3090B, 0x00000000
865data8 0x3FEC709B65C9AA7B
866data4 0x2449F611, 0x00000000
867data8 0x3FED58F4ACCF8435
868data4 0x23616A7E, 0x00000000
869data8 0x3FEE480F97635082
870data4 0x24C2FEAE, 0x00000000
871data8 0x3FEF3E57F0ACC544
872data4 0x242CE964, 0x00000000
873data8 0x3FF01E20F7E06E4B
874data4 0x2480D3EE, 0x00000000
875data8 0x3FF0A1258A798A69
876data4 0x24DB8967, 0x00000000
877LOCAL_OBJECT_END(tanl_table_tm1)
878
879LOCAL_OBJECT_START(tanl_table_cm2)
880//
881//  Entries C_hi   double-precision memory format
882//  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
883//  Entries C_lo  single-precision memory format
884//  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
885//
886data8 0x400ED3E2E63EFBD0
887data4 0x259D94D4, 0x00000000
888data8 0x400DDDB4C515DAB5
889data4 0x245F0537, 0x00000000
890data8 0x400CF57ABE19A79F
891data4 0x25D4EA9F, 0x00000000
892data8 0x400C1A06D15298ED
893data4 0x24AE40A0, 0x00000000
894data8 0x400B4A4C164B2708
895data4 0x25A5AAB6, 0x00000000
896data8 0x400A855A5285B068
897data4 0x25524F18, 0x00000000
898data8 0x4009CA5A3FFA549F
899data4 0x24C999C0, 0x00000000
900data8 0x4009188A646AF623
901data4 0x254FD801, 0x00000000
902data8 0x40086F3C6084D0E7
903data4 0x2560F5FD, 0x00000000
904data8 0x4007CDD2A29A76EE
905data4 0x255B9D19, 0x00000000
906data8 0x400733BE6C8ECA95
907data4 0x25CB021B, 0x00000000
908data8 0x4006A07E1F8DDC52
909data4 0x24AB4722, 0x00000000
910data8 0x4006139BC298AD58
911data4 0x252764E2, 0x00000000
912data8 0x40058CABBAD7164B
913data4 0x24DAF5DB, 0x00000000
914data8 0x40050B4BAE31A5D3
915data4 0x25EA20F4, 0x00000000
916data8 0x40048F2189F85A8A
917data4 0x2583A3E8, 0x00000000
918data8 0x400417DAA862380D
919data4 0x25DCC4CC, 0x00000000
920data8 0x4003A52B1088FCFE
921data4 0x2430A492, 0x00000000
922data8 0x400336CCCD3527D5
923data4 0x255F77CF, 0x00000000
924data8 0x4002CC7F5760766D
925data4 0x25DA0BDA, 0x00000000
926data8 0x4002660711CE02E3
927data4 0x256FF4A2, 0x00000000
928data8 0x4002032CD37BBE04
929data4 0x25208AED, 0x00000000
930data8 0x4001A3BD7F050775
931data4 0x24B72DD6, 0x00000000
932data8 0x40014789A554848A
933data4 0x24AB4DAA, 0x00000000
934data8 0x4000EE65323E81B7
935data4 0x2584C440, 0x00000000
936data8 0x4000982721CF1293
937data4 0x25C9428D, 0x00000000
938data8 0x400044A93D415EEB
939data4 0x25DC8482, 0x00000000
940data8 0x3FFFE78FBD72C577
941data4 0x257F5070, 0x00000000
942data8 0x3FFF4AC375EFD28E
943data4 0x23EBBF7A, 0x00000000
944data8 0x3FFEB2AF60B52DDE
945data4 0x22EECA07, 0x00000000
946data8 0x3FFE1F1935204180
947data4 0x24191079, 0x00000000
948data8 0x3FFD8FCA54F7E60A
949data4 0x248D3058, 0x00000000
950LOCAL_OBJECT_END(tanl_table_cm2)
951
952LOCAL_OBJECT_START(tanl_table_cm1)
953//
954//  Entries C_hi   double-precision memory format
955//  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
956//  Entries C_lo  single-precision memory format
957//  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
958//
959data8 0x3FFCC06A79F6FADE
960data4 0x239C7886, 0x00000000
961data8 0x3FFBB91F891662A6
962data4 0x250BD191, 0x00000000
963data8 0x3FFABFB6529F155D
964data4 0x256CC3E6, 0x00000000
965data8 0x3FF9D3002E964AE9
966data4 0x250843E3, 0x00000000
967data8 0x3FF8F1EF89DCB383
968data4 0x2277C87E, 0x00000000
969data8 0x3FF81B937C87DBD6
970data4 0x256DA6CF, 0x00000000
971data8 0x3FF74F141042EDE4
972data4 0x2573D28A, 0x00000000
973data8 0x3FF68BAF1784B360
974data4 0x242E489A, 0x00000000
975data8 0x3FF5D0B57C923C4C
976data4 0x2532D940, 0x00000000
977data8 0x3FF51D88F418EF20
978data4 0x253C7DD6, 0x00000000
979data8 0x3FF4719A02F88DAE
980data4 0x23DB59BF, 0x00000000
981data8 0x3FF3CC6649DA0788
982data4 0x252B4756, 0x00000000
983data8 0x3FF32D770B980DB8
984data4 0x23FE585F, 0x00000000
985data8 0x3FF2945FE56C987A
986data4 0x25378A63, 0x00000000
987data8 0x3FF200BDB16523F6
988data4 0x247BB2E0, 0x00000000
989data8 0x3FF172358CE27778
990data4 0x24446538, 0x00000000
991data8 0x3FF0E873FDEFE692
992data4 0x2514638F, 0x00000000
993data8 0x3FF0632C33154062
994data4 0x24A7FC27, 0x00000000
995data8 0x3FEFC42EB3EF115F
996data4 0x248FD0FE, 0x00000000
997data8 0x3FEEC9E8135D26F6
998data4 0x2385C719, 0x00000000
999LOCAL_OBJECT_END(tanl_table_cm1)
1000
1001LOCAL_OBJECT_START(tanl_table_scim2)
1002//
1003//  Entries SC_inv in Swapped IEEE format (extended)
1004//  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
1005//
1006data8    0x839D6D4A1BF30C9E, 0x00004001
1007data8    0x80092804554B0EB0, 0x00004001
1008data8    0xF959F94CA1CF0DE9, 0x00004000
1009data8    0xF3086BA077378677, 0x00004000
1010data8    0xED154515CCD4723C, 0x00004000
1011data8    0xE77909441C27CF25, 0x00004000
1012data8    0xE22D037D8DDACB88, 0x00004000
1013data8    0xDD2B2D8A89C73522, 0x00004000
1014data8    0xD86E1A23BB2C1171, 0x00004000
1015data8    0xD3F0E288DFF5E0F9, 0x00004000
1016data8    0xCFAF16B1283BEBD5, 0x00004000
1017data8    0xCBA4AFAA0D88DD53, 0x00004000
1018data8    0xC7CE03CCCA67C43D, 0x00004000
1019data8    0xC427BC820CA0DDB0, 0x00004000
1020data8    0xC0AECD57F13D8CAB, 0x00004000
1021data8    0xBD606C3871ECE6B1, 0x00004000
1022data8    0xBA3A0A96A44C4929, 0x00004000
1023data8    0xB7394F6FE5CCCEC1, 0x00004000
1024data8    0xB45C12039637D8BC, 0x00004000
1025data8    0xB1A0552892CB051B, 0x00004000
1026data8    0xAF04432B6BA2FFD0, 0x00004000
1027data8    0xAC862A237221235F, 0x00004000
1028data8    0xAA2478AF5F00A9D1, 0x00004000
1029data8    0xA7DDBB0C81E082BF, 0x00004000
1030data8    0xA5B0987D45684FEE, 0x00004000
1031data8    0xA39BD0F5627A8F53, 0x00004000
1032data8    0xA19E3B036EC5C8B0, 0x00004000
1033data8    0x9FB6C1F091CD7C66, 0x00004000
1034data8    0x9DE464101FA3DF8A, 0x00004000
1035data8    0x9C263139A8F6B888, 0x00004000
1036data8    0x9A7B4968C27B0450, 0x00004000
1037data8    0x98E2DB7E5EE614EE, 0x00004000
1038LOCAL_OBJECT_END(tanl_table_scim2)
1039
1040LOCAL_OBJECT_START(tanl_table_scim1)
1041//
1042//  Entries SC_inv in Swapped IEEE format (extended)
1043//  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
1044//
1045data8    0x969F335C13B2B5BA, 0x00004000
1046data8    0x93D446D9D4C0F548, 0x00004000
1047data8    0x9147094F61B798AF, 0x00004000
1048data8    0x8EF317CC758787AC, 0x00004000
1049data8    0x8CD498B3B99EEFDB, 0x00004000
1050data8    0x8AE82A7DDFF8BC37, 0x00004000
1051data8    0x892AD546E3C55D42, 0x00004000
1052data8    0x8799FEA9D15573C1, 0x00004000
1053data8    0x86335F88435A4B4C, 0x00004000
1054data8    0x84F4FB6E3E93A87B, 0x00004000
1055data8    0x83DD195280A382FB, 0x00004000
1056data8    0x82EA3D7FA4CB8C9E, 0x00004000
1057data8    0x821B247C6861D0A8, 0x00004000
1058data8    0x816EBED163E8D244, 0x00004000
1059data8    0x80E42D9127E4CFC6, 0x00004000
1060data8    0x807ABF8D28E64AFD, 0x00004000
1061data8    0x8031EF26863B4FD8, 0x00004000
1062data8    0x800960ADAE8C11FD, 0x00004000
1063data8    0x8000E1475FDBEC21, 0x00004000
1064data8    0x80186650A07791FA, 0x00004000
1065LOCAL_OBJECT_END(tanl_table_scim1)
1066
1067Arg                 = f8
1068Save_Norm_Arg       = f8        // For input to reduction routine
1069Result              = f8
1070r                   = f8        // For output from reduction routine
1071c                   = f9        // For output from reduction routine
1072U_2                 = f10
1073rsq                 = f11
1074C_hi                = f12
1075C_lo                = f13
1076T_hi                = f14
1077T_lo                = f15
1078
1079d_1                 = f33
1080N_0                 = f34
1081tail                = f35
1082tanx                = f36
1083Cx                  = f37
1084Sx                  = f38
1085sgn_r               = f39
1086CORR                = f40
1087P                   = f41
1088D                   = f42
1089ArgPrime            = f43
1090P_0                 = f44
1091
1092P2_1                = f45
1093P2_2                = f46
1094P2_3                = f47
1095
1096P1_1                = f45
1097P1_2                = f46
1098P1_3                = f47
1099
1100P1_4                = f48
1101P1_5                = f49
1102P1_6                = f50
1103P1_7                = f51
1104P1_8                = f52
1105P1_9                = f53
1106
1107x                   = f56
1108xsq                 = f57
1109Tx                  = f58
1110Tx1                 = f59
1111Set                 = f60
1112poly1               = f61
1113poly2               = f62
1114Poly                = f63
1115Poly1               = f64
1116Poly2               = f65
1117r_to_the_8          = f66
1118B                   = f67
1119SC_inv              = f68
1120Pos_r               = f69
1121N_0_fix             = f70
1122d_2                 = f71
1123PI_BY_4             = f72
1124TWO_TO_NEG14        = f74
1125TWO_TO_NEG33        = f75
1126NEGTWO_TO_NEG14     = f76
1127NEGTWO_TO_NEG33     = f77
1128two_by_PI           = f78
1129N                   = f79
1130N_fix               = f80
1131P_1                 = f81
1132P_2                 = f82
1133P_3                 = f83
1134s_val               = f84
1135w                   = f85
1136B_mask1             = f86
1137B_mask2             = f87
1138w2                  = f88
1139A                   = f89
1140a                   = f90
1141t                   = f91
1142U_1                 = f92
1143NEGTWO_TO_NEG2      = f93
1144TWO_TO_NEG2         = f94
1145Q1_1                = f95
1146Q1_2                = f96
1147Q1_3                = f97
1148Q1_4                = f98
1149Q1_5                = f99
1150Q1_6                = f100
1151Q1_7                = f101
1152Q1_8                = f102
1153S_hi                = f103
1154S_lo                = f104
1155V_hi                = f105
1156V_lo                = f106
1157U_hi                = f107
1158U_lo                = f108
1159U_hiabs             = f109
1160V_hiabs             = f110
1161V                   = f111
1162Inv_P_0             = f112
1163
1164FR_inv_pi_2to63     = f113
1165FR_rshf_2to64       = f114
1166FR_2tom64           = f115
1167FR_rshf             = f116
1168Norm_Arg            = f117
1169Abs_Arg             = f118
1170TWO_TO_NEG65        = f119
1171fp_tmp              = f120
1172mOne                = f121
1173
1174GR_SAVE_B0     = r33
1175GR_SAVE_GP     = r34
1176GR_SAVE_PFS    = r35
1177table_base     = r36
1178table_ptr1     = r37
1179table_ptr2     = r38
1180table_ptr3     = r39
1181lookup         = r40
1182N_fix_gr       = r41
1183GR_exp_2tom2   = r42
1184GR_exp_2tom65  = r43
1185exp_r          = r44
1186sig_r          = r45
1187bmask1         = r46
1188table_offset   = r47
1189bmask2         = r48
1190gr_tmp         = r49
1191cot_flag       = r50
1192
1193GR_sig_inv_pi  = r51
1194GR_rshf_2to64  = r52
1195GR_exp_2tom64  = r53
1196GR_rshf        = r54
1197GR_exp_2_to_63 = r55
1198GR_exp_2_to_24 = r56
1199GR_signexp_x   = r57
1200GR_exp_x       = r58
1201GR_exp_mask    = r59
1202GR_exp_2tom14  = r60
1203GR_exp_m2tom14 = r61
1204GR_exp_2tom33  = r62
1205GR_exp_m2tom33 = r63
1206
1207GR_SAVE_B0                  = r64
1208GR_SAVE_PFS                 = r65
1209GR_SAVE_GP                  = r66
1210
1211GR_Parameter_X              = r67
1212GR_Parameter_Y              = r68
1213GR_Parameter_RESULT         = r69
1214GR_Parameter_Tag            = r70
1215
1216
1217.section .text
1218.global __libm_tanl#
1219.global __libm_cotl#
1220
1221.proc __libm_cotl#
1222__libm_cotl:
1223.endp __libm_cotl#
1224LOCAL_LIBM_ENTRY(cotl)
1225
1226{ .mlx
1227      alloc r32 = ar.pfs, 0,35,4,0
1228      movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
1229}
1230{ .mlx
1231      mov GR_exp_mask = 0x1ffff            // Exponent mask
1232      movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
1233}
1234;;
1235
1236//     Check for NatVals, Infs , NaNs, and Zeros
1237{ .mfi
1238      getf.exp GR_signexp_x = Arg          // Get sign and exponent of x
1239      fclass.m  p6,p0 = Arg, 0x1E7         // Test for natval, nan, inf, zero
1240      mov cot_flag = 0x1
1241}
1242{ .mfb
1243      addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr
1244      fnorm.s1 Norm_Arg = Arg              // Normalize x
1245      br.cond.sptk COMMON_PATH
1246};;
1247
1248LOCAL_LIBM_END(cotl)
1249
1250
1251.proc __libm_tanl#
1252__libm_tanl:
1253.endp __libm_tanl#
1254GLOBAL_IEEE754_ENTRY(tanl)
1255
1256{ .mlx
1257      alloc r32 = ar.pfs, 0,35,4,0
1258      movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
1259}
1260{ .mlx
1261      mov GR_exp_mask = 0x1ffff            // Exponent mask
1262      movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
1263}
1264;;
1265
1266//     Check for NatVals, Infs , NaNs, and Zeros
1267{ .mfi
1268      getf.exp GR_signexp_x = Arg          // Get sign and exponent of x
1269      fclass.m  p6,p0 = Arg, 0x1E7         // Test for natval, nan, inf, zero
1270      mov cot_flag = 0x0
1271}
1272{ .mfi
1273      addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr
1274      fnorm.s1 Norm_Arg = Arg              // Normalize x
1275      nop.i 0
1276};;
1277
1278// Common path for both tanl and cotl
1279COMMON_PATH:
1280{ .mfi
1281      setf.sig FR_inv_pi_2to63 = GR_sig_inv_pi // Form 1/pi * 2^63
1282      fclass.m p9, p0 = Arg, 0x0b          // Test x denormal
1283      mov GR_exp_2tom64 = 0xffff - 64      // Scaling constant to compute N
1284}
1285{ .mlx
1286      setf.d FR_rshf_2to64 = GR_rshf_2to64 // Form const 1.1000 * 2^(63+64)
1287      movl GR_rshf = 0x43e8000000000000    // Form const 1.1000 * 2^63
1288}
1289;;
1290
1291// Check for everything - if false, then must be pseudo-zero or pseudo-nan.
1292// Branch out to deal with special values.
1293{ .mfi
1294      addl gr_tmp = -1,r0
1295      fclass.nm  p7,p0 = Arg, 0x1FF        // Test x unsupported
1296      mov GR_exp_2_to_63 = 0xffff + 63     // Exponent of 2^63
1297}
1298{ .mfb
1299      ld8 table_base = [table_base]        // Get pointer to constant table
1300      fms.s1 mOne = f0, f0, f1
1301(p6)  br.cond.spnt TANL_SPECIAL            // Branch if x natval, nan, inf, zero
1302}
1303;;
1304
1305{ .mmb
1306      setf.sig fp_tmp = gr_tmp   // Make a constant so fmpy produces inexact
1307      mov GR_exp_2_to_24 = 0xffff + 24     // Exponent of 2^24
1308(p9)  br.cond.spnt TANL_DENORMAL           // Branch if x denormal
1309}
1310;;
1311
1312TANL_COMMON:
1313// Return to here if x denormal
1314//
1315// Do fcmp to generate Denormal exception
1316//  - can't do FNORM (will generate Underflow when U is unmasked!)
1317// Branch out to deal with unsupporteds values.
1318{ .mfi
1319      setf.exp FR_2tom64 = GR_exp_2tom64 // Form 2^-64 for scaling N_float
1320      fcmp.eq.s0 p0, p6 = Arg, f1        // Dummy to flag denormals
1321      add table_ptr1 = 0, table_base     // Point to tanl_table_1
1322}
1323{ .mib
1324      setf.d FR_rshf = GR_rshf           // Form right shift const 1.1000 * 2^63
1325      add table_ptr2 = 80, table_base    // Point to tanl_table_2
1326(p7)  br.cond.spnt TANL_UNSUPPORTED      // Branch if x unsupported type
1327}
1328;;
1329
1330{ .mfi
1331      and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x
1332      fmpy.s1 Save_Norm_Arg = Norm_Arg, f1     // Save x if large arg reduction
1333      dep.z bmask1 = 0x7c, 56, 8               // Form mask to get 5 msb of r
1334                                               // bmask1 = 0x7c00000000000000
1335}
1336;;
1337
1338//
1339//     Decide about the paths to take:
1340//     Set PR_6 if |Arg| >= 2**63
1341//     Set PR_9 if |Arg| < 2**24 - CASE 1 OR 2
1342//     OTHERWISE Set PR_8 - CASE 3 OR 4
1343//
1344//     Branch out if the magnitude of the input argument is >= 2^63
1345//     - do this branch before the next.
1346{ .mfi
1347      ldfe two_by_PI = [table_ptr1],16        // Load 2/pi
1348      nop.f 999
1349      dep.z bmask2 = 0x41, 57, 7              // Form mask to OR to produce B
1350                                              // bmask2 = 0x8200000000000000
1351}
1352{ .mib
1353      ldfe PI_BY_4 = [table_ptr2],16          // Load pi/4
1354      cmp.ge p6,p0 = GR_exp_x, GR_exp_2_to_63 // Is |x| >= 2^63
1355(p6)  br.cond.spnt TANL_ARG_TOO_LARGE         // Branch if |x| >= 2^63
1356}
1357;;
1358
1359{ .mmi
1360      ldfe P_0 = [table_ptr1],16              // Load P_0
1361      ldfe Inv_P_0 = [table_ptr2],16          // Load Inv_P_0
1362      nop.i 999
1363}
1364;;
1365
1366{ .mfi
1367      ldfe P_1 = [table_ptr1],16              // Load P_1
1368      fmerge.s Abs_Arg = f0, Norm_Arg         // Get |x|
1369      mov GR_exp_m2tom33 = 0x2ffff - 33       // Form signexp of -2^-33
1370}
1371{ .mfi
1372      ldfe d_1 = [table_ptr2],16              // Load d_1 for 2^24 <= |x| < 2^63
1373      nop.f 999
1374      mov GR_exp_2tom33 = 0xffff - 33         // Form signexp of 2^-33
1375}
1376;;
1377
1378{ .mmi
1379      ldfe P_2 = [table_ptr1],16              // Load P_2
1380      ldfe d_2 = [table_ptr2],16              // Load d_2 for 2^24 <= |x| < 2^63
1381      cmp.ge p8,p0 = GR_exp_x, GR_exp_2_to_24 // Is |x| >= 2^24
1382}
1383;;
1384
1385// Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits
1386// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
1387{ .mfb
1388      ldfe   P_3 = [table_ptr1],16            // Load P_3
1389      fma.s1      N_fix = Norm_Arg, FR_inv_pi_2to63, FR_rshf_2to64
1390(p8)  br.cond.spnt TANL_LARGER_ARG            // Branch if 2^24 <= |x| < 2^63
1391}
1392;;
1393
1394// Here if 0 < |x| < 2^24
1395//     ARGUMENT REDUCTION CODE - CASE 1 and 2
1396//
1397{ .mmf
1398      setf.exp TWO_TO_NEG33 = GR_exp_2tom33      // Form 2^-33
1399      setf.exp NEGTWO_TO_NEG33 = GR_exp_m2tom33  // Form -2^-33
1400      fmerge.s r = Norm_Arg,Norm_Arg          // Assume r=x, ok if |x| < pi/4
1401}
1402;;
1403
1404//
1405// If |Arg| < pi/4,  set PR_8, else  pi/4 <=|Arg| < 2^24 - set PR_9.
1406//
1407//     Case 2: Convert integer N_fix back to normalized floating-point value.
1408{ .mfi
1409      getf.sig sig_r = Norm_Arg               // Get sig_r if 1/4 <= |x| < pi/4
1410      fcmp.lt.s1 p8,p9= Abs_Arg,PI_BY_4       // Test |x| < pi/4
1411      mov GR_exp_2tom2 = 0xffff - 2           // Form signexp of 2^-2
1412}
1413{ .mfi
1414      ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2] // Load 2^-2, -2^-2
1415      fms.s1 N = N_fix, FR_2tom64, FR_rshf    // Use scaling to get N floated
1416      mov N_fix_gr = r0                       // Assume N=0, ok if |x| < pi/4
1417}
1418;;
1419
1420//
1421//     Case 1: Is |r| < 2**(-2).
1422//     Arg is the same as r in this case.
1423//     r = Arg
1424//     c = 0
1425//
1426//     Case 2: Place integer part of N in GP register.
1427{ .mfi
1428(p9)  getf.sig N_fix_gr = N_fix
1429      fmerge.s c = f0, f0                     // Assume c=0, ok if |x| < pi/4
1430      cmp.lt p10, p0 = GR_exp_x, GR_exp_2tom2 // Test if |x| < 1/4
1431}
1432;;
1433
1434{ .mfi
1435      setf.sig B_mask1 = bmask1               // Form mask to get 5 msb of r
1436      nop.f 999
1437      mov exp_r = GR_exp_x                    // Get exp_r if 1/4 <= |x| < pi/4
1438}
1439{ .mbb
1440      setf.sig B_mask2 = bmask2               // Form mask to form B from r
1441(p10) br.cond.spnt TANL_SMALL_R               // Branch if 0 < |x| < 1/4
1442(p8)  br.cond.spnt TANL_NORMAL_R              // Branch if 1/4 <= |x| < pi/4
1443}
1444;;
1445
1446// Here if pi/4 <= |x| < 2^24
1447//
1448//     Case 1: PR_3 is only affected  when PR_1 is set.
1449//
1450//
1451//     Case 2: w = N * P_2
1452//     Case 2: s_val = -N * P_1  + Arg
1453//
1454
1455{ .mfi
1456      nop.m 999
1457      fnma.s1 s_val = N, P_1, Norm_Arg
1458      nop.i 999
1459}
1460{ .mfi
1461      nop.m 999
1462      fmpy.s1 w = N, P_2                     // w = N * P_2 for |s| >= 2^-33
1463      nop.i 999
1464}
1465;;
1466
1467//     Case 2_reduce: w = N * P_3 (change sign)
1468{ .mfi
1469      nop.m 999
1470      fmpy.s1 w2 = N, P_3                    // w = N * P_3 for |s| < 2^-33
1471      nop.i 999
1472}
1473;;
1474
1475//     Case 1_reduce: r = s + w (change sign)
1476{ .mfi
1477      nop.m 999
1478      fsub.s1 r = s_val, w                   // r = s_val - w for |s| >= 2^-33
1479      nop.i 999
1480}
1481;;
1482
1483//     Case 2_reduce: U_1 = N * P_2 + w
1484{ .mfi
1485      nop.m 999
1486      fma.s1  U_1 = N, P_2, w2              // U_1 = N * P_2 + w for |s| < 2^-33
1487      nop.i 999
1488}
1489;;
1490
1491//
1492//     Decide between case_1 and case_2 reduce:
1493//     Case 1_reduce:  |s| >= 2**(-33)
1494//     Case 2_reduce:  |s| < 2**(-33)
1495//
1496{ .mfi
1497      nop.m 999
1498      fcmp.lt.s1 p9, p8 = s_val, TWO_TO_NEG33
1499      nop.i 999
1500}
1501;;
1502
1503{ .mfi
1504      nop.m 999
1505(p9)  fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
1506      nop.i 999
1507}
1508;;
1509
1510//     Case 1_reduce: c = s - r
1511{ .mfi
1512      nop.m 999
1513      fsub.s1 c = s_val, r                     // c = s_val - r for |s| >= 2^-33
1514      nop.i 999
1515}
1516;;
1517
1518//     Case 2_reduce: r is complete here - continue to calculate c .
1519//     r = s - U_1
1520{ .mfi
1521      nop.m 999
1522(p9)  fsub.s1 r = s_val, U_1
1523      nop.i 999
1524}
1525{ .mfi
1526      nop.m 999
1527(p9)  fms.s1 U_2 = N, P_2, U_1
1528      nop.i 999
1529}
1530;;
1531
1532//
1533//     Case 1_reduce: Is |r| < 2**(-2), if so set PR_10
1534//     else set PR_13.
1535//
1536
1537{ .mfi
1538      nop.m 999
1539      fand B = B_mask1, r
1540      nop.i 999
1541}
1542{ .mfi
1543      nop.m 999
1544(p8)  fcmp.lt.unc.s1 p10, p13 = r, TWO_TO_NEG2
1545      nop.i 999
1546}
1547;;
1548
1549{ .mfi
1550(p8)  getf.sig sig_r = r               // Get signif of r if |s| >= 2^-33
1551      nop.f 999
1552      nop.i 999
1553}
1554;;
1555
1556{ .mfi
1557(p8)  getf.exp exp_r = r               // Extract signexp of r if |s| >= 2^-33
1558(p10) fcmp.gt.s1 p10, p13 = r, NEGTWO_TO_NEG2
1559      nop.i 999
1560}
1561;;
1562
1563//     Case 1_reduce: c is complete here.
1564//     Case 1: Branch to SMALL_R or NORMAL_R.
1565//     c = c + w (w has not been negated.)
1566{ .mfi
1567      nop.m 999
1568(p8)  fsub.s1 c = c, w                         // c = c - w for |s| >= 2^-33
1569      nop.i 999
1570}
1571{ .mbb
1572      nop.m 999
1573(p10) br.cond.spnt TANL_SMALL_R     // Branch if pi/4 < |x| < 2^24 and |r|<1/4
1574(p13) br.cond.sptk TANL_NORMAL_R_A  // Branch if pi/4 < |x| < 2^24 and |r|>=1/4
1575}
1576;;
1577
1578
1579// Here if pi/4 < |x| < 2^24 and |s| < 2^-33
1580//
1581//     Is i_1 = lsb of N_fix_gr even or odd?
1582//     if i_1 == 0, set p11, else set p12.
1583//
1584{ .mfi
1585      nop.m 999
1586      fsub.s1 s_val = s_val, r
1587      add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
1588}
1589{ .mfi
1590      nop.m 999
1591//
1592//     Case 2_reduce:
1593//     U_2 = N * P_2 - U_1
1594//     Not needed until later.
1595//
1596      fadd.s1 U_2 = U_2, w2
1597//
1598//     Case 2_reduce:
1599//     s = s - r
1600//     U_2 = U_2 + w
1601//
1602      nop.i 999
1603}
1604;;
1605
1606//
1607//     Case 2_reduce:
1608//     c = c - U_2
1609//     c is complete here
1610//     Argument reduction ends here.
1611//
1612{ .mfi
1613      nop.m 999
1614      fmpy.s1 rsq = r, r
1615      tbit.z p11, p12 = N_fix_gr, 0 ;;    // Set p11 if N even, p12 if odd
1616}
1617
1618{ .mfi
1619      nop.m 999
1620(p12) frcpa.s1 S_hi,p0 = f1, r
1621      nop.i 999
1622}
1623{ .mfi
1624      nop.m 999
1625      fsub.s1 c = s_val, U_1
1626      nop.i 999
1627}
1628;;
1629
1630{ .mmi
1631      add table_ptr1 = 160, table_base ;;  // Point to tanl_table_p1
1632      ldfe P1_1 = [table_ptr1],144
1633      nop.i 999 ;;
1634}
1635//
1636//     Load P1_1 and point to Q1_1 .
1637//
1638{ .mfi
1639      ldfe Q1_1 = [table_ptr1]
1640//
1641//     N even: rsq = r * Z
1642//     N odd:  S_hi = frcpa(r)
1643//
1644(p12) fmerge.ns S_hi = S_hi, S_hi
1645      nop.i 999
1646}
1647{ .mfi
1648      nop.m 999
1649//
1650//     Case 2_reduce:
1651//     c = s - U_1
1652//
1653(p9)  fsub.s1 c = c, U_2
1654      nop.i 999 ;;
1655}
1656{ .mfi
1657      nop.m 999
1658(p12) fma.s1  poly1 = S_hi, r, f1
1659      nop.i 999 ;;
1660}
1661{ .mfi
1662      nop.m 999
1663//
1664//     N odd:  Change sign of S_hi
1665//
1666(p11) fmpy.s1 rsq = rsq, P1_1
1667      nop.i 999 ;;
1668}
1669{ .mfi
1670      nop.m 999
1671(p12) fma.s1 S_hi = S_hi, poly1, S_hi
1672      nop.i 999 ;;
1673}
1674{ .mfi
1675      nop.m 999
1676//
1677//     N even: rsq = rsq * P1_1
1678//     N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
1679//
1680(p11) fma.s1 Poly = r, rsq, c
1681      nop.i 999 ;;
1682}
1683{ .mfi
1684      nop.m 999
1685//
1686//     N even: Poly = c  + r * rsq
1687//     N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
1688//
1689(p12) fma.s1 poly1 = S_hi, r, f1
1690(p11) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
1691}
1692{ .mfi
1693      nop.m 999
1694//
1695//     N even: Result = Poly + r
1696//     N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
1697//
1698(p14) fadd.s0 Result = r, Poly             // for tanl
1699      nop.i 999
1700}
1701{ .mfi
1702      nop.m 999
1703(p15) fms.s0 Result = r, mOne, Poly        // for cotl
1704      nop.i 999
1705}
1706;;
1707
1708{ .mfi
1709      nop.m 999
1710(p12) fma.s1  S_hi = S_hi, poly1, S_hi
1711      nop.i 999 ;;
1712}
1713{ .mfi
1714      nop.m 999
1715//
1716//     N even: Result1 = Result + r
1717//     N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
1718//
1719(p12) fma.s1 poly1 = S_hi, r, f1
1720      nop.i 999 ;;
1721}
1722{ .mfi
1723      nop.m 999
1724//
1725//     N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
1726//
1727(p12) fma.s1 S_hi = S_hi, poly1, S_hi
1728      nop.i 999 ;;
1729}
1730{ .mfi
1731      nop.m 999
1732//
1733//     N odd:  poly1  =  S_hi * poly + 1.0    64 bits
1734//
1735(p12) fma.s1 poly1 = S_hi, r, f1
1736      nop.i 999 ;;
1737}
1738{ .mfi
1739      nop.m 999
1740//
1741//     N odd:  poly1  =  S_hi * r + 1.0
1742//
1743(p12) fma.s1 poly1 = S_hi, c, poly1
1744      nop.i 999 ;;
1745}
1746{ .mfi
1747      nop.m 999
1748//
1749//     N odd:  poly1  =  S_hi * c + poly1
1750//
1751(p12) fmpy.s1 S_lo = S_hi, poly1
1752      nop.i 999 ;;
1753}
1754{ .mfi
1755      nop.m 999
1756//
1757//     N odd:  S_lo  =  S_hi *  poly1
1758//
1759(p12) fma.s1 S_lo = Q1_1, r, S_lo
1760(p12) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
1761}
1762{ .mfi
1763      nop.m 999
1764//
1765//     N odd:  Result =  S_hi + S_lo
1766//
1767      fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
1768      nop.i 999 ;;
1769}
1770{ .mfi
1771      nop.m 999
1772//
1773//     N odd:  S_lo  =  S_lo + Q1_1 * r
1774//
1775(p14) fadd.s0 Result = S_hi, S_lo          // for tanl
1776      nop.i 999
1777}
1778{ .mfb
1779      nop.m 999
1780(p15) fms.s0 Result = S_hi, mOne, S_lo     // for cotl
1781      br.ret.sptk b0 ;;          // Exit for pi/4 <= |x| < 2^24 and |s| < 2^-33
1782}
1783
1784
1785TANL_LARGER_ARG:
1786// Here if 2^24 <= |x| < 2^63
1787//
1788// ARGUMENT REDUCTION CODE - CASE 3 and 4
1789//
1790
1791{ .mmf
1792      mov GR_exp_2tom14 = 0xffff - 14          // Form signexp of 2^-14
1793      mov GR_exp_m2tom14 = 0x2ffff - 14        // Form signexp of -2^-14
1794      fmpy.s1 N_0 = Norm_Arg, Inv_P_0
1795}
1796;;
1797
1798{ .mmi
1799      setf.exp TWO_TO_NEG14 = GR_exp_2tom14    // Form 2^-14
1800      setf.exp NEGTWO_TO_NEG14 = GR_exp_m2tom14// Form -2^-14
1801      nop.i 999
1802}
1803;;
1804
1805
1806//
1807//    Adjust table_ptr1 to beginning of table.
1808//    N_0 = Arg * Inv_P_0
1809//
1810{ .mmi
1811      add table_ptr2 = 144, table_base ;;     // Point to 2^-2
1812      ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2]
1813      nop.i 999
1814}
1815;;
1816
1817//
1818//    N_0_fix  = integer part of N_0 .
1819//
1820//
1821//    Make N_0 the integer part.
1822//
1823{ .mfi
1824      nop.m 999
1825      fcvt.fx.s1 N_0_fix = N_0
1826      nop.i 999 ;;
1827}
1828{ .mfi
1829      setf.sig B_mask1 = bmask1               // Form mask to get 5 msb of r
1830      fcvt.xf N_0 = N_0_fix
1831      nop.i 999 ;;
1832}
1833{ .mfi
1834      setf.sig B_mask2 = bmask2               // Form mask to form B from r
1835      fnma.s1 ArgPrime = N_0, P_0, Norm_Arg
1836      nop.i 999
1837}
1838{ .mfi
1839      nop.m 999
1840      fmpy.s1 w = N_0, d_1
1841      nop.i 999 ;;
1842}
1843//
1844//    ArgPrime = -N_0 * P_0 + Arg
1845//    w  = N_0 * d_1
1846//
1847//
1848//    N = ArgPrime * 2/pi
1849//
1850//      fcvt.fx.s1 N_fix = N
1851// Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits
1852// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
1853{ .mfi
1854      nop.m 999
1855      fma.s1      N_fix = ArgPrime, FR_inv_pi_2to63, FR_rshf_2to64
1856
1857      nop.i 999 ;;
1858}
1859//     Convert integer N_fix back to normalized floating-point value.
1860{ .mfi
1861      nop.m 999
1862      fms.s1 N = N_fix, FR_2tom64, FR_rshf    // Use scaling to get N floated
1863      nop.i 999
1864}
1865;;
1866
1867//
1868//    N is the integer part of the reduced-reduced argument.
1869//    Put the integer in a GP register.
1870//
1871{ .mfi
1872      getf.sig N_fix_gr = N_fix
1873      nop.f 999
1874      nop.i 999
1875}
1876;;
1877
1878//
1879//    s_val = -N*P_1 + ArgPrime
1880//    w = -N*P_2 + w
1881//
1882{ .mfi
1883      nop.m 999
1884      fnma.s1 s_val = N, P_1, ArgPrime
1885      nop.i 999
1886}
1887{ .mfi
1888      nop.m 999
1889      fnma.s1 w = N, P_2, w
1890      nop.i 999
1891}
1892;;
1893
1894//    Case 4: V_hi = N * P_2
1895//    Case 4: U_hi = N_0 * d_1
1896{ .mfi
1897      nop.m 999
1898      fmpy.s1 V_hi = N, P_2               // V_hi = N * P_2 for |s| < 2^-14
1899      nop.i 999
1900}
1901{ .mfi
1902      nop.m 999
1903      fmpy.s1 U_hi = N_0, d_1             // U_hi = N_0 * d_1 for |s| < 2^-14
1904      nop.i 999
1905}
1906;;
1907
1908//    Case 3: r = s_val + w (Z complete)
1909//    Case 4: w = N * P_3
1910{ .mfi
1911      nop.m 999
1912      fadd.s1 r = s_val, w                // r = s_val + w for |s| >= 2^-14
1913      nop.i 999
1914}
1915{ .mfi
1916      nop.m 999
1917      fmpy.s1 w2 = N, P_3                 // w = N * P_3 for |s| < 2^-14
1918      nop.i 999
1919}
1920;;
1921
1922//    Case 4: A =  U_hi + V_hi
1923//    Note: Worry about switched sign of V_hi, so subtract instead of add.
1924//    Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
1925//    Note: the (-) is still missing for V_hi.
1926{ .mfi
1927      nop.m 999
1928      fsub.s1 A = U_hi, V_hi           // A = U_hi - V_hi for |s| < 2^-14
1929      nop.i 999
1930}
1931{ .mfi
1932      nop.m 999
1933      fnma.s1 V_lo = N, P_2, V_hi      // V_lo = V_hi - N * P_2 for |s| < 2^-14
1934      nop.i 999
1935}
1936;;
1937
1938//    Decide between case 3 and 4:
1939//    Case 3:  |s| >= 2**(-14)     Set p10
1940//    Case 4:  |s| <  2**(-14)     Set p11
1941//
1942//    Case 4: U_lo = N_0 * d_1 - U_hi
1943{ .mfi
1944      nop.m 999
1945      fms.s1 U_lo = N_0, d_1, U_hi     // U_lo = N_0*d_1 - U_hi for |s| < 2^-14
1946      nop.i 999
1947}
1948{ .mfi
1949      nop.m 999
1950      fcmp.lt.s1 p11, p10 = s_val, TWO_TO_NEG14
1951      nop.i 999
1952}
1953;;
1954
1955//    Case 4: We need abs of both U_hi and V_hi - dont
1956//    worry about switched sign of V_hi.
1957{ .mfi
1958      nop.m 999
1959      fabs V_hiabs = V_hi              // |V_hi| for |s| < 2^-14
1960      nop.i 999
1961}
1962{ .mfi
1963      nop.m 999
1964(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
1965      nop.i 999
1966}
1967;;
1968
1969//    Case 3: c = s_val - r
1970{ .mfi
1971      nop.m 999
1972      fabs U_hiabs = U_hi              // |U_hi| for |s| < 2^-14
1973      nop.i 999
1974}
1975{ .mfi
1976      nop.m 999
1977      fsub.s1 c = s_val, r             // c = s_val - r    for |s| >= 2^-14
1978      nop.i 999
1979}
1980;;
1981
1982// For Case 3, |s| >= 2^-14, determine if |r| < 1/4
1983//
1984//    Case 4: C_hi = s_val + A
1985//
1986{ .mfi
1987      nop.m 999
1988(p11) fadd.s1 C_hi = s_val, A              // C_hi = s_val + A for |s| < 2^-14
1989      nop.i 999
1990}
1991{ .mfi
1992      nop.m 999
1993(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
1994      nop.i 999
1995}
1996;;
1997
1998{ .mfi
1999      getf.sig sig_r = r               // Get signif of r if |s| >= 2^-33
2000      fand B = B_mask1, r
2001      nop.i 999
2002}
2003;;
2004
2005//    Case 4: t = U_lo + V_lo
2006{ .mfi
2007      getf.exp exp_r = r               // Extract signexp of r if |s| >= 2^-33
2008(p11) fadd.s1 t = U_lo, V_lo               // t = U_lo + V_lo for |s| < 2^-14
2009      nop.i 999
2010}
2011{ .mfi
2012      nop.m 999
2013(p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2
2014      nop.i 999
2015}
2016;;
2017
2018//    Case 3: c = (s - r) + w (c complete)
2019{ .mfi
2020      nop.m 999
2021(p10) fadd.s1 c = c, w              // c = c + w for |s| >= 2^-14
2022      nop.i 999
2023}
2024{ .mbb
2025      nop.m 999
2026(p14) br.cond.spnt TANL_SMALL_R     // Branch if 2^24 <= |x| < 2^63 and |r|< 1/4
2027(p15) br.cond.sptk TANL_NORMAL_R_A  // Branch if 2^24 <= |x| < 2^63 and |r|>=1/4
2028}
2029;;
2030
2031
2032// Here if 2^24 <= |x| < 2^63 and |s| < 2^-14  >>>>>>>  Case 4.
2033//
2034//    Case 4: Set P_12 if U_hiabs >= V_hiabs
2035//    Case 4: w = w + N_0 * d_2
2036//    Note: the (-) is now incorporated in w .
2037{ .mfi
2038      add table_ptr1 = 160, table_base           // Point to tanl_table_p1
2039      fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
2040      nop.i 999
2041}
2042{ .mfi
2043      nop.m 999
2044      fms.s1 w2 = N_0, d_2, w2
2045      nop.i 999
2046}
2047;;
2048
2049//    Case 4: C_lo = s_val - C_hi
2050{ .mfi
2051      ldfe P1_1 = [table_ptr1], 16               // Load P1_1
2052      fsub.s1 C_lo = s_val, C_hi
2053      nop.i 999
2054}
2055;;
2056
2057//
2058//    Case 4: a = U_hi - A
2059//            a = V_hi - A (do an add to account for missing (-) on V_hi
2060//
2061{ .mfi
2062      ldfe P1_2 = [table_ptr1], 128              // Load P1_2
2063(p12) fsub.s1 a = U_hi, A
2064      nop.i 999
2065}
2066{ .mfi
2067      nop.m 999
2068(p13) fadd.s1 a = V_hi, A
2069      nop.i 999
2070}
2071;;
2072
2073//    Case 4: t = U_lo + V_lo  + w
2074{ .mfi
2075      ldfe Q1_1 = [table_ptr1], 16               // Load Q1_1
2076      fadd.s1 t = t, w2
2077      nop.i 999
2078}
2079;;
2080
2081//    Case 4: a = (U_hi - A)  + V_hi
2082//            a = (V_hi - A)  + U_hi
2083//    In each case account for negative missing form V_hi .
2084//
2085{ .mfi
2086      ldfe Q1_2 = [table_ptr1], 16               // Load Q1_2
2087(p12) fsub.s1 a = a, V_hi
2088      nop.i 999
2089}
2090{ .mfi
2091      nop.m 999
2092(p13) fsub.s1 a = U_hi, a
2093      nop.i 999
2094}
2095;;
2096
2097//
2098//    Case 4: C_lo = (s_val - C_hi) + A
2099//
2100{ .mfi
2101      nop.m 999
2102      fadd.s1 C_lo = C_lo, A
2103      nop.i 999 ;;
2104}
2105//
2106//    Case 4: t = t + a
2107//
2108{ .mfi
2109      nop.m 999
2110      fadd.s1 t = t, a
2111      nop.i 999
2112}
2113;;
2114
2115//    Case 4: C_lo = C_lo + t
2116//    Case 4: r = C_hi + C_lo
2117{ .mfi
2118      nop.m 999
2119      fadd.s1 C_lo = C_lo, t
2120      nop.i 999
2121}
2122;;
2123
2124{ .mfi
2125      nop.m 999
2126      fadd.s1 r = C_hi, C_lo
2127      nop.i 999
2128}
2129;;
2130
2131//
2132//    Case 4: c = C_hi - r
2133//
2134{ .mfi
2135      nop.m 999
2136      fsub.s1 c = C_hi, r
2137      nop.i 999
2138}
2139{ .mfi
2140      nop.m 999
2141      fmpy.s1 rsq = r, r
2142      add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
2143}
2144;;
2145
2146//    Case 4: c = c + C_lo  finished.
2147//
2148//    Is i_1 = lsb of N_fix_gr even or odd?
2149//    if i_1 == 0, set PR_11, else set PR_12.
2150//
2151{ .mfi
2152      nop.m 999
2153      fadd.s1 c = c , C_lo
2154      tbit.z p11, p12 =  N_fix_gr, 0
2155}
2156;;
2157
2158// r and c have been computed.
2159{ .mfi
2160      nop.m 999
2161(p12) frcpa.s1 S_hi, p0 = f1, r
2162      nop.i 999
2163}
2164{ .mfi
2165      nop.m 999
2166//
2167//    N odd: Change sign of S_hi
2168//
2169(p11) fma.s1 Poly = rsq, P1_2, P1_1
2170      nop.i 999 ;;
2171}
2172{ .mfi
2173      nop.m 999
2174(p12) fma.s1 P = rsq, Q1_2, Q1_1
2175      nop.i 999
2176}
2177{ .mfi
2178      nop.m 999
2179//
2180//    N odd:  Result  =  S_hi + S_lo      (User supplied rounding mode for C1)
2181//
2182       fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2183      nop.i 999 ;;
2184}
2185{ .mfi
2186      nop.m 999
2187//
2188//    N even: rsq = r * r
2189//    N odd:  S_hi = frcpa(r)
2190//
2191(p12) fmerge.ns S_hi = S_hi, S_hi
2192      nop.i 999
2193}
2194{ .mfi
2195      nop.m 999
2196//
2197//    N even: rsq = rsq * P1_2 + P1_1
2198//    N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
2199//
2200(p11) fmpy.s1 Poly = rsq, Poly
2201      nop.i 999 ;;
2202}
2203{ .mfi
2204      nop.m 999
2205(p12) fma.s1 poly1 = S_hi, r,f1
2206(p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
2207}
2208{ .mfi
2209      nop.m 999
2210//
2211//    N even: Poly =  Poly * rsq
2212//    N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
2213//
2214(p11) fma.s1 Poly = r, Poly, c
2215      nop.i 999 ;;
2216}
2217{ .mfi
2218      nop.m 999
2219(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2220      nop.i 999
2221}
2222{ .mfi
2223      nop.m 999
2224//
2225//    N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
2226//
2227(p14) fadd.s0 Result = r, Poly          // for tanl
2228      nop.i 999 ;;
2229}
2230
2231.pred.rel "mutex",p15,p12
2232{ .mfi
2233      nop.m 999
2234(p15) fms.s0 Result = r, mOne, Poly     // for cotl
2235      nop.i 999
2236}
2237{ .mfi
2238      nop.m 999
2239(p12) fma.s1 poly1 =  S_hi, r, f1
2240      nop.i 999 ;;
2241}
2242{ .mfi
2243      nop.m 999
2244//
2245//    N even: Poly = Poly * r + c
2246//    N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
2247//
2248(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2249      nop.i 999 ;;
2250}
2251{ .mfi
2252      nop.m 999
2253(p12) fma.s1 poly1 = S_hi, r, f1
2254      nop.i 999 ;;
2255}
2256{ .mfi
2257      nop.m 999
2258//
2259//    N even: Result = Poly + r  (Rounding mode S0)
2260//    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2261//
2262(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2263      nop.i 999 ;;
2264}
2265{ .mfi
2266      nop.m 999
2267//
2268//    N odd:  poly1  =  S_hi * poly + S_hi    64 bits
2269//
2270(p12) fma.s1 poly1 = S_hi, r, f1
2271      nop.i 999 ;;
2272}
2273{ .mfi
2274      nop.m 999
2275//
2276//    N odd:  poly1  =  S_hi * r + 1.0
2277//
2278(p12) fma.s1 poly1 = S_hi, c, poly1
2279      nop.i 999 ;;
2280}
2281{ .mfi
2282      nop.m 999
2283//
2284//    N odd:  poly1  =  S_hi * c + poly1
2285//
2286(p12) fmpy.s1 S_lo = S_hi, poly1
2287      nop.i 999 ;;
2288}
2289{ .mfi
2290      nop.m 999
2291//
2292//    N odd:  S_lo  =  S_hi *  poly1
2293//
2294(p12) fma.s1 S_lo = P, r, S_lo
2295(p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
2296}
2297
2298{ .mfi
2299      nop.m 999
2300(p14) fadd.s0 Result = S_hi, S_lo           // for tanl
2301      nop.i 999
2302}
2303{ .mfb
2304      nop.m 999
2305//
2306//    N odd:  S_lo  =  S_lo + r * P
2307//
2308(p15) fms.s0 Result = S_hi, mOne, S_lo      // for cotl
2309      br.ret.sptk b0 ;;      // Exit for 2^24 <= |x| < 2^63 and |s| < 2^-14
2310}
2311
2312
2313TANL_SMALL_R:
2314// Here if |r| < 1/4
2315// r and c have been computed.
2316// *****************************************************************
2317// *****************************************************************
2318// *****************************************************************
2319//    N odd:  S_hi = frcpa(r)
2320//    Get [i_1] - lsb of N_fix_gr.  Set p11 if N even, p12 if N odd.
2321//    N even: rsq = r * r
2322{ .mfi
2323      add table_ptr1 = 160, table_base    // Point to tanl_table_p1
2324      frcpa.s1 S_hi, p0 = f1, r           // S_hi for N odd
2325      add N_fix_gr = N_fix_gr, cot_flag   // N = N + 1 (for cotl)
2326}
2327{ .mfi
2328      add table_ptr2 = 400, table_base    // Point to Q1_7
2329      fmpy.s1 rsq = r, r
2330      nop.i 999
2331}
2332;;
2333
2334{ .mmi
2335      ldfe P1_1 = [table_ptr1], 16
2336;;
2337      ldfe P1_2 = [table_ptr1], 16
2338      tbit.z p11, p12 = N_fix_gr, 0
2339}
2340;;
2341
2342
2343{ .mfi
2344      ldfe P1_3 = [table_ptr1], 96
2345      nop.f 999
2346      nop.i 999
2347}
2348;;
2349
2350{ .mfi
2351(p11) ldfe P1_9 = [table_ptr1], -16
2352(p12) fmerge.ns S_hi = S_hi, S_hi
2353      nop.i 999
2354}
2355{ .mfi
2356      nop.m 999
2357(p11) fmpy.s1 r_to_the_8 = rsq, rsq
2358      nop.i 999
2359}
2360;;
2361
2362//
2363//    N even: Poly2 = P1_7 + Poly2 * rsq
2364//    N odd:  poly2 = Q1_5 + poly2 * rsq
2365//
2366{ .mfi
2367(p11) ldfe P1_8 = [table_ptr1], -16
2368(p11) fadd.s1 CORR = rsq, f1
2369      nop.i 999
2370}
2371;;
2372
2373//
2374//    N even: Poly1 = P1_2 + P1_3 * rsq
2375//    N odd:  poly1 =  1.0 +  S_hi * r
2376//    16 bits partial  account for necessary (-1)
2377//
2378{ .mmi
2379(p11) ldfe P1_7 = [table_ptr1], -16
2380;;
2381(p11) ldfe P1_6 = [table_ptr1], -16
2382      nop.i 999
2383}
2384;;
2385
2386//
2387//    N even: Poly1 = P1_1 + Poly1 * rsq
2388//    N odd:  S_hi  =  S_hi + S_hi * poly1)     16 bits account for necessary
2389//
2390//
2391//    N even: Poly2 = P1_5 + Poly2 * rsq
2392//    N odd:  poly2 = Q1_3 + poly2 * rsq
2393//
2394{ .mfi
2395(p11) ldfe P1_5 = [table_ptr1], -16
2396(p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8
2397      nop.i 999
2398}
2399{ .mfi
2400      nop.m 999
2401(p12) fma.s1 poly1 =  S_hi, r, f1
2402      nop.i 999
2403}
2404;;
2405
2406//
2407//    N even: Poly1 =  Poly1 * rsq
2408//    N odd:  poly1  = 1.0 + S_hi * r         32 bits partial
2409//
2410
2411//
2412//    N even: CORR =  CORR * c
2413//    N odd:  S_hi  =  S_hi * poly1 + S_hi    32 bits
2414//
2415
2416//
2417//    N even: Poly2 = P1_6 + Poly2 * rsq
2418//    N odd:  poly2 = Q1_4 + poly2 * rsq
2419//
2420
2421{ .mmf
2422(p11) ldfe P1_4 = [table_ptr1], -16
2423      nop.m 999
2424(p11) fmpy.s1 CORR =  CORR, c
2425}
2426;;
2427
2428{ .mfi
2429      nop.m 999
2430(p11) fma.s1 Poly1 = P1_3, rsq, P1_2
2431      nop.i 999 ;;
2432}
2433{ .mfi
2434(p12) ldfe Q1_7 = [table_ptr2], -16
2435(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2436      nop.i 999 ;;
2437}
2438{ .mfi
2439(p12) ldfe Q1_6 = [table_ptr2], -16
2440(p11) fma.s1 Poly2 = P1_9, rsq, P1_8
2441      nop.i 999 ;;
2442}
2443{ .mmi
2444(p12) ldfe Q1_5 = [table_ptr2], -16 ;;
2445(p12) ldfe Q1_4 = [table_ptr2], -16
2446      nop.i 999 ;;
2447}
2448{ .mfi
2449(p12) ldfe Q1_3 = [table_ptr2], -16
2450//
2451//    N even: Poly2 = P1_8 + P1_9 * rsq
2452//    N odd:  poly2 = Q1_6 + Q1_7 * rsq
2453//
2454(p11) fma.s1 Poly1 = Poly1, rsq, P1_1
2455      nop.i 999 ;;
2456}
2457{ .mfi
2458(p12) ldfe Q1_2 = [table_ptr2], -16
2459(p12) fma.s1 poly1 = S_hi, r, f1
2460      nop.i 999 ;;
2461}
2462{ .mfi
2463(p12) ldfe Q1_1 = [table_ptr2], -16
2464(p11) fma.s1 Poly2 = Poly2, rsq, P1_7
2465      nop.i 999 ;;
2466}
2467{ .mfi
2468      nop.m 999
2469//
2470//    N even: CORR =  rsq + 1
2471//    N even: r_to_the_8 =  rsq * rsq
2472//
2473(p11) fmpy.s1 Poly1 = Poly1, rsq
2474      nop.i 999 ;;
2475}
2476{ .mfi
2477      nop.m 999
2478(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2479      nop.i 999
2480}
2481{ .mfi
2482      nop.m 999
2483(p12) fma.s1 poly2 = Q1_7, rsq, Q1_6
2484      nop.i 999 ;;
2485}
2486{ .mfi
2487      nop.m 999
2488(p11) fma.s1 Poly2 = Poly2, rsq, P1_6
2489      nop.i 999 ;;
2490}
2491{ .mfi
2492      nop.m 999
2493(p12) fma.s1 poly1 = S_hi, r, f1
2494      nop.i 999
2495}
2496{ .mfi
2497      nop.m 999
2498(p12) fma.s1 poly2 = poly2, rsq, Q1_5
2499      nop.i 999 ;;
2500}
2501{ .mfi
2502      nop.m 999
2503(p11) fma.s1 Poly2= Poly2, rsq, P1_5
2504      nop.i 999 ;;
2505}
2506{ .mfi
2507      nop.m 999
2508(p12) fma.s1 S_hi =  S_hi, poly1, S_hi
2509      nop.i 999
2510}
2511{ .mfi
2512      nop.m 999
2513(p12) fma.s1 poly2 = poly2, rsq, Q1_4
2514      nop.i 999 ;;
2515}
2516{ .mfi
2517      nop.m 999
2518//
2519//    N even: r_to_the_8 = r_to_the_8 * r_to_the_8
2520//    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2521//
2522(p11) fma.s1 Poly2 = Poly2, rsq, P1_4
2523      nop.i 999 ;;
2524}
2525{ .mfi
2526      nop.m 999
2527//
2528//    N even: Poly = CORR + Poly * r
2529//    N odd:  P = Q1_1 + poly2 * rsq
2530//
2531(p12) fma.s1 poly1 = S_hi, r, f1
2532      nop.i 999
2533}
2534{ .mfi
2535      nop.m 999
2536(p12) fma.s1 poly2 = poly2, rsq, Q1_3
2537      nop.i 999 ;;
2538}
2539{ .mfi
2540      nop.m 999
2541//
2542//    N even: Poly2 = P1_4 + Poly2 * rsq
2543//    N odd:  poly2 = Q1_2 + poly2 * rsq
2544//
2545(p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1
2546      nop.i 999 ;;
2547}
2548{ .mfi
2549      nop.m 999
2550(p12) fma.s1 poly1 = S_hi, c, poly1
2551      nop.i 999
2552}
2553{ .mfi
2554      nop.m 999
2555(p12) fma.s1 poly2 = poly2, rsq, Q1_2
2556      nop.i 999 ;;
2557}
2558
2559{ .mfi
2560      nop.m 999
2561//
2562//    N even: Poly = Poly1 + Poly2 * r_to_the_8
2563//    N odd:  S_hi =  S_hi * poly1 + S_hi    64 bits
2564//
2565(p11) fma.s1 Poly = Poly, r, CORR
2566      nop.i 999 ;;
2567}
2568{ .mfi
2569      nop.m 999
2570//
2571//    N even: Result =  r + Poly  (User supplied rounding mode)
2572//    N odd:  poly1  =  S_hi * c + poly1
2573//
2574(p12) fmpy.s1 S_lo = S_hi, poly1
2575(p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
2576}
2577{ .mfi
2578      nop.m 999
2579(p12) fma.s1 P = poly2, rsq, Q1_1
2580      nop.i 999 ;;
2581}
2582{ .mfi
2583      nop.m 999
2584//
2585//    N odd:  poly1  =  S_hi * r + 1.0
2586//
2587//
2588//    N odd:  S_lo  =  S_hi *  poly1
2589//
2590(p14) fadd.s0 Result = Poly, r          // for tanl
2591      nop.i 999
2592}
2593{ .mfi
2594      nop.m 999
2595(p15) fms.s0 Result = Poly, mOne, r     // for cotl
2596      nop.i 999 ;;
2597}
2598
2599{ .mfi
2600      nop.m 999
2601//
2602//    N odd:  S_lo  =  Q1_1 * c + S_lo
2603//
2604(p12) fma.s1 S_lo = Q1_1, c, S_lo
2605      nop.i 999
2606}
2607{ .mfi
2608      nop.m 999
2609      fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2610      nop.i 999 ;;
2611}
2612{ .mfi
2613      nop.m 999
2614//
2615//    N odd:  Result =  S_lo + r * P
2616//
2617(p12) fma.s1 Result = P, r, S_lo
2618(p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
2619}
2620
2621//
2622//    N odd:  Result = Result + S_hi  (user supplied rounding mode)
2623//
2624{ .mfi
2625      nop.m 999
2626(p14) fadd.s0 Result = Result, S_hi         // for tanl
2627      nop.i 999
2628}
2629{ .mfb
2630      nop.m 999
2631(p15) fms.s0 Result = Result, mOne, S_hi    // for cotl
2632      br.ret.sptk b0 ;;              // Exit |r| < 1/4 path
2633}
2634
2635
2636TANL_NORMAL_R:
2637// Here if 1/4 <= |x| < pi/4  or  if |x| >= 2^63 and |r| >= 1/4
2638// *******************************************************************
2639// *******************************************************************
2640// *******************************************************************
2641//
2642//    r and c have been computed.
2643//
2644{ .mfi
2645      nop.m 999
2646      fand B = B_mask1, r
2647      nop.i 999
2648}
2649;;
2650
2651TANL_NORMAL_R_A:
2652// Enter here if pi/4 <= |x| < 2^63 and |r| >= 1/4
2653//    Get the 5 bits or r for the lookup.   1.xxxxx ....
2654{ .mmi
2655      add table_ptr1 = 416, table_base     // Point to tanl_table_p2
2656      mov GR_exp_2tom65 = 0xffff - 65      // Scaling constant for B
2657      extr.u lookup = sig_r, 58, 5
2658}
2659;;
2660
2661{ .mmi
2662      ldfe P2_1 = [table_ptr1], 16
2663      setf.exp TWO_TO_NEG65 = GR_exp_2tom65  // 2^-65 for scaling B if exp_r=-2
2664      add N_fix_gr = N_fix_gr, cot_flag      // N = N + 1 (for cotl)
2665}
2666;;
2667
2668.pred.rel "mutex",p11,p12
2669//    B =  2^63 * 1.xxxxx 100...0
2670{ .mfi
2671      ldfe P2_2 = [table_ptr1], 16
2672      for B = B_mask2, B
2673      mov table_offset = 512               // Assume table offset is 512
2674}
2675;;
2676
2677{ .mfi
2678      ldfe P2_3 = [table_ptr1], 16
2679      fmerge.s  Pos_r = f1, r
2680      tbit.nz p8,p9 = exp_r, 0
2681}
2682;;
2683
2684//    Is  B = 2** -2 or  B= 2** -1? If 2**-1, then
2685//    we want an offset of 512 for table addressing.
2686{ .mii
2687      add table_ptr2 = 1296, table_base     // Point to tanl_table_cm2
2688(p9)  shladd table_offset = lookup, 4, table_offset
2689(p8)  shladd table_offset = lookup, 4, r0
2690}
2691;;
2692
2693{ .mmi
2694      add table_ptr1 = table_ptr1, table_offset  // Point to T_hi
2695      add table_ptr2 = table_ptr2, table_offset  // Point to C_hi
2696      add table_ptr3 = 2128, table_base     // Point to tanl_table_scim2
2697}
2698;;
2699
2700{ .mmi
2701      ldfd T_hi = [table_ptr1], 8                // Load T_hi
2702;;
2703      ldfd C_hi = [table_ptr2], 8                // Load C_hi
2704      add table_ptr3 = table_ptr3, table_offset  // Point to SC_inv
2705}
2706;;
2707
2708//
2709//    x = |r| - B
2710//
2711//   Convert B so it has the same exponent as Pos_r before subtracting
2712{ .mfi
2713      ldfs T_lo = [table_ptr1]                   // Load T_lo
2714(p9)  fnma.s1 x = B, FR_2tom64, Pos_r
2715      nop.i 999
2716}
2717{ .mfi
2718      nop.m 999
2719(p8)  fnma.s1 x = B, TWO_TO_NEG65, Pos_r
2720      nop.i 999
2721}
2722;;
2723
2724{ .mfi
2725      ldfs C_lo = [table_ptr2]                   // Load C_lo
2726      nop.f 999
2727      nop.i 999
2728}
2729;;
2730
2731{ .mfi
2732      ldfe SC_inv = [table_ptr3]                 // Load SC_inv
2733      fmerge.s  sgn_r = r, f1
2734      tbit.z p11, p12 = N_fix_gr, 0              // p11 if N even, p12 if odd
2735
2736}
2737;;
2738
2739//
2740//    xsq = x * x
2741//    N even: Tx = T_hi * x
2742//
2743//    N even: Tx1 = Tx + 1
2744//    N odd:  Cx1 = 1 - Cx
2745//
2746
2747{ .mfi
2748      nop.m 999
2749      fmpy.s1 xsq = x, x
2750      nop.i 999
2751}
2752{ .mfi
2753      nop.m 999
2754(p11) fmpy.s1 Tx = T_hi, x
2755      nop.i 999
2756}
2757;;
2758
2759//
2760//    N odd: Cx = C_hi * x
2761//
2762{ .mfi
2763      nop.m 999
2764(p12) fmpy.s1 Cx = C_hi, x
2765      nop.i 999
2766}
2767;;
2768//
2769//    N even and odd: P = P2_3 + P2_2 * xsq
2770//
2771{ .mfi
2772      nop.m 999
2773      fma.s1 P = P2_3, xsq, P2_2
2774      nop.i 999
2775}
2776{ .mfi
2777      nop.m 999
2778(p11) fadd.s1 Tx1 = Tx, f1
2779      nop.i 999 ;;
2780}
2781{ .mfi
2782      nop.m 999
2783//
2784//    N even: D = C_hi - tanx
2785//    N odd: D = T_hi + tanx
2786//
2787(p11) fmpy.s1 CORR = SC_inv, T_hi
2788      nop.i 999
2789}
2790{ .mfi
2791      nop.m 999
2792      fmpy.s1 Sx = SC_inv, x
2793      nop.i 999 ;;
2794}
2795{ .mfi
2796      nop.m 999
2797(p12) fmpy.s1 CORR = SC_inv, C_hi
2798      nop.i 999 ;;
2799}
2800{ .mfi
2801      nop.m 999
2802(p12) fsub.s1 V_hi = f1, Cx
2803      nop.i 999 ;;
2804}
2805{ .mfi
2806      nop.m 999
2807      fma.s1 P = P, xsq, P2_1
2808      nop.i 999
2809}
2810{ .mfi
2811      nop.m 999
2812//
2813//    N even and odd: P = P2_1 + P * xsq
2814//
2815(p11) fma.s1 V_hi = Tx, Tx1, f1
2816      nop.i 999 ;;
2817}
2818{ .mfi
2819      nop.m 999
2820//
2821//    N even: Result  = sgn_r * tail + T_hi (user rounding mode for C1)
2822//    N odd:  Result  = sgn_r * tail + C_hi (user rounding mode for C1)
2823//
2824      fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2825      nop.i 999 ;;
2826}
2827{ .mfi
2828      nop.m 999
2829      fmpy.s1 CORR = CORR, c
2830      nop.i 999 ;;
2831}
2832{ .mfi
2833      nop.m 999
2834(p12) fnma.s1 V_hi = Cx,V_hi,f1
2835      nop.i 999 ;;
2836}
2837{ .mfi
2838      nop.m 999
2839//
2840//    N even: V_hi = Tx * Tx1 + 1
2841//    N odd: Cx1 = 1 - Cx * Cx1
2842//
2843      fmpy.s1 P = P, xsq
2844      nop.i 999
2845}
2846{ .mfi
2847      nop.m 999
2848//
2849//    N even and odd: P = P * xsq
2850//
2851(p11) fmpy.s1 V_hi = V_hi, T_hi
2852      nop.i 999 ;;
2853}
2854{ .mfi
2855      nop.m 999
2856//
2857//    N even and odd: tail = P * tail + V_lo
2858//
2859(p11) fmpy.s1 T_hi = sgn_r, T_hi
2860      nop.i 999 ;;
2861}
2862{ .mfi
2863      nop.m 999
2864      fmpy.s1 CORR = CORR, sgn_r
2865      nop.i 999 ;;
2866}
2867{ .mfi
2868      nop.m 999
2869(p12) fmpy.s1 V_hi = V_hi,C_hi
2870      nop.i 999 ;;
2871}
2872{ .mfi
2873      nop.m 999
2874//
2875//    N even: V_hi = T_hi * V_hi
2876//    N odd: V_hi  = C_hi * V_hi
2877//
2878      fma.s1 tanx = P, x, x
2879      nop.i 999
2880}
2881{ .mfi
2882      nop.m 999
2883(p12) fnmpy.s1 C_hi = sgn_r, C_hi
2884      nop.i 999 ;;
2885}
2886{ .mfi
2887      nop.m 999
2888//
2889//    N even: V_lo = 1 - V_hi + C_hi
2890//    N odd: V_lo = 1 - V_hi + T_hi
2891//
2892(p11) fadd.s1 CORR = CORR, T_lo
2893      nop.i 999
2894}
2895{ .mfi
2896      nop.m 999
2897(p12) fsub.s1 CORR = CORR, C_lo
2898      nop.i 999 ;;
2899}
2900{ .mfi
2901      nop.m 999
2902//
2903//    N even and odd: tanx = x + x * P
2904//    N even and odd: Sx = SC_inv * x
2905//
2906(p11) fsub.s1 D = C_hi, tanx
2907      nop.i 999
2908}
2909{ .mfi
2910      nop.m 999
2911(p12) fadd.s1 D = T_hi, tanx
2912      nop.i 999 ;;
2913}
2914{ .mfi
2915      nop.m 999
2916//
2917//    N odd: CORR = SC_inv * C_hi
2918//    N even: CORR = SC_inv * T_hi
2919//
2920      fnma.s1 D = V_hi, D, f1
2921      nop.i 999 ;;
2922}
2923{ .mfi
2924      nop.m 999
2925//
2926//    N even and odd: D = 1 - V_hi * D
2927//    N even and odd: CORR = CORR * c
2928//
2929      fma.s1 V_hi = V_hi, D, V_hi
2930      nop.i 999 ;;
2931}
2932{ .mfi
2933      nop.m 999
2934//
2935//    N even and odd: V_hi = V_hi + V_hi * D
2936//    N even and odd: CORR = sgn_r * CORR
2937//
2938(p11) fnma.s1 V_lo = V_hi, C_hi, f1
2939      nop.i 999
2940}
2941{ .mfi
2942      nop.m 999
2943(p12) fnma.s1 V_lo = V_hi, T_hi, f1
2944      nop.i 999 ;;
2945}
2946{ .mfi
2947      nop.m 999
2948//
2949//    N even: CORR = COOR + T_lo
2950//    N odd: CORR = CORR - C_lo
2951//
2952(p11) fma.s1 V_lo = tanx, V_hi, V_lo
2953      tbit.nz p15, p0 = cot_flag, 0       // p15=1 if we compute cotl
2954}
2955{ .mfi
2956      nop.m 999
2957(p12) fnma.s1 V_lo = tanx, V_hi, V_lo
2958      nop.i 999 ;;
2959}
2960
2961{ .mfi
2962      nop.m 999
2963(p15) fms.s1 T_hi = f0, f0, T_hi        // to correct result's sign for cotl
2964      nop.i 999
2965}
2966{ .mfi
2967      nop.m 999
2968(p15) fms.s1 C_hi = f0, f0, C_hi        // to correct result's sign for cotl
2969      nop.i 999
2970};;
2971
2972{ .mfi
2973      nop.m 999
2974(p15) fms.s1 sgn_r = f0, f0, sgn_r      // to correct result's sign for cotl
2975      nop.i 999
2976};;
2977
2978{ .mfi
2979      nop.m 999
2980//
2981//    N even: V_lo = V_lo + V_hi * tanx
2982//    N odd: V_lo = V_lo - V_hi * tanx
2983//
2984(p11) fnma.s1 V_lo = C_lo, V_hi, V_lo
2985      nop.i 999
2986}
2987{ .mfi
2988      nop.m 999
2989(p12) fnma.s1 V_lo = T_lo, V_hi, V_lo
2990      nop.i 999 ;;
2991}
2992{ .mfi
2993      nop.m 999
2994//
2995//    N  even: V_lo = V_lo - V_hi * C_lo
2996//    N  odd: V_lo = V_lo - V_hi * T_lo
2997//
2998      fmpy.s1 V_lo = V_hi, V_lo
2999      nop.i 999 ;;
3000}
3001{ .mfi
3002      nop.m 999
3003//
3004//    N even and odd: V_lo = V_lo * V_hi
3005//
3006      fadd.s1 tail = V_hi, V_lo
3007      nop.i 999 ;;
3008}
3009{ .mfi
3010      nop.m 999
3011//
3012//    N even and odd: tail = V_hi + V_lo
3013//
3014      fma.s1 tail = tail, P, V_lo
3015      nop.i 999 ;;
3016}
3017{ .mfi
3018      nop.m 999
3019//
3020//    N even: T_hi = sgn_r * T_hi
3021//    N odd : C_hi = -sgn_r * C_hi
3022//
3023      fma.s1 tail = tail, Sx, CORR
3024      nop.i 999 ;;
3025}
3026{ .mfi
3027      nop.m 999
3028//
3029//    N even and odd: tail = Sx * tail + CORR
3030//
3031      fma.s1 tail = V_hi, Sx, tail
3032      nop.i 999 ;;
3033}
3034{ .mfi
3035      nop.m 999
3036//
3037//    N even an odd: tail = Sx * V_hi + tail
3038//
3039(p11) fma.s0 Result = sgn_r, tail, T_hi
3040      nop.i 999
3041}
3042{ .mfb
3043      nop.m 999
3044(p12) fma.s0 Result = sgn_r, tail, C_hi
3045      br.ret.sptk b0 ;;                 // Exit for 1/4 <= |r| < pi/4
3046}
3047
3048TANL_DENORMAL:
3049// Here if x denormal
3050{ .mfb
3051      getf.exp GR_signexp_x = Norm_Arg          // Get sign and exponent of x
3052      nop.f 999
3053      br.cond.sptk TANL_COMMON                  // Return to common code
3054}
3055;;
3056
3057
3058TANL_SPECIAL:
3059TANL_UNSUPPORTED:
3060//
3061//     Code for NaNs, Unsupporteds, Infs, or +/- zero ?
3062//     Invalid raised for Infs and SNaNs.
3063//
3064
3065{ .mfi
3066      nop.m 999
3067      fmerge.s  f10 = f8, f8            // Save input for error call
3068      tbit.nz p6, p7 = cot_flag, 0      // p6=1 if we compute cotl
3069}
3070;;
3071
3072{ .mfi
3073      nop.m 999
3074(p6)  fclass.m p6, p7 = f8, 0x7         // Test for zero (cotl only)
3075      nop.i 999
3076}
3077;;
3078
3079.pred.rel "mutex", p6, p7
3080{ .mfi
3081(p6)  mov GR_Parameter_Tag = 225        // (cotl)
3082(p6)  frcpa.s0  f8, p0 = f1, f8         // cotl(+-0) = +-Inf
3083      nop.i 999
3084}
3085{ .mfb
3086      nop.m 999
3087(p7)  fmpy.s0 f8 = f8, f0
3088(p7)  br.ret.sptk b0
3089}
3090;;
3091
3092GLOBAL_IEEE754_END(tanl)
3093libm_alias_ldouble_other (__tan, tan)
3094
3095
3096LOCAL_LIBM_ENTRY(__libm_error_region)
3097.prologue
3098
3099// (1)
3100{ .mfi
3101      add           GR_Parameter_Y=-32,sp        // Parameter 2 value
3102      nop.f         0
3103.save   ar.pfs,GR_SAVE_PFS
3104      mov           GR_SAVE_PFS=ar.pfs           // Save ar.pfs
3105}
3106{ .mfi
3107.fframe 64
3108      add sp=-64,sp                              // Create new stack
3109      nop.f 0
3110      mov GR_SAVE_GP=gp                          // Save gp
3111};;
3112
3113// (2)
3114{ .mmi
3115      stfe [GR_Parameter_Y] = f1,16              // STORE Parameter 2 on stack
3116      add GR_Parameter_X = 16,sp                 // Parameter 1 address
3117.save   b0, GR_SAVE_B0
3118      mov GR_SAVE_B0=b0                          // Save b0
3119};;
3120
3121.body
3122// (3)
3123{ .mib
3124      stfe [GR_Parameter_X] = f10                // STORE Parameter 1 on stack
3125      add   GR_Parameter_RESULT = 0,GR_Parameter_Y  // Parameter 3 address
3126      nop.b 0
3127}
3128{ .mib
3129      stfe [GR_Parameter_Y] = f8                 // STORE Parameter 3 on stack
3130      add   GR_Parameter_Y = -16,GR_Parameter_Y
3131      br.call.sptk b0=__libm_error_support#      // Call error handling function
3132};;
3133{ .mmi
3134      nop.m 0
3135      nop.m 0
3136      add   GR_Parameter_RESULT = 48,sp
3137};;
3138
3139// (4)
3140{ .mmi
3141      ldfe  f8 = [GR_Parameter_RESULT]           // Get return result off stack
3142.restore sp
3143      add   sp = 64,sp                           // Restore stack pointer
3144      mov   b0 = GR_SAVE_B0                      // Restore return address
3145};;
3146{ .mib
3147      mov   gp = GR_SAVE_GP                      // Restore gp
3148      mov   ar.pfs = GR_SAVE_PFS                 // Restore ar.pfs
3149      br.ret.sptk     b0                         // Return
3150};;
3151
3152LOCAL_LIBM_END(__libm_error_region)
3153
3154.type   __libm_error_support#,@function
3155.global __libm_error_support#
3156
3157
3158// *******************************************************************
3159// *******************************************************************
3160// *******************************************************************
3161//
3162//     Special Code to handle very large argument case.
3163//     Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63
3164//     The interface is custom:
3165//       On input:
3166//         (Arg or x) is in f8
3167//       On output:
3168//         r is in f8
3169//         c is in f9
3170//         N is in r8
3171//     We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127.  We
3172//     use this to eliminate save/restore of key fp registers in this calling
3173//     function.
3174//
3175// *******************************************************************
3176// *******************************************************************
3177// *******************************************************************
3178
3179LOCAL_LIBM_ENTRY(__libm_callout)
3180TANL_ARG_TOO_LARGE:
3181.prologue
3182{ .mfi
3183      add table_ptr2 = 144, table_base        // Point to 2^-2
3184      nop.f 999
3185.save   ar.pfs,GR_SAVE_PFS
3186      mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
3187}
3188;;
3189
3190//     Load 2^-2, -2^-2
3191{ .mmi
3192      ldfps  TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2]
3193      setf.sig B_mask1 = bmask1               // Form mask to get 5 msb of r
3194.save   b0, GR_SAVE_B0
3195      mov GR_SAVE_B0=b0                       // Save b0
3196};;
3197
3198.body
3199//
3200//     Call argument reduction with x in f8
3201//     Returns with N in r8, r in f8, c in f9
3202//     Assumes f71-127 are preserved across the call
3203//
3204{ .mib
3205      setf.sig B_mask2 = bmask2               // Form mask to form B from r
3206      mov GR_SAVE_GP=gp                       // Save gp
3207      br.call.sptk b0=__libm_pi_by_2_reduce#
3208}
3209;;
3210
3211//
3212//     Is |r| < 2**(-2)
3213//
3214{ .mfi
3215      getf.sig sig_r = r                     // Extract significand of r
3216      fcmp.lt.s1  p6, p0 = r, TWO_TO_NEG2
3217      mov   gp = GR_SAVE_GP                  // Restore gp
3218}
3219;;
3220
3221{ .mfi
3222      getf.exp exp_r = r                     // Extract signexp of r
3223      nop.f 999
3224      mov    b0 = GR_SAVE_B0                 // Restore return address
3225}
3226;;
3227
3228//
3229//     Get N_fix_gr
3230//
3231{ .mfi
3232      mov   N_fix_gr = r8
3233(p6)  fcmp.gt.unc.s1  p6, p0 = r, NEGTWO_TO_NEG2
3234      mov   ar.pfs = GR_SAVE_PFS             // Restore pfs
3235}
3236;;
3237
3238{ .mbb
3239      nop.m 999
3240(p6)  br.cond.spnt TANL_SMALL_R              // Branch if |r| < 1/4
3241      br.cond.sptk TANL_NORMAL_R             // Branch if 1/4 <= |r| < pi/4
3242}
3243;;
3244
3245LOCAL_LIBM_END(__libm_callout)
3246
3247.type __libm_pi_by_2_reduce#,@function
3248.global __libm_pi_by_2_reduce#
3249