1.file "libm_tan.s"
2
3// Copyright (C) 2000, 2001, Intel Corporation
4// All rights reserved.
5//
6//
7// Redistribution and use in source and binary forms, with or without
8// modification, are permitted provided that the following conditions are
9// met:
10//
11// * Redistributions of source code must retain the above copyright
12// notice, this list of conditions and the following disclaimer.
13//
14// * Redistributions in binary form must reproduce the above copyright
15// notice, this list of conditions and the following disclaimer in the
16// documentation and/or other materials provided with the distribution.
17//
18// * The name of Intel Corporation may not be used to endorse or promote
19// products derived from this software without specific prior written
20// permission.
21//
22// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
26// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
30// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
31// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33//
34// Intel Corporation is the author of this code, and requests that all
35// problem reports or change requests be submitted to it directly at
36// http://developer.intel.com/opensource.
37//
38// *********************************************************************
39//
40// History:
41// 02/02/00 Initial Version
42// 4/04/00  Unwind support added
43// 12/28/00 Fixed false invalid flags
44//
45// *********************************************************************
46//
47// Function:   tan(x) = tangent(x), for double precision x values
48//
49// *********************************************************************
50//
51// Accuracy:       Very accurate for double-precision values
52//
53// *********************************************************************
54//
55// Resources Used:
56//
57//    Floating-Point Registers: f8 (Input and Return Value)
58//                              f9-f15
59//                              f32-f112
60//
61//    General Purpose Registers:
62//      r32-r48
63//      r49-r50 (Used to pass arguments to pi_by_2 reduce routine)
64//
65//    Predicate Registers:      p6-p15
66//
67// *********************************************************************
68//
69// IEEE Special Conditions:
70//
71//    Denormal  fault raised on denormal inputs
72//    Overflow exceptions do not occur
73//    Underflow exceptions raised when appropriate for tan
74//    (No specialized error handling for this routine)
75//    Inexact raised when appropriate by algorithm
76//
77//    tan(SNaN) = QNaN
78//    tan(QNaN) = QNaN
79//    tan(inf) = QNaN
80//    tan(+/-0) = +/-0
81//
82// *********************************************************************
83//
84// Mathematical Description
85//
86// We consider the computation of FPTAN of Arg. Now, given
87//
88//      Arg = N pi/2  + alpha,          |alpha| <= pi/4,
89//
90// basic mathematical relationship shows that
91//
92//      tan( Arg ) =  tan( alpha )     if N is even;
93//                 = -cot( alpha )      otherwise.
94//
95// The value of alpha is obtained by argument reduction and
96// represented by two working precision numbers r and c where
97//
98//      alpha =  r  +  c     accurately.
99//
100// The reduction method is described in a previous write up.
101// The argument reduction scheme identifies 4 cases. For Cases 2
102// and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be
103// computed very easily by 2 or 3 terms of the Taylor series
104// expansion as follows:
105//
106// Case 2:
107// -------
108//
109//      tan(r + c) = r + c + r^3/3          ...accurately
110//        -cot(r + c) = -1/(r+c) + r/3          ...accurately
111//
112// Case 4:
113// -------
114//
115//      tan(r + c) = r + c + r^3/3 + 2r^5/15     ...accurately
116//        -cot(r + c) = -1/(r+c) + r/3 + r^3/45     ...accurately
117//
118//
119// The only cases left are Cases 1 and 3 of the argument reduction
120// procedure. These two cases will be merged since after the
121// argument is reduced in either cases, we have the reduced argument
122// represented as r + c and that the magnitude |r + c| is not small
123// enough to allow the usage of a very short approximation.
124//
125// The greatest challenge of this task is that the second terms of
126// the Taylor series for tan(r) and -cot(r)
127//
128//      r + r^3/3 + 2 r^5/15 + ...
129//
130// and
131//
132//      -1/r + r/3 + r^3/45 + ...
133//
134// are not very small when |r| is close to pi/4 and the rounding
135// errors will be a concern if simple polynomial accumulation is
136// used. When |r| < 2^(-2), however, the second terms will be small
137// enough (5 bits or so of right shift) that a normal Horner
138// recurrence suffices. Hence there are two cases that we consider
139// in the accurate computation of tan(r) and cot(r), |r| <= pi/4.
140//
141// Case small_r: |r| < 2^(-2)
142// --------------------------
143//
144// Since Arg = N pi/4 + r + c accurately, we have
145//
146//      tan(Arg) =  tan(r+c)            for N even,
147//            = -cot(r+c)          otherwise.
148//
149// Here for this case, both tan(r) and -cot(r) can be approximated
150// by simple polynomials:
151//
152//      tan(r) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
153//        -cot(r) = -1/r + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
154//
155// accurately. Since |r| is relatively small, tan(r+c) and
156// -cot(r+c) can be accurately approximated by replacing r with
157// r+c only in the first two terms of the corresponding polynomials.
158//
159// Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to
160// almost 64 sig. bits, thus
161//
162//      P1_1 (r+c)^3 =  P1_1 r^3 + c * r^2     accurately.
163//
164// Hence,
165//
166//      tan(r+c) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
167//                     + c*(1 + r^2)
168//
169//        -cot(r+c) = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
170//               + Q1_1*c
171//
172//
173// Case normal_r: 2^(-2) <= |r| <= pi/4
174// ------------------------------------
175//
176// This case is more likely than the previous one if one considers
177// r to be uniformly distributed in [-pi/4 pi/4].
178//
179// The required calculation is either
180//
181//      tan(r + c)  =  tan(r)  +  correction,  or
182//        -cot(r + c)  = -cot(r)  +  correction.
183//
184// Specifically,
185//
186//      tan(r + c) =  tan(r) + c tan'(r)  + O(c^2)
187//              =  tan(r) + c sec^2(r) + O(c^2)
188//              =  tan(r) + c SEC_sq     ...accurately
189//                as long as SEC_sq approximates sec^2(r)
190//                to, say, 5 bits or so.
191//
192// Similarly,
193//
194//        -cot(r + c) = -cot(r) - c cot'(r)  + O(c^2)
195//              = -cot(r) + c csc^2(r) + O(c^2)
196//              = -cot(r) + c CSC_sq     ...accurately
197//                as long as CSC_sq approximates csc^2(r)
198//                to, say, 5 bits or so.
199//
200// We therefore concentrate on accurately calculating tan(r) and
201// cot(r) for a working-precision number r, |r| <= pi/4 to within
202// 0.1% or so.
203//
204// We will employ a table-driven approach. Let
205//
206//      r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63
207//        = sgn_r * ( B + x )
208//
209// where
210//
211//      B = 2^k * 1.b_1 b_2 ... b_5 1
212//         x = |r| - B
213//
214// Now,
215//                   tan(B)  +   tan(x)
216//      tan( B + x ) =  ------------------------
217//                   1 -  tan(B)*tan(x)
218//
219//               /                         \
220//               |   tan(B)  +   tan(x)          |
221
222//      = tan(B) +  | ------------------------ - tan(B) |
223//               |     1 -  tan(B)*tan(x)          |
224//               \                         /
225//
226//                 sec^2(B) * tan(x)
227//      = tan(B) + ------------------------
228//                 1 -  tan(B)*tan(x)
229//
230//                (1/[sin(B)*cos(B)]) * tan(x)
231//      = tan(B) + --------------------------------
232//                      cot(B)  -  tan(x)
233//
234//
235// Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
236// calculated beforehand and stored in a table. Since
237//
238//      |x| <= 2^k * 2^(-6)  <= 2^(-7)  (because k = -1, -2)
239//
240// a very short polynomial will be sufficient to approximate tan(x)
241// accurately. The details involved in computing the last expression
242// will be given in the next section on algorithm description.
243//
244//
245// Now, we turn to the case where cot( B + x ) is needed.
246//
247//
248//                   1 - tan(B)*tan(x)
249//      cot( B + x ) =  ------------------------
250//                   tan(B)  +  tan(x)
251//
252//               /                           \
253//               |   1 - tan(B)*tan(x)              |
254
255//      = cot(B) +  | ----------------------- - cot(B) |
256//               |     tan(B)  +  tan(x)            |
257//               \                           /
258//
259//               [tan(B) + cot(B)] * tan(x)
260//      = cot(B) - ----------------------------
261//                   tan(B)  +  tan(x)
262//
263//                (1/[sin(B)*cos(B)]) * tan(x)
264//      = cot(B) - --------------------------------
265//                      tan(B)  +  tan(x)
266//
267//
268// Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that
269// are needed are the same set of values needed in the previous
270// case.
271//
272// Finally, we can put all the ingredients together as follows:
273//
274//      Arg = N * pi/2 +  r + c          ...accurately
275//
276//      tan(Arg) =  tan(r) + correction    if N is even;
277//            = -cot(r) + correction    otherwise.
278//
279// For Cases 2 and 4,
280//
281//     Case 2:
282//     tan(Arg) =  tan(r + c) = r + c + r^3/3           N even
283//              = -cot(r + c) = -1/(r+c) + r/3           N odd
284//     Case 4:
285//     tan(Arg) =  tan(r + c) = r + c + r^3/3 + 2r^5/15  N even
286//              = -cot(r + c) = -1/(r+c) + r/3 + r^3/45  N odd
287//
288//
289// For Cases 1 and 3,
290//
291//     Case small_r: |r| < 2^(-2)
292//
293//      tan(Arg) =  r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
294//                     + c*(1 + r^2)               N even
295//
296//                  = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
297//               + Q1_1*c                    N odd
298//
299//     Case normal_r: 2^(-2) <= |r| <= pi/4
300//
301//      tan(Arg) =  tan(r) + c * sec^2(r)     N even
302//               = -cot(r) + c * csc^2(r)     otherwise
303//
304//     For N even,
305//
306//      tan(Arg) = tan(r) + c*sec^2(r)
307//               = tan( sgn_r * (B+x) ) + c * sec^2(|r|)
308//                  = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(|r|) )
309//                  = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(B) )
310//
311// since B approximates |r| to 2^(-6) in relative accuracy.
312//
313//                 /            (1/[sin(B)*cos(B)]) * tan(x)
314//    tan(Arg) = sgn_r * | tan(B) + --------------------------------
315//                 \                     cot(B)  -  tan(x)
316//                                        \
317//                       + CORR  |
318
319//                                     /
320// where
321//
322//    CORR = sgn_r*c*tan(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
323//
324// For N odd,
325//
326//      tan(Arg) = -cot(r) + c*csc^2(r)
327//               = -cot( sgn_r * (B+x) ) + c * csc^2(|r|)
328//                  = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(|r|) )
329//                  = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(B) )
330//
331// since B approximates |r| to 2^(-6) in relative accuracy.
332//
333//                 /            (1/[sin(B)*cos(B)]) * tan(x)
334//    tan(Arg) = sgn_r * | -cot(B) + --------------------------------
335//                 \                     tan(B)  +  tan(x)
336//                                        \
337//                       + CORR  |
338
339//                                     /
340// where
341//
342//    CORR = sgn_r*c*cot(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
343//
344//
345// The actual algorithm prescribes how all the mathematical formulas
346// are calculated.
347//
348//
349// 2. Algorithmic Description
350// ==========================
351//
352// 2.1 Computation for Cases 2 and 4.
353// ----------------------------------
354//
355// For Case 2, we use two-term polynomials.
356//
357//    For N even,
358//
359//    rsq := r * r
360//    Result := c + r * rsq * P1_1
361//    Result := r + Result          ...in user-defined rounding
362//
363//    For N odd,
364//    S_hi  := -frcpa(r)               ...8 bits
365//    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
366//    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
367//    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
368//    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
369//    ...S_hi + S_lo is -1/(r+c) to extra precision
370//    S_lo  := S_lo + Q1_1*r
371//
372//    Result := S_hi + S_lo     ...in user-defined rounding
373//
374// For Case 4, we use three-term polynomials
375//
376//    For N even,
377//
378//    rsq := r * r
379//    Result := c + r * rsq * (P1_1 + rsq * P1_2)
380//    Result := r + Result          ...in user-defined rounding
381//
382//    For N odd,
383//    S_hi  := -frcpa(r)               ...8 bits
384//    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
385//    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
386//    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
387//    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
388//    ...S_hi + S_lo is -1/(r+c) to extra precision
389//    rsq   := r * r
390//    P      := Q1_1 + rsq*Q1_2
391//    S_lo  := S_lo + r*P
392//
393//    Result := S_hi + S_lo     ...in user-defined rounding
394//
395//
396// Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are
397// the same as those used in the small_r case of Cases 1 and 3
398// below.
399//
400//
401// 2.2 Computation for Cases 1 and 3.
402// ----------------------------------
403// This is further divided into the case of small_r,
404// where |r| < 2^(-2), and the case of normal_r, where |r| lies between
405// 2^(-2) and pi/4.
406//
407// Algorithm for the case of small_r
408// ---------------------------------
409//
410// For N even,
411//      rsq   := r * r
412//      Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3))
413//      r_to_the_8    := rsq * rsq
414//      r_to_the_8    := r_to_the_8 * r_to_the_8
415//      Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9))
416//      CORR  := c * ( 1 + rsq )
417//      Poly  := Poly1 + r_to_the_8*Poly2
418//      Result := r*Poly + CORR
419//      Result := r + Result     ...in user-defined rounding
420//      ...note that Poly1 and r_to_the_8 can be computed in parallel
421//      ...with Poly2 (Poly1 is intentionally set to be much
422//      ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden)
423//
424// For N odd,
425//      S_hi  := -frcpa(r)               ...8 bits
426//      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
427//      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
428//      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
429//      S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
430//      ...S_hi + S_lo is -1/(r+c) to extra precision
431//      S_lo  := S_lo + Q1_1*c
432//
433//      ...S_hi and S_lo are computed in parallel with
434//      ...the following
435//      rsq := r*r
436//      P   := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7))
437//
438//      Result :=  r*P + S_lo
439//      Result :=  S_hi  +  Result      ...in user-defined rounding
440//
441//
442// Algorithm for the case of normal_r
443// ----------------------------------
444//
445// Here, we first consider the computation of tan( r + c ). As
446// presented in the previous section,
447//
448//      tan( r + c )  =  tan(r) + c * sec^2(r)
449//                 =  sgn_r * [ tan(B+x) + CORR ]
450//      CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)]
451//
452// because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits.
453//
454//      tan( r + c ) =
455//           /           (1/[sin(B)*cos(B)]) * tan(x)
456//      sgn_r * | tan(B) + --------------------------------  +
457//           \                     cot(B)  -  tan(x)
458//                                \
459//                          CORR  |
460
461//                                /
462//
463// The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
464// calculated beforehand and stored in a table. Specifically,
465// the table values are
466//
467//      tan(B)                as  T_hi  +  T_lo;
468//      cot(B)             as  C_hi  +  C_lo;
469//      1/[sin(B)*cos(B)]  as  SC_inv
470//
471// T_hi, C_hi are in  double-precision  memory format;
472// T_lo, C_lo are in  single-precision  memory format;
473// SC_inv     is  in extended-precision memory format.
474//
475// The value of tan(x) will be approximated by a short polynomial of
476// the form
477//
478//      tan(x)  as  x  +  x * P, where
479//           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
480//
481// Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x)
482// to a relative accuracy better than 2^(-20). Thus, a good
483// initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative
484// division is:
485//
486//      1/(cot(B) - tan(x))      is approximately
487//      1/(cot(B) -   x)         is
488//      tan(B)/(1 - x*tan(B))    is approximately
489//      T_hi / ( 1 - T_hi * x )  is approximately
490//
491//      T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ]
492//
493// The calculation of tan(r+c) therefore proceed as follows:
494//
495//      Tx     := T_hi * x
496//      xsq     := x * x
497//
498//      V_hi     := T_hi*(1 + Tx*(1 + Tx))
499//      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
500//      ...V_hi serves as an initial guess of 1/(cot(B) - tan(x))
501//         ...good to about 20 bits of accuracy
502//
503//      tanx     := x + x*P
504//      D     := C_hi - tanx
505//      ...D is a double precision denominator: cot(B) - tan(x)
506//
507//      V_hi     := V_hi + V_hi*(1 - V_hi*D)
508//      ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits
509//
510//      V_lo     := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ]
511//                           - V_hi*C_lo )   ...observe all order
512//         ...V_hi + V_lo approximates 1/(cot(B) - tan(x))
513//      ...to extra accuracy
514//
515//      ...               SC_inv(B) * (x + x*P)
516//      ...   tan(B) +      ------------------------- + CORR
517//         ...                cot(B) - (x + x*P)
518//      ...
519//      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
520//      ...
521//
522//      Sx     := SC_inv * x
523//      CORR     := sgn_r * c * SC_inv * T_hi
524//
525//      ...put the ingredients together to compute
526//      ...               SC_inv(B) * (x + x*P)
527//      ...   tan(B) +      ------------------------- + CORR
528//         ...                cot(B) - (x + x*P)
529//      ...
530//      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
531//      ...
532//      ... = T_hi + T_lo + CORR +
533//      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
534//
535//      CORR := CORR + T_lo
536//      tail := V_lo + P*(V_hi + V_lo)
537//         tail := Sx * tail  +  CORR
538//      tail := Sx * V_hi  +  tail
539//         T_hi := sgn_r * T_hi
540//
541//         ...T_hi + sgn_r*tail  now approximate
542//      ...sgn_r*(tan(B+x) + CORR) accurately
543//
544//      Result :=  T_hi + sgn_r*tail  ...in user-defined
545//                           ...rounding control
546//      ...It is crucial that independent paths be fully
547//      ...exploited for performance's sake.
548//
549//
550// Next, we consider the computation of -cot( r + c ). As
551// presented in the previous section,
552//
553//        -cot( r + c )  =  -cot(r) + c * csc^2(r)
554//                 =  sgn_r * [ -cot(B+x) + CORR ]
555//      CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)]
556//
557// because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits.
558//
559//        -cot( r + c ) =
560//           /             (1/[sin(B)*cos(B)]) * tan(x)
561//      sgn_r * | -cot(B) + --------------------------------  +
562//           \                     tan(B)  +  tan(x)
563//                                \
564//                          CORR  |
565
566//                                /
567//
568// The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
569// calculated beforehand and stored in a table. Specifically,
570// the table values are
571//
572//      tan(B)                as  T_hi  +  T_lo;
573//      cot(B)             as  C_hi  +  C_lo;
574//      1/[sin(B)*cos(B)]  as  SC_inv
575//
576// T_hi, C_hi are in  double-precision  memory format;
577// T_lo, C_lo are in  single-precision  memory format;
578// SC_inv     is  in extended-precision memory format.
579//
580// The value of tan(x) will be approximated by a short polynomial of
581// the form
582//
583//      tan(x)  as  x  +  x * P, where
584//           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
585//
586// Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x)
587// to a relative accuracy better than 2^(-18). Thus, a good
588// initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative
589// division is:
590//
591//      1/(tan(B) + tan(x))      is approximately
592//      1/(tan(B) +   x)         is
593//      cot(B)/(1 + x*cot(B))    is approximately
594//      C_hi / ( 1 + C_hi * x )  is approximately
595//
596//      C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ]
597//
598// The calculation of -cot(r+c) therefore proceed as follows:
599//
600//      Cx     := C_hi * x
601//      xsq     := x * x
602//
603//      V_hi     := C_hi*(1 - Cx*(1 - Cx))
604//      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
605//      ...V_hi serves as an initial guess of 1/(tan(B) + tan(x))
606//         ...good to about 18 bits of accuracy
607//
608//      tanx     := x + x*P
609//      D     := T_hi + tanx
610//      ...D is a double precision denominator: tan(B) + tan(x)
611//
612//      V_hi     := V_hi + V_hi*(1 - V_hi*D)
613//      ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits
614//
615//      V_lo     := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ]
616//                           - V_hi*T_lo )   ...observe all order
617//         ...V_hi + V_lo approximates 1/(tan(B) + tan(x))
618//      ...to extra accuracy
619//
620//      ...               SC_inv(B) * (x + x*P)
621//      ...  -cot(B) +      ------------------------- + CORR
622//         ...                tan(B) + (x + x*P)
623//      ...
624//      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
625//      ...
626//
627//      Sx     := SC_inv * x
628//      CORR     := sgn_r * c * SC_inv * C_hi
629//
630//      ...put the ingredients together to compute
631//      ...               SC_inv(B) * (x + x*P)
632//      ...  -cot(B) +      ------------------------- + CORR
633//         ...                tan(B) + (x + x*P)
634//      ...
635//      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
636//      ...
637//      ... =-C_hi - C_lo + CORR +
638//      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
639//
640//      CORR := CORR - C_lo
641//      tail := V_lo + P*(V_hi + V_lo)
642//         tail := Sx * tail  +  CORR
643//      tail := Sx * V_hi  +  tail
644//         C_hi := -sgn_r * C_hi
645//
646//         ...C_hi + sgn_r*tail now approximates
647//      ...sgn_r*(-cot(B+x) + CORR) accurately
648//
649//      Result :=  C_hi + sgn_r*tail   in user-defined rounding control
650//      ...It is crucial that independent paths be fully
651//      ...exploited for performance's sake.
652//
653// 3. Implementation Notes
654// =======================
655//
656//   Table entries T_hi, T_lo; C_hi, C_lo; SC_inv
657//
658//   Recall that 2^(-2) <= |r| <= pi/4;
659//
660//      r = sgn_r * 2^k * 1.b_1 b_2 ... b_63
661//
662//   and
663//
664//        B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1
665//
666//   Thus, for k = -2, possible values of B are
667//
668//          B = 2^(-2) * ( 1 + index/32  +  1/64 ),
669//      index ranges from 0 to 31
670//
671//   For k = -1, however, since |r| <= pi/4 = 0.78...
672//   possible values of B are
673//
674//        B = 2^(-1) * ( 1 + index/32  +  1/64 )
675//      index ranges from 0 to 19.
676//
677//
678
679#include "libm_support.h"
680
681#ifdef _LIBC
682.rodata
683#else
684.data
685#endif
686
687.align 128
688
689TAN_BASE_CONSTANTS:
690.type TAN_BASE_CONSTANTS, @object
691data4    0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24
692                                                        // two**-14, -two**-14
693data4    0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi
694data4    0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0
695data4    0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1
696data4    0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2
697data4    0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3
698data4    0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63
699data4    0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0
700data4    0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1
701data4    0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2
702data4    0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4
703data4    0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4
704data4    0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2
705data4    0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33
706data4    0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1
707data4    0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2
708data4    0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3
709data4    0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4
710data4    0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5
711data4    0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6
712data4    0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7
713data4    0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8
714data4    0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9
715data4    0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1
716data4    0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2
717data4    0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3
718data4    0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4
719data4    0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5
720data4    0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6
721data4    0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7
722data4    0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1
723data4    0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2
724data4    0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3
725//
726//  Entries T_hi   double-precision memory format
727//  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
728//  Entries T_lo  single-precision memory format
729//  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
730//
731data4    0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000
732data4    0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000
733data4    0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000
734data4    0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000
735data4    0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000
736data4    0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000
737data4    0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000
738data4    0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000
739data4    0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000
740data4    0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000
741data4    0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000
742data4    0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000
743data4    0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000
744data4    0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000
745data4    0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000
746data4    0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000
747data4    0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000
748data4    0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000
749data4    0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000
750data4    0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000
751data4    0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000
752data4    0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000
753data4    0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000
754data4    0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000
755data4    0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000
756data4    0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000
757data4    0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000
758data4    0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000
759data4    0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000
760data4    0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000
761data4    0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000
762data4    0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000
763//
764//  Entries T_hi   double-precision memory format
765//  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
766//  Entries T_lo  single-precision memory format
767//  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
768//
769data4    0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000
770data4    0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000
771data4    0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000
772data4    0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000
773data4    0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000
774data4    0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000
775data4    0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000
776data4    0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000
777data4    0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000
778data4    0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000
779data4    0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000
780data4    0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000
781data4    0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000
782data4    0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000
783data4    0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000
784data4    0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000
785data4    0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000
786data4    0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000
787data4    0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000
788data4    0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000
789//
790//  Entries C_hi   double-precision memory format
791//  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
792//  Entries C_lo  single-precision memory format
793//  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
794//
795data4    0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000
796data4    0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000
797data4    0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000
798data4    0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000
799data4    0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000
800data4    0x5285B068, 0x400A855A, 0x25524F18, 0x00000000
801data4    0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000
802data4    0x646AF623, 0x4009188A, 0x254FD801, 0x00000000
803data4    0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000
804data4    0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000
805data4    0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000
806data4    0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000
807data4    0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000
808data4    0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000
809data4    0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000
810data4    0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000
811data4    0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000
812data4    0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000
813data4    0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000
814data4    0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000
815data4    0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000
816data4    0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000
817data4    0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000
818data4    0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000
819data4    0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000
820data4    0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000
821data4    0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000
822data4    0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000
823data4    0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000
824data4    0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000
825data4    0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000
826data4    0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000
827//
828//  Entries C_hi   double-precision memory format
829//  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
830//  Entries C_lo  single-precision memory format
831//  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
832//
833data4    0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000
834data4    0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000
835data4    0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000
836data4    0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000
837data4    0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000
838data4    0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000
839data4    0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000
840data4    0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000
841data4    0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000
842data4    0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000
843data4    0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000
844data4    0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000
845data4    0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000
846data4    0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000
847data4    0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000
848data4    0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000
849data4    0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000
850data4    0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000
851data4    0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000
852data4    0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000
853//
854//  Entries SC_inv in Swapped IEEE format (extended)
855//  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
856//
857data4    0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000
858data4    0x554B0EB0, 0x80092804, 0x00004001, 0x00000000
859data4    0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000
860data4    0x77378677, 0xF3086BA0, 0x00004000, 0x00000000
861data4    0xCCD4723C, 0xED154515, 0x00004000, 0x00000000
862data4    0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000
863data4    0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000
864data4    0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000
865data4    0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000
866data4    0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000
867data4    0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000
868data4    0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000
869data4    0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000
870data4    0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000
871data4    0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000
872data4    0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000
873data4    0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000
874data4    0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000
875data4    0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000
876data4    0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000
877data4    0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000
878data4    0x7221235F, 0xAC862A23, 0x00004000, 0x00000000
879data4    0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000
880data4    0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000
881data4    0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000
882data4    0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000
883data4    0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000
884data4    0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000
885data4    0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000
886data4    0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000
887data4    0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000
888data4    0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000
889//
890//  Entries SC_inv in Swapped IEEE format (extended)
891//  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
892//
893data4    0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000
894data4    0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000
895data4    0x61B798AF, 0x9147094F, 0x00004000, 0x00000000
896data4    0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000
897data4    0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000
898data4    0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000
899data4    0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000
900data4    0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000
901data4    0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000
902data4    0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000
903data4    0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000
904data4    0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000
905data4    0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000
906data4    0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000
907data4    0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000
908data4    0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000
909data4    0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000
910data4    0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000
911data4    0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000
912data4    0xA07791FA, 0x80186650, 0x00004000, 0x00000000
913
914Arg                 = f8
915Result              = f8
916fp_tmp              = f9
917U_2                 = f10
918rsq                =  f11
919C_hi                = f12
920C_lo                = f13
921T_hi                = f14
922T_lo                = f15
923
924N_0                 = f32
925d_1                 = f33
926MPI_BY_4            = f34
927tail                = f35
928tanx                = f36
929Cx                  = f37
930Sx                  = f38
931sgn_r               = f39
932CORR                = f40
933P                   = f41
934D                   = f42
935ArgPrime            = f43
936P_0                 = f44
937
938P2_1                = f45
939P2_2                = f46
940P2_3                = f47
941
942P1_1                = f45
943P1_2                = f46
944P1_3                = f47
945
946P1_4                = f48
947P1_5                = f49
948P1_6                = f50
949P1_7                = f51
950P1_8                = f52
951P1_9                = f53
952
953TWO_TO_63           = f54
954NEGTWO_TO_63        = f55
955x                   = f56
956xsq                 = f57
957Tx                  = f58
958Tx1                 = f59
959Set                 = f60
960poly1               = f61
961poly2               = f62
962Poly                = f63
963Poly1               = f64
964Poly2               = f65
965r_to_the_8          = f66
966B                   = f67
967SC_inv              = f68
968Pos_r               = f69
969N_0_fix             = f70
970PI_BY_4             = f71
971NEGTWO_TO_NEG2      = f72
972TWO_TO_24           = f73
973TWO_TO_NEG14        = f74
974TWO_TO_NEG33        = f75
975NEGTWO_TO_24        = f76
976NEGTWO_TO_NEG14     = f76
977NEGTWO_TO_NEG33     = f77
978two_by_PI           = f78
979N                   = f79
980N_fix               = f80
981P_1                 = f81
982P_2                 = f82
983P_3                 = f83
984s_val               = f84
985w                   = f85
986c                   = f86
987r                   = f87
988Z                   = f88
989A                   = f89
990a                   = f90
991t                   = f91
992U_1                 = f92
993d_2                 = f93
994TWO_TO_NEG2         = f94
995Q1_1                = f95
996Q1_2                = f96
997Q1_3                = f97
998Q1_4                = f98
999Q1_5                = f99
1000Q1_6                = f100
1001Q1_7                = f101
1002Q1_8                = f102
1003S_hi                = f103
1004S_lo                = f104
1005V_hi                = f105
1006V_lo                = f106
1007U_hi                = f107
1008U_lo                = f108
1009U_hiabs             = f109
1010V_hiabs             = f110
1011V                   = f111
1012Inv_P_0             = f112
1013
1014GR_SAVE_B0     = r33
1015GR_SAVE_GP     = r34
1016GR_SAVE_PFS    = r35
1017
1018delta1         = r36
1019table_ptr1     = r37
1020table_ptr2     = r38
1021i_0            = r39
1022i_1            = r40
1023N_fix_gr       = r41
1024N_inc          = r42
1025exp_Arg        = r43
1026exp_r          = r44
1027sig_r          = r45
1028lookup         = r46
1029table_offset   = r47
1030Create_B       = r48
1031gr_tmp         = r49
1032
1033GR_Parameter_X = r49
1034GR_Parameter_r = r50
1035
1036
1037
1038.global __libm_tan
1039.section .text
1040.proc __libm_tan
1041
1042
1043__libm_tan:
1044
1045{ .mfi
1046alloc r32 = ar.pfs, 0,17,2,0
1047(p0)   fclass.m.unc  p6,p0 = Arg, 0x1E7
1048      addl gr_tmp = -1,r0
1049}
1050;;
1051
1052{ .mfi
1053       nop.m 999
1054(p0)   fclass.nm.unc  p7,p0 = Arg, 0x1FF
1055       nop.i 999
1056}
1057;;
1058
1059{ .mfi
1060(p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
1061       nop.f 999
1062       nop.i 999
1063}
1064;;
1065
1066{ .mmi
1067      ld8 table_ptr1 = [table_ptr1]
1068      setf.sig fp_tmp = gr_tmp   // Make a constant so fmpy produces inexact
1069      nop.i 999
1070}
1071;;
1072
1073//
1074//     Check for NatVals, Infs , NaNs, and Zeros
1075//     Check for everything - if false, then must be pseudo-zero
1076//     or pseudo-nan.
1077//     Local table pointer
1078//
1079
1080{ .mbb
1081(p0)   add table_ptr2 = 96, table_ptr1
1082(p6)   br.cond.spnt __libm_TAN_SPECIAL
1083(p7)   br.cond.spnt __libm_TAN_SPECIAL ;;
1084}
1085//
1086//     Point to Inv_P_0
1087//     Branch out to deal with unsupporteds and special values.
1088//
1089
1090{ .mmf
1091(p0)   ldfs TWO_TO_24 = [table_ptr1],4
1092(p0)   ldfs TWO_TO_63 = [table_ptr2],4
1093//
1094//     Load -2**24, load -2**63.
1095//
1096(p0)   fcmp.eq.s0 p0, p6 = Arg, f1 ;;
1097}
1098
1099{ .mfi
1100(p0)   ldfs NEGTWO_TO_63 = [table_ptr2],12
1101(p0)   fnorm.s1     Arg = Arg
1102	nop.i 999
1103}
1104//
1105//     Load 2**24, Load 2**63.
1106//
1107
1108{ .mmi
1109(p0)   ldfs NEGTWO_TO_24 = [table_ptr1],12 ;;
1110//
1111//     Do fcmp to generate Denormal exception
1112//     - can't do FNORM (will generate Underflow when U is unmasked!)
1113//     Normalize input argument.
1114//
1115(p0)   ldfe two_by_PI = [table_ptr1],16
1116	nop.i 999
1117}
1118
1119{ .mmi
1120(p0)   ldfe Inv_P_0 = [table_ptr2],16 ;;
1121(p0)   ldfe d_1 = [table_ptr2],16
1122	nop.i 999
1123}
1124//
1125//     Decide about the paths to take:
1126//     PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2
1127//     OTHERWISE - CASE 3 OR 4
1128//     Load inverse of P_0 .
1129//     Set PR_6 if Arg <= -2**63
1130//     Are there any Infs, NaNs, or zeros?
1131//
1132
1133{ .mmi
1134(p0)   ldfe P_0 = [table_ptr1],16 ;;
1135(p0)   ldfe d_2 = [table_ptr2],16
1136	nop.i 999
1137}
1138//
1139//     Set PR_8 if Arg <= -2**24
1140//     Set PR_6 if Arg >=  2**63
1141//
1142
1143{ .mmi
1144(p0)   ldfe P_1 = [table_ptr1],16 ;;
1145(p0)   ldfe PI_BY_4 = [table_ptr2],16
1146	nop.i 999
1147}
1148//
1149//     Set PR_8 if Arg >= 2**24
1150//
1151
1152{ .mmi
1153(p0)   ldfe P_2 = [table_ptr1],16 ;;
1154(p0)   ldfe   MPI_BY_4 = [table_ptr2],16
1155	nop.i 999
1156}
1157//
1158//     Load  P_2 and PI_BY_4
1159//
1160
1161{ .mfi
1162(p0)   ldfe   P_3 = [table_ptr1],16
1163	nop.f 999
1164	nop.i 999 ;;
1165}
1166
1167{ .mfi
1168	nop.m 999
1169(p0)   fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63
1170	nop.i 999
1171}
1172
1173{ .mfi
1174	nop.m 999
1175(p0)   fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24
1176	nop.i 999 ;;
1177}
1178
1179{ .mfi
1180	nop.m 999
1181(p7)   fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63
1182	nop.i 999
1183}
1184
1185{ .mfi
1186	nop.m 999
1187(p9)   fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24
1188	nop.i 999 ;;
1189}
1190
1191{ .mib
1192	nop.m 999
1193	nop.i 999
1194//
1195//     Load  P_3 and -PI_BY_4
1196//
1197(p6)   br.cond.spnt TAN_ARG_TOO_LARGE ;;
1198}
1199
1200{ .mib
1201	nop.m 999
1202	nop.i 999
1203//
1204//     Load 2**(-2).
1205//     Load -2**(-2).
1206//     Branch out if we have a special argument.
1207//     Branch out if the magnitude of the input argument is too large
1208//     - do this branch before the next.
1209//
1210(p8)   br.cond.spnt TAN_LARGER_ARG ;;
1211}
1212//
1213//     Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
1214//
1215
1216{ .mfi
1217(p0)   ldfs TWO_TO_NEG2 = [table_ptr2],4
1218//     ARGUMENT REDUCTION CODE - CASE 1 and 2
1219//     Load 2**(-2).
1220//     Load -2**(-2).
1221(p0)   fmpy.s1 N = Arg,two_by_PI
1222	nop.i 999 ;;
1223}
1224
1225{ .mfi
1226(p0)   ldfs NEGTWO_TO_NEG2 = [table_ptr2],12
1227//
1228//     N = Arg * 2/pi
1229//
1230(p0)   fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4
1231	nop.i 999 ;;
1232}
1233
1234{ .mfi
1235	nop.m 999
1236//
1237//     if Arg < pi/4,  set PR_8.
1238//
1239(p8)   fcmp.gt.s1 p8,p9= Arg,MPI_BY_4
1240	nop.i 999 ;;
1241}
1242//
1243//     Case 1: Is |r| < 2**(-2).
1244//     Arg is the same as r in this case.
1245//     r = Arg
1246//     c = 0
1247//
1248
1249{ .mfi
1250(p8)   mov N_fix_gr = r0
1251//
1252//     if Arg > -pi/4, reset PR_8.
1253//     Select the case when |Arg| < pi/4 - set PR[8] = true.
1254//     Else Select the case when |Arg| >= pi/4 - set PR[9] = true.
1255//
1256(p0)   fcvt.fx.s1 N_fix = N
1257	nop.i 999 ;;
1258}
1259
1260{ .mfi
1261	nop.m 999
1262//
1263//     Grab the integer part of N .
1264//
1265(p8)   mov r = Arg
1266	nop.i 999
1267}
1268
1269{ .mfi
1270	nop.m 999
1271(p8)   mov c = f0
1272	nop.i 999 ;;
1273}
1274
1275{ .mfi
1276	nop.m 999
1277(p8)   fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2
1278	nop.i 999 ;;
1279}
1280
1281{ .mfi
1282	nop.m 999
1283(p10)  fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2
1284	nop.i 999 ;;
1285}
1286
1287{ .mfi
1288	nop.m 999
1289//
1290//     Case 2: Place integer part of N in GP register.
1291//
1292(p9)   fcvt.xf N = N_fix
1293	nop.i 999 ;;
1294}
1295
1296{ .mib
1297(p9)   getf.sig N_fix_gr = N_fix
1298	nop.i 999
1299//
1300//     Case 2: Convert integer N_fix back to normalized floating-point value.
1301//
1302(p10)  br.cond.spnt TAN_SMALL_R ;;
1303}
1304
1305{ .mib
1306	nop.m 999
1307	nop.i 999
1308(p8)   br.cond.sptk TAN_NORMAL_R ;;
1309}
1310//
1311//     Case 1: PR_3 is only affected  when PR_1 is set.
1312//
1313
1314{ .mmi
1315(p9)   ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;;
1316//
1317//     Case 2: Load 2**(-33).
1318//
1319(p9)   ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4
1320	nop.i 999 ;;
1321}
1322
1323{ .mfi
1324	nop.m 999
1325//
1326//     Case 2: Load -2**(-33).
1327//
1328(p9)   fnma.s1 s_val = N, P_1, Arg
1329	nop.i 999
1330}
1331
1332{ .mfi
1333	nop.m 999
1334(p9)   fmpy.s1 w = N, P_2
1335	nop.i 999 ;;
1336}
1337
1338{ .mfi
1339	nop.m 999
1340//
1341//     Case 2: w = N * P_2
1342//     Case 2: s_val = -N * P_1  + Arg
1343//
1344(p0)   fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33
1345	nop.i 999 ;;
1346}
1347
1348{ .mfi
1349	nop.m 999
1350//
1351//     Decide between case_1 and case_2 reduce:
1352//
1353(p9)   fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
1354	nop.i 999 ;;
1355}
1356
1357{ .mfi
1358	nop.m 999
1359//
1360//     Case 1_reduce:  s <= -2**(-33) or s >= 2**(-33)
1361//     Case 2_reduce: -2**(-33) < s < 2**(-33)
1362//
1363(p8)   fsub.s1 r = s_val, w
1364	nop.i 999
1365}
1366
1367{ .mfi
1368	nop.m 999
1369(p9)   fmpy.s1 w = N, P_3
1370	nop.i 999 ;;
1371}
1372
1373{ .mfi
1374	nop.m 999
1375(p9)   fma.s1  U_1 = N, P_2, w
1376	nop.i 999
1377}
1378
1379{ .mfi
1380	nop.m 999
1381//
1382//     Case 1_reduce: Is |r| < 2**(-2), if so set PR_10
1383//     else set PR_11.
1384//
1385(p8)   fsub.s1 c = s_val, r
1386	nop.i 999 ;;
1387}
1388
1389{ .mfi
1390	nop.m 999
1391//
1392//     Case 1_reduce: r = s + w (change sign)
1393//     Case 2_reduce: w = N * P_3 (change sign)
1394//
1395(p8)   fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2
1396	nop.i 999 ;;
1397}
1398
1399{ .mfi
1400	nop.m 999
1401(p10)  fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2
1402	nop.i 999 ;;
1403}
1404
1405{ .mfi
1406	nop.m 999
1407(p9)   fsub.s1 r = s_val, U_1
1408	nop.i 999
1409}
1410
1411{ .mfi
1412	nop.m 999
1413//
1414//     Case 1_reduce: c is complete here.
1415//     c = c + w (w has not been negated.)
1416//     Case 2_reduce: r is complete here - continue to calculate c .
1417//     r = s - U_1
1418//
1419(p9)   fms.s1 U_2 = N, P_2, U_1
1420	nop.i 999 ;;
1421}
1422
1423{ .mfi
1424	nop.m 999
1425//
1426//     Case 1_reduce: c = s - r
1427//     Case 2_reduce: U_1 = N * P_2 + w
1428//
1429(p8)   fsub.s1 c = c, w
1430	nop.i 999 ;;
1431}
1432
1433{ .mfi
1434	nop.m 999
1435(p9)   fsub.s1 s_val = s_val, r
1436	nop.i 999
1437}
1438
1439{ .mfb
1440	nop.m 999
1441//
1442//     Case 2_reduce:
1443//     U_2 = N * P_2 - U_1
1444//     Not needed until later.
1445//
1446(p9)   fadd.s1 U_2 = U_2, w
1447//
1448//     Case 2_reduce:
1449//     s = s - r
1450//     U_2 = U_2 + w
1451//
1452(p10)  br.cond.spnt TAN_SMALL_R ;;
1453}
1454
1455{ .mib
1456	nop.m 999
1457	nop.i 999
1458(p11)  br.cond.sptk TAN_NORMAL_R ;;
1459}
1460
1461{ .mii
1462	nop.m 999
1463//
1464//     Case 2_reduce:
1465//     c = c - U_2
1466//     c is complete here
1467//     Argument reduction ends here.
1468//
1469(p9)   extr.u i_1 = N_fix_gr, 0, 1 ;;
1470(p9)   cmp.eq.unc p11, p12 = 0x0000,i_1 ;;
1471}
1472
1473{ .mfi
1474	nop.m 999
1475//
1476//     Is i_1  even or odd?
1477//     if i_1 == 0, set p11, else set p12.
1478//
1479(p11)  fmpy.s1 rsq = r, Z
1480	nop.i 999 ;;
1481}
1482
1483{ .mfi
1484	nop.m 999
1485(p12)  frcpa.s1 S_hi,p0 = f1, r
1486	nop.i 999
1487}
1488
1489//
1490//     Case 1: Branch to SMALL_R or NORMAL_R.
1491//     Case 1 is done now.
1492//
1493
1494{ .mfi
1495(p9)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
1496(p9)   fsub.s1 c = s_val, U_1
1497	nop.i 999 ;;
1498}
1499;;
1500
1501{ .mmi
1502(p9)  ld8 table_ptr1 = [table_ptr1]
1503      nop.m 999
1504      nop.i 999
1505}
1506;;
1507
1508{ .mmi
1509(p9)   add table_ptr1 = 224, table_ptr1 ;;
1510(p9)   ldfe P1_1 = [table_ptr1],144
1511	nop.i 999 ;;
1512}
1513//
1514//     Get [i_1] -  lsb of N_fix_gr .
1515//     Load P1_1 and point to Q1_1 .
1516//
1517
1518{ .mfi
1519(p9)   ldfe Q1_1 = [table_ptr1] , 0
1520//
1521//     N even: rsq = r * Z
1522//     N odd:  S_hi = frcpa(r)
1523//
1524(p12)  fmerge.ns S_hi = S_hi, S_hi
1525	nop.i 999
1526}
1527
1528{ .mfi
1529	nop.m 999
1530//
1531//     Case 2_reduce:
1532//     c = s - U_1
1533//
1534(p9)   fsub.s1 c = c, U_2
1535	nop.i 999 ;;
1536}
1537
1538{ .mfi
1539	nop.m 999
1540(p12)  fma.s1  poly1 = S_hi, r, f1
1541	nop.i 999 ;;
1542}
1543
1544{ .mfi
1545	nop.m 999
1546//
1547//     N odd:  Change sign of S_hi
1548//
1549(p11)  fmpy.s1 rsq = rsq, P1_1
1550	nop.i 999 ;;
1551}
1552
1553{ .mfi
1554	nop.m 999
1555(p12)  fma.s1 S_hi = S_hi, poly1, S_hi
1556	nop.i 999 ;;
1557}
1558
1559{ .mfi
1560	nop.m 999
1561//
1562//     N even: rsq = rsq * P1_1
1563//     N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
1564//
1565(p11)  fma.s1 Result = r, rsq, c
1566	nop.i 999 ;;
1567}
1568
1569{ .mfi
1570	nop.m 999
1571//
1572//     N even: Result = c  + r * rsq
1573//     N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
1574//
1575(p12)  fma.s1 poly1 = S_hi, r, f1
1576	nop.i 999 ;;
1577}
1578
1579{ .mfi
1580	nop.m 999
1581//
1582//     N even: Result = Result + r
1583//     N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
1584//
1585(p11)  fadd.s0 Result = r, Result
1586	nop.i 999 ;;
1587}
1588
1589{ .mfi
1590	nop.m 999
1591(p12)  fma.s1  S_hi = S_hi, poly1, S_hi
1592	nop.i 999 ;;
1593}
1594
1595{ .mfi
1596	nop.m 999
1597//
1598//     N even: Result1 = Result + r
1599//     N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
1600//
1601(p12)  fma.s1 poly1 = S_hi, r, f1
1602	nop.i 999 ;;
1603}
1604
1605{ .mfi
1606	nop.m 999
1607//
1608//     N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
1609//
1610(p12)  fma.s1 S_hi = S_hi, poly1, S_hi
1611	nop.i 999 ;;
1612}
1613
1614{ .mfi
1615	nop.m 999
1616//
1617//     N odd:  poly1  =  S_hi * poly + 1.0    64 bits
1618//
1619(p12)  fma.s1 poly1 = S_hi, r, f1
1620	nop.i 999 ;;
1621}
1622
1623{ .mfi
1624	nop.m 999
1625//
1626//     N odd:  poly1  =  S_hi * r + 1.0
1627//
1628(p12)  fma.s1 poly1 = S_hi, c, poly1
1629	nop.i 999 ;;
1630}
1631
1632{ .mfi
1633	nop.m 999
1634//
1635//     N odd:  poly1  =  S_hi * c + poly1
1636//
1637(p12)  fmpy.s1 S_lo = S_hi, poly1
1638	nop.i 999 ;;
1639}
1640
1641{ .mfi
1642	nop.m 999
1643//
1644//     N odd:  S_lo  =  S_hi *  poly1
1645//
1646(p12)  fma.s1 S_lo = Q1_1, r, S_lo
1647	nop.i 999
1648}
1649
1650{ .mfi
1651	nop.m 999
1652//
1653//     N odd:  Result =  S_hi + S_lo
1654//
1655(p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
1656	nop.i 999 ;;
1657}
1658
1659{ .mfb
1660	nop.m 999
1661//
1662//     N odd:  S_lo  =  S_lo + Q1_1 * r
1663//
1664(p12)  fadd.s0 Result = S_hi, S_lo
1665(p0)   br.ret.sptk b0 ;;
1666}
1667
1668
1669TAN_LARGER_ARG:
1670
1671{ .mmf
1672(p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
1673      nop.m 999
1674(p0)  fmpy.s1 N_0 = Arg, Inv_P_0
1675}
1676;;
1677
1678//
1679// ARGUMENT REDUCTION CODE - CASE 3 and 4
1680//
1681//
1682//    Adjust table_ptr1 to beginning of table.
1683//    N_0 = Arg * Inv_P_0
1684//
1685
1686
1687{ .mmi
1688(p0)  ld8 table_ptr1 = [table_ptr1]
1689      nop.m 999
1690      nop.i 999
1691}
1692;;
1693
1694
1695{ .mmi
1696(p0)  add table_ptr1 = 8, table_ptr1 ;;
1697//
1698//    Point to  2*-14
1699//
1700(p0)  ldfs TWO_TO_NEG14 = [table_ptr1], 4
1701	nop.i 999 ;;
1702}
1703//
1704//    Load 2**(-14).
1705//
1706
1707{ .mmi
1708(p0)  ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;;
1709//
1710//    N_0_fix  = integer part of N_0 .
1711//    Adjust table_ptr1 to beginning of table.
1712//
1713(p0)  ldfs TWO_TO_NEG2 = [table_ptr1], 4
1714	nop.i 999 ;;
1715}
1716//
1717//    Make N_0 the integer part.
1718//
1719
1720{ .mfi
1721(p0)  ldfs NEGTWO_TO_NEG2 = [table_ptr1]
1722//
1723//    Load -2**(-14).
1724//
1725(p0)  fcvt.fx.s1 N_0_fix = N_0
1726	nop.i 999 ;;
1727}
1728
1729{ .mfi
1730	nop.m 999
1731(p0)  fcvt.xf N_0 = N_0_fix
1732	nop.i 999 ;;
1733}
1734
1735{ .mfi
1736	nop.m 999
1737(p0)  fnma.s1 ArgPrime = N_0, P_0, Arg
1738	nop.i 999
1739}
1740
1741{ .mfi
1742	nop.m 999
1743(p0)  fmpy.s1 w = N_0, d_1
1744	nop.i 999 ;;
1745}
1746
1747{ .mfi
1748	nop.m 999
1749//
1750//    ArgPrime = -N_0 * P_0 + Arg
1751//    w  = N_0 * d_1
1752//
1753(p0)  fmpy.s1 N = ArgPrime, two_by_PI
1754	nop.i 999 ;;
1755}
1756
1757{ .mfi
1758	nop.m 999
1759//
1760//    N = ArgPrime * 2/pi
1761//
1762(p0)  fcvt.fx.s1 N_fix = N
1763	nop.i 999 ;;
1764}
1765
1766{ .mfi
1767	nop.m 999
1768//
1769//    N_fix is the integer part.
1770//
1771(p0)  fcvt.xf N = N_fix
1772	nop.i 999 ;;
1773}
1774
1775{ .mfi
1776(p0)  getf.sig N_fix_gr = N_fix
1777	nop.f 999
1778	nop.i 999 ;;
1779}
1780
1781{ .mfi
1782	nop.m 999
1783//
1784//    N is the integer part of the reduced-reduced argument.
1785//    Put the integer in a GP register.
1786//
1787(p0)  fnma.s1 s_val = N, P_1, ArgPrime
1788	nop.i 999
1789}
1790
1791{ .mfi
1792	nop.m 999
1793(p0)  fnma.s1 w = N, P_2, w
1794	nop.i 999 ;;
1795}
1796
1797{ .mfi
1798	nop.m 999
1799//
1800//    s_val = -N*P_1 + ArgPrime
1801//    w = -N*P_2 + w
1802//
1803(p0)  fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14
1804	nop.i 999 ;;
1805}
1806
1807{ .mfi
1808	nop.m 999
1809(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
1810	nop.i 999 ;;
1811}
1812
1813{ .mfi
1814	nop.m 999
1815//
1816//    Case 3: r = s_val + w (Z complete)
1817//    Case 4: U_hi = N_0 * d_1
1818//
1819(p10) fmpy.s1 V_hi = N, P_2
1820	nop.i 999
1821}
1822
1823{ .mfi
1824	nop.m 999
1825(p11) fmpy.s1 U_hi = N_0, d_1
1826	nop.i 999 ;;
1827}
1828
1829{ .mfi
1830	nop.m 999
1831//
1832//    Case 3: r = s_val + w (Z complete)
1833//    Case 4: U_hi = N_0 * d_1
1834//
1835(p11) fmpy.s1 V_hi = N, P_2
1836	nop.i 999
1837}
1838
1839{ .mfi
1840	nop.m 999
1841(p11) fmpy.s1 U_hi = N_0, d_1
1842	nop.i 999 ;;
1843}
1844
1845{ .mfi
1846	nop.m 999
1847//
1848//    Decide between case 3 and 4:
1849//    Case 3:  s <= -2**(-14) or s >= 2**(-14)
1850//    Case 4: -2**(-14) < s < 2**(-14)
1851//
1852(p10) fadd.s1 r = s_val, w
1853	nop.i 999
1854}
1855
1856{ .mfi
1857	nop.m 999
1858(p11) fmpy.s1 w = N, P_3
1859	nop.i 999 ;;
1860}
1861
1862{ .mfi
1863	nop.m 999
1864//
1865//    Case 4: We need abs of both U_hi and V_hi - dont
1866//    worry about switched sign of V_hi .
1867//
1868(p11) fsub.s1 A = U_hi, V_hi
1869	nop.i 999
1870}
1871
1872{ .mfi
1873	nop.m 999
1874//
1875//    Case 4: A =  U_hi + V_hi
1876//    Note: Worry about switched sign of V_hi, so subtract instead of add.
1877//
1878(p11) fnma.s1 V_lo = N, P_2, V_hi
1879	nop.i 999 ;;
1880}
1881
1882{ .mfi
1883	nop.m 999
1884(p11) fms.s1 U_lo = N_0, d_1, U_hi
1885	nop.i 999 ;;
1886}
1887
1888{ .mfi
1889	nop.m 999
1890(p11) fabs V_hiabs = V_hi
1891	nop.i 999
1892}
1893
1894{ .mfi
1895	nop.m 999
1896//
1897//    Case 4: V_hi = N * P_2
1898//            w = N * P_3
1899//    Note the product does not include the (-) as in the writeup
1900//    so (-) missing for V_hi and w .
1901(p10) fadd.s1 r = s_val, w
1902	nop.i 999 ;;
1903}
1904
1905{ .mfi
1906	nop.m 999
1907//
1908//    Case 3: c = s_val - r
1909//    Case 4: U_lo = N_0 * d_1 - U_hi
1910//
1911(p11) fabs U_hiabs = U_hi
1912	nop.i 999
1913}
1914
1915{ .mfi
1916	nop.m 999
1917(p11) fmpy.s1 w = N, P_3
1918	nop.i 999 ;;
1919}
1920
1921{ .mfi
1922	nop.m 999
1923//
1924//    Case 4: Set P_12 if U_hiabs >= V_hiabs
1925//
1926(p11) fadd.s1 C_hi = s_val, A
1927	nop.i 999 ;;
1928}
1929
1930{ .mfi
1931	nop.m 999
1932//
1933//    Case 4: C_hi = s_val + A
1934//
1935(p11) fadd.s1 t = U_lo, V_lo
1936	nop.i 999 ;;
1937}
1938
1939{ .mfi
1940	nop.m 999
1941//
1942//    Case 3: Is |r| < 2**(-2), if so set PR_7
1943//    else set PR_8.
1944//    Case 3: If PR_7 is set, prepare to branch to Small_R.
1945//    Case 3: If PR_8 is set, prepare to branch to Normal_R.
1946//
1947(p10) fsub.s1 c = s_val, r
1948	nop.i 999 ;;
1949}
1950
1951{ .mfi
1952	nop.m 999
1953//
1954//    Case 3: c = (s - r) + w (c complete)
1955//
1956(p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
1957	nop.i 999
1958}
1959
1960{ .mfi
1961	nop.m 999
1962(p11) fms.s1 w = N_0, d_2, w
1963	nop.i 999 ;;
1964}
1965
1966{ .mfi
1967	nop.m 999
1968//
1969//    Case 4: V_hi = N * P_2
1970//            w = N * P_3
1971//    Note the product does not include the (-) as in the writeup
1972//    so (-) missing for V_hi and w .
1973//
1974(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
1975	nop.i 999 ;;
1976}
1977
1978{ .mfi
1979	nop.m 999
1980(p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2
1981	nop.i 999 ;;
1982}
1983
1984{ .mfb
1985	nop.m 999
1986//
1987//    Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
1988//    Note: the (-) is still missing for V_hi .
1989//    Case 4: w = w + N_0 * d_2
1990//    Note: the (-) is now incorporated in w .
1991//
1992(p10) fadd.s1 c = c, w
1993//
1994//    Case 4: t = U_lo + V_lo
1995//    Note: remember V_lo should be (-), subtract instead of add. NO
1996//
1997(p14) br.cond.spnt TAN_SMALL_R ;;
1998}
1999
2000{ .mib
2001	nop.m 999
2002	nop.i 999
2003(p15) br.cond.spnt TAN_NORMAL_R ;;
2004}
2005
2006{ .mfi
2007	nop.m 999
2008//
2009//    Case 3: Vector off when |r| < 2**(-2).  Recall that PR_3 will be true.
2010//    The remaining stuff is for Case 4.
2011//
2012(p12) fsub.s1 a = U_hi, A
2013(p11) extr.u i_1 = N_fix_gr, 0, 1 ;;
2014}
2015
2016{ .mfi
2017	nop.m 999
2018//
2019//    Case 4: C_lo = s_val - C_hi
2020//
2021(p11) fadd.s1 t = t, w
2022	nop.i 999
2023}
2024
2025{ .mfi
2026	nop.m 999
2027(p13) fadd.s1 a = V_hi, A
2028	nop.i 999 ;;
2029}
2030
2031//
2032//    Case 4: a = U_hi - A
2033//            a = V_hi - A (do an add to account for missing (-) on V_hi
2034//
2035
2036{ .mfi
2037(p11)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
2038(p11) fsub.s1 C_lo = s_val, C_hi
2039	nop.i 999
2040}
2041;;
2042
2043{ .mmi
2044(p11) ld8 table_ptr1 = [table_ptr1]
2045      nop.m 999
2046      nop.i 999
2047}
2048;;
2049
2050//
2051//    Case 4: a = (U_hi - A)  + V_hi
2052//            a = (V_hi - A)  + U_hi
2053//    In each case account for negative missing form V_hi .
2054//
2055//
2056//    Case 4: C_lo = (s_val - C_hi) + A
2057//
2058
2059{ .mmi
2060(p11) add table_ptr1 = 224, table_ptr1 ;;
2061(p11) ldfe P1_1 = [table_ptr1], 16
2062	nop.i 999 ;;
2063}
2064
2065{ .mfi
2066(p11) ldfe P1_2 = [table_ptr1], 128
2067//
2068//    Case 4: w = U_lo + V_lo  + w
2069//
2070(p12) fsub.s1 a = a, V_hi
2071	nop.i 999 ;;
2072}
2073//
2074//    Case 4: r = C_hi + C_lo
2075//
2076
2077{ .mfi
2078(p11) ldfe Q1_1 = [table_ptr1], 16
2079(p11) fadd.s1 C_lo = C_lo, A
2080	nop.i 999 ;;
2081}
2082//
2083//    Case 4: c = C_hi - r
2084//    Get [i_1] - lsb of N_fix_gr.
2085//
2086
2087{ .mfi
2088(p11) ldfe Q1_2 = [table_ptr1], 16
2089	nop.f 999
2090	nop.i 999 ;;
2091}
2092
2093{ .mfi
2094	nop.m 999
2095(p13) fsub.s1 a = U_hi, a
2096	nop.i 999 ;;
2097}
2098
2099{ .mfi
2100	nop.m 999
2101(p11) fadd.s1 t = t, a
2102	nop.i 999 ;;
2103}
2104
2105{ .mfi
2106	nop.m 999
2107//
2108//    Case 4: t = t + a
2109//
2110(p11) fadd.s1 C_lo = C_lo, t
2111	nop.i 999 ;;
2112}
2113
2114{ .mfi
2115	nop.m 999
2116//
2117//    Case 4: C_lo = C_lo + t
2118//
2119(p11) fadd.s1 r = C_hi, C_lo
2120	nop.i 999 ;;
2121}
2122
2123{ .mfi
2124	nop.m 999
2125(p11) fsub.s1 c = C_hi, r
2126	nop.i 999
2127}
2128
2129{ .mfi
2130	nop.m 999
2131//
2132//    Case 4: c = c + C_lo  finished.
2133//    Is i_1  even or odd?
2134//    if i_1 == 0, set PR_4, else set PR_5.
2135//
2136// r and c have been computed.
2137// We known whether this is the sine or cosine routine.
2138// Make sure ftz mode is set - should be automatic when using wre
2139(p0)  fmpy.s1 rsq = r, r
2140	nop.i 999 ;;
2141}
2142
2143{ .mfi
2144	nop.m 999
2145(p11) fadd.s1 c = c , C_lo
2146(p11) cmp.eq.unc p11, p12 =  0x0000, i_1 ;;
2147}
2148
2149{ .mfi
2150	nop.m 999
2151(p12) frcpa.s1 S_hi, p0 = f1, r
2152	nop.i 999
2153}
2154
2155{ .mfi
2156	nop.m 999
2157//
2158//    N odd: Change sign of S_hi
2159//
2160(p11) fma.s1 Result = rsq, P1_2, P1_1
2161	nop.i 999 ;;
2162}
2163
2164{ .mfi
2165	nop.m 999
2166(p12) fma.s1 P = rsq, Q1_2, Q1_1
2167	nop.i 999
2168}
2169
2170{ .mfi
2171	nop.m 999
2172//
2173//    N odd:  Result  =  S_hi + S_lo      (User supplied rounding mode for C1)
2174//
2175(p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2176	nop.i 999 ;;
2177}
2178
2179{ .mfi
2180	nop.m 999
2181//
2182//    N even: rsq = r * r
2183//    N odd:  S_hi = frcpa(r)
2184//
2185(p12) fmerge.ns S_hi = S_hi, S_hi
2186	nop.i 999
2187}
2188
2189{ .mfi
2190	nop.m 999
2191//
2192//    N even: rsq = rsq * P1_2 + P1_1
2193//    N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
2194//
2195(p11) fmpy.s1 Result = rsq, Result
2196	nop.i 999 ;;
2197}
2198
2199{ .mfi
2200	nop.m 999
2201(p12) fma.s1 poly1 = S_hi, r,f1
2202	nop.i 999
2203}
2204
2205{ .mfi
2206	nop.m 999
2207//
2208//    N even: Result =  Result * rsq
2209//    N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
2210//
2211(p11) fma.s1 Result = r, Result, c
2212	nop.i 999 ;;
2213}
2214
2215{ .mfi
2216	nop.m 999
2217(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2218	nop.i 999
2219}
2220
2221{ .mfi
2222	nop.m 999
2223//
2224//    N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
2225//
2226(p11) fadd.s0 Result= r, Result
2227	nop.i 999 ;;
2228}
2229
2230{ .mfi
2231	nop.m 999
2232(p12) fma.s1 poly1 =  S_hi, r, f1
2233	nop.i 999 ;;
2234}
2235
2236{ .mfi
2237	nop.m 999
2238//
2239//    N even: Result = Result * r + c
2240//    N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
2241//
2242(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2243	nop.i 999 ;;
2244}
2245
2246{ .mfi
2247	nop.m 999
2248(p12) fma.s1 poly1 = S_hi, r, f1
2249	nop.i 999 ;;
2250}
2251
2252{ .mfi
2253	nop.m 999
2254//
2255//    N even: Result1 = Result + r  (Rounding mode S0)
2256//    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2257//
2258(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2259	nop.i 999 ;;
2260}
2261
2262{ .mfi
2263	nop.m 999
2264//
2265//    N odd:  poly1  =  S_hi * poly + S_hi    64 bits
2266//
2267(p12) fma.s1 poly1 = S_hi, r, f1
2268	nop.i 999 ;;
2269}
2270
2271{ .mfi
2272	nop.m 999
2273//
2274//    N odd:  poly1  =  S_hi * r + 1.0
2275//
2276(p12) fma.s1 poly1 = S_hi, c, poly1
2277	nop.i 999 ;;
2278}
2279
2280{ .mfi
2281	nop.m 999
2282//
2283//    N odd:  poly1  =  S_hi * c + poly1
2284//
2285(p12) fmpy.s1 S_lo = S_hi, poly1
2286	nop.i 999 ;;
2287}
2288
2289{ .mfi
2290	nop.m 999
2291//
2292//    N odd:  S_lo  =  S_hi *  poly1
2293//
2294(p12) fma.s1 S_lo = P, r, S_lo
2295	nop.i 999 ;;
2296}
2297
2298{ .mfb
2299	nop.m 999
2300//
2301//    N odd:  S_lo  =  S_lo + r * P
2302//
2303(p12) fadd.s0 Result = S_hi, S_lo
2304(p0)   br.ret.sptk b0 ;;
2305}
2306
2307
2308TAN_SMALL_R:
2309
2310{ .mii
2311	nop.m 999
2312(p0)  extr.u i_1 = N_fix_gr, 0, 1 ;;
2313(p0)  cmp.eq.unc p11, p12 = 0x0000, i_1
2314}
2315
2316{ .mfi
2317	nop.m 999
2318(p0)  fmpy.s1 rsq = r, r
2319	nop.i 999 ;;
2320}
2321
2322{ .mfi
2323	nop.m 999
2324(p12) frcpa.s1 S_hi, p0 = f1, r
2325	nop.i 999
2326}
2327
2328{ .mfi
2329(p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
2330        nop.f 999
2331        nop.i 999
2332}
2333;;
2334
2335{ .mmi
2336(p0)  ld8 table_ptr1 = [table_ptr1]
2337      nop.m 999
2338      nop.i 999
2339}
2340;;
2341
2342// *****************************************************************
2343// *****************************************************************
2344// *****************************************************************
2345
2346{ .mmi
2347(p0)  add table_ptr1 = 224, table_ptr1 ;;
2348(p0)  ldfe P1_1 = [table_ptr1], 16
2349	nop.i 999 ;;
2350}
2351//    r and c have been computed.
2352//    We known whether this is the sine or cosine routine.
2353//    Make sure ftz mode is set - should be automatic when using wre
2354//    |r| < 2**(-2)
2355
2356{ .mfi
2357(p0)  ldfe P1_2 = [table_ptr1], 16
2358(p11) fmpy.s1 r_to_the_8 = rsq, rsq
2359	nop.i 999 ;;
2360}
2361//
2362//    Set table_ptr1 to beginning of constant table.
2363//    Get [i_1] - lsb of N_fix_gr.
2364//
2365
2366{ .mfi
2367(p0)  ldfe P1_3 = [table_ptr1], 96
2368//
2369//    N even: rsq = r * r
2370//    N odd:  S_hi = frcpa(r)
2371//
2372(p12) fmerge.ns S_hi = S_hi, S_hi
2373	nop.i 999 ;;
2374}
2375//
2376//    Is i_1  even or odd?
2377//    if i_1 == 0, set PR_11.
2378//    if i_1 != 0, set PR_12.
2379//
2380
2381{ .mfi
2382(p11) ldfe P1_9 = [table_ptr1], -16
2383//
2384//    N even: Poly2 = P1_7 + Poly2 * rsq
2385//    N odd:  poly2 = Q1_5 + poly2 * rsq
2386//
2387(p11) fadd.s1 CORR = rsq, f1
2388	nop.i 999 ;;
2389}
2390
2391{ .mmi
2392(p11) ldfe P1_8 = [table_ptr1], -16 ;;
2393//
2394//    N even: Poly1 = P1_2 + P1_3 * rsq
2395//    N odd:  poly1 =  1.0 +  S_hi * r
2396//    16 bits partial  account for necessary (-1)
2397//
2398(p11) ldfe P1_7 = [table_ptr1], -16
2399	nop.i 999 ;;
2400}
2401//
2402//    N even: Poly1 = P1_1 + Poly1 * rsq
2403//    N odd:  S_hi  =  S_hi + S_hi * poly1)     16 bits account for necessary
2404//
2405
2406{ .mfi
2407(p11) ldfe P1_6 = [table_ptr1], -16
2408//
2409//    N even: Poly2 = P1_5 + Poly2 * rsq
2410//    N odd:  poly2 = Q1_3 + poly2 * rsq
2411//
2412(p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8
2413	nop.i 999 ;;
2414}
2415//
2416//    N even: Poly1 =  Poly1 * rsq
2417//    N odd:  poly1  = 1.0 + S_hi * r         32 bits partial
2418//
2419
2420{ .mfi
2421(p11) ldfe P1_5 = [table_ptr1], -16
2422(p12) fma.s1 poly1 =  S_hi, r, f1
2423	nop.i 999 ;;
2424}
2425//
2426//    N even: CORR =  CORR * c
2427//    N odd:  S_hi  =  S_hi * poly1 + S_hi    32 bits
2428//
2429
2430//
2431//    N even: Poly2 = P1_6 + Poly2 * rsq
2432//    N odd:  poly2 = Q1_4 + poly2 * rsq
2433//
2434{ .mmf
2435(p0)  addl           table_ptr2   = @ltoff(TAN_BASE_CONSTANTS), gp
2436(p11) ldfe P1_4 = [table_ptr1], -16
2437(p11) fmpy.s1 CORR =  CORR, c
2438}
2439;;
2440
2441
2442{ .mmi
2443(p0)  ld8 table_ptr2 = [table_ptr2]
2444      nop.m 999
2445      nop.i 999
2446}
2447;;
2448
2449
2450{ .mii
2451(p0)  add table_ptr2 = 464, table_ptr2
2452	nop.i 999 ;;
2453	nop.i 999
2454}
2455
2456{ .mfi
2457	nop.m 999
2458(p11) fma.s1 Poly1 = P1_3, rsq, P1_2
2459	nop.i 999 ;;
2460}
2461
2462{ .mfi
2463(p0)  ldfe Q1_7 = [table_ptr2], -16
2464(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2465	nop.i 999 ;;
2466}
2467
2468{ .mfi
2469(p0)  ldfe Q1_6 = [table_ptr2], -16
2470(p11) fma.s1 Poly2 = P1_9, rsq, P1_8
2471	nop.i 999 ;;
2472}
2473
2474{ .mmi
2475(p0)  ldfe Q1_5 = [table_ptr2], -16 ;;
2476(p12) ldfe Q1_4 = [table_ptr2], -16
2477	nop.i 999 ;;
2478}
2479
2480{ .mfi
2481(p12) ldfe Q1_3 = [table_ptr2], -16
2482//
2483//    N even: Poly2 = P1_8 + P1_9 * rsq
2484//    N odd:  poly2 = Q1_6 + Q1_7 * rsq
2485//
2486(p11) fma.s1 Poly1 = Poly1, rsq, P1_1
2487	nop.i 999 ;;
2488}
2489
2490{ .mfi
2491(p12) ldfe Q1_2 = [table_ptr2], -16
2492(p12) fma.s1 poly1 = S_hi, r, f1
2493	nop.i 999 ;;
2494}
2495
2496{ .mfi
2497(p12) ldfe Q1_1 = [table_ptr2], -16
2498(p11) fma.s1 Poly2 = Poly2, rsq, P1_7
2499	nop.i 999 ;;
2500}
2501
2502{ .mfi
2503	nop.m 999
2504//
2505//    N even: CORR =  rsq + 1
2506//    N even: r_to_the_8 =  rsq * rsq
2507//
2508(p11) fmpy.s1 Poly1 = Poly1, rsq
2509	nop.i 999 ;;
2510}
2511
2512{ .mfi
2513	nop.m 999
2514(p12) fma.s1 S_hi = S_hi, poly1, S_hi
2515	nop.i 999
2516}
2517
2518{ .mfi
2519	nop.m 999
2520(p12) fma.s1 poly2 = Q1_7, rsq, Q1_6
2521	nop.i 999 ;;
2522}
2523
2524{ .mfi
2525	nop.m 999
2526(p11) fma.s1 Poly2 = Poly2, rsq, P1_6
2527	nop.i 999 ;;
2528}
2529
2530{ .mfi
2531	nop.m 999
2532(p12) fma.s1 poly1 = S_hi, r, f1
2533	nop.i 999
2534}
2535
2536{ .mfi
2537	nop.m 999
2538(p12) fma.s1 poly2 = poly2, rsq, Q1_5
2539	nop.i 999 ;;
2540}
2541
2542{ .mfi
2543	nop.m 999
2544(p11) fma.s1 Poly2= Poly2, rsq, P1_5
2545	nop.i 999 ;;
2546}
2547
2548{ .mfi
2549	nop.m 999
2550(p12) fma.s1 S_hi =  S_hi, poly1, S_hi
2551	nop.i 999
2552}
2553
2554{ .mfi
2555	nop.m 999
2556(p12) fma.s1 poly2 = poly2, rsq, Q1_4
2557	nop.i 999 ;;
2558}
2559
2560{ .mfi
2561	nop.m 999
2562//
2563//    N even: r_to_the_8 = r_to_the_8 * r_to_the_8
2564//    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2565//
2566(p11) fma.s1 Poly2 = Poly2, rsq, P1_4
2567	nop.i 999 ;;
2568}
2569
2570{ .mfi
2571	nop.m 999
2572//
2573//    N even: Result = CORR + Poly * r
2574//    N odd:  P = Q1_1 + poly2 * rsq
2575//
2576(p12) fma.s1 poly1 = S_hi, r, f1
2577	nop.i 999
2578}
2579
2580{ .mfi
2581	nop.m 999
2582(p12) fma.s1 poly2 = poly2, rsq, Q1_3
2583	nop.i 999 ;;
2584}
2585
2586{ .mfi
2587	nop.m 999
2588//
2589//    N even: Poly2 = P1_4 + Poly2 * rsq
2590//    N odd:  poly2 = Q1_2 + poly2 * rsq
2591//
2592(p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1
2593	nop.i 999 ;;
2594}
2595
2596{ .mfi
2597	nop.m 999
2598(p12) fma.s1 poly1 = S_hi, c, poly1
2599	nop.i 999
2600}
2601
2602{ .mfi
2603	nop.m 999
2604(p12) fma.s1 poly2 = poly2, rsq, Q1_2
2605	nop.i 999 ;;
2606}
2607
2608{ .mfi
2609	nop.m 999
2610//
2611//    N even: Poly = Poly1 + Poly2 * r_to_the_8
2612//    N odd:  S_hi =  S_hi * poly1 + S_hi    64 bits
2613//
2614(p11) fma.s1 Result = Poly, r, CORR
2615	nop.i 999 ;;
2616}
2617
2618{ .mfi
2619	nop.m 999
2620//
2621//    N even: Result =  r + Result  (User supplied rounding mode)
2622//    N odd:  poly1  =  S_hi * c + poly1
2623//
2624(p12) fmpy.s1 S_lo = S_hi, poly1
2625	nop.i 999
2626}
2627
2628{ .mfi
2629	nop.m 999
2630(p12) fma.s1 P = poly2, rsq, Q1_1
2631	nop.i 999 ;;
2632}
2633
2634{ .mfi
2635	nop.m 999
2636//
2637//    N odd:  poly1  =  S_hi * r + 1.0
2638//
2639(p11) fadd.s0 Result = Result, r
2640	nop.i 999 ;;
2641}
2642
2643{ .mfi
2644	nop.m 999
2645//
2646//    N odd:  S_lo  =  S_hi *  poly1
2647//
2648(p12) fma.s1 S_lo = Q1_1, c, S_lo
2649	nop.i 999
2650}
2651
2652{ .mfi
2653	nop.m 999
2654//
2655//    N odd:  Result = Result + S_hi  (user supplied rounding mode)
2656//
2657(p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2658	nop.i 999 ;;
2659}
2660
2661{ .mfi
2662	nop.m 999
2663//
2664//    N odd:  S_lo  =  Q1_1 * c + S_lo
2665//
2666(p12) fma.s1 Result = P, r, S_lo
2667	nop.i 999 ;;
2668}
2669
2670{ .mfb
2671	nop.m 999
2672//
2673//    N odd:  Result =  S_lo + r * P
2674//
2675(p12) fadd.s0 Result = Result, S_hi
2676(p0)   br.ret.sptk b0 ;;
2677}
2678
2679
2680TAN_NORMAL_R:
2681
2682{ .mfi
2683(p0)  getf.sig sig_r = r
2684// *******************************************************************
2685// *******************************************************************
2686// *******************************************************************
2687//
2688//    r and c have been computed.
2689//    Make sure ftz mode is set - should be automatic when using wre
2690//
2691//
2692//    Get [i_1] -  lsb of N_fix_gr alone.
2693//
2694(p0)  fmerge.s  Pos_r = f1, r
2695(p0)  extr.u i_1 = N_fix_gr, 0, 1 ;;
2696}
2697
2698{ .mfi
2699	nop.m 999
2700(p0)  fmerge.s  sgn_r =  r, f1
2701(p0)  cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
2702}
2703
2704{ .mfi
2705	nop.m 999
2706	nop.f 999
2707(p0)  extr.u lookup = sig_r, 58, 5
2708}
2709
2710{ .mlx
2711	nop.m 999
2712(p0)  movl Create_B = 0x8200000000000000 ;;
2713}
2714
2715{ .mfi
2716(p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
2717	nop.f 999
2718(p0)  dep Create_B = lookup, Create_B, 58, 5
2719}
2720;;
2721
2722//
2723//    Get [i_1] -  lsb of N_fix_gr alone.
2724//    Pos_r = abs (r)
2725//
2726
2727
2728{ .mmi
2729      ld8 table_ptr1 = [table_ptr1]
2730      nop.m 999
2731      nop.i 999
2732}
2733;;
2734
2735
2736{ .mmi
2737	nop.m 999
2738(p0)  setf.sig B = Create_B
2739//
2740//    Set table_ptr1 and table_ptr2 to base address of
2741//    constant table.
2742//
2743(p0)  add table_ptr1 = 480, table_ptr1 ;;
2744}
2745
2746{ .mmb
2747	nop.m 999
2748//
2749//    Is i_1 or i_0  == 0 ?
2750//    Create the constant  1 00000 1000000000000000000000...
2751//
2752(p0)  ldfe P2_1 = [table_ptr1], 16
2753	nop.b 999
2754}
2755
2756{ .mmi
2757	nop.m 999 ;;
2758(p0)  getf.exp exp_r = Pos_r
2759	nop.i 999
2760}
2761//
2762//    Get r's exponent
2763//    Get r's significand
2764//
2765
2766{ .mmi
2767(p0)  ldfe P2_2 = [table_ptr1], 16 ;;
2768//
2769//    Get the 5 bits or r for the lookup.   1.xxxxx ....
2770//    from sig_r.
2771//    Grab  lsb of exp of B
2772//
2773(p0)  ldfe P2_3 = [table_ptr1], 16
2774	nop.i 999 ;;
2775}
2776
2777{ .mii
2778	nop.m 999
2779(p0)  andcm table_offset = 0x0001, exp_r ;;
2780(p0)  shl table_offset = table_offset, 9 ;;
2781}
2782
2783{ .mii
2784	nop.m 999
2785//
2786//    Deposit   0 00000 1000000000000000000000... on
2787//              1 xxxxx yyyyyyyyyyyyyyyyyyyyyy...,
2788//    getting rid of the ys.
2789//    Is  B = 2** -2 or  B= 2** -1? If 2**-1, then
2790//    we want an offset of 512 for table addressing.
2791//
2792(p0)  shladd table_offset = lookup, 4, table_offset ;;
2793//
2794//    B =  ........ 1xxxxx 1000000000000000000...
2795//
2796(p0)  add table_ptr1 = table_ptr1, table_offset ;;
2797}
2798
2799{ .mmb
2800	nop.m 999
2801//
2802//   B =  ........ 1xxxxx 1000000000000000000...
2803//   Convert B so it has the same exponent as Pos_r
2804//
2805(p0)  ldfd T_hi = [table_ptr1], 8
2806	nop.b 999 ;;
2807}
2808
2809//
2810//    x = |r| - B
2811//    Load T_hi.
2812//    Load C_hi.
2813//
2814
2815{ .mmf
2816(p0)  addl           table_ptr2   = @ltoff(TAN_BASE_CONSTANTS), gp
2817(p0)  ldfs T_lo = [table_ptr1]
2818(p0)  fmerge.se B = Pos_r, B
2819}
2820;;
2821
2822{ .mmi
2823      ld8 table_ptr2 = [table_ptr2]
2824      nop.m 999
2825      nop.i 999
2826}
2827;;
2828
2829{ .mii
2830(p0)  add table_ptr2 = 1360, table_ptr2
2831	nop.i 999 ;;
2832(p0)  add table_ptr2 = table_ptr2, table_offset ;;
2833}
2834
2835{ .mfi
2836(p0)  ldfd C_hi = [table_ptr2], 8
2837(p0)  fsub.s1 x = Pos_r, B
2838	nop.i 999 ;;
2839}
2840
2841{ .mii
2842(p0)  ldfs C_lo = [table_ptr2],255
2843	nop.i 999 ;;
2844//
2845//    xsq = x * x
2846//    N even: Tx = T_hi * x
2847//    Load T_lo.
2848//    Load C_lo - increment pointer to get SC_inv
2849//    - cant get all the way, do an add later.
2850//
2851(p0)  add table_ptr2 = 569, table_ptr2 ;;
2852}
2853//
2854//    N even: Tx1 = Tx + 1
2855//    N odd:  Cx1 = 1 - Cx
2856//
2857
2858{ .mfi
2859(p0)  ldfe SC_inv = [table_ptr2], 0
2860	nop.f 999
2861	nop.i 999 ;;
2862}
2863
2864{ .mfi
2865	nop.m 999
2866(p0)  fmpy.s1 xsq = x, x
2867	nop.i 999
2868}
2869
2870{ .mfi
2871	nop.m 999
2872(p11) fmpy.s1 Tx = T_hi, x
2873	nop.i 999 ;;
2874}
2875
2876{ .mfi
2877	nop.m 999
2878(p12) fmpy.s1 Cx = C_hi, x
2879	nop.i 999 ;;
2880}
2881
2882{ .mfi
2883	nop.m 999
2884//
2885//    N odd: Cx = C_hi * x
2886//
2887(p0)  fma.s1 P = P2_3, xsq, P2_2
2888	nop.i 999
2889}
2890
2891{ .mfi
2892	nop.m 999
2893//
2894//    N even and odd: P = P2_3 + P2_2 * xsq
2895//
2896(p11) fadd.s1 Tx1 = Tx, f1
2897	nop.i 999 ;;
2898}
2899
2900{ .mfi
2901	nop.m 999
2902//
2903//    N even: D = C_hi - tanx
2904//    N odd: D = T_hi + tanx
2905//
2906(p11) fmpy.s1 CORR = SC_inv, T_hi
2907	nop.i 999
2908}
2909
2910{ .mfi
2911	nop.m 999
2912(p0)  fmpy.s1 Sx = SC_inv, x
2913	nop.i 999 ;;
2914}
2915
2916{ .mfi
2917	nop.m 999
2918(p12) fmpy.s1 CORR = SC_inv, C_hi
2919	nop.i 999 ;;
2920}
2921
2922{ .mfi
2923	nop.m 999
2924(p12) fsub.s1 V_hi = f1, Cx
2925	nop.i 999 ;;
2926}
2927
2928{ .mfi
2929	nop.m 999
2930(p0)  fma.s1 P = P, xsq, P2_1
2931	nop.i 999
2932}
2933
2934{ .mfi
2935	nop.m 999
2936//
2937//    N even and odd: P = P2_1 + P * xsq
2938//
2939(p11) fma.s1 V_hi = Tx, Tx1, f1
2940	nop.i 999 ;;
2941}
2942
2943{ .mfi
2944	nop.m 999
2945//
2946//    N even: Result  = sgn_r * tail + T_hi (user rounding mode for C1)
2947//    N odd:  Result  = sgn_r * tail + C_hi (user rounding mode for C1)
2948//
2949(p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2950	nop.i 999 ;;
2951}
2952
2953{ .mfi
2954	nop.m 999
2955(p0)  fmpy.s1 CORR = CORR, c
2956	nop.i 999 ;;
2957}
2958
2959{ .mfi
2960	nop.m 999
2961(p12) fnma.s1 V_hi = Cx,V_hi,f1
2962	nop.i 999 ;;
2963}
2964
2965{ .mfi
2966	nop.m 999
2967//
2968//    N even: V_hi = Tx * Tx1 + 1
2969//    N odd: Cx1 = 1 - Cx * Cx1
2970//
2971(p0)  fmpy.s1 P = P, xsq
2972	nop.i 999
2973}
2974
2975{ .mfi
2976	nop.m 999
2977//
2978//    N even and odd: P = P * xsq
2979//
2980(p11) fmpy.s1 V_hi = V_hi, T_hi
2981	nop.i 999 ;;
2982}
2983
2984{ .mfi
2985	nop.m 999
2986//
2987//    N even and odd: tail = P * tail + V_lo
2988//
2989(p11) fmpy.s1 T_hi = sgn_r, T_hi
2990	nop.i 999 ;;
2991}
2992
2993{ .mfi
2994	nop.m 999
2995(p0)  fmpy.s1 CORR = CORR, sgn_r
2996	nop.i 999 ;;
2997}
2998
2999{ .mfi
3000	nop.m 999
3001(p12) fmpy.s1 V_hi = V_hi,C_hi
3002	nop.i 999 ;;
3003}
3004
3005{ .mfi
3006	nop.m 999
3007//
3008//    N even: V_hi = T_hi * V_hi
3009//    N odd: V_hi  = C_hi * V_hi
3010//
3011(p0)  fma.s1 tanx = P, x, x
3012	nop.i 999
3013}
3014
3015{ .mfi
3016	nop.m 999
3017(p12) fnmpy.s1 C_hi = sgn_r, C_hi
3018	nop.i 999 ;;
3019}
3020
3021{ .mfi
3022	nop.m 999
3023//
3024//    N even: V_lo = 1 - V_hi + C_hi
3025//    N odd: V_lo = 1 - V_hi + T_hi
3026//
3027(p11) fadd.s1 CORR = CORR, T_lo
3028	nop.i 999
3029}
3030
3031{ .mfi
3032	nop.m 999
3033(p12) fsub.s1 CORR = CORR, C_lo
3034	nop.i 999 ;;
3035}
3036
3037{ .mfi
3038	nop.m 999
3039//
3040//    N even and odd: tanx = x + x * P
3041//    N even and odd: Sx = SC_inv * x
3042//
3043(p11) fsub.s1 D = C_hi, tanx
3044	nop.i 999
3045}
3046
3047{ .mfi
3048	nop.m 999
3049(p12) fadd.s1 D = T_hi, tanx
3050	nop.i 999 ;;
3051}
3052
3053{ .mfi
3054	nop.m 999
3055//
3056//    N odd: CORR = SC_inv * C_hi
3057//    N even: CORR = SC_inv * T_hi
3058//
3059(p0)  fnma.s1 D = V_hi, D, f1
3060	nop.i 999 ;;
3061}
3062
3063{ .mfi
3064	nop.m 999
3065//
3066//    N even and odd: D = 1 - V_hi * D
3067//    N even and odd: CORR = CORR * c
3068//
3069(p0)  fma.s1 V_hi = V_hi, D, V_hi
3070	nop.i 999 ;;
3071}
3072
3073{ .mfi
3074	nop.m 999
3075//
3076//    N even and odd: V_hi = V_hi + V_hi * D
3077//    N even and odd: CORR = sgn_r * CORR
3078//
3079(p11) fnma.s1 V_lo = V_hi, C_hi, f1
3080	nop.i 999
3081}
3082
3083{ .mfi
3084	nop.m 999
3085(p12) fnma.s1 V_lo = V_hi, T_hi, f1
3086	nop.i 999 ;;
3087}
3088
3089{ .mfi
3090	nop.m 999
3091//
3092//    N even: CORR = COOR + T_lo
3093//    N odd: CORR = CORR - C_lo
3094//
3095(p11) fma.s1 V_lo = tanx, V_hi, V_lo
3096	nop.i 999
3097}
3098
3099{ .mfi
3100	nop.m 999
3101(p12) fnma.s1 V_lo = tanx, V_hi, V_lo
3102	nop.i 999 ;;
3103}
3104
3105{ .mfi
3106	nop.m 999
3107//
3108//    N even: V_lo = V_lo + V_hi * tanx
3109//    N odd: V_lo = V_lo - V_hi * tanx
3110//
3111(p11) fnma.s1 V_lo = C_lo, V_hi, V_lo
3112	nop.i 999
3113}
3114
3115{ .mfi
3116	nop.m 999
3117(p12) fnma.s1 V_lo = T_lo, V_hi, V_lo
3118	nop.i 999 ;;
3119}
3120
3121{ .mfi
3122	nop.m 999
3123//
3124//    N  even: V_lo = V_lo - V_hi * C_lo
3125//    N  odd: V_lo = V_lo - V_hi * T_lo
3126//
3127(p0)  fmpy.s1 V_lo = V_hi, V_lo
3128	nop.i 999 ;;
3129}
3130
3131{ .mfi
3132	nop.m 999
3133//
3134//    N even and odd: V_lo = V_lo * V_hi
3135//
3136(p0)  fadd.s1 tail = V_hi, V_lo
3137	nop.i 999 ;;
3138}
3139
3140{ .mfi
3141	nop.m 999
3142//
3143//    N even and odd: tail = V_hi + V_lo
3144//
3145(p0)  fma.s1 tail = tail, P, V_lo
3146	nop.i 999 ;;
3147}
3148
3149{ .mfi
3150	nop.m 999
3151//
3152//    N even: T_hi = sgn_r * T_hi
3153//    N odd : C_hi = -sgn_r * C_hi
3154//
3155(p0)  fma.s1 tail = tail, Sx, CORR
3156	nop.i 999 ;;
3157}
3158
3159{ .mfi
3160	nop.m 999
3161//
3162//    N even and odd: tail = Sx * tail + CORR
3163//
3164(p0)  fma.s1 tail = V_hi, Sx, tail
3165	nop.i 999 ;;
3166}
3167
3168{ .mfi
3169	nop.m 999
3170//
3171//    N even an odd: tail = Sx * V_hi + tail
3172//
3173(p11) fma.s0 Result = sgn_r, tail, T_hi
3174	nop.i 999
3175}
3176
3177{ .mfb
3178	nop.m 999
3179(p12) fma.s0 Result = sgn_r, tail, C_hi
3180(p0)   br.ret.sptk b0 ;;
3181}
3182
3183.endp __libm_tan
3184ASM_SIZE_DIRECTIVE(__libm_tan)
3185
3186
3187
3188// *******************************************************************
3189// *******************************************************************
3190// *******************************************************************
3191//
3192//     Special Code to handle very large argument case.
3193//     Call int pi_by_2_reduce(&x,&r)
3194//     for |arguments| >= 2**63
3195//     (Arg or x) is in f8
3196//     Address to save r and c as double
3197
3198//                 (1)                    (2)                 (3) (call)         (4)
3199//            sp -> +               psp -> +            psp -> +           sp ->  +
3200//                  |                      |                   |                  |
3201//                  |                r50 ->| <- r50      f0  ->|           r50 -> | -> c
3202//                  |                      |                   |                  |
3203//         sp-32 -> | <- r50          f0 ->|             f0  ->| <- r50    r49 -> | -> r
3204//                  |                      |                   |                  |
3205//                  |               r49  ->| <- r49     Arg  ->| <- r49           | -> x
3206//                  |                      |                   |                  |
3207//         sp -64 ->|             sp -64 ->|          sp -64 ->|                  |
3208//
3209//            save pfs           save b0                                     restore gp
3210//            save gp                                                        restore b0
3211//                                                                           restore pfs
3212
3213
3214
3215.proc __libm_callout
3216__libm_callout:
3217TAN_ARG_TOO_LARGE:
3218.prologue
3219// (1)
3220{ .mfi
3221        add   GR_Parameter_r =-32,sp                        // Parameter: r address
3222        nop.f 0
3223.save   ar.pfs,GR_SAVE_PFS
3224        mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
3225}
3226{ .mfi
3227.fframe 64
3228        add sp=-64,sp                           // Create new stack
3229        nop.f 0
3230        mov GR_SAVE_GP=gp                       // Save gp
3231};;
3232
3233// (2)
3234{ .mmi
3235        stfe [GR_Parameter_r ] = f0,16                      // Clear Parameter r on stack
3236        add  GR_Parameter_X = 16,sp                        // Parameter x address
3237.save   b0, GR_SAVE_B0
3238        mov GR_SAVE_B0=b0                       // Save b0
3239};;
3240
3241// (3)
3242.body
3243{ .mib
3244        stfe [GR_Parameter_r ] = f0,-16                     // Clear Parameter c on stack
3245        nop.i 0
3246        nop.b 0
3247}
3248{ .mib
3249        stfe [GR_Parameter_X] = Arg                        // Store Parameter x on stack
3250        nop.i 0
3251(p0)    br.call.sptk b0=__libm_pi_by_2_reduce#
3252}
3253;;
3254
3255
3256// (4)
3257{ .mmi
3258        mov   gp = GR_SAVE_GP                  // Restore gp
3259(p0)    mov   N_fix_gr = r8
3260        nop.i 999
3261}
3262;;
3263
3264{ .mmi
3265(p0)    ldfe  Arg        =[GR_Parameter_X],16
3266(p0)    ldfs  TWO_TO_NEG2 = [table_ptr2],4
3267        nop.i 999
3268}
3269;;
3270
3271
3272{ .mmb
3273(p0)    ldfe  r =[GR_Parameter_r ],16
3274(p0)    ldfs  NEGTWO_TO_NEG2 = [table_ptr2],4
3275        nop.b 999 ;;
3276}
3277
3278{ .mfi
3279(p0)    ldfe  c =[GR_Parameter_r ]
3280        nop.f 999
3281        nop.i 999 ;;
3282}
3283
3284{ .mfi
3285        nop.m 999
3286//
3287//     Is |r| < 2**(-2)
3288//
3289(p0)   fcmp.lt.unc.s1  p6, p0 = r, TWO_TO_NEG2
3290        mov   b0 = GR_SAVE_B0                  // Restore return address
3291}
3292;;
3293
3294{ .mfi
3295       nop.m 999
3296(p6)   fcmp.gt.unc.s1  p6, p0 = r, NEGTWO_TO_NEG2
3297       mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
3298}
3299;;
3300
3301{ .mbb
3302.restore sp
3303        add   sp = 64,sp                       // Restore stack pointer
3304(p6)   br.cond.spnt TAN_SMALL_R
3305(p0)   br.cond.sptk TAN_NORMAL_R
3306}
3307;;
3308.endp __libm_callout
3309ASM_SIZE_DIRECTIVE(__libm_callout)
3310
3311
3312.proc __libm_TAN_SPECIAL
3313__libm_TAN_SPECIAL:
3314
3315//
3316//     Code for NaNs, Unsupporteds, Infs, or +/- zero ?
3317//     Invalid raised for Infs and SNaNs.
3318//
3319
3320{ .mfb
3321	nop.m 999
3322(p0)   fmpy.s0 Arg = Arg, f0
3323(p0)   br.ret.sptk b0
3324}
3325.endp __libm_TAN_SPECIAL
3326ASM_SIZE_DIRECTIVE(__libm_TAN_SPECIAL)
3327
3328
3329.type __libm_pi_by_2_reduce#,@function
3330.global __libm_pi_by_2_reduce#
3331