1.file "libm_sincos_large.s"
2
3
4// Copyright (c) 2002 - 2003, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39// History
40//==============================================================
41// 02/15/02 Initial version
42// 05/13/02 Changed interface to __libm_pi_by_2_reduce
43// 02/10/03 Reordered header: .section, .global, .proc, .align;
44//          used data8 for long double table values
45// 05/15/03 Reformatted data tables
46//
47//
48// Overview of operation
49//==============================================================
50//
51// These functions calculate the sin and cos for inputs
52// greater than 2^10
53//
54// __libm_sin_large#
55// __libm_cos_large#
56// They accept argument in f8
57// and return result in f8 without final rounding
58//
59// __libm_sincos_large#
60// It accepts argument in f8
61// and returns cos in f8 and sin in f9 without final rounding
62//
63//
64//*********************************************************************
65//
66// Accuracy:       Within .7 ulps for 80-bit floating point values
67//                 Very accurate for double precision values
68//
69//*********************************************************************
70//
71// Resources Used:
72//
73//    Floating-Point Registers: f8 as Input Value, f8 and f9 as Return Values
74//                              f32-f103
75//
76//    General Purpose Registers:
77//      r32-r43
78//      r44-r45 (Used to pass arguments to pi_by_2 reduce routine)
79//
80//    Predicate Registers:      p6-p13
81//
82//*********************************************************************
83//
84//  IEEE Special Conditions:
85//
86//    Denormal  fault raised on denormal inputs
87//    Overflow exceptions do not occur
88//    Underflow exceptions raised when appropriate for sin
89//    (No specialized error handling for this routine)
90//    Inexact raised when appropriate by algorithm
91//
92//    sin(SNaN) = QNaN
93//    sin(QNaN) = QNaN
94//    sin(inf) = QNaN
95//    sin(+/-0) = +/-0
96//    cos(inf) = QNaN
97//    cos(SNaN) = QNaN
98//    cos(QNaN) = QNaN
99//    cos(0) = 1
100//
101//*********************************************************************
102//
103//  Mathematical Description
104//  ========================
105//
106//  The computation of FSIN and FCOS is best handled in one piece of
107//  code. The main reason is that given any argument Arg, computation
108//  of trigonometric functions first calculate N and an approximation
109//  to alpha where
110//
111//  Arg = N pi/2 + alpha, |alpha| <= pi/4.
112//
113//  Since
114//
115//  cos( Arg ) = sin( (N+1) pi/2 + alpha ),
116//
117//  therefore, the code for computing sine will produce cosine as long
118//  as 1 is added to N immediately after the argument reduction
119//  process.
120//
121//  Let M = N if sine
122//      N+1 if cosine.
123//
124//  Now, given
125//
126//  Arg = M pi/2  + alpha, |alpha| <= pi/4,
127//
128//  let I = M mod 4, or I be the two lsb of M when M is represented
129//  as 2's complement. I = [i_0 i_1]. Then
130//
131//  sin( Arg ) = (-1)^i_0  sin( alpha ) if i_1 = 0,
132//             = (-1)^i_0  cos( alpha )     if i_1 = 1.
133//
134//  For example:
135//       if M = -1, I = 11
136//         sin ((-pi/2 + alpha) = (-1) cos (alpha)
137//       if M = 0, I = 00
138//         sin (alpha) = sin (alpha)
139//       if M = 1, I = 01
140//         sin (pi/2 + alpha) = cos (alpha)
141//       if M = 2, I = 10
142//         sin (pi + alpha) = (-1) sin (alpha)
143//       if M = 3, I = 11
144//         sin ((3/2)pi + alpha) = (-1) cos (alpha)
145//
146//  The value of alpha is obtained by argument reduction and
147//  represented by two working precision numbers r and c where
148//
149//  alpha =  r  +  c     accurately.
150//
151//  The reduction method is described in a previous write up.
152//  The argument reduction scheme identifies 4 cases. For Cases 2
153//  and 4, because |alpha| is small, sin(r+c) and cos(r+c) can be
154//  computed very easily by 2 or 3 terms of the Taylor series
155//  expansion as follows:
156//
157//  Case 2:
158//  -------
159//
160//  sin(r + c) = r + c - r^3/6  accurately
161//  cos(r + c) = 1 - 2^(-67)    accurately
162//
163//  Case 4:
164//  -------
165//
166//  sin(r + c) = r + c - r^3/6 + r^5/120    accurately
167//  cos(r + c) = 1 - r^2/2 + r^4/24     accurately
168//
169//  The only cases left are Cases 1 and 3 of the argument reduction
170//  procedure. These two cases will be merged since after the
171//  argument is reduced in either cases, we have the reduced argument
172//  represented as r + c and that the magnitude |r + c| is not small
173//  enough to allow the usage of a very short approximation.
174//
175//  The required calculation is either
176//
177//  sin(r + c)  =  sin(r)  +  correction,  or
178//  cos(r + c)  =  cos(r)  +  correction.
179//
180//  Specifically,
181//
182//  sin(r + c) = sin(r) + c sin'(r) + O(c^2)
183//         = sin(r) + c cos (r) + O(c^2)
184//         = sin(r) + c(1 - r^2/2)  accurately.
185//  Similarly,
186//
187//  cos(r + c) = cos(r) - c sin(r) + O(c^2)
188//         = cos(r) - c(r - r^3/6)  accurately.
189//
190//  We therefore concentrate on accurately calculating sin(r) and
191//  cos(r) for a working-precision number r, |r| <= pi/4 to within
192//  0.1% or so.
193//
194//  The greatest challenge of this task is that the second terms of
195//  the Taylor series
196//
197//  r - r^3/3! + r^r/5! - ...
198//
199//  and
200//
201//  1 - r^2/2! + r^4/4! - ...
202//
203//  are not very small when |r| is close to pi/4 and the rounding
204//  errors will be a concern if simple polynomial accumulation is
205//  used. When |r| < 2^-3, however, the second terms will be small
206//  enough (6 bits or so of right shift) that a normal Horner
207//  recurrence suffices. Hence there are two cases that we consider
208//  in the accurate computation of sin(r) and cos(r), |r| <= pi/4.
209//
210//  Case small_r: |r| < 2^(-3)
211//  --------------------------
212//
213//  Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1],
214//  we have
215//
216//  sin(Arg) = (-1)^i_0 * sin(r + c)    if i_1 = 0
217//       = (-1)^i_0 * cos(r + c)    if i_1 = 1
218//
219//  can be accurately approximated by
220//
221//  sin(Arg) = (-1)^i_0 * [sin(r) + c]  if i_1 = 0
222//           = (-1)^i_0 * [cos(r) - c*r] if i_1 = 1
223//
224//  because |r| is small and thus the second terms in the correction
225//  are unnecessary.
226//
227//  Finally, sin(r) and cos(r) are approximated by polynomials of
228//  moderate lengths.
229//
230//  sin(r) =  r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11
231//  cos(r) =  1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10
232//
233//  We can make use of predicates to selectively calculate
234//  sin(r) or cos(r) based on i_1.
235//
236//  Case normal_r: 2^(-3) <= |r| <= pi/4
237//  ------------------------------------
238//
239//  This case is more likely than the previous one if one considers
240//  r to be uniformly distributed in [-pi/4 pi/4]. Again,
241//
242//  sin(Arg) = (-1)^i_0 * sin(r + c)    if i_1 = 0
243//           = (-1)^i_0 * cos(r + c)    if i_1 = 1.
244//
245//  Because |r| is now larger, we need one extra term in the
246//  correction. sin(Arg) can be accurately approximated by
247//
248//  sin(Arg) = (-1)^i_0 * [sin(r) + c(1-r^2/2)]      if i_1 = 0
249//           = (-1)^i_0 * [cos(r) - c*r*(1 - r^2/6)]    i_1 = 1.
250//
251//  Finally, sin(r) and cos(r) are approximated by polynomials of
252//  moderate lengths.
253//
254//  sin(r) =  r + PP_1_hi r^3 + PP_1_lo r^3 +
255//                PP_2 r^5 + ... + PP_8 r^17
256//
257//  cos(r) =  1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16
258//
259//  where PP_1_hi is only about 16 bits long and QQ_1 is -1/2.
260//  The crux in accurate computation is to calculate
261//
262//  r + PP_1_hi r^3   or  1 + QQ_1 r^2
263//
264//  accurately as two pieces: U_hi and U_lo. The way to achieve this
265//  is to obtain r_hi as a 10 sig. bit number that approximates r to
266//  roughly 8 bits or so of accuracy. (One convenient way is
267//
268//  r_hi := frcpa( frcpa( r ) ).)
269//
270//  This way,
271//
272//  r + PP_1_hi r^3 =  r + PP_1_hi r_hi^3 +
273//                          PP_1_hi (r^3 - r_hi^3)
274//              =  [r + PP_1_hi r_hi^3]  +
275//             [PP_1_hi (r - r_hi)
276//                (r^2 + r_hi r + r_hi^2) ]
277//              =  U_hi  +  U_lo
278//
279//  Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long,
280//  PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed
281//  exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign
282//  and that there is no more than 8 bit shift off between r and
283//  PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus
284//  calculated without any error. Finally, the fact that
285//
286//  |U_lo| <= 2^(-8) |U_hi|
287//
288//  says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly
289//  8 extra bits of accuracy.
290//
291//  Similarly,
292//
293//  1 + QQ_1 r^2  =  [1 + QQ_1 r_hi^2]  +
294//                      [QQ_1 (r - r_hi)(r + r_hi)]
295//            =  U_hi  +  U_lo.
296//
297//  Summarizing, we calculate r_hi = frcpa( frcpa( r ) ).
298//
299//  If i_1 = 0, then
300//
301//    U_hi := r + PP_1_hi * r_hi^3
302//    U_lo := PP_1_hi * (r - r_hi) * (r^2 + r*r_hi + r_hi^2)
303//    poly := PP_1_lo r^3 + PP_2 r^5 + ... + PP_8 r^17
304//    correction := c * ( 1 + C_1 r^2 )
305//
306//  Else ...i_1 = 1
307//
308//    U_hi := 1 + QQ_1 * r_hi * r_hi
309//    U_lo := QQ_1 * (r - r_hi) * (r + r_hi)
310//    poly := QQ_2 * r^4 + QQ_3 * r^6 + ... + QQ_8 r^16
311//    correction := -c * r * (1 + S_1 * r^2)
312//
313//  End
314//
315//  Finally,
316//
317//  V := poly + ( U_lo + correction )
318//
319//                 /    U_hi  +  V         if i_0 = 0
320//  result := |
321//                 \  (-U_hi) -  V         if i_0 = 1
322//
323//  It is important that in the last step, negation of U_hi is
324//  performed prior to the subtraction which is to be performed in
325//  the user-set rounding mode.
326//
327//
328//  Algorithmic Description
329//  =======================
330//
331//  The argument reduction algorithm is tightly integrated into FSIN
332//  and FCOS which share the same code. The following is complete and
333//  self-contained. The argument reduction description given
334//  previously is repeated below.
335//
336//
337//  Step 0. Initialization.
338//
339//   If FSIN is invoked, set N_inc := 0; else if FCOS is invoked,
340//   set N_inc := 1.
341//
342//  Step 1. Check for exceptional and special cases.
343//
344//   * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special
345//     handling.
346//   * If |Arg| < 2^24, go to Step 2 for reduction of moderate
347//     arguments. This is the most likely case.
348//   * If |Arg| < 2^63, go to Step 8 for pre-reduction of large
349//     arguments.
350//   * If |Arg| >= 2^63, go to Step 10 for special handling.
351//
352//  Step 2. Reduction of moderate arguments.
353//
354//  If |Arg| < pi/4     ...quick branch
355//     N_fix := N_inc   (integer)
356//     r     := Arg
357//     c     := 0.0
358//     Branch to Step 4, Case_1_complete
359//  Else        ...cf. argument reduction
360//     N     := Arg * two_by_PI (fp)
361//     N_fix := fcvt.fx( N )    (int)
362//     N     := fcvt.xf( N_fix )
363//     N_fix := N_fix + N_inc
364//     s     := Arg - N * P_1   (first piece of pi/2)
365//     w     := -N * P_2    (second piece of pi/2)
366//
367//     If |s| >= 2^(-33)
368//        go to Step 3, Case_1_reduce
369//     Else
370//        go to Step 7, Case_2_reduce
371//     Endif
372//  Endif
373//
374//  Step 3. Case_1_reduce.
375//
376//  r := s + w
377//  c := (s - r) + w    ...observe order
378//
379//  Step 4. Case_1_complete
380//
381//  ...At this point, the reduced argument alpha is
382//  ...accurately represented as r + c.
383//  If |r| < 2^(-3), go to Step 6, small_r.
384//
385//  Step 5. Normal_r.
386//
387//  Let [i_0 i_1] by the 2 lsb of N_fix.
388//  FR_rsq  := r * r
389//  r_hi := frcpa( frcpa( r ) )
390//  r_lo := r - r_hi
391//
392//  If i_1 = 0, then
393//    poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8))
394//    U_hi := r + PP_1_hi*r_hi*r_hi*r_hi    ...any order
395//    U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi)
396//    correction := c + c*C_1*FR_rsq        ...any order
397//  Else
398//    poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8))
399//    U_hi := 1 + QQ_1 * r_hi * r_hi        ...any order
400//    U_lo := QQ_1 * r_lo * (r + r_hi)
401//    correction := -c*(r + S_1*FR_rsq*r)   ...any order
402//  Endif
403//
404//  V := poly + (U_lo + correction) ...observe order
405//
406//  result := (i_0 == 0?   1.0 : -1.0)
407//
408//  Last instruction in user-set rounding mode
409//
410//  result := (i_0 == 0?   result*U_hi + V :
411//                        result*U_hi - V)
412//
413//  Return
414//
415//  Step 6. Small_r.
416//
417//  ...Use flush to zero mode without causing exception
418//    Let [i_0 i_1] be the two lsb of N_fix.
419//
420//  FR_rsq := r * r
421//
422//  If i_1 = 0 then
423//     z := FR_rsq*FR_rsq; z := FR_rsq*z *r
424//     poly_lo := S_3 + FR_rsq*(S_4 + FR_rsq*S_5)
425//     poly_hi := r*FR_rsq*(S_1 + FR_rsq*S_2)
426//     correction := c
427//     result := r
428//  Else
429//     z := FR_rsq*FR_rsq; z := FR_rsq*z
430//     poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5)
431//     poly_hi := FR_rsq*(C_1 + FR_rsq*C_2)
432//     correction := -c*r
433//     result := 1
434//  Endif
435//
436//  poly := poly_hi + (z * poly_lo + correction)
437//
438//  If i_0 = 1, result := -result
439//
440//  Last operation. Perform in user-set rounding mode
441//
442//  result := (i_0 == 0?     result + poly :
443//                          result - poly )
444//  Return
445//
446//  Step 7. Case_2_reduce.
447//
448//  ...Refer to the write up for argument reduction for
449//  ...rationale. The reduction algorithm below is taken from
450//  ...argument reduction description and integrated this.
451//
452//  w := N*P_3
453//  U_1 := N*P_2 + w        ...FMA
454//  U_2 := (N*P_2 - U_1) + w    ...2 FMA
455//  ...U_1 + U_2 is  N*(P_2+P_3) accurately
456//
457//  r := s - U_1
458//  c := ( (s - r) - U_1 ) - U_2
459//
460//  ...The mathematical sum r + c approximates the reduced
461//  ...argument accurately. Note that although compared to
462//  ...Case 1, this case requires much more work to reduce
463//  ...the argument, the subsequent calculation needed for
464//  ...any of the trigonometric function is very little because
465//  ...|alpha| < 1.01*2^(-33) and thus two terms of the
466//  ...Taylor series expansion suffices.
467//
468//  If i_1 = 0 then
469//     poly := c + S_1 * r * r * r  ...any order
470//     result := r
471//  Else
472//     poly := -2^(-67)
473//     result := 1.0
474//  Endif
475//
476//  If i_0 = 1, result := -result
477//
478//  Last operation. Perform in user-set rounding mode
479//
480//  result := (i_0 == 0?     result + poly :
481//                           result - poly )
482//
483//  Return
484//
485//
486//  Step 8. Pre-reduction of large arguments.
487//
488//  ...Again, the following reduction procedure was described
489//  ...in the separate write up for argument reduction, which
490//  ...is tightly integrated here.
491
492//  N_0 := Arg * Inv_P_0
493//  N_0_fix := fcvt.fx( N_0 )
494//  N_0 := fcvt.xf( N_0_fix)
495
496//  Arg' := Arg - N_0 * P_0
497//  w := N_0 * d_1
498//  N := Arg' * two_by_PI
499//  N_fix := fcvt.fx( N )
500//  N := fcvt.xf( N_fix )
501//  N_fix := N_fix + N_inc
502//
503//  s := Arg' - N * P_1
504//  w := w - N * P_2
505//
506//  If |s| >= 2^(-14)
507//     go to Step 3
508//  Else
509//     go to Step 9
510//  Endif
511//
512//  Step 9. Case_4_reduce.
513//
514//    ...first obtain N_0*d_1 and -N*P_2 accurately
515//   U_hi := N_0 * d_1      V_hi := -N*P_2
516//   U_lo := N_0 * d_1 - U_hi   V_lo := -N*P_2 - U_hi   ...FMAs
517//
518//   ...compute the contribution from N_0*d_1 and -N*P_3
519//   w := -N*P_3
520//   w := w + N_0*d_2
521//   t := U_lo + V_lo + w       ...any order
522//
523//   ...at this point, the mathematical value
524//   ...s + U_hi + V_hi  + t approximates the true reduced argument
525//   ...accurately. Just need to compute this accurately.
526//
527//   ...Calculate U_hi + V_hi accurately:
528//   A := U_hi + V_hi
529//   if |U_hi| >= |V_hi| then
530//      a := (U_hi - A) + V_hi
531//   else
532//      a := (V_hi - A) + U_hi
533//   endif
534//   ...order in computing "a" must be observed. This branch is
535//   ...best implemented by predicates.
536//   ...A + a  is U_hi + V_hi accurately. Moreover, "a" is
537//   ...much smaller than A: |a| <= (1/2)ulp(A).
538//
539//   ...Just need to calculate   s + A + a + t
540//   C_hi := s + A      t := t + a
541//   C_lo := (s - C_hi) + A
542//   C_lo := C_lo + t
543//
544//   ...Final steps for reduction
545//   r := C_hi + C_lo
546//   c := (C_hi - r) + C_lo
547//
548//   ...At this point, we have r and c
549//   ...And all we need is a couple of terms of the corresponding
550//   ...Taylor series.
551//
552//   If i_1 = 0
553//      poly := c + r*FR_rsq*(S_1 + FR_rsq*S_2)
554//      result := r
555//   Else
556//      poly := FR_rsq*(C_1 + FR_rsq*C_2)
557//      result := 1
558//   Endif
559//
560//   If i_0 = 1, result := -result
561//
562//   Last operation. Perform in user-set rounding mode
563//
564//   result := (i_0 == 0?     result + poly :
565//                            result - poly )
566//   Return
567//
568//   Large Arguments: For arguments above 2**63, a Payne-Hanek
569//   style argument reduction is used and pi_by_2 reduce is called.
570//
571
572
573RODATA
574.align 16
575
576LOCAL_OBJECT_START(FSINCOS_CONSTANTS)
577
578data4 0x4B800000 // two**24
579data4 0xCB800000 // -two**24
580data4 0x00000000 // pad
581data4 0x00000000 // pad
582data8 0xA2F9836E4E44152A, 0x00003FFE // Inv_pi_by_2
583data8 0xC84D32B0CE81B9F1, 0x00004016 // P_0
584data8 0xC90FDAA22168C235, 0x00003FFF // P_1
585data8 0xECE675D1FC8F8CBB, 0x0000BFBD // P_2
586data8 0xB7ED8FBBACC19C60, 0x0000BF7C // P_3
587data4 0x5F000000 // two**63
588data4 0xDF000000 // -two**63
589data4 0x00000000 // pad
590data4 0x00000000 // pad
591data8 0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0
592data8 0x8D848E89DBD171A1, 0x0000BFBF // d_1
593data8 0xD5394C3618A66F8E, 0x0000BF7C // d_2
594data8 0xC90FDAA22168C234, 0x00003FFE // pi_by_4
595data8 0xC90FDAA22168C234, 0x0000BFFE // neg_pi_by_4
596data4 0x3E000000 // two**-3
597data4 0xBE000000 // -two**-3
598data4 0x00000000 // pad
599data4 0x00000000 // pad
600data4 0x2F000000 // two**-33
601data4 0xAF000000 // -two**-33
602data4 0x9E000000 // -two**-67
603data4 0x00000000 // pad
604data8 0xCC8ABEBCA21C0BC9, 0x00003FCE // PP_8
605data8 0xD7468A05720221DA, 0x0000BFD6 // PP_7
606data8 0xB092382F640AD517, 0x00003FDE // PP_6
607data8 0xD7322B47D1EB75A4, 0x0000BFE5 // PP_5
608data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1
609data8 0xAAAA000000000000, 0x0000BFFC // PP_1_hi
610data8 0xB8EF1D2ABAF69EEA, 0x00003FEC // PP_4
611data8 0xD00D00D00D03BB69, 0x0000BFF2 // PP_3
612data8 0x8888888888888962, 0x00003FF8 // PP_2
613data8 0xAAAAAAAAAAAB0000, 0x0000BFEC // PP_1_lo
614data8 0xD56232EFC2B0FE52, 0x00003FD2 // QQ_8
615data8 0xC9C99ABA2B48DCA6, 0x0000BFDA // QQ_7
616data8 0x8F76C6509C716658, 0x00003FE2 // QQ_6
617data8 0x93F27DBAFDA8D0FC, 0x0000BFE9 // QQ_5
618data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1
619data8 0x8000000000000000, 0x0000BFFE // QQ_1
620data8 0xD00D00D00C6E5041, 0x00003FEF // QQ_4
621data8 0xB60B60B60B607F60, 0x0000BFF5 // QQ_3
622data8 0xAAAAAAAAAAAAAA9B, 0x00003FFA // QQ_2
623data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1
624data8 0xAAAAAAAAAAAA719F, 0x00003FFA // C_2
625data8 0xB60B60B60356F994, 0x0000BFF5 // C_3
626data8 0xD00CFFD5B2385EA9, 0x00003FEF // C_4
627data8 0x93E4BD18292A14CD, 0x0000BFE9 // C_5
628data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1
629data8 0x88888888888868DB, 0x00003FF8 // S_2
630data8 0xD00D00D0055EFD4B, 0x0000BFF2 // S_3
631data8 0xB8EF1C5D839730B9, 0x00003FEC // S_4
632data8 0xD71EA3A4E5B3F492, 0x0000BFE5 // S_5
633data4 0x38800000 // two**-14
634data4 0xB8800000 // -two**-14
635LOCAL_OBJECT_END(FSINCOS_CONSTANTS)
636
637// sin and cos registers
638
639// FR
640FR_Input_X        = f8
641
642FR_r              = f8
643FR_c              = f9
644
645FR_Two_to_63      = f32
646FR_Two_to_24      = f33
647FR_Pi_by_4        = f33
648FR_Two_to_M14     = f34
649FR_Two_to_M33     = f35
650FR_Neg_Two_to_24  = f36
651FR_Neg_Pi_by_4    = f36
652FR_Neg_Two_to_M14 = f37
653FR_Neg_Two_to_M33 = f38
654FR_Neg_Two_to_M67 = f39
655FR_Inv_pi_by_2    = f40
656FR_N_float        = f41
657FR_N_fix          = f42
658FR_P_1            = f43
659FR_P_2            = f44
660FR_P_3            = f45
661FR_s              = f46
662FR_w              = f47
663FR_d_2            = f48
664FR_prelim         = f49
665FR_Z              = f50
666FR_A              = f51
667FR_a              = f52
668FR_t              = f53
669FR_U_1            = f54
670FR_U_2            = f55
671FR_C_1            = f56
672FR_C_2            = f57
673FR_C_3            = f58
674FR_C_4            = f59
675FR_C_5            = f60
676FR_S_1            = f61
677FR_S_2            = f62
678FR_S_3            = f63
679FR_S_4            = f64
680FR_S_5            = f65
681FR_poly_hi        = f66
682FR_poly_lo        = f67
683FR_r_hi           = f68
684FR_r_lo           = f69
685FR_rsq            = f70
686FR_r_cubed        = f71
687FR_C_hi           = f72
688FR_N_0            = f73
689FR_d_1            = f74
690FR_V              = f75
691FR_V_hi           = f75
692FR_V_lo           = f76
693FR_U_hi           = f77
694FR_U_lo           = f78
695FR_U_hiabs        = f79
696FR_V_hiabs        = f80
697FR_PP_8           = f81
698FR_QQ_8           = f81
699FR_PP_7           = f82
700FR_QQ_7           = f82
701FR_PP_6           = f83
702FR_QQ_6           = f83
703FR_PP_5           = f84
704FR_QQ_5           = f84
705FR_PP_4           = f85
706FR_QQ_4           = f85
707FR_PP_3           = f86
708FR_QQ_3           = f86
709FR_PP_2           = f87
710FR_QQ_2           = f87
711FR_QQ_1           = f88
712FR_N_0_fix        = f89
713FR_Inv_P_0        = f90
714FR_corr           = f91
715FR_poly           = f92
716FR_Neg_Two_to_M3  = f93
717FR_Two_to_M3      = f94
718FR_Neg_Two_to_63  = f94
719FR_P_0            = f95
720FR_C_lo           = f96
721FR_PP_1           = f97
722FR_PP_1_lo        = f98
723FR_ArgPrime       = f99
724
725// GR
726GR_Table_Base     = r32
727GR_Table_Base1    = r33
728GR_i_0            = r34
729GR_i_1            = r35
730GR_N_Inc          = r36
731GR_Sin_or_Cos     = r37
732
733GR_SAVE_B0        = r39
734GR_SAVE_GP        = r40
735GR_SAVE_PFS       = r41
736
737// sincos combined routine registers
738
739// GR
740GR_SINCOS_SAVE_PFS    = r32
741GR_SINCOS_SAVE_B0     = r33
742GR_SINCOS_SAVE_GP     = r34
743
744// FR
745FR_SINCOS_ARG         = f100
746FR_SINCOS_RES_SIN     = f101
747
748
749.section .text
750
751
752GLOBAL_LIBM_ENTRY(__libm_sincos_large)
753
754{ .mfi
755        alloc GR_SINCOS_SAVE_PFS = ar.pfs,0,3,0,0
756        fma.s1 FR_SINCOS_ARG     = f8, f1, f0  // Save argument for sin and cos
757        mov GR_SINCOS_SAVE_B0    = b0
758};;
759
760{ .mfb
761        mov GR_SINCOS_SAVE_GP    = gp
762        nop.f  0
763        br.call.sptk b0          = __libm_sin_large // Call sin
764};;
765
766{ .mfi
767        nop.m  0
768        fma.s1 FR_SINCOS_RES_SIN = f8, f1, f0 // Save sin result
769        nop.i  0
770};;
771
772{ .mfb
773        nop.m  0
774        fma.s1 f8                = FR_SINCOS_ARG, f1, f0 // Arg for cos
775        br.call.sptk b0          = __libm_cos_large // Call cos
776};;
777
778{ .mfi
779        mov    gp                = GR_SINCOS_SAVE_GP
780        fma.s1 f9                = FR_SINCOS_RES_SIN, f1, f0 // Out sin result
781        mov    b0                = GR_SINCOS_SAVE_B0
782};;
783
784{ .mib
785        nop.m  0
786        mov ar.pfs               = GR_SINCOS_SAVE_PFS
787        br.ret.sptk                b0 // sincos_large exit
788};;
789
790GLOBAL_LIBM_END(__libm_sincos_large)
791
792
793
794
795GLOBAL_LIBM_ENTRY(__libm_sin_large)
796
797{ .mlx
798alloc GR_Table_Base = ar.pfs,0,12,2,0
799       movl GR_Sin_or_Cos = 0x0 ;;
800}
801
802{ .mmi
803      nop.m 999
804      addl           GR_Table_Base   = @ltoff(FSINCOS_CONSTANTS#), gp
805      nop.i 999
806}
807;;
808
809{ .mmi
810      ld8 GR_Table_Base = [GR_Table_Base]
811      nop.m 999
812      nop.i 999
813}
814;;
815
816
817{ .mib
818      nop.m 999
819      nop.i 999
820       br.cond.sptk SINCOS_CONTINUE ;;
821}
822
823GLOBAL_LIBM_END(__libm_sin_large)
824
825GLOBAL_LIBM_ENTRY(__libm_cos_large)
826
827{ .mlx
828alloc GR_Table_Base= ar.pfs,0,12,2,0
829       movl GR_Sin_or_Cos = 0x1 ;;
830}
831
832{ .mmi
833      nop.m 999
834      addl           GR_Table_Base   = @ltoff(FSINCOS_CONSTANTS#), gp
835      nop.i 999
836}
837;;
838
839{ .mmi
840      ld8 GR_Table_Base = [GR_Table_Base]
841      nop.m 999
842      nop.i 999
843}
844;;
845
846//
847//     Load Table Address
848//
849SINCOS_CONTINUE:
850
851{ .mmi
852       add GR_Table_Base1 = 96, GR_Table_Base
853       ldfs FR_Two_to_24 = [GR_Table_Base], 4
854       nop.i 999
855}
856;;
857
858{ .mmi
859      nop.m 999
860//
861//     Load 2**24, load 2**63.
862//
863       ldfs FR_Neg_Two_to_24 = [GR_Table_Base], 12
864       mov   r41 = ar.pfs ;;
865}
866
867{ .mfi
868       ldfs FR_Two_to_63 = [GR_Table_Base1], 4
869//
870//     Check for unnormals - unsupported operands. We do not want
871//     to generate denormal exception
872//     Check for NatVals, QNaNs, SNaNs, +/-Infs
873//     Check for EM unsupporteds
874//     Check for Zero
875//
876       fclass.m.unc  p6, p8 =  FR_Input_X, 0x1E3
877       mov   r40 = gp ;;
878}
879
880{ .mfi
881      nop.m 999
882       fclass.nm.unc p8, p0 =  FR_Input_X, 0x1FF
883// GR_Sin_or_Cos denotes
884       mov   r39 = b0
885}
886
887{ .mfb
888       ldfs FR_Neg_Two_to_63 = [GR_Table_Base1], 12
889       fclass.m.unc p10, p0 = FR_Input_X, 0x007
890(p6)   br.cond.spnt SINCOS_SPECIAL ;;
891}
892
893{ .mib
894      nop.m 999
895      nop.i 999
896(p8)   br.cond.spnt SINCOS_SPECIAL ;;
897}
898
899{ .mib
900      nop.m 999
901      nop.i 999
902//
903//     Branch if +/- NaN, Inf.
904//     Load -2**24, load -2**63.
905//
906(p10)  br.cond.spnt SINCOS_ZERO ;;
907}
908
909{ .mmb
910       ldfe FR_Inv_pi_by_2 = [GR_Table_Base], 16
911       ldfe FR_Inv_P_0 = [GR_Table_Base1], 16
912      nop.b 999 ;;
913}
914
915{ .mmb
916      nop.m 999
917       ldfe     FR_d_1 = [GR_Table_Base1], 16
918      nop.b 999 ;;
919}
920//
921//     Raise possible denormal operand flag with useful fcmp
922//     Is x <= -2**63
923//     Load Inv_P_0 for pre-reduction
924//     Load Inv_pi_by_2
925//
926
927{ .mmb
928       ldfe     FR_P_0 = [GR_Table_Base], 16
929       ldfe FR_d_2 = [GR_Table_Base1], 16
930      nop.b 999 ;;
931}
932//
933//     Load P_0
934//     Load d_1
935//     Is x >= 2**63
936//     Is x <= -2**24?
937//
938
939{ .mmi
940       ldfe FR_P_1 = [GR_Table_Base], 16 ;;
941//
942//     Load P_1
943//     Load d_2
944//     Is x >= 2**24?
945//
946       ldfe FR_P_2 = [GR_Table_Base], 16
947      nop.i 999 ;;
948}
949
950{ .mmf
951      nop.m 999
952       ldfe FR_P_3 = [GR_Table_Base], 16
953       fcmp.le.unc.s1   p7, p8 = FR_Input_X, FR_Neg_Two_to_24
954}
955
956{ .mfi
957      nop.m 999
958//
959//     Branch if +/- zero.
960//     Decide about the paths to take:
961//     If -2**24 < FR_Input_X < 2**24 - CASE 1 OR 2
962//     OTHERWISE - CASE 3 OR 4
963//
964       fcmp.le.unc.s1   p10, p11 = FR_Input_X, FR_Neg_Two_to_63
965      nop.i 999 ;;
966}
967
968{ .mfi
969      nop.m 999
970(p8)   fcmp.ge.s1 p7, p0 = FR_Input_X, FR_Two_to_24
971      nop.i 999
972}
973
974{ .mfi
975       ldfe FR_Pi_by_4 = [GR_Table_Base1], 16
976(p11)  fcmp.ge.s1   p10, p0 = FR_Input_X, FR_Two_to_63
977      nop.i 999 ;;
978}
979
980{ .mmi
981       ldfe FR_Neg_Pi_by_4 = [GR_Table_Base1], 16 ;;
982       ldfs FR_Two_to_M3 = [GR_Table_Base1], 4
983      nop.i 999 ;;
984}
985
986{ .mib
987       ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1], 12
988      nop.i 999
989//
990//     Load P_2
991//     Load P_3
992//     Load pi_by_4
993//     Load neg_pi_by_4
994//     Load 2**(-3)
995//     Load -2**(-3).
996//
997(p10)  br.cond.spnt SINCOS_ARG_TOO_LARGE ;;
998}
999
1000{ .mib
1001      nop.m 999
1002      nop.i 999
1003//
1004//     Branch out if x >= 2**63. Use Payne-Hanek Reduction
1005//
1006(p7)   br.cond.spnt SINCOS_LARGER_ARG ;;
1007}
1008
1009{ .mfi
1010      nop.m 999
1011//
1012//     Branch if Arg <= -2**24 or Arg >= 2**24 and use pre-reduction.
1013//
1014       fma.s1   FR_N_float = FR_Input_X, FR_Inv_pi_by_2, f0
1015      nop.i 999 ;;
1016}
1017
1018{ .mfi
1019      nop.m 999
1020       fcmp.lt.unc.s1   p6, p7 = FR_Input_X, FR_Pi_by_4
1021      nop.i 999 ;;
1022}
1023
1024{ .mfi
1025      nop.m 999
1026//
1027//     Select the case when |Arg| < pi/4
1028//     Else Select the case when |Arg| >= pi/4
1029//
1030       fcvt.fx.s1 FR_N_fix = FR_N_float
1031      nop.i 999 ;;
1032}
1033
1034{ .mfi
1035      nop.m 999
1036//
1037//     N  = Arg * 2/pi
1038//     Check if Arg < pi/4
1039//
1040(p6)   fcmp.gt.s1 p6, p7 = FR_Input_X, FR_Neg_Pi_by_4
1041      nop.i 999 ;;
1042}
1043//
1044//     Case 2: Convert integer N_fix back to normalized floating-point value.
1045//     Case 1: p8 is only affected  when p6 is set
1046//
1047
1048{ .mfi
1049(p7)   ldfs FR_Two_to_M33 = [GR_Table_Base1], 4
1050//
1051//     Grab the integer part of N and call it N_fix
1052//
1053(p6)   fmerge.se FR_r = FR_Input_X, FR_Input_X
1054//     If |x| < pi/4, r = x and c = 0
1055//     lf |x| < pi/4, is x < 2**(-3).
1056//     r = Arg
1057//     c = 0
1058(p6)   mov GR_N_Inc = GR_Sin_or_Cos ;;
1059}
1060
1061{ .mmf
1062      nop.m 999
1063(p7)   ldfs FR_Neg_Two_to_M33 = [GR_Table_Base1], 4
1064(p6)   fmerge.se FR_c = f0, f0
1065}
1066
1067{ .mfi
1068      nop.m 999
1069(p6)   fcmp.lt.unc.s1   p8, p9 = FR_Input_X, FR_Two_to_M3
1070      nop.i 999 ;;
1071}
1072
1073{ .mfi
1074      nop.m 999
1075//
1076//     lf |x| < pi/4, is -2**(-3)< x < 2**(-3) - set p8.
1077//     If |x| >= pi/4,
1078//     Create the right N for |x| < pi/4 and otherwise
1079//     Case 2: Place integer part of N in GP register
1080//
1081(p7)   fcvt.xf FR_N_float = FR_N_fix
1082      nop.i 999 ;;
1083}
1084
1085{ .mmf
1086      nop.m 999
1087(p7)   getf.sig GR_N_Inc = FR_N_fix
1088(p8)   fcmp.gt.s1 p8, p0 = FR_Input_X, FR_Neg_Two_to_M3 ;;
1089}
1090
1091{ .mib
1092      nop.m 999
1093      nop.i 999
1094//
1095//     Load 2**(-33), -2**(-33)
1096//
1097(p8)   br.cond.spnt SINCOS_SMALL_R ;;
1098}
1099
1100{ .mib
1101      nop.m 999
1102      nop.i 999
1103(p6)   br.cond.sptk SINCOS_NORMAL_R ;;
1104}
1105//
1106//     if |x| < pi/4, branch based on |x| < 2**(-3) or otherwise.
1107//
1108//
1109//     In this branch, |x| >= pi/4.
1110//
1111
1112{ .mfi
1113       ldfs FR_Neg_Two_to_M67 = [GR_Table_Base1], 8
1114//
1115//     Load -2**(-67)
1116//
1117       fnma.s1  FR_s = FR_N_float, FR_P_1, FR_Input_X
1118//
1119//     w = N * P_2
1120//     s = -N * P_1  + Arg
1121//
1122       add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos
1123}
1124
1125{ .mfi
1126      nop.m 999
1127       fma.s1   FR_w = FR_N_float, FR_P_2, f0
1128      nop.i 999 ;;
1129}
1130
1131{ .mfi
1132      nop.m 999
1133//
1134//     Adjust N_fix by N_inc to determine whether sine or
1135//     cosine is being calculated
1136//
1137       fcmp.lt.unc.s1 p7, p6 = FR_s, FR_Two_to_M33
1138      nop.i 999 ;;
1139}
1140
1141{ .mfi
1142      nop.m 999
1143(p7)   fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33
1144      nop.i 999 ;;
1145}
1146
1147{ .mfi
1148      nop.m 999
1149//     Remember x >= pi/4.
1150//     Is s <= -2**(-33) or s >= 2**(-33) (p6)
1151//     or -2**(-33) < s < 2**(-33) (p7)
1152(p6)   fms.s1 FR_r = FR_s, f1, FR_w
1153      nop.i 999
1154}
1155
1156{ .mfi
1157      nop.m 999
1158(p7)   fma.s1 FR_w = FR_N_float, FR_P_3, f0
1159      nop.i 999 ;;
1160}
1161
1162{ .mfi
1163      nop.m 999
1164(p7)   fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w
1165      nop.i 999
1166}
1167
1168{ .mfi
1169      nop.m 999
1170(p6)   fms.s1 FR_c = FR_s, f1, FR_r
1171      nop.i 999 ;;
1172}
1173
1174{ .mfi
1175      nop.m 999
1176//
1177//     For big s: r = s - w: No futher reduction is necessary
1178//     For small s: w = N * P_3 (change sign) More reduction
1179//
1180(p6)   fcmp.lt.unc.s1 p8, p9 = FR_r, FR_Two_to_M3
1181      nop.i 999 ;;
1182}
1183
1184{ .mfi
1185      nop.m 999
1186(p8)   fcmp.gt.s1 p8, p9 = FR_r, FR_Neg_Two_to_M3
1187      nop.i 999 ;;
1188}
1189
1190{ .mfi
1191      nop.m 999
1192(p7)   fms.s1 FR_r = FR_s, f1, FR_U_1
1193      nop.i 999
1194}
1195
1196{ .mfb
1197      nop.m 999
1198//
1199//     For big s: Is |r| < 2**(-3)?
1200//     For big s: c = S - r
1201//     For small s: U_1 = N * P_2 + w
1202//
1203//     If p8 is set, prepare to branch to Small_R.
1204//     If p9 is set, prepare to branch to Normal_R.
1205//     For big s,  r is complete here.
1206//
1207(p6)   fms.s1 FR_c = FR_c, f1, FR_w
1208//
1209//     For big s: c = c + w (w has not been negated.)
1210//     For small s: r = S - U_1
1211//
1212(p8)   br.cond.spnt SINCOS_SMALL_R ;;
1213}
1214
1215{ .mib
1216      nop.m 999
1217      nop.i 999
1218(p9)   br.cond.sptk SINCOS_NORMAL_R ;;
1219}
1220
1221{ .mfi
1222(p7)   add GR_Table_Base1 = 224, GR_Table_Base1
1223//
1224//     Branch to SINCOS_SMALL_R or SINCOS_NORMAL_R
1225//
1226(p7)   fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1
1227//
1228//     c = S - U_1
1229//     r = S_1 * r
1230//
1231//
1232(p7)   extr.u   GR_i_1 = GR_N_Inc, 0, 1
1233}
1234
1235{ .mmi
1236      nop.m 999 ;;
1237//
1238//     Get [i_0,i_1] - two lsb of N_fix_gr.
1239//     Do dummy fmpy so inexact is always set.
1240//
1241(p7)   cmp.eq.unc p9, p10 = 0x0, GR_i_1
1242(p7)   extr.u   GR_i_0 = GR_N_Inc, 1, 1 ;;
1243}
1244//
1245//     For small s: U_2 = N * P_2 - U_1
1246//     S_1 stored constant - grab the one stored with the
1247//     coefficients.
1248//
1249
1250{ .mfi
1251(p7)   ldfe FR_S_1 = [GR_Table_Base1], 16
1252//
1253//     Check if i_1 and i_0  != 0
1254//
1255(p10)  fma.s1   FR_poly = f0, f1, FR_Neg_Two_to_M67
1256(p7)   cmp.eq.unc p11, p12 = 0x0, GR_i_0 ;;
1257}
1258
1259{ .mfi
1260      nop.m 999
1261(p7)   fms.s1   FR_s = FR_s, f1, FR_r
1262      nop.i 999
1263}
1264
1265{ .mfi
1266      nop.m 999
1267//
1268//     S = S - r
1269//     U_2 = U_2 + w
1270//     load S_1
1271//
1272(p7)   fma.s1   FR_rsq = FR_r, FR_r, f0
1273      nop.i 999 ;;
1274}
1275
1276{ .mfi
1277      nop.m 999
1278(p7)   fma.s1   FR_U_2 = FR_U_2, f1, FR_w
1279      nop.i 999
1280}
1281
1282{ .mfi
1283      nop.m 999
1284//(p7)   fmerge.se FR_Input_X = FR_r, FR_r
1285(p7)   fmerge.se FR_prelim = FR_r, FR_r
1286      nop.i 999 ;;
1287}
1288
1289{ .mfi
1290      nop.m 999
1291//(p10)  fma.s1 FR_Input_X = f0, f1, f1
1292(p10)  fma.s1 FR_prelim = f0, f1, f1
1293      nop.i 999 ;;
1294}
1295
1296{ .mfi
1297      nop.m 999
1298//
1299//     FR_rsq = r * r
1300//     Save r as the result.
1301//
1302(p7)   fms.s1   FR_c = FR_s, f1, FR_U_1
1303      nop.i 999 ;;
1304}
1305
1306{ .mfi
1307      nop.m 999
1308//
1309//     if ( i_1 ==0) poly = c + S_1*r*r*r
1310//     else Result = 1
1311//
1312//(p12)  fnma.s1 FR_Input_X = FR_Input_X, f1, f0
1313(p12)  fnma.s1 FR_prelim = FR_prelim, f1, f0
1314      nop.i 999
1315}
1316
1317{ .mfi
1318      nop.m 999
1319(p7)   fma.s1   FR_r = FR_S_1, FR_r, f0
1320      nop.i 999 ;;
1321}
1322
1323{ .mfi
1324      nop.m 999
1325(p7)   fma.d.s1 FR_S_1 = FR_S_1, FR_S_1, f0
1326      nop.i 999 ;;
1327}
1328
1329{ .mfi
1330      nop.m 999
1331//
1332//     If i_1 != 0, poly = 2**(-67)
1333//
1334(p7)   fms.s1 FR_c = FR_c, f1, FR_U_2
1335      nop.i 999 ;;
1336}
1337
1338{ .mfi
1339      nop.m 999
1340//
1341//     c = c - U_2
1342//
1343(p9)   fma.s1 FR_poly = FR_r, FR_rsq, FR_c
1344      nop.i 999 ;;
1345}
1346
1347{ .mfi
1348      nop.m 999
1349//
1350//     i_0 != 0, so Result = -Result
1351//
1352(p11)  fma.s1 FR_Input_X = FR_prelim, f1, FR_poly
1353      nop.i 999 ;;
1354}
1355
1356{ .mfb
1357      nop.m 999
1358(p12)  fms.s1 FR_Input_X = FR_prelim, f1, FR_poly
1359//
1360//     if (i_0 == 0),  Result = Result + poly
1361//     else            Result = Result - poly
1362//
1363       br.ret.sptk   b0 ;;
1364}
1365SINCOS_LARGER_ARG:
1366
1367{ .mfi
1368      nop.m 999
1369       fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0
1370      nop.i 999
1371}
1372;;
1373
1374//     This path for argument > 2*24
1375//     Adjust table_ptr1 to beginning of table.
1376//
1377
1378{ .mmi
1379      nop.m 999
1380      addl           GR_Table_Base   = @ltoff(FSINCOS_CONSTANTS#), gp
1381      nop.i 999
1382}
1383;;
1384
1385{ .mmi
1386      ld8 GR_Table_Base = [GR_Table_Base]
1387      nop.m 999
1388      nop.i 999
1389}
1390;;
1391
1392
1393//
1394//     Point to  2*-14
1395//     N_0 = Arg * Inv_P_0
1396//
1397
1398{ .mmi
1399       add GR_Table_Base = 688, GR_Table_Base ;;
1400       ldfs FR_Two_to_M14 = [GR_Table_Base], 4
1401      nop.i 999 ;;
1402}
1403
1404{ .mfi
1405       ldfs FR_Neg_Two_to_M14 = [GR_Table_Base], 0
1406      nop.f 999
1407      nop.i 999 ;;
1408}
1409
1410{ .mfi
1411      nop.m 999
1412//
1413//     Load values 2**(-14) and -2**(-14)
1414//
1415       fcvt.fx.s1 FR_N_0_fix = FR_N_0
1416      nop.i 999 ;;
1417}
1418
1419{ .mfi
1420      nop.m 999
1421//
1422//     N_0_fix  = integer part of N_0
1423//
1424       fcvt.xf FR_N_0 = FR_N_0_fix
1425      nop.i 999 ;;
1426}
1427
1428{ .mfi
1429      nop.m 999
1430//
1431//     Make N_0 the integer part
1432//
1433       fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X
1434      nop.i 999
1435}
1436
1437{ .mfi
1438      nop.m 999
1439       fma.s1 FR_w = FR_N_0, FR_d_1, f0
1440      nop.i 999 ;;
1441}
1442
1443{ .mfi
1444      nop.m 999
1445//
1446//     Arg' = -N_0 * P_0 + Arg
1447//     w  = N_0 * d_1
1448//
1449       fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0
1450      nop.i 999 ;;
1451}
1452
1453{ .mfi
1454      nop.m 999
1455//
1456//     N = A' * 2/pi
1457//
1458       fcvt.fx.s1 FR_N_fix = FR_N_float
1459      nop.i 999 ;;
1460}
1461
1462{ .mfi
1463      nop.m 999
1464//
1465//     N_fix is the integer part
1466//
1467       fcvt.xf FR_N_float = FR_N_fix
1468      nop.i 999 ;;
1469}
1470
1471{ .mfi
1472       getf.sig GR_N_Inc = FR_N_fix
1473      nop.f 999
1474      nop.i 999 ;;
1475}
1476
1477{ .mii
1478      nop.m 999
1479      nop.i 999 ;;
1480       add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;;
1481}
1482
1483{ .mfi
1484      nop.m 999
1485//
1486//     N is the integer part of the reduced-reduced argument.
1487//     Put the integer in a GP register
1488//
1489       fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime
1490      nop.i 999
1491}
1492
1493{ .mfi
1494      nop.m 999
1495       fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w
1496      nop.i 999 ;;
1497}
1498
1499{ .mfi
1500      nop.m 999
1501//
1502//     s = -N*P_1 + Arg'
1503//     w = -N*P_2 + w
1504//     N_fix_gr = N_fix_gr + N_inc
1505//
1506       fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14
1507      nop.i 999 ;;
1508}
1509
1510{ .mfi
1511      nop.m 999
1512(p9)   fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14
1513      nop.i 999 ;;
1514}
1515
1516{ .mfi
1517      nop.m 999
1518//
1519//     For |s|  > 2**(-14) r = S + w (r complete)
1520//     Else       U_hi = N_0 * d_1
1521//
1522(p9)   fma.s1 FR_V_hi = FR_N_float, FR_P_2, f0
1523      nop.i 999
1524}
1525
1526{ .mfi
1527      nop.m 999
1528(p9)   fma.s1 FR_U_hi = FR_N_0, FR_d_1, f0
1529      nop.i 999 ;;
1530}
1531
1532{ .mfi
1533      nop.m 999
1534//
1535//     Either S <= -2**(-14) or S >= 2**(-14)
1536//     or -2**(-14) < s < 2**(-14)
1537//
1538(p8)   fma.s1 FR_r = FR_s, f1, FR_w
1539      nop.i 999
1540}
1541
1542{ .mfi
1543      nop.m 999
1544(p9)   fma.s1 FR_w = FR_N_float, FR_P_3, f0
1545      nop.i 999 ;;
1546}
1547
1548{ .mfi
1549      nop.m 999
1550//
1551//     We need abs of both U_hi and V_hi - don't
1552//     worry about switched sign of V_hi.
1553//
1554(p9)   fms.s1 FR_A = FR_U_hi, f1, FR_V_hi
1555      nop.i 999
1556}
1557
1558{ .mfi
1559      nop.m 999
1560//
1561//     Big s: finish up c = (S - r) + w (c complete)
1562//     Case 4: A =  U_hi + V_hi
1563//     Note: Worry about switched sign of V_hi, so subtract instead of add.
1564//
1565(p9)   fnma.s1 FR_V_lo = FR_N_float, FR_P_2, FR_V_hi
1566      nop.i 999 ;;
1567}
1568
1569{ .mfi
1570      nop.m 999
1571(p9)   fms.s1 FR_U_lo = FR_N_0, FR_d_1, FR_U_hi
1572      nop.i 999 ;;
1573}
1574
1575{ .mfi
1576      nop.m 999
1577(p9)   fmerge.s FR_V_hiabs = f0, FR_V_hi
1578      nop.i 999
1579}
1580
1581{ .mfi
1582      nop.m 999
1583//     For big s: c = S - r
1584//     For small s do more work: U_lo = N_0 * d_1 - U_hi
1585//
1586(p9)   fmerge.s FR_U_hiabs = f0, FR_U_hi
1587      nop.i 999 ;;
1588}
1589
1590{ .mfi
1591      nop.m 999
1592//
1593//     For big s: Is |r| < 2**(-3)
1594//     For big s: if p12 set, prepare to branch to Small_R.
1595//     For big s: If p13 set, prepare to branch to Normal_R.
1596//
1597(p8)   fms.s1 FR_c = FR_s, f1, FR_r
1598      nop.i 999
1599}
1600
1601{ .mfi
1602      nop.m 999
1603//
1604//     For small S: V_hi = N * P_2
1605//                  w = N * P_3
1606//     Note the product does not include the (-) as in the writeup
1607//     so (-) missing for V_hi and w.
1608//
1609(p8)   fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3
1610      nop.i 999 ;;
1611}
1612
1613{ .mfi
1614      nop.m 999
1615(p12)  fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3
1616      nop.i 999 ;;
1617}
1618
1619{ .mfi
1620      nop.m 999
1621(p8)   fma.s1 FR_c = FR_c, f1, FR_w
1622      nop.i 999
1623}
1624
1625{ .mfb
1626      nop.m 999
1627(p9)   fms.s1 FR_w = FR_N_0, FR_d_2, FR_w
1628(p12)  br.cond.spnt SINCOS_SMALL_R ;;
1629}
1630
1631{ .mib
1632      nop.m 999
1633      nop.i 999
1634(p13)  br.cond.sptk SINCOS_NORMAL_R ;;
1635}
1636
1637{ .mfi
1638      nop.m 999
1639//
1640//     Big s: Vector off when |r| < 2**(-3).  Recall that p8 will be true.
1641//     The remaining stuff is for Case 4.
1642//     Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup)
1643//     Note: the (-) is still missing for V_lo.
1644//     Small s: w = w + N_0 * d_2
1645//     Note: the (-) is now incorporated in w.
1646//
1647(p9)   fcmp.ge.unc.s1 p10, p11 = FR_U_hiabs, FR_V_hiabs
1648       extr.u   GR_i_1 = GR_N_Inc, 0, 1 ;;
1649}
1650
1651{ .mfi
1652      nop.m 999
1653//
1654//     C_hi = S + A
1655//
1656(p9)   fma.s1 FR_t = FR_U_lo, f1, FR_V_lo
1657       extr.u   GR_i_0 = GR_N_Inc, 1, 1 ;;
1658}
1659
1660{ .mfi
1661      nop.m 999
1662//
1663//     t = U_lo + V_lo
1664//
1665//
1666(p10)  fms.s1 FR_a = FR_U_hi, f1, FR_A
1667      nop.i 999 ;;
1668}
1669
1670{ .mfi
1671      nop.m 999
1672(p11)  fma.s1 FR_a = FR_V_hi, f1, FR_A
1673      nop.i 999
1674}
1675;;
1676
1677{ .mmi
1678      nop.m 999
1679      addl           GR_Table_Base   = @ltoff(FSINCOS_CONSTANTS#), gp
1680      nop.i 999
1681}
1682;;
1683
1684{ .mmi
1685      ld8 GR_Table_Base = [GR_Table_Base]
1686      nop.m 999
1687      nop.i 999
1688}
1689;;
1690
1691
1692{ .mfi
1693       add GR_Table_Base = 528, GR_Table_Base
1694//
1695//     Is U_hiabs >= V_hiabs?
1696//
1697(p9)   fma.s1 FR_C_hi = FR_s, f1, FR_A
1698      nop.i 999 ;;
1699}
1700
1701{ .mmi
1702       ldfe FR_C_1 = [GR_Table_Base], 16 ;;
1703       ldfe FR_C_2 = [GR_Table_Base], 64
1704      nop.i 999 ;;
1705}
1706
1707{ .mmf
1708      nop.m 999
1709//
1710//     c = c + C_lo  finished.
1711//     Load  C_2
1712//
1713       ldfe FR_S_1 = [GR_Table_Base], 16
1714//
1715//     C_lo = S - C_hi
1716//
1717       fma.s1 FR_t = FR_t, f1, FR_w ;;
1718}
1719//
1720//     r and c have been computed.
1721//     Make sure ftz mode is set - should be automatic when using wre
1722//     |r| < 2**(-3)
1723//     Get [i_0,i_1] - two lsb of N_fix.
1724//     Load S_1
1725//
1726
1727{ .mfi
1728       ldfe FR_S_2 = [GR_Table_Base], 64
1729//
1730//     t = t + w
1731//
1732(p10)  fms.s1 FR_a = FR_a, f1, FR_V_hi
1733       cmp.eq.unc p9, p10 = 0x0, GR_i_0
1734}
1735
1736{ .mfi
1737      nop.m 999
1738//
1739//     For larger u than v: a = U_hi - A
1740//     Else a = V_hi - A (do an add to account for missing (-) on V_hi
1741//
1742       fms.s1 FR_C_lo = FR_s, f1, FR_C_hi
1743      nop.i 999 ;;
1744}
1745
1746{ .mfi
1747      nop.m 999
1748(p11)  fms.s1 FR_a = FR_U_hi, f1, FR_a
1749       cmp.eq.unc p11, p12 = 0x0, GR_i_1
1750}
1751
1752{ .mfi
1753      nop.m 999
1754//
1755//     If u > v: a = (U_hi - A)  + V_hi
1756//     Else      a = (V_hi - A)  + U_hi
1757//     In each case account for negative missing from V_hi.
1758//
1759       fma.s1 FR_C_lo = FR_C_lo, f1, FR_A
1760      nop.i 999 ;;
1761}
1762
1763{ .mfi
1764      nop.m 999
1765//
1766//     C_lo = (S - C_hi) + A
1767//
1768       fma.s1 FR_t = FR_t, f1, FR_a
1769      nop.i 999 ;;
1770}
1771
1772{ .mfi
1773      nop.m 999
1774//
1775//     t = t + a
1776//
1777       fma.s1 FR_C_lo = FR_C_lo, f1, FR_t
1778      nop.i 999 ;;
1779}
1780
1781{ .mfi
1782      nop.m 999
1783//
1784//     C_lo = C_lo + t
1785//     Adjust Table_Base to beginning of table
1786//
1787       fma.s1 FR_r = FR_C_hi, f1, FR_C_lo
1788      nop.i 999 ;;
1789}
1790
1791{ .mfi
1792      nop.m 999
1793//
1794//     Load S_2
1795//
1796       fma.s1 FR_rsq = FR_r, FR_r, f0
1797      nop.i 999
1798}
1799
1800{ .mfi
1801      nop.m 999
1802//
1803//     Table_Base points to C_1
1804//     r = C_hi + C_lo
1805//
1806       fms.s1 FR_c = FR_C_hi, f1, FR_r
1807      nop.i 999 ;;
1808}
1809
1810{ .mfi
1811      nop.m 999
1812//
1813//     if i_1 ==0: poly = S_2 * FR_rsq + S_1
1814//     else        poly = C_2 * FR_rsq + C_1
1815//
1816//(p11)  fma.s1 FR_Input_X = f0, f1, FR_r
1817(p11)  fma.s1 FR_prelim = f0, f1, FR_r
1818      nop.i 999 ;;
1819}
1820
1821{ .mfi
1822      nop.m 999
1823//(p12)  fma.s1 FR_Input_X = f0, f1, f1
1824(p12)  fma.s1 FR_prelim = f0, f1, f1
1825      nop.i 999 ;;
1826}
1827
1828{ .mfi
1829      nop.m 999
1830//
1831//     Compute r_cube = FR_rsq * r
1832//
1833(p11)  fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1
1834      nop.i 999 ;;
1835}
1836
1837{ .mfi
1838      nop.m 999
1839(p12)  fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1
1840      nop.i 999
1841}
1842
1843{ .mfi
1844      nop.m 999
1845//
1846//     Compute FR_rsq = r * r
1847//     Is i_1 == 0 ?
1848//
1849       fma.s1 FR_r_cubed = FR_rsq, FR_r, f0
1850      nop.i 999 ;;
1851}
1852
1853{ .mfi
1854      nop.m 999
1855//
1856//     c = C_hi - r
1857//     Load  C_1
1858//
1859       fma.s1 FR_c = FR_c, f1, FR_C_lo
1860      nop.i 999
1861}
1862
1863{ .mfi
1864      nop.m 999
1865//
1866//     if i_1 ==0: poly = r_cube * poly + c
1867//     else        poly = FR_rsq * poly
1868//
1869//(p10)  fms.s1 FR_Input_X = f0, f1, FR_Input_X
1870(p10)  fms.s1 FR_prelim = f0, f1, FR_prelim
1871      nop.i 999 ;;
1872}
1873
1874{ .mfi
1875      nop.m 999
1876//
1877//     if i_1 ==0: Result = r
1878//     else        Result = 1.0
1879//
1880(p11)  fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c
1881      nop.i 999 ;;
1882}
1883
1884{ .mfi
1885      nop.m 999
1886(p12)  fma.s1 FR_poly = FR_rsq, FR_poly, f0
1887      nop.i 999 ;;
1888}
1889
1890{ .mfi
1891      nop.m 999
1892//
1893//     if i_0 !=0: Result = -Result
1894//
1895(p9)   fma.s1 FR_Input_X = FR_prelim, f1, FR_poly
1896      nop.i 999 ;;
1897}
1898
1899{ .mfb
1900      nop.m 999
1901(p10)  fms.s1 FR_Input_X = FR_prelim, f1, FR_poly
1902//
1903//     if i_0 == 0: Result = Result + poly
1904//     else         Result = Result - poly
1905//
1906       br.ret.sptk   b0 ;;
1907}
1908SINCOS_SMALL_R:
1909
1910{ .mii
1911      nop.m 999
1912        extr.u  GR_i_1 = GR_N_Inc, 0, 1 ;;
1913//
1914//
1915//      Compare both i_1 and i_0 with 0.
1916//      if i_1 == 0, set p9.
1917//      if i_0 == 0, set p11.
1918//
1919        cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;
1920}
1921
1922{ .mfi
1923      nop.m 999
1924        fma.s1 FR_rsq = FR_r, FR_r, f0
1925        extr.u  GR_i_0 = GR_N_Inc, 1, 1 ;;
1926}
1927
1928{ .mfi
1929      nop.m 999
1930//
1931//  Z = Z * FR_rsq
1932//
1933(p10)   fnma.s1 FR_c = FR_c, FR_r, f0
1934        cmp.eq.unc p11, p12 = 0x0, GR_i_0
1935}
1936;;
1937
1938// ******************************************************************
1939// ******************************************************************
1940// ******************************************************************
1941//      r and c have been computed.
1942//      We know whether this is the sine or cosine routine.
1943//      Make sure ftz mode is set - should be automatic when using wre
1944//      |r| < 2**(-3)
1945//
1946//      Set table_ptr1 to beginning of constant table.
1947//      Get [i_0,i_1] - two lsb of N_fix_gr.
1948//
1949
1950{ .mmi
1951      nop.m 999
1952      addl           GR_Table_Base   = @ltoff(FSINCOS_CONSTANTS#), gp
1953      nop.i 999
1954}
1955;;
1956
1957{ .mmi
1958      ld8 GR_Table_Base = [GR_Table_Base]
1959      nop.m 999
1960      nop.i 999
1961}
1962;;
1963
1964
1965//
1966//      Set table_ptr1 to point to S_5.
1967//      Set table_ptr1 to point to C_5.
1968//      Compute FR_rsq = r * r
1969//
1970
1971{ .mfi
1972(p9)    add GR_Table_Base = 672, GR_Table_Base
1973(p10)   fmerge.s FR_r = f1, f1
1974(p10)   add GR_Table_Base = 592, GR_Table_Base ;;
1975}
1976//
1977//      Set table_ptr1 to point to S_5.
1978//      Set table_ptr1 to point to C_5.
1979//
1980
1981{ .mmi
1982(p9)    ldfe FR_S_5 = [GR_Table_Base], -16 ;;
1983//
1984//      if (i_1 == 0) load S_5
1985//      if (i_1 != 0) load C_5
1986//
1987(p9)    ldfe FR_S_4 = [GR_Table_Base], -16
1988      nop.i 999 ;;
1989}
1990
1991{ .mmf
1992(p10)   ldfe FR_C_5 = [GR_Table_Base], -16
1993//
1994//      Z = FR_rsq * FR_rsq
1995//
1996(p9)    ldfe FR_S_3 = [GR_Table_Base], -16
1997//
1998//      Compute FR_rsq = r * r
1999//      if (i_1 == 0) load S_4
2000//      if (i_1 != 0) load C_4
2001//
2002        fma.s1 FR_Z = FR_rsq, FR_rsq, f0 ;;
2003}
2004//
2005//      if (i_1 == 0) load S_3
2006//      if (i_1 != 0) load C_3
2007//
2008
2009{ .mmi
2010(p9)    ldfe FR_S_2 = [GR_Table_Base], -16 ;;
2011//
2012//      if (i_1 == 0) load S_2
2013//      if (i_1 != 0) load C_2
2014//
2015(p9)    ldfe FR_S_1 = [GR_Table_Base], -16
2016      nop.i 999
2017}
2018
2019{ .mmi
2020(p10)   ldfe FR_C_4 = [GR_Table_Base], -16 ;;
2021(p10)   ldfe FR_C_3 = [GR_Table_Base], -16
2022      nop.i 999 ;;
2023}
2024
2025{ .mmi
2026(p10)   ldfe FR_C_2 = [GR_Table_Base], -16 ;;
2027(p10)   ldfe FR_C_1 = [GR_Table_Base], -16
2028      nop.i 999
2029}
2030
2031{ .mfi
2032      nop.m 999
2033//
2034//      if (i_1 != 0):
2035//      poly_lo = FR_rsq * C_5 + C_4
2036//      poly_hi = FR_rsq * C_2 + C_1
2037//
2038(p9)    fma.s1 FR_Z = FR_Z, FR_r, f0
2039      nop.i 999 ;;
2040}
2041
2042{ .mfi
2043      nop.m 999
2044//
2045//      if (i_1 == 0) load S_1
2046//      if (i_1 != 0) load C_1
2047//
2048(p9)    fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4
2049      nop.i 999
2050}
2051
2052{ .mfi
2053      nop.m 999
2054//
2055//      c = -c * r
2056//      dummy fmpy's to flag inexact.
2057//
2058(p9)    fma.d.s1 FR_S_4 = FR_S_4, FR_S_4, f0
2059      nop.i 999 ;;
2060}
2061
2062{ .mfi
2063      nop.m 999
2064//
2065//      poly_lo = FR_rsq * poly_lo + C_3
2066//      poly_hi = FR_rsq * poly_hi
2067//
2068        fma.s1  FR_Z = FR_Z, FR_rsq, f0
2069      nop.i 999 ;;
2070}
2071
2072{ .mfi
2073      nop.m 999
2074(p9)    fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1
2075      nop.i 999
2076}
2077
2078{ .mfi
2079      nop.m 999
2080//
2081//      if (i_1 == 0):
2082//      poly_lo = FR_rsq * S_5 + S_4
2083//      poly_hi = FR_rsq * S_2 + S_1
2084//
2085(p10)   fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4
2086      nop.i 999 ;;
2087}
2088
2089{ .mfi
2090      nop.m 999
2091//
2092//      if (i_1 == 0):
2093//      Z = Z * r  for only one of the small r cases - not there
2094//      in original implementation notes.
2095//
2096(p9)    fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3
2097      nop.i 999 ;;
2098}
2099
2100{ .mfi
2101      nop.m 999
2102(p10)   fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1
2103      nop.i 999
2104}
2105
2106{ .mfi
2107      nop.m 999
2108(p10)   fma.d.s1 FR_C_1 = FR_C_1, FR_C_1, f0
2109      nop.i 999 ;;
2110}
2111
2112{ .mfi
2113      nop.m 999
2114(p9)    fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0
2115      nop.i 999
2116}
2117
2118{ .mfi
2119      nop.m 999
2120//
2121//      poly_lo = FR_rsq * poly_lo + S_3
2122//      poly_hi = FR_rsq * poly_hi
2123//
2124(p10)   fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3
2125      nop.i 999 ;;
2126}
2127
2128{ .mfi
2129      nop.m 999
2130(p10)   fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0
2131      nop.i 999 ;;
2132}
2133
2134{ .mfi
2135      nop.m 999
2136//
2137//  if (i_1 == 0): dummy fmpy's to flag inexact
2138//  r = 1
2139//
2140(p9)    fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0
2141      nop.i 999
2142}
2143
2144{ .mfi
2145      nop.m 999
2146//
2147//  poly_hi = r * poly_hi
2148//
2149        fma.s1  FR_poly = FR_Z, FR_poly_lo, FR_c
2150      nop.i 999 ;;
2151}
2152
2153{ .mfi
2154      nop.m 999
2155(p12)   fms.s1  FR_r = f0, f1, FR_r
2156      nop.i 999 ;;
2157}
2158
2159{ .mfi
2160      nop.m 999
2161//
2162//      poly_hi = Z * poly_lo + c
2163//  if i_0 == 1: r = -r
2164//
2165        fma.s1  FR_poly = FR_poly, f1, FR_poly_hi
2166      nop.i 999 ;;
2167}
2168
2169{ .mfi
2170      nop.m 999
2171(p12)   fms.s1 FR_Input_X = FR_r, f1, FR_poly
2172      nop.i 999
2173}
2174
2175{ .mfb
2176      nop.m 999
2177//
2178//      poly = poly + poly_hi
2179//
2180(p11)   fma.s1 FR_Input_X = FR_r, f1, FR_poly
2181//
2182//      if (i_0 == 0) Result = r + poly
2183//      if (i_0 != 0) Result = r - poly
2184//
2185       br.ret.sptk   b0 ;;
2186}
2187SINCOS_NORMAL_R:
2188
2189{ .mii
2190      nop.m 999
2191        extr.u  GR_i_1 = GR_N_Inc, 0, 1 ;;
2192//
2193//      Set table_ptr1 and table_ptr2 to base address of
2194//      constant table.
2195        cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;
2196}
2197
2198{ .mfi
2199      nop.m 999
2200        fma.s1  FR_rsq = FR_r, FR_r, f0
2201        extr.u  GR_i_0 = GR_N_Inc, 1, 1 ;;
2202}
2203
2204{ .mfi
2205      nop.m 999
2206        frcpa.s1 FR_r_hi, p6 = f1, FR_r
2207        cmp.eq.unc p11, p12 = 0x0, GR_i_0
2208}
2209;;
2210
2211// ******************************************************************
2212// ******************************************************************
2213// ******************************************************************
2214//
2215//      r and c have been computed.
2216//      We known whether this is the sine or cosine routine.
2217//      Make sure ftz mode is set - should be automatic when using wre
2218//      Get [i_0,i_1] - two lsb of N_fix_gr alone.
2219//
2220
2221{ .mmi
2222      nop.m 999
2223      addl           GR_Table_Base   = @ltoff(FSINCOS_CONSTANTS#), gp
2224      nop.i 999
2225}
2226;;
2227
2228{ .mmi
2229      ld8 GR_Table_Base = [GR_Table_Base]
2230      nop.m 999
2231      nop.i 999
2232}
2233;;
2234
2235
2236{ .mfi
2237(p10)   add GR_Table_Base = 384, GR_Table_Base
2238//(p12)   fms.s1 FR_Input_X = f0, f1, f1
2239(p12)   fms.s1 FR_prelim = f0, f1, f1
2240(p9)    add GR_Table_Base = 224, GR_Table_Base ;;
2241}
2242
2243{ .mmf
2244      nop.m 999
2245(p10)   ldfe FR_QQ_8 = [GR_Table_Base], 16
2246//
2247//      if (i_1==0) poly = poly * FR_rsq + PP_1_lo
2248//      else        poly = FR_rsq * poly
2249//
2250//(p11)   fma.s1 FR_Input_X = f0, f1, f1 ;;
2251(p11)   fma.s1 FR_prelim = f0, f1, f1 ;;
2252}
2253
2254{ .mmf
2255(p10)   ldfe FR_QQ_7 = [GR_Table_Base], 16
2256//
2257//  Adjust table pointers based on i_0
2258//      Compute rsq = r * r
2259//
2260(p9)    ldfe FR_PP_8 = [GR_Table_Base], 16
2261        fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 ;;
2262}
2263
2264{ .mmf
2265(p9)    ldfe FR_PP_7 = [GR_Table_Base], 16
2266(p10)   ldfe FR_QQ_6 = [GR_Table_Base], 16
2267//
2268//      Load PP_8 and QQ_8; PP_7 and QQ_7
2269//
2270        frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;;
2271}
2272//
2273//      if (i_1==0) poly =   PP_7 + FR_rsq * PP_8.
2274//      else        poly =   QQ_7 + FR_rsq * QQ_8.
2275//
2276
2277{ .mmb
2278(p9)    ldfe FR_PP_6 = [GR_Table_Base], 16
2279(p10)   ldfe FR_QQ_5 = [GR_Table_Base], 16
2280      nop.b 999 ;;
2281}
2282
2283{ .mmb
2284(p9)    ldfe FR_PP_5 = [GR_Table_Base], 16
2285(p10)   ldfe FR_S_1 = [GR_Table_Base], 16
2286      nop.b 999 ;;
2287}
2288
2289{ .mmb
2290(p10)   ldfe FR_QQ_1 = [GR_Table_Base], 16
2291(p9)    ldfe FR_C_1 = [GR_Table_Base], 16
2292      nop.b 999 ;;
2293}
2294
2295{ .mmi
2296(p10)   ldfe FR_QQ_4 = [GR_Table_Base], 16 ;;
2297(p9)    ldfe FR_PP_1 = [GR_Table_Base], 16
2298      nop.i 999 ;;
2299}
2300
2301{ .mmf
2302(p10)   ldfe FR_QQ_3 = [GR_Table_Base], 16
2303//
2304//      if (i_1=0) corr = corr + c*c
2305//      else       corr = corr * c
2306//
2307(p9)    ldfe FR_PP_4 = [GR_Table_Base], 16
2308(p10)   fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 ;;
2309}
2310//
2311//      if (i_1=0) poly = rsq * poly + PP_5
2312//      else       poly = rsq * poly + QQ_5
2313//      Load PP_4 or QQ_4
2314//
2315
2316{ .mmf
2317(p9)    ldfe FR_PP_3 = [GR_Table_Base], 16
2318(p10)   ldfe FR_QQ_2 = [GR_Table_Base], 16
2319//
2320//      r_hi =   frcpa(frcpa(r)).
2321//      r_cube = r * FR_rsq.
2322//
2323(p9)    fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 ;;
2324}
2325//
2326//      Do dummy multiplies so inexact is always set.
2327//
2328
2329{ .mfi
2330(p9)    ldfe FR_PP_2 = [GR_Table_Base], 16
2331//
2332//      r_lo = r - r_hi
2333//
2334(p9)    fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0
2335      nop.i 999 ;;
2336}
2337
2338{ .mmf
2339      nop.m 999
2340(p9)    ldfe FR_PP_1_lo = [GR_Table_Base], 16
2341(p10)   fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r
2342}
2343
2344{ .mfi
2345      nop.m 999
2346(p10)   fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6
2347      nop.i 999 ;;
2348}
2349
2350{ .mfi
2351      nop.m 999
2352//
2353//      if (i_1=0) U_lo = r_hi * r_hi
2354//      else       U_lo = r_hi + r
2355//
2356(p9)    fma.s1 FR_corr = FR_C_1, FR_rsq, f0
2357      nop.i 999 ;;
2358}
2359
2360{ .mfi
2361      nop.m 999
2362//
2363//      if (i_1=0) corr = C_1 * rsq
2364//      else       corr = S_1 * r_cubed + r
2365//
2366(p9)    fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6
2367      nop.i 999
2368}
2369
2370{ .mfi
2371      nop.m 999
2372(p10)   fma.s1 FR_U_lo = FR_r_hi, f1, FR_r
2373      nop.i 999 ;;
2374}
2375
2376{ .mfi
2377      nop.m 999
2378//
2379//      if (i_1=0) U_hi = r_hi + U_hi
2380//      else       U_hi = QQ_1 * U_hi + 1
2381//
2382(p9)    fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo
2383      nop.i 999
2384}
2385
2386{ .mfi
2387      nop.m 999
2388//
2389//      U_hi = r_hi * r_hi
2390//
2391        fms.s1 FR_r_lo = FR_r, f1, FR_r_hi
2392      nop.i 999 ;;
2393}
2394
2395{ .mfi
2396      nop.m 999
2397//
2398//      Load PP_1, PP_6, PP_5, and C_1
2399//      Load QQ_1, QQ_6, QQ_5, and S_1
2400//
2401        fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0
2402      nop.i 999 ;;
2403}
2404
2405{ .mfi
2406      nop.m 999
2407(p10)   fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5
2408      nop.i 999
2409}
2410
2411{ .mfi
2412      nop.m 999
2413(p10)   fnma.s1 FR_corr = FR_corr, FR_c, f0
2414      nop.i 999 ;;
2415}
2416
2417{ .mfi
2418      nop.m 999
2419//
2420//      if (i_1=0) U_lo = r * r_hi + U_lo
2421//      else       U_lo = r_lo * U_lo
2422//
2423(p9)    fma.s1 FR_corr = FR_corr, FR_c, FR_c
2424      nop.i 999 ;;
2425}
2426
2427{ .mfi
2428      nop.m 999
2429(p9)    fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5
2430      nop.i 999
2431}
2432
2433{ .mfi
2434      nop.m 999
2435//
2436//      if (i_1 =0) U_hi = r + U_hi
2437//      if (i_1 =0) U_lo = r_lo * U_lo
2438//
2439//
2440(p9)    fma.d.s1 FR_PP_5 = FR_PP_5, FR_PP_4, f0
2441      nop.i 999 ;;
2442}
2443
2444{ .mfi
2445      nop.m 999
2446(p9)    fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo
2447      nop.i 999
2448}
2449
2450{ .mfi
2451      nop.m 999
2452(p10)   fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0
2453      nop.i 999 ;;
2454}
2455
2456{ .mfi
2457      nop.m 999
2458//
2459//      if (i_1=0) poly = poly * rsq + PP_6
2460//      else       poly = poly * rsq + QQ_6
2461//
2462(p9)    fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0
2463      nop.i 999 ;;
2464}
2465
2466{ .mfi
2467      nop.m 999
2468(p10)   fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4
2469      nop.i 999
2470}
2471
2472{ .mfi
2473      nop.m 999
2474(p10)   fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1
2475      nop.i 999 ;;
2476}
2477
2478{ .mfi
2479      nop.m 999
2480(p10)   fma.d.s1 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0
2481      nop.i 999 ;;
2482}
2483
2484{ .mfi
2485      nop.m 999
2486//
2487//      if (i_1!=0) U_hi = PP_1 * U_hi
2488//      if (i_1!=0) U_lo = r * r  + U_lo
2489//      Load PP_3 or QQ_3
2490//
2491(p9)    fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4
2492      nop.i 999 ;;
2493}
2494
2495{ .mfi
2496      nop.m 999
2497(p9)    fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0
2498      nop.i 999
2499}
2500
2501{ .mfi
2502      nop.m 999
2503(p10)   fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0
2504      nop.i 999 ;;
2505}
2506
2507{ .mfi
2508      nop.m 999
2509(p9)    fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0
2510      nop.i 999 ;;
2511}
2512
2513{ .mfi
2514      nop.m 999
2515(p10)   fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3
2516      nop.i 999 ;;
2517}
2518
2519{ .mfi
2520      nop.m 999
2521//
2522//      Load PP_2, QQ_2
2523//
2524(p9)    fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3
2525      nop.i 999 ;;
2526}
2527
2528{ .mfi
2529      nop.m 999
2530//
2531//      if (i_1==0) poly = FR_rsq * poly  + PP_3
2532//      else        poly = FR_rsq * poly  + QQ_3
2533//      Load PP_1_lo
2534//
2535(p9)    fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0
2536      nop.i 999 ;;
2537}
2538
2539{ .mfi
2540      nop.m 999
2541//
2542//      if (i_1 =0) poly = poly * rsq + pp_r4
2543//      else        poly = poly * rsq + qq_r4
2544//
2545(p9)    fma.s1 FR_U_hi = FR_r, f1, FR_U_hi
2546      nop.i 999 ;;
2547}
2548
2549{ .mfi
2550      nop.m 999
2551(p10)   fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2
2552      nop.i 999 ;;
2553}
2554
2555{ .mfi
2556      nop.m 999
2557//
2558//      if (i_1==0) U_lo =  PP_1_hi * U_lo
2559//      else        U_lo =  QQ_1 * U_lo
2560//
2561(p9)    fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2
2562      nop.i 999 ;;
2563}
2564
2565{ .mfi
2566      nop.m 999
2567//
2568//      if (i_0==0)  Result = 1
2569//      else         Result = -1
2570//
2571        fma.s1 FR_V = FR_U_lo, f1, FR_corr
2572      nop.i 999 ;;
2573}
2574
2575{ .mfi
2576      nop.m 999
2577(p10)   fma.s1 FR_poly = FR_rsq, FR_poly, f0
2578      nop.i 999 ;;
2579}
2580
2581{ .mfi
2582      nop.m 999
2583//
2584//      if (i_1==0) poly =  FR_rsq * poly + PP_2
2585//      else poly =  FR_rsq * poly + QQ_2
2586//
2587(p9)    fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo
2588      nop.i 999 ;;
2589}
2590
2591{ .mfi
2592      nop.m 999
2593(p10)   fma.s1 FR_poly = FR_rsq, FR_poly, f0
2594      nop.i 999 ;;
2595}
2596
2597{ .mfi
2598      nop.m 999
2599//
2600//      V = U_lo + corr
2601//
2602(p9)    fma.s1 FR_poly = FR_r_cubed, FR_poly, f0
2603      nop.i 999 ;;
2604}
2605
2606{ .mfi
2607      nop.m 999
2608//
2609//      if (i_1==0) poly = r_cube * poly
2610//      else        poly = FR_rsq * poly
2611//
2612        fma.s1  FR_V = FR_poly, f1, FR_V
2613      nop.i 999 ;;
2614}
2615
2616{ .mfi
2617      nop.m 999
2618//(p12)   fms.s1 FR_Input_X = FR_Input_X, FR_U_hi, FR_V
2619(p12)   fms.s1 FR_Input_X = FR_prelim, FR_U_hi, FR_V
2620      nop.i 999
2621}
2622
2623{ .mfb
2624      nop.m 999
2625//
2626//      V = V + poly
2627//
2628//(p11)   fma.s1 FR_Input_X = FR_Input_X, FR_U_hi, FR_V
2629(p11)   fma.s1 FR_Input_X = FR_prelim, FR_U_hi, FR_V
2630//
2631//      if (i_0==0) Result = Result * U_hi + V
2632//      else        Result = Result * U_hi - V
2633//
2634       br.ret.sptk   b0 ;;
2635}
2636
2637//
2638//      If cosine, FR_Input_X = 1
2639//      If sine, FR_Input_X = +/-Zero (Input FR_Input_X)
2640//      Results are exact, no exceptions
2641//
2642SINCOS_ZERO:
2643
2644{ .mmb
2645        cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos
2646      nop.m 999
2647      nop.b 999 ;;
2648}
2649
2650{ .mfi
2651      nop.m 999
2652(p7)    fmerge.s FR_Input_X = FR_Input_X, FR_Input_X
2653      nop.i 999
2654}
2655
2656{ .mfb
2657      nop.m 999
2658(p6)    fmerge.s FR_Input_X = f1, f1
2659       br.ret.sptk   b0 ;;
2660}
2661
2662SINCOS_SPECIAL:
2663
2664//
2665//      Path for Arg = +/- QNaN, SNaN, Inf
2666//      Invalid can be raised. SNaNs
2667//      become QNaNs
2668//
2669
2670{ .mfb
2671      nop.m 999
2672        fmpy.s1 FR_Input_X = FR_Input_X, f0
2673        br.ret.sptk   b0 ;;
2674}
2675GLOBAL_LIBM_END(__libm_cos_large)
2676
2677
2678// *******************************************************************
2679// *******************************************************************
2680// *******************************************************************
2681//
2682//     Special Code to handle very large argument case.
2683//     Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63
2684//     The interface is custom:
2685//       On input:
2686//         (Arg or x) is in f8
2687//       On output:
2688//         r is in f8
2689//         c is in f9
2690//         N is in r8
2691//     Be sure to allocate at least 2 GP registers as output registers for
2692//     __libm_pi_by_2_reduce.  This routine uses r49-50. These are used as
2693//     scratch registers within the __libm_pi_by_2_reduce routine (for speed).
2694//
2695//     We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127.  We
2696//     use this to eliminate save/restore of key fp registers in this calling
2697//     function.
2698//
2699// *******************************************************************
2700// *******************************************************************
2701// *******************************************************************
2702
2703LOCAL_LIBM_ENTRY(__libm_callout_2)
2704SINCOS_ARG_TOO_LARGE:
2705
2706.prologue
2707//      Readjust Table ptr
2708{ .mfi
2709        adds  GR_Table_Base1 = -16, GR_Table_Base1
2710        nop.f 999
2711.save   ar.pfs,GR_SAVE_PFS
2712        mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
2713};;
2714
2715{ .mmi
2716        ldfs FR_Two_to_M3 = [GR_Table_Base1],4
2717        mov GR_SAVE_GP=gp                       // Save gp
2718.save   b0, GR_SAVE_B0
2719        mov GR_SAVE_B0=b0                       // Save b0
2720};;
2721
2722.body
2723//
2724//     Call argument reduction with x in f8
2725//     Returns with N in r8, r in f8, c in f9
2726//     Assumes f71-127 are preserved across the call
2727//
2728{ .mib
2729        ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0
2730        nop.i 0
2731        br.call.sptk b0=__libm_pi_by_2_reduce#
2732};;
2733
2734{ .mfi
2735        add   GR_N_Inc = GR_Sin_or_Cos,r8
2736        fcmp.lt.unc.s1  p6, p0 = FR_r, FR_Two_to_M3
2737        mov   b0 = GR_SAVE_B0                  // Restore return address
2738};;
2739
2740{ .mfi
2741        mov   gp = GR_SAVE_GP                  // Restore gp
2742(p6)    fcmp.gt.unc.s1  p6, p0 = FR_r, FR_Neg_Two_to_M3
2743        mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
2744};;
2745
2746{ .mbb
2747        nop.m 999
2748(p6)    br.cond.spnt SINCOS_SMALL_R            // Branch if |r| < 1/4
2749        br.cond.sptk SINCOS_NORMAL_R ;;        // Branch if 1/4 <= |r| < pi/4
2750}
2751
2752LOCAL_LIBM_END(__libm_callout_2)
2753
2754.type   __libm_pi_by_2_reduce#,@function
2755.global __libm_pi_by_2_reduce#
2756