1.file "libm_sincos.s"
2
3
4// Copyright (c) 2002 - 2005, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39// History
40//==============================================================
41// 02/01/02 Initial version
42// 02/18/02 Large arguments processing routine is excluded.
43//          External interface entry points are added
44// 03/13/02 Corrected restore of predicate registers
45// 03/19/02 Added stack unwind around call to __libm_cis_large
46// 09/05/02 Work range is widened by reduction strengthen (3 parts of Pi/16)
47// 02/10/03 Reordered header: .section, .global, .proc, .align
48// 08/08/03 Improved performance
49// 02/11/04 cis is moved to the separate file.
50// 03/31/05 Reformatted delimiters between data tables
51//
52// API
53//==============================================================
54// 1) void sincos(double, double*s, double*c)
55// 2) __libm_sincos - internal LIBM function, that accepts
56//    argument in f8 and returns cosine through f8, sine through f9
57//
58// Overview of operation
59//==============================================================
60//
61// Step 1
62// ======
63// Reduce x to region -1/2*pi/2^k ===== 0 ===== +1/2*pi/2^k  where k=4
64//    divide x by pi/2^k.
65//    Multiply by 2^k/pi.
66//    nfloat = Round result to integer (round-to-nearest)
67//
68// r = x -  nfloat * pi/2^k
69//    Do this as ((((x -  nfloat * HIGH(pi/2^k))) -
70//                        nfloat * LOW(pi/2^k)) -
71//                        nfloat * LOWEST(pi/2^k) for increased accuracy.
72//    pi/2^k is stored as two numbers that when added make pi/2^k.
73//       pi/2^k = HIGH(pi/2^k) + LOW(pi/2^k)
74//    HIGH and LOW parts are rounded to zero values,
75//    and LOWEST is rounded to nearest one.
76//
77// x = (nfloat * pi/2^k) + r
78//    r is small enough that we can use a polynomial approximation
79//    and is referred to as the reduced argument.
80//
81// Step 3
82// ======
83// Take the unreduced part and remove the multiples of 2pi.
84// So nfloat = nfloat (with lower k+1 bits cleared) + lower k+1 bits
85//
86//    nfloat (with lower k+1 bits cleared) is a multiple of 2^(k+1)
87//    N * 2^(k+1)
88//    nfloat * pi/2^k = N * 2^(k+1) * pi/2^k + (lower k+1 bits) * pi/2^k
89//    nfloat * pi/2^k = N * 2 * pi + (lower k+1 bits) * pi/2^k
90//    nfloat * pi/2^k = N2pi + M * pi/2^k
91//
92//
93// Sin(x) = Sin((nfloat * pi/2^k) + r)
94//        = Sin(nfloat * pi/2^k) * Cos(r) + Cos(nfloat * pi/2^k) * Sin(r)
95//
96//          Sin(nfloat * pi/2^k) = Sin(N2pi + Mpi/2^k)
97//                               = Sin(N2pi)Cos(Mpi/2^k) + Cos(N2pi)Sin(Mpi/2^k)
98//                               = Sin(Mpi/2^k)
99//
100//          Cos(nfloat * pi/2^k) = Cos(N2pi + Mpi/2^k)
101//                               = Cos(N2pi)Cos(Mpi/2^k) + Sin(N2pi)Sin(Mpi/2^k)
102//                               = Cos(Mpi/2^k)
103//
104// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r)
105//
106//
107// Step 4
108// ======
109// 0 <= M < 2^(k+1)
110// There are 2^(k+1) Sin entries in a table.
111// There are 2^(k+1) Cos entries in a table.
112//
113// Get Sin(Mpi/2^k) and Cos(Mpi/2^k) by table lookup.
114//
115//
116// Step 5
117// ======
118// Calculate Cos(r) and Sin(r) by polynomial approximation.
119//
120// Cos(r) = 1 + r^2 q1  + r^4 q2 + r^6 q3 + ... = Series for Cos
121// Sin(r) = r + r^3 p1  + r^5 p2 + r^7 p3 + ... = Series for Sin
122//
123// and the coefficients q1, q2, ... and p1, p2, ... are stored in a table
124//
125//
126// Calculate
127// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r)
128//
129// as follows
130//
131//    S[m] = Sin(Mpi/2^k) and C[m] = Cos(Mpi/2^k)
132//    rsq = r*r
133//
134//
135//    P = p1 + r^2p2 + r^4p3 + r^6p4
136//    Q = q1 + r^2q2 + r^4q3 + r^6q4
137//
138//       rcub = r * rsq
139//       Sin(r) = r + rcub * P
140//              = r + r^3p1  + r^5p2 + r^7p3 + r^9p4 + ... = Sin(r)
141//
142//            The coefficients are not exactly these values, but almost.
143//
144//            p1 = -1/6  = -1/3!
145//            p2 = 1/120 =  1/5!
146//            p3 = -1/5040 = -1/7!
147//            p4 = 1/362889 = 1/9!
148//
149//       P =  r + rcub * P
150//
151//    Answer = S[m] Cos(r) + C[m] P
152//
153//       Cos(r) = 1 + rsq Q
154//       Cos(r) = 1 + r^2 Q
155//       Cos(r) = 1 + r^2 (q1 + r^2q2 + r^4q3 + r^6q4)
156//       Cos(r) = 1 + r^2q1 + r^4q2 + r^6q3 + r^8q4 + ...
157//
158//       S[m] Cos(r) = S[m](1 + rsq Q)
159//       S[m] Cos(r) = S[m] + S[m] rsq Q
160//       S[m] Cos(r) = S[m] + s_rsq Q
161//       Q           = S[m] + s_rsq Q
162//
163// Then,
164//
165//    Answer = Q + C[m] P
166
167// Registers used
168//==============================================================
169// general input registers:
170// r14 -> r39
171
172// predicate registers used:
173// p6 -> p14
174//
175// floating-point registers used
176// f9 -> f15
177// f32 -> f67
178
179// Assembly macros
180//==============================================================
181
182cis_Arg                     = f8
183
184cis_Sin_res                 = f9
185cis_Cos_res                 = f8
186
187cis_NORM_f8                 = f10
188cis_W                       = f11
189cis_int_Nfloat              = f12
190cis_Nfloat                  = f13
191
192cis_r                       = f14
193cis_rsq                     = f15
194cis_rcub                    = f32
195
196cis_Inv_Pi_by_16            = f33
197cis_Pi_by_16_hi             = f34
198cis_Pi_by_16_lo             = f35
199
200cis_Inv_Pi_by_64            = f36
201cis_Pi_by_16_lowest         = f37
202cis_r_exact                 = f38
203
204
205cis_P1                      = f39
206cis_Q1                      = f40
207cis_P2                      = f41
208cis_Q2                      = f42
209cis_P3                      = f43
210cis_Q3                      = f44
211cis_P4                      = f45
212cis_Q4                      = f46
213
214cis_P_temp1                 = f47
215cis_P_temp2                 = f48
216
217cis_Q_temp1                 = f49
218cis_Q_temp2                 = f50
219
220cis_P                       = f51
221
222cis_SIG_INV_PI_BY_16_2TO61  = f52
223cis_RSHF_2TO61              = f53
224cis_RSHF                    = f54
225cis_2TOM61                  = f55
226cis_NFLOAT                  = f56
227cis_W_2TO61_RSH             = f57
228
229cis_tmp                     = f58
230
231cis_Sm_sin                  = f59
232cis_Cm_sin                  = f60
233
234cis_Sm_cos                  = f61
235cis_Cm_cos                  = f62
236
237cis_srsq_sin                = f63
238cis_srsq_cos                = f64
239
240cis_Q_sin                   = f65
241cis_Q_cos                   = f66
242cis_Q                       = f67
243
244/////////////////////////////////////////////////////////////
245
246cis_pResSin                 = r33
247cis_pResCos                 = r34
248
249cis_GR_sig_inv_pi_by_16     = r14
250cis_GR_rshf_2to61           = r15
251cis_GR_rshf                 = r16
252cis_GR_exp_2tom61           = r17
253cis_GR_n                    = r18
254cis_GR_n_sin                = r19
255cis_exp_limit               = r20
256cis_r_signexp               = r21
257cis_AD_1                    = r22
258cis_r_sincos                = r23
259cis_r_exp                   = r24
260cis_r_17_ones               = r25
261cis_GR_m_sin                = r26
262cis_GR_32m_sin              = r26
263cis_GR_n_cos                = r27
264cis_GR_m_cos                = r28
265cis_GR_32m_cos              = r28
266cis_AD_2_sin                = r29
267cis_AD_2_cos                = r30
268cis_gr_tmp                  = r31
269
270GR_SAVE_B0                  = r35
271GR_SAVE_GP                  = r36
272rB0_SAVED                   = r37
273GR_SAVE_PFS                 = r38
274GR_SAVE_PR                  = r39
275
276RODATA
277
278.align 16
279// Pi/16 parts
280LOCAL_OBJECT_START(double_cis_pi)
281   data8 0xC90FDAA22168C234, 0x00003FFC // pi/16 1st part
282   data8 0xC4C6628B80DC1CD1, 0x00003FBC // pi/16 2nd part
283   data8 0xA4093822299F31D0, 0x00003F7A // pi/16 3rd part
284LOCAL_OBJECT_END(double_cis_pi)
285
286// Coefficients for polynomials
287LOCAL_OBJECT_START(double_cis_pq_k4)
288   data8 0x3EC71C963717C63A // P4
289   data8 0x3EF9FFBA8F191AE6 // Q4
290   data8 0xBF2A01A00F4E11A8 // P3
291   data8 0xBF56C16C05AC77BF // Q3
292   data8 0x3F8111111110F167 // P2
293   data8 0x3FA555555554DD45 // Q2
294   data8 0xBFC5555555555555 // P1
295   data8 0xBFDFFFFFFFFFFFFC // Q1
296LOCAL_OBJECT_END(double_cis_pq_k4)
297
298// Sincos table (S[m], C[m])
299LOCAL_OBJECT_START(double_sin_cos_beta_k4)
300data8 0x0000000000000000 , 0x00000000 // sin( 0 pi/16)  S0
301data8 0x8000000000000000 , 0x00003fff // cos( 0 pi/16)  C0
302//
303data8 0xc7c5c1e34d3055b3 , 0x00003ffc // sin( 1 pi/16)  S1
304data8 0xfb14be7fbae58157 , 0x00003ffe // cos( 1 pi/16)  C1
305//
306data8 0xc3ef1535754b168e , 0x00003ffd // sin( 2 pi/16)  S2
307data8 0xec835e79946a3146 , 0x00003ffe // cos( 2 pi/16)  C2
308//
309data8 0x8e39d9cd73464364 , 0x00003ffe // sin( 3 pi/16)  S3
310data8 0xd4db3148750d181a , 0x00003ffe // cos( 3 pi/16)  C3
311//
312data8 0xb504f333f9de6484 , 0x00003ffe // sin( 4 pi/16)  S4
313data8 0xb504f333f9de6484 , 0x00003ffe // cos( 4 pi/16)  C4
314//
315data8 0xd4db3148750d181a , 0x00003ffe // sin( 5 pi/16)  C3
316data8 0x8e39d9cd73464364 , 0x00003ffe // cos( 5 pi/16)  S3
317//
318data8 0xec835e79946a3146 , 0x00003ffe // sin( 6 pi/16)  C2
319data8 0xc3ef1535754b168e , 0x00003ffd // cos( 6 pi/16)  S2
320//
321data8 0xfb14be7fbae58157 , 0x00003ffe // sin( 7 pi/16)  C1
322data8 0xc7c5c1e34d3055b3 , 0x00003ffc // cos( 7 pi/16)  S1
323//
324data8 0x8000000000000000 , 0x00003fff // sin( 8 pi/16)  C0
325data8 0x0000000000000000 , 0x00000000 // cos( 8 pi/16)  S0
326//
327data8 0xfb14be7fbae58157 , 0x00003ffe // sin( 9 pi/16)  C1
328data8 0xc7c5c1e34d3055b3 , 0x0000bffc // cos( 9 pi/16)  -S1
329//
330data8 0xec835e79946a3146 , 0x00003ffe // sin(10 pi/16)  C2
331data8 0xc3ef1535754b168e , 0x0000bffd // cos(10 pi/16)  -S2
332//
333data8 0xd4db3148750d181a , 0x00003ffe // sin(11 pi/16)  C3
334data8 0x8e39d9cd73464364 , 0x0000bffe // cos(11 pi/16)  -S3
335//
336data8 0xb504f333f9de6484 , 0x00003ffe // sin(12 pi/16)  S4
337data8 0xb504f333f9de6484 , 0x0000bffe // cos(12 pi/16)  -S4
338//
339data8 0x8e39d9cd73464364 , 0x00003ffe // sin(13 pi/16) S3
340data8 0xd4db3148750d181a , 0x0000bffe // cos(13 pi/16) -C3
341//
342data8 0xc3ef1535754b168e , 0x00003ffd // sin(14 pi/16) S2
343data8 0xec835e79946a3146 , 0x0000bffe // cos(14 pi/16) -C2
344//
345data8 0xc7c5c1e34d3055b3 , 0x00003ffc // sin(15 pi/16) S1
346data8 0xfb14be7fbae58157 , 0x0000bffe // cos(15 pi/16) -C1
347//
348data8 0x0000000000000000 , 0x00000000 // sin(16 pi/16) S0
349data8 0x8000000000000000 , 0x0000bfff // cos(16 pi/16) -C0
350//
351data8 0xc7c5c1e34d3055b3 , 0x0000bffc // sin(17 pi/16) -S1
352data8 0xfb14be7fbae58157 , 0x0000bffe // cos(17 pi/16) -C1
353//
354data8 0xc3ef1535754b168e , 0x0000bffd // sin(18 pi/16) -S2
355data8 0xec835e79946a3146 , 0x0000bffe // cos(18 pi/16) -C2
356//
357data8 0x8e39d9cd73464364 , 0x0000bffe // sin(19 pi/16) -S3
358data8 0xd4db3148750d181a , 0x0000bffe // cos(19 pi/16) -C3
359//
360data8 0xb504f333f9de6484 , 0x0000bffe // sin(20 pi/16) -S4
361data8 0xb504f333f9de6484 , 0x0000bffe // cos(20 pi/16) -S4
362//
363data8 0xd4db3148750d181a , 0x0000bffe // sin(21 pi/16) -C3
364data8 0x8e39d9cd73464364 , 0x0000bffe // cos(21 pi/16) -S3
365//
366data8 0xec835e79946a3146 , 0x0000bffe // sin(22 pi/16) -C2
367data8 0xc3ef1535754b168e , 0x0000bffd // cos(22 pi/16) -S2
368//
369data8 0xfb14be7fbae58157 , 0x0000bffe // sin(23 pi/16) -C1
370data8 0xc7c5c1e34d3055b3 , 0x0000bffc // cos(23 pi/16) -S1
371//
372data8 0x8000000000000000 , 0x0000bfff // sin(24 pi/16) -C0
373data8 0x0000000000000000 , 0x00000000 // cos(24 pi/16) S0
374//
375data8 0xfb14be7fbae58157 , 0x0000bffe // sin(25 pi/16) -C1
376data8 0xc7c5c1e34d3055b3 , 0x00003ffc // cos(25 pi/16) S1
377//
378data8 0xec835e79946a3146 , 0x0000bffe // sin(26 pi/16) -C2
379data8 0xc3ef1535754b168e , 0x00003ffd // cos(26 pi/16) S2
380//
381data8 0xd4db3148750d181a , 0x0000bffe // sin(27 pi/16) -C3
382data8 0x8e39d9cd73464364 , 0x00003ffe // cos(27 pi/16) S3
383//
384data8 0xb504f333f9de6484 , 0x0000bffe // sin(28 pi/16) -S4
385data8 0xb504f333f9de6484 , 0x00003ffe // cos(28 pi/16) S4
386//
387data8 0x8e39d9cd73464364 , 0x0000bffe // sin(29 pi/16) -S3
388data8 0xd4db3148750d181a , 0x00003ffe // cos(29 pi/16) C3
389//
390data8 0xc3ef1535754b168e , 0x0000bffd // sin(30 pi/16) -S2
391data8 0xec835e79946a3146 , 0x00003ffe // cos(30 pi/16) C2
392//
393data8 0xc7c5c1e34d3055b3 , 0x0000bffc // sin(31 pi/16) -S1
394data8 0xfb14be7fbae58157 , 0x00003ffe // cos(31 pi/16) C1
395//
396data8 0x0000000000000000 , 0x00000000 // sin(32 pi/16) S0
397data8 0x8000000000000000 , 0x00003fff // cos(32 pi/16) C0
398LOCAL_OBJECT_END(double_sin_cos_beta_k4)
399
400.section .text
401
402GLOBAL_IEEE754_ENTRY(sincos)
403// cis_GR_sig_inv_pi_by_16 = significand of 16/pi
404{ .mlx
405      getf.exp      cis_r_signexp       = cis_Arg
406      movl          cis_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A
407
408}
409// cis_GR_rshf_2to61 = 1.1000 2^(63+63-2)
410{ .mlx
411      addl          cis_AD_1                = @ltoff(double_cis_pi), gp
412      movl          cis_GR_rshf_2to61       = 0x47b8000000000000
413};;
414
415{ .mfi
416      ld8           cis_AD_1            = [cis_AD_1]
417      fnorm.s1      cis_NORM_f8         = cis_Arg
418      cmp.eq        p13, p14            = r0, r0 // p13 set for sincos
419}
420// cis_GR_exp_2tom61 = exponent of scaling factor 2^-61
421{ .mib
422      mov           cis_GR_exp_2tom61   = 0xffff-61
423      nop.i         0
424      br.cond.sptk  _CIS_COMMON
425};;
426GLOBAL_IEEE754_END(sincos)
427libm_alias_double_other (__sincos, sincos)
428
429GLOBAL_LIBM_ENTRY(__libm_sincos)
430// cis_GR_sig_inv_pi_by_16 = significand of 16/pi
431{ .mlx
432      getf.exp      cis_r_signexp       = cis_Arg
433      movl          cis_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A
434}
435// cis_GR_rshf_2to61 = 1.1000 2^(63+63-2)
436{ .mlx
437      addl          cis_AD_1            = @ltoff(double_cis_pi), gp
438      movl          cis_GR_rshf_2to61   = 0x47b8000000000000
439};;
440
441// p14 set for __libm_sincos and cis
442{ .mfi
443      ld8           cis_AD_1            = [cis_AD_1]
444      fnorm.s1      cis_NORM_f8         = cis_Arg
445      cmp.eq        p14, p13            = r0, r0
446}
447// cis_GR_exp_2tom61 = exponent of scaling factor 2^-61
448{ .mib
449      mov           cis_GR_exp_2tom61   = 0xffff-61
450      nop.i         0
451      nop.b         0
452};;
453
454_CIS_COMMON:
455//  Form two constants we need
456//  16/pi * 2^-2 * 2^63, scaled by 2^61 since we just loaded the significand
457//  1.1000...000 * 2^(63+63-2) to right shift int(W) into the low significand
458//  fcmp used to set denormal, and invalid on snans
459{ .mfi
460      setf.sig      cis_SIG_INV_PI_BY_16_2TO61 = cis_GR_sig_inv_pi_by_16
461      fclass.m      p6,p0                      = cis_Arg, 0xe7 // if x=0,inf,nan
462      addl          cis_gr_tmp                 = -1, r0
463}
464// 1.1000 2^63 for right shift
465{ .mlx
466      setf.d        cis_RSHF_2TO61             = cis_GR_rshf_2to61
467      movl          cis_GR_rshf                = 0x43e8000000000000
468};;
469
470//  Form another constant
471//  2^-61 for scaling Nfloat
472//  0x1001a is register_bias + 27.
473//  So if f8 >= 2^27, go to large arguments routine
474{ .mfi
475      alloc         GR_SAVE_PFS         = ar.pfs, 3, 5, 0, 0
476      fclass.m      p11,p0              = cis_Arg, 0x0b // Test for x=unorm
477      mov           cis_exp_limit       = 0x1001a
478}
479{ .mib
480      setf.exp      cis_2TOM61          = cis_GR_exp_2tom61
481      nop.i         0
482(p6)  br.cond.spnt  _CIS_SPECIAL_ARGS
483};;
484
485//  Load the two pieces of pi/16
486//  Form another constant
487//  1.1000...000 * 2^63, the right shift constant
488{ .mmb
489      ldfe          cis_Pi_by_16_hi     = [cis_AD_1],16
490      setf.d        cis_RSHF            = cis_GR_rshf
491(p11) br.cond.spnt  _CIS_UNORM          // Branch if x=unorm
492};;
493
494_CIS_COMMON2:
495// Return here if x=unorm
496// Create constant inexact set
497{ .mmi
498      ldfe          cis_Pi_by_16_lo     = [cis_AD_1],16
499      setf.sig      cis_tmp             = cis_gr_tmp
500      nop.i         0
501};;
502
503// Select exponent (17 lsb)
504{ .mfi
505      ldfe          cis_Pi_by_16_lowest = [cis_AD_1],16
506      nop.f         0
507      dep.z         cis_r_exp           = cis_r_signexp, 0, 17
508};;
509
510// Start loading P, Q coefficients
511// p10 is true if we must call routines to handle larger arguments
512// p10 is true if f8 exp is > 0x1001a
513{ .mmb
514      ldfpd         cis_P4,cis_Q4       = [cis_AD_1],16
515      cmp.ge        p10, p0             = cis_r_exp, cis_exp_limit
516(p10) br.cond.spnt  _CIS_LARGE_ARGS // go to |x| >= 2^27 path
517};;
518
519// cis_W = x * cis_Inv_Pi_by_16
520// Multiply x by scaled 16/pi and add large const to shift integer part of W to
521// rightmost bits of significand
522{ .mfi
523      ldfpd         cis_P3,cis_Q3       = [cis_AD_1],16
524      fma.s1 cis_W_2TO61_RSH = cis_NORM_f8,cis_SIG_INV_PI_BY_16_2TO61,cis_RSHF_2TO61
525      nop.i  0
526};;
527
528// get N = (int)cis_int_Nfloat
529// cis_NFLOAT = Round_Int_Nearest(cis_W)
530{ .mmf
531      getf.sig      cis_GR_n            = cis_W_2TO61_RSH
532      ldfpd  cis_P2,cis_Q2   = [cis_AD_1],16
533      fms.s1        cis_NFLOAT          = cis_W_2TO61_RSH,cis_2TOM61,cis_RSHF
534};;
535
536// cis_r = -cis_Nfloat * cis_Pi_by_16_hi + x
537{ .mfi
538      ldfpd         cis_P1,cis_Q1       = [cis_AD_1], 16
539      fnma.s1       cis_r               = cis_NFLOAT,cis_Pi_by_16_hi,cis_NORM_f8
540      nop.i         0
541};;
542
543// Add 2^(k-1) (which is in cis_r_sincos) to N
544{ .mmi
545      add           cis_GR_n_cos        = 0x8, cis_GR_n
546;;
547//Get M (least k+1 bits of N)
548      and           cis_GR_m_sin        = 0x1f,cis_GR_n
549      and           cis_GR_m_cos        = 0x1f,cis_GR_n_cos
550};;
551
552{ .mmi
553      nop.m         0
554      nop.m         0
555      shl           cis_GR_32m_sin      = cis_GR_m_sin,5
556};;
557
558// Add 32*M to address of sin_cos_beta table
559// cis_r =  cis_r -cis_Nfloat * cis_Pi_by_16_lo
560{ .mfi
561      add           cis_AD_2_sin        = cis_GR_32m_sin, cis_AD_1
562      fnma.s1       cis_r               = cis_NFLOAT, cis_Pi_by_16_lo,  cis_r
563      shl           cis_GR_32m_cos      = cis_GR_m_cos,5
564};;
565
566// Add 32*M to address of sin_cos_beta table
567{ .mmf
568      ldfe          cis_Sm_sin          = [cis_AD_2_sin],16
569      add           cis_AD_2_cos        = cis_GR_32m_cos, cis_AD_1
570      fclass.m.unc  p10,p0              = cis_Arg,0x0b  // den. input - uflow
571};;
572
573{ .mfi
574      ldfe          cis_Sm_cos          = [cis_AD_2_cos], 16
575      nop.i         0
576};;
577
578{ .mfi
579      ldfe          cis_Cm_sin          = [cis_AD_2_sin]
580      fma.s1        cis_rsq             = cis_r, cis_r,   f0  // get r^2
581      nop.i         0
582}
583// fmpy forces inexact flag
584{ .mfi
585      nop.m         0
586      fmpy.s0       cis_tmp             = cis_tmp,cis_tmp
587      nop.i         0
588};;
589
590{ .mfi
591      nop.m         0
592      fnma.s1       cis_r_exact         = cis_NFLOAT, cis_Pi_by_16_lowest, cis_r
593      nop.i         0
594};;
595
596{ .mfi
597      ldfe          cis_Cm_cos          = [cis_AD_2_cos]
598      fma.s1        cis_P_temp1         = cis_rsq, cis_P4, cis_P3
599      nop.i         0
600}
601
602{ .mfi
603      nop.m         0
604      fma.s1        cis_Q_temp1         = cis_rsq, cis_Q4, cis_Q3
605      nop.i         0
606};;
607
608{ .mfi
609      nop.m         0
610      fmpy.s1       cis_srsq_sin        = cis_Sm_sin, cis_rsq
611      nop.i         0
612}
613{ .mfi
614      nop.m         0
615      fmpy.s1       cis_srsq_cos        = cis_Sm_cos,cis_rsq
616      nop.i         0
617};;
618
619{ .mfi
620      nop.m         0
621      fma.s1        cis_Q_temp2         = cis_rsq, cis_Q_temp1, cis_Q2
622      nop.i         0
623}
624{ .mfi
625      nop.m         0
626      fma.s1        cis_P_temp2         = cis_rsq, cis_P_temp1, cis_P2
627      nop.i         0
628};;
629
630{ .mfi
631      nop.m         0
632      fmpy.s1       cis_rcub            = cis_r_exact, cis_rsq // get r^3
633      nop.i         0
634};;
635
636{ .mfi
637      nop.m         0
638      fma.s1        cis_Q               = cis_rsq, cis_Q_temp2, cis_Q1
639      nop.i         0
640}
641{ .mfi
642      nop.m         0
643      fma.s1        cis_P               = cis_rsq, cis_P_temp2, cis_P1
644      nop.i         0
645};;
646
647{ .mfi
648      nop.m         0
649      fma.s1        cis_Q_sin           = cis_srsq_sin,cis_Q, cis_Sm_sin
650      nop.i         0
651}
652{ .mfi
653      nop.m         0
654      fma.s1        cis_Q_cos           = cis_srsq_cos,cis_Q, cis_Sm_cos
655      nop.i         0
656};;
657
658{ .mfi
659      nop.m         0
660      fma.s1        cis_P               = cis_rcub,cis_P, cis_r_exact // final P
661      nop.i         0
662};;
663
664// If den. arg, force underflow to be set
665{ .mfi
666      nop.m         0
667(p10) fmpy.d.s0     cis_tmp             = cis_Arg,cis_Arg
668      nop.i         0
669};;
670
671{ .mfi
672      nop.m         0
673      fma.d.s0      cis_Sin_res         = cis_Cm_sin,cis_P,cis_Q_sin//Final sin
674      nop.i         0
675}
676{ .mfb
677      nop.m         0
678      fma.d.s0      cis_Cos_res         = cis_Cm_cos,cis_P,cis_Q_cos//Final cos
679(p14) br.ret.sptk   b0  // common exit for __libm_sincos and cis main path
680};;
681
682{ .mmb
683      stfd          [cis_pResSin]       = cis_Sin_res
684      stfd          [cis_pResCos]       = cis_Cos_res
685      br.ret.sptk   b0 // common exit for sincos main path
686};;
687
688_CIS_SPECIAL_ARGS:
689// sin(+/-0) = +/-0
690// sin(Inf)  = NaN
691// sin(NaN)  = NaN
692{ .mfi
693      nop.m         999
694      fma.d.s0      cis_Sin_res          = cis_Arg, f0, f0 // sinf(+/-0,NaN,Inf)
695      nop.i         999
696};;
697// cos(+/-0) = 1.0
698// cos(Inf)  = NaN
699// cos(NaN)  = NaN
700{ .mfb
701      nop.m         999
702      fma.d.s0      cis_Cos_res          = cis_Arg, f0, f1 // cosf(+/-0,NaN,Inf)
703(p14) br.ret.sptk   b0 //spec exit for __libm_sincos and cis main path
704};;
705
706{ .mmb
707      stfd          [cis_pResSin]       = cis_Sin_res
708      stfd          [cis_pResCos]       = cis_Cos_res
709      br.ret.sptk   b0 // common exit for sincos main path
710};;
711
712_CIS_UNORM:
713// Here if x=unorm
714{ .mfb
715      getf.exp      cis_r_signexp       = cis_NORM_f8 // Get signexp of x
716      fcmp.eq.s0    p11,p0              = cis_Arg, f0 // Dummy op to set denorm
717      br.cond.sptk  _CIS_COMMON2        // Return to main path
718};;
719
720GLOBAL_LIBM_END(__libm_sincos)
721
722////  |x| > 2^27 path  ///////
723.proc _CIS_LARGE_ARGS
724_CIS_LARGE_ARGS:
725.prologue
726{ .mfi
727      nop.m         0
728      nop.f         0
729.save ar.pfs, GR_SAVE_PFS
730      mov           GR_SAVE_PFS         = ar.pfs
731}
732;;
733
734{ .mfi
735      mov           GR_SAVE_GP          = gp
736      nop.f         0
737.save b0, GR_SAVE_B0
738      mov           GR_SAVE_B0          = b0
739};;
740
741.body
742// Call of huge arguments sincos
743{ .mib
744      nop.m         0
745      mov           GR_SAVE_PR          = pr
746      br.call.sptk  b0                  = __libm_sincos_large
747};;
748
749{ .mfi
750      mov           gp                  = GR_SAVE_GP
751      nop.f         0
752      mov           pr                  = GR_SAVE_PR, 0x1fffe
753}
754;;
755
756{ .mfi
757      nop.m         0
758      nop.f         0
759      mov           b0                  = GR_SAVE_B0
760}
761;;
762
763{ .mfi
764      nop.m         0
765      fma.d.s0      cis_Cos_res         = cis_Cos_res, f1, f0
766      mov           ar.pfs              = GR_SAVE_PFS
767}
768{ .mfb
769      nop.m         0
770      fma.d.s0      cis_Sin_res         = cis_Sin_res, f1, f0
771(p14) br.ret.sptk   b0  // exit for |x| > 2^27 path (__libm_sincos and cis)
772};;
773
774{ .mmb
775      stfd          [cis_pResSin]       = cis_Sin_res
776      stfd          [cis_pResCos]       = cis_Cos_res
777      br.ret.sptk   b0 // exit for sincos |x| > 2^27 path
778};;
779.endp _CIS_LARGE_ARGS
780
781.type __libm_sincos_large#,@function
782.global __libm_sincos_large#
783