1.file "libm_reduce.s"
2
3
4// Copyright (c) 2000 - 2003, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39// History:
40// 02/02/00 Initial Version
41// 05/13/02 Rescheduled for speed, changed interface to pass
42//          parameters in fp registers
43// 02/10/03 Reordered header: .section, .global, .proc, .align;
44//          used data8 for long double data storage
45//
46//*********************************************************************
47//*********************************************************************
48//
49// Function:   __libm_pi_by_two_reduce(x) return r, c, and N where
50//             x = N * pi/4 + (r+c) , where |r+c| <= pi/4.
51//             This function is not designed to be used by the
52//             general user.
53//
54//*********************************************************************
55//
56// Accuracy:       Returns double-precision values
57//
58//*********************************************************************
59//
60// Resources Used:
61//
62//    Floating-Point Registers:
63//      f8  = Input x, return value r
64//      f9  = return value c
65//      f32-f70
66//
67//    General Purpose Registers:
68//      r8  = return value N
69//      r34-r64
70//
71//    Predicate Registers:      p6-p14
72//
73//*********************************************************************
74//
75// IEEE Special Conditions:
76//
77//    No conditions should be raised.
78//
79//*********************************************************************
80//
81// I. Introduction
82// ===============
83//
84// For the forward trigonometric functions sin, cos, sincos, and
85// tan, the original algorithms for IA 64 handle arguments up to
86// 1 ulp less than 2^63 in magnitude. For double-extended arguments x,
87// |x| >= 2^63, this routine returns N and r_hi, r_lo where
88//
89//    x  is accurately approximated by
90//    2*K*pi  +  N * pi/2  +  r_hi + r_lo,  |r_hi+r_lo| <= pi/4.
91//    CASE = 1 or 2.
92//    CASE is 1 unless |r_hi + r_lo| < 2^(-33).
93//
94// The exact value of K is not determined, but that information is
95// not required in trigonometric function computations.
96//
97// We first assume the argument x in question satisfies x >= 2^(63).
98// In particular, it is positive. Negative x can be handled by symmetry:
99//
100//   -x  is accurately approximated by
101//         -2*K*pi  +  (-N) * pi/2  -  (r_hi + r_lo),  |r_hi+r_lo| <= pi/4.
102//
103// The idea of the reduction is that
104//
105//       x  *  2/pi   =   N_big  +  N  +  f,      |f| <= 1/2
106//
107// Moreover, for double extended x, |f| >= 2^(-75). (This is an
108// non-obvious fact found by enumeration using a special algorithm
109// involving continued fraction.) The algorithm described below
110// calculates N and an accurate approximation of f.
111//
112// Roughly speaking, an appropriate 256-bit (4 X 64) portion of
113// 2/pi is multiplied with x to give the desired information.
114//
115// II. Representation of 2/PI
116// ==========================
117//
118// The value of 2/pi in binary fixed-point is
119//
120//            .101000101111100110......
121//
122// We store 2/pi in a table, starting at the position corresponding
123// to bit position 63
124//
125//   bit position  63 62 ... 0   -1 -2 -3 -4 -5 -6 -7  ....  -16576
126//
127//              0  0  ... 0  . 1  0  1  0  1  0  1  ....    X
128//
129//                              ^
130//                               |__ implied binary pt
131//
132// III. Algorithm
133// ==============
134//
135// This describes the algorithm in the most natural way using
136// unsigned interger multiplication. The implementation section
137// describes how the integer arithmetic is simulated.
138//
139// STEP 0. Initialization
140// ----------------------
141//
142// Let the input argument x be
143//
144//     x = 2^m * ( 1. b_1 b_2 b_3 ... b_63 ),  63 <= m <= 16383.
145//
146// The first crucial step is to fetch four 64-bit portions of 2/pi.
147// To fulfill this goal, we calculate the bit position L of the
148// beginning of these 256-bit quantity by
149//
150//     L :=  62 - m.
151//
152// Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that
153// the storage of 2/pi is adequate.
154//
155// Fetch P_1, P_2, P_3, P_4 beginning at bit position L thus:
156//
157//      bit position  L  L-1  L-2    ...  L-63
158//
159//      P_1    =      b   b    b     ...    b
160//
161// each b can be 0 or 1. Also, let P_0 be the two bits correspoding to
162// bit positions L+2 and L+1. So, when each of the P_j is interpreted
163// with appropriate scaling, we have
164//
165//      2/pi  =  P_big  + P_0 + (P_1 + P_2 + P_3 + P_4)  +  P_small
166//
167// Note that P_big and P_small can be ignored. The reasons are as follow.
168// First, consider P_big. If P_big = 0, we can certainly ignore it.
169// Otherwise, P_big >= 2^(L+3). Now,
170//
171//        P_big * ulp(x) >=  2^(L+3) * 2^(m-63)
172//                   >=  2^(65-m  +  m-63 )
173//                   >=  2^2
174//
175// Thus, P_big * x is an integer of the form 4*K. So
176//
177//       x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2)
178//                + x*P_small*(pi/2).
179//
180// Hence, P_big*x corresponds to information that can be ignored for
181// trigonometic function evaluation.
182//
183// Next, we must estimate the effect of ignoring P_small. The absolute
184// error made by ignoring P_small is bounded by
185//
186//       |P_small * x|  <=  ulp(P_4) * x
187//                  <=  2^(L-255) * 2^(m+1)
188//                  <=  2^(62-m-255 + m + 1)
189//                  <=  2^(-192)
190//
191// Since for double-extended precision, x * 2/pi = integer + f,
192// 0.5 >= |f| >= 2^(-75), the relative error introduced by ignoring
193// P_small is bounded by 2^(-192+75) <= 2^(-117), which is acceptable.
194//
195// Further note that if x is split into x_hi + x_lo where x_lo is the
196// two bits corresponding to bit positions 2^(m-62) and 2^(m-63); then
197//
198//       P_0 * x_hi
199//
200// is also an integer of the form 4*K; and thus can also be ignored.
201// Let M := P_0 * x_lo which is a small integer. The main part of the
202// calculation is really the multiplication of x with the four pieces
203// P_1, P_2, P_3, and P_4.
204//
205// Unless the reduced argument is extremely small in magnitude, it
206// suffices to carry out the multiplication of x with P_1, P_2, and
207// P_3. x*P_4 will be carried out and added on as a correction only
208// when it is found to be needed. Note also that x*P_4 need not be
209// computed exactly. A straightforward multiplication suffices since
210// the rounding error thus produced would be bounded by 2^(-3*64),
211// that is 2^(-192) which is small enough as the reduced argument
212// is bounded from below by 2^(-75).
213//
214// Now that we have four 64-bit data representing 2/pi and a
215// 64-bit x. We first need to calculate a highly accurate product
216// of x and P_1, P_2, P_3. This is best understood as integer
217// multiplication.
218//
219//
220// STEP 1. Multiplication
221// ----------------------
222//
223//
224//                     ---------   ---------   ---------
225//                    |  P_1  |   |  P_2  |   |  P_3  |
226//                    ---------   ---------   ---------
227//
228//                                            ---------
229//             X                              |   X   |
230//                                            ---------
231//      ----------------------------------------------------
232//
233//                                 ---------   ---------
234//                               |  A_hi |   |  A_lo |
235//                               ---------   ---------
236//
237//
238//                    ---------   ---------
239//                   |  B_hi |   |  B_lo |
240//                   ---------   ---------
241//
242//
243//        ---------   ---------
244//       |  C_hi |   |  C_lo |
245//       ---------   ---------
246//
247//      ====================================================
248//       ---------   ---------   ---------   ---------
249//       |  S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
250//       ---------   ---------   ---------   ---------
251//
252//
253//
254// STEP 2. Get N and f
255// -------------------
256//
257// Conceptually, after the individual pieces S_0, S_1, ..., are obtained,
258// we have to sum them and obtain an integer part, N, and a fraction, f.
259// Here, |f| <= 1/2, and N is an integer. Note also that N need only to
260// be known to module 2^k, k >= 2. In the case when |f| is small enough,
261// we would need to add in the value x*P_4.
262//
263//
264// STEP 3. Get reduced argument
265// ----------------------------
266//
267// The value f is not yet the reduced argument that we seek. The
268// equation
269//
270//       x * 2/pi = 4K  + N  + f
271//
272// says that
273//
274//         x   =  2*K*pi  + N * pi/2  +  f * (pi/2).
275//
276// Thus, the reduced argument is given by
277//
278//       reduced argument =  f * pi/2.
279//
280// This multiplication must be performed to extra precision.
281//
282// IV. Implementation
283// ==================
284//
285// Step 0. Initialization
286// ----------------------
287//
288// Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
289//
290// In memory, 2/pi is stored contiguously as
291//
292//  0x00000000 0x00000000 0xA2F....
293//                       ^
294//                       |__ implied binary bit
295//
296// Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. Thus
297// -1 <= L <= -16321. We fetch from memory 5 integer pieces of data.
298//
299// P_0 is the two bits corresponding to bit positions L+2 and L+1
300// P_1 is the 64-bit starting at bit position  L
301// P_2 is the 64-bit starting at bit position  L-64
302// P_3 is the 64-bit starting at bit position  L-128
303// P_4 is the 64-bit starting at bit position  L-192
304//
305// For example, if m = 63, P_0 would be 0 and P_1 would look like
306// 0xA2F...
307//
308// If m = 65, P_0 would be the two msb of 0xA, thus, P_0 is 10 in binary.
309// P_1 in binary would be  1 0 0 0 1 0 1 1 1 1 ....
310//
311// Step 1. Multiplication
312// ----------------------
313//
314// At this point, P_1, P_2, P_3, P_4 are integers. They are
315// supposed to be interpreted as
316//
317//  2^(L-63)     * P_1;
318//  2^(L-63-64)  * P_2;
319//  2^(L-63-128) * P_3;
320// 2^(L-63-192) * P_4;
321//
322// Since each of them need to be multiplied to x, we would scale
323// both x and the P_j's by some convenient factors: scale each
324// of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
325//
326//   p_1 := fcvt.xf ( P_1 )
327//   p_2 := fcvt.xf ( P_2 ) * 2^(-64)
328//   p_3 := fcvt.xf ( P_3 ) * 2^(-128)
329//   p_4 := fcvt.xf ( P_4 ) * 2^(-192)
330//   x   := replace exponent of x by -1
331//          because 2^m    * 1.xxxx...xxx  * 2^(L-63)
332//          is      2^(-1) * 1.xxxx...xxx
333//
334// We are now faced with the task of computing the following
335//
336//                     ---------   ---------   ---------
337//                    |  P_1  |   |  P_2  |   |  P_3  |
338//                    ---------   ---------   ---------
339//
340//                                             ---------
341//             X                              |   X   |
342//                                            ---------
343//       ----------------------------------------------------
344//
345//                                 ---------   ---------
346//                                |  A_hi |   |  A_lo |
347//                                ---------   ---------
348//
349//                     ---------   ---------
350//                    |  B_hi |   |  B_lo |
351//                    ---------   ---------
352//
353//         ---------   ---------
354//        |  C_hi |   |  C_lo |
355//        ---------   ---------
356//
357//      ====================================================
358//       -----------   ---------   ---------   ---------
359//       |    S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
360//       -----------   ---------   ---------   ---------
361//        ^          ^
362//        |          |___ binary point
363//        |
364//        |___ possibly one more bit
365//
366// Let FPSR3 be set to round towards zero with widest precision
367// and exponent range. Unless an explicit FPSR is given,
368// round-to-nearest with widest precision and exponent range is
369// used.
370//
371// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_C := 2^(-65).
372//
373// Tmp_C := fmpy.fpsr3( x, p_1 );
374// If Tmp_C >= sigma_C then
375//    C_hi := Tmp_C;
376//    C_lo := x*p_1 - C_hi ...fma, exact
377// Else
378//    C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
379//                   ...subtraction is exact, regardless
380//                   ...of rounding direction
381//    C_lo := x*p_1 - C_hi ...fma, exact
382// End If
383//
384// Tmp_B := fmpy.fpsr3( x, p_2 );
385// If Tmp_B >= sigma_B then
386//    B_hi := Tmp_B;
387//    B_lo := x*p_2 - B_hi ...fma, exact
388// Else
389//    B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
390//                   ...subtraction is exact, regardless
391//                   ...of rounding direction
392//    B_lo := x*p_2 - B_hi ...fma, exact
393// End If
394//
395// Tmp_A := fmpy.fpsr3( x, p_3 );
396// If Tmp_A >= sigma_A then
397//    A_hi := Tmp_A;
398//    A_lo := x*p_3 - A_hi ...fma, exact
399// Else
400//    A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
401//                   ...subtraction is exact, regardless
402//                   ...of rounding direction
403//    A_lo := x*p_3 - A_hi ...fma, exact
404// End If
405//
406// ...Note that C_hi is of integer value. We need only the
407// ...last few bits. Thus we can ensure C_hi is never a big
408// ...integer, freeing us from overflow worry.
409//
410// Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
411// ...Tmp_C is the upper portion of C_hi
412// C_hi := C_hi - Tmp_C
413// ...0 <= C_hi < 2^7
414//
415// Step 2. Get N and f
416// -------------------
417//
418// At this point, we have all the components to obtain
419// S_0, S_1, S_2, S_3 and thus N and f. We start by adding
420// C_lo and B_hi. This sum together with C_hi gives a good
421// estimation of N and f.
422//
423// A := fadd.fpsr3( B_hi, C_lo )
424// B := max( B_hi, C_lo )
425// b := min( B_hi, C_lo )
426//
427// a := (B - A) + b      ...exact. Note that a is either 0
428//                   ...or 2^(-64).
429//
430// N := round_to_nearest_integer_value( A );
431// f := A - N;            ...exact because lsb(A) >= 2^(-64)
432//                   ...and |f| <= 1/2.
433//
434// f := f + a            ...exact because a is 0 or 2^(-64);
435//                   ...the msb of the sum is <= 1/2
436//                   ...lsb >= 2^(-64).
437//
438// N := convert to integer format( C_hi + N );
439// M := P_0 * x_lo;
440// N := N + M;
441//
442// If sgn_x == 1 (that is original x was negative)
443// N := 2^10 - N
444// ...this maintains N to be non-negative, but still
445// ...equivalent to the (negated N) mod 4.
446// End If
447//
448// If |f| >= 2^(-33)
449//
450// ...Case 1
451// CASE := 1
452// g := A_hi + B_lo;
453// s_hi := f + g;
454// s_lo := (f - s_hi) + g;
455//
456// Else
457//
458// ...Case 2
459// CASE := 2
460// A := fadd.fpsr3( A_hi, B_lo )
461// B := max( A_hi, B_lo )
462// b := min( A_hi, B_lo )
463//
464// a := (B - A) + b      ...exact. Note that a is either 0
465//                   ...or 2^(-128).
466//
467// f_hi := A + f;
468// f_lo := (f - f_hi) + A;
469// ...this is exact.
470// ...f-f_hi is exact because either |f| >= |A|, in which
471// ...case f-f_hi is clearly exact; or otherwise, 0<|f|<|A|
472// ...means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64).
473// ...If f = 2^(-64), f-f_hi involves cancellation and is
474// ...exact. If f = -2^(-64), then A + f is exact. Hence
475// ...f-f_hi is -A exactly, giving f_lo = 0.
476//
477// f_lo := f_lo + a;
478//
479// If |f| >= 2^(-50) then
480//    s_hi := f_hi;
481//    s_lo := f_lo;
482// Else
483//    f_lo := (f_lo + A_lo) + x*p_4
484//    s_hi := f_hi + f_lo
485//    s_lo := (f_hi - s_hi) + f_lo
486// End If
487//
488// End If
489//
490// Step 3. Get reduced argument
491// ----------------------------
492//
493// If sgn_x == 0 (that is original x is positive)
494//
495// D_hi := Pi_by_2_hi
496// D_lo := Pi_by_2_lo
497// ...load from table
498//
499// Else
500//
501// D_hi := neg_Pi_by_2_hi
502// D_lo := neg_Pi_by_2_lo
503// ...load from table
504// End If
505//
506// r_hi :=  s_hi*D_hi
507// r_lo :=  s_hi*D_hi - r_hi         ...fma
508// r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
509//
510// Return  N, r_hi, r_lo
511//
512FR_input_X = f8
513FR_r_hi    = f8
514FR_r_lo    = f9
515
516FR_X       = f32
517FR_N       = f33
518FR_p_1     = f34
519FR_TWOM33  = f35
520FR_TWOM50  = f36
521FR_g       = f37
522FR_p_2     = f38
523FR_f       = f39
524FR_s_lo    = f40
525FR_p_3     = f41
526FR_f_abs   = f42
527FR_D_lo    = f43
528FR_p_4     = f44
529FR_D_hi    = f45
530FR_Tmp2_C  = f46
531FR_s_hi    = f47
532FR_sigma_A = f48
533FR_A       = f49
534FR_sigma_B = f50
535FR_B       = f51
536FR_sigma_C = f52
537FR_b       = f53
538FR_ScaleP2 = f54
539FR_ScaleP3 = f55
540FR_ScaleP4 = f56
541FR_Tmp_A   = f57
542FR_Tmp_B   = f58
543FR_Tmp_C   = f59
544FR_A_hi    = f60
545FR_f_hi    = f61
546FR_RSHF    = f62
547FR_A_lo    = f63
548FR_B_hi    = f64
549FR_a       = f65
550FR_B_lo    = f66
551FR_f_lo    = f67
552FR_N_fix   = f68
553FR_C_hi    = f69
554FR_C_lo    = f70
555
556GR_N       = r8
557GR_Exp_x   = r36
558GR_Temp    = r37
559GR_BIASL63 = r38
560GR_CASE    = r39
561GR_x_lo    = r40
562GR_sgn_x   = r41
563GR_M       = r42
564GR_BASE    = r43
565GR_LENGTH1 = r44
566GR_LENGTH2 = r45
567GR_ASUB    = r46
568GR_P_0     = r47
569GR_P_1     = r48
570GR_P_2     = r49
571GR_P_3     = r50
572GR_P_4     = r51
573GR_START   = r52
574GR_SEGMENT = r53
575GR_A       = r54
576GR_B       = r55
577GR_C       = r56
578GR_D       = r57
579GR_E       = r58
580GR_TEMP1   = r59
581GR_TEMP2   = r60
582GR_TEMP3   = r61
583GR_TEMP4   = r62
584GR_TEMP5   = r63
585GR_TEMP6   = r64
586GR_rshf    = r64
587
588RODATA
589.align 64
590
591LOCAL_OBJECT_START(Constants_Bits_of_2_by_pi)
592data8 0x0000000000000000,0xA2F9836E4E441529
593data8 0xFC2757D1F534DDC0,0xDB6295993C439041
594data8 0xFE5163ABDEBBC561,0xB7246E3A424DD2E0
595data8 0x06492EEA09D1921C,0xFE1DEB1CB129A73E
596data8 0xE88235F52EBB4484,0xE99C7026B45F7E41
597data8 0x3991D639835339F4,0x9C845F8BBDF9283B
598data8 0x1FF897FFDE05980F,0xEF2F118B5A0A6D1F
599data8 0x6D367ECF27CB09B7,0x4F463F669E5FEA2D
600data8 0x7527BAC7EBE5F17B,0x3D0739F78A5292EA
601data8 0x6BFB5FB11F8D5D08,0x56033046FC7B6BAB
602data8 0xF0CFBC209AF4361D,0xA9E391615EE61B08
603data8 0x6599855F14A06840,0x8DFFD8804D732731
604data8 0x06061556CA73A8C9,0x60E27BC08C6B47C4
605data8 0x19C367CDDCE8092A,0x8359C4768B961CA6
606data8 0xDDAF44D15719053E,0xA5FF07053F7E33E8
607data8 0x32C2DE4F98327DBB,0xC33D26EF6B1E5EF8
608data8 0x9F3A1F35CAF27F1D,0x87F121907C7C246A
609data8 0xFA6ED5772D30433B,0x15C614B59D19C3C2
610data8 0xC4AD414D2C5D000C,0x467D862D71E39AC6
611data8 0x9B0062337CD2B497,0xA7B4D55537F63ED7
612data8 0x1810A3FC764D2A9D,0x64ABD770F87C6357
613data8 0xB07AE715175649C0,0xD9D63B3884A7CB23
614data8 0x24778AD623545AB9,0x1F001B0AF1DFCE19
615data8 0xFF319F6A1E666157,0x9947FBACD87F7EB7
616data8 0x652289E83260BFE6,0xCDC4EF09366CD43F
617data8 0x5DD7DE16DE3B5892,0x9BDE2822D2E88628
618data8 0x4D58E232CAC616E3,0x08CB7DE050C017A7
619data8 0x1DF35BE01834132E,0x6212830148835B8E
620data8 0xF57FB0ADF2E91E43,0x4A48D36710D8DDAA
621data8 0x425FAECE616AA428,0x0AB499D3F2A6067F
622data8 0x775C83C2A3883C61,0x78738A5A8CAFBDD7
623data8 0x6F63A62DCBBFF4EF,0x818D67C12645CA55
624data8 0x36D9CAD2A8288D61,0xC277C9121426049B
625data8 0x4612C459C444C5C8,0x91B24DF31700AD43
626data8 0xD4E5492910D5FDFC,0xBE00CC941EEECE70
627data8 0xF53E1380F1ECC3E7,0xB328F8C79405933E
628data8 0x71C1B3092EF3450B,0x9C12887B20AB9FB5
629data8 0x2EC292472F327B6D,0x550C90A7721FE76B
630data8 0x96CB314A1679E279,0x4189DFF49794E884
631data8 0xE6E29731996BED88,0x365F5F0EFDBBB49A
632data8 0x486CA46742727132,0x5D8DB8159F09E5BC
633data8 0x25318D3974F71C05,0x30010C0D68084B58
634data8 0xEE2C90AA4702E774,0x24D6BDA67DF77248
635data8 0x6EEF169FA6948EF6,0x91B45153D1F20ACF
636data8 0x3398207E4BF56863,0xB25F3EDD035D407F
637data8 0x8985295255C06437,0x10D86D324832754C
638data8 0x5BD4714E6E5445C1,0x090B69F52AD56614
639data8 0x9D072750045DDB3B,0xB4C576EA17F9877D
640data8 0x6B49BA271D296996,0xACCCC65414AD6AE2
641data8 0x9089D98850722CBE,0xA4049407777030F3
642data8 0x27FC00A871EA49C2,0x663DE06483DD9797
643data8 0x3FA3FD94438C860D,0xDE41319D39928C70
644data8 0xDDE7B7173BDF082B,0x3715A0805C93805A
645data8 0x921110D8E80FAF80,0x6C4BFFDB0F903876
646data8 0x185915A562BBCB61,0xB989C7BD401004F2
647data8 0xD2277549F6B6EBBB,0x22DBAA140A2F2689
648data8 0x768364333B091A94,0x0EAA3A51C2A31DAE
649data8 0xEDAF12265C4DC26D,0x9C7A2D9756C0833F
650data8 0x03F6F0098C402B99,0x316D07B43915200C
651data8 0x5BC3D8C492F54BAD,0xC6A5CA4ECD37A736
652data8 0xA9E69492AB6842DD,0xDE6319EF8C76528B
653data8 0x6837DBFCABA1AE31,0x15DFA1AE00DAFB0C
654data8 0x664D64B705ED3065,0x29BF56573AFF47B9
655data8 0xF96AF3BE75DF9328,0x3080ABF68C6615CB
656data8 0x040622FA1DE4D9A4,0xB33D8F1B5709CD36
657data8 0xE9424EA4BE13B523,0x331AAAF0A8654FA5
658data8 0xC1D20F3F0BCD785B,0x76F923048B7B7217
659data8 0x8953A6C6E26E6F00,0xEBEF584A9BB7DAC4
660data8 0xBA66AACFCF761D02,0xD12DF1B1C1998C77
661data8 0xADC3DA4886A05DF7,0xF480C62FF0AC9AEC
662data8 0xDDBC5C3F6DDED01F,0xC790B6DB2A3A25A3
663data8 0x9AAF009353AD0457,0xB6B42D297E804BA7
664data8 0x07DA0EAA76A1597B,0x2A12162DB7DCFDE5
665data8 0xFAFEDB89FDBE896C,0x76E4FCA90670803E
666data8 0x156E85FF87FD073E,0x2833676186182AEA
667data8 0xBD4DAFE7B36E6D8F,0x3967955BBF3148D7
668data8 0x8416DF30432DC735,0x6125CE70C9B8CB30
669data8 0xFD6CBFA200A4E46C,0x05A0DD5A476F21D2
670data8 0x1262845CB9496170,0xE0566B0152993755
671data8 0x50B7D51EC4F1335F,0x6E13E4305DA92E85
672data8 0xC3B21D3632A1A4B7,0x08D4B1EA21F716E4
673data8 0x698F77FF2780030C,0x2D408DA0CD4F99A5
674data8 0x20D3A2B30A5D2F42,0xF9B4CBDA11D0BE7D
675data8 0xC1DB9BBD17AB81A2,0xCA5C6A0817552E55
676data8 0x0027F0147F8607E1,0x640B148D4196DEBE
677data8 0x872AFDDAB6256B34,0x897BFEF3059EBFB9
678data8 0x4F6A68A82A4A5AC4,0x4FBCF82D985AD795
679data8 0xC7F48D4D0DA63A20,0x5F57A4B13F149538
680data8 0x800120CC86DD71B6,0xDEC9F560BF11654D
681data8 0x6B0701ACB08CD0C0,0xB24855510EFB1EC3
682data8 0x72953B06A33540C0,0x7BDC06CC45E0FA29
683data8 0x4EC8CAD641F3E8DE,0x647CD8649B31BED9
684data8 0xC397A4D45877C5E3,0x6913DAF03C3ABA46
685data8 0x18465F7555F5BDD2,0xC6926E5D2EACED44
686data8 0x0E423E1C87C461E9,0xFD29F3D6E7CA7C22
687data8 0x35916FC5E0088DD7,0xFFE26A6EC6FDB0C1
688data8 0x0893745D7CB2AD6B,0x9D6ECD7B723E6A11
689data8 0xC6A9CFF7DF7329BA,0xC9B55100B70DB2E2
690data8 0x24BA74607DE58AD8,0x742C150D0C188194
691data8 0x667E162901767A9F,0xBEFDFDEF4556367E
692data8 0xD913D9ECB9BA8BFC,0x97C427A831C36EF1
693data8 0x36C59456A8D8B5A8,0xB40ECCCF2D891234
694data8 0x576F89562CE3CE99,0xB920D6AA5E6B9C2A
695data8 0x3ECC5F114A0BFDFB,0xF4E16D3B8E2C86E2
696data8 0x84D4E9A9B4FCD1EE,0xEFC9352E61392F44
697data8 0x2138C8D91B0AFC81,0x6A4AFBD81C2F84B4
698data8 0x538C994ECC2254DC,0x552AD6C6C096190B
699data8 0xB8701A649569605A,0x26EE523F0F117F11
700data8 0xB5F4F5CBFC2DBC34,0xEEBC34CC5DE8605E
701data8 0xDD9B8E67EF3392B8,0x17C99B5861BC57E1
702data8 0xC68351103ED84871,0xDDDD1C2DA118AF46
703data8 0x2C21D7F359987AD9,0xC0549EFA864FFC06
704data8 0x56AE79E536228922,0xAD38DC9367AAE855
705data8 0x3826829BE7CAA40D,0x51B133990ED7A948
706data8 0x0569F0B265A7887F,0x974C8836D1F9B392
707data8 0x214A827B21CF98DC,0x9F405547DC3A74E1
708data8 0x42EB67DF9DFE5FD4,0x5EA4677B7AACBAA2
709data8 0xF65523882B55BA41,0x086E59862A218347
710data8 0x39E6E389D49EE540,0xFB49E956FFCA0F1C
711data8 0x8A59C52BFA94C5C1,0xD3CFC50FAE5ADB86
712data8 0xC5476243853B8621,0x94792C8761107B4C
713data8 0x2A1A2C8012BF4390,0x2688893C78E4C4A8
714data8 0x7BDBE5C23AC4EAF4,0x268A67F7BF920D2B
715data8 0xA365B1933D0B7CBD,0xDC51A463DD27DDE1
716data8 0x6919949A9529A828,0xCE68B4ED09209F44
717data8 0xCA984E638270237C,0x7E32B90F8EF5A7E7
718data8 0x561408F1212A9DB5,0x4D7E6F5119A5ABF9
719data8 0xB5D6DF8261DD9602,0x36169F3AC4A1A283
720data8 0x6DED727A8D39A9B8,0x825C326B5B2746ED
721data8 0x34007700D255F4FC,0x4D59018071E0E13F
722data8 0x89B295F364A8F1AE,0xA74B38FC4CEAB2BB
723LOCAL_OBJECT_END(Constants_Bits_of_2_by_pi)
724
725LOCAL_OBJECT_START(Constants_Bits_of_pi_by_2)
726data8 0xC90FDAA22168C234,0x00003FFF
727data8 0xC4C6628B80DC1CD1,0x00003FBF
728LOCAL_OBJECT_END(Constants_Bits_of_pi_by_2)
729
730.section .text
731.global __libm_pi_by_2_reduce#
732.proc __libm_pi_by_2_reduce#
733.align 32
734
735__libm_pi_by_2_reduce:
736
737//    X is in f8
738//    Place the two-piece result r (r_hi) in f8 and c (r_lo) in f9
739//    N is returned in r8
740
741{ .mfi
742      alloc  r34 = ar.pfs,2,34,0,0
743      fsetc.s3 0x00,0x7F     // Set sf3 to round to zero, 82-bit prec, td, ftz
744      nop.i 999
745}
746{ .mfi
747      addl           GR_BASE   = @ltoff(Constants_Bits_of_2_by_pi#), gp
748      nop.f 999
749      mov GR_BIASL63 = 0x1003E
750}
751;;
752
753
754//    L         -1-2-3-4
755//    0 0 0 0 0. 1 0 1 0
756//    M          0 1 2 .... 63, 64 65 ... 127, 128
757//     ---------------------------------------------
758//    Segment 0.        1     ,      2       ,    3
759//    START = M - 63                        M = 128 becomes 65
760//    LENGTH1  = START & 0x3F               65 become position 1
761//    SEGMENT  = shr(START,6) + 1      0 maps to 1,   64 maps to 2,
762//    LENGTH2  = 64 - LENGTH1
763//    Address_BASE = shladd(SEGMENT,3) + BASE
764
765
766{ .mmi
767      getf.exp GR_Exp_x = FR_input_X
768      ld8 GR_BASE = [GR_BASE]
769      mov GR_TEMP5 = 0x0FFFE
770}
771;;
772
773//    Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65).
774{ .mmi
775      getf.sig GR_x_lo = FR_input_X
776      mov GR_TEMP6 = 0x0FFBE
777      nop.i 999
778}
779;;
780
781//    Special Code for testing DE arguments
782//          movl GR_BIASL63 = 0x0000000000013FFE
783//          movl GR_x_lo = 0xFFFFFFFFFFFFFFFF
784//          setf.exp FR_X = GR_BIASL63
785//          setf.sig FR_ScaleP3 = GR_x_lo
786//          fmerge.se FR_X = FR_X,FR_ScaleP3
787//    Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
788//    2/pi is stored contiguously as
789//    0x00000000 0x00000000.0xA2F....
790//    M = EXP - BIAS  ( M >= 63)
791//    Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m.
792//    Thus -1 <= L <= -16321.
793{ .mmi
794      setf.exp FR_sigma_B = GR_TEMP5
795      setf.exp FR_sigma_A = GR_TEMP6
796      extr.u GR_M = GR_Exp_x,0,17
797}
798;;
799
800{ .mii
801      and  GR_x_lo = 0x03,GR_x_lo
802      sub  GR_START = GR_M,GR_BIASL63
803      add  GR_BASE = 8,GR_BASE           // To effectively add 1 to SEGMENT
804}
805;;
806
807{ .mii
808      and  GR_LENGTH1 = 0x3F,GR_START
809      shr.u  GR_SEGMENT = GR_START,6
810      nop.i 999
811}
812;;
813
814{ .mmi
815      shladd GR_BASE = GR_SEGMENT,3,GR_BASE
816      sub  GR_LENGTH2 = 0x40,GR_LENGTH1
817      cmp.le p6,p7 = 0x2,GR_LENGTH1
818}
819;;
820
821//    P_0 is the two bits corresponding to bit positions L+2 and L+1
822//    P_1 is the 64-bit starting at bit position  L
823//    P_2 is the 64-bit starting at bit position  L-64
824//    P_3 is the 64-bit starting at bit position  L-128
825//    P_4 is the 64-bit starting at bit position  L-192
826//    P_1 is made up of Alo and Bhi
827//    P_1 = deposit Alo, position 0, length2  into P_1,position length1
828//          deposit Bhi, position length2, length1 into P_1, position 0
829//    P_2 is made up of Blo and Chi
830//    P_2 = deposit Blo, position 0, length2  into P_2, position length1
831//          deposit Chi, position length2, length1 into P_2, position 0
832//    P_3 is made up of Clo and Dhi
833//    P_3 = deposit Clo, position 0, length2  into P_3, position length1
834//          deposit Dhi, position length2, length1 into P_3, position 0
835//    P_4 is made up of Clo and Dhi
836//    P_4 = deposit Dlo, position 0, length2  into P_4, position length1
837//          deposit Ehi, position length2, length1 into P_4, position 0
838{ .mfi
839      ld8 GR_A = [GR_BASE],8
840      fabs FR_X = FR_input_X
841(p7)  cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1
842}
843;;
844
845//    ld_64 A at Base and increment Base by 8
846//    ld_64 B at Base and increment Base by 8
847//    ld_64 C at Base and increment Base by 8
848//    ld_64 D at Base and increment Base by 8
849//    ld_64 E at Base and increment Base by 8
850//                                          A/B/C/D
851//                                    ---------------------
852//    A, B, C, D, and E look like    | length1 | length2   |
853//                                    ---------------------
854//                                       hi        lo
855{ .mlx
856      ld8 GR_B = [GR_BASE],8
857      movl GR_rshf = 0x43e8000000000000   // 1.10000 2^63 for right shift N_fix
858}
859;;
860
861{ .mmi
862      ld8 GR_C = [GR_BASE],8
863      nop.m 999
864(p8)  extr.u GR_Temp = GR_A,63,1
865}
866;;
867
868//    If length1 >= 2,
869//       P_0 = deposit Ahi, position length2, 2 bit into P_0 at position 0.
870{ .mii
871      ld8 GR_D = [GR_BASE],8
872      shl GR_TEMP1 = GR_A,GR_LENGTH1   // MM instruction
873(p6)  shr.u GR_P_0 = GR_A,GR_LENGTH2   // MM instruction
874}
875;;
876
877{ .mii
878      ld8 GR_E = [GR_BASE],-40
879      shl GR_TEMP2 = GR_B,GR_LENGTH1   // MM instruction
880      shr.u GR_P_1 = GR_B,GR_LENGTH2   // MM instruction
881}
882;;
883
884//    Else
885//       Load 16 bit of ASUB from (Base_Address_of_A - 2)
886//       P_0 = ASUB & 0x3
887//       If length1 == 0,
888//          P_0 complete
889//       Else
890//          Deposit element 63 from Ahi and place in element 0 of P_0.
891//       Endif
892//    Endif
893
894{ .mii
895(p7)  ld2 GR_ASUB = [GR_BASE],8
896      shl GR_TEMP3 = GR_C,GR_LENGTH1   // MM instruction
897      shr.u GR_P_2 = GR_C,GR_LENGTH2   // MM instruction
898}
899;;
900
901{ .mii
902      setf.d FR_RSHF = GR_rshf         // Form right shift const 1.100 * 2^63
903      shl GR_TEMP4 = GR_D,GR_LENGTH1   // MM instruction
904      shr.u GR_P_3 = GR_D,GR_LENGTH2   // MM instruction
905}
906;;
907
908{ .mmi
909(p7)  and GR_P_0 = 0x03,GR_ASUB
910(p6)  and GR_P_0 = 0x03,GR_P_0
911      shr.u GR_P_4 = GR_E,GR_LENGTH2   // MM instruction
912}
913;;
914
915{ .mmi
916      nop.m 999
917      or GR_P_1 = GR_P_1,GR_TEMP1
918(p8)  and GR_P_0 = 0x1,GR_P_0
919}
920;;
921
922{ .mmi
923      setf.sig FR_p_1 = GR_P_1
924      or GR_P_2 = GR_P_2,GR_TEMP2
925(p8)  shladd GR_P_0 = GR_P_0,1,GR_Temp
926}
927;;
928
929{ .mmf
930      setf.sig FR_p_2 = GR_P_2
931      or GR_P_3 = GR_P_3,GR_TEMP3
932      fmerge.se FR_X = FR_sigma_B,FR_X
933}
934;;
935
936{ .mmi
937      setf.sig FR_p_3 = GR_P_3
938      or GR_P_4 = GR_P_4,GR_TEMP4
939      pmpy2.r GR_M = GR_P_0,GR_x_lo
940}
941;;
942
943//    P_1, P_2, P_3, P_4 are integers. They should be
944//    2^(L-63)     * P_1;
945//    2^(L-63-64)  * P_2;
946//    2^(L-63-128) * P_3;
947//    2^(L-63-192) * P_4;
948//    Since each of them need to be multiplied to x, we would scale
949//    both x and the P_j's by some convenient factors: scale each
950//    of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
951//    p_1 := fcvt.xf ( P_1 )
952//    p_2 := fcvt.xf ( P_2 ) * 2^(-64)
953//    p_3 := fcvt.xf ( P_3 ) * 2^(-128)
954//    p_4 := fcvt.xf ( P_4 ) * 2^(-192)
955//    x= Set x's exp to -1 because 2^m*1.x...x *2^(L-63)=2^(-1)*1.x...xxx
956//             ---------   ---------   ---------
957//             |  P_1  |   |  P_2  |   |  P_3  |
958//             ---------   ---------   ---------
959//                                           ---------
960//            X                              |   X   |
961//                                           ---------
962//      ----------------------------------------------------
963//                               ---------   ---------
964//                               |  A_hi |   |  A_lo |
965//                               ---------   ---------
966//                   ---------   ---------
967//                   |  B_hi |   |  B_lo |
968//                   ---------   ---------
969//       ---------   ---------
970//       |  C_hi |   |  C_lo |
971//       ---------   ---------
972//     ====================================================
973//    -----------   ---------   ---------   ---------
974//    |    S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
975//    -----------   ---------   ---------   ---------
976//    |            |___ binary point
977//    |___ possibly one more bit
978//
979//    Let FPSR3 be set to round towards zero with widest precision
980//    and exponent range. Unless an explicit FPSR is given,
981//    round-to-nearest with widest precision and exponent range is
982//    used.
983{ .mmi
984      setf.sig FR_p_4 = GR_P_4
985      mov GR_TEMP1 = 0x0FFBF
986      nop.i 999
987}
988;;
989
990{ .mmi
991      setf.exp FR_ScaleP2 = GR_TEMP1
992      mov GR_TEMP2 = 0x0FF7F
993      nop.i 999
994}
995;;
996
997{ .mmi
998      setf.exp FR_ScaleP3 = GR_TEMP2
999      mov GR_TEMP4 = 0x1003E
1000      nop.i 999
1001}
1002;;
1003
1004{ .mmf
1005      setf.exp FR_sigma_C = GR_TEMP4
1006      mov GR_Temp = 0x0FFDE
1007      fcvt.xuf.s1 FR_p_1 = FR_p_1
1008}
1009;;
1010
1011{ .mfi
1012      setf.exp FR_TWOM33 = GR_Temp
1013      fcvt.xuf.s1 FR_p_2 = FR_p_2
1014      nop.i 999
1015}
1016;;
1017
1018{ .mfi
1019      nop.m 999
1020      fcvt.xuf.s1 FR_p_3 = FR_p_3
1021      nop.i 999
1022}
1023;;
1024
1025{ .mfi
1026      nop.m 999
1027      fcvt.xuf.s1 FR_p_4 = FR_p_4
1028      nop.i 999
1029}
1030;;
1031
1032//    Tmp_C := fmpy.fpsr3( x, p_1 );
1033//    Tmp_B := fmpy.fpsr3( x, p_2 );
1034//    Tmp_A := fmpy.fpsr3( x, p_3 );
1035//    If Tmp_C >= sigma_C then
1036//      C_hi := Tmp_C;
1037//      C_lo := x*p_1 - C_hi ...fma, exact
1038//    Else
1039//      C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
1040//      C_lo := x*p_1 - C_hi ...fma, exact
1041//    End If
1042//    If Tmp_B >= sigma_B then
1043//      B_hi := Tmp_B;
1044//      B_lo := x*p_2 - B_hi ...fma, exact
1045//    Else
1046//      B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
1047//      B_lo := x*p_2 - B_hi ...fma, exact
1048//    End If
1049//    If Tmp_A >= sigma_A then
1050//      A_hi := Tmp_A;
1051//      A_lo := x*p_3 - A_hi ...fma, exact
1052//    Else
1053//      A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
1054//      Exact, regardless ...of rounding direction
1055//      A_lo := x*p_3 - A_hi ...fma, exact
1056//    Endif
1057{ .mfi
1058      nop.m 999
1059      fmpy.s3 FR_Tmp_C = FR_X,FR_p_1
1060      nop.i 999
1061}
1062;;
1063
1064{ .mfi
1065      mov GR_TEMP3 = 0x0FF3F
1066      fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2
1067      nop.i 999
1068}
1069;;
1070
1071{ .mmf
1072      setf.exp FR_ScaleP4 = GR_TEMP3
1073      mov GR_TEMP4 = 0x10045
1074      fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3
1075}
1076;;
1077
1078{ .mfi
1079      nop.m 999
1080      fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C   // For Tmp_C < sigma_C case
1081      nop.i 999
1082}
1083;;
1084
1085{ .mmf
1086      setf.exp FR_Tmp2_C = GR_TEMP4
1087      nop.m 999
1088      fmpy.s3 FR_Tmp_B = FR_X,FR_p_2
1089}
1090;;
1091
1092{ .mfi
1093      addl           GR_BASE   = @ltoff(Constants_Bits_of_pi_by_2#), gp
1094      fcmp.ge.s1 p12,  p9 = FR_Tmp_C,FR_sigma_C
1095      nop.i 999
1096}
1097{ .mfi
1098      nop.m 999
1099      fmpy.s3 FR_Tmp_A = FR_X,FR_p_3
1100      nop.i 99
1101}
1102;;
1103
1104{ .mfi
1105      ld8 GR_BASE = [GR_BASE]
1106(p12) mov FR_C_hi = FR_Tmp_C
1107      nop.i 999
1108}
1109{ .mfi
1110      nop.m 999
1111(p9)  fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C
1112      nop.i 999
1113}
1114;;
1115
1116
1117
1118//   End If
1119//   Step 3. Get reduced argument
1120//   If sgn_x == 0 (that is original x is positive)
1121//      D_hi := Pi_by_2_hi
1122//      D_lo := Pi_by_2_lo
1123//      Load from table
1124//   Else
1125//      D_hi := neg_Pi_by_2_hi
1126//      D_lo := neg_Pi_by_2_lo
1127//      Load from table
1128//   End If
1129
1130{ .mfi
1131      nop.m 999
1132      fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4
1133      nop.i 999
1134}
1135{ .mfi
1136      nop.m 999
1137      fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B     // For Tmp_B < sigma_B case
1138      nop.i 999
1139}
1140;;
1141
1142{ .mfi
1143      nop.m 999
1144      fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A     // For Tmp_A < sigma_A case
1145      nop.i 999
1146}
1147;;
1148
1149{ .mfi
1150      nop.m 999
1151      fcmp.ge.s1 p13, p10 = FR_Tmp_B,FR_sigma_B
1152      nop.i 999
1153}
1154{ .mfi
1155      nop.m 999
1156      fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
1157      nop.i 999
1158}
1159;;
1160
1161{ .mfi
1162      ldfe FR_D_hi = [GR_BASE],16
1163      fcmp.ge.s1 p14, p11 = FR_Tmp_A,FR_sigma_A
1164      nop.i 999
1165}
1166;;
1167
1168{ .mfi
1169      ldfe FR_D_lo = [GR_BASE]
1170(p13) mov FR_B_hi = FR_Tmp_B
1171      nop.i 999
1172}
1173{ .mfi
1174      nop.m 999
1175(p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B
1176      nop.i 999
1177}
1178;;
1179
1180{ .mfi
1181      nop.m 999
1182(p14) mov FR_A_hi = FR_Tmp_A
1183      nop.i 999
1184}
1185{ .mfi
1186      nop.m 999
1187(p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A
1188      nop.i 999
1189}
1190;;
1191
1192//    Note that C_hi is of integer value. We need only the
1193//    last few bits. Thus we can ensure C_hi is never a big
1194//    integer, freeing us from overflow worry.
1195//    Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
1196//    Tmp_C is the upper portion of C_hi
1197{ .mfi
1198      nop.m 999
1199      fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C
1200      tbit.z p12,p9 = GR_Exp_x, 17
1201}
1202;;
1203
1204{ .mfi
1205      nop.m 999
1206      fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
1207      nop.i 999
1208}
1209{ .mfi
1210      nop.m 999
1211      fadd.s3 FR_A = FR_B_hi,FR_C_lo
1212      nop.i 999
1213}
1214;;
1215
1216{ .mfi
1217      nop.m 999
1218      fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
1219      nop.i 999
1220}
1221;;
1222
1223{ .mfi
1224      nop.m 999
1225      fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C
1226      nop.i 999
1227}
1228;;
1229
1230//    *******************
1231//    Step 2. Get N and f
1232//    *******************
1233//    We have all the components to obtain
1234//    S_0, S_1, S_2, S_3 and thus N and f. We start by adding
1235//    C_lo and B_hi. This sum together with C_hi estimates
1236//    N and f well.
1237//    A := fadd.fpsr3( B_hi, C_lo )
1238//    B := max( B_hi, C_lo )
1239//    b := min( B_hi, C_lo )
1240{ .mfi
1241      nop.m 999
1242      fmax.s1 FR_B = FR_B_hi,FR_C_lo
1243      nop.i 999
1244}
1245;;
1246
1247// We use a right-shift trick to get the integer part of A into the rightmost
1248// bits of the significand by adding 1.1000..00 * 2^63.  This operation is good
1249// if |A| < 2^61, which it is in this case.  We are doing this to save a few
1250// cycles over using fcvt.fx followed by fnorm.  The second step of the trick
1251// is to subtract the same constant to float the rounded integer into a fp reg.
1252
1253{ .mfi
1254      nop.m 999
1255//    N := round_to_nearest_integer_value( A );
1256      fma.s1 FR_N_fix = FR_A, f1, FR_RSHF
1257      nop.i 999
1258}
1259;;
1260
1261{ .mfi
1262      nop.m 999
1263      fmin.s1 FR_b = FR_B_hi,FR_C_lo
1264      nop.i 999
1265}
1266{ .mfi
1267      nop.m 999
1268//    C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7
1269      fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C
1270      nop.i 999
1271}
1272;;
1273
1274{ .mfi
1275      nop.m 999
1276//    a := (B - A) + b: Exact - note that a is either 0 or 2^(-64).
1277      fsub.s1 FR_a = FR_B,FR_A
1278      nop.i 999
1279}
1280;;
1281
1282{ .mfi
1283      nop.m 999
1284      fms.s1 FR_N = FR_N_fix, f1, FR_RSHF
1285      nop.i 999
1286}
1287;;
1288
1289{ .mfi
1290      nop.m 999
1291      fadd.s1 FR_a = FR_a,FR_b
1292      nop.i 999
1293}
1294;;
1295
1296//    f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2.
1297//    N := convert to integer format( C_hi + N );
1298//    M := P_0 * x_lo;
1299//    N := N + M;
1300{ .mfi
1301      nop.m 999
1302      fsub.s1 FR_f = FR_A,FR_N
1303      nop.i 999
1304}
1305{ .mfi
1306      nop.m 999
1307      fadd.s1 FR_N = FR_N,FR_C_hi
1308      nop.i 999
1309}
1310;;
1311
1312{ .mfi
1313      nop.m 999
1314(p9)  fsub.s1 FR_D_hi = f0, FR_D_hi
1315      nop.i 999
1316}
1317{ .mfi
1318      nop.m 999
1319(p9)  fsub.s1 FR_D_lo = f0, FR_D_lo
1320      nop.i 999
1321}
1322;;
1323
1324{ .mfi
1325      nop.m 999
1326      fadd.s1 FR_g = FR_A_hi,FR_B_lo          // For Case 1, g=A_hi+B_lo
1327      nop.i 999
1328}
1329{ .mfi
1330      nop.m 999
1331      fadd.s3 FR_A = FR_A_hi,FR_B_lo          // For Case 2, A=A_hi+B_lo w/ sf3
1332      nop.i 999
1333}
1334;;
1335
1336{ .mfi
1337      mov GR_Temp = 0x0FFCD                   // For Case 2, exponent of 2^-50
1338      fmax.s1 FR_B = FR_A_hi,FR_B_lo          // For Case 2, B=max(A_hi,B_lo)
1339      nop.i 999
1340}
1341;;
1342
1343//    f = f + a      Exact because a is 0 or 2^(-64);
1344//    the msb of the sum is <= 1/2 and lsb >= 2^(-64).
1345{ .mfi
1346      setf.exp FR_TWOM50 = GR_Temp            // For Case 2, form 2^-50
1347      fcvt.fx.s1 FR_N = FR_N
1348      nop.i 999
1349}
1350{ .mfi
1351      nop.m 999
1352      fadd.s1 FR_f = FR_f,FR_a
1353      nop.i 999
1354}
1355;;
1356
1357{ .mfi
1358      nop.m 999
1359      fmin.s1 FR_b = FR_A_hi,FR_B_lo          // For Case 2, b=min(A_hi,B_lo)
1360      nop.i 999
1361}
1362;;
1363
1364{ .mfi
1365      nop.m 999
1366      fsub.s1 FR_a = FR_B,FR_A                // For Case 2, a=B-A
1367      nop.i 999
1368}
1369;;
1370
1371{ .mfi
1372      nop.m 999
1373      fadd.s1 FR_s_hi = FR_f,FR_g             // For Case 1, s_hi=f+g
1374      nop.i 999
1375}
1376{ .mfi
1377      nop.m 999
1378      fadd.s1 FR_f_hi = FR_A,FR_f             // For Case 2, f_hi=A+f
1379      nop.i 999
1380}
1381;;
1382
1383{ .mfi
1384      nop.m 999
1385      fabs FR_f_abs = FR_f
1386      nop.i 999
1387}
1388;;
1389
1390{ .mfi
1391      getf.sig GR_N = FR_N
1392      fsetc.s3 0x7F,0x40                 // Reset sf3 to user settings + td
1393      nop.i 999
1394}
1395;;
1396
1397{ .mfi
1398      nop.m 999
1399      fsub.s1 FR_s_lo = FR_f,FR_s_hi          // For Case 1, s_lo=f-s_hi
1400      nop.i 999
1401}
1402{ .mfi
1403      nop.m 999
1404      fsub.s1 FR_f_lo = FR_f,FR_f_hi          // For Case 2, f_lo=f-f_hi
1405      nop.i 999
1406}
1407;;
1408
1409{ .mfi
1410      nop.m 999
1411      fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi       // For Case 1, r_hi=s_hi*D_hi
1412      nop.i 999
1413}
1414{ .mfi
1415      nop.m 999
1416      fadd.s1 FR_a = FR_a,FR_b                // For Case 2, a=a+b
1417      nop.i 999
1418}
1419;;
1420
1421
1422//    If sgn_x == 1 (that is original x was negative)
1423//       N := 2^10 - N
1424//       this maintains N to be non-negative, but still
1425//       equivalent to the (negated N) mod 4.
1426//    End If
1427{ .mfi
1428      add GR_N = GR_N,GR_M
1429      fcmp.ge.s1 p13, p10 = FR_f_abs,FR_TWOM33
1430      mov GR_Temp = 0x00400
1431}
1432;;
1433
1434{ .mfi
1435(p9)  sub GR_N = GR_Temp,GR_N
1436      fadd.s1 FR_s_lo = FR_s_lo,FR_g           // For Case 1, s_lo=s_lo+g
1437      nop.i 999
1438}
1439{ .mfi
1440      nop.m 999
1441      fadd.s1 FR_f_lo = FR_f_lo,FR_A           // For Case 2, f_lo=f_lo+A
1442      nop.i 999
1443}
1444;;
1445
1446//       a := (B - A) + b      Exact.
1447//       Note that a is either 0 or 2^(-128).
1448//       f_hi := A + f;
1449//       f_lo := (f - f_hi) + A
1450//       f_lo=f-f_hi is exact because either |f| >= |A|, in which
1451//       case f-f_hi is clearly exact; or otherwise, 0<|f|<|A|
1452//       means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64).
1453//       If f = 2^(-64), f-f_hi involves cancellation and is
1454//       exact. If f = -2^(-64), then A + f is exact. Hence
1455//       f-f_hi is -A exactly, giving f_lo = 0.
1456//       f_lo := f_lo + a;
1457
1458//    If |f| >= 2^(-33)
1459//       Case 1
1460//       CASE := 1
1461//       g := A_hi + B_lo;
1462//       s_hi := f + g;
1463//       s_lo := (f - s_hi) + g;
1464//   Else
1465//       Case 2
1466//       CASE := 2
1467//       A := fadd.fpsr3( A_hi, B_lo )
1468//       B := max( A_hi, B_lo )
1469//       b := min( A_hi, B_lo )
1470
1471{ .mfi
1472      nop.m 999
1473(p10) fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50
1474      nop.i 999
1475}
1476{ .mfi
1477      nop.m 999
1478(p13) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi //For Case 1, r_lo=s_hi*D_hi+r_hi
1479      nop.i 999
1480}
1481;;
1482
1483//       If |f| >= 2^(-50) then
1484//          s_hi := f_hi;
1485//          s_lo := f_lo;
1486//       Else
1487//          f_lo := (f_lo + A_lo) + x*p_4
1488//          s_hi := f_hi + f_lo
1489//          s_lo := (f_hi - s_hi) + f_lo
1490//       End If
1491{ .mfi
1492      nop.m 999
1493(p14) mov FR_s_hi = FR_f_hi
1494      nop.i 999
1495}
1496{ .mfi
1497      nop.m 999
1498(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_a
1499      nop.i 999
1500}
1501;;
1502
1503{ .mfi
1504      nop.m 999
1505(p14) mov FR_s_lo = FR_f_lo
1506      nop.i 999
1507}
1508{ .mfi
1509      nop.m 999
1510(p11) fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo
1511      nop.i 999
1512}
1513;;
1514
1515{ .mfi
1516      nop.m 999
1517(p11) fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo
1518      nop.i 999
1519}
1520;;
1521
1522{ .mfi
1523      nop.m 999
1524(p13) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo //For Case 1, r_lo=s_hi*D_lo+r_lo
1525      nop.i 999
1526}
1527{ .mfi
1528      nop.m 999
1529(p11) fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo
1530      nop.i 999
1531}
1532;;
1533
1534//   r_hi :=  s_hi*D_hi
1535//   r_lo :=  s_hi*D_hi - r_hi  with fma
1536//   r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
1537{ .mfi
1538      nop.m 999
1539(p10) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi
1540      nop.i 999
1541}
1542{ .mfi
1543      nop.m 999
1544(p11) fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi
1545      nop.i 999
1546}
1547;;
1548
1549{ .mfi
1550      nop.m 999
1551(p10) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi
1552      nop.i 999
1553}
1554{ .mfi
1555      nop.m 999
1556(p11) fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo
1557      nop.i 999
1558}
1559;;
1560
1561{ .mfi
1562      nop.m 999
1563(p10) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo
1564      nop.i 999
1565}
1566;;
1567
1568//   Return  N, r_hi, r_lo
1569//   We do not return CASE
1570{ .mfb
1571      nop.m 999
1572      fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo
1573      br.ret.sptk   b0
1574}
1575;;
1576
1577.endp __libm_pi_by_2_reduce#
1578