1.file "sqrtl.s"
2
3
4// Copyright (c) 2000 - 2003, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39//********************************************************************
40//
41// History:
42// 02/02/00 (hand-optimized)
43// 04/04/00 Unwind support added
44// 08/15/00 Bundle added after call to __libm_error_support to properly
45//          set [the previously overwritten] GR_Parameter_RESULT.
46// 05/20/02 Cleaned up namespace and sf0 syntax
47// 02/10/03 Reordered header: .section, .global, .proc, .align
48//
49//********************************************************************
50//
51// Function:   Combined sqrtl(x), where
52//                         _
53//             sqrtl(x) = |x, for double-extended precision x values
54//
55//********************************************************************
56//
57// Resources Used:
58//
59//    Floating-Point Registers: f8  (Input and Return Value)
60//                              f7 -f14
61//
62//    General Purpose Registers:
63//      r32-r36 (Locals)
64//      r37-r40 (Used to pass arguments to error handling routine)
65//
66//    Predicate Registers:      p6, p7, p8
67//
68//********************************************************************
69//
70// IEEE Special Conditions:
71//
72//    All faults and exceptions should be raised correctly.
73//    sqrtl(QNaN) = QNaN
74//    sqrtl(SNaN) = QNaN
75//    sqrtl(+/-0) = +/-0
76//    sqrtl(negative) = QNaN and error handling is called
77//
78//********************************************************************
79//
80// Implementation:
81//
82//  Modified Newton-Raphson Algorithm
83//
84//********************************************************************
85
86GR_SAVE_PFS         = r33
87GR_SAVE_B0          = r34
88GR_SAVE_GP          = r35
89GR_Parameter_X      = r37
90GR_Parameter_Y      = r38
91GR_Parameter_RESULT = r39
92GR_Parameter_TAG    = r40
93
94FR_X                = f15
95FR_Y                = f0
96FR_RESULT           = f8
97
98.section .text
99GLOBAL_IEEE754_ENTRY(sqrtl)
100{ .mlx
101alloc r32= ar.pfs,0,5,4,0
102  // exponent of +1/2 in r2
103  movl r2 = 0x0fffe;;
104} { .mfi
105  // +1/2 in f10
106  setf.exp f12 = r2
107  // Step (1)
108  // y0 = 1/sqrt(a) in f7
109  frsqrta.s0 f7,p6=f8
110  nop.i 0;;
111} { .mfi
112  nop.m 0
113  // Step (2)
114  // H0 = +1/2 * y0 in f9
115  (p6) fma.s1 f9=f12,f7,f0
116  nop.i 0
117} { .mfi
118  nop.m 0
119  // Step (3)
120  // S0 = a * y0 in f7
121  (p6) fma.s1 f7=f8,f7,f0
122  nop.i 0;;
123} { .mfi
124  nop.m 0
125  // Make copy input x
126  mov f13=f8
127  nop.i 0
128} { .mfi
129  nop.m 0
130  fclass.m.unc p7,p8 = f8,0x3A
131  nop.i 0;;
132} { .mfi
133  nop.m 0
134  // Step (4)
135  // d0 = 1/2 - S0 * H0 in f10
136  (p6) fnma.s1 f10=f7,f9,f12
137  nop.i 0;;
138}
139{ .mfi
140  nop.m 0
141       mov f15=f8
142  nop.i 0;;
143} { .mfi
144  nop.m 0
145  // Step (5)
146  // H1 = H0 + d0 * H0 in f9
147  (p6) fma.s1 f9=f10,f9,f9
148  nop.i 0
149} { .mfi
150  nop.m 0
151  // Step (6)
152  // S1 = S0 + d0 * S0 in f7
153  (p6) fma.s1 f7=f10,f7,f7
154  nop.i 0;;
155} { .mfi
156  nop.m 0
157  // Step (7)
158  // d1 = 1/2 - S1 * H1 in f10
159  (p6) fnma.s1 f10=f7,f9,f12
160  nop.i 0;;
161} { .mfi
162  nop.m 0
163  // Step (8)
164  // H2 = H1 + d1 * H1 in f9
165  (p6) fma.s1 f9=f10,f9,f9
166  nop.i 0
167} { .mfi
168  nop.m 0
169  // Step (9)
170  // S2 = S1 + d1 * S1 in f7
171  (p6) fma.s1 f7=f10,f7,f7
172  nop.i 0;;
173} { .mfi
174  nop.m 0
175  // Step (10)
176  // d2 = 1/2 - S2 * H2 in f10
177  (p6) fnma.s1 f10=f7,f9,f12
178  nop.i 0
179} { .mfi
180  nop.m 0
181  // Step (11)
182  // e2 = a - S2 * S2 in f12
183  (p6) fnma.s1 f12=f7,f7,f8
184  nop.i 0;;
185} { .mfi
186  nop.m 0
187  // Step (12)
188  // S3 = S2 + d2 * S2 in f7
189  (p6) fma.s1 f7=f12,f9,f7
190  nop.i 0
191} { .mfi
192  nop.m 0
193  // Step (13)
194  // H3 = H2 + d2 * H2 in f9
195  (p6) fma.s1 f9=f10,f9,f9
196  nop.i 0;;
197} { .mfi
198  nop.m 0
199  // Step (14)
200  // e3 = a - S3 * S3 in f12
201  (p6) fnma.s1 f12=f7,f7,f8
202  nop.i 0;;
203} { .mfb
204  nop.m 0
205  // Step (15)
206  // S = S3 + e3 * H3 in f7
207  (p6) fma.s0 f8=f12,f9,f7
208  (p6) br.ret.sptk b0 ;;
209}
210{ .mfb
211       mov GR_Parameter_TAG    = 48
212       mov   f8 = f7
213  (p8) br.ret.sptk b0 ;;
214}
215//
216// This branch includes all those special values that are not negative,
217// with the result equal to frcpa(x)
218//
219
220
221// END DOUBLE EXTENDED PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM
222GLOBAL_IEEE754_END(sqrtl)
223libm_alias_ldouble_other (__sqrt, sqrt)
224
225LOCAL_LIBM_ENTRY(__libm_error_region)
226.prologue
227{ .mfi
228        add   GR_Parameter_Y=-32,sp             // Parameter 2 value
229        nop.f 0
230.save   ar.pfs,GR_SAVE_PFS
231        mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
232}
233{ .mfi
234.fframe 64
235        add sp=-64,sp                           // Create new stack
236        nop.f 0
237        mov GR_SAVE_GP=gp                       // Save gp
238};;
239{ .mmi
240        stfe [GR_Parameter_Y] = FR_Y,16         // Save Parameter 2 on stack
241        add GR_Parameter_X = 16,sp              // Parameter 1 address
242.save   b0, GR_SAVE_B0
243        mov GR_SAVE_B0=b0                       // Save b0
244};;
245.body
246{ .mib
247        stfe [GR_Parameter_X] = FR_X            // Store Parameter 1 on stack
248        add   GR_Parameter_RESULT = 0,GR_Parameter_Y
249        nop.b 0                                 // Parameter 3 address
250}
251{ .mib
252        stfe [GR_Parameter_Y] = FR_RESULT      // Store Parameter 3 on stack
253        add   GR_Parameter_Y = -16,GR_Parameter_Y
254        br.call.sptk b0=__libm_error_support#  // Call error handling function
255};;
256{ .mmi
257        nop.m 0
258        nop.m 0
259        add   GR_Parameter_RESULT = 48,sp
260};;
261{ .mmi
262        ldfe  f8 = [GR_Parameter_RESULT]       // Get return result off stack
263.restore sp
264        add   sp = 64,sp                       // Restore stack pointer
265        mov   b0 = GR_SAVE_B0                  // Restore return address
266};;
267{ .mib
268        mov   gp = GR_SAVE_GP                  // Restore gp
269        mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
270        br.ret.sptk     b0                     // Return
271};;
272
273LOCAL_LIBM_END(__libm_error_region#)
274.type   __libm_error_support#,@function
275.global __libm_error_support#
276