1.file "sqrtf.s"
2
3
4// Copyright (c) 2000 - 2003, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39//*********************************************************************
40// History:
41//
42// 02/02/00 Initial version
43// 04/04/00 Unwind support added
44// 08/15/00 Bundle added after call to __libm_error_support to properly
45//          set [the previously overwritten] GR_Parameter_RESULT.
46// 05/20/02 Cleaned up namespace and sf0 syntax
47// 02/10/03 Reordered header: .section, .global, .proc, .align
48//
49//*********************************************************************
50//
51// Function:   Combined sqrtf(x), where
52//                         _
53//             sqrtf(x) = |x, for single precision x values
54//
55//********************************************************************
56//
57// Accuracy:       Correctly Rounded
58//
59//********************************************************************
60//
61// Resources Used:
62//
63//    Floating-Point Registers: f8  (Input and Return Value)
64//                              f7 -f14
65//
66//    General Purpose Registers:
67//      r32-r36 (Locals)
68//      r37-r40 (Used to pass arguments to error handling routine)
69//
70//    Predicate Registers:      p6, p7, p8
71//
72//********************************************************************
73//
74// IEEE Special Conditions:
75//
76//    All faults and exceptions should be raised correctly.
77//    sqrtf(QNaN) = QNaN
78//    sqrtf(SNaN) = QNaN
79//    sqrtf(+/-0) = +/-0
80//    sqrtf(negative) = QNaN and error handling is called
81//
82//********************************************************************
83//
84// Implementation:
85//
86//  Modified Newton-Raphson Algorithm
87//
88//********************************************************************
89
90
91GR_SAVE_B0                    = r34
92GR_SAVE_PFS                   = r33
93GR_SAVE_GP                    = r35
94
95GR_Parameter_X                = r37
96GR_Parameter_Y                = r38
97GR_Parameter_RESULT           = r39
98GR_Parameter_TAG              = r40
99
100FR_X             = f13
101FR_Y             = f0
102FR_RESULT        = f8
103
104
105.section .text
106GLOBAL_IEEE754_ENTRY(sqrtf)
107{ .mlx
108  // BEGIN SINGLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM
109  alloc r32= ar.pfs,0,5,4,0
110  // exponent of +1/2 in r2
111  movl r2 = 0x0fffe
112} { .mfi
113  // +1/2 in f12
114  nop.m 0
115  frsqrta.s0 f7,p6=f8
116  nop.i 0;;
117} { .mfi
118  setf.exp f12 = r2
119  // Step (1)
120  // y0 = 1/sqrt(a) in f7
121  fclass.m.unc p7,p8 = f8,0x3A
122  nop.i 0
123} { .mfi
124  nop.m 0
125  // Make a copy of x just in case
126  mov f13 = f8
127  nop.i 0;;
128} { .mfi
129  nop.m 0
130  // Step (2)
131  // H0 = 1/2 * y0 in f9
132  (p6) fma.s1 f9=f12,f7,f0
133  nop.i 0
134} { .mfi
135  nop.m 0
136  // Step (3)
137  // S0 = a * y0 in f7
138  (p6) fma.s1 f7=f8,f7,f0
139  nop.i 0;;
140} { .mfi
141  nop.m 0
142  // Step (4)
143  // d = 1/2 - S0 * H0 in f10
144  (p6) fnma.s1 f10=f7,f9,f12
145  nop.i 0
146} { .mfi
147  nop.m 0
148  // Step (0'')
149  // 3/2 = 1 + 1/2 in f12
150  (p6) fma.s1 f12=f12,f1,f1
151  nop.i 0;;
152} { .mfi
153  nop.m 0
154  // Step (5)
155  // e = 1 + 3/2 * d in f12
156  (p6) fma.s1 f12=f12,f10,f1
157  nop.i 0
158} { .mfi
159  nop.m 0
160  // Step (6)
161  // T0 = d * S0 in f11
162  (p6) fma.s1 f11=f10,f7,f0
163  nop.i 0;;
164} { .mfi
165  nop.m 0
166  // Step (7)
167  // G0 = d * H0 in f10
168  (p6) fma.s1 f10=f10,f9,f0
169  nop.i 0;;
170} { .mfi
171  nop.m 0
172  // Step (8)
173  // S1 = S0 + e * T0 in f7
174  (p6) fma.s.s1 f7=f12,f11,f7
175  nop.i 0;;
176} { .mfi
177  nop.m 0
178  // Step (9)
179  // H1 = H0 + e * G0 in f12
180  (p6) fma.s1 f12=f12,f10,f9
181  nop.i 0;;
182} { .mfi
183  nop.m 0
184  // Step (10)
185  // d1 = a - S1 * S1 in f9
186  (p6) fnma.s1 f9=f7,f7,f8
187  nop.i 0;;
188} { .mfb
189  nop.m 0
190  // Step (11)
191  // S = S1 + d1 * H1 in f7
192  (p6) fma.s.s0 f8=f9,f12,f7
193  (p6) br.ret.sptk   b0 ;;
194// END SINGLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM
195} { .mfb
196  nop.m 0
197       mov   f8 = f7
198  (p8) br.ret.sptk b0 ;;
199}
200//
201// This branch includes all those special values that are not negative,
202// with the result equal to frcpa(x)
203//
204GLOBAL_IEEE754_END(sqrtf)
205libm_alias_float_other (__sqrt, sqrt)
206
207
208LOCAL_LIBM_ENTRY(__libm_error_region)
209.prologue
210{ .mii
211        add   GR_Parameter_Y=-32,sp             // Parameter 2 value
212        mov   GR_Parameter_TAG = 50
213.save   ar.pfs,GR_SAVE_PFS
214        mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
215}
216{ .mfi
217.fframe 64
218        add sp=-64,sp                           // Create new stack
219        nop.f 0
220        mov GR_SAVE_GP=gp                       // Save gp
221};;
222{ .mmi
223        stfs [GR_Parameter_Y] = FR_Y,16         // Store Parameter 2 on stack
224        add GR_Parameter_X = 16,sp              // Parameter 1 address
225.save   b0, GR_SAVE_B0
226        mov GR_SAVE_B0=b0                       // Save b0
227};;
228.body
229{ .mib
230        stfs [GR_Parameter_X] = FR_X            // Store Parameter 1 on stack
231        add   GR_Parameter_RESULT = 0,GR_Parameter_Y
232        nop.b 0                                 // Parameter 3 address
233}
234{ .mib
235        stfs [GR_Parameter_Y] = FR_RESULT       // Store Parameter 3 on stack
236        add   GR_Parameter_Y = -16,GR_Parameter_Y
237        br.call.sptk b0=__libm_error_support#   // Call error handling function
238};;
239{ .mmi
240        nop.m 0
241        nop.m 0
242        add   GR_Parameter_RESULT = 48,sp
243};;
244{ .mmi
245        ldfs  f8 = [GR_Parameter_RESULT]       // Get return result off stack
246.restore sp
247        add   sp = 64,sp                       // Restore stack pointer
248        mov   b0 = GR_SAVE_B0                  // Restore return address
249};;
250{ .mib
251        mov   gp = GR_SAVE_GP                  // Restore gp
252        mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
253        br.ret.sptk     b0                     // Return
254};;
255
256LOCAL_LIBM_END(__libm_error_region)
257
258
259.type   __libm_error_support#,@function
260.global __libm_error_support#
261