1.file "sqrtl.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39//******************************************************************** 40// 41// History: 42// 02/02/00 (hand-optimized) 43// 04/04/00 Unwind support added 44// 08/15/00 Bundle added after call to __libm_error_support to properly 45// set [the previously overwritten] GR_Parameter_RESULT. 46// 05/20/02 Cleaned up namespace and sf0 syntax 47// 02/10/03 Reordered header: .section, .global, .proc, .align 48// 49//******************************************************************** 50// 51// Function: Combined sqrtl(x), where 52// _ 53// sqrtl(x) = |x, for double-extended precision x values 54// 55//******************************************************************** 56// 57// Resources Used: 58// 59// Floating-Point Registers: f8 (Input and Return Value) 60// f7 -f14 61// 62// General Purpose Registers: 63// r32-r36 (Locals) 64// r37-r40 (Used to pass arguments to error handling routine) 65// 66// Predicate Registers: p6, p7, p8 67// 68//******************************************************************** 69// 70// IEEE Special Conditions: 71// 72// All faults and exceptions should be raised correctly. 73// sqrtl(QNaN) = QNaN 74// sqrtl(SNaN) = QNaN 75// sqrtl(+/-0) = +/-0 76// sqrtl(negative) = QNaN and error handling is called 77// 78//******************************************************************** 79// 80// Implementation: 81// 82// Modified Newton-Raphson Algorithm 83// 84//******************************************************************** 85 86GR_SAVE_PFS = r33 87GR_SAVE_B0 = r34 88GR_SAVE_GP = r35 89GR_Parameter_X = r37 90GR_Parameter_Y = r38 91GR_Parameter_RESULT = r39 92GR_Parameter_TAG = r40 93 94FR_X = f15 95FR_Y = f0 96FR_RESULT = f8 97 98.section .text 99GLOBAL_IEEE754_ENTRY(sqrtl) 100{ .mlx 101alloc r32= ar.pfs,0,5,4,0 102 // exponent of +1/2 in r2 103 movl r2 = 0x0fffe;; 104} { .mfi 105 // +1/2 in f10 106 setf.exp f12 = r2 107 // Step (1) 108 // y0 = 1/sqrt(a) in f7 109 frsqrta.s0 f7,p6=f8 110 nop.i 0;; 111} { .mfi 112 nop.m 0 113 // Step (2) 114 // H0 = +1/2 * y0 in f9 115 (p6) fma.s1 f9=f12,f7,f0 116 nop.i 0 117} { .mfi 118 nop.m 0 119 // Step (3) 120 // S0 = a * y0 in f7 121 (p6) fma.s1 f7=f8,f7,f0 122 nop.i 0;; 123} { .mfi 124 nop.m 0 125 // Make copy input x 126 mov f13=f8 127 nop.i 0 128} { .mfi 129 nop.m 0 130 fclass.m.unc p7,p8 = f8,0x3A 131 nop.i 0;; 132} { .mfi 133 nop.m 0 134 // Step (4) 135 // d0 = 1/2 - S0 * H0 in f10 136 (p6) fnma.s1 f10=f7,f9,f12 137 nop.i 0;; 138} 139{ .mfi 140 nop.m 0 141 mov f15=f8 142 nop.i 0;; 143} { .mfi 144 nop.m 0 145 // Step (5) 146 // H1 = H0 + d0 * H0 in f9 147 (p6) fma.s1 f9=f10,f9,f9 148 nop.i 0 149} { .mfi 150 nop.m 0 151 // Step (6) 152 // S1 = S0 + d0 * S0 in f7 153 (p6) fma.s1 f7=f10,f7,f7 154 nop.i 0;; 155} { .mfi 156 nop.m 0 157 // Step (7) 158 // d1 = 1/2 - S1 * H1 in f10 159 (p6) fnma.s1 f10=f7,f9,f12 160 nop.i 0;; 161} { .mfi 162 nop.m 0 163 // Step (8) 164 // H2 = H1 + d1 * H1 in f9 165 (p6) fma.s1 f9=f10,f9,f9 166 nop.i 0 167} { .mfi 168 nop.m 0 169 // Step (9) 170 // S2 = S1 + d1 * S1 in f7 171 (p6) fma.s1 f7=f10,f7,f7 172 nop.i 0;; 173} { .mfi 174 nop.m 0 175 // Step (10) 176 // d2 = 1/2 - S2 * H2 in f10 177 (p6) fnma.s1 f10=f7,f9,f12 178 nop.i 0 179} { .mfi 180 nop.m 0 181 // Step (11) 182 // e2 = a - S2 * S2 in f12 183 (p6) fnma.s1 f12=f7,f7,f8 184 nop.i 0;; 185} { .mfi 186 nop.m 0 187 // Step (12) 188 // S3 = S2 + d2 * S2 in f7 189 (p6) fma.s1 f7=f12,f9,f7 190 nop.i 0 191} { .mfi 192 nop.m 0 193 // Step (13) 194 // H3 = H2 + d2 * H2 in f9 195 (p6) fma.s1 f9=f10,f9,f9 196 nop.i 0;; 197} { .mfi 198 nop.m 0 199 // Step (14) 200 // e3 = a - S3 * S3 in f12 201 (p6) fnma.s1 f12=f7,f7,f8 202 nop.i 0;; 203} { .mfb 204 nop.m 0 205 // Step (15) 206 // S = S3 + e3 * H3 in f7 207 (p6) fma.s0 f8=f12,f9,f7 208 (p6) br.ret.sptk b0 ;; 209} 210{ .mfb 211 mov GR_Parameter_TAG = 48 212 mov f8 = f7 213 (p8) br.ret.sptk b0 ;; 214} 215// 216// This branch includes all those special values that are not negative, 217// with the result equal to frcpa(x) 218// 219 220 221// END DOUBLE EXTENDED PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM 222GLOBAL_IEEE754_END(sqrtl) 223libm_alias_ldouble_other (__sqrt, sqrt) 224 225LOCAL_LIBM_ENTRY(__libm_error_region) 226.prologue 227{ .mfi 228 add GR_Parameter_Y=-32,sp // Parameter 2 value 229 nop.f 0 230.save ar.pfs,GR_SAVE_PFS 231 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 232} 233{ .mfi 234.fframe 64 235 add sp=-64,sp // Create new stack 236 nop.f 0 237 mov GR_SAVE_GP=gp // Save gp 238};; 239{ .mmi 240 stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack 241 add GR_Parameter_X = 16,sp // Parameter 1 address 242.save b0, GR_SAVE_B0 243 mov GR_SAVE_B0=b0 // Save b0 244};; 245.body 246{ .mib 247 stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack 248 add GR_Parameter_RESULT = 0,GR_Parameter_Y 249 nop.b 0 // Parameter 3 address 250} 251{ .mib 252 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack 253 add GR_Parameter_Y = -16,GR_Parameter_Y 254 br.call.sptk b0=__libm_error_support# // Call error handling function 255};; 256{ .mmi 257 nop.m 0 258 nop.m 0 259 add GR_Parameter_RESULT = 48,sp 260};; 261{ .mmi 262 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack 263.restore sp 264 add sp = 64,sp // Restore stack pointer 265 mov b0 = GR_SAVE_B0 // Restore return address 266};; 267{ .mib 268 mov gp = GR_SAVE_GP // Restore gp 269 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 270 br.ret.sptk b0 // Return 271};; 272 273LOCAL_LIBM_END(__libm_error_region#) 274.type __libm_error_support#,@function 275.global __libm_error_support# 276