1.file "hypotl.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39//********************************************************************* 40// 41// History: 42// 02/02/00 hand-optimized 43// 04/04/00 Unwind support added 44// 06/20/00 new version 45// 08/15/00 Bundle added after call to __libm_error_support to properly 46// set [the previously overwritten] GR_Parameter_RESULT. 47// 05/20/02 Cleaned up namespace and sf0 syntax 48// 02/10/03 Reordered header: .section, .global, .proc, .align 49// 50//********************************************************************* 51// ___________ 52// Function: hypotl(x,y) = |(x^2 + y^2) = for double extended values 53// x and y 54// Also provides cabsl functionality. 55// 56//********************************************************************* 57// 58// Resources Used: 59// 60// Floating-Point Registers: f8 (Input and Return Value) 61// f9 (Input) 62// f6 -f15, f32-f34 63// 64// General Purpose Registers: 65// r2-r3 (Scratch) 66// r32-r36 (Locals) 67// r37-r40 (Used to pass arguments to error handling routine) 68// 69// Predicate Registers: p6 - p10 70// 71//********************************************************************* 72// 73// IEEE Special Conditions: 74// 75// All faults and exceptions should be raised correctly. 76// Overflow can occur. 77// hypotl(Infinity and anything) = +Infinity 78// hypotl(QNaN and anything) = QNaN 79// hypotl(SNaN and anything ) = QNaN 80// 81//********************************************************************* 82// 83// Implementation: 84// x2 = x * x in double-extended 85// y2 = y * y in double-extended 86// temp = x2 + y2 in double-extended 87// sqrt(temp) rounded to double extended 88// 89//********************************************************************* 90 91GR_SAVE_PFS = r33 92GR_SAVE_B0 = r34 93GR_SAVE_GP = r35 94GR_Parameter_X = r36 95GR_Parameter_Y = r37 96GR_Parameter_RESULT = r38 97GR_Parameter_TAG = r39 98 99FR_X = f32 100FR_Y = f33 101FR_RESULT = f8 102 103.section .text 104 105LOCAL_LIBM_ENTRY(cabsl) 106LOCAL_LIBM_END(cabsl) 107 108GLOBAL_IEEE754_ENTRY(hypotl) 109{.mfi 110 alloc r32= ar.pfs,0,4,4,0 111 // Compute x*x 112 fma.s1 f10=f8,f8,f0 113 // r2=bias-1 114 mov r2=0xfffe 115} 116{.mfi 117 nop.m 0 118 // y*y 119 fma.s1 f11=f9,f9,f0 120 nop.i 0;; 121} 122 123{ .mfi 124 nop.m 0 125// Check if x is an Inf - if so return Inf even 126// if y is a NaN (C9X) 127 fclass.m.unc p7, p6 = f8, 0x023 128 nop.i 0 129} 130{.mfi 131 nop.m 0 132 // if possible overflow, copy f8 to f32 133 // set Denormal, if necessary 134 // (p8) 135 fma.s0 f32=f8,f1,f0 136 nop.i 0;; 137} 138{ .mfi 139 nop.m 0 140// Check if y is an Inf - if so return Inf even 141// if x is a NaN (C9X) 142 fclass.m.unc p8, p9 = f9, 0x023 143 nop.i 0 144} 145{ .mfi 146 nop.m 999 147// For x=inf, multiply y by 1 to raise invalid on y an SNaN 148// (p7) fma.s0 f9=f9,f1,f0 149 // copy f9 to f33; set Denormal, if necessary 150 fma.s0 f33=f9,f1,f0 151 nop.i 0;; 152} 153{.mfi 154 nop.m 0 155 // is y Zero ? 156 (p6) fclass.m p6,p0=f9,0x7 157 nop.i 0;; 158} 159 160{.mfi 161 // f7=0.5 162 setf.exp f7=r2 163 // a=x2+y2 164 fma.s1 f12=f10,f1,f11 165 nop.i 0 166} 167{.mfi 168 mov r2=0x408c //0000 169 // dx=x*x-x2 170 fms.s1 f13=f8,f8,f10 171 nop.i 0;; 172} 173{.mfi 174 nop.m 0 175 // is x Zero ? 176 (p9) fclass.m p9,p0=f8,0x7 177 shl r2=r2,16 178} 179{.mfi 180 nop.m 0 181 // dy=y*y-y2 182 fms.s1 f14=f9,f9,f11 183 nop.i 0;; 184} 185 186{.mfi 187 nop.m 0 188 // x not NaN ? 189 (p6) fclass.m p7,p0=f8,0x3f 190 nop.i 0 191} 192{.mfi 193 nop.m 0 194 // f6=2 195 fma.s1 f6=f1,f1,f1 196 nop.i 0;; 197} 198 199{.mfi 200 nop.m 0 201 // f34=min(x2,y2) 202 famin.s1 f34=f10,f11 203 nop.i 0 204} 205{.mfb 206 nop.m 0 207 // f10=max(x2,y2) 208 famax.s1 f10=f11,f10 209 nop.b 0;; // 210} 211 212{.mfi 213 nop.m 0 214 // y not NaN ? 215 (p9) fclass.m p8,p0=f9,0x3f 216 nop.i 0;; 217} 218{.mfb 219 // f9=35/8 220 setf.s f9=r2 221 // if f8=Infinity or f9=Zero, return |f8| 222 (p7) fmerge.s f8=f0,f32 223 (p7) br.ret.spnt b0;; 224} 225 226 227{.mfi 228 nop.m 0 229 // z0=frsqrta(a) 230 frsqrta.s1 f8,p6=f12 231 nop.i 0;; 232} 233{ .mfi 234 nop.m 0 235// Identify Natvals, Infs, NaNs, and Zeros 236// and return result 237 fclass.m.unc p7, p0 = f12, 0x1E7 238 nop.i 0 239} 240{.mfi 241 // get exponent of x^2+y^2 242 getf.exp r3=f12 243 // dxy=dx+dy 244 fma.s1 f13=f13,f1,f14 245 nop.i 0;; 246} 247 248{.mfb 249 // 2*emax-2 250 mov r2=0x17ffb 251 // if f9=Infinity or f8=Zero, return |f9| 252 (p8) fmerge.s f8=f0,f33 253 (p8) br.ret.spnt b0 254} 255{.mfi 256 nop.m 0 257 // dd=a-max(x2,y2) 258 fnma.s1 f10=f10,f1,f12 259 nop.i 0;; 260} 261 262{.mfi 263 nop.m 0 264 // S0=a*z0 265 (p6) fma.s1 f14=f12,f8,f0 266 nop.i 0 267} 268{.mfi 269 nop.m 0 270 // H0=0.5*z0 271 (p6) fma.s1 f15=f8,f7,f0 272 nop.i 0;; 273} 274 275{.mfb 276 nop.m 0 277 // if special case, set f8 278 (p7) mov f8=f12 279 (p7) br.ret.spnt b0 280} 281{.mfi 282 nop.m 0 283 // da=min(x2,y2)-dd 284 fnma.s1 f10=f10,f1,f34 285 nop.i 0;; 286} 287{.mfi 288 nop.m 0 289 // f6=5/2 290 fma.s1 f6=f7,f1,f6 291 nop.i 0 292} 293{.mfi 294 nop.m 0 295 // f11=3/2 296 fma.s1 f11=f7,f1,f1 297 nop.i 0;; 298} 299 300{.mfi 301 nop.m 0 302 // d=0.5-S0*H0 303 (p6) fnma.s1 f7=f14,f15,f7 304 nop.i 0;; 305} 306 307{.mfi 308 nop.m 0 309 // P1=3/2*d+1 310 (p6) fma.s1 f11=f11,f7,f1 311 nop.i 0 312} 313{.mfi 314 nop.m 0 315 // P2=35/8*d+5/2 316 (p6) fma.s1 f9=f9,f7,f6 317 nop.i 0;; 318} 319{.mfi 320 nop.m 0 321 // d2=d*d 322 (p6) fma.s1 f34=f7,f7,f0 323 nop.i 0;; 324} 325 326{.mfi 327 nop.m 0 328 // T0=d*S0 329 (p6) fma.s1 f6=f7,f14,f0 330 nop.i 0 331} 332{.mfi 333 nop.m 0 334 // G0=d*H0 335 (p6) fma.s1 f7=f7,f15,f0 336 nop.i 0;; 337} 338{.mfi 339 nop.m 0 340 // P=d2*P2+P1 341 (p6) fma.s1 f11=f34,f9,f11 342 nop.i 0;; 343} 344 345{.mfi 346 nop.m 0 347 // S1=p*T0+S0 348 (p6) fma.s1 f14=f11,f6,f14 349 nop.i 0 350} 351{.mfi 352 nop.m 0 353 // H1=p*G0+H0 354 (p6) fma.s1 f15=f11,f7,f15 355 nop.i 0;; 356} 357 358 359{.mfi 360 nop.m 0 361 // e1=a-S1*S1 362 (p6) fnma.s1 f7=f14,f14,f12 363 nop.i 0 364} 365{.mfi 366 // Is x^2 + y^2 well less than the overflow 367 // threshold? 368 (p6) cmp.lt.unc p7, p8 = r3,r2 369 // c=dxy+da 370 (p6) fma.s1 f13=f13,f1,f10 371 nop.i 0;; 372} 373 374{.mfi 375 nop.m 0 376 // e=e1+c 377 (p6) fma.s1 f13=f7,f1,f13 378 nop.i 0;; 379} 380 381{.mfb 382 nop.m 0 383 // S=e*H1+S1 384 fma.s0 f8=f13,f15,f14 385 // No overflow in this case 386 (p7) br.ret.sptk b0;; 387} 388 389{ .mfi 390 nop.m 0 391(p8) fsetc.s2 0x7F,0x42 392 // Possible overflow path, must detect by 393 // Setting widest range exponent with prevailing 394 // rounding mode. 395 nop.i 0 ;; 396} 397 398 399{ .mfi 400 // bias+0x4000 (bias+EMAX+1) 401 (p8) mov r2=0x13fff 402 // S=e*H1+S1 403 (p8) fma.s2 f12=f13,f15,f14 404 nop.i 0 ;; 405} 406{ .mfi 407(p8) setf.exp f11 = r2 408(p8) fsetc.s2 0x7F,0x40 409// Restore Original Mode in S2 410 nop.i 0 ;; 411} 412{ .mfi 413 nop.m 0 414(p8) fcmp.lt.unc.s1 p9, p10 = f12, f11 415 nop.i 0 ;; 416} 417{ .mib 418 nop.m 0 419 mov GR_Parameter_TAG = 45; 420 // No overflow 421(p9) br.ret.sptk b0;; 422} 423GLOBAL_IEEE754_END(hypotl) 424libm_alias_ldouble_other (__hypot, hypot) 425 426LOCAL_LIBM_ENTRY(__libm_error_region) 427.prologue 428{ .mfi 429 add GR_Parameter_Y=-32,sp // Parameter 2 value 430 nop.f 0 431.save ar.pfs,GR_SAVE_PFS 432 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 433} 434{ .mfi 435.fframe 64 436 add sp=-64,sp // Create new stack 437 nop.f 0 438 mov GR_SAVE_GP=gp // Save gp 439};; 440{ .mmi 441 stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack 442 add GR_Parameter_X = 16,sp // Parameter 1 address 443.save b0, GR_SAVE_B0 444 mov GR_SAVE_B0=b0 // Save b0 445};; 446.body 447{ .mib 448 stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack 449 add GR_Parameter_RESULT = 0,GR_Parameter_Y 450 nop.b 0 // Parameter 3 address 451} 452{ .mib 453 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack 454 add GR_Parameter_Y = -16,GR_Parameter_Y 455 br.call.sptk b0=__libm_error_support# // Call error handling function 456};; 457{ .mmi 458 nop.m 0 459 nop.m 0 460 add GR_Parameter_RESULT = 48,sp 461};; 462{ .mmi 463 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack 464.restore sp 465 add sp = 64,sp // Restore stack pointer 466 mov b0 = GR_SAVE_B0 // Restore return address 467};; 468{ .mib 469 mov gp = GR_SAVE_GP // Restore gp 470 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 471 br.ret.sptk b0 // Return 472};; 473LOCAL_LIBM_END(__libm_error_region#) 474.type __libm_error_support#,@function 475.global __libm_error_support# 476