1.file "libm_sincosf.s" 2 3 4// Copyright (c) 2002 - 2005, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//============================================================== 41// 02/01/02 Initial version 42// 02/18/02 Large arguments processing routine is excluded. 43// External interface entry points are added 44// 02/26/02 Added temporary return of results in r8, r9 45// 03/13/02 Corrected restore of predicate registers 46// 03/19/02 Added stack unwind around call to __libm_cisf_large 47// 09/05/02 Work range is widened by reduction strengthen (2 parts of Pi/16) 48// 02/10/03 Reordered header: .section, .global, .proc, .align 49// 02/11/04 cisf is moved to the separate file. 50// 03/31/05 Reformatted delimiters between data tables 51 52// API 53//============================================================== 54// 1) void sincosf(float, float*s, float*c) 55// 2) __libm_sincosf - internal LIBM function, that accepts 56// argument in f8 and returns cosine through f8, sine through f9 57 58// 59// Overview of operation 60//============================================================== 61// 62// Step 1 63// ====== 64// Reduce x to region -1/2*pi/2^k ===== 0 ===== +1/2*pi/2^k where k=4 65// divide x by pi/2^k. 66// Multiply by 2^k/pi. 67// nfloat = Round result to integer (round-to-nearest) 68// 69// r = x - nfloat * pi/2^k 70// Do this as (x - nfloat * HIGH(pi/2^k)) - nfloat * LOW(pi/2^k) for increased accuracy. 71// pi/2^k is stored as two numbers that when added make pi/2^k. 72// pi/2^k = HIGH(pi/2^k) + LOW(pi/2^k) 73// HIGH part is rounded to zero, LOW - to nearest 74// 75// x = (nfloat * pi/2^k) + r 76// r is small enough that we can use a polynomial approximation 77// and is referred to as the reduced argument. 78// 79// Step 3 80// ====== 81// Take the unreduced part and remove the multiples of 2pi. 82// So nfloat = nfloat (with lower k+1 bits cleared) + lower k+1 bits 83// 84// nfloat (with lower k+1 bits cleared) is a multiple of 2^(k+1) 85// N * 2^(k+1) 86// nfloat * pi/2^k = N * 2^(k+1) * pi/2^k + (lower k+1 bits) * pi/2^k 87// nfloat * pi/2^k = N * 2 * pi + (lower k+1 bits) * pi/2^k 88// nfloat * pi/2^k = N2pi + M * pi/2^k 89// 90// 91// Sin(x) = Sin((nfloat * pi/2^k) + r) 92// = Sin(nfloat * pi/2^k) * Cos(r) + Cos(nfloat * pi/2^k) * Sin(r) 93// 94// Sin(nfloat * pi/2^k) = Sin(N2pi + Mpi/2^k) 95// = Sin(N2pi)Cos(Mpi/2^k) + Cos(N2pi)Sin(Mpi/2^k) 96// = Sin(Mpi/2^k) 97// 98// Cos(nfloat * pi/2^k) = Cos(N2pi + Mpi/2^k) 99// = Cos(N2pi)Cos(Mpi/2^k) + Sin(N2pi)Sin(Mpi/2^k) 100// = Cos(Mpi/2^k) 101// 102// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r) 103// 104// 105// Step 4 106// ====== 107// 0 <= M < 2^(k+1) 108// There are 2^(k+1) Sin entries in a table. 109// There are 2^(k+1) Cos entries in a table. 110// 111// Get Sin(Mpi/2^k) and Cos(Mpi/2^k) by table lookup. 112// 113// 114// Step 5 115// ====== 116// Calculate Cos(r) and Sin(r) by polynomial approximation. 117// 118// Cos(r) = 1 + r^2 q1 + r^4 q2 = Series for Cos 119// Sin(r) = r + r^3 p1 + r^5 p2 = Series for Sin 120// 121// and the coefficients q1, q2 and p1, p2 are stored in a table 122// 123// 124// Calculate 125// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r) 126// 127// as follows 128// 129// S[m] = Sin(Mpi/2^k) and C[m] = Cos(Mpi/2^k) 130// rsq = r*r 131// 132// 133// P = p1 + r^2p2 134// Q = q1 + r^2q2 135// 136// rcub = r * rsq 137// Sin(r) = r + rcub * P 138// = r + r^3p1 + r^5p2 = Sin(r) 139// 140// P = r + rcub * P 141// 142// Answer = S[m] Cos(r) + C[m] P 143// 144// Cos(r) = 1 + rsq Q 145// Cos(r) = 1 + r^2 Q 146// Cos(r) = 1 + r^2 (q1 + r^2q2) 147// Cos(r) = 1 + r^2q1 + r^4q2 148// 149// S[m] Cos(r) = S[m](1 + rsq Q) 150// S[m] Cos(r) = S[m] + S[m] rsq Q 151// S[m] Cos(r) = S[m] + s_rsq Q 152// Q = S[m] + s_rsq Q 153// 154// Then, 155// 156// Answer = Q + C[m] P 157 158 159// Registers used 160//============================================================== 161// general input registers: 162// r14 -> r19 163// r32 -> r49 164 165// predicate registers used: 166// p6 -> p14 167 168// floating-point registers used 169// f9 -> f15 170// f32 -> f100 171 172// Assembly macros 173//============================================================== 174 175cisf_Arg = f8 176 177cisf_Sin_res = f9 178cisf_Cos_res = f8 179 180 181cisf_NORM_f8 = f10 182cisf_W = f11 183cisf_int_Nfloat = f12 184cisf_Nfloat = f13 185 186cisf_r = f14 187cisf_r_exact = f68 188cisf_rsq = f15 189cisf_rcub = f32 190 191cisf_Inv_Pi_by_16 = f33 192cisf_Pi_by_16_hi = f34 193cisf_Pi_by_16_lo = f35 194 195cisf_Inv_Pi_by_64 = f36 196cisf_Pi_by_64_hi = f37 197cisf_Pi_by_64_lo = f38 198 199 200cisf_P1 = f39 201cisf_Q1 = f40 202cisf_P2 = f41 203cisf_Q2 = f42 204cisf_P3 = f43 205cisf_Q3 = f44 206cisf_P4 = f45 207cisf_Q4 = f46 208 209cisf_P_temp1 = f47 210cisf_P_temp2 = f48 211 212cisf_Q_temp1 = f49 213cisf_Q_temp2 = f50 214 215cisf_P = f51 216 217cisf_SIG_INV_PI_BY_16_2TO61 = f52 218cisf_RSHF_2TO61 = f53 219cisf_RSHF = f54 220cisf_2TOM61 = f55 221cisf_NFLOAT = f56 222cisf_W_2TO61_RSH = f57 223 224cisf_tmp = f58 225 226cisf_Sm_sin = f59 227cisf_Cm_sin = f60 228 229cisf_Sm_cos = f61 230cisf_Cm_cos = f62 231 232cisf_srsq_sin = f63 233cisf_srsq_cos = f64 234 235cisf_Q_sin = f65 236cisf_Q_cos = f66 237cisf_Q = f67 238 239///////////////////////////////////////////////////////////// 240 241cisf_pResSin = r33 242cisf_pResCos = r34 243 244cisf_exp_limit = r35 245cisf_r_signexp = r36 246cisf_AD_beta_table = r37 247cisf_r_sincos = r38 248 249cisf_r_exp = r39 250cisf_r_17_ones = r40 251 252cisf_GR_sig_inv_pi_by_16 = r14 253cisf_GR_rshf_2to61 = r15 254cisf_GR_rshf = r16 255cisf_GR_exp_2tom61 = r17 256cisf_GR_n = r18 257 258cisf_GR_n_sin = r19 259cisf_GR_m_sin = r41 260cisf_GR_32m_sin = r41 261 262cisf_GR_n_cos = r42 263cisf_GR_m_cos = r43 264cisf_GR_32m_cos = r43 265 266cisf_AD_2_sin = r44 267cisf_AD_2_cos = r45 268 269cisf_gr_tmp = r46 270GR_SAVE_B0 = r47 271GR_SAVE_GP = r48 272rB0_SAVED = r49 273GR_SAVE_PFS = r50 274GR_SAVE_PR = r51 275cisf_AD_1 = r52 276 277RODATA 278 279.align 16 280// Pi/16 parts 281LOCAL_OBJECT_START(double_cisf_pi) 282 data8 0xC90FDAA22168C234, 0x00003FFC // pi/16 1st part 283 data8 0xC4C6628B80DC1CD1, 0x00003FBC // pi/16 2nd part 284LOCAL_OBJECT_END(double_cisf_pi) 285 286// Coefficients for polynomials 287LOCAL_OBJECT_START(double_cisf_pq_k4) 288 data8 0x3F810FABB668E9A2 // P2 289 data8 0x3FA552E3D6DE75C9 // Q2 290 data8 0xBFC555554447BC7F // P1 291 data8 0xBFDFFFFFC447610A // Q1 292LOCAL_OBJECT_END(double_cisf_pq_k4) 293 294// Sincos table (S[m], C[m]) 295LOCAL_OBJECT_START(double_sin_cos_beta_k4) 296 data8 0x0000000000000000 // sin ( 0 Pi / 16 ) 297 data8 0x3FF0000000000000 // cos ( 0 Pi / 16 ) 298// 299 data8 0x3FC8F8B83C69A60B // sin ( 1 Pi / 16 ) 300 data8 0x3FEF6297CFF75CB0 // cos ( 1 Pi / 16 ) 301// 302 data8 0x3FD87DE2A6AEA963 // sin ( 2 Pi / 16 ) 303 data8 0x3FED906BCF328D46 // cos ( 2 Pi / 16 ) 304// 305 data8 0x3FE1C73B39AE68C8 // sin ( 3 Pi / 16 ) 306 data8 0x3FEA9B66290EA1A3 // cos ( 3 Pi / 16 ) 307// 308 data8 0x3FE6A09E667F3BCD // sin ( 4 Pi / 16 ) 309 data8 0x3FE6A09E667F3BCD // cos ( 4 Pi / 16 ) 310// 311 data8 0x3FEA9B66290EA1A3 // sin ( 5 Pi / 16 ) 312 data8 0x3FE1C73B39AE68C8 // cos ( 5 Pi / 16 ) 313// 314 data8 0x3FED906BCF328D46 // sin ( 6 Pi / 16 ) 315 data8 0x3FD87DE2A6AEA963 // cos ( 6 Pi / 16 ) 316// 317 data8 0x3FEF6297CFF75CB0 // sin ( 7 Pi / 16 ) 318 data8 0x3FC8F8B83C69A60B // cos ( 7 Pi / 16 ) 319// 320 data8 0x3FF0000000000000 // sin ( 8 Pi / 16 ) 321 data8 0x0000000000000000 // cos ( 8 Pi / 16 ) 322// 323 data8 0x3FEF6297CFF75CB0 // sin ( 9 Pi / 16 ) 324 data8 0xBFC8F8B83C69A60B // cos ( 9 Pi / 16 ) 325// 326 data8 0x3FED906BCF328D46 // sin ( 10 Pi / 16 ) 327 data8 0xBFD87DE2A6AEA963 // cos ( 10 Pi / 16 ) 328// 329 data8 0x3FEA9B66290EA1A3 // sin ( 11 Pi / 16 ) 330 data8 0xBFE1C73B39AE68C8 // cos ( 11 Pi / 16 ) 331// 332 data8 0x3FE6A09E667F3BCD // sin ( 12 Pi / 16 ) 333 data8 0xBFE6A09E667F3BCD // cos ( 12 Pi / 16 ) 334// 335 data8 0x3FE1C73B39AE68C8 // sin ( 13 Pi / 16 ) 336 data8 0xBFEA9B66290EA1A3 // cos ( 13 Pi / 16 ) 337// 338 data8 0x3FD87DE2A6AEA963 // sin ( 14 Pi / 16 ) 339 data8 0xBFED906BCF328D46 // cos ( 14 Pi / 16 ) 340// 341 data8 0x3FC8F8B83C69A60B // sin ( 15 Pi / 16 ) 342 data8 0xBFEF6297CFF75CB0 // cos ( 15 Pi / 16 ) 343// 344 data8 0x0000000000000000 // sin ( 16 Pi / 16 ) 345 data8 0xBFF0000000000000 // cos ( 16 Pi / 16 ) 346// 347 data8 0xBFC8F8B83C69A60B // sin ( 17 Pi / 16 ) 348 data8 0xBFEF6297CFF75CB0 // cos ( 17 Pi / 16 ) 349// 350 data8 0xBFD87DE2A6AEA963 // sin ( 18 Pi / 16 ) 351 data8 0xBFED906BCF328D46 // cos ( 18 Pi / 16 ) 352// 353 data8 0xBFE1C73B39AE68C8 // sin ( 19 Pi / 16 ) 354 data8 0xBFEA9B66290EA1A3 // cos ( 19 Pi / 16 ) 355// 356 data8 0xBFE6A09E667F3BCD // sin ( 20 Pi / 16 ) 357 data8 0xBFE6A09E667F3BCD // cos ( 20 Pi / 16 ) 358// 359 data8 0xBFEA9B66290EA1A3 // sin ( 21 Pi / 16 ) 360 data8 0xBFE1C73B39AE68C8 // cos ( 21 Pi / 16 ) 361// 362 data8 0xBFED906BCF328D46 // sin ( 22 Pi / 16 ) 363 data8 0xBFD87DE2A6AEA963 // cos ( 22 Pi / 16 ) 364// 365 data8 0xBFEF6297CFF75CB0 // sin ( 23 Pi / 16 ) 366 data8 0xBFC8F8B83C69A60B // cos ( 23 Pi / 16 ) 367// 368 data8 0xBFF0000000000000 // sin ( 24 Pi / 16 ) 369 data8 0x0000000000000000 // cos ( 24 Pi / 16 ) 370// 371 data8 0xBFEF6297CFF75CB0 // sin ( 25 Pi / 16 ) 372 data8 0x3FC8F8B83C69A60B // cos ( 25 Pi / 16 ) 373// 374 data8 0xBFED906BCF328D46 // sin ( 26 Pi / 16 ) 375 data8 0x3FD87DE2A6AEA963 // cos ( 26 Pi / 16 ) 376// 377 data8 0xBFEA9B66290EA1A3 // sin ( 27 Pi / 16 ) 378 data8 0x3FE1C73B39AE68C8 // cos ( 27 Pi / 16 ) 379// 380 data8 0xBFE6A09E667F3BCD // sin ( 28 Pi / 16 ) 381 data8 0x3FE6A09E667F3BCD // cos ( 28 Pi / 16 ) 382// 383 data8 0xBFE1C73B39AE68C8 // sin ( 29 Pi / 16 ) 384 data8 0x3FEA9B66290EA1A3 // cos ( 29 Pi / 16 ) 385// 386 data8 0xBFD87DE2A6AEA963 // sin ( 30 Pi / 16 ) 387 data8 0x3FED906BCF328D46 // cos ( 30 Pi / 16 ) 388// 389 data8 0xBFC8F8B83C69A60B // sin ( 31 Pi / 16 ) 390 data8 0x3FEF6297CFF75CB0 // cos ( 31 Pi / 16 ) 391// 392 data8 0x0000000000000000 // sin ( 32 Pi / 16 ) 393 data8 0x3FF0000000000000 // cos ( 32 Pi / 16 ) 394LOCAL_OBJECT_END(double_sin_cos_beta_k4) 395 396.section .text 397 398GLOBAL_IEEE754_ENTRY(sincosf) 399// cis_GR_sig_inv_pi_by_16 = significand of 16/pi 400{ .mlx 401 alloc GR_SAVE_PFS = ar.pfs, 0, 21, 0, 0 402 movl cisf_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A // 16/pi signd 403 404} 405// cis_GR_rshf_2to61 = 1.1000 2^(63+63-2) 406{ .mlx 407 addl cisf_AD_1 = @ltoff(double_cisf_pi), gp 408 movl cisf_GR_rshf_2to61 = 0x47b8000000000000 // 1.1 2^(63+63-2) 409};; 410 411{ .mfi 412 ld8 cisf_AD_1 = [cisf_AD_1] 413 fnorm.s1 cisf_NORM_f8 = cisf_Arg 414 cmp.eq p13, p14 = r0, r0 // p13 set for sincos 415} 416// cis_GR_exp_2tom61 = exponent of scaling factor 2^-61 417{ .mib 418 mov cisf_GR_exp_2tom61 = 0xffff-61 419 nop.i 0 420 br.cond.sptk _CISF_COMMON 421};; 422GLOBAL_IEEE754_END(sincosf) 423libm_alias_float_other (__sincos, sincos) 424 425GLOBAL_LIBM_ENTRY(__libm_sincosf) 426{ .mlx 427// cisf_GR_sig_inv_pi_by_16 = significand of 16/pi 428 alloc GR_SAVE_PFS = ar.pfs,0,21,0,0 429 movl cisf_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A 430} 431// cisf_GR_rshf_2to61 = 1.1000 2^(63+63-2) 432{ .mlx 433 addl cisf_AD_1 = @ltoff(double_cisf_pi), gp 434 movl cisf_GR_rshf_2to61 = 0x47b8000000000000 435};; 436 437// p14 set for __libm_sincos and cis 438{ .mfi 439 ld8 cisf_AD_1 = [cisf_AD_1] 440 fnorm.s1 cisf_NORM_f8 = cisf_Arg 441 cmp.eq p14, p13 = r0, r0 442} 443// cisf_GR_exp_2tom61 = exponent of scaling factor 2^-61 444{ .mib 445 mov cisf_GR_exp_2tom61 = 0xffff-61 446 nop.i 0 447 nop.b 0 448};; 449 450_CISF_COMMON: 451// Form two constants we need 452// 16/pi * 2^-2 * 2^63, scaled by 2^61 since we just loaded the significand 453// 1.1000...000 * 2^(63+63-2) to right shift int(W) into the low significand 454// fcmp used to set denormal, and invalid on snans 455{ .mfi 456 setf.sig cisf_SIG_INV_PI_BY_16_2TO61 = cisf_GR_sig_inv_pi_by_16 457 fclass.m p6,p0 = cisf_Arg, 0xe7//if x=0,inf,nan 458 addl cisf_gr_tmp = -1, r0 459} 460// cisf_GR_rshf = 1.1000 2^63 for right shift 461{ .mlx 462 setf.d cisf_RSHF_2TO61 = cisf_GR_rshf_2to61 463 movl cisf_GR_rshf = 0x43e8000000000000 464};; 465 466// Form another constant 467// 2^-61 for scaling Nfloat 468// 0x10017 is register_bias + 24. 469// So if f8 >= 2^24, go to large args routine 470{ .mmi 471 getf.exp cisf_r_signexp = cisf_Arg 472 setf.exp cisf_2TOM61 = cisf_GR_exp_2tom61 473 mov cisf_exp_limit = 0x10017 474};; 475 476// Load the two pieces of pi/16 477// Form another constant 478// 1.1000...000 * 2^63, the right shift constant 479{ .mmb 480 ldfe cisf_Pi_by_16_hi = [cisf_AD_1],16 481 setf.d cisf_RSHF = cisf_GR_rshf 482(p6) br.cond.spnt _CISF_SPECIAL_ARGS 483};; 484 485{ .mmi 486 ldfe cisf_Pi_by_16_lo = [cisf_AD_1],16 487 setf.sig cisf_tmp = cisf_gr_tmp //constant for inexact set 488 nop.i 0 489};; 490 491// Start loading P, Q coefficients 492{ .mmi 493 ldfpd cisf_P2,cisf_Q2 = [cisf_AD_1],16 494 nop.m 0 495 dep.z cisf_r_exp = cisf_r_signexp, 0, 17 496};; 497 498// p10 is true if we must call routines to handle larger arguments 499// p10 is true if f8 exp is >= 0x10017 500{ .mmb 501 ldfpd cisf_P1,cisf_Q1 = [cisf_AD_1], 16 502 cmp.ge p10, p0 = cisf_r_exp, cisf_exp_limit 503(p10) br.cond.spnt _CISF_LARGE_ARGS // go to |x| >= 2^24 path 504};; 505 506// cisf_W = x * cisf_Inv_Pi_by_16 507// Multiply x by scaled 16/pi and add large const to shift integer part of W to 508// rightmost bits of significand 509{ .mfi 510 nop.m 0 511 fma.s1 cisf_W_2TO61_RSH = cisf_NORM_f8,cisf_SIG_INV_PI_BY_16_2TO61,cisf_RSHF_2TO61 512 nop.i 0 513};; 514 515// cisf_NFLOAT = Round_Int_Nearest(cisf_W) 516{ .mfi 517 nop.m 0 518 fms.s1 cisf_NFLOAT = cisf_W_2TO61_RSH,cisf_2TOM61,cisf_RSHF 519 nop.i 0 520};; 521 522// N = (int)cisf_int_Nfloat 523{ .mfi 524 getf.sig cisf_GR_n = cisf_W_2TO61_RSH 525 nop.f 0 526 nop.i 0 527};; 528 529// Add 2^(k-1) (which is in cisf_r_sincos) to N 530// cisf_r = -cisf_Nfloat * cisf_Pi_by_16_hi + x 531// cisf_r = cisf_r -cisf_Nfloat * cisf_Pi_by_16_lo 532{ .mfi 533 add cisf_GR_n_cos = 0x8, cisf_GR_n 534 fnma.s1 cisf_r = cisf_NFLOAT, cisf_Pi_by_16_hi, cisf_NORM_f8 535 nop.i 0 536};; 537 538//Get M (least k+1 bits of N) 539{ .mmi 540 and cisf_GR_m_sin = 0x1f,cisf_GR_n 541 and cisf_GR_m_cos = 0x1f,cisf_GR_n_cos 542 nop.i 0 543};; 544 545{ .mmi 546 shladd cisf_AD_2_cos = cisf_GR_m_cos,4, cisf_AD_1 547 shladd cisf_AD_2_sin = cisf_GR_m_sin,4, cisf_AD_1 548 nop.i 0 549};; 550 551// den. input to set uflow 552{ .mmf 553 ldfpd cisf_Sm_sin, cisf_Cm_sin = [cisf_AD_2_sin] 554 ldfpd cisf_Sm_cos, cisf_Cm_cos = [cisf_AD_2_cos] 555 fclass.m.unc p10,p0 = cisf_Arg,0x0b 556};; 557 558{ .mfi 559 nop.m 0 560 fma.s1 cisf_rsq = cisf_r, cisf_r, f0 // get r^2 561 nop.i 0 562} 563{ .mfi 564 nop.m 0 565 fmpy.s0 cisf_tmp = cisf_tmp,cisf_tmp // inexact flag 566 nop.i 0 567};; 568 569{ .mmf 570 nop.m 0 571 nop.m 0 572 fnma.s1 cisf_r_exact = cisf_NFLOAT, cisf_Pi_by_16_lo, cisf_r 573};; 574 575{ .mfi 576 nop.m 0 577 fma.s1 cisf_P = cisf_rsq, cisf_P2, cisf_P1 578 nop.i 0 579} 580{ .mfi 581 nop.m 0 582 fma.s1 cisf_Q = cisf_rsq, cisf_Q2, cisf_Q1 583 nop.i 0 584};; 585 586{ .mfi 587 nop.m 0 588 fmpy.s1 cisf_rcub = cisf_r_exact, cisf_rsq // get r^3 589 nop.i 0 590};; 591 592{ .mfi 593 nop.m 0 594 fmpy.s1 cisf_srsq_sin = cisf_Sm_sin,cisf_rsq 595 nop.i 0 596} 597{ .mfi 598 nop.m 0 599 fmpy.s1 cisf_srsq_cos = cisf_Sm_cos,cisf_rsq 600 nop.i 0 601};; 602 603{ .mfi 604 nop.m 0 605 fma.s1 cisf_P = cisf_rcub,cisf_P,cisf_r_exact 606 nop.i 0 607};; 608 609{ .mfi 610 nop.m 0 611 fma.s1 cisf_Q_sin = cisf_srsq_sin,cisf_Q, cisf_Sm_sin 612 nop.i 0 613} 614{ .mfi 615 nop.m 0 616 fma.s1 cisf_Q_cos = cisf_srsq_cos,cisf_Q, cisf_Sm_cos 617 nop.i 0 618};; 619 620// If den. arg, force underflow to be set 621{ .mfi 622 nop.m 0 623(p10) fmpy.s.s0 cisf_tmp = cisf_Arg,cisf_Arg 624 nop.i 0 625};; 626 627//Final sin 628{ .mfi 629 nop.m 0 630 fma.s.s0 cisf_Sin_res = cisf_Cm_sin, cisf_P, cisf_Q_sin 631 nop.i 0 632} 633//Final cos 634{ .mfb 635 nop.m 0 636 fma.s.s0 cisf_Cos_res = cisf_Cm_cos, cisf_P, cisf_Q_cos 637(p14) br.cond.sptk _CISF_RETURN //com. exit for __libm_sincos and cis main path 638};; 639 640{ .mmb 641 stfs [cisf_pResSin] = cisf_Sin_res 642 stfs [cisf_pResCos] = cisf_Cos_res 643 br.ret.sptk b0 // common exit for sincos main path 644};; 645 646_CISF_SPECIAL_ARGS: 647// sinf(+/-0) = +/-0 648// sinf(Inf) = NaN 649// sinf(NaN) = NaN 650{ .mfi 651 nop.m 999 652 fma.s.s0 cisf_Sin_res = cisf_Arg, f0, f0 // sinf(+/-0,NaN,Inf) 653 nop.i 999 654};; 655 656// cosf(+/-0) = 1.0 657// cosf(Inf) = NaN 658// cosf(NaN) = NaN 659{ .mfb 660 nop.m 999 661 fma.s.s0 cisf_Cos_res = cisf_Arg, f0, f1 // cosf(+/-0,NaN,Inf) 662(p14) br.cond.sptk _CISF_RETURN //spec exit for __libm_sincos and cis main path 663};; 664 665{ .mmb 666 stfs [cisf_pResSin] = cisf_Sin_res 667 stfs [cisf_pResCos] = cisf_Cos_res 668 br.ret.sptk b0 // special exit for sincos main path 669};; 670 671 // exit for sincos 672 // NOTE! r8 and r9 used only because of compiler issue 673 // connected with float point complex function arguments pass 674 // After fix of this issue this operations can be deleted 675_CISF_RETURN: 676{ .mmb 677 getf.s r8 = cisf_Cos_res 678 getf.s r9 = cisf_Sin_res 679 br.ret.sptk b0 // exit for sincos 680};; 681GLOBAL_LIBM_END(__libm_sincosf) 682 683//// |x| > 2^24 path /////// 684.proc _CISF_LARGE_ARGS 685_CISF_LARGE_ARGS: 686.prologue 687{ .mfi 688 nop.m 0 689 nop.f 0 690.save ar.pfs, GR_SAVE_PFS 691 mov GR_SAVE_PFS = ar.pfs 692};; 693 694{ .mfi 695 mov GR_SAVE_GP = gp 696 nop.f 0 697.save b0, GR_SAVE_B0 698 mov GR_SAVE_B0 = b0 699};; 700 701.body 702// Call of huge arguments sincos 703{ .mib 704 nop.m 0 705 mov GR_SAVE_PR = pr 706 br.call.sptk b0 = __libm_sincos_large 707};; 708 709{ .mfi 710 mov gp = GR_SAVE_GP 711 nop.f 0 712 mov pr = GR_SAVE_PR, 0x1fffe 713} 714;; 715 716{ .mfi 717 nop.m 0 718 nop.f 0 719 mov b0 = GR_SAVE_B0 720} 721;; 722 723{ .mfi 724 nop.m 0 725 fma.s.s0 cisf_Cos_res = cisf_Cos_res, f1, f0 726 mov ar.pfs = GR_SAVE_PFS 727} 728// exit for |x| > 2^24 path (__libm_sincos and cis) 729{ .mfb 730 nop.m 0 731 fma.s.s0 cisf_Sin_res = cisf_Sin_res, f1, f0 732(p14) br.cond.sptk _CISF_RETURN 733};; 734 735{ .mmb 736 stfs [cisf_pResSin] = cisf_Sin_res 737 stfs [cisf_pResCos] = cisf_Cos_res 738 br.ret.sptk b0 // exit for sincos |x| > 2^24 path 739};; 740 741.endp _CISF_LARGE_ARGS 742 743.type __libm_sincos_large#,@function 744.global __libm_sincos_large# 745