1.file "libm_sincos_large.s" 2 3 4// Copyright (c) 2002 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//============================================================== 41// 02/15/02 Initial version 42// 05/13/02 Changed interface to __libm_pi_by_2_reduce 43// 02/10/03 Reordered header: .section, .global, .proc, .align; 44// used data8 for long double table values 45// 05/15/03 Reformatted data tables 46// 47// 48// Overview of operation 49//============================================================== 50// 51// These functions calculate the sin and cos for inputs 52// greater than 2^10 53// 54// __libm_sin_large# 55// __libm_cos_large# 56// They accept argument in f8 57// and return result in f8 without final rounding 58// 59// __libm_sincos_large# 60// It accepts argument in f8 61// and returns cos in f8 and sin in f9 without final rounding 62// 63// 64//********************************************************************* 65// 66// Accuracy: Within .7 ulps for 80-bit floating point values 67// Very accurate for double precision values 68// 69//********************************************************************* 70// 71// Resources Used: 72// 73// Floating-Point Registers: f8 as Input Value, f8 and f9 as Return Values 74// f32-f103 75// 76// General Purpose Registers: 77// r32-r43 78// r44-r45 (Used to pass arguments to pi_by_2 reduce routine) 79// 80// Predicate Registers: p6-p13 81// 82//********************************************************************* 83// 84// IEEE Special Conditions: 85// 86// Denormal fault raised on denormal inputs 87// Overflow exceptions do not occur 88// Underflow exceptions raised when appropriate for sin 89// (No specialized error handling for this routine) 90// Inexact raised when appropriate by algorithm 91// 92// sin(SNaN) = QNaN 93// sin(QNaN) = QNaN 94// sin(inf) = QNaN 95// sin(+/-0) = +/-0 96// cos(inf) = QNaN 97// cos(SNaN) = QNaN 98// cos(QNaN) = QNaN 99// cos(0) = 1 100// 101//********************************************************************* 102// 103// Mathematical Description 104// ======================== 105// 106// The computation of FSIN and FCOS is best handled in one piece of 107// code. The main reason is that given any argument Arg, computation 108// of trigonometric functions first calculate N and an approximation 109// to alpha where 110// 111// Arg = N pi/2 + alpha, |alpha| <= pi/4. 112// 113// Since 114// 115// cos( Arg ) = sin( (N+1) pi/2 + alpha ), 116// 117// therefore, the code for computing sine will produce cosine as long 118// as 1 is added to N immediately after the argument reduction 119// process. 120// 121// Let M = N if sine 122// N+1 if cosine. 123// 124// Now, given 125// 126// Arg = M pi/2 + alpha, |alpha| <= pi/4, 127// 128// let I = M mod 4, or I be the two lsb of M when M is represented 129// as 2's complement. I = [i_0 i_1]. Then 130// 131// sin( Arg ) = (-1)^i_0 sin( alpha ) if i_1 = 0, 132// = (-1)^i_0 cos( alpha ) if i_1 = 1. 133// 134// For example: 135// if M = -1, I = 11 136// sin ((-pi/2 + alpha) = (-1) cos (alpha) 137// if M = 0, I = 00 138// sin (alpha) = sin (alpha) 139// if M = 1, I = 01 140// sin (pi/2 + alpha) = cos (alpha) 141// if M = 2, I = 10 142// sin (pi + alpha) = (-1) sin (alpha) 143// if M = 3, I = 11 144// sin ((3/2)pi + alpha) = (-1) cos (alpha) 145// 146// The value of alpha is obtained by argument reduction and 147// represented by two working precision numbers r and c where 148// 149// alpha = r + c accurately. 150// 151// The reduction method is described in a previous write up. 152// The argument reduction scheme identifies 4 cases. For Cases 2 153// and 4, because |alpha| is small, sin(r+c) and cos(r+c) can be 154// computed very easily by 2 or 3 terms of the Taylor series 155// expansion as follows: 156// 157// Case 2: 158// ------- 159// 160// sin(r + c) = r + c - r^3/6 accurately 161// cos(r + c) = 1 - 2^(-67) accurately 162// 163// Case 4: 164// ------- 165// 166// sin(r + c) = r + c - r^3/6 + r^5/120 accurately 167// cos(r + c) = 1 - r^2/2 + r^4/24 accurately 168// 169// The only cases left are Cases 1 and 3 of the argument reduction 170// procedure. These two cases will be merged since after the 171// argument is reduced in either cases, we have the reduced argument 172// represented as r + c and that the magnitude |r + c| is not small 173// enough to allow the usage of a very short approximation. 174// 175// The required calculation is either 176// 177// sin(r + c) = sin(r) + correction, or 178// cos(r + c) = cos(r) + correction. 179// 180// Specifically, 181// 182// sin(r + c) = sin(r) + c sin'(r) + O(c^2) 183// = sin(r) + c cos (r) + O(c^2) 184// = sin(r) + c(1 - r^2/2) accurately. 185// Similarly, 186// 187// cos(r + c) = cos(r) - c sin(r) + O(c^2) 188// = cos(r) - c(r - r^3/6) accurately. 189// 190// We therefore concentrate on accurately calculating sin(r) and 191// cos(r) for a working-precision number r, |r| <= pi/4 to within 192// 0.1% or so. 193// 194// The greatest challenge of this task is that the second terms of 195// the Taylor series 196// 197// r - r^3/3! + r^r/5! - ... 198// 199// and 200// 201// 1 - r^2/2! + r^4/4! - ... 202// 203// are not very small when |r| is close to pi/4 and the rounding 204// errors will be a concern if simple polynomial accumulation is 205// used. When |r| < 2^-3, however, the second terms will be small 206// enough (6 bits or so of right shift) that a normal Horner 207// recurrence suffices. Hence there are two cases that we consider 208// in the accurate computation of sin(r) and cos(r), |r| <= pi/4. 209// 210// Case small_r: |r| < 2^(-3) 211// -------------------------- 212// 213// Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1], 214// we have 215// 216// sin(Arg) = (-1)^i_0 * sin(r + c) if i_1 = 0 217// = (-1)^i_0 * cos(r + c) if i_1 = 1 218// 219// can be accurately approximated by 220// 221// sin(Arg) = (-1)^i_0 * [sin(r) + c] if i_1 = 0 222// = (-1)^i_0 * [cos(r) - c*r] if i_1 = 1 223// 224// because |r| is small and thus the second terms in the correction 225// are unnecessary. 226// 227// Finally, sin(r) and cos(r) are approximated by polynomials of 228// moderate lengths. 229// 230// sin(r) = r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11 231// cos(r) = 1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10 232// 233// We can make use of predicates to selectively calculate 234// sin(r) or cos(r) based on i_1. 235// 236// Case normal_r: 2^(-3) <= |r| <= pi/4 237// ------------------------------------ 238// 239// This case is more likely than the previous one if one considers 240// r to be uniformly distributed in [-pi/4 pi/4]. Again, 241// 242// sin(Arg) = (-1)^i_0 * sin(r + c) if i_1 = 0 243// = (-1)^i_0 * cos(r + c) if i_1 = 1. 244// 245// Because |r| is now larger, we need one extra term in the 246// correction. sin(Arg) can be accurately approximated by 247// 248// sin(Arg) = (-1)^i_0 * [sin(r) + c(1-r^2/2)] if i_1 = 0 249// = (-1)^i_0 * [cos(r) - c*r*(1 - r^2/6)] i_1 = 1. 250// 251// Finally, sin(r) and cos(r) are approximated by polynomials of 252// moderate lengths. 253// 254// sin(r) = r + PP_1_hi r^3 + PP_1_lo r^3 + 255// PP_2 r^5 + ... + PP_8 r^17 256// 257// cos(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16 258// 259// where PP_1_hi is only about 16 bits long and QQ_1 is -1/2. 260// The crux in accurate computation is to calculate 261// 262// r + PP_1_hi r^3 or 1 + QQ_1 r^2 263// 264// accurately as two pieces: U_hi and U_lo. The way to achieve this 265// is to obtain r_hi as a 10 sig. bit number that approximates r to 266// roughly 8 bits or so of accuracy. (One convenient way is 267// 268// r_hi := frcpa( frcpa( r ) ).) 269// 270// This way, 271// 272// r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 + 273// PP_1_hi (r^3 - r_hi^3) 274// = [r + PP_1_hi r_hi^3] + 275// [PP_1_hi (r - r_hi) 276// (r^2 + r_hi r + r_hi^2) ] 277// = U_hi + U_lo 278// 279// Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long, 280// PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed 281// exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign 282// and that there is no more than 8 bit shift off between r and 283// PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus 284// calculated without any error. Finally, the fact that 285// 286// |U_lo| <= 2^(-8) |U_hi| 287// 288// says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly 289// 8 extra bits of accuracy. 290// 291// Similarly, 292// 293// 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] + 294// [QQ_1 (r - r_hi)(r + r_hi)] 295// = U_hi + U_lo. 296// 297// Summarizing, we calculate r_hi = frcpa( frcpa( r ) ). 298// 299// If i_1 = 0, then 300// 301// U_hi := r + PP_1_hi * r_hi^3 302// U_lo := PP_1_hi * (r - r_hi) * (r^2 + r*r_hi + r_hi^2) 303// poly := PP_1_lo r^3 + PP_2 r^5 + ... + PP_8 r^17 304// correction := c * ( 1 + C_1 r^2 ) 305// 306// Else ...i_1 = 1 307// 308// U_hi := 1 + QQ_1 * r_hi * r_hi 309// U_lo := QQ_1 * (r - r_hi) * (r + r_hi) 310// poly := QQ_2 * r^4 + QQ_3 * r^6 + ... + QQ_8 r^16 311// correction := -c * r * (1 + S_1 * r^2) 312// 313// End 314// 315// Finally, 316// 317// V := poly + ( U_lo + correction ) 318// 319// / U_hi + V if i_0 = 0 320// result := | 321// \ (-U_hi) - V if i_0 = 1 322// 323// It is important that in the last step, negation of U_hi is 324// performed prior to the subtraction which is to be performed in 325// the user-set rounding mode. 326// 327// 328// Algorithmic Description 329// ======================= 330// 331// The argument reduction algorithm is tightly integrated into FSIN 332// and FCOS which share the same code. The following is complete and 333// self-contained. The argument reduction description given 334// previously is repeated below. 335// 336// 337// Step 0. Initialization. 338// 339// If FSIN is invoked, set N_inc := 0; else if FCOS is invoked, 340// set N_inc := 1. 341// 342// Step 1. Check for exceptional and special cases. 343// 344// * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special 345// handling. 346// * If |Arg| < 2^24, go to Step 2 for reduction of moderate 347// arguments. This is the most likely case. 348// * If |Arg| < 2^63, go to Step 8 for pre-reduction of large 349// arguments. 350// * If |Arg| >= 2^63, go to Step 10 for special handling. 351// 352// Step 2. Reduction of moderate arguments. 353// 354// If |Arg| < pi/4 ...quick branch 355// N_fix := N_inc (integer) 356// r := Arg 357// c := 0.0 358// Branch to Step 4, Case_1_complete 359// Else ...cf. argument reduction 360// N := Arg * two_by_PI (fp) 361// N_fix := fcvt.fx( N ) (int) 362// N := fcvt.xf( N_fix ) 363// N_fix := N_fix + N_inc 364// s := Arg - N * P_1 (first piece of pi/2) 365// w := -N * P_2 (second piece of pi/2) 366// 367// If |s| >= 2^(-33) 368// go to Step 3, Case_1_reduce 369// Else 370// go to Step 7, Case_2_reduce 371// Endif 372// Endif 373// 374// Step 3. Case_1_reduce. 375// 376// r := s + w 377// c := (s - r) + w ...observe order 378// 379// Step 4. Case_1_complete 380// 381// ...At this point, the reduced argument alpha is 382// ...accurately represented as r + c. 383// If |r| < 2^(-3), go to Step 6, small_r. 384// 385// Step 5. Normal_r. 386// 387// Let [i_0 i_1] by the 2 lsb of N_fix. 388// FR_rsq := r * r 389// r_hi := frcpa( frcpa( r ) ) 390// r_lo := r - r_hi 391// 392// If i_1 = 0, then 393// poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8)) 394// U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order 395// U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi) 396// correction := c + c*C_1*FR_rsq ...any order 397// Else 398// poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8)) 399// U_hi := 1 + QQ_1 * r_hi * r_hi ...any order 400// U_lo := QQ_1 * r_lo * (r + r_hi) 401// correction := -c*(r + S_1*FR_rsq*r) ...any order 402// Endif 403// 404// V := poly + (U_lo + correction) ...observe order 405// 406// result := (i_0 == 0? 1.0 : -1.0) 407// 408// Last instruction in user-set rounding mode 409// 410// result := (i_0 == 0? result*U_hi + V : 411// result*U_hi - V) 412// 413// Return 414// 415// Step 6. Small_r. 416// 417// ...Use flush to zero mode without causing exception 418// Let [i_0 i_1] be the two lsb of N_fix. 419// 420// FR_rsq := r * r 421// 422// If i_1 = 0 then 423// z := FR_rsq*FR_rsq; z := FR_rsq*z *r 424// poly_lo := S_3 + FR_rsq*(S_4 + FR_rsq*S_5) 425// poly_hi := r*FR_rsq*(S_1 + FR_rsq*S_2) 426// correction := c 427// result := r 428// Else 429// z := FR_rsq*FR_rsq; z := FR_rsq*z 430// poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5) 431// poly_hi := FR_rsq*(C_1 + FR_rsq*C_2) 432// correction := -c*r 433// result := 1 434// Endif 435// 436// poly := poly_hi + (z * poly_lo + correction) 437// 438// If i_0 = 1, result := -result 439// 440// Last operation. Perform in user-set rounding mode 441// 442// result := (i_0 == 0? result + poly : 443// result - poly ) 444// Return 445// 446// Step 7. Case_2_reduce. 447// 448// ...Refer to the write up for argument reduction for 449// ...rationale. The reduction algorithm below is taken from 450// ...argument reduction description and integrated this. 451// 452// w := N*P_3 453// U_1 := N*P_2 + w ...FMA 454// U_2 := (N*P_2 - U_1) + w ...2 FMA 455// ...U_1 + U_2 is N*(P_2+P_3) accurately 456// 457// r := s - U_1 458// c := ( (s - r) - U_1 ) - U_2 459// 460// ...The mathematical sum r + c approximates the reduced 461// ...argument accurately. Note that although compared to 462// ...Case 1, this case requires much more work to reduce 463// ...the argument, the subsequent calculation needed for 464// ...any of the trigonometric function is very little because 465// ...|alpha| < 1.01*2^(-33) and thus two terms of the 466// ...Taylor series expansion suffices. 467// 468// If i_1 = 0 then 469// poly := c + S_1 * r * r * r ...any order 470// result := r 471// Else 472// poly := -2^(-67) 473// result := 1.0 474// Endif 475// 476// If i_0 = 1, result := -result 477// 478// Last operation. Perform in user-set rounding mode 479// 480// result := (i_0 == 0? result + poly : 481// result - poly ) 482// 483// Return 484// 485// 486// Step 8. Pre-reduction of large arguments. 487// 488// ...Again, the following reduction procedure was described 489// ...in the separate write up for argument reduction, which 490// ...is tightly integrated here. 491 492// N_0 := Arg * Inv_P_0 493// N_0_fix := fcvt.fx( N_0 ) 494// N_0 := fcvt.xf( N_0_fix) 495 496// Arg' := Arg - N_0 * P_0 497// w := N_0 * d_1 498// N := Arg' * two_by_PI 499// N_fix := fcvt.fx( N ) 500// N := fcvt.xf( N_fix ) 501// N_fix := N_fix + N_inc 502// 503// s := Arg' - N * P_1 504// w := w - N * P_2 505// 506// If |s| >= 2^(-14) 507// go to Step 3 508// Else 509// go to Step 9 510// Endif 511// 512// Step 9. Case_4_reduce. 513// 514// ...first obtain N_0*d_1 and -N*P_2 accurately 515// U_hi := N_0 * d_1 V_hi := -N*P_2 516// U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs 517// 518// ...compute the contribution from N_0*d_1 and -N*P_3 519// w := -N*P_3 520// w := w + N_0*d_2 521// t := U_lo + V_lo + w ...any order 522// 523// ...at this point, the mathematical value 524// ...s + U_hi + V_hi + t approximates the true reduced argument 525// ...accurately. Just need to compute this accurately. 526// 527// ...Calculate U_hi + V_hi accurately: 528// A := U_hi + V_hi 529// if |U_hi| >= |V_hi| then 530// a := (U_hi - A) + V_hi 531// else 532// a := (V_hi - A) + U_hi 533// endif 534// ...order in computing "a" must be observed. This branch is 535// ...best implemented by predicates. 536// ...A + a is U_hi + V_hi accurately. Moreover, "a" is 537// ...much smaller than A: |a| <= (1/2)ulp(A). 538// 539// ...Just need to calculate s + A + a + t 540// C_hi := s + A t := t + a 541// C_lo := (s - C_hi) + A 542// C_lo := C_lo + t 543// 544// ...Final steps for reduction 545// r := C_hi + C_lo 546// c := (C_hi - r) + C_lo 547// 548// ...At this point, we have r and c 549// ...And all we need is a couple of terms of the corresponding 550// ...Taylor series. 551// 552// If i_1 = 0 553// poly := c + r*FR_rsq*(S_1 + FR_rsq*S_2) 554// result := r 555// Else 556// poly := FR_rsq*(C_1 + FR_rsq*C_2) 557// result := 1 558// Endif 559// 560// If i_0 = 1, result := -result 561// 562// Last operation. Perform in user-set rounding mode 563// 564// result := (i_0 == 0? result + poly : 565// result - poly ) 566// Return 567// 568// Large Arguments: For arguments above 2**63, a Payne-Hanek 569// style argument reduction is used and pi_by_2 reduce is called. 570// 571 572 573RODATA 574.align 16 575 576LOCAL_OBJECT_START(FSINCOS_CONSTANTS) 577 578data4 0x4B800000 // two**24 579data4 0xCB800000 // -two**24 580data4 0x00000000 // pad 581data4 0x00000000 // pad 582data8 0xA2F9836E4E44152A, 0x00003FFE // Inv_pi_by_2 583data8 0xC84D32B0CE81B9F1, 0x00004016 // P_0 584data8 0xC90FDAA22168C235, 0x00003FFF // P_1 585data8 0xECE675D1FC8F8CBB, 0x0000BFBD // P_2 586data8 0xB7ED8FBBACC19C60, 0x0000BF7C // P_3 587data4 0x5F000000 // two**63 588data4 0xDF000000 // -two**63 589data4 0x00000000 // pad 590data4 0x00000000 // pad 591data8 0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0 592data8 0x8D848E89DBD171A1, 0x0000BFBF // d_1 593data8 0xD5394C3618A66F8E, 0x0000BF7C // d_2 594data8 0xC90FDAA22168C234, 0x00003FFE // pi_by_4 595data8 0xC90FDAA22168C234, 0x0000BFFE // neg_pi_by_4 596data4 0x3E000000 // two**-3 597data4 0xBE000000 // -two**-3 598data4 0x00000000 // pad 599data4 0x00000000 // pad 600data4 0x2F000000 // two**-33 601data4 0xAF000000 // -two**-33 602data4 0x9E000000 // -two**-67 603data4 0x00000000 // pad 604data8 0xCC8ABEBCA21C0BC9, 0x00003FCE // PP_8 605data8 0xD7468A05720221DA, 0x0000BFD6 // PP_7 606data8 0xB092382F640AD517, 0x00003FDE // PP_6 607data8 0xD7322B47D1EB75A4, 0x0000BFE5 // PP_5 608data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1 609data8 0xAAAA000000000000, 0x0000BFFC // PP_1_hi 610data8 0xB8EF1D2ABAF69EEA, 0x00003FEC // PP_4 611data8 0xD00D00D00D03BB69, 0x0000BFF2 // PP_3 612data8 0x8888888888888962, 0x00003FF8 // PP_2 613data8 0xAAAAAAAAAAAB0000, 0x0000BFEC // PP_1_lo 614data8 0xD56232EFC2B0FE52, 0x00003FD2 // QQ_8 615data8 0xC9C99ABA2B48DCA6, 0x0000BFDA // QQ_7 616data8 0x8F76C6509C716658, 0x00003FE2 // QQ_6 617data8 0x93F27DBAFDA8D0FC, 0x0000BFE9 // QQ_5 618data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1 619data8 0x8000000000000000, 0x0000BFFE // QQ_1 620data8 0xD00D00D00C6E5041, 0x00003FEF // QQ_4 621data8 0xB60B60B60B607F60, 0x0000BFF5 // QQ_3 622data8 0xAAAAAAAAAAAAAA9B, 0x00003FFA // QQ_2 623data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1 624data8 0xAAAAAAAAAAAA719F, 0x00003FFA // C_2 625data8 0xB60B60B60356F994, 0x0000BFF5 // C_3 626data8 0xD00CFFD5B2385EA9, 0x00003FEF // C_4 627data8 0x93E4BD18292A14CD, 0x0000BFE9 // C_5 628data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1 629data8 0x88888888888868DB, 0x00003FF8 // S_2 630data8 0xD00D00D0055EFD4B, 0x0000BFF2 // S_3 631data8 0xB8EF1C5D839730B9, 0x00003FEC // S_4 632data8 0xD71EA3A4E5B3F492, 0x0000BFE5 // S_5 633data4 0x38800000 // two**-14 634data4 0xB8800000 // -two**-14 635LOCAL_OBJECT_END(FSINCOS_CONSTANTS) 636 637// sin and cos registers 638 639// FR 640FR_Input_X = f8 641 642FR_r = f8 643FR_c = f9 644 645FR_Two_to_63 = f32 646FR_Two_to_24 = f33 647FR_Pi_by_4 = f33 648FR_Two_to_M14 = f34 649FR_Two_to_M33 = f35 650FR_Neg_Two_to_24 = f36 651FR_Neg_Pi_by_4 = f36 652FR_Neg_Two_to_M14 = f37 653FR_Neg_Two_to_M33 = f38 654FR_Neg_Two_to_M67 = f39 655FR_Inv_pi_by_2 = f40 656FR_N_float = f41 657FR_N_fix = f42 658FR_P_1 = f43 659FR_P_2 = f44 660FR_P_3 = f45 661FR_s = f46 662FR_w = f47 663FR_d_2 = f48 664FR_prelim = f49 665FR_Z = f50 666FR_A = f51 667FR_a = f52 668FR_t = f53 669FR_U_1 = f54 670FR_U_2 = f55 671FR_C_1 = f56 672FR_C_2 = f57 673FR_C_3 = f58 674FR_C_4 = f59 675FR_C_5 = f60 676FR_S_1 = f61 677FR_S_2 = f62 678FR_S_3 = f63 679FR_S_4 = f64 680FR_S_5 = f65 681FR_poly_hi = f66 682FR_poly_lo = f67 683FR_r_hi = f68 684FR_r_lo = f69 685FR_rsq = f70 686FR_r_cubed = f71 687FR_C_hi = f72 688FR_N_0 = f73 689FR_d_1 = f74 690FR_V = f75 691FR_V_hi = f75 692FR_V_lo = f76 693FR_U_hi = f77 694FR_U_lo = f78 695FR_U_hiabs = f79 696FR_V_hiabs = f80 697FR_PP_8 = f81 698FR_QQ_8 = f81 699FR_PP_7 = f82 700FR_QQ_7 = f82 701FR_PP_6 = f83 702FR_QQ_6 = f83 703FR_PP_5 = f84 704FR_QQ_5 = f84 705FR_PP_4 = f85 706FR_QQ_4 = f85 707FR_PP_3 = f86 708FR_QQ_3 = f86 709FR_PP_2 = f87 710FR_QQ_2 = f87 711FR_QQ_1 = f88 712FR_N_0_fix = f89 713FR_Inv_P_0 = f90 714FR_corr = f91 715FR_poly = f92 716FR_Neg_Two_to_M3 = f93 717FR_Two_to_M3 = f94 718FR_Neg_Two_to_63 = f94 719FR_P_0 = f95 720FR_C_lo = f96 721FR_PP_1 = f97 722FR_PP_1_lo = f98 723FR_ArgPrime = f99 724 725// GR 726GR_Table_Base = r32 727GR_Table_Base1 = r33 728GR_i_0 = r34 729GR_i_1 = r35 730GR_N_Inc = r36 731GR_Sin_or_Cos = r37 732 733GR_SAVE_B0 = r39 734GR_SAVE_GP = r40 735GR_SAVE_PFS = r41 736 737// sincos combined routine registers 738 739// GR 740GR_SINCOS_SAVE_PFS = r32 741GR_SINCOS_SAVE_B0 = r33 742GR_SINCOS_SAVE_GP = r34 743 744// FR 745FR_SINCOS_ARG = f100 746FR_SINCOS_RES_SIN = f101 747 748 749.section .text 750 751 752GLOBAL_LIBM_ENTRY(__libm_sincos_large) 753 754{ .mfi 755 alloc GR_SINCOS_SAVE_PFS = ar.pfs,0,3,0,0 756 fma.s1 FR_SINCOS_ARG = f8, f1, f0 // Save argument for sin and cos 757 mov GR_SINCOS_SAVE_B0 = b0 758};; 759 760{ .mfb 761 mov GR_SINCOS_SAVE_GP = gp 762 nop.f 0 763 br.call.sptk b0 = __libm_sin_large // Call sin 764};; 765 766{ .mfi 767 nop.m 0 768 fma.s1 FR_SINCOS_RES_SIN = f8, f1, f0 // Save sin result 769 nop.i 0 770};; 771 772{ .mfb 773 nop.m 0 774 fma.s1 f8 = FR_SINCOS_ARG, f1, f0 // Arg for cos 775 br.call.sptk b0 = __libm_cos_large // Call cos 776};; 777 778{ .mfi 779 mov gp = GR_SINCOS_SAVE_GP 780 fma.s1 f9 = FR_SINCOS_RES_SIN, f1, f0 // Out sin result 781 mov b0 = GR_SINCOS_SAVE_B0 782};; 783 784{ .mib 785 nop.m 0 786 mov ar.pfs = GR_SINCOS_SAVE_PFS 787 br.ret.sptk b0 // sincos_large exit 788};; 789 790GLOBAL_LIBM_END(__libm_sincos_large) 791 792 793 794 795GLOBAL_LIBM_ENTRY(__libm_sin_large) 796 797{ .mlx 798alloc GR_Table_Base = ar.pfs,0,12,2,0 799 movl GR_Sin_or_Cos = 0x0 ;; 800} 801 802{ .mmi 803 nop.m 999 804 addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp 805 nop.i 999 806} 807;; 808 809{ .mmi 810 ld8 GR_Table_Base = [GR_Table_Base] 811 nop.m 999 812 nop.i 999 813} 814;; 815 816 817{ .mib 818 nop.m 999 819 nop.i 999 820 br.cond.sptk SINCOS_CONTINUE ;; 821} 822 823GLOBAL_LIBM_END(__libm_sin_large) 824 825GLOBAL_LIBM_ENTRY(__libm_cos_large) 826 827{ .mlx 828alloc GR_Table_Base= ar.pfs,0,12,2,0 829 movl GR_Sin_or_Cos = 0x1 ;; 830} 831 832{ .mmi 833 nop.m 999 834 addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp 835 nop.i 999 836} 837;; 838 839{ .mmi 840 ld8 GR_Table_Base = [GR_Table_Base] 841 nop.m 999 842 nop.i 999 843} 844;; 845 846// 847// Load Table Address 848// 849SINCOS_CONTINUE: 850 851{ .mmi 852 add GR_Table_Base1 = 96, GR_Table_Base 853 ldfs FR_Two_to_24 = [GR_Table_Base], 4 854 nop.i 999 855} 856;; 857 858{ .mmi 859 nop.m 999 860// 861// Load 2**24, load 2**63. 862// 863 ldfs FR_Neg_Two_to_24 = [GR_Table_Base], 12 864 mov r41 = ar.pfs ;; 865} 866 867{ .mfi 868 ldfs FR_Two_to_63 = [GR_Table_Base1], 4 869// 870// Check for unnormals - unsupported operands. We do not want 871// to generate denormal exception 872// Check for NatVals, QNaNs, SNaNs, +/-Infs 873// Check for EM unsupporteds 874// Check for Zero 875// 876 fclass.m.unc p6, p8 = FR_Input_X, 0x1E3 877 mov r40 = gp ;; 878} 879 880{ .mfi 881 nop.m 999 882 fclass.nm.unc p8, p0 = FR_Input_X, 0x1FF 883// GR_Sin_or_Cos denotes 884 mov r39 = b0 885} 886 887{ .mfb 888 ldfs FR_Neg_Two_to_63 = [GR_Table_Base1], 12 889 fclass.m.unc p10, p0 = FR_Input_X, 0x007 890(p6) br.cond.spnt SINCOS_SPECIAL ;; 891} 892 893{ .mib 894 nop.m 999 895 nop.i 999 896(p8) br.cond.spnt SINCOS_SPECIAL ;; 897} 898 899{ .mib 900 nop.m 999 901 nop.i 999 902// 903// Branch if +/- NaN, Inf. 904// Load -2**24, load -2**63. 905// 906(p10) br.cond.spnt SINCOS_ZERO ;; 907} 908 909{ .mmb 910 ldfe FR_Inv_pi_by_2 = [GR_Table_Base], 16 911 ldfe FR_Inv_P_0 = [GR_Table_Base1], 16 912 nop.b 999 ;; 913} 914 915{ .mmb 916 nop.m 999 917 ldfe FR_d_1 = [GR_Table_Base1], 16 918 nop.b 999 ;; 919} 920// 921// Raise possible denormal operand flag with useful fcmp 922// Is x <= -2**63 923// Load Inv_P_0 for pre-reduction 924// Load Inv_pi_by_2 925// 926 927{ .mmb 928 ldfe FR_P_0 = [GR_Table_Base], 16 929 ldfe FR_d_2 = [GR_Table_Base1], 16 930 nop.b 999 ;; 931} 932// 933// Load P_0 934// Load d_1 935// Is x >= 2**63 936// Is x <= -2**24? 937// 938 939{ .mmi 940 ldfe FR_P_1 = [GR_Table_Base], 16 ;; 941// 942// Load P_1 943// Load d_2 944// Is x >= 2**24? 945// 946 ldfe FR_P_2 = [GR_Table_Base], 16 947 nop.i 999 ;; 948} 949 950{ .mmf 951 nop.m 999 952 ldfe FR_P_3 = [GR_Table_Base], 16 953 fcmp.le.unc.s1 p7, p8 = FR_Input_X, FR_Neg_Two_to_24 954} 955 956{ .mfi 957 nop.m 999 958// 959// Branch if +/- zero. 960// Decide about the paths to take: 961// If -2**24 < FR_Input_X < 2**24 - CASE 1 OR 2 962// OTHERWISE - CASE 3 OR 4 963// 964 fcmp.le.unc.s1 p10, p11 = FR_Input_X, FR_Neg_Two_to_63 965 nop.i 999 ;; 966} 967 968{ .mfi 969 nop.m 999 970(p8) fcmp.ge.s1 p7, p0 = FR_Input_X, FR_Two_to_24 971 nop.i 999 972} 973 974{ .mfi 975 ldfe FR_Pi_by_4 = [GR_Table_Base1], 16 976(p11) fcmp.ge.s1 p10, p0 = FR_Input_X, FR_Two_to_63 977 nop.i 999 ;; 978} 979 980{ .mmi 981 ldfe FR_Neg_Pi_by_4 = [GR_Table_Base1], 16 ;; 982 ldfs FR_Two_to_M3 = [GR_Table_Base1], 4 983 nop.i 999 ;; 984} 985 986{ .mib 987 ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1], 12 988 nop.i 999 989// 990// Load P_2 991// Load P_3 992// Load pi_by_4 993// Load neg_pi_by_4 994// Load 2**(-3) 995// Load -2**(-3). 996// 997(p10) br.cond.spnt SINCOS_ARG_TOO_LARGE ;; 998} 999 1000{ .mib 1001 nop.m 999 1002 nop.i 999 1003// 1004// Branch out if x >= 2**63. Use Payne-Hanek Reduction 1005// 1006(p7) br.cond.spnt SINCOS_LARGER_ARG ;; 1007} 1008 1009{ .mfi 1010 nop.m 999 1011// 1012// Branch if Arg <= -2**24 or Arg >= 2**24 and use pre-reduction. 1013// 1014 fma.s1 FR_N_float = FR_Input_X, FR_Inv_pi_by_2, f0 1015 nop.i 999 ;; 1016} 1017 1018{ .mfi 1019 nop.m 999 1020 fcmp.lt.unc.s1 p6, p7 = FR_Input_X, FR_Pi_by_4 1021 nop.i 999 ;; 1022} 1023 1024{ .mfi 1025 nop.m 999 1026// 1027// Select the case when |Arg| < pi/4 1028// Else Select the case when |Arg| >= pi/4 1029// 1030 fcvt.fx.s1 FR_N_fix = FR_N_float 1031 nop.i 999 ;; 1032} 1033 1034{ .mfi 1035 nop.m 999 1036// 1037// N = Arg * 2/pi 1038// Check if Arg < pi/4 1039// 1040(p6) fcmp.gt.s1 p6, p7 = FR_Input_X, FR_Neg_Pi_by_4 1041 nop.i 999 ;; 1042} 1043// 1044// Case 2: Convert integer N_fix back to normalized floating-point value. 1045// Case 1: p8 is only affected when p6 is set 1046// 1047 1048{ .mfi 1049(p7) ldfs FR_Two_to_M33 = [GR_Table_Base1], 4 1050// 1051// Grab the integer part of N and call it N_fix 1052// 1053(p6) fmerge.se FR_r = FR_Input_X, FR_Input_X 1054// If |x| < pi/4, r = x and c = 0 1055// lf |x| < pi/4, is x < 2**(-3). 1056// r = Arg 1057// c = 0 1058(p6) mov GR_N_Inc = GR_Sin_or_Cos ;; 1059} 1060 1061{ .mmf 1062 nop.m 999 1063(p7) ldfs FR_Neg_Two_to_M33 = [GR_Table_Base1], 4 1064(p6) fmerge.se FR_c = f0, f0 1065} 1066 1067{ .mfi 1068 nop.m 999 1069(p6) fcmp.lt.unc.s1 p8, p9 = FR_Input_X, FR_Two_to_M3 1070 nop.i 999 ;; 1071} 1072 1073{ .mfi 1074 nop.m 999 1075// 1076// lf |x| < pi/4, is -2**(-3)< x < 2**(-3) - set p8. 1077// If |x| >= pi/4, 1078// Create the right N for |x| < pi/4 and otherwise 1079// Case 2: Place integer part of N in GP register 1080// 1081(p7) fcvt.xf FR_N_float = FR_N_fix 1082 nop.i 999 ;; 1083} 1084 1085{ .mmf 1086 nop.m 999 1087(p7) getf.sig GR_N_Inc = FR_N_fix 1088(p8) fcmp.gt.s1 p8, p0 = FR_Input_X, FR_Neg_Two_to_M3 ;; 1089} 1090 1091{ .mib 1092 nop.m 999 1093 nop.i 999 1094// 1095// Load 2**(-33), -2**(-33) 1096// 1097(p8) br.cond.spnt SINCOS_SMALL_R ;; 1098} 1099 1100{ .mib 1101 nop.m 999 1102 nop.i 999 1103(p6) br.cond.sptk SINCOS_NORMAL_R ;; 1104} 1105// 1106// if |x| < pi/4, branch based on |x| < 2**(-3) or otherwise. 1107// 1108// 1109// In this branch, |x| >= pi/4. 1110// 1111 1112{ .mfi 1113 ldfs FR_Neg_Two_to_M67 = [GR_Table_Base1], 8 1114// 1115// Load -2**(-67) 1116// 1117 fnma.s1 FR_s = FR_N_float, FR_P_1, FR_Input_X 1118// 1119// w = N * P_2 1120// s = -N * P_1 + Arg 1121// 1122 add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos 1123} 1124 1125{ .mfi 1126 nop.m 999 1127 fma.s1 FR_w = FR_N_float, FR_P_2, f0 1128 nop.i 999 ;; 1129} 1130 1131{ .mfi 1132 nop.m 999 1133// 1134// Adjust N_fix by N_inc to determine whether sine or 1135// cosine is being calculated 1136// 1137 fcmp.lt.unc.s1 p7, p6 = FR_s, FR_Two_to_M33 1138 nop.i 999 ;; 1139} 1140 1141{ .mfi 1142 nop.m 999 1143(p7) fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33 1144 nop.i 999 ;; 1145} 1146 1147{ .mfi 1148 nop.m 999 1149// Remember x >= pi/4. 1150// Is s <= -2**(-33) or s >= 2**(-33) (p6) 1151// or -2**(-33) < s < 2**(-33) (p7) 1152(p6) fms.s1 FR_r = FR_s, f1, FR_w 1153 nop.i 999 1154} 1155 1156{ .mfi 1157 nop.m 999 1158(p7) fma.s1 FR_w = FR_N_float, FR_P_3, f0 1159 nop.i 999 ;; 1160} 1161 1162{ .mfi 1163 nop.m 999 1164(p7) fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w 1165 nop.i 999 1166} 1167 1168{ .mfi 1169 nop.m 999 1170(p6) fms.s1 FR_c = FR_s, f1, FR_r 1171 nop.i 999 ;; 1172} 1173 1174{ .mfi 1175 nop.m 999 1176// 1177// For big s: r = s - w: No futher reduction is necessary 1178// For small s: w = N * P_3 (change sign) More reduction 1179// 1180(p6) fcmp.lt.unc.s1 p8, p9 = FR_r, FR_Two_to_M3 1181 nop.i 999 ;; 1182} 1183 1184{ .mfi 1185 nop.m 999 1186(p8) fcmp.gt.s1 p8, p9 = FR_r, FR_Neg_Two_to_M3 1187 nop.i 999 ;; 1188} 1189 1190{ .mfi 1191 nop.m 999 1192(p7) fms.s1 FR_r = FR_s, f1, FR_U_1 1193 nop.i 999 1194} 1195 1196{ .mfb 1197 nop.m 999 1198// 1199// For big s: Is |r| < 2**(-3)? 1200// For big s: c = S - r 1201// For small s: U_1 = N * P_2 + w 1202// 1203// If p8 is set, prepare to branch to Small_R. 1204// If p9 is set, prepare to branch to Normal_R. 1205// For big s, r is complete here. 1206// 1207(p6) fms.s1 FR_c = FR_c, f1, FR_w 1208// 1209// For big s: c = c + w (w has not been negated.) 1210// For small s: r = S - U_1 1211// 1212(p8) br.cond.spnt SINCOS_SMALL_R ;; 1213} 1214 1215{ .mib 1216 nop.m 999 1217 nop.i 999 1218(p9) br.cond.sptk SINCOS_NORMAL_R ;; 1219} 1220 1221{ .mfi 1222(p7) add GR_Table_Base1 = 224, GR_Table_Base1 1223// 1224// Branch to SINCOS_SMALL_R or SINCOS_NORMAL_R 1225// 1226(p7) fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1 1227// 1228// c = S - U_1 1229// r = S_1 * r 1230// 1231// 1232(p7) extr.u GR_i_1 = GR_N_Inc, 0, 1 1233} 1234 1235{ .mmi 1236 nop.m 999 ;; 1237// 1238// Get [i_0,i_1] - two lsb of N_fix_gr. 1239// Do dummy fmpy so inexact is always set. 1240// 1241(p7) cmp.eq.unc p9, p10 = 0x0, GR_i_1 1242(p7) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; 1243} 1244// 1245// For small s: U_2 = N * P_2 - U_1 1246// S_1 stored constant - grab the one stored with the 1247// coefficients. 1248// 1249 1250{ .mfi 1251(p7) ldfe FR_S_1 = [GR_Table_Base1], 16 1252// 1253// Check if i_1 and i_0 != 0 1254// 1255(p10) fma.s1 FR_poly = f0, f1, FR_Neg_Two_to_M67 1256(p7) cmp.eq.unc p11, p12 = 0x0, GR_i_0 ;; 1257} 1258 1259{ .mfi 1260 nop.m 999 1261(p7) fms.s1 FR_s = FR_s, f1, FR_r 1262 nop.i 999 1263} 1264 1265{ .mfi 1266 nop.m 999 1267// 1268// S = S - r 1269// U_2 = U_2 + w 1270// load S_1 1271// 1272(p7) fma.s1 FR_rsq = FR_r, FR_r, f0 1273 nop.i 999 ;; 1274} 1275 1276{ .mfi 1277 nop.m 999 1278(p7) fma.s1 FR_U_2 = FR_U_2, f1, FR_w 1279 nop.i 999 1280} 1281 1282{ .mfi 1283 nop.m 999 1284//(p7) fmerge.se FR_Input_X = FR_r, FR_r 1285(p7) fmerge.se FR_prelim = FR_r, FR_r 1286 nop.i 999 ;; 1287} 1288 1289{ .mfi 1290 nop.m 999 1291//(p10) fma.s1 FR_Input_X = f0, f1, f1 1292(p10) fma.s1 FR_prelim = f0, f1, f1 1293 nop.i 999 ;; 1294} 1295 1296{ .mfi 1297 nop.m 999 1298// 1299// FR_rsq = r * r 1300// Save r as the result. 1301// 1302(p7) fms.s1 FR_c = FR_s, f1, FR_U_1 1303 nop.i 999 ;; 1304} 1305 1306{ .mfi 1307 nop.m 999 1308// 1309// if ( i_1 ==0) poly = c + S_1*r*r*r 1310// else Result = 1 1311// 1312//(p12) fnma.s1 FR_Input_X = FR_Input_X, f1, f0 1313(p12) fnma.s1 FR_prelim = FR_prelim, f1, f0 1314 nop.i 999 1315} 1316 1317{ .mfi 1318 nop.m 999 1319(p7) fma.s1 FR_r = FR_S_1, FR_r, f0 1320 nop.i 999 ;; 1321} 1322 1323{ .mfi 1324 nop.m 999 1325(p7) fma.d.s1 FR_S_1 = FR_S_1, FR_S_1, f0 1326 nop.i 999 ;; 1327} 1328 1329{ .mfi 1330 nop.m 999 1331// 1332// If i_1 != 0, poly = 2**(-67) 1333// 1334(p7) fms.s1 FR_c = FR_c, f1, FR_U_2 1335 nop.i 999 ;; 1336} 1337 1338{ .mfi 1339 nop.m 999 1340// 1341// c = c - U_2 1342// 1343(p9) fma.s1 FR_poly = FR_r, FR_rsq, FR_c 1344 nop.i 999 ;; 1345} 1346 1347{ .mfi 1348 nop.m 999 1349// 1350// i_0 != 0, so Result = -Result 1351// 1352(p11) fma.s1 FR_Input_X = FR_prelim, f1, FR_poly 1353 nop.i 999 ;; 1354} 1355 1356{ .mfb 1357 nop.m 999 1358(p12) fms.s1 FR_Input_X = FR_prelim, f1, FR_poly 1359// 1360// if (i_0 == 0), Result = Result + poly 1361// else Result = Result - poly 1362// 1363 br.ret.sptk b0 ;; 1364} 1365SINCOS_LARGER_ARG: 1366 1367{ .mfi 1368 nop.m 999 1369 fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0 1370 nop.i 999 1371} 1372;; 1373 1374// This path for argument > 2*24 1375// Adjust table_ptr1 to beginning of table. 1376// 1377 1378{ .mmi 1379 nop.m 999 1380 addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp 1381 nop.i 999 1382} 1383;; 1384 1385{ .mmi 1386 ld8 GR_Table_Base = [GR_Table_Base] 1387 nop.m 999 1388 nop.i 999 1389} 1390;; 1391 1392 1393// 1394// Point to 2*-14 1395// N_0 = Arg * Inv_P_0 1396// 1397 1398{ .mmi 1399 add GR_Table_Base = 688, GR_Table_Base ;; 1400 ldfs FR_Two_to_M14 = [GR_Table_Base], 4 1401 nop.i 999 ;; 1402} 1403 1404{ .mfi 1405 ldfs FR_Neg_Two_to_M14 = [GR_Table_Base], 0 1406 nop.f 999 1407 nop.i 999 ;; 1408} 1409 1410{ .mfi 1411 nop.m 999 1412// 1413// Load values 2**(-14) and -2**(-14) 1414// 1415 fcvt.fx.s1 FR_N_0_fix = FR_N_0 1416 nop.i 999 ;; 1417} 1418 1419{ .mfi 1420 nop.m 999 1421// 1422// N_0_fix = integer part of N_0 1423// 1424 fcvt.xf FR_N_0 = FR_N_0_fix 1425 nop.i 999 ;; 1426} 1427 1428{ .mfi 1429 nop.m 999 1430// 1431// Make N_0 the integer part 1432// 1433 fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X 1434 nop.i 999 1435} 1436 1437{ .mfi 1438 nop.m 999 1439 fma.s1 FR_w = FR_N_0, FR_d_1, f0 1440 nop.i 999 ;; 1441} 1442 1443{ .mfi 1444 nop.m 999 1445// 1446// Arg' = -N_0 * P_0 + Arg 1447// w = N_0 * d_1 1448// 1449 fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0 1450 nop.i 999 ;; 1451} 1452 1453{ .mfi 1454 nop.m 999 1455// 1456// N = A' * 2/pi 1457// 1458 fcvt.fx.s1 FR_N_fix = FR_N_float 1459 nop.i 999 ;; 1460} 1461 1462{ .mfi 1463 nop.m 999 1464// 1465// N_fix is the integer part 1466// 1467 fcvt.xf FR_N_float = FR_N_fix 1468 nop.i 999 ;; 1469} 1470 1471{ .mfi 1472 getf.sig GR_N_Inc = FR_N_fix 1473 nop.f 999 1474 nop.i 999 ;; 1475} 1476 1477{ .mii 1478 nop.m 999 1479 nop.i 999 ;; 1480 add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;; 1481} 1482 1483{ .mfi 1484 nop.m 999 1485// 1486// N is the integer part of the reduced-reduced argument. 1487// Put the integer in a GP register 1488// 1489 fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime 1490 nop.i 999 1491} 1492 1493{ .mfi 1494 nop.m 999 1495 fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w 1496 nop.i 999 ;; 1497} 1498 1499{ .mfi 1500 nop.m 999 1501// 1502// s = -N*P_1 + Arg' 1503// w = -N*P_2 + w 1504// N_fix_gr = N_fix_gr + N_inc 1505// 1506 fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14 1507 nop.i 999 ;; 1508} 1509 1510{ .mfi 1511 nop.m 999 1512(p9) fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14 1513 nop.i 999 ;; 1514} 1515 1516{ .mfi 1517 nop.m 999 1518// 1519// For |s| > 2**(-14) r = S + w (r complete) 1520// Else U_hi = N_0 * d_1 1521// 1522(p9) fma.s1 FR_V_hi = FR_N_float, FR_P_2, f0 1523 nop.i 999 1524} 1525 1526{ .mfi 1527 nop.m 999 1528(p9) fma.s1 FR_U_hi = FR_N_0, FR_d_1, f0 1529 nop.i 999 ;; 1530} 1531 1532{ .mfi 1533 nop.m 999 1534// 1535// Either S <= -2**(-14) or S >= 2**(-14) 1536// or -2**(-14) < s < 2**(-14) 1537// 1538(p8) fma.s1 FR_r = FR_s, f1, FR_w 1539 nop.i 999 1540} 1541 1542{ .mfi 1543 nop.m 999 1544(p9) fma.s1 FR_w = FR_N_float, FR_P_3, f0 1545 nop.i 999 ;; 1546} 1547 1548{ .mfi 1549 nop.m 999 1550// 1551// We need abs of both U_hi and V_hi - don't 1552// worry about switched sign of V_hi. 1553// 1554(p9) fms.s1 FR_A = FR_U_hi, f1, FR_V_hi 1555 nop.i 999 1556} 1557 1558{ .mfi 1559 nop.m 999 1560// 1561// Big s: finish up c = (S - r) + w (c complete) 1562// Case 4: A = U_hi + V_hi 1563// Note: Worry about switched sign of V_hi, so subtract instead of add. 1564// 1565(p9) fnma.s1 FR_V_lo = FR_N_float, FR_P_2, FR_V_hi 1566 nop.i 999 ;; 1567} 1568 1569{ .mfi 1570 nop.m 999 1571(p9) fms.s1 FR_U_lo = FR_N_0, FR_d_1, FR_U_hi 1572 nop.i 999 ;; 1573} 1574 1575{ .mfi 1576 nop.m 999 1577(p9) fmerge.s FR_V_hiabs = f0, FR_V_hi 1578 nop.i 999 1579} 1580 1581{ .mfi 1582 nop.m 999 1583// For big s: c = S - r 1584// For small s do more work: U_lo = N_0 * d_1 - U_hi 1585// 1586(p9) fmerge.s FR_U_hiabs = f0, FR_U_hi 1587 nop.i 999 ;; 1588} 1589 1590{ .mfi 1591 nop.m 999 1592// 1593// For big s: Is |r| < 2**(-3) 1594// For big s: if p12 set, prepare to branch to Small_R. 1595// For big s: If p13 set, prepare to branch to Normal_R. 1596// 1597(p8) fms.s1 FR_c = FR_s, f1, FR_r 1598 nop.i 999 1599} 1600 1601{ .mfi 1602 nop.m 999 1603// 1604// For small S: V_hi = N * P_2 1605// w = N * P_3 1606// Note the product does not include the (-) as in the writeup 1607// so (-) missing for V_hi and w. 1608// 1609(p8) fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3 1610 nop.i 999 ;; 1611} 1612 1613{ .mfi 1614 nop.m 999 1615(p12) fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3 1616 nop.i 999 ;; 1617} 1618 1619{ .mfi 1620 nop.m 999 1621(p8) fma.s1 FR_c = FR_c, f1, FR_w 1622 nop.i 999 1623} 1624 1625{ .mfb 1626 nop.m 999 1627(p9) fms.s1 FR_w = FR_N_0, FR_d_2, FR_w 1628(p12) br.cond.spnt SINCOS_SMALL_R ;; 1629} 1630 1631{ .mib 1632 nop.m 999 1633 nop.i 999 1634(p13) br.cond.sptk SINCOS_NORMAL_R ;; 1635} 1636 1637{ .mfi 1638 nop.m 999 1639// 1640// Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true. 1641// The remaining stuff is for Case 4. 1642// Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup) 1643// Note: the (-) is still missing for V_lo. 1644// Small s: w = w + N_0 * d_2 1645// Note: the (-) is now incorporated in w. 1646// 1647(p9) fcmp.ge.unc.s1 p10, p11 = FR_U_hiabs, FR_V_hiabs 1648 extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; 1649} 1650 1651{ .mfi 1652 nop.m 999 1653// 1654// C_hi = S + A 1655// 1656(p9) fma.s1 FR_t = FR_U_lo, f1, FR_V_lo 1657 extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; 1658} 1659 1660{ .mfi 1661 nop.m 999 1662// 1663// t = U_lo + V_lo 1664// 1665// 1666(p10) fms.s1 FR_a = FR_U_hi, f1, FR_A 1667 nop.i 999 ;; 1668} 1669 1670{ .mfi 1671 nop.m 999 1672(p11) fma.s1 FR_a = FR_V_hi, f1, FR_A 1673 nop.i 999 1674} 1675;; 1676 1677{ .mmi 1678 nop.m 999 1679 addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp 1680 nop.i 999 1681} 1682;; 1683 1684{ .mmi 1685 ld8 GR_Table_Base = [GR_Table_Base] 1686 nop.m 999 1687 nop.i 999 1688} 1689;; 1690 1691 1692{ .mfi 1693 add GR_Table_Base = 528, GR_Table_Base 1694// 1695// Is U_hiabs >= V_hiabs? 1696// 1697(p9) fma.s1 FR_C_hi = FR_s, f1, FR_A 1698 nop.i 999 ;; 1699} 1700 1701{ .mmi 1702 ldfe FR_C_1 = [GR_Table_Base], 16 ;; 1703 ldfe FR_C_2 = [GR_Table_Base], 64 1704 nop.i 999 ;; 1705} 1706 1707{ .mmf 1708 nop.m 999 1709// 1710// c = c + C_lo finished. 1711// Load C_2 1712// 1713 ldfe FR_S_1 = [GR_Table_Base], 16 1714// 1715// C_lo = S - C_hi 1716// 1717 fma.s1 FR_t = FR_t, f1, FR_w ;; 1718} 1719// 1720// r and c have been computed. 1721// Make sure ftz mode is set - should be automatic when using wre 1722// |r| < 2**(-3) 1723// Get [i_0,i_1] - two lsb of N_fix. 1724// Load S_1 1725// 1726 1727{ .mfi 1728 ldfe FR_S_2 = [GR_Table_Base], 64 1729// 1730// t = t + w 1731// 1732(p10) fms.s1 FR_a = FR_a, f1, FR_V_hi 1733 cmp.eq.unc p9, p10 = 0x0, GR_i_0 1734} 1735 1736{ .mfi 1737 nop.m 999 1738// 1739// For larger u than v: a = U_hi - A 1740// Else a = V_hi - A (do an add to account for missing (-) on V_hi 1741// 1742 fms.s1 FR_C_lo = FR_s, f1, FR_C_hi 1743 nop.i 999 ;; 1744} 1745 1746{ .mfi 1747 nop.m 999 1748(p11) fms.s1 FR_a = FR_U_hi, f1, FR_a 1749 cmp.eq.unc p11, p12 = 0x0, GR_i_1 1750} 1751 1752{ .mfi 1753 nop.m 999 1754// 1755// If u > v: a = (U_hi - A) + V_hi 1756// Else a = (V_hi - A) + U_hi 1757// In each case account for negative missing from V_hi. 1758// 1759 fma.s1 FR_C_lo = FR_C_lo, f1, FR_A 1760 nop.i 999 ;; 1761} 1762 1763{ .mfi 1764 nop.m 999 1765// 1766// C_lo = (S - C_hi) + A 1767// 1768 fma.s1 FR_t = FR_t, f1, FR_a 1769 nop.i 999 ;; 1770} 1771 1772{ .mfi 1773 nop.m 999 1774// 1775// t = t + a 1776// 1777 fma.s1 FR_C_lo = FR_C_lo, f1, FR_t 1778 nop.i 999 ;; 1779} 1780 1781{ .mfi 1782 nop.m 999 1783// 1784// C_lo = C_lo + t 1785// Adjust Table_Base to beginning of table 1786// 1787 fma.s1 FR_r = FR_C_hi, f1, FR_C_lo 1788 nop.i 999 ;; 1789} 1790 1791{ .mfi 1792 nop.m 999 1793// 1794// Load S_2 1795// 1796 fma.s1 FR_rsq = FR_r, FR_r, f0 1797 nop.i 999 1798} 1799 1800{ .mfi 1801 nop.m 999 1802// 1803// Table_Base points to C_1 1804// r = C_hi + C_lo 1805// 1806 fms.s1 FR_c = FR_C_hi, f1, FR_r 1807 nop.i 999 ;; 1808} 1809 1810{ .mfi 1811 nop.m 999 1812// 1813// if i_1 ==0: poly = S_2 * FR_rsq + S_1 1814// else poly = C_2 * FR_rsq + C_1 1815// 1816//(p11) fma.s1 FR_Input_X = f0, f1, FR_r 1817(p11) fma.s1 FR_prelim = f0, f1, FR_r 1818 nop.i 999 ;; 1819} 1820 1821{ .mfi 1822 nop.m 999 1823//(p12) fma.s1 FR_Input_X = f0, f1, f1 1824(p12) fma.s1 FR_prelim = f0, f1, f1 1825 nop.i 999 ;; 1826} 1827 1828{ .mfi 1829 nop.m 999 1830// 1831// Compute r_cube = FR_rsq * r 1832// 1833(p11) fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1 1834 nop.i 999 ;; 1835} 1836 1837{ .mfi 1838 nop.m 999 1839(p12) fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1 1840 nop.i 999 1841} 1842 1843{ .mfi 1844 nop.m 999 1845// 1846// Compute FR_rsq = r * r 1847// Is i_1 == 0 ? 1848// 1849 fma.s1 FR_r_cubed = FR_rsq, FR_r, f0 1850 nop.i 999 ;; 1851} 1852 1853{ .mfi 1854 nop.m 999 1855// 1856// c = C_hi - r 1857// Load C_1 1858// 1859 fma.s1 FR_c = FR_c, f1, FR_C_lo 1860 nop.i 999 1861} 1862 1863{ .mfi 1864 nop.m 999 1865// 1866// if i_1 ==0: poly = r_cube * poly + c 1867// else poly = FR_rsq * poly 1868// 1869//(p10) fms.s1 FR_Input_X = f0, f1, FR_Input_X 1870(p10) fms.s1 FR_prelim = f0, f1, FR_prelim 1871 nop.i 999 ;; 1872} 1873 1874{ .mfi 1875 nop.m 999 1876// 1877// if i_1 ==0: Result = r 1878// else Result = 1.0 1879// 1880(p11) fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c 1881 nop.i 999 ;; 1882} 1883 1884{ .mfi 1885 nop.m 999 1886(p12) fma.s1 FR_poly = FR_rsq, FR_poly, f0 1887 nop.i 999 ;; 1888} 1889 1890{ .mfi 1891 nop.m 999 1892// 1893// if i_0 !=0: Result = -Result 1894// 1895(p9) fma.s1 FR_Input_X = FR_prelim, f1, FR_poly 1896 nop.i 999 ;; 1897} 1898 1899{ .mfb 1900 nop.m 999 1901(p10) fms.s1 FR_Input_X = FR_prelim, f1, FR_poly 1902// 1903// if i_0 == 0: Result = Result + poly 1904// else Result = Result - poly 1905// 1906 br.ret.sptk b0 ;; 1907} 1908SINCOS_SMALL_R: 1909 1910{ .mii 1911 nop.m 999 1912 extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; 1913// 1914// 1915// Compare both i_1 and i_0 with 0. 1916// if i_1 == 0, set p9. 1917// if i_0 == 0, set p11. 1918// 1919 cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;; 1920} 1921 1922{ .mfi 1923 nop.m 999 1924 fma.s1 FR_rsq = FR_r, FR_r, f0 1925 extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; 1926} 1927 1928{ .mfi 1929 nop.m 999 1930// 1931// Z = Z * FR_rsq 1932// 1933(p10) fnma.s1 FR_c = FR_c, FR_r, f0 1934 cmp.eq.unc p11, p12 = 0x0, GR_i_0 1935} 1936;; 1937 1938// ****************************************************************** 1939// ****************************************************************** 1940// ****************************************************************** 1941// r and c have been computed. 1942// We know whether this is the sine or cosine routine. 1943// Make sure ftz mode is set - should be automatic when using wre 1944// |r| < 2**(-3) 1945// 1946// Set table_ptr1 to beginning of constant table. 1947// Get [i_0,i_1] - two lsb of N_fix_gr. 1948// 1949 1950{ .mmi 1951 nop.m 999 1952 addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp 1953 nop.i 999 1954} 1955;; 1956 1957{ .mmi 1958 ld8 GR_Table_Base = [GR_Table_Base] 1959 nop.m 999 1960 nop.i 999 1961} 1962;; 1963 1964 1965// 1966// Set table_ptr1 to point to S_5. 1967// Set table_ptr1 to point to C_5. 1968// Compute FR_rsq = r * r 1969// 1970 1971{ .mfi 1972(p9) add GR_Table_Base = 672, GR_Table_Base 1973(p10) fmerge.s FR_r = f1, f1 1974(p10) add GR_Table_Base = 592, GR_Table_Base ;; 1975} 1976// 1977// Set table_ptr1 to point to S_5. 1978// Set table_ptr1 to point to C_5. 1979// 1980 1981{ .mmi 1982(p9) ldfe FR_S_5 = [GR_Table_Base], -16 ;; 1983// 1984// if (i_1 == 0) load S_5 1985// if (i_1 != 0) load C_5 1986// 1987(p9) ldfe FR_S_4 = [GR_Table_Base], -16 1988 nop.i 999 ;; 1989} 1990 1991{ .mmf 1992(p10) ldfe FR_C_5 = [GR_Table_Base], -16 1993// 1994// Z = FR_rsq * FR_rsq 1995// 1996(p9) ldfe FR_S_3 = [GR_Table_Base], -16 1997// 1998// Compute FR_rsq = r * r 1999// if (i_1 == 0) load S_4 2000// if (i_1 != 0) load C_4 2001// 2002 fma.s1 FR_Z = FR_rsq, FR_rsq, f0 ;; 2003} 2004// 2005// if (i_1 == 0) load S_3 2006// if (i_1 != 0) load C_3 2007// 2008 2009{ .mmi 2010(p9) ldfe FR_S_2 = [GR_Table_Base], -16 ;; 2011// 2012// if (i_1 == 0) load S_2 2013// if (i_1 != 0) load C_2 2014// 2015(p9) ldfe FR_S_1 = [GR_Table_Base], -16 2016 nop.i 999 2017} 2018 2019{ .mmi 2020(p10) ldfe FR_C_4 = [GR_Table_Base], -16 ;; 2021(p10) ldfe FR_C_3 = [GR_Table_Base], -16 2022 nop.i 999 ;; 2023} 2024 2025{ .mmi 2026(p10) ldfe FR_C_2 = [GR_Table_Base], -16 ;; 2027(p10) ldfe FR_C_1 = [GR_Table_Base], -16 2028 nop.i 999 2029} 2030 2031{ .mfi 2032 nop.m 999 2033// 2034// if (i_1 != 0): 2035// poly_lo = FR_rsq * C_5 + C_4 2036// poly_hi = FR_rsq * C_2 + C_1 2037// 2038(p9) fma.s1 FR_Z = FR_Z, FR_r, f0 2039 nop.i 999 ;; 2040} 2041 2042{ .mfi 2043 nop.m 999 2044// 2045// if (i_1 == 0) load S_1 2046// if (i_1 != 0) load C_1 2047// 2048(p9) fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4 2049 nop.i 999 2050} 2051 2052{ .mfi 2053 nop.m 999 2054// 2055// c = -c * r 2056// dummy fmpy's to flag inexact. 2057// 2058(p9) fma.d.s1 FR_S_4 = FR_S_4, FR_S_4, f0 2059 nop.i 999 ;; 2060} 2061 2062{ .mfi 2063 nop.m 999 2064// 2065// poly_lo = FR_rsq * poly_lo + C_3 2066// poly_hi = FR_rsq * poly_hi 2067// 2068 fma.s1 FR_Z = FR_Z, FR_rsq, f0 2069 nop.i 999 ;; 2070} 2071 2072{ .mfi 2073 nop.m 999 2074(p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1 2075 nop.i 999 2076} 2077 2078{ .mfi 2079 nop.m 999 2080// 2081// if (i_1 == 0): 2082// poly_lo = FR_rsq * S_5 + S_4 2083// poly_hi = FR_rsq * S_2 + S_1 2084// 2085(p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4 2086 nop.i 999 ;; 2087} 2088 2089{ .mfi 2090 nop.m 999 2091// 2092// if (i_1 == 0): 2093// Z = Z * r for only one of the small r cases - not there 2094// in original implementation notes. 2095// 2096(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3 2097 nop.i 999 ;; 2098} 2099 2100{ .mfi 2101 nop.m 999 2102(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1 2103 nop.i 999 2104} 2105 2106{ .mfi 2107 nop.m 999 2108(p10) fma.d.s1 FR_C_1 = FR_C_1, FR_C_1, f0 2109 nop.i 999 ;; 2110} 2111 2112{ .mfi 2113 nop.m 999 2114(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 2115 nop.i 999 2116} 2117 2118{ .mfi 2119 nop.m 999 2120// 2121// poly_lo = FR_rsq * poly_lo + S_3 2122// poly_hi = FR_rsq * poly_hi 2123// 2124(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3 2125 nop.i 999 ;; 2126} 2127 2128{ .mfi 2129 nop.m 999 2130(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 2131 nop.i 999 ;; 2132} 2133 2134{ .mfi 2135 nop.m 999 2136// 2137// if (i_1 == 0): dummy fmpy's to flag inexact 2138// r = 1 2139// 2140(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0 2141 nop.i 999 2142} 2143 2144{ .mfi 2145 nop.m 999 2146// 2147// poly_hi = r * poly_hi 2148// 2149 fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c 2150 nop.i 999 ;; 2151} 2152 2153{ .mfi 2154 nop.m 999 2155(p12) fms.s1 FR_r = f0, f1, FR_r 2156 nop.i 999 ;; 2157} 2158 2159{ .mfi 2160 nop.m 999 2161// 2162// poly_hi = Z * poly_lo + c 2163// if i_0 == 1: r = -r 2164// 2165 fma.s1 FR_poly = FR_poly, f1, FR_poly_hi 2166 nop.i 999 ;; 2167} 2168 2169{ .mfi 2170 nop.m 999 2171(p12) fms.s1 FR_Input_X = FR_r, f1, FR_poly 2172 nop.i 999 2173} 2174 2175{ .mfb 2176 nop.m 999 2177// 2178// poly = poly + poly_hi 2179// 2180(p11) fma.s1 FR_Input_X = FR_r, f1, FR_poly 2181// 2182// if (i_0 == 0) Result = r + poly 2183// if (i_0 != 0) Result = r - poly 2184// 2185 br.ret.sptk b0 ;; 2186} 2187SINCOS_NORMAL_R: 2188 2189{ .mii 2190 nop.m 999 2191 extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; 2192// 2193// Set table_ptr1 and table_ptr2 to base address of 2194// constant table. 2195 cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;; 2196} 2197 2198{ .mfi 2199 nop.m 999 2200 fma.s1 FR_rsq = FR_r, FR_r, f0 2201 extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; 2202} 2203 2204{ .mfi 2205 nop.m 999 2206 frcpa.s1 FR_r_hi, p6 = f1, FR_r 2207 cmp.eq.unc p11, p12 = 0x0, GR_i_0 2208} 2209;; 2210 2211// ****************************************************************** 2212// ****************************************************************** 2213// ****************************************************************** 2214// 2215// r and c have been computed. 2216// We known whether this is the sine or cosine routine. 2217// Make sure ftz mode is set - should be automatic when using wre 2218// Get [i_0,i_1] - two lsb of N_fix_gr alone. 2219// 2220 2221{ .mmi 2222 nop.m 999 2223 addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp 2224 nop.i 999 2225} 2226;; 2227 2228{ .mmi 2229 ld8 GR_Table_Base = [GR_Table_Base] 2230 nop.m 999 2231 nop.i 999 2232} 2233;; 2234 2235 2236{ .mfi 2237(p10) add GR_Table_Base = 384, GR_Table_Base 2238//(p12) fms.s1 FR_Input_X = f0, f1, f1 2239(p12) fms.s1 FR_prelim = f0, f1, f1 2240(p9) add GR_Table_Base = 224, GR_Table_Base ;; 2241} 2242 2243{ .mmf 2244 nop.m 999 2245(p10) ldfe FR_QQ_8 = [GR_Table_Base], 16 2246// 2247// if (i_1==0) poly = poly * FR_rsq + PP_1_lo 2248// else poly = FR_rsq * poly 2249// 2250//(p11) fma.s1 FR_Input_X = f0, f1, f1 ;; 2251(p11) fma.s1 FR_prelim = f0, f1, f1 ;; 2252} 2253 2254{ .mmf 2255(p10) ldfe FR_QQ_7 = [GR_Table_Base], 16 2256// 2257// Adjust table pointers based on i_0 2258// Compute rsq = r * r 2259// 2260(p9) ldfe FR_PP_8 = [GR_Table_Base], 16 2261 fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 ;; 2262} 2263 2264{ .mmf 2265(p9) ldfe FR_PP_7 = [GR_Table_Base], 16 2266(p10) ldfe FR_QQ_6 = [GR_Table_Base], 16 2267// 2268// Load PP_8 and QQ_8; PP_7 and QQ_7 2269// 2270 frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;; 2271} 2272// 2273// if (i_1==0) poly = PP_7 + FR_rsq * PP_8. 2274// else poly = QQ_7 + FR_rsq * QQ_8. 2275// 2276 2277{ .mmb 2278(p9) ldfe FR_PP_6 = [GR_Table_Base], 16 2279(p10) ldfe FR_QQ_5 = [GR_Table_Base], 16 2280 nop.b 999 ;; 2281} 2282 2283{ .mmb 2284(p9) ldfe FR_PP_5 = [GR_Table_Base], 16 2285(p10) ldfe FR_S_1 = [GR_Table_Base], 16 2286 nop.b 999 ;; 2287} 2288 2289{ .mmb 2290(p10) ldfe FR_QQ_1 = [GR_Table_Base], 16 2291(p9) ldfe FR_C_1 = [GR_Table_Base], 16 2292 nop.b 999 ;; 2293} 2294 2295{ .mmi 2296(p10) ldfe FR_QQ_4 = [GR_Table_Base], 16 ;; 2297(p9) ldfe FR_PP_1 = [GR_Table_Base], 16 2298 nop.i 999 ;; 2299} 2300 2301{ .mmf 2302(p10) ldfe FR_QQ_3 = [GR_Table_Base], 16 2303// 2304// if (i_1=0) corr = corr + c*c 2305// else corr = corr * c 2306// 2307(p9) ldfe FR_PP_4 = [GR_Table_Base], 16 2308(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 ;; 2309} 2310// 2311// if (i_1=0) poly = rsq * poly + PP_5 2312// else poly = rsq * poly + QQ_5 2313// Load PP_4 or QQ_4 2314// 2315 2316{ .mmf 2317(p9) ldfe FR_PP_3 = [GR_Table_Base], 16 2318(p10) ldfe FR_QQ_2 = [GR_Table_Base], 16 2319// 2320// r_hi = frcpa(frcpa(r)). 2321// r_cube = r * FR_rsq. 2322// 2323(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 ;; 2324} 2325// 2326// Do dummy multiplies so inexact is always set. 2327// 2328 2329{ .mfi 2330(p9) ldfe FR_PP_2 = [GR_Table_Base], 16 2331// 2332// r_lo = r - r_hi 2333// 2334(p9) fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0 2335 nop.i 999 ;; 2336} 2337 2338{ .mmf 2339 nop.m 999 2340(p9) ldfe FR_PP_1_lo = [GR_Table_Base], 16 2341(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r 2342} 2343 2344{ .mfi 2345 nop.m 999 2346(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6 2347 nop.i 999 ;; 2348} 2349 2350{ .mfi 2351 nop.m 999 2352// 2353// if (i_1=0) U_lo = r_hi * r_hi 2354// else U_lo = r_hi + r 2355// 2356(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0 2357 nop.i 999 ;; 2358} 2359 2360{ .mfi 2361 nop.m 999 2362// 2363// if (i_1=0) corr = C_1 * rsq 2364// else corr = S_1 * r_cubed + r 2365// 2366(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6 2367 nop.i 999 2368} 2369 2370{ .mfi 2371 nop.m 999 2372(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r 2373 nop.i 999 ;; 2374} 2375 2376{ .mfi 2377 nop.m 999 2378// 2379// if (i_1=0) U_hi = r_hi + U_hi 2380// else U_hi = QQ_1 * U_hi + 1 2381// 2382(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo 2383 nop.i 999 2384} 2385 2386{ .mfi 2387 nop.m 999 2388// 2389// U_hi = r_hi * r_hi 2390// 2391 fms.s1 FR_r_lo = FR_r, f1, FR_r_hi 2392 nop.i 999 ;; 2393} 2394 2395{ .mfi 2396 nop.m 999 2397// 2398// Load PP_1, PP_6, PP_5, and C_1 2399// Load QQ_1, QQ_6, QQ_5, and S_1 2400// 2401 fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0 2402 nop.i 999 ;; 2403} 2404 2405{ .mfi 2406 nop.m 999 2407(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5 2408 nop.i 999 2409} 2410 2411{ .mfi 2412 nop.m 999 2413(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0 2414 nop.i 999 ;; 2415} 2416 2417{ .mfi 2418 nop.m 999 2419// 2420// if (i_1=0) U_lo = r * r_hi + U_lo 2421// else U_lo = r_lo * U_lo 2422// 2423(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c 2424 nop.i 999 ;; 2425} 2426 2427{ .mfi 2428 nop.m 999 2429(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5 2430 nop.i 999 2431} 2432 2433{ .mfi 2434 nop.m 999 2435// 2436// if (i_1 =0) U_hi = r + U_hi 2437// if (i_1 =0) U_lo = r_lo * U_lo 2438// 2439// 2440(p9) fma.d.s1 FR_PP_5 = FR_PP_5, FR_PP_4, f0 2441 nop.i 999 ;; 2442} 2443 2444{ .mfi 2445 nop.m 999 2446(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo 2447 nop.i 999 2448} 2449 2450{ .mfi 2451 nop.m 999 2452(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 2453 nop.i 999 ;; 2454} 2455 2456{ .mfi 2457 nop.m 999 2458// 2459// if (i_1=0) poly = poly * rsq + PP_6 2460// else poly = poly * rsq + QQ_6 2461// 2462(p9) fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0 2463 nop.i 999 ;; 2464} 2465 2466{ .mfi 2467 nop.m 999 2468(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4 2469 nop.i 999 2470} 2471 2472{ .mfi 2473 nop.m 999 2474(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1 2475 nop.i 999 ;; 2476} 2477 2478{ .mfi 2479 nop.m 999 2480(p10) fma.d.s1 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0 2481 nop.i 999 ;; 2482} 2483 2484{ .mfi 2485 nop.m 999 2486// 2487// if (i_1!=0) U_hi = PP_1 * U_hi 2488// if (i_1!=0) U_lo = r * r + U_lo 2489// Load PP_3 or QQ_3 2490// 2491(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4 2492 nop.i 999 ;; 2493} 2494 2495{ .mfi 2496 nop.m 999 2497(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 2498 nop.i 999 2499} 2500 2501{ .mfi 2502 nop.m 999 2503(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0 2504 nop.i 999 ;; 2505} 2506 2507{ .mfi 2508 nop.m 999 2509(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0 2510 nop.i 999 ;; 2511} 2512 2513{ .mfi 2514 nop.m 999 2515(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3 2516 nop.i 999 ;; 2517} 2518 2519{ .mfi 2520 nop.m 999 2521// 2522// Load PP_2, QQ_2 2523// 2524(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3 2525 nop.i 999 ;; 2526} 2527 2528{ .mfi 2529 nop.m 999 2530// 2531// if (i_1==0) poly = FR_rsq * poly + PP_3 2532// else poly = FR_rsq * poly + QQ_3 2533// Load PP_1_lo 2534// 2535(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0 2536 nop.i 999 ;; 2537} 2538 2539{ .mfi 2540 nop.m 999 2541// 2542// if (i_1 =0) poly = poly * rsq + pp_r4 2543// else poly = poly * rsq + qq_r4 2544// 2545(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi 2546 nop.i 999 ;; 2547} 2548 2549{ .mfi 2550 nop.m 999 2551(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2 2552 nop.i 999 ;; 2553} 2554 2555{ .mfi 2556 nop.m 999 2557// 2558// if (i_1==0) U_lo = PP_1_hi * U_lo 2559// else U_lo = QQ_1 * U_lo 2560// 2561(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2 2562 nop.i 999 ;; 2563} 2564 2565{ .mfi 2566 nop.m 999 2567// 2568// if (i_0==0) Result = 1 2569// else Result = -1 2570// 2571 fma.s1 FR_V = FR_U_lo, f1, FR_corr 2572 nop.i 999 ;; 2573} 2574 2575{ .mfi 2576 nop.m 999 2577(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 2578 nop.i 999 ;; 2579} 2580 2581{ .mfi 2582 nop.m 999 2583// 2584// if (i_1==0) poly = FR_rsq * poly + PP_2 2585// else poly = FR_rsq * poly + QQ_2 2586// 2587(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo 2588 nop.i 999 ;; 2589} 2590 2591{ .mfi 2592 nop.m 999 2593(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 2594 nop.i 999 ;; 2595} 2596 2597{ .mfi 2598 nop.m 999 2599// 2600// V = U_lo + corr 2601// 2602(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0 2603 nop.i 999 ;; 2604} 2605 2606{ .mfi 2607 nop.m 999 2608// 2609// if (i_1==0) poly = r_cube * poly 2610// else poly = FR_rsq * poly 2611// 2612 fma.s1 FR_V = FR_poly, f1, FR_V 2613 nop.i 999 ;; 2614} 2615 2616{ .mfi 2617 nop.m 999 2618//(p12) fms.s1 FR_Input_X = FR_Input_X, FR_U_hi, FR_V 2619(p12) fms.s1 FR_Input_X = FR_prelim, FR_U_hi, FR_V 2620 nop.i 999 2621} 2622 2623{ .mfb 2624 nop.m 999 2625// 2626// V = V + poly 2627// 2628//(p11) fma.s1 FR_Input_X = FR_Input_X, FR_U_hi, FR_V 2629(p11) fma.s1 FR_Input_X = FR_prelim, FR_U_hi, FR_V 2630// 2631// if (i_0==0) Result = Result * U_hi + V 2632// else Result = Result * U_hi - V 2633// 2634 br.ret.sptk b0 ;; 2635} 2636 2637// 2638// If cosine, FR_Input_X = 1 2639// If sine, FR_Input_X = +/-Zero (Input FR_Input_X) 2640// Results are exact, no exceptions 2641// 2642SINCOS_ZERO: 2643 2644{ .mmb 2645 cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos 2646 nop.m 999 2647 nop.b 999 ;; 2648} 2649 2650{ .mfi 2651 nop.m 999 2652(p7) fmerge.s FR_Input_X = FR_Input_X, FR_Input_X 2653 nop.i 999 2654} 2655 2656{ .mfb 2657 nop.m 999 2658(p6) fmerge.s FR_Input_X = f1, f1 2659 br.ret.sptk b0 ;; 2660} 2661 2662SINCOS_SPECIAL: 2663 2664// 2665// Path for Arg = +/- QNaN, SNaN, Inf 2666// Invalid can be raised. SNaNs 2667// become QNaNs 2668// 2669 2670{ .mfb 2671 nop.m 999 2672 fmpy.s1 FR_Input_X = FR_Input_X, f0 2673 br.ret.sptk b0 ;; 2674} 2675GLOBAL_LIBM_END(__libm_cos_large) 2676 2677 2678// ******************************************************************* 2679// ******************************************************************* 2680// ******************************************************************* 2681// 2682// Special Code to handle very large argument case. 2683// Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63 2684// The interface is custom: 2685// On input: 2686// (Arg or x) is in f8 2687// On output: 2688// r is in f8 2689// c is in f9 2690// N is in r8 2691// Be sure to allocate at least 2 GP registers as output registers for 2692// __libm_pi_by_2_reduce. This routine uses r49-50. These are used as 2693// scratch registers within the __libm_pi_by_2_reduce routine (for speed). 2694// 2695// We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127. We 2696// use this to eliminate save/restore of key fp registers in this calling 2697// function. 2698// 2699// ******************************************************************* 2700// ******************************************************************* 2701// ******************************************************************* 2702 2703LOCAL_LIBM_ENTRY(__libm_callout_2) 2704SINCOS_ARG_TOO_LARGE: 2705 2706.prologue 2707// Readjust Table ptr 2708{ .mfi 2709 adds GR_Table_Base1 = -16, GR_Table_Base1 2710 nop.f 999 2711.save ar.pfs,GR_SAVE_PFS 2712 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 2713};; 2714 2715{ .mmi 2716 ldfs FR_Two_to_M3 = [GR_Table_Base1],4 2717 mov GR_SAVE_GP=gp // Save gp 2718.save b0, GR_SAVE_B0 2719 mov GR_SAVE_B0=b0 // Save b0 2720};; 2721 2722.body 2723// 2724// Call argument reduction with x in f8 2725// Returns with N in r8, r in f8, c in f9 2726// Assumes f71-127 are preserved across the call 2727// 2728{ .mib 2729 ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0 2730 nop.i 0 2731 br.call.sptk b0=__libm_pi_by_2_reduce# 2732};; 2733 2734{ .mfi 2735 add GR_N_Inc = GR_Sin_or_Cos,r8 2736 fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3 2737 mov b0 = GR_SAVE_B0 // Restore return address 2738};; 2739 2740{ .mfi 2741 mov gp = GR_SAVE_GP // Restore gp 2742(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3 2743 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 2744};; 2745 2746{ .mbb 2747 nop.m 999 2748(p6) br.cond.spnt SINCOS_SMALL_R // Branch if |r| < 1/4 2749 br.cond.sptk SINCOS_NORMAL_R ;; // Branch if 1/4 <= |r| < pi/4 2750} 2751 2752LOCAL_LIBM_END(__libm_callout_2) 2753 2754.type __libm_pi_by_2_reduce#,@function 2755.global __libm_pi_by_2_reduce# 2756