1.file "atanl.s" 2 3 4// Copyright (c) 2000 - 2005, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// 40//********************************************************************* 41// 42// History 43// 02/02/00 (hand-optimized) 44// 04/04/00 Unwind support added 45// 08/15/00 Bundle added after call to __libm_error_support to properly 46// set [the previously overwritten] GR_Parameter_RESULT. 47// 03/13/01 Fixed flags when denormal raised on intermediate result 48// 01/08/02 Improved speed. 49// 02/06/02 Corrected .section statement 50// 05/20/02 Cleaned up namespace and sf0 syntax 51// 02/10/03 Reordered header: .section, .global, .proc, .align; 52// used data8 for long double table values 53// 03/31/05 Reformatted delimiters between data tables 54// 55//********************************************************************* 56// 57// Function: atanl(x) = inverse tangent(x), for double extended x values 58// Function: atan2l(y,x) = atan(y/x), for double extended y, x values 59// 60// API 61// 62// long double atanl (long double x) 63// long double atan2l (long double y, long double x) 64// 65//********************************************************************* 66// 67// Resources Used: 68// 69// Floating-Point Registers: f8 (Input and Return Value) 70// f9 (Input for atan2l) 71// f10-f15, f32-f83 72// 73// General Purpose Registers: 74// r32-r51 75// r49-r52 (Arguments to error support for 0,0 case) 76// 77// Predicate Registers: p6-p15 78// 79//********************************************************************* 80// 81// IEEE Special Conditions: 82// 83// Denormal fault raised on denormal inputs 84// Underflow exceptions may occur 85// Special error handling for the y=0 and x=0 case 86// Inexact raised when appropriate by algorithm 87// 88// atanl(SNaN) = QNaN 89// atanl(QNaN) = QNaN 90// atanl(+/-0) = +/- 0 91// atanl(+/-Inf) = +/-pi/2 92// 93// atan2l(Any NaN for x or y) = QNaN 94// atan2l(+/-0,x) = +/-0 for x > 0 95// atan2l(+/-0,x) = +/-pi for x < 0 96// atan2l(+/-0,+0) = +/-0 97// atan2l(+/-0,-0) = +/-pi 98// atan2l(y,+/-0) = pi/2 y > 0 99// atan2l(y,+/-0) = -pi/2 y < 0 100// atan2l(+/-y, Inf) = +/-0 for finite y > 0 101// atan2l(+/-Inf, x) = +/-pi/2 for finite x 102// atan2l(+/-y, -Inf) = +/-pi for finite y > 0 103// atan2l(+/-Inf, Inf) = +/-pi/4 104// atan2l(+/-Inf, -Inf) = +/-3pi/4 105// 106//********************************************************************* 107// 108// Mathematical Description 109// --------------------------- 110// 111// The function ATANL( Arg_Y, Arg_X ) returns the "argument" 112// or the "phase" of the complex number 113// 114// Arg_X + i Arg_Y 115// 116// or equivalently, the angle in radians from the positive 117// x-axis to the line joining the origin and the point 118// (Arg_X,Arg_Y) 119// 120// 121// (Arg_X, Arg_Y) x 122// \ 123// \ 124// \ 125// \ 126// \ angle between is ATANL(Arg_Y,Arg_X) 127 128 129 130 131// \ 132// ------------------> X-axis 133 134// Origin 135// 136// Moreover, this angle is reported in the range [-pi,pi] thus 137// 138// -pi <= ATANL( Arg_Y, Arg_X ) <= pi. 139// 140// From the geometry, it is easy to define ATANL when one of 141// Arg_X or Arg_Y is +-0 or +-inf: 142// 143// 144// \ Y | 145// X \ | +0 | -0 | +inf | -inf | finite non-zero 146// \ | | | | | 147// ______________________________________________________ 148// | | | | 149// +-0 | Invalid/ | pi/2 | -pi/2 | sign(Y)*pi/2 150// | qNaN | | | 151// -------------------------------------------------------- 152// | | | | | 153// +inf | +0 | -0 | pi/4 | -pi/4 | sign(Y)*0 154// -------------------------------------------------------- 155// | | | | | 156// -inf | +pi | -pi | 3pi/4 | -3pi/4 | sign(Y)*pi 157// -------------------------------------------------------- 158// finite | X>0? | pi/2 | -pi/2 | normal case 159// non-zero| sign(Y)*0: | | | 160// | sign(Y)*pi | | | 161// 162// 163// One must take note that ATANL is NOT the arctangent of the 164// value Arg_Y/Arg_X; but rather ATANL and arctan are related 165// in a slightly more complicated way as follows: 166// 167// Let U := max(|Arg_X|, |Arg_Y|); V := min(|Arg_X|, |Arg_Y|); 168// sign_X be the sign bit of Arg_X, i.e., sign_X is 0 or 1; 169// s_X be the sign of Arg_X, i.e., s_X = (-1)^sign_X; 170// 171// sign_Y be the sign bit of Arg_Y, i.e., sign_Y is 0 or 1; 172// s_Y be the sign of Arg_Y, i.e., s_Y = (-1)^sign_Y; 173// 174// swap be 0 if |Arg_X| >= |Arg_Y| and 1 otherwise. 175// 176// Then, ATANL(Arg_Y, Arg_X) = 177// 178// / arctan(V/U) \ sign_X = 0 & swap = 0 179// | pi/2 - arctan(V/U) | sign_X = 0 & swap = 1 180// s_Y * | | 181// | pi - arctan(V/U) | sign_X = 1 & swap = 0 182// \ pi/2 + arctan(V/U) / sign_X = 1 & swap = 1 183// 184// 185// This relationship also suggest that the algorithm's major 186// task is to calculate arctan(V/U) for 0 < V <= U; and the 187// final Result is given by 188// 189// s_Y * { (P_hi + P_lo) + sigma * arctan(V/U) } 190// 191// where 192// 193// (P_hi,P_lo) represents M(sign_X,swap)*(pi/2) accurately 194// 195// M(sign_X,swap) = 0 for sign_X = 0 and swap = 0 196// 1 for swap = 1 197// 2 for sign_X = 1 and swap = 0 198// 199// and 200// 201// sigma = { (sign_X XOR swap) : -1.0 : 1.0 } 202// 203// = (-1) ^ ( sign_X XOR swap ) 204// 205// Both (P_hi,P_lo) and sigma can be stored in a table and fetched 206// using (sign_X,swap) as an index. (P_hi, P_lo) can be stored as a 207// double-precision, and single-precision pair; and sigma can 208// obviously be just a single-precision number. 209// 210// In the algorithm we propose, arctan(V/U) is calculated to high accuracy 211// as A_hi + A_lo. Consequently, the Result ATANL( Arg_Y, Arg_X ) is 212// given by 213// 214// s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo) 215// 216// We now discuss the calculation of arctan(V/U) for 0 < V <= U. 217// 218// For (V/U) < 2^(-3), we use a simple polynomial of the form 219// 220// z + z^3*(P_1 + z^2*(P_2 + z^2*(P_3 + ... + P_8))) 221// 222// where z = V/U. 223// 224// For the sake of accuracy, the first term "z" must approximate V/U to 225// extra precision. For z^3 and higher power, a working precision 226// approximation to V/U suffices. Thus, we obtain: 227// 228// z_hi + z_lo = V/U to extra precision and 229// z = V/U to working precision 230// 231// The value arctan(V/U) is delivered as two pieces (A_hi, A_lo) 232// 233// (A_hi,A_lo) = (z_hi, z^3*(P_1 + ... + P_8) + z_lo). 234// 235// 236// For 2^(-3) <= (V/U) <= 1, we use a table-driven approach. 237// Consider 238// 239// (V/U) = 2^k * 1.b_1 b_2 .... b_63 b_64 b_65 .... 240// 241// Define 242// 243// z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1 244// 245// then 246// / \ 247// | (V/U) - z_hi | 248 249// arctan(V/U) = arctan(z_hi) + acrtan| -------------- | 250// | 1 + (V/U)*z_hi | 251// \ / 252// 253// / \ 254// | V - z_hi*U | 255 256// = arctan(z_hi) + acrtan| -------------- | 257// | U + V*z_hi | 258// \ / 259// 260// = arctan(z_hi) + acrtan( V' / U' ) 261// 262// 263// where 264// 265// V' = V - U*z_hi; U' = U + V*z_hi. 266// 267// Let 268// 269// w_hi + w_lo = V'/U' to extra precision and 270// w = V'/U' to working precision 271// 272// then we can approximate arctan(V'/U') by 273// 274// arctan(V'/U') = w_hi + w_lo 275// + w^3*(Q_1 + w^2*(Q_2 + w^2*(Q_3 + w^2*Q_4))) 276// 277// = w_hi + w_lo + poly 278// 279// Finally, arctan(z_hi) is calculated beforehand and stored in a table 280// as Tbl_hi, Tbl_lo. Thus, 281// 282// (A_hi, A_lo) = (Tbl_hi, w_hi+(poly+(w_lo+Tbl_lo))) 283// 284// This completes the mathematical description. 285// 286// 287// Algorithm 288// ------------- 289// 290// Step 0. Check for unsupported format. 291// 292// If 293// ( expo(Arg_X) not zero AND msb(Arg_X) = 0 ) OR 294// ( expo(Arg_Y) not zero AND msb(Arg_Y) = 0 ) 295// 296// then one of the arguments is unsupported. Generate an 297// invalid and return qNaN. 298// 299// Step 1. Initialize 300// 301// Normalize Arg_X and Arg_Y and set the following 302// 303// sign_X := sign_bit(Arg_X) 304// s_Y := (sign_bit(Arg_Y)==0? 1.0 : -1.0) 305// swap := (|Arg_X| >= |Arg_Y|? 0 : 1 ) 306// U := max( |Arg_X|, |Arg_Y| ) 307// V := min( |Arg_X|, |Arg_Y| ) 308// 309// execute: frcpa E, pred, V, U 310// If pred is 0, go to Step 5 for special cases handling. 311// 312// Step 2. Decide on branch. 313// 314// Q := E * V 315// If Q < 2^(-3) go to Step 4 for simple polynomial case. 316// 317// Step 3. Table-driven algorithm. 318// 319// Q is represented as 320// 321// 2^(-k) * 1.b_1 b_2 b_3 ... b_63; k = 0,-1,-2,-3 322// 323// and that if k = 0, b_1 = b_2 = b_3 = b_4 = 0. 324// 325// Define 326// 327// z_hi := 2^(-k) * 1.b_1 b_2 b_3 b_4 1 328// 329// (note that there are 49 possible values of z_hi). 330// 331// ...We now calculate V' and U'. While V' is representable 332// ...as a 64-bit number because of cancellation, U' is 333// ...not in general a 64-bit number. Obtaining U' accurately 334// ...requires two working precision numbers 335// 336// U_prime_hi := U + V * z_hi ...WP approx. to U' 337// U_prime_lo := ( U - U_prime_hi ) + V*z_hi ...observe order 338// V_prime := V - U * z_hi ...this is exact 339// 340// C_hi := frcpa (1.0, U_prime_hi) ...C_hi approx 1/U'_hi 341// 342// loop 3 times 343// C_hi := C_hi + C_hi*(1.0 - C_hi*U_prime_hi) 344// 345// ...at this point C_hi is (1/U_prime_hi) to roughly 64 bits 346// 347// w_hi := V_prime * C_hi ...w_hi is V_prime/U_prime to 348// ...roughly working precision 349// 350// ...note that we want w_hi + w_lo to approximate 351// ...V_prime/(U_prime_hi + U_prime_lo) to extra precision 352// ...but for now, w_hi is good enough for the polynomial 353// ...calculation. 354// 355// wsq := w_hi*w_hi 356// poly := w_hi*wsq*(Q_1 + wsq*(Q_2 + wsq*(Q_3 + wsq*Q_4))) 357// 358// Fetch 359// (Tbl_hi, Tbl_lo) = atan(z_hi) indexed by (k,b_1,b_2,b_3,b_4) 360// ...Tbl_hi is a double-precision number 361// ...Tbl_lo is a single-precision number 362// 363// (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo) 364// ...as discussed previous. Again; the implementation can 365// ...chose to fetch P_hi and P_lo from a table indexed by 366// ...(sign_X, swap). 367// ...P_hi is a double-precision number; 368// ...P_lo is a single-precision number. 369// 370// ...calculate w_lo so that w_hi + w_lo is V'/U' accurately 371// w_lo := ((V_prime - w_hi*U_prime_hi) - 372// w_hi*U_prime_lo) * C_hi ...observe order 373// 374// 375// ...Ready to deliver arctan(V'/U') as A_hi, A_lo 376// A_hi := Tbl_hi 377// A_lo := w_hi + (poly + (Tbl_lo + w_lo)) ...observe order 378// 379// ...Deliver final Result 380// ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo) 381// 382// sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 ) 383// ...sigma can be obtained by a table lookup using 384// ...(sign_X,swap) as index and stored as single precision 385// ...sigma should be calculated earlier 386// 387// P_hi := s_Y*P_hi 388// A_hi := s_Y*A_hi 389// 390// Res_hi := P_hi + sigma*A_hi ...this is exact because 391// ...both P_hi and Tbl_hi 392// ...are double-precision 393// ...and |Tbl_hi| > 2^(-4) 394// ...P_hi is either 0 or 395// ...between (1,4) 396// 397// Res_lo := sigma*A_lo + P_lo 398// 399// Return Res_hi + s_Y*Res_lo in user-defined rounding control 400// 401// Step 4. Simple polynomial case. 402// 403// ...E and Q are inherited from Step 2. 404// 405// A_hi := Q ...Q is inherited from Step 2 Q approx V/U 406// 407// loop 3 times 408// E := E + E2(1.0 - E*U1 409// ...at this point E approximates 1/U to roughly working precision 410// 411// z := V * E ...z approximates V/U to roughly working precision 412// zsq := z * z 413// z4 := zsq * zsq; z8 := z4 * z4 414// 415// poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) 416// poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3)) 417// 418// poly := poly1 + z8*poly2 419// 420// z_lo := (V - A_hi*U)*E 421// 422// A_lo := z*poly + z_lo 423// ...A_hi, A_lo approximate arctan(V/U) accurately 424// 425// (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo) 426// ...one can store the M(sign_X,swap) as single precision 427// ...values 428// 429// ...Deliver final Result 430// ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo) 431// 432// sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 ) 433// ...sigma can be obtained by a table lookup using 434// ...(sign_X,swap) as index and stored as single precision 435// ...sigma should be calculated earlier 436// 437// P_hi := s_Y*P_hi 438// A_hi := s_Y*A_hi 439// 440// Res_hi := P_hi + sigma*A_hi ...need to compute 441// ...P_hi + sigma*A_hi 442// ...exactly 443// 444// tmp := (P_hi - Res_hi) + sigma*A_hi 445// 446// Res_lo := s_Y*(sigma*A_lo + P_lo) + tmp 447// 448// Return Res_hi + Res_lo in user-defined rounding control 449// 450// Step 5. Special Cases 451// 452// These are detected early in the function by fclass instructions. 453// 454// We are in one of those special cases when X or Y is 0,+-inf or NaN 455// 456// If one of X and Y is NaN, return X+Y (which will generate 457// invalid in case one is a signaling NaN). Otherwise, 458// return the Result as described in the table 459// 460// 461// 462// \ Y | 463// X \ | +0 | -0 | +inf | -inf | finite non-zero 464// \ | | | | | 465// ______________________________________________________ 466// | | | | 467// +-0 | Invalid/ | pi/2 | -pi/2 | sign(Y)*pi/2 468// | qNaN | | | 469// -------------------------------------------------------- 470// | | | | | 471// +inf | +0 | -0 | pi/4 | -pi/4 | sign(Y)*0 472// -------------------------------------------------------- 473// | | | | | 474// -inf | +pi | -pi | 3pi/4 | -3pi/4 | sign(Y)*pi 475// -------------------------------------------------------- 476// finite | X>0? | pi/2 | -pi/2 | 477// non-zero| sign(Y)*0: | | | N/A 478// | sign(Y)*pi | | | 479// 480// 481 482ArgY_orig = f8 483Result = f8 484FR_RESULT = f8 485ArgX_orig = f9 486ArgX = f10 487FR_X = f10 488ArgY = f11 489FR_Y = f11 490s_Y = f12 491U = f13 492V = f14 493E = f15 494Q = f32 495z_hi = f33 496U_prime_hi = f34 497U_prime_lo = f35 498V_prime = f36 499C_hi = f37 500w_hi = f38 501w_lo = f39 502wsq = f40 503poly = f41 504Tbl_hi = f42 505Tbl_lo = f43 506P_hi = f44 507P_lo = f45 508A_hi = f46 509A_lo = f47 510sigma = f48 511Res_hi = f49 512Res_lo = f50 513Z = f52 514zsq = f53 515z4 = f54 516z8 = f54 517poly1 = f55 518poly2 = f56 519z_lo = f57 520tmp = f58 521P_1 = f59 522Q_1 = f60 523P_2 = f61 524Q_2 = f62 525P_3 = f63 526Q_3 = f64 527P_4 = f65 528Q_4 = f66 529P_5 = f67 530P_6 = f68 531P_7 = f69 532P_8 = f70 533U_hold = f71 534TWO_TO_NEG3 = f72 535C_hi_hold = f73 536E_hold = f74 537M = f75 538ArgX_abs = f76 539ArgY_abs = f77 540Result_lo = f78 541A_temp = f79 542FR_temp = f80 543Xsq = f81 544Ysq = f82 545tmp_small = f83 546 547GR_SAVE_PFS = r33 548GR_SAVE_B0 = r34 549GR_SAVE_GP = r35 550sign_X = r36 551sign_Y = r37 552swap = r38 553table_ptr1 = r39 554table_ptr2 = r40 555k = r41 556lookup = r42 557exp_ArgX = r43 558exp_ArgY = r44 559exponent_Q = r45 560significand_Q = r46 561special = r47 562sp_exp_Q = r48 563sp_exp_4sig_Q = r49 564table_base = r50 565int_temp = r51 566 567GR_Parameter_X = r49 568GR_Parameter_Y = r50 569GR_Parameter_RESULT = r51 570GR_Parameter_TAG = r52 571GR_temp = r52 572 573RODATA 574.align 16 575 576LOCAL_OBJECT_START(Constants_atan) 577// double pi/2 578data8 0x3FF921FB54442D18 579// single lo_pi/2, two**(-3) 580data4 0x248D3132, 0x3E000000 581data8 0xAAAAAAAAAAAAAAA3, 0xBFFD // P_1 582data8 0xCCCCCCCCCCCC54B2, 0x3FFC // P_2 583data8 0x9249249247E4D0C2, 0xBFFC // P_3 584data8 0xE38E38E058870889, 0x3FFB // P_4 585data8 0xBA2E895B290149F8, 0xBFFB // P_5 586data8 0x9D88E6D4250F733D, 0x3FFB // P_6 587data8 0x884E51FFFB8745A0, 0xBFFB // P_7 588data8 0xE1C7412B394396BD, 0x3FFA // P_8 589data8 0xAAAAAAAAAAAAA52F, 0xBFFD // Q_1 590data8 0xCCCCCCCCC75B60D3, 0x3FFC // Q_2 591data8 0x924923AD011F1940, 0xBFFC // Q_3 592data8 0xE36F716D2A5F89BD, 0x3FFB // Q_4 593// 594// Entries Tbl_hi (double precision) 595// B = 1+Index/16+1/32 Index = 0 596// Entries Tbl_lo (single precision) 597// B = 1+Index/16+1/32 Index = 0 598// 599data8 0x3FE9A000A935BD8E 600data4 0x23ACA08F, 0x00000000 601// 602// Entries Tbl_hi (double precision) Index = 0,1,...,15 603// B = 2^(-1)*(1+Index/16+1/32) 604// Entries Tbl_lo (single precision) 605// Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32) 606// 607data8 0x3FDE77EB7F175A34 608data4 0x238729EE, 0x00000000 609data8 0x3FE0039C73C1A40B 610data4 0x249334DB, 0x00000000 611data8 0x3FE0C6145B5B43DA 612data4 0x22CBA7D1, 0x00000000 613data8 0x3FE1835A88BE7C13 614data4 0x246310E7, 0x00000000 615data8 0x3FE23B71E2CC9E6A 616data4 0x236210E5, 0x00000000 617data8 0x3FE2EE628406CBCA 618data4 0x2462EAF5, 0x00000000 619data8 0x3FE39C391CD41719 620data4 0x24B73EF3, 0x00000000 621data8 0x3FE445065B795B55 622data4 0x24C11260, 0x00000000 623data8 0x3FE4E8DE5BB6EC04 624data4 0x242519EE, 0x00000000 625data8 0x3FE587D81F732FBA 626data4 0x24D4346C, 0x00000000 627data8 0x3FE6220D115D7B8D 628data4 0x24ED487B, 0x00000000 629data8 0x3FE6B798920B3D98 630data4 0x2495FF1E, 0x00000000 631data8 0x3FE748978FBA8E0F 632data4 0x223D9531, 0x00000000 633data8 0x3FE7D528289FA093 634data4 0x242B0411, 0x00000000 635data8 0x3FE85D69576CC2C5 636data4 0x2335B374, 0x00000000 637data8 0x3FE8E17AA99CC05D 638data4 0x24C27CFB, 0x00000000 639// 640// Entries Tbl_hi (double precision) Index = 0,1,...,15 641// B = 2^(-2)*(1+Index/16+1/32) 642// Entries Tbl_lo (single precision) 643// Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32) 644// 645data8 0x3FD025FA510665B5 646data4 0x24263482, 0x00000000 647data8 0x3FD1151A362431C9 648data4 0x242C8DC9, 0x00000000 649data8 0x3FD2025567E47C95 650data4 0x245CF9BA, 0x00000000 651data8 0x3FD2ED987A823CFE 652data4 0x235C892C, 0x00000000 653data8 0x3FD3D6D129271134 654data4 0x2389BE52, 0x00000000 655data8 0x3FD4BDEE586890E6 656data4 0x24436471, 0x00000000 657data8 0x3FD5A2E0175E0F4E 658data4 0x2389DBD4, 0x00000000 659data8 0x3FD685979F5FA6FD 660data4 0x2476D43F, 0x00000000 661data8 0x3FD7660752817501 662data4 0x24711774, 0x00000000 663data8 0x3FD84422B8DF95D7 664data4 0x23EBB501, 0x00000000 665data8 0x3FD91FDE7CD0C662 666data4 0x23883A0C, 0x00000000 667data8 0x3FD9F93066168001 668data4 0x240DF63F, 0x00000000 669data8 0x3FDAD00F5422058B 670data4 0x23FE261A, 0x00000000 671data8 0x3FDBA473378624A5 672data4 0x23A8CD0E, 0x00000000 673data8 0x3FDC76550AAD71F8 674data4 0x2422D1D0, 0x00000000 675data8 0x3FDD45AEC9EC862B 676data4 0x2344A109, 0x00000000 677// 678// Entries Tbl_hi (double precision) Index = 0,1,...,15 679// B = 2^(-3)*(1+Index/16+1/32) 680// Entries Tbl_lo (single precision) 681// Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32) 682// 683data8 0x3FC068D584212B3D 684data4 0x239874B6, 0x00000000 685data8 0x3FC1646541060850 686data4 0x2335E774, 0x00000000 687data8 0x3FC25F6E171A535C 688data4 0x233E36BE, 0x00000000 689data8 0x3FC359E8EDEB99A3 690data4 0x239680A3, 0x00000000 691data8 0x3FC453CEC6092A9E 692data4 0x230FB29E, 0x00000000 693data8 0x3FC54D18BA11570A 694data4 0x230C1418, 0x00000000 695data8 0x3FC645BFFFB3AA73 696data4 0x23F0564A, 0x00000000 697data8 0x3FC73DBDE8A7D201 698data4 0x23D4A5E1, 0x00000000 699data8 0x3FC8350BE398EBC7 700data4 0x23D4ADDA, 0x00000000 701data8 0x3FC92BA37D050271 702data4 0x23BCB085, 0x00000000 703data8 0x3FCA217E601081A5 704data4 0x23BC841D, 0x00000000 705data8 0x3FCB1696574D780B 706data4 0x23CF4A8E, 0x00000000 707data8 0x3FCC0AE54D768466 708data4 0x23BECC90, 0x00000000 709data8 0x3FCCFE654E1D5395 710data4 0x2323DCD2, 0x00000000 711data8 0x3FCDF110864C9D9D 712data4 0x23F53F3A, 0x00000000 713data8 0x3FCEE2E1451D980C 714data4 0x23CCB11F, 0x00000000 715// 716data8 0x400921FB54442D18, 0x3CA1A62633145C07 // PI two doubles 717data8 0x3FF921FB54442D18, 0x3C91A62633145C07 // PI_by_2 two dbles 718data8 0x3FE921FB54442D18, 0x3C81A62633145C07 // PI_by_4 two dbles 719data8 0x4002D97C7F3321D2, 0x3C9A79394C9E8A0A // 3PI_by_4 two dbles 720LOCAL_OBJECT_END(Constants_atan) 721 722 723.section .text 724GLOBAL_IEEE754_ENTRY(atanl) 725 726// Use common code with atan2l after setting x=1.0 727{ .mfi 728 alloc r32 = ar.pfs, 0, 17, 4, 0 729 fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y 730 nop.i 999 731} 732{ .mfi 733 addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer 734 fma.s1 Xsq = f1, f1, f0 // Form x*x 735 nop.i 999 736} 737;; 738 739{ .mfi 740 ld8 table_ptr1 = [table_ptr1] // Get table pointer 741 fnorm.s1 ArgY = ArgY_orig 742 nop.i 999 743} 744{ .mfi 745 nop.m 999 746 fnorm.s1 ArgX = f1 747 nop.i 999 748} 749;; 750 751{ .mfi 752 getf.exp sign_X = f1 // Get signexp of x 753 fmerge.s ArgX_abs = f0, f1 // Form |x| 754 nop.i 999 755} 756{ .mfi 757 nop.m 999 758 fnorm.s1 ArgX_orig = f1 759 nop.i 999 760} 761;; 762 763{ .mfi 764 getf.exp sign_Y = ArgY_orig // Get signexp of y 765 fmerge.s ArgY_abs = f0, ArgY_orig // Form |y| 766 mov table_base = table_ptr1 // Save base pointer to tables 767} 768;; 769 770{ .mfi 771 ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi 772 fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero 773 nop.i 999 774} 775;; 776 777{ .mfi 778 ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3 779 nop.f 999 780 nop.i 999 781} 782{ .mfi 783 nop.m 999 784 fma.s1 M = f1, f1, f0 // Set M = 1.0 785 nop.i 999 786} 787;; 788 789// 790// Check for everything - if false, then must be pseudo-zero 791// or pseudo-nan (IA unsupporteds). 792// 793{ .mfb 794 nop.m 999 795 fclass.m p0,p12 = f1, 0x1FF // Test x unsupported 796(p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero 797} 798;; 799 800// U = max(ArgX_abs,ArgY_abs) 801// V = min(ArgX_abs,ArgY_abs) 802{ .mfi 803 nop.m 999 804 fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares 805 nop.i 999 806} 807{ .mfb 808 nop.m 999 809 fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y| 810 br.cond.sptk ATANL_COMMON // Branch to common code 811} 812;; 813 814GLOBAL_IEEE754_END(atanl) 815libm_alias_ldouble_other (__atan, atan) 816 817GLOBAL_IEEE754_ENTRY(atan2l) 818 819{ .mfi 820 alloc r32 = ar.pfs, 0, 17, 4, 0 821 fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y 822 nop.i 999 823} 824{ .mfi 825 addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer 826 fma.s1 Xsq = ArgX_orig, ArgX_orig, f0 // Form x*x 827 nop.i 999 828} 829;; 830 831{ .mfi 832 ld8 table_ptr1 = [table_ptr1] // Get table pointer 833 fnorm.s1 ArgY = ArgY_orig 834 nop.i 999 835} 836{ .mfi 837 nop.m 999 838 fnorm.s1 ArgX = ArgX_orig 839 nop.i 999 840} 841;; 842 843{ .mfi 844 getf.exp sign_X = ArgX_orig // Get signexp of x 845 fmerge.s ArgX_abs = f0, ArgX_orig // Form |x| 846 nop.i 999 847} 848;; 849 850{ .mfi 851 getf.exp sign_Y = ArgY_orig // Get signexp of y 852 fmerge.s ArgY_abs = f0, ArgY_orig // Form |y| 853 mov table_base = table_ptr1 // Save base pointer to tables 854} 855;; 856 857{ .mfi 858 ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi 859 fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero 860 nop.i 999 861} 862;; 863 864{ .mfi 865 ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3 866 fclass.m p9,p0 = ArgX_orig, 0x1e7 // Test x natval, nan, inf, zero 867 nop.i 999 868} 869{ .mfi 870 nop.m 999 871 fma.s1 M = f1, f1, f0 // Set M = 1.0 872 nop.i 999 873} 874;; 875 876// 877// Check for everything - if false, then must be pseudo-zero 878// or pseudo-nan (IA unsupporteds). 879// 880{ .mfb 881 nop.m 999 882 fclass.m p0,p12 = ArgX_orig, 0x1FF // Test x unsupported 883(p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero 884} 885;; 886 887// U = max(ArgX_abs,ArgY_abs) 888// V = min(ArgX_abs,ArgY_abs) 889{ .mfi 890 nop.m 999 891 fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares 892 nop.i 999 893} 894{ .mfb 895 nop.m 999 896 fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y| 897(p9) br.cond.spnt ATANL_X_SPECIAL // Branch if x natval, nan, inf, zero 898} 899;; 900 901// Now common code for atanl and atan2l 902ATANL_COMMON: 903{ .mfi 904 nop.m 999 905 fclass.m p0,p13 = ArgY_orig, 0x1FF // Test y unsupported 906 shr sign_X = sign_X, 17 // Get sign bit of x 907} 908{ .mfi 909 nop.m 999 910 fma.s1 U = ArgY_abs, f1, f0 // Set U assuming |x| < |y| 911 adds table_ptr1 = 176, table_ptr1 // Point to Q4 912} 913;; 914 915{ .mfi 916(p6) add swap = r0, r0 // Set swap=0 if |x| >= |y| 917(p6) frcpa.s1 E, p0 = ArgY_abs, ArgX_abs // Compute E if |x| >= |y| 918 shr sign_Y = sign_Y, 17 // Get sign bit of y 919} 920{ .mfb 921 nop.m 999 922(p6) fma.s1 V = ArgY_abs, f1, f0 // Set V if |x| >= |y| 923(p12) br.cond.spnt ATANL_UNSUPPORTED // Branch if x unsupported 924} 925;; 926 927// Set p8 if y >=0 928// Set p9 if y < 0 929// Set p10 if |x| >= |y| and x >=0 930// Set p11 if |x| >= |y| and x < 0 931{ .mfi 932 cmp.eq p8, p9 = 0, sign_Y // Test for y >= 0 933(p7) frcpa.s1 E, p0 = ArgX_abs, ArgY_abs // Compute E if |x| < |y| 934(p7) add swap = 1, r0 // Set swap=1 if |x| < |y| 935} 936{ .mfb 937(p6) cmp.eq.unc p10, p11 = 0, sign_X // If |x| >= |y|, test for x >= 0 938(p6) fma.s1 U = ArgX_abs, f1, f0 // Set U if |x| >= |y| 939(p13) br.cond.spnt ATANL_UNSUPPORTED // Branch if y unsupported 940} 941;; 942 943// 944// if p8, s_Y = 1.0 945// if p9, s_Y = -1.0 946// 947.pred.rel "mutex",p8,p9 948{ .mfi 949 nop.m 999 950(p8) fadd.s1 s_Y = f0, f1 // If y >= 0 set s_Y = 1.0 951 nop.i 999 952} 953{ .mfi 954 nop.m 999 955(p9) fsub.s1 s_Y = f0, f1 // If y < 0 set s_Y = -1.0 956 nop.i 999 957} 958;; 959 960.pred.rel "mutex",p10,p11 961{ .mfi 962 nop.m 999 963(p10) fsub.s1 M = M, f1 // If |x| >= |y| and x >=0, set M=0 964 nop.i 999 965} 966{ .mfi 967 nop.m 999 968(p11) fadd.s1 M = M, f1 // If |x| >= |y| and x < 0, set M=2.0 969 nop.i 999 970} 971;; 972 973{ .mfi 974 nop.m 999 975 fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag 976 nop.i 999 977} 978// ************************************************* 979// ********************* STEP2 ********************* 980// ************************************************* 981// 982// Q = E * V 983// 984{ .mfi 985 nop.m 999 986 fmpy.s1 Q = E, V 987 nop.i 999 988} 989;; 990 991{ .mfi 992 nop.m 999 993 fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (1) if POLY path 994 nop.i 999 995} 996;; 997 998// Create a single precision representation of the signexp of Q with the 999// 4 most significant bits of the significand followed by a 1 and then 18 0's 1000{ .mfi 1001 nop.m 999 1002 fmpy.s1 P_hi = M, P_hi 1003 dep.z special = 0x1, 18, 1 // Form 0x0000000000040000 1004} 1005{ .mfi 1006 nop.m 999 1007 fmpy.s1 P_lo = M, P_lo 1008 add table_ptr2 = 32, table_ptr1 1009} 1010;; 1011 1012{ .mfi 1013 nop.m 999 1014 fma.s1 A_temp = Q, f1, f0 // Set A_temp if POLY path 1015 nop.i 999 1016} 1017{ .mfi 1018 nop.m 999 1019 fma.s1 E = E, E_hold, E // E = E + E*E_hold (1) if POLY path 1020 nop.i 999 1021} 1022;; 1023 1024// 1025// Is Q < 2**(-3)? 1026// swap = xor(swap,sign_X) 1027// 1028{ .mfi 1029 nop.m 999 1030 fcmp.lt.s1 p9, p0 = Q, TWO_TO_NEG3 // Test Q < 2^-3 1031 xor swap = sign_X, swap 1032} 1033;; 1034 1035// P_hi = s_Y * P_hi 1036{ .mmf 1037 getf.exp exponent_Q = Q // Get signexp of Q 1038 cmp.eq.unc p7, p6 = 0x00000, swap 1039 fmpy.s1 P_hi = s_Y, P_hi 1040} 1041;; 1042 1043// 1044// if (PR_1) sigma = -1.0 1045// if (PR_2) sigma = 1.0 1046// 1047{ .mfi 1048 getf.sig significand_Q = Q // Get significand of Q 1049(p6) fsub.s1 sigma = f0, f1 1050 nop.i 999 1051} 1052{ .mfb 1053(p9) add table_ptr1 = 128, table_base // Point to P8 if POLY path 1054(p7) fadd.s1 sigma = f0, f1 1055(p9) br.cond.spnt ATANL_POLY // Branch to POLY if 0 < Q < 2^-3 1056} 1057;; 1058 1059// 1060// ************************************************* 1061// ******************** STEP3 ********************** 1062// ************************************************* 1063// 1064// lookup = b_1 b_2 b_3 B_4 1065// 1066{ .mmi 1067 nop.m 999 1068 nop.m 999 1069 andcm k = 0x0003, exponent_Q // k=0,1,2,3 for exp_Q=0,-1,-2,-3 1070} 1071;; 1072 1073// 1074// Generate sign_exp_Q b_1 b_2 b_3 b_4 1 0 0 0 ... 0 in single precision 1075// representation. Note sign of Q is always 0. 1076// 1077{ .mfi 1078 cmp.eq p8, p9 = 0x0000, k // Test k=0 1079 nop.f 999 1080 extr.u lookup = significand_Q, 59, 4 // Extract b_1 b_2 b_3 b_4 for index 1081} 1082{ .mfi 1083 sub sp_exp_Q = 0x7f, k // Form single prec biased exp of Q 1084 nop.f 999 1085 sub k = k, r0, 1 // Decrement k 1086} 1087;; 1088 1089// Form pointer to B index table 1090{ .mfi 1091 ldfe Q_4 = [table_ptr1], -16 // Load Q_4 1092 nop.f 999 1093(p9) shl k = k, 8 // k = 0, 256, or 512 1094} 1095{ .mfi 1096(p9) shladd table_ptr2 = lookup, 4, table_ptr2 1097 nop.f 999 1098 shladd sp_exp_4sig_Q = sp_exp_Q, 4, lookup // Shift and add in 4 high bits 1099} 1100;; 1101 1102{ .mmi 1103(p8) add table_ptr2 = -16, table_ptr2 // Pointer if original k was 0 1104(p9) add table_ptr2 = k, table_ptr2 // Pointer if k was 1, 2, 3 1105 dep special = sp_exp_4sig_Q, special, 19, 13 // Form z_hi as single prec 1106} 1107;; 1108 1109// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 1110{ .mmi 1111 ldfd Tbl_hi = [table_ptr2], 8 // Load Tbl_hi from index table 1112;; 1113 setf.s z_hi = special // Form z_hi 1114 nop.i 999 1115} 1116{ .mmi 1117 ldfs Tbl_lo = [table_ptr2], 8 // Load Tbl_lo from index table 1118;; 1119 ldfe Q_3 = [table_ptr1], -16 // Load Q_3 1120 nop.i 999 1121} 1122;; 1123 1124{ .mmi 1125 ldfe Q_2 = [table_ptr1], -16 // Load Q_2 1126 nop.m 999 1127 nop.i 999 1128} 1129;; 1130 1131{ .mmf 1132 ldfe Q_1 = [table_ptr1], -16 // Load Q_1 1133 nop.m 999 1134 nop.f 999 1135} 1136;; 1137 1138{ .mfi 1139 nop.m 999 1140 fma.s1 U_prime_hi = V, z_hi, U // U_prime_hi = U + V * z_hi 1141 nop.i 999 1142} 1143{ .mfi 1144 nop.m 999 1145 fnma.s1 V_prime = U, z_hi, V // V_prime = V - U * z_hi 1146 nop.i 999 1147} 1148;; 1149 1150{ .mfi 1151 nop.m 999 1152 mov A_hi = Tbl_hi // Start with A_hi = Tbl_hi 1153 nop.i 999 1154} 1155;; 1156 1157{ .mfi 1158 nop.m 999 1159 fsub.s1 U_hold = U, U_prime_hi // U_hold = U - U_prime_hi 1160 nop.i 999 1161} 1162;; 1163 1164{ .mfi 1165 nop.m 999 1166 frcpa.s1 C_hi, p0 = f1, U_prime_hi // C_hi = frcpa(1,U_prime_hi) 1167 nop.i 999 1168} 1169;; 1170 1171{ .mfi 1172 nop.m 999 1173 fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi 1174 nop.i 999 1175} 1176;; 1177 1178{ .mfi 1179 nop.m 999 1180 fma.s1 U_prime_lo = z_hi, V, U_hold // U_prime_lo = U_hold + V * z_hi 1181 nop.i 999 1182} 1183;; 1184 1185// C_hi_hold = 1 - C_hi * U_prime_hi (1) 1186{ .mfi 1187 nop.m 999 1188 fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 1189 nop.i 999 1190} 1191;; 1192 1193{ .mfi 1194 nop.m 999 1195 fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi 1196 nop.i 999 1197} 1198;; 1199 1200{ .mfi 1201 nop.m 999 1202 fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (1) 1203 nop.i 999 1204} 1205;; 1206 1207// C_hi_hold = 1 - C_hi * U_prime_hi (2) 1208{ .mfi 1209 nop.m 999 1210 fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 1211 nop.i 999 1212} 1213;; 1214 1215{ .mfi 1216 nop.m 999 1217 fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (2) 1218 nop.i 999 1219} 1220;; 1221 1222// C_hi_hold = 1 - C_hi * U_prime_hi (3) 1223{ .mfi 1224 nop.m 999 1225 fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 1226 nop.i 999 1227} 1228;; 1229 1230{ .mfi 1231 nop.m 999 1232 fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (3) 1233 nop.i 999 1234} 1235;; 1236 1237{ .mfi 1238 nop.m 999 1239 fmpy.s1 w_hi = V_prime, C_hi // w_hi = V_prime * C_hi 1240 nop.i 999 1241} 1242;; 1243 1244{ .mfi 1245 nop.m 999 1246 fmpy.s1 wsq = w_hi, w_hi // wsq = w_hi * w_hi 1247 nop.i 999 1248} 1249{ .mfi 1250 nop.m 999 1251 fnma.s1 w_lo = w_hi, U_prime_hi, V_prime // w_lo = V_prime-w_hi*U_prime_hi 1252 nop.i 999 1253} 1254;; 1255 1256{ .mfi 1257 nop.m 999 1258 fma.s1 poly = wsq, Q_4, Q_3 // poly = Q_3 + wsq * Q_4 1259 nop.i 999 1260} 1261{ .mfi 1262 nop.m 999 1263 fnma.s1 w_lo = w_hi, U_prime_lo, w_lo // w_lo = w_lo - w_hi * U_prime_lo 1264 nop.i 999 1265} 1266;; 1267 1268{ .mfi 1269 nop.m 999 1270 fma.s1 poly = wsq, poly, Q_2 // poly = Q_2 + wsq * poly 1271 nop.i 999 1272} 1273{ .mfi 1274 nop.m 999 1275 fmpy.s1 w_lo = C_hi, w_lo // w_lo = = w_lo * C_hi 1276 nop.i 999 1277} 1278;; 1279 1280{ .mfi 1281 nop.m 999 1282 fma.s1 poly = wsq, poly, Q_1 // poly = Q_1 + wsq * poly 1283 nop.i 999 1284} 1285{ .mfi 1286 nop.m 999 1287 fadd.s1 A_lo = Tbl_lo, w_lo // A_lo = Tbl_lo + w_lo 1288 nop.i 999 1289} 1290;; 1291 1292{ .mfi 1293 nop.m 999 1294 fmpy.s0 Q_1 = Q_1, Q_1 // Dummy operation to raise inexact 1295 nop.i 999 1296} 1297;; 1298 1299{ .mfi 1300 nop.m 999 1301 fmpy.s1 poly = wsq, poly // poly = wsq * poly 1302 nop.i 999 1303} 1304;; 1305 1306{ .mfi 1307 nop.m 999 1308 fmpy.s1 poly = w_hi, poly // poly = w_hi * poly 1309 nop.i 999 1310} 1311;; 1312 1313{ .mfi 1314 nop.m 999 1315 fadd.s1 A_lo = A_lo, poly // A_lo = A_lo + poly 1316 nop.i 999 1317} 1318;; 1319 1320{ .mfi 1321 nop.m 999 1322 fadd.s1 A_lo = A_lo, w_hi // A_lo = A_lo + w_hi 1323 nop.i 999 1324} 1325;; 1326 1327{ .mfi 1328 nop.m 999 1329 fma.s1 Res_lo = sigma, A_lo, P_lo // Res_lo = P_lo + sigma * A_lo 1330 nop.i 999 1331} 1332;; 1333 1334// 1335// Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode) 1336// 1337{ .mfb 1338 nop.m 999 1339 fma.s0 Result = Res_lo, s_Y, Res_hi 1340 br.ret.sptk b0 // Exit table path 2^-3 <= V/U < 1 1341} 1342;; 1343 1344 1345ATANL_POLY: 1346// Here if 0 < V/U < 2^-3 1347// 1348// *********************************************** 1349// ******************** STEP4 ******************** 1350// *********************************************** 1351 1352// 1353// Following: 1354// Iterate 3 times E = E + E*(1.0 - E*U) 1355// Also load P_8, P_7, P_6, P_5, P_4 1356// 1357{ .mfi 1358 ldfe P_8 = [table_ptr1], -16 // Load P_8 1359 fnma.s1 z_lo = A_temp, U, V // z_lo = V - A_temp * U 1360 nop.i 999 1361} 1362{ .mfi 1363 nop.m 999 1364 fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (2) 1365 nop.i 999 1366} 1367;; 1368 1369{ .mmi 1370 ldfe P_7 = [table_ptr1], -16 // Load P_7 1371;; 1372 ldfe P_6 = [table_ptr1], -16 // Load P_6 1373 nop.i 999 1374} 1375;; 1376 1377{ .mfi 1378 ldfe P_5 = [table_ptr1], -16 // Load P_5 1379 fma.s1 E = E, E_hold, E // E = E + E_hold*E (2) 1380 nop.i 999 1381} 1382;; 1383 1384{ .mmi 1385 ldfe P_4 = [table_ptr1], -16 // Load P_4 1386;; 1387 ldfe P_3 = [table_ptr1], -16 // Load P_3 1388 nop.i 999 1389} 1390;; 1391 1392{ .mfi 1393 ldfe P_2 = [table_ptr1], -16 // Load P_2 1394 fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (3) 1395 nop.i 999 1396} 1397{ .mlx 1398 nop.m 999 1399 movl int_temp = 0x24005 // Signexp for small neg number 1400} 1401;; 1402 1403{ .mmf 1404 ldfe P_1 = [table_ptr1], -16 // Load P_1 1405 setf.exp tmp_small = int_temp // Form small neg number 1406 fma.s1 E = E, E_hold, E // E = E + E_hold*E (3) 1407} 1408;; 1409 1410// 1411// 1412// At this point E approximates 1/U to roughly working precision 1413// Z = V*E approximates V/U 1414// 1415{ .mfi 1416 nop.m 999 1417 fmpy.s1 Z = V, E // Z = V * E 1418 nop.i 999 1419} 1420{ .mfi 1421 nop.m 999 1422 fmpy.s1 z_lo = z_lo, E // z_lo = z_lo * E 1423 nop.i 999 1424} 1425;; 1426 1427// 1428// Now what we want to do is 1429// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) 1430// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3)) 1431// 1432// 1433// Fixup added to force inexact later - 1434// A_hi = A_temp + z_lo 1435// z_lo = (A_temp - A_hi) + z_lo 1436// 1437{ .mfi 1438 nop.m 999 1439 fmpy.s1 zsq = Z, Z // zsq = Z * Z 1440 nop.i 999 1441} 1442{ .mfi 1443 nop.m 999 1444 fadd.s1 A_hi = A_temp, z_lo // A_hi = A_temp + z_lo 1445 nop.i 999 1446} 1447;; 1448 1449{ .mfi 1450 nop.m 999 1451 fma.s1 poly1 = zsq, P_8, P_7 // poly1 = P_7 + zsq * P_8 1452 nop.i 999 1453} 1454{ .mfi 1455 nop.m 999 1456 fma.s1 poly2 = zsq, P_3, P_2 // poly2 = P_2 + zsq * P_3 1457 nop.i 999 1458} 1459;; 1460 1461{ .mfi 1462 nop.m 999 1463 fmpy.s1 z4 = zsq, zsq // z4 = zsq * zsq 1464 nop.i 999 1465} 1466{ .mfi 1467 nop.m 999 1468 fsub.s1 A_temp = A_temp, A_hi // A_temp = A_temp - A_hi 1469 nop.i 999 1470} 1471;; 1472 1473{ .mfi 1474 nop.m 999 1475 fmerge.s tmp = A_hi, A_hi // Copy tmp = A_hi 1476 nop.i 999 1477} 1478;; 1479 1480{ .mfi 1481 nop.m 999 1482 fma.s1 poly1 = zsq, poly1, P_6 // poly1 = P_6 + zsq * poly1 1483 nop.i 999 1484} 1485{ .mfi 1486 nop.m 999 1487 fma.s1 poly2 = zsq, poly2, P_1 // poly2 = P_2 + zsq * poly2 1488 nop.i 999 1489} 1490;; 1491 1492{ .mfi 1493 nop.m 999 1494 fmpy.s1 z8 = z4, z4 // z8 = z4 * z4 1495 nop.i 999 1496} 1497{ .mfi 1498 nop.m 999 1499 fadd.s1 z_lo = A_temp, z_lo // z_lo = (A_temp - A_hi) + z_lo 1500 nop.i 999 1501} 1502;; 1503 1504{ .mfi 1505 nop.m 999 1506 fma.s1 poly1 = zsq, poly1, P_5 // poly1 = P_5 + zsq * poly1 1507 nop.i 999 1508} 1509{ .mfi 1510 nop.m 999 1511 fmpy.s1 poly2 = poly2, zsq // poly2 = zsq * poly2 1512 nop.i 999 1513} 1514;; 1515 1516// Create small GR double in case need to raise underflow 1517{ .mfi 1518 nop.m 999 1519 fma.s1 poly1 = zsq, poly1, P_4 // poly1 = P_4 + zsq * poly1 1520 dep GR_temp = -1,r0,0,53 1521} 1522;; 1523 1524// Create small double in case need to raise underflow 1525{ .mfi 1526 setf.d FR_temp = GR_temp 1527 fma.s1 poly = z8, poly1, poly2 // poly = poly2 + z8 * poly1 1528 nop.i 999 1529} 1530;; 1531 1532{ .mfi 1533 nop.m 999 1534 fma.s1 A_lo = Z, poly, z_lo // A_lo = z_lo + Z * poly 1535 nop.i 999 1536} 1537;; 1538 1539{ .mfi 1540 nop.m 999 1541 fadd.s1 A_hi = tmp, A_lo // A_hi = tmp + A_lo 1542 nop.i 999 1543} 1544;; 1545 1546{ .mfi 1547 nop.m 999 1548 fsub.s1 tmp = tmp, A_hi // tmp = tmp - A_hi 1549 nop.i 999 1550} 1551{ .mfi 1552 nop.m 999 1553 fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi 1554 nop.i 999 1555} 1556;; 1557 1558{ .mfi 1559 nop.m 999 1560 fadd.s1 A_lo = tmp, A_lo // A_lo = tmp + A_lo 1561 nop.i 999 1562} 1563{ .mfi 1564 nop.m 999 1565 fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi 1566 nop.i 999 1567} 1568;; 1569 1570{ .mfi 1571 nop.m 999 1572 fsub.s1 tmp = P_hi, Res_hi // tmp = P_hi - Res_hi 1573 nop.i 999 1574} 1575;; 1576 1577// 1578// Test if A_lo is zero 1579// 1580{ .mfi 1581 nop.m 999 1582 fclass.m p6,p0 = A_lo, 0x007 // Test A_lo = 0 1583 nop.i 999 1584} 1585;; 1586 1587{ .mfi 1588 nop.m 999 1589(p6) mov A_lo = tmp_small // If A_lo zero, make very small 1590 nop.i 999 1591} 1592;; 1593 1594{ .mfi 1595 nop.m 999 1596 fma.s1 tmp = A_hi, sigma, tmp // tmp = sigma * A_hi + tmp 1597 nop.i 999 1598} 1599{ .mfi 1600 nop.m 999 1601 fma.s1 sigma = A_lo, sigma, P_lo // sigma = A_lo * sigma + P_lo 1602 nop.i 999 1603} 1604;; 1605 1606{ .mfi 1607 nop.m 999 1608 fma.s1 Res_lo = s_Y, sigma, tmp // Res_lo = s_Y * sigma + tmp 1609 nop.i 999 1610} 1611;; 1612 1613// 1614// Test if Res_lo is denormal 1615// 1616{ .mfi 1617 nop.m 999 1618 fclass.m p14, p15 = Res_lo, 0x0b 1619 nop.i 999 1620} 1621;; 1622 1623// 1624// Compute Result = Res_lo + Res_hi. Use s3 if Res_lo is denormal. 1625// 1626{ .mfi 1627 nop.m 999 1628(p14) fadd.s3 Result = Res_lo, Res_hi // Result for Res_lo denormal 1629 nop.i 999 1630} 1631{ .mfi 1632 nop.m 999 1633(p15) fadd.s0 Result = Res_lo, Res_hi // Result for Res_lo normal 1634 nop.i 999 1635} 1636;; 1637 1638// 1639// If Res_lo is denormal test if Result equals zero 1640// 1641{ .mfi 1642 nop.m 999 1643(p14) fclass.m.unc p14, p0 = Result, 0x07 1644 nop.i 999 1645} 1646;; 1647 1648// 1649// If Res_lo is denormal and Result equals zero, raise inexact, underflow 1650// by squaring small double 1651// 1652{ .mfb 1653 nop.m 999 1654(p14) fmpy.d.s0 FR_temp = FR_temp, FR_temp 1655 br.ret.sptk b0 // Exit POLY path, 0 < Q < 2^-3 1656} 1657;; 1658 1659 1660ATANL_UNSUPPORTED: 1661{ .mfb 1662 nop.m 999 1663 fmpy.s0 Result = ArgX,ArgY 1664 br.ret.sptk b0 1665} 1666;; 1667 1668// Here if y natval, nan, inf, zero 1669ATANL_Y_SPECIAL: 1670// Here if x natval, nan, inf, zero 1671ATANL_X_SPECIAL: 1672{ .mfi 1673 nop.m 999 1674 fclass.m p13,p12 = ArgY_orig, 0x0c3 // Test y nan 1675 nop.i 999 1676} 1677;; 1678 1679{ .mfi 1680 nop.m 999 1681 fclass.m p15,p14 = ArgY_orig, 0x103 // Test y natval 1682 nop.i 999 1683} 1684;; 1685 1686{ .mfi 1687 nop.m 999 1688(p12) fclass.m p13,p0 = ArgX_orig, 0x0c3 // Test x nan 1689 nop.i 999 1690} 1691;; 1692 1693{ .mfi 1694 nop.m 999 1695(p14) fclass.m p15,p0 = ArgX_orig, 0x103 // Test x natval 1696 nop.i 999 1697} 1698;; 1699 1700{ .mfb 1701 nop.m 999 1702(p13) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result nan if x or y nan 1703(p13) br.ret.spnt b0 // Exit if x or y nan 1704} 1705;; 1706 1707{ .mfb 1708 nop.m 999 1709(p15) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result natval if x or y natval 1710(p15) br.ret.spnt b0 // Exit if x or y natval 1711} 1712;; 1713 1714 1715// Here if x or y inf or zero 1716ATANL_SPECIAL_HANDLING: 1717{ .mfi 1718 nop.m 999 1719 fclass.m p6, p7 = ArgY_orig, 0x007 // Test y zero 1720 mov special = 992 // Offset to table 1721} 1722;; 1723 1724{ .mfb 1725 add table_ptr1 = table_base, special // Point to 3pi/4 1726 fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag 1727(p7) br.cond.spnt ATANL_ArgY_Not_ZERO // Branch if y not zero 1728} 1729;; 1730 1731// Here if y zero 1732{ .mmf 1733 ldfd Result = [table_ptr1], 8 // Get pi high 1734 nop.m 999 1735 fclass.m p14, p0 = ArgX, 0x035 // Test for x>=+0 1736} 1737;; 1738 1739{ .mmf 1740 nop.m 999 1741 ldfd Result_lo = [table_ptr1], -8 // Get pi lo 1742 fclass.m p15, p0 = ArgX, 0x036 // Test for x<=-0 1743} 1744;; 1745 1746// 1747// Return sign_Y * 0 when ArgX > +0 1748// 1749{ .mfi 1750 nop.m 999 1751(p14) fmerge.s Result = ArgY, f0 // If x>=+0, y=0, hi sgn(y)*0 1752 nop.i 999 1753} 1754;; 1755 1756{ .mfi 1757 nop.m 999 1758 fclass.m p13, p0 = ArgX, 0x007 // Test for x=0 1759 nop.i 999 1760} 1761;; 1762 1763{ .mfi 1764 nop.m 999 1765(p14) fmerge.s Result_lo = ArgY, f0 // If x>=+0, y=0, lo sgn(y)*0 1766 nop.i 999 1767} 1768;; 1769 1770{ .mfi 1771(p13) mov GR_Parameter_TAG = 36 // Error tag for x=0, y=0 1772 nop.f 999 1773 nop.i 999 1774} 1775;; 1776 1777// 1778// Return sign_Y * pi when ArgX < -0 1779// 1780{ .mfi 1781 nop.m 999 1782(p15) fmerge.s Result = ArgY, Result // If x<0, y=0, hi=sgn(y)*pi 1783 nop.i 999 1784} 1785;; 1786 1787{ .mfi 1788 nop.m 999 1789(p15) fmerge.s Result_lo = ArgY, Result_lo // If x<0, y=0, lo=sgn(y)*pi 1790 nop.i 999 1791} 1792;; 1793 1794// 1795// Call error support function for atan(0,0) 1796// 1797{ .mfb 1798 nop.m 999 1799 fadd.s0 Result = Result, Result_lo 1800(p13) br.cond.spnt __libm_error_region // Branch if atan(0,0) 1801} 1802;; 1803 1804{ .mib 1805 nop.m 999 1806 nop.i 999 1807 br.ret.sptk b0 // Exit for y=0, x not 0 1808} 1809;; 1810 1811// Here if y not zero 1812ATANL_ArgY_Not_ZERO: 1813{ .mfi 1814 nop.m 999 1815 fclass.m p0, p10 = ArgY, 0x023 // Test y inf 1816 nop.i 999 1817} 1818;; 1819 1820{ .mfb 1821 nop.m 999 1822 fclass.m p6, p0 = ArgX, 0x017 // Test for 0 <= |x| < inf 1823(p10) br.cond.spnt ATANL_ArgY_Not_INF // Branch if 0 < |y| < inf 1824} 1825;; 1826 1827// Here if y=inf 1828// 1829// Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal 1830// Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal 1831// Return +PI/4 when ArgY = +Inf and ArgX = +Inf 1832// Return -PI/4 when ArgY = -Inf and ArgX = +Inf 1833// Return +3PI/4 when ArgY = +Inf and ArgX = -Inf 1834// Return -3PI/4 when ArgY = -Inf and ArgX = -Inf 1835// 1836{ .mfi 1837 nop.m 999 1838 fclass.m p7, p0 = ArgX, 0x021 // Test for x=+inf 1839 nop.i 999 1840} 1841;; 1842 1843{ .mfi 1844(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2, if x finite 1845 fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf 1846 nop.i 999 1847} 1848;; 1849 1850{ .mmi 1851(p7) add table_ptr1 = 32, table_ptr1 // Point to pi/4 if x=+inf 1852;; 1853(p8) add table_ptr1 = 48, table_ptr1 // Point to 3pi/4 if x=-inf 1854 1855 nop.i 999 1856} 1857;; 1858 1859{ .mmi 1860 ldfd Result = [table_ptr1], 8 // Load pi/2, pi/4, or 3pi/4 hi 1861;; 1862 ldfd Result_lo = [table_ptr1], -8 // Load pi/2, pi/4, or 3pi/4 lo 1863 nop.i 999 1864} 1865;; 1866 1867{ .mfi 1868 nop.m 999 1869 fmerge.s Result = ArgY, Result // Merge sgn(y) in hi 1870 nop.i 999 1871} 1872;; 1873 1874{ .mfi 1875 nop.m 999 1876 fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo 1877 nop.i 999 1878} 1879;; 1880 1881{ .mfb 1882 nop.m 999 1883 fadd.s0 Result = Result, Result_lo // Compute complete result 1884 br.ret.sptk b0 // Exit for y=inf 1885} 1886;; 1887 1888// Here if y not INF, and x=0 or INF 1889ATANL_ArgY_Not_INF: 1890// 1891// Return +PI/2 when ArgY NOT Inf, ArgY > 0 and ArgX = +/-0 1892// Return -PI/2 when ArgY NOT Inf, ArgY < 0 and ArgX = +/-0 1893// Return +0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf 1894// Return -0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf 1895// Return +PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf 1896// Return -PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf 1897// 1898{ .mfi 1899 nop.m 999 1900 fclass.m p7, p9 = ArgX, 0x021 // Test for x=+inf 1901 nop.i 999 1902} 1903;; 1904 1905{ .mfi 1906 nop.m 999 1907 fclass.m p6, p0 = ArgX, 0x007 // Test for x=0 1908 nop.i 999 1909} 1910;; 1911 1912{ .mfi 1913(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2 1914 fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf 1915 nop.i 999 1916} 1917;; 1918 1919.pred.rel "mutex",p7,p9 1920{ .mfi 1921(p9) ldfd Result = [table_ptr1], 8 // Load pi or pi/2 hi 1922(p7) fmerge.s Result = ArgY, f0 // If y not inf, x=+inf, sgn(y)*0 1923 nop.i 999 1924} 1925;; 1926 1927{ .mfi 1928(p9) ldfd Result_lo = [table_ptr1], -8 // Load pi or pi/2 lo 1929(p7) fnorm.s0 Result = Result // If y not inf, x=+inf normalize 1930 nop.i 999 1931} 1932;; 1933 1934{ .mfi 1935 nop.m 999 1936(p9) fmerge.s Result = ArgY, Result // Merge sgn(y) in hi 1937 nop.i 999 1938} 1939;; 1940 1941{ .mfi 1942 nop.m 999 1943(p9) fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo 1944 nop.i 999 1945} 1946;; 1947 1948{ .mfb 1949 nop.m 999 1950(p9) fadd.s0 Result = Result, Result_lo // Compute complete result 1951 br.ret.spnt b0 // Exit for y not inf, x=0,inf 1952} 1953;; 1954 1955GLOBAL_IEEE754_END(atan2l) 1956libm_alias_ldouble_other (__atan2, atan2) 1957 1958LOCAL_LIBM_ENTRY(__libm_error_region) 1959.prologue 1960{ .mfi 1961 add GR_Parameter_Y=-32,sp // Parameter 2 value 1962 nop.f 0 1963.save ar.pfs,GR_SAVE_PFS 1964 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 1965} 1966{ .mfi 1967.fframe 64 1968 add sp=-64,sp // Create new stack 1969 nop.f 0 1970 mov GR_SAVE_GP=gp // Save gp 1971};; 1972{ .mmi 1973 stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack 1974 add GR_Parameter_X = 16,sp // Parameter 1 address 1975.save b0, GR_SAVE_B0 1976 mov GR_SAVE_B0=b0 // Save b0 1977};; 1978.body 1979{ .mib 1980 stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack 1981 add GR_Parameter_RESULT = 0,GR_Parameter_Y 1982 nop.b 0 // Parameter 3 address 1983} 1984{ .mib 1985 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack 1986 add GR_Parameter_Y = -16,GR_Parameter_Y 1987 br.call.sptk b0=__libm_error_support# // Call error handling function 1988};; 1989{ .mmi 1990 nop.m 0 1991 nop.m 0 1992 add GR_Parameter_RESULT = 48,sp 1993};; 1994{ .mmi 1995 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack 1996.restore sp 1997 add sp = 64,sp // Restore stack pointer 1998 mov b0 = GR_SAVE_B0 // Restore return address 1999};; 2000{ .mib 2001 mov gp = GR_SAVE_GP // Restore gp 2002 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 2003 br.ret.sptk b0 // Return 2004};; 2005 2006LOCAL_LIBM_END(__libm_error_region#) 2007.type __libm_error_support#,@function 2008.global __libm_error_support# 2009