1.file "libm_tan.s" 2 3// Copyright (C) 2000, 2001, Intel Corporation 4// All rights reserved. 5// 6// 7// Redistribution and use in source and binary forms, with or without 8// modification, are permitted provided that the following conditions are 9// met: 10// 11// * Redistributions of source code must retain the above copyright 12// notice, this list of conditions and the following disclaimer. 13// 14// * Redistributions in binary form must reproduce the above copyright 15// notice, this list of conditions and the following disclaimer in the 16// documentation and/or other materials provided with the distribution. 17// 18// * The name of Intel Corporation may not be used to endorse or promote 19// products derived from this software without specific prior written 20// permission. 21// 22// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 26// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 27// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 28// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 29// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 30// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 31// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 32// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33// 34// Intel Corporation is the author of this code, and requests that all 35// problem reports or change requests be submitted to it directly at 36// http://developer.intel.com/opensource. 37// 38// ********************************************************************* 39// 40// History: 41// 02/02/00 Initial Version 42// 4/04/00 Unwind support added 43// 12/28/00 Fixed false invalid flags 44// 45// ********************************************************************* 46// 47// Function: tan(x) = tangent(x), for double precision x values 48// 49// ********************************************************************* 50// 51// Accuracy: Very accurate for double-precision values 52// 53// ********************************************************************* 54// 55// Resources Used: 56// 57// Floating-Point Registers: f8 (Input and Return Value) 58// f9-f15 59// f32-f112 60// 61// General Purpose Registers: 62// r32-r48 63// r49-r50 (Used to pass arguments to pi_by_2 reduce routine) 64// 65// Predicate Registers: p6-p15 66// 67// ********************************************************************* 68// 69// IEEE Special Conditions: 70// 71// Denormal fault raised on denormal inputs 72// Overflow exceptions do not occur 73// Underflow exceptions raised when appropriate for tan 74// (No specialized error handling for this routine) 75// Inexact raised when appropriate by algorithm 76// 77// tan(SNaN) = QNaN 78// tan(QNaN) = QNaN 79// tan(inf) = QNaN 80// tan(+/-0) = +/-0 81// 82// ********************************************************************* 83// 84// Mathematical Description 85// 86// We consider the computation of FPTAN of Arg. Now, given 87// 88// Arg = N pi/2 + alpha, |alpha| <= pi/4, 89// 90// basic mathematical relationship shows that 91// 92// tan( Arg ) = tan( alpha ) if N is even; 93// = -cot( alpha ) otherwise. 94// 95// The value of alpha is obtained by argument reduction and 96// represented by two working precision numbers r and c where 97// 98// alpha = r + c accurately. 99// 100// The reduction method is described in a previous write up. 101// The argument reduction scheme identifies 4 cases. For Cases 2 102// and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be 103// computed very easily by 2 or 3 terms of the Taylor series 104// expansion as follows: 105// 106// Case 2: 107// ------- 108// 109// tan(r + c) = r + c + r^3/3 ...accurately 110// -cot(r + c) = -1/(r+c) + r/3 ...accurately 111// 112// Case 4: 113// ------- 114// 115// tan(r + c) = r + c + r^3/3 + 2r^5/15 ...accurately 116// -cot(r + c) = -1/(r+c) + r/3 + r^3/45 ...accurately 117// 118// 119// The only cases left are Cases 1 and 3 of the argument reduction 120// procedure. These two cases will be merged since after the 121// argument is reduced in either cases, we have the reduced argument 122// represented as r + c and that the magnitude |r + c| is not small 123// enough to allow the usage of a very short approximation. 124// 125// The greatest challenge of this task is that the second terms of 126// the Taylor series for tan(r) and -cot(r) 127// 128// r + r^3/3 + 2 r^5/15 + ... 129// 130// and 131// 132// -1/r + r/3 + r^3/45 + ... 133// 134// are not very small when |r| is close to pi/4 and the rounding 135// errors will be a concern if simple polynomial accumulation is 136// used. When |r| < 2^(-2), however, the second terms will be small 137// enough (5 bits or so of right shift) that a normal Horner 138// recurrence suffices. Hence there are two cases that we consider 139// in the accurate computation of tan(r) and cot(r), |r| <= pi/4. 140// 141// Case small_r: |r| < 2^(-2) 142// -------------------------- 143// 144// Since Arg = N pi/4 + r + c accurately, we have 145// 146// tan(Arg) = tan(r+c) for N even, 147// = -cot(r+c) otherwise. 148// 149// Here for this case, both tan(r) and -cot(r) can be approximated 150// by simple polynomials: 151// 152// tan(r) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19 153// -cot(r) = -1/r + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 154// 155// accurately. Since |r| is relatively small, tan(r+c) and 156// -cot(r+c) can be accurately approximated by replacing r with 157// r+c only in the first two terms of the corresponding polynomials. 158// 159// Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to 160// almost 64 sig. bits, thus 161// 162// P1_1 (r+c)^3 = P1_1 r^3 + c * r^2 accurately. 163// 164// Hence, 165// 166// tan(r+c) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19 167// + c*(1 + r^2) 168// 169// -cot(r+c) = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 170// + Q1_1*c 171// 172// 173// Case normal_r: 2^(-2) <= |r| <= pi/4 174// ------------------------------------ 175// 176// This case is more likely than the previous one if one considers 177// r to be uniformly distributed in [-pi/4 pi/4]. 178// 179// The required calculation is either 180// 181// tan(r + c) = tan(r) + correction, or 182// -cot(r + c) = -cot(r) + correction. 183// 184// Specifically, 185// 186// tan(r + c) = tan(r) + c tan'(r) + O(c^2) 187// = tan(r) + c sec^2(r) + O(c^2) 188// = tan(r) + c SEC_sq ...accurately 189// as long as SEC_sq approximates sec^2(r) 190// to, say, 5 bits or so. 191// 192// Similarly, 193// 194// -cot(r + c) = -cot(r) - c cot'(r) + O(c^2) 195// = -cot(r) + c csc^2(r) + O(c^2) 196// = -cot(r) + c CSC_sq ...accurately 197// as long as CSC_sq approximates csc^2(r) 198// to, say, 5 bits or so. 199// 200// We therefore concentrate on accurately calculating tan(r) and 201// cot(r) for a working-precision number r, |r| <= pi/4 to within 202// 0.1% or so. 203// 204// We will employ a table-driven approach. Let 205// 206// r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63 207// = sgn_r * ( B + x ) 208// 209// where 210// 211// B = 2^k * 1.b_1 b_2 ... b_5 1 212// x = |r| - B 213// 214// Now, 215// tan(B) + tan(x) 216// tan( B + x ) = ------------------------ 217// 1 - tan(B)*tan(x) 218// 219// / \ 220// | tan(B) + tan(x) | 221 222// = tan(B) + | ------------------------ - tan(B) | 223// | 1 - tan(B)*tan(x) | 224// \ / 225// 226// sec^2(B) * tan(x) 227// = tan(B) + ------------------------ 228// 1 - tan(B)*tan(x) 229// 230// (1/[sin(B)*cos(B)]) * tan(x) 231// = tan(B) + -------------------------------- 232// cot(B) - tan(x) 233// 234// 235// Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are 236// calculated beforehand and stored in a table. Since 237// 238// |x| <= 2^k * 2^(-6) <= 2^(-7) (because k = -1, -2) 239// 240// a very short polynomial will be sufficient to approximate tan(x) 241// accurately. The details involved in computing the last expression 242// will be given in the next section on algorithm description. 243// 244// 245// Now, we turn to the case where cot( B + x ) is needed. 246// 247// 248// 1 - tan(B)*tan(x) 249// cot( B + x ) = ------------------------ 250// tan(B) + tan(x) 251// 252// / \ 253// | 1 - tan(B)*tan(x) | 254 255// = cot(B) + | ----------------------- - cot(B) | 256// | tan(B) + tan(x) | 257// \ / 258// 259// [tan(B) + cot(B)] * tan(x) 260// = cot(B) - ---------------------------- 261// tan(B) + tan(x) 262// 263// (1/[sin(B)*cos(B)]) * tan(x) 264// = cot(B) - -------------------------------- 265// tan(B) + tan(x) 266// 267// 268// Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that 269// are needed are the same set of values needed in the previous 270// case. 271// 272// Finally, we can put all the ingredients together as follows: 273// 274// Arg = N * pi/2 + r + c ...accurately 275// 276// tan(Arg) = tan(r) + correction if N is even; 277// = -cot(r) + correction otherwise. 278// 279// For Cases 2 and 4, 280// 281// Case 2: 282// tan(Arg) = tan(r + c) = r + c + r^3/3 N even 283// = -cot(r + c) = -1/(r+c) + r/3 N odd 284// Case 4: 285// tan(Arg) = tan(r + c) = r + c + r^3/3 + 2r^5/15 N even 286// = -cot(r + c) = -1/(r+c) + r/3 + r^3/45 N odd 287// 288// 289// For Cases 1 and 3, 290// 291// Case small_r: |r| < 2^(-2) 292// 293// tan(Arg) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19 294// + c*(1 + r^2) N even 295// 296// = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 297// + Q1_1*c N odd 298// 299// Case normal_r: 2^(-2) <= |r| <= pi/4 300// 301// tan(Arg) = tan(r) + c * sec^2(r) N even 302// = -cot(r) + c * csc^2(r) otherwise 303// 304// For N even, 305// 306// tan(Arg) = tan(r) + c*sec^2(r) 307// = tan( sgn_r * (B+x) ) + c * sec^2(|r|) 308// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(|r|) ) 309// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(B) ) 310// 311// since B approximates |r| to 2^(-6) in relative accuracy. 312// 313// / (1/[sin(B)*cos(B)]) * tan(x) 314// tan(Arg) = sgn_r * | tan(B) + -------------------------------- 315// \ cot(B) - tan(x) 316// \ 317// + CORR | 318 319// / 320// where 321// 322// CORR = sgn_r*c*tan(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)). 323// 324// For N odd, 325// 326// tan(Arg) = -cot(r) + c*csc^2(r) 327// = -cot( sgn_r * (B+x) ) + c * csc^2(|r|) 328// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(|r|) ) 329// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(B) ) 330// 331// since B approximates |r| to 2^(-6) in relative accuracy. 332// 333// / (1/[sin(B)*cos(B)]) * tan(x) 334// tan(Arg) = sgn_r * | -cot(B) + -------------------------------- 335// \ tan(B) + tan(x) 336// \ 337// + CORR | 338 339// / 340// where 341// 342// CORR = sgn_r*c*cot(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)). 343// 344// 345// The actual algorithm prescribes how all the mathematical formulas 346// are calculated. 347// 348// 349// 2. Algorithmic Description 350// ========================== 351// 352// 2.1 Computation for Cases 2 and 4. 353// ---------------------------------- 354// 355// For Case 2, we use two-term polynomials. 356// 357// For N even, 358// 359// rsq := r * r 360// Result := c + r * rsq * P1_1 361// Result := r + Result ...in user-defined rounding 362// 363// For N odd, 364// S_hi := -frcpa(r) ...8 bits 365// S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits 366// S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits 367// S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits 368// S_lo := S_hi*( (1 + S_hi*r) + S_hi*c ) 369// ...S_hi + S_lo is -1/(r+c) to extra precision 370// S_lo := S_lo + Q1_1*r 371// 372// Result := S_hi + S_lo ...in user-defined rounding 373// 374// For Case 4, we use three-term polynomials 375// 376// For N even, 377// 378// rsq := r * r 379// Result := c + r * rsq * (P1_1 + rsq * P1_2) 380// Result := r + Result ...in user-defined rounding 381// 382// For N odd, 383// S_hi := -frcpa(r) ...8 bits 384// S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits 385// S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits 386// S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits 387// S_lo := S_hi*( (1 + S_hi*r) + S_hi*c ) 388// ...S_hi + S_lo is -1/(r+c) to extra precision 389// rsq := r * r 390// P := Q1_1 + rsq*Q1_2 391// S_lo := S_lo + r*P 392// 393// Result := S_hi + S_lo ...in user-defined rounding 394// 395// 396// Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are 397// the same as those used in the small_r case of Cases 1 and 3 398// below. 399// 400// 401// 2.2 Computation for Cases 1 and 3. 402// ---------------------------------- 403// This is further divided into the case of small_r, 404// where |r| < 2^(-2), and the case of normal_r, where |r| lies between 405// 2^(-2) and pi/4. 406// 407// Algorithm for the case of small_r 408// --------------------------------- 409// 410// For N even, 411// rsq := r * r 412// Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3)) 413// r_to_the_8 := rsq * rsq 414// r_to_the_8 := r_to_the_8 * r_to_the_8 415// Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9)) 416// CORR := c * ( 1 + rsq ) 417// Poly := Poly1 + r_to_the_8*Poly2 418// Result := r*Poly + CORR 419// Result := r + Result ...in user-defined rounding 420// ...note that Poly1 and r_to_the_8 can be computed in parallel 421// ...with Poly2 (Poly1 is intentionally set to be much 422// ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden) 423// 424// For N odd, 425// S_hi := -frcpa(r) ...8 bits 426// S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits 427// S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits 428// S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits 429// S_lo := S_hi*( (1 + S_hi*r) + S_hi*c ) 430// ...S_hi + S_lo is -1/(r+c) to extra precision 431// S_lo := S_lo + Q1_1*c 432// 433// ...S_hi and S_lo are computed in parallel with 434// ...the following 435// rsq := r*r 436// P := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7)) 437// 438// Result := r*P + S_lo 439// Result := S_hi + Result ...in user-defined rounding 440// 441// 442// Algorithm for the case of normal_r 443// ---------------------------------- 444// 445// Here, we first consider the computation of tan( r + c ). As 446// presented in the previous section, 447// 448// tan( r + c ) = tan(r) + c * sec^2(r) 449// = sgn_r * [ tan(B+x) + CORR ] 450// CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)] 451// 452// because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits. 453// 454// tan( r + c ) = 455// / (1/[sin(B)*cos(B)]) * tan(x) 456// sgn_r * | tan(B) + -------------------------------- + 457// \ cot(B) - tan(x) 458// \ 459// CORR | 460 461// / 462// 463// The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are 464// calculated beforehand and stored in a table. Specifically, 465// the table values are 466// 467// tan(B) as T_hi + T_lo; 468// cot(B) as C_hi + C_lo; 469// 1/[sin(B)*cos(B)] as SC_inv 470// 471// T_hi, C_hi are in double-precision memory format; 472// T_lo, C_lo are in single-precision memory format; 473// SC_inv is in extended-precision memory format. 474// 475// The value of tan(x) will be approximated by a short polynomial of 476// the form 477// 478// tan(x) as x + x * P, where 479// P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3)) 480// 481// Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x) 482// to a relative accuracy better than 2^(-20). Thus, a good 483// initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative 484// division is: 485// 486// 1/(cot(B) - tan(x)) is approximately 487// 1/(cot(B) - x) is 488// tan(B)/(1 - x*tan(B)) is approximately 489// T_hi / ( 1 - T_hi * x ) is approximately 490// 491// T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ] 492// 493// The calculation of tan(r+c) therefore proceed as follows: 494// 495// Tx := T_hi * x 496// xsq := x * x 497// 498// V_hi := T_hi*(1 + Tx*(1 + Tx)) 499// P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3)) 500// ...V_hi serves as an initial guess of 1/(cot(B) - tan(x)) 501// ...good to about 20 bits of accuracy 502// 503// tanx := x + x*P 504// D := C_hi - tanx 505// ...D is a double precision denominator: cot(B) - tan(x) 506// 507// V_hi := V_hi + V_hi*(1 - V_hi*D) 508// ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits 509// 510// V_lo := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ] 511// - V_hi*C_lo ) ...observe all order 512// ...V_hi + V_lo approximates 1/(cot(B) - tan(x)) 513// ...to extra accuracy 514// 515// ... SC_inv(B) * (x + x*P) 516// ... tan(B) + ------------------------- + CORR 517// ... cot(B) - (x + x*P) 518// ... 519// ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR 520// ... 521// 522// Sx := SC_inv * x 523// CORR := sgn_r * c * SC_inv * T_hi 524// 525// ...put the ingredients together to compute 526// ... SC_inv(B) * (x + x*P) 527// ... tan(B) + ------------------------- + CORR 528// ... cot(B) - (x + x*P) 529// ... 530// ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR 531// ... 532// ... = T_hi + T_lo + CORR + 533// ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo) 534// 535// CORR := CORR + T_lo 536// tail := V_lo + P*(V_hi + V_lo) 537// tail := Sx * tail + CORR 538// tail := Sx * V_hi + tail 539// T_hi := sgn_r * T_hi 540// 541// ...T_hi + sgn_r*tail now approximate 542// ...sgn_r*(tan(B+x) + CORR) accurately 543// 544// Result := T_hi + sgn_r*tail ...in user-defined 545// ...rounding control 546// ...It is crucial that independent paths be fully 547// ...exploited for performance's sake. 548// 549// 550// Next, we consider the computation of -cot( r + c ). As 551// presented in the previous section, 552// 553// -cot( r + c ) = -cot(r) + c * csc^2(r) 554// = sgn_r * [ -cot(B+x) + CORR ] 555// CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)] 556// 557// because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits. 558// 559// -cot( r + c ) = 560// / (1/[sin(B)*cos(B)]) * tan(x) 561// sgn_r * | -cot(B) + -------------------------------- + 562// \ tan(B) + tan(x) 563// \ 564// CORR | 565 566// / 567// 568// The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are 569// calculated beforehand and stored in a table. Specifically, 570// the table values are 571// 572// tan(B) as T_hi + T_lo; 573// cot(B) as C_hi + C_lo; 574// 1/[sin(B)*cos(B)] as SC_inv 575// 576// T_hi, C_hi are in double-precision memory format; 577// T_lo, C_lo are in single-precision memory format; 578// SC_inv is in extended-precision memory format. 579// 580// The value of tan(x) will be approximated by a short polynomial of 581// the form 582// 583// tan(x) as x + x * P, where 584// P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3)) 585// 586// Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x) 587// to a relative accuracy better than 2^(-18). Thus, a good 588// initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative 589// division is: 590// 591// 1/(tan(B) + tan(x)) is approximately 592// 1/(tan(B) + x) is 593// cot(B)/(1 + x*cot(B)) is approximately 594// C_hi / ( 1 + C_hi * x ) is approximately 595// 596// C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ] 597// 598// The calculation of -cot(r+c) therefore proceed as follows: 599// 600// Cx := C_hi * x 601// xsq := x * x 602// 603// V_hi := C_hi*(1 - Cx*(1 - Cx)) 604// P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3)) 605// ...V_hi serves as an initial guess of 1/(tan(B) + tan(x)) 606// ...good to about 18 bits of accuracy 607// 608// tanx := x + x*P 609// D := T_hi + tanx 610// ...D is a double precision denominator: tan(B) + tan(x) 611// 612// V_hi := V_hi + V_hi*(1 - V_hi*D) 613// ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits 614// 615// V_lo := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ] 616// - V_hi*T_lo ) ...observe all order 617// ...V_hi + V_lo approximates 1/(tan(B) + tan(x)) 618// ...to extra accuracy 619// 620// ... SC_inv(B) * (x + x*P) 621// ... -cot(B) + ------------------------- + CORR 622// ... tan(B) + (x + x*P) 623// ... 624// ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR 625// ... 626// 627// Sx := SC_inv * x 628// CORR := sgn_r * c * SC_inv * C_hi 629// 630// ...put the ingredients together to compute 631// ... SC_inv(B) * (x + x*P) 632// ... -cot(B) + ------------------------- + CORR 633// ... tan(B) + (x + x*P) 634// ... 635// ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR 636// ... 637// ... =-C_hi - C_lo + CORR + 638// ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo) 639// 640// CORR := CORR - C_lo 641// tail := V_lo + P*(V_hi + V_lo) 642// tail := Sx * tail + CORR 643// tail := Sx * V_hi + tail 644// C_hi := -sgn_r * C_hi 645// 646// ...C_hi + sgn_r*tail now approximates 647// ...sgn_r*(-cot(B+x) + CORR) accurately 648// 649// Result := C_hi + sgn_r*tail in user-defined rounding control 650// ...It is crucial that independent paths be fully 651// ...exploited for performance's sake. 652// 653// 3. Implementation Notes 654// ======================= 655// 656// Table entries T_hi, T_lo; C_hi, C_lo; SC_inv 657// 658// Recall that 2^(-2) <= |r| <= pi/4; 659// 660// r = sgn_r * 2^k * 1.b_1 b_2 ... b_63 661// 662// and 663// 664// B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1 665// 666// Thus, for k = -2, possible values of B are 667// 668// B = 2^(-2) * ( 1 + index/32 + 1/64 ), 669// index ranges from 0 to 31 670// 671// For k = -1, however, since |r| <= pi/4 = 0.78... 672// possible values of B are 673// 674// B = 2^(-1) * ( 1 + index/32 + 1/64 ) 675// index ranges from 0 to 19. 676// 677// 678 679#include "libm_support.h" 680 681#ifdef _LIBC 682.rodata 683#else 684.data 685#endif 686 687.align 128 688 689TAN_BASE_CONSTANTS: 690.type TAN_BASE_CONSTANTS, @object 691data4 0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24 692 // two**-14, -two**-14 693data4 0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi 694data4 0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0 695data4 0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1 696data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2 697data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3 698data4 0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63 699data4 0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0 700data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1 701data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2 702data4 0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4 703data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4 704data4 0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2 705data4 0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33 706data4 0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1 707data4 0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2 708data4 0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3 709data4 0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4 710data4 0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5 711data4 0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6 712data4 0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7 713data4 0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8 714data4 0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9 715data4 0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1 716data4 0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2 717data4 0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3 718data4 0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4 719data4 0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5 720data4 0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6 721data4 0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7 722data4 0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1 723data4 0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2 724data4 0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3 725// 726// Entries T_hi double-precision memory format 727// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) 728// Entries T_lo single-precision memory format 729// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) 730// 731data4 0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000 732data4 0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000 733data4 0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000 734data4 0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000 735data4 0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000 736data4 0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000 737data4 0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000 738data4 0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000 739data4 0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000 740data4 0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000 741data4 0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000 742data4 0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000 743data4 0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000 744data4 0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000 745data4 0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000 746data4 0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000 747data4 0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000 748data4 0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000 749data4 0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000 750data4 0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000 751data4 0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000 752data4 0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000 753data4 0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000 754data4 0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000 755data4 0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000 756data4 0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000 757data4 0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000 758data4 0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000 759data4 0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000 760data4 0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000 761data4 0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000 762data4 0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000 763// 764// Entries T_hi double-precision memory format 765// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) 766// Entries T_lo single-precision memory format 767// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) 768// 769data4 0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000 770data4 0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000 771data4 0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000 772data4 0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000 773data4 0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000 774data4 0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000 775data4 0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000 776data4 0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000 777data4 0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000 778data4 0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000 779data4 0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000 780data4 0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000 781data4 0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000 782data4 0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000 783data4 0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000 784data4 0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000 785data4 0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000 786data4 0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000 787data4 0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000 788data4 0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000 789// 790// Entries C_hi double-precision memory format 791// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) 792// Entries C_lo single-precision memory format 793// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) 794// 795data4 0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000 796data4 0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000 797data4 0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000 798data4 0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000 799data4 0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000 800data4 0x5285B068, 0x400A855A, 0x25524F18, 0x00000000 801data4 0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000 802data4 0x646AF623, 0x4009188A, 0x254FD801, 0x00000000 803data4 0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000 804data4 0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000 805data4 0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000 806data4 0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000 807data4 0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000 808data4 0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000 809data4 0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000 810data4 0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000 811data4 0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000 812data4 0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000 813data4 0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000 814data4 0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000 815data4 0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000 816data4 0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000 817data4 0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000 818data4 0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000 819data4 0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000 820data4 0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000 821data4 0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000 822data4 0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000 823data4 0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000 824data4 0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000 825data4 0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000 826data4 0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000 827// 828// Entries C_hi double-precision memory format 829// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) 830// Entries C_lo single-precision memory format 831// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) 832// 833data4 0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000 834data4 0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000 835data4 0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000 836data4 0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000 837data4 0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000 838data4 0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000 839data4 0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000 840data4 0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000 841data4 0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000 842data4 0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000 843data4 0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000 844data4 0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000 845data4 0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000 846data4 0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000 847data4 0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000 848data4 0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000 849data4 0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000 850data4 0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000 851data4 0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000 852data4 0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000 853// 854// Entries SC_inv in Swapped IEEE format (extended) 855// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) 856// 857data4 0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000 858data4 0x554B0EB0, 0x80092804, 0x00004001, 0x00000000 859data4 0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000 860data4 0x77378677, 0xF3086BA0, 0x00004000, 0x00000000 861data4 0xCCD4723C, 0xED154515, 0x00004000, 0x00000000 862data4 0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000 863data4 0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000 864data4 0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000 865data4 0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000 866data4 0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000 867data4 0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000 868data4 0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000 869data4 0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000 870data4 0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000 871data4 0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000 872data4 0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000 873data4 0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000 874data4 0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000 875data4 0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000 876data4 0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000 877data4 0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000 878data4 0x7221235F, 0xAC862A23, 0x00004000, 0x00000000 879data4 0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000 880data4 0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000 881data4 0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000 882data4 0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000 883data4 0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000 884data4 0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000 885data4 0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000 886data4 0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000 887data4 0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000 888data4 0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000 889// 890// Entries SC_inv in Swapped IEEE format (extended) 891// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) 892// 893data4 0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000 894data4 0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000 895data4 0x61B798AF, 0x9147094F, 0x00004000, 0x00000000 896data4 0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000 897data4 0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000 898data4 0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000 899data4 0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000 900data4 0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000 901data4 0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000 902data4 0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000 903data4 0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000 904data4 0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000 905data4 0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000 906data4 0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000 907data4 0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000 908data4 0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000 909data4 0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000 910data4 0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000 911data4 0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000 912data4 0xA07791FA, 0x80186650, 0x00004000, 0x00000000 913 914Arg = f8 915Result = f8 916fp_tmp = f9 917U_2 = f10 918rsq = f11 919C_hi = f12 920C_lo = f13 921T_hi = f14 922T_lo = f15 923 924N_0 = f32 925d_1 = f33 926MPI_BY_4 = f34 927tail = f35 928tanx = f36 929Cx = f37 930Sx = f38 931sgn_r = f39 932CORR = f40 933P = f41 934D = f42 935ArgPrime = f43 936P_0 = f44 937 938P2_1 = f45 939P2_2 = f46 940P2_3 = f47 941 942P1_1 = f45 943P1_2 = f46 944P1_3 = f47 945 946P1_4 = f48 947P1_5 = f49 948P1_6 = f50 949P1_7 = f51 950P1_8 = f52 951P1_9 = f53 952 953TWO_TO_63 = f54 954NEGTWO_TO_63 = f55 955x = f56 956xsq = f57 957Tx = f58 958Tx1 = f59 959Set = f60 960poly1 = f61 961poly2 = f62 962Poly = f63 963Poly1 = f64 964Poly2 = f65 965r_to_the_8 = f66 966B = f67 967SC_inv = f68 968Pos_r = f69 969N_0_fix = f70 970PI_BY_4 = f71 971NEGTWO_TO_NEG2 = f72 972TWO_TO_24 = f73 973TWO_TO_NEG14 = f74 974TWO_TO_NEG33 = f75 975NEGTWO_TO_24 = f76 976NEGTWO_TO_NEG14 = f76 977NEGTWO_TO_NEG33 = f77 978two_by_PI = f78 979N = f79 980N_fix = f80 981P_1 = f81 982P_2 = f82 983P_3 = f83 984s_val = f84 985w = f85 986c = f86 987r = f87 988Z = f88 989A = f89 990a = f90 991t = f91 992U_1 = f92 993d_2 = f93 994TWO_TO_NEG2 = f94 995Q1_1 = f95 996Q1_2 = f96 997Q1_3 = f97 998Q1_4 = f98 999Q1_5 = f99 1000Q1_6 = f100 1001Q1_7 = f101 1002Q1_8 = f102 1003S_hi = f103 1004S_lo = f104 1005V_hi = f105 1006V_lo = f106 1007U_hi = f107 1008U_lo = f108 1009U_hiabs = f109 1010V_hiabs = f110 1011V = f111 1012Inv_P_0 = f112 1013 1014GR_SAVE_B0 = r33 1015GR_SAVE_GP = r34 1016GR_SAVE_PFS = r35 1017 1018delta1 = r36 1019table_ptr1 = r37 1020table_ptr2 = r38 1021i_0 = r39 1022i_1 = r40 1023N_fix_gr = r41 1024N_inc = r42 1025exp_Arg = r43 1026exp_r = r44 1027sig_r = r45 1028lookup = r46 1029table_offset = r47 1030Create_B = r48 1031gr_tmp = r49 1032 1033GR_Parameter_X = r49 1034GR_Parameter_r = r50 1035 1036 1037 1038.global __libm_tan 1039.section .text 1040.proc __libm_tan 1041 1042 1043__libm_tan: 1044 1045{ .mfi 1046alloc r32 = ar.pfs, 0,17,2,0 1047(p0) fclass.m.unc p6,p0 = Arg, 0x1E7 1048 addl gr_tmp = -1,r0 1049} 1050;; 1051 1052{ .mfi 1053 nop.m 999 1054(p0) fclass.nm.unc p7,p0 = Arg, 0x1FF 1055 nop.i 999 1056} 1057;; 1058 1059{ .mfi 1060(p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp 1061 nop.f 999 1062 nop.i 999 1063} 1064;; 1065 1066{ .mmi 1067 ld8 table_ptr1 = [table_ptr1] 1068 setf.sig fp_tmp = gr_tmp // Make a constant so fmpy produces inexact 1069 nop.i 999 1070} 1071;; 1072 1073// 1074// Check for NatVals, Infs , NaNs, and Zeros 1075// Check for everything - if false, then must be pseudo-zero 1076// or pseudo-nan. 1077// Local table pointer 1078// 1079 1080{ .mbb 1081(p0) add table_ptr2 = 96, table_ptr1 1082(p6) br.cond.spnt __libm_TAN_SPECIAL 1083(p7) br.cond.spnt __libm_TAN_SPECIAL ;; 1084} 1085// 1086// Point to Inv_P_0 1087// Branch out to deal with unsupporteds and special values. 1088// 1089 1090{ .mmf 1091(p0) ldfs TWO_TO_24 = [table_ptr1],4 1092(p0) ldfs TWO_TO_63 = [table_ptr2],4 1093// 1094// Load -2**24, load -2**63. 1095// 1096(p0) fcmp.eq.s0 p0, p6 = Arg, f1 ;; 1097} 1098 1099{ .mfi 1100(p0) ldfs NEGTWO_TO_63 = [table_ptr2],12 1101(p0) fnorm.s1 Arg = Arg 1102 nop.i 999 1103} 1104// 1105// Load 2**24, Load 2**63. 1106// 1107 1108{ .mmi 1109(p0) ldfs NEGTWO_TO_24 = [table_ptr1],12 ;; 1110// 1111// Do fcmp to generate Denormal exception 1112// - can't do FNORM (will generate Underflow when U is unmasked!) 1113// Normalize input argument. 1114// 1115(p0) ldfe two_by_PI = [table_ptr1],16 1116 nop.i 999 1117} 1118 1119{ .mmi 1120(p0) ldfe Inv_P_0 = [table_ptr2],16 ;; 1121(p0) ldfe d_1 = [table_ptr2],16 1122 nop.i 999 1123} 1124// 1125// Decide about the paths to take: 1126// PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2 1127// OTHERWISE - CASE 3 OR 4 1128// Load inverse of P_0 . 1129// Set PR_6 if Arg <= -2**63 1130// Are there any Infs, NaNs, or zeros? 1131// 1132 1133{ .mmi 1134(p0) ldfe P_0 = [table_ptr1],16 ;; 1135(p0) ldfe d_2 = [table_ptr2],16 1136 nop.i 999 1137} 1138// 1139// Set PR_8 if Arg <= -2**24 1140// Set PR_6 if Arg >= 2**63 1141// 1142 1143{ .mmi 1144(p0) ldfe P_1 = [table_ptr1],16 ;; 1145(p0) ldfe PI_BY_4 = [table_ptr2],16 1146 nop.i 999 1147} 1148// 1149// Set PR_8 if Arg >= 2**24 1150// 1151 1152{ .mmi 1153(p0) ldfe P_2 = [table_ptr1],16 ;; 1154(p0) ldfe MPI_BY_4 = [table_ptr2],16 1155 nop.i 999 1156} 1157// 1158// Load P_2 and PI_BY_4 1159// 1160 1161{ .mfi 1162(p0) ldfe P_3 = [table_ptr1],16 1163 nop.f 999 1164 nop.i 999 ;; 1165} 1166 1167{ .mfi 1168 nop.m 999 1169(p0) fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63 1170 nop.i 999 1171} 1172 1173{ .mfi 1174 nop.m 999 1175(p0) fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24 1176 nop.i 999 ;; 1177} 1178 1179{ .mfi 1180 nop.m 999 1181(p7) fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63 1182 nop.i 999 1183} 1184 1185{ .mfi 1186 nop.m 999 1187(p9) fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24 1188 nop.i 999 ;; 1189} 1190 1191{ .mib 1192 nop.m 999 1193 nop.i 999 1194// 1195// Load P_3 and -PI_BY_4 1196// 1197(p6) br.cond.spnt TAN_ARG_TOO_LARGE ;; 1198} 1199 1200{ .mib 1201 nop.m 999 1202 nop.i 999 1203// 1204// Load 2**(-2). 1205// Load -2**(-2). 1206// Branch out if we have a special argument. 1207// Branch out if the magnitude of the input argument is too large 1208// - do this branch before the next. 1209// 1210(p8) br.cond.spnt TAN_LARGER_ARG ;; 1211} 1212// 1213// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24 1214// 1215 1216{ .mfi 1217(p0) ldfs TWO_TO_NEG2 = [table_ptr2],4 1218// ARGUMENT REDUCTION CODE - CASE 1 and 2 1219// Load 2**(-2). 1220// Load -2**(-2). 1221(p0) fmpy.s1 N = Arg,two_by_PI 1222 nop.i 999 ;; 1223} 1224 1225{ .mfi 1226(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],12 1227// 1228// N = Arg * 2/pi 1229// 1230(p0) fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4 1231 nop.i 999 ;; 1232} 1233 1234{ .mfi 1235 nop.m 999 1236// 1237// if Arg < pi/4, set PR_8. 1238// 1239(p8) fcmp.gt.s1 p8,p9= Arg,MPI_BY_4 1240 nop.i 999 ;; 1241} 1242// 1243// Case 1: Is |r| < 2**(-2). 1244// Arg is the same as r in this case. 1245// r = Arg 1246// c = 0 1247// 1248 1249{ .mfi 1250(p8) mov N_fix_gr = r0 1251// 1252// if Arg > -pi/4, reset PR_8. 1253// Select the case when |Arg| < pi/4 - set PR[8] = true. 1254// Else Select the case when |Arg| >= pi/4 - set PR[9] = true. 1255// 1256(p0) fcvt.fx.s1 N_fix = N 1257 nop.i 999 ;; 1258} 1259 1260{ .mfi 1261 nop.m 999 1262// 1263// Grab the integer part of N . 1264// 1265(p8) mov r = Arg 1266 nop.i 999 1267} 1268 1269{ .mfi 1270 nop.m 999 1271(p8) mov c = f0 1272 nop.i 999 ;; 1273} 1274 1275{ .mfi 1276 nop.m 999 1277(p8) fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2 1278 nop.i 999 ;; 1279} 1280 1281{ .mfi 1282 nop.m 999 1283(p10) fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2 1284 nop.i 999 ;; 1285} 1286 1287{ .mfi 1288 nop.m 999 1289// 1290// Case 2: Place integer part of N in GP register. 1291// 1292(p9) fcvt.xf N = N_fix 1293 nop.i 999 ;; 1294} 1295 1296{ .mib 1297(p9) getf.sig N_fix_gr = N_fix 1298 nop.i 999 1299// 1300// Case 2: Convert integer N_fix back to normalized floating-point value. 1301// 1302(p10) br.cond.spnt TAN_SMALL_R ;; 1303} 1304 1305{ .mib 1306 nop.m 999 1307 nop.i 999 1308(p8) br.cond.sptk TAN_NORMAL_R ;; 1309} 1310// 1311// Case 1: PR_3 is only affected when PR_1 is set. 1312// 1313 1314{ .mmi 1315(p9) ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;; 1316// 1317// Case 2: Load 2**(-33). 1318// 1319(p9) ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4 1320 nop.i 999 ;; 1321} 1322 1323{ .mfi 1324 nop.m 999 1325// 1326// Case 2: Load -2**(-33). 1327// 1328(p9) fnma.s1 s_val = N, P_1, Arg 1329 nop.i 999 1330} 1331 1332{ .mfi 1333 nop.m 999 1334(p9) fmpy.s1 w = N, P_2 1335 nop.i 999 ;; 1336} 1337 1338{ .mfi 1339 nop.m 999 1340// 1341// Case 2: w = N * P_2 1342// Case 2: s_val = -N * P_1 + Arg 1343// 1344(p0) fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33 1345 nop.i 999 ;; 1346} 1347 1348{ .mfi 1349 nop.m 999 1350// 1351// Decide between case_1 and case_2 reduce: 1352// 1353(p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33 1354 nop.i 999 ;; 1355} 1356 1357{ .mfi 1358 nop.m 999 1359// 1360// Case 1_reduce: s <= -2**(-33) or s >= 2**(-33) 1361// Case 2_reduce: -2**(-33) < s < 2**(-33) 1362// 1363(p8) fsub.s1 r = s_val, w 1364 nop.i 999 1365} 1366 1367{ .mfi 1368 nop.m 999 1369(p9) fmpy.s1 w = N, P_3 1370 nop.i 999 ;; 1371} 1372 1373{ .mfi 1374 nop.m 999 1375(p9) fma.s1 U_1 = N, P_2, w 1376 nop.i 999 1377} 1378 1379{ .mfi 1380 nop.m 999 1381// 1382// Case 1_reduce: Is |r| < 2**(-2), if so set PR_10 1383// else set PR_11. 1384// 1385(p8) fsub.s1 c = s_val, r 1386 nop.i 999 ;; 1387} 1388 1389{ .mfi 1390 nop.m 999 1391// 1392// Case 1_reduce: r = s + w (change sign) 1393// Case 2_reduce: w = N * P_3 (change sign) 1394// 1395(p8) fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2 1396 nop.i 999 ;; 1397} 1398 1399{ .mfi 1400 nop.m 999 1401(p10) fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2 1402 nop.i 999 ;; 1403} 1404 1405{ .mfi 1406 nop.m 999 1407(p9) fsub.s1 r = s_val, U_1 1408 nop.i 999 1409} 1410 1411{ .mfi 1412 nop.m 999 1413// 1414// Case 1_reduce: c is complete here. 1415// c = c + w (w has not been negated.) 1416// Case 2_reduce: r is complete here - continue to calculate c . 1417// r = s - U_1 1418// 1419(p9) fms.s1 U_2 = N, P_2, U_1 1420 nop.i 999 ;; 1421} 1422 1423{ .mfi 1424 nop.m 999 1425// 1426// Case 1_reduce: c = s - r 1427// Case 2_reduce: U_1 = N * P_2 + w 1428// 1429(p8) fsub.s1 c = c, w 1430 nop.i 999 ;; 1431} 1432 1433{ .mfi 1434 nop.m 999 1435(p9) fsub.s1 s_val = s_val, r 1436 nop.i 999 1437} 1438 1439{ .mfb 1440 nop.m 999 1441// 1442// Case 2_reduce: 1443// U_2 = N * P_2 - U_1 1444// Not needed until later. 1445// 1446(p9) fadd.s1 U_2 = U_2, w 1447// 1448// Case 2_reduce: 1449// s = s - r 1450// U_2 = U_2 + w 1451// 1452(p10) br.cond.spnt TAN_SMALL_R ;; 1453} 1454 1455{ .mib 1456 nop.m 999 1457 nop.i 999 1458(p11) br.cond.sptk TAN_NORMAL_R ;; 1459} 1460 1461{ .mii 1462 nop.m 999 1463// 1464// Case 2_reduce: 1465// c = c - U_2 1466// c is complete here 1467// Argument reduction ends here. 1468// 1469(p9) extr.u i_1 = N_fix_gr, 0, 1 ;; 1470(p9) cmp.eq.unc p11, p12 = 0x0000,i_1 ;; 1471} 1472 1473{ .mfi 1474 nop.m 999 1475// 1476// Is i_1 even or odd? 1477// if i_1 == 0, set p11, else set p12. 1478// 1479(p11) fmpy.s1 rsq = r, Z 1480 nop.i 999 ;; 1481} 1482 1483{ .mfi 1484 nop.m 999 1485(p12) frcpa.s1 S_hi,p0 = f1, r 1486 nop.i 999 1487} 1488 1489// 1490// Case 1: Branch to SMALL_R or NORMAL_R. 1491// Case 1 is done now. 1492// 1493 1494{ .mfi 1495(p9) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp 1496(p9) fsub.s1 c = s_val, U_1 1497 nop.i 999 ;; 1498} 1499;; 1500 1501{ .mmi 1502(p9) ld8 table_ptr1 = [table_ptr1] 1503 nop.m 999 1504 nop.i 999 1505} 1506;; 1507 1508{ .mmi 1509(p9) add table_ptr1 = 224, table_ptr1 ;; 1510(p9) ldfe P1_1 = [table_ptr1],144 1511 nop.i 999 ;; 1512} 1513// 1514// Get [i_1] - lsb of N_fix_gr . 1515// Load P1_1 and point to Q1_1 . 1516// 1517 1518{ .mfi 1519(p9) ldfe Q1_1 = [table_ptr1] , 0 1520// 1521// N even: rsq = r * Z 1522// N odd: S_hi = frcpa(r) 1523// 1524(p12) fmerge.ns S_hi = S_hi, S_hi 1525 nop.i 999 1526} 1527 1528{ .mfi 1529 nop.m 999 1530// 1531// Case 2_reduce: 1532// c = s - U_1 1533// 1534(p9) fsub.s1 c = c, U_2 1535 nop.i 999 ;; 1536} 1537 1538{ .mfi 1539 nop.m 999 1540(p12) fma.s1 poly1 = S_hi, r, f1 1541 nop.i 999 ;; 1542} 1543 1544{ .mfi 1545 nop.m 999 1546// 1547// N odd: Change sign of S_hi 1548// 1549(p11) fmpy.s1 rsq = rsq, P1_1 1550 nop.i 999 ;; 1551} 1552 1553{ .mfi 1554 nop.m 999 1555(p12) fma.s1 S_hi = S_hi, poly1, S_hi 1556 nop.i 999 ;; 1557} 1558 1559{ .mfi 1560 nop.m 999 1561// 1562// N even: rsq = rsq * P1_1 1563// N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary 1564// 1565(p11) fma.s1 Result = r, rsq, c 1566 nop.i 999 ;; 1567} 1568 1569{ .mfi 1570 nop.m 999 1571// 1572// N even: Result = c + r * rsq 1573// N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary 1574// 1575(p12) fma.s1 poly1 = S_hi, r, f1 1576 nop.i 999 ;; 1577} 1578 1579{ .mfi 1580 nop.m 999 1581// 1582// N even: Result = Result + r 1583// N odd: poly1 = 1.0 + S_hi * r 32 bits partial 1584// 1585(p11) fadd.s0 Result = r, Result 1586 nop.i 999 ;; 1587} 1588 1589{ .mfi 1590 nop.m 999 1591(p12) fma.s1 S_hi = S_hi, poly1, S_hi 1592 nop.i 999 ;; 1593} 1594 1595{ .mfi 1596 nop.m 999 1597// 1598// N even: Result1 = Result + r 1599// N odd: S_hi = S_hi * poly1 + S_hi 32 bits 1600// 1601(p12) fma.s1 poly1 = S_hi, r, f1 1602 nop.i 999 ;; 1603} 1604 1605{ .mfi 1606 nop.m 999 1607// 1608// N odd: poly1 = S_hi * r + 1.0 64 bits partial 1609// 1610(p12) fma.s1 S_hi = S_hi, poly1, S_hi 1611 nop.i 999 ;; 1612} 1613 1614{ .mfi 1615 nop.m 999 1616// 1617// N odd: poly1 = S_hi * poly + 1.0 64 bits 1618// 1619(p12) fma.s1 poly1 = S_hi, r, f1 1620 nop.i 999 ;; 1621} 1622 1623{ .mfi 1624 nop.m 999 1625// 1626// N odd: poly1 = S_hi * r + 1.0 1627// 1628(p12) fma.s1 poly1 = S_hi, c, poly1 1629 nop.i 999 ;; 1630} 1631 1632{ .mfi 1633 nop.m 999 1634// 1635// N odd: poly1 = S_hi * c + poly1 1636// 1637(p12) fmpy.s1 S_lo = S_hi, poly1 1638 nop.i 999 ;; 1639} 1640 1641{ .mfi 1642 nop.m 999 1643// 1644// N odd: S_lo = S_hi * poly1 1645// 1646(p12) fma.s1 S_lo = Q1_1, r, S_lo 1647 nop.i 999 1648} 1649 1650{ .mfi 1651 nop.m 999 1652// 1653// N odd: Result = S_hi + S_lo 1654// 1655(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact 1656 nop.i 999 ;; 1657} 1658 1659{ .mfb 1660 nop.m 999 1661// 1662// N odd: S_lo = S_lo + Q1_1 * r 1663// 1664(p12) fadd.s0 Result = S_hi, S_lo 1665(p0) br.ret.sptk b0 ;; 1666} 1667 1668 1669TAN_LARGER_ARG: 1670 1671{ .mmf 1672(p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp 1673 nop.m 999 1674(p0) fmpy.s1 N_0 = Arg, Inv_P_0 1675} 1676;; 1677 1678// 1679// ARGUMENT REDUCTION CODE - CASE 3 and 4 1680// 1681// 1682// Adjust table_ptr1 to beginning of table. 1683// N_0 = Arg * Inv_P_0 1684// 1685 1686 1687{ .mmi 1688(p0) ld8 table_ptr1 = [table_ptr1] 1689 nop.m 999 1690 nop.i 999 1691} 1692;; 1693 1694 1695{ .mmi 1696(p0) add table_ptr1 = 8, table_ptr1 ;; 1697// 1698// Point to 2*-14 1699// 1700(p0) ldfs TWO_TO_NEG14 = [table_ptr1], 4 1701 nop.i 999 ;; 1702} 1703// 1704// Load 2**(-14). 1705// 1706 1707{ .mmi 1708(p0) ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;; 1709// 1710// N_0_fix = integer part of N_0 . 1711// Adjust table_ptr1 to beginning of table. 1712// 1713(p0) ldfs TWO_TO_NEG2 = [table_ptr1], 4 1714 nop.i 999 ;; 1715} 1716// 1717// Make N_0 the integer part. 1718// 1719 1720{ .mfi 1721(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr1] 1722// 1723// Load -2**(-14). 1724// 1725(p0) fcvt.fx.s1 N_0_fix = N_0 1726 nop.i 999 ;; 1727} 1728 1729{ .mfi 1730 nop.m 999 1731(p0) fcvt.xf N_0 = N_0_fix 1732 nop.i 999 ;; 1733} 1734 1735{ .mfi 1736 nop.m 999 1737(p0) fnma.s1 ArgPrime = N_0, P_0, Arg 1738 nop.i 999 1739} 1740 1741{ .mfi 1742 nop.m 999 1743(p0) fmpy.s1 w = N_0, d_1 1744 nop.i 999 ;; 1745} 1746 1747{ .mfi 1748 nop.m 999 1749// 1750// ArgPrime = -N_0 * P_0 + Arg 1751// w = N_0 * d_1 1752// 1753(p0) fmpy.s1 N = ArgPrime, two_by_PI 1754 nop.i 999 ;; 1755} 1756 1757{ .mfi 1758 nop.m 999 1759// 1760// N = ArgPrime * 2/pi 1761// 1762(p0) fcvt.fx.s1 N_fix = N 1763 nop.i 999 ;; 1764} 1765 1766{ .mfi 1767 nop.m 999 1768// 1769// N_fix is the integer part. 1770// 1771(p0) fcvt.xf N = N_fix 1772 nop.i 999 ;; 1773} 1774 1775{ .mfi 1776(p0) getf.sig N_fix_gr = N_fix 1777 nop.f 999 1778 nop.i 999 ;; 1779} 1780 1781{ .mfi 1782 nop.m 999 1783// 1784// N is the integer part of the reduced-reduced argument. 1785// Put the integer in a GP register. 1786// 1787(p0) fnma.s1 s_val = N, P_1, ArgPrime 1788 nop.i 999 1789} 1790 1791{ .mfi 1792 nop.m 999 1793(p0) fnma.s1 w = N, P_2, w 1794 nop.i 999 ;; 1795} 1796 1797{ .mfi 1798 nop.m 999 1799// 1800// s_val = -N*P_1 + ArgPrime 1801// w = -N*P_2 + w 1802// 1803(p0) fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14 1804 nop.i 999 ;; 1805} 1806 1807{ .mfi 1808 nop.m 999 1809(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14 1810 nop.i 999 ;; 1811} 1812 1813{ .mfi 1814 nop.m 999 1815// 1816// Case 3: r = s_val + w (Z complete) 1817// Case 4: U_hi = N_0 * d_1 1818// 1819(p10) fmpy.s1 V_hi = N, P_2 1820 nop.i 999 1821} 1822 1823{ .mfi 1824 nop.m 999 1825(p11) fmpy.s1 U_hi = N_0, d_1 1826 nop.i 999 ;; 1827} 1828 1829{ .mfi 1830 nop.m 999 1831// 1832// Case 3: r = s_val + w (Z complete) 1833// Case 4: U_hi = N_0 * d_1 1834// 1835(p11) fmpy.s1 V_hi = N, P_2 1836 nop.i 999 1837} 1838 1839{ .mfi 1840 nop.m 999 1841(p11) fmpy.s1 U_hi = N_0, d_1 1842 nop.i 999 ;; 1843} 1844 1845{ .mfi 1846 nop.m 999 1847// 1848// Decide between case 3 and 4: 1849// Case 3: s <= -2**(-14) or s >= 2**(-14) 1850// Case 4: -2**(-14) < s < 2**(-14) 1851// 1852(p10) fadd.s1 r = s_val, w 1853 nop.i 999 1854} 1855 1856{ .mfi 1857 nop.m 999 1858(p11) fmpy.s1 w = N, P_3 1859 nop.i 999 ;; 1860} 1861 1862{ .mfi 1863 nop.m 999 1864// 1865// Case 4: We need abs of both U_hi and V_hi - dont 1866// worry about switched sign of V_hi . 1867// 1868(p11) fsub.s1 A = U_hi, V_hi 1869 nop.i 999 1870} 1871 1872{ .mfi 1873 nop.m 999 1874// 1875// Case 4: A = U_hi + V_hi 1876// Note: Worry about switched sign of V_hi, so subtract instead of add. 1877// 1878(p11) fnma.s1 V_lo = N, P_2, V_hi 1879 nop.i 999 ;; 1880} 1881 1882{ .mfi 1883 nop.m 999 1884(p11) fms.s1 U_lo = N_0, d_1, U_hi 1885 nop.i 999 ;; 1886} 1887 1888{ .mfi 1889 nop.m 999 1890(p11) fabs V_hiabs = V_hi 1891 nop.i 999 1892} 1893 1894{ .mfi 1895 nop.m 999 1896// 1897// Case 4: V_hi = N * P_2 1898// w = N * P_3 1899// Note the product does not include the (-) as in the writeup 1900// so (-) missing for V_hi and w . 1901(p10) fadd.s1 r = s_val, w 1902 nop.i 999 ;; 1903} 1904 1905{ .mfi 1906 nop.m 999 1907// 1908// Case 3: c = s_val - r 1909// Case 4: U_lo = N_0 * d_1 - U_hi 1910// 1911(p11) fabs U_hiabs = U_hi 1912 nop.i 999 1913} 1914 1915{ .mfi 1916 nop.m 999 1917(p11) fmpy.s1 w = N, P_3 1918 nop.i 999 ;; 1919} 1920 1921{ .mfi 1922 nop.m 999 1923// 1924// Case 4: Set P_12 if U_hiabs >= V_hiabs 1925// 1926(p11) fadd.s1 C_hi = s_val, A 1927 nop.i 999 ;; 1928} 1929 1930{ .mfi 1931 nop.m 999 1932// 1933// Case 4: C_hi = s_val + A 1934// 1935(p11) fadd.s1 t = U_lo, V_lo 1936 nop.i 999 ;; 1937} 1938 1939{ .mfi 1940 nop.m 999 1941// 1942// Case 3: Is |r| < 2**(-2), if so set PR_7 1943// else set PR_8. 1944// Case 3: If PR_7 is set, prepare to branch to Small_R. 1945// Case 3: If PR_8 is set, prepare to branch to Normal_R. 1946// 1947(p10) fsub.s1 c = s_val, r 1948 nop.i 999 ;; 1949} 1950 1951{ .mfi 1952 nop.m 999 1953// 1954// Case 3: c = (s - r) + w (c complete) 1955// 1956(p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs 1957 nop.i 999 1958} 1959 1960{ .mfi 1961 nop.m 999 1962(p11) fms.s1 w = N_0, d_2, w 1963 nop.i 999 ;; 1964} 1965 1966{ .mfi 1967 nop.m 999 1968// 1969// Case 4: V_hi = N * P_2 1970// w = N * P_3 1971// Note the product does not include the (-) as in the writeup 1972// so (-) missing for V_hi and w . 1973// 1974(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2 1975 nop.i 999 ;; 1976} 1977 1978{ .mfi 1979 nop.m 999 1980(p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2 1981 nop.i 999 ;; 1982} 1983 1984{ .mfb 1985 nop.m 999 1986// 1987// Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup) 1988// Note: the (-) is still missing for V_hi . 1989// Case 4: w = w + N_0 * d_2 1990// Note: the (-) is now incorporated in w . 1991// 1992(p10) fadd.s1 c = c, w 1993// 1994// Case 4: t = U_lo + V_lo 1995// Note: remember V_lo should be (-), subtract instead of add. NO 1996// 1997(p14) br.cond.spnt TAN_SMALL_R ;; 1998} 1999 2000{ .mib 2001 nop.m 999 2002 nop.i 999 2003(p15) br.cond.spnt TAN_NORMAL_R ;; 2004} 2005 2006{ .mfi 2007 nop.m 999 2008// 2009// Case 3: Vector off when |r| < 2**(-2). Recall that PR_3 will be true. 2010// The remaining stuff is for Case 4. 2011// 2012(p12) fsub.s1 a = U_hi, A 2013(p11) extr.u i_1 = N_fix_gr, 0, 1 ;; 2014} 2015 2016{ .mfi 2017 nop.m 999 2018// 2019// Case 4: C_lo = s_val - C_hi 2020// 2021(p11) fadd.s1 t = t, w 2022 nop.i 999 2023} 2024 2025{ .mfi 2026 nop.m 999 2027(p13) fadd.s1 a = V_hi, A 2028 nop.i 999 ;; 2029} 2030 2031// 2032// Case 4: a = U_hi - A 2033// a = V_hi - A (do an add to account for missing (-) on V_hi 2034// 2035 2036{ .mfi 2037(p11) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp 2038(p11) fsub.s1 C_lo = s_val, C_hi 2039 nop.i 999 2040} 2041;; 2042 2043{ .mmi 2044(p11) ld8 table_ptr1 = [table_ptr1] 2045 nop.m 999 2046 nop.i 999 2047} 2048;; 2049 2050// 2051// Case 4: a = (U_hi - A) + V_hi 2052// a = (V_hi - A) + U_hi 2053// In each case account for negative missing form V_hi . 2054// 2055// 2056// Case 4: C_lo = (s_val - C_hi) + A 2057// 2058 2059{ .mmi 2060(p11) add table_ptr1 = 224, table_ptr1 ;; 2061(p11) ldfe P1_1 = [table_ptr1], 16 2062 nop.i 999 ;; 2063} 2064 2065{ .mfi 2066(p11) ldfe P1_2 = [table_ptr1], 128 2067// 2068// Case 4: w = U_lo + V_lo + w 2069// 2070(p12) fsub.s1 a = a, V_hi 2071 nop.i 999 ;; 2072} 2073// 2074// Case 4: r = C_hi + C_lo 2075// 2076 2077{ .mfi 2078(p11) ldfe Q1_1 = [table_ptr1], 16 2079(p11) fadd.s1 C_lo = C_lo, A 2080 nop.i 999 ;; 2081} 2082// 2083// Case 4: c = C_hi - r 2084// Get [i_1] - lsb of N_fix_gr. 2085// 2086 2087{ .mfi 2088(p11) ldfe Q1_2 = [table_ptr1], 16 2089 nop.f 999 2090 nop.i 999 ;; 2091} 2092 2093{ .mfi 2094 nop.m 999 2095(p13) fsub.s1 a = U_hi, a 2096 nop.i 999 ;; 2097} 2098 2099{ .mfi 2100 nop.m 999 2101(p11) fadd.s1 t = t, a 2102 nop.i 999 ;; 2103} 2104 2105{ .mfi 2106 nop.m 999 2107// 2108// Case 4: t = t + a 2109// 2110(p11) fadd.s1 C_lo = C_lo, t 2111 nop.i 999 ;; 2112} 2113 2114{ .mfi 2115 nop.m 999 2116// 2117// Case 4: C_lo = C_lo + t 2118// 2119(p11) fadd.s1 r = C_hi, C_lo 2120 nop.i 999 ;; 2121} 2122 2123{ .mfi 2124 nop.m 999 2125(p11) fsub.s1 c = C_hi, r 2126 nop.i 999 2127} 2128 2129{ .mfi 2130 nop.m 999 2131// 2132// Case 4: c = c + C_lo finished. 2133// Is i_1 even or odd? 2134// if i_1 == 0, set PR_4, else set PR_5. 2135// 2136// r and c have been computed. 2137// We known whether this is the sine or cosine routine. 2138// Make sure ftz mode is set - should be automatic when using wre 2139(p0) fmpy.s1 rsq = r, r 2140 nop.i 999 ;; 2141} 2142 2143{ .mfi 2144 nop.m 999 2145(p11) fadd.s1 c = c , C_lo 2146(p11) cmp.eq.unc p11, p12 = 0x0000, i_1 ;; 2147} 2148 2149{ .mfi 2150 nop.m 999 2151(p12) frcpa.s1 S_hi, p0 = f1, r 2152 nop.i 999 2153} 2154 2155{ .mfi 2156 nop.m 999 2157// 2158// N odd: Change sign of S_hi 2159// 2160(p11) fma.s1 Result = rsq, P1_2, P1_1 2161 nop.i 999 ;; 2162} 2163 2164{ .mfi 2165 nop.m 999 2166(p12) fma.s1 P = rsq, Q1_2, Q1_1 2167 nop.i 999 2168} 2169 2170{ .mfi 2171 nop.m 999 2172// 2173// N odd: Result = S_hi + S_lo (User supplied rounding mode for C1) 2174// 2175(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact 2176 nop.i 999 ;; 2177} 2178 2179{ .mfi 2180 nop.m 999 2181// 2182// N even: rsq = r * r 2183// N odd: S_hi = frcpa(r) 2184// 2185(p12) fmerge.ns S_hi = S_hi, S_hi 2186 nop.i 999 2187} 2188 2189{ .mfi 2190 nop.m 999 2191// 2192// N even: rsq = rsq * P1_2 + P1_1 2193// N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary 2194// 2195(p11) fmpy.s1 Result = rsq, Result 2196 nop.i 999 ;; 2197} 2198 2199{ .mfi 2200 nop.m 999 2201(p12) fma.s1 poly1 = S_hi, r,f1 2202 nop.i 999 2203} 2204 2205{ .mfi 2206 nop.m 999 2207// 2208// N even: Result = Result * rsq 2209// N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary 2210// 2211(p11) fma.s1 Result = r, Result, c 2212 nop.i 999 ;; 2213} 2214 2215{ .mfi 2216 nop.m 999 2217(p12) fma.s1 S_hi = S_hi, poly1, S_hi 2218 nop.i 999 2219} 2220 2221{ .mfi 2222 nop.m 999 2223// 2224// N odd: S_hi = S_hi * poly1 + S_hi 32 bits 2225// 2226(p11) fadd.s0 Result= r, Result 2227 nop.i 999 ;; 2228} 2229 2230{ .mfi 2231 nop.m 999 2232(p12) fma.s1 poly1 = S_hi, r, f1 2233 nop.i 999 ;; 2234} 2235 2236{ .mfi 2237 nop.m 999 2238// 2239// N even: Result = Result * r + c 2240// N odd: poly1 = 1.0 + S_hi * r 32 bits partial 2241// 2242(p12) fma.s1 S_hi = S_hi, poly1, S_hi 2243 nop.i 999 ;; 2244} 2245 2246{ .mfi 2247 nop.m 999 2248(p12) fma.s1 poly1 = S_hi, r, f1 2249 nop.i 999 ;; 2250} 2251 2252{ .mfi 2253 nop.m 999 2254// 2255// N even: Result1 = Result + r (Rounding mode S0) 2256// N odd: poly1 = S_hi * r + 1.0 64 bits partial 2257// 2258(p12) fma.s1 S_hi = S_hi, poly1, S_hi 2259 nop.i 999 ;; 2260} 2261 2262{ .mfi 2263 nop.m 999 2264// 2265// N odd: poly1 = S_hi * poly + S_hi 64 bits 2266// 2267(p12) fma.s1 poly1 = S_hi, r, f1 2268 nop.i 999 ;; 2269} 2270 2271{ .mfi 2272 nop.m 999 2273// 2274// N odd: poly1 = S_hi * r + 1.0 2275// 2276(p12) fma.s1 poly1 = S_hi, c, poly1 2277 nop.i 999 ;; 2278} 2279 2280{ .mfi 2281 nop.m 999 2282// 2283// N odd: poly1 = S_hi * c + poly1 2284// 2285(p12) fmpy.s1 S_lo = S_hi, poly1 2286 nop.i 999 ;; 2287} 2288 2289{ .mfi 2290 nop.m 999 2291// 2292// N odd: S_lo = S_hi * poly1 2293// 2294(p12) fma.s1 S_lo = P, r, S_lo 2295 nop.i 999 ;; 2296} 2297 2298{ .mfb 2299 nop.m 999 2300// 2301// N odd: S_lo = S_lo + r * P 2302// 2303(p12) fadd.s0 Result = S_hi, S_lo 2304(p0) br.ret.sptk b0 ;; 2305} 2306 2307 2308TAN_SMALL_R: 2309 2310{ .mii 2311 nop.m 999 2312(p0) extr.u i_1 = N_fix_gr, 0, 1 ;; 2313(p0) cmp.eq.unc p11, p12 = 0x0000, i_1 2314} 2315 2316{ .mfi 2317 nop.m 999 2318(p0) fmpy.s1 rsq = r, r 2319 nop.i 999 ;; 2320} 2321 2322{ .mfi 2323 nop.m 999 2324(p12) frcpa.s1 S_hi, p0 = f1, r 2325 nop.i 999 2326} 2327 2328{ .mfi 2329(p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp 2330 nop.f 999 2331 nop.i 999 2332} 2333;; 2334 2335{ .mmi 2336(p0) ld8 table_ptr1 = [table_ptr1] 2337 nop.m 999 2338 nop.i 999 2339} 2340;; 2341 2342// ***************************************************************** 2343// ***************************************************************** 2344// ***************************************************************** 2345 2346{ .mmi 2347(p0) add table_ptr1 = 224, table_ptr1 ;; 2348(p0) ldfe P1_1 = [table_ptr1], 16 2349 nop.i 999 ;; 2350} 2351// r and c have been computed. 2352// We known whether this is the sine or cosine routine. 2353// Make sure ftz mode is set - should be automatic when using wre 2354// |r| < 2**(-2) 2355 2356{ .mfi 2357(p0) ldfe P1_2 = [table_ptr1], 16 2358(p11) fmpy.s1 r_to_the_8 = rsq, rsq 2359 nop.i 999 ;; 2360} 2361// 2362// Set table_ptr1 to beginning of constant table. 2363// Get [i_1] - lsb of N_fix_gr. 2364// 2365 2366{ .mfi 2367(p0) ldfe P1_3 = [table_ptr1], 96 2368// 2369// N even: rsq = r * r 2370// N odd: S_hi = frcpa(r) 2371// 2372(p12) fmerge.ns S_hi = S_hi, S_hi 2373 nop.i 999 ;; 2374} 2375// 2376// Is i_1 even or odd? 2377// if i_1 == 0, set PR_11. 2378// if i_1 != 0, set PR_12. 2379// 2380 2381{ .mfi 2382(p11) ldfe P1_9 = [table_ptr1], -16 2383// 2384// N even: Poly2 = P1_7 + Poly2 * rsq 2385// N odd: poly2 = Q1_5 + poly2 * rsq 2386// 2387(p11) fadd.s1 CORR = rsq, f1 2388 nop.i 999 ;; 2389} 2390 2391{ .mmi 2392(p11) ldfe P1_8 = [table_ptr1], -16 ;; 2393// 2394// N even: Poly1 = P1_2 + P1_3 * rsq 2395// N odd: poly1 = 1.0 + S_hi * r 2396// 16 bits partial account for necessary (-1) 2397// 2398(p11) ldfe P1_7 = [table_ptr1], -16 2399 nop.i 999 ;; 2400} 2401// 2402// N even: Poly1 = P1_1 + Poly1 * rsq 2403// N odd: S_hi = S_hi + S_hi * poly1) 16 bits account for necessary 2404// 2405 2406{ .mfi 2407(p11) ldfe P1_6 = [table_ptr1], -16 2408// 2409// N even: Poly2 = P1_5 + Poly2 * rsq 2410// N odd: poly2 = Q1_3 + poly2 * rsq 2411// 2412(p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8 2413 nop.i 999 ;; 2414} 2415// 2416// N even: Poly1 = Poly1 * rsq 2417// N odd: poly1 = 1.0 + S_hi * r 32 bits partial 2418// 2419 2420{ .mfi 2421(p11) ldfe P1_5 = [table_ptr1], -16 2422(p12) fma.s1 poly1 = S_hi, r, f1 2423 nop.i 999 ;; 2424} 2425// 2426// N even: CORR = CORR * c 2427// N odd: S_hi = S_hi * poly1 + S_hi 32 bits 2428// 2429 2430// 2431// N even: Poly2 = P1_6 + Poly2 * rsq 2432// N odd: poly2 = Q1_4 + poly2 * rsq 2433// 2434{ .mmf 2435(p0) addl table_ptr2 = @ltoff(TAN_BASE_CONSTANTS), gp 2436(p11) ldfe P1_4 = [table_ptr1], -16 2437(p11) fmpy.s1 CORR = CORR, c 2438} 2439;; 2440 2441 2442{ .mmi 2443(p0) ld8 table_ptr2 = [table_ptr2] 2444 nop.m 999 2445 nop.i 999 2446} 2447;; 2448 2449 2450{ .mii 2451(p0) add table_ptr2 = 464, table_ptr2 2452 nop.i 999 ;; 2453 nop.i 999 2454} 2455 2456{ .mfi 2457 nop.m 999 2458(p11) fma.s1 Poly1 = P1_3, rsq, P1_2 2459 nop.i 999 ;; 2460} 2461 2462{ .mfi 2463(p0) ldfe Q1_7 = [table_ptr2], -16 2464(p12) fma.s1 S_hi = S_hi, poly1, S_hi 2465 nop.i 999 ;; 2466} 2467 2468{ .mfi 2469(p0) ldfe Q1_6 = [table_ptr2], -16 2470(p11) fma.s1 Poly2 = P1_9, rsq, P1_8 2471 nop.i 999 ;; 2472} 2473 2474{ .mmi 2475(p0) ldfe Q1_5 = [table_ptr2], -16 ;; 2476(p12) ldfe Q1_4 = [table_ptr2], -16 2477 nop.i 999 ;; 2478} 2479 2480{ .mfi 2481(p12) ldfe Q1_3 = [table_ptr2], -16 2482// 2483// N even: Poly2 = P1_8 + P1_9 * rsq 2484// N odd: poly2 = Q1_6 + Q1_7 * rsq 2485// 2486(p11) fma.s1 Poly1 = Poly1, rsq, P1_1 2487 nop.i 999 ;; 2488} 2489 2490{ .mfi 2491(p12) ldfe Q1_2 = [table_ptr2], -16 2492(p12) fma.s1 poly1 = S_hi, r, f1 2493 nop.i 999 ;; 2494} 2495 2496{ .mfi 2497(p12) ldfe Q1_1 = [table_ptr2], -16 2498(p11) fma.s1 Poly2 = Poly2, rsq, P1_7 2499 nop.i 999 ;; 2500} 2501 2502{ .mfi 2503 nop.m 999 2504// 2505// N even: CORR = rsq + 1 2506// N even: r_to_the_8 = rsq * rsq 2507// 2508(p11) fmpy.s1 Poly1 = Poly1, rsq 2509 nop.i 999 ;; 2510} 2511 2512{ .mfi 2513 nop.m 999 2514(p12) fma.s1 S_hi = S_hi, poly1, S_hi 2515 nop.i 999 2516} 2517 2518{ .mfi 2519 nop.m 999 2520(p12) fma.s1 poly2 = Q1_7, rsq, Q1_6 2521 nop.i 999 ;; 2522} 2523 2524{ .mfi 2525 nop.m 999 2526(p11) fma.s1 Poly2 = Poly2, rsq, P1_6 2527 nop.i 999 ;; 2528} 2529 2530{ .mfi 2531 nop.m 999 2532(p12) fma.s1 poly1 = S_hi, r, f1 2533 nop.i 999 2534} 2535 2536{ .mfi 2537 nop.m 999 2538(p12) fma.s1 poly2 = poly2, rsq, Q1_5 2539 nop.i 999 ;; 2540} 2541 2542{ .mfi 2543 nop.m 999 2544(p11) fma.s1 Poly2= Poly2, rsq, P1_5 2545 nop.i 999 ;; 2546} 2547 2548{ .mfi 2549 nop.m 999 2550(p12) fma.s1 S_hi = S_hi, poly1, S_hi 2551 nop.i 999 2552} 2553 2554{ .mfi 2555 nop.m 999 2556(p12) fma.s1 poly2 = poly2, rsq, Q1_4 2557 nop.i 999 ;; 2558} 2559 2560{ .mfi 2561 nop.m 999 2562// 2563// N even: r_to_the_8 = r_to_the_8 * r_to_the_8 2564// N odd: poly1 = S_hi * r + 1.0 64 bits partial 2565// 2566(p11) fma.s1 Poly2 = Poly2, rsq, P1_4 2567 nop.i 999 ;; 2568} 2569 2570{ .mfi 2571 nop.m 999 2572// 2573// N even: Result = CORR + Poly * r 2574// N odd: P = Q1_1 + poly2 * rsq 2575// 2576(p12) fma.s1 poly1 = S_hi, r, f1 2577 nop.i 999 2578} 2579 2580{ .mfi 2581 nop.m 999 2582(p12) fma.s1 poly2 = poly2, rsq, Q1_3 2583 nop.i 999 ;; 2584} 2585 2586{ .mfi 2587 nop.m 999 2588// 2589// N even: Poly2 = P1_4 + Poly2 * rsq 2590// N odd: poly2 = Q1_2 + poly2 * rsq 2591// 2592(p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1 2593 nop.i 999 ;; 2594} 2595 2596{ .mfi 2597 nop.m 999 2598(p12) fma.s1 poly1 = S_hi, c, poly1 2599 nop.i 999 2600} 2601 2602{ .mfi 2603 nop.m 999 2604(p12) fma.s1 poly2 = poly2, rsq, Q1_2 2605 nop.i 999 ;; 2606} 2607 2608{ .mfi 2609 nop.m 999 2610// 2611// N even: Poly = Poly1 + Poly2 * r_to_the_8 2612// N odd: S_hi = S_hi * poly1 + S_hi 64 bits 2613// 2614(p11) fma.s1 Result = Poly, r, CORR 2615 nop.i 999 ;; 2616} 2617 2618{ .mfi 2619 nop.m 999 2620// 2621// N even: Result = r + Result (User supplied rounding mode) 2622// N odd: poly1 = S_hi * c + poly1 2623// 2624(p12) fmpy.s1 S_lo = S_hi, poly1 2625 nop.i 999 2626} 2627 2628{ .mfi 2629 nop.m 999 2630(p12) fma.s1 P = poly2, rsq, Q1_1 2631 nop.i 999 ;; 2632} 2633 2634{ .mfi 2635 nop.m 999 2636// 2637// N odd: poly1 = S_hi * r + 1.0 2638// 2639(p11) fadd.s0 Result = Result, r 2640 nop.i 999 ;; 2641} 2642 2643{ .mfi 2644 nop.m 999 2645// 2646// N odd: S_lo = S_hi * poly1 2647// 2648(p12) fma.s1 S_lo = Q1_1, c, S_lo 2649 nop.i 999 2650} 2651 2652{ .mfi 2653 nop.m 999 2654// 2655// N odd: Result = Result + S_hi (user supplied rounding mode) 2656// 2657(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact 2658 nop.i 999 ;; 2659} 2660 2661{ .mfi 2662 nop.m 999 2663// 2664// N odd: S_lo = Q1_1 * c + S_lo 2665// 2666(p12) fma.s1 Result = P, r, S_lo 2667 nop.i 999 ;; 2668} 2669 2670{ .mfb 2671 nop.m 999 2672// 2673// N odd: Result = S_lo + r * P 2674// 2675(p12) fadd.s0 Result = Result, S_hi 2676(p0) br.ret.sptk b0 ;; 2677} 2678 2679 2680TAN_NORMAL_R: 2681 2682{ .mfi 2683(p0) getf.sig sig_r = r 2684// ******************************************************************* 2685// ******************************************************************* 2686// ******************************************************************* 2687// 2688// r and c have been computed. 2689// Make sure ftz mode is set - should be automatic when using wre 2690// 2691// 2692// Get [i_1] - lsb of N_fix_gr alone. 2693// 2694(p0) fmerge.s Pos_r = f1, r 2695(p0) extr.u i_1 = N_fix_gr, 0, 1 ;; 2696} 2697 2698{ .mfi 2699 nop.m 999 2700(p0) fmerge.s sgn_r = r, f1 2701(p0) cmp.eq.unc p11, p12 = 0x0000, i_1 ;; 2702} 2703 2704{ .mfi 2705 nop.m 999 2706 nop.f 999 2707(p0) extr.u lookup = sig_r, 58, 5 2708} 2709 2710{ .mlx 2711 nop.m 999 2712(p0) movl Create_B = 0x8200000000000000 ;; 2713} 2714 2715{ .mfi 2716(p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp 2717 nop.f 999 2718(p0) dep Create_B = lookup, Create_B, 58, 5 2719} 2720;; 2721 2722// 2723// Get [i_1] - lsb of N_fix_gr alone. 2724// Pos_r = abs (r) 2725// 2726 2727 2728{ .mmi 2729 ld8 table_ptr1 = [table_ptr1] 2730 nop.m 999 2731 nop.i 999 2732} 2733;; 2734 2735 2736{ .mmi 2737 nop.m 999 2738(p0) setf.sig B = Create_B 2739// 2740// Set table_ptr1 and table_ptr2 to base address of 2741// constant table. 2742// 2743(p0) add table_ptr1 = 480, table_ptr1 ;; 2744} 2745 2746{ .mmb 2747 nop.m 999 2748// 2749// Is i_1 or i_0 == 0 ? 2750// Create the constant 1 00000 1000000000000000000000... 2751// 2752(p0) ldfe P2_1 = [table_ptr1], 16 2753 nop.b 999 2754} 2755 2756{ .mmi 2757 nop.m 999 ;; 2758(p0) getf.exp exp_r = Pos_r 2759 nop.i 999 2760} 2761// 2762// Get r's exponent 2763// Get r's significand 2764// 2765 2766{ .mmi 2767(p0) ldfe P2_2 = [table_ptr1], 16 ;; 2768// 2769// Get the 5 bits or r for the lookup. 1.xxxxx .... 2770// from sig_r. 2771// Grab lsb of exp of B 2772// 2773(p0) ldfe P2_3 = [table_ptr1], 16 2774 nop.i 999 ;; 2775} 2776 2777{ .mii 2778 nop.m 999 2779(p0) andcm table_offset = 0x0001, exp_r ;; 2780(p0) shl table_offset = table_offset, 9 ;; 2781} 2782 2783{ .mii 2784 nop.m 999 2785// 2786// Deposit 0 00000 1000000000000000000000... on 2787// 1 xxxxx yyyyyyyyyyyyyyyyyyyyyy..., 2788// getting rid of the ys. 2789// Is B = 2** -2 or B= 2** -1? If 2**-1, then 2790// we want an offset of 512 for table addressing. 2791// 2792(p0) shladd table_offset = lookup, 4, table_offset ;; 2793// 2794// B = ........ 1xxxxx 1000000000000000000... 2795// 2796(p0) add table_ptr1 = table_ptr1, table_offset ;; 2797} 2798 2799{ .mmb 2800 nop.m 999 2801// 2802// B = ........ 1xxxxx 1000000000000000000... 2803// Convert B so it has the same exponent as Pos_r 2804// 2805(p0) ldfd T_hi = [table_ptr1], 8 2806 nop.b 999 ;; 2807} 2808 2809// 2810// x = |r| - B 2811// Load T_hi. 2812// Load C_hi. 2813// 2814 2815{ .mmf 2816(p0) addl table_ptr2 = @ltoff(TAN_BASE_CONSTANTS), gp 2817(p0) ldfs T_lo = [table_ptr1] 2818(p0) fmerge.se B = Pos_r, B 2819} 2820;; 2821 2822{ .mmi 2823 ld8 table_ptr2 = [table_ptr2] 2824 nop.m 999 2825 nop.i 999 2826} 2827;; 2828 2829{ .mii 2830(p0) add table_ptr2 = 1360, table_ptr2 2831 nop.i 999 ;; 2832(p0) add table_ptr2 = table_ptr2, table_offset ;; 2833} 2834 2835{ .mfi 2836(p0) ldfd C_hi = [table_ptr2], 8 2837(p0) fsub.s1 x = Pos_r, B 2838 nop.i 999 ;; 2839} 2840 2841{ .mii 2842(p0) ldfs C_lo = [table_ptr2],255 2843 nop.i 999 ;; 2844// 2845// xsq = x * x 2846// N even: Tx = T_hi * x 2847// Load T_lo. 2848// Load C_lo - increment pointer to get SC_inv 2849// - cant get all the way, do an add later. 2850// 2851(p0) add table_ptr2 = 569, table_ptr2 ;; 2852} 2853// 2854// N even: Tx1 = Tx + 1 2855// N odd: Cx1 = 1 - Cx 2856// 2857 2858{ .mfi 2859(p0) ldfe SC_inv = [table_ptr2], 0 2860 nop.f 999 2861 nop.i 999 ;; 2862} 2863 2864{ .mfi 2865 nop.m 999 2866(p0) fmpy.s1 xsq = x, x 2867 nop.i 999 2868} 2869 2870{ .mfi 2871 nop.m 999 2872(p11) fmpy.s1 Tx = T_hi, x 2873 nop.i 999 ;; 2874} 2875 2876{ .mfi 2877 nop.m 999 2878(p12) fmpy.s1 Cx = C_hi, x 2879 nop.i 999 ;; 2880} 2881 2882{ .mfi 2883 nop.m 999 2884// 2885// N odd: Cx = C_hi * x 2886// 2887(p0) fma.s1 P = P2_3, xsq, P2_2 2888 nop.i 999 2889} 2890 2891{ .mfi 2892 nop.m 999 2893// 2894// N even and odd: P = P2_3 + P2_2 * xsq 2895// 2896(p11) fadd.s1 Tx1 = Tx, f1 2897 nop.i 999 ;; 2898} 2899 2900{ .mfi 2901 nop.m 999 2902// 2903// N even: D = C_hi - tanx 2904// N odd: D = T_hi + tanx 2905// 2906(p11) fmpy.s1 CORR = SC_inv, T_hi 2907 nop.i 999 2908} 2909 2910{ .mfi 2911 nop.m 999 2912(p0) fmpy.s1 Sx = SC_inv, x 2913 nop.i 999 ;; 2914} 2915 2916{ .mfi 2917 nop.m 999 2918(p12) fmpy.s1 CORR = SC_inv, C_hi 2919 nop.i 999 ;; 2920} 2921 2922{ .mfi 2923 nop.m 999 2924(p12) fsub.s1 V_hi = f1, Cx 2925 nop.i 999 ;; 2926} 2927 2928{ .mfi 2929 nop.m 999 2930(p0) fma.s1 P = P, xsq, P2_1 2931 nop.i 999 2932} 2933 2934{ .mfi 2935 nop.m 999 2936// 2937// N even and odd: P = P2_1 + P * xsq 2938// 2939(p11) fma.s1 V_hi = Tx, Tx1, f1 2940 nop.i 999 ;; 2941} 2942 2943{ .mfi 2944 nop.m 999 2945// 2946// N even: Result = sgn_r * tail + T_hi (user rounding mode for C1) 2947// N odd: Result = sgn_r * tail + C_hi (user rounding mode for C1) 2948// 2949(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact 2950 nop.i 999 ;; 2951} 2952 2953{ .mfi 2954 nop.m 999 2955(p0) fmpy.s1 CORR = CORR, c 2956 nop.i 999 ;; 2957} 2958 2959{ .mfi 2960 nop.m 999 2961(p12) fnma.s1 V_hi = Cx,V_hi,f1 2962 nop.i 999 ;; 2963} 2964 2965{ .mfi 2966 nop.m 999 2967// 2968// N even: V_hi = Tx * Tx1 + 1 2969// N odd: Cx1 = 1 - Cx * Cx1 2970// 2971(p0) fmpy.s1 P = P, xsq 2972 nop.i 999 2973} 2974 2975{ .mfi 2976 nop.m 999 2977// 2978// N even and odd: P = P * xsq 2979// 2980(p11) fmpy.s1 V_hi = V_hi, T_hi 2981 nop.i 999 ;; 2982} 2983 2984{ .mfi 2985 nop.m 999 2986// 2987// N even and odd: tail = P * tail + V_lo 2988// 2989(p11) fmpy.s1 T_hi = sgn_r, T_hi 2990 nop.i 999 ;; 2991} 2992 2993{ .mfi 2994 nop.m 999 2995(p0) fmpy.s1 CORR = CORR, sgn_r 2996 nop.i 999 ;; 2997} 2998 2999{ .mfi 3000 nop.m 999 3001(p12) fmpy.s1 V_hi = V_hi,C_hi 3002 nop.i 999 ;; 3003} 3004 3005{ .mfi 3006 nop.m 999 3007// 3008// N even: V_hi = T_hi * V_hi 3009// N odd: V_hi = C_hi * V_hi 3010// 3011(p0) fma.s1 tanx = P, x, x 3012 nop.i 999 3013} 3014 3015{ .mfi 3016 nop.m 999 3017(p12) fnmpy.s1 C_hi = sgn_r, C_hi 3018 nop.i 999 ;; 3019} 3020 3021{ .mfi 3022 nop.m 999 3023// 3024// N even: V_lo = 1 - V_hi + C_hi 3025// N odd: V_lo = 1 - V_hi + T_hi 3026// 3027(p11) fadd.s1 CORR = CORR, T_lo 3028 nop.i 999 3029} 3030 3031{ .mfi 3032 nop.m 999 3033(p12) fsub.s1 CORR = CORR, C_lo 3034 nop.i 999 ;; 3035} 3036 3037{ .mfi 3038 nop.m 999 3039// 3040// N even and odd: tanx = x + x * P 3041// N even and odd: Sx = SC_inv * x 3042// 3043(p11) fsub.s1 D = C_hi, tanx 3044 nop.i 999 3045} 3046 3047{ .mfi 3048 nop.m 999 3049(p12) fadd.s1 D = T_hi, tanx 3050 nop.i 999 ;; 3051} 3052 3053{ .mfi 3054 nop.m 999 3055// 3056// N odd: CORR = SC_inv * C_hi 3057// N even: CORR = SC_inv * T_hi 3058// 3059(p0) fnma.s1 D = V_hi, D, f1 3060 nop.i 999 ;; 3061} 3062 3063{ .mfi 3064 nop.m 999 3065// 3066// N even and odd: D = 1 - V_hi * D 3067// N even and odd: CORR = CORR * c 3068// 3069(p0) fma.s1 V_hi = V_hi, D, V_hi 3070 nop.i 999 ;; 3071} 3072 3073{ .mfi 3074 nop.m 999 3075// 3076// N even and odd: V_hi = V_hi + V_hi * D 3077// N even and odd: CORR = sgn_r * CORR 3078// 3079(p11) fnma.s1 V_lo = V_hi, C_hi, f1 3080 nop.i 999 3081} 3082 3083{ .mfi 3084 nop.m 999 3085(p12) fnma.s1 V_lo = V_hi, T_hi, f1 3086 nop.i 999 ;; 3087} 3088 3089{ .mfi 3090 nop.m 999 3091// 3092// N even: CORR = COOR + T_lo 3093// N odd: CORR = CORR - C_lo 3094// 3095(p11) fma.s1 V_lo = tanx, V_hi, V_lo 3096 nop.i 999 3097} 3098 3099{ .mfi 3100 nop.m 999 3101(p12) fnma.s1 V_lo = tanx, V_hi, V_lo 3102 nop.i 999 ;; 3103} 3104 3105{ .mfi 3106 nop.m 999 3107// 3108// N even: V_lo = V_lo + V_hi * tanx 3109// N odd: V_lo = V_lo - V_hi * tanx 3110// 3111(p11) fnma.s1 V_lo = C_lo, V_hi, V_lo 3112 nop.i 999 3113} 3114 3115{ .mfi 3116 nop.m 999 3117(p12) fnma.s1 V_lo = T_lo, V_hi, V_lo 3118 nop.i 999 ;; 3119} 3120 3121{ .mfi 3122 nop.m 999 3123// 3124// N even: V_lo = V_lo - V_hi * C_lo 3125// N odd: V_lo = V_lo - V_hi * T_lo 3126// 3127(p0) fmpy.s1 V_lo = V_hi, V_lo 3128 nop.i 999 ;; 3129} 3130 3131{ .mfi 3132 nop.m 999 3133// 3134// N even and odd: V_lo = V_lo * V_hi 3135// 3136(p0) fadd.s1 tail = V_hi, V_lo 3137 nop.i 999 ;; 3138} 3139 3140{ .mfi 3141 nop.m 999 3142// 3143// N even and odd: tail = V_hi + V_lo 3144// 3145(p0) fma.s1 tail = tail, P, V_lo 3146 nop.i 999 ;; 3147} 3148 3149{ .mfi 3150 nop.m 999 3151// 3152// N even: T_hi = sgn_r * T_hi 3153// N odd : C_hi = -sgn_r * C_hi 3154// 3155(p0) fma.s1 tail = tail, Sx, CORR 3156 nop.i 999 ;; 3157} 3158 3159{ .mfi 3160 nop.m 999 3161// 3162// N even and odd: tail = Sx * tail + CORR 3163// 3164(p0) fma.s1 tail = V_hi, Sx, tail 3165 nop.i 999 ;; 3166} 3167 3168{ .mfi 3169 nop.m 999 3170// 3171// N even an odd: tail = Sx * V_hi + tail 3172// 3173(p11) fma.s0 Result = sgn_r, tail, T_hi 3174 nop.i 999 3175} 3176 3177{ .mfb 3178 nop.m 999 3179(p12) fma.s0 Result = sgn_r, tail, C_hi 3180(p0) br.ret.sptk b0 ;; 3181} 3182 3183.endp __libm_tan 3184ASM_SIZE_DIRECTIVE(__libm_tan) 3185 3186 3187 3188// ******************************************************************* 3189// ******************************************************************* 3190// ******************************************************************* 3191// 3192// Special Code to handle very large argument case. 3193// Call int pi_by_2_reduce(&x,&r) 3194// for |arguments| >= 2**63 3195// (Arg or x) is in f8 3196// Address to save r and c as double 3197 3198// (1) (2) (3) (call) (4) 3199// sp -> + psp -> + psp -> + sp -> + 3200// | | | | 3201// | r50 ->| <- r50 f0 ->| r50 -> | -> c 3202// | | | | 3203// sp-32 -> | <- r50 f0 ->| f0 ->| <- r50 r49 -> | -> r 3204// | | | | 3205// | r49 ->| <- r49 Arg ->| <- r49 | -> x 3206// | | | | 3207// sp -64 ->| sp -64 ->| sp -64 ->| | 3208// 3209// save pfs save b0 restore gp 3210// save gp restore b0 3211// restore pfs 3212 3213 3214 3215.proc __libm_callout 3216__libm_callout: 3217TAN_ARG_TOO_LARGE: 3218.prologue 3219// (1) 3220{ .mfi 3221 add GR_Parameter_r =-32,sp // Parameter: r address 3222 nop.f 0 3223.save ar.pfs,GR_SAVE_PFS 3224 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 3225} 3226{ .mfi 3227.fframe 64 3228 add sp=-64,sp // Create new stack 3229 nop.f 0 3230 mov GR_SAVE_GP=gp // Save gp 3231};; 3232 3233// (2) 3234{ .mmi 3235 stfe [GR_Parameter_r ] = f0,16 // Clear Parameter r on stack 3236 add GR_Parameter_X = 16,sp // Parameter x address 3237.save b0, GR_SAVE_B0 3238 mov GR_SAVE_B0=b0 // Save b0 3239};; 3240 3241// (3) 3242.body 3243{ .mib 3244 stfe [GR_Parameter_r ] = f0,-16 // Clear Parameter c on stack 3245 nop.i 0 3246 nop.b 0 3247} 3248{ .mib 3249 stfe [GR_Parameter_X] = Arg // Store Parameter x on stack 3250 nop.i 0 3251(p0) br.call.sptk b0=__libm_pi_by_2_reduce# 3252} 3253;; 3254 3255 3256// (4) 3257{ .mmi 3258 mov gp = GR_SAVE_GP // Restore gp 3259(p0) mov N_fix_gr = r8 3260 nop.i 999 3261} 3262;; 3263 3264{ .mmi 3265(p0) ldfe Arg =[GR_Parameter_X],16 3266(p0) ldfs TWO_TO_NEG2 = [table_ptr2],4 3267 nop.i 999 3268} 3269;; 3270 3271 3272{ .mmb 3273(p0) ldfe r =[GR_Parameter_r ],16 3274(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],4 3275 nop.b 999 ;; 3276} 3277 3278{ .mfi 3279(p0) ldfe c =[GR_Parameter_r ] 3280 nop.f 999 3281 nop.i 999 ;; 3282} 3283 3284{ .mfi 3285 nop.m 999 3286// 3287// Is |r| < 2**(-2) 3288// 3289(p0) fcmp.lt.unc.s1 p6, p0 = r, TWO_TO_NEG2 3290 mov b0 = GR_SAVE_B0 // Restore return address 3291} 3292;; 3293 3294{ .mfi 3295 nop.m 999 3296(p6) fcmp.gt.unc.s1 p6, p0 = r, NEGTWO_TO_NEG2 3297 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 3298} 3299;; 3300 3301{ .mbb 3302.restore sp 3303 add sp = 64,sp // Restore stack pointer 3304(p6) br.cond.spnt TAN_SMALL_R 3305(p0) br.cond.sptk TAN_NORMAL_R 3306} 3307;; 3308.endp __libm_callout 3309ASM_SIZE_DIRECTIVE(__libm_callout) 3310 3311 3312.proc __libm_TAN_SPECIAL 3313__libm_TAN_SPECIAL: 3314 3315// 3316// Code for NaNs, Unsupporteds, Infs, or +/- zero ? 3317// Invalid raised for Infs and SNaNs. 3318// 3319 3320{ .mfb 3321 nop.m 999 3322(p0) fmpy.s0 Arg = Arg, f0 3323(p0) br.ret.sptk b0 3324} 3325.endp __libm_TAN_SPECIAL 3326ASM_SIZE_DIRECTIVE(__libm_TAN_SPECIAL) 3327 3328 3329.type __libm_pi_by_2_reduce#,@function 3330.global __libm_pi_by_2_reduce# 3331