1.file "libm_reduce.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History: 40// 02/02/00 Initial Version 41// 05/13/02 Rescheduled for speed, changed interface to pass 42// parameters in fp registers 43// 02/10/03 Reordered header: .section, .global, .proc, .align; 44// used data8 for long double data storage 45// 46//********************************************************************* 47//********************************************************************* 48// 49// Function: __libm_pi_by_two_reduce(x) return r, c, and N where 50// x = N * pi/4 + (r+c) , where |r+c| <= pi/4. 51// This function is not designed to be used by the 52// general user. 53// 54//********************************************************************* 55// 56// Accuracy: Returns double-precision values 57// 58//********************************************************************* 59// 60// Resources Used: 61// 62// Floating-Point Registers: 63// f8 = Input x, return value r 64// f9 = return value c 65// f32-f70 66// 67// General Purpose Registers: 68// r8 = return value N 69// r34-r64 70// 71// Predicate Registers: p6-p14 72// 73//********************************************************************* 74// 75// IEEE Special Conditions: 76// 77// No conditions should be raised. 78// 79//********************************************************************* 80// 81// I. Introduction 82// =============== 83// 84// For the forward trigonometric functions sin, cos, sincos, and 85// tan, the original algorithms for IA 64 handle arguments up to 86// 1 ulp less than 2^63 in magnitude. For double-extended arguments x, 87// |x| >= 2^63, this routine returns N and r_hi, r_lo where 88// 89// x is accurately approximated by 90// 2*K*pi + N * pi/2 + r_hi + r_lo, |r_hi+r_lo| <= pi/4. 91// CASE = 1 or 2. 92// CASE is 1 unless |r_hi + r_lo| < 2^(-33). 93// 94// The exact value of K is not determined, but that information is 95// not required in trigonometric function computations. 96// 97// We first assume the argument x in question satisfies x >= 2^(63). 98// In particular, it is positive. Negative x can be handled by symmetry: 99// 100// -x is accurately approximated by 101// -2*K*pi + (-N) * pi/2 - (r_hi + r_lo), |r_hi+r_lo| <= pi/4. 102// 103// The idea of the reduction is that 104// 105// x * 2/pi = N_big + N + f, |f| <= 1/2 106// 107// Moreover, for double extended x, |f| >= 2^(-75). (This is an 108// non-obvious fact found by enumeration using a special algorithm 109// involving continued fraction.) The algorithm described below 110// calculates N and an accurate approximation of f. 111// 112// Roughly speaking, an appropriate 256-bit (4 X 64) portion of 113// 2/pi is multiplied with x to give the desired information. 114// 115// II. Representation of 2/PI 116// ========================== 117// 118// The value of 2/pi in binary fixed-point is 119// 120// .101000101111100110...... 121// 122// We store 2/pi in a table, starting at the position corresponding 123// to bit position 63 124// 125// bit position 63 62 ... 0 -1 -2 -3 -4 -5 -6 -7 .... -16576 126// 127// 0 0 ... 0 . 1 0 1 0 1 0 1 .... X 128// 129// ^ 130// |__ implied binary pt 131// 132// III. Algorithm 133// ============== 134// 135// This describes the algorithm in the most natural way using 136// unsigned interger multiplication. The implementation section 137// describes how the integer arithmetic is simulated. 138// 139// STEP 0. Initialization 140// ---------------------- 141// 142// Let the input argument x be 143// 144// x = 2^m * ( 1. b_1 b_2 b_3 ... b_63 ), 63 <= m <= 16383. 145// 146// The first crucial step is to fetch four 64-bit portions of 2/pi. 147// To fulfill this goal, we calculate the bit position L of the 148// beginning of these 256-bit quantity by 149// 150// L := 62 - m. 151// 152// Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that 153// the storage of 2/pi is adequate. 154// 155// Fetch P_1, P_2, P_3, P_4 beginning at bit position L thus: 156// 157// bit position L L-1 L-2 ... L-63 158// 159// P_1 = b b b ... b 160// 161// each b can be 0 or 1. Also, let P_0 be the two bits correspoding to 162// bit positions L+2 and L+1. So, when each of the P_j is interpreted 163// with appropriate scaling, we have 164// 165// 2/pi = P_big + P_0 + (P_1 + P_2 + P_3 + P_4) + P_small 166// 167// Note that P_big and P_small can be ignored. The reasons are as follow. 168// First, consider P_big. If P_big = 0, we can certainly ignore it. 169// Otherwise, P_big >= 2^(L+3). Now, 170// 171// P_big * ulp(x) >= 2^(L+3) * 2^(m-63) 172// >= 2^(65-m + m-63 ) 173// >= 2^2 174// 175// Thus, P_big * x is an integer of the form 4*K. So 176// 177// x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2) 178// + x*P_small*(pi/2). 179// 180// Hence, P_big*x corresponds to information that can be ignored for 181// trigonometic function evaluation. 182// 183// Next, we must estimate the effect of ignoring P_small. The absolute 184// error made by ignoring P_small is bounded by 185// 186// |P_small * x| <= ulp(P_4) * x 187// <= 2^(L-255) * 2^(m+1) 188// <= 2^(62-m-255 + m + 1) 189// <= 2^(-192) 190// 191// Since for double-extended precision, x * 2/pi = integer + f, 192// 0.5 >= |f| >= 2^(-75), the relative error introduced by ignoring 193// P_small is bounded by 2^(-192+75) <= 2^(-117), which is acceptable. 194// 195// Further note that if x is split into x_hi + x_lo where x_lo is the 196// two bits corresponding to bit positions 2^(m-62) and 2^(m-63); then 197// 198// P_0 * x_hi 199// 200// is also an integer of the form 4*K; and thus can also be ignored. 201// Let M := P_0 * x_lo which is a small integer. The main part of the 202// calculation is really the multiplication of x with the four pieces 203// P_1, P_2, P_3, and P_4. 204// 205// Unless the reduced argument is extremely small in magnitude, it 206// suffices to carry out the multiplication of x with P_1, P_2, and 207// P_3. x*P_4 will be carried out and added on as a correction only 208// when it is found to be needed. Note also that x*P_4 need not be 209// computed exactly. A straightforward multiplication suffices since 210// the rounding error thus produced would be bounded by 2^(-3*64), 211// that is 2^(-192) which is small enough as the reduced argument 212// is bounded from below by 2^(-75). 213// 214// Now that we have four 64-bit data representing 2/pi and a 215// 64-bit x. We first need to calculate a highly accurate product 216// of x and P_1, P_2, P_3. This is best understood as integer 217// multiplication. 218// 219// 220// STEP 1. Multiplication 221// ---------------------- 222// 223// 224// --------- --------- --------- 225// | P_1 | | P_2 | | P_3 | 226// --------- --------- --------- 227// 228// --------- 229// X | X | 230// --------- 231// ---------------------------------------------------- 232// 233// --------- --------- 234// | A_hi | | A_lo | 235// --------- --------- 236// 237// 238// --------- --------- 239// | B_hi | | B_lo | 240// --------- --------- 241// 242// 243// --------- --------- 244// | C_hi | | C_lo | 245// --------- --------- 246// 247// ==================================================== 248// --------- --------- --------- --------- 249// | S_0 | | S_1 | | S_2 | | S_3 | 250// --------- --------- --------- --------- 251// 252// 253// 254// STEP 2. Get N and f 255// ------------------- 256// 257// Conceptually, after the individual pieces S_0, S_1, ..., are obtained, 258// we have to sum them and obtain an integer part, N, and a fraction, f. 259// Here, |f| <= 1/2, and N is an integer. Note also that N need only to 260// be known to module 2^k, k >= 2. In the case when |f| is small enough, 261// we would need to add in the value x*P_4. 262// 263// 264// STEP 3. Get reduced argument 265// ---------------------------- 266// 267// The value f is not yet the reduced argument that we seek. The 268// equation 269// 270// x * 2/pi = 4K + N + f 271// 272// says that 273// 274// x = 2*K*pi + N * pi/2 + f * (pi/2). 275// 276// Thus, the reduced argument is given by 277// 278// reduced argument = f * pi/2. 279// 280// This multiplication must be performed to extra precision. 281// 282// IV. Implementation 283// ================== 284// 285// Step 0. Initialization 286// ---------------------- 287// 288// Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x. 289// 290// In memory, 2/pi is stored contiguously as 291// 292// 0x00000000 0x00000000 0xA2F.... 293// ^ 294// |__ implied binary bit 295// 296// Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. Thus 297// -1 <= L <= -16321. We fetch from memory 5 integer pieces of data. 298// 299// P_0 is the two bits corresponding to bit positions L+2 and L+1 300// P_1 is the 64-bit starting at bit position L 301// P_2 is the 64-bit starting at bit position L-64 302// P_3 is the 64-bit starting at bit position L-128 303// P_4 is the 64-bit starting at bit position L-192 304// 305// For example, if m = 63, P_0 would be 0 and P_1 would look like 306// 0xA2F... 307// 308// If m = 65, P_0 would be the two msb of 0xA, thus, P_0 is 10 in binary. 309// P_1 in binary would be 1 0 0 0 1 0 1 1 1 1 .... 310// 311// Step 1. Multiplication 312// ---------------------- 313// 314// At this point, P_1, P_2, P_3, P_4 are integers. They are 315// supposed to be interpreted as 316// 317// 2^(L-63) * P_1; 318// 2^(L-63-64) * P_2; 319// 2^(L-63-128) * P_3; 320// 2^(L-63-192) * P_4; 321// 322// Since each of them need to be multiplied to x, we would scale 323// both x and the P_j's by some convenient factors: scale each 324// of P_j's up by 2^(63-L), and scale x down by 2^(L-63). 325// 326// p_1 := fcvt.xf ( P_1 ) 327// p_2 := fcvt.xf ( P_2 ) * 2^(-64) 328// p_3 := fcvt.xf ( P_3 ) * 2^(-128) 329// p_4 := fcvt.xf ( P_4 ) * 2^(-192) 330// x := replace exponent of x by -1 331// because 2^m * 1.xxxx...xxx * 2^(L-63) 332// is 2^(-1) * 1.xxxx...xxx 333// 334// We are now faced with the task of computing the following 335// 336// --------- --------- --------- 337// | P_1 | | P_2 | | P_3 | 338// --------- --------- --------- 339// 340// --------- 341// X | X | 342// --------- 343// ---------------------------------------------------- 344// 345// --------- --------- 346// | A_hi | | A_lo | 347// --------- --------- 348// 349// --------- --------- 350// | B_hi | | B_lo | 351// --------- --------- 352// 353// --------- --------- 354// | C_hi | | C_lo | 355// --------- --------- 356// 357// ==================================================== 358// ----------- --------- --------- --------- 359// | S_0 | | S_1 | | S_2 | | S_3 | 360// ----------- --------- --------- --------- 361// ^ ^ 362// | |___ binary point 363// | 364// |___ possibly one more bit 365// 366// Let FPSR3 be set to round towards zero with widest precision 367// and exponent range. Unless an explicit FPSR is given, 368// round-to-nearest with widest precision and exponent range is 369// used. 370// 371// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_C := 2^(-65). 372// 373// Tmp_C := fmpy.fpsr3( x, p_1 ); 374// If Tmp_C >= sigma_C then 375// C_hi := Tmp_C; 376// C_lo := x*p_1 - C_hi ...fma, exact 377// Else 378// C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C 379// ...subtraction is exact, regardless 380// ...of rounding direction 381// C_lo := x*p_1 - C_hi ...fma, exact 382// End If 383// 384// Tmp_B := fmpy.fpsr3( x, p_2 ); 385// If Tmp_B >= sigma_B then 386// B_hi := Tmp_B; 387// B_lo := x*p_2 - B_hi ...fma, exact 388// Else 389// B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B 390// ...subtraction is exact, regardless 391// ...of rounding direction 392// B_lo := x*p_2 - B_hi ...fma, exact 393// End If 394// 395// Tmp_A := fmpy.fpsr3( x, p_3 ); 396// If Tmp_A >= sigma_A then 397// A_hi := Tmp_A; 398// A_lo := x*p_3 - A_hi ...fma, exact 399// Else 400// A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A 401// ...subtraction is exact, regardless 402// ...of rounding direction 403// A_lo := x*p_3 - A_hi ...fma, exact 404// End If 405// 406// ...Note that C_hi is of integer value. We need only the 407// ...last few bits. Thus we can ensure C_hi is never a big 408// ...integer, freeing us from overflow worry. 409// 410// Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70); 411// ...Tmp_C is the upper portion of C_hi 412// C_hi := C_hi - Tmp_C 413// ...0 <= C_hi < 2^7 414// 415// Step 2. Get N and f 416// ------------------- 417// 418// At this point, we have all the components to obtain 419// S_0, S_1, S_2, S_3 and thus N and f. We start by adding 420// C_lo and B_hi. This sum together with C_hi gives a good 421// estimation of N and f. 422// 423// A := fadd.fpsr3( B_hi, C_lo ) 424// B := max( B_hi, C_lo ) 425// b := min( B_hi, C_lo ) 426// 427// a := (B - A) + b ...exact. Note that a is either 0 428// ...or 2^(-64). 429// 430// N := round_to_nearest_integer_value( A ); 431// f := A - N; ...exact because lsb(A) >= 2^(-64) 432// ...and |f| <= 1/2. 433// 434// f := f + a ...exact because a is 0 or 2^(-64); 435// ...the msb of the sum is <= 1/2 436// ...lsb >= 2^(-64). 437// 438// N := convert to integer format( C_hi + N ); 439// M := P_0 * x_lo; 440// N := N + M; 441// 442// If sgn_x == 1 (that is original x was negative) 443// N := 2^10 - N 444// ...this maintains N to be non-negative, but still 445// ...equivalent to the (negated N) mod 4. 446// End If 447// 448// If |f| >= 2^(-33) 449// 450// ...Case 1 451// CASE := 1 452// g := A_hi + B_lo; 453// s_hi := f + g; 454// s_lo := (f - s_hi) + g; 455// 456// Else 457// 458// ...Case 2 459// CASE := 2 460// A := fadd.fpsr3( A_hi, B_lo ) 461// B := max( A_hi, B_lo ) 462// b := min( A_hi, B_lo ) 463// 464// a := (B - A) + b ...exact. Note that a is either 0 465// ...or 2^(-128). 466// 467// f_hi := A + f; 468// f_lo := (f - f_hi) + A; 469// ...this is exact. 470// ...f-f_hi is exact because either |f| >= |A|, in which 471// ...case f-f_hi is clearly exact; or otherwise, 0<|f|<|A| 472// ...means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64). 473// ...If f = 2^(-64), f-f_hi involves cancellation and is 474// ...exact. If f = -2^(-64), then A + f is exact. Hence 475// ...f-f_hi is -A exactly, giving f_lo = 0. 476// 477// f_lo := f_lo + a; 478// 479// If |f| >= 2^(-50) then 480// s_hi := f_hi; 481// s_lo := f_lo; 482// Else 483// f_lo := (f_lo + A_lo) + x*p_4 484// s_hi := f_hi + f_lo 485// s_lo := (f_hi - s_hi) + f_lo 486// End If 487// 488// End If 489// 490// Step 3. Get reduced argument 491// ---------------------------- 492// 493// If sgn_x == 0 (that is original x is positive) 494// 495// D_hi := Pi_by_2_hi 496// D_lo := Pi_by_2_lo 497// ...load from table 498// 499// Else 500// 501// D_hi := neg_Pi_by_2_hi 502// D_lo := neg_Pi_by_2_lo 503// ...load from table 504// End If 505// 506// r_hi := s_hi*D_hi 507// r_lo := s_hi*D_hi - r_hi ...fma 508// r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi 509// 510// Return N, r_hi, r_lo 511// 512FR_input_X = f8 513FR_r_hi = f8 514FR_r_lo = f9 515 516FR_X = f32 517FR_N = f33 518FR_p_1 = f34 519FR_TWOM33 = f35 520FR_TWOM50 = f36 521FR_g = f37 522FR_p_2 = f38 523FR_f = f39 524FR_s_lo = f40 525FR_p_3 = f41 526FR_f_abs = f42 527FR_D_lo = f43 528FR_p_4 = f44 529FR_D_hi = f45 530FR_Tmp2_C = f46 531FR_s_hi = f47 532FR_sigma_A = f48 533FR_A = f49 534FR_sigma_B = f50 535FR_B = f51 536FR_sigma_C = f52 537FR_b = f53 538FR_ScaleP2 = f54 539FR_ScaleP3 = f55 540FR_ScaleP4 = f56 541FR_Tmp_A = f57 542FR_Tmp_B = f58 543FR_Tmp_C = f59 544FR_A_hi = f60 545FR_f_hi = f61 546FR_RSHF = f62 547FR_A_lo = f63 548FR_B_hi = f64 549FR_a = f65 550FR_B_lo = f66 551FR_f_lo = f67 552FR_N_fix = f68 553FR_C_hi = f69 554FR_C_lo = f70 555 556GR_N = r8 557GR_Exp_x = r36 558GR_Temp = r37 559GR_BIASL63 = r38 560GR_CASE = r39 561GR_x_lo = r40 562GR_sgn_x = r41 563GR_M = r42 564GR_BASE = r43 565GR_LENGTH1 = r44 566GR_LENGTH2 = r45 567GR_ASUB = r46 568GR_P_0 = r47 569GR_P_1 = r48 570GR_P_2 = r49 571GR_P_3 = r50 572GR_P_4 = r51 573GR_START = r52 574GR_SEGMENT = r53 575GR_A = r54 576GR_B = r55 577GR_C = r56 578GR_D = r57 579GR_E = r58 580GR_TEMP1 = r59 581GR_TEMP2 = r60 582GR_TEMP3 = r61 583GR_TEMP4 = r62 584GR_TEMP5 = r63 585GR_TEMP6 = r64 586GR_rshf = r64 587 588RODATA 589.align 64 590 591LOCAL_OBJECT_START(Constants_Bits_of_2_by_pi) 592data8 0x0000000000000000,0xA2F9836E4E441529 593data8 0xFC2757D1F534DDC0,0xDB6295993C439041 594data8 0xFE5163ABDEBBC561,0xB7246E3A424DD2E0 595data8 0x06492EEA09D1921C,0xFE1DEB1CB129A73E 596data8 0xE88235F52EBB4484,0xE99C7026B45F7E41 597data8 0x3991D639835339F4,0x9C845F8BBDF9283B 598data8 0x1FF897FFDE05980F,0xEF2F118B5A0A6D1F 599data8 0x6D367ECF27CB09B7,0x4F463F669E5FEA2D 600data8 0x7527BAC7EBE5F17B,0x3D0739F78A5292EA 601data8 0x6BFB5FB11F8D5D08,0x56033046FC7B6BAB 602data8 0xF0CFBC209AF4361D,0xA9E391615EE61B08 603data8 0x6599855F14A06840,0x8DFFD8804D732731 604data8 0x06061556CA73A8C9,0x60E27BC08C6B47C4 605data8 0x19C367CDDCE8092A,0x8359C4768B961CA6 606data8 0xDDAF44D15719053E,0xA5FF07053F7E33E8 607data8 0x32C2DE4F98327DBB,0xC33D26EF6B1E5EF8 608data8 0x9F3A1F35CAF27F1D,0x87F121907C7C246A 609data8 0xFA6ED5772D30433B,0x15C614B59D19C3C2 610data8 0xC4AD414D2C5D000C,0x467D862D71E39AC6 611data8 0x9B0062337CD2B497,0xA7B4D55537F63ED7 612data8 0x1810A3FC764D2A9D,0x64ABD770F87C6357 613data8 0xB07AE715175649C0,0xD9D63B3884A7CB23 614data8 0x24778AD623545AB9,0x1F001B0AF1DFCE19 615data8 0xFF319F6A1E666157,0x9947FBACD87F7EB7 616data8 0x652289E83260BFE6,0xCDC4EF09366CD43F 617data8 0x5DD7DE16DE3B5892,0x9BDE2822D2E88628 618data8 0x4D58E232CAC616E3,0x08CB7DE050C017A7 619data8 0x1DF35BE01834132E,0x6212830148835B8E 620data8 0xF57FB0ADF2E91E43,0x4A48D36710D8DDAA 621data8 0x425FAECE616AA428,0x0AB499D3F2A6067F 622data8 0x775C83C2A3883C61,0x78738A5A8CAFBDD7 623data8 0x6F63A62DCBBFF4EF,0x818D67C12645CA55 624data8 0x36D9CAD2A8288D61,0xC277C9121426049B 625data8 0x4612C459C444C5C8,0x91B24DF31700AD43 626data8 0xD4E5492910D5FDFC,0xBE00CC941EEECE70 627data8 0xF53E1380F1ECC3E7,0xB328F8C79405933E 628data8 0x71C1B3092EF3450B,0x9C12887B20AB9FB5 629data8 0x2EC292472F327B6D,0x550C90A7721FE76B 630data8 0x96CB314A1679E279,0x4189DFF49794E884 631data8 0xE6E29731996BED88,0x365F5F0EFDBBB49A 632data8 0x486CA46742727132,0x5D8DB8159F09E5BC 633data8 0x25318D3974F71C05,0x30010C0D68084B58 634data8 0xEE2C90AA4702E774,0x24D6BDA67DF77248 635data8 0x6EEF169FA6948EF6,0x91B45153D1F20ACF 636data8 0x3398207E4BF56863,0xB25F3EDD035D407F 637data8 0x8985295255C06437,0x10D86D324832754C 638data8 0x5BD4714E6E5445C1,0x090B69F52AD56614 639data8 0x9D072750045DDB3B,0xB4C576EA17F9877D 640data8 0x6B49BA271D296996,0xACCCC65414AD6AE2 641data8 0x9089D98850722CBE,0xA4049407777030F3 642data8 0x27FC00A871EA49C2,0x663DE06483DD9797 643data8 0x3FA3FD94438C860D,0xDE41319D39928C70 644data8 0xDDE7B7173BDF082B,0x3715A0805C93805A 645data8 0x921110D8E80FAF80,0x6C4BFFDB0F903876 646data8 0x185915A562BBCB61,0xB989C7BD401004F2 647data8 0xD2277549F6B6EBBB,0x22DBAA140A2F2689 648data8 0x768364333B091A94,0x0EAA3A51C2A31DAE 649data8 0xEDAF12265C4DC26D,0x9C7A2D9756C0833F 650data8 0x03F6F0098C402B99,0x316D07B43915200C 651data8 0x5BC3D8C492F54BAD,0xC6A5CA4ECD37A736 652data8 0xA9E69492AB6842DD,0xDE6319EF8C76528B 653data8 0x6837DBFCABA1AE31,0x15DFA1AE00DAFB0C 654data8 0x664D64B705ED3065,0x29BF56573AFF47B9 655data8 0xF96AF3BE75DF9328,0x3080ABF68C6615CB 656data8 0x040622FA1DE4D9A4,0xB33D8F1B5709CD36 657data8 0xE9424EA4BE13B523,0x331AAAF0A8654FA5 658data8 0xC1D20F3F0BCD785B,0x76F923048B7B7217 659data8 0x8953A6C6E26E6F00,0xEBEF584A9BB7DAC4 660data8 0xBA66AACFCF761D02,0xD12DF1B1C1998C77 661data8 0xADC3DA4886A05DF7,0xF480C62FF0AC9AEC 662data8 0xDDBC5C3F6DDED01F,0xC790B6DB2A3A25A3 663data8 0x9AAF009353AD0457,0xB6B42D297E804BA7 664data8 0x07DA0EAA76A1597B,0x2A12162DB7DCFDE5 665data8 0xFAFEDB89FDBE896C,0x76E4FCA90670803E 666data8 0x156E85FF87FD073E,0x2833676186182AEA 667data8 0xBD4DAFE7B36E6D8F,0x3967955BBF3148D7 668data8 0x8416DF30432DC735,0x6125CE70C9B8CB30 669data8 0xFD6CBFA200A4E46C,0x05A0DD5A476F21D2 670data8 0x1262845CB9496170,0xE0566B0152993755 671data8 0x50B7D51EC4F1335F,0x6E13E4305DA92E85 672data8 0xC3B21D3632A1A4B7,0x08D4B1EA21F716E4 673data8 0x698F77FF2780030C,0x2D408DA0CD4F99A5 674data8 0x20D3A2B30A5D2F42,0xF9B4CBDA11D0BE7D 675data8 0xC1DB9BBD17AB81A2,0xCA5C6A0817552E55 676data8 0x0027F0147F8607E1,0x640B148D4196DEBE 677data8 0x872AFDDAB6256B34,0x897BFEF3059EBFB9 678data8 0x4F6A68A82A4A5AC4,0x4FBCF82D985AD795 679data8 0xC7F48D4D0DA63A20,0x5F57A4B13F149538 680data8 0x800120CC86DD71B6,0xDEC9F560BF11654D 681data8 0x6B0701ACB08CD0C0,0xB24855510EFB1EC3 682data8 0x72953B06A33540C0,0x7BDC06CC45E0FA29 683data8 0x4EC8CAD641F3E8DE,0x647CD8649B31BED9 684data8 0xC397A4D45877C5E3,0x6913DAF03C3ABA46 685data8 0x18465F7555F5BDD2,0xC6926E5D2EACED44 686data8 0x0E423E1C87C461E9,0xFD29F3D6E7CA7C22 687data8 0x35916FC5E0088DD7,0xFFE26A6EC6FDB0C1 688data8 0x0893745D7CB2AD6B,0x9D6ECD7B723E6A11 689data8 0xC6A9CFF7DF7329BA,0xC9B55100B70DB2E2 690data8 0x24BA74607DE58AD8,0x742C150D0C188194 691data8 0x667E162901767A9F,0xBEFDFDEF4556367E 692data8 0xD913D9ECB9BA8BFC,0x97C427A831C36EF1 693data8 0x36C59456A8D8B5A8,0xB40ECCCF2D891234 694data8 0x576F89562CE3CE99,0xB920D6AA5E6B9C2A 695data8 0x3ECC5F114A0BFDFB,0xF4E16D3B8E2C86E2 696data8 0x84D4E9A9B4FCD1EE,0xEFC9352E61392F44 697data8 0x2138C8D91B0AFC81,0x6A4AFBD81C2F84B4 698data8 0x538C994ECC2254DC,0x552AD6C6C096190B 699data8 0xB8701A649569605A,0x26EE523F0F117F11 700data8 0xB5F4F5CBFC2DBC34,0xEEBC34CC5DE8605E 701data8 0xDD9B8E67EF3392B8,0x17C99B5861BC57E1 702data8 0xC68351103ED84871,0xDDDD1C2DA118AF46 703data8 0x2C21D7F359987AD9,0xC0549EFA864FFC06 704data8 0x56AE79E536228922,0xAD38DC9367AAE855 705data8 0x3826829BE7CAA40D,0x51B133990ED7A948 706data8 0x0569F0B265A7887F,0x974C8836D1F9B392 707data8 0x214A827B21CF98DC,0x9F405547DC3A74E1 708data8 0x42EB67DF9DFE5FD4,0x5EA4677B7AACBAA2 709data8 0xF65523882B55BA41,0x086E59862A218347 710data8 0x39E6E389D49EE540,0xFB49E956FFCA0F1C 711data8 0x8A59C52BFA94C5C1,0xD3CFC50FAE5ADB86 712data8 0xC5476243853B8621,0x94792C8761107B4C 713data8 0x2A1A2C8012BF4390,0x2688893C78E4C4A8 714data8 0x7BDBE5C23AC4EAF4,0x268A67F7BF920D2B 715data8 0xA365B1933D0B7CBD,0xDC51A463DD27DDE1 716data8 0x6919949A9529A828,0xCE68B4ED09209F44 717data8 0xCA984E638270237C,0x7E32B90F8EF5A7E7 718data8 0x561408F1212A9DB5,0x4D7E6F5119A5ABF9 719data8 0xB5D6DF8261DD9602,0x36169F3AC4A1A283 720data8 0x6DED727A8D39A9B8,0x825C326B5B2746ED 721data8 0x34007700D255F4FC,0x4D59018071E0E13F 722data8 0x89B295F364A8F1AE,0xA74B38FC4CEAB2BB 723LOCAL_OBJECT_END(Constants_Bits_of_2_by_pi) 724 725LOCAL_OBJECT_START(Constants_Bits_of_pi_by_2) 726data8 0xC90FDAA22168C234,0x00003FFF 727data8 0xC4C6628B80DC1CD1,0x00003FBF 728LOCAL_OBJECT_END(Constants_Bits_of_pi_by_2) 729 730.section .text 731.global __libm_pi_by_2_reduce# 732.proc __libm_pi_by_2_reduce# 733.align 32 734 735__libm_pi_by_2_reduce: 736 737// X is in f8 738// Place the two-piece result r (r_hi) in f8 and c (r_lo) in f9 739// N is returned in r8 740 741{ .mfi 742 alloc r34 = ar.pfs,2,34,0,0 743 fsetc.s3 0x00,0x7F // Set sf3 to round to zero, 82-bit prec, td, ftz 744 nop.i 999 745} 746{ .mfi 747 addl GR_BASE = @ltoff(Constants_Bits_of_2_by_pi#), gp 748 nop.f 999 749 mov GR_BIASL63 = 0x1003E 750} 751;; 752 753 754// L -1-2-3-4 755// 0 0 0 0 0. 1 0 1 0 756// M 0 1 2 .... 63, 64 65 ... 127, 128 757// --------------------------------------------- 758// Segment 0. 1 , 2 , 3 759// START = M - 63 M = 128 becomes 65 760// LENGTH1 = START & 0x3F 65 become position 1 761// SEGMENT = shr(START,6) + 1 0 maps to 1, 64 maps to 2, 762// LENGTH2 = 64 - LENGTH1 763// Address_BASE = shladd(SEGMENT,3) + BASE 764 765 766{ .mmi 767 getf.exp GR_Exp_x = FR_input_X 768 ld8 GR_BASE = [GR_BASE] 769 mov GR_TEMP5 = 0x0FFFE 770} 771;; 772 773// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65). 774{ .mmi 775 getf.sig GR_x_lo = FR_input_X 776 mov GR_TEMP6 = 0x0FFBE 777 nop.i 999 778} 779;; 780 781// Special Code for testing DE arguments 782// movl GR_BIASL63 = 0x0000000000013FFE 783// movl GR_x_lo = 0xFFFFFFFFFFFFFFFF 784// setf.exp FR_X = GR_BIASL63 785// setf.sig FR_ScaleP3 = GR_x_lo 786// fmerge.se FR_X = FR_X,FR_ScaleP3 787// Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x. 788// 2/pi is stored contiguously as 789// 0x00000000 0x00000000.0xA2F.... 790// M = EXP - BIAS ( M >= 63) 791// Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. 792// Thus -1 <= L <= -16321. 793{ .mmi 794 setf.exp FR_sigma_B = GR_TEMP5 795 setf.exp FR_sigma_A = GR_TEMP6 796 extr.u GR_M = GR_Exp_x,0,17 797} 798;; 799 800{ .mii 801 and GR_x_lo = 0x03,GR_x_lo 802 sub GR_START = GR_M,GR_BIASL63 803 add GR_BASE = 8,GR_BASE // To effectively add 1 to SEGMENT 804} 805;; 806 807{ .mii 808 and GR_LENGTH1 = 0x3F,GR_START 809 shr.u GR_SEGMENT = GR_START,6 810 nop.i 999 811} 812;; 813 814{ .mmi 815 shladd GR_BASE = GR_SEGMENT,3,GR_BASE 816 sub GR_LENGTH2 = 0x40,GR_LENGTH1 817 cmp.le p6,p7 = 0x2,GR_LENGTH1 818} 819;; 820 821// P_0 is the two bits corresponding to bit positions L+2 and L+1 822// P_1 is the 64-bit starting at bit position L 823// P_2 is the 64-bit starting at bit position L-64 824// P_3 is the 64-bit starting at bit position L-128 825// P_4 is the 64-bit starting at bit position L-192 826// P_1 is made up of Alo and Bhi 827// P_1 = deposit Alo, position 0, length2 into P_1,position length1 828// deposit Bhi, position length2, length1 into P_1, position 0 829// P_2 is made up of Blo and Chi 830// P_2 = deposit Blo, position 0, length2 into P_2, position length1 831// deposit Chi, position length2, length1 into P_2, position 0 832// P_3 is made up of Clo and Dhi 833// P_3 = deposit Clo, position 0, length2 into P_3, position length1 834// deposit Dhi, position length2, length1 into P_3, position 0 835// P_4 is made up of Clo and Dhi 836// P_4 = deposit Dlo, position 0, length2 into P_4, position length1 837// deposit Ehi, position length2, length1 into P_4, position 0 838{ .mfi 839 ld8 GR_A = [GR_BASE],8 840 fabs FR_X = FR_input_X 841(p7) cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1 842} 843;; 844 845// ld_64 A at Base and increment Base by 8 846// ld_64 B at Base and increment Base by 8 847// ld_64 C at Base and increment Base by 8 848// ld_64 D at Base and increment Base by 8 849// ld_64 E at Base and increment Base by 8 850// A/B/C/D 851// --------------------- 852// A, B, C, D, and E look like | length1 | length2 | 853// --------------------- 854// hi lo 855{ .mlx 856 ld8 GR_B = [GR_BASE],8 857 movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift N_fix 858} 859;; 860 861{ .mmi 862 ld8 GR_C = [GR_BASE],8 863 nop.m 999 864(p8) extr.u GR_Temp = GR_A,63,1 865} 866;; 867 868// If length1 >= 2, 869// P_0 = deposit Ahi, position length2, 2 bit into P_0 at position 0. 870{ .mii 871 ld8 GR_D = [GR_BASE],8 872 shl GR_TEMP1 = GR_A,GR_LENGTH1 // MM instruction 873(p6) shr.u GR_P_0 = GR_A,GR_LENGTH2 // MM instruction 874} 875;; 876 877{ .mii 878 ld8 GR_E = [GR_BASE],-40 879 shl GR_TEMP2 = GR_B,GR_LENGTH1 // MM instruction 880 shr.u GR_P_1 = GR_B,GR_LENGTH2 // MM instruction 881} 882;; 883 884// Else 885// Load 16 bit of ASUB from (Base_Address_of_A - 2) 886// P_0 = ASUB & 0x3 887// If length1 == 0, 888// P_0 complete 889// Else 890// Deposit element 63 from Ahi and place in element 0 of P_0. 891// Endif 892// Endif 893 894{ .mii 895(p7) ld2 GR_ASUB = [GR_BASE],8 896 shl GR_TEMP3 = GR_C,GR_LENGTH1 // MM instruction 897 shr.u GR_P_2 = GR_C,GR_LENGTH2 // MM instruction 898} 899;; 900 901{ .mii 902 setf.d FR_RSHF = GR_rshf // Form right shift const 1.100 * 2^63 903 shl GR_TEMP4 = GR_D,GR_LENGTH1 // MM instruction 904 shr.u GR_P_3 = GR_D,GR_LENGTH2 // MM instruction 905} 906;; 907 908{ .mmi 909(p7) and GR_P_0 = 0x03,GR_ASUB 910(p6) and GR_P_0 = 0x03,GR_P_0 911 shr.u GR_P_4 = GR_E,GR_LENGTH2 // MM instruction 912} 913;; 914 915{ .mmi 916 nop.m 999 917 or GR_P_1 = GR_P_1,GR_TEMP1 918(p8) and GR_P_0 = 0x1,GR_P_0 919} 920;; 921 922{ .mmi 923 setf.sig FR_p_1 = GR_P_1 924 or GR_P_2 = GR_P_2,GR_TEMP2 925(p8) shladd GR_P_0 = GR_P_0,1,GR_Temp 926} 927;; 928 929{ .mmf 930 setf.sig FR_p_2 = GR_P_2 931 or GR_P_3 = GR_P_3,GR_TEMP3 932 fmerge.se FR_X = FR_sigma_B,FR_X 933} 934;; 935 936{ .mmi 937 setf.sig FR_p_3 = GR_P_3 938 or GR_P_4 = GR_P_4,GR_TEMP4 939 pmpy2.r GR_M = GR_P_0,GR_x_lo 940} 941;; 942 943// P_1, P_2, P_3, P_4 are integers. They should be 944// 2^(L-63) * P_1; 945// 2^(L-63-64) * P_2; 946// 2^(L-63-128) * P_3; 947// 2^(L-63-192) * P_4; 948// Since each of them need to be multiplied to x, we would scale 949// both x and the P_j's by some convenient factors: scale each 950// of P_j's up by 2^(63-L), and scale x down by 2^(L-63). 951// p_1 := fcvt.xf ( P_1 ) 952// p_2 := fcvt.xf ( P_2 ) * 2^(-64) 953// p_3 := fcvt.xf ( P_3 ) * 2^(-128) 954// p_4 := fcvt.xf ( P_4 ) * 2^(-192) 955// x= Set x's exp to -1 because 2^m*1.x...x *2^(L-63)=2^(-1)*1.x...xxx 956// --------- --------- --------- 957// | P_1 | | P_2 | | P_3 | 958// --------- --------- --------- 959// --------- 960// X | X | 961// --------- 962// ---------------------------------------------------- 963// --------- --------- 964// | A_hi | | A_lo | 965// --------- --------- 966// --------- --------- 967// | B_hi | | B_lo | 968// --------- --------- 969// --------- --------- 970// | C_hi | | C_lo | 971// --------- --------- 972// ==================================================== 973// ----------- --------- --------- --------- 974// | S_0 | | S_1 | | S_2 | | S_3 | 975// ----------- --------- --------- --------- 976// | |___ binary point 977// |___ possibly one more bit 978// 979// Let FPSR3 be set to round towards zero with widest precision 980// and exponent range. Unless an explicit FPSR is given, 981// round-to-nearest with widest precision and exponent range is 982// used. 983{ .mmi 984 setf.sig FR_p_4 = GR_P_4 985 mov GR_TEMP1 = 0x0FFBF 986 nop.i 999 987} 988;; 989 990{ .mmi 991 setf.exp FR_ScaleP2 = GR_TEMP1 992 mov GR_TEMP2 = 0x0FF7F 993 nop.i 999 994} 995;; 996 997{ .mmi 998 setf.exp FR_ScaleP3 = GR_TEMP2 999 mov GR_TEMP4 = 0x1003E 1000 nop.i 999 1001} 1002;; 1003 1004{ .mmf 1005 setf.exp FR_sigma_C = GR_TEMP4 1006 mov GR_Temp = 0x0FFDE 1007 fcvt.xuf.s1 FR_p_1 = FR_p_1 1008} 1009;; 1010 1011{ .mfi 1012 setf.exp FR_TWOM33 = GR_Temp 1013 fcvt.xuf.s1 FR_p_2 = FR_p_2 1014 nop.i 999 1015} 1016;; 1017 1018{ .mfi 1019 nop.m 999 1020 fcvt.xuf.s1 FR_p_3 = FR_p_3 1021 nop.i 999 1022} 1023;; 1024 1025{ .mfi 1026 nop.m 999 1027 fcvt.xuf.s1 FR_p_4 = FR_p_4 1028 nop.i 999 1029} 1030;; 1031 1032// Tmp_C := fmpy.fpsr3( x, p_1 ); 1033// Tmp_B := fmpy.fpsr3( x, p_2 ); 1034// Tmp_A := fmpy.fpsr3( x, p_3 ); 1035// If Tmp_C >= sigma_C then 1036// C_hi := Tmp_C; 1037// C_lo := x*p_1 - C_hi ...fma, exact 1038// Else 1039// C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C 1040// C_lo := x*p_1 - C_hi ...fma, exact 1041// End If 1042// If Tmp_B >= sigma_B then 1043// B_hi := Tmp_B; 1044// B_lo := x*p_2 - B_hi ...fma, exact 1045// Else 1046// B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B 1047// B_lo := x*p_2 - B_hi ...fma, exact 1048// End If 1049// If Tmp_A >= sigma_A then 1050// A_hi := Tmp_A; 1051// A_lo := x*p_3 - A_hi ...fma, exact 1052// Else 1053// A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A 1054// Exact, regardless ...of rounding direction 1055// A_lo := x*p_3 - A_hi ...fma, exact 1056// Endif 1057{ .mfi 1058 nop.m 999 1059 fmpy.s3 FR_Tmp_C = FR_X,FR_p_1 1060 nop.i 999 1061} 1062;; 1063 1064{ .mfi 1065 mov GR_TEMP3 = 0x0FF3F 1066 fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2 1067 nop.i 999 1068} 1069;; 1070 1071{ .mmf 1072 setf.exp FR_ScaleP4 = GR_TEMP3 1073 mov GR_TEMP4 = 0x10045 1074 fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3 1075} 1076;; 1077 1078{ .mfi 1079 nop.m 999 1080 fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C // For Tmp_C < sigma_C case 1081 nop.i 999 1082} 1083;; 1084 1085{ .mmf 1086 setf.exp FR_Tmp2_C = GR_TEMP4 1087 nop.m 999 1088 fmpy.s3 FR_Tmp_B = FR_X,FR_p_2 1089} 1090;; 1091 1092{ .mfi 1093 addl GR_BASE = @ltoff(Constants_Bits_of_pi_by_2#), gp 1094 fcmp.ge.s1 p12, p9 = FR_Tmp_C,FR_sigma_C 1095 nop.i 999 1096} 1097{ .mfi 1098 nop.m 999 1099 fmpy.s3 FR_Tmp_A = FR_X,FR_p_3 1100 nop.i 99 1101} 1102;; 1103 1104{ .mfi 1105 ld8 GR_BASE = [GR_BASE] 1106(p12) mov FR_C_hi = FR_Tmp_C 1107 nop.i 999 1108} 1109{ .mfi 1110 nop.m 999 1111(p9) fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C 1112 nop.i 999 1113} 1114;; 1115 1116 1117 1118// End If 1119// Step 3. Get reduced argument 1120// If sgn_x == 0 (that is original x is positive) 1121// D_hi := Pi_by_2_hi 1122// D_lo := Pi_by_2_lo 1123// Load from table 1124// Else 1125// D_hi := neg_Pi_by_2_hi 1126// D_lo := neg_Pi_by_2_lo 1127// Load from table 1128// End If 1129 1130{ .mfi 1131 nop.m 999 1132 fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4 1133 nop.i 999 1134} 1135{ .mfi 1136 nop.m 999 1137 fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B // For Tmp_B < sigma_B case 1138 nop.i 999 1139} 1140;; 1141 1142{ .mfi 1143 nop.m 999 1144 fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A // For Tmp_A < sigma_A case 1145 nop.i 999 1146} 1147;; 1148 1149{ .mfi 1150 nop.m 999 1151 fcmp.ge.s1 p13, p10 = FR_Tmp_B,FR_sigma_B 1152 nop.i 999 1153} 1154{ .mfi 1155 nop.m 999 1156 fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi 1157 nop.i 999 1158} 1159;; 1160 1161{ .mfi 1162 ldfe FR_D_hi = [GR_BASE],16 1163 fcmp.ge.s1 p14, p11 = FR_Tmp_A,FR_sigma_A 1164 nop.i 999 1165} 1166;; 1167 1168{ .mfi 1169 ldfe FR_D_lo = [GR_BASE] 1170(p13) mov FR_B_hi = FR_Tmp_B 1171 nop.i 999 1172} 1173{ .mfi 1174 nop.m 999 1175(p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B 1176 nop.i 999 1177} 1178;; 1179 1180{ .mfi 1181 nop.m 999 1182(p14) mov FR_A_hi = FR_Tmp_A 1183 nop.i 999 1184} 1185{ .mfi 1186 nop.m 999 1187(p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A 1188 nop.i 999 1189} 1190;; 1191 1192// Note that C_hi is of integer value. We need only the 1193// last few bits. Thus we can ensure C_hi is never a big 1194// integer, freeing us from overflow worry. 1195// Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70); 1196// Tmp_C is the upper portion of C_hi 1197{ .mfi 1198 nop.m 999 1199 fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C 1200 tbit.z p12,p9 = GR_Exp_x, 17 1201} 1202;; 1203 1204{ .mfi 1205 nop.m 999 1206 fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi 1207 nop.i 999 1208} 1209{ .mfi 1210 nop.m 999 1211 fadd.s3 FR_A = FR_B_hi,FR_C_lo 1212 nop.i 999 1213} 1214;; 1215 1216{ .mfi 1217 nop.m 999 1218 fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi 1219 nop.i 999 1220} 1221;; 1222 1223{ .mfi 1224 nop.m 999 1225 fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C 1226 nop.i 999 1227} 1228;; 1229 1230// ******************* 1231// Step 2. Get N and f 1232// ******************* 1233// We have all the components to obtain 1234// S_0, S_1, S_2, S_3 and thus N and f. We start by adding 1235// C_lo and B_hi. This sum together with C_hi estimates 1236// N and f well. 1237// A := fadd.fpsr3( B_hi, C_lo ) 1238// B := max( B_hi, C_lo ) 1239// b := min( B_hi, C_lo ) 1240{ .mfi 1241 nop.m 999 1242 fmax.s1 FR_B = FR_B_hi,FR_C_lo 1243 nop.i 999 1244} 1245;; 1246 1247// We use a right-shift trick to get the integer part of A into the rightmost 1248// bits of the significand by adding 1.1000..00 * 2^63. This operation is good 1249// if |A| < 2^61, which it is in this case. We are doing this to save a few 1250// cycles over using fcvt.fx followed by fnorm. The second step of the trick 1251// is to subtract the same constant to float the rounded integer into a fp reg. 1252 1253{ .mfi 1254 nop.m 999 1255// N := round_to_nearest_integer_value( A ); 1256 fma.s1 FR_N_fix = FR_A, f1, FR_RSHF 1257 nop.i 999 1258} 1259;; 1260 1261{ .mfi 1262 nop.m 999 1263 fmin.s1 FR_b = FR_B_hi,FR_C_lo 1264 nop.i 999 1265} 1266{ .mfi 1267 nop.m 999 1268// C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7 1269 fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C 1270 nop.i 999 1271} 1272;; 1273 1274{ .mfi 1275 nop.m 999 1276// a := (B - A) + b: Exact - note that a is either 0 or 2^(-64). 1277 fsub.s1 FR_a = FR_B,FR_A 1278 nop.i 999 1279} 1280;; 1281 1282{ .mfi 1283 nop.m 999 1284 fms.s1 FR_N = FR_N_fix, f1, FR_RSHF 1285 nop.i 999 1286} 1287;; 1288 1289{ .mfi 1290 nop.m 999 1291 fadd.s1 FR_a = FR_a,FR_b 1292 nop.i 999 1293} 1294;; 1295 1296// f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2. 1297// N := convert to integer format( C_hi + N ); 1298// M := P_0 * x_lo; 1299// N := N + M; 1300{ .mfi 1301 nop.m 999 1302 fsub.s1 FR_f = FR_A,FR_N 1303 nop.i 999 1304} 1305{ .mfi 1306 nop.m 999 1307 fadd.s1 FR_N = FR_N,FR_C_hi 1308 nop.i 999 1309} 1310;; 1311 1312{ .mfi 1313 nop.m 999 1314(p9) fsub.s1 FR_D_hi = f0, FR_D_hi 1315 nop.i 999 1316} 1317{ .mfi 1318 nop.m 999 1319(p9) fsub.s1 FR_D_lo = f0, FR_D_lo 1320 nop.i 999 1321} 1322;; 1323 1324{ .mfi 1325 nop.m 999 1326 fadd.s1 FR_g = FR_A_hi,FR_B_lo // For Case 1, g=A_hi+B_lo 1327 nop.i 999 1328} 1329{ .mfi 1330 nop.m 999 1331 fadd.s3 FR_A = FR_A_hi,FR_B_lo // For Case 2, A=A_hi+B_lo w/ sf3 1332 nop.i 999 1333} 1334;; 1335 1336{ .mfi 1337 mov GR_Temp = 0x0FFCD // For Case 2, exponent of 2^-50 1338 fmax.s1 FR_B = FR_A_hi,FR_B_lo // For Case 2, B=max(A_hi,B_lo) 1339 nop.i 999 1340} 1341;; 1342 1343// f = f + a Exact because a is 0 or 2^(-64); 1344// the msb of the sum is <= 1/2 and lsb >= 2^(-64). 1345{ .mfi 1346 setf.exp FR_TWOM50 = GR_Temp // For Case 2, form 2^-50 1347 fcvt.fx.s1 FR_N = FR_N 1348 nop.i 999 1349} 1350{ .mfi 1351 nop.m 999 1352 fadd.s1 FR_f = FR_f,FR_a 1353 nop.i 999 1354} 1355;; 1356 1357{ .mfi 1358 nop.m 999 1359 fmin.s1 FR_b = FR_A_hi,FR_B_lo // For Case 2, b=min(A_hi,B_lo) 1360 nop.i 999 1361} 1362;; 1363 1364{ .mfi 1365 nop.m 999 1366 fsub.s1 FR_a = FR_B,FR_A // For Case 2, a=B-A 1367 nop.i 999 1368} 1369;; 1370 1371{ .mfi 1372 nop.m 999 1373 fadd.s1 FR_s_hi = FR_f,FR_g // For Case 1, s_hi=f+g 1374 nop.i 999 1375} 1376{ .mfi 1377 nop.m 999 1378 fadd.s1 FR_f_hi = FR_A,FR_f // For Case 2, f_hi=A+f 1379 nop.i 999 1380} 1381;; 1382 1383{ .mfi 1384 nop.m 999 1385 fabs FR_f_abs = FR_f 1386 nop.i 999 1387} 1388;; 1389 1390{ .mfi 1391 getf.sig GR_N = FR_N 1392 fsetc.s3 0x7F,0x40 // Reset sf3 to user settings + td 1393 nop.i 999 1394} 1395;; 1396 1397{ .mfi 1398 nop.m 999 1399 fsub.s1 FR_s_lo = FR_f,FR_s_hi // For Case 1, s_lo=f-s_hi 1400 nop.i 999 1401} 1402{ .mfi 1403 nop.m 999 1404 fsub.s1 FR_f_lo = FR_f,FR_f_hi // For Case 2, f_lo=f-f_hi 1405 nop.i 999 1406} 1407;; 1408 1409{ .mfi 1410 nop.m 999 1411 fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi // For Case 1, r_hi=s_hi*D_hi 1412 nop.i 999 1413} 1414{ .mfi 1415 nop.m 999 1416 fadd.s1 FR_a = FR_a,FR_b // For Case 2, a=a+b 1417 nop.i 999 1418} 1419;; 1420 1421 1422// If sgn_x == 1 (that is original x was negative) 1423// N := 2^10 - N 1424// this maintains N to be non-negative, but still 1425// equivalent to the (negated N) mod 4. 1426// End If 1427{ .mfi 1428 add GR_N = GR_N,GR_M 1429 fcmp.ge.s1 p13, p10 = FR_f_abs,FR_TWOM33 1430 mov GR_Temp = 0x00400 1431} 1432;; 1433 1434{ .mfi 1435(p9) sub GR_N = GR_Temp,GR_N 1436 fadd.s1 FR_s_lo = FR_s_lo,FR_g // For Case 1, s_lo=s_lo+g 1437 nop.i 999 1438} 1439{ .mfi 1440 nop.m 999 1441 fadd.s1 FR_f_lo = FR_f_lo,FR_A // For Case 2, f_lo=f_lo+A 1442 nop.i 999 1443} 1444;; 1445 1446// a := (B - A) + b Exact. 1447// Note that a is either 0 or 2^(-128). 1448// f_hi := A + f; 1449// f_lo := (f - f_hi) + A 1450// f_lo=f-f_hi is exact because either |f| >= |A|, in which 1451// case f-f_hi is clearly exact; or otherwise, 0<|f|<|A| 1452// means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64). 1453// If f = 2^(-64), f-f_hi involves cancellation and is 1454// exact. If f = -2^(-64), then A + f is exact. Hence 1455// f-f_hi is -A exactly, giving f_lo = 0. 1456// f_lo := f_lo + a; 1457 1458// If |f| >= 2^(-33) 1459// Case 1 1460// CASE := 1 1461// g := A_hi + B_lo; 1462// s_hi := f + g; 1463// s_lo := (f - s_hi) + g; 1464// Else 1465// Case 2 1466// CASE := 2 1467// A := fadd.fpsr3( A_hi, B_lo ) 1468// B := max( A_hi, B_lo ) 1469// b := min( A_hi, B_lo ) 1470 1471{ .mfi 1472 nop.m 999 1473(p10) fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50 1474 nop.i 999 1475} 1476{ .mfi 1477 nop.m 999 1478(p13) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi //For Case 1, r_lo=s_hi*D_hi+r_hi 1479 nop.i 999 1480} 1481;; 1482 1483// If |f| >= 2^(-50) then 1484// s_hi := f_hi; 1485// s_lo := f_lo; 1486// Else 1487// f_lo := (f_lo + A_lo) + x*p_4 1488// s_hi := f_hi + f_lo 1489// s_lo := (f_hi - s_hi) + f_lo 1490// End If 1491{ .mfi 1492 nop.m 999 1493(p14) mov FR_s_hi = FR_f_hi 1494 nop.i 999 1495} 1496{ .mfi 1497 nop.m 999 1498(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_a 1499 nop.i 999 1500} 1501;; 1502 1503{ .mfi 1504 nop.m 999 1505(p14) mov FR_s_lo = FR_f_lo 1506 nop.i 999 1507} 1508{ .mfi 1509 nop.m 999 1510(p11) fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo 1511 nop.i 999 1512} 1513;; 1514 1515{ .mfi 1516 nop.m 999 1517(p11) fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo 1518 nop.i 999 1519} 1520;; 1521 1522{ .mfi 1523 nop.m 999 1524(p13) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo //For Case 1, r_lo=s_hi*D_lo+r_lo 1525 nop.i 999 1526} 1527{ .mfi 1528 nop.m 999 1529(p11) fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo 1530 nop.i 999 1531} 1532;; 1533 1534// r_hi := s_hi*D_hi 1535// r_lo := s_hi*D_hi - r_hi with fma 1536// r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi 1537{ .mfi 1538 nop.m 999 1539(p10) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi 1540 nop.i 999 1541} 1542{ .mfi 1543 nop.m 999 1544(p11) fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi 1545 nop.i 999 1546} 1547;; 1548 1549{ .mfi 1550 nop.m 999 1551(p10) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi 1552 nop.i 999 1553} 1554{ .mfi 1555 nop.m 999 1556(p11) fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo 1557 nop.i 999 1558} 1559;; 1560 1561{ .mfi 1562 nop.m 999 1563(p10) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo 1564 nop.i 999 1565} 1566;; 1567 1568// Return N, r_hi, r_lo 1569// We do not return CASE 1570{ .mfb 1571 nop.m 999 1572 fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo 1573 br.ret.sptk b0 1574} 1575;; 1576 1577.endp __libm_pi_by_2_reduce# 1578