1.file "log1pl.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39//********************************************************************* 40// 41// History: 42// 02/02/00 Initial version 43// 04/04/00 Unwind support added 44// 08/15/00 Bundle added after call to __libm_error_support to properly 45// set [the previously overwritten] GR_Parameter_RESULT. 46// 05/21/01 Removed logl and log10l, putting them in a separate file 47// 06/29/01 Improved speed of all paths 48// 05/20/02 Cleaned up namespace and sf0 syntax 49// 02/10/03 Reordered header: .section, .global, .proc, .align; 50// used data8 for long double table values 51// 52//********************************************************************* 53// 54//********************************************************************* 55// 56// Function: log1pl(x) = ln(x+1), for double-extended precision x values 57// 58//********************************************************************* 59// 60// Resources Used: 61// 62// Floating-Point Registers: f8 (Input and Return Value) 63// f34-f82 64// 65// General Purpose Registers: 66// r32-r56 67// r53-r56 (Used to pass arguments to error handling routine) 68// 69// Predicate Registers: p6-p13 70// 71//********************************************************************* 72// 73// IEEE Special Conditions: 74// 75// Denormal fault raised on denormal inputs 76// Overflow exceptions cannot occur 77// Underflow exceptions raised when appropriate for log1p 78// Inexact raised when appropriate by algorithm 79// 80// log1pl(inf) = inf 81// log1pl(-inf) = QNaN 82// log1pl(+/-0) = +/-0 83// log1pl(-1) = -inf 84// log1pl(SNaN) = QNaN 85// log1pl(QNaN) = QNaN 86// log1pl(EM_special Values) = QNaN 87// 88//********************************************************************* 89// 90// Overview 91// 92// The method consists of three cases. 93// 94// If |X| < 2^(-80) use case log1p_small; 95// else |X| < 2^(-7) use case log_near1; 96// else use case log_regular; 97// 98// Case log1p_small: 99// 100// log1pl( X ) = logl( X+1 ) can be approximated by X 101// 102// Case log_near1: 103// 104// log1pl( X ) = log( X+1 ) can be approximated by a simple polynomial 105// in W = X. This polynomial resembles the truncated Taylor 106// series W - W^/2 + W^3/3 - ... 107// 108// Case log_regular: 109// 110// Here we use a table lookup method. The basic idea is that in 111// order to compute logl(Arg) = log1pl (Arg-1) for an argument Arg in [1,2), 112// we construct a value G such that G*Arg is close to 1 and that 113// logl(1/G) is obtainable easily from a table of values calculated 114// beforehand. Thus 115// 116// logl(Arg) = logl(1/G) + logl(G*Arg) 117// = logl(1/G) + logl(1 + (G*Arg - 1)) 118// 119// Because |G*Arg - 1| is small, the second term on the right hand 120// side can be approximated by a short polynomial. We elaborate 121// this method in four steps. 122// 123// Step 0: Initialization 124// 125// We need to calculate logl( X+1 ). Obtain N, S_hi such that 126// 127// X+1 = 2^N * ( S_hi + S_lo ) exactly 128// 129// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense 130// that |S_lo| <= ulp(S_hi). 131// 132// Step 1: Argument Reduction 133// 134// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate 135// 136// G := G_1 * G_2 * G_3 137// r := (G * S_hi - 1) + G * S_lo 138// 139// These G_j's have the property that the product is exactly 140// representable and that |r| < 2^(-12) as a result. 141// 142// Step 2: Approximation 143// 144// 145// logl(1 + r) is approximated by a short polynomial poly(r). 146// 147// Step 3: Reconstruction 148// 149// 150// Finally, log1pl( X ) = logl( X+1 ) is given by 151// 152// logl( X+1 ) = logl( 2^N * (S_hi + S_lo) ) 153// ~=~ N*logl(2) + logl(1/G) + logl(1 + r) 154// ~=~ N*logl(2) + logl(1/G) + poly(r). 155// 156// **** Algorithm **** 157// 158// Case log1p_small: 159// 160// Although log1pl(X) is basically X, we would like to preserve the inexactness 161// nature as well as consistent behavior under different rounding modes. 162// We can do this by computing the result as 163// 164// log1pl(X) = X - X*X 165// 166// 167// Case log_near1: 168// 169// Here we compute a simple polynomial. To exploit parallelism, we split 170// the polynomial into two portions. 171// 172// W := X 173// Wsq := W * W 174// W4 := Wsq*Wsq 175// W6 := W4*Wsq 176// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4)) 177// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8))) 178// 179// Case log_regular: 180// 181// We present the algorithm in four steps. 182// 183// Step 0. Initialization 184// ---------------------- 185// 186// Z := X + 1 187// N := unbaised exponent of Z 188// S_hi := 2^(-N) * Z 189// S_lo := 2^(-N) * { (max(X,1)-Z) + min(X,1) } 190// 191// Step 1. Argument Reduction 192// -------------------------- 193// 194// Let 195// 196// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63 197// 198// We obtain G_1, G_2, G_3 by the following steps. 199// 200// 201// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted 202// from S_hi. 203// 204// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated 205// to lsb = 2^(-4). 206// 207// Define index_1 := [ d_1 d_2 d_3 d_4 ]. 208// 209// Fetch Z_1 := (1/A_1) rounded UP in fixed point with 210// fixed point lsb = 2^(-15). 211// Z_1 looks like z_0.z_1 z_2 ... z_15 212// Note that the fetching is done using index_1. 213// A_1 is actually not needed in the implementation 214// and is used here only to explain how is the value 215// Z_1 defined. 216// 217// Fetch G_1 := (1/A_1) truncated to 21 sig. bits. 218// floating pt. Again, fetching is done using index_1. A_1 219// explains how G_1 is defined. 220// 221// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14) 222// = 1.0 0 0 0 d_5 ... d_14 223// This is accomplished by integer multiplication. 224// It is proved that X_1 indeed always begin 225// with 1.0000 in fixed point. 226// 227// 228// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1 229// truncated to lsb = 2^(-8). Similar to A_1, 230// A_2 is not needed in actual implementation. It 231// helps explain how some of the values are defined. 232// 233// Define index_2 := [ d_5 d_6 d_7 d_8 ]. 234// 235// Fetch Z_2 := (1/A_2) rounded UP in fixed point with 236// fixed point lsb = 2^(-15). Fetch done using index_2. 237// Z_2 looks like z_0.z_1 z_2 ... z_15 238// 239// Fetch G_2 := (1/A_2) truncated to 21 sig. bits. 240// floating pt. 241// 242// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14) 243// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14 244// This is accomplished by integer multiplication. 245// It is proved that X_2 indeed always begin 246// with 1.00000000 in fixed point. 247// 248// 249// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1. 250// This is 2^(-14) + X_2 truncated to lsb = 2^(-13). 251// 252// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ]. 253// 254// Fetch G_3 := (1/A_3) truncated to 21 sig. bits. 255// floating pt. Fetch is done using index_3. 256// 257// Compute G := G_1 * G_2 * G_3. 258// 259// This is done exactly since each of G_j only has 21 sig. bits. 260// 261// Compute 262// 263// r := (G*S_hi - 1) + G*S_lo using 2 FMA operations. 264// 265// Thus r approximates G*(S_hi + S_lo) - 1 to within a couple of 266// rounding errors. 267// 268// 269// Step 2. Approximation 270// --------------------- 271// 272// This step computes an approximation to logl( 1 + r ) where r is the 273// reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13); 274// thus logl(1+r) can be approximated by a short polynomial: 275// 276// logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5 277// 278// 279// Step 3. Reconstruction 280// ---------------------- 281// 282// This step computes the desired result of logl(X+1): 283// 284// logl(X+1) = logl( 2^N * (S_hi + S_lo) ) 285// = N*logl(2) + logl( S_hi + S_lo) ) 286// = N*logl(2) + logl(1/G) + 287// logl(1 + G * ( S_hi + S_lo ) - 1 ) 288// 289// logl(2), logl(1/G_j) are stored as pairs of (single,double) numbers: 290// log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are 291// single-precision numbers and the low parts are double precision 292// numbers. These have the property that 293// 294// N*log2_hi + SUM ( log1byGj_hi ) 295// 296// is computable exactly in double-extended precision (64 sig. bits). 297// Finally 298// 299// Y_hi := N*log2_hi + SUM ( log1byGj_hi ) 300// Y_lo := poly_hi + [ poly_lo + 301// ( SUM ( log1byGj_lo ) + N*log2_lo ) ] 302// 303 304RODATA 305.align 64 306 307// ************* DO NOT CHANGE THE ORDER OF THESE TABLES ************* 308 309// P_8, P_7, P_6, P_5, P_4, P_3, P_2, and P_1 310 311LOCAL_OBJECT_START(Constants_P) 312//data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000 313//data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000 314//data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000 315//data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000 316//data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000 317//data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000 318//data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000 319//data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000 320data8 0xE3936754EFD62B15,0x00003FFB 321data8 0x8003B271A5E56381,0x0000BFFC 322data8 0x9249248C73282DB0,0x00003FFC 323data8 0xAAAAAA9F47305052,0x0000BFFC 324data8 0xCCCCCCCCCCD17FC9,0x00003FFC 325data8 0x8000000000067ED5,0x0000BFFD 326data8 0xAAAAAAAAAAAAAAAA,0x00003FFD 327data8 0xFFFFFFFFFFFFFFFE,0x0000BFFD 328LOCAL_OBJECT_END(Constants_P) 329 330// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1 331 332LOCAL_OBJECT_START(Constants_Q) 333//data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 334//data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 335//data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 336//data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 337//data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 338//data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 339data8 0xB172180000000000,0x00003FFE 340data8 0x82E308654361C4C6,0x0000BFE2 341data8 0xCCCCCAF2328833CB,0x00003FFC 342data8 0x80000077A9D4BAFB,0x0000BFFD 343data8 0xAAAAAAAAAAABE3D2,0x00003FFD 344data8 0xFFFFFFFFFFFFDAB7,0x0000BFFD 345LOCAL_OBJECT_END(Constants_Q) 346 347// 1/ln10_hi, 1/ln10_lo 348 349LOCAL_OBJECT_START(Constants_1_by_LN10) 350//data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000 351//data4 0xACCF70C8,0xD56EAABE,0x00003FBB,0x00000000 352data8 0xDE5BD8A937287195,0x00003FFD 353data8 0xD56EAABEACCF70C8,0x00003FBB 354LOCAL_OBJECT_END(Constants_1_by_LN10) 355 356 357// Z1 - 16 bit fixed 358 359LOCAL_OBJECT_START(Constants_Z_1) 360data4 0x00008000 361data4 0x00007879 362data4 0x000071C8 363data4 0x00006BCB 364data4 0x00006667 365data4 0x00006187 366data4 0x00005D18 367data4 0x0000590C 368data4 0x00005556 369data4 0x000051EC 370data4 0x00004EC5 371data4 0x00004BDB 372data4 0x00004925 373data4 0x0000469F 374data4 0x00004445 375data4 0x00004211 376LOCAL_OBJECT_END(Constants_Z_1) 377 378// G1 and H1 - IEEE single and h1 - IEEE double 379 380LOCAL_OBJECT_START(Constants_G_H_h1) 381data4 0x3F800000,0x00000000 382data8 0x0000000000000000 383data4 0x3F70F0F0,0x3D785196 384data8 0x3DA163A6617D741C 385data4 0x3F638E38,0x3DF13843 386data8 0x3E2C55E6CBD3D5BB 387data4 0x3F579430,0x3E2FF9A0 388data8 0xBE3EB0BFD86EA5E7 389data4 0x3F4CCCC8,0x3E647FD6 390data8 0x3E2E6A8C86B12760 391data4 0x3F430C30,0x3E8B3AE7 392data8 0x3E47574C5C0739BA 393data4 0x3F3A2E88,0x3EA30C68 394data8 0x3E20E30F13E8AF2F 395data4 0x3F321640,0x3EB9CEC8 396data8 0xBE42885BF2C630BD 397data4 0x3F2AAAA8,0x3ECF9927 398data8 0x3E497F3497E577C6 399data4 0x3F23D708,0x3EE47FC5 400data8 0x3E3E6A6EA6B0A5AB 401data4 0x3F1D89D8,0x3EF8947D 402data8 0xBDF43E3CD328D9BE 403data4 0x3F17B420,0x3F05F3A1 404data8 0x3E4094C30ADB090A 405data4 0x3F124920,0x3F0F4303 406data8 0xBE28FBB2FC1FE510 407data4 0x3F0D3DC8,0x3F183EBF 408data8 0x3E3A789510FDE3FA 409data4 0x3F088888,0x3F20EC80 410data8 0x3E508CE57CC8C98F 411data4 0x3F042108,0x3F29516A 412data8 0xBE534874A223106C 413LOCAL_OBJECT_END(Constants_G_H_h1) 414 415// Z2 - 16 bit fixed 416 417LOCAL_OBJECT_START(Constants_Z_2) 418data4 0x00008000 419data4 0x00007F81 420data4 0x00007F02 421data4 0x00007E85 422data4 0x00007E08 423data4 0x00007D8D 424data4 0x00007D12 425data4 0x00007C98 426data4 0x00007C20 427data4 0x00007BA8 428data4 0x00007B31 429data4 0x00007ABB 430data4 0x00007A45 431data4 0x000079D1 432data4 0x0000795D 433data4 0x000078EB 434LOCAL_OBJECT_END(Constants_Z_2) 435 436// G2 and H2 - IEEE single and h2 - IEEE double 437 438LOCAL_OBJECT_START(Constants_G_H_h2) 439data4 0x3F800000,0x00000000 440data8 0x0000000000000000 441data4 0x3F7F00F8,0x3B7F875D 442data8 0x3DB5A11622C42273 443data4 0x3F7E03F8,0x3BFF015B 444data8 0x3DE620CF21F86ED3 445data4 0x3F7D08E0,0x3C3EE393 446data8 0xBDAFA07E484F34ED 447data4 0x3F7C0FC0,0x3C7E0586 448data8 0xBDFE07F03860BCF6 449data4 0x3F7B1880,0x3C9E75D2 450data8 0x3DEA370FA78093D6 451data4 0x3F7A2328,0x3CBDC97A 452data8 0x3DFF579172A753D0 453data4 0x3F792FB0,0x3CDCFE47 454data8 0x3DFEBE6CA7EF896B 455data4 0x3F783E08,0x3CFC15D0 456data8 0x3E0CF156409ECB43 457data4 0x3F774E38,0x3D0D874D 458data8 0xBE0B6F97FFEF71DF 459data4 0x3F766038,0x3D1CF49B 460data8 0xBE0804835D59EEE8 461data4 0x3F757400,0x3D2C531D 462data8 0x3E1F91E9A9192A74 463data4 0x3F748988,0x3D3BA322 464data8 0xBE139A06BF72A8CD 465data4 0x3F73A0D0,0x3D4AE46F 466data8 0x3E1D9202F8FBA6CF 467data4 0x3F72B9D0,0x3D5A1756 468data8 0xBE1DCCC4BA796223 469data4 0x3F71D488,0x3D693B9D 470data8 0xBE049391B6B7C239 471LOCAL_OBJECT_END(Constants_G_H_h2) 472 473// G3 and H3 - IEEE single and h3 - IEEE double 474 475LOCAL_OBJECT_START(Constants_G_H_h3) 476data4 0x3F7FFC00,0x38800100 477data8 0x3D355595562224CD 478data4 0x3F7FF400,0x39400480 479data8 0x3D8200A206136FF6 480data4 0x3F7FEC00,0x39A00640 481data8 0x3DA4D68DE8DE9AF0 482data4 0x3F7FE400,0x39E00C41 483data8 0xBD8B4291B10238DC 484data4 0x3F7FDC00,0x3A100A21 485data8 0xBD89CCB83B1952CA 486data4 0x3F7FD400,0x3A300F22 487data8 0xBDB107071DC46826 488data4 0x3F7FCC08,0x3A4FF51C 489data8 0x3DB6FCB9F43307DB 490data4 0x3F7FC408,0x3A6FFC1D 491data8 0xBD9B7C4762DC7872 492data4 0x3F7FBC10,0x3A87F20B 493data8 0xBDC3725E3F89154A 494data4 0x3F7FB410,0x3A97F68B 495data8 0xBD93519D62B9D392 496data4 0x3F7FAC18,0x3AA7EB86 497data8 0x3DC184410F21BD9D 498data4 0x3F7FA420,0x3AB7E101 499data8 0xBDA64B952245E0A6 500data4 0x3F7F9C20,0x3AC7E701 501data8 0x3DB4B0ECAABB34B8 502data4 0x3F7F9428,0x3AD7DD7B 503data8 0x3D9923376DC40A7E 504data4 0x3F7F8C30,0x3AE7D474 505data8 0x3DC6E17B4F2083D3 506data4 0x3F7F8438,0x3AF7CBED 507data8 0x3DAE314B811D4394 508data4 0x3F7F7C40,0x3B03E1F3 509data8 0xBDD46F21B08F2DB1 510data4 0x3F7F7448,0x3B0BDE2F 511data8 0xBDDC30A46D34522B 512data4 0x3F7F6C50,0x3B13DAAA 513data8 0x3DCB0070B1F473DB 514data4 0x3F7F6458,0x3B1BD766 515data8 0xBDD65DDC6AD282FD 516data4 0x3F7F5C68,0x3B23CC5C 517data8 0xBDCDAB83F153761A 518data4 0x3F7F5470,0x3B2BC997 519data8 0xBDDADA40341D0F8F 520data4 0x3F7F4C78,0x3B33C711 521data8 0x3DCD1BD7EBC394E8 522data4 0x3F7F4488,0x3B3BBCC6 523data8 0xBDC3532B52E3E695 524data4 0x3F7F3C90,0x3B43BAC0 525data8 0xBDA3961EE846B3DE 526data4 0x3F7F34A0,0x3B4BB0F4 527data8 0xBDDADF06785778D4 528data4 0x3F7F2CA8,0x3B53AF6D 529data8 0x3DCC3ED1E55CE212 530data4 0x3F7F24B8,0x3B5BA620 531data8 0xBDBA31039E382C15 532data4 0x3F7F1CC8,0x3B639D12 533data8 0x3D635A0B5C5AF197 534data4 0x3F7F14D8,0x3B6B9444 535data8 0xBDDCCB1971D34EFC 536data4 0x3F7F0CE0,0x3B7393BC 537data8 0x3DC7450252CD7ADA 538data4 0x3F7F04F0,0x3B7B8B6D 539data8 0xBDB68F177D7F2A42 540LOCAL_OBJECT_END(Constants_G_H_h3) 541 542 543// Floating Point Registers 544 545FR_Input_X = f8 546 547FR_Y_hi = f34 548FR_Y_lo = f35 549 550FR_Scale = f36 551FR_X_Prime = f37 552FR_S_hi = f38 553FR_W = f39 554FR_G = f40 555 556FR_H = f41 557FR_wsq = f42 558FR_w4 = f43 559FR_h = f44 560FR_w6 = f45 561 562FR_G2 = f46 563FR_H2 = f47 564FR_poly_lo = f48 565FR_P8 = f49 566FR_poly_hi = f50 567 568FR_P7 = f51 569FR_h2 = f52 570FR_rsq = f53 571FR_P6 = f54 572FR_r = f55 573 574FR_log2_hi = f56 575FR_log2_lo = f57 576FR_p87 = f58 577FR_p876 = f58 578FR_p8765 = f58 579FR_float_N = f59 580FR_Q4 = f60 581 582FR_p43 = f61 583FR_p432 = f61 584FR_p4321 = f61 585FR_P4 = f62 586FR_G3 = f63 587FR_H3 = f64 588FR_h3 = f65 589 590FR_Q3 = f66 591FR_P3 = f67 592FR_Q2 = f68 593FR_P2 = f69 594FR_1LN10_hi = f70 595 596FR_Q1 = f71 597FR_P1 = f72 598FR_1LN10_lo = f73 599FR_P5 = f74 600FR_rcub = f75 601 602FR_Output_X_tmp = f76 603FR_Neg_One = f77 604FR_Z = f78 605FR_AA = f79 606FR_BB = f80 607FR_S_lo = f81 608FR_2_to_minus_N = f82 609 610FR_X = f8 611FR_Y = f0 612FR_RESULT = f76 613 614 615// General Purpose Registers 616 617GR_ad_p = r33 618GR_Index1 = r34 619GR_Index2 = r35 620GR_signif = r36 621GR_X_0 = r37 622GR_X_1 = r38 623GR_X_2 = r39 624GR_minus_N = r39 625GR_Z_1 = r40 626GR_Z_2 = r41 627GR_N = r42 628GR_Bias = r43 629GR_M = r44 630GR_Index3 = r45 631GR_exp_2tom80 = r45 632GR_ad_p2 = r46 633GR_exp_mask = r47 634GR_exp_2tom7 = r48 635GR_ad_ln10 = r49 636GR_ad_tbl_1 = r50 637GR_ad_tbl_2 = r51 638GR_ad_tbl_3 = r52 639GR_ad_q = r53 640GR_ad_z_1 = r54 641GR_ad_z_2 = r55 642GR_ad_z_3 = r56 643GR_minus_N = r39 644 645// 646// Added for unwind support 647// 648 649GR_SAVE_PFS = r50 650GR_SAVE_B0 = r51 651GR_SAVE_GP = r52 652GR_Parameter_X = r53 653GR_Parameter_Y = r54 654GR_Parameter_RESULT = r55 655GR_Parameter_TAG = r56 656 657.section .text 658GLOBAL_IEEE754_ENTRY(log1pl) 659{ .mfi 660 alloc r32 = ar.pfs,0,21,4,0 661 fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test for natval, nan, inf 662 nop.i 999 663} 664{ .mfi 665 addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp 666 fma.s1 FR_Z = FR_Input_X, f1, f1 // x+1 667 nop.i 999 668} 669;; 670 671{ .mfi 672 nop.m 999 673 fmerge.ns FR_Neg_One = f1, f1 // Form -1.0 674 nop.i 999 675} 676{ .mfi 677 nop.m 999 678 fnorm.s1 FR_X_Prime = FR_Input_X // Normalize x 679 nop.i 999 680} 681;; 682 683{ .mfi 684 ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1 685 nop.f 999 686 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7 687} 688;; 689 690{ .mfb 691 getf.sig GR_signif = FR_Z // Get significand of x+1 692 fcmp.eq.s1 p9, p0 = FR_Input_X, f0 // Test for x=0 693(p6) br.cond.spnt LOG1P_special // Branch for nan, inf, natval 694} 695;; 696 697{ .mfi 698 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1 699 fcmp.lt.s1 p13, p0 = FR_X_Prime, FR_Neg_One // Test for x<-1 700 add GR_ad_p = -0x100, GR_ad_z_1 // Point to Constants_P 701} 702{ .mfi 703 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2 704 nop.f 999 705 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2 706} 707;; 708 709{ .mfi 710 add GR_ad_q = 0x080, GR_ad_p // Point to Constants_Q 711 fcmp.eq.s1 p8, p0 = FR_X_Prime, FR_Neg_One // Test for x=-1 712 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif 713} 714{ .mfb 715 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3 716 nop.f 999 717(p9) br.ret.spnt b0 // Exit if x=0, return input 718} 719;; 720 721{ .mfi 722 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1 723 fclass.nm p10, p0 = FR_Input_X, 0x1FF // Test for unsupported 724 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of significand 725} 726{ .mfi 727 ldfe FR_P8 = [GR_ad_p],16 // Load P_8 for near1 path 728 fsub.s1 FR_W = FR_X_Prime, f0 // W = x 729 add GR_ad_ln10 = 0x060, GR_ad_q // Point to Constants_1_by_LN10 730} 731;; 732 733{ .mfi 734 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1 735 fmax.s1 FR_AA = FR_X_Prime, f1 // For S_lo, form AA = max(X,1.0) 736 mov GR_exp_mask = 0x1FFFF // Create exponent mask 737} 738{ .mib 739 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1 740 mov GR_Bias = 0x0FFFF // Create exponent bias 741(p13) br.cond.spnt LOG1P_LT_Minus_1 // Branch if x<-1 742} 743;; 744 745{ .mfb 746 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1 747 fmerge.se FR_S_hi = f1,FR_Z // Form |x+1| 748(p8) br.cond.spnt LOG1P_EQ_Minus_1 // Branch if x=-1 749} 750;; 751 752{ .mmb 753 getf.exp GR_N = FR_Z // Get N = exponent of x+1 754 ldfd FR_h = [GR_ad_tbl_1] // Load h_1 755(p10) br.cond.spnt LOG1P_unsupported // Branch for unsupported type 756} 757;; 758 759{ .mfi 760 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi 761 fcmp.eq.s0 p8, p0 = FR_Input_X, f0 // Dummy op to flag denormals 762 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1 763} 764;; 765 766// 767// For performance, don't use result of pmpyshr2.u for 4 cycles. 768// 769{ .mmi 770 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo 771 sub GR_N = GR_N, GR_Bias 772 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80 773} 774;; 775 776{ .mfi 777 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4 778 fms.s1 FR_S_lo = FR_AA, f1, FR_Z // Form S_lo = AA - Z 779 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N) 780} 781;; 782 783{ .mmf 784 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3 785 setf.sig FR_float_N = GR_N // Put integer N into rightmost significand 786 fmin.s1 FR_BB = FR_X_Prime, f1 // For S_lo, form BB = min(X,1.0) 787} 788;; 789 790{ .mmi 791 getf.exp GR_M = FR_W // Get signexp of w = x 792 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2 793 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 794} 795;; 796 797{ .mmi 798 ldfe FR_Q1 = [GR_ad_q] // Load Q1 799 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2 800 add GR_ad_p2 = 0x30,GR_ad_p // Point to P_4 801} 802;; 803 804{ .mmi 805 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 806 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2 807 and GR_M = GR_exp_mask, GR_M // Get exponent of w = x 808} 809;; 810 811{ .mmi 812 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2 813 cmp.lt p8, p9 = GR_M, GR_exp_2tom7 // Test |x| < 2^-7 814 cmp.lt p7, p0 = GR_M, GR_exp_2tom80 // Test |x| < 2^-80 815} 816;; 817 818// Small path is separate code 819// p7 is for the small path: |x| < 2^-80 820// near1 and regular paths are merged. 821// p8 is for the near1 path: |x| < 2^-7 822// p9 is for regular path: |x| >= 2^-7 823 824{ .mfi 825 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2 826 nop.f 999 827 nop.i 999 828} 829{ .mfb 830(p9) setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N) 831(p7) fnma.s0 f8 = FR_X_Prime, FR_X_Prime, FR_X_Prime // Result x - x*x 832(p7) br.ret.spnt b0 // Branch if |x| < 2^-80 833} 834;; 835 836{ .mmi 837(p8) ldfe FR_P7 = [GR_ad_p],16 // Load P_7 for near1 path 838(p8) ldfe FR_P4 = [GR_ad_p2],16 // Load P_4 for near1 path 839(p9) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2 840} 841;; 842 843// 844// For performance, don't use result of pmpyshr2.u for 4 cycles. 845// 846{ .mmf 847(p8) ldfe FR_P6 = [GR_ad_p],16 // Load P_6 for near1 path 848(p8) ldfe FR_P3 = [GR_ad_p2],16 // Load P_3 for near1 path 849(p9) fma.s1 FR_S_lo = FR_S_lo, f1, FR_BB // S_lo = S_lo + BB 850} 851;; 852 853{ .mmf 854(p8) ldfe FR_P5 = [GR_ad_p],16 // Load P_5 for near1 path 855(p8) ldfe FR_P2 = [GR_ad_p2],16 // Load P_2 for near1 path 856(p8) fmpy.s1 FR_wsq = FR_W, FR_W // wsq = w * w for near1 path 857} 858;; 859 860{ .mmi 861(p8) ldfe FR_P1 = [GR_ad_p2],16 ;; // Load P_1 for near1 path 862 nop.m 999 863(p9) extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 864} 865;; 866 867{ .mfi 868(p9) shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3 869(p9) fcvt.xf FR_float_N = FR_float_N 870 nop.i 999 871} 872;; 873 874{ .mfi 875(p9) ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3 876 nop.f 999 877 nop.i 999 878} 879;; 880 881{ .mfi 882(p9) ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3 883(p9) fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 884 nop.i 999 885} 886{ .mfi 887 nop.m 999 888(p9) fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 889 nop.i 999 890} 891;; 892 893{ .mmf 894 nop.m 999 895 nop.m 999 896(p9) fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 897} 898;; 899 900{ .mfi 901 nop.m 999 902(p8) fmpy.s1 FR_w4 = FR_wsq, FR_wsq // w4 = w^4 for near1 path 903 nop.i 999 904} 905{ .mfi 906 nop.m 999 907(p8) fma.s1 FR_p87 = FR_W, FR_P8, FR_P7 // p87 = w * P8 + P7 908 nop.i 999 909} 910;; 911 912{ .mfi 913 nop.m 999 914(p9) fma.s1 FR_S_lo = FR_S_lo, FR_2_to_minus_N, f0 // S_lo = S_lo * 2^(-N) 915 nop.i 999 916} 917{ .mfi 918 nop.m 999 919(p8) fma.s1 FR_p43 = FR_W, FR_P4, FR_P3 // p43 = w * P4 + P3 920 nop.i 999 921} 922;; 923 924{ .mfi 925 nop.m 999 926(p9) fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 927 nop.i 999 928} 929{ .mfi 930 nop.m 999 931(p9) fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 932 nop.i 999 933} 934;; 935 936{ .mfi 937 nop.m 999 938(p9) fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 939 nop.i 999 940} 941{ .mfi 942 nop.m 999 943(p8) fmpy.s1 FR_w6 = FR_w4, FR_wsq // w6 = w^6 for near1 path 944 nop.i 999 945} 946;; 947 948{ .mfi 949 nop.m 999 950(p8) fma.s1 FR_p432 = FR_W, FR_p43, FR_P2 // p432 = w * p43 + P2 951 nop.i 999 952} 953{ .mfi 954 nop.m 999 955(p8) fma.s1 FR_p876 = FR_W, FR_p87, FR_P6 // p876 = w * p87 + P6 956 nop.i 999 957} 958;; 959 960{ .mfi 961 nop.m 999 962(p9) fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1 963 nop.i 999 964} 965{ .mfi 966 nop.m 999 967(p9) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi = N * log2_hi + H 968 nop.i 999 969} 970;; 971 972{ .mfi 973 nop.m 999 974(p9) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h = N * log2_lo + h 975 nop.i 999 976} 977;; 978 979{ .mfi 980 nop.m 999 981(p9) fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r = G * S_lo + (G * S_hi - 1) 982 nop.i 999 983} 984;; 985 986{ .mfi 987 nop.m 999 988(p8) fma.s1 FR_p4321 = FR_W, FR_p432, FR_P1 // p4321 = w * p432 + P1 989 nop.i 999 990} 991{ .mfi 992 nop.m 999 993(p8) fma.s1 FR_p8765 = FR_W, FR_p876, FR_P5 // p8765 = w * p876 + P5 994 nop.i 999 995} 996;; 997 998{ .mfi 999 nop.m 999 1000(p9) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3 1001 nop.i 999 1002} 1003{ .mfi 1004 nop.m 999 1005(p9) fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r 1006 nop.i 999 1007} 1008;; 1009 1010{ .mfi 1011 nop.m 999 1012(p8) fma.s1 FR_Y_lo = FR_wsq, FR_p4321, f0 // Y_lo = wsq * p4321 1013 nop.i 999 1014} 1015{ .mfi 1016 nop.m 999 1017(p8) fma.s1 FR_Y_hi = FR_W, f1, f0 // Y_hi = w for near1 path 1018 nop.i 999 1019} 1020;; 1021 1022{ .mfi 1023 nop.m 999 1024(p9) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo = poly_lo * r + Q2 1025 nop.i 999 1026} 1027{ .mfi 1028 nop.m 999 1029(p9) fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 1030 nop.i 999 1031} 1032;; 1033 1034{ .mfi 1035 nop.m 999 1036(p8) fma.s1 FR_Y_lo = FR_w6, FR_p8765,FR_Y_lo // Y_lo = w6 * p8765 + w2 * p4321 1037 nop.i 999 1038} 1039;; 1040 1041{ .mfi 1042 nop.m 999 1043(p9) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1 * rsq + r 1044 nop.i 999 1045} 1046;; 1047 1048{ .mfi 1049 nop.m 999 1050(p9) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h // poly_lo = poly_lo*r^3 + h 1051 nop.i 999 1052} 1053;; 1054 1055{ .mfi 1056 nop.m 999 1057(p9) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo = poly_hi + poly_lo 1058 nop.i 999 1059} 1060;; 1061 1062// Remainder of code is common for near1 and regular paths 1063{ .mfb 1064 nop.m 999 1065 fadd.s0 f8 = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi 1066 br.ret.sptk b0 // Common exit for 2^-80 < x < inf 1067} 1068;; 1069 1070 1071// Here if x=-1 1072LOG1P_EQ_Minus_1: 1073// 1074// If x=-1 raise divide by zero and return -inf 1075// 1076{ .mfi 1077 mov GR_Parameter_TAG = 138 1078 fsub.s1 FR_Output_X_tmp = f0, f1 1079 nop.i 999 1080} 1081;; 1082 1083{ .mfb 1084 nop.m 999 1085 frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0 1086 br.cond.sptk __libm_error_region 1087} 1088;; 1089 1090LOG1P_special: 1091{ .mfi 1092 nop.m 999 1093 fclass.m.unc p8, p0 = FR_Input_X, 0x1E1 // Test for natval, nan, +inf 1094 nop.i 999 1095} 1096;; 1097 1098// 1099// For SNaN raise invalid and return QNaN. 1100// For QNaN raise invalid and return QNaN. 1101// For +Inf return +Inf. 1102// 1103{ .mfb 1104 nop.m 999 1105(p8) fmpy.s0 f8 = FR_Input_X, f1 1106(p8) br.ret.sptk b0 // Return for natval, nan, +inf 1107} 1108;; 1109 1110// 1111// For -Inf raise invalid and return QNaN. 1112// 1113{ .mfb 1114 mov GR_Parameter_TAG = 139 1115 fmpy.s0 FR_Output_X_tmp = FR_Input_X, f0 1116 br.cond.sptk __libm_error_region 1117} 1118;; 1119 1120 1121LOG1P_unsupported: 1122// 1123// Return generated NaN or other value. 1124// 1125{ .mfb 1126 nop.m 999 1127 fmpy.s0 f8 = FR_Input_X, f0 1128 br.ret.sptk b0 1129} 1130;; 1131 1132// Here if -inf < x < -1 1133LOG1P_LT_Minus_1: 1134// 1135// Deal with x < -1 in a special way - raise 1136// invalid and produce QNaN indefinite. 1137// 1138{ .mfb 1139 mov GR_Parameter_TAG = 139 1140 frcpa.s0 FR_Output_X_tmp, p8 = f0, f0 1141 br.cond.sptk __libm_error_region 1142} 1143;; 1144 1145 1146GLOBAL_IEEE754_END(log1pl) 1147libm_alias_ldouble_other (__log1p, log1p) 1148 1149LOCAL_LIBM_ENTRY(__libm_error_region) 1150.prologue 1151{ .mfi 1152 add GR_Parameter_Y=-32,sp // Parameter 2 value 1153 nop.f 0 1154.save ar.pfs,GR_SAVE_PFS 1155 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 1156} 1157{ .mfi 1158.fframe 64 1159 add sp=-64,sp // Create new stack 1160 nop.f 0 1161 mov GR_SAVE_GP=gp // Save gp 1162};; 1163{ .mmi 1164 stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack 1165 add GR_Parameter_X = 16,sp // Parameter 1 address 1166.save b0, GR_SAVE_B0 1167 mov GR_SAVE_B0=b0 // Save b0 1168};; 1169.body 1170{ .mib 1171 stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack 1172 add GR_Parameter_RESULT = 0,GR_Parameter_Y 1173 nop.b 0 // Parameter 3 address 1174} 1175{ .mib 1176 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack 1177 add GR_Parameter_Y = -16,GR_Parameter_Y 1178 br.call.sptk b0=__libm_error_support# // Call error handling function 1179};; 1180{ .mmi 1181 nop.m 999 1182 nop.m 999 1183 add GR_Parameter_RESULT = 48,sp 1184};; 1185{ .mmi 1186 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack 1187.restore sp 1188 add sp = 64,sp // Restore stack pointer 1189 mov b0 = GR_SAVE_B0 // Restore return address 1190};; 1191{ .mib 1192 mov gp = GR_SAVE_GP // Restore gp 1193 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 1194 br.ret.sptk b0 // Return 1195};; 1196 1197LOCAL_LIBM_END(__libm_error_region#) 1198 1199.type __libm_error_support#,@function 1200.global __libm_error_support# 1201