1.file "logl.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39//********************************************************************* 40// 41// History: 42// 05/21/01 Extracted logl and log10l from log1pl.s file, and optimized 43// all paths. 44// 06/20/01 Fixed error tag for x=-inf. 45// 05/20/02 Cleaned up namespace and sf0 syntax 46// 02/10/03 Reordered header: .section, .global, .proc, .align; 47// used data8 for long double table values 48// 49//********************************************************************* 50// 51//********************************************************************* 52// 53// Function: Combined logl(x) and log10l(x) where 54// logl(x) = ln(x), for double-extended precision x values 55// log10l(x) = log (x), for double-extended precision x values 56// 10 57// 58//********************************************************************* 59// 60// Resources Used: 61// 62// Floating-Point Registers: f8 (Input and Return Value) 63// f34-f76 64// 65// General Purpose Registers: 66// r32-r56 67// r53-r56 (Used to pass arguments to error handling routine) 68// 69// Predicate Registers: p6-p14 70// 71//********************************************************************* 72// 73// IEEE Special Conditions: 74// 75// Denormal fault raised on denormal inputs 76// Overflow exceptions cannot occur 77// Underflow exceptions raised when appropriate for log1p 78// (Error Handling Routine called for underflow) 79// Inexact raised when appropriate by algorithm 80// 81// logl(inf) = inf 82// logl(-inf) = QNaN 83// logl(+/-0) = -inf 84// logl(SNaN) = QNaN 85// logl(QNaN) = QNaN 86// logl(EM_special Values) = QNaN 87// log10l(inf) = inf 88// log10l(-inf) = QNaN 89// log10l(+/-0) = -inf 90// log10l(SNaN) = QNaN 91// log10l(QNaN) = QNaN 92// log10l(EM_special Values) = QNaN 93// 94//********************************************************************* 95// 96// Overview 97// 98// The method consists of two cases. 99// 100// If |X-1| < 2^(-7) use case log_near1; 101// else use case log_regular; 102// 103// Case log_near1: 104// 105// logl( 1 + X ) can be approximated by a simple polynomial 106// in W = X-1. This polynomial resembles the truncated Taylor 107// series W - W^/2 + W^3/3 - ... 108// 109// Case log_regular: 110// 111// Here we use a table lookup method. The basic idea is that in 112// order to compute logl(Arg) for an argument Arg in [1,2), we 113// construct a value G such that G*Arg is close to 1 and that 114// logl(1/G) is obtainable easily from a table of values calculated 115// beforehand. Thus 116// 117// logl(Arg) = logl(1/G) + logl(G*Arg) 118// = logl(1/G) + logl(1 + (G*Arg - 1)) 119// 120// Because |G*Arg - 1| is small, the second term on the right hand 121// side can be approximated by a short polynomial. We elaborate 122// this method in four steps. 123// 124// Step 0: Initialization 125// 126// We need to calculate logl( X ). Obtain N, S_hi such that 127// 128// X = 2^N * S_hi exactly 129// 130// where S_hi in [1,2) 131// 132// Step 1: Argument Reduction 133// 134// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate 135// 136// G := G_1 * G_2 * G_3 137// r := (G * S_hi - 1) 138// 139// These G_j's have the property that the product is exactly 140// representable and that |r| < 2^(-12) as a result. 141// 142// Step 2: Approximation 143// 144// 145// logl(1 + r) is approximated by a short polynomial poly(r). 146// 147// Step 3: Reconstruction 148// 149// 150// Finally, logl( X ) is given by 151// 152// logl( X ) = logl( 2^N * S_hi ) 153// ~=~ N*logl(2) + logl(1/G) + logl(1 + r) 154// ~=~ N*logl(2) + logl(1/G) + poly(r). 155// 156// **** Algorithm **** 157// 158// Case log_near1: 159// 160// Here we compute a simple polynomial. To exploit parallelism, we split 161// the polynomial into two portions. 162// 163// W := X - 1 164// Wsq := W * W 165// W4 := Wsq*Wsq 166// W6 := W4*Wsq 167// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4)) 168// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8))) 169// 170// Case log_regular: 171// 172// We present the algorithm in four steps. 173// 174// Step 0. Initialization 175// ---------------------- 176// 177// Z := X 178// N := unbaised exponent of Z 179// S_hi := 2^(-N) * Z 180// 181// Step 1. Argument Reduction 182// -------------------------- 183// 184// Let 185// 186// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63 187// 188// We obtain G_1, G_2, G_3 by the following steps. 189// 190// 191// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted 192// from S_hi. 193// 194// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated 195// to lsb = 2^(-4). 196// 197// Define index_1 := [ d_1 d_2 d_3 d_4 ]. 198// 199// Fetch Z_1 := (1/A_1) rounded UP in fixed point with 200// fixed point lsb = 2^(-15). 201// Z_1 looks like z_0.z_1 z_2 ... z_15 202// Note that the fetching is done using index_1. 203// A_1 is actually not needed in the implementation 204// and is used here only to explain how is the value 205// Z_1 defined. 206// 207// Fetch G_1 := (1/A_1) truncated to 21 sig. bits. 208// floating pt. Again, fetching is done using index_1. A_1 209// explains how G_1 is defined. 210// 211// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14) 212// = 1.0 0 0 0 d_5 ... d_14 213// This is accomplished by integer multiplication. 214// It is proved that X_1 indeed always begin 215// with 1.0000 in fixed point. 216// 217// 218// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1 219// truncated to lsb = 2^(-8). Similar to A_1, 220// A_2 is not needed in actual implementation. It 221// helps explain how some of the values are defined. 222// 223// Define index_2 := [ d_5 d_6 d_7 d_8 ]. 224// 225// Fetch Z_2 := (1/A_2) rounded UP in fixed point with 226// fixed point lsb = 2^(-15). Fetch done using index_2. 227// Z_2 looks like z_0.z_1 z_2 ... z_15 228// 229// Fetch G_2 := (1/A_2) truncated to 21 sig. bits. 230// floating pt. 231// 232// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14) 233// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14 234// This is accomplished by integer multiplication. 235// It is proved that X_2 indeed always begin 236// with 1.00000000 in fixed point. 237// 238// 239// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1. 240// This is 2^(-14) + X_2 truncated to lsb = 2^(-13). 241// 242// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ]. 243// 244// Fetch G_3 := (1/A_3) truncated to 21 sig. bits. 245// floating pt. Fetch is done using index_3. 246// 247// Compute G := G_1 * G_2 * G_3. 248// 249// This is done exactly since each of G_j only has 21 sig. bits. 250// 251// Compute 252// 253// r := (G*S_hi - 1) 254// 255// 256// Step 2. Approximation 257// --------------------- 258// 259// This step computes an approximation to logl( 1 + r ) where r is the 260// reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13); 261// thus logl(1+r) can be approximated by a short polynomial: 262// 263// logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5 264// 265// 266// Step 3. Reconstruction 267// ---------------------- 268// 269// This step computes the desired result of logl(X): 270// 271// logl(X) = logl( 2^N * S_hi ) 272// = N*logl(2) + logl( S_hi ) 273// = N*logl(2) + logl(1/G) + 274// logl(1 + G*S_hi - 1 ) 275// 276// logl(2), logl(1/G_j) are stored as pairs of (single,double) numbers: 277// log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are 278// single-precision numbers and the low parts are double precision 279// numbers. These have the property that 280// 281// N*log2_hi + SUM ( log1byGj_hi ) 282// 283// is computable exactly in double-extended precision (64 sig. bits). 284// Finally 285// 286// Y_hi := N*log2_hi + SUM ( log1byGj_hi ) 287// Y_lo := poly_hi + [ poly_lo + 288// ( SUM ( log1byGj_lo ) + N*log2_lo ) ] 289// 290 291RODATA 292.align 64 293 294// ************* DO NOT CHANGE THE ORDER OF THESE TABLES ************* 295 296// P_8, P_7, P_6, P_5, P_4, P_3, P_2, and P_1 297 298LOCAL_OBJECT_START(Constants_P) 299data8 0xE3936754EFD62B15,0x00003FFB 300data8 0x8003B271A5E56381,0x0000BFFC 301data8 0x9249248C73282DB0,0x00003FFC 302data8 0xAAAAAA9F47305052,0x0000BFFC 303data8 0xCCCCCCCCCCD17FC9,0x00003FFC 304data8 0x8000000000067ED5,0x0000BFFD 305data8 0xAAAAAAAAAAAAAAAA,0x00003FFD 306data8 0xFFFFFFFFFFFFFFFE,0x0000BFFD 307LOCAL_OBJECT_END(Constants_P) 308 309// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1 310 311LOCAL_OBJECT_START(Constants_Q) 312data8 0xB172180000000000,0x00003FFE 313data8 0x82E308654361C4C6,0x0000BFE2 314data8 0xCCCCCAF2328833CB,0x00003FFC 315data8 0x80000077A9D4BAFB,0x0000BFFD 316data8 0xAAAAAAAAAAABE3D2,0x00003FFD 317data8 0xFFFFFFFFFFFFDAB7,0x0000BFFD 318LOCAL_OBJECT_END(Constants_Q) 319 320// 1/ln10_hi, 1/ln10_lo 321 322LOCAL_OBJECT_START(Constants_1_by_LN10) 323data8 0xDE5BD8A937287195,0x00003FFD 324data8 0xD56EAABEACCF70C8,0x00003FBB 325LOCAL_OBJECT_END(Constants_1_by_LN10) 326 327 328// Z1 - 16 bit fixed 329 330LOCAL_OBJECT_START(Constants_Z_1) 331data4 0x00008000 332data4 0x00007879 333data4 0x000071C8 334data4 0x00006BCB 335data4 0x00006667 336data4 0x00006187 337data4 0x00005D18 338data4 0x0000590C 339data4 0x00005556 340data4 0x000051EC 341data4 0x00004EC5 342data4 0x00004BDB 343data4 0x00004925 344data4 0x0000469F 345data4 0x00004445 346data4 0x00004211 347LOCAL_OBJECT_END(Constants_Z_1) 348 349// G1 and H1 - IEEE single and h1 - IEEE double 350 351LOCAL_OBJECT_START(Constants_G_H_h1) 352data4 0x3F800000,0x00000000 353data8 0x0000000000000000 354data4 0x3F70F0F0,0x3D785196 355data8 0x3DA163A6617D741C 356data4 0x3F638E38,0x3DF13843 357data8 0x3E2C55E6CBD3D5BB 358data4 0x3F579430,0x3E2FF9A0 359data8 0xBE3EB0BFD86EA5E7 360data4 0x3F4CCCC8,0x3E647FD6 361data8 0x3E2E6A8C86B12760 362data4 0x3F430C30,0x3E8B3AE7 363data8 0x3E47574C5C0739BA 364data4 0x3F3A2E88,0x3EA30C68 365data8 0x3E20E30F13E8AF2F 366data4 0x3F321640,0x3EB9CEC8 367data8 0xBE42885BF2C630BD 368data4 0x3F2AAAA8,0x3ECF9927 369data8 0x3E497F3497E577C6 370data4 0x3F23D708,0x3EE47FC5 371data8 0x3E3E6A6EA6B0A5AB 372data4 0x3F1D89D8,0x3EF8947D 373data8 0xBDF43E3CD328D9BE 374data4 0x3F17B420,0x3F05F3A1 375data8 0x3E4094C30ADB090A 376data4 0x3F124920,0x3F0F4303 377data8 0xBE28FBB2FC1FE510 378data4 0x3F0D3DC8,0x3F183EBF 379data8 0x3E3A789510FDE3FA 380data4 0x3F088888,0x3F20EC80 381data8 0x3E508CE57CC8C98F 382data4 0x3F042108,0x3F29516A 383data8 0xBE534874A223106C 384LOCAL_OBJECT_END(Constants_G_H_h1) 385 386// Z2 - 16 bit fixed 387 388LOCAL_OBJECT_START(Constants_Z_2) 389data4 0x00008000 390data4 0x00007F81 391data4 0x00007F02 392data4 0x00007E85 393data4 0x00007E08 394data4 0x00007D8D 395data4 0x00007D12 396data4 0x00007C98 397data4 0x00007C20 398data4 0x00007BA8 399data4 0x00007B31 400data4 0x00007ABB 401data4 0x00007A45 402data4 0x000079D1 403data4 0x0000795D 404data4 0x000078EB 405LOCAL_OBJECT_END(Constants_Z_2) 406 407// G2 and H2 - IEEE single and h2 - IEEE double 408 409LOCAL_OBJECT_START(Constants_G_H_h2) 410data4 0x3F800000,0x00000000 411data8 0x0000000000000000 412data4 0x3F7F00F8,0x3B7F875D 413data8 0x3DB5A11622C42273 414data4 0x3F7E03F8,0x3BFF015B 415data8 0x3DE620CF21F86ED3 416data4 0x3F7D08E0,0x3C3EE393 417data8 0xBDAFA07E484F34ED 418data4 0x3F7C0FC0,0x3C7E0586 419data8 0xBDFE07F03860BCF6 420data4 0x3F7B1880,0x3C9E75D2 421data8 0x3DEA370FA78093D6 422data4 0x3F7A2328,0x3CBDC97A 423data8 0x3DFF579172A753D0 424data4 0x3F792FB0,0x3CDCFE47 425data8 0x3DFEBE6CA7EF896B 426data4 0x3F783E08,0x3CFC15D0 427data8 0x3E0CF156409ECB43 428data4 0x3F774E38,0x3D0D874D 429data8 0xBE0B6F97FFEF71DF 430data4 0x3F766038,0x3D1CF49B 431data8 0xBE0804835D59EEE8 432data4 0x3F757400,0x3D2C531D 433data8 0x3E1F91E9A9192A74 434data4 0x3F748988,0x3D3BA322 435data8 0xBE139A06BF72A8CD 436data4 0x3F73A0D0,0x3D4AE46F 437data8 0x3E1D9202F8FBA6CF 438data4 0x3F72B9D0,0x3D5A1756 439data8 0xBE1DCCC4BA796223 440data4 0x3F71D488,0x3D693B9D 441data8 0xBE049391B6B7C239 442LOCAL_OBJECT_END(Constants_G_H_h2) 443 444// G3 and H3 - IEEE single and h3 - IEEE double 445 446LOCAL_OBJECT_START(Constants_G_H_h3) 447data4 0x3F7FFC00,0x38800100 448data8 0x3D355595562224CD 449data4 0x3F7FF400,0x39400480 450data8 0x3D8200A206136FF6 451data4 0x3F7FEC00,0x39A00640 452data8 0x3DA4D68DE8DE9AF0 453data4 0x3F7FE400,0x39E00C41 454data8 0xBD8B4291B10238DC 455data4 0x3F7FDC00,0x3A100A21 456data8 0xBD89CCB83B1952CA 457data4 0x3F7FD400,0x3A300F22 458data8 0xBDB107071DC46826 459data4 0x3F7FCC08,0x3A4FF51C 460data8 0x3DB6FCB9F43307DB 461data4 0x3F7FC408,0x3A6FFC1D 462data8 0xBD9B7C4762DC7872 463data4 0x3F7FBC10,0x3A87F20B 464data8 0xBDC3725E3F89154A 465data4 0x3F7FB410,0x3A97F68B 466data8 0xBD93519D62B9D392 467data4 0x3F7FAC18,0x3AA7EB86 468data8 0x3DC184410F21BD9D 469data4 0x3F7FA420,0x3AB7E101 470data8 0xBDA64B952245E0A6 471data4 0x3F7F9C20,0x3AC7E701 472data8 0x3DB4B0ECAABB34B8 473data4 0x3F7F9428,0x3AD7DD7B 474data8 0x3D9923376DC40A7E 475data4 0x3F7F8C30,0x3AE7D474 476data8 0x3DC6E17B4F2083D3 477data4 0x3F7F8438,0x3AF7CBED 478data8 0x3DAE314B811D4394 479data4 0x3F7F7C40,0x3B03E1F3 480data8 0xBDD46F21B08F2DB1 481data4 0x3F7F7448,0x3B0BDE2F 482data8 0xBDDC30A46D34522B 483data4 0x3F7F6C50,0x3B13DAAA 484data8 0x3DCB0070B1F473DB 485data4 0x3F7F6458,0x3B1BD766 486data8 0xBDD65DDC6AD282FD 487data4 0x3F7F5C68,0x3B23CC5C 488data8 0xBDCDAB83F153761A 489data4 0x3F7F5470,0x3B2BC997 490data8 0xBDDADA40341D0F8F 491data4 0x3F7F4C78,0x3B33C711 492data8 0x3DCD1BD7EBC394E8 493data4 0x3F7F4488,0x3B3BBCC6 494data8 0xBDC3532B52E3E695 495data4 0x3F7F3C90,0x3B43BAC0 496data8 0xBDA3961EE846B3DE 497data4 0x3F7F34A0,0x3B4BB0F4 498data8 0xBDDADF06785778D4 499data4 0x3F7F2CA8,0x3B53AF6D 500data8 0x3DCC3ED1E55CE212 501data4 0x3F7F24B8,0x3B5BA620 502data8 0xBDBA31039E382C15 503data4 0x3F7F1CC8,0x3B639D12 504data8 0x3D635A0B5C5AF197 505data4 0x3F7F14D8,0x3B6B9444 506data8 0xBDDCCB1971D34EFC 507data4 0x3F7F0CE0,0x3B7393BC 508data8 0x3DC7450252CD7ADA 509data4 0x3F7F04F0,0x3B7B8B6D 510data8 0xBDB68F177D7F2A42 511LOCAL_OBJECT_END(Constants_G_H_h3) 512 513 514// Floating Point Registers 515 516FR_Input_X = f8 517 518FR_Y_hi = f34 519FR_Y_lo = f35 520 521FR_Scale = f36 522FR_X_Prime = f37 523FR_S_hi = f38 524FR_W = f39 525FR_G = f40 526 527FR_H = f41 528FR_wsq = f42 529FR_w4 = f43 530FR_h = f44 531FR_w6 = f45 532 533FR_G2 = f46 534FR_H2 = f47 535FR_poly_lo = f48 536FR_P8 = f49 537FR_poly_hi = f50 538 539FR_P7 = f51 540FR_h2 = f52 541FR_rsq = f53 542FR_P6 = f54 543FR_r = f55 544 545FR_log2_hi = f56 546FR_log2_lo = f57 547FR_p87 = f58 548FR_p876 = f58 549FR_p8765 = f58 550FR_float_N = f59 551FR_Q4 = f60 552 553FR_p43 = f61 554FR_p432 = f61 555FR_p4321 = f61 556FR_P4 = f62 557FR_G3 = f63 558FR_H3 = f64 559FR_h3 = f65 560 561FR_Q3 = f66 562FR_P3 = f67 563FR_Q2 = f68 564FR_P2 = f69 565FR_1LN10_hi = f70 566 567FR_Q1 = f71 568FR_P1 = f72 569FR_1LN10_lo = f73 570FR_P5 = f74 571FR_rcub = f75 572 573FR_Output_X_tmp = f76 574 575FR_X = f8 576FR_Y = f0 577FR_RESULT = f76 578 579 580// General Purpose Registers 581 582GR_ad_p = r33 583GR_Index1 = r34 584GR_Index2 = r35 585GR_signif = r36 586GR_X_0 = r37 587GR_X_1 = r38 588GR_X_2 = r39 589GR_Z_1 = r40 590GR_Z_2 = r41 591GR_N = r42 592GR_Bias = r43 593GR_M = r44 594GR_Index3 = r45 595GR_ad_p2 = r46 596GR_exp_mask = r47 597GR_exp_2tom7 = r48 598GR_ad_ln10 = r49 599GR_ad_tbl_1 = r50 600GR_ad_tbl_2 = r51 601GR_ad_tbl_3 = r52 602GR_ad_q = r53 603GR_ad_z_1 = r54 604GR_ad_z_2 = r55 605GR_ad_z_3 = r56 606 607// 608// Added for unwind support 609// 610 611GR_SAVE_PFS = r50 612GR_SAVE_B0 = r51 613GR_SAVE_GP = r52 614GR_Parameter_X = r53 615GR_Parameter_Y = r54 616GR_Parameter_RESULT = r55 617GR_Parameter_TAG = r56 618 619.section .text 620 621GLOBAL_IEEE754_ENTRY(logl) 622{ .mfi 623 alloc r32 = ar.pfs,0,21,4,0 624 fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test for natval, nan, inf 625 cmp.eq p7, p14 = r0, r0 // Set p7 if logl 626} 627{ .mfb 628 addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp 629 fnorm.s1 FR_X_Prime = FR_Input_X // Normalize x 630 br.cond.sptk LOGL_BEGIN 631} 632;; 633 634GLOBAL_IEEE754_END(logl) 635libm_alias_ldouble_other (__log, log) 636 637 638GLOBAL_IEEE754_ENTRY(log10l) 639{ .mfi 640 alloc r32 = ar.pfs,0,21,4,0 641 fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test for natval, nan, inf 642 cmp.ne p7, p14 = r0, r0 // Set p14 if log10l 643} 644{ .mfb 645 addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp 646 fnorm.s1 FR_X_Prime = FR_Input_X // Normalize x 647 nop.b 999 648} 649;; 650 651 652// Common code for logl and log10 653LOGL_BEGIN: 654{ .mfi 655 ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1 656 fclass.m p10, p0 = FR_Input_X, 0x0b // Test for denormal 657 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7 658} 659;; 660 661{ .mfb 662 getf.sig GR_signif = FR_Input_X // Get significand of x 663 fcmp.eq.s1 p9, p0 = FR_Input_X, f1 // Test for x=1.0 664(p6) br.cond.spnt LOGL_64_special // Branch for nan, inf, natval 665} 666;; 667 668{ .mfi 669 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1 670 fcmp.lt.s1 p13, p0 = FR_Input_X, f0 // Test for x<0 671 add GR_ad_p = -0x100, GR_ad_z_1 // Point to Constants_P 672} 673{ .mib 674 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2 675 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2 676(p10) br.cond.spnt LOGL_64_denormal // Branch for denormal 677} 678;; 679 680LOGL_64_COMMON: 681{ .mfi 682 add GR_ad_q = 0x080, GR_ad_p // Point to Constants_Q 683 fcmp.eq.s1 p8, p0 = FR_Input_X, f0 // Test for x=0 684 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif 685} 686{ .mfb 687 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3 688(p9) fma.s0 f8 = FR_Input_X, f0, f0 // If x=1, return +0.0 689(p9) br.ret.spnt b0 // Exit if x=1 690} 691;; 692 693{ .mfi 694 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1 695 fclass.nm p10, p0 = FR_Input_X, 0x1FF // Test for unsupported 696 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of significand 697} 698{ .mfi 699 ldfe FR_P8 = [GR_ad_p],16 // Load P_8 for near1 path 700 fsub.s1 FR_W = FR_X_Prime, f1 // W = x - 1 701 add GR_ad_ln10 = 0x060, GR_ad_q // Point to Constants_1_by_LN10 702} 703;; 704 705{ .mfi 706 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1 707 nop.f 999 708 mov GR_exp_mask = 0x1FFFF // Create exponent mask 709} 710{ .mib 711 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1 712 mov GR_Bias = 0x0FFFF // Create exponent bias 713(p13) br.cond.spnt LOGL_64_negative // Branch if x<0 714} 715;; 716 717{ .mfb 718 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1 719 fmerge.se FR_S_hi = f1,FR_X_Prime // Form |x| 720(p8) br.cond.spnt LOGL_64_zero // Branch if x=0 721} 722;; 723 724{ .mmb 725 getf.exp GR_N = FR_X_Prime // Get N = exponent of x 726 ldfd FR_h = [GR_ad_tbl_1] // Load h_1 727(p10) br.cond.spnt LOGL_64_unsupported // Branch for unsupported type 728} 729;; 730 731{ .mfi 732 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi 733 fcmp.eq.s0 p8, p0 = FR_Input_X, f0 // Dummy op to flag denormals 734 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1 735} 736;; 737 738// 739// For performance, don't use result of pmpyshr2.u for 4 cycles. 740// 741{ .mmi 742 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo 743(p14) ldfe FR_1LN10_hi = [GR_ad_ln10],16 // If log10l, load 1/ln10_hi 744 sub GR_N = GR_N, GR_Bias 745} 746;; 747 748{ .mmi 749 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4 750(p14) ldfe FR_1LN10_lo = [GR_ad_ln10] // If log10l, load 1/ln10_lo 751 nop.i 999 752} 753;; 754 755{ .mmi 756 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3 757 setf.sig FR_float_N = GR_N // Put integer N into rightmost significand 758 nop.i 999 759} 760;; 761 762{ .mmi 763 getf.exp GR_M = FR_W // Get signexp of w = x - 1 764 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2 765 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 766} 767;; 768 769{ .mmi 770 ldfe FR_Q1 = [GR_ad_q] // Load Q1 771 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2 772 add GR_ad_p2 = 0x30,GR_ad_p // Point to P_4 773} 774;; 775 776{ .mmi 777 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 778 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2 779 and GR_M = GR_exp_mask, GR_M // Get exponent of w = x - 1 780} 781;; 782 783{ .mmi 784 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2 785 cmp.lt p8, p9 = GR_M, GR_exp_2tom7 // Test |x-1| < 2^-7 786 nop.i 999 787} 788;; 789 790// Paths are merged. 791// p8 is for the near1 path: |x-1| < 2^-7 792// p9 is for regular path: |x-1| >= 2^-7 793 794{ .mmi 795 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2 796 nop.m 999 797 nop.i 999 798} 799;; 800 801{ .mmi 802(p8) ldfe FR_P7 = [GR_ad_p],16 // Load P_7 for near1 path 803(p8) ldfe FR_P4 = [GR_ad_p2],16 // Load P_4 for near1 path 804(p9) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2 805} 806;; 807 808// 809// For performance, don't use result of pmpyshr2.u for 4 cycles. 810// 811{ .mmi 812(p8) ldfe FR_P6 = [GR_ad_p],16 // Load P_6 for near1 path 813(p8) ldfe FR_P3 = [GR_ad_p2],16 // Load P_3 for near1 path 814 nop.i 999 815} 816;; 817 818{ .mmf 819(p8) ldfe FR_P5 = [GR_ad_p],16 // Load P_5 for near1 path 820(p8) ldfe FR_P2 = [GR_ad_p2],16 // Load P_2 for near1 path 821(p8) fmpy.s1 FR_wsq = FR_W, FR_W // wsq = w * w for near1 path 822} 823;; 824 825{ .mmi 826(p8) ldfe FR_P1 = [GR_ad_p2],16 ;; // Load P_1 for near1 path 827 nop.m 999 828(p9) extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 829} 830;; 831 832{ .mfi 833(p9) shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3 834(p9) fcvt.xf FR_float_N = FR_float_N 835 nop.i 999 836} 837;; 838 839{ .mfi 840(p9) ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3 841 nop.f 999 842 nop.i 999 843} 844;; 845 846{ .mfi 847(p9) ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3 848(p9) fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 849 nop.i 999 850} 851{ .mfi 852 nop.m 999 853(p9) fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 854 nop.i 999 855} 856;; 857 858{ .mmf 859 nop.m 999 860 nop.m 999 861(p9) fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 862} 863;; 864 865{ .mfi 866 nop.m 999 867(p8) fmpy.s1 FR_w4 = FR_wsq, FR_wsq // w4 = w^4 for near1 path 868 nop.i 999 869} 870{ .mfi 871 nop.m 999 872(p8) fma.s1 FR_p87 = FR_W, FR_P8, FR_P7 // p87 = w * P8 + P7 873 nop.i 999 874} 875;; 876 877{ .mfi 878 nop.m 999 879(p8) fma.s1 FR_p43 = FR_W, FR_P4, FR_P3 // p43 = w * P4 + P3 880 nop.i 999 881} 882;; 883 884{ .mfi 885 nop.m 999 886(p9) fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 887 nop.i 999 888} 889{ .mfi 890 nop.m 999 891(p9) fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 892 nop.i 999 893} 894;; 895 896{ .mfi 897 nop.m 999 898(p9) fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 899 nop.i 999 900} 901{ .mfi 902 nop.m 999 903(p8) fmpy.s1 FR_w6 = FR_w4, FR_wsq // w6 = w^6 for near1 path 904 nop.i 999 905} 906;; 907 908{ .mfi 909 nop.m 999 910(p8) fma.s1 FR_p432 = FR_W, FR_p43, FR_P2 // p432 = w * p43 + P2 911 nop.i 999 912} 913{ .mfi 914 nop.m 999 915(p8) fma.s1 FR_p876 = FR_W, FR_p87, FR_P6 // p876 = w * p87 + P6 916 nop.i 999 917} 918;; 919 920{ .mfi 921 nop.m 999 922(p9) fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1 923 nop.i 999 924} 925{ .mfi 926 nop.m 999 927(p9) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi = N * log2_hi + H 928 nop.i 999 929} 930;; 931 932{ .mfi 933 nop.m 999 934(p9) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h = N * log2_lo + h 935 nop.i 999 936} 937;; 938 939{ .mfi 940 nop.m 999 941(p8) fma.s1 FR_p4321 = FR_W, FR_p432, FR_P1 // p4321 = w * p432 + P1 942 nop.i 999 943} 944{ .mfi 945 nop.m 999 946(p8) fma.s1 FR_p8765 = FR_W, FR_p876, FR_P5 // p8765 = w * p876 + P5 947 nop.i 999 948} 949;; 950 951{ .mfi 952 nop.m 999 953(p9) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3 954 nop.i 999 955} 956{ .mfi 957 nop.m 999 958(p9) fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r 959 nop.i 999 960} 961;; 962 963{ .mfi 964 nop.m 999 965(p8) fma.s1 FR_Y_lo = FR_wsq, FR_p4321, f0 // Y_lo = wsq * p4321 966 nop.i 999 967} 968{ .mfi 969 nop.m 999 970(p8) fma.s1 FR_Y_hi = FR_W, f1, f0 // Y_hi = w for near1 path 971 nop.i 999 972} 973;; 974 975{ .mfi 976 nop.m 999 977(p9) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo = poly_lo * r + Q2 978 nop.i 999 979} 980{ .mfi 981 nop.m 999 982(p9) fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 983 nop.i 999 984} 985;; 986 987{ .mfi 988 nop.m 999 989(p8) fma.s1 FR_Y_lo = FR_w6, FR_p8765,FR_Y_lo // Y_lo = w6 * p8765 + w2 * p4321 990 nop.i 999 991} 992;; 993 994{ .mfi 995 nop.m 999 996(p9) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1 * rsq + r 997 nop.i 999 998} 999;; 1000 1001{ .mfi 1002 nop.m 999 1003(p9) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h // poly_lo = poly_lo*r^3 + h 1004 nop.i 999 1005} 1006;; 1007 1008{ .mfi 1009 nop.m 999 1010(p9) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo = poly_hi + poly_lo 1011 nop.i 999 1012} 1013;; 1014 1015// Remainder of code is common for near1 and regular paths 1016{ .mfi 1017 nop.m 999 1018(p7) fadd.s0 f8 = FR_Y_lo,FR_Y_hi // If logl, result=Y_lo+Y_hi 1019 nop.i 999 1020} 1021{ .mfi 1022 nop.m 999 1023(p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi 1024 nop.i 999 1025} 1026;; 1027 1028{ .mfi 1029 nop.m 999 1030(p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp 1031 nop.i 999 1032} 1033;; 1034 1035{ .mfb 1036 nop.m 999 1037(p14) fma.s0 f8 = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp 1038 br.ret.sptk b0 // Common exit for 0 < x < inf 1039} 1040;; 1041 1042 1043// Here if x=+-0 1044LOGL_64_zero: 1045// 1046// If x=+-0 raise divide by zero and return -inf 1047// 1048{ .mfi 1049(p7) mov GR_Parameter_TAG = 0 1050 fsub.s1 FR_Output_X_tmp = f0, f1 1051 nop.i 999 1052} 1053;; 1054 1055{ .mfb 1056(p14) mov GR_Parameter_TAG = 6 1057 frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0 1058 br.cond.sptk __libm_error_region 1059} 1060;; 1061 1062LOGL_64_special: 1063{ .mfi 1064 nop.m 999 1065 fclass.m.unc p8, p0 = FR_Input_X, 0x1E1 // Test for natval, nan, +inf 1066 nop.i 999 1067} 1068;; 1069 1070// 1071// For SNaN raise invalid and return QNaN. 1072// For QNaN raise invalid and return QNaN. 1073// For +Inf return +Inf. 1074// 1075{ .mfb 1076 nop.m 999 1077(p8) fmpy.s0 f8 = FR_Input_X, f1 1078(p8) br.ret.sptk b0 // Return for natval, nan, +inf 1079} 1080;; 1081 1082// 1083// For -Inf raise invalid and return QNaN. 1084// 1085{ .mmi 1086(p7) mov GR_Parameter_TAG = 1 1087 nop.m 999 1088 nop.i 999 1089} 1090;; 1091 1092{ .mfb 1093(p14) mov GR_Parameter_TAG = 7 1094 fmpy.s0 FR_Output_X_tmp = FR_Input_X, f0 1095 br.cond.sptk __libm_error_region 1096} 1097;; 1098 1099// Here if x denormal or unnormal 1100LOGL_64_denormal: 1101{ .mmi 1102 getf.sig GR_signif = FR_X_Prime // Get significand of normalized input 1103 nop.m 999 1104 nop.i 999 1105} 1106;; 1107 1108{ .mmb 1109 getf.exp GR_N = FR_X_Prime // Get exponent of normalized input 1110 nop.m 999 1111 br.cond.sptk LOGL_64_COMMON // Branch back to common code 1112} 1113;; 1114 1115LOGL_64_unsupported: 1116// 1117// Return generated NaN or other value. 1118// 1119{ .mfb 1120 nop.m 999 1121 fmpy.s0 f8 = FR_Input_X, f0 1122 br.ret.sptk b0 1123} 1124;; 1125 1126// Here if -inf < x < 0 1127LOGL_64_negative: 1128// 1129// Deal with x < 0 in a special way - raise 1130// invalid and produce QNaN indefinite. 1131// 1132{ .mfi 1133(p7) mov GR_Parameter_TAG = 1 1134 frcpa.s0 FR_Output_X_tmp, p8 = f0, f0 1135 nop.i 999 1136} 1137;; 1138 1139{ .mib 1140(p14) mov GR_Parameter_TAG = 7 1141 nop.i 999 1142 br.cond.sptk __libm_error_region 1143} 1144;; 1145 1146 1147GLOBAL_IEEE754_END(log10l) 1148libm_alias_ldouble_other (__log10, log10) 1149 1150LOCAL_LIBM_ENTRY(__libm_error_region) 1151.prologue 1152{ .mfi 1153 add GR_Parameter_Y=-32,sp // Parameter 2 value 1154 nop.f 0 1155.save ar.pfs,GR_SAVE_PFS 1156 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 1157} 1158{ .mfi 1159.fframe 64 1160 add sp=-64,sp // Create new stack 1161 nop.f 0 1162 mov GR_SAVE_GP=gp // Save gp 1163};; 1164{ .mmi 1165 stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack 1166 add GR_Parameter_X = 16,sp // Parameter 1 address 1167.save b0, GR_SAVE_B0 1168 mov GR_SAVE_B0=b0 // Save b0 1169};; 1170.body 1171{ .mib 1172 stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack 1173 add GR_Parameter_RESULT = 0,GR_Parameter_Y 1174 nop.b 0 // Parameter 3 address 1175} 1176{ .mib 1177 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack 1178 add GR_Parameter_Y = -16,GR_Parameter_Y 1179 br.call.sptk b0=__libm_error_support# // Call error handling function 1180};; 1181{ .mmi 1182 nop.m 999 1183 nop.m 999 1184 add GR_Parameter_RESULT = 48,sp 1185};; 1186{ .mmi 1187 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack 1188.restore sp 1189 add sp = 64,sp // Restore stack pointer 1190 mov b0 = GR_SAVE_B0 // Restore return address 1191};; 1192{ .mib 1193 mov gp = GR_SAVE_GP // Restore gp 1194 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 1195 br.ret.sptk b0 // Return 1196};; 1197 1198LOCAL_LIBM_END(__libm_error_region#) 1199 1200.type __libm_error_support#,@function 1201.global __libm_error_support# 1202