1.file "log2l.s" 2 3 4// Copyright (c) 2000 - 2003, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//============================================================== 41// 09/25/00 Initial version 42// 11/22/00 Fixed accuracy bug (for mantissas near 1, 2) 43// 12/07/00 Fixed C_1l constant, eliminated rounding errors in 44// reduced argument (x*frcpa(x)-1) 45// 05/20/02 Cleaned up namespace and sf0 syntax 46// 02/10/03 Reordered header: .section, .global, .proc, .align 47// 48// API 49//============================================================== 50// long double log2l(long double) 51// 52// Overview of operation 53//============================================================== 54// Background 55// 56// Implementation 57// 58// Let x = 2^l * m, where m=1.b1 b2 ... b8 b9 ... b52 59// y=frcpa(m), r=m*y-1, f=b1 b2 .. b8 60// T_hi is a table that stores the 24 most significant bits of log2(1/y) 61// (in entries 1..255) in single precision format 62// T_low is a table that stores (log2(1/y)-T_high), rounded to double 63// precision 64// 65// f is used as an index; T_high[255]=T_low[255]=0 66// 67// If f=0 and b9=0, r is set to 2^{-8}* 0.b9 b10 ... b52 = m-1 (fractional part of m), 68// and 0 is used instead of T_high[0], T_low[0] 69// (polynomial evaluation only, for m=1+r, 0<=r<2^{-9}) 70// If f=255, r is set to (m-2)/2 (T[255]=0, and only polynomial evaluation is used 71// for m=2(1-r'), 0<=r'<2^{-9}) 72// 73// If 2^{-9}<=m<2-2^{-8} or (input not near 1), let C1r=(2^{16}+C1*r)-2^{16} 74// and let E=((RN(m*y)-1)-r)+(m*y-RN(m*y)) 75// Else let C1r=C1*r (rounded to 64 significant bits) and let E=0 76// 77// Let D=C1*r-C1r 78// 79// 80// log2l(x) is approximated as 81// (l+T_high[f]+C1r) + (D+r*(c1+c2*r+c3*r^2...+c8*r^7)+(T_low[f]+C_1*E)) 82// 83 84 85// Special values 86//============================================================== 87// log2l(0)=-inf, raises Divide by Zero 88// log2l(+inf)=inf 89// log2l(x)=NaN, raises Invalid if x<0 90// 91 92 93// Registers used 94//============================================================== 95// f6-f15, f32-f36 96// r2-r3, r23-r23 97// p6,p7,p8,p12 98// 99 100 101GR_SAVE_B0 = r33 102GR_SAVE_PFS = r34 103GR_SAVE_GP = r35 // This reg. can safely be used 104GR_SAVE_SP = r36 105 106GR_Parameter_X = r37 107GR_Parameter_Y = r38 108GR_Parameter_RESULT = r39 109GR_Parameter_TAG = r40 110 111FR_X = f10 112FR_Y = f1 113FR_RESULT = f8 114 115 116 117 118// Data tables 119//============================================================== 120 121RODATA 122 123.align 16 124 125LOCAL_OBJECT_START(poly_coeffs) 126 127data8 0xb8aa3b295c17f0bc, 0x00003fff // C_1 128data8 0x3fca61762a7aded9, 0xbfc71547652b82fe // C_7, C_8 129data8 0x3fd2776c50ef9bfe, 0xbfcec709dc3a03fd // C_5, C_6 130data8 0x3fdec709dc3a03fd, 0xbfd71547652b82fe // C_3, C_4 131//data8 0xd871319ff0342580, 0x0000bfbd // C_1l (low part of C1) 132data8 0x82f0025f2dc582ee, 0x0000bfbe // C_1l (low part of C1) 133data8 0xb8aa3b295c17f0bc, 0x0000bffe // C_2 134LOCAL_OBJECT_END(poly_coeffs) 135 136 137 138 139LOCAL_OBJECT_START(T_table) 140 141data4 0x3b38d875, 0x3c0ae7f4, 0x3c67f738, 0x3ca2b253 142data4 0x3ccbb91d, 0x3cfac91e, 0x3d1504a5, 0x3d29c4a0 143data4 0x3d419264, 0x3d567aa6, 0x3d6e76ca, 0x3d81c3f7 144data4 0x3d8c5630, 0x3d9876e9, 0x3da31e0a, 0x3dadcf09 145data4 0x3db889f9, 0x3dc34eec, 0x3dce1df5, 0x3dd8f726 146data4 0x3de3da94, 0x3deec851, 0x3df82ea4, 0x3e0197dd 147data4 0x3e071dad, 0x3e0ca8ca, 0x3e116d6e, 0x3e170281 148data4 0x3e1bcfbc, 0x3e216ee9, 0x3e2644dc, 0x3e2b1ee1 149data4 0x3e30cd12, 0x3e35affd, 0x3e3a970f, 0x3e3f824f 150data4 0x3e4544c0, 0x3e4a3926, 0x3e4f31d1, 0x3e542ec7 151data4 0x3e593012, 0x3e5e35b7, 0x3e633fbf, 0x3e677625 152data4 0x3e6c884b, 0x3e719eea, 0x3e76ba0a, 0x3e7bd9b2 153data4 0x3e80111d, 0x3e82a523, 0x3e84ccec, 0x3e876533 154data4 0x3e89ffd1, 0x3e8c2d22, 0x3e8e5c18, 0x3e90fd0a 155data4 0x3e932fa9, 0x3e95d506, 0x3e980b5a, 0x3e9a4361 156data4 0x3e9c7d1f, 0x3e9f2b16, 0x3ea168a0, 0x3ea3a7ea 157data4 0x3ea5e8f5, 0x3ea82bc4, 0x3eaa705b, 0x3eacb6bb 158data4 0x3eaefee7, 0x3eb148e3, 0x3eb394b1, 0x3eb5e255 159data4 0x3eb831d0, 0x3eba8327, 0x3ebcd65c, 0x3ebeb3e0 160data4 0x3ec10a7a, 0x3ec362f9, 0x3ec5bd63, 0x3ec7a0b3 161data4 0x3ec9fe96, 0x3ecc5e6c, 0x3ece4619, 0x3ed0a978 162data4 0x3ed293fe, 0x3ed4faf1, 0x3ed6e859, 0x3ed952eb 163data4 0x3edb433c, 0x3eddb178, 0x3edfa4bc, 0x3ee19953 164data4 0x3ee40cee, 0x3ee60484, 0x3ee7fd73, 0x3ee9f7bb 165data4 0x3eec7280, 0x3eee6fda, 0x3ef06e94, 0x3ef26eb1 166data4 0x3ef47031, 0x3ef67317, 0x3ef8f8b2, 0x3efafec5 167data4 0x3efd0644, 0x3eff0f32, 0x3f008cc8, 0x3f0192b0 168data4 0x3f029952, 0x3f03a0b0, 0x3f0466b2, 0x3f056f5a 169data4 0x3f0678c0, 0x3f0782e6, 0x3f088dcc, 0x3f099973 170data4 0x3f0aa5dd, 0x3f0b6fac, 0x3f0c7d6d, 0x3f0d8bf4 171data4 0x3f0e575b, 0x3f0f673e, 0x3f1077e9, 0x3f1144ef 172data4 0x3f1256fc, 0x3f1369d6, 0x3f143880, 0x3f154cc1 173data4 0x3f161c7a, 0x3f173227, 0x3f1802f2, 0x3f191a0f 174data4 0x3f19ebee, 0x3f1b047e, 0x3f1bd775, 0x3f1cf17b 175data4 0x3f1dc58e, 0x3f1ee10f, 0x3f1fb63f, 0x3f208bea 176data4 0x3f21a98f, 0x3f22805c, 0x3f2357a7, 0x3f247778 177data4 0x3f254fe9, 0x3f2628d9, 0x3f270249, 0x3f2824fb 178data4 0x3f28ff97, 0x3f29dab4, 0x3f2ab654, 0x3f2b9277 179data4 0x3f2cb8c8, 0x3f2d961e, 0x3f2e73fa, 0x3f2f525b 180data4 0x3f303143, 0x3f3110b1, 0x3f31f0a7, 0x3f32d125 181data4 0x3f33b22b, 0x3f3493bc, 0x3f3575d6, 0x3f36587b 182data4 0x3f373bab, 0x3f381f68, 0x3f3903b1, 0x3f39e888 183data4 0x3f3acdec, 0x3f3bb3e0, 0x3f3c9a63, 0x3f3d8177 184data4 0x3f3e1bd4, 0x3f3f03d9, 0x3f3fec71, 0x3f40d59b 185data4 0x3f41bf59, 0x3f42a9ab, 0x3f434635, 0x3f443180 186data4 0x3f451d61, 0x3f4609d9, 0x3f46a7d3, 0x3f479549 187data4 0x3f488357, 0x3f492261, 0x3f4a1171, 0x3f4b011c 188data4 0x3f4ba139, 0x3f4c91e8, 0x3f4d8334, 0x3f4e246a 189data4 0x3f4f16be, 0x3f5009b1, 0x3f50ac02, 0x3f51a001 190data4 0x3f524305, 0x3f533812, 0x3f53dbca, 0x3f54d1e7 191data4 0x3f55c8a8, 0x3f566d85, 0x3f57655b, 0x3f580af0 192data4 0x3f58b0d0, 0x3f59aa2c, 0x3f5a50c7, 0x3f5b4b3c 193data4 0x3f5bf294, 0x3f5cee26, 0x3f5d963c, 0x3f5e92ed 194data4 0x3f5f3bc3, 0x3f5fe4e7, 0x3f60e32d, 0x3f618d13 195data4 0x3f623748, 0x3f63372a, 0x3f63e223, 0x3f648d6b 196data4 0x3f658eee, 0x3f663afe, 0x3f66e75e, 0x3f67ea86 197data4 0x3f6897b0, 0x3f69452c, 0x3f69f2f9, 0x3f6af847 198data4 0x3f6ba6e2, 0x3f6c55d0, 0x3f6d0510, 0x3f6e0c8d 199data4 0x3f6ebc9f, 0x3f6f6d04, 0x3f701dbe, 0x3f70cecd 200data4 0x3f718030, 0x3f728ae6, 0x3f733d20, 0x3f73efaf 201data4 0x3f74a296, 0x3f7555d3, 0x3f760967, 0x3f76bd53 202data4 0x3f777197, 0x3f7880a1, 0x3f7935c2, 0x3f79eb3c 203data4 0x3f7aa10f, 0x3f7b573b, 0x3f7c0dc2, 0x3f7cc4a3 204data4 0x3f7d7bdf, 0x3f7e3376, 0x3f7eeb68, 0x00000000 205LOCAL_OBJECT_END(T_table) 206 207 208 209LOCAL_OBJECT_START(T_low) 210 211 212data8 0x3dc0b97f689876ef, 0x3dfd5d906028ac01 213data8 0x3df8b9cbb8d7240b, 0x3de0c941a2f220cd 214data8 0x3e09c6aecba15936, 0x3dfa6d528241827c 215data8 0x3dd0bad25714903c, 0x3e2776b01dc036a2 216data8 0x3e2b914bc77f158b, 0x3e1c0fafd29dc74a 217data8 0x3e28dadc119cd3de, 0x3e3bca869da085be 218data8 0x3e19d1e700f2200a, 0x3e3e13530cc37504 219data8 0x3e3936464d9c41ee, 0x3e3c3fa21c9499d0 220data8 0x3e3259e079b6c6e8, 0x3e2a364069c4f7f3 221data8 0x3e1274c84f6c6364, 0x3e3796170159f454 222data8 0x3e26e1e389f4364e, 0x3e28cedda8c7f658 223data8 0x3e376c2028433268, 0x3e4aee6d650c82e1 224data8 0x3e33e65094fbeeb4, 0x3e4c7d125aa92c5d 225data8 0x3e1559a4b69691d8, 0x3e18efabeb7d7221 226data8 0x3e4c2b255abaa8de, 0x3e37436952a4538b 227data8 0x3e4e6807f4ba00b8, 0x3e33ff5964190e42 228data8 0x3e4f5d798cead43c, 0x3e4f3676443bf453 229data8 0x3e4660f8d5bc1bf5, 0x3e2d4f9f3ab04f36 230data8 0x3e357f7a64ccd537, 0x3e394caf7c9b05af 231data8 0x3e225c7d17ab29b0, 0x3e4eb202f6d55a12 232data8 0x3e32faa68b19bcd2, 0x3e45ee1c9b566a8b 233data8 0x3e4770a67de054ff, 0x3e42234fb9de6d6b 234data8 0x3e4ad139825c6e19, 0x3e47f3d334814a93 235data8 0x3e2af1ec402867b6, 0x3e2bfbda0c956e3d 236data8 0x3e4287b831e77ff2, 0x3e54bf0eb77f7b89 237data8 0x3e5b9259a1029607, 0x3e4a764b015e699d 238data8 0x3e4d0b68ea883ab5, 0x3e33e829ecdadf46 239data8 0x3e52f27efef3031b, 0x3e3073979e4af89e 240data8 0x3e3b980f2cd6c253, 0x3e2a5f0f5f7f66a9 241data8 0x3e37788738117b02, 0x3e58aa29a784d52f 242data8 0x3e4f5504c4ff2466, 0x3e002d40340fa647 243data8 0x3e5f53b64592f4c3, 0x3e543f222c526802 244data8 0x3e5680e547a872fa, 0x3e5e234bd1154450 245data8 0x3e3000edc18b6d21, 0x3e1c3c1f000942a8 246data8 0x3e51eeae0e442d6e, 0x3e4fb265376623f2 247data8 0x3e57b5941782d830, 0x3e3a4b83f24ae52c 248data8 0x3e5a5fb4f23978de, 0x3e51ed071563fb02 249data8 0x3e49e2071f51a7a8, 0x3e5e43ae5b924234 250data8 0x3dfa2be9aedf374a, 0x3e56dea3dbba67d5 251data8 0x3e3375fe732b3c3e, 0x3e5a0c6f91f2e77e 252data8 0x3e55e1bf1c969e41, 0x3e30a5a5166b8eee 253data8 0x3e53e6e9a539d46c, 0x3e542981b3d7b0e6 254data8 0x3e595fd8ff36ad64, 0x3e5edeb9e65cbbb4 255data8 0x3e46aeab4d3434c1, 0x3e4ea3ff0564b010 256data8 0x3e59b00be2e3c25a, 0x3e5b887cd7b0821f 257data8 0x3e5f666668547b4d, 0x3e4d0733a805273f 258data8 0x3e26a2ff21c4aec5, 0x3e4c336f7a3a78f3 259data8 0x3e11ad12b628e2d0, 0x3e56d43ff3f0ea64 260data8 0x3e238809433cccd2, 0x3e40d9734147d40f 261data8 0x3e54245fe3e24e06, 0x3e251441fce4d48c 262data8 0x3e517114efc5d1f9, 0x3e5e9a99154b0d82 263data8 0x3e442a71337970f8, 0x3e420c7c69211fdf 264data8 0x3e537e7d5d43c6a7, 0x3e4376c66ad9ad8b 265data8 0x3e49054d678a4f1c, 0x3e5d23cb3bc19f18 266data8 0x3e6ebcd449dcab2b, 0x3e67f5fc2849c88a 267data8 0x3e63f388395d3e84, 0x3e65c1103b0ad7e9 268data8 0x3e6d5d1dd031f353, 0x3e5a159dae75c4d0 269data8 0x3e4d5e22aa75f71d, 0x3e5e379ee62e1e35 270data8 0x3e4df082213cb2dc, 0x3e6bfa06c156f521 271data8 0x3e66e2d3c19b517b, 0x3e426b7098590071 272data8 0x3e541bd027e9854e, 0x3e5061dd924b0ac0 273data8 0x3e6dae01df373a03, 0x3e3baec80b207b0b 274data8 0x3e6b6a6fe06bebac, 0x3e61aebcfc3ab5d1 275data8 0x3e584ee3e7c79d83, 0x3e6b3c1b2840cb40 276data8 0x3e6c842085d6befd, 0x3e6ac04fd7b141e0 277data8 0x3e6c48250474141d, 0x3e2d889b86125f69 278data8 0x3e6e74740225dad0, 0x3e45940d31d50a7c 279data8 0x3e695476a6c39ddc, 0x3e6d9a6d857a060a 280data8 0x3e4a3e9bb4b69337, 0x3e484f3ce4707ed6 281data8 0x3e39dd125d25fc27, 0x3e563fb400de8732 282data8 0x3e5fdd6d0ee28b48, 0x3e669d15b869bb07 283data8 0x3e40687cfad7964d, 0x3e69317990d43957 284data8 0x3e633d57e24ae1bd, 0x3e618bf03710eabb 285data8 0x3e4b4df6fccd1160, 0x3e3fb26ddaa1ec45 286data8 0x3e3810a5e1817fd4, 0x3e6857373642fa5c 287data8 0x3e673db6193add31, 0x3e63200c8acbc9c3 288data8 0x3e3d2dee448ebb62, 0x3e6a19723a80db6a 289data8 0x3e5e7cdab8fd3e6a, 0x3e671855cd660672 290data8 0x3e473c3c78a85ecd, 0x3e5f5e23056a7cf2 291data8 0x3e52538519527367, 0x3e4b573bcf2580e9 292data8 0x3e6d6f856fe90c60, 0x3e2d932a8487642e 293data8 0x3e5236fc78b6174c, 0x3e50cb91d406db50 294data8 0x3e650e8bd562aa57, 0x3e424ee3d9a82f2e 295data8 0x3e59363960e1e3d9, 0x3e379604c1150a3e 296data8 0x3e6d914f6c2ac258, 0x3e62967a451a7b48 297data8 0x3e684b5f01139cb2, 0x3e448bbfbf6d292c 298data8 0x3e6227e7fb487e73, 0x3e6d39d50290f458 299data8 0x3e58368342b4b668, 0x3e65dc0c25bd1763 300data8 0x3e61b7dc362e22b5, 0x3e671691f094bb80 301data8 0x3e5011642d5123f2, 0x3e4c4eb7f11e41be 302data8 0x3e5dcee36ca242cf, 0x3e6791cefff688f1 303data8 0x3e60e23c8dda4ecd, 0x3e48e6a22fe78cfe 304data8 0x3e6d703f244adc86, 0x3e6a281a85a5049d 305data8 0x3e570f20e6403d9e, 0x3e2211518a12956f 306data8 0x3e6737d1e54d71df, 0x3e66b1881476f5e9 307data8 0x3e6e1bbeef085376, 0x3e47cad4944a32be 308data8 0x3e527f2c738e7ee9, 0x3e699883a4b9fb29 309data8 0x3e5c17d1108740d9, 0x3e5d4a9c79a43389 310data8 0x3e49fdc24462ba3b, 0x3e24dbb3a60cceb2 311data8 0x3e5c5bf618780748, 0x3e5c38005b0c778c 312data8 0x3e6be168dd6dd3fe, 0x3e633ab9370693b0 313data8 0x3dd290556b0ae339, 0x3e607c317927096a 314data8 0x3e59651353b3d90e, 0x3e4d8751e5e0ae0d 315data8 0x3e46c81023272a85, 0x3e6b23c988f391b2 316data8 0x3e608741d215209c, 0x3e60b8ba506d758f 317data8 0x3e62ddbe74803297, 0x3e5dbb8b5087587d 318data8 0x3e642aa529048131, 0x3e3dcbda6835dcf4 319data8 0x3e6db503ce854d2a, 0x3e6dd00b49bc6849 320data8 0x3e4db2f11243bc84, 0x3e3b9848efc2ea97 321data8 0x3e58f18e17c82609, 0x3e6ed8645e16c312 322data8 0x3e4065bdb60a5dd4, 0x3e490453c6e6c30a 323data8 0x3e62373994aa31ba, 0x3e56305f0e6b2a95 324data8 0x3e68c1601a6614ee, 0x3e614e204f19d93f 325data8 0x3e6e5037ca773299, 0x3e693f98892561a6 326data8 0x3e639de4f4bf700d, 0x3e416c071e93fd97 327data8 0x3e65466991b415ef, 0x3e6896a324afac9d 328data8 0x3e44f64802e2f11c, 0x3e64d7d747e2191a 329data8 0x3e6174b7581de84c, 0x3e44c7b946e1d43c 330data8 0x3e6a3bcbe30512ec, 0x3e5d3ed411c95ce4 331data8 0x3e3e5b5735cfaf8e, 0x3e6e538ab34efb51 332data8 0x3e514e204f19d93f, 0x3e5a88e6550c89a4 333data8 0x3e66b97a5d9dfd8b, 0x3e5f46b1e14ebaf3 334data8 0x3e357665f6893f5d, 0x3e6bbf633078d1d5 335data8 0x3e5e7337a212c417, 0x3e3570fde15fc8cc 336data8 0x3e21119402da92b4, 0x3e6566e830d1ff3b 337data8 0x3e558883e480e220, 0x3e589ca3a68da411 338data8 0x3e44eb66df73d648, 0x3e1a0a629b1b7e68 339data8 0x3e54cc207b8c1116, 0x0000000000000000 340LOCAL_OBJECT_END(T_low) 341 342 343.section .text 344GLOBAL_IEEE754_ENTRY(log2l) 345 346{ .mfi 347 alloc r32=ar.pfs,1,4,4,0 348 // normalize x 349 // y=frcpa(x) 350 frcpa.s1 f41,p0=f1,f8 351 // r26=bias-1 352 mov r26=0xfffe 353} 354{.mfi 355 // r23=bias+16 356 mov r23=0xffff+16 357 fma.s1 f7=f8,f1,f0 358 // r2 = pointer to C_1...C_6 followed by T_table 359 addl r2 = @ltoff(poly_coeffs), gp;; 360} 361{.mfi 362 // get significand 363 getf.sig r25=f8 364 // f8 denormal ? 365 fclass.m p8,p10=f8,0x9 366 // r24=bias-8 367 mov r24=0xffff-8;; 368} 369{.mfi 370 setf.exp f36=r26 371 nop.f 0 372 // r27=bias 373 mov r27=0xffff;; 374} 375 376{.mmf 377 getf.exp r29=f8 378 // load start address for C_1...C_7 followed by T_table 379 ld8 r2=[r2] 380 // will continue only for positive normal/unnormal numbers 381 fclass.m.unc p0,p12 = f8, 0x19;; 382} 383 384 385.pred.rel "mutex",p8,p10 386{.mfi 387 // denormal input, repeat get significand (after normalization) 388 (p8) getf.sig r25=f7 389 // x=1 ? 390 fcmp.eq.s0 p6,p0=f8,f1 391 // get T_index 392 (p10) shr.u r28=r25,63-8 393} 394{.mfi 395 // f32=2^16 396 setf.exp f32=r23 397 nop.f 0 398 mov r26=0x804;; 399} 400 401{.mfi 402 // denormal input, repeat get exponent (after normalization) 403 (p8) getf.exp r29=f7 404 // f33=0 405 mov f33=f0 406 // r26=0x80400...0 (threshold for using polynomial approximation) 407 shl r26=r26,64-12;; 408} 409 410{.mfb 411 add r3=16,r2 412 // r=x*y-1 413 fms.s1 f6=f41,f8,f1 414 (p12) br.cond.spnt SPECIAL_log2l 415} 416{.mfi 417 // load C_1 418 ldfe f14=[r2],48 419 // RN(x*y) 420 fma.s1 f43=f41,f8,f0 421 mov r23=0xff;; 422} 423 424{.mmi 425 // load C_7, C_8 426 ldfpd f10,f11=[r3],16 427 // load C_3,C_4 428 ldfpd f15,f42=[r2],16 429 (p8) shr.u r28=r25,63-8;; 430} 431 432 433{.mfi 434 // load C_5, C_6 435 ldfpd f12,f13=[r3] 436 // pseudo-zero ? 437 fcmp.eq.s0 p7,p0=f7,f0 438 // if first 9 bits after leading 1 are all zero, then p8=1 439 cmp.ltu p8,p12=r25,r26 440} 441{.mfi 442 // load C1l 443 ldfe f34=[r2],16 444 fmerge.se f7=f1,f7 445 // get T_index 446 and r28=r28,r23;; 447} 448{.mfi 449 // r29=exponent-bias 450 sub r29=r29,r27 451 // if first 8 bits after leading bit are 0, use polynomial approx. only 452 (p8) fms.s1 f6=f7,f1,f1 453 // start address of T_low 454 add r3=1024+16,r2 455} 456{.mfi 457 // load C_2 458 ldfe f35=[r2],16 459 // x=1, return 0 460 (p6) fma.s0 f8=f0,f0,f0 461 // first 8 bits after leading 1 are all ones ? 462 cmp.eq p10,p0=r23,r28;; 463} 464 465{.mfb 466 // if first 8 bits after leading 1 are all ones, use polynomial approx. only 467 // add 1 to the exponent additive term, and estimate log2(1-r) 468 (p10) add r29=1,r29 469 nop.f 0 470 (p7) br.cond.spnt LOG2_PSEUDO_ZERO 471} 472{.mfi 473 // get T_low address 474 shladd r3=r28,3,r3 475 // if first 8 bits after leading 1 are all ones, use polynomial approx. only 476 (p10) fms.s1 f6=f7,f36,f1 477 // p10 --> p8=1, p12=0 478 (p10) cmp.eq p8,p12=r0,r0;; 479} 480 481{.mfi 482 // get T_high address 483 shladd r2=r28,2,r2 484 // L(x*y)=x*y-RN(x*y) 485 fms.s1 f41=f41,f8,f43 486 nop.i 0 487} 488{.mfi 489 // p13=p12 490 (p12) cmp.eq.unc p13,p0=r0,r0 491 // RtH=RN(x*y)-1 (will eliminate rounding errors in r) 492 fms.s1 f43=f43,f1,f1 493 nop.i 0;; 494} 495 496.pred.rel "mutex",p8,p12 497{.mfb 498 // load T_high (unless first 9 bits after leading 1 are 0) 499 (p12) ldfs f7=[r2] 500 // set T_high=0 (if first 9 bits after leading 1 are 0) 501 (p8) fma.s1 f7=f0,f0,f0 502 // x=1, return 503 (p6) br.ret.spnt b0 504} 505.pred.rel "mutex",p8,p12 506{.mfi 507 // p12: load T_low 508 (p12) ldfd f36=[r3] 509 // p8: set T_low=0 510 (p8) fma.s1 f36=f0,f0,f0 511 (p8) cmp.eq p8,p12=r29,r0;; //nop.i 0;; 512} 513 514.pred.rel "mutex",p8,p12 515{.mfi 516 // f8=expon - bias 517 setf.sig f8=r29 518 // general case: 2^{16}+C1*r 519 (p12) fma.s1 f33=f6,f14,f32 520 nop.i 0 521} 522{.mfi 523 // r26=1 524 mov r26=1 525 // p8 (mantissa is close to 1, or close to 2): 2^{-8}+C1*r 526 (p8) fma.s1 f32=f6,f14,f33 527 nop.i 0;; 528} 529 530{.mfi 531 nop.m 0 532 // P78=C_7+C_8*r 533 fma.s1 f10=f11,f6,f10 534 // r26=2^{63} 535 shl r26=r26,63 536} 537{.mfi 538 nop.m 0 539 // P34=C_3+r*C_4 540 fma.s1 f15=f42,f6,f15 541 nop.i 0;; 542} 543{.mfi 544 nop.m 0 545 // r2=r*r 546 fma.s1 f11=f6,f6,f0 547 nop.i 0 548} 549{.mfi 550 nop.m 0 551 // P56=C_5+C_6*r 552 fma.s1 f13=f13,f6,f12 553 nop.i 0;; 554} 555 556{.mfi 557 nop.m 0 558 // Rth-r 559 (p13) fms.s1 f43=f43,f1,f6 560 nop.i 0 561} 562{.mfi 563 // significand(x)=1 ? 564 cmp.eq p0,p6=r25,r26 565 // P12=C1l+C_2*r 566 fma.s1 f34=f35,f6,f34 567 nop.i 0;; 568} 569 570.pred.rel "mutex",p8,p12 571{.mfi 572 nop.m 0 573 // p12: C1r=(2^{16}+C1*r)-2^{16} 574 (p12) fms.s1 f32=f33,f1,f32 575 nop.i 0 576} 577{.mfi 578 nop.m 0 579 // p8: C1r=C1*r (double extended) 580 (p8) fms.s1 f32=f32,f1,f33 581 nop.i 0;; 582} 583 584{.mfi 585 nop.m 0 586 // L(x*y)*C_1+T_low 587 (p13) fma.s1 f36=f41,f14,f36 588 nop.i 0 589} 590{.mfi 591 nop.m 0 592 // P58=P56+r2*P78 593 fma.s1 f13=f11,f10,f13 594 nop.i 0;; 595} 596{.mfi 597 nop.m 0 598 // P14=P12+r2*P34 599 fma.s1 f15=f15,f11,f34 600 nop.i 0 601} 602{.mfi 603 nop.m 0 604 // r4=r2*r2 605 fma.s1 f11=f11,f11,f0 606 nop.i 0;; 607} 608 609{.mfi 610 nop.m 0 611 // normalize additive term (l=exponent of x) 612 fcvt.xf f8=f8 613 nop.i 0;; 614} 615 616 617{.mfi 618 nop.m 0 619 // D=C1*r-C1r 620 (p6) fms.s1 f12=f14,f6,f32 621 nop.i 0;; 622} 623 624{.mfi 625 nop.m 0 626 // T_low'=(Rth-r)*C1+(L(x*y)*C1+T_low) 627 (p13) fma.s1 f36=f43,f14,f36 628 nop.i 0;; 629} 630{.mfi 631 nop.m 0 632 // P18=P14+r4*P58 633 (p6) fma.s1 f13=f11,f13,f15 634 nop.i 0;; 635} 636 637{.mfi 638 nop.m 0 639 // add T_high+l 640 (p6) fma.s1 f8=f8,f1,f7 641 nop.i 0;; 642} 643 644 645{.mfi 646 nop.m 0 647 // D+T_low 648 (p6) fma.s1 f12=f12,f1,f36 649 nop.i 0;; 650} 651 652 653{.mfi 654 nop.m 0 655 // (T_high+l)+C1r 656 (p6) fma.s1 f8=f8,f1,f32 657 nop.i 0 658} 659{.mfi 660 nop.m 0 661 // (D+T_low)+r*P18 662 (p6) fma.s1 f13=f13,f6,f12 663 nop.i 0;; 664} 665 666//{.mfb 667//nop.m 0 668//mov f8=f36 669//fma.s0 f8=f13,f6,f0 670//br.ret.sptk b0;; 671//} 672 673 674{.mfb 675 nop.m 0 676 // result=((T_high+l)+C1r)+((D+T_low)+r*P18) 677 (p6) fma.s0 f8=f13,f1,f8 678 // return 679 br.ret.sptk b0;; 680} 681 682 683SPECIAL_log2l: 684{.mfi 685 nop.m 0 686 mov FR_X=f8 687 nop.i 0 688} 689{.mfi 690 nop.m 0 691 // x=+Infinity ? 692 fclass.m p7,p0=f8,0x21 693 nop.i 0;; 694} 695{.mfi 696 nop.m 0 697 // x=+/-Zero ? 698 fclass.m p8,p0=f7,0x7 699 nop.i 0;; 700} 701{.mfi 702 nop.m 0 703 // x=-Infinity, -normal, -denormal ? 704 fclass.m p6,p0=f8,0x3a 705 nop.i 0;; 706} 707{.mfb 708 nop.m 0 709 // log2l(+Infinity)=+Infinity 710 nop.f 0 711 (p7) br.ret.spnt b0;; 712} 713{.mfi 714 (p8) mov GR_Parameter_TAG = 168 715 // log2l(+/-0)=-infinity, raises Divide by Zero 716 // set f8=-0 717 (p8) fmerge.ns f8=f0,f8 718 nop.i 0;; 719} 720{.mfb 721 nop.m 0 722 (p8) frcpa.s0 f8,p0=f1,f8 723 (p8) br.cond.sptk __libm_error_region;; 724} 725{.mfb 726 (p6) mov GR_Parameter_TAG = 169 727 // x<0: return NaN, raise Invalid 728 (p6) frcpa.s0 f8,p0=f0,f0 729 (p6) br.cond.sptk __libm_error_region;; 730} 731 732 733{.mfb 734 nop.m 0 735 // Remaining cases: NaNs 736 fma.s0 f8=f8,f1,f0 737 br.ret.sptk b0;; 738} 739 740LOG2_PSEUDO_ZERO: 741 742{.mfi 743 nop.m 0 744 mov FR_X=f8 745 nop.i 0 746} 747{.mfi 748 mov GR_Parameter_TAG = 168 749 // log2l(+/-0)=-infinity, raises Divide by Zero 750 // set f8=-0 751 fmerge.ns f8=f0,f8 752 nop.i 0;; 753} 754{.mfb 755 nop.m 0 756 frcpa.s0 f8,p0=f1,f8 757 br.cond.sptk __libm_error_region;; 758} 759 760 761GLOBAL_IEEE754_END(log2l) 762libm_alias_ldouble_other (__log2, log2) 763 764 765LOCAL_LIBM_ENTRY(__libm_error_region) 766.prologue 767{ .mfi 768 add GR_Parameter_Y=-32,sp // Parameter 2 value 769 nop.f 0 770.save ar.pfs,GR_SAVE_PFS 771 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs 772} 773{ .mfi 774.fframe 64 775 add sp=-64,sp // Create new stack 776 nop.f 0 777 mov GR_SAVE_GP=gp // Save gp 778};; 779{ .mmi 780 stfe [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack 781 add GR_Parameter_X = 16,sp // Parameter 1 address 782.save b0, GR_SAVE_B0 783 mov GR_SAVE_B0=b0 // Save b0 784};; 785.body 786{ .mib 787 stfe [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack 788 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address 789 nop.b 0 790} 791{ .mib 792 stfe [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack 793 add GR_Parameter_Y = -16,GR_Parameter_Y 794 br.call.sptk b0=__libm_error_support# // Call error handling function 795};; 796{ .mmi 797 nop.m 0 798 nop.m 0 799 add GR_Parameter_RESULT = 48,sp 800};; 801{ .mmi 802 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack 803.restore sp 804 add sp = 64,sp // Restore stack pointer 805 mov b0 = GR_SAVE_B0 // Restore return address 806};; 807{ .mib 808 mov gp = GR_SAVE_GP // Restore gp 809 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs 810 br.ret.sptk b0 // Return 811};; 812 813LOCAL_LIBM_END(__libm_error_region) 814.type __libm_error_support#,@function 815.global __libm_error_support# 816