1.file "tanh.s" 2 3 4// Copyright (c) 2001 - 2005, Intel Corporation 5// All rights reserved. 6// 7// 8// Redistribution and use in source and binary forms, with or without 9// modification, are permitted provided that the following conditions are 10// met: 11// 12// * Redistributions of source code must retain the above copyright 13// notice, this list of conditions and the following disclaimer. 14// 15// * Redistributions in binary form must reproduce the above copyright 16// notice, this list of conditions and the following disclaimer in the 17// documentation and/or other materials provided with the distribution. 18// 19// * The name of Intel Corporation may not be used to endorse or promote 20// products derived from this software without specific prior written 21// permission. 22 23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34// 35// Intel Corporation is the author of this code, and requests that all 36// problem reports or change requests be submitted to it directly at 37// http://www.intel.com/software/products/opensource/libraries/num.htm. 38// 39// History 40//============================================================================== 41// 05/30/01 Initial version 42// 12/04/01 Rewritten version with erf-like algorithm. 43// Performance improved. 44// 05/20/02 Cleaned up namespace and sf0 syntax 45// 08/14/02 Changed mli templates to mlx 46// 02/10/03 Reordered header: .section, .global, .proc, .align 47// 03/31/05 Reformatted delimiters between data tables 48// 49// API 50//============================================================================== 51// double tanh(double) 52// 53// Overview of operation 54//============================================================================== 55// 56// Algorithm description 57// --------------------- 58// 59// There are 4 paths: 60// 61// 1. Special path: x = 0, Inf, NaNs, denormals 62// Return tanh(x) = +/-0.0 for zeros 63// Return tanh(x) = QNaN for NaNs 64// Return tanh(x) = sign(x)*1.0 for Inf 65// Return tanh(x) = x + x^2 for - denormals 66// Return tanh(x) = x - x^2 for + denormals 67// 68// 2. Near zero path: 0.0 < |x| < 0.25 69// Return tanh(x) = x + x^3*A3 + ... + x^19*A19 70// 71// 3. Main path: 0.25 <= |x| < 19.0625 72// For several ranges of 0.25 <= |x| < 19.0625 73// Return tanh(x) = sign(x)*(A0 + y*A1 + y^2*A2 + 74// + y^3*A3 + ... + y^19*A19) 75// where y = (|x|/a) - b 76// 77// For each range there is particular set of coefficients. 78// Below is the list of ranges: 79// 1/4 <= |x| < 1/2 a = 0.25, b = 1.0 80// 1/2 <= |x| < 1.0 a = 0.5, b = 1.0 81// 1.0 <= |x| < 2.0 a = 1.0, b = 1.0 82// 2.0 <= |x| < 3.25 a = 2.0, b = 1.0 83// 3.25 <= |x| < 4.0 a = 2.0, b = 2.0 84// 4.0 <= |x| < 6.5 a = 4.0, b = 1.0 85// 6.5 <= |x| < 8.0 a = 4.0, b = 2.0 86// 8.0 <= |x| < 13.0 a = 8.0, b = 1.0 87// 13.0 <= |x| < 16.0 a = 8.0, b = 2.0 88// 16.0 <= |x| < 19.0625 a = 16.0, b = 1.0 89// ( [3.25;4.0], [6.5;8.0], [13.0;16.0] subranges separated 90// for monotonicity issues resolve ) 91// 92// 4. Saturation path: 19.0625 <= |x| < +INF 93// Return tanh(x) = sign(x)*(1.0 - tiny_value) 94// (tiny_value ~ 2^(-63)) 95// 96// Registers used 97//============================================================================== 98// Floating Point registers used: 99// f8 = input, output 100// f32 -> f64 101// 102// General registers used: 103// r32 -> r51, r2, r3 104// 105// Predicate registers used: 106// p6, p8, p10, p11, p12, p14, p15 107// p6 arg is zero, denormal or special IEEE 108// p8 to filter out case when signd(x) > 1.625 109// p10 to filter out case when |x| < 0.25 110// p11 to filter out case when signd(x) <= 1.625 111// p12 to filter out case when |x| >= 19.0625 112// p14 set to 1 for positive x 113// p15 set to 1 for negative x 114 115// Assembly macros 116//============================================================================== 117rDataPtr = r2 118rDataPtr1 = r3 119 120rBias = r33 121rCoeffAddr3 = r34 122rThreeAndQ = r35 123rCoeffAddr2 = r36 124rMask = r37 125rArg = r38 126rSignBit = r39 127rAbsArg = r40 128rSaturation = r41 129rIndex = r42 130rCoeffAddr1 = r43 131rCoeffAddr4 = r44 132rShiftedArg = r45 133rShiftedArgMasked = r46 134rBiasedExpOf4 = r47 135rShiftedAbsArg = r48 136rArgSgnd = r49 137r1625Sgnd = r50 138rTwo = r51 139 140//============================================================================== 141fA0 = f32 142fA1 = f33 143fA2 = f34 144fA3 = f35 145fA4 = f36 146fA5 = f37 147fA6 = f38 148fA7 = f39 149fA8 = f40 150fA9 = f41 151fA10 = f42 152fA11 = f43 153fA12 = f44 154fA13 = f45 155fA14 = f46 156fA15 = f47 157fA16 = f48 158fA17 = f49 159fA18 = f50 160fA19 = f51 161fArgSqr = f52 162fArgAbsNorm = f53 163fSignumX = f54 164fRes = f55 165fThreeAndQ = f56 166fArgAbs = f57 167fTSqr = f58 168fTQuadr = f59 169fTDeg3 = f60 170fTDeg7 = f61 171fArgAbsNormSgn = f62 172fTQuadrSgn = f63 173fTwo = f64 174 175// Data tables 176//============================================================================== 177RODATA 178 179.align 16 180 181LOCAL_OBJECT_START(tanh_data) 182// CAUTION: The order of these table coefficients shouldn't be changed! 183 184// Main path coefficients: 185// Coefficients ##0..15 ("main" coefficient tables) 186// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5 187data8 0xE9D218BC9A3FB55A, 0x00003FC7 //A19 188data8 0xC8C0D38687F36EBA, 0x00003FCE //A18 189data8 0xA2663E519FAC8A43, 0x0000BFD2 //A17 190data8 0xD913F0490674B0DF, 0x00003FD3 //A16 191data8 0xF75D84789DE0AE52, 0x00003FD6 //A15 192data8 0xACB3C40EEF3A06F0, 0x0000BFD9 //A14 193data8 0xEBD7F5DC02CFD5BA, 0x0000BFDB //A13 194data8 0x8B52CDF66D709E2A, 0x00003FDF //A12 195data8 0x9EC21F28E05C4A3E, 0x00003FE0 //A11 196data8 0xC412B44D0176F3ED, 0x0000BFE4 //A10 197data8 0x97BF35A34DD1EA4C, 0x0000BFE0 //A9 198data8 0xF89F5B39E3A3AA36, 0x00003FE9 //A8 199data8 0xF2BA654BCEEBA433, 0x0000BFEA //A7 200data8 0x8E1C15876AA589AD, 0x0000BFEF //A6 201data8 0x942226246A8C2A86, 0x00003FF1 //A5 202data8 0x8F06D9FF7DB47261, 0x00003FF4 //A4 203// 204// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0 205data8 0xC4A7B8FB672A8520, 0x00003FDC //A19 206data8 0xA20724B847E13499, 0x0000BFE0 //A18 207data8 0xE17DB53F02E4D340, 0x00003FE2 //A17 208data8 0x90264A1012F4CA6F, 0x0000BFE4 //A16 209data8 0xEBEC9F776F0BF415, 0x0000BFE0 //A15 210data8 0x89AF912B305B45A4, 0x00003FE7 //A14 211data8 0xB4A960B81F5EC36A, 0x0000BFE7 //A13 212data8 0x969A4E95B2DA86B5, 0x0000BFEA //A12 213data8 0x8A3FC0EC082305CB, 0x00003FEC //A11 214data8 0x83D7795BCBE24373, 0x00003FEC //A10 215data8 0xDCBF42AEB82932EC, 0x0000BFEF //A9 216data8 0x83318E61ECAFD804, 0x00003FF0 //A8 217data8 0xEA4DE5746975A914, 0x00003FF2 //A7 218data8 0xCE63E8FA6B96480B, 0x0000BFF4 //A6 219data8 0xDF017BE0D4FE45D8, 0x0000BFF4 //A5 220data8 0xA8A0C6E2226DF3CD, 0x00003FF8 //A4 221// 222// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0 223data8 0x8E89D2EBFDAA160B, 0x00003FE9 //A19 224data8 0xDD9226310A272046, 0x0000BFEC //A18 225data8 0xA038042D28B0D665, 0x00003FEF //A17 226data8 0x8C04796F03516306, 0x0000BFF1 //A16 227data8 0x9CD6A9CB4E90A2FD, 0x00003FF2 //A15 228data8 0xC8980E166F5A84FD, 0x0000BFF2 //A14 229data8 0x9ADFE65F56B7BCFD, 0x00003FED //A13 230data8 0x8B11FDFB5D0A7B96, 0x00003FF4 //A12 231data8 0x8209A125E829CBFA, 0x0000BFF5 //A11 232data8 0xCF38AAC17B85BD76, 0x00003FF1 //A10 233data8 0xD5C2E248D8AB99AB, 0x00003FF6 //A9 234data8 0xE12BE2785727F2D6, 0x0000BFF7 //A8 235data8 0x9FC9EF90F87BF1E2, 0x00003FF6 //A7 236data8 0x9B02FE0DAF42C08F, 0x00003FF9 //A6 237data8 0xBDACE06F531D9491, 0x0000BFFA //A5 238data8 0xE3048AD1DB2F648C, 0x00003FF9 //A4 239// 240// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25 241data8 0x856EC3B0330A385A, 0x00003FEB //A19 242data8 0xC641D69DAE2D429C, 0x0000BFF2 //A18 243data8 0xC683EB0BE1343FFF, 0x00003FF5 //A17 244data8 0xC358954224E4E823, 0x0000BFF7 //A16 245data8 0xF813A8D6D396BC5F, 0x00003FF8 //A15 246data8 0xE0ECDFED078D37D6, 0x0000BFF9 //A14 247data8 0x950E4E619855E316, 0x00003FFA //A13 248data8 0x8453B8F93370FB58, 0x0000BFFA //A12 249data8 0xFDBA28430AEC95BA, 0x00003FF7 //A11 250data8 0x9371AAC1FDB1E664, 0x00003FFA //A10 251data8 0xAC972DA97782D88A, 0x0000BFFB //A9 252data8 0xE18F47B10B9CE1BC, 0x00003FFB //A8 253data8 0xAB7C81230BF13BC6, 0x0000BFFB //A7 254data8 0xA6CAAD4A3E31A7D5, 0x0000BFF8 //A6 255data8 0x9CABD76D1D5C3878, 0x00003FFC //A5 256data8 0x92906D077941CAA9, 0x0000BFFD //A4 257// 258// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5 259data8 0x9232D19F71709AC9, 0x0000BFF5 //A19 260data8 0x819E31323F5DD3F8, 0x00003FF8 //A18 261data8 0xDA8E1CDB8D23DC29, 0x0000BFF9 //A17 262data8 0xE97C7CD8FC0486D8, 0x00003FFA //A16 263data8 0xB0C4AD234D88C9F2, 0x0000BFFB //A15 264data8 0xC5989BFB28FDE267, 0x00003FFB //A14 265data8 0x9B26520EC4EFEE8E, 0x0000BFFB //A13 266data8 0xC4B6F758AD21E574, 0x00003FF9 //A12 267data8 0xCC36E3FFA10D2CFF, 0x00003FFA //A11 268data8 0x8738696FB06A5CED, 0x0000BFFC //A10 269data8 0xD31981825BF39228, 0x00003FFC //A9 270data8 0x82C58FB9BEE43992, 0x0000BFFD //A8 271data8 0x88D5AAE49164B6F3, 0x00003FFD //A7 272data8 0xF4CA0B968AF2DDE2, 0x0000BFFC //A6 273data8 0xB99874B482BD17EE, 0x00003FFC //A5 274data8 0xE93FB2F99431DC1D, 0x0000BFFB //A4 275// 276// Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0 277data8 0xAAA9EB7EADA85CEC, 0x00003FF5 //A19 278data8 0x980C80EE05A6BE78, 0x0000BFF8 //A18 279data8 0x818DA9F5396390A5, 0x00003FFA //A17 280data8 0x8D8CC21E23D8A6A2, 0x0000BFFB //A16 281data8 0xE0EC19E55A886765, 0x00003FFB //A15 282data8 0x8C11197A7E6244C5, 0x0000BFFC //A14 283data8 0x901D2BF203C2F7F3, 0x00003FFC //A13 284data8 0xFEACAEE66EE803E5, 0x0000BFFB //A12 285data8 0xC684E4925E318C3F, 0x00003FFB //A11 286data8 0x8A9D8A970565F28D, 0x0000BFFB //A10 287data8 0xAE34C61DE5CEA4D4, 0x00003FFA //A9 288data8 0xC44C5714BD6208A0, 0x0000BFF9 //A8 289data8 0xC4612F7D6C8BDB79, 0x00003FF8 //A7 290data8 0xABD91DCE40D5EECB, 0x0000BFF7 //A6 291data8 0x80E375C1B847B72F, 0x00003FF6 //A5 292data8 0xA11C7DD978CF700A, 0x0000BFF4 //A4 293// 294// Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625 295data8 0xE29D17C510F86F6B, 0x00003FF3 //A19 296data8 0x88FE52EB39A3A98C, 0x0000BFF5 //A18 297data8 0xA406547E50360693, 0x00003FF5 //A17 298data8 0x83E6260B71C6D7DE, 0x0000BFF5 //A16 299data8 0xA36AB5B0CBC97B85, 0x00003FF4 //A15 300data8 0xA94931E0B7BA6C14, 0x0000BFF3 //A14 301data8 0x9A4596DAF350AD63, 0x00003FF2 //A13 302data8 0xFE47643F375AECA5, 0x0000BFF0 //A12 303data8 0xBF8433C5ABEE63B1, 0x00003FEF //A11 304data8 0x83CEE05D7AE90A0A, 0x0000BFEE //A10 305data8 0xA4CC45480BCEB02D, 0x00003FEC //A9 306data8 0xB967CBDCBC16CB10, 0x0000BFEA //A8 307data8 0xB9681B214EDC098D, 0x00003FE8 //A7 308data8 0xA23B20D87B80DFA8, 0x0000BFE6 //A6 309data8 0xF358B2C46F10CBAF, 0x00003FE3 //A5 310data8 0x98176FD06229A385, 0x0000BFE1 //A4 311// 312// Binary subranges 313// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0 314data8 0xEF2EE841288F6706, 0x00003FE9 //A19 315data8 0xE65D5B74B85F82A6, 0x00003FEB //A18 316data8 0xE495FC21E42A79FF, 0x00003FEA //A17 317data8 0xF99B267A913CF3E5, 0x00003FEC //A16 318data8 0xFE3D700F4A0A0FDE, 0x0000BFEC //A15 319data8 0x8F91BB4EE4E4EA52, 0x00003FEE //A14 320data8 0xBCA9F41A5C6EF8BA, 0x0000BFEE //A13 321data8 0xF93E00884027A9CF, 0x00003FED //A12 322data8 0xC4D4036A61BABC2F, 0x00003FEF //A11 323data8 0x86CC2AD1AD47C7D5, 0x0000BFF2 //A10 324data8 0xD3065DEF4CE9AD32, 0x00003FF3 //A9 325data8 0x82C44125F568D54E, 0x0000BFF5 //A8 326data8 0x88D588729BAF14CA, 0x00003FF6 //A7 327data8 0xF4CA0661307243C7, 0x0000BFF6 //A6 328data8 0xB998746D57061F74, 0x00003FF7 //A5 329data8 0xE93FB2F482327C19, 0x0000BFF7 //A4 330// 331// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0 332data8 0xEB189B71ADC40BE2, 0x00003FEA //A19 333data8 0xA60B46F9FF6DC2DF, 0x00003FEA //A18 334data8 0xBB061CDD9F368B9D, 0x00003FEC //A17 335data8 0x841E08BDF5429991, 0x0000BFEC //A16 336data8 0xDD33990B433F25BE, 0x00003FED //A15 337data8 0xBA5DE6B870F0A2BB, 0x0000BFEE //A14 338data8 0xA71D489AAA6DACF0, 0x00003FEF //A13 339data8 0x874CCB2B8F3FBC0E, 0x0000BFF0 //A12 340data8 0xCB1D2E9754EA534A, 0x00003FF0 //A11 341data8 0x8BA5ABB53BA6ABCF, 0x0000BFF1 //A10 342data8 0xAE91FD1C2391A32B, 0x00003FF1 //A9 343data8 0xC465A74B798E5761, 0x0000BFF1 //A8 344data8 0xC4666152397D15C1, 0x00003FF1 //A7 345data8 0xABD9E63CA575B950, 0x0000BFF1 //A6 346data8 0x80E38B18E8D0F460, 0x00003FF1 //A5 347data8 0xA11C80E20AAFDD3C, 0x0000BFF0 //A4 348// 349// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0 350data8 0xBECD0AF7E22E5594, 0x00003FE9 //A19 351data8 0xE2834E2D68C1128C, 0x00003FEA //A18 352data8 0x97B117611B317379, 0x00003FEB //A17 353data8 0xEE91A0D39A772F6B, 0x00003FEA //A16 354data8 0x92F6EC377DCADA4F, 0x00003FEA //A15 355data8 0xD8FCCD6A3277FAB7, 0x00003FE8 //A14 356data8 0xC15AB9CB0C3DCFE0, 0x00003FE7 //A13 357data8 0xC3C659704A7147CD, 0x00003FE2 //A12 358data8 0xFA17F09D27C97912, 0x00003FE4 //A11 359data8 0xF664147182B94788, 0x0000BFE3 //A10 360data8 0xA6C89FA741464DA1, 0x00003FE3 //A9 361data8 0xB90FE464A825EFA8, 0x0000BFE2 //A8 362data8 0xB973AE0FD86EC024, 0x00003FE1 //A7 363data8 0xA23A087F96846951, 0x0000BFE0 //A6 364data8 0xF358D8A7FC012D5D, 0x00003FDE //A5 365data8 0x98176E2309B7C73A, 0x0000BFDD //A4 366// 367// Coefficients ##16..19 ("tail" coefficient tables) 368// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5 369data8 0x838F209ABB9BA7B3, 0x0000BFF7 //A3 370data8 0xEBC0AC78DA4FC500, 0x0000BFF8 //A2 371data8 0xF0A4D02960B60E69, 0x00003FFC //A1 372data8 0xFACBF534D0E42F8A, 0x00003FFC //A0 373// 374// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0 375data8 0xC0ECBDC0A0D133A6, 0x0000BFF8 //A3 376data8 0xBA13A076BF8E812F, 0x0000BFFB //A2 377data8 0xC954A37D1A1CA070, 0x00003FFD //A1 378data8 0xEC9A9EBAB4579B29, 0x00003FFD //A0 379// 380// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0 381data8 0xD42E9175A6EA1397, 0x00003FFB //A3 382data8 0xA3C361378A55CF56, 0x0000BFFD //A2 383data8 0xD706E07CC8622983, 0x00003FFD //A1 384data8 0xC2F7D5A8A79CA2AC, 0x00003FFE //A0 385// 386// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25 387data8 0xAC7A7F8776817C7E, 0x00003FFD //A3 388data8 0x8B7CE95E69FCFE9A, 0x0000BFFD //A2 389data8 0x90B161317028D995, 0x00003FFC //A1 390data8 0xF6CA82F0DE1E9E9A, 0x00003FFE //A0 391// 392// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5 393data8 0xE9E072407BC22DC6, 0x00003FFA //A3 394data8 0xAFA4A913D8E6BB4A, 0x0000BFF9 //A2 395data8 0xAFC2D6A885BAA875, 0x00003FF7 //A1 396data8 0xFFD40B84505A10B2, 0x00003FFE //A0 397// 398// Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0 399data8 0xA11C8A1FED168CD5, 0x00003FF2 //A3 400data8 0xF1AAD6B02063A5F5, 0x0000BFEF //A2 401data8 0xF1AADA46AD341C34, 0x00003FEC //A1 402data8 0xFFFFFC39548FC34B, 0x00003FFE //A0 403// 404// Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625 405data8 0x98176FD1F0950C16, 0x00003FDE //A3 406data8 0xE42327BB09C8B2A5, 0x0000BFDA //A2 407data8 0xE42327BB0B154F13, 0x00003FD6 //A1 408data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0 409// 410// Binary subranges 411// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0 412data8 0xE9E072404329293B, 0x00003FF7 //A3 413data8 0xAFA4A913D798300B, 0x0000BFF7 //A2 414data8 0xAFC2D6A885B48567, 0x00003FF6 //A1 415data8 0xFFD40B84505A10B4, 0x00003FFE //A0 416// 417// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0 418data8 0xA11C8A63815F7A28, 0x00003FEF //A3 419data8 0xF1AAD6B65B0EBF53, 0x0000BFED //A2 420data8 0xF1AADA46E799831F, 0x00003FEB //A1 421data8 0xFFFFFC39548FC348, 0x00003FFE //A0 422// 423// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0 424data8 0x98176FE982140A59, 0x00003FDB //A3 425data8 0xE42327B9B0D7202F, 0x0000BFD8 //A2 426data8 0xE42327BB13076BD6, 0x00003FD5 //A1 427data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0 428// 429// Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.25 430// ('tanh_near_zero' path) 431data8 0xBF2BA5D26E479D0C //A9 432data8 0x3F4336D96F81EE26 //A8 433data8 0xBF8226E34AE197B0 //A5 434data8 0x3F9664F488148657 //A4 435data8 0xAAAAAAAAAAAAAA99, 0x0000BFFD //A1 436data8 0xBF57D91925BB5EE2 //A7 437data8 0x3F6D6D36C3D5B7A1 //A6 438data8 0xBFABA1BA1BA19D32 //A3 439data8 0x3FC1111111111108 //A2 440// 441// 1.0 - 2^(-63) 442// ('tanh_saturation' path) 443data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE 444LOCAL_OBJECT_END(tanh_data) 445 446// CAUTION: The order of table coefficients shouldn't be changed! 447 448 449.section .text 450GLOBAL_LIBM_ENTRY(tanh) 451{ .mfi 452 alloc r32 = ar.pfs, 0, 20, 0, 0 453 fmerge.se fArgAbsNorm = f1, f8 // normalized x 454 adds rSignBit = 0x1, r0 // Bit for sign removing 455} 456{ .mfi 457 addl rDataPtr = @ltoff(tanh_data), gp // Data pointer 458 fma.s1 fTwo = f1, f1, f1 // 2.0 construct 459 addl rArgSgnd = 0xfff, r0 // mask for exponent 460};; 461 462{ .mfi 463 getf.d rArg = f8 // x in GR 464 fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials 465 // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf 466 shl rArgSgnd = rArgSgnd, 52 // mask for exponent 467} 468{ .mlx 469 ld8 rDataPtr = [rDataPtr] // Real data pointer 470 movl r1625Sgnd = 0xA000000000000 // 1.625 signd 471 // 1.625 significand used to filter values greater than 3.25, 6.5, 13.0 472 // to enter binary subranges 473};; 474 475{ .mfi 476 addl rBias = 0x3FD00, r0 // bias of 0.25 << 8 477 fma.s1 fArgSqr = f8, f8, f0 // x^2 478 shl rSignBit = rSignBit, 63 // mask for sign bit 479} 480{ .mlx 481 addl rMask = 0x7FF00, r0 // Mask for index bits 482 movl rTwo = 0x4000000000000000 // 2.0 483};; 484 485{ .mfi 486 andcm rArgSgnd = rArg, rArgSgnd // Remove exponent 487 nop.f 0 488 shr.u rShiftedArg = rArg, 44 // Select only necessary bits of arg 489} 490{ .mfb 491 andcm rAbsArg = rArg, rSignBit // Remove sign 492 nop.f 0 493(p6) br.cond.spnt _tanh_spec // Branch to zero, denorm & specs 494};; 495 496{ .mfi 497 and rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8 498 fmerge.s fArgAbs = f1, f8 // |x| 499 shr rShiftedAbsArg = rAbsArg, 44 // Select only necessary 500 // bits of absolute arg 501} 502{ .mfi 503 cmp.gt p8, p11 = rArgSgnd, r1625Sgnd // p8 = 1 if 504 // signd(x) > 1.625 - to filter values greater than 3.25, 6.5, 13.0 505 nop.f 0 506 nop.i 0 507};; 508 509{ .mfi 510 sub rIndex = rShiftedArgMasked, rBias // index << 8 511 nop.f 0 512 cmp.lt p10, p0 = rShiftedArgMasked, rBias // p10=1 if |x|<0.25 513} 514{ .mfb 515(p8) cmp.gt p8, p11 = rAbsArg, rTwo // If arg is greater than 2.0? 516 // (then we should use binary subranges) 517 nop.f 0 518(p10) br.cond.spnt tanh_near_zero // branch out if |x| < 0.25 519};; 520 521.pred.rel "mutex",p8,p11 522{ .mfi 523(p8) add rIndex = 0x400, rIndex // Make pointer to binary 524 // subranges 525(p11) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1 // |x|/b - 1.0 526 addl rSaturation = 0x40331, r0 // shifted bits of 19.0625 527} 528{ .mfi 529 nop.m 0 530(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, fTwo // |x|/b - 2.0 531 // this is only for binary subranges [3.25;4], [6.5;8], [13.0;16] 532 nop.i 0 533} 534;; 535 536{ .mfi 537 add rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14 538 nop.f 0 539 nop.i 0 540};; 541 542{ .mfi 543 adds rCoeffAddr2 = 16, rCoeffAddr1 // Shifted pointer to coeffs 544 fmerge.s fSignumX = f8, f1 // signum(x) 545 nop.i 0 546} 547{ .mfb 548 cmp.le p12, p0 = rSaturation, rShiftedAbsArg // |x|>=19.0625? 549 nop.f 0 550(p12) br.cond.spnt tanh_saturation // branch out if x |x| >= 19.0625 551};; 552 553{.mfi 554 ldfe fA19 = [rCoeffAddr1], 32 // Load A19 555 nop.f 0 556 nop.i 0 557} 558{.mfi 559 ldfe fA18 = [rCoeffAddr2], 32 // Load A18 560 nop.f 0 561 adds rCoeffAddr3 = 0xA00, rDataPtr // Pointer to "tail" 562 // coefficients tables 563};; 564 565{.mfi 566 ldfe fA17 = [rCoeffAddr1], 32 // Load A17 567 nop.f 0 568 nop.i 0 569} 570{.mfi 571 ldfe fA16 = [rCoeffAddr2], 32 // Load A16 572 nop.f 0 573 nop.i 0 574};; 575 576{.mfi 577 ldfe fA15 = [rCoeffAddr1], 32 // Load A15 578 fma.s1 fTSqr = fArgAbsNorm, fArgAbsNorm, f0 // x^2 579 shr.u rIndex = rIndex, 2 // Index for "tail" tables 580} 581{.mfi 582 ldfe fA14 = [rCoeffAddr2], 32 // Load A14 583 nop.f 0 584 adds rCoeffAddr4 = 16, r0 // Shifter pointer 585 // to "tail" tables 586};; 587 588{.mfi 589 ldfe fA13 = [rCoeffAddr1], 32 // Load A13 590 nop.f 0 591 add rCoeffAddr3 = rCoeffAddr3, rIndex // "tail" coeffs to load 592 // ##16..23 593} 594{.mfi 595 ldfe fA12 = [rCoeffAddr2], 32 // Load A12 596 nop.f 0 597 cmp.lt p15, p14 = rArg, r0 // Arg positive (p14) 598 // or negative (p15)? 599};; 600 601{.mfi 602 ldfe fA11 = [rCoeffAddr1], 32 // Load A11 603 nop.f 0 604 add rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4 // shifted "tail" 605 // coeffs to load 606} 607{.mfi 608 ldfe fA10 = [rCoeffAddr2], 32 // Load A10 609 nop.f 0 610 nop.i 0 611};; 612 613{.mfi 614 ldfe fA9 = [rCoeffAddr1], 32 // Load A9 615 nop.f 0 616 nop.i 0 617} 618{.mfi 619 ldfe fA8 = [rCoeffAddr2], 32 // Load A8 620 nop.f 0 621 nop.i 0 622};; 623 624{.mfi 625 ldfe fA7 = [rCoeffAddr1], 32 // Load A7 626 nop.f 0 627 nop.i 0 628} 629{.mfi 630 ldfe fA6 = [rCoeffAddr2], 32 // Load A6 631 nop.f 0 632 nop.i 0 633};; 634 635{.mfi 636 ldfe fA5 = [rCoeffAddr1], 32 // Load A5 637 fma.s1 fTDeg3 = fArgAbsNorm, fTSqr, f0 // x^3 638 nop.i 0 639} 640{.mfi 641 ldfe fA4 = [rCoeffAddr2], 32 // Load A4 642 fma.s1 fTQuadr = fTSqr, fTSqr, f0 // x^4 643 nop.i 0 644};; 645 646// Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm 647{.mfi 648 ldfe fA3 = [rCoeffAddr3], 32 // Load A3 649 fma.s1 fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0 // sign(x)*x 650 nop.i 0 651} 652{.mfi 653 ldfe fA2 = [rCoeffAddr4], 32 // Load A2 654 nop.f 0 655 nop.i 0 656};; 657 658{.mfi 659 ldfe fA1 = [rCoeffAddr3], 32 // Load A1 660 fma.s1 fRes = fA19, fArgAbsNorm, fA18 // Polynomial 661 nop.i 0 662} 663{.mfi 664 ldfe fA0 = [rCoeffAddr4], 32 // Load A0 665 nop.f 0 666 nop.i 0 667};; 668 669{ .mfi 670 nop.m 0 671 fma.s1 fA17 = fA17, fArgAbsNorm, fA16 // Polynomial 672 nop.i 0 673};; 674 675{ .mfi 676 nop.m 0 677 fma.s1 fA15 = fA15, fArgAbsNorm, fA14 // Polynomial 678 nop.i 0 679};; 680 681{ .mfi 682 nop.m 0 683 fma.s1 fTDeg7 = fTDeg3, fTQuadr, f0 // Polynomial 684 nop.i 0 685} 686{ .mfi 687 nop.m 0 688 fma.s1 fA13 = fA13, fArgAbsNorm, fA12 // Polynomial 689 nop.i 0 690};; 691 692{ .mfi 693 nop.m 0 694 fma.s1 fA11 = fA11, fArgAbsNorm, fA10 // Polynomial 695 nop.i 0 696};; 697 698{ .mfi 699 nop.m 0 700 fma.s1 fA9 = fA9, fArgAbsNorm, fA8 // Polynomial 701 nop.i 0 702};; 703 704{ .mfi 705 nop.m 0 706 fma.s1 fRes = fRes, fTSqr, fA17 // Polynomial 707 nop.i 0 708} 709{ .mfi 710 nop.m 0 711 fma.s1 fA7 = fA7, fArgAbsNorm, fA6 // Polynomial 712 nop.i 0 713};; 714 715{ .mfi 716 nop.m 0 717 fma.s1 fA5 = fA5, fArgAbsNorm, f0 // Polynomial 718 nop.i 0 719};; 720 721{ .mfi 722 nop.m 0 723 fma.s1 fA15 = fA15, fTSqr, fA13 // Polynomial 724 nop.i 0 725} 726{ .mfi 727 nop.m 0 728 fma.s1 fA4 = fA4, fArgAbsNorm, fA3 // Polynomial 729 nop.i 0 730};; 731 732{ .mfi 733 nop.m 0 734 fma.s1 fA2 = fA2, fArgAbsNorm, fA1 // Polynomial 735 nop.i 0 736};; 737 738{ .mfi 739 nop.m 0 740 fma.s1 fA11 = fA11, fTSqr, fA9 // Polynomial 741 nop.i 0 742};; 743 744{ .mfi 745 nop.m 0 746 fma.s1 fA7 = fA7, fTSqr, fA5 // Polynomial 747 nop.i 0 748};; 749 750{ .mfi 751 nop.m 0 752 fma.s1 fRes = fRes, fTQuadr, fA15 // Polynomial 753 nop.i 0 754};; 755 756{ .mfi 757 nop.m 0 758 fma.s1 fA4 = fA4, fTSqr, fA2 // Polynomial 759 nop.i 0 760};; 761 762{ .mfi 763 nop.m 0 764 fma.s1 fRes = fRes, fTQuadr, fA11 // Polynomial 765 nop.i 0 766};; 767 768{ .mfi 769 nop.m 0 770 fma.s1 fA4 = fA7, fTDeg3, fA4 // Polynomial 771 nop.i 0 772};; 773 774{ .mfi 775 nop.m 0 776 fma.s1 fRes = fRes, fTDeg7, fA4 // Polynomial 777 nop.i 0 778};; 779 780{ .mfi 781 nop.m 0 782 // result for negative argument 783(p15) fms.d.s0 f8 = fRes, fArgAbsNormSgn, fA0 // Polynomial 784 nop.i 0 785} 786{ .mfb 787 nop.m 0 788 // result for positive argument 789(p14) fma.d.s0 f8 = fRes, fArgAbsNormSgn, fA0 // Polynomial 790 br.ret.sptk b0 791};; 792 793 794// |x| < 0.25 Path ///////////////////////////////////////////////////////////// 795.align 32 796tanh_near_zero: 797{ .mfi 798 adds rCoeffAddr1 = 0xC80, rDataPtr // address of A9 799 fma.s0 fTSqr = fArgSqr, fArgSqr, f0 // x^4 800 nop.i 0 801} 802{ .mfi 803 adds rCoeffAddr2 = 0xCB0, rDataPtr // address of A7 804 nop.f 0 805 nop.i 0 806};; 807 808{ .mfi 809 ldfpd fA9, fA8 = [rCoeffAddr1], 16 // Load A9, A8 810 nop.f 0 811 nop.i 0 812} 813{ .mfi 814 ldfpd fA7, fA6 = [rCoeffAddr2], 16 // Load A7, A6 815 nop.f 0 816 nop.i 0 817};; 818 819{ .mfi 820 ldfpd fA5, fA4 = [rCoeffAddr1], 16 // Load A5, A4 821 nop.f 0 822 nop.i 0 823} 824{ .mfi 825 ldfpd fA3, fA2 = [rCoeffAddr2], 16 // Load A3, A2 826 nop.f 0 827 nop.i 0 828};; 829 830{ .mfi 831 ldfe fA1 = [rCoeffAddr1] // Load A1 832 nop.f 0 833 nop.i 0 834};; 835 836{ .mfi 837 nop.m 0 838 fma.s1 fTQuadr = fTSqr, fTSqr, f0 // x^4 839 nop.i 0 840};; 841 842{ .mfi 843 nop.m 0 844 fma.s1 fRes = fA9, fArgSqr, fA8 // Polynomial 845 nop.i 0 846} 847{ .mfi 848 nop.m 0 849 fma.s1 fA7 = fA7, fArgSqr, fA6 // Polynomial 850 nop.i 0 851};; 852 853{ .mfi 854 nop.m 0 855 fma.s1 fA3 = fA3, fArgSqr, fA2 // Polynomial 856 nop.i 0 857} 858{ .mfi 859 nop.m 0 860 fma.s1 fA5 = fA5, fArgSqr, fA4 // Polynomial 861 nop.i 0 862};; 863 864{ .mfi 865 nop.m 0 866 fma.s1 fA1 = fA1, fArgSqr, f0 // Polynomial 867 nop.i 0 868} 869{ .mfi 870 nop.m 0 871 fma.s1 fTQuadrSgn = fTQuadr, f8, f0 // x^4 * x 872 nop.i 0 873};; 874 875{ .mfi 876 nop.m 0 877 fma.s1 fRes = fRes, fTSqr, fA7 // Polynomial 878 nop.i 0 879};; 880 881{ .mfi 882 nop.m 0 883 fma.s1 fA1 = fA3, fTSqr, fA1 // Polynomial 884 nop.i 0 885};; 886 887{ .mfi 888 nop.m 0 889 fma.s1 fRes = fRes, fTSqr, fA5 // Polynomial 890 nop.i 0 891};; 892 893{ .mfi 894 nop.m 0 895 fma.s1 fRes = fRes, fTQuadr, fA1 // Polynomial 896 nop.i 0 897};; 898 899{ .mfb 900 nop.m 0 901 fma.d.s0 f8 = fRes, f8, f8 // x+x*Polynomial 902 br.ret.sptk b0 // Exit for |x| < 0.25 903};; 904 905 906 907 908 909// 19.0625 <= |x| < +inf Saturation path /////////////////////////////////////// 910.align 32 911tanh_saturation: 912{ .mfi 913 adds rDataPtr = 0xCD0, rDataPtr // address of A0 914 nop.f 0 915 nop.i 0 916};; 917 918{ .mfi 919 ldfe fA0 = [rDataPtr] // Load A0 = 2^(-63) 920 nop.f 0 921 nop.i 0 922};; 923 924{ .mfb 925 nop.m 0 926 fma.d.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0-2^(-63)) 927 br.ret.sptk b0 // Exit for 19.0625 <=|x|< +inf 928};; 929 930 931 932 933 934// 0, denormals and special IEEE numbers path ///////////////////////////////// 935_tanh_spec: 936 937{ .mfi 938 cmp.lt p15, p14 = rArg, r0 // Is arg negative (p15) 939 // or positive p14) 940 fclass.m p6,p0 = f8, 0x23 // To filter infinities 941 // 0x23 = @pos|@neg|@inf 942 nop.i 0 943};; 944 945{ .mfi 946 nop.m 0 947 fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros 948 // 0xC7 = @pos|@neg|@zero|@qnan|@snan 949 nop.i 0 950};; 951 952{ .mfb 953 nop.m 0 954(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args 955(p6) br.ret.spnt b0 // exit for x = INF 956};; 957 958{ .mfb 959 nop.m 0 960(p7) fma.d.s0 f8 = f8, f1, f8 // +/-0 for 0 args 961 // and NaNs for NaNs 962(p7) br.ret.spnt b0 // exit for x = NaN or +/-0 963};; 964 965{ .mfi 966 nop.m 0 967 fnorm.s0 f8 = f8 // Normalize arg 968 nop.i 0 969};; 970 971.pred.rel "mutex",p14,p15 972{ .mfi 973 nop.m 0 974(p14) fnma.d.s0 f8 = f8, f8, f8 // res = r-r^2 975 nop.i 0 976} 977{ .mfb 978 nop.m 0 979(p15) fma.d.s0 f8 = f8, f8, f8 // res = r+r^2 980 br.ret.sptk b0 // 0, denormals, specials return 981};; 982 983GLOBAL_LIBM_END(tanh) 984libm_alias_double_other (tanh, tanh) 985