1.file "tanh.s"
2
3
4// Copyright (c) 2001 - 2005, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39// History
40//==============================================================================
41// 05/30/01  Initial version
42// 12/04/01  Rewritten version with erf-like algorithm.
43//           Performance improved.
44// 05/20/02  Cleaned up namespace and sf0 syntax
45// 08/14/02  Changed mli templates to mlx
46// 02/10/03  Reordered header: .section, .global, .proc, .align
47// 03/31/05  Reformatted delimiters between data tables
48//
49// API
50//==============================================================================
51// double tanh(double)
52//
53// Overview of operation
54//==============================================================================
55//
56// Algorithm description
57// ---------------------
58//
59// There are 4 paths:
60//
61// 1. Special path: x = 0, Inf, NaNs, denormals
62//    Return tanh(x) = +/-0.0 for zeros
63//    Return tanh(x) = QNaN for NaNs
64//    Return tanh(x) = sign(x)*1.0 for Inf
65//    Return tanh(x) = x + x^2   for - denormals
66//    Return tanh(x) = x - x^2   for + denormals
67//
68// 2. Near zero path: 0.0 < |x| < 0.25
69//    Return tanh(x) = x + x^3*A3 + ... + x^19*A19
70//
71// 3. Main path: 0.25 <= |x| < 19.0625
72//    For several ranges of 0.25 <= |x| < 19.0625
73//    Return tanh(x) = sign(x)*(A0 + y*A1 + y^2*A2 +
74//                                       + y^3*A3 + ... + y^19*A19)
75//    where y = (|x|/a) - b
76//
77//    For each range there is particular set of coefficients.
78//    Below is the list of ranges:
79//    1/4  <= |x| < 1/2     a = 0.25, b = 1.0
80//    1/2  <= |x| < 1.0     a = 0.5,  b = 1.0
81//    1.0  <= |x| < 2.0     a = 1.0,  b = 1.0
82//    2.0  <= |x| < 3.25    a = 2.0,  b = 1.0
83//    3.25 <= |x| < 4.0     a = 2.0,  b = 2.0
84//    4.0  <= |x| < 6.5     a = 4.0,  b = 1.0
85//    6.5  <= |x| < 8.0     a = 4.0,  b = 2.0
86//    8.0  <= |x| < 13.0    a = 8.0,  b = 1.0
87//    13.0 <= |x| < 16.0    a = 8.0,  b = 2.0
88//    16.0 <= |x| < 19.0625 a = 16.0, b = 1.0
89//    ( [3.25;4.0], [6.5;8.0], [13.0;16.0] subranges separated
90//                               for monotonicity issues resolve )
91//
92// 4. Saturation path: 19.0625 <= |x| < +INF
93//    Return tanh(x) = sign(x)*(1.0 - tiny_value)
94//    (tiny_value ~ 2^(-63))
95//
96// Registers used
97//==============================================================================
98// Floating Point registers used:
99// f8 = input, output
100// f32 -> f64
101//
102// General registers used:
103// r32 -> r51, r2, r3
104//
105// Predicate registers used:
106// p6, p8, p10, p11, p12, p14, p15
107// p6           arg is zero, denormal or special IEEE
108// p8           to filter out case when signd(x) > 1.625
109// p10          to filter out case when |x| < 0.25
110// p11          to filter out case when signd(x) <= 1.625
111// p12          to filter out case when |x| >= 19.0625
112// p14          set to 1 for positive x
113// p15          set to 1 for negative x
114
115// Assembly macros
116//==============================================================================
117rDataPtr           = r2
118rDataPtr1          = r3
119
120rBias              = r33
121rCoeffAddr3        = r34
122rThreeAndQ         = r35
123rCoeffAddr2        = r36
124rMask              = r37
125rArg               = r38
126rSignBit           = r39
127rAbsArg            = r40
128rSaturation        = r41
129rIndex             = r42
130rCoeffAddr1        = r43
131rCoeffAddr4        = r44
132rShiftedArg        = r45
133rShiftedArgMasked  = r46
134rBiasedExpOf4      = r47
135rShiftedAbsArg     = r48
136rArgSgnd           = r49
137r1625Sgnd          = r50
138rTwo               = r51
139
140//==============================================================================
141fA0                = f32
142fA1                = f33
143fA2                = f34
144fA3                = f35
145fA4                = f36
146fA5                = f37
147fA6                = f38
148fA7                = f39
149fA8                = f40
150fA9                = f41
151fA10               = f42
152fA11               = f43
153fA12               = f44
154fA13               = f45
155fA14               = f46
156fA15               = f47
157fA16               = f48
158fA17               = f49
159fA18               = f50
160fA19               = f51
161fArgSqr            = f52
162fArgAbsNorm        = f53
163fSignumX           = f54
164fRes               = f55
165fThreeAndQ         = f56
166fArgAbs            = f57
167fTSqr              = f58
168fTQuadr            = f59
169fTDeg3             = f60
170fTDeg7             = f61
171fArgAbsNormSgn     = f62
172fTQuadrSgn         = f63
173fTwo               = f64
174
175// Data tables
176//==============================================================================
177RODATA
178
179.align 16
180
181LOCAL_OBJECT_START(tanh_data)
182// CAUTION: The order of these table coefficients shouldn't be changed!
183
184// Main path coefficients:
185// Coefficients ##0..15 ("main" coefficient tables)
186// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5
187data8 0xE9D218BC9A3FB55A, 0x00003FC7 //A19
188data8 0xC8C0D38687F36EBA, 0x00003FCE //A18
189data8 0xA2663E519FAC8A43, 0x0000BFD2 //A17
190data8 0xD913F0490674B0DF, 0x00003FD3 //A16
191data8 0xF75D84789DE0AE52, 0x00003FD6 //A15
192data8 0xACB3C40EEF3A06F0, 0x0000BFD9 //A14
193data8 0xEBD7F5DC02CFD5BA, 0x0000BFDB //A13
194data8 0x8B52CDF66D709E2A, 0x00003FDF //A12
195data8 0x9EC21F28E05C4A3E, 0x00003FE0 //A11
196data8 0xC412B44D0176F3ED, 0x0000BFE4 //A10
197data8 0x97BF35A34DD1EA4C, 0x0000BFE0 //A9
198data8 0xF89F5B39E3A3AA36, 0x00003FE9 //A8
199data8 0xF2BA654BCEEBA433, 0x0000BFEA //A7
200data8 0x8E1C15876AA589AD, 0x0000BFEF //A6
201data8 0x942226246A8C2A86, 0x00003FF1 //A5
202data8 0x8F06D9FF7DB47261, 0x00003FF4 //A4
203//
204// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
205data8 0xC4A7B8FB672A8520, 0x00003FDC //A19
206data8 0xA20724B847E13499, 0x0000BFE0 //A18
207data8 0xE17DB53F02E4D340, 0x00003FE2 //A17
208data8 0x90264A1012F4CA6F, 0x0000BFE4 //A16
209data8 0xEBEC9F776F0BF415, 0x0000BFE0 //A15
210data8 0x89AF912B305B45A4, 0x00003FE7 //A14
211data8 0xB4A960B81F5EC36A, 0x0000BFE7 //A13
212data8 0x969A4E95B2DA86B5, 0x0000BFEA //A12
213data8 0x8A3FC0EC082305CB, 0x00003FEC //A11
214data8 0x83D7795BCBE24373, 0x00003FEC //A10
215data8 0xDCBF42AEB82932EC, 0x0000BFEF //A9
216data8 0x83318E61ECAFD804, 0x00003FF0 //A8
217data8 0xEA4DE5746975A914, 0x00003FF2 //A7
218data8 0xCE63E8FA6B96480B, 0x0000BFF4 //A6
219data8 0xDF017BE0D4FE45D8, 0x0000BFF4 //A5
220data8 0xA8A0C6E2226DF3CD, 0x00003FF8 //A4
221//
222// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
223data8 0x8E89D2EBFDAA160B, 0x00003FE9 //A19
224data8 0xDD9226310A272046, 0x0000BFEC //A18
225data8 0xA038042D28B0D665, 0x00003FEF //A17
226data8 0x8C04796F03516306, 0x0000BFF1 //A16
227data8 0x9CD6A9CB4E90A2FD, 0x00003FF2 //A15
228data8 0xC8980E166F5A84FD, 0x0000BFF2 //A14
229data8 0x9ADFE65F56B7BCFD, 0x00003FED //A13
230data8 0x8B11FDFB5D0A7B96, 0x00003FF4 //A12
231data8 0x8209A125E829CBFA, 0x0000BFF5 //A11
232data8 0xCF38AAC17B85BD76, 0x00003FF1 //A10
233data8 0xD5C2E248D8AB99AB, 0x00003FF6 //A9
234data8 0xE12BE2785727F2D6, 0x0000BFF7 //A8
235data8 0x9FC9EF90F87BF1E2, 0x00003FF6 //A7
236data8 0x9B02FE0DAF42C08F, 0x00003FF9 //A6
237data8 0xBDACE06F531D9491, 0x0000BFFA //A5
238data8 0xE3048AD1DB2F648C, 0x00003FF9 //A4
239//
240// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25
241data8 0x856EC3B0330A385A, 0x00003FEB //A19
242data8 0xC641D69DAE2D429C, 0x0000BFF2 //A18
243data8 0xC683EB0BE1343FFF, 0x00003FF5 //A17
244data8 0xC358954224E4E823, 0x0000BFF7 //A16
245data8 0xF813A8D6D396BC5F, 0x00003FF8 //A15
246data8 0xE0ECDFED078D37D6, 0x0000BFF9 //A14
247data8 0x950E4E619855E316, 0x00003FFA //A13
248data8 0x8453B8F93370FB58, 0x0000BFFA //A12
249data8 0xFDBA28430AEC95BA, 0x00003FF7 //A11
250data8 0x9371AAC1FDB1E664, 0x00003FFA //A10
251data8 0xAC972DA97782D88A, 0x0000BFFB //A9
252data8 0xE18F47B10B9CE1BC, 0x00003FFB //A8
253data8 0xAB7C81230BF13BC6, 0x0000BFFB //A7
254data8 0xA6CAAD4A3E31A7D5, 0x0000BFF8 //A6
255data8 0x9CABD76D1D5C3878, 0x00003FFC //A5
256data8 0x92906D077941CAA9, 0x0000BFFD //A4
257//
258// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5
259data8 0x9232D19F71709AC9, 0x0000BFF5 //A19
260data8 0x819E31323F5DD3F8, 0x00003FF8 //A18
261data8 0xDA8E1CDB8D23DC29, 0x0000BFF9 //A17
262data8 0xE97C7CD8FC0486D8, 0x00003FFA //A16
263data8 0xB0C4AD234D88C9F2, 0x0000BFFB //A15
264data8 0xC5989BFB28FDE267, 0x00003FFB //A14
265data8 0x9B26520EC4EFEE8E, 0x0000BFFB //A13
266data8 0xC4B6F758AD21E574, 0x00003FF9 //A12
267data8 0xCC36E3FFA10D2CFF, 0x00003FFA //A11
268data8 0x8738696FB06A5CED, 0x0000BFFC //A10
269data8 0xD31981825BF39228, 0x00003FFC //A9
270data8 0x82C58FB9BEE43992, 0x0000BFFD //A8
271data8 0x88D5AAE49164B6F3, 0x00003FFD //A7
272data8 0xF4CA0B968AF2DDE2, 0x0000BFFC //A6
273data8 0xB99874B482BD17EE, 0x00003FFC //A5
274data8 0xE93FB2F99431DC1D, 0x0000BFFB //A4
275//
276// Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0
277data8 0xAAA9EB7EADA85CEC, 0x00003FF5 //A19
278data8 0x980C80EE05A6BE78, 0x0000BFF8 //A18
279data8 0x818DA9F5396390A5, 0x00003FFA //A17
280data8 0x8D8CC21E23D8A6A2, 0x0000BFFB //A16
281data8 0xE0EC19E55A886765, 0x00003FFB //A15
282data8 0x8C11197A7E6244C5, 0x0000BFFC //A14
283data8 0x901D2BF203C2F7F3, 0x00003FFC //A13
284data8 0xFEACAEE66EE803E5, 0x0000BFFB //A12
285data8 0xC684E4925E318C3F, 0x00003FFB //A11
286data8 0x8A9D8A970565F28D, 0x0000BFFB //A10
287data8 0xAE34C61DE5CEA4D4, 0x00003FFA //A9
288data8 0xC44C5714BD6208A0, 0x0000BFF9 //A8
289data8 0xC4612F7D6C8BDB79, 0x00003FF8 //A7
290data8 0xABD91DCE40D5EECB, 0x0000BFF7 //A6
291data8 0x80E375C1B847B72F, 0x00003FF6 //A5
292data8 0xA11C7DD978CF700A, 0x0000BFF4 //A4
293//
294// Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625
295data8 0xE29D17C510F86F6B, 0x00003FF3 //A19
296data8 0x88FE52EB39A3A98C, 0x0000BFF5 //A18
297data8 0xA406547E50360693, 0x00003FF5 //A17
298data8 0x83E6260B71C6D7DE, 0x0000BFF5 //A16
299data8 0xA36AB5B0CBC97B85, 0x00003FF4 //A15
300data8 0xA94931E0B7BA6C14, 0x0000BFF3 //A14
301data8 0x9A4596DAF350AD63, 0x00003FF2 //A13
302data8 0xFE47643F375AECA5, 0x0000BFF0 //A12
303data8 0xBF8433C5ABEE63B1, 0x00003FEF //A11
304data8 0x83CEE05D7AE90A0A, 0x0000BFEE //A10
305data8 0xA4CC45480BCEB02D, 0x00003FEC //A9
306data8 0xB967CBDCBC16CB10, 0x0000BFEA //A8
307data8 0xB9681B214EDC098D, 0x00003FE8 //A7
308data8 0xA23B20D87B80DFA8, 0x0000BFE6 //A6
309data8 0xF358B2C46F10CBAF, 0x00003FE3 //A5
310data8 0x98176FD06229A385, 0x0000BFE1 //A4
311//
312// Binary subranges
313// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0
314data8 0xEF2EE841288F6706, 0x00003FE9 //A19
315data8 0xE65D5B74B85F82A6, 0x00003FEB //A18
316data8 0xE495FC21E42A79FF, 0x00003FEA //A17
317data8 0xF99B267A913CF3E5, 0x00003FEC //A16
318data8 0xFE3D700F4A0A0FDE, 0x0000BFEC //A15
319data8 0x8F91BB4EE4E4EA52, 0x00003FEE //A14
320data8 0xBCA9F41A5C6EF8BA, 0x0000BFEE //A13
321data8 0xF93E00884027A9CF, 0x00003FED //A12
322data8 0xC4D4036A61BABC2F, 0x00003FEF //A11
323data8 0x86CC2AD1AD47C7D5, 0x0000BFF2 //A10
324data8 0xD3065DEF4CE9AD32, 0x00003FF3 //A9
325data8 0x82C44125F568D54E, 0x0000BFF5 //A8
326data8 0x88D588729BAF14CA, 0x00003FF6 //A7
327data8 0xF4CA0661307243C7, 0x0000BFF6 //A6
328data8 0xB998746D57061F74, 0x00003FF7 //A5
329data8 0xE93FB2F482327C19, 0x0000BFF7 //A4
330//
331// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
332data8 0xEB189B71ADC40BE2, 0x00003FEA //A19
333data8 0xA60B46F9FF6DC2DF, 0x00003FEA //A18
334data8 0xBB061CDD9F368B9D, 0x00003FEC //A17
335data8 0x841E08BDF5429991, 0x0000BFEC //A16
336data8 0xDD33990B433F25BE, 0x00003FED //A15
337data8 0xBA5DE6B870F0A2BB, 0x0000BFEE //A14
338data8 0xA71D489AAA6DACF0, 0x00003FEF //A13
339data8 0x874CCB2B8F3FBC0E, 0x0000BFF0 //A12
340data8 0xCB1D2E9754EA534A, 0x00003FF0 //A11
341data8 0x8BA5ABB53BA6ABCF, 0x0000BFF1 //A10
342data8 0xAE91FD1C2391A32B, 0x00003FF1 //A9
343data8 0xC465A74B798E5761, 0x0000BFF1 //A8
344data8 0xC4666152397D15C1, 0x00003FF1 //A7
345data8 0xABD9E63CA575B950, 0x0000BFF1 //A6
346data8 0x80E38B18E8D0F460, 0x00003FF1 //A5
347data8 0xA11C80E20AAFDD3C, 0x0000BFF0 //A4
348//
349// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0
350data8 0xBECD0AF7E22E5594, 0x00003FE9 //A19
351data8 0xE2834E2D68C1128C, 0x00003FEA //A18
352data8 0x97B117611B317379, 0x00003FEB //A17
353data8 0xEE91A0D39A772F6B, 0x00003FEA //A16
354data8 0x92F6EC377DCADA4F, 0x00003FEA //A15
355data8 0xD8FCCD6A3277FAB7, 0x00003FE8 //A14
356data8 0xC15AB9CB0C3DCFE0, 0x00003FE7 //A13
357data8 0xC3C659704A7147CD, 0x00003FE2 //A12
358data8 0xFA17F09D27C97912, 0x00003FE4 //A11
359data8 0xF664147182B94788, 0x0000BFE3 //A10
360data8 0xA6C89FA741464DA1, 0x00003FE3 //A9
361data8 0xB90FE464A825EFA8, 0x0000BFE2 //A8
362data8 0xB973AE0FD86EC024, 0x00003FE1 //A7
363data8 0xA23A087F96846951, 0x0000BFE0 //A6
364data8 0xF358D8A7FC012D5D, 0x00003FDE //A5
365data8 0x98176E2309B7C73A, 0x0000BFDD //A4
366//
367// Coefficients ##16..19 ("tail" coefficient tables)
368// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5
369data8 0x838F209ABB9BA7B3, 0x0000BFF7 //A3
370data8 0xEBC0AC78DA4FC500, 0x0000BFF8 //A2
371data8 0xF0A4D02960B60E69, 0x00003FFC //A1
372data8 0xFACBF534D0E42F8A, 0x00003FFC //A0
373//
374// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
375data8 0xC0ECBDC0A0D133A6, 0x0000BFF8 //A3
376data8 0xBA13A076BF8E812F, 0x0000BFFB //A2
377data8 0xC954A37D1A1CA070, 0x00003FFD //A1
378data8 0xEC9A9EBAB4579B29, 0x00003FFD //A0
379//
380// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
381data8 0xD42E9175A6EA1397, 0x00003FFB //A3
382data8 0xA3C361378A55CF56, 0x0000BFFD //A2
383data8 0xD706E07CC8622983, 0x00003FFD //A1
384data8 0xC2F7D5A8A79CA2AC, 0x00003FFE //A0
385//
386// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25
387data8 0xAC7A7F8776817C7E, 0x00003FFD //A3
388data8 0x8B7CE95E69FCFE9A, 0x0000BFFD //A2
389data8 0x90B161317028D995, 0x00003FFC //A1
390data8 0xF6CA82F0DE1E9E9A, 0x00003FFE //A0
391//
392// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5
393data8 0xE9E072407BC22DC6, 0x00003FFA //A3
394data8 0xAFA4A913D8E6BB4A, 0x0000BFF9 //A2
395data8 0xAFC2D6A885BAA875, 0x00003FF7 //A1
396data8 0xFFD40B84505A10B2, 0x00003FFE //A0
397//
398// Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0
399data8 0xA11C8A1FED168CD5, 0x00003FF2 //A3
400data8 0xF1AAD6B02063A5F5, 0x0000BFEF //A2
401data8 0xF1AADA46AD341C34, 0x00003FEC //A1
402data8 0xFFFFFC39548FC34B, 0x00003FFE //A0
403//
404// Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625
405data8 0x98176FD1F0950C16, 0x00003FDE //A3
406data8 0xE42327BB09C8B2A5, 0x0000BFDA //A2
407data8 0xE42327BB0B154F13, 0x00003FD6 //A1
408data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0
409//
410// Binary subranges
411// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0
412data8 0xE9E072404329293B, 0x00003FF7 //A3
413data8 0xAFA4A913D798300B, 0x0000BFF7 //A2
414data8 0xAFC2D6A885B48567, 0x00003FF6 //A1
415data8 0xFFD40B84505A10B4, 0x00003FFE //A0
416//
417// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
418data8 0xA11C8A63815F7A28, 0x00003FEF //A3
419data8 0xF1AAD6B65B0EBF53, 0x0000BFED //A2
420data8 0xF1AADA46E799831F, 0x00003FEB //A1
421data8 0xFFFFFC39548FC348, 0x00003FFE //A0
422//
423// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0
424data8 0x98176FE982140A59, 0x00003FDB //A3
425data8 0xE42327B9B0D7202F, 0x0000BFD8 //A2
426data8 0xE42327BB13076BD6, 0x00003FD5 //A1
427data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0
428//
429// Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.25
430// ('tanh_near_zero' path)
431data8 0xBF2BA5D26E479D0C //A9
432data8 0x3F4336D96F81EE26 //A8
433data8 0xBF8226E34AE197B0 //A5
434data8 0x3F9664F488148657 //A4
435data8 0xAAAAAAAAAAAAAA99, 0x0000BFFD //A1
436data8 0xBF57D91925BB5EE2 //A7
437data8 0x3F6D6D36C3D5B7A1 //A6
438data8 0xBFABA1BA1BA19D32 //A3
439data8 0x3FC1111111111108 //A2
440//
441// 1.0 - 2^(-63)
442// ('tanh_saturation' path)
443data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE
444LOCAL_OBJECT_END(tanh_data)
445
446// CAUTION: The order of table coefficients shouldn't be changed!
447
448
449.section .text
450GLOBAL_LIBM_ENTRY(tanh)
451{ .mfi
452      alloc          r32         = ar.pfs, 0, 20, 0, 0
453      fmerge.se      fArgAbsNorm = f1, f8         // normalized x
454      adds           rSignBit    = 0x1, r0        // Bit for sign removing
455}
456{ .mfi
457      addl           rDataPtr    = @ltoff(tanh_data), gp // Data pointer
458      fma.s1         fTwo        = f1, f1, f1            // 2.0 construct
459      addl           rArgSgnd    = 0xfff, r0             // mask for exponent
460};;
461
462{ .mfi
463      getf.d         rArg        = f8       // x in GR
464      fclass.m       p6,p0       = f8, 0xEF // Filter 0, denormals and specials
465                            // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf
466      shl            rArgSgnd    = rArgSgnd, 52  // mask for exponent
467}
468{ .mlx
469      ld8            rDataPtr    = [rDataPtr]        // Real data pointer
470      movl           r1625Sgnd   = 0xA000000000000   // 1.625 signd
471      // 1.625 significand used to filter values greater than 3.25, 6.5, 13.0
472      // to enter binary subranges
473};;
474
475{ .mfi
476      addl           rBias       = 0x3FD00, r0       // bias of 0.25 << 8
477      fma.s1         fArgSqr     = f8, f8, f0        // x^2
478      shl            rSignBit    = rSignBit, 63      // mask for sign bit
479}
480{ .mlx
481      addl           rMask       = 0x7FF00, r0          // Mask for index bits
482      movl           rTwo        = 0x4000000000000000   // 2.0
483};;
484
485{ .mfi
486      andcm          rArgSgnd    = rArg, rArgSgnd // Remove exponent
487      nop.f          0
488      shr.u          rShiftedArg = rArg, 44 // Select only necessary bits of arg
489}
490{ .mfb
491      andcm          rAbsArg     = rArg, rSignBit     // Remove sign
492      nop.f          0
493(p6)  br.cond.spnt   _tanh_spec    // Branch to zero, denorm & specs
494};;
495
496{ .mfi
497      and            rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8
498      fmerge.s       fArgAbs     = f1, f8                   // |x|
499      shr            rShiftedAbsArg    = rAbsArg, 44 // Select only necessary
500                                                     // bits of absolute arg
501}
502{ .mfi
503      cmp.gt         p8, p11     = rArgSgnd, r1625Sgnd // p8 = 1 if
504      // signd(x) > 1.625 - to filter values greater than 3.25, 6.5, 13.0
505      nop.f          0
506      nop.i          0
507};;
508
509{ .mfi
510      sub            rIndex      = rShiftedArgMasked, rBias // index << 8
511      nop.f          0
512      cmp.lt         p10, p0     = rShiftedArgMasked, rBias // p10=1 if |x|<0.25
513}
514{ .mfb
515(p8)  cmp.gt         p8, p11     = rAbsArg, rTwo // If arg is greater than 2.0?
516                                       // (then we should use binary subranges)
517      nop.f          0
518(p10) br.cond.spnt   tanh_near_zero    // branch out if |x| < 0.25
519};;
520
521.pred.rel "mutex",p8,p11
522{ .mfi
523(p8)  add            rIndex      = 0x400, rIndex // Make pointer to binary
524                                                 // subranges
525(p11) fms.s1         fArgAbsNorm = fArgAbsNorm, f1, f1     // |x|/b - 1.0
526      addl           rSaturation = 0x40331, r0 // shifted bits of 19.0625
527}
528{ .mfi
529      nop.m          0
530(p8)  fms.s1         fArgAbsNorm = fArgAbsNorm, f1, fTwo // |x|/b - 2.0
531       // this is only for binary subranges [3.25;4], [6.5;8], [13.0;16]
532      nop.i          0
533}
534;;
535
536{ .mfi
537      add            rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14
538      nop.f          0
539      nop.i          0
540};;
541
542{ .mfi
543      adds           rCoeffAddr2 = 16, rCoeffAddr1 // Shifted pointer to coeffs
544      fmerge.s       fSignumX    = f8, f1          // signum(x)
545      nop.i          0
546}
547{ .mfb
548      cmp.le         p12, p0     = rSaturation, rShiftedAbsArg // |x|>=19.0625?
549      nop.f          0
550(p12) br.cond.spnt   tanh_saturation          // branch out if x |x| >= 19.0625
551};;
552
553{.mfi
554      ldfe           fA19        = [rCoeffAddr1], 32 // Load A19
555      nop.f          0
556      nop.i          0
557}
558{.mfi
559      ldfe           fA18        = [rCoeffAddr2], 32 // Load A18
560      nop.f          0
561      adds           rCoeffAddr3 = 0xA00, rDataPtr   // Pointer to "tail"
562                                                     // coefficients tables
563};;
564
565{.mfi
566      ldfe           fA17        = [rCoeffAddr1], 32 // Load A17
567      nop.f          0
568      nop.i          0
569}
570{.mfi
571      ldfe           fA16        = [rCoeffAddr2], 32 // Load A16
572      nop.f          0
573      nop.i          0
574};;
575
576{.mfi
577      ldfe           fA15        = [rCoeffAddr1], 32 // Load A15
578      fma.s1         fTSqr       = fArgAbsNorm, fArgAbsNorm, f0 // x^2
579      shr.u          rIndex      = rIndex, 2 // Index for "tail" tables
580}
581{.mfi
582      ldfe           fA14        = [rCoeffAddr2], 32 // Load A14
583      nop.f          0
584      adds           rCoeffAddr4 = 16, r0            // Shifter pointer
585                                                     // to "tail" tables
586};;
587
588{.mfi
589      ldfe           fA13        = [rCoeffAddr1], 32   // Load A13
590      nop.f          0
591      add            rCoeffAddr3 = rCoeffAddr3, rIndex // "tail" coeffs to load
592                                                       // ##16..23
593}
594{.mfi
595      ldfe           fA12        = [rCoeffAddr2], 32 // Load A12
596      nop.f          0
597      cmp.lt         p15, p14    = rArg, r0          // Arg positive (p14)
598                                                     // or negative (p15)?
599};;
600
601{.mfi
602      ldfe           fA11        = [rCoeffAddr1], 32        // Load A11
603      nop.f          0
604      add            rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4 // shifted "tail"
605                                                            // coeffs to load
606}
607{.mfi
608      ldfe           fA10        = [rCoeffAddr2], 32 // Load A10
609      nop.f          0
610      nop.i          0
611};;
612
613{.mfi
614      ldfe           fA9         = [rCoeffAddr1], 32 // Load A9
615      nop.f          0
616      nop.i          0
617}
618{.mfi
619      ldfe           fA8         = [rCoeffAddr2], 32 // Load A8
620      nop.f          0
621      nop.i          0
622};;
623
624{.mfi
625      ldfe           fA7         = [rCoeffAddr1], 32 // Load A7
626      nop.f          0
627      nop.i          0
628}
629{.mfi
630      ldfe           fA6         = [rCoeffAddr2], 32 // Load A6
631      nop.f          0
632      nop.i          0
633};;
634
635{.mfi
636      ldfe           fA5         = [rCoeffAddr1], 32 // Load A5
637      fma.s1         fTDeg3      = fArgAbsNorm, fTSqr, f0 // x^3
638      nop.i          0
639}
640{.mfi
641      ldfe           fA4         = [rCoeffAddr2], 32 // Load A4
642      fma.s1         fTQuadr     = fTSqr, fTSqr, f0  // x^4
643      nop.i          0
644};;
645
646// Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm
647{.mfi
648      ldfe           fA3         = [rCoeffAddr3], 32            // Load A3
649      fma.s1         fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0 // sign(x)*x
650      nop.i          0
651}
652{.mfi
653      ldfe           fA2         = [rCoeffAddr4], 32            // Load A2
654      nop.f          0
655      nop.i          0
656};;
657
658{.mfi
659      ldfe           fA1         = [rCoeffAddr3], 32       // Load A1
660      fma.s1         fRes        = fA19, fArgAbsNorm, fA18 // Polynomial
661      nop.i          0
662}
663{.mfi
664      ldfe           fA0         = [rCoeffAddr4], 32       // Load A0
665      nop.f          0
666      nop.i          0
667};;
668
669{ .mfi
670      nop.m          0
671      fma.s1         fA17        = fA17, fArgAbsNorm, fA16  // Polynomial
672      nop.i          0
673};;
674
675{ .mfi
676      nop.m          0
677      fma.s1         fA15        = fA15, fArgAbsNorm, fA14  // Polynomial
678      nop.i          0
679};;
680
681{ .mfi
682      nop.m          0
683      fma.s1         fTDeg7      = fTDeg3, fTQuadr, f0     // Polynomial
684      nop.i          0
685}
686{ .mfi
687      nop.m          0
688      fma.s1         fA13        = fA13, fArgAbsNorm, fA12 // Polynomial
689      nop.i          0
690};;
691
692{ .mfi
693      nop.m          0
694      fma.s1         fA11        = fA11, fArgAbsNorm, fA10 // Polynomial
695      nop.i          0
696};;
697
698{ .mfi
699      nop.m          0
700      fma.s1         fA9         = fA9, fArgAbsNorm, fA8   // Polynomial
701      nop.i          0
702};;
703
704{ .mfi
705      nop.m          0
706      fma.s1         fRes        = fRes, fTSqr, fA17       // Polynomial
707      nop.i          0
708}
709{ .mfi
710      nop.m          0
711      fma.s1         fA7         = fA7, fArgAbsNorm, fA6 // Polynomial
712      nop.i          0
713};;
714
715{ .mfi
716      nop.m          0
717      fma.s1         fA5         = fA5, fArgAbsNorm, f0  // Polynomial
718      nop.i          0
719};;
720
721{ .mfi
722      nop.m          0
723      fma.s1         fA15        = fA15, fTSqr, fA13     // Polynomial
724      nop.i          0
725}
726{ .mfi
727      nop.m          0
728      fma.s1         fA4         = fA4, fArgAbsNorm, fA3 // Polynomial
729      nop.i          0
730};;
731
732{ .mfi
733      nop.m          0
734      fma.s1         fA2         = fA2, fArgAbsNorm, fA1 // Polynomial
735      nop.i          0
736};;
737
738{ .mfi
739      nop.m          0
740      fma.s1         fA11        = fA11, fTSqr, fA9 // Polynomial
741      nop.i          0
742};;
743
744{ .mfi
745      nop.m          0
746      fma.s1         fA7         = fA7, fTSqr, fA5  // Polynomial
747      nop.i          0
748};;
749
750{ .mfi
751      nop.m          0
752      fma.s1         fRes        = fRes, fTQuadr, fA15 // Polynomial
753      nop.i          0
754};;
755
756{ .mfi
757      nop.m          0
758      fma.s1         fA4         = fA4, fTSqr, fA2     // Polynomial
759      nop.i          0
760};;
761
762{ .mfi
763      nop.m          0
764      fma.s1         fRes        = fRes, fTQuadr, fA11 // Polynomial
765      nop.i          0
766};;
767
768{ .mfi
769      nop.m          0
770      fma.s1         fA4         = fA7, fTDeg3, fA4    // Polynomial
771      nop.i          0
772};;
773
774{ .mfi
775      nop.m          0
776      fma.s1         fRes        = fRes,  fTDeg7, fA4  // Polynomial
777      nop.i          0
778};;
779
780{ .mfi
781      nop.m          0
782      // result for negative argument
783(p15) fms.d.s0       f8          = fRes, fArgAbsNormSgn, fA0 // Polynomial
784      nop.i          0
785}
786{ .mfb
787      nop.m          0
788      // result for positive argument
789(p14) fma.d.s0       f8          = fRes, fArgAbsNormSgn, fA0 // Polynomial
790      br.ret.sptk    b0
791};;
792
793
794// |x| < 0.25 Path /////////////////////////////////////////////////////////////
795.align 32
796tanh_near_zero:
797{ .mfi
798      adds           rCoeffAddr1 = 0xC80, rDataPtr      // address of A9
799      fma.s0         fTSqr       = fArgSqr, fArgSqr, f0 // x^4
800      nop.i          0
801}
802{ .mfi
803      adds           rCoeffAddr2 = 0xCB0, rDataPtr      // address of A7
804      nop.f          0
805      nop.i          0
806};;
807
808{ .mfi
809      ldfpd          fA9, fA8    = [rCoeffAddr1], 16 // Load A9, A8
810      nop.f          0
811      nop.i          0
812}
813{ .mfi
814      ldfpd          fA7, fA6    = [rCoeffAddr2], 16 // Load A7, A6
815      nop.f          0
816      nop.i          0
817};;
818
819{ .mfi
820      ldfpd          fA5, fA4    = [rCoeffAddr1], 16 // Load A5, A4
821      nop.f          0
822      nop.i          0
823}
824{ .mfi
825      ldfpd          fA3, fA2    = [rCoeffAddr2], 16 // Load A3, A2
826      nop.f          0
827      nop.i          0
828};;
829
830{ .mfi
831      ldfe           fA1         = [rCoeffAddr1] // Load A1
832      nop.f          0
833      nop.i          0
834};;
835
836{ .mfi
837      nop.m          0
838      fma.s1         fTQuadr     = fTSqr, fTSqr, f0 // x^4
839      nop.i          0
840};;
841
842{ .mfi
843      nop.m          0
844      fma.s1         fRes        = fA9, fArgSqr, fA8 // Polynomial
845      nop.i          0
846}
847{ .mfi
848      nop.m          0
849      fma.s1         fA7         = fA7, fArgSqr, fA6 // Polynomial
850      nop.i          0
851};;
852
853{ .mfi
854      nop.m          0
855      fma.s1         fA3         = fA3, fArgSqr, fA2 // Polynomial
856      nop.i          0
857}
858{ .mfi
859      nop.m          0
860      fma.s1         fA5         = fA5, fArgSqr, fA4 // Polynomial
861      nop.i          0
862};;
863
864{ .mfi
865      nop.m          0
866      fma.s1         fA1         = fA1, fArgSqr, f0 // Polynomial
867      nop.i          0
868}
869{ .mfi
870      nop.m          0
871      fma.s1         fTQuadrSgn  = fTQuadr, f8, f0  // x^4 * x
872      nop.i          0
873};;
874
875{ .mfi
876      nop.m          0
877      fma.s1         fRes        = fRes, fTSqr, fA7 // Polynomial
878      nop.i          0
879};;
880
881{ .mfi
882      nop.m          0
883      fma.s1         fA1         = fA3, fTSqr, fA1 // Polynomial
884      nop.i          0
885};;
886
887{ .mfi
888      nop.m          0
889      fma.s1         fRes        = fRes, fTSqr, fA5 // Polynomial
890      nop.i          0
891};;
892
893{ .mfi
894      nop.m          0
895      fma.s1         fRes        = fRes, fTQuadr, fA1 // Polynomial
896      nop.i          0
897};;
898
899{ .mfb
900      nop.m          0
901      fma.d.s0       f8          = fRes, f8, f8 // x+x*Polynomial
902      br.ret.sptk    b0                         // Exit for |x| < 0.25
903};;
904
905
906
907
908
909// 19.0625 <= |x| < +inf Saturation path ///////////////////////////////////////
910.align 32
911tanh_saturation:
912{ .mfi
913      adds           rDataPtr    = 0xCD0, rDataPtr  // address of A0
914      nop.f          0
915      nop.i          0
916};;
917
918{ .mfi
919      ldfe           fA0         = [rDataPtr]       // Load  A0 = 2^(-63)
920      nop.f          0
921      nop.i          0
922};;
923
924{ .mfb
925      nop.m          0
926      fma.d.s0       f8          = fA0, fSignumX, f0 // sign(x)*(1.0-2^(-63))
927      br.ret.sptk    b0                       // Exit for 19.0625 <=|x|< +inf
928};;
929
930
931
932
933
934//  0, denormals and special IEEE numbers path /////////////////////////////////
935_tanh_spec:
936
937{ .mfi
938      cmp.lt         p15, p14    = rArg, r0 // Is arg negative (p15)
939                                            // or positive p14)
940      fclass.m       p6,p0       = f8, 0x23 // To filter infinities
941                                          // 0x23 = @pos|@neg|@inf
942      nop.i          0
943};;
944
945{ .mfi
946      nop.m          0
947      fclass.m       p7,p0       = f8, 0xC7 // To filter NaNs & Zeros
948                                 // 0xC7 = @pos|@neg|@zero|@qnan|@snan
949      nop.i          0
950};;
951
952{ .mfb
953      nop.m          0
954(p6)  fmerge.s       f8          = f8, f1     // +/-1 for INF args
955(p6)  br.ret.spnt    b0                       // exit for x = INF
956};;
957
958{ .mfb
959      nop.m          0
960(p7)  fma.d.s0       f8          = f8, f1, f8    // +/-0 for 0 args
961                                                 // and NaNs for NaNs
962(p7)  br.ret.spnt    b0                          // exit for x = NaN or +/-0
963};;
964
965{ .mfi
966      nop.m          0
967      fnorm.s0       f8          = f8            // Normalize arg
968      nop.i          0
969};;
970
971.pred.rel "mutex",p14,p15
972{ .mfi
973      nop.m          0
974(p14) fnma.d.s0      f8          = f8, f8, f8  // res = r-r^2
975      nop.i          0
976}
977{ .mfb
978      nop.m          0
979(p15) fma.d.s0       f8          = f8, f8, f8  // res = r+r^2
980      br.ret.sptk    b0          // 0, denormals, specials return
981};;
982
983GLOBAL_LIBM_END(tanh)
984libm_alias_double_other (tanh, tanh)
985