1.file "tanhf.s"
2
3
4// Copyright (c) 2001 - 2005, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39// History
40//==============================================================
41// 05/30/01 Initial version
42// 05/20/02 Cleaned up namespace and sf0 syntax
43// 02/10/03 Reordered header: .section, .global, .proc, .align
44// 03/31/05 Reformatted delimiters between data tables
45//
46// API
47//==============================================================
48// float tanhf(float)
49//
50// Overview of operation
51//==============================================================
52// Background
53//
54//
55// There are 9 paths:
56// 1. x = +/-0.0
57//    Return tanhf(x) = +/-0.0
58//
59// 2. 0.0 < |x| < 0.3125
60//    Return tanhf(x) = x + x^3*Pol3(x^2),
61//    where Pol3(x^2) = C3*x^6 + C2*x^4 + C1*x^2 + C0
62//
63// 3. 0.3125 <= |x| < 8.0
64//    Return tanhf(x) = sign(x)*PolD(x)*PolC(|x|) + sign(x)*PolA(|x|),
65//    where sign(x)*PolD(x) = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4),
66//          PolC(|x|) = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0,
67//          PolA(|x|) = A3|x|^3 + A2*x^2 + A1*|x| + A0
68//
69//    Actually range 0.3125<=|x|< 8.0 is split to 5 subranges.
70//    For each subrange there is particular set of coefficients.
71//    Below is the list of subranges:
72//    3.1 0.3125 <= |x| < 0.5
73//    3.2 0.5 <= |x| < 1.0
74//    3.3 1.0 <= |x| < 2.0
75//    3.4 2.0 <= |x| < 4.0
76//    3.5 4.0 <= |x| < 8.0
77//
78// 4. 8.0 <= |x| < 9.125
79//    Return tanhf(x) = sign(x)*(A3|x|^3 + A2*x^2 + A1*|x| + A0)
80//
81// 5. 9.125 <= |x| < +INF
82//    Return tanhf(x) = sign(x)*(1.0d - 2^(-52))
83//
84// 6. |x| = INF
85//    Return tanhf(x) = sign(x) * 1.0
86//
87// 7. x = [S,Q]NaN
88//    Return tanhf(x) = QNaN
89//
90// 8. x is positive denormal
91//    Return tanhf(x) = x - x^2
92//
93// 9. x is negative denormal
94//    Return tanhf(x) = x + x^2
95//
96// Registers used
97//==============================================================
98// Floating Point registers used:
99// f8, input
100// f32 -> f59
101
102// General registers used:
103// r32 -> r46, r2, r3
104
105// Predicate registers used:
106// p0, p6 -> p15
107
108// p6           to filter out case when x = [Q,S]NaN or +/-0
109// p7           to filter out case when x = denormal
110// p8           set if |x| >= 0.3125, used also to process denormal input
111// p9           to filter out case when |x| = inf
112// p10          to filter out case when |x| < 0.3125
113// p11          to filter out case when 0.3125 <= |x| < 9.125
114// p12          to filter out case when |x| >= 9.125
115// p13          to filter out case when 8.0 <= |x| < 9.125
116// p14          set to 1 for positive x
117// p15          set to 1 for negative x
118
119// Assembly macros
120//==============================================================
121rDataPtr           = r2
122rDataPtr1          = r3
123
124rBias              = r33
125rCoeffAddr3        = r34
126rNearSaturation    = r35
127rCoeffAddr1        = r36
128rCoeffAddr2        = r37
129rOffset2           = r38
130rBias2             = r39
131rMask              = r40
132rArg               = r41
133rBound             = r42
134rSignBit           = r43
135rAbsArg            = r44
136rDataPtr2          = r45
137rSaturation        = r46
138
139//==============================================================
140fA0                = f32
141fA1                = f33
142fA2                = f34
143fA3                = f35
144fC0                = f36
145fC1                = f37
146fC2                = f38
147fC3                = f39
148fD0                = f40
149fD1                = f41
150fD2                = f42
151fB0                = f43
152fArgSqr            = f44
153fAbsArg            = f45
154fSignumX           = f46
155fArg4              = f47
156fArg4Sgn           = f48
157fArg3              = f49
158fArg3Sgn           = f50
159fArg7Sgn           = f51
160fArg6Sgn           = f52
161fPolC              = f53
162fPolCTmp           = f54
163fPolA              = f55
164fPolATmp           = f56
165fPolD              = f57
166fPolDTmp           = f58
167fArgSqrSgn         = f59
168
169// Data tables
170//==============================================================
171
172RODATA
173
174.align 16
175
176LOCAL_OBJECT_START(tanhf_data)
177// Polynomial coefficients for the tanh(x), 0.3125 <= |x| < 0.5
178data8 0x3F9BEEDFDD177D7B // C0
179data8 0x3F970D10C7F32458 // C1
180data8 0x3F766D6B051F3A38 // C2
181data8 0xBF732F2001B23402 // C3
182data8 0xBF854BE1CE1ED499 // D0
183data8 0x4013C944F3999A16 // D1
184data8 0xC01106C6975222C0 // D2
185data8 0x3F783D5ACCF9EBE8 // B0
186// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
187data8 0xBF5D631440786869 // C0
188data8 0xBF575D79A0D52069 // C1
189data8 0xBF7E2237B7EFC705 // C2
190data8 0x3F6A7ACBC273041F // C3
191data8 0xC040E32EA52D91EB // D0
192data8 0x403D19463E5DB4D7 // D1
193data8 0xC02216F61F759F39 // D2
194data8 0xBF55B4EA0B844BE7 // B0
195// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
196data8 0x3F8637DBE5B3E690 // C0
197data8 0xBF7F7FEC158C07F5 // C1
198data8 0x3F711C586706838A // C2
199data8 0xBF50EF7EF605554E // C3
200data8 0xC054D45448354E25 // D0
201data8 0x404ADFEEA282E730 // D1
202data8 0xC028AEE456D59549 // D2
203data8 0x3F25232D1BED59A8 // B0
204// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 4.0
205data8 0xBF52602285F2D06C // C0
206data8 0x3F2E57C298FFE1E0 // C1
207data8 0xBF15ED575DB3C811 // C2
208data8 0x3EE428878A08525C // C3
209data8 0xC0895A26849039C1 // D0
210data8 0x406E3C60BBFBB575 // D1
211data8 0xC03A06F62867C75A // D2
212data8 0xBEB114C70F1C723E // B0
213// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 8.0
214data8 0x3EF4B22BD17039A3 // C0
215data8 0xBEB704ADC040C57F // C1
216data8 0x3E937A98288AFE1A // C2
217data8 0xBE4F33B2C9FFE7E7 // C3
218data8 0xC0BE48CFADE2431E // D0
219data8 0x4090E74249760FDD // D1
220data8 0xC04B6F537FCF2F1E // D2
221data8 0x3E0DCD879C91ADEA // B0
222// Polynomial coefficients for the tanh(x), -0.3125 < x < 0.3125
223data8 0xBFD555551E8245B7 // A0
224data8 0x3FC110E63F52E689 // A1
225data8 0xBFAB8CD6A5B7BAFA // A2
226data8 0x3F945D467FCEB553 // A3
227// Polynomial coefficients for the tanh(x), 0.3125 <= |x| < 0.5
228data8 0xBE3DCC92FCAECBB6 // A0
229data8 0x3FF0000043B7D267 // A1
230data8 0xBED18BF28ACFC4B1 // A2
231data8 0xBFD554A56F82837E // A3
232// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
233data8 0x3EFD6054758539F9 // A0
234data8 0x3FEFFBFC77198EBE // A1
235data8 0x3F700327CA98D237 // A2
236data8 0xBFD68955F5BB2FA1 // A3
237// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
238data8 0xBF71A53F229DF01B // A0
239data8 0x3FF0AECFD730DE50 // A1
240data8 0xBFC882F88E5DF3BA // A2
241data8 0x3FC6EDF212CA2A8D // A3
242// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 4.0
243data8 0xBFAF0B712E9EDA47 // A0
244data8 0x3FF1C208080BEA64 // A1
245data8 0x3FC3D29B20C8946E // A2
246data8 0xBFF04514ED900A6A // A3
247// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 8.0
248data8 0xBFB1DEA49A831CBC // A0
249data8 0x3FFA729FC7085674 // A1
250data8 0xBFF2F44D923A8FA4 // A2
251data8 0x3FE092FC5712227E // A3
252// Polynomial coefficients for the tanh(x), 8.0 <= |x| <= 9.125
253data8 0x3FEFFF5769EE3041 // A0
254data8 0x3EFBBF148D850891 // A1
255data8 0xBEC86BCEF0F5C2FE // A2
256data8 0x3E7CBA4F3A885A5C // A3
257//
258data8 0x3FEFFFFFFFFFFFFF // 1.0 - epsilon
259LOCAL_OBJECT_END(tanhf_data)
260
261.section .text
262GLOBAL_LIBM_ENTRY(tanhf)
263
264{ .mfi
265      alloc          r32 = ar.pfs, 1, 14, 0, 0
266      fmerge.s       fAbsArg = f1, f8             // |x|
267      addl           rMask = 0x806, r0
268}
269{ .mfi
270      addl           rDataPtr = @ltoff(tanhf_data), gp
271      fma.s1         fArgSqr = f8, f8, f0         // x^2
272      adds           rSignBit = 0x1, r0
273}
274;;
275
276{ .mfi
277      getf.s         rArg = f8                    // x in GR
278      fclass.m       p7,p0 = f8, 0x0b             // is x denormal ?
279      // sign bit and 2 most bits in significand
280      shl            rMask = rMask, 20
281}
282{ .mfi
283      ld8            rDataPtr = [rDataPtr]
284      nop.f          0
285      adds           rBias2 = 0x1F4, r0
286}
287;;
288
289{ .mfi
290      adds           rNearSaturation = 0x14, r0
291      fmerge.s       fSignumX = f8, f1            // signum(x)
292      shl            rSignBit = rSignBit, 31      // mask for sign bit
293}
294{ .mfi
295      adds           rBound = 0x3EA, r0
296      nop.f          0
297      addl           rSaturation = 0x4112, r0
298}
299;;
300
301{ .mfi
302      andcm          rOffset2 = rArg, rMask
303      fclass.m       p6,p0 = f8, 0xc7             // is x [S,Q]NaN or +/-0 ?
304      shl            rBound = rBound, 20          // 1.0f in GR
305}
306{ .mfb
307      andcm          rAbsArg = rArg, rSignBit     // |x| in GR
308      nop.f          0
309(p7)  br.cond.spnt   tanhf_denormal               // branch out if x is denormal
310}
311;;
312
313{ .mfi
314      adds           rCoeffAddr2 = 352, rDataPtr
315      fclass.m       p9,p0 = f8, 0x23            // is x +/- inf?
316      shr            rOffset2 = rOffset2, 21
317}
318{ .mfi
319      cmp.lt         p10, p8 = rAbsArg, rBound   // |x| < 0.3125?
320      nop.f          0
321      adds           rCoeffAddr3 = 16, rDataPtr
322}
323;;
324
325{ .mfi
326(p8)  sub            rBias = rOffset2, rBias2
327      fma.s1         fArg4 = fArgSqr, fArgSqr, f0 // x^4
328      shl            rSaturation = rSaturation, 16
329}
330{ .mfb
331(p10) adds           rBias = 0x14, r0
332(p6)  fma.s.s0       f8 = f8,f1,f8                // NaN or +/-0
333(p6)  br.ret.spnt    b0                           // exit for x = NaN or +/-0
334}
335;;
336
337{ .mfi
338      shladd         rCoeffAddr1 = rBias, 4, rDataPtr
339      fma.s1         fArg3Sgn = fArgSqr, f8, f0  // sign(x)*|x|^3
340      // is |x| < 9.125?
341      cmp.lt         p11, p12 = rAbsArg, rSaturation
342}
343{ .mfi
344      shladd         rCoeffAddr3 = rBias, 4, rCoeffAddr3
345      fma.s1         fArg3 = fArgSqr, fAbsArg, f0 // |x|^3
346      shladd         rCoeffAddr2 = rBias, 3, rCoeffAddr2
347}
348;;
349
350{ .mfi
351(p11) ldfpd          fC0, fC1 = [rCoeffAddr1]
352(p9)  fmerge.s       f8 = f8,f1                   // +/- inf
353(p12) adds           rDataPtr = 544, rDataPtr
354}
355{ .mfb
356(p11) ldfpd          fC2, fC3 = [rCoeffAddr3], 16
357      nop.f          0
358(p9)  br.ret.spnt    b0                           // exit for x = +/- inf
359}
360;;
361
362{ .mfi
363(p11) ldfpd          fA0, fA1 = [rCoeffAddr2], 16
364      nop.f          0
365(p8)  cmp.eq.unc     p13, p0 = rBias, rNearSaturation
366}
367{ .mfi
368      add            rCoeffAddr1 = 48, rCoeffAddr1
369      nop.f          0
370      nop.i          0
371}
372;;
373
374{ .mfi
375(p11) ldfpd          fD0, fD1 = [rCoeffAddr3]
376      nop.f          0
377      nop.i          0
378}
379{ .mfb
380(p11) ldfpd          fD2, fB0 = [rCoeffAddr1]
381      // sign(x)*|x|^2
382      fma.s1         fArgSqrSgn = fArgSqr, fSignumX, f0
383(p10) br.cond.spnt   tanhf_near_zero
384}
385;;
386
387{ .mfi
388(p11) ldfpd          fA2, fA3 = [rCoeffAddr2], 16
389      fcmp.lt.s1     p15, p14 = f8,f0
390      nop.i          0
391}
392{ .mfb
393(p12) ldfd           fA0 = [rDataPtr]
394      fma.s1         fArg4Sgn = fArg4, fSignumX, f0 // sign(x)*|x|^4
395(p12) br.cond.spnt   tanhf_saturation
396}
397;;
398{ .mfi
399      nop.m          0
400      fma.s1         fArg7Sgn = fArg4, fArg3Sgn, f0  // sign(x)*|x|^7
401      nop.i          0
402}
403{ .mfb
404      nop.m          0
405      fma.s1         fArg6Sgn = fArg3, fArg3Sgn, f0  // sign(x)*|x|^6
406(p13) br.cond.spnt   tanhf_close_to_saturation
407}
408;;
409
410{ .mfi
411      nop.m          0
412      fma.s1         fPolC = fC3, fAbsArg, fC2    // C3*|x| + C2
413      nop.i          0
414}
415{ .mfi
416      nop.m          0
417      fma.s1         fPolCTmp = fC1, fAbsArg, fC0 // C1*|x| + C0
418      nop.i          0
419};;
420
421{ .mfi
422      nop.m          0
423      fma.s1         fPolA = fA1, fAbsArg, fA0    // A1*|x| + A0
424      nop.i          0
425}
426;;
427
428{ .mfi
429      nop.m          0
430      fma.s1         fPolD = fD1, fAbsArg, fD0    // D1*|x| + D0
431      nop.i          0
432}
433{ .mfi
434      nop.m          0
435      // sign(x)*(|x|^7 + D2*x^6)
436      fma.s1         fPolDTmp = fArg6Sgn, fD2, fArg7Sgn
437      nop.i          0
438};;
439
440{ .mfi
441      nop.m          0
442      fma.s1         fPolATmp = fA3, fAbsArg, fA2  // A3*|x| + A2
443      nop.i          0
444}
445{ .mfi
446      nop.m          0
447      fma.s1         fB0 = fB0, fArg4, f0          // B0*x^4
448      nop.i          0
449};;
450
451{ .mfi
452      nop.m          0
453      // C3*|x|^3 + C2*x^2 + C1*|x| + C0
454      fma.s1         fPolC = fPolC, fArgSqr, fPolCTmp
455      nop.i          0
456}
457;;
458
459{ .mfi
460      nop.m          0
461      // PolD = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4)
462      fma.d.s1       fPolD = fPolD, fArg4Sgn, fPolDTmp
463      nop.i          0
464}
465;;
466
467{ .mfi
468      nop.m          0
469      // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0
470      fma.d.s1       fPolA = fPolATmp, fArgSqr, fPolA
471      nop.i          0
472}
473;;
474
475{ .mfi
476      nop.m          0
477      // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0
478      fma.d.s1       fPolC = fPolC, f1, fB0
479      nop.i          0
480}
481;;
482
483{ .mfi
484      nop.m          0
485(p14) fma.s.s0       f8 = fPolC, fPolD, fPolA     // for positive x
486      nop.i          0
487}
488{ .mfb
489      nop.m          0
490(p15) fms.s.s0       f8 = fPolC, fPolD, fPolA     // for negative x
491      br.ret.sptk    b0                           // Exit for 0.3125 <=|x|< 8.0
492};;
493
494
495// Here if |x| < 0.3125
496tanhf_near_zero:
497{ .mfi
498      nop.m          0
499      fma.s1         fPolC = fC3, fArgSqr, fC2    // C3*x^2 + C2
500      nop.i          0
501}
502{ .mfi
503      nop.m          0
504      fma.s1         fPolCTmp = fC1, fArgSqr, fC0  // C1*x^2 + C0
505      nop.i          0
506};;
507
508{ .mfi
509      nop.m          0
510      fma.s1         fPolC = fPolC, fArg4, fPolCTmp // C3*x^6 + C2*x^4 + C1*x^2 + C0
511      nop.i          0
512};;
513
514{ .mfb
515      nop.m          0
516      // x + x^3*(C3*x^6 + C2*x^4 + C1*x^2 + C0)
517      fma.s.s0       f8 = fPolC, fArg3Sgn, f8
518      br.ret.sptk    b0                           // Exit for |x| < 0.3125
519};;
520
521// Here if 9.125 <= |x| < +inf
522tanhf_saturation:
523{ .mfb
524      nop.m          0
525      fma.s.s0       f8 = fA0, fSignumX, f0       // sign(x)*(1.0d - 2^(-52))
526      // Exit for 9.125 <= |x| < +inf
527      br.ret.sptk    b0                           // Exit for 9.125 <=|x|< +inf
528}
529;;
530
531// Here if  8.0 <= |x| < 9.125
532tanhf_close_to_saturation:
533{ .mfi
534      nop.m          0
535      fma.s1         fPolATmp = fA1, fAbsArg, fA0 // A1*|x| + A0
536      nop.i          0
537}
538{ .mfi
539      nop.m          0
540      fma.s1         fPolA = fA3, fAbsArg, fA2    // A3*|x| + A2
541      nop.i          0
542}
543;;
544
545.pred.rel "mutex", p14, p15
546{ .mfi
547      nop.m          0
548      // for positive x
549(p14) fma.s.s0       f8 = fPolA, fArgSqr, fPolATmp
550      nop.i          0
551}
552{ .mfb
553      nop.m          0
554      // for negative x
555(p15) fms.s.s0       f8 = fPolA, fArgSqrSgn, fPolATmp
556      br.ret.sptk    b0                           // Exit for 8.0 <=|x|< 9.125
557};;
558
559// Here if x is single precision denormal
560tanhf_denormal:
561{ .mfi
562      nop.m          0
563      fclass.m       p7,p8 = f8, 0x0a             // is x -denormal ?
564      nop.i          0
565}
566;;
567
568{ .mfi
569      nop.m          0
570(p7)  fma.s.s0       f8 = f8,f8,f8                // -denormal
571      nop.i          0
572}
573{ .mfb
574      nop.m          0
575(p8)  fnma.s.s0      f8 = f8,f8,f8                // +denormal
576      br.ret.sptk    b0                           // Exit for denormal
577}
578;;
579
580GLOBAL_LIBM_END(tanhf)
581libm_alias_float_other (tanh, tanh)
582