1.file "sinhl.s"
2
3
4// Copyright (c) 2000 - 2002, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39// History
40//==============================================================
41// 02/02/00 Initial version
42// 04/04/00 Unwind support added
43// 08/15/00 Bundle added after call to __libm_error_support to properly
44//          set [the previously overwritten] GR_Parameter_RESULT.
45// 10/12/00 Update to set denormal operand and underflow flags
46// 01/22/01 Fixed to set inexact flag for small args.  Fixed incorrect
47//          call to __libm_error_support for 710.476 < x < 11357.2166.
48// 05/02/01 Reworked to improve speed of all paths
49// 05/20/02 Cleaned up namespace and sf0 syntax
50// 12/04/02 Improved performance
51//
52// API
53//==============================================================
54// long double = sinhl(long double)
55// input  floating point f8
56// output floating point f8
57//
58// Registers used
59//==============================================================
60// general registers:
61// r14 -> r40
62// predicate registers used:
63// p6 -> p11
64// floating-point registers used:
65// f9 -> f15; f32 -> f90;
66// f8 has input, then output
67//
68// Overview of operation
69//==============================================================
70// There are seven paths
71// 1. 0 < |x| < 0.25          SINH_BY_POLY
72// 2. 0.25 <=|x| < 32         SINH_BY_TBL
73// 3. 32 <= |x| < 11357.21655 SINH_BY_EXP (merged path with SINH_BY_TBL)
74// 4. |x| >= 11357.21655      SINH_HUGE
75// 5. x=0                     Done with early exit
76// 6. x=inf,nan               Done with early exit
77// 7. x=denormal              SINH_DENORM
78//
79// For double extended we get overflow for x >= 400c b174 ddc0 31ae c0ea
80//                                           >= 11357.21655
81//
82//
83// 1. SINH_BY_POLY   0 < |x| < 0.25
84// ===============
85// Evaluate sinh(x) by a 13th order polynomial
86// Care is take for the order of multiplication; and P_1 is not exactly 1/3!,
87// P_2 is not exactly 1/5!, etc.
88// sinh(x) = sign * (series(e^x) - series(e^-x))/2
89//         = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11!
90//                        + ax^13/13!)
91//         = sign * (ax   + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) )
92//                        + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) ))
93//         = sign * (ax   + ax*p_odd + (ax*p_even))
94//         = sign * (ax   + Y_lo)
95// sinh(x) = sign * (Y_hi + Y_lo)
96// Note that ax = |x|
97//
98// 2. SINH_BY_TBL   0.25 <= |x| < 32.0
99// =============
100// sinh(x) = sinh(B+R)
101//         = sinh(B)cosh(R) + cosh(B)sinh(R)
102//
103// ax = |x| = M*log2/64 + R
104// B = M*log2/64
105// M = 64*N + j
106//   We will calculate M and get N as (M-j)/64
107//   The division is a shift.
108// exp(B)  = exp(N*log2 + j*log2/64)
109//         = 2^N * 2^(j*log2/64)
110// sinh(B) = 1/2(e^B -e^-B)
111//         = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64))
112// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
113// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
114// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32
115// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit)
116//
117// R = ax - M*log2/64
118// R = ax - M*log2_by_64_hi - M*log2_by_64_lo
119// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...)
120//        = 1 + p_odd + p_even
121//        where the p_even uses the A coefficients and the p_even uses
122//        the B coefficients
123//
124// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd
125//    cosh(R) = 1 + p_even
126//    sinh(B) = S_hi + S_lo
127//    cosh(B) = C_hi
128// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R)
129//
130// 3. SINH_BY_EXP   32.0 <= |x| < 11357.21655  ( 400c b174 ddc0 31ae c0ea )
131// ==============
132// Can approximate result by exp(x)/2 in this region.
133// Y_hi = Tjhi
134// Y_lo = Tjhi * (p_odd + p_even) + Tjlo
135// sinh(x) = Y_hi + Y_lo
136//
137// 4. SINH_HUGE     |x| >= 11357.21655  ( 400c b174 ddc0 31ae c0ea )
138// ============
139// Set error tag and call error support
140//
141//
142// Assembly macros
143//==============================================================
144r_ad5                 = r14
145r_rshf_2to57          = r15
146r_exp_denorm          = r15
147r_ad_mJ_lo            = r15
148r_ad_J_lo             = r16
149r_2Nm1                = r17
150r_2mNm1               = r18
151r_exp_x               = r18
152r_ad_J_hi             = r19
153r_ad2o                = r19
154r_ad_mJ_hi            = r20
155r_mj                  = r21
156r_ad2e                = r22
157r_ad3                 = r23
158r_ad1                 = r24
159r_Mmj                 = r24
160r_rshf                = r25
161r_M                   = r25
162r_N                   = r25
163r_jshf                = r26
164r_exp_2tom57          = r26
165r_j                   = r26
166r_exp_mask            = r27
167r_signexp_x           = r28
168r_signexp_sgnx_0_5    = r28
169r_exp_0_25            = r29
170r_sig_inv_ln2         = r30
171r_exp_32              = r30
172r_exp_huge            = r30
173r_ad4                 = r31
174
175GR_SAVE_PFS           = r34
176GR_SAVE_B0            = r35
177GR_SAVE_GP            = r36
178
179GR_Parameter_X        = r37
180GR_Parameter_Y        = r38
181GR_Parameter_RESULT   = r39
182GR_Parameter_TAG      = r40
183
184
185f_ABS_X               = f9
186f_X2                  = f10
187f_X4                  = f11
188f_tmp                 = f14
189f_RSHF                = f15
190
191f_Inv_log2by64        = f32
192f_log2by64_lo         = f33
193f_log2by64_hi         = f34
194f_A1                  = f35
195
196f_A2                  = f36
197f_A3                  = f37
198f_Rcub                = f38
199f_M_temp              = f39
200f_R_temp              = f40
201
202f_Rsq                 = f41
203f_R                   = f42
204f_M                   = f43
205f_B1                  = f44
206f_B2                  = f45
207
208f_B3                  = f46
209f_peven_temp1         = f47
210f_peven_temp2         = f48
211f_peven               = f49
212f_podd_temp1          = f50
213
214f_podd_temp2          = f51
215f_podd                = f52
216f_poly65              = f53
217f_poly6543            = f53
218f_poly6to1            = f53
219f_poly43              = f54
220f_poly21              = f55
221
222f_X3                  = f56
223f_INV_LN2_2TO63       = f57
224f_RSHF_2TO57          = f58
225f_2TOM57              = f59
226f_smlst_oflow_input   = f60
227
228f_pre_result          = f61
229f_huge                = f62
230f_spos                = f63
231f_sneg                = f64
232f_Tjhi                = f65
233
234f_Tjlo                = f66
235f_Tmjhi               = f67
236f_Tmjlo               = f68
237f_S_hi                = f69
238f_SC_hi_temp          = f70
239
240f_S_lo_temp1          = f71
241f_S_lo_temp2          = f72
242f_S_lo_temp3          = f73
243f_S_lo_temp4          = f73
244f_S_lo                = f74
245f_C_hi                = f75
246
247f_Y_hi                = f77
248f_Y_lo_temp           = f78
249f_Y_lo                = f79
250f_NORM_X              = f80
251
252f_P1                  = f81
253f_P2                  = f82
254f_P3                  = f83
255f_P4                  = f84
256f_P5                  = f85
257
258f_P6                  = f86
259f_Tjhi_spos           = f87
260f_Tjlo_spos           = f88
261f_huge                = f89
262f_signed_hi_lo        = f90
263
264
265// Data tables
266//==============================================================
267
268// DO NOT CHANGE ORDER OF THESE TABLES
269RODATA
270
271.align 16
272LOCAL_OBJECT_START(sinh_arg_reduction)
273//   data8 0xB8AA3B295C17F0BC, 0x00004005  // 64/log2 -- signif loaded with setf
274   data8 0xB17217F7D1000000, 0x00003FF8  // log2/64 high part
275   data8 0xCF79ABC9E3B39804, 0x00003FD0  // log2/64 low part
276   data8 0xb174ddc031aec0ea, 0x0000400c  // Smallest x to overflow (11357.21655)
277LOCAL_OBJECT_END(sinh_arg_reduction)
278
279LOCAL_OBJECT_START(sinh_p_table)
280   data8 0xB08AF9AE78C1239F, 0x00003FDE  // P6
281   data8 0xB8EF1D28926D8891, 0x00003FEC  // P4
282   data8 0x8888888888888412, 0x00003FF8  // P2
283   data8 0xD732377688025BE9, 0x00003FE5  // P5
284   data8 0xD00D00D00D4D39F2, 0x00003FF2  // P3
285   data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC  // P1
286LOCAL_OBJECT_END(sinh_p_table)
287
288LOCAL_OBJECT_START(sinh_ab_table)
289   data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC  // A1
290   data8 0x88888888884ECDD5, 0x00003FF8  // A2
291   data8 0xD00D0C6DCC26A86B, 0x00003FF2  // A3
292   data8 0x8000000000000002, 0x00003FFE  // B1
293   data8 0xAAAAAAAAAA402C77, 0x00003FFA  // B2
294   data8 0xB60B6CC96BDB144D, 0x00003FF5  // B3
295LOCAL_OBJECT_END(sinh_ab_table)
296
297LOCAL_OBJECT_START(sinh_j_hi_table)
298   data8 0xB504F333F9DE6484, 0x00003FFE
299   data8 0xB6FD91E328D17791, 0x00003FFE
300   data8 0xB8FBAF4762FB9EE9, 0x00003FFE
301   data8 0xBAFF5AB2133E45FB, 0x00003FFE
302   data8 0xBD08A39F580C36BF, 0x00003FFE
303   data8 0xBF1799B67A731083, 0x00003FFE
304   data8 0xC12C4CCA66709456, 0x00003FFE
305   data8 0xC346CCDA24976407, 0x00003FFE
306   data8 0xC5672A115506DADD, 0x00003FFE
307   data8 0xC78D74C8ABB9B15D, 0x00003FFE
308   data8 0xC9B9BD866E2F27A3, 0x00003FFE
309   data8 0xCBEC14FEF2727C5D, 0x00003FFE
310   data8 0xCE248C151F8480E4, 0x00003FFE
311   data8 0xD06333DAEF2B2595, 0x00003FFE
312   data8 0xD2A81D91F12AE45A, 0x00003FFE
313   data8 0xD4F35AABCFEDFA1F, 0x00003FFE
314   data8 0xD744FCCAD69D6AF4, 0x00003FFE
315   data8 0xD99D15C278AFD7B6, 0x00003FFE
316   data8 0xDBFBB797DAF23755, 0x00003FFE
317   data8 0xDE60F4825E0E9124, 0x00003FFE
318   data8 0xE0CCDEEC2A94E111, 0x00003FFE
319   data8 0xE33F8972BE8A5A51, 0x00003FFE
320   data8 0xE5B906E77C8348A8, 0x00003FFE
321   data8 0xE8396A503C4BDC68, 0x00003FFE
322   data8 0xEAC0C6E7DD24392F, 0x00003FFE
323   data8 0xED4F301ED9942B84, 0x00003FFE
324   data8 0xEFE4B99BDCDAF5CB, 0x00003FFE
325   data8 0xF281773C59FFB13A, 0x00003FFE
326   data8 0xF5257D152486CC2C, 0x00003FFE
327   data8 0xF7D0DF730AD13BB9, 0x00003FFE
328   data8 0xFA83B2DB722A033A, 0x00003FFE
329   data8 0xFD3E0C0CF486C175, 0x00003FFE
330   data8 0x8000000000000000, 0x00003FFF // Center of table
331   data8 0x8164D1F3BC030773, 0x00003FFF
332   data8 0x82CD8698AC2BA1D7, 0x00003FFF
333   data8 0x843A28C3ACDE4046, 0x00003FFF
334   data8 0x85AAC367CC487B15, 0x00003FFF
335   data8 0x871F61969E8D1010, 0x00003FFF
336   data8 0x88980E8092DA8527, 0x00003FFF
337   data8 0x8A14D575496EFD9A, 0x00003FFF
338   data8 0x8B95C1E3EA8BD6E7, 0x00003FFF
339   data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF
340   data8 0x8EA4398B45CD53C0, 0x00003FFF
341   data8 0x9031DC431466B1DC, 0x00003FFF
342   data8 0x91C3D373AB11C336, 0x00003FFF
343   data8 0x935A2B2F13E6E92C, 0x00003FFF
344   data8 0x94F4EFA8FEF70961, 0x00003FFF
345   data8 0x96942D3720185A00, 0x00003FFF
346   data8 0x9837F0518DB8A96F, 0x00003FFF
347   data8 0x99E0459320B7FA65, 0x00003FFF
348   data8 0x9B8D39B9D54E5539, 0x00003FFF
349   data8 0x9D3ED9A72CFFB751, 0x00003FFF
350   data8 0x9EF5326091A111AE, 0x00003FFF
351   data8 0xA0B0510FB9714FC2, 0x00003FFF
352   data8 0xA27043030C496819, 0x00003FFF
353   data8 0xA43515AE09E6809E, 0x00003FFF
354   data8 0xA5FED6A9B15138EA, 0x00003FFF
355   data8 0xA7CD93B4E965356A, 0x00003FFF
356   data8 0xA9A15AB4EA7C0EF8, 0x00003FFF
357   data8 0xAB7A39B5A93ED337, 0x00003FFF
358   data8 0xAD583EEA42A14AC6, 0x00003FFF
359   data8 0xAF3B78AD690A4375, 0x00003FFF
360   data8 0xB123F581D2AC2590, 0x00003FFF
361   data8 0xB311C412A9112489, 0x00003FFF
362   data8 0xB504F333F9DE6484, 0x00003FFF
363LOCAL_OBJECT_END(sinh_j_hi_table)
364
365LOCAL_OBJECT_START(sinh_j_lo_table)
366   data4 0x1EB2FB13
367   data4 0x1CE2CBE2
368   data4 0x1DDC3CBC
369   data4 0x1EE9AA34
370   data4 0x9EAEFDC1
371   data4 0x9DBF517B
372   data4 0x1EF88AFB
373   data4 0x1E03B216
374   data4 0x1E78AB43
375   data4 0x9E7B1747
376   data4 0x9EFE3C0E
377   data4 0x9D36F837
378   data4 0x9DEE53E4
379   data4 0x9E24AE8E
380   data4 0x1D912473
381   data4 0x1EB243BE
382   data4 0x1E669A2F
383   data4 0x9BBC610A
384   data4 0x1E761035
385   data4 0x9E0BE175
386   data4 0x1CCB12A1
387   data4 0x1D1BFE90
388   data4 0x1DF2F47A
389   data4 0x1EF22F22
390   data4 0x9E3F4A29
391   data4 0x1EC01A5B
392   data4 0x1E8CAC3A
393   data4 0x9DBB3FAB
394   data4 0x1EF73A19
395   data4 0x9BB795B5
396   data4 0x1EF84B76
397   data4 0x9EF5818B
398   data4 0x00000000 // Center of table
399   data4 0x1F77CACA
400   data4 0x1EF8A91D
401   data4 0x1E57C976
402   data4 0x9EE8DA92
403   data4 0x1EE85C9F
404   data4 0x1F3BF1AF
405   data4 0x1D80CA1E
406   data4 0x9D0373AF
407   data4 0x9F167097
408   data4 0x1EB70051
409   data4 0x1F6EB029
410   data4 0x1DFD6D8E
411   data4 0x9EB319B0
412   data4 0x1EBA2BEB
413   data4 0x1F11D537
414   data4 0x1F0D5A46
415   data4 0x9E5E7BCA
416   data4 0x9F3AAFD1
417   data4 0x9E86DACC
418   data4 0x9F3EDDC2
419   data4 0x1E496E3D
420   data4 0x9F490BF6
421   data4 0x1DD1DB48
422   data4 0x1E65EBFB
423   data4 0x9F427496
424   data4 0x1F283C4A
425   data4 0x1F4B0047
426   data4 0x1F130152
427   data4 0x9E8367C0
428   data4 0x9F705F90
429   data4 0x1EFB3C53
430   data4 0x1F32FB13
431LOCAL_OBJECT_END(sinh_j_lo_table)
432
433
434.section .text
435GLOBAL_IEEE754_ENTRY(sinhl)
436
437{ .mlx
438      getf.exp        r_signexp_x = f8   // Get signexp of x, must redo if unorm
439      movl            r_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
440}
441{ .mlx
442      addl            r_ad1 = @ltoff(sinh_arg_reduction), gp
443      movl            r_rshf_2to57 = 0x4778000000000000 // 1.10000 2^(63+57)
444}
445;;
446
447{ .mfi
448      ld8             r_ad1 = [r_ad1]
449      fmerge.s        f_ABS_X    = f0,f8
450      mov             r_exp_0_25 = 0x0fffd    // Form exponent for 0.25
451}
452{ .mfi
453      nop.m           0
454      fnorm.s1        f_NORM_X = f8
455      mov             r_exp_2tom57 = 0xffff-57
456}
457;;
458
459{ .mfi
460      setf.d          f_RSHF_2TO57 = r_rshf_2to57 // Form const 1.100 * 2^120
461      fclass.m        p10,p0 = f8, 0x0b           // Test for denorm
462      mov             r_exp_mask = 0x1ffff
463}
464{ .mlx
465      setf.sig        f_INV_LN2_2TO63 = r_sig_inv_ln2 // Form 1/ln2 * 2^63
466      movl            r_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift
467}
468;;
469
470{ .mfi
471      nop.m           0
472      fclass.m        p7,p0 = f8, 0x07  // Test if x=0
473      nop.i           0
474}
475{ .mfi
476      setf.exp        f_2TOM57 = r_exp_2tom57 // Form 2^-57 for scaling
477      nop.f           0
478      add             r_ad3 = 0x90, r_ad1  // Point to ab_table
479}
480;;
481
482{ .mfi
483      setf.d          f_RSHF = r_rshf     // Form right shift const 1.100 * 2^63
484      fclass.m        p6,p0 = f8, 0xe3     // Test if x nan, inf
485      add             r_ad4 = 0x2f0, r_ad1 // Point to j_hi_table midpoint
486}
487{ .mib
488      add             r_ad2e = 0x20, r_ad1 // Point to p_table
489      nop.i           0
490(p10) br.cond.spnt    SINH_DENORM          // Branch if x denorm
491}
492;;
493
494// Common path -- return here from SINH_DENORM if x is unnorm
495SINH_COMMON:
496{ .mfi
497      ldfe            f_smlst_oflow_input = [r_ad2e],16
498      nop.f           0
499      add             r_ad5 = 0x580, r_ad1 // Point to j_lo_table midpoint
500}
501{ .mib
502      ldfe            f_log2by64_hi  = [r_ad1],16
503      and             r_exp_x = r_exp_mask, r_signexp_x
504(p7)  br.ret.spnt     b0                  // Exit if x=0
505}
506;;
507
508// Get the A coefficients for SINH_BY_TBL
509{ .mfi
510      ldfe            f_A1 = [r_ad3],16
511      fcmp.lt.s1      p8,p9 = f8,f0           // Test for x<0
512      cmp.lt          p7,p0 = r_exp_x, r_exp_0_25  // Test x < 0.25
513}
514{ .mfb
515      add             r_ad2o = 0x30, r_ad2e  // Point to p_table odd coeffs
516(p6)  fma.s0          f8 = f8,f1,f0          // Result for x nan, inf
517(p6)  br.ret.spnt     b0                     // Exit for x nan, inf
518}
519;;
520
521// Calculate X2 = ax*ax for SINH_BY_POLY
522{ .mfi
523      ldfe            f_log2by64_lo  = [r_ad1],16
524      nop.f           0
525      nop.i           0
526}
527{ .mfb
528      ldfe            f_A2 = [r_ad3],16
529      fma.s1          f_X2 = f_NORM_X, f_NORM_X, f0
530(p7)  br.cond.spnt    SINH_BY_POLY
531}
532;;
533
534// Here if |x| >= 0.25
535SINH_BY_TBL:
536// ******************************************************
537// STEP 1 (TBL and EXP) - Argument reduction
538// ******************************************************
539// Get the following constants.
540// Inv_log2by64
541// log2by64_hi
542// log2by64_lo
543
544
545// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and
546// put them in an exponent.
547// f_spos = 2^(N-1) and f_sneg = 2^(-N-1)
548// 0xffff + (N-1)  = 0xffff +N -1
549// 0xffff - (N +1) = 0xffff -N -1
550
551
552// Calculate M and keep it as integer and floating point.
553// M = round-to-integer(x*Inv_log2by64)
554// f_M = M = truncate(ax/(log2/64))
555// Put the integer representation of M in r_M
556//    and the floating point representation of M in f_M
557
558// Get the remaining A,B coefficients
559{ .mmi
560      ldfe            f_A3 = [r_ad3],16
561      nop.m           0
562      nop.i           0
563}
564;;
565
566.pred.rel "mutex",p8,p9
567// Use constant (1.100*2^(63-6)) to get rounded M into rightmost significand
568// |x| * 64 * 1/ln2 * 2^(63-6) + 1.1000 * 2^(63+(63-6))
569{ .mfi
570(p8)  mov             r_signexp_sgnx_0_5 = 0x2fffe // signexp of -0.5
571      fma.s1          f_M_temp = f_ABS_X, f_INV_LN2_2TO63, f_RSHF_2TO57
572(p9)  mov             r_signexp_sgnx_0_5 = 0x0fffe // signexp of +0.5
573}
574;;
575
576// Test for |x| >= overflow limit
577{ .mfi
578      ldfe            f_B1 = [r_ad3],16
579      fcmp.ge.s1      p6,p0 = f_ABS_X, f_smlst_oflow_input
580      nop.i           0
581}
582;;
583
584{ .mfi
585      ldfe            f_B2 = [r_ad3],16
586      nop.f           0
587      mov             r_exp_32 = 0x10004
588}
589;;
590
591// Subtract RSHF constant to get rounded M as a floating point value
592// M_temp * 2^(63-6) - 2^63
593{ .mfb
594      ldfe            f_B3 = [r_ad3],16
595      fms.s1          f_M = f_M_temp, f_2TOM57, f_RSHF
596(p6)  br.cond.spnt    SINH_HUGE  // Branch if result will overflow
597}
598;;
599
600{ .mfi
601      getf.sig        r_M = f_M_temp
602      nop.f           0
603      cmp.ge          p7,p6 = r_exp_x, r_exp_32 // Test if x >= 32
604}
605;;
606
607// Calculate j. j is the signed extension of the six lsb of M. It
608// has a range of -32 thru 31.
609
610// Calculate R
611// ax - M*log2by64_hi
612// R = (ax - M*log2by64_hi) - M*log2by64_lo
613
614{ .mfi
615      nop.m           0
616      fnma.s1         f_R_temp = f_M, f_log2by64_hi, f_ABS_X
617      and             r_j = 0x3f, r_M
618}
619;;
620
621{ .mii
622      nop.m           0
623      shl             r_jshf = r_j, 0x2 // Shift j so can sign extend it
624;;
625      sxt1            r_jshf = r_jshf
626}
627;;
628
629{ .mii
630      nop.m           0
631      shr             r_j = r_jshf, 0x2    // Now j has range -32 to 31
632      nop.i           0
633}
634;;
635
636{ .mmi
637      shladd          r_ad_J_hi = r_j, 4, r_ad4 // pointer to Tjhi
638      sub             r_Mmj = r_M, r_j          // M-j
639      sub             r_mj = r0, r_j            // Form -j
640}
641;;
642
643// The TBL and EXP branches are merged and predicated
644// If TBL, p6 true, 0.25 <= |x| < 32
645// If EXP, p7 true, 32 <= |x| < overflow_limit
646//
647// N = (M-j)/64
648{ .mfi
649      ldfe            f_Tjhi = [r_ad_J_hi]
650      fnma.s1         f_R = f_M, f_log2by64_lo, f_R_temp
651      shr             r_N = r_Mmj, 0x6            // N = (M-j)/64
652}
653{ .mfi
654      shladd          r_ad_mJ_hi = r_mj, 4, r_ad4 // pointer to Tmjhi
655      nop.f           0
656      shladd          r_ad_mJ_lo = r_mj, 2, r_ad5 // pointer to Tmjlo
657}
658;;
659
660{ .mfi
661      sub             r_2mNm1 = r_signexp_sgnx_0_5, r_N // signexp sgnx*2^(-N-1)
662      nop.f           0
663      shladd          r_ad_J_lo = r_j, 2, r_ad5   // pointer to Tjlo
664}
665{ .mfi
666      ldfe            f_Tmjhi = [r_ad_mJ_hi]
667      nop.f           0
668      add             r_2Nm1 = r_signexp_sgnx_0_5, r_N // signexp sgnx*2^(N-1)
669}
670;;
671
672{ .mmf
673      ldfs            f_Tmjlo = [r_ad_mJ_lo]
674      setf.exp        f_sneg = r_2mNm1            // Form sgnx * 2^(-N-1)
675      nop.f           0
676}
677;;
678
679{ .mmf
680      ldfs            f_Tjlo  = [r_ad_J_lo]
681      setf.exp        f_spos = r_2Nm1             // Form sgnx * 2^(N-1)
682      nop.f           0
683}
684;;
685
686// ******************************************************
687// STEP 2 (TBL and EXP)
688// ******************************************************
689// Calculate Rsquared and Rcubed in preparation for p_even and p_odd
690
691{ .mmf
692      nop.m           0
693      nop.m           0
694      fma.s1          f_Rsq  = f_R, f_R, f0
695}
696;;
697
698
699// Calculate p_even
700// B_2 + Rsq *B_3
701// B_1 + Rsq * (B_2 + Rsq *B_3)
702// p_even = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
703{ .mfi
704      nop.m           0
705      fma.s1          f_peven_temp1 = f_Rsq, f_B3, f_B2
706      nop.i           0
707}
708// Calculate p_odd
709// A_2 + Rsq *A_3
710// A_1 + Rsq * (A_2 + Rsq *A_3)
711// podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
712{ .mfi
713      nop.m           0
714      fma.s1          f_podd_temp1 = f_Rsq, f_A3, f_A2
715      nop.i           0
716}
717;;
718
719{ .mfi
720      nop.m           0
721      fma.s1          f_Rcub = f_Rsq, f_R, f0
722      nop.i           0
723}
724;;
725
726//
727// If TBL,
728// Calculate S_hi and S_lo, and C_hi
729// SC_hi_temp = sneg * Tmjhi
730// S_hi = spos * Tjhi - SC_hi_temp
731// S_hi = spos * Tjhi - (sneg * Tmjhi)
732// C_hi = spos * Tjhi + SC_hi_temp
733// C_hi = spos * Tjhi + (sneg * Tmjhi)
734
735{ .mfi
736      nop.m           0
737(p6)  fma.s1          f_SC_hi_temp = f_sneg, f_Tmjhi, f0
738      nop.i           0
739}
740;;
741
742// If TBL,
743// S_lo_temp3 = sneg * Tmjlo
744// S_lo_temp4 = spos * Tjlo - S_lo_temp3
745// S_lo_temp4 = spos * Tjlo -(sneg * Tmjlo)
746{ .mfi
747      nop.m           0
748(p6)  fma.s1          f_S_lo_temp3 =  f_sneg, f_Tmjlo, f0
749      nop.i           0
750}
751;;
752
753{ .mfi
754      nop.m           0
755      fma.s1          f_peven_temp2 = f_Rsq, f_peven_temp1, f_B1
756      nop.i           0
757}
758{ .mfi
759      nop.m           0
760      fma.s1          f_podd_temp2 = f_Rsq, f_podd_temp1, f_A1
761      nop.i           0
762}
763;;
764
765// If EXP,
766// Compute sgnx * 2^(N-1) * Tjhi and sgnx * 2^(N-1) * Tjlo
767{ .mfi
768      nop.m           0
769(p7)  fma.s1          f_Tjhi_spos = f_Tjhi, f_spos, f0
770      nop.i           0
771}
772{ .mfi
773      nop.m           0
774(p7)  fma.s1          f_Tjlo_spos = f_Tjlo, f_spos, f0
775      nop.i           0
776}
777;;
778
779{ .mfi
780      nop.m           0
781(p6)  fms.s1          f_S_hi = f_spos, f_Tjhi, f_SC_hi_temp
782      nop.i           0
783}
784;;
785
786{ .mfi
787      nop.m           0
788(p6)  fma.s1          f_C_hi = f_spos, f_Tjhi, f_SC_hi_temp
789      nop.i           0
790}
791{ .mfi
792      nop.m           0
793(p6)  fms.s1          f_S_lo_temp4 = f_spos, f_Tjlo, f_S_lo_temp3
794      nop.i           0
795}
796;;
797
798{ .mfi
799      nop.m           0
800      fma.s1          f_peven = f_Rsq, f_peven_temp2, f0
801      nop.i           0
802}
803{ .mfi
804      nop.m           0
805      fma.s1          f_podd = f_podd_temp2, f_Rcub, f_R
806      nop.i           0
807}
808;;
809
810// If TBL,
811// S_lo_temp1 =  spos * Tjhi - S_hi
812// S_lo_temp2 = -sneg * Tmjlo + S_lo_temp1
813// S_lo_temp2 = -sneg * Tmjlo + (spos * Tjhi - S_hi)
814
815{ .mfi
816      nop.m           0
817(p6)  fms.s1          f_S_lo_temp1 =  f_spos, f_Tjhi,  f_S_hi
818      nop.i           0
819}
820;;
821
822{ .mfi
823      nop.m           0
824(p6)  fnma.s1         f_S_lo_temp2 = f_sneg, f_Tmjhi, f_S_lo_temp1
825      nop.i           0
826}
827;;
828
829// If EXP,
830// Y_hi = sgnx * 2^(N-1) * Tjhi
831// Y_lo = sgnx * 2^(N-1) * Tjhi * (p_odd + p_even) + sgnx * 2^(N-1) * Tjlo
832{ .mfi
833      nop.m           0
834(p7)  fma.s1          f_Y_lo_temp =  f_peven, f1, f_podd
835      nop.i           0
836}
837;;
838
839// If TBL,
840// S_lo = S_lo_temp4 + S_lo_temp2
841{ .mfi
842      nop.m           0
843(p6)  fma.s1          f_S_lo = f_S_lo_temp4, f1, f_S_lo_temp2
844      nop.i           0
845}
846;;
847
848// If TBL,
849// Y_hi = S_hi
850// Y_lo = C_hi*p_odd + (S_hi*p_even + S_lo)
851{ .mfi
852      nop.m           0
853(p6)  fma.s1          f_Y_lo_temp = f_S_hi, f_peven, f_S_lo
854      nop.i           0
855}
856;;
857
858{ .mfi
859      nop.m           0
860(p7)  fma.s1          f_Y_lo = f_Tjhi_spos, f_Y_lo_temp, f_Tjlo_spos
861      nop.i           0
862}
863;;
864
865// Dummy multiply to generate inexact
866{ .mfi
867      nop.m           0
868      fmpy.s0         f_tmp = f_B2, f_B2
869      nop.i           0
870}
871{ .mfi
872      nop.m           0
873(p6)  fma.s1          f_Y_lo = f_C_hi, f_podd, f_Y_lo_temp
874      nop.i           0
875}
876;;
877
878// f8 = answer = Y_hi + Y_lo
879{ .mfi
880      nop.m           0
881(p7)  fma.s0          f8 = f_Y_lo,  f1, f_Tjhi_spos
882      nop.i           0
883}
884;;
885
886// f8 = answer = Y_hi + Y_lo
887{ .mfb
888      nop.m           0
889(p6)  fma.s0          f8 = f_Y_lo, f1, f_S_hi
890      br.ret.sptk     b0      // Exit for SINH_BY_TBL and SINH_BY_EXP
891}
892;;
893
894
895// Here if 0 < |x| < 0.25
896SINH_BY_POLY:
897{ .mmf
898      ldfe            f_P6 = [r_ad2e],16
899      ldfe            f_P5 = [r_ad2o],16
900      nop.f           0
901}
902;;
903
904{ .mmi
905      ldfe            f_P4 = [r_ad2e],16
906      ldfe            f_P3 = [r_ad2o],16
907      nop.i           0
908}
909;;
910
911{ .mmi
912      ldfe            f_P2 = [r_ad2e],16
913      ldfe            f_P1 = [r_ad2o],16
914      nop.i           0
915}
916;;
917
918{ .mfi
919      nop.m           0
920      fma.s1          f_X3 = f_NORM_X, f_X2, f0
921      nop.i           0
922}
923{ .mfi
924      nop.m           0
925      fma.s1          f_X4 = f_X2, f_X2, f0
926      nop.i           0
927}
928;;
929
930{ .mfi
931      nop.m           0
932      fma.s1          f_poly65 = f_X2, f_P6, f_P5
933      nop.i           0
934}
935{ .mfi
936      nop.m           0
937      fma.s1          f_poly43 = f_X2, f_P4, f_P3
938      nop.i           0
939}
940;;
941
942{ .mfi
943      nop.m           0
944      fma.s1          f_poly21 = f_X2, f_P2, f_P1
945      nop.i           0
946}
947;;
948
949{ .mfi
950      nop.m           0
951      fma.s1          f_poly6543 = f_X4, f_poly65, f_poly43
952      nop.i           0
953}
954;;
955
956{ .mfi
957      nop.m           0
958      fma.s1          f_poly6to1 = f_X4, f_poly6543, f_poly21
959      nop.i           0
960}
961;;
962
963// Dummy multiply to generate inexact
964{ .mfi
965      nop.m           0
966      fmpy.s0         f_tmp = f_P6, f_P6
967      nop.i           0
968}
969{ .mfb
970      nop.m           0
971      fma.s0          f8 = f_poly6to1, f_X3, f_NORM_X
972      br.ret.sptk     b0                // Exit SINH_BY_POLY
973}
974;;
975
976
977// Here if x denorm or unorm
978SINH_DENORM:
979// Determine if x really a denorm and not a unorm
980{ .mmf
981      getf.exp        r_signexp_x = f_NORM_X
982      mov             r_exp_denorm = 0x0c001   // Real denorms have exp < this
983      fmerge.s        f_ABS_X = f0, f_NORM_X
984}
985;;
986
987{ .mfi
988      nop.m           0
989      fcmp.eq.s0      p10,p0 = f8, f0  // Set denorm flag
990      nop.i           0
991}
992;;
993
994// Set p8 if really a denorm
995{ .mmi
996      and             r_exp_x = r_exp_mask, r_signexp_x
997;;
998      cmp.lt          p8,p9 = r_exp_x, r_exp_denorm
999      nop.i           0
1000}
1001;;
1002
1003// Identify denormal operands.
1004{ .mfb
1005      nop.m           0
1006(p8)  fcmp.ge.unc.s1  p6,p7 = f8, f0   // Test sign of denorm
1007(p9)  br.cond.sptk    SINH_COMMON    // Return to main path if x unorm
1008}
1009;;
1010
1011{ .mfi
1012      nop.m           0
1013(p6)  fma.s0          f8 =  f8,f8,f8  // If x +denorm, result=x+x^2
1014      nop.i           0
1015}
1016{ .mfb
1017      nop.m           0
1018(p7)  fnma.s0         f8 =  f8,f8,f8  // If x -denorm, result=x-x^2
1019      br.ret.sptk     b0            // Exit if x denorm
1020}
1021;;
1022
1023
1024// Here if |x| >= overflow limit
1025SINH_HUGE:
1026// for SINH_HUGE, put 24000 in exponent; take sign from input
1027{ .mmi
1028      mov             r_exp_huge = 0x15dbf
1029;;
1030      setf.exp        f_huge  = r_exp_huge
1031      nop.i           0
1032}
1033;;
1034
1035.pred.rel "mutex",p8,p9
1036{ .mfi
1037      alloc           r32 = ar.pfs,0,5,4,0
1038(p8)  fnma.s1         f_signed_hi_lo = f_huge, f1, f1
1039      nop.i           0
1040}
1041{ .mfi
1042      nop.m           0
1043(p9)  fma.s1          f_signed_hi_lo = f_huge, f1, f1
1044      nop.i           0
1045}
1046;;
1047
1048{ .mfi
1049      nop.m           0
1050      fma.s0          f_pre_result = f_signed_hi_lo, f_huge, f0
1051      mov             GR_Parameter_TAG = 126
1052}
1053;;
1054
1055GLOBAL_IEEE754_END(sinhl)
1056libm_alias_ldouble_other (__sinh, sinh)
1057
1058
1059LOCAL_LIBM_ENTRY(__libm_error_region)
1060.prologue
1061
1062{ .mfi
1063        add   GR_Parameter_Y=-32,sp              // Parameter 2 value
1064        nop.f 0
1065.save   ar.pfs,GR_SAVE_PFS
1066        mov  GR_SAVE_PFS=ar.pfs                  // Save ar.pfs
1067}
1068{ .mfi
1069.fframe 64
1070        add sp=-64,sp                            // Create new stack
1071        nop.f 0
1072        mov GR_SAVE_GP=gp                        // Save gp
1073};;
1074
1075{ .mmi
1076        stfe [GR_Parameter_Y] = f0,16            // STORE Parameter 2 on stack
1077        add GR_Parameter_X = 16,sp               // Parameter 1 address
1078.save   b0, GR_SAVE_B0
1079        mov GR_SAVE_B0=b0                        // Save b0
1080};;
1081
1082.body
1083{ .mib
1084        stfe [GR_Parameter_X] = f8               // STORE Parameter 1 on stack
1085        add   GR_Parameter_RESULT = 0,GR_Parameter_Y   // Parameter 3 address
1086        nop.b 0
1087}
1088{ .mib
1089        stfe [GR_Parameter_Y] = f_pre_result     // STORE Parameter 3 on stack
1090        add   GR_Parameter_Y = -16,GR_Parameter_Y
1091        br.call.sptk b0=__libm_error_support#    // Call error handling function
1092};;
1093
1094{ .mmi
1095        add   GR_Parameter_RESULT = 48,sp
1096        nop.m 0
1097        nop.i 0
1098};;
1099
1100{ .mmi
1101        ldfe  f8 = [GR_Parameter_RESULT]         // Get return result off stack
1102.restore sp
1103        add   sp = 64,sp                         // Restore stack pointer
1104        mov   b0 = GR_SAVE_B0                    // Restore return address
1105};;
1106
1107{ .mib
1108        mov   gp = GR_SAVE_GP                    // Restore gp
1109        mov   ar.pfs = GR_SAVE_PFS               // Restore ar.pfs
1110        br.ret.sptk     b0                       // Return
1111};;
1112
1113LOCAL_LIBM_END(__libm_error_region)
1114
1115
1116.type   __libm_error_support#,@function
1117.global __libm_error_support#
1118