1/* Function sinhf vectorized with SSE4.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *   Compute sinh(x) as (exp(x)-exp(-x))/2,
23 *   where exp is calculated as
24 *   exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25 *
26 *   Special cases:
27 *
28 *   sinh(NaN) = quiet NaN, and raise invalid exception
29 *   sinh(INF) = that INF
30 *   sinh(x)   = x for subnormals
31 *   sinh(x) overflows for big x and returns MAXLOG+log(2)
32 *
33 */
34
35/* Offsets for data table __svml_ssinh_data_internal
36 */
37#define _sInvLn2			0
38#define _sLn2hi				16
39#define _sLn2lo				32
40#define _sSign				48
41#define _sShifter			64
42#define _iDomainRange			80
43#define _sPC1				96
44#define _sPC2				112
45#define _sPC3				128
46#define _sPC4				144
47#define _sPC5				160
48#define _sPC6				176
49#define _iHalf				192
50
51#include <sysdep.h>
52
53	.section .text.sse4, "ax", @progbits
54ENTRY(_ZGVbN4v_sinhf_sse4)
55	subq	$72, %rsp
56	cfi_def_cfa_offset(80)
57
58	/*
59	 *  Implementation
60	 *  Abs argument
61	 */
62	movups	_sSign+__svml_ssinh_data_internal(%rip), %xmm14
63	andps	%xmm0, %xmm14
64	movaps	%xmm14, %xmm10
65
66	/*
67	 *  Load argument
68	 * dM = x/log(2) + RShifter
69	 */
70	movups	_sInvLn2+__svml_ssinh_data_internal(%rip), %xmm7
71	pxor	%xmm0, %xmm10
72	mulps	%xmm10, %xmm7
73
74	/*
75	 * Check for overflow\underflow
76	 * MORE faster than GE?
77	 */
78	movaps	%xmm10, %xmm1
79	movups	_sShifter+__svml_ssinh_data_internal(%rip), %xmm2
80
81	/* sR = sX - sN*Log2_hi */
82	movups	_sLn2hi+__svml_ssinh_data_internal(%rip), %xmm3
83	addps	%xmm2, %xmm7
84
85	/*
86	 *  R
87	 * sN = sM - RShifter
88	 */
89	movaps	%xmm7, %xmm4
90
91	/*
92	 *  G1, G2 2^N, 2^(-N)
93	 * iM now is an EXP(2^N)
94	 */
95	pslld	$23, %xmm7
96
97	/* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
98	movups	_sLn2lo+__svml_ssinh_data_internal(%rip), %xmm5
99	subps	%xmm2, %xmm4
100	mulps	%xmm4, %xmm3
101	mulps	%xmm4, %xmm5
102	subps	%xmm3, %xmm10
103
104	/*
105	 * sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) ....
106	 * sSinh_r = (a3+r^2*a5)
107	 */
108	movups	_sPC5+__svml_ssinh_data_internal(%rip), %xmm8
109	subps	%xmm5, %xmm10
110
111	/* sR2 = sR^2 */
112	movaps	%xmm10, %xmm12
113	mulps	%xmm10, %xmm12
114
115	/*
116	 * sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2)
117	 * sOut = (a4 +a6*sR2)
118	 */
119	movups	_sPC6+__svml_ssinh_data_internal(%rip), %xmm9
120	mulps	%xmm12, %xmm8
121	mulps	%xmm12, %xmm9
122	addps	_sPC3+__svml_ssinh_data_internal(%rip), %xmm8
123	addps	_sPC4+__svml_ssinh_data_internal(%rip), %xmm9
124
125	/* sSinh_r = r^2*(a3+r^2*a5) */
126	mulps	%xmm12, %xmm8
127
128	/* sOut = a2+sR2*(a4+a6*sR2) */
129	mulps	%xmm12, %xmm9
130
131	/* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */
132	mulps	%xmm10, %xmm8
133	addps	_sPC2+__svml_ssinh_data_internal(%rip), %xmm9
134	addps	%xmm8, %xmm10
135
136	/* sOut = sR2*(a2+sR2*(a4+a6*sR2) */
137	mulps	%xmm9, %xmm12
138	movdqu	_iHalf+__svml_ssinh_data_internal(%rip), %xmm6
139	movdqa	%xmm6, %xmm13
140	psubd	%xmm7, %xmm6
141	paddd	%xmm7, %xmm13
142
143	/* sG1 = 2^(N-1)+2^(-N-1) */
144	movdqa	%xmm13, %xmm11
145
146	/* sG2 = 2^(N-1)-2^(-N-1) */
147	subps	%xmm6, %xmm13
148	addps	%xmm6, %xmm11
149
150	/* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */
151	mulps	%xmm13, %xmm12
152
153	/* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */
154	mulps	%xmm10, %xmm11
155	pcmpgtd	_iDomainRange+__svml_ssinh_data_internal(%rip), %xmm1
156	addps	%xmm11, %xmm12
157	movmskps %xmm1, %edx
158
159	/* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */
160	addps	%xmm12, %xmm13
161
162	/*  Ret H  */
163	orps	%xmm13, %xmm14
164	testl	%edx, %edx
165
166	/* Go to special inputs processing branch */
167	jne	L(SPECIAL_VALUES_BRANCH)
168	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm14
169
170	/* Restore registers
171	 * and exit the function
172	 */
173
174L(EXIT):
175	movaps	%xmm14, %xmm0
176	addq	$72, %rsp
177	cfi_def_cfa_offset(8)
178	ret
179	cfi_def_cfa_offset(80)
180
181	/* Branch to process
182	 * special inputs
183	 */
184
185L(SPECIAL_VALUES_BRANCH):
186	movups	%xmm0, 32(%rsp)
187	movups	%xmm14, 48(%rsp)
188	# LOE rbx rbp r12 r13 r14 r15 edx
189
190	xorl	%eax, %eax
191	movq	%r12, 16(%rsp)
192	cfi_offset(12, -64)
193	movl	%eax, %r12d
194	movq	%r13, 8(%rsp)
195	cfi_offset(13, -72)
196	movl	%edx, %r13d
197	movq	%r14, (%rsp)
198	cfi_offset(14, -80)
199	# LOE rbx rbp r15 r12d r13d
200
201	/* Range mask
202	 * bits check
203	 */
204
205L(RANGEMASK_CHECK):
206	btl	%r12d, %r13d
207
208	/* Call scalar math function */
209	jc	L(SCALAR_MATH_CALL)
210	# LOE rbx rbp r15 r12d r13d
211
212	/* Special inputs
213	 * processing loop
214	 */
215
216L(SPECIAL_VALUES_LOOP):
217	incl	%r12d
218	cmpl	$4, %r12d
219
220	/* Check bits in range mask */
221	jl	L(RANGEMASK_CHECK)
222	# LOE rbx rbp r15 r12d r13d
223
224	movq	16(%rsp), %r12
225	cfi_restore(12)
226	movq	8(%rsp), %r13
227	cfi_restore(13)
228	movq	(%rsp), %r14
229	cfi_restore(14)
230	movups	48(%rsp), %xmm14
231
232	/* Go to exit */
233	jmp	L(EXIT)
234	cfi_offset(12, -64)
235	cfi_offset(13, -72)
236	cfi_offset(14, -80)
237	# LOE rbx rbp r12 r13 r14 r15 xmm14
238
239	/* Scalar math fucntion call
240	 * to process special input
241	 */
242
243L(SCALAR_MATH_CALL):
244	movl	%r12d, %r14d
245	movss	32(%rsp, %r14, 4), %xmm0
246	call	sinhf@PLT
247	# LOE rbx rbp r14 r15 r12d r13d xmm0
248
249	movss	%xmm0, 48(%rsp, %r14, 4)
250
251	/* Process special inputs in loop */
252	jmp	L(SPECIAL_VALUES_LOOP)
253	# LOE rbx rbp r15 r12d r13d
254END(_ZGVbN4v_sinhf_sse4)
255
256	.section .rodata, "a"
257	.align	16
258
259#ifdef __svml_ssinh_data_internal_typedef
260typedef unsigned int VUINT32;
261typedef struct {
262	__declspec(align(16)) VUINT32 _sInvLn2[4][1];
263	__declspec(align(16)) VUINT32 _sLn2hi[4][1];
264	__declspec(align(16)) VUINT32 _sLn2lo[4][1];
265	__declspec(align(16)) VUINT32 _sSign[4][1];
266	__declspec(align(16)) VUINT32 _sShifter[4][1];
267	__declspec(align(16)) VUINT32 _iDomainRange[4][1];
268	__declspec(align(16)) VUINT32 _sPC1[4][1];
269	__declspec(align(16)) VUINT32 _sPC2[4][1];
270	__declspec(align(16)) VUINT32 _sPC3[4][1];
271	__declspec(align(16)) VUINT32 _sPC4[4][1];
272	__declspec(align(16)) VUINT32 _sPC5[4][1];
273	__declspec(align(16)) VUINT32 _sPC6[4][1];
274	__declspec(align(16)) VUINT32 _iHalf[4][1];
275} __svml_ssinh_data_internal;
276#endif
277__svml_ssinh_data_internal:
278	.long	0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ // k=0
279	.align	16
280	.long	0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
281	.align	16
282	.long	0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4 /* _sLn2lo */
283	.align	16
284	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
285	.align	16
286	.long	0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
287	.align	16
288	.long	0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
289	.align	16
290	.long	0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
291	.align	16
292	.long	0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
293	.align	16
294	.long	0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
295	.align	16
296	.long	0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */
297	.align	16
298	.long	0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */
299	.align	16
300	.long	0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */
301	// Integer constants
302	.align	16
303	.long	0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf */
304	.align	16
305	.type	__svml_ssinh_data_internal, @object
306	.size	__svml_ssinh_data_internal, .-__svml_ssinh_data_internal
307