1/* Function coshf vectorized with AVX-512.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *   Compute cosh(x) as (exp(x)+exp(-x))/2,
23 *   where exp is calculated as
24 *   exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25 *
26 *   Special cases:
27 *
28 *   cosh(NaN) = quiet NaN, and raise invalid exception
29 *   cosh(INF) = that INF
30 *   cosh(0)   = 1
31 *   cosh(x) overflows for big x and returns MAXLOG+log(2)
32 *
33 */
34
35/* Offsets for data table __svml_scosh_data_internal
36 */
37#define _sExp_tbl_PH			0
38#define _sExp_tbl_NH			128
39#define _sShifter_UISA			256
40#define _iDomainRange_UISA		320
41#define _sPC1_UISA			384
42#define _sPC2_UISA			448
43#define _sPC3_UISA			512
44#define _sInvLn2			576
45#define _sLn2hi				640
46#define _sLn2lo				704
47#define _sSign				768
48#define _iExpMask			832
49#define _sShifter			896
50#define _iDomainRange			960
51#define _sPC1				1024
52#define _sPC2				1088
53#define _sPC3				1152
54
55#include <sysdep.h>
56
57	.section .text.exex512, "ax", @progbits
58ENTRY(_ZGVeN16v_coshf_skx)
59	pushq	%rbp
60	cfi_def_cfa_offset(16)
61	movq	%rsp, %rbp
62	cfi_def_cfa(6, 16)
63	cfi_offset(6, -16)
64	andq	$-64, %rsp
65	subq	$192, %rsp
66	vmovups	_sSign+__svml_scosh_data_internal(%rip), %zmm4
67	vmovups	_sShifter_UISA+__svml_scosh_data_internal(%rip), %zmm6
68
69	/*
70	 *  Load argument
71	 * dM = x/log(2) + RShifter
72	 */
73	vmovups	_sInvLn2+__svml_scosh_data_internal(%rip), %zmm10
74	vmovups	_sLn2hi+__svml_scosh_data_internal(%rip), %zmm7
75	vmovups	_sLn2lo+__svml_scosh_data_internal(%rip), %zmm9
76
77	/*  */
78	vmovups	_sPC3_UISA+__svml_scosh_data_internal(%rip), %zmm2
79
80	/* x^2 */
81	vmovups	_sPC2_UISA+__svml_scosh_data_internal(%rip), %zmm3
82
83	/*  G1, G2 2^N, 2^(-N)  */
84	vmovups	__svml_scosh_data_internal(%rip), %zmm12
85	vmovups	_sExp_tbl_NH+__svml_scosh_data_internal(%rip), %zmm13
86
87	/*
88	 *  Implementation
89	 *  Abs argument
90	 */
91	vandnps	%zmm0, %zmm4, %zmm1
92
93	/* Check for overflow\underflow  */
94	vpternlogd $255, %zmm5, %zmm5, %zmm5
95	vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm10
96	vpcmpd	$1, _iDomainRange_UISA+__svml_scosh_data_internal(%rip), %zmm1, %k1
97
98	/* iM now is an EXP(2^N) */
99	vpslld	$18, %zmm10, %zmm11
100
101	/*
102	 *  R
103	 * sN = sM - RShifter
104	 */
105	vsubps	{rn-sae}, %zmm6, %zmm10, %zmm8
106	vpermt2ps _sExp_tbl_PH+64+__svml_scosh_data_internal(%rip), %zmm10, %zmm12
107	vpermt2ps _sExp_tbl_NH+64+__svml_scosh_data_internal(%rip), %zmm10, %zmm13
108	vpandnd	%zmm1, %zmm1, %zmm5{%k1}
109
110	/* sR = sX - sN*Log2_hi */
111	vfnmadd231ps {rn-sae}, %zmm7, %zmm8, %zmm1
112	vptestmd %zmm5, %zmm5, %k0
113
114	/* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
115	vfnmadd231ps {rn-sae}, %zmm9, %zmm8, %zmm1
116	kmovw	%k0, %edx
117	vmulps	{rn-sae}, %zmm1, %zmm1, %zmm4
118	vmulps	{rn-sae}, %zmm4, %zmm2, %zmm2
119
120	/* sSinh_r = r + r*(r^2*(a3)) */
121	vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm2
122
123	/* sOut = r^2*(a2) */
124	vmulps	{rn-sae}, %zmm4, %zmm3, %zmm1
125	vpandd	_iExpMask+__svml_scosh_data_internal(%rip), %zmm11, %zmm14
126	vpaddd	%zmm14, %zmm12, %zmm15
127	vpsubd	%zmm14, %zmm13, %zmm10
128
129	/* sG2 = 2^N*Th + 2^(-N)*T_h */
130	vaddps	{rn-sae}, %zmm10, %zmm15, %zmm5
131
132	/* sG1 = 2^N*Th - 2^(-N)*T_h */
133	vsubps	{rn-sae}, %zmm10, %zmm15, %zmm6
134
135	/* res = sG1*(r + r*(r^2*(a3))) + sG2*(1+r^2*(a2)) */
136	vfmadd213ps {rn-sae}, %zmm5, %zmm5, %zmm1
137	vfmadd213ps {rn-sae}, %zmm1, %zmm2, %zmm6
138	testl	%edx, %edx
139
140	/* Go to special inputs processing branch */
141	jne	L(SPECIAL_VALUES_BRANCH)
142	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm6
143
144	/* Restore registers
145	 * and exit the function
146	 */
147
148L(EXIT):
149	vmovaps	%zmm6, %zmm0
150	movq	%rbp, %rsp
151	popq	%rbp
152	cfi_def_cfa(7, 8)
153	cfi_restore(6)
154	ret
155	cfi_def_cfa(6, 16)
156	cfi_offset(6, -16)
157
158	/* Branch to process
159	 * special inputs
160	 */
161
162L(SPECIAL_VALUES_BRANCH):
163	vmovups	%zmm0, 64(%rsp)
164	vmovups	%zmm6, 128(%rsp)
165	# LOE rbx r12 r13 r14 r15 edx zmm6
166
167	xorl	%eax, %eax
168	# LOE rbx r12 r13 r14 r15 eax edx
169
170	vzeroupper
171	movq	%r12, 16(%rsp)
172	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
173	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
174	movl	%eax, %r12d
175	movq	%r13, 8(%rsp)
176	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
177	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
178	movl	%edx, %r13d
179	movq	%r14, (%rsp)
180	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
181	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
182	# LOE rbx r15 r12d r13d
183
184	/* Range mask
185	 * bits check
186	 */
187
188L(RANGEMASK_CHECK):
189	btl	%r12d, %r13d
190
191	/* Call scalar math function */
192	jc	L(SCALAR_MATH_CALL)
193	# LOE rbx r15 r12d r13d
194
195	/* Special inputs
196	 * processing loop
197	 */
198
199L(SPECIAL_VALUES_LOOP):
200	incl	%r12d
201	cmpl	$16, %r12d
202
203	/* Check bits in range mask */
204	jl	L(RANGEMASK_CHECK)
205	# LOE rbx r15 r12d r13d
206
207	movq	16(%rsp), %r12
208	cfi_restore(12)
209	movq	8(%rsp), %r13
210	cfi_restore(13)
211	movq	(%rsp), %r14
212	cfi_restore(14)
213	vmovups	128(%rsp), %zmm6
214
215	/* Go to exit */
216	jmp	L(EXIT)
217	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
218	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
219	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
220	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
221	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
222	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
223	# LOE rbx r12 r13 r14 r15 zmm6
224
225	/* Scalar math fucntion call
226	 * to process special input
227	 */
228
229L(SCALAR_MATH_CALL):
230	movl	%r12d, %r14d
231	vmovss	64(%rsp, %r14, 4), %xmm0
232	call	coshf@PLT
233	# LOE rbx r14 r15 r12d r13d xmm0
234
235	vmovss	%xmm0, 128(%rsp, %r14, 4)
236
237	/* Process special inputs in loop */
238	jmp	L(SPECIAL_VALUES_LOOP)
239	# LOE rbx r15 r12d r13d
240END(_ZGVeN16v_coshf_skx)
241
242	.section .rodata, "a"
243	.align	64
244
245#ifdef __svml_scosh_data_internal_typedef
246typedef unsigned int VUINT32;
247typedef struct {
248	__declspec(align(64)) VUINT32 _sExp_tbl_PH[32][1];
249	__declspec(align(64)) VUINT32 _sExp_tbl_NH[32][1];
250	__declspec(align(64)) VUINT32 _sShifter_UISA[16][1];
251	__declspec(align(64)) VUINT32 _iDomainRange_UISA[16][1];
252	__declspec(align(64)) VUINT32 _sPC1_UISA[16][1];
253	__declspec(align(64)) VUINT32 _sPC2_UISA[16][1];
254	__declspec(align(64)) VUINT32 _sPC3_UISA[16][1];
255	__declspec(align(64)) VUINT32 _sInvLn2[16][1];
256	__declspec(align(64)) VUINT32 _sLn2hi[16][1];
257	__declspec(align(64)) VUINT32 _sLn2lo[16][1];
258	__declspec(align(64)) VUINT32 _sSign[16][1];
259	__declspec(align(64)) VUINT32 _iExpMask[16][1];
260	__declspec(align(64)) VUINT32 _sShifter[16][1];
261	__declspec(align(64)) VUINT32 _iDomainRange[16][1];
262	__declspec(align(64)) VUINT32 _sPC1[16][1];
263	__declspec(align(64)) VUINT32 _sPC2[16][1];
264	__declspec(align(64)) VUINT32 _sPC3[16][1];
265} __svml_scosh_data_internal;
266#endif
267__svml_scosh_data_internal:
268	/* _sExp_tbl_PH 2^(i/32-1), i=0..31 */
269	.long	0x3f000000, 0x3f02cd87, 0x3f05aac3, 0x3f08980f
270	.long	0x3f0b95c2, 0x3f0ea43a, 0x3f11c3d3, 0x3f14f4f0
271	.long	0x3f1837f0, 0x3f1b8d3a, 0x3f1ef532, 0x3f227043
272	.long	0x3f25fed7, 0x3f29a15b, 0x3f2d583f, 0x3f3123f6
273	.long	0x3f3504f3, 0x3f38fbaf, 0x3f3d08a4, 0x3f412c4d
274	.long	0x3f45672a, 0x3f49b9be, 0x3f4e248c, 0x3f52a81e
275	.long	0x3f5744fd, 0x3f5bfbb8, 0x3f60ccdf, 0x3f65b907
276	.long	0x3f6ac0c7, 0x3f6fe4ba, 0x3f75257d, 0x3f7a83b3
277	/* _sExp_tbl_NH 2^(-i/32-1), i=0..31 */
278	.align	64
279	.long	0x3f000000, 0x3efa83b3, 0x3ef5257d, 0x3eefe4ba
280	.long	0x3eeac0c7, 0x3ee5b907, 0x3ee0ccdf, 0x3edbfbb8
281	.long	0x3ed744fd, 0x3ed2a81e, 0x3ece248c, 0x3ec9b9be
282	.long	0x3ec5672a, 0x3ec12c4d, 0x3ebd08a4, 0x3eb8fbaf
283	.long	0x3eb504f3, 0x3eb123f6, 0x3ead583f, 0x3ea9a15b
284	.long	0x3ea5fed7, 0x3ea27043, 0x3e9ef532, 0x3e9b8d3a
285	.long	0x3e9837f0, 0x3e94f4f0, 0x3e91c3d3, 0x3e8ea43a
286	.long	0x3e8b95c2, 0x3e88980f, 0x3e85aac3, 0x3e82cd87
287	.align	64
288	.long	0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000 /* 1.5*2^18 _sShifter_UISA */
289	.align	64
290	.long	0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange_UISA */
291	.align	64
292	.long	0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1_UISA=1 */
293	.align	64
294	.long	0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f /* _sPC2_UISA */
295	.align	64
296	.long	0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd /* _sPC3_UISA */
297	.align	64
298	.long	0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ // k=0
299	.align	64
300	.long	0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
301	.align	64
302	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */
303	.align	64
304	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
305	.align	64
306	.long	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 /* _iExpMask */
307	.align	64
308	.long	0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
309	.align	64
310	.long	0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
311	.align	64
312	.long	0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
313	.align	64
314	.long	0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
315	.align	64
316	.long	0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
317	.align	64
318	.type	__svml_scosh_data_internal, @object
319	.size	__svml_scosh_data_internal, .-__svml_scosh_data_internal
320