1/* Function atanh vectorized with AVX-512.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *   Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
23 *   using small lookup table that map to AVX-512 permute instructions
24 *
25 *   Special cases:
26 *
27 *   atanh(0)  = 0
28 *   atanh(+1) = +INF
29 *   atanh(-1) = -INF
30 *   atanh(x)  = NaN if |x| > 1, or if x is a NaN or INF
31 *
32 */
33
34/* Offsets for data table __svml_datanh_data_internal_avx512
35 */
36#define Log_tbl_H			0
37#define Log_tbl_L			128
38#define One				256
39#define AbsMask				320
40#define AddB5				384
41#define RcpBitMask			448
42#define poly_coeff8			512
43#define poly_coeff7			576
44#define poly_coeff6			640
45#define poly_coeff5			704
46#define poly_coeff4			768
47#define poly_coeff3			832
48#define poly_coeff2			896
49#define poly_coeff1			960
50#define poly_coeff0			1024
51#define Half				1088
52#define L2H				1152
53#define L2L				1216
54
55#include <sysdep.h>
56
57	.section .text.evex512, "ax", @progbits
58ENTRY(_ZGVeN8v_atanh_skx)
59	pushq	%rbp
60	cfi_def_cfa_offset(16)
61	movq	%rsp, %rbp
62	cfi_def_cfa(6, 16)
63	cfi_offset(6, -16)
64	andq	$-64, %rsp
65	subq	$192, %rsp
66	vmovups	One+__svml_datanh_data_internal_avx512(%rip), %zmm15
67
68	/* round reciprocals to 1+4b mantissas */
69	vmovups	AddB5+__svml_datanh_data_internal_avx512(%rip), %zmm6
70	vmovups	RcpBitMask+__svml_datanh_data_internal_avx512(%rip), %zmm9
71	vmovaps	%zmm0, %zmm2
72	vandpd	AbsMask+__svml_datanh_data_internal_avx512(%rip), %zmm2, %zmm13
73
74	/* 1+y */
75	vaddpd	{rn-sae}, %zmm15, %zmm13, %zmm0
76
77	/* 1-y */
78	vsubpd	{rn-sae}, %zmm13, %zmm15, %zmm4
79	vxorpd	%zmm13, %zmm2, %zmm1
80
81	/* Yp_high */
82	vsubpd	{rn-sae}, %zmm15, %zmm0, %zmm7
83
84	/* -Ym_high */
85	vsubpd	{rn-sae}, %zmm15, %zmm4, %zmm12
86
87	/* RcpP ~ 1/Yp */
88	vrcp14pd %zmm0, %zmm3
89
90	/* RcpM ~ 1/Ym */
91	vrcp14pd %zmm4, %zmm5
92
93	/* input outside (-1, 1) ? */
94	vcmppd	$21, {sae}, %zmm15, %zmm13, %k0
95	vpaddq	%zmm6, %zmm3, %zmm11
96	vpaddq	%zmm6, %zmm5, %zmm10
97
98	/* Yp_low */
99	vsubpd	{rn-sae}, %zmm7, %zmm13, %zmm8
100	vandpd	%zmm9, %zmm11, %zmm14
101	vandpd	%zmm9, %zmm10, %zmm3
102
103	/* Ym_low */
104	vaddpd	{rn-sae}, %zmm12, %zmm13, %zmm12
105
106	/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
107	vfmsub213pd {rn-sae}, %zmm15, %zmm14, %zmm0
108
109	/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
110	vfmsub231pd {rn-sae}, %zmm3, %zmm4, %zmm15
111
112	/* exponents */
113	vgetexppd {sae}, %zmm14, %zmm5
114	vgetexppd {sae}, %zmm3, %zmm4
115
116	/* Table lookups */
117	vmovups	__svml_datanh_data_internal_avx512(%rip), %zmm9
118	vmovups	Log_tbl_H+64+__svml_datanh_data_internal_avx512(%rip), %zmm13
119	vmovups	Log_tbl_L+__svml_datanh_data_internal_avx512(%rip), %zmm7
120	vfmadd231pd {rn-sae}, %zmm14, %zmm8, %zmm0
121	vfnmadd231pd {rn-sae}, %zmm3, %zmm12, %zmm15
122
123	/* Prepare table index */
124	vpsrlq	$48, %zmm14, %zmm11
125	vpsrlq	$48, %zmm3, %zmm8
126	vmovups	Log_tbl_L+64+__svml_datanh_data_internal_avx512(%rip), %zmm14
127
128	/* polynomials */
129	vmovups	poly_coeff8+__svml_datanh_data_internal_avx512(%rip), %zmm3
130
131	/* Km-Kp */
132	vsubpd	{rn-sae}, %zmm5, %zmm4, %zmm5
133	vmovups	poly_coeff7+__svml_datanh_data_internal_avx512(%rip), %zmm4
134	kmovw	%k0, %edx
135	vmovaps	%zmm11, %zmm10
136	vmovaps	%zmm4, %zmm6
137	vpermi2pd %zmm13, %zmm9, %zmm10
138	vpermi2pd %zmm14, %zmm7, %zmm11
139	vpermt2pd %zmm13, %zmm8, %zmm9
140	vpermt2pd %zmm14, %zmm8, %zmm7
141	vmovups	poly_coeff6+__svml_datanh_data_internal_avx512(%rip), %zmm8
142	vfmadd231pd {rn-sae}, %zmm0, %zmm3, %zmm6
143	vfmadd231pd {rn-sae}, %zmm15, %zmm3, %zmm4
144	vmovups	poly_coeff3+__svml_datanh_data_internal_avx512(%rip), %zmm13
145	vmovups	poly_coeff2+__svml_datanh_data_internal_avx512(%rip), %zmm14
146	vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm6
147	vfmadd213pd {rn-sae}, %zmm8, %zmm15, %zmm4
148	vmovups	poly_coeff0+__svml_datanh_data_internal_avx512(%rip), %zmm8
149	vsubpd	{rn-sae}, %zmm11, %zmm7, %zmm12
150
151	/* table values */
152	vsubpd	{rn-sae}, %zmm10, %zmm9, %zmm3
153	vmovups	poly_coeff5+__svml_datanh_data_internal_avx512(%rip), %zmm7
154	vmovups	poly_coeff4+__svml_datanh_data_internal_avx512(%rip), %zmm9
155
156	/* K*L2H + Th */
157	vmovups	L2H+__svml_datanh_data_internal_avx512(%rip), %zmm10
158
159	/* K*L2L + Tl */
160	vmovups	L2L+__svml_datanh_data_internal_avx512(%rip), %zmm11
161	vfmadd213pd {rn-sae}, %zmm7, %zmm0, %zmm6
162	vfmadd213pd {rn-sae}, %zmm7, %zmm15, %zmm4
163	vmovups	poly_coeff1+__svml_datanh_data_internal_avx512(%rip), %zmm7
164	vfmadd231pd {rn-sae}, %zmm5, %zmm10, %zmm3
165	vfmadd213pd {rn-sae}, %zmm12, %zmm11, %zmm5
166	vfmadd213pd {rn-sae}, %zmm9, %zmm0, %zmm6
167	vfmadd213pd {rn-sae}, %zmm9, %zmm15, %zmm4
168	vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm6
169	vfmadd213pd {rn-sae}, %zmm13, %zmm15, %zmm4
170	vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm6
171	vfmadd213pd {rn-sae}, %zmm14, %zmm15, %zmm4
172	vfmadd213pd {rn-sae}, %zmm7, %zmm0, %zmm6
173	vfmadd213pd {rn-sae}, %zmm7, %zmm15, %zmm4
174	vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm6
175	vfmadd213pd {rn-sae}, %zmm8, %zmm15, %zmm4
176
177	/* (K*L2L + Tl) + Rp*PolyP */
178	vfmadd213pd {rn-sae}, %zmm5, %zmm0, %zmm6
179	vorpd	Half+__svml_datanh_data_internal_avx512(%rip), %zmm1, %zmm0
180
181	/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
182	vfnmadd213pd {rn-sae}, %zmm6, %zmm15, %zmm4
183	vaddpd	{rn-sae}, %zmm4, %zmm3, %zmm1
184	vmulpd	{rn-sae}, %zmm0, %zmm1, %zmm0
185	testl	%edx, %edx
186
187	/* Go to special inputs processing branch */
188	jne	L(SPECIAL_VALUES_BRANCH)
189	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm2
190
191	/* Restore registers
192	 * and exit the function
193	 */
194
195L(EXIT):
196	movq	%rbp, %rsp
197	popq	%rbp
198	cfi_def_cfa(7, 8)
199	cfi_restore(6)
200	ret
201	cfi_def_cfa(6, 16)
202	cfi_offset(6, -16)
203
204	/* Branch to process
205	 * special inputs
206	 */
207
208L(SPECIAL_VALUES_BRANCH):
209	vmovups	%zmm2, 64(%rsp)
210	vmovups	%zmm0, 128(%rsp)
211	# LOE rbx r12 r13 r14 r15 edx zmm0
212
213	xorl	%eax, %eax
214	# LOE rbx r12 r13 r14 r15 eax edx
215
216	vzeroupper
217	movq	%r12, 16(%rsp)
218	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
219	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
220	movl	%eax, %r12d
221	movq	%r13, 8(%rsp)
222	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
223	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
224	movl	%edx, %r13d
225	movq	%r14, (%rsp)
226	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
227	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
228	# LOE rbx r15 r12d r13d
229
230	/* Range mask
231	 * bits check
232	 */
233
234L(RANGEMASK_CHECK):
235	btl	%r12d, %r13d
236
237	/* Call scalar math function */
238	jc	L(SCALAR_MATH_CALL)
239	# LOE rbx r15 r12d r13d
240
241	/* Special inputs
242	 * processing loop
243	 */
244
245L(SPECIAL_VALUES_LOOP):
246	incl	%r12d
247	cmpl	$8, %r12d
248
249	/* Check bits in range mask */
250	jl	L(RANGEMASK_CHECK)
251	# LOE rbx r15 r12d r13d
252
253	movq	16(%rsp), %r12
254	cfi_restore(12)
255	movq	8(%rsp), %r13
256	cfi_restore(13)
257	movq	(%rsp), %r14
258	cfi_restore(14)
259	vmovups	128(%rsp), %zmm0
260
261	/* Go to exit */
262	jmp	L(EXIT)
263	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
264	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
265	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
266	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
267	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
268	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
269	# LOE rbx r12 r13 r14 r15 zmm0
270
271	/* Scalar math fucntion call
272	 * to process special input
273	 */
274
275L(SCALAR_MATH_CALL):
276	movl	%r12d, %r14d
277	vmovsd	64(%rsp, %r14, 8), %xmm0
278	call	atanh@PLT
279	# LOE rbx r14 r15 r12d r13d xmm0
280
281	vmovsd	%xmm0, 128(%rsp, %r14, 8)
282
283	/* Process special inputs in loop */
284	jmp	L(SPECIAL_VALUES_LOOP)
285	# LOE rbx r15 r12d r13d
286END(_ZGVeN8v_atanh_skx)
287
288	.section .rodata, "a"
289	.align	64
290
291#ifdef __svml_datanh_data_internal_avx512_typedef
292typedef unsigned int VUINT32;
293typedef struct {
294	__declspec(align(64)) VUINT32 Log_tbl_H[16][2];
295	__declspec(align(64)) VUINT32 Log_tbl_L[16][2];
296	__declspec(align(64)) VUINT32 One[8][2];
297	__declspec(align(64)) VUINT32 AbsMask[8][2];
298	__declspec(align(64)) VUINT32 AddB5[8][2];
299	__declspec(align(64)) VUINT32 RcpBitMask[8][2];
300	__declspec(align(64)) VUINT32 poly_coeff8[8][2];
301	__declspec(align(64)) VUINT32 poly_coeff7[8][2];
302	__declspec(align(64)) VUINT32 poly_coeff6[8][2];
303	__declspec(align(64)) VUINT32 poly_coeff5[8][2];
304	__declspec(align(64)) VUINT32 poly_coeff4[8][2];
305	__declspec(align(64)) VUINT32 poly_coeff3[8][2];
306	__declspec(align(64)) VUINT32 poly_coeff2[8][2];
307	__declspec(align(64)) VUINT32 poly_coeff1[8][2];
308	__declspec(align(64)) VUINT32 poly_coeff0[8][2];
309	__declspec(align(64)) VUINT32 Half[8][2];
310	__declspec(align(64)) VUINT32 L2H[8][2];
311	__declspec(align(64)) VUINT32 L2L[8][2];
312} __svml_datanh_data_internal_avx512;
313#endif
314__svml_datanh_data_internal_avx512:
315	/* Log_tbl_H */
316	.quad	0x0000000000000000
317	.quad	0x3faf0a30c0100000
318	.quad	0x3fbe27076e2a0000
319	.quad	0x3fc5ff3070a80000
320	.quad	0x3fcc8ff7c79b0000
321	.quad	0x3fd1675cabab8000
322	.quad	0x3fd4618bc21c8000
323	.quad	0x3fd739d7f6bc0000
324	.quad	0x3fd9f323ecbf8000
325	.quad	0x3fdc8ff7c79a8000
326	.quad	0x3fdf128f5faf0000
327	.quad	0x3fe0be72e4254000
328	.quad	0x3fe1e85f5e704000
329	.quad	0x3fe307d7334f0000
330	.quad	0x3fe41d8fe8468000
331	.quad	0x3fe52a2d265bc000
332	/* Log_tbl_L */
333	.align	64
334	.quad	0x0000000000000000
335	.quad	0x3d662a6617cc9717
336	.quad	0x3d6e5cbd3d50fffc
337	.quad	0xbd6b0b0de3077d7e
338	.quad	0xbd697794f689f843
339	.quad	0x3d630701ce63eab9
340	.quad	0xbd609ec17a426426
341	.quad	0xbd67fcb18ed9d603
342	.quad	0x3d584bf2b68d766f
343	.quad	0x3d5a21ac25d81ef3
344	.quad	0x3d3bb2cd720ec44c
345	.quad	0xbd657d49676844cc
346	.quad	0x3d1a07bd8b34be7c
347	.quad	0x3d60be1fb590a1f5
348	.quad	0xbd5aa33736867a17
349	.quad	0x3d46abb9df22bc57
350	/* One */
351	.align	64
352	.quad	0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
353	/* AbsMask */
354	.align	64
355	.quad	0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
356	/* AddB5 */
357	.align	64
358	.quad	0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000
359	/* RcpBitMask */
360	.align	64
361	.quad	0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000
362	/* poly_coeff8 */
363	.align	64
364	.quad	0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142
365	/* poly_coeff7 */
366	.align	64
367	.quad	0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70
368	/* poly_coeff6 */
369	.align	64
370	.quad	0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8
371	/* poly_coeff5 */
372	.align	64
373	.quad	0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5
374	/* poly_coeff4 */
375	.align	64
376	.quad	0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a
377	/* poly_coeff3 */
378	.align	64
379	.quad	0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01
380	/* poly_coeff2 */
381	.align	64
382	.quad	0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462
383	/* poly_coeff1 */
384	.align	64
385	.quad	0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5
386	/* poly_coeff0 */
387	.align	64
388	.quad	0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
389	/* Half */
390	.align	64
391	.quad	0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
392	/* L2H = log(2)_high */
393	.align	64
394	.quad	0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000
395	/* L2L = log(2)_low */
396	.align	64
397	.quad	0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000
398	.align	64
399	.type	__svml_datanh_data_internal_avx512, @object
400	.size	__svml_datanh_data_internal_avx512, .-__svml_datanh_data_internal_avx512
401