1/* Function atanhf vectorized with SSE4.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *   Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
23 *
24 *   Special cases:
25 *
26 *   atanh(0)  = 0
27 *   atanh(+1) = +INF
28 *   atanh(-1) = -INF
29 *   atanh(x)  = NaN if |x| > 1, or if x is a NaN or INF
30 *
31 */
32
33/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
34   by use in the function. On cold-starts this might help the
35   prefetcher. Possibly a better idea is to interleave start/end so
36   that the prefetcher is less likely to detect a stream and pull
37   irrelivant lines into cache.  */
38#define sOne				0
39#define SgnMask				16
40#define sTopMask12			32
41#define iBrkValue			48
42#define iOffExpoMask			64
43#define sPoly				80
44#define sLn2				208
45#define TinyRange			224
46
47#include <sysdep.h>
48#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal)
49
50	.section .text.sse4, "ax", @progbits
51ENTRY(_ZGVbN4v_atanhf_sse4)
52	movaps	%xmm0, %xmm5
53
54	/* Load constants including One = 1 */
55	movups	ATANHF_DATA(sOne)(%rip), %xmm4
56	movaps	%xmm5, %xmm3
57
58	/* Strip off the sign, so treat X as positive until right at the end */
59	movups	ATANHF_DATA(SgnMask)(%rip), %xmm1
60	movaps	%xmm4, %xmm2
61	andps	%xmm1, %xmm0
62	movaps	%xmm4, %xmm10
63	movups	ATANHF_DATA(sTopMask12)(%rip), %xmm11
64	movaps	%xmm4, %xmm14
65	movaps	%xmm11, %xmm9
66
67
68	/*
69	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
70	 * the upper part UHi being <= 12 bits long. Then we have
71	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
72	 */
73	movaps	%xmm0, %xmm6
74	mulps	%xmm5, %xmm3
75	subps	%xmm0, %xmm2
76	addps	%xmm0, %xmm6
77	subps	%xmm2, %xmm10
78	addps	%xmm5, %xmm3
79	subps	%xmm0, %xmm10
80	andps	%xmm2, %xmm9
81
82
83	/*
84	 * Check whether |X| < 1, in which case we use the main function.
85	 * Otherwise set the rangemask so that the callout will get used.
86	 * Note that this will also use the callout for NaNs since not(NaN < 1).
87	 */
88	rcpps	%xmm9, %xmm7
89	subps	%xmm9, %xmm2
90	andps	%xmm11, %xmm7
91
92
93	/*
94	 * Split V as well into upper 12 bits and lower part, so that we can get
95	 * a preliminary quotient estimate without rounding error.
96	 */
97	andps	%xmm6, %xmm11
98	mulps	%xmm7, %xmm9
99	addps	%xmm2, %xmm10
100	subps	%xmm11, %xmm6
101
102	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
103	mulps	%xmm7, %xmm11
104	mulps	%xmm7, %xmm10
105	subps	%xmm9, %xmm14
106	mulps	%xmm6, %xmm7
107	subps	%xmm10, %xmm14
108
109	/* Compute D = E + E^2 */
110	movaps	%xmm14, %xmm13
111	movaps	%xmm4, %xmm8
112	mulps	%xmm14, %xmm13
113
114	/* reduction: compute r,n */
115	movdqu	ATANHF_DATA(iBrkValue)(%rip), %xmm9
116	addps	%xmm13, %xmm14
117
118	/*
119	 * Compute R * (VHi + VLo) * (1 + E + E^2)
120	 * = R *  (VHi + VLo) * (1 + D)
121	 * = QHi + (QHi * D + QLo + QLo * D)
122	 */
123	movaps	%xmm14, %xmm2
124	mulps	%xmm7, %xmm14
125	mulps	%xmm11, %xmm2
126	addps	%xmm14, %xmm7
127	movdqu	ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
128	movaps	%xmm4, %xmm14
129
130	/* Record the sign for eventual reincorporation. */
131	addps	%xmm7, %xmm2
132
133
134	/*
135	 * Now finally accumulate the high and low parts of the
136	 * argument to log1p, H + L, with a final compensated summation.
137	 */
138	movaps	%xmm2, %xmm6
139	andnps	%xmm5, %xmm1
140	movaps	%xmm4, %xmm7
141	/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
142	addps	%xmm11, %xmm6
143	maxps	%xmm6, %xmm7
144	minps	%xmm6, %xmm8
145	subps	%xmm6, %xmm11
146	movaps	%xmm7, %xmm10
147	addps	%xmm8, %xmm10
148	addps	%xmm11, %xmm2
149	subps	%xmm10, %xmm7
150	psubd	%xmm9, %xmm10
151	addps	%xmm8, %xmm7
152	pand	%xmm10, %xmm12
153	psrad	$23, %xmm10
154	cvtdq2ps %xmm10, %xmm13
155	addps	%xmm7, %xmm2
156
157	/* final reconstruction */
158	pslld	$23, %xmm10
159	paddd	%xmm9, %xmm12
160	psubd	%xmm10, %xmm14
161
162	/* polynomial evaluation */
163	subps	%xmm4, %xmm12
164	mulps	%xmm14, %xmm2
165	movups	ATANHF_DATA(sPoly+0)(%rip), %xmm7
166	addps	%xmm12, %xmm2
167	mulps	%xmm2, %xmm7
168
169
170	/* Finally, halve the result and reincorporate the sign */
171	addps	ATANHF_DATA(sPoly+16)(%rip), %xmm7
172	mulps	%xmm2, %xmm7
173	addps	ATANHF_DATA(sPoly+32)(%rip), %xmm7
174	mulps	%xmm2, %xmm7
175	addps	ATANHF_DATA(sPoly+48)(%rip), %xmm7
176	mulps	%xmm2, %xmm7
177	addps	ATANHF_DATA(sPoly+64)(%rip), %xmm7
178	mulps	%xmm2, %xmm7
179	addps	ATANHF_DATA(sPoly+80)(%rip), %xmm7
180	mulps	%xmm2, %xmm7
181	addps	ATANHF_DATA(sPoly+96)(%rip), %xmm7
182	mulps	%xmm2, %xmm7
183	movaps	ATANHF_DATA(sPoly+112)(%rip), %xmm6
184	addps	%xmm6, %xmm7
185	mulps	%xmm2, %xmm7
186	mulps	%xmm2, %xmm7
187	mulps	ATANHF_DATA(sLn2)(%rip), %xmm13
188	/* We can build `sHalf` with `sPoly & sOne`.  */
189	andps	%xmm4, %xmm6
190	orps	%xmm1, %xmm3
191	xorps	%xmm6, %xmm1
192
193	addps	%xmm2, %xmm7
194	addps	%xmm13, %xmm7
195	mulps	%xmm7, %xmm1
196
197	/* Finish check of NaNs.  */
198	cmpleps	%xmm0, %xmm4
199	movmskps %xmm4, %edx
200	cmpltps	ATANHF_DATA(TinyRange)(%rip), %xmm0
201
202	andps	%xmm0, %xmm3
203	andnps	%xmm1, %xmm0
204	orps	%xmm3, %xmm0
205
206	testl	%edx, %edx
207	/* Go to special inputs processing branch.  */
208	jne	L(SPECIAL_VALUES_BRANCH)
209	# LOE rbx rbp r12 r13 r14 r15 xmm0
210	/* No registers to restore on fast path.  */
211	ret
212
213
214	/* Cold case. edx has 1s where there was a special value that
215	   needs to be handled by a atanhf call. Optimize for code size
216	   more so than speed here. */
217L(SPECIAL_VALUES_BRANCH):
218	# LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
219	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
220       call entry will be 16-byte aligned. */
221	subq	$56, %rsp
222	cfi_def_cfa_offset(64)
223	movups	%xmm0, 24(%rsp)
224	movups	%xmm5, 40(%rsp)
225
226	/* Use rbx/rbp for callee save registers as they get short
227       encoding for many instructions (as compared with r12/r13). */
228	movq	%rbx, (%rsp)
229	cfi_offset(rbx, -64)
230	movq	%rbp, 8(%rsp)
231	cfi_offset(rbp, -56)
232	/* edx has 1s where there was a special value that needs to be handled
233	   by a tanhf call.  */
234	movl	%edx, %ebx
235L(SPECIAL_VALUES_LOOP):
236	# LOE rbx rbp r12 r13 r14 r15
237	/* use rbp as index for special value that is saved across calls to
238	   tanhf. We technically don't need a callee save register here as offset
239	   to rsp is always [0, 12] so we can restore rsp by realigning to 64.
240	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
241	   in the loop.  */
242	xorl	%ebp, %ebp
243	bsfl	%ebx, %ebp
244
245	/* Scalar math fucntion call to process special input.  */
246	movss	40(%rsp, %rbp, 4), %xmm0
247	call	atanhf@PLT
248	/* No good way to avoid the store-forwarding fault this will cause on
249	   return. `lfence` avoids the SF fault but at greater cost as it
250	   serialized stack/callee save restoration.  */
251	movss	%xmm0, 24(%rsp, %rbp, 4)
252
253	leal	-1(%rbx), %eax
254	andl	%eax, %ebx
255	jnz	L(SPECIAL_VALUES_LOOP)
256	# LOE r12 r13 r14 r15
257	/* All results have been written to 24(%rsp).  */
258	movups	24(%rsp), %xmm0
259	movq	(%rsp), %rbx
260	cfi_restore(rbx)
261	movq	8(%rsp), %rbp
262	cfi_restore(rbp)
263	addq	$56, %rsp
264	cfi_def_cfa_offset(8)
265	ret
266END(_ZGVbN4v_atanhf_sse4)
267
268	.section .rodata, "a"
269	.align	16
270
271#ifdef __svml_satanh_data_internal_typedef
272typedef unsigned int VUINT32;
273typedef struct{
274	__declspec(align(16)) VUINT32 sOne[4][1];
275	__declspec(align(16)) VUINT32 SgnMask[4][1];
276	__declspec(align(16)) VUINT32 sTopMask12[4][1];
277	__declspec(align(16)) VUINT32 iBrkValue[4][1];
278	__declspec(align(16)) VUINT32 iOffExpoMask[4][1];
279	__declspec(align(16)) VUINT32 sPoly[8][4][1];
280	__declspec(align(16)) VUINT32 sLn2[4][1];
281	__declspec(align(16)) VUINT32 TinyRange[4][1];
282} __svml_satanh_data_internal;
283#endif
284
285__svml_satanh_data_internal:
286	/* sOne = SP 1.0 */
287	.align	16
288	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
289	/* SgnMask */
290	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
291	/* sTopMask12 */
292	.align	16
293	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
294	/* iBrkValue = SP 2/3 */
295	.align	16
296	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
297	/* iOffExpoMask = SP significand mask ==*/
298	.align	16
299	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
300
301	/* sPoly[] = SP polynomial */
302	.align	16
303	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
304	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
305	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
306	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
307	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
308	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
309	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
310	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
311
312	/* sLn2 = SP ln(2) */
313	.align	16
314	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
315	/* TinyRange */
316	.align	16
317	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
318	.align	16
319	.type	__svml_satanh_data_internal, @object
320	.size	__svml_satanh_data_internal, .-__svml_satanh_data_internal
321