1/* Function tanhf vectorized with AVX2.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *   NOTE: Since the hyperbolic tangent function is odd
23 *         (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
24 *         value of the argument |x|: tanh(x) = sign(x) * tanh(|x|)
25 *
26 *   We use a table lookup method to compute tanh(|x|).
27 *   The basic idea is to split the input range into a number of subintervals
28 *   and to approximate tanh(.) with a polynomial on each of them.
29 *
30 *   IEEE SPECIAL CONDITIONS:
31 *   x = [+, -]0, r = [+, -]0
32 *   x = +Inf,   r = +1
33 *   x = -Inf,   r = -1
34 *   x = QNaN,   r = QNaN
35 *   x = SNaN,   r = QNaN
36 *
37 *
38 *   ALGORITHM DETAILS
39 *   We handle special values in a callout function, aside from main path
40 *   computations. "Special" for this algorithm are:
41 *   INF, NAN, |x| > HUGE_THRESHOLD
42 *
43 *
44 *   Main path computations are organized as follows:
45 *   Actually we split the interval [0, SATURATION_THRESHOLD)
46 *   into a number of subintervals.  On each subinterval we approximate tanh(.)
47 *   with a minimax polynomial of pre-defined degree. Polynomial coefficients
48 *   are computed beforehand and stored in table. We also use
49 *
50 *       y := |x| + B,
51 *
52 *   here B depends on subinterval and is used to make argument
53 *   closer to zero.
54 *   We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
55 *   where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to
56 *   preserve main path computation logic but return 1.0 for all arguments.
57 *
58 *   Hence reconstruction looks as follows:
59 *   we extract proper polynomial and range reduction coefficients
60 *        (Pj and B), corresponding to subinterval, to which |x| belongs,
61 *        and return
62 *
63 *       r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
64 *
65 *   NOTE: we use multiprecision technique to multiply and sum the first
66 *         K terms of the polynomial. So Pj, j = 0..K are stored in
67 *         table each as a pair of target precision numbers (Pj and PLj) to
68 *         achieve wider than target precision.
69 *
70 *
71 */
72
73#include <sysdep.h>
74
75/* tanhf data tables for avx2 and sse4 implementatins defined here.
76 */
77#include "svml_s_tanhf_rodata.S"
78
79	.section .text.avx2, "ax", @progbits
80ENTRY(_ZGVdN8v_tanhf_avx2)
81	/* Here huge arguments, INF and NaNs are filtered out to callout. */
82	vpand	TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
83	vpsubd	TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
84
85	/* Selection of arguments between [0, 0x04280000] into ymm2.  */
86	vpxor	%ymm3, %ymm3, %ymm3
87	vpmaxsd	%ymm3, %ymm2, %ymm2
88	vpminsd	TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
89
90	/*
91	 *  small table specific variables *
92	 *  Constant loading
93	 */
94	vpsrld	$14, %ymm2, %ymm1
95
96	/* We are splitting xmm1 into 8 GPRs. This may be faster to do with
97	   store/load as we can take advantage of store-forwarding.  */
98	vmovq	%xmm1, %r8
99	/* We have eliminated all negative values for ymm1 so no need to sign
100	   extend.  */
101	movl	%r8d, %r9d
102	shrq	$32, %r8
103
104	/* Store base of lookup table in rax.  */
105	leaq	TANHF_DATA(_lookupTable)(%rip), %rax
106
107	/* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
108	   with memory operand. This helps alleviate bottleneck on p5.  */
109	vmovupd	16(%r9, %rax), %xmm5
110
111	vpextrq	$1, %xmm1, %rsi
112	movl	%esi, %edi
113	shrq	$32, %rsi
114
115	vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
116
117	vextracti128 $1, %ymm1, %xmm2
118	vmovq	%xmm2, %rdx
119	movl	%edx, %ecx
120	shrq	$32, %rdx
121
122	vmovupd	(%rcx, %rax), %xmm6
123
124	vpextrq	$1, %xmm2, %r10
125	movl	%r10d, %r11d
126	shrq	$32, %r10
127
128	vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
129
130	vmovupd	16(%r8, %rax), %xmm1
131	vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
132	vmovupd	(%rdx, %rax), %xmm3
133	vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
134
135	vunpcklpd %ymm3, %ymm6, %ymm7
136	vunpckhpd %ymm3, %ymm6, %ymm6
137
138	vunpcklpd %ymm1, %ymm5, %ymm3
139	vunpckhpd %ymm1, %ymm5, %ymm1
140
141	vmovaps	TANHF_DATA(_sAbsMask)(%rip), %ymm11
142	/* Store special cases in ymm15.  */
143	vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
144
145	vandps	%ymm11, %ymm0, %ymm4
146
147	vcvtps2pd %xmm4, %ymm5
148
149	vextractf128 $1, %ymm4, %xmm4
150	vcvtps2pd %xmm4, %ymm4
151
152	vmovupd	16(%rcx, %rax), %xmm2
153	vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
154
155	vfmadd213pd %ymm3, %ymm5, %ymm1
156
157	vmovupd	16(%rdx, %rax), %xmm3
158	vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
159
160	vunpcklpd %ymm3, %ymm2, %ymm10
161	vunpckhpd %ymm3, %ymm2, %ymm2
162
163	vfmadd213pd %ymm10, %ymm4, %ymm2
164	vfmadd213pd %ymm6, %ymm4, %ymm2
165	vfmadd213pd %ymm7, %ymm4, %ymm2
166	vcvtpd2ps %ymm2, %xmm2
167
168	vmovupd	(%r9, %rax), %xmm7
169	vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
170
171	vmovupd	(%r8, %rax), %xmm3
172	vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
173
174	vunpckhpd %ymm3, %ymm7, %ymm4
175	vunpcklpd %ymm3, %ymm7, %ymm7
176
177	vfmadd213pd %ymm4, %ymm5, %ymm1
178	vfmadd213pd %ymm7, %ymm5, %ymm1
179
180
181	vcvtpd2ps %ymm1, %xmm1
182	vinsertf128 $1, %xmm2, %ymm1, %ymm1
183
184	vmovmskps %ymm15, %edx
185	vandnps	%ymm0, %ymm11, %ymm2
186	testl	%edx, %edx
187	/* Go to special inputs processing branch */
188	jne	L(SPECIAL_VALUES_BRANCH)
189	# LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
190	/* Wait until after branch of write over ymm0.  */
191	vorps	%ymm2, %ymm1, %ymm0
192	/* No stack restoration on the fastpath.  */
193	ret
194
195
196	/* Cold case. edx has 1s where there was a special value that
197	   needs to be handled by a tanhf call. Optimize for code size
198	   more so than speed here. */
199L(SPECIAL_VALUES_BRANCH):
200	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
201    /* Use r13 to save/restore the stack. This allows us to use rbp as
202       callee save register saving code size. */
203	pushq	%r13
204	cfi_adjust_cfa_offset(8)
205	cfi_offset(r13, -16)
206	/* Need to callee save registers to preserve state across tanhf calls.
207	 */
208	pushq	%rbx
209	cfi_adjust_cfa_offset(8)
210	cfi_offset(rbx, -24)
211	pushq	%rbp
212	cfi_adjust_cfa_offset(8)
213	cfi_offset(rbp, -32)
214	movq	%rsp, %r13
215	cfi_def_cfa_register(r13)
216
217	/* Align stack and make room for 2x ymm vectors.  */
218	andq	$-32, %rsp
219	addq	$-64, %rsp
220
221	/* Save all already computed inputs.  */
222	vorps	%ymm2, %ymm1, %ymm1
223	vmovaps	%ymm1, (%rsp)
224	/* Save original input (ymm0 unchanged up to this point).  */
225	vmovaps	%ymm0, 32(%rsp)
226
227	vzeroupper
228
229	/* edx has 1s where there was a special value that needs to be handled
230	   by a tanhf call.  */
231	movl	%edx, %ebx
232L(SPECIAL_VALUES_LOOP):
233	# LOE rbx rbp r12 r13 r14 r15
234	/* use rbp as index for special value that is saved across calls to
235	   tanhf. We technically don't need a callee save register here as offset
236	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
237	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
238	   in the loop. Realigning also costs more code size.  */
239	xorl	%ebp, %ebp
240	tzcntl	%ebx, %ebp
241
242	/* Scalar math function call to process special input.  */
243	vmovss	32(%rsp, %rbp, 4), %xmm0
244	call	tanhf@PLT
245
246	/* No good way to avoid the store-forwarding fault this will cause on
247	   return. `lfence` avoids the SF fault but at greater cost as it
248	   serialized stack/callee save restoration.  */
249	vmovss	%xmm0, (%rsp, %rbp, 4)
250
251	blsrl   %ebx, %ebx
252	jnz	L(SPECIAL_VALUES_LOOP)
253	# LOE r12 r13 r14 r15
254
255
256	/* All results have been written to (%rsp).  */
257	vmovups	(%rsp), %ymm0
258	/* Restore rsp.  */
259	movq	%r13, %rsp
260	cfi_def_cfa_register(rsp)
261	/* Restore callee save registers.  */
262	popq	%rbp
263	cfi_adjust_cfa_offset(-8)
264	cfi_restore(rbp)
265	popq	%rbx
266	cfi_adjust_cfa_offset(-8)
267	cfi_restore(rbp)
268	popq	%r13
269	cfi_adjust_cfa_offset(-8)
270	cfi_restore(r13)
271	ret
272END(_ZGVdN8v_tanhf_avx2)
273