1/* Function atan vectorized with SSE4.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *      For    0.0    <= x <=  7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
23 *      For  7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
24 *      For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
25 *      For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
26 *      For 39.0/16.0 <= x <=    inf   : atan(x) = atan(inf) + atan(s), where s=-1.0/x
27 *      Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
28 *
29 */
30
31/* Offsets for data table __svml_datan_data_internal_avx512
32 */
33#define AbsMask				0
34#define Shifter				16
35#define MaxThreshold			32
36#define MOne				48
37#define One				64
38#define LargeX				80
39#define Zero				96
40#define Tbl_H				112
41#define Tbl_L				368
42#define dIndexMed			624
43#define Pi2				640
44#define Pi2_low				656
45#define coeff				672
46
47#include <sysdep.h>
48
49	.section .text.sse4, "ax", @progbits
50ENTRY(_ZGVbN2v_atan_sse4)
51	lea	Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rcx
52	movups	__svml_datan_data_internal_avx512(%rip), %xmm4
53	movups	Shifter+__svml_datan_data_internal_avx512(%rip), %xmm3
54	andps	%xmm0, %xmm4
55	movaps	%xmm3, %xmm12
56	movaps	%xmm4, %xmm5
57	addpd	%xmm4, %xmm12
58	movaps	%xmm12, %xmm7
59
60	/*
61	 * table lookup sequence
62	 * VPERMUTE not available
63	 */
64	movaps	%xmm12, %xmm10
65	subpd	%xmm3, %xmm7
66	subpd	%xmm7, %xmm5
67	mulpd	%xmm4, %xmm7
68	movups	MaxThreshold+__svml_datan_data_internal_avx512(%rip), %xmm2
69	psllq	$3, %xmm10
70
71	/* saturate X range */
72	movups	LargeX+__svml_datan_data_internal_avx512(%rip), %xmm8
73	pxor	%xmm4, %xmm0
74	cmplepd	%xmm4, %xmm2
75	addpd	One+__svml_datan_data_internal_avx512(%rip), %xmm7
76	minpd	%xmm4, %xmm8
77	movups	MOne+__svml_datan_data_internal_avx512(%rip), %xmm6
78	movaps	%xmm2, %xmm1
79	movaps	%xmm2, %xmm9
80	andnps	%xmm5, %xmm1
81	andps	%xmm2, %xmm6
82	andnps	%xmm7, %xmm9
83	andps	%xmm2, %xmm8
84	orps	%xmm6, %xmm1
85	orps	%xmm8, %xmm9
86
87	/* R+Rl = DiffX/Y */
88	divpd	%xmm9, %xmm1
89	pand	.FLT_11(%rip), %xmm10
90
91	/* set table value to Pi/2 for large X */
92	movups	Pi2+__svml_datan_data_internal_avx512(%rip), %xmm4
93	movd	%xmm10, %eax
94	andps	%xmm2, %xmm4
95	pshufd	$2, %xmm10, %xmm11
96	movaps	%xmm2, %xmm10
97
98	/* polynomial evaluation */
99	movaps	%xmm1, %xmm2
100	mulpd	%xmm1, %xmm2
101	movd	%xmm11, %edx
102	movups	coeff+__svml_datan_data_internal_avx512(%rip), %xmm5
103	movaps	%xmm2, %xmm7
104	movups	coeff+32+__svml_datan_data_internal_avx512(%rip), %xmm6
105	movaps	%xmm2, %xmm9
106	mulpd	%xmm2, %xmm5
107	mulpd	%xmm2, %xmm7
108	addpd	coeff+16+__svml_datan_data_internal_avx512(%rip), %xmm5
109	mulpd	%xmm2, %xmm6
110	mulpd	%xmm7, %xmm5
111	addpd	coeff+48+__svml_datan_data_internal_avx512(%rip), %xmm6
112	mulpd	%xmm1, %xmm9
113	addpd	%xmm5, %xmm6
114	movups	coeff+64+__svml_datan_data_internal_avx512(%rip), %xmm8
115	mulpd	%xmm2, %xmm8
116	mulpd	%xmm6, %xmm7
117	addpd	coeff+80+__svml_datan_data_internal_avx512(%rip), %xmm8
118	addpd	%xmm7, %xmm8
119	mulpd	%xmm8, %xmm9
120	movups	dIndexMed+__svml_datan_data_internal_avx512(%rip), %xmm14
121	cmplepd	%xmm12, %xmm14
122	addpd	%xmm9, %xmm1
123	movslq	%eax, %rax
124	movaps	%xmm14, %xmm3
125	movslq	%edx, %rdx
126	movsd	-128(%rax, %rcx), %xmm13
127	movsd	(%rcx, %rax), %xmm15
128	movhpd	-128(%rdx, %rcx), %xmm13
129	movhpd	(%rcx, %rdx), %xmm15
130	andnps	%xmm13, %xmm3
131	andps	%xmm14, %xmm15
132	orps	%xmm15, %xmm3
133	andnps	%xmm3, %xmm10
134	orps	%xmm4, %xmm10
135	addpd	%xmm1, %xmm10
136	pxor	%xmm10, %xmm0
137	ret
138
139END(_ZGVbN2v_atan_sse4)
140
141	.section .rodata, "a"
142	.align	16
143
144#ifdef __svml_datan_data_internal_avx512_typedef
145typedef unsigned int VUINT32;
146typedef struct {
147	__declspec(align(16)) VUINT32 AbsMask[2][2];
148	__declspec(align(16)) VUINT32 Shifter[2][2];
149	__declspec(align(16)) VUINT32 MaxThreshold[2][2];
150	__declspec(align(16)) VUINT32 MOne[2][2];
151	__declspec(align(16)) VUINT32 One[2][2];
152	__declspec(align(16)) VUINT32 LargeX[2][2];
153	__declspec(align(16)) VUINT32 Zero[2][2];
154	__declspec(align(16)) VUINT32 Tbl_H[32][2];
155	__declspec(align(16)) VUINT32 Tbl_L[32][2];
156	__declspec(align(16)) VUINT32 dIndexMed[2][2];
157	__declspec(align(16)) VUINT32 Pi2[2][2];
158	__declspec(align(16)) VUINT32 Pi2_low[2][2];
159	__declspec(align(16)) VUINT32 coeff[6][2][2];
160} __svml_datan_data_internal_avx512;
161#endif
162__svml_datan_data_internal_avx512:
163	/* AbsMask */
164	.quad	0x7fffffffffffffff, 0x7fffffffffffffff
165	/* Shifter */
166	.align	16
167	.quad	0x4318000000000000, 0x4318000000000000
168	/* MaxThreshold */
169	.align	16
170	.quad	0x401f800000000000, 0x401f800000000000
171	/* MOne */
172	.align	16
173	.quad	0xbff0000000000000, 0xbff0000000000000
174	/* One */
175	.align	16
176	.quad	0x3ff0000000000000, 0x3ff0000000000000
177	/* LargeX */
178	.align	16
179	.quad	0x47f0000000000000, 0x47f0000000000000
180	/* Zero */
181	.align	16
182	.quad	0x0000000000000000, 0x0000000000000000
183	/* Tbl_H */
184	.align	16
185	.quad	0x0000000000000000, 0x3fcf5b75f92c80dd
186	.quad	0x3fddac670561bb4f, 0x3fe4978fa3269ee1
187	.quad	0x3fe921fb54442d18, 0x3fecac7c57846f9e
188	.quad	0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
189	.quad	0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
190	.quad	0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
191	.quad	0x3ff3fc176b7a8560, 0x3ff45b54837351a0
192	.quad	0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
193	.quad	0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
194	.quad	0x3ff5a25052114e60, 0x3ff5d013c41adabd
195	.quad	0x3ff5f97315254857, 0x3ff61f06c6a92b89
196	.quad	0x3ff6414d44094c7c, 0x3ff660b02c736a06
197	.quad	0x3ff67d8863bc99bd, 0x3ff698213a9d5053
198	.quad	0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
199	.quad	0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
200	.quad	0x3ff7030cf9403197, 0x3ff7145eac2088a4
201	/* Tbl_L */
202	.align	16
203	.quad	0x0000000000000000, 0x3c68ab6e3cf7afbd
204	.quad	0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
205	.quad	0x3c81a62633145c07, 0x3c80dae13ad18a6b
206	.quad	0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
207	.quad	0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
208	.quad	0x3c96254cb03bb199, 0xbc812c77e8a80f5c
209	.quad	0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
210	.quad	0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
211	.quad	0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
212	.quad	0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
213	.quad	0xbc831151a43b51ca, 0xbc8487d50bceb1a5
214	.quad	0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
215	.quad	0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
216	.quad	0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
217	.quad	0xbc929c86447928e7, 0xbc8957a7170df016
218	.quad	0xbc7cbe1896221608, 0xbc9fda5797b32a0b
219	/* dIndexMed */
220	.align	16
221	.quad	0x4318000000000010, 0x4318000000000010
222	/* Pi2 */
223	.align	16
224	.quad	0x3ff921fb54442d18, 0x3ff921fb54442d18
225	/* Pi2_low */
226	.align	16
227	.quad	0x3c91a62633145c07, 0x3c91a62633145c07
228	/* coeff6 */
229	.align	16
230	.quad	0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
231	.quad	0xbfb74257c46790cc, 0xbfb74257c46790cc
232	.quad	0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
233	.quad	0xbfc249248eef04da, 0xbfc249248eef04da
234	.quad	0x3fc999999998741e, 0x3fc999999998741e
235	.quad	0xbfd555555555554d, 0xbfd555555555554d
236	.align	16
237	.type	__svml_datan_data_internal_avx512, @object
238	.size	__svml_datan_data_internal_avx512, .-__svml_datan_data_internal_avx512
239	.align	16
240
241.FLT_11:
242	.long	0x00000078, 0x00000000, 0x00000078, 0x00000000
243	.type	.FLT_11, @object
244	.size	.FLT_11, 16
245