1/* Function atanf vectorized with AVX-512.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *      For    0.0    <= x <=  7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
23 *      For  7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
24 *      For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
25 *      For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
26 *      For 39.0/16.0 <= x <=    inf   : atan(x) = atan(inf) + atan(s), where s=-1.0/x
27 *      Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
28 *
29 */
30
31/* Offsets for data table __svml_satan_data_internal_avx512
32 */
33#define AbsMask				0
34#define Shifter				64
35#define MaxThreshold			128
36#define MOne				192
37#define One				256
38#define LargeX				320
39#define Zero				384
40#define Tbl_H				448
41#define Pi2				576
42#define coeff_1				640
43#define coeff_2				704
44#define coeff_3				768
45
46#include <sysdep.h>
47
48	.section .text.exex512, "ax", @progbits
49ENTRY(_ZGVeN16v_atanf_skx)
50	vandps	__svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
51	vmovups	MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
52	vmovups	One+__svml_satan_data_internal_avx512(%rip), %zmm8
53
54	/* round to 2 bits after binary point */
55	vreduceps $40, {sae}, %zmm7, %zmm5
56
57	/* saturate X range */
58	vmovups	LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
59	vmovups	Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
60	vcmpps	$29, {sae}, %zmm3, %zmm7, %k1
61
62	/* table lookup sequence */
63	vmovups	Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
64	vsubps	{rn-sae}, %zmm5, %zmm7, %zmm4
65	vaddps	{rn-sae}, %zmm2, %zmm7, %zmm1
66	vxorps	%zmm0, %zmm7, %zmm0
67	vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
68	vmovups	coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
69
70	/* if|X|>=MaxThreshold, set DiffX=-1 */
71	vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
72	vmovups	coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
73
74	/* if|X|>=MaxThreshold, set Y=X */
75	vminps	{sae}, %zmm7, %zmm6, %zmm8{%k1}
76
77	/* R+Rl = DiffX/Y */
78	vgetmantps $0, {sae}, %zmm9, %zmm12
79	vgetexpps {sae}, %zmm9, %zmm10
80	vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
81	vgetmantps $0, {sae}, %zmm8, %zmm15
82	vgetexpps {sae}, %zmm8, %zmm11
83	vmovups	coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
84
85	/* set table value to Pi/2 for large X */
86	vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
87	vrcp14ps %zmm15, %zmm13
88	vsubps	{rn-sae}, %zmm11, %zmm10, %zmm2
89	vmulps	{rn-sae}, %zmm13, %zmm12, %zmm14
90	vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
91	vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
92	vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
93
94	/* polynomial evaluation */
95	vmulps	{rn-sae}, %zmm7, %zmm7, %zmm8
96	vmulps	{rn-sae}, %zmm7, %zmm8, %zmm6
97	vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
98	vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
99	vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
100	vaddps	{rn-sae}, %zmm9, %zmm8, %zmm10
101	vxorps	%zmm0, %zmm10, %zmm0
102	ret
103
104END(_ZGVeN16v_atanf_skx)
105
106	.section .rodata, "a"
107	.align	64
108
109#ifdef __svml_satan_data_internal_avx512_typedef
110typedef unsigned int VUINT32;
111typedef struct {
112	__declspec(align(64)) VUINT32 AbsMask[16][1];
113	__declspec(align(64)) VUINT32 Shifter[16][1];
114	__declspec(align(64)) VUINT32 MaxThreshold[16][1];
115	__declspec(align(64)) VUINT32 MOne[16][1];
116	__declspec(align(64)) VUINT32 One[16][1];
117	__declspec(align(64)) VUINT32 LargeX[16][1];
118	__declspec(align(64)) VUINT32 Zero[16][1];
119	__declspec(align(64)) VUINT32 Tbl_H[32][1];
120	__declspec(align(64)) VUINT32 Pi2[16][1];
121	__declspec(align(64)) VUINT32 coeff[3][16][1];
122} __svml_satan_data_internal_avx512;
123#endif
124__svml_satan_data_internal_avx512:
125	/* AbsMask */
126	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
127	/* Shifter */
128	.align	64
129	.long	0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
130	/* MaxThreshold */
131	.align	64
132	.long	0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
133	/* MOne */
134	.align	64
135	.long	0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
136	/* One */
137	.align	64
138	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
139	/* LargeX */
140	.align	64
141	.long	0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
142	/* Zero */
143	.align	64
144	.long	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
145	/* Tbl_H */
146	.align	64
147	.long	0x00000000, 0x3e7adbb0
148	.long	0x3eed6338, 0x3f24bc7d
149	.long	0x3f490fdb, 0x3f6563e3
150	.long	0x3f7b985f, 0x3f869c79
151	.long	0x3f8db70d, 0x3f93877b
152	.long	0x3f985b6c, 0x3f9c6b53
153	.long	0x3f9fe0bb, 0x3fa2daa4
154	.long	0x3fa57088, 0x3fa7b46f
155	.long	0x3fa9b465, 0x3fab7b7a
156	.long	0x3fad1283, 0x3fae809e
157	.long	0x3fafcb99, 0x3fb0f836
158	.long	0x3fb20a6a, 0x3fb30581
159	.long	0x3fb3ec43, 0x3fb4c10a
160	.long	0x3fb585d7, 0x3fb63c64
161	.long	0x3fb6e62c, 0x3fb78478
162	.long	0x3fb81868, 0x3fb8a2f5
163	/* Pi2 */
164	.align	64
165	.long	0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
166	/* coeff3 */
167	.align	64
168	.long	0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
169	.long	0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
170	.long	0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
171	.align	64
172	.type	__svml_satan_data_internal_avx512, @object
173	.size	__svml_satan_data_internal_avx512, .-__svml_satan_data_internal_avx512
174