1/* Function atanf vectorized with AVX2.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *      For    0.0    <= x <=  7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
23 *      For  7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
24 *      For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
25 *      For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
26 *      For 39.0/16.0 <= x <=    inf   : atan(x) = atan(inf) + atan(s), where s=-1.0/x
27 *      Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
28 *
29 */
30
31/* Offsets for data table __svml_satan_data_internal
32 */
33#define _sSIGN_MASK			0
34#define _sABS_MASK			32
35#define _sONE				64
36#define _sPIO2				96
37#define _sPC8				128
38#define _sPC7				160
39#define _sPC6				192
40#define _sPC5				224
41#define _sPC4				256
42#define _sPC3				288
43#define _sPC2				320
44#define _sPC1				352
45#define _sPC0				384
46
47#include <sysdep.h>
48
49	.section .text.avx2, "ax", @progbits
50ENTRY(_ZGVdN8v_atanf_avx2)
51	/*
52	 * 1) If x>1,      then r=-1/x, PIO2=Pi/2
53	 * 2) If -1<=x<=1, then r=x,    PIO2=0
54	 * 3) If x<-1,     then r=-1/x, PIO2=-Pi/2
55	 */
56	vmovups	_sONE+__svml_satan_data_internal(%rip), %ymm2
57	vmovups	__svml_satan_data_internal(%rip), %ymm7
58	vmovups	_sPC7+__svml_satan_data_internal(%rip), %ymm13
59
60	/*
61	 * To use minps\maxps operations for argument reduction
62	 * uncomment _AT_USEMINMAX_ definition
63	 *  Declarations
64	 * Variables
65	 * Constants
66	 */
67	vandps	_sABS_MASK+__svml_satan_data_internal(%rip), %ymm0, %ymm3
68	vmaxps	%ymm3, %ymm2, %ymm5
69	vminps	%ymm3, %ymm2, %ymm4
70	vcmple_oqps %ymm2, %ymm3, %ymm6
71	vdivps	%ymm5, %ymm4, %ymm11
72	vandps	%ymm7, %ymm0, %ymm9
73	vandnps	%ymm7, %ymm6, %ymm8
74	vxorps	%ymm9, %ymm8, %ymm10
75	vxorps	%ymm11, %ymm10, %ymm15
76
77	/* Polynomial. */
78	vmulps	%ymm15, %ymm15, %ymm14
79	vmovups	_sPC8+__svml_satan_data_internal(%rip), %ymm0
80	vmulps	%ymm14, %ymm14, %ymm12
81	vfmadd213ps _sPC6+__svml_satan_data_internal(%rip), %ymm12, %ymm0
82	vfmadd213ps _sPC5+__svml_satan_data_internal(%rip), %ymm12, %ymm13
83	vfmadd213ps _sPC4+__svml_satan_data_internal(%rip), %ymm12, %ymm0
84	vfmadd213ps _sPC3+__svml_satan_data_internal(%rip), %ymm12, %ymm13
85	vfmadd213ps _sPC2+__svml_satan_data_internal(%rip), %ymm12, %ymm0
86	vfmadd213ps _sPC1+__svml_satan_data_internal(%rip), %ymm12, %ymm13
87	vfmadd213ps %ymm13, %ymm14, %ymm0
88	vfmadd213ps _sPC0+__svml_satan_data_internal(%rip), %ymm14, %ymm0
89	vandnps	_sPIO2+__svml_satan_data_internal(%rip), %ymm6, %ymm1
90	vxorps	%ymm9, %ymm1, %ymm1
91
92	/* Reconstruction. */
93	vfmadd213ps %ymm1, %ymm15, %ymm0
94	ret
95
96END(_ZGVdN8v_atanf_avx2)
97
98	.section .rodata, "a"
99	.align	32
100
101#ifdef __svml_satan_data_internal_typedef
102typedef unsigned int VUINT32;
103typedef struct {
104	__declspec(align(32)) VUINT32 _sSIGN_MASK[8][1];
105	__declspec(align(32)) VUINT32 _sABS_MASK[8][1];
106	__declspec(align(32)) VUINT32 _sONE[8][1];
107	__declspec(align(32)) VUINT32 _sPIO2[8][1];
108	__declspec(align(32)) VUINT32 _sPC8[8][1];
109	__declspec(align(32)) VUINT32 _sPC7[8][1];
110	__declspec(align(32)) VUINT32 _sPC6[8][1];
111	__declspec(align(32)) VUINT32 _sPC5[8][1];
112	__declspec(align(32)) VUINT32 _sPC4[8][1];
113	__declspec(align(32)) VUINT32 _sPC3[8][1];
114	__declspec(align(32)) VUINT32 _sPC2[8][1];
115	__declspec(align(32)) VUINT32 _sPC1[8][1];
116	__declspec(align(32)) VUINT32 _sPC0[8][1];
117} __svml_satan_data_internal;
118#endif
119__svml_satan_data_internal:
120	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 // _sSIGN_MASK
121	.align	32
122	.long	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // _sABS_MASK
123	.align	32
124	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sONE
125	.align	32
126	.long	0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // _sPIO2
127	.align	32
128	.long	0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // _sPC8
129	.align	32
130	.long	0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // _sPC7
131	.align	32
132	.long	0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // _sPC6
133	.align	32
134	.long	0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // _sPC5
135	.align	32
136	.long	0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // _sPC4
137	.align	32
138	.long	0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // _sPC3
139	.align	32
140	.long	0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // _sPC2
141	.align	32
142	.long	0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // _sPC1
143	.align	32
144	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sPC0
145	.align	32
146	.type	__svml_satan_data_internal, @object
147	.size	__svml_satan_data_internal, .-__svml_satan_data_internal
148