1/* Function acos vectorized with AVX-512.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *      SelMask = (|x| >= 0.5) ? 1 : 0;
23 *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
25 *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
26 *
27 */
28
29/* Offsets for data table __svml_dacos_data_internal
30 */
31#define SgnBit				0
32#define OneHalf				64
33#define SmallNorm			128
34#define MOne				192
35#define Two				256
36#define sqrt_coeff_1			320
37#define sqrt_coeff_2			384
38#define sqrt_coeff_3			448
39#define sqrt_coeff_4			512
40#define poly_coeff_1			576
41#define poly_coeff_2			640
42#define poly_coeff_3			704
43#define poly_coeff_4			768
44#define poly_coeff_5			832
45#define poly_coeff_6			896
46#define poly_coeff_7			960
47#define poly_coeff_8			1024
48#define poly_coeff_9			1088
49#define poly_coeff_10			1152
50#define poly_coeff_11			1216
51#define poly_coeff_12			1280
52#define PiH				1344
53#define Pi2H				1408
54
55#include <sysdep.h>
56
57	.section .text.evex512, "ax", @progbits
58ENTRY(_ZGVeN8v_acos_skx)
59	pushq	%rbp
60	cfi_def_cfa_offset(16)
61	movq	%rsp, %rbp
62	cfi_def_cfa(6, 16)
63	cfi_offset(6, -16)
64	andq	$-64, %rsp
65	subq	$192, %rsp
66	vmovups	__svml_dacos_data_internal(%rip), %zmm7
67	vmovups	OneHalf+__svml_dacos_data_internal(%rip), %zmm8
68
69	/* S ~ 2*sqrt(Y) */
70	vmovups	SmallNorm+__svml_dacos_data_internal(%rip), %zmm11
71	vmovups	Two+__svml_dacos_data_internal(%rip), %zmm14
72	vmovups	sqrt_coeff_1+__svml_dacos_data_internal(%rip), %zmm15
73	vmovups	sqrt_coeff_2+__svml_dacos_data_internal(%rip), %zmm2
74	vmovups	sqrt_coeff_3+__svml_dacos_data_internal(%rip), %zmm1
75	vmovups	MOne+__svml_dacos_data_internal(%rip), %zmm10
76	vmovaps	%zmm0, %zmm6
77
78	/* x = -|arg| */
79	vorpd	%zmm6, %zmm7, %zmm5
80	vandpd	%zmm6, %zmm7, %zmm4
81
82	/* Y = 0.5 + 0.5*(-x) */
83	vfmadd231pd {rn-sae}, %zmm5, %zmm8, %zmm8
84
85	/* x^2 */
86	vmulpd	{rn-sae}, %zmm5, %zmm5, %zmm9
87	vrsqrt14pd %zmm8, %zmm12
88	vcmppd	$17, {sae}, %zmm11, %zmm8, %k1
89	vcmppd	$17, {sae}, %zmm10, %zmm5, %k0
90	vmovups	poly_coeff_5+__svml_dacos_data_internal(%rip), %zmm10
91	vmovups	poly_coeff_7+__svml_dacos_data_internal(%rip), %zmm11
92	vminpd	{sae}, %zmm8, %zmm9, %zmm3
93	vmovups	poly_coeff_3+__svml_dacos_data_internal(%rip), %zmm9
94	vxorpd	%zmm12, %zmm12, %zmm12{%k1}
95	vaddpd	{rn-sae}, %zmm8, %zmm8, %zmm0
96	vcmppd	$21, {sae}, %zmm8, %zmm3, %k4
97
98	/* X<X^2 iff X<0 */
99	vcmppd	$17, {sae}, %zmm3, %zmm6, %k2
100	vmulpd	{rn-sae}, %zmm12, %zmm12, %zmm13
101	vmulpd	{rn-sae}, %zmm12, %zmm0, %zmm7
102	vmovups	poly_coeff_4+__svml_dacos_data_internal(%rip), %zmm12
103
104	/* polynomial */
105	vmovups	poly_coeff_1+__svml_dacos_data_internal(%rip), %zmm8
106	vfmsub213pd {rn-sae}, %zmm14, %zmm13, %zmm0
107	vmovups	sqrt_coeff_4+__svml_dacos_data_internal(%rip), %zmm13
108	vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm12
109	vmovups	poly_coeff_11+__svml_dacos_data_internal(%rip), %zmm9
110	vfmadd231pd {rn-sae}, %zmm0, %zmm15, %zmm2
111	vmovups	poly_coeff_9+__svml_dacos_data_internal(%rip), %zmm15
112	vmulpd	{rn-sae}, %zmm0, %zmm7, %zmm14
113	vfmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm2
114	vmovups	poly_coeff_2+__svml_dacos_data_internal(%rip), %zmm1
115	kmovw	%k0, %edx
116	vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm2
117	vfmadd231pd {rn-sae}, %zmm3, %zmm8, %zmm1
118	vmovups	poly_coeff_10+__svml_dacos_data_internal(%rip), %zmm8
119	vmulpd	{rn-sae}, %zmm3, %zmm3, %zmm0
120	vfnmadd213pd {rn-sae}, %zmm7, %zmm14, %zmm2
121	vmovups	poly_coeff_6+__svml_dacos_data_internal(%rip), %zmm7
122	vfmadd231pd {rn-sae}, %zmm3, %zmm15, %zmm8
123	vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm1
124	vblendmpd %zmm2, %zmm5, %zmm2{%k4}
125	vfmadd231pd {rn-sae}, %zmm3, %zmm10, %zmm7
126	vmovups	poly_coeff_8+__svml_dacos_data_internal(%rip), %zmm10
127	vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
128	vmovups	poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
129	kandw	%k4, %k2, %k3
130	vfmadd213pd {rn-sae}, %zmm10, %zmm0, %zmm7
131	vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm11
132	vmulpd	{rn-sae}, %zmm0, %zmm0, %zmm10
133	vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm1
134	vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm1
135	vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
136	vmovups	Pi2H+__svml_dacos_data_internal(%rip), %zmm0
137	vmulpd	{rn-sae}, %zmm3, %zmm1, %zmm1
138	vxorpd	%zmm4, %zmm2, %zmm3
139	vxorpd	%zmm0, %zmm0, %zmm0{%k4}
140	vfmadd213pd {rn-sae}, %zmm3, %zmm3, %zmm1
141	vorpd	PiH+__svml_dacos_data_internal(%rip), %zmm0, %zmm0{%k3}
142	vaddpd	{rn-sae}, %zmm1, %zmm0, %zmm0
143	testl	%edx, %edx
144
145	/* Go to special inputs processing branch */
146	jne	L(SPECIAL_VALUES_BRANCH)
147	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm6
148
149	/* Restore registers
150	 * and exit the function
151	 */
152
153L(EXIT):
154	movq	%rbp, %rsp
155	popq	%rbp
156	cfi_def_cfa(7, 8)
157	cfi_restore(6)
158	ret
159	cfi_def_cfa(6, 16)
160	cfi_offset(6, -16)
161
162	/* Branch to process
163	 * special inputs
164	 */
165
166L(SPECIAL_VALUES_BRANCH):
167	vmovups	%zmm6, 64(%rsp)
168	vmovups	%zmm0, 128(%rsp)
169	# LOE rbx r12 r13 r14 r15 edx zmm0
170
171	xorl	%eax, %eax
172	# LOE rbx r12 r13 r14 r15 eax edx
173
174	vzeroupper
175	movq	%r12, 16(%rsp)
176	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
177	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
178	movl	%eax, %r12d
179	movq	%r13, 8(%rsp)
180	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
181	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
182	movl	%edx, %r13d
183	movq	%r14, (%rsp)
184	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
185	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
186	# LOE rbx r15 r12d r13d
187
188	/* Range mask
189	 * bits check
190	 */
191
192L(RANGEMASK_CHECK):
193	btl	%r12d, %r13d
194
195	/* Call scalar math function */
196	jc	L(SCALAR_MATH_CALL)
197	# LOE rbx r15 r12d r13d
198
199	/* Special inputs
200	 * processing loop
201	 */
202
203L(SPECIAL_VALUES_LOOP):
204	incl	%r12d
205	cmpl	$8, %r12d
206
207	/* Check bits in range mask */
208	jl	L(RANGEMASK_CHECK)
209	# LOE rbx r15 r12d r13d
210
211	movq	16(%rsp), %r12
212	cfi_restore(12)
213	movq	8(%rsp), %r13
214	cfi_restore(13)
215	movq	(%rsp), %r14
216	cfi_restore(14)
217	vmovups	128(%rsp), %zmm0
218
219	/* Go to exit */
220	jmp	L(EXIT)
221	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
222	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
223	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
224	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
225	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
226	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
227	# LOE rbx r12 r13 r14 r15 zmm0
228
229	/* Scalar math fucntion call
230	 * to process special input
231	 */
232
233L(SCALAR_MATH_CALL):
234	movl	%r12d, %r14d
235	vmovsd	64(%rsp, %r14, 8), %xmm0
236	call	acos@PLT
237	# LOE rbx r14 r15 r12d r13d xmm0
238
239	vmovsd	%xmm0, 128(%rsp, %r14, 8)
240
241	/* Process special inputs in loop */
242	jmp	L(SPECIAL_VALUES_LOOP)
243	# LOE rbx r15 r12d r13d
244END(_ZGVeN8v_acos_skx)
245
246	.section .rodata, "a"
247	.align	64
248
249#ifdef __svml_dacos_data_internal_typedef
250typedef unsigned int VUINT32;
251typedef struct {
252	__declspec(align(64)) VUINT32 SgnBit[8][2];
253	__declspec(align(64)) VUINT32 OneHalf[8][2];
254	__declspec(align(64)) VUINT32 SmallNorm[8][2];
255	__declspec(align(64)) VUINT32 MOne[8][2];
256	__declspec(align(64)) VUINT32 Two[8][2];
257	__declspec(align(64)) VUINT32 sqrt_coeff[4][8][2];
258	__declspec(align(64)) VUINT32 poly_coeff[12][8][2];
259	__declspec(align(64)) VUINT32 PiH[8][2];
260	__declspec(align(64)) VUINT32 Pi2H[8][2];
261} __svml_dacos_data_internal;
262#endif
263__svml_dacos_data_internal:
264	/* SgnBit */
265	.quad	0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
266	/* OneHalf */
267	.align	64
268	.quad	0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
269	/* SmallNorm */
270	.align	64
271	.quad	0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
272	/* MOne */
273	.align	64
274	.quad	0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
275	/* Two */
276	.align	64
277	.quad	0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
278	/* sqrt_coeff[4] */
279	.align	64
280	.quad	0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
281	.quad	0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
282	.quad	0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
283	.quad	0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
284	/* poly_coeff[12] */
285	.align	64
286	.quad	0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
287	.quad	0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
288	.quad	0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
289	.quad	0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
290	.quad	0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
291	.quad	0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
292	.quad	0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
293	.quad	0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
294	.quad	0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
295	.quad	0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
296	.quad	0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
297	.quad	0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
298	/* PiH */
299	.align	64
300	.quad	0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
301	/* Pi2H */
302	.align	64
303	.quad	0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
304	.align	64
305	.type	__svml_dacos_data_internal, @object
306	.size	__svml_dacos_data_internal, .-__svml_dacos_data_internal
307