1/* Function acosf vectorized with SSE4.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *      SelMask = (|x| >= 0.5) ? 1 : 0;
23 *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
25 *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
26 *
27 *
28 */
29
30/* Offsets for data table __svml_sacos_data_internal
31 */
32#define SgnBit				0
33#define OneHalf				16
34#define SmallNorm			32
35#define MOne				48
36#define Two				64
37#define sqrt_coeff			80
38#define poly_coeff			112
39#define Pi2H				192
40#define PiH				208
41
42#include <sysdep.h>
43
44	.section .text.sse4, "ax", @progbits
45ENTRY(_ZGVbN4v_acosf_sse4)
46	subq	$72, %rsp
47	cfi_def_cfa_offset(80)
48
49	/* X<X^2 iff X<0 */
50	movaps	%xmm0, %xmm14
51
52	/*
53	 * 2*sqrt(X) ~ Sh - Sl  (to 24+ bits)
54	 * SQ ~ 2*sqrt(X)
55	 */
56	movups	__svml_sacos_data_internal(%rip), %xmm3
57	movups	OneHalf+__svml_sacos_data_internal(%rip), %xmm5
58
59	/* x = -|arg| */
60	movaps	%xmm3, %xmm4
61	orps	%xmm0, %xmm4
62
63	/* Y = 0.5 + 0.5*(-x) */
64	movaps	%xmm5, %xmm6
65	mulps	%xmm4, %xmm6
66
67	/* x^2 */
68	movaps	%xmm4, %xmm13
69	mulps	%xmm4, %xmm13
70	addps	%xmm6, %xmm5
71
72	/* SQ ~ 2*sqrt(Y) */
73	rsqrtps	%xmm5, %xmm8
74	minps	%xmm5, %xmm13
75	movaps	%xmm5, %xmm2
76	movaps	%xmm13, %xmm1
77	cmpltps	SmallNorm+__svml_sacos_data_internal(%rip), %xmm2
78	cmpnltps %xmm5, %xmm1
79	cmpltps	%xmm13, %xmm14
80	addps	%xmm5, %xmm5
81	andnps	%xmm8, %xmm2
82	movaps	%xmm13, %xmm11
83	movaps	%xmm2, %xmm9
84	movaps	%xmm1, %xmm6
85	mulps	%xmm2, %xmm9
86	andnps	%xmm4, %xmm6
87	mulps	%xmm5, %xmm2
88	mulps	%xmm13, %xmm11
89	mulps	%xmm9, %xmm5
90	movups	sqrt_coeff+__svml_sacos_data_internal(%rip), %xmm10
91	andps	%xmm0, %xmm3
92
93	/* polynomial */
94	movups	poly_coeff+__svml_sacos_data_internal(%rip), %xmm12
95	movaps	%xmm1, %xmm15
96	mulps	%xmm13, %xmm12
97	subps	Two+__svml_sacos_data_internal(%rip), %xmm5
98	mulps	%xmm5, %xmm10
99	addps	poly_coeff+16+__svml_sacos_data_internal(%rip), %xmm12
100	mulps	%xmm2, %xmm5
101	mulps	%xmm11, %xmm12
102	addps	sqrt_coeff+16+__svml_sacos_data_internal(%rip), %xmm10
103	mulps	%xmm5, %xmm10
104	movups	poly_coeff+32+__svml_sacos_data_internal(%rip), %xmm5
105	subps	%xmm10, %xmm2
106	mulps	%xmm13, %xmm5
107	movups	MOne+__svml_sacos_data_internal(%rip), %xmm7
108	andps	%xmm1, %xmm2
109	cmpnleps %xmm4, %xmm7
110	addps	poly_coeff+48+__svml_sacos_data_internal(%rip), %xmm5
111	movmskps %xmm7, %edx
112	orps	%xmm2, %xmm6
113	addps	%xmm12, %xmm5
114	mulps	%xmm13, %xmm5
115	pxor	%xmm3, %xmm6
116	movups	PiH+__svml_sacos_data_internal(%rip), %xmm7
117	andps	%xmm1, %xmm7
118	addps	poly_coeff+64+__svml_sacos_data_internal(%rip), %xmm5
119	mulps	%xmm13, %xmm5
120	andps	%xmm14, %xmm7
121	mulps	%xmm6, %xmm5
122	andnps	Pi2H+__svml_sacos_data_internal(%rip), %xmm15
123	addps	%xmm5, %xmm6
124	addps	%xmm15, %xmm7
125	addps	%xmm6, %xmm7
126	testl	%edx, %edx
127
128	/* Go to special inputs processing branch */
129	jne	L(SPECIAL_VALUES_BRANCH)
130	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm7
131
132	/* Restore registers
133	 * and exit the function
134	 */
135
136L(EXIT):
137	movaps	%xmm7, %xmm0
138	addq	$72, %rsp
139	cfi_def_cfa_offset(8)
140	ret
141	cfi_def_cfa_offset(80)
142
143	/* Branch to process
144	 * special inputs
145	 */
146
147L(SPECIAL_VALUES_BRANCH):
148	movups	%xmm0, 32(%rsp)
149	movups	%xmm7, 48(%rsp)
150	# LOE rbx rbp r12 r13 r14 r15 edx
151
152	xorl	%eax, %eax
153	movq	%r12, 16(%rsp)
154	cfi_offset(12, -64)
155	movl	%eax, %r12d
156	movq	%r13, 8(%rsp)
157	cfi_offset(13, -72)
158	movl	%edx, %r13d
159	movq	%r14, (%rsp)
160	cfi_offset(14, -80)
161	# LOE rbx rbp r15 r12d r13d
162
163	/* Range mask
164	 * bits check
165	 */
166
167L(RANGEMASK_CHECK):
168	btl	%r12d, %r13d
169
170	/* Call scalar math function */
171	jc	L(SCALAR_MATH_CALL)
172	# LOE rbx rbp r15 r12d r13d
173
174	/* Special inputs
175	 * processing loop
176	 */
177
178L(SPECIAL_VALUES_LOOP):
179	incl	%r12d
180	cmpl	$4, %r12d
181
182	/* Check bits in range mask */
183	jl	L(RANGEMASK_CHECK)
184	# LOE rbx rbp r15 r12d r13d
185
186	movq	16(%rsp), %r12
187	cfi_restore(12)
188	movq	8(%rsp), %r13
189	cfi_restore(13)
190	movq	(%rsp), %r14
191	cfi_restore(14)
192	movups	48(%rsp), %xmm7
193
194	/* Go to exit */
195	jmp	L(EXIT)
196	cfi_offset(12, -64)
197	cfi_offset(13, -72)
198	cfi_offset(14, -80)
199	# LOE rbx rbp r12 r13 r14 r15 xmm7
200
201	/* Scalar math fucntion call
202	 * to process special input
203	 */
204
205L(SCALAR_MATH_CALL):
206	movl	%r12d, %r14d
207	movss	32(%rsp, %r14, 4), %xmm0
208	call	acosf@PLT
209	# LOE rbx rbp r14 r15 r12d r13d xmm0
210
211	movss	%xmm0, 48(%rsp, %r14, 4)
212
213	/* Process special inputs in loop */
214	jmp	L(SPECIAL_VALUES_LOOP)
215	# LOE rbx rbp r15 r12d r13d
216END(_ZGVbN4v_acosf_sse4)
217
218	.section .rodata, "a"
219	.align	16
220
221#ifdef __svml_sacos_data_internal_typedef
222typedef unsigned int VUINT32;
223typedef struct {
224	__declspec(align(16)) VUINT32 SgnBit[4][1];
225	__declspec(align(16)) VUINT32 OneHalf[4][1];
226	__declspec(align(16)) VUINT32 SmallNorm[4][1];
227	__declspec(align(16)) VUINT32 MOne[4][1];
228	__declspec(align(16)) VUINT32 Two[4][1];
229	__declspec(align(16)) VUINT32 sqrt_coeff[2][4][1];
230	__declspec(align(16)) VUINT32 poly_coeff[5][4][1];
231	__declspec(align(16)) VUINT32 Pi2H[4][1];
232	__declspec(align(16)) VUINT32 PiH[4][1];
233} __svml_sacos_data_internal;
234#endif
235__svml_sacos_data_internal:
236	/* SgnBit */
237	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
238	/* OneHalf */
239	.align	16
240	.long	0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
241	/* SmallNorm */
242	.align	16
243	.long	0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
244	/* MOne */
245	.align	16
246	.long	0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
247	/* Two */
248	.align	16
249	.long	0x40000000, 0x40000000, 0x40000000, 0x40000000
250	/* sqrt_coeff[2] */
251	.align	16
252	.long	0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
253	.long	0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
254	/* poly_coeff[5] */
255	.align	16
256	.long	0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
257	.long	0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
258	.long	0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
259	.long	0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
260	.long	0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
261	/* Pi2H */
262	.align	16
263	.long	0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
264	/* PiH */
265	.align	16
266	.long	0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
267	.align	16
268	.type	__svml_sacos_data_internal, @object
269	.size	__svml_sacos_data_internal, .-__svml_sacos_data_internal
270