1/* Function asinf vectorized with SSE4.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *      SelMask = (|x| >= 0.5) ? 1 : 0;
23 *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 *      asin(x) = (SelMask ? (Pi/2 - 2*Poly(R)) : Poly(R))*(-1)^sign(x)
25 *
26 *
27 */
28
29/* Offsets for data table __svml_sasin_data_internal
30 */
31#define AbsMask				0
32#define OneHalf				16
33#define SmallNorm			32
34#define One				48
35#define Two				64
36#define sqrt_coeff			80
37#define poly_coeff			112
38#define Pi2H				192
39
40#include <sysdep.h>
41
42	.section .text.sse4, "ax", @progbits
43ENTRY(_ZGVbN4v_asinf_sse4)
44	subq	$72, %rsp
45	cfi_def_cfa_offset(80)
46	movaps	%xmm0, %xmm2
47	movups	__svml_sasin_data_internal(%rip), %xmm1
48	movups	OneHalf+__svml_sasin_data_internal(%rip), %xmm5
49
50	/* x = |arg| */
51	movaps	%xmm1, %xmm0
52	andps	%xmm2, %xmm0
53
54	/* Y = 0.5 - 0.5*x */
55	movaps	%xmm5, %xmm3
56	mulps	%xmm0, %xmm3
57	movaps	%xmm5, %xmm8
58
59	/* x^2 */
60	movaps	%xmm0, %xmm14
61	movaps	%xmm0, %xmm15
62	mulps	%xmm0, %xmm14
63	subps	%xmm3, %xmm8
64	cmpnltps %xmm5, %xmm15
65
66	/* SQ ~ -2*sqrt(Y) */
67	rsqrtps	%xmm8, %xmm6
68	minps	%xmm8, %xmm14
69	movaps	%xmm8, %xmm9
70	movaps	%xmm14, %xmm10
71	cmpltps	SmallNorm+__svml_sasin_data_internal(%rip), %xmm9
72	mulps	%xmm14, %xmm10
73	addps	%xmm8, %xmm8
74	andnps	%xmm6, %xmm9
75	movaps	%xmm15, %xmm3
76	movaps	%xmm9, %xmm7
77	andnps	%xmm0, %xmm3
78	mulps	%xmm9, %xmm7
79	andnps	%xmm2, %xmm1
80	mulps	%xmm8, %xmm9
81	mulps	%xmm7, %xmm8
82
83	/* polynomial */
84	movups	poly_coeff+__svml_sasin_data_internal(%rip), %xmm11
85	mulps	%xmm14, %xmm11
86	subps	Two+__svml_sasin_data_internal(%rip), %xmm8
87	movups	poly_coeff+32+__svml_sasin_data_internal(%rip), %xmm12
88	mulps	%xmm14, %xmm12
89	addps	poly_coeff+16+__svml_sasin_data_internal(%rip), %xmm11
90	mulps	%xmm10, %xmm11
91	addps	poly_coeff+48+__svml_sasin_data_internal(%rip), %xmm12
92	movups	sqrt_coeff+__svml_sasin_data_internal(%rip), %xmm13
93	addps	%xmm11, %xmm12
94	mulps	%xmm8, %xmm13
95	mulps	%xmm9, %xmm8
96	mulps	%xmm14, %xmm12
97	addps	sqrt_coeff+16+__svml_sasin_data_internal(%rip), %xmm13
98	addps	poly_coeff+64+__svml_sasin_data_internal(%rip), %xmm12
99	mulps	%xmm8, %xmm13
100	mulps	%xmm12, %xmm14
101	subps	%xmm9, %xmm13
102	andps	%xmm15, %xmm13
103	orps	%xmm13, %xmm3
104	mulps	%xmm3, %xmm14
105	movups	One+__svml_sasin_data_internal(%rip), %xmm4
106	addps	%xmm14, %xmm3
107	cmpltps	%xmm0, %xmm4
108	movups	Pi2H+__svml_sasin_data_internal(%rip), %xmm0
109	andps	%xmm15, %xmm0
110	movmskps %xmm4, %edx
111	addps	%xmm3, %xmm0
112	pxor	%xmm1, %xmm0
113	testl	%edx, %edx
114
115	/* Go to special inputs processing branch */
116	jne	L(SPECIAL_VALUES_BRANCH)
117	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm2
118
119	/* Restore registers
120	 * and exit the function
121	 */
122
123L(EXIT):
124	addq	$72, %rsp
125	cfi_def_cfa_offset(8)
126	ret
127	cfi_def_cfa_offset(80)
128
129	/* Branch to process
130	 * special inputs
131	 */
132
133L(SPECIAL_VALUES_BRANCH):
134	movups	%xmm2, 32(%rsp)
135	movups	%xmm0, 48(%rsp)
136	# LOE rbx rbp r12 r13 r14 r15 edx
137
138	xorl	%eax, %eax
139	movq	%r12, 16(%rsp)
140	cfi_offset(12, -64)
141	movl	%eax, %r12d
142	movq	%r13, 8(%rsp)
143	cfi_offset(13, -72)
144	movl	%edx, %r13d
145	movq	%r14, (%rsp)
146	cfi_offset(14, -80)
147	# LOE rbx rbp r15 r12d r13d
148
149	/* Range mask
150	 * bits check
151	 */
152
153L(RANGEMASK_CHECK):
154	btl	%r12d, %r13d
155
156	/* Call scalar math function */
157	jc	L(SCALAR_MATH_CALL)
158	# LOE rbx rbp r15 r12d r13d
159
160	/* Special inputs
161	 * processing loop
162	 */
163
164L(SPECIAL_VALUES_LOOP):
165	incl	%r12d
166	cmpl	$4, %r12d
167
168	/* Check bits in range mask */
169	jl	L(RANGEMASK_CHECK)
170	# LOE rbx rbp r15 r12d r13d
171
172	movq	16(%rsp), %r12
173	cfi_restore(12)
174	movq	8(%rsp), %r13
175	cfi_restore(13)
176	movq	(%rsp), %r14
177	cfi_restore(14)
178	movups	48(%rsp), %xmm0
179
180	/* Go to exit */
181	jmp	L(EXIT)
182	cfi_offset(12, -64)
183	cfi_offset(13, -72)
184	cfi_offset(14, -80)
185	# LOE rbx rbp r12 r13 r14 r15 xmm0
186
187	/* Scalar math fucntion call
188	 * to process special input
189	 */
190
191L(SCALAR_MATH_CALL):
192	movl	%r12d, %r14d
193	movss	32(%rsp, %r14, 4), %xmm0
194	call	asinf@PLT
195	# LOE rbx rbp r14 r15 r12d r13d xmm0
196
197	movss	%xmm0, 48(%rsp, %r14, 4)
198
199	/* Process special inputs in loop */
200	jmp	L(SPECIAL_VALUES_LOOP)
201	# LOE rbx rbp r15 r12d r13d
202END(_ZGVbN4v_asinf_sse4)
203
204	.section .rodata, "a"
205	.align	16
206
207#ifdef __svml_sasin_data_internal_typedef
208typedef unsigned int VUINT32;
209typedef struct {
210	__declspec(align(16)) VUINT32 AbsMask[4][1];
211	__declspec(align(16)) VUINT32 OneHalf[4][1];
212	__declspec(align(16)) VUINT32 SmallNorm[4][1];
213	__declspec(align(16)) VUINT32 One[4][1];
214	__declspec(align(16)) VUINT32 Two[4][1];
215	__declspec(align(16)) VUINT32 sqrt_coeff[2][4][1];
216	__declspec(align(16)) VUINT32 poly_coeff[5][4][1];
217	__declspec(align(16)) VUINT32 Pi2H[4][1];
218} __svml_sasin_data_internal;
219#endif
220__svml_sasin_data_internal:
221	/* AbsMask */
222	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
223	/* OneHalf */
224	.align	16
225	.long	0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
226	/* SmallNorm */
227	.align	16
228	.long	0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
229	/* One */
230	.align	16
231	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
232	/* Two */
233	.align	16
234	.long	0x40000000, 0x40000000, 0x40000000, 0x40000000
235	/* sqrt_coeff[2] */
236	.align	16
237	.long	0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
238	.long	0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
239	/* poly_coeff[5] */
240	.align	16
241	.long	0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
242	.long	0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
243	.long	0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
244	.long	0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
245	.long	0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
246	/* Pi2H */
247	.align	16
248	.long	0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
249	.align	16
250	.type	__svml_sasin_data_internal, @object
251	.size	__svml_sasin_data_internal, .-__svml_sasin_data_internal
252