1/* Function asin vectorized with SSE4.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *      SelMask = (|x| >= 0.5) ? 1 : 0;
23 *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 *      asin(x) = (SelMask ? (Pi/2 - 2*Poly(R)) : Poly(R))*(-1)^sign(x)
25 *
26 */
27
28/* Offsets for data table __svml_dasin_data_internal
29 */
30#define AbsMask				0
31#define OneHalf				16
32#define SmallNorm			32
33#define One				48
34#define Two				64
35#define sqrt_coeff			80
36#define poly_coeff			144
37#define Pi2H				336
38
39#include <sysdep.h>
40
41	.section .text.sse4, "ax", @progbits
42ENTRY(_ZGVbN2v_asin_sse4)
43	subq	$72, %rsp
44	cfi_def_cfa_offset(80)
45	movaps	%xmm0, %xmm5
46	movups	__svml_dasin_data_internal(%rip), %xmm3
47	movups	OneHalf+__svml_dasin_data_internal(%rip), %xmm8
48
49	/* x = |arg| */
50	movaps	%xmm3, %xmm4
51	andps	%xmm5, %xmm4
52
53	/* Y = 0.5 - 0.5*x */
54	movaps	%xmm8, %xmm6
55	mulpd	%xmm4, %xmm6
56	movaps	%xmm8, %xmm14
57
58	/* x^2 */
59	movaps	%xmm4, %xmm2
60	subpd	%xmm6, %xmm14
61	mulpd	%xmm4, %xmm2
62
63	/* S ~ -2*sqrt(Y) */
64	cvtpd2ps %xmm14, %xmm9
65	minpd	%xmm14, %xmm2
66	movlhps	%xmm9, %xmm9
67	movaps	%xmm14, %xmm15
68	rsqrtps	%xmm9, %xmm10
69	cmpltpd	SmallNorm+__svml_dasin_data_internal(%rip), %xmm15
70	addpd	%xmm14, %xmm14
71	cvtps2pd %xmm10, %xmm11
72	andnps	%xmm11, %xmm15
73	movaps	%xmm4, %xmm1
74	movaps	%xmm15, %xmm12
75	andnps	%xmm5, %xmm3
76	mulpd	%xmm15, %xmm12
77	mulpd	%xmm14, %xmm15
78	mulpd	%xmm12, %xmm14
79	cmpnltpd %xmm8, %xmm1
80	subpd	Two+__svml_dasin_data_internal(%rip), %xmm14
81
82	/* polynomial */
83	movups	poly_coeff+__svml_dasin_data_internal(%rip), %xmm6
84	movaps	%xmm2, %xmm12
85	mulpd	%xmm2, %xmm6
86	mulpd	%xmm2, %xmm12
87	addpd	poly_coeff+16+__svml_dasin_data_internal(%rip), %xmm6
88	movups	One+__svml_dasin_data_internal(%rip), %xmm7
89	movaps	%xmm12, %xmm8
90	cmpltpd	%xmm4, %xmm7
91	mulpd	%xmm12, %xmm6
92	movmskpd %xmm7, %edx
93	movups	poly_coeff+32+__svml_dasin_data_internal(%rip), %xmm9
94	movaps	%xmm14, %xmm0
95	movups	poly_coeff+64+__svml_dasin_data_internal(%rip), %xmm7
96	mulpd	%xmm2, %xmm9
97	mulpd	%xmm2, %xmm7
98	addpd	poly_coeff+48+__svml_dasin_data_internal(%rip), %xmm9
99	addpd	poly_coeff+80+__svml_dasin_data_internal(%rip), %xmm7
100	mulpd	%xmm12, %xmm8
101	mulpd	%xmm12, %xmm7
102	addpd	%xmm6, %xmm9
103	mulpd	%xmm15, %xmm0
104	mulpd	%xmm8, %xmm9
105	movups	poly_coeff+96+__svml_dasin_data_internal(%rip), %xmm10
106	mulpd	%xmm2, %xmm10
107	movups	sqrt_coeff+__svml_dasin_data_internal(%rip), %xmm13
108	mulpd	%xmm14, %xmm13
109	addpd	poly_coeff+112+__svml_dasin_data_internal(%rip), %xmm10
110	addpd	sqrt_coeff+16+__svml_dasin_data_internal(%rip), %xmm13
111	addpd	%xmm7, %xmm10
112	mulpd	%xmm14, %xmm13
113	addpd	%xmm9, %xmm10
114	addpd	sqrt_coeff+32+__svml_dasin_data_internal(%rip), %xmm13
115	mulpd	%xmm12, %xmm10
116	mulpd	%xmm13, %xmm14
117	movups	poly_coeff+128+__svml_dasin_data_internal(%rip), %xmm11
118	mulpd	%xmm2, %xmm11
119	addpd	sqrt_coeff+48+__svml_dasin_data_internal(%rip), %xmm14
120	addpd	poly_coeff+144+__svml_dasin_data_internal(%rip), %xmm11
121	mulpd	%xmm14, %xmm0
122	addpd	%xmm10, %xmm11
123	subpd	%xmm15, %xmm0
124	mulpd	%xmm11, %xmm12
125	movups	poly_coeff+160+__svml_dasin_data_internal(%rip), %xmm13
126	movaps	%xmm1, %xmm14
127	mulpd	%xmm2, %xmm13
128	addpd	poly_coeff+176+__svml_dasin_data_internal(%rip), %xmm13
129	addpd	%xmm12, %xmm13
130	mulpd	%xmm13, %xmm2
131	andnps	%xmm4, %xmm14
132	andps	%xmm1, %xmm0
133	orps	%xmm0, %xmm14
134	mulpd	%xmm14, %xmm2
135	addpd	%xmm2, %xmm14
136	movups	Pi2H+__svml_dasin_data_internal(%rip), %xmm0
137	andps	%xmm1, %xmm0
138	addpd	%xmm14, %xmm0
139	pxor	%xmm3, %xmm0
140	testl	%edx, %edx
141
142	/* Go to special inputs processing branch */
143	jne	L(SPECIAL_VALUES_BRANCH)
144	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm5
145
146	/* Restore registers
147	 * and exit the function
148	 */
149
150L(EXIT):
151	addq	$72, %rsp
152	cfi_def_cfa_offset(8)
153	ret
154	cfi_def_cfa_offset(80)
155
156	/* Branch to process
157	 * special inputs
158	 */
159
160L(SPECIAL_VALUES_BRANCH):
161	movups	%xmm5, 32(%rsp)
162	movups	%xmm0, 48(%rsp)
163	# LOE rbx rbp r12 r13 r14 r15 edx
164
165	xorl	%eax, %eax
166	movq	%r12, 16(%rsp)
167	cfi_offset(12, -64)
168	movl	%eax, %r12d
169	movq	%r13, 8(%rsp)
170	cfi_offset(13, -72)
171	movl	%edx, %r13d
172	movq	%r14, (%rsp)
173	cfi_offset(14, -80)
174	# LOE rbx rbp r15 r12d r13d
175
176	/* Range mask
177	 * bits check
178	 */
179
180L(RANGEMASK_CHECK):
181	btl	%r12d, %r13d
182
183	/* Call scalar math function */
184	jc	L(SCALAR_MATH_CALL)
185	# LOE rbx rbp r15 r12d r13d
186
187	/* Special inputs
188	 * processing loop
189	 */
190
191L(SPECIAL_VALUES_LOOP):
192	incl	%r12d
193	cmpl	$2, %r12d
194
195	/* Check bits in range mask */
196	jl	L(RANGEMASK_CHECK)
197	# LOE rbx rbp r15 r12d r13d
198
199	movq	16(%rsp), %r12
200	cfi_restore(12)
201	movq	8(%rsp), %r13
202	cfi_restore(13)
203	movq	(%rsp), %r14
204	cfi_restore(14)
205	movups	48(%rsp), %xmm0
206
207	/* Go to exit */
208	jmp	L(EXIT)
209	cfi_offset(12, -64)
210	cfi_offset(13, -72)
211	cfi_offset(14, -80)
212	# LOE rbx rbp r12 r13 r14 r15 xmm0
213
214	/* Scalar math fucntion call
215	 * to process special input
216	 */
217
218L(SCALAR_MATH_CALL):
219	movl	%r12d, %r14d
220	movsd	32(%rsp, %r14, 8), %xmm0
221	call	asin@PLT
222	# LOE rbx rbp r14 r15 r12d r13d xmm0
223
224	movsd	%xmm0, 48(%rsp, %r14, 8)
225
226	/* Process special inputs in loop */
227	jmp	L(SPECIAL_VALUES_LOOP)
228	# LOE rbx rbp r15 r12d r13d
229END(_ZGVbN2v_asin_sse4)
230
231	.section .rodata, "a"
232	.align	16
233
234#ifdef __svml_dasin_data_internal_typedef
235typedef unsigned int VUINT32;
236typedef struct {
237	__declspec(align(16)) VUINT32 AbsMask[2][2];
238	__declspec(align(16)) VUINT32 OneHalf[2][2];
239	__declspec(align(16)) VUINT32 SmallNorm[2][2];
240	__declspec(align(16)) VUINT32 One[2][2];
241	__declspec(align(16)) VUINT32 Two[2][2];
242	__declspec(align(16)) VUINT32 sqrt_coeff[4][2][2];
243	__declspec(align(16)) VUINT32 poly_coeff[12][2][2];
244	__declspec(align(16)) VUINT32 Pi2H[2][2];
245} __svml_dasin_data_internal;
246#endif
247__svml_dasin_data_internal:
248	/* AbsMask */
249	.quad	0x7fffffffffffffff, 0x7fffffffffffffff
250	/* OneHalf */
251	.align	16
252	.quad	0x3fe0000000000000, 0x3fe0000000000000
253	/* SmallNorm */
254	.align	16
255	.quad	0x3000000000000000, 0x3000000000000000
256	/* One */
257	.align	16
258	.quad	0x3ff0000000000000, 0x3ff0000000000000
259	/* Two */
260	.align	16
261	.quad	0x4000000000000000, 0x4000000000000000
262	/* sqrt_coeff[4] */
263	.align	16
264	.quad	0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
265	.quad	0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
266	.quad	0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
267	.quad	0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
268	/* poly_coeff[12] */
269	.align	16
270	.quad	0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
271	.quad	0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
272	.quad	0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
273	.quad	0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
274	.quad	0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
275	.quad	0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
276	.quad	0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
277	.quad	0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
278	.quad	0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
279	.quad	0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
280	.quad	0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
281	.quad	0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
282	/* Pi2H */
283	.align	16
284	.quad	0x3ff921fb54442d18, 0x3ff921fb54442d18
285	.align	16
286	.type	__svml_dasin_data_internal, @object
287	.size	__svml_dasin_data_internal, .-__svml_dasin_data_internal
288