1/* Function cbrt vectorized with AVX-512.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *   x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52
23 *   Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5],
24 *   where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in double precision
25 *   cbrt(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5]
26 *   (T stores the high 53 bits, D stores the low order bits)
27 *   Result=2^k*T+(2^k*T*r)*P+2^k*D
28 *   where P=p1+p2*r+..+p8*r^7
29 *
30 */
31
32/* Offsets for data table __svml_dcbrt_data_internal_avx512
33 */
34#define etbl_H				0
35#define etbl_L				64
36#define cbrt_tbl_H			128
37#define BiasL				256
38#define SZero				320
39#define OneThird			384
40#define Bias3				448
41#define Three				512
42#define One				576
43#define poly_coeff10			640
44#define poly_coeff9			704
45#define poly_coeff8			768
46#define poly_coeff7			832
47#define poly_coeff6			896
48#define poly_coeff5			960
49#define poly_coeff4			1024
50#define poly_coeff3			1088
51#define poly_coeff2			1152
52#define poly_coeff1			1216
53
54#include <sysdep.h>
55
56	.section .text.evex512, "ax", @progbits
57ENTRY(_ZGVeN8v_cbrt_skx)
58	vgetmantpd $0, {sae}, %zmm0, %zmm14
59
60	/* GetExp(x) */
61	vgetexppd {sae}, %zmm0, %zmm7
62	vmovups	BiasL+__svml_dcbrt_data_internal_avx512(%rip), %zmm8
63
64	/* exponent/3 */
65	vmovups	OneThird+__svml_dcbrt_data_internal_avx512(%rip), %zmm9
66	vmovups	Bias3+__svml_dcbrt_data_internal_avx512(%rip), %zmm10
67
68	/* Reduced argument: R = DblRcp*Mantissa - 1 */
69	vmovups	One+__svml_dcbrt_data_internal_avx512(%rip), %zmm2
70
71	/* exponent%3 (to be used as index) */
72	vmovups	Three+__svml_dcbrt_data_internal_avx512(%rip), %zmm11
73
74	/* DblRcp ~ 1/Mantissa */
75	vrcp14pd %zmm14, %zmm13
76	vaddpd	{rn-sae}, %zmm8, %zmm7, %zmm12
77	vandpd	SZero+__svml_dcbrt_data_internal_avx512(%rip), %zmm0, %zmm6
78
79	/* round DblRcp to 3 fractional bits (RN mode, no Precision exception) */
80	vrndscalepd $72, {sae}, %zmm13, %zmm15
81	vfmsub231pd {rn-sae}, %zmm12, %zmm9, %zmm10
82
83	/* polynomial */
84	vmovups	poly_coeff10+__svml_dcbrt_data_internal_avx512(%rip), %zmm0
85	vmovups	poly_coeff8+__svml_dcbrt_data_internal_avx512(%rip), %zmm7
86	vmovups	poly_coeff7+__svml_dcbrt_data_internal_avx512(%rip), %zmm9
87	vfmsub231pd {rn-sae}, %zmm15, %zmm14, %zmm2
88	vrndscalepd $9, {sae}, %zmm10, %zmm5
89
90	/* Table lookup */
91	vmovups	cbrt_tbl_H+__svml_dcbrt_data_internal_avx512(%rip), %zmm10
92	vmovups	poly_coeff6+__svml_dcbrt_data_internal_avx512(%rip), %zmm8
93	vmovups	poly_coeff3+__svml_dcbrt_data_internal_avx512(%rip), %zmm13
94	vfmadd231pd {rn-sae}, %zmm2, %zmm7, %zmm9
95	vfnmadd231pd {rn-sae}, %zmm5, %zmm11, %zmm12
96	vmovups	poly_coeff5+__svml_dcbrt_data_internal_avx512(%rip), %zmm11
97	vmovups	poly_coeff1+__svml_dcbrt_data_internal_avx512(%rip), %zmm14
98
99	/* Prepare table index */
100	vpsrlq	$49, %zmm15, %zmm1
101
102	/* Table lookup: 2^(exponent%3) */
103	vpermpd	__svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm4
104	vpermpd	etbl_L+__svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm3
105	vpermt2pd cbrt_tbl_H+64+__svml_dcbrt_data_internal_avx512(%rip), %zmm1, %zmm10
106	vmovups	poly_coeff9+__svml_dcbrt_data_internal_avx512(%rip), %zmm1
107	vfmadd231pd {rn-sae}, %zmm2, %zmm8, %zmm11
108	vmovups	poly_coeff2+__svml_dcbrt_data_internal_avx512(%rip), %zmm12
109	vscalefpd {rn-sae}, %zmm5, %zmm10, %zmm15
110	vfmadd231pd {rn-sae}, %zmm2, %zmm0, %zmm1
111	vmovups	poly_coeff4+__svml_dcbrt_data_internal_avx512(%rip), %zmm5
112	vfmadd231pd {rn-sae}, %zmm2, %zmm12, %zmm14
113	vmulpd	{rn-sae}, %zmm2, %zmm2, %zmm0
114	vfmadd231pd {rn-sae}, %zmm2, %zmm5, %zmm13
115
116	/* Sh*R */
117	vmulpd	{rn-sae}, %zmm2, %zmm4, %zmm2
118	vfmadd213pd {rn-sae}, %zmm9, %zmm0, %zmm1
119	vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
120	vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm1
121	vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm1
122
123	/* Sl + (Sh*R)*Poly */
124	vfmadd213pd {rn-sae}, %zmm3, %zmm1, %zmm2
125
126	/*
127	 * branch-free
128	 * scaled_Th*(Sh+Sl+Sh*R*Poly)
129	 */
130	vaddpd	{rn-sae}, %zmm4, %zmm2, %zmm3
131	vmulpd	{rn-sae}, %zmm15, %zmm3, %zmm4
132	vorpd	%zmm6, %zmm4, %zmm0
133	ret
134
135END(_ZGVeN8v_cbrt_skx)
136
137	.section .rodata, "a"
138	.align	64
139
140#ifdef __svml_dcbrt_data_internal_avx512_typedef
141typedef unsigned int VUINT32;
142typedef struct {
143	__declspec(align(64)) VUINT32 etbl_H[8][2];
144	__declspec(align(64)) VUINT32 etbl_L[8][2];
145	__declspec(align(64)) VUINT32 cbrt_tbl_H[16][2];
146	__declspec(align(64)) VUINT32 BiasL[8][2];
147	__declspec(align(64)) VUINT32 SZero[8][2];
148	__declspec(align(64)) VUINT32 OneThird[8][2];
149	__declspec(align(64)) VUINT32 Bias3[8][2];
150	__declspec(align(64)) VUINT32 Three[8][2];
151	__declspec(align(64)) VUINT32 One[8][2];
152	__declspec(align(64)) VUINT32 poly_coeff10[8][2];
153	__declspec(align(64)) VUINT32 poly_coeff9[8][2];
154	__declspec(align(64)) VUINT32 poly_coeff8[8][2];
155	__declspec(align(64)) VUINT32 poly_coeff7[8][2];
156	__declspec(align(64)) VUINT32 poly_coeff6[8][2];
157	__declspec(align(64)) VUINT32 poly_coeff5[8][2];
158	__declspec(align(64)) VUINT32 poly_coeff4[8][2];
159	__declspec(align(64)) VUINT32 poly_coeff3[8][2];
160	__declspec(align(64)) VUINT32 poly_coeff2[8][2];
161	__declspec(align(64)) VUINT32 poly_coeff1[8][2];
162} __svml_dcbrt_data_internal_avx512;
163#endif
164__svml_dcbrt_data_internal_avx512:
165	/* etbl_H */
166	.quad	0x3ff0000000000000
167	.quad	0x3ff428a2f98d728b
168	.quad	0x3ff965fea53d6e3d
169	.quad	0x0000000000000000
170	.quad	0xbff0000000000000
171	.quad	0xbff428a2f98d728b
172	.quad	0xbff965fea53d6e3d
173	.quad	0x0000000000000000
174	/* etbl_L */
175	.align	64
176	.quad	0x0000000000000000
177	.quad	0xbc7ddc22548ea41e
178	.quad	0xbc9f53e999952f09
179	.quad	0x0000000000000000
180	.quad	0x0000000000000000
181	.quad	0x3c7ddc22548ea41e
182	.quad	0x3c9f53e999952f09
183	.quad	0x0000000000000000
184	/* cbrt_tbl_H */
185	.align	64
186	.quad	0x3ff428a2f98d728b
187	.quad	0x3ff361f35ca116ff
188	.quad	0x3ff2b6b5edf6b54a
189	.quad	0x3ff220e6dd675180
190	.quad	0x3ff19c3b38e975a8
191	.quad	0x3ff12589c21fb842
192	.quad	0x3ff0ba6ee5f9aad4
193	.quad	0x3ff059123d3a9848
194	.quad	0x3ff0000000000000
195	.quad	0x0000000000000000
196	.quad	0x0000000000000000
197	.quad	0x0000000000000000
198	.quad	0x0000000000000000
199	.quad	0x0000000000000000
200	.quad	0x0000000000000000
201	.quad	0x0000000000000000
202	/* BiasL */
203	.align	64
204	.quad	0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000
205	/* Zero */
206	.align	64
207	.quad	0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
208	/* OneThird */
209	.align	64
210	.quad	0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556
211	/* Bias3 */
212	.align	64
213	.quad	0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000
214	/* Three */
215	.align	64
216	.quad	0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000
217	/* One */
218	.align	64
219	.quad	0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
220	/* poly_coeff10 */
221	.align	64
222	.quad	0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62
223	/* poly_coeff9 */
224	.align	64
225	.quad	0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875
226	/* poly_coeff8 */
227	.align	64
228	.quad	0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f
229	/* poly_coeff7 */
230	.align	64
231	.quad	0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914
232	/* poly_coeff6 */
233	.align	64
234	.quad	0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e
235	/* poly_coeff5 */
236	.align	64
237	.quad	0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569
238	/* poly_coeff4 */
239	.align	64
240	.quad	0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e
241	/* poly_coeff3 */
242	.align	64
243	.quad	0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31
244	/* poly_coeff2 */
245	.align	64
246	.quad	0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741
247	/* poly_coeff1 */
248	.align	64
249	.quad	0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557
250	.align	64
251	.type	__svml_dcbrt_data_internal_avx512, @object
252	.size	__svml_dcbrt_data_internal_avx512, .-__svml_dcbrt_data_internal_avx512
253