1/* Function exp10 vectorized with AVX-512.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *   Typical exp10() implementation, except that:
22 *    - tables are small (16 elements), allowing for fast gathers
23 *    - all arguments processed in the main path
24 *        - final VSCALEF assists branch-free design (correct overflow/underflow and special case responses)
25 *        - a VAND is used to ensure the reduced argument |R|<2, even for large inputs
26 *        - RZ mode used to avoid oveflow to +/-Inf for x*log2(e); helps with special case handling
27 *        - SAE used to avoid spurious flag settings
28 *
29 */
30
31/* Offsets for data table __svml_dexp10_data_internal_avx512
32 */
33#define Exp_tbl_H			0
34#define L2E				128
35#define Shifter				192
36#define L2H				256
37#define L2L				320
38#define EMask				384
39#define poly_coeff6			448
40#define poly_coeff5			512
41#define poly_coeff4			576
42#define poly_coeff3			640
43#define poly_coeff2			704
44#define poly_coeff1			768
45#define AbsMask				832
46#define Threshold			896
47
48#include <sysdep.h>
49
50	.section .text.evex512, "ax", @progbits
51ENTRY(_ZGVeN8v_exp10_skx)
52	pushq	%rbp
53	cfi_def_cfa_offset(16)
54	movq	%rsp, %rbp
55	cfi_def_cfa(6, 16)
56	cfi_offset(6, -16)
57	andq	$-64, %rsp
58	subq	$192, %rsp
59	vmovups	L2E+__svml_dexp10_data_internal_avx512(%rip), %zmm4
60	vmovups	Shifter+__svml_dexp10_data_internal_avx512(%rip), %zmm2
61	vmovups	L2H+__svml_dexp10_data_internal_avx512(%rip), %zmm5
62	vmovups	L2L+__svml_dexp10_data_internal_avx512(%rip), %zmm3
63
64	/* polynomial */
65	vmovups	poly_coeff6+__svml_dexp10_data_internal_avx512(%rip), %zmm6
66	vmovups	poly_coeff4+__svml_dexp10_data_internal_avx512(%rip), %zmm7
67	vmovups	poly_coeff3+__svml_dexp10_data_internal_avx512(%rip), %zmm9
68	vmovups	poly_coeff2+__svml_dexp10_data_internal_avx512(%rip), %zmm8
69	vmovups	poly_coeff1+__svml_dexp10_data_internal_avx512(%rip), %zmm11
70	vmovups	Threshold+__svml_dexp10_data_internal_avx512(%rip), %zmm14
71	vmovaps	%zmm0, %zmm1
72
73	/* 2^(52-4)*1.5 + x * log2(e) */
74	vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm4
75	vandpd	AbsMask+__svml_dexp10_data_internal_avx512(%rip), %zmm1, %zmm13
76
77	/* Z0 ~ x*log2(e), rounded down to 4 fractional bits */
78	vsubpd	{rn-sae}, %zmm2, %zmm4, %zmm0
79
80	/* Table lookup: Th */
81	vmovups	__svml_dexp10_data_internal_avx512(%rip), %zmm2
82	vcmppd	$29, {sae}, %zmm14, %zmm13, %k0
83
84	/* R = x - Z0*log(2) */
85	vfnmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm5
86	vpermt2pd Exp_tbl_H+64+__svml_dexp10_data_internal_avx512(%rip), %zmm4, %zmm2
87	kmovw	%k0, %edx
88	vfnmadd231pd {rn-sae}, %zmm0, %zmm3, %zmm5
89	vmovups	poly_coeff5+__svml_dexp10_data_internal_avx512(%rip), %zmm3
90
91	/* ensure |R|<2 even for special cases */
92	vandpd	EMask+__svml_dexp10_data_internal_avx512(%rip), %zmm5, %zmm12
93	vmulpd	{rn-sae}, %zmm12, %zmm12, %zmm10
94	vmulpd	{rn-sae}, %zmm12, %zmm2, %zmm15
95	vfmadd231pd {rn-sae}, %zmm12, %zmm6, %zmm3
96	vfmadd231pd {rn-sae}, %zmm12, %zmm7, %zmm9
97	vfmadd231pd {rn-sae}, %zmm12, %zmm8, %zmm11
98	vfmadd213pd {rn-sae}, %zmm9, %zmm10, %zmm3
99	vfmadd213pd {rn-sae}, %zmm11, %zmm10, %zmm3
100	vfmadd213pd {rn-sae}, %zmm2, %zmm15, %zmm3
101	vscalefpd {rn-sae}, %zmm0, %zmm3, %zmm0
102	testl	%edx, %edx
103
104	/* Go to special inputs processing branch */
105	jne	L(SPECIAL_VALUES_BRANCH)
106	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
107
108	/* Restore registers
109	 * and exit the function
110	 */
111
112L(EXIT):
113	movq	%rbp, %rsp
114	popq	%rbp
115	cfi_def_cfa(7, 8)
116	cfi_restore(6)
117	ret
118	cfi_def_cfa(6, 16)
119	cfi_offset(6, -16)
120
121	/* Branch to process
122	 * special inputs
123	 */
124
125L(SPECIAL_VALUES_BRANCH):
126	vmovups	%zmm1, 64(%rsp)
127	vmovups	%zmm0, 128(%rsp)
128	# LOE rbx r12 r13 r14 r15 edx zmm0
129
130	xorl	%eax, %eax
131	# LOE rbx r12 r13 r14 r15 eax edx
132
133	vzeroupper
134	movq	%r12, 16(%rsp)
135	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
136	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
137	movl	%eax, %r12d
138	movq	%r13, 8(%rsp)
139	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
140	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
141	movl	%edx, %r13d
142	movq	%r14, (%rsp)
143	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
144	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
145	# LOE rbx r15 r12d r13d
146
147	/* Range mask
148	 * bits check
149	 */
150
151L(RANGEMASK_CHECK):
152	btl	%r12d, %r13d
153
154	/* Call scalar math function */
155	jc	L(SCALAR_MATH_CALL)
156	# LOE rbx r15 r12d r13d
157
158	/* Special inputs
159	 * processing loop
160	 */
161
162L(SPECIAL_VALUES_LOOP):
163	incl	%r12d
164	cmpl	$8, %r12d
165
166	/* Check bits in range mask */
167	jl	L(RANGEMASK_CHECK)
168	# LOE rbx r15 r12d r13d
169
170	movq	16(%rsp), %r12
171	cfi_restore(12)
172	movq	8(%rsp), %r13
173	cfi_restore(13)
174	movq	(%rsp), %r14
175	cfi_restore(14)
176	vmovups	128(%rsp), %zmm0
177
178	/* Go to exit */
179	jmp	L(EXIT)
180	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
181	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
182	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
183	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
184	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
185	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
186	# LOE rbx r12 r13 r14 r15 zmm0
187
188	/* Scalar math fucntion call
189	 * to process special input
190	 */
191
192L(SCALAR_MATH_CALL):
193	movl	%r12d, %r14d
194	vmovsd	64(%rsp, %r14, 8), %xmm0
195	call	exp10@PLT
196	# LOE rbx r14 r15 r12d r13d xmm0
197
198	vmovsd	%xmm0, 128(%rsp, %r14, 8)
199
200	/* Process special inputs in loop */
201	jmp	L(SPECIAL_VALUES_LOOP)
202	# LOE rbx r15 r12d r13d
203END(_ZGVeN8v_exp10_skx)
204
205	.section .rodata, "a"
206	.align	64
207
208#ifdef __svml_dexp10_data_internal_avx512_typedef
209typedef unsigned int VUINT32;
210typedef struct {
211	__declspec(align(64)) VUINT32 Exp_tbl_H[16][2];
212	__declspec(align(64)) VUINT32 L2E[8][2];
213	__declspec(align(64)) VUINT32 Shifter[8][2];
214	__declspec(align(64)) VUINT32 L2H[8][2];
215	__declspec(align(64)) VUINT32 L2L[8][2];
216	__declspec(align(64)) VUINT32 EMask[8][2];
217	__declspec(align(64)) VUINT32 poly_coeff6[8][2];
218	__declspec(align(64)) VUINT32 poly_coeff5[8][2];
219	__declspec(align(64)) VUINT32 poly_coeff4[8][2];
220	__declspec(align(64)) VUINT32 poly_coeff3[8][2];
221	__declspec(align(64)) VUINT32 poly_coeff2[8][2];
222	__declspec(align(64)) VUINT32 poly_coeff1[8][2];
223	__declspec(align(64)) VUINT32 AbsMask[8][2];
224	__declspec(align(64)) VUINT32 Threshold[8][2];
225} __svml_dexp10_data_internal_avx512;
226#endif
227__svml_dexp10_data_internal_avx512:
228	/* Exp_tbl_H */
229	.quad	0x3ff0000000000000
230	.quad	0x3ff0b5586cf9890f
231	.quad	0x3ff172b83c7d517b
232	.quad	0x3ff2387a6e756238
233	.quad	0x3ff306fe0a31b715
234	.quad	0x3ff3dea64c123422
235	.quad	0x3ff4bfdad5362a27
236	.quad	0x3ff5ab07dd485429
237	.quad	0x3ff6a09e667f3bcd
238	.quad	0x3ff7a11473eb0187
239	.quad	0x3ff8ace5422aa0db
240	.quad	0x3ff9c49182a3f090
241	.quad	0x3ffae89f995ad3ad
242	.quad	0x3ffc199bdd85529c
243	.quad	0x3ffd5818dcfba487
244	.quad	0x3ffea4afa2a490da
245	/* log2(e) */
246	.align	64
247	.quad	0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371
248	/* Shifter=2^(52-4)*1.5 */
249	.align	64
250	.quad	0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0
251	/* L2H = log(2)_high */
252	.align	64
253	.quad	0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff
254	/* L2L = log(2)_low */
255	.align	64
256	.quad	0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21
257	/* EMask */
258	.align	64
259	.quad	0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff
260	/* poly_coeff6 */
261	.align	64
262	.quad	0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020
263	/* poly_coeff5 */
264	.align	64
265	.quad	0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424
266	/* poly_coeff4 */
267	.align	64
268	.quad	0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d
269	/* poly_coeff3 */
270	.align	64
271	.quad	0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8
272	/* poly_coeff2 */
273	.align	64
274	.quad	0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25
275	/* poly_coeff1 */
276	.align	64
277	.quad	0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2
278	/* AbsMask */
279	.align	64
280	.quad	0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
281	/* Threshold */
282	.align	64
283	.quad	0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41
284	.align	64
285	.type	__svml_dexp10_data_internal_avx512, @object
286	.size	__svml_dexp10_data_internal_avx512, .-__svml_dexp10_data_internal_avx512
287