1/* Function exp2f vectorized with SSE4.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *   exp2(x)  = 2^n * T[j] * (1 + P(y))
23 *   where
24 *        x = m*(1/K) + y,    y in [-1/K..1/K]
25 *        m = n*K + j,           m, n,j - signed integer, j in [-K/2..K/2]
26 *
27 *        values of 2^j/K are tabulated
28 *
29 *        P(y) is a minimax polynomial approximation of exp2(x)-1
30 *        on small interval [-1/K..1/K]
31 *
32 *  Special cases:
33 *
34 *   exp2(NaN)  = NaN
35 *   exp2(+INF) = +INF
36 *   exp2(-INF) = 0
37 *   exp2(x)    = 1 for subnormals
38 *   For IEEE float
39 *     if x >= 128.0 then exp2f(x) overflow
40 *     if x < -151.0 then exp2f(x) underflow
41 *
42 */
43
44/* Offsets for data table __svml_sexp2_data_internal
45 */
46#define _sShifter			0
47#define _sPC0				16
48#define _sPC1				32
49#define _sPC2				48
50#define _sPC3				64
51#define _sPC4				80
52#define _sPC5				96
53#define _sPC6				112
54#define _iAbsMask			128
55#define _iDomainRange			144
56
57#include <sysdep.h>
58
59	.section .text.sse4, "ax", @progbits
60ENTRY(_ZGVbN4v_exp2f_sse4)
61	subq	$72, %rsp
62	cfi_def_cfa_offset(80)
63
64	/* Check for overflow\underflow  */
65	movups	__svml_sexp2_data_internal(%rip), %xmm1
66
67	/*  Implementation  */
68	movaps	%xmm1, %xmm5
69
70	/*  Polynomial  */
71	movups	_sPC6+__svml_sexp2_data_internal(%rip), %xmm4
72	addps	%xmm0, %xmm5
73	movaps	%xmm5, %xmm3
74
75	/*  2^N  */
76	pslld	$23, %xmm5
77
78	/* Check for overflow\underflow  */
79	movdqu	_iAbsMask+__svml_sexp2_data_internal(%rip), %xmm2
80	subps	%xmm1, %xmm3
81
82	/*  R  */
83	movaps	%xmm0, %xmm1
84	pand	%xmm0, %xmm2
85	pcmpgtd	_iDomainRange+__svml_sexp2_data_internal(%rip), %xmm2
86	subps	%xmm3, %xmm1
87	movmskps %xmm2, %edx
88	mulps	%xmm1, %xmm4
89	addps	_sPC5+__svml_sexp2_data_internal(%rip), %xmm4
90	mulps	%xmm1, %xmm4
91	addps	_sPC4+__svml_sexp2_data_internal(%rip), %xmm4
92	mulps	%xmm1, %xmm4
93	addps	_sPC3+__svml_sexp2_data_internal(%rip), %xmm4
94	mulps	%xmm1, %xmm4
95	addps	_sPC2+__svml_sexp2_data_internal(%rip), %xmm4
96	mulps	%xmm1, %xmm4
97	addps	_sPC1+__svml_sexp2_data_internal(%rip), %xmm4
98	mulps	%xmm4, %xmm1
99	addps	_sPC0+__svml_sexp2_data_internal(%rip), %xmm1
100
101	/*  Reconstruction  */
102	paddd	%xmm5, %xmm1
103	testl	%edx, %edx
104
105	/* Go to special inputs processing branch */
106	jne	L(SPECIAL_VALUES_BRANCH)
107	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1
108
109	/* Restore registers
110	 * and exit the function
111	 */
112
113L(EXIT):
114	movaps	%xmm1, %xmm0
115	addq	$72, %rsp
116	cfi_def_cfa_offset(8)
117	ret
118	cfi_def_cfa_offset(80)
119
120	/* Branch to process
121	 * special inputs
122	 */
123
124L(SPECIAL_VALUES_BRANCH):
125	movups	%xmm0, 32(%rsp)
126	movups	%xmm1, 48(%rsp)
127	# LOE rbx rbp r12 r13 r14 r15 edx
128
129	xorl	%eax, %eax
130	movq	%r12, 16(%rsp)
131	cfi_offset(12, -64)
132	movl	%eax, %r12d
133	movq	%r13, 8(%rsp)
134	cfi_offset(13, -72)
135	movl	%edx, %r13d
136	movq	%r14, (%rsp)
137	cfi_offset(14, -80)
138	# LOE rbx rbp r15 r12d r13d
139
140	/* Range mask
141	 * bits check
142	 */
143
144L(RANGEMASK_CHECK):
145	btl	%r12d, %r13d
146
147	/* Call scalar math function */
148	jc	L(SCALAR_MATH_CALL)
149	# LOE rbx rbp r15 r12d r13d
150
151	/* Special inputs
152	 * processing loop
153	 */
154
155L(SPECIAL_VALUES_LOOP):
156	incl	%r12d
157	cmpl	$4, %r12d
158
159	/* Check bits in range mask */
160	jl	L(RANGEMASK_CHECK)
161	# LOE rbx rbp r15 r12d r13d
162
163	movq	16(%rsp), %r12
164	cfi_restore(12)
165	movq	8(%rsp), %r13
166	cfi_restore(13)
167	movq	(%rsp), %r14
168	cfi_restore(14)
169	movups	48(%rsp), %xmm1
170
171	/* Go to exit */
172	jmp	L(EXIT)
173	cfi_offset(12, -64)
174	cfi_offset(13, -72)
175	cfi_offset(14, -80)
176	# LOE rbx rbp r12 r13 r14 r15 xmm1
177
178	/* Scalar math fucntion call
179	 * to process special input
180	 */
181
182L(SCALAR_MATH_CALL):
183	movl	%r12d, %r14d
184	movss	32(%rsp, %r14, 4), %xmm0
185	call	exp2f@PLT
186	# LOE rbx rbp r14 r15 r12d r13d xmm0
187
188	movss	%xmm0, 48(%rsp, %r14, 4)
189
190	/* Process special inputs in loop */
191	jmp	L(SPECIAL_VALUES_LOOP)
192	# LOE rbx rbp r15 r12d r13d
193END(_ZGVbN4v_exp2f_sse4)
194
195	.section .rodata, "a"
196	.align	16
197
198#ifdef __svml_sexp2_data_internal_typedef
199typedef unsigned int VUINT32;
200typedef struct {
201	__declspec(align(16)) VUINT32 _sShifter[4][1];
202	__declspec(align(16)) VUINT32 _sPC0[4][1];
203	__declspec(align(16)) VUINT32 _sPC1[4][1];
204	__declspec(align(16)) VUINT32 _sPC2[4][1];
205	__declspec(align(16)) VUINT32 _sPC3[4][1];
206	__declspec(align(16)) VUINT32 _sPC4[4][1];
207	__declspec(align(16)) VUINT32 _sPC5[4][1];
208	__declspec(align(16)) VUINT32 _sPC6[4][1];
209	__declspec(align(16)) VUINT32 _iAbsMask[4][1];
210	__declspec(align(16)) VUINT32 _iDomainRange[4][1];
211} __svml_sexp2_data_internal;
212#endif
213__svml_sexp2_data_internal:
214	.long	0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
215	.align	16
216	.long	0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC0 */
217	.align	16
218	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 /* _sPC1 */
219	.align	16
220	.long	0x3e75fdef, 0x3e75fdef, 0x3e75fdef, 0x3e75fdef /* _sPC2 */
221	.align	16
222	.long	0x3d6357cf, 0x3d6357cf, 0x3d6357cf, 0x3d6357cf /* _sPC3 */
223	.align	16
224	.long	0x3c1d962c, 0x3c1d962c, 0x3c1d962c, 0x3c1d962c /* _sPC4 */
225	.align	16
226	.long	0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51 /* _sPC5 */
227	.align	16
228	.long	0x39213c8c, 0x39213c8c, 0x39213c8c, 0x39213c8c /* _sPC6 */
229	//common
230	.align	16
231	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _iAbsMask */
232	.align	16
233	.long	0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000 /* _iDomainRange=126.0 */
234	.align	16
235	.type	__svml_sexp2_data_internal, @object
236	.size	__svml_sexp2_data_internal, .-__svml_sexp2_data_internal
237