1/* Function log1pf vectorized with AVX-512.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   https://www.gnu.org/licenses/.  */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 *    1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1, 2)
23 *    Get short reciprocal approximation Rcp ~ 1/xh
24 *    R = (Rcp*xh - 1.0) + Rcp*xl
25 *    log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
26 *       log(Rcp) is tabulated
27 *
28 *
29 */
30
31/* Offsets for data table __svml_slog1p_data_internal
32 */
33#define SgnMask				0
34#define sOne				64
35#define sPoly_1				128
36#define sPoly_2				192
37#define sPoly_3				256
38#define sPoly_4				320
39#define sPoly_5				384
40#define sPoly_6				448
41#define sPoly_7				512
42#define sPoly_8				576
43#define iHiDelta			640
44#define iLoRange			704
45#define iBrkValue			768
46#define iOffExpoMask			832
47#define sLn2				896
48
49#include <sysdep.h>
50
51	.section .text.exex512, "ax", @progbits
52ENTRY(_ZGVeN16v_log1pf_skx)
53	pushq	%rbp
54	cfi_def_cfa_offset(16)
55	movq	%rsp, %rbp
56	cfi_def_cfa(6, 16)
57	cfi_offset(6, -16)
58	andq	$-64, %rsp
59	subq	$192, %rsp
60	vmovups	sOne+__svml_slog1p_data_internal(%rip), %zmm2
61
62	/* reduction: compute r, n */
63	vmovups	iBrkValue+__svml_slog1p_data_internal(%rip), %zmm12
64	vmovups	SgnMask+__svml_slog1p_data_internal(%rip), %zmm4
65	vmovaps	%zmm0, %zmm3
66
67	/* compute 1+x as high, low parts */
68	vmaxps	{sae}, %zmm3, %zmm2, %zmm5
69	vminps	{sae}, %zmm3, %zmm2, %zmm7
70	vandnps	%zmm3, %zmm4, %zmm1
71	vpternlogd $255, %zmm4, %zmm4, %zmm4
72	vaddps	{rn-sae}, %zmm7, %zmm5, %zmm9
73	vpsubd	%zmm12, %zmm9, %zmm10
74	vsubps	{rn-sae}, %zmm9, %zmm5, %zmm6
75
76	/* check argument value ranges */
77	vpaddd	iHiDelta+__svml_slog1p_data_internal(%rip), %zmm9, %zmm8
78	vpsrad	$23, %zmm10, %zmm13
79	vmovups	sPoly_5+__svml_slog1p_data_internal(%rip), %zmm9
80	vpcmpd	$5, iLoRange+__svml_slog1p_data_internal(%rip), %zmm8, %k1
81	vpslld	$23, %zmm13, %zmm14
82	vaddps	{rn-sae}, %zmm7, %zmm6, %zmm15
83	vcvtdq2ps {rn-sae}, %zmm13, %zmm0
84	vpsubd	%zmm14, %zmm2, %zmm13
85	vmovups	sPoly_8+__svml_slog1p_data_internal(%rip), %zmm7
86	vmovups	sPoly_1+__svml_slog1p_data_internal(%rip), %zmm14
87	vmulps	{rn-sae}, %zmm13, %zmm15, %zmm6
88	vpandd	iOffExpoMask+__svml_slog1p_data_internal(%rip), %zmm10, %zmm11
89	vpaddd	%zmm12, %zmm11, %zmm5
90	vmovups	sPoly_4+__svml_slog1p_data_internal(%rip), %zmm10
91	vmovups	sPoly_3+__svml_slog1p_data_internal(%rip), %zmm11
92	vmovups	sPoly_2+__svml_slog1p_data_internal(%rip), %zmm12
93
94	/* polynomial evaluation */
95	vsubps	{rn-sae}, %zmm2, %zmm5, %zmm2
96	vaddps	{rn-sae}, %zmm6, %zmm2, %zmm15
97	vmovups	sPoly_7+__svml_slog1p_data_internal(%rip), %zmm2
98	vfmadd231ps {rn-sae}, %zmm15, %zmm7, %zmm2
99	vpandnd	%zmm8, %zmm8, %zmm4{%k1}
100	vmovups	sPoly_6+__svml_slog1p_data_internal(%rip), %zmm8
101
102	/* combine and get argument value range mask */
103	vptestmd %zmm4, %zmm4, %k0
104	vfmadd213ps {rn-sae}, %zmm8, %zmm15, %zmm2
105	kmovw	%k0, %edx
106	vfmadd213ps {rn-sae}, %zmm9, %zmm15, %zmm2
107	vfmadd213ps {rn-sae}, %zmm10, %zmm15, %zmm2
108	vfmadd213ps {rn-sae}, %zmm11, %zmm15, %zmm2
109	vfmadd213ps {rn-sae}, %zmm12, %zmm15, %zmm2
110	vfmadd213ps {rn-sae}, %zmm14, %zmm15, %zmm2
111	vmulps	{rn-sae}, %zmm15, %zmm2, %zmm4
112	vfmadd213ps {rn-sae}, %zmm15, %zmm15, %zmm4
113
114	/* final reconstruction */
115	vmovups	sLn2+__svml_slog1p_data_internal(%rip), %zmm15
116	vfmadd213ps {rn-sae}, %zmm4, %zmm15, %zmm0
117	vorps	%zmm1, %zmm0, %zmm0
118	testl	%edx, %edx
119
120	/* Go to special inputs processing branch */
121	jne	L(SPECIAL_VALUES_BRANCH)
122	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm3
123
124	/* Restore registers
125	 * and exit the function
126	 */
127
128L(EXIT):
129	movq	%rbp, %rsp
130	popq	%rbp
131	cfi_def_cfa(7, 8)
132	cfi_restore(6)
133	ret
134	cfi_def_cfa(6, 16)
135	cfi_offset(6, -16)
136
137	/* Branch to process
138	 * special inputs
139	 */
140
141L(SPECIAL_VALUES_BRANCH):
142	vmovups	%zmm3, 64(%rsp)
143	vmovups	%zmm0, 128(%rsp)
144	# LOE rbx r12 r13 r14 r15 edx zmm0
145
146	xorl	%eax, %eax
147	# LOE rbx r12 r13 r14 r15 eax edx
148
149	vzeroupper
150	movq	%r12, 16(%rsp)
151	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
152	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
153	movl	%eax, %r12d
154	movq	%r13, 8(%rsp)
155	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
156	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
157	movl	%edx, %r13d
158	movq	%r14, (%rsp)
159	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
160	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
161	# LOE rbx r15 r12d r13d
162
163	/* Range mask
164	 * bits check
165	 */
166
167L(RANGEMASK_CHECK):
168	btl	%r12d, %r13d
169
170	/* Call scalar math function */
171	jc	L(SCALAR_MATH_CALL)
172	# LOE rbx r15 r12d r13d
173
174	/* Special inputs
175	 * processing loop
176	 */
177
178L(SPECIAL_VALUES_LOOP):
179	incl	%r12d
180	cmpl	$16, %r12d
181
182	/* Check bits in range mask */
183	jl	L(RANGEMASK_CHECK)
184	# LOE rbx r15 r12d r13d
185
186	movq	16(%rsp), %r12
187	cfi_restore(12)
188	movq	8(%rsp), %r13
189	cfi_restore(13)
190	movq	(%rsp), %r14
191	cfi_restore(14)
192	vmovups	128(%rsp), %zmm0
193
194	/* Go to exit */
195	jmp	L(EXIT)
196	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
197	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
198	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
199	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
200	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
201	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
202	# LOE rbx r12 r13 r14 r15 zmm0
203
204	/* Scalar math fucntion call
205	 * to process special input
206	 */
207
208L(SCALAR_MATH_CALL):
209	movl	%r12d, %r14d
210	vmovss	64(%rsp, %r14, 4), %xmm0
211	call	log1pf@PLT
212	# LOE rbx r14 r15 r12d r13d xmm0
213
214	vmovss	%xmm0, 128(%rsp, %r14, 4)
215
216	/* Process special inputs in loop */
217	jmp	L(SPECIAL_VALUES_LOOP)
218	# LOE rbx r15 r12d r13d
219END(_ZGVeN16v_log1pf_skx)
220
221	.section .rodata, "a"
222	.align	64
223
224#ifdef __svml_slog1p_data_internal_typedef
225typedef unsigned int VUINT32;
226typedef struct {
227	__declspec(align(64)) VUINT32 SgnMask[16][1];
228	__declspec(align(64)) VUINT32 sOne[16][1];
229	__declspec(align(64)) VUINT32 sPoly[8][16][1];
230	__declspec(align(64)) VUINT32 iHiDelta[16][1];
231	__declspec(align(64)) VUINT32 iLoRange[16][1];
232	__declspec(align(64)) VUINT32 iBrkValue[16][1];
233	__declspec(align(64)) VUINT32 iOffExpoMask[16][1];
234	__declspec(align(64)) VUINT32 sLn2[16][1];
235} __svml_slog1p_data_internal;
236#endif
237__svml_slog1p_data_internal:
238	/* SgnMask */
239	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
240	/* sOne = SP 1.0 */
241	.align	64
242	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
243	/* sPoly[] = SP polynomial */
244	.align	64
245	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
246	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
247	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
248	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
249	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
250	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
251	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
252	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
253	/* iHiDelta = SP 80000000-7f000000 */
254	.align	64
255	.long	0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000
256	/* iLoRange = SP 00800000+iHiDelta */
257	.align	64
258	.long	0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000
259	/* iBrkValue = SP 2/3 */
260	.align	64
261	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
262	/* iOffExpoMask = SP significand mask */
263	.align	64
264	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
265	/* sLn2 = SP ln(2) */
266	.align	64
267	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
268	.align	64
269	.type	__svml_slog1p_data_internal, @object
270	.size	__svml_slog1p_data_internal, .-__svml_slog1p_data_internal
271