1/* Compute cubic root of long double value.
2   Copyright (C) 1997-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <libm-alias-ldouble.h>
20#include <machine/asm.h>
21
22        .section .rodata
23
24        .align ALIGNARG(4)
25        .type f8,@object
26f8:	.tfloat 0.161617097923756032
27	ASM_SIZE_DIRECTIVE(f8)
28        .align ALIGNARG(4)
29        .type f7,@object
30f7:	.tfloat -0.988553671195413709
31	ASM_SIZE_DIRECTIVE(f7)
32        .align ALIGNARG(4)
33        .type f6,@object
34f6:	.tfloat 2.65298938441952296
35	ASM_SIZE_DIRECTIVE(f6)
36        .align ALIGNARG(4)
37        .type f5,@object
38f5:	.tfloat -4.11151425200350531
39	ASM_SIZE_DIRECTIVE(f5)
40        .align ALIGNARG(4)
41        .type f4,@object
42f4:	.tfloat 4.09559907378707839
43	ASM_SIZE_DIRECTIVE(f4)
44        .align ALIGNARG(4)
45        .type f3,@object
46f3:	.tfloat -2.82414939754975962
47	ASM_SIZE_DIRECTIVE(f3)
48        .align ALIGNARG(4)
49        .type f2,@object
50f2:	.tfloat 1.67595307700780102
51	ASM_SIZE_DIRECTIVE(f2)
52        .align ALIGNARG(4)
53        .type f1,@object
54f1:	.tfloat 0.338058687610520237
55	ASM_SIZE_DIRECTIVE(f1)
56
57#define CBRT2		1.2599210498948731648
58#define ONE_CBRT2	0.793700525984099737355196796584
59#define SQR_CBRT2	1.5874010519681994748
60#define ONE_SQR_CBRT2	0.629960524947436582364439673883
61
62	/* We make the entries in the following table all 16 bytes
63	   wide to avoid having to implement a multiplication by 10.  */
64	.type factor,@object
65        .align ALIGNARG(4)
66factor:	.tfloat ONE_SQR_CBRT2
67	.byte 0, 0, 0, 0, 0, 0
68	.tfloat ONE_CBRT2
69	.byte 0, 0, 0, 0, 0, 0
70	.tfloat 1.0
71	.byte 0, 0, 0, 0, 0, 0
72	.tfloat CBRT2
73	.byte 0, 0, 0, 0, 0, 0
74	.tfloat SQR_CBRT2
75	ASM_SIZE_DIRECTIVE(factor)
76
77        .type two64,@object
78        .align ALIGNARG(4)
79two64:  .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
80        ASM_SIZE_DIRECTIVE(two64)
81
82#ifdef PIC
83#define MO(op) op##@GOTOFF(%ebx)
84#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
85#else
86#define MO(op) op
87#define MOX(op,x) op(x)
88#endif
89
90	.text
91ENTRY(__cbrtl)
92	movl	4(%esp), %ecx
93	movl	12(%esp), %eax
94	orl	8(%esp), %ecx
95	movl	%eax, %edx
96	andl	$0x7fff, %eax
97	orl	%eax, %ecx
98	jz	1f
99	xorl	%ecx, %ecx
100	cmpl	$0x7fff, %eax
101	je	1f
102
103#ifdef PIC
104	pushl	%ebx
105	cfi_adjust_cfa_offset (4)
106	cfi_rel_offset (ebx, 0)
107	LOAD_PIC_REG (bx)
108#endif
109
110	cmpl	$0, %eax
111	jne	2f
112
113#ifdef PIC
114	fldt	8(%esp)
115#else
116	fldt	4(%esp)
117#endif
118	fmull	MO(two64)
119	movl	$-64, %ecx
120#ifdef PIC
121	fstpt	8(%esp)
122	movl	16(%esp), %eax
123#else
124	fstpt	4(%esp)
125	movl	12(%esp), %eax
126#endif
127	movl	%eax, %edx
128	andl	$0x7fff, %eax
129
1302:	andl	$0x8000, %edx
131	subl	$16382, %eax
132	orl	$0x3ffe, %edx
133	addl	%eax, %ecx
134#ifdef PIC
135	movl	%edx, 16(%esp)
136
137	fldt	8(%esp)			/* xm */
138#else
139	movl	%edx, 12(%esp)
140
141	fldt	4(%esp)			/* xm */
142#endif
143	fabs
144
145	/* The following code has two tracks:
146	    a) compute the normalized cbrt value
147	    b) compute xe/3 and xe%3
148	   The right track computes the value for b) and this is done
149	   in an optimized way by avoiding division.
150
151	   But why two tracks at all?  Very easy: efficiency.  Some FP
152	   instruction can overlap with a certain amount of integer (and
153	   FP) instructions.  So we get (except for the imull) all
154	   instructions for free.  */
155
156	fldt	MO(f8)			/* f8 : xm */
157	fmul	%st(1)			/* f8*xm : xm */
158
159	fldt	MO(f7)
160	faddp				/* f7+f8*xm : xm */
161	fmul	%st(1)			/* (f7+f8*xm)*xm : xm */
162			movl	$1431655766, %eax
163	fldt	MO(f6)
164	faddp				/* f6+(f7+f8*xm)*xm : xm */
165			imull	%ecx
166	fmul	%st(1)			/* (f6+(f7+f8*xm)*xm)*xm : xm */
167			movl	%ecx, %eax
168	fldt	MO(f5)
169	faddp				/* f5+(f6+(f7+f8*xm)*xm)*xm : xm */
170			sarl	$31, %eax
171	fmul	%st(1)			/* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
172			subl	%eax, %edx
173	fldt	MO(f4)
174	faddp				/* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
175	fmul	%st(1)			/* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
176	fldt	MO(f3)
177	faddp				/* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
178	fmul	%st(1)			/* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
179	fldt	MO(f2)
180	faddp				/* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
181	fmul	%st(1)			/* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
182	fldt	MO(f1)
183	faddp				/* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
184
185	fld	%st			/* u : u : xm */
186	fmul	%st(1)			/* u*u : u : xm */
187	fld	%st(2)			/* xm : u*u : u : xm */
188	fadd	%st			/* 2*xm : u*u : u : xm */
189	fxch	%st(1)			/* u*u : 2*xm : u : xm */
190	fmul	%st(2)			/* t2:=u*u*u : 2*xm : u : xm */
191			movl	%edx, %eax
192	fadd	%st, %st(1)		/* t2 : t2+2*xm : u : xm */
193			leal	(%edx,%edx,2),%edx
194	fadd	%st(0)			/* 2*t2 : t2+2*xm : u : xm */
195			subl	%edx, %ecx
196	faddp	%st, %st(3)		/* t2+2*xm : u : 2*t2+xm */
197			shll	$4, %ecx
198	fmulp				/* u*(t2+2*xm) : 2*t2+xm */
199	fdivp	%st, %st(1)		/* u*(t2+2*xm)/(2*t2+xm) */
200	fldt	MOX(32+factor,%ecx)
201	fmulp				/* u*(t2+2*xm)/(2*t2+xm)*FACT */
202	pushl	%eax
203	cfi_adjust_cfa_offset (4)
204	fildl	(%esp)			/* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
205	fxch				/* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
206	fscale				/* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
207	popl	%edx
208	cfi_adjust_cfa_offset (-4)
209#ifdef PIC
210	movl	16(%esp), %eax
211	popl	%ebx
212	cfi_adjust_cfa_offset (-4)
213	cfi_restore (ebx)
214#else
215	movl	12(%esp), %eax
216#endif
217	testl	$0x8000, %eax
218	fstp	%st(1)
219	jz	4f
220	fchs
2214:	ret
222
223	/* Return the argument.  */
2241:	fldt	4(%esp)
225	fadd	%st
226	ret
227END(__cbrtl)
228libm_alias_ldouble (__cbrt, cbrt)
229