1/* Compute cubic root of float value.
2   Copyright (C) 1997-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <machine/asm.h>
20#include <libm-alias-float.h>
21
22        .section .rodata
23
24        .align ALIGNARG(4)
25        .type f3,@object
26f3:	.double 0.191502161678719066
27        ASM_SIZE_DIRECTIVE(f3)
28        .type f2,@object
29f2:	.double 0.697570460207922770
30        ASM_SIZE_DIRECTIVE(f2)
31        .type f1,@object
32f1:	.double 0.492659620528969547
33        ASM_SIZE_DIRECTIVE(f1)
34
35#define CBRT2		1.2599210498948731648
36#define ONE_CBRT2	0.793700525984099737355196796584
37#define SQR_CBRT2	1.5874010519681994748
38#define ONE_SQR_CBRT2	0.629960524947436582364439673883
39
40	.type factor,@object
41        .align ALIGNARG(4)
42factor:	.double ONE_SQR_CBRT2
43	.double ONE_CBRT2
44	.double 1.0
45	.double CBRT2
46	.double SQR_CBRT2
47	ASM_SIZE_DIRECTIVE(factor)
48
49        .type two25,@object
50two25:	.byte 0, 0, 0, 0x4c
51        ASM_SIZE_DIRECTIVE(two25)
52
53#ifdef PIC
54#define MO(op) op##@GOTOFF(%ebx)
55#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
56#else
57#define MO(op) op
58#define MOX(op,x) op(x)
59#endif
60
61	.text
62ENTRY(__cbrtf)
63	movl	4(%esp), %eax
64	xorl	%ecx, %ecx
65	movl	%eax, %edx
66	andl	$0x7fffffff, %eax
67	jz	1f
68	cmpl	$0x7f800000, %eax
69	jae	1f
70
71#ifdef PIC
72	pushl	%ebx
73	cfi_adjust_cfa_offset (4)
74	cfi_rel_offset (ebx, 0)
75	LOAD_PIC_REG (bx)
76#endif
77
78	cmpl	$0x00800000, %eax
79	jae	2f
80
81#ifdef PIC
82	flds	8(%esp)
83#else
84	flds	4(%esp)
85#endif
86	fmuls	MO(two25)
87	movl	$-25, %ecx
88#ifdef PIC
89	fstps	8(%esp)
90	movl	8(%esp), %eax
91#else
92	fstps	4(%esp)
93	movl	4(%esp), %eax
94#endif
95	movl	%eax, %edx
96	andl	$0x7fffffff, %eax
97
982:	shrl	$23, %eax
99	andl	$0x807fffff, %edx
100	subl	$126, %eax
101	orl	$0x3f000000, %edx
102	addl	%eax, %ecx
103#ifdef PIC
104	movl	%edx, 8(%esp)
105
106	flds	8(%esp)			/* xm */
107#else
108	movl	%edx, 4(%esp)
109
110	flds	4(%esp)			/* xm */
111#endif
112	fabs
113
114	/* The following code has two tracks:
115	    a) compute the normalized cbrt value
116	    b) compute xe/3 and xe%3
117	   The right track computes the value for b) and this is done
118	   in an optimized way by avoiding division.
119
120	   But why two tracks at all?  Very easy: efficiency.  Some FP
121	   instruction can overlap with a certain amount of integer (and
122	   FP) instructions.  So we get (except for the imull) all
123	   instructions for free.  */
124
125	fld	%st(0)			/* xm : xm */
126	fmull	MO(f3)			/* f3*xm : xm */
127			movl	$1431655766, %eax
128	fsubrl	MO(f2)			/* f2-f3*xm : xm */
129			imull	%ecx
130	fmul	%st(1)			/* (f2-f3*xm)*xm : xm */
131			movl	%ecx, %eax
132	faddl	MO(f1)			/* u:=f1+(f2-f3*xm)*xm : xm */
133			sarl	$31, %eax
134	fld	%st			/* u : u : xm */
135			subl	%eax, %edx
136	fmul	%st(1)			/* u*u : u : xm */
137	fld	%st(2)			/* xm : u*u : u : xm */
138	fadd	%st			/* 2*xm : u*u : u : xm */
139	fxch	%st(1)			/* u*u : 2*xm : u : xm */
140	fmul	%st(2)			/* t2:=u*u*u : 2*xm : u : xm */
141			movl	%edx, %eax
142	fadd	%st, %st(1)		/* t2 : t2+2*xm : u : xm */
143			leal	(%edx,%edx,2),%edx
144	fadd	%st(0)			/* 2*t2 : t2+2*xm : u : xm */
145			subl	%edx, %ecx
146	faddp	%st, %st(3)		/* t2+2*xm : u : 2*t2+xm */
147			shll	$3, %ecx
148	fmulp				/* u*(t2+2*xm) : 2*t2+xm */
149	fdivp	%st, %st(1)		/* u*(t2+2*xm)/(2*t2+xm) */
150	fmull	MOX(16+factor,%ecx)	/* u*(t2+2*xm)/(2*t2+xm)*FACT */
151	pushl	%eax
152	cfi_adjust_cfa_offset (4)
153	fildl	(%esp)			/* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
154	fxch				/* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
155	fscale				/* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
156	popl	%edx
157	cfi_adjust_cfa_offset (-4)
158#ifdef PIC
159	movl	8(%esp), %eax
160	popl	%ebx
161	cfi_adjust_cfa_offset (-4)
162	cfi_restore (ebx)
163#else
164	movl	4(%esp), %eax
165#endif
166	testl	%eax, %eax
167	fstp	%st(1)
168	jns	4f
169	fchs
1704:	ret
171
172	/* Return the argument.  */
1731:	flds	4(%esp)
174	ret
175END(__cbrtf)
176libm_alias_float (__cbrt, cbrt)
177