1/* Copyright (C) 2004-2022 Free Software Foundation, Inc.
2   This file is part of the GNU C Library.
3
4   The GNU C Library is free software; you can redistribute it and/or
5   modify it under the terms of the GNU Lesser General Public
6   License as published by the Free Software Foundation; either
7   version 2.1 of the License, or (at your option) any later version.
8
9   The GNU C Library is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13
14   You should have received a copy of the GNU Lesser General Public
15   License along with the GNU C Library.  If not, see
16   <https://www.gnu.org/licenses/>.  */
17
18#include "div_libc.h"
19
20
21/* 64-bit signed long divide.  These are not normal C functions.  Argument
22   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
23   be clobbered.
24
25   Theory of operation here is that we can use the FPU divider for virtually
26   all operands that we see: all dividend values between -2**53 and 2**53-1
27   can be computed directly.  Note that divisor values need not be checked
28   against that range because the rounded fp value will be close enough such
29   that the quotient is < 1, which will properly be truncated to zero when we
30   convert back to integer.
31
32   When the dividend is outside the range for which we can compute exact
33   results, we use the fp quotent as an estimate from which we begin refining
34   an exact integral value.  This reduces the number of iterations in the
35   shift-and-subtract loop significantly.
36
37   The FPCR save/restore is due to the fact that the EV6 _will_ set FPCR_INE
38   for cvttq/c even without /sui being set.  It will not, however, properly
39   raise the exception, so we don't have to worry about FPCR_INED being clear
40   and so dying by SIGFPE.  */
41
42	.text
43	.align	4
44	.globl	__divq
45	.type	__divq, @funcnoplt
46	.usepv	__divq, no
47
48	cfi_startproc
49	cfi_return_column (RA)
50__divq:
51	lda	sp, -FRAME(sp)
52	cfi_def_cfa_offset (FRAME)
53	CALL_MCOUNT
54
55	/* Get the fp divide insn issued as quickly as possible.  After
56	   that's done, we have at least 22 cycles until its results are
57	   ready -- all the time in the world to figure out how we're
58	   going to use the results.  */
59	stt	$f0, 0(sp)
60	excb
61	beq	Y, DIVBYZERO
62
63	stt	$f1, 8(sp)
64	stt	$f3, 48(sp)
65	cfi_rel_offset ($f0, 0)
66	cfi_rel_offset ($f1, 8)
67	cfi_rel_offset ($f3, 48)
68	mf_fpcr	$f3
69
70	_ITOFT2	X, $f0, 16, Y, $f1, 24
71	cvtqt	$f0, $f0
72	cvtqt	$f1, $f1
73	divt/c	$f0, $f1, $f0
74
75	/* Check to see if X fit in the double as an exact value.  */
76	sll	X, (64-53), AT
77	ldt	$f1, 8(sp)
78	sra	AT, (64-53), AT
79	cmpeq	X, AT, AT
80	beq	AT, $x_big
81
82	/* If we get here, we're expecting exact results from the division.
83	   Do nothing else besides convert and clean up.  */
84	cvttq/c	$f0, $f0
85	excb
86	mt_fpcr	$f3
87	_FTOIT	$f0, RV, 16
88
89	ldt	$f0, 0(sp)
90	ldt	$f3, 48(sp)
91	cfi_restore ($f1)
92	cfi_remember_state
93	cfi_restore ($f0)
94	cfi_restore ($f3)
95	cfi_def_cfa_offset (0)
96	lda	sp, FRAME(sp)
97	ret	$31, (RA), 1
98
99	.align	4
100	cfi_restore_state
101$x_big:
102	/* If we get here, X is large enough that we don't expect exact
103	   results, and neither X nor Y got mis-translated for the fp
104	   division.  Our task is to take the fp result, figure out how
105	   far it's off from the correct result and compute a fixup.  */
106	stq	t0, 16(sp)
107	stq	t1, 24(sp)
108	stq	t2, 32(sp)
109	stq	t5, 40(sp)
110	cfi_rel_offset (t0, 16)
111	cfi_rel_offset (t1, 24)
112	cfi_rel_offset (t2, 32)
113	cfi_rel_offset (t5, 40)
114
115#define Q	RV		/* quotient */
116#define R	t0		/* remainder */
117#define SY	t1		/* scaled Y */
118#define S	t2		/* scalar */
119#define QY	t3		/* Q*Y */
120
121	/* The fixup code below can only handle unsigned values.  */
122	or	X, Y, AT
123	mov	$31, t5
124	blt	AT, $fix_sign_in
125$fix_sign_in_ret1:
126	cvttq/c	$f0, $f0
127
128	_FTOIT	$f0, Q, 8
129	.align	3
130$fix_sign_in_ret2:
131	ldt	$f0, 0(sp)
132	stq	t3, 0(sp)
133	cfi_restore ($f0)
134	cfi_rel_offset (t3, 0)
135
136	mulq	Q, Y, QY
137	excb
138	stq	t4, 8(sp)
139	mt_fpcr	$f3
140	cfi_rel_offset (t4, 8)
141
142	subq	QY, X, R
143	mov	Y, SY
144	mov	1, S
145	bgt	R, $q_high
146
147$q_high_ret:
148	subq	X, QY, R
149	mov	Y, SY
150	mov	1, S
151	bgt	R, $q_low
152
153$q_low_ret:
154	ldq	t0, 16(sp)
155	ldq	t1, 24(sp)
156	ldq	t2, 32(sp)
157	bne	t5, $fix_sign_out
158
159$fix_sign_out_ret:
160	ldq	t3, 0(sp)
161	ldq	t4, 8(sp)
162	ldq	t5, 40(sp)
163	ldt	$f3, 48(sp)
164	lda	sp, FRAME(sp)
165	cfi_remember_state
166	cfi_restore (t0)
167	cfi_restore (t1)
168	cfi_restore (t2)
169	cfi_restore (t3)
170	cfi_restore (t4)
171	cfi_restore (t5)
172	cfi_restore ($f3)
173	cfi_def_cfa_offset (0)
174	ret	$31, (RA), 1
175
176	.align	4
177	cfi_restore_state
178	/* The quotient that we computed was too large.  We need to reduce
179	   it by S such that Y*S >= R.  Obviously the closer we get to the
180	   correct value the better, but overshooting high is ok, as we'll
181	   fix that up later.  */
1820:
183	addq	SY, SY, SY
184	addq	S, S, S
185$q_high:
186	cmpult	SY, R, AT
187	bne	AT, 0b
188
189	subq	Q, S, Q
190	unop
191	subq	QY, SY, QY
192	br	$q_high_ret
193
194	.align	4
195	/* The quotient that we computed was too small.  Divide Y by the
196	   current remainder (R) and add that to the existing quotient (Q).
197	   The expectation, of course, is that R is much smaller than X.  */
198	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
199	   already have a copy of Y in SY and the value 1 in S.  */
2000:
201	addq	SY, SY, SY
202	addq	S, S, S
203$q_low:
204	cmpult	SY, R, AT
205	bne	AT, 0b
206
207	/* Shift-down and subtract loop.  Each iteration compares our scaled
208	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
209	   Y's scalar (S) so add it to the quotient (Q).  */
2102:	addq	Q, S, t3
211	srl	S, 1, S
212	cmpule	SY, R, AT
213	subq	R, SY, t4
214
215	cmovne	AT, t3, Q
216	cmovne	AT, t4, R
217	srl	SY, 1, SY
218	bne	S, 2b
219
220	br	$q_low_ret
221
222	.align	4
223$fix_sign_in:
224	/* If we got here, then X|Y is negative.  Need to adjust everything
225	   such that we're doing unsigned division in the fixup loop.  */
226	/* T5 records the changes we had to make:
227		bit 0:	set if result should be negative.
228		bit 2:	set if X was negated.
229		bit 3:	set if Y was negated.
230	*/
231	xor	X, Y, AT
232	cmplt	AT, 0, t5
233	cmplt	X, 0, AT
234	negq	X, t0
235
236	s4addq	AT, t5, t5
237	cmovne	AT, t0, X
238	cmplt	Y, 0, AT
239	negq	Y, t0
240
241	s8addq	AT, t5, t5
242	cmovne	AT, t0, Y
243	unop
244	blbc	t5, $fix_sign_in_ret1
245
246	cvttq/c	$f0, $f0
247	_FTOIT	$f0, Q, 8
248	.align	3
249	negq	Q, Q
250	br	$fix_sign_in_ret2
251
252	.align	4
253$fix_sign_out:
254	/* Now we get to undo what we did above.  */
255	/* ??? Is this really faster than just increasing the size of
256	   the stack frame and storing X and Y in memory?  */
257	and	t5, 8, AT
258	negq	Y, t4
259	cmovne	AT, t4, Y
260
261	and	t5, 4, AT
262	negq	X, t4
263	cmovne	AT, t4, X
264
265	negq	RV, t4
266	cmovlbs	t5, t4, RV
267
268	br	$fix_sign_out_ret
269
270	cfi_endproc
271	.size	__divq, .-__divq
272
273	DO_DIVBYZERO
274