1/* Copyright (C) 2004-2022 Free Software Foundation, Inc.
2   This file is part of the GNU C Library.
3
4   The GNU C Library is free software; you can redistribute it and/or
5   modify it under the terms of the GNU Lesser General Public
6   License as published by the Free Software Foundation; either
7   version 2.1 of the License, or (at your option) any later version.
8
9   The GNU C Library is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13
14   You should have received a copy of the GNU Lesser General Public
15   License along with the GNU C Library.  If not, see
16   <https://www.gnu.org/licenses/>.  */
17
18#include "div_libc.h"
19
20
21/* 64-bit signed long remainder.  These are not normal C functions.  Argument
22   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
23   be clobbered.
24
25   Theory of operation here is that we can use the FPU divider for virtually
26   all operands that we see: all dividend values between -2**53 and 2**53-1
27   can be computed directly.  Note that divisor values need not be checked
28   against that range because the rounded fp value will be close enough such
29   that the quotient is < 1, which will properly be truncated to zero when we
30   convert back to integer.
31
32   When the dividend is outside the range for which we can compute exact
33   results, we use the fp quotent as an estimate from which we begin refining
34   an exact integral value.  This reduces the number of iterations in the
35   shift-and-subtract loop significantly.
36
37   The FPCR save/restore is due to the fact that the EV6 _will_ set FPCR_INE
38   for cvttq/c even without /sui being set.  It will not, however, properly
39   raise the exception, so we don't have to worry about FPCR_INED being clear
40   and so dying by SIGFPE.  */
41
42	.text
43	.align	4
44	.globl	__remq
45	.type	__remq, @funcnoplt
46	.usepv	__remq, no
47
48	cfi_startproc
49	cfi_return_column (RA)
50__remq:
51	lda	sp, -FRAME(sp)
52	cfi_def_cfa_offset (FRAME)
53	CALL_MCOUNT
54
55	/* Get the fp divide insn issued as quickly as possible.  After
56	   that's done, we have at least 22 cycles until its results are
57	   ready -- all the time in the world to figure out how we're
58	   going to use the results.  */
59	stt	$f0, 0(sp)
60	excb
61	beq	Y, DIVBYZERO
62
63	stt	$f1, 8(sp)
64	stt	$f3, 48(sp)
65	cfi_rel_offset ($f0, 0)
66	cfi_rel_offset ($f1, 8)
67	cfi_rel_offset ($f3, 48)
68	mf_fpcr	$f3
69
70	_ITOFT2	X, $f0, 16, Y, $f1, 24
71	cvtqt	$f0, $f0
72	cvtqt	$f1, $f1
73	divt/c	$f0, $f1, $f0
74
75	/* Check to see if X fit in the double as an exact value.  */
76	sll	X, (64-53), AT
77	ldt	$f1, 8(sp)
78	sra	AT, (64-53), AT
79	cmpeq	X, AT, AT
80	beq	AT, $x_big
81
82	/* If we get here, we're expecting exact results from the division.
83	   Do nothing else besides convert, compute remainder, clean up.  */
84	cvttq/c	$f0, $f0
85	excb
86	mt_fpcr	$f3
87	_FTOIT	$f0, AT, 16
88	mulq	AT, Y, AT
89	ldt	$f0, 0(sp)
90	ldt	$f3, 48(sp)
91	cfi_restore ($f1)
92	cfi_remember_state
93	cfi_restore ($f0)
94	cfi_restore ($f3)
95	cfi_def_cfa_offset (0)
96	lda	sp, FRAME(sp)
97	subq	X, AT, RV
98	ret	$31, (RA), 1
99
100	.align	4
101	cfi_restore_state
102$x_big:
103	/* If we get here, X is large enough that we don't expect exact
104	   results, and neither X nor Y got mis-translated for the fp
105	   division.  Our task is to take the fp result, figure out how
106	   far it's off from the correct result and compute a fixup.  */
107	stq	t0, 16(sp)
108	stq	t1, 24(sp)
109	stq	t2, 32(sp)
110	stq	t5, 40(sp)
111	cfi_rel_offset (t0, 16)
112	cfi_rel_offset (t1, 24)
113	cfi_rel_offset (t2, 32)
114	cfi_rel_offset (t5, 40)
115
116#define Q	t0		/* quotient */
117#define R	RV		/* remainder */
118#define SY	t1		/* scaled Y */
119#define S	t2		/* scalar */
120#define QY	t3		/* Q*Y */
121
122	/* The fixup code below can only handle unsigned values.  */
123	or	X, Y, AT
124	mov	$31, t5
125	blt	AT, $fix_sign_in
126$fix_sign_in_ret1:
127	cvttq/c	$f0, $f0
128
129	_FTOIT	$f0, Q, 8
130	.align	3
131$fix_sign_in_ret2:
132	ldt	$f0, 0(sp)
133	stq	t3, 0(sp)
134	cfi_restore ($f0)
135	cfi_rel_offset (t3, 0)
136
137	mulq	Q, Y, QY
138	excb
139	stq	t4, 8(sp)
140	mt_fpcr	$f3
141	cfi_rel_offset (t4, 8)
142
143	subq	QY, X, R
144	mov	Y, SY
145	mov	1, S
146	bgt	R, $q_high
147
148$q_high_ret:
149	subq	X, QY, R
150	mov	Y, SY
151	mov	1, S
152	bgt	R, $q_low
153
154$q_low_ret:
155	ldq	t0, 16(sp)
156	ldq	t1, 24(sp)
157	ldq	t2, 32(sp)
158	bne	t5, $fix_sign_out
159
160$fix_sign_out_ret:
161	ldq	t3, 0(sp)
162	ldq	t4, 8(sp)
163	ldq	t5, 40(sp)
164	ldt	$f3, 48(sp)
165	lda	sp, FRAME(sp)
166	cfi_remember_state
167	cfi_restore (t0)
168	cfi_restore (t1)
169	cfi_restore (t2)
170	cfi_restore (t3)
171	cfi_restore (t4)
172	cfi_restore (t5)
173	cfi_restore ($f3)
174	cfi_def_cfa_offset (0)
175	ret	$31, (RA), 1
176
177	.align	4
178	cfi_restore_state
179	/* The quotient that we computed was too large.  We need to reduce
180	   it by S such that Y*S >= R.  Obviously the closer we get to the
181	   correct value the better, but overshooting high is ok, as we'll
182	   fix that up later.  */
1830:
184	addq	SY, SY, SY
185	addq	S, S, S
186$q_high:
187	cmpult	SY, R, AT
188	bne	AT, 0b
189
190	subq	Q, S, Q
191	unop
192	subq	QY, SY, QY
193	br	$q_high_ret
194
195	.align	4
196	/* The quotient that we computed was too small.  Divide Y by the
197	   current remainder (R) and add that to the existing quotient (Q).
198	   The expectation, of course, is that R is much smaller than X.  */
199	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
200	   already have a copy of Y in SY and the value 1 in S.  */
2010:
202	addq	SY, SY, SY
203	addq	S, S, S
204$q_low:
205	cmpult	SY, R, AT
206	bne	AT, 0b
207
208	/* Shift-down and subtract loop.  Each iteration compares our scaled
209	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
210	   Y's scalar (S) so add it to the quotient (Q).  */
2112:	addq	Q, S, t3
212	srl	S, 1, S
213	cmpule	SY, R, AT
214	subq	R, SY, t4
215
216	cmovne	AT, t3, Q
217	cmovne	AT, t4, R
218	srl	SY, 1, SY
219	bne	S, 2b
220
221	br	$q_low_ret
222
223	.align	4
224$fix_sign_in:
225	/* If we got here, then X|Y is negative.  Need to adjust everything
226	   such that we're doing unsigned division in the fixup loop.  */
227	/* T5 records the changes we had to make:
228		bit 0:	set if X was negated.  Note that the sign of the
229			remainder follows the sign of the divisor.
230		bit 2:	set if Y was negated.
231	*/
232	xor	X, Y, t1
233	cmplt	X, 0, t5
234	negq	X, t0
235	cmovne	t5, t0, X
236
237	cmplt	Y, 0, AT
238	negq	Y, t0
239	s4addq	AT, t5, t5
240	cmovne	AT, t0, Y
241
242	bge	t1, $fix_sign_in_ret1
243	cvttq/c	$f0, $f0
244	_FTOIT	$f0, Q, 8
245	.align	3
246	negq	Q, Q
247	br	$fix_sign_in_ret2
248
249	.align	4
250$fix_sign_out:
251	/* Now we get to undo what we did above.  */
252	/* ??? Is this really faster than just increasing the size of
253	   the stack frame and storing X and Y in memory?  */
254	and	t5, 4, AT
255	negq	Y, t4
256	cmovne	AT, t4, Y
257
258	negq	X, t4
259	cmovlbs	t5, t4, X
260	negq	RV, t4
261	cmovlbs	t5, t4, RV
262
263	br	$fix_sign_out_ret
264
265	cfi_endproc
266	.size	__remq, .-__remq
267
268	DO_DIVBYZERO
269