1/* From the Intel IA-64 Optimization Guide, choose the minimum latency
2   alternative.  */
3
4#include <sysdep.h>
5#undef ret
6
7#include <shlib-compat.h>
8
9#if SHLIB_COMPAT(libc, GLIBC_2_2, GLIBC_2_2_6)
10
11/* __divtf3
12   Compute a 80-bit IEEE double-extended quotient.
13   farg0 holds the dividend.  farg1 holds the divisor.  */
14
15ENTRY(___divtf3)
16	cmp.eq p7, p0 = r0, r0
17	frcpa.s0 f10, p6 = farg0, farg1
18	;;
19(p6)	cmp.ne p7, p0 = r0, r0
20	.pred.rel.mutex p6, p7
21(p6)	fnma.s1 f11 = farg1, f10, f1
22(p6)	fma.s1 f12 = farg0, f10, f0
23	;;
24(p6)	fma.s1 f13 = f11, f11, f0
25(p6)	fma.s1 f14 = f11, f11, f11
26	;;
27(p6)	fma.s1 f11 = f13, f13, f11
28(p6)	fma.s1 f13 = f14, f10, f10
29	;;
30(p6)	fma.s1 f10 = f13, f11, f10
31(p6)	fnma.s1 f11 = farg1, f12, farg0
32	;;
33(p6)	fma.s1 f11 = f11, f10, f12
34(p6)	fnma.s1 f12 = farg1, f10, f1
35	;;
36(p6)	fma.s1 f10 = f12, f10, f10
37(p6)	fnma.s1 f12 = farg1, f11, farg0
38	;;
39(p6)	fma.s0 fret0 = f12, f10, f11
40(p7)	mov fret0 = f10
41	br.ret.sptk rp
42END(___divtf3)
43	.symver ___divtf3, __divtf3@GLIBC_2.2
44
45/* __divdf3
46   Compute a 64-bit IEEE double quotient.
47   farg0 holds the dividend.  farg1 holds the divisor.  */
48
49ENTRY(___divdf3)
50	cmp.eq p7, p0 = r0, r0
51	frcpa.s0 f10, p6 = farg0, farg1
52	;;
53(p6)	cmp.ne p7, p0 = r0, r0
54	.pred.rel.mutex p6, p7
55(p6)	fmpy.s1 f11 = farg0, f10
56(p6)	fnma.s1 f12 = farg1, f10, f1
57	;;
58(p6)	fma.s1 f11 = f12, f11, f11
59(p6)	fmpy.s1 f13 = f12, f12
60	;;
61(p6)	fma.s1 f10 = f12, f10, f10
62(p6)	fma.s1 f11 = f13, f11, f11
63	;;
64(p6)	fmpy.s1 f12 = f13, f13
65(p6)	fma.s1 f10 = f13, f10, f10
66	;;
67(p6)	fma.d.s1 f11 = f12, f11, f11
68(p6)	fma.s1 f10 = f12, f10, f10
69	;;
70(p6)	fnma.d.s1 f8 = farg1, f11, farg0
71	;;
72(p6)	fma.d fret0 = f8, f10, f11
73(p7)	mov fret0 = f10
74	br.ret.sptk rp
75	;;
76END(___divdf3)
77	.symver	___divdf3, __divdf3@GLIBC_2.2
78
79/* __divsf3
80   Compute a 32-bit IEEE float quotient.
81   farg0 holds the dividend.  farg1 holds the divisor.  */
82
83ENTRY(___divsf3)
84	cmp.eq p7, p0 = r0, r0
85	frcpa.s0 f10, p6 = farg0, farg1
86	;;
87(p6)	cmp.ne p7, p0 = r0, r0
88	.pred.rel.mutex p6, p7
89(p6)	fmpy.s1 f8 = farg0, f10
90(p6)	fnma.s1 f9 = farg1, f10, f1
91	;;
92(p6)	fma.s1 f8 = f9, f8, f8
93(p6)	fmpy.s1 f9 = f9, f9
94	;;
95(p6)	fma.s1 f8 = f9, f8, f8
96(p6)	fmpy.s1 f9 = f9, f9
97	;;
98(p6)	fma.d.s1 f10 = f9, f8, f8
99	;;
100(p6)	fnorm.s.s0 fret0 = f10
101(p7)	mov fret0 = f10
102	br.ret.sptk rp
103	;;
104END(___divsf3)
105	.symver	___divsf3, __divsf3@GLIBC_2.2
106
107/* __divdi3
108   Compute a 64-bit integer quotient.
109   in0 holds the dividend.  in1 holds the divisor.  */
110
111ENTRY(___divdi3)
112	.regstk 2,0,0,0
113	/* Transfer inputs to FP registers.  */
114	setf.sig f8 = in0
115	setf.sig f9 = in1
116	;;
117	/* Convert the inputs to FP, so that they won't be treated as
118	   unsigned.  */
119	fcvt.xf f8 = f8
120	fcvt.xf f9 = f9
121	;;
122	/* Compute the reciprocal approximation.  */
123	frcpa.s1 f10, p6 = f8, f9
124	;;
125	/* 3 Newton-Raphson iterations.  */
126(p6)	fnma.s1 f11 = f9, f10, f1
127(p6)	fmpy.s1 f12 = f8, f10
128	;;
129(p6)	fmpy.s1 f13 = f11, f11
130(p6)	fma.s1 f12 = f11, f12, f12
131	;;
132(p6)	fma.s1 f10 = f11, f10, f10
133(p6)	fma.s1 f11 = f13, f12, f12
134	;;
135(p6)	fma.s1 f10 = f13, f10, f10
136(p6)	fnma.s1 f12 = f9, f11, f8
137	;;
138(p6)	fma.s1 f10 = f12, f10, f11
139	;;
140	/* Round quotient to an integer.  */
141	fcvt.fx.trunc.s1 f10 = f10
142	;;
143	/* Transfer result to GP registers.  */
144	getf.sig ret0 = f10
145	br.ret.sptk rp
146	;;
147END(___divdi3)
148	.symver	___divdi3, __divdi3@GLIBC_2.2
149
150/* __moddi3
151   Compute a 64-bit integer modulus.
152   in0 holds the dividend (a).  in1 holds the divisor (b).  */
153
154ENTRY(___moddi3)
155	.regstk 2,0,0,0
156	/* Transfer inputs to FP registers.  */
157	setf.sig f14 = in0
158	setf.sig f9 = in1
159	;;
160	/* Convert the inputs to FP, so that they won't be treated as
161	   unsigned.  */
162	fcvt.xf f8 = f14
163	fcvt.xf f9 = f9
164	;;
165	/* Compute the reciprocal approximation.  */
166	frcpa.s1 f10, p6 = f8, f9
167	;;
168	/* 3 Newton-Raphson iterations.  */
169(p6)	fmpy.s1 f12 = f8, f10
170(p6)	fnma.s1 f11 = f9, f10, f1
171	;;
172(p6)	fma.s1 f12 = f11, f12, f12
173(p6)	fmpy.s1 f13 = f11, f11
174	;;
175(p6)	fma.s1 f10 = f11, f10, f10
176(p6)	fma.s1 f11 = f13, f12, f12
177	;;
178	sub in1 = r0, in1
179(p6)	fma.s1 f10 = f13, f10, f10
180(p6)	fnma.s1 f12 = f9, f11, f8
181	;;
182	setf.sig f9 = in1
183(p6)	fma.s1 f10 = f12, f10, f11
184	;;
185	fcvt.fx.trunc.s1 f10 = f10
186	;;
187	/* r = q * (-b) + a  */
188	xma.l f10 = f10, f9, f14
189	;;
190	/* Transfer result to GP registers.  */
191	getf.sig ret0 = f10
192	br.ret.sptk rp
193	;;
194END(___moddi3)
195	.symver ___moddi3, __moddi3@GLIBC_2.2
196
197/* __udivdi3
198   Compute a 64-bit unsigned integer quotient.
199   in0 holds the dividend.  in1 holds the divisor.  */
200
201ENTRY(___udivdi3)
202	.regstk 2,0,0,0
203	/* Transfer inputs to FP registers.  */
204	setf.sig f8 = in0
205	setf.sig f9 = in1
206	;;
207	/* Convert the inputs to FP, to avoid FP software-assist faults.  */
208	fcvt.xuf.s1 f8 = f8
209	fcvt.xuf.s1 f9 = f9
210	;;
211	/* Compute the reciprocal approximation.  */
212	frcpa.s1 f10, p6 = f8, f9
213	;;
214	/* 3 Newton-Raphson iterations.  */
215(p6)	fnma.s1 f11 = f9, f10, f1
216(p6)	fmpy.s1 f12 = f8, f10
217	;;
218(p6)	fmpy.s1 f13 = f11, f11
219(p6)	fma.s1 f12 = f11, f12, f12
220	;;
221(p6)	fma.s1 f10 = f11, f10, f10
222(p6)	fma.s1 f11 = f13, f12, f12
223	;;
224(p6)	fma.s1 f10 = f13, f10, f10
225(p6)	fnma.s1 f12 = f9, f11, f8
226	;;
227(p6)	fma.s1 f10 = f12, f10, f11
228	;;
229	/* Round quotient to an unsigned integer.  */
230	fcvt.fxu.trunc.s1 f10 = f10
231	;;
232	/* Transfer result to GP registers.  */
233	getf.sig ret0 = f10
234	br.ret.sptk rp
235	;;
236END(___udivdi3)
237	.symver	___udivdi3, __udivdi3@GLIBC_2.2
238
239/* __umoddi3
240   Compute a 64-bit unsigned integer modulus.
241   in0 holds the dividend (a).  in1 holds the divisor (b).  */
242
243ENTRY(___umoddi3)
244	.regstk 2,0,0,0
245	/* Transfer inputs to FP registers.  */
246	setf.sig f14 = in0
247	setf.sig f9 = in1
248	;;
249	/* Convert the inputs to FP, to avoid FP software assist faults.  */
250	fcvt.xuf.s1 f8 = f14
251	fcvt.xuf.s1 f9 = f9
252	;;
253	/* Compute the reciprocal approximation.  */
254	frcpa.s1 f10, p6 = f8, f9
255	;;
256	/* 3 Newton-Raphson iterations.  */
257(p6)	fmpy.s1 f12 = f8, f10
258(p6)	fnma.s1 f11 = f9, f10, f1
259	;;
260(p6)	fma.s1 f12 = f11, f12, f12
261(p6)	fmpy.s1 f13 = f11, f11
262	;;
263(p6)	fma.s1 f10 = f11, f10, f10
264(p6)	fma.s1 f11 = f13, f12, f12
265	;;
266	sub in1 = r0, in1
267(p6)	fma.s1 f10 = f13, f10, f10
268(p6)	fnma.s1 f12 = f9, f11, f8
269	;;
270	setf.sig f9 = in1
271(p6)	fma.s1 f10 = f12, f10, f11
272	;;
273	/* Round quotient to an unsigned integer.  */
274	fcvt.fxu.trunc.s1 f10 = f10
275	;;
276	/* r = q * (-b) + a  */
277	xma.l f10 = f10, f9, f14
278	;;
279	/* Transfer result to GP registers.  */
280	getf.sig ret0 = f10
281	br.ret.sptk rp
282	;;
283END(___umoddi3)
284	.symver	___umoddi3, __umoddi3@GLIBC_2.2
285
286/* __multi3
287   Compute a 128-bit multiply of 128-bit multiplicands.
288   in0/in1 holds one multiplicand (a), in2/in3 holds the other one (b).  */
289
290ENTRY(___multi3)
291	.regstk 4,0,0,0
292	setf.sig f6 = in1
293	movl r19 = 0xffffffff
294	setf.sig f7 = in2
295	;;
296	and r14 = r19, in0
297	;;
298	setf.sig f10 = r14
299	and r14 = r19, in2
300	xmpy.l f9 = f6, f7
301	;;
302	setf.sig f6 = r14
303	shr.u r14 = in0, 32
304	;;
305	setf.sig f7 = r14
306	shr.u r14 = in2, 32
307	;;
308	setf.sig f8 = r14
309	xmpy.l f11 = f10, f6
310	xmpy.l f6 = f7, f6
311	;;
312	getf.sig r16 = f11
313	xmpy.l f7 = f7, f8
314	;;
315	shr.u r14 = r16, 32
316	and r16 = r19, r16
317	getf.sig r17 = f6
318	setf.sig f6 = in0
319	;;
320	setf.sig f11 = r14
321	getf.sig r21 = f7
322	setf.sig f7 = in3
323	;;
324	xma.l f11 = f10, f8, f11
325	xma.l f6 = f6, f7, f9
326	;;
327	getf.sig r18 = f11
328	;;
329	add r18 = r18, r17
330	;;
331	and r15 = r19, r18
332	cmp.ltu p7, p6 = r18, r17
333	;;
334	getf.sig r22 = f6
335(p7)	adds r14 = 1, r19
336	;;
337(p7)	add r21 = r21, r14
338	shr.u r14 = r18, 32
339	shl r15 = r15, 32
340	;;
341	add r20 = r21, r14
342	;;
343	add ret0 = r15, r16
344	add ret1 = r22, r20
345	br.ret.sptk rp
346	;;
347END(___multi3)
348	.symver	___multi3, __multi3@GLIBC_2.2
349
350#endif
351