1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2    Copyright (C) 1991-2022 Free Software Foundation, Inc.
3 
4    This file is part of the GNU C Library.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    In addition to the permissions in the GNU Lesser General Public
12    License, the Free Software Foundation gives you unlimited
13    permission to link the compiled version of this file into
14    combinations with other programs, and to distribute those
15    combinations without any restriction coming from the use of this
16    file.  (The Lesser General Public License restrictions do apply in
17    other respects; for example, they cover modification of the file,
18    and distribution when not linked into a combine executable.)
19 
20    The GNU C Library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24 
25    You should have received a copy of the GNU Lesser General Public
26    License along with the GNU C Library; if not, see
27    <https://www.gnu.org/licenses/>.  */
28 
29 /* You have to define the following before including this file:
30 
31    UWtype -- An unsigned type, default type for operations (typically a "word")
32    UHWtype -- An unsigned type, at least half the size of UWtype.
33    UDWtype -- An unsigned type, at least twice as large a UWtype
34    W_TYPE_SIZE -- size in bits of UWtype
35 
36    UQItype -- Unsigned 8 bit type.
37    SItype, USItype -- Signed and unsigned 32 bit types.
38    DItype, UDItype -- Signed and unsigned 64 bit types.
39 
40    On a 32 bit machine UWtype should typically be USItype;
41    on a 64 bit machine, UWtype should typically be UDItype.  */
42 
43 #define __BITS4 (W_TYPE_SIZE / 4)
44 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
45 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
46 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
47 
48 #ifndef W_TYPE_SIZE
49 #define W_TYPE_SIZE	32
50 #define UWtype		USItype
51 #define UHWtype		USItype
52 #define UDWtype		UDItype
53 #endif
54 
55 /* Used in glibc only.  */
56 #ifndef attribute_hidden
57 #define attribute_hidden
58 #endif
59 
60 extern const UQItype __clz_tab[256] attribute_hidden;
61 
62 /* Define auxiliary asm macros.
63 
64    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
65    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
66    word product in HIGH_PROD and LOW_PROD.
67 
68    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
69    UDWtype product.  This is just a variant of umul_ppmm.
70 
71    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
72    denominator) divides a UDWtype, composed by the UWtype integers
73    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
74    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
75    than DENOMINATOR for correct operation.  If, in addition, the most
76    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
77    UDIV_NEEDS_NORMALIZATION is defined to 1.
78 
79    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
80    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
81    is rounded towards 0.
82 
83    5) count_leading_zeros(count, x) counts the number of zero-bits from the
84    msb to the first nonzero bit in the UWtype X.  This is the number of
85    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
86    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
87 
88    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
89    from the least significant end.
90 
91    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
92    high_addend_2, low_addend_2) adds two UWtype integers, composed by
93    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
94    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
95    (i.e. carry out) is not stored anywhere, and is lost.
96 
97    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
98    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
99    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
100    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
101    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
102    and is lost.
103 
104    If any of these macros are left undefined for a particular CPU,
105    C macros are used.  */
106 
107 /* The CPUs come in alphabetical order below.
108 
109    Please add support for more CPUs here, or improve the current support
110    for the CPUs below!
111    (E.g. WE32100, IBM360.)  */
112 
113 #if defined (__GNUC__) && !defined (NO_ASM)
114 
115 /* We sometimes need to clobber "cc" with gcc2, but that would not be
116    understood by gcc1.  Use cpp to avoid major code duplication.  */
117 #if __GNUC__ < 2
118 #define __CLOBBER_CC
119 #define __AND_CLOBBER_CC
120 #else /* __GNUC__ >= 2 */
121 #define __CLOBBER_CC : "cc"
122 #define __AND_CLOBBER_CC , "cc"
123 #endif /* __GNUC__ < 2 */
124 
125 #if defined (__aarch64__)
126 
127 #if W_TYPE_SIZE == 32
128 #define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clz (X))
129 #define count_trailing_zeros(COUNT, X)   ((COUNT) = __builtin_ctz (X))
130 #define COUNT_LEADING_ZEROS_0 32
131 #endif /* W_TYPE_SIZE == 32 */
132 
133 #if W_TYPE_SIZE == 64
134 #define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clzll (X))
135 #define count_trailing_zeros(COUNT, X)   ((COUNT) = __builtin_ctzll (X))
136 #define COUNT_LEADING_ZEROS_0 64
137 #endif /* W_TYPE_SIZE == 64 */
138 
139 #endif /* __aarch64__ */
140 
141 #if defined (__alpha) && W_TYPE_SIZE == 64
142 /* There is a bug in g++ before version 5 that
143    errors on __builtin_alpha_umulh.  */
144 #if !defined(__cplusplus) || __GNUC__ >= 5
145 #define umul_ppmm(ph, pl, m0, m1) \
146   do {									\
147     UDItype __m0 = (m0), __m1 = (m1);					\
148     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
149     (pl) = __m0 * __m1;							\
150   } while (0)
151 #define UMUL_TIME 46
152 #endif /* !c++ */
153 #ifndef LONGLONG_STANDALONE
154 #define udiv_qrnnd(q, r, n1, n0, d) \
155   do { UDItype __r;							\
156     (q) = __udiv_qrnnd (&__r, (n1), (n0), (d));				\
157     (r) = __r;								\
158   } while (0)
159 extern UDItype __udiv_qrnnd (UDItype *, UDItype, UDItype, UDItype);
160 #define UDIV_TIME 220
161 #endif /* LONGLONG_STANDALONE */
162 #ifdef __alpha_cix__
163 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clzl (X))
164 #define count_trailing_zeros(COUNT,X)	((COUNT) = __builtin_ctzl (X))
165 #define COUNT_LEADING_ZEROS_0 64
166 #else
167 #define count_leading_zeros(COUNT,X) \
168   do {									\
169     UDItype __xr = (X), __t, __a;					\
170     __t = __builtin_alpha_cmpbge (0, __xr);				\
171     __a = __clz_tab[__t ^ 0xff] - 1;					\
172     __t = __builtin_alpha_extbl (__xr, __a);				\
173     (COUNT) = 64 - (__clz_tab[__t] + __a*8);				\
174   } while (0)
175 #define count_trailing_zeros(COUNT,X) \
176   do {									\
177     UDItype __xr = (X), __t, __a;					\
178     __t = __builtin_alpha_cmpbge (0, __xr);				\
179     __t = ~__t & -~__t;							\
180     __a = ((__t & 0xCC) != 0) * 2;					\
181     __a += ((__t & 0xF0) != 0) * 4;					\
182     __a += ((__t & 0xAA) != 0);						\
183     __t = __builtin_alpha_extbl (__xr, __a);				\
184     __a <<= 3;								\
185     __t &= -__t;							\
186     __a += ((__t & 0xCC) != 0) * 2;					\
187     __a += ((__t & 0xF0) != 0) * 4;					\
188     __a += ((__t & 0xAA) != 0);						\
189     (COUNT) = __a;							\
190   } while (0)
191 #endif /* __alpha_cix__ */
192 #endif /* __alpha */
193 
194 #if defined (__arc__) && W_TYPE_SIZE == 32
195 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
196   __asm__ ("add.f	%1, %4, %5\n\tadc	%0, %2, %3"		\
197 	   : "=r" ((USItype) (sh)),					\
198 	     "=&r" ((USItype) (sl))					\
199 	   : "%r" ((USItype) (ah)),					\
200 	     "rICal" ((USItype) (bh)),					\
201 	     "%r" ((USItype) (al)),					\
202 	     "rICal" ((USItype) (bl))					\
203 	   : "cc")
204 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
205   __asm__ ("sub.f	%1, %4, %5\n\tsbc	%0, %2, %3"		\
206 	   : "=r" ((USItype) (sh)),					\
207 	     "=&r" ((USItype) (sl))					\
208 	   : "r" ((USItype) (ah)),					\
209 	     "rICal" ((USItype) (bh)),					\
210 	     "r" ((USItype) (al)),					\
211 	     "rICal" ((USItype) (bl))					\
212 	   : "cc")
213 
214 #define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
215 #ifdef __ARC_NORM__
216 #define count_leading_zeros(count, x) \
217   do									\
218     {									\
219       SItype c_;							\
220 									\
221       __asm__ ("norm.f\t%0,%1\n\tmov.mi\t%0,-1" : "=r" (c_) : "r" (x) : "cc");\
222       (count) = c_ + 1;							\
223     }									\
224   while (0)
225 #define COUNT_LEADING_ZEROS_0 32
226 #endif /* __ARC_NORM__ */
227 #endif /* __arc__ */
228 
229 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
230  && W_TYPE_SIZE == 32
231 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
232   __asm__ ("adds	%1, %4, %5\n\tadc	%0, %2, %3"		\
233 	   : "=r" ((USItype) (sh)),					\
234 	     "=&r" ((USItype) (sl))					\
235 	   : "%r" ((USItype) (ah)),					\
236 	     "rI" ((USItype) (bh)),					\
237 	     "%r" ((USItype) (al)),					\
238 	     "rI" ((USItype) (bl)) __CLOBBER_CC)
239 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
240   __asm__ ("subs	%1, %4, %5\n\tsbc	%0, %2, %3"		\
241 	   : "=r" ((USItype) (sh)),					\
242 	     "=&r" ((USItype) (sl))					\
243 	   : "r" ((USItype) (ah)),					\
244 	     "rI" ((USItype) (bh)),					\
245 	     "r" ((USItype) (al)),					\
246 	     "rI" ((USItype) (bl)) __CLOBBER_CC)
247 # if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \
248      || defined(__ARM_ARCH_3__)
249 #  define umul_ppmm(xh, xl, a, b)					\
250   do {									\
251     register USItype __t0, __t1, __t2;					\
252     __asm__ ("%@ Inlined umul_ppmm\n"					\
253 	   "	mov	%2, %5, lsr #16\n"				\
254 	   "	mov	%0, %6, lsr #16\n"				\
255 	   "	bic	%3, %5, %2, lsl #16\n"				\
256 	   "	bic	%4, %6, %0, lsl #16\n"				\
257 	   "	mul	%1, %3, %4\n"					\
258 	   "	mul	%4, %2, %4\n"					\
259 	   "	mul	%3, %0, %3\n"					\
260 	   "	mul	%0, %2, %0\n"					\
261 	   "	adds	%3, %4, %3\n"					\
262 	   "	addcs	%0, %0, #65536\n"				\
263 	   "	adds	%1, %1, %3, lsl #16\n"				\
264 	   "	adc	%0, %0, %3, lsr #16"				\
265 	   : "=&r" ((USItype) (xh)),					\
266 	     "=r" ((USItype) (xl)),					\
267 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
268 	   : "r" ((USItype) (a)),					\
269 	     "r" ((USItype) (b)) __CLOBBER_CC );			\
270   } while (0)
271 #  define UMUL_TIME 20
272 # else
273 #  define umul_ppmm(xh, xl, a, b)					\
274   do {									\
275     /* Generate umull, under compiler control.  */			\
276     register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b);	\
277     (xl) = (USItype)__t0;						\
278     (xh) = (USItype)(__t0 >> 32);					\
279   } while (0)
280 #  define UMUL_TIME 3
281 # endif
282 # define UDIV_TIME 100
283 #endif /* __arm__ */
284 
285 #if defined(__arm__)
286 /* Let gcc decide how best to implement count_leading_zeros.  */
287 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
288 #define count_trailing_zeros(COUNT,X)   ((COUNT) = __builtin_ctz (X))
289 #define COUNT_LEADING_ZEROS_0 32
290 #endif
291 
292 #if defined (__AVR__)
293 
294 #if W_TYPE_SIZE == 16
295 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clz (X))
296 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctz (X))
297 #define COUNT_LEADING_ZEROS_0 16
298 #endif /* W_TYPE_SIZE == 16 */
299 
300 #if W_TYPE_SIZE == 32
301 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clzl (X))
302 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctzl (X))
303 #define COUNT_LEADING_ZEROS_0 32
304 #endif /* W_TYPE_SIZE == 32 */
305 
306 #if W_TYPE_SIZE == 64
307 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clzll (X))
308 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctzll (X))
309 #define COUNT_LEADING_ZEROS_0 64
310 #endif /* W_TYPE_SIZE == 64 */
311 
312 #endif /* defined (__AVR__) */
313 
314 #if defined (__CRIS__)
315 
316 #if __CRIS_arch_version >= 3
317 #define count_leading_zeros(COUNT, X) ((COUNT) = __builtin_clz (X))
318 #define COUNT_LEADING_ZEROS_0 32
319 #endif /* __CRIS_arch_version >= 3 */
320 
321 #if __CRIS_arch_version >= 8
322 #define count_trailing_zeros(COUNT, X) ((COUNT) = __builtin_ctz (X))
323 #endif /* __CRIS_arch_version >= 8 */
324 
325 #if __CRIS_arch_version >= 10
326 #define __umulsidi3(u,v) ((UDItype)(USItype) (u) * (UDItype)(USItype) (v))
327 #else
328 #define __umulsidi3 __umulsidi3
329 extern UDItype __umulsidi3 (USItype, USItype);
330 #endif /* __CRIS_arch_version >= 10 */
331 
332 #define umul_ppmm(w1, w0, u, v)		\
333   do {					\
334     UDItype __x = __umulsidi3 (u, v);	\
335     (w0) = (USItype) (__x);		\
336     (w1) = (USItype) (__x >> 32);	\
337   } while (0)
338 
339 /* FIXME: defining add_ssaaaa and sub_ddmmss should be advantageous for
340    DFmode ("double" intrinsics, avoiding two of the three insns handling
341    carry), but defining them as open-code C composing and doing the
342    operation in DImode (UDImode) shows that the DImode needs work:
343    register pressure from requiring neighboring registers and the
344    traffic to and from them come to dominate, in the 4.7 series.  */
345 
346 #endif /* defined (__CRIS__) */
347 
348 #if defined (__hppa) && W_TYPE_SIZE == 32
349 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
350   __asm__ ("add %4,%5,%1\n\taddc %2,%3,%0"				\
351 	   : "=r" ((USItype) (sh)),					\
352 	     "=&r" ((USItype) (sl))					\
353 	   : "%rM" ((USItype) (ah)),					\
354 	     "rM" ((USItype) (bh)),					\
355 	     "%rM" ((USItype) (al)),					\
356 	     "rM" ((USItype) (bl)))
357 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
358   __asm__ ("sub %4,%5,%1\n\tsubb %2,%3,%0"				\
359 	   : "=r" ((USItype) (sh)),					\
360 	     "=&r" ((USItype) (sl))					\
361 	   : "rM" ((USItype) (ah)),					\
362 	     "rM" ((USItype) (bh)),					\
363 	     "rM" ((USItype) (al)),					\
364 	     "rM" ((USItype) (bl)))
365 #if defined (_PA_RISC1_1)
366 #define umul_ppmm(w1, w0, u, v) \
367   do {									\
368     union								\
369       {									\
370 	UDItype __f;							\
371 	struct {USItype __w1, __w0;} __w1w0;				\
372       } __t;								\
373     __asm__ ("xmpyu %1,%2,%0"						\
374 	     : "=x" (__t.__f)						\
375 	     : "x" ((USItype) (u)),					\
376 	       "x" ((USItype) (v)));					\
377     (w1) = __t.__w1w0.__w1;						\
378     (w0) = __t.__w1w0.__w0;						\
379      } while (0)
380 #define UMUL_TIME 8
381 #else
382 #define UMUL_TIME 30
383 #endif
384 #define UDIV_TIME 40
385 #define count_leading_zeros(count, x) \
386   do {									\
387     USItype __tmp;							\
388     __asm__ (								\
389        "ldi		1,%0\n"						\
390 "	extru,=		%1,15,16,%%r0		; Bits 31..16 zero?\n"	\
391 "	extru,tr	%1,15,16,%1		; No.  Shift down, skip add.\n"\
392 "	ldo		16(%0),%0		; Yes.  Perform add.\n"	\
393 "	extru,=		%1,23,8,%%r0		; Bits 15..8 zero?\n"	\
394 "	extru,tr	%1,23,8,%1		; No.  Shift down, skip add.\n"\
395 "	ldo		8(%0),%0		; Yes.  Perform add.\n"	\
396 "	extru,=		%1,27,4,%%r0		; Bits 7..4 zero?\n"	\
397 "	extru,tr	%1,27,4,%1		; No.  Shift down, skip add.\n"\
398 "	ldo		4(%0),%0		; Yes.  Perform add.\n"	\
399 "	extru,=		%1,29,2,%%r0		; Bits 3..2 zero?\n"	\
400 "	extru,tr	%1,29,2,%1		; No.  Shift down, skip add.\n"\
401 "	ldo		2(%0),%0		; Yes.  Perform add.\n"	\
402 "	extru		%1,30,1,%1		; Extract bit 1.\n"	\
403 "	sub		%0,%1,%0		; Subtract it.\n"	\
404 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
405   } while (0)
406 #endif
407 
408 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
409 #if !defined (__zarch__)
410 #define smul_ppmm(xh, xl, m0, m1) \
411   do {									\
412     union {DItype __ll;							\
413 	   struct {USItype __h, __l;} __i;				\
414 	  } __x;							\
415     __asm__ ("lr %N0,%1\n\tmr %0,%2"					\
416 	     : "=&r" (__x.__ll)						\
417 	     : "r" (m0), "r" (m1));					\
418     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
419   } while (0)
420 #define sdiv_qrnnd(q, r, n1, n0, d) \
421   do {									\
422     union {DItype __ll;							\
423 	   struct {USItype __h, __l;} __i;				\
424 	  } __x;							\
425     __x.__i.__h = n1; __x.__i.__l = n0;					\
426     __asm__ ("dr %0,%2"							\
427 	     : "=r" (__x.__ll)						\
428 	     : "0" (__x.__ll), "r" (d));				\
429     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
430   } while (0)
431 #else
432 #define smul_ppmm(xh, xl, m0, m1) \
433   do {                                                                  \
434     register SItype __r0 __asm__ ("0");					\
435     register SItype __r1 __asm__ ("1") = (m0);				\
436 									\
437     __asm__ ("mr\t%%r0,%3"                                              \
438 	     : "=r" (__r0), "=r" (__r1)					\
439 	     : "r"  (__r1),  "r" (m1));					\
440     (xh) = __r0; (xl) = __r1;						\
441   } while (0)
442 
443 #define sdiv_qrnnd(q, r, n1, n0, d) \
444   do {									\
445     register SItype __r0 __asm__ ("0") = (n1);				\
446     register SItype __r1 __asm__ ("1") = (n0);				\
447 									\
448     __asm__ ("dr\t%%r0,%4"                                              \
449 	     : "=r" (__r0), "=r" (__r1)					\
450 	     : "r" (__r0), "r" (__r1), "r" (d));			\
451     (q) = __r1; (r) = __r0;						\
452   } while (0)
453 #endif /* __zarch__ */
454 #endif
455 
456 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
457 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
458   __asm__ ("add{l} {%5,%1|%1,%5}\n\tadc{l} {%3,%0|%0,%3}"		\
459 	   : "=r" ((USItype) (sh)),					\
460 	     "=&r" ((USItype) (sl))					\
461 	   : "%0" ((USItype) (ah)),					\
462 	     "g" ((USItype) (bh)),					\
463 	     "%1" ((USItype) (al)),					\
464 	     "g" ((USItype) (bl)))
465 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
466   __asm__ ("sub{l} {%5,%1|%1,%5}\n\tsbb{l} {%3,%0|%0,%3}"		\
467 	   : "=r" ((USItype) (sh)),					\
468 	     "=&r" ((USItype) (sl))					\
469 	   : "0" ((USItype) (ah)),					\
470 	     "g" ((USItype) (bh)),					\
471 	     "1" ((USItype) (al)),					\
472 	     "g" ((USItype) (bl)))
473 #define umul_ppmm(w1, w0, u, v) \
474   __asm__ ("mul{l} %3"							\
475 	   : "=a" ((USItype) (w0)),					\
476 	     "=d" ((USItype) (w1))					\
477 	   : "%0" ((USItype) (u)),					\
478 	     "rm" ((USItype) (v)))
479 #define udiv_qrnnd(q, r, n1, n0, dv) \
480   __asm__ ("div{l} %4"							\
481 	   : "=a" ((USItype) (q)),					\
482 	     "=d" ((USItype) (r))					\
483 	   : "0" ((USItype) (n0)),					\
484 	     "1" ((USItype) (n1)),					\
485 	     "rm" ((USItype) (dv)))
486 #define count_leading_zeros(count, x)	((count) = __builtin_clz (x))
487 #define count_trailing_zeros(count, x)	((count) = __builtin_ctz (x))
488 #define UMUL_TIME 40
489 #define UDIV_TIME 40
490 #endif /* 80x86 */
491 
492 #if defined (__x86_64__) && W_TYPE_SIZE == 64
493 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
494   __asm__ ("add{q} {%5,%1|%1,%5}\n\tadc{q} {%3,%0|%0,%3}"		\
495 	   : "=r" ((UDItype) (sh)),					\
496 	     "=&r" ((UDItype) (sl))					\
497 	   : "%0" ((UDItype) (ah)),					\
498 	     "rme" ((UDItype) (bh)),					\
499 	     "%1" ((UDItype) (al)),					\
500 	     "rme" ((UDItype) (bl)))
501 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
502   __asm__ ("sub{q} {%5,%1|%1,%5}\n\tsbb{q} {%3,%0|%0,%3}"		\
503 	   : "=r" ((UDItype) (sh)),					\
504 	     "=&r" ((UDItype) (sl))					\
505 	   : "0" ((UDItype) (ah)),					\
506 	     "rme" ((UDItype) (bh)),					\
507 	     "1" ((UDItype) (al)),					\
508 	     "rme" ((UDItype) (bl)))
509 #define umul_ppmm(w1, w0, u, v) \
510   __asm__ ("mul{q} %3"							\
511 	   : "=a" ((UDItype) (w0)),					\
512 	     "=d" ((UDItype) (w1))					\
513 	   : "%0" ((UDItype) (u)),					\
514 	     "rm" ((UDItype) (v)))
515 #define udiv_qrnnd(q, r, n1, n0, dv) \
516   __asm__ ("div{q} %4"							\
517 	   : "=a" ((UDItype) (q)),					\
518 	     "=d" ((UDItype) (r))					\
519 	   : "0" ((UDItype) (n0)),					\
520 	     "1" ((UDItype) (n1)),					\
521 	     "rm" ((UDItype) (dv)))
522 #define count_leading_zeros(count, x)	((count) = __builtin_clzll (x))
523 #define count_trailing_zeros(count, x)	((count) = __builtin_ctzll (x))
524 #define UMUL_TIME 40
525 #define UDIV_TIME 40
526 #endif /* x86_64 */
527 
528 #if defined (__i960__) && W_TYPE_SIZE == 32
529 #define umul_ppmm(w1, w0, u, v) \
530   ({union {UDItype __ll;						\
531 	   struct {USItype __l, __h;} __i;				\
532 	  } __xx;							\
533   __asm__ ("emul	%2,%1,%0"					\
534 	   : "=d" (__xx.__ll)						\
535 	   : "%dI" ((USItype) (u)),					\
536 	     "dI" ((USItype) (v)));					\
537   (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;})
538 #define __umulsidi3(u, v) \
539   ({UDItype __w;							\
540     __asm__ ("emul	%2,%1,%0"					\
541 	     : "=d" (__w)						\
542 	     : "%dI" ((USItype) (u)),					\
543 	       "dI" ((USItype) (v)));					\
544     __w; })
545 #endif /* __i960__ */
546 
547 #if defined (__ia64) && W_TYPE_SIZE == 64
548 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
549    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
550    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
551    register, which takes an extra cycle.  */
552 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
553   do {									\
554     UWtype __x;								\
555     __x = (al) - (bl);							\
556     if ((al) < (bl))							\
557       (sh) = (ah) - (bh) - 1;						\
558     else								\
559       (sh) = (ah) - (bh);						\
560     (sl) = __x;								\
561   } while (0)
562 
563 /* Do both product parts in assembly, since that gives better code with
564    all gcc versions.  Some callers will just use the upper part, and in
565    that situation we waste an instruction, but not any cycles.  */
566 #define umul_ppmm(ph, pl, m0, m1)					\
567   __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
568 	   : "=&f" (ph), "=f" (pl)					\
569 	   : "f" (m0), "f" (m1))
570 #define count_leading_zeros(count, x)					\
571   do {									\
572     UWtype _x = (x), _y, _a, _c;					\
573     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
574     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
575     _c = (_a - 1) << 3;							\
576     _x >>= _c;								\
577     if (_x >= 1 << 4)							\
578       _x >>= 4, _c += 4;						\
579     if (_x >= 1 << 2)							\
580       _x >>= 2, _c += 2;						\
581     _c += _x >> 1;							\
582     (count) =  W_TYPE_SIZE - 1 - _c;					\
583   } while (0)
584 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
585    based, and we don't need a special case for x==0 here */
586 #define count_trailing_zeros(count, x)					\
587   do {									\
588     UWtype __ctz_x = (x);						\
589     __asm__ ("popcnt %0 = %1"						\
590 	     : "=r" (count)						\
591 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
592   } while (0)
593 #define UMUL_TIME 14
594 #endif
595 
596 #if defined (__M32R__) && W_TYPE_SIZE == 32
597 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
598   /* The cmp clears the condition bit.  */ \
599   __asm__ ("cmp %0,%0\n\taddx %1,%5\n\taddx %0,%3"			\
600 	   : "=r" ((USItype) (sh)),					\
601 	     "=&r" ((USItype) (sl))					\
602 	   : "0" ((USItype) (ah)),					\
603 	     "r" ((USItype) (bh)),					\
604 	     "1" ((USItype) (al)),					\
605 	     "r" ((USItype) (bl))					\
606 	   : "cbit")
607 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
608   /* The cmp clears the condition bit.  */ \
609   __asm__ ("cmp %0,%0\n\tsubx %1,%5\n\tsubx %0,%3"			\
610 	   : "=r" ((USItype) (sh)),					\
611 	     "=&r" ((USItype) (sl))					\
612 	   : "0" ((USItype) (ah)),					\
613 	     "r" ((USItype) (bh)),					\
614 	     "1" ((USItype) (al)),					\
615 	     "r" ((USItype) (bl))					\
616 	   : "cbit")
617 #endif /* __M32R__ */
618 
619 #if defined (__mc68000__) && W_TYPE_SIZE == 32
620 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
621   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
622 	   : "=d" ((USItype) (sh)),					\
623 	     "=&d" ((USItype) (sl))					\
624 	   : "%0" ((USItype) (ah)),					\
625 	     "d" ((USItype) (bh)),					\
626 	     "%1" ((USItype) (al)),					\
627 	     "g" ((USItype) (bl)))
628 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
629   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
630 	   : "=d" ((USItype) (sh)),					\
631 	     "=&d" ((USItype) (sl))					\
632 	   : "0" ((USItype) (ah)),					\
633 	     "d" ((USItype) (bh)),					\
634 	     "1" ((USItype) (al)),					\
635 	     "g" ((USItype) (bl)))
636 
637 /* The '020, '030, '040, '060 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
638 #if (defined (__mc68020__) && !defined (__mc68060__))
639 #define umul_ppmm(w1, w0, u, v) \
640   __asm__ ("mulu%.l %3,%1:%0"						\
641 	   : "=d" ((USItype) (w0)),					\
642 	     "=d" ((USItype) (w1))					\
643 	   : "%0" ((USItype) (u)),					\
644 	     "dmi" ((USItype) (v)))
645 #define UMUL_TIME 45
646 #define udiv_qrnnd(q, r, n1, n0, d) \
647   __asm__ ("divu%.l %4,%1:%0"						\
648 	   : "=d" ((USItype) (q)),					\
649 	     "=d" ((USItype) (r))					\
650 	   : "0" ((USItype) (n0)),					\
651 	     "1" ((USItype) (n1)),					\
652 	     "dmi" ((USItype) (d)))
653 #define UDIV_TIME 90
654 #define sdiv_qrnnd(q, r, n1, n0, d) \
655   __asm__ ("divs%.l %4,%1:%0"						\
656 	   : "=d" ((USItype) (q)),					\
657 	     "=d" ((USItype) (r))					\
658 	   : "0" ((USItype) (n0)),					\
659 	     "1" ((USItype) (n1)),					\
660 	     "dmi" ((USItype) (d)))
661 
662 #elif defined (__mcoldfire__) /* not mc68020 */
663 
664 #define umul_ppmm(xh, xl, a, b) \
665   __asm__ ("| Inlined umul_ppmm\n"					\
666 	   "	move%.l	%2,%/d0\n"					\
667 	   "	move%.l	%3,%/d1\n"					\
668 	   "	move%.l	%/d0,%/d2\n"					\
669 	   "	swap	%/d0\n"						\
670 	   "	move%.l	%/d1,%/d3\n"					\
671 	   "	swap	%/d1\n"						\
672 	   "	move%.w	%/d2,%/d4\n"					\
673 	   "	mulu	%/d3,%/d4\n"					\
674 	   "	mulu	%/d1,%/d2\n"					\
675 	   "	mulu	%/d0,%/d3\n"					\
676 	   "	mulu	%/d0,%/d1\n"					\
677 	   "	move%.l	%/d4,%/d0\n"					\
678 	   "	clr%.w	%/d0\n"						\
679 	   "	swap	%/d0\n"						\
680 	   "	add%.l	%/d0,%/d2\n"					\
681 	   "	add%.l	%/d3,%/d2\n"					\
682 	   "	jcc	1f\n"						\
683 	   "	add%.l	%#65536,%/d1\n"					\
684 	   "1:	swap	%/d2\n"						\
685 	   "	moveq	%#0,%/d0\n"					\
686 	   "	move%.w	%/d2,%/d0\n"					\
687 	   "	move%.w	%/d4,%/d2\n"					\
688 	   "	move%.l	%/d2,%1\n"					\
689 	   "	add%.l	%/d1,%/d0\n"					\
690 	   "	move%.l	%/d0,%0"					\
691 	   : "=g" ((USItype) (xh)),					\
692 	     "=g" ((USItype) (xl))					\
693 	   : "g" ((USItype) (a)),					\
694 	     "g" ((USItype) (b))					\
695 	   : "d0", "d1", "d2", "d3", "d4")
696 #define UMUL_TIME 100
697 #define UDIV_TIME 400
698 #else /* not ColdFire */
699 /* %/ inserts REGISTER_PREFIX, %# inserts IMMEDIATE_PREFIX.  */
700 #define umul_ppmm(xh, xl, a, b) \
701   __asm__ ("| Inlined umul_ppmm\n"					\
702 	   "	move%.l	%2,%/d0\n"					\
703 	   "	move%.l	%3,%/d1\n"					\
704 	   "	move%.l	%/d0,%/d2\n"					\
705 	   "	swap	%/d0\n"						\
706 	   "	move%.l	%/d1,%/d3\n"					\
707 	   "	swap	%/d1\n"						\
708 	   "	move%.w	%/d2,%/d4\n"					\
709 	   "	mulu	%/d3,%/d4\n"					\
710 	   "	mulu	%/d1,%/d2\n"					\
711 	   "	mulu	%/d0,%/d3\n"					\
712 	   "	mulu	%/d0,%/d1\n"					\
713 	   "	move%.l	%/d4,%/d0\n"					\
714 	   "	eor%.w	%/d0,%/d0\n"					\
715 	   "	swap	%/d0\n"						\
716 	   "	add%.l	%/d0,%/d2\n"					\
717 	   "	add%.l	%/d3,%/d2\n"					\
718 	   "	jcc	1f\n"						\
719 	   "	add%.l	%#65536,%/d1\n"					\
720 	   "1:	swap	%/d2\n"						\
721 	   "	moveq	%#0,%/d0\n"					\
722 	   "	move%.w	%/d2,%/d0\n"					\
723 	   "	move%.w	%/d4,%/d2\n"					\
724 	   "	move%.l	%/d2,%1\n"					\
725 	   "	add%.l	%/d1,%/d0\n"					\
726 	   "	move%.l	%/d0,%0"					\
727 	   : "=g" ((USItype) (xh)),					\
728 	     "=g" ((USItype) (xl))					\
729 	   : "g" ((USItype) (a)),					\
730 	     "g" ((USItype) (b))					\
731 	   : "d0", "d1", "d2", "d3", "d4")
732 #define UMUL_TIME 100
733 #define UDIV_TIME 400
734 
735 #endif /* not mc68020 */
736 
737 /* The '020, '030, '040 and '060 have bitfield insns.
738    cpu32 disguises as a 68020, but lacks them.  */
739 #if defined (__mc68020__) && !defined (__mcpu32__)
740 #define count_leading_zeros(count, x) \
741   __asm__ ("bfffo %1{%b2:%b2},%0"					\
742 	   : "=d" ((USItype) (count))					\
743 	   : "od" ((USItype) (x)), "n" (0))
744 /* Some ColdFire architectures have a ff1 instruction supported via
745    __builtin_clz. */
746 #elif defined (__mcfisaaplus__) || defined (__mcfisac__)
747 #define count_leading_zeros(count,x) ((count) = __builtin_clz (x))
748 #define COUNT_LEADING_ZEROS_0 32
749 #endif
750 #endif /* mc68000 */
751 
752 #if defined (__m88000__) && W_TYPE_SIZE == 32
753 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
754   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
755 	   : "=r" ((USItype) (sh)),					\
756 	     "=&r" ((USItype) (sl))					\
757 	   : "%rJ" ((USItype) (ah)),					\
758 	     "rJ" ((USItype) (bh)),					\
759 	     "%rJ" ((USItype) (al)),					\
760 	     "rJ" ((USItype) (bl)))
761 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
762   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
763 	   : "=r" ((USItype) (sh)),					\
764 	     "=&r" ((USItype) (sl))					\
765 	   : "rJ" ((USItype) (ah)),					\
766 	     "rJ" ((USItype) (bh)),					\
767 	     "rJ" ((USItype) (al)),					\
768 	     "rJ" ((USItype) (bl)))
769 #define count_leading_zeros(count, x) \
770   do {									\
771     USItype __cbtmp;							\
772     __asm__ ("ff1 %0,%1"						\
773 	     : "=r" (__cbtmp)						\
774 	     : "r" ((USItype) (x)));					\
775     (count) = __cbtmp ^ 31;						\
776   } while (0)
777 #define COUNT_LEADING_ZEROS_0 63 /* sic */
778 #if defined (__mc88110__)
779 #define umul_ppmm(wh, wl, u, v) \
780   do {									\
781     union {UDItype __ll;						\
782 	   struct {USItype __h, __l;} __i;				\
783 	  } __xx;							\
784     __asm__ ("mulu.d	%0,%1,%2"					\
785 	     : "=r" (__xx.__ll)						\
786 	     : "r" ((USItype) (u)),					\
787 	       "r" ((USItype) (v)));					\
788     (wh) = __xx.__i.__h;						\
789     (wl) = __xx.__i.__l;						\
790   } while (0)
791 #define udiv_qrnnd(q, r, n1, n0, d) \
792   ({union {UDItype __ll;						\
793 	   struct {USItype __h, __l;} __i;				\
794 	  } __xx;							\
795   USItype __q;								\
796   __xx.__i.__h = (n1); __xx.__i.__l = (n0);				\
797   __asm__ ("divu.d %0,%1,%2"						\
798 	   : "=r" (__q)							\
799 	   : "r" (__xx.__ll),						\
800 	     "r" ((USItype) (d)));					\
801   (r) = (n0) - __q * (d); (q) = __q; })
802 #define UMUL_TIME 5
803 #define UDIV_TIME 25
804 #else
805 #define UMUL_TIME 17
806 #define UDIV_TIME 150
807 #endif /* __mc88110__ */
808 #endif /* __m88000__ */
809 
810 #if defined (__mn10300__)
811 # if defined (__AM33__)
812 #  define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
813 #  define umul_ppmm(w1, w0, u, v)		\
814     asm("mulu %3,%2,%1,%0" : "=r"(w0), "=r"(w1) : "r"(u), "r"(v))
815 #  define smul_ppmm(w1, w0, u, v)		\
816     asm("mul %3,%2,%1,%0" : "=r"(w0), "=r"(w1) : "r"(u), "r"(v))
817 # else
818 #  define umul_ppmm(w1, w0, u, v)		\
819     asm("nop; nop; mulu %3,%0" : "=d"(w0), "=z"(w1) : "%0"(u), "d"(v))
820 #  define smul_ppmm(w1, w0, u, v)		\
821     asm("nop; nop; mul %3,%0" : "=d"(w0), "=z"(w1) : "%0"(u), "d"(v))
822 # endif
823 # define add_ssaaaa(sh, sl, ah, al, bh, bl)	\
824   do {						\
825     DWunion __s, __a, __b;			\
826     __a.s.low = (al); __a.s.high = (ah);	\
827     __b.s.low = (bl); __b.s.high = (bh);	\
828     __s.ll = __a.ll + __b.ll;			\
829     (sl) = __s.s.low; (sh) = __s.s.high;	\
830   } while (0)
831 # define sub_ddmmss(sh, sl, ah, al, bh, bl)	\
832   do {						\
833     DWunion __s, __a, __b;			\
834     __a.s.low = (al); __a.s.high = (ah);	\
835     __b.s.low = (bl); __b.s.high = (bh);	\
836     __s.ll = __a.ll - __b.ll;			\
837     (sl) = __s.s.low; (sh) = __s.s.high;	\
838   } while (0)
839 # define udiv_qrnnd(q, r, nh, nl, d)		\
840   asm("divu %2,%0" : "=D"(q), "=z"(r) : "D"(d), "0"(nl), "1"(nh))
841 # define sdiv_qrnnd(q, r, nh, nl, d)		\
842   asm("div %2,%0" : "=D"(q), "=z"(r) : "D"(d), "0"(nl), "1"(nh))
843 # define UMUL_TIME 3
844 # define UDIV_TIME 38
845 #endif
846 
847 #if defined (__mips__) && W_TYPE_SIZE == 32
848 #define umul_ppmm(w1, w0, u, v)						\
849   do {									\
850     UDItype __x = (UDItype) (USItype) (u) * (USItype) (v);		\
851     (w1) = (USItype) (__x >> 32);					\
852     (w0) = (USItype) (__x);						\
853   } while (0)
854 #define UMUL_TIME 10
855 #define UDIV_TIME 100
856 
857 #if (__mips == 32 || __mips == 64) && ! defined (__mips16)
858 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
859 #define COUNT_LEADING_ZEROS_0 32
860 #endif
861 #endif /* __mips__ */
862 
863 /* FIXME: We should test _IBMR2 here when we add assembly support for the
864    system vendor compilers.
865    FIXME: What's needed for gcc PowerPC VxWorks?  __vxworks__ is not good
866    enough, since that hits ARM and m68k too.  */
867 #if (defined (_ARCH_PPC)	/* AIX */				\
868      || defined (__powerpc__)	/* gcc */				\
869      || defined (__POWERPC__)	/* BEOS */				\
870      || defined (__ppc__)	/* Darwin */				\
871      || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */    \
872      || (defined (PPC) && defined (CPU_FAMILY)    /* VxWorks */               \
873 	 && CPU_FAMILY == PPC)                                                \
874      ) && W_TYPE_SIZE == 32
875 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
876   do {									\
877     if (__builtin_constant_p (bh) && (bh) == 0)				\
878       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"		\
879 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
880     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
881       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"		\
882 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
883     else								\
884       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"		\
885 	     : "=r" (sh), "=&r" (sl)					\
886 	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
887   } while (0)
888 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
889   do {									\
890     if (__builtin_constant_p (ah) && (ah) == 0)				\
891       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"	\
892 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
893     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
894       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"	\
895 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
896     else if (__builtin_constant_p (bh) && (bh) == 0)			\
897       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"		\
898 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
899     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
900       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"		\
901 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
902     else								\
903       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"	\
904 	       : "=r" (sh), "=&r" (sl)					\
905 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
906   } while (0)
907 #define count_leading_zeros(count, x) \
908   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
909 #define COUNT_LEADING_ZEROS_0 32
910 #if defined (_ARCH_PPC) || defined (__powerpc__) || defined (__POWERPC__) \
911   || defined (__ppc__)                                                    \
912   || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */       \
913   || (defined (PPC) && defined (CPU_FAMILY)    /* VxWorks */                  \
914 	 && CPU_FAMILY == PPC)
915 #define umul_ppmm(ph, pl, m0, m1) \
916   do {									\
917     USItype __m0 = (m0), __m1 = (m1);					\
918     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
919     (pl) = __m0 * __m1;							\
920   } while (0)
921 #define UMUL_TIME 15
922 #define smul_ppmm(ph, pl, m0, m1) \
923   do {									\
924     SItype __m0 = (m0), __m1 = (m1);					\
925     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
926     (pl) = __m0 * __m1;							\
927   } while (0)
928 #define SMUL_TIME 14
929 #define UDIV_TIME 120
930 #endif
931 #endif /* 32-bit POWER architecture variants.  */
932 
933 /* We should test _IBMR2 here when we add assembly support for the system
934    vendor compilers.  */
935 #if (defined (_ARCH_PPC64) || defined (__powerpc64__)) && W_TYPE_SIZE == 64
936 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
937   do {									\
938     if (__builtin_constant_p (bh) && (bh) == 0)				\
939       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"		\
940 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
941     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
942       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"		\
943 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
944     else								\
945       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"		\
946 	     : "=r" (sh), "=&r" (sl)					\
947 	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
948   } while (0)
949 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
950   do {									\
951     if (__builtin_constant_p (ah) && (ah) == 0)				\
952       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"	\
953 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
954     else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)		\
955       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"	\
956 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
957     else if (__builtin_constant_p (bh) && (bh) == 0)			\
958       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"		\
959 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
960     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
961       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"		\
962 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
963     else								\
964       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"	\
965 	       : "=r" (sh), "=&r" (sl)					\
966 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
967   } while (0)
968 #define count_leading_zeros(count, x) \
969   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
970 #define COUNT_LEADING_ZEROS_0 64
971 #define umul_ppmm(ph, pl, m0, m1) \
972   do {									\
973     UDItype __m0 = (m0), __m1 = (m1);					\
974     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
975     (pl) = __m0 * __m1;							\
976   } while (0)
977 #define UMUL_TIME 15
978 #define smul_ppmm(ph, pl, m0, m1) \
979   do {									\
980     DItype __m0 = (m0), __m1 = (m1);					\
981     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
982     (pl) = __m0 * __m1;							\
983   } while (0)
984 #define SMUL_TIME 14  /* ??? */
985 #define UDIV_TIME 120 /* ??? */
986 #endif /* 64-bit PowerPC.  */
987 
988 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
989 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
990   __asm__ ("a %1,%5\n\tae %0,%3"					\
991 	   : "=r" ((USItype) (sh)),					\
992 	     "=&r" ((USItype) (sl))					\
993 	   : "%0" ((USItype) (ah)),					\
994 	     "r" ((USItype) (bh)),					\
995 	     "%1" ((USItype) (al)),					\
996 	     "r" ((USItype) (bl)))
997 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
998   __asm__ ("s %1,%5\n\tse %0,%3"					\
999 	   : "=r" ((USItype) (sh)),					\
1000 	     "=&r" ((USItype) (sl))					\
1001 	   : "0" ((USItype) (ah)),					\
1002 	     "r" ((USItype) (bh)),					\
1003 	     "1" ((USItype) (al)),					\
1004 	     "r" ((USItype) (bl)))
1005 #define umul_ppmm(ph, pl, m0, m1) \
1006   do {									\
1007     USItype __m0 = (m0), __m1 = (m1);					\
1008     __asm__ (								\
1009        "s	r2,r2\n"						\
1010 "	mts	r10,%2\n"						\
1011 "	m	r2,%3\n"						\
1012 "	m	r2,%3\n"						\
1013 "	m	r2,%3\n"						\
1014 "	m	r2,%3\n"						\
1015 "	m	r2,%3\n"						\
1016 "	m	r2,%3\n"						\
1017 "	m	r2,%3\n"						\
1018 "	m	r2,%3\n"						\
1019 "	m	r2,%3\n"						\
1020 "	m	r2,%3\n"						\
1021 "	m	r2,%3\n"						\
1022 "	m	r2,%3\n"						\
1023 "	m	r2,%3\n"						\
1024 "	m	r2,%3\n"						\
1025 "	m	r2,%3\n"						\
1026 "	m	r2,%3\n"						\
1027 "	cas	%0,r2,r0\n"						\
1028 "	mfs	r10,%1"							\
1029 	     : "=r" ((USItype) (ph)),					\
1030 	       "=r" ((USItype) (pl))					\
1031 	     : "%r" (__m0),						\
1032 		"r" (__m1)						\
1033 	     : "r2");							\
1034     (ph) += ((((SItype) __m0 >> 31) & __m1)				\
1035 	     + (((SItype) __m1 >> 31) & __m0));				\
1036   } while (0)
1037 #define UMUL_TIME 20
1038 #define UDIV_TIME 200
1039 #define count_leading_zeros(count, x) \
1040   do {									\
1041     if ((x) >= 0x10000)							\
1042       __asm__ ("clz	%0,%1"						\
1043 	       : "=r" ((USItype) (count))				\
1044 	       : "r" ((USItype) (x) >> 16));				\
1045     else								\
1046       {									\
1047 	__asm__ ("clz	%0,%1"						\
1048 		 : "=r" ((USItype) (count))				\
1049 		 : "r" ((USItype) (x)));					\
1050 	(count) += 16;							\
1051       }									\
1052   } while (0)
1053 #endif
1054 
1055 #if defined(__riscv)
1056 #ifdef __riscv_mul
1057 #define __umulsidi3(u,v) ((UDWtype)(UWtype)(u) * (UWtype)(v))
1058 #define __muluw3(a, b) ((UWtype)(a) * (UWtype)(b))
1059 #else
1060 #if __riscv_xlen == 32
1061   #define MULUW3 "call __mulsi3"
1062 #elif __riscv_xlen == 64
1063   #define MULUW3 "call __muldi3"
1064 #else
1065 #error unsupport xlen
1066 #endif /* __riscv_xlen */
1067 /* We rely on the fact that MULUW3 doesn't clobber the t-registers.
1068    It can get better register allocation result.  */
1069 #define __muluw3(a, b) \
1070   ({ \
1071     register UWtype __op0 asm ("a0") = a; \
1072     register UWtype __op1 asm ("a1") = b; \
1073     asm volatile (MULUW3 \
1074                   : "+r" (__op0), "+r" (__op1) \
1075                   : \
1076                   : "ra", "a2", "a3"); \
1077     __op0; \
1078   })
1079 #endif /* __riscv_mul */
1080 #define umul_ppmm(w1, w0, u, v) \
1081   do { \
1082     UWtype __x0, __x1, __x2, __x3; \
1083     UHWtype __ul, __vl, __uh, __vh; \
1084  \
1085     __ul = __ll_lowpart (u); \
1086     __uh = __ll_highpart (u); \
1087     __vl = __ll_lowpart (v); \
1088     __vh = __ll_highpart (v); \
1089  \
1090     __x0 = __muluw3 (__ul, __vl); \
1091     __x1 = __muluw3 (__ul, __vh); \
1092     __x2 = __muluw3 (__uh, __vl); \
1093     __x3 = __muluw3 (__uh, __vh); \
1094  \
1095     __x1 += __ll_highpart (__x0);/* this can't give carry */ \
1096     __x1 += __x2; /* but this indeed can */ \
1097     if (__x1 < __x2) /* did we get it? */ \
1098       __x3 += __ll_B; /* yes, add it in the proper pos.  */ \
1099  \
1100     (w1) = __x3 + __ll_highpart (__x1); \
1101     (w0) = __ll_lowpart (__x1) * __ll_B + __ll_lowpart (__x0); \
1102   } while (0)
1103 #endif /* __riscv */
1104 
1105 #if defined(__sh__) && W_TYPE_SIZE == 32
1106 #ifndef __sh1__
1107 #define umul_ppmm(w1, w0, u, v) \
1108   __asm__ (								\
1109        "dmulu.l	%2,%3\n\tsts%M1	macl,%1\n\tsts%M0	mach,%0"	\
1110 	   : "=r<" ((USItype)(w1)),					\
1111 	     "=r<" ((USItype)(w0))					\
1112 	   : "r" ((USItype)(u)),					\
1113 	     "r" ((USItype)(v))						\
1114 	   : "macl", "mach")
1115 #define UMUL_TIME 5
1116 #endif
1117 
1118 /* This is the same algorithm as __udiv_qrnnd_c.  */
1119 #define UDIV_NEEDS_NORMALIZATION 1
1120 
1121 #ifdef __FDPIC__
1122 /* FDPIC needs a special version of the asm fragment to extract the
1123    code address from the function descriptor. __udiv_qrnnd_16 is
1124    assumed to be local and not to use the GOT, so loading r12 is
1125    not needed. */
1126 #define udiv_qrnnd(q, r, n1, n0, d) \
1127   do {									\
1128     extern UWtype __udiv_qrnnd_16 (UWtype, UWtype)			\
1129 			__attribute__ ((visibility ("hidden")));	\
1130     /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */	\
1131     __asm__ (								\
1132 	"mov%M4	%4,r5\n"						\
1133 "	swap.w	%3,r4\n"						\
1134 "	swap.w	r5,r6\n"						\
1135 "	mov.l	@%5,r2\n"						\
1136 "	jsr	@r2\n"							\
1137 "	shll16	r6\n"							\
1138 "	swap.w	r4,r4\n"						\
1139 "	mov.l	@%5,r2\n"						\
1140 "	jsr	@r2\n"							\
1141 "	swap.w	r1,%0\n"						\
1142 "	or	r1,%0"							\
1143 	: "=r" (q), "=&z" (r)						\
1144 	: "1" (n1), "r" (n0), "rm" (d), "r" (&__udiv_qrnnd_16)		\
1145 	: "r1", "r2", "r4", "r5", "r6", "pr", "t");			\
1146   } while (0)
1147 #else
1148 #define udiv_qrnnd(q, r, n1, n0, d) \
1149   do {									\
1150     extern UWtype __udiv_qrnnd_16 (UWtype, UWtype)			\
1151 			__attribute__ ((visibility ("hidden")));	\
1152     /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */	\
1153     __asm__ (								\
1154 	"mov%M4 %4,r5\n"						\
1155 "	swap.w %3,r4\n"							\
1156 "	swap.w r5,r6\n"							\
1157 "	jsr @%5\n"							\
1158 "	shll16 r6\n"							\
1159 "	swap.w r4,r4\n"							\
1160 "	jsr @%5\n"							\
1161 "	swap.w r1,%0\n"							\
1162 "	or r1,%0"							\
1163 	: "=r" (q), "=&z" (r)						\
1164 	: "1" (n1), "r" (n0), "rm" (d), "r" (&__udiv_qrnnd_16)		\
1165 	: "r1", "r2", "r4", "r5", "r6", "pr", "t");			\
1166   } while (0)
1167 #endif /* __FDPIC__  */
1168 
1169 #define UDIV_TIME 80
1170 
1171 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
1172   __asm__ ("clrt;subc %5,%1; subc %4,%0"				\
1173 	   : "=r" (sh), "=r" (sl)					\
1174 	   : "0" (ah), "1" (al), "r" (bh), "r" (bl) : "t")
1175 
1176 #endif /* __sh__ */
1177 
1178 #if defined (__sparc__) && !defined (__arch64__) && !defined (__sparcv9) \
1179     && W_TYPE_SIZE == 32
1180 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1181   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
1182 	   : "=r" ((USItype) (sh)),					\
1183 	     "=&r" ((USItype) (sl))					\
1184 	   : "%rJ" ((USItype) (ah)),					\
1185 	     "rI" ((USItype) (bh)),					\
1186 	     "%rJ" ((USItype) (al)),					\
1187 	     "rI" ((USItype) (bl))					\
1188 	   __CLOBBER_CC)
1189 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1190   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
1191 	   : "=r" ((USItype) (sh)),					\
1192 	     "=&r" ((USItype) (sl))					\
1193 	   : "rJ" ((USItype) (ah)),					\
1194 	     "rI" ((USItype) (bh)),					\
1195 	     "rJ" ((USItype) (al)),					\
1196 	     "rI" ((USItype) (bl))					\
1197 	   __CLOBBER_CC)
1198 #if defined (__sparc_v9__)
1199 #define umul_ppmm(w1, w0, u, v) \
1200   do {									\
1201     register USItype __g1 asm ("g1");					\
1202     __asm__ ("umul\t%2,%3,%1\n\t"					\
1203 	     "srlx\t%1, 32, %0"						\
1204 	     : "=r" ((USItype) (w1)),					\
1205 	       "=r" (__g1)						\
1206 	     : "r" ((USItype) (u)),					\
1207 	       "r" ((USItype) (v)));					\
1208     (w0) = __g1;							\
1209   } while (0)
1210 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
1211   __asm__ ("mov\t%2,%%y\n\t"						\
1212 	   "udiv\t%3,%4,%0\n\t"						\
1213 	   "umul\t%0,%4,%1\n\t"						\
1214 	   "sub\t%3,%1,%1"						\
1215 	   : "=&r" ((USItype) (__q)),					\
1216 	     "=&r" ((USItype) (__r))					\
1217 	   : "r" ((USItype) (__n1)),					\
1218 	     "r" ((USItype) (__n0)),					\
1219 	     "r" ((USItype) (__d)))
1220 #else
1221 #if defined (__sparc_v8__)
1222 #define umul_ppmm(w1, w0, u, v) \
1223   __asm__ ("umul %2,%3,%1;rd %%y,%0"					\
1224 	   : "=r" ((USItype) (w1)),					\
1225 	     "=r" ((USItype) (w0))					\
1226 	   : "r" ((USItype) (u)),					\
1227 	     "r" ((USItype) (v)))
1228 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
1229   __asm__ ("mov %2,%%y;nop;nop;nop;udiv %3,%4,%0;umul %0,%4,%1;sub %3,%1,%1"\
1230 	   : "=&r" ((USItype) (__q)),					\
1231 	     "=&r" ((USItype) (__r))					\
1232 	   : "r" ((USItype) (__n1)),					\
1233 	     "r" ((USItype) (__n0)),					\
1234 	     "r" ((USItype) (__d)))
1235 #else
1236 #if defined (__sparclite__)
1237 /* This has hardware multiply but not divide.  It also has two additional
1238    instructions scan (ffs from high bit) and divscc.  */
1239 #define umul_ppmm(w1, w0, u, v) \
1240   __asm__ ("umul %2,%3,%1;rd %%y,%0"					\
1241 	   : "=r" ((USItype) (w1)),					\
1242 	     "=r" ((USItype) (w0))					\
1243 	   : "r" ((USItype) (u)),					\
1244 	     "r" ((USItype) (v)))
1245 #define udiv_qrnnd(q, r, n1, n0, d) \
1246   __asm__ ("! Inlined udiv_qrnnd\n"					\
1247 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
1248 "	tst	%%g0\n"							\
1249 "	divscc	%3,%4,%%g1\n"						\
1250 "	divscc	%%g1,%4,%%g1\n"						\
1251 "	divscc	%%g1,%4,%%g1\n"						\
1252 "	divscc	%%g1,%4,%%g1\n"						\
1253 "	divscc	%%g1,%4,%%g1\n"						\
1254 "	divscc	%%g1,%4,%%g1\n"						\
1255 "	divscc	%%g1,%4,%%g1\n"						\
1256 "	divscc	%%g1,%4,%%g1\n"						\
1257 "	divscc	%%g1,%4,%%g1\n"						\
1258 "	divscc	%%g1,%4,%%g1\n"						\
1259 "	divscc	%%g1,%4,%%g1\n"						\
1260 "	divscc	%%g1,%4,%%g1\n"						\
1261 "	divscc	%%g1,%4,%%g1\n"						\
1262 "	divscc	%%g1,%4,%%g1\n"						\
1263 "	divscc	%%g1,%4,%%g1\n"						\
1264 "	divscc	%%g1,%4,%%g1\n"						\
1265 "	divscc	%%g1,%4,%%g1\n"						\
1266 "	divscc	%%g1,%4,%%g1\n"						\
1267 "	divscc	%%g1,%4,%%g1\n"						\
1268 "	divscc	%%g1,%4,%%g1\n"						\
1269 "	divscc	%%g1,%4,%%g1\n"						\
1270 "	divscc	%%g1,%4,%%g1\n"						\
1271 "	divscc	%%g1,%4,%%g1\n"						\
1272 "	divscc	%%g1,%4,%%g1\n"						\
1273 "	divscc	%%g1,%4,%%g1\n"						\
1274 "	divscc	%%g1,%4,%%g1\n"						\
1275 "	divscc	%%g1,%4,%%g1\n"						\
1276 "	divscc	%%g1,%4,%%g1\n"						\
1277 "	divscc	%%g1,%4,%%g1\n"						\
1278 "	divscc	%%g1,%4,%%g1\n"						\
1279 "	divscc	%%g1,%4,%%g1\n"						\
1280 "	divscc	%%g1,%4,%0\n"						\
1281 "	rd	%%y,%1\n"						\
1282 "	bl,a 1f\n"							\
1283 "	add	%1,%4,%1\n"						\
1284 "1:	! End of inline udiv_qrnnd"					\
1285 	   : "=r" ((USItype) (q)),					\
1286 	     "=r" ((USItype) (r))					\
1287 	   : "r" ((USItype) (n1)),					\
1288 	     "r" ((USItype) (n0)),					\
1289 	     "rI" ((USItype) (d))					\
1290 	   : "g1" __AND_CLOBBER_CC)
1291 #define UDIV_TIME 37
1292 #define count_leading_zeros(count, x) \
1293   do {                                                                  \
1294   __asm__ ("scan %1,1,%0"                                               \
1295 	   : "=r" ((USItype) (count))                                   \
1296 	   : "r" ((USItype) (x)));					\
1297   } while (0)
1298 /* Early sparclites return 63 for an argument of 0, but they warn that future
1299    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1300    undefined.  */
1301 #else
1302 /* SPARC without integer multiplication and divide instructions.
1303    (i.e. at least Sun4/20,40,60,65,75,110,260,280,330,360,380,470,490) */
1304 #define umul_ppmm(w1, w0, u, v) \
1305   __asm__ ("! Inlined umul_ppmm\n"					\
1306 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n"\
1307 "	sra	%3,31,%%o5	! Don't move this insn\n"		\
1308 "	and	%2,%%o5,%%o5	! Don't move this insn\n"		\
1309 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
1310 "	mulscc	%%g1,%3,%%g1\n"						\
1311 "	mulscc	%%g1,%3,%%g1\n"						\
1312 "	mulscc	%%g1,%3,%%g1\n"						\
1313 "	mulscc	%%g1,%3,%%g1\n"						\
1314 "	mulscc	%%g1,%3,%%g1\n"						\
1315 "	mulscc	%%g1,%3,%%g1\n"						\
1316 "	mulscc	%%g1,%3,%%g1\n"						\
1317 "	mulscc	%%g1,%3,%%g1\n"						\
1318 "	mulscc	%%g1,%3,%%g1\n"						\
1319 "	mulscc	%%g1,%3,%%g1\n"						\
1320 "	mulscc	%%g1,%3,%%g1\n"						\
1321 "	mulscc	%%g1,%3,%%g1\n"						\
1322 "	mulscc	%%g1,%3,%%g1\n"						\
1323 "	mulscc	%%g1,%3,%%g1\n"						\
1324 "	mulscc	%%g1,%3,%%g1\n"						\
1325 "	mulscc	%%g1,%3,%%g1\n"						\
1326 "	mulscc	%%g1,%3,%%g1\n"						\
1327 "	mulscc	%%g1,%3,%%g1\n"						\
1328 "	mulscc	%%g1,%3,%%g1\n"						\
1329 "	mulscc	%%g1,%3,%%g1\n"						\
1330 "	mulscc	%%g1,%3,%%g1\n"						\
1331 "	mulscc	%%g1,%3,%%g1\n"						\
1332 "	mulscc	%%g1,%3,%%g1\n"						\
1333 "	mulscc	%%g1,%3,%%g1\n"						\
1334 "	mulscc	%%g1,%3,%%g1\n"						\
1335 "	mulscc	%%g1,%3,%%g1\n"						\
1336 "	mulscc	%%g1,%3,%%g1\n"						\
1337 "	mulscc	%%g1,%3,%%g1\n"						\
1338 "	mulscc	%%g1,%3,%%g1\n"						\
1339 "	mulscc	%%g1,%3,%%g1\n"						\
1340 "	mulscc	%%g1,%3,%%g1\n"						\
1341 "	mulscc	%%g1,%3,%%g1\n"						\
1342 "	mulscc	%%g1,0,%%g1\n"						\
1343 "	add	%%g1,%%o5,%0\n"						\
1344 "	rd	%%y,%1"							\
1345 	   : "=r" ((USItype) (w1)),					\
1346 	     "=r" ((USItype) (w0))					\
1347 	   : "%rI" ((USItype) (u)),					\
1348 	     "r" ((USItype) (v))						\
1349 	   : "g1", "o5" __AND_CLOBBER_CC)
1350 #define UMUL_TIME 39		/* 39 instructions */
1351 /* It's quite necessary to add this much assembler for the sparc.
1352    The default udiv_qrnnd (in C) is more than 10 times slower!  */
1353 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
1354   __asm__ ("! Inlined udiv_qrnnd\n"					\
1355 "	mov	32,%%g1\n"						\
1356 "	subcc	%1,%2,%%g0\n"						\
1357 "1:	bcs	5f\n"							\
1358 "	 addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n"	\
1359 "	sub	%1,%2,%1	! this kills msb of n\n"		\
1360 "	addx	%1,%1,%1	! so this can't give carry\n"		\
1361 "	subcc	%%g1,1,%%g1\n"						\
1362 "2:	bne	1b\n"							\
1363 "	 subcc	%1,%2,%%g0\n"						\
1364 "	bcs	3f\n"							\
1365 "	 addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n"	\
1366 "	b	3f\n"							\
1367 "	 sub	%1,%2,%1	! this kills msb of n\n"		\
1368 "4:	sub	%1,%2,%1\n"						\
1369 "5:	addxcc	%1,%1,%1\n"						\
1370 "	bcc	2b\n"							\
1371 "	 subcc	%%g1,1,%%g1\n"						\
1372 "! Got carry from n.  Subtract next step to cancel this carry.\n"	\
1373 "	bne	4b\n"							\
1374 "	 addcc	%0,%0,%0	! shift n1n0 and a 0-bit in lsb\n"	\
1375 "	sub	%1,%2,%1\n"						\
1376 "3:	xnor	%0,0,%0\n"						\
1377 "	! End of inline udiv_qrnnd"					\
1378 	   : "=&r" ((USItype) (__q)),					\
1379 	     "=&r" ((USItype) (__r))					\
1380 	   : "r" ((USItype) (__d)),					\
1381 	     "1" ((USItype) (__n1)),					\
1382 	     "0" ((USItype) (__n0)) : "g1" __AND_CLOBBER_CC)
1383 #define UDIV_TIME (3+7*32)	/* 7 instructions/iteration. 32 iterations.  */
1384 #endif /* __sparclite__ */
1385 #endif /* __sparc_v8__ */
1386 #endif /* __sparc_v9__ */
1387 #endif /* sparc32 */
1388 
1389 #if ((defined (__sparc__) && defined (__arch64__)) || defined (__sparcv9)) \
1390     && W_TYPE_SIZE == 64
1391 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
1392   do {									\
1393     UDItype __carry = 0;						\
1394     __asm__ ("addcc\t%r5,%6,%1\n\t"					\
1395 	     "add\t%r3,%4,%0\n\t"					\
1396 	     "movcs\t%%xcc, 1, %2\n\t"					\
1397 	     "add\t%0, %2, %0"						\
1398 	     : "=r" ((UDItype)(sh)),				      	\
1399 	       "=&r" ((UDItype)(sl)),				      	\
1400 	       "+r" (__carry)				      		\
1401 	     : "%rJ" ((UDItype)(ah)),				     	\
1402 	       "rI" ((UDItype)(bh)),				      	\
1403 	       "%rJ" ((UDItype)(al)),				     	\
1404 	       "rI" ((UDItype)(bl))				       	\
1405 	     __CLOBBER_CC);						\
1406   } while (0)
1407 
1408 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
1409   do {									\
1410     UDItype __carry = 0;						\
1411     __asm__ ("subcc\t%r5,%6,%1\n\t"					\
1412 	     "sub\t%r3,%4,%0\n\t"					\
1413 	     "movcs\t%%xcc, 1, %2\n\t"					\
1414 	     "sub\t%0, %2, %0"						\
1415 	     : "=r" ((UDItype)(sh)),				      	\
1416 	       "=&r" ((UDItype)(sl)),				      	\
1417 	       "+r" (__carry)				      		\
1418 	     : "%rJ" ((UDItype)(ah)),				     	\
1419 	       "rI" ((UDItype)(bh)),				      	\
1420 	       "%rJ" ((UDItype)(al)),				     	\
1421 	       "rI" ((UDItype)(bl))				       	\
1422 	     __CLOBBER_CC);						\
1423   } while (0)
1424 
1425 #define umul_ppmm(wh, wl, u, v)						\
1426   do {									\
1427 	  UDItype tmp1, tmp2, tmp3, tmp4;				\
1428 	  __asm__ __volatile__ (					\
1429 		   "srl %7,0,%3\n\t"					\
1430 		   "mulx %3,%6,%1\n\t"					\
1431 		   "srlx %6,32,%2\n\t"					\
1432 		   "mulx %2,%3,%4\n\t"					\
1433 		   "sllx %4,32,%5\n\t"					\
1434 		   "srl %6,0,%3\n\t"					\
1435 		   "sub %1,%5,%5\n\t"					\
1436 		   "srlx %5,32,%5\n\t"					\
1437 		   "addcc %4,%5,%4\n\t"					\
1438 		   "srlx %7,32,%5\n\t"					\
1439 		   "mulx %3,%5,%3\n\t"					\
1440 		   "mulx %2,%5,%5\n\t"					\
1441 		   "sethi %%hi(0x80000000),%2\n\t"			\
1442 		   "addcc %4,%3,%4\n\t"					\
1443 		   "srlx %4,32,%4\n\t"					\
1444 		   "add %2,%2,%2\n\t"					\
1445 		   "movcc %%xcc,%%g0,%2\n\t"				\
1446 		   "addcc %5,%4,%5\n\t"					\
1447 		   "sllx %3,32,%3\n\t"					\
1448 		   "add %1,%3,%1\n\t"					\
1449 		   "add %5,%2,%0"					\
1450 	   : "=r" ((UDItype)(wh)),					\
1451 	     "=&r" ((UDItype)(wl)),					\
1452 	     "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4)	\
1453 	   : "r" ((UDItype)(u)),					\
1454 	     "r" ((UDItype)(v))						\
1455 	   __CLOBBER_CC);						\
1456   } while (0)
1457 #define UMUL_TIME 96
1458 #define UDIV_TIME 230
1459 #endif /* sparc64 */
1460 
1461 #if defined (__vax__) && W_TYPE_SIZE == 32
1462 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1463   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
1464 	   : "=g" ((USItype) (sh)),					\
1465 	     "=&g" ((USItype) (sl))					\
1466 	   : "%0" ((USItype) (ah)),					\
1467 	     "g" ((USItype) (bh)),					\
1468 	     "%1" ((USItype) (al)),					\
1469 	     "g" ((USItype) (bl)))
1470 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1471   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
1472 	   : "=g" ((USItype) (sh)),					\
1473 	     "=&g" ((USItype) (sl))					\
1474 	   : "0" ((USItype) (ah)),					\
1475 	     "g" ((USItype) (bh)),					\
1476 	     "1" ((USItype) (al)),					\
1477 	     "g" ((USItype) (bl)))
1478 #define umul_ppmm(xh, xl, m0, m1) \
1479   do {									\
1480     union {								\
1481 	UDItype __ll;							\
1482 	struct {USItype __l, __h;} __i;					\
1483       } __xx;								\
1484     USItype __m0 = (m0), __m1 = (m1);					\
1485     __asm__ ("emul %1,%2,$0,%0"						\
1486 	     : "=r" (__xx.__ll)						\
1487 	     : "g" (__m0),						\
1488 	       "g" (__m1));						\
1489     (xh) = __xx.__i.__h;						\
1490     (xl) = __xx.__i.__l;						\
1491     (xh) += ((((SItype) __m0 >> 31) & __m1)				\
1492 	     + (((SItype) __m1 >> 31) & __m0));				\
1493   } while (0)
1494 #define sdiv_qrnnd(q, r, n1, n0, d) \
1495   do {									\
1496     union {DItype __ll;							\
1497 	   struct {SItype __l, __h;} __i;				\
1498 	  } __xx;							\
1499     __xx.__i.__h = n1; __xx.__i.__l = n0;				\
1500     __asm__ ("ediv %3,%2,%0,%1"						\
1501 	     : "=g" (q), "=g" (r)					\
1502 	     : "g" (__xx.__ll), "g" (d));				\
1503   } while (0)
1504 #endif /* __vax__ */
1505 
1506 #ifdef _TMS320C6X
1507 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1508   do									\
1509     {									\
1510       UDItype __ll;							\
1511       __asm__ ("addu .l1 %1, %2, %0"					\
1512 	       : "=a" (__ll) : "a" (al), "a" (bl));			\
1513       (sl) = (USItype)__ll;						\
1514       (sh) = ((USItype)(__ll >> 32)) + (ah) + (bh);			\
1515     }									\
1516   while (0)
1517 
1518 #ifdef _TMS320C6400_PLUS
1519 #define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
1520 #define umul_ppmm(w1, w0, u, v)						\
1521   do {									\
1522     UDItype __x = (UDItype) (USItype) (u) * (USItype) (v);		\
1523     (w1) = (USItype) (__x >> 32);					\
1524     (w0) = (USItype) (__x);						\
1525   } while (0)
1526 #endif  /* _TMS320C6400_PLUS */
1527 
1528 #define count_leading_zeros(count, x)	((count) = __builtin_clz (x))
1529 #ifdef _TMS320C6400
1530 #define count_trailing_zeros(count, x)	((count) = __builtin_ctz (x))
1531 #endif
1532 #define UMUL_TIME 4
1533 #define UDIV_TIME 40
1534 #endif /* _TMS320C6X */
1535 
1536 #if defined (__xtensa__) && W_TYPE_SIZE == 32
1537 /* This code is not Xtensa-configuration-specific, so rely on the compiler
1538    to expand builtin functions depending on what configuration features
1539    are available.  This avoids library calls when the operation can be
1540    performed in-line.  */
1541 #define umul_ppmm(w1, w0, u, v)						\
1542   do {									\
1543     DWunion __w;							\
1544     __w.ll = __builtin_umulsidi3 (u, v);				\
1545     w1 = __w.s.high;							\
1546     w0 = __w.s.low;							\
1547   } while (0)
1548 #define __umulsidi3(u, v)		__builtin_umulsidi3 (u, v)
1549 #define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clz (X))
1550 #define count_trailing_zeros(COUNT, X)	((COUNT) = __builtin_ctz (X))
1551 #endif /* __xtensa__ */
1552 
1553 #if defined xstormy16
1554 extern UHItype __stormy16_count_leading_zeros (UHItype);
1555 #define count_leading_zeros(count, x)					\
1556   do									\
1557     {									\
1558       UHItype size;							\
1559 									\
1560       /* We assume that W_TYPE_SIZE is a multiple of 16...  */		\
1561       for ((count) = 0, size = W_TYPE_SIZE; size; size -= 16)		\
1562 	{								\
1563 	  UHItype c;							\
1564 									\
1565 	  c = __clzhi2 ((x) >> (size - 16));				\
1566 	  (count) += c;							\
1567 	  if (c != 16)							\
1568 	    break;							\
1569 	}								\
1570     }									\
1571   while (0)
1572 #define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
1573 #endif
1574 
1575 #if defined (__z8000__) && W_TYPE_SIZE == 16
1576 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1577   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
1578 	   : "=r" ((unsigned int)(sh)),					\
1579 	     "=&r" ((unsigned int)(sl))					\
1580 	   : "%0" ((unsigned int)(ah)),					\
1581 	     "r" ((unsigned int)(bh)),					\
1582 	     "%1" ((unsigned int)(al)),					\
1583 	     "rQR" ((unsigned int)(bl)))
1584 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1585   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
1586 	   : "=r" ((unsigned int)(sh)),					\
1587 	     "=&r" ((unsigned int)(sl))					\
1588 	   : "0" ((unsigned int)(ah)),					\
1589 	     "r" ((unsigned int)(bh)),					\
1590 	     "1" ((unsigned int)(al)),					\
1591 	     "rQR" ((unsigned int)(bl)))
1592 #define umul_ppmm(xh, xl, m0, m1) \
1593   do {									\
1594     union {long int __ll;						\
1595 	   struct {unsigned int __h, __l;} __i;				\
1596 	  } __xx;							\
1597     unsigned int __m0 = (m0), __m1 = (m1);				\
1598     __asm__ ("mult	%S0,%H3"					\
1599 	     : "=r" (__xx.__i.__h),					\
1600 	       "=r" (__xx.__i.__l)					\
1601 	     : "%1" (__m0),						\
1602 	       "rQR" (__m1));						\
1603     (xh) = __xx.__i.__h; (xl) = __xx.__i.__l;				\
1604     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
1605 	     + (((signed int) __m1 >> 15) & __m0));			\
1606   } while (0)
1607 #endif /* __z8000__ */
1608 
1609 #endif /* __GNUC__ */
1610 
1611 /* If this machine has no inline assembler, use C macros.  */
1612 
1613 #if !defined (add_ssaaaa)
1614 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1615   do {									\
1616     UWtype __x;								\
1617     __x = (al) + (bl);							\
1618     (sh) = (ah) + (bh) + (__x < (al));					\
1619     (sl) = __x;								\
1620   } while (0)
1621 #endif
1622 
1623 #if !defined (sub_ddmmss)
1624 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1625   do {									\
1626     UWtype __x;								\
1627     __x = (al) - (bl);							\
1628     (sh) = (ah) - (bh) - (__x > (al));					\
1629     (sl) = __x;								\
1630   } while (0)
1631 #endif
1632 
1633 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1634    smul_ppmm.  */
1635 #if !defined (umul_ppmm) && defined (smul_ppmm)
1636 #define umul_ppmm(w1, w0, u, v)						\
1637   do {									\
1638     UWtype __w1;							\
1639     UWtype __xm0 = (u), __xm1 = (v);					\
1640     smul_ppmm (__w1, w0, __xm0, __xm1);					\
1641     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
1642 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
1643   } while (0)
1644 #endif
1645 
1646 /* If we still don't have umul_ppmm, define it using plain C.  */
1647 #if !defined (umul_ppmm)
1648 #define umul_ppmm(w1, w0, u, v)						\
1649   do {									\
1650     UWtype __x0, __x1, __x2, __x3;					\
1651     UHWtype __ul, __vl, __uh, __vh;					\
1652 									\
1653     __ul = __ll_lowpart (u);						\
1654     __uh = __ll_highpart (u);						\
1655     __vl = __ll_lowpart (v);						\
1656     __vh = __ll_highpart (v);						\
1657 									\
1658     __x0 = (UWtype) __ul * __vl;					\
1659     __x1 = (UWtype) __ul * __vh;					\
1660     __x2 = (UWtype) __uh * __vl;					\
1661     __x3 = (UWtype) __uh * __vh;					\
1662 									\
1663     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
1664     __x1 += __x2;		/* but this indeed can */		\
1665     if (__x1 < __x2)		/* did we get it? */			\
1666       __x3 += __ll_B;		/* yes, add it in the proper pos.  */	\
1667 									\
1668     (w1) = __x3 + __ll_highpart (__x1);					\
1669     (w0) = __ll_lowpart (__x1) * __ll_B + __ll_lowpart (__x0);		\
1670   } while (0)
1671 #endif
1672 
1673 #if !defined (__umulsidi3)
1674 #define __umulsidi3(u, v) \
1675   ({DWunion __w;							\
1676     umul_ppmm (__w.s.high, __w.s.low, u, v);				\
1677     __w.ll; })
1678 #endif
1679 
1680 /* Define this unconditionally, so it can be used for debugging.  */
1681 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
1682   do {									\
1683     UWtype __d1, __d0, __q1, __q0;					\
1684     UWtype __r1, __r0, __m;						\
1685     __d1 = __ll_highpart (d);						\
1686     __d0 = __ll_lowpart (d);						\
1687 									\
1688     __r1 = (n1) % __d1;							\
1689     __q1 = (n1) / __d1;							\
1690     __m = (UWtype) __q1 * __d0;						\
1691     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
1692     if (__r1 < __m)							\
1693       {									\
1694 	__q1--, __r1 += (d);						\
1695 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
1696 	  if (__r1 < __m)						\
1697 	    __q1--, __r1 += (d);					\
1698       }									\
1699     __r1 -= __m;							\
1700 									\
1701     __r0 = __r1 % __d1;							\
1702     __q0 = __r1 / __d1;							\
1703     __m = (UWtype) __q0 * __d0;						\
1704     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
1705     if (__r0 < __m)							\
1706       {									\
1707 	__q0--, __r0 += (d);						\
1708 	if (__r0 >= (d))						\
1709 	  if (__r0 < __m)						\
1710 	    __q0--, __r0 += (d);					\
1711       }									\
1712     __r0 -= __m;							\
1713 									\
1714     (q) = (UWtype) __q1 * __ll_B | __q0;				\
1715     (r) = __r0;								\
1716   } while (0)
1717 
1718 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
1719    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
1720 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
1721 #define udiv_qrnnd(q, r, nh, nl, d) \
1722   do {									\
1723     extern UWtype __udiv_w_sdiv (UWtype *, UWtype, UWtype, UWtype);	\
1724     UWtype __r;								\
1725     (q) = __udiv_w_sdiv (&__r, nh, nl, d);				\
1726     (r) = __r;								\
1727   } while (0)
1728 #endif
1729 
1730 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
1731 #if !defined (udiv_qrnnd)
1732 #define UDIV_NEEDS_NORMALIZATION 1
1733 #define udiv_qrnnd __udiv_qrnnd_c
1734 #endif
1735 
1736 #if !defined (count_leading_zeros)
1737 #define count_leading_zeros(count, x) \
1738   do {									\
1739     UWtype __xr = (x);							\
1740     UWtype __a;								\
1741 									\
1742     if (W_TYPE_SIZE <= 32)						\
1743       {									\
1744 	__a = __xr < ((UWtype)1<<2*__BITS4)				\
1745 	  ? (__xr < ((UWtype)1<<__BITS4) ? 0 : __BITS4)			\
1746 	  : (__xr < ((UWtype)1<<3*__BITS4) ?  2*__BITS4 : 3*__BITS4);	\
1747       }									\
1748     else								\
1749       {									\
1750 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
1751 	  if (((__xr >> __a) & 0xff) != 0)				\
1752 	    break;							\
1753       }									\
1754 									\
1755     (count) = W_TYPE_SIZE - (__clz_tab[__xr >> __a] + __a);		\
1756   } while (0)
1757 #define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
1758 #endif
1759 
1760 #if !defined (count_trailing_zeros)
1761 /* Define count_trailing_zeros using count_leading_zeros.  The latter might be
1762    defined in asm, but if it is not, the C version above is good enough.  */
1763 #define count_trailing_zeros(count, x) \
1764   do {									\
1765     UWtype __ctz_x = (x);						\
1766     UWtype __ctz_c;							\
1767     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
1768     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
1769   } while (0)
1770 #endif
1771 
1772 #ifndef UDIV_NEEDS_NORMALIZATION
1773 #define UDIV_NEEDS_NORMALIZATION 0
1774 #endif
1775