1 /* multi_arith.h: multi-precision integer arithmetic functions, needed
2    to do extended-precision floating point.
3 
4    (c) 1998 David Huggins-Daines.
5 
6    Somewhat based on arch/alpha/math-emu/ieee-math.c, which is (c)
7    David Mosberger-Tang.
8 
9    You may copy, modify, and redistribute this file under the terms of
10    the GNU General Public License, version 2, or any later version, at
11    your convenience. */
12 
13 /* Note:
14 
15    These are not general multi-precision math routines.  Rather, they
16    implement the subset of integer arithmetic that we need in order to
17    multiply, divide, and normalize 128-bit unsigned mantissae.  */
18 
19 #ifndef MULTI_ARITH_H
20 #define MULTI_ARITH_H
21 
22 #if 0	/* old code... */
23 
24 /* Unsigned only, because we don't need signs to multiply and divide. */
25 typedef unsigned int int128[4];
26 
27 /* Word order */
28 enum {
29 	MSW128,
30 	NMSW128,
31 	NLSW128,
32 	LSW128
33 };
34 
35 /* big-endian */
36 #define LO_WORD(ll) (((unsigned int *) &ll)[1])
37 #define HI_WORD(ll) (((unsigned int *) &ll)[0])
38 
39 /* Convenience functions to stuff various integer values into int128s */
40 
41 static inline void zero128(int128 a)
42 {
43 	a[LSW128] = a[NLSW128] = a[NMSW128] = a[MSW128] = 0;
44 }
45 
46 /* Human-readable word order in the arguments */
47 static inline void set128(unsigned int i3, unsigned int i2, unsigned int i1,
48 			  unsigned int i0, int128 a)
49 {
50 	a[LSW128] = i0;
51 	a[NLSW128] = i1;
52 	a[NMSW128] = i2;
53 	a[MSW128] = i3;
54 }
55 
56 /* Convenience functions (for testing as well) */
57 static inline void int64_to_128(unsigned long long src, int128 dest)
58 {
59 	dest[LSW128] = (unsigned int) src;
60 	dest[NLSW128] = src >> 32;
61 	dest[NMSW128] = dest[MSW128] = 0;
62 }
63 
64 static inline void int128_to_64(const int128 src, unsigned long long *dest)
65 {
66 	*dest = src[LSW128] | (long long) src[NLSW128] << 32;
67 }
68 
69 static inline void put_i128(const int128 a)
70 {
71 	printk("%08x %08x %08x %08x\n", a[MSW128], a[NMSW128],
72 	       a[NLSW128], a[LSW128]);
73 }
74 
75 /* Internal shifters:
76 
77    Note that these are only good for 0 < count < 32.
78  */
79 
80 static inline void _lsl128(unsigned int count, int128 a)
81 {
82 	a[MSW128] = (a[MSW128] << count) | (a[NMSW128] >> (32 - count));
83 	a[NMSW128] = (a[NMSW128] << count) | (a[NLSW128] >> (32 - count));
84 	a[NLSW128] = (a[NLSW128] << count) | (a[LSW128] >> (32 - count));
85 	a[LSW128] <<= count;
86 }
87 
88 static inline void _lsr128(unsigned int count, int128 a)
89 {
90 	a[LSW128] = (a[LSW128] >> count) | (a[NLSW128] << (32 - count));
91 	a[NLSW128] = (a[NLSW128] >> count) | (a[NMSW128] << (32 - count));
92 	a[NMSW128] = (a[NMSW128] >> count) | (a[MSW128] << (32 - count));
93 	a[MSW128] >>= count;
94 }
95 
96 /* Should be faster, one would hope */
97 
98 static inline void lslone128(int128 a)
99 {
100 	asm volatile ("lsl.l #1,%0\n"
101 		      "roxl.l #1,%1\n"
102 		      "roxl.l #1,%2\n"
103 		      "roxl.l #1,%3\n"
104 		      :
105 		      "=d" (a[LSW128]),
106 		      "=d"(a[NLSW128]),
107 		      "=d"(a[NMSW128]),
108 		      "=d"(a[MSW128])
109 		      :
110 		      "0"(a[LSW128]),
111 		      "1"(a[NLSW128]),
112 		      "2"(a[NMSW128]),
113 		      "3"(a[MSW128]));
114 }
115 
116 static inline void lsrone128(int128 a)
117 {
118 	asm volatile ("lsr.l #1,%0\n"
119 		      "roxr.l #1,%1\n"
120 		      "roxr.l #1,%2\n"
121 		      "roxr.l #1,%3\n"
122 		      :
123 		      "=d" (a[MSW128]),
124 		      "=d"(a[NMSW128]),
125 		      "=d"(a[NLSW128]),
126 		      "=d"(a[LSW128])
127 		      :
128 		      "0"(a[MSW128]),
129 		      "1"(a[NMSW128]),
130 		      "2"(a[NLSW128]),
131 		      "3"(a[LSW128]));
132 }
133 
134 /* Generalized 128-bit shifters:
135 
136    These bit-shift to a multiple of 32, then move whole longwords.  */
137 
138 static inline void lsl128(unsigned int count, int128 a)
139 {
140 	int wordcount, i;
141 
142 	if (count % 32)
143 		_lsl128(count % 32, a);
144 
145 	if (0 == (wordcount = count / 32))
146 		return;
147 
148 	/* argh, gak, endian-sensitive */
149 	for (i = 0; i < 4 - wordcount; i++) {
150 		a[i] = a[i + wordcount];
151 	}
152 	for (i = 3; i >= 4 - wordcount; --i) {
153 		a[i] = 0;
154 	}
155 }
156 
157 static inline void lsr128(unsigned int count, int128 a)
158 {
159 	int wordcount, i;
160 
161 	if (count % 32)
162 		_lsr128(count % 32, a);
163 
164 	if (0 == (wordcount = count / 32))
165 		return;
166 
167 	for (i = 3; i >= wordcount; --i) {
168 		a[i] = a[i - wordcount];
169 	}
170 	for (i = 0; i < wordcount; i++) {
171 		a[i] = 0;
172 	}
173 }
174 
175 static inline int orl128(int a, int128 b)
176 {
177 	b[LSW128] |= a;
178 }
179 
180 static inline int btsthi128(const int128 a)
181 {
182 	return a[MSW128] & 0x80000000;
183 }
184 
185 /* test bits (numbered from 0 = LSB) up to and including "top" */
186 static inline int bftestlo128(int top, const int128 a)
187 {
188 	int r = 0;
189 
190 	if (top > 31)
191 		r |= a[LSW128];
192 	if (top > 63)
193 		r |= a[NLSW128];
194 	if (top > 95)
195 		r |= a[NMSW128];
196 
197 	r |= a[3 - (top / 32)] & ((1 << (top % 32 + 1)) - 1);
198 
199 	return (r != 0);
200 }
201 
202 /* Aargh.  We need these because GCC is broken */
203 /* FIXME: do them in assembly, for goodness' sake! */
204 static inline void mask64(int pos, unsigned long long *mask)
205 {
206 	*mask = 0;
207 
208 	if (pos < 32) {
209 		LO_WORD(*mask) = (1 << pos) - 1;
210 		return;
211 	}
212 	LO_WORD(*mask) = -1;
213 	HI_WORD(*mask) = (1 << (pos - 32)) - 1;
214 }
215 
216 static inline void bset64(int pos, unsigned long long *dest)
217 {
218 	/* This conditional will be optimized away.  Thanks, GCC! */
219 	if (pos < 32)
220 		asm volatile ("bset %1,%0":"=m"
221 			      (LO_WORD(*dest)):"id"(pos));
222 	else
223 		asm volatile ("bset %1,%0":"=m"
224 			      (HI_WORD(*dest)):"id"(pos - 32));
225 }
226 
227 static inline int btst64(int pos, unsigned long long dest)
228 {
229 	if (pos < 32)
230 		return (0 != (LO_WORD(dest) & (1 << pos)));
231 	else
232 		return (0 != (HI_WORD(dest) & (1 << (pos - 32))));
233 }
234 
235 static inline void lsl64(int count, unsigned long long *dest)
236 {
237 	if (count < 32) {
238 		HI_WORD(*dest) = (HI_WORD(*dest) << count)
239 		    | (LO_WORD(*dest) >> count);
240 		LO_WORD(*dest) <<= count;
241 		return;
242 	}
243 	count -= 32;
244 	HI_WORD(*dest) = LO_WORD(*dest) << count;
245 	LO_WORD(*dest) = 0;
246 }
247 
248 static inline void lsr64(int count, unsigned long long *dest)
249 {
250 	if (count < 32) {
251 		LO_WORD(*dest) = (LO_WORD(*dest) >> count)
252 		    | (HI_WORD(*dest) << (32 - count));
253 		HI_WORD(*dest) >>= count;
254 		return;
255 	}
256 	count -= 32;
257 	LO_WORD(*dest) = HI_WORD(*dest) >> count;
258 	HI_WORD(*dest) = 0;
259 }
260 #endif
261 
fp_denormalize(struct fp_ext * reg,unsigned int cnt)262 static inline void fp_denormalize(struct fp_ext *reg, unsigned int cnt)
263 {
264 	reg->exp += cnt;
265 
266 	switch (cnt) {
267 	case 0 ... 8:
268 		reg->lowmant = reg->mant.m32[1] << (8 - cnt);
269 		reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
270 				   (reg->mant.m32[0] << (32 - cnt));
271 		reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
272 		break;
273 	case 9 ... 32:
274 		reg->lowmant = reg->mant.m32[1] >> (cnt - 8);
275 		if (reg->mant.m32[1] << (40 - cnt))
276 			reg->lowmant |= 1;
277 		reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
278 				   (reg->mant.m32[0] << (32 - cnt));
279 		reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
280 		break;
281 	case 33 ... 39:
282 		asm volatile ("bfextu %1{%2,#8},%0" : "=d" (reg->lowmant)
283 			: "m" (reg->mant.m32[0]), "d" (64 - cnt));
284 		if (reg->mant.m32[1] << (40 - cnt))
285 			reg->lowmant |= 1;
286 		reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
287 		reg->mant.m32[0] = 0;
288 		break;
289 	case 40 ... 71:
290 		reg->lowmant = reg->mant.m32[0] >> (cnt - 40);
291 		if ((reg->mant.m32[0] << (72 - cnt)) || reg->mant.m32[1])
292 			reg->lowmant |= 1;
293 		reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
294 		reg->mant.m32[0] = 0;
295 		break;
296 	default:
297 		reg->lowmant = reg->mant.m32[0] || reg->mant.m32[1];
298 		reg->mant.m32[0] = 0;
299 		reg->mant.m32[1] = 0;
300 		break;
301 	}
302 }
303 
fp_overnormalize(struct fp_ext * reg)304 static inline int fp_overnormalize(struct fp_ext *reg)
305 {
306 	int shift;
307 
308 	if (reg->mant.m32[0]) {
309 		asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[0]));
310 		reg->mant.m32[0] = (reg->mant.m32[0] << shift) | (reg->mant.m32[1] >> (32 - shift));
311 		reg->mant.m32[1] = (reg->mant.m32[1] << shift);
312 	} else {
313 		asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[1]));
314 		reg->mant.m32[0] = (reg->mant.m32[1] << shift);
315 		reg->mant.m32[1] = 0;
316 		shift += 32;
317 	}
318 
319 	return shift;
320 }
321 
fp_addmant(struct fp_ext * dest,struct fp_ext * src)322 static inline int fp_addmant(struct fp_ext *dest, struct fp_ext *src)
323 {
324 	int carry;
325 
326 	/* we assume here, gcc only insert move and a clr instr */
327 	asm volatile ("add.b %1,%0" : "=d,g" (dest->lowmant)
328 		: "g,d" (src->lowmant), "0,0" (dest->lowmant));
329 	asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[1])
330 		: "d" (src->mant.m32[1]), "0" (dest->mant.m32[1]));
331 	asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[0])
332 		: "d" (src->mant.m32[0]), "0" (dest->mant.m32[0]));
333 	asm volatile ("addx.l %0,%0" : "=d" (carry) : "0" (0));
334 
335 	return carry;
336 }
337 
fp_addcarry(struct fp_ext * reg)338 static inline int fp_addcarry(struct fp_ext *reg)
339 {
340 	if (++reg->exp == 0x7fff) {
341 		if (reg->mant.m64)
342 			fp_set_sr(FPSR_EXC_INEX2);
343 		reg->mant.m64 = 0;
344 		fp_set_sr(FPSR_EXC_OVFL);
345 		return 0;
346 	}
347 	reg->lowmant = (reg->mant.m32[1] << 7) | (reg->lowmant ? 1 : 0);
348 	reg->mant.m32[1] = (reg->mant.m32[1] >> 1) |
349 			   (reg->mant.m32[0] << 31);
350 	reg->mant.m32[0] = (reg->mant.m32[0] >> 1) | 0x80000000;
351 
352 	return 1;
353 }
354 
fp_submant(struct fp_ext * dest,struct fp_ext * src1,struct fp_ext * src2)355 static inline void fp_submant(struct fp_ext *dest, struct fp_ext *src1,
356 			      struct fp_ext *src2)
357 {
358 	/* we assume here, gcc only insert move and a clr instr */
359 	asm volatile ("sub.b %1,%0" : "=d,g" (dest->lowmant)
360 		: "g,d" (src2->lowmant), "0,0" (src1->lowmant));
361 	asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[1])
362 		: "d" (src2->mant.m32[1]), "0" (src1->mant.m32[1]));
363 	asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[0])
364 		: "d" (src2->mant.m32[0]), "0" (src1->mant.m32[0]));
365 }
366 
367 #define fp_mul64(desth, destl, src1, src2) ({				\
368 	asm ("mulu.l %2,%1:%0" : "=d" (destl), "=d" (desth)		\
369 		: "dm" (src1), "0" (src2));				\
370 })
371 #define fp_div64(quot, rem, srch, srcl, div)				\
372 	asm ("divu.l %2,%1:%0" : "=d" (quot), "=d" (rem)		\
373 		: "dm" (div), "1" (srch), "0" (srcl))
374 #define fp_add64(dest1, dest2, src1, src2) ({				\
375 	asm ("add.l %1,%0" : "=d,dm" (dest2)				\
376 		: "dm,d" (src2), "0,0" (dest2));			\
377 	asm ("addx.l %1,%0" : "=d" (dest1)				\
378 		: "d" (src1), "0" (dest1));				\
379 })
380 #define fp_addx96(dest, src) ({						\
381 	/* we assume here, gcc only insert move and a clr instr */	\
382 	asm volatile ("add.l %1,%0" : "=d,g" (dest->m32[2])		\
383 		: "g,d" (temp.m32[1]), "0,0" (dest->m32[2]));		\
384 	asm volatile ("addx.l %1,%0" : "=d" (dest->m32[1])		\
385 		: "d" (temp.m32[0]), "0" (dest->m32[1]));		\
386 	asm volatile ("addx.l %1,%0" : "=d" (dest->m32[0])		\
387 		: "d" (0), "0" (dest->m32[0]));				\
388 })
389 #define fp_sub64(dest, src) ({						\
390 	asm ("sub.l %1,%0" : "=d,dm" (dest.m32[1])			\
391 		: "dm,d" (src.m32[1]), "0,0" (dest.m32[1]));		\
392 	asm ("subx.l %1,%0" : "=d" (dest.m32[0])			\
393 		: "d" (src.m32[0]), "0" (dest.m32[0]));			\
394 })
395 #define fp_sub96c(dest, srch, srcm, srcl) ({				\
396 	char carry;							\
397 	asm ("sub.l %1,%0" : "=d,dm" (dest.m32[2])			\
398 		: "dm,d" (srcl), "0,0" (dest.m32[2]));			\
399 	asm ("subx.l %1,%0" : "=d" (dest.m32[1])			\
400 		: "d" (srcm), "0" (dest.m32[1]));			\
401 	asm ("subx.l %2,%1; scs %0" : "=d" (carry), "=d" (dest.m32[0])	\
402 		: "d" (srch), "1" (dest.m32[0]));			\
403 	carry;								\
404 })
405 
fp_multiplymant(union fp_mant128 * dest,struct fp_ext * src1,struct fp_ext * src2)406 static inline void fp_multiplymant(union fp_mant128 *dest, struct fp_ext *src1,
407 				   struct fp_ext *src2)
408 {
409 	union fp_mant64 temp;
410 
411 	fp_mul64(dest->m32[0], dest->m32[1], src1->mant.m32[0], src2->mant.m32[0]);
412 	fp_mul64(dest->m32[2], dest->m32[3], src1->mant.m32[1], src2->mant.m32[1]);
413 
414 	fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[0], src2->mant.m32[1]);
415 	fp_addx96(dest, temp);
416 
417 	fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[1], src2->mant.m32[0]);
418 	fp_addx96(dest, temp);
419 }
420 
fp_dividemant(union fp_mant128 * dest,struct fp_ext * src,struct fp_ext * div)421 static inline void fp_dividemant(union fp_mant128 *dest, struct fp_ext *src,
422 				 struct fp_ext *div)
423 {
424 	union fp_mant128 tmp;
425 	union fp_mant64 tmp64;
426 	unsigned long *mantp = dest->m32;
427 	unsigned long fix, rem, first, dummy;
428 	int i;
429 
430 	/* the algorithm below requires dest to be smaller than div,
431 	   but both have the high bit set */
432 	if (src->mant.m64 >= div->mant.m64) {
433 		fp_sub64(src->mant, div->mant);
434 		*mantp = 1;
435 	} else
436 		*mantp = 0;
437 	mantp++;
438 
439 	/* basic idea behind this algorithm: we can't divide two 64bit numbers
440 	   (AB/CD) directly, but we can calculate AB/C0, but this means this
441 	   quotient is off by C0/CD, so we have to multiply the first result
442 	   to fix the result, after that we have nearly the correct result
443 	   and only a few corrections are needed. */
444 
445 	/* C0/CD can be precalculated, but it's an 64bit division again, but
446 	   we can make it a bit easier, by dividing first through C so we get
447 	   10/1D and now only a single shift and the value fits into 32bit. */
448 	fix = 0x80000000;
449 	dummy = div->mant.m32[1] / div->mant.m32[0] + 1;
450 	dummy = (dummy >> 1) | fix;
451 	fp_div64(fix, dummy, fix, 0, dummy);
452 	fix--;
453 
454 	for (i = 0; i < 3; i++, mantp++) {
455 		if (src->mant.m32[0] == div->mant.m32[0]) {
456 			fp_div64(first, rem, 0, src->mant.m32[1], div->mant.m32[0]);
457 
458 			fp_mul64(*mantp, dummy, first, fix);
459 			*mantp += fix;
460 		} else {
461 			fp_div64(first, rem, src->mant.m32[0], src->mant.m32[1], div->mant.m32[0]);
462 
463 			fp_mul64(*mantp, dummy, first, fix);
464 		}
465 
466 		fp_mul64(tmp.m32[0], tmp.m32[1], div->mant.m32[0], first - *mantp);
467 		fp_add64(tmp.m32[0], tmp.m32[1], 0, rem);
468 		tmp.m32[2] = 0;
469 
470 		fp_mul64(tmp64.m32[0], tmp64.m32[1], *mantp, div->mant.m32[1]);
471 		fp_sub96c(tmp, 0, tmp64.m32[0], tmp64.m32[1]);
472 
473 		src->mant.m32[0] = tmp.m32[1];
474 		src->mant.m32[1] = tmp.m32[2];
475 
476 		while (!fp_sub96c(tmp, 0, div->mant.m32[0], div->mant.m32[1])) {
477 			src->mant.m32[0] = tmp.m32[1];
478 			src->mant.m32[1] = tmp.m32[2];
479 			*mantp += 1;
480 		}
481 	}
482 }
483 
484 #if 0
485 static inline unsigned int fp_fls128(union fp_mant128 *src)
486 {
487 	unsigned long data;
488 	unsigned int res, off;
489 
490 	if ((data = src->m32[0]))
491 		off = 0;
492 	else if ((data = src->m32[1]))
493 		off = 32;
494 	else if ((data = src->m32[2]))
495 		off = 64;
496 	else if ((data = src->m32[3]))
497 		off = 96;
498 	else
499 		return 128;
500 
501 	asm ("bfffo %1{#0,#32},%0" : "=d" (res) : "dm" (data));
502 	return res + off;
503 }
504 
505 static inline void fp_shiftmant128(union fp_mant128 *src, int shift)
506 {
507 	unsigned long sticky;
508 
509 	switch (shift) {
510 	case 0:
511 		return;
512 	case 1:
513 		asm volatile ("lsl.l #1,%0"
514 			: "=d" (src->m32[3]) : "0" (src->m32[3]));
515 		asm volatile ("roxl.l #1,%0"
516 			: "=d" (src->m32[2]) : "0" (src->m32[2]));
517 		asm volatile ("roxl.l #1,%0"
518 			: "=d" (src->m32[1]) : "0" (src->m32[1]));
519 		asm volatile ("roxl.l #1,%0"
520 			: "=d" (src->m32[0]) : "0" (src->m32[0]));
521 		return;
522 	case 2 ... 31:
523 		src->m32[0] = (src->m32[0] << shift) | (src->m32[1] >> (32 - shift));
524 		src->m32[1] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
525 		src->m32[2] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
526 		src->m32[3] = (src->m32[3] << shift);
527 		return;
528 	case 32 ... 63:
529 		shift -= 32;
530 		src->m32[0] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
531 		src->m32[1] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
532 		src->m32[2] = (src->m32[3] << shift);
533 		src->m32[3] = 0;
534 		return;
535 	case 64 ... 95:
536 		shift -= 64;
537 		src->m32[0] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
538 		src->m32[1] = (src->m32[3] << shift);
539 		src->m32[2] = src->m32[3] = 0;
540 		return;
541 	case 96 ... 127:
542 		shift -= 96;
543 		src->m32[0] = (src->m32[3] << shift);
544 		src->m32[1] = src->m32[2] = src->m32[3] = 0;
545 		return;
546 	case -31 ... -1:
547 		shift = -shift;
548 		sticky = 0;
549 		if (src->m32[3] << (32 - shift))
550 			sticky = 1;
551 		src->m32[3] = (src->m32[3] >> shift) | (src->m32[2] << (32 - shift)) | sticky;
552 		src->m32[2] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift));
553 		src->m32[1] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
554 		src->m32[0] = (src->m32[0] >> shift);
555 		return;
556 	case -63 ... -32:
557 		shift = -shift - 32;
558 		sticky = 0;
559 		if ((src->m32[2] << (32 - shift)) || src->m32[3])
560 			sticky = 1;
561 		src->m32[3] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)) | sticky;
562 		src->m32[2] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
563 		src->m32[1] = (src->m32[0] >> shift);
564 		src->m32[0] = 0;
565 		return;
566 	case -95 ... -64:
567 		shift = -shift - 64;
568 		sticky = 0;
569 		if ((src->m32[1] << (32 - shift)) || src->m32[2] || src->m32[3])
570 			sticky = 1;
571 		src->m32[3] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)) | sticky;
572 		src->m32[2] = (src->m32[0] >> shift);
573 		src->m32[1] = src->m32[0] = 0;
574 		return;
575 	case -127 ... -96:
576 		shift = -shift - 96;
577 		sticky = 0;
578 		if ((src->m32[0] << (32 - shift)) || src->m32[1] || src->m32[2] || src->m32[3])
579 			sticky = 1;
580 		src->m32[3] = (src->m32[0] >> shift) | sticky;
581 		src->m32[2] = src->m32[1] = src->m32[0] = 0;
582 		return;
583 	}
584 
585 	if (shift < 0 && (src->m32[0] || src->m32[1] || src->m32[2] || src->m32[3]))
586 		src->m32[3] = 1;
587 	else
588 		src->m32[3] = 0;
589 	src->m32[2] = 0;
590 	src->m32[1] = 0;
591 	src->m32[0] = 0;
592 }
593 #endif
594 
fp_putmant128(struct fp_ext * dest,union fp_mant128 * src,int shift)595 static inline void fp_putmant128(struct fp_ext *dest, union fp_mant128 *src,
596 				 int shift)
597 {
598 	unsigned long tmp;
599 
600 	switch (shift) {
601 	case 0:
602 		dest->mant.m64 = src->m64[0];
603 		dest->lowmant = src->m32[2] >> 24;
604 		if (src->m32[3] || (src->m32[2] << 8))
605 			dest->lowmant |= 1;
606 		break;
607 	case 1:
608 		asm volatile ("lsl.l #1,%0"
609 			: "=d" (tmp) : "0" (src->m32[2]));
610 		asm volatile ("roxl.l #1,%0"
611 			: "=d" (dest->mant.m32[1]) : "0" (src->m32[1]));
612 		asm volatile ("roxl.l #1,%0"
613 			: "=d" (dest->mant.m32[0]) : "0" (src->m32[0]));
614 		dest->lowmant = tmp >> 24;
615 		if (src->m32[3] || (tmp << 8))
616 			dest->lowmant |= 1;
617 		break;
618 	case 31:
619 		asm volatile ("lsr.l #1,%1; roxr.l #1,%0"
620 			: "=d" (dest->mant.m32[0])
621 			: "d" (src->m32[0]), "0" (src->m32[1]));
622 		asm volatile ("roxr.l #1,%0"
623 			: "=d" (dest->mant.m32[1]) : "0" (src->m32[2]));
624 		asm volatile ("roxr.l #1,%0"
625 			: "=d" (tmp) : "0" (src->m32[3]));
626 		dest->lowmant = tmp >> 24;
627 		if (src->m32[3] << 7)
628 			dest->lowmant |= 1;
629 		break;
630 	case 32:
631 		dest->mant.m32[0] = src->m32[1];
632 		dest->mant.m32[1] = src->m32[2];
633 		dest->lowmant = src->m32[3] >> 24;
634 		if (src->m32[3] << 8)
635 			dest->lowmant |= 1;
636 		break;
637 	}
638 }
639 
640 #if 0 /* old code... */
641 static inline int fls(unsigned int a)
642 {
643 	int r;
644 
645 	asm volatile ("bfffo %1{#0,#32},%0"
646 		      : "=d" (r) : "md" (a));
647 	return r;
648 }
649 
650 /* fls = "find last set" (cf. ffs(3)) */
651 static inline int fls128(const int128 a)
652 {
653 	if (a[MSW128])
654 		return fls(a[MSW128]);
655 	if (a[NMSW128])
656 		return fls(a[NMSW128]) + 32;
657 	/* XXX: it probably never gets beyond this point in actual
658 	   use, but that's indicative of a more general problem in the
659 	   algorithm (i.e. as per the actual 68881 implementation, we
660 	   really only need at most 67 bits of precision [plus
661 	   overflow]) so I'm not going to fix it. */
662 	if (a[NLSW128])
663 		return fls(a[NLSW128]) + 64;
664 	if (a[LSW128])
665 		return fls(a[LSW128]) + 96;
666 	else
667 		return -1;
668 }
669 
670 static inline int zerop128(const int128 a)
671 {
672 	return !(a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
673 }
674 
675 static inline int nonzerop128(const int128 a)
676 {
677 	return (a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
678 }
679 
680 /* Addition and subtraction */
681 /* Do these in "pure" assembly, because "extended" asm is unmanageable
682    here */
683 static inline void add128(const int128 a, int128 b)
684 {
685 	/* rotating carry flags */
686 	unsigned int carry[2];
687 
688 	carry[0] = a[LSW128] > (0xffffffff - b[LSW128]);
689 	b[LSW128] += a[LSW128];
690 
691 	carry[1] = a[NLSW128] > (0xffffffff - b[NLSW128] - carry[0]);
692 	b[NLSW128] = a[NLSW128] + b[NLSW128] + carry[0];
693 
694 	carry[0] = a[NMSW128] > (0xffffffff - b[NMSW128] - carry[1]);
695 	b[NMSW128] = a[NMSW128] + b[NMSW128] + carry[1];
696 
697 	b[MSW128] = a[MSW128] + b[MSW128] + carry[0];
698 }
699 
700 /* Note: assembler semantics: "b -= a" */
701 static inline void sub128(const int128 a, int128 b)
702 {
703 	/* rotating borrow flags */
704 	unsigned int borrow[2];
705 
706 	borrow[0] = b[LSW128] < a[LSW128];
707 	b[LSW128] -= a[LSW128];
708 
709 	borrow[1] = b[NLSW128] < a[NLSW128] + borrow[0];
710 	b[NLSW128] = b[NLSW128] - a[NLSW128] - borrow[0];
711 
712 	borrow[0] = b[NMSW128] < a[NMSW128] + borrow[1];
713 	b[NMSW128] = b[NMSW128] - a[NMSW128] - borrow[1];
714 
715 	b[MSW128] = b[MSW128] - a[MSW128] - borrow[0];
716 }
717 
718 /* Poor man's 64-bit expanding multiply */
719 static inline void mul64(unsigned long long a, unsigned long long b, int128 c)
720 {
721 	unsigned long long acc;
722 	int128 acc128;
723 
724 	zero128(acc128);
725 	zero128(c);
726 
727 	/* first the low words */
728 	if (LO_WORD(a) && LO_WORD(b)) {
729 		acc = (long long) LO_WORD(a) * LO_WORD(b);
730 		c[NLSW128] = HI_WORD(acc);
731 		c[LSW128] = LO_WORD(acc);
732 	}
733 	/* Next the high words */
734 	if (HI_WORD(a) && HI_WORD(b)) {
735 		acc = (long long) HI_WORD(a) * HI_WORD(b);
736 		c[MSW128] = HI_WORD(acc);
737 		c[NMSW128] = LO_WORD(acc);
738 	}
739 	/* The middle words */
740 	if (LO_WORD(a) && HI_WORD(b)) {
741 		acc = (long long) LO_WORD(a) * HI_WORD(b);
742 		acc128[NMSW128] = HI_WORD(acc);
743 		acc128[NLSW128] = LO_WORD(acc);
744 		add128(acc128, c);
745 	}
746 	/* The first and last words */
747 	if (HI_WORD(a) && LO_WORD(b)) {
748 		acc = (long long) HI_WORD(a) * LO_WORD(b);
749 		acc128[NMSW128] = HI_WORD(acc);
750 		acc128[NLSW128] = LO_WORD(acc);
751 		add128(acc128, c);
752 	}
753 }
754 
755 /* Note: unsigned */
756 static inline int cmp128(int128 a, int128 b)
757 {
758 	if (a[MSW128] < b[MSW128])
759 		return -1;
760 	if (a[MSW128] > b[MSW128])
761 		return 1;
762 	if (a[NMSW128] < b[NMSW128])
763 		return -1;
764 	if (a[NMSW128] > b[NMSW128])
765 		return 1;
766 	if (a[NLSW128] < b[NLSW128])
767 		return -1;
768 	if (a[NLSW128] > b[NLSW128])
769 		return 1;
770 
771 	return (signed) a[LSW128] - b[LSW128];
772 }
773 
774 inline void div128(int128 a, int128 b, int128 c)
775 {
776 	int128 mask;
777 
778 	/* Algorithm:
779 
780 	   Shift the divisor until it's at least as big as the
781 	   dividend, keeping track of the position to which we've
782 	   shifted it, i.e. the power of 2 which we've multiplied it
783 	   by.
784 
785 	   Then, for this power of 2 (the mask), and every one smaller
786 	   than it, subtract the mask from the dividend and add it to
787 	   the quotient until the dividend is smaller than the raised
788 	   divisor.  At this point, divide the dividend and the mask
789 	   by 2 (i.e. shift one place to the right).  Lather, rinse,
790 	   and repeat, until there are no more powers of 2 left. */
791 
792 	/* FIXME: needless to say, there's room for improvement here too. */
793 
794 	/* Shift up */
795 	/* XXX: since it just has to be "at least as big", we can
796 	   probably eliminate this horribly wasteful loop.  I will
797 	   have to prove this first, though */
798 	set128(0, 0, 0, 1, mask);
799 	while (cmp128(b, a) < 0 && !btsthi128(b)) {
800 		lslone128(b);
801 		lslone128(mask);
802 	}
803 
804 	/* Shift down */
805 	zero128(c);
806 	do {
807 		if (cmp128(a, b) >= 0) {
808 			sub128(b, a);
809 			add128(mask, c);
810 		}
811 		lsrone128(mask);
812 		lsrone128(b);
813 	} while (nonzerop128(mask));
814 
815 	/* The remainder is in a... */
816 }
817 #endif
818 
819 #endif	/* MULTI_ARITH_H */
820