1 /* Software floating-point emulation.
2    Basic two-word fraction declaration and manipulation.
3    Copyright (C) 1997-2022 Free Software Foundation, Inc.
4    This file is part of the GNU C Library.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    In addition to the permissions in the GNU Lesser General Public
12    License, the Free Software Foundation gives you unlimited
13    permission to link the compiled version of this file into
14    combinations with other programs, and to distribute those
15    combinations without any restriction coming from the use of this
16    file.  (The Lesser General Public License restrictions do apply in
17    other respects; for example, they cover modification of the file,
18    and distribution when not linked into a combine executable.)
19 
20    The GNU C Library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24 
25    You should have received a copy of the GNU Lesser General Public
26    License along with the GNU C Library; if not, see
27    <https://www.gnu.org/licenses/>.  */
28 
29 #ifndef SOFT_FP_OP_2_H
30 #define SOFT_FP_OP_2_H	1
31 
32 #define _FP_FRAC_DECL_2(X)				\
33   _FP_W_TYPE X##_f0 _FP_ZERO_INIT, X##_f1 _FP_ZERO_INIT
34 #define _FP_FRAC_COPY_2(D, S)	(D##_f0 = S##_f0, D##_f1 = S##_f1)
35 #define _FP_FRAC_SET_2(X, I)	__FP_FRAC_SET_2 (X, I)
36 #define _FP_FRAC_HIGH_2(X)	(X##_f1)
37 #define _FP_FRAC_LOW_2(X)	(X##_f0)
38 #define _FP_FRAC_WORD_2(X, w)	(X##_f##w)
39 
40 #define _FP_FRAC_SLL_2(X, N)						\
41   (void) (((N) < _FP_W_TYPE_SIZE)					\
42 	  ? ({								\
43 	      if (__builtin_constant_p (N) && (N) == 1)			\
44 		{							\
45 		  X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE) (X##_f0)) < 0); \
46 		  X##_f0 += X##_f0;					\
47 		}							\
48 	      else							\
49 		{							\
50 		  X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
51 		  X##_f0 <<= (N);					\
52 		}							\
53 	      0;							\
54 	    })								\
55 	  : ({								\
56 	      X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE);		\
57 	      X##_f0 = 0;						\
58 	    }))
59 
60 
61 #define _FP_FRAC_SRL_2(X, N)						\
62   (void) (((N) < _FP_W_TYPE_SIZE)					\
63 	  ? ({								\
64 	      X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N)); \
65 	      X##_f1 >>= (N);						\
66 	    })								\
67 	  : ({								\
68 	      X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE);		\
69 	      X##_f1 = 0;						\
70 	    }))
71 
72 /* Right shift with sticky-lsb.  */
73 #define _FP_FRAC_SRST_2(X, S, N, sz)					\
74   (void) (((N) < _FP_W_TYPE_SIZE)					\
75 	  ? ({								\
76 	      S = (__builtin_constant_p (N) && (N) == 1			\
77 		   ? X##_f0 & 1						\
78 		   : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0);		\
79 	      X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \
80 	      X##_f1 >>= (N);						\
81 	    })								\
82 	  : ({								\
83 	      S = ((((N) == _FP_W_TYPE_SIZE				\
84 		     ? 0						\
85 		     : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))		\
86 		    | X##_f0) != 0);					\
87 	      X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE));		\
88 	      X##_f1 = 0;						\
89 	    }))
90 
91 #define _FP_FRAC_SRS_2(X, N, sz)					\
92   (void) (((N) < _FP_W_TYPE_SIZE)					\
93 	  ? ({								\
94 	      X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) \
95 			| (__builtin_constant_p (N) && (N) == 1		\
96 			   ? X##_f0 & 1					\
97 			   : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \
98 	      X##_f1 >>= (N);						\
99 	    })								\
100 	  : ({								\
101 	      X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE)		\
102 			| ((((N) == _FP_W_TYPE_SIZE			\
103 			     ? 0					\
104 			     : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))	\
105 			    | X##_f0) != 0));				\
106 	      X##_f1 = 0;						\
107 	    }))
108 
109 #define _FP_FRAC_ADDI_2(X, I)	\
110   __FP_FRAC_ADDI_2 (X##_f1, X##_f0, I)
111 
112 #define _FP_FRAC_ADD_2(R, X, Y)	\
113   __FP_FRAC_ADD_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
114 
115 #define _FP_FRAC_SUB_2(R, X, Y)	\
116   __FP_FRAC_SUB_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
117 
118 #define _FP_FRAC_DEC_2(X, Y)	\
119   __FP_FRAC_DEC_2 (X##_f1, X##_f0, Y##_f1, Y##_f0)
120 
121 #define _FP_FRAC_CLZ_2(R, X)			\
122   do						\
123     {						\
124       if (X##_f1)				\
125 	__FP_CLZ ((R), X##_f1);			\
126       else					\
127 	{					\
128 	  __FP_CLZ ((R), X##_f0);		\
129 	  (R) += _FP_W_TYPE_SIZE;		\
130 	}					\
131     }						\
132   while (0)
133 
134 /* Predicates.  */
135 #define _FP_FRAC_NEGP_2(X)	((_FP_WS_TYPE) X##_f1 < 0)
136 #define _FP_FRAC_ZEROP_2(X)	((X##_f1 | X##_f0) == 0)
137 #define _FP_FRAC_OVERP_2(fs, X)	(_FP_FRAC_HIGH_##fs (X) & _FP_OVERFLOW_##fs)
138 #define _FP_FRAC_CLEAR_OVERP_2(fs, X)	(_FP_FRAC_HIGH_##fs (X) &= ~_FP_OVERFLOW_##fs)
139 #define _FP_FRAC_HIGHBIT_DW_2(fs, X)	\
140   (_FP_FRAC_HIGH_DW_##fs (X) & _FP_HIGHBIT_DW_##fs)
141 #define _FP_FRAC_EQ_2(X, Y)	(X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
142 #define _FP_FRAC_GT_2(X, Y)	\
143   (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
144 #define _FP_FRAC_GE_2(X, Y)	\
145   (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))
146 
147 #define _FP_ZEROFRAC_2		0, 0
148 #define _FP_MINFRAC_2		0, 1
149 #define _FP_MAXFRAC_2		(~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0)
150 
151 /* Internals.  */
152 
153 #define __FP_FRAC_SET_2(X, I1, I0)	(X##_f0 = I0, X##_f1 = I1)
154 
155 #define __FP_CLZ_2(R, xh, xl)			\
156   do						\
157     {						\
158       if (xh)					\
159 	__FP_CLZ ((R), xh);			\
160       else					\
161 	{					\
162 	  __FP_CLZ ((R), xl);			\
163 	  (R) += _FP_W_TYPE_SIZE;		\
164 	}					\
165     }						\
166   while (0)
167 
168 #if 0
169 
170 # ifndef __FP_FRAC_ADDI_2
171 #  define __FP_FRAC_ADDI_2(xh, xl, i)	\
172   (xh += ((xl += i) < i))
173 # endif
174 # ifndef __FP_FRAC_ADD_2
175 #  define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl)	\
176   (rh = xh + yh + ((rl = xl + yl) < xl))
177 # endif
178 # ifndef __FP_FRAC_SUB_2
179 #  define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl)	\
180   (rh = xh - yh - ((rl = xl - yl) > xl))
181 # endif
182 # ifndef __FP_FRAC_DEC_2
183 #  define __FP_FRAC_DEC_2(xh, xl, yh, yl)		\
184   do							\
185     {							\
186       UWtype __FP_FRAC_DEC_2_t = xl;			\
187       xh -= yh + ((xl -= yl) > __FP_FRAC_DEC_2_t);	\
188     }							\
189   while (0)
190 # endif
191 
192 #else
193 
194 # undef __FP_FRAC_ADDI_2
195 # define __FP_FRAC_ADDI_2(xh, xl, i)	add_ssaaaa (xh, xl, xh, xl, 0, i)
196 # undef __FP_FRAC_ADD_2
197 # define __FP_FRAC_ADD_2		add_ssaaaa
198 # undef __FP_FRAC_SUB_2
199 # define __FP_FRAC_SUB_2		sub_ddmmss
200 # undef __FP_FRAC_DEC_2
201 # define __FP_FRAC_DEC_2(xh, xl, yh, yl)	\
202   sub_ddmmss (xh, xl, xh, xl, yh, yl)
203 
204 #endif
205 
206 /* Unpack the raw bits of a native fp value.  Do not classify or
207    normalize the data.  */
208 
209 #define _FP_UNPACK_RAW_2(fs, X, val)			\
210   do							\
211     {							\
212       union _FP_UNION_##fs _FP_UNPACK_RAW_2_flo;	\
213       _FP_UNPACK_RAW_2_flo.flt = (val);			\
214 							\
215       X##_f0 = _FP_UNPACK_RAW_2_flo.bits.frac0;		\
216       X##_f1 = _FP_UNPACK_RAW_2_flo.bits.frac1;		\
217       X##_e  = _FP_UNPACK_RAW_2_flo.bits.exp;		\
218       X##_s  = _FP_UNPACK_RAW_2_flo.bits.sign;		\
219     }							\
220   while (0)
221 
222 #define _FP_UNPACK_RAW_2_P(fs, X, val)			\
223   do							\
224     {							\
225       union _FP_UNION_##fs *_FP_UNPACK_RAW_2_P_flo	\
226 	= (union _FP_UNION_##fs *) (val);		\
227 							\
228       X##_f0 = _FP_UNPACK_RAW_2_P_flo->bits.frac0;	\
229       X##_f1 = _FP_UNPACK_RAW_2_P_flo->bits.frac1;	\
230       X##_e  = _FP_UNPACK_RAW_2_P_flo->bits.exp;	\
231       X##_s  = _FP_UNPACK_RAW_2_P_flo->bits.sign;	\
232     }							\
233   while (0)
234 
235 
236 /* Repack the raw bits of a native fp value.  */
237 
238 #define _FP_PACK_RAW_2(fs, val, X)		\
239   do						\
240     {						\
241       union _FP_UNION_##fs _FP_PACK_RAW_2_flo;	\
242 						\
243       _FP_PACK_RAW_2_flo.bits.frac0 = X##_f0;	\
244       _FP_PACK_RAW_2_flo.bits.frac1 = X##_f1;	\
245       _FP_PACK_RAW_2_flo.bits.exp   = X##_e;	\
246       _FP_PACK_RAW_2_flo.bits.sign  = X##_s;	\
247 						\
248       (val) = _FP_PACK_RAW_2_flo.flt;		\
249     }						\
250   while (0)
251 
252 #define _FP_PACK_RAW_2_P(fs, val, X)			\
253   do							\
254     {							\
255       union _FP_UNION_##fs *_FP_PACK_RAW_2_P_flo	\
256 	= (union _FP_UNION_##fs *) (val);		\
257 							\
258       _FP_PACK_RAW_2_P_flo->bits.frac0 = X##_f0;	\
259       _FP_PACK_RAW_2_P_flo->bits.frac1 = X##_f1;	\
260       _FP_PACK_RAW_2_P_flo->bits.exp   = X##_e;		\
261       _FP_PACK_RAW_2_P_flo->bits.sign  = X##_s;		\
262     }							\
263   while (0)
264 
265 
266 /* Multiplication algorithms: */
267 
268 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
269 
270 #define _FP_MUL_MEAT_DW_2_wide(wfracbits, R, X, Y, doit)		\
271   do									\
272     {									\
273       _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_b);			\
274       _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_c);			\
275 									\
276       doit (_FP_FRAC_WORD_4 (R, 1), _FP_FRAC_WORD_4 (R, 0),		\
277 	    X##_f0, Y##_f0);						\
278       doit (_FP_MUL_MEAT_DW_2_wide_b_f1, _FP_MUL_MEAT_DW_2_wide_b_f0,	\
279 	    X##_f0, Y##_f1);						\
280       doit (_FP_MUL_MEAT_DW_2_wide_c_f1, _FP_MUL_MEAT_DW_2_wide_c_f0,	\
281 	    X##_f1, Y##_f0);						\
282       doit (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),		\
283 	    X##_f1, Y##_f1);						\
284 									\
285       __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
286 		       _FP_FRAC_WORD_4 (R, 1), 0,			\
287 		       _FP_MUL_MEAT_DW_2_wide_b_f1,			\
288 		       _FP_MUL_MEAT_DW_2_wide_b_f0,			\
289 		       _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
290 		       _FP_FRAC_WORD_4 (R, 1));				\
291       __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
292 		       _FP_FRAC_WORD_4 (R, 1), 0,			\
293 		       _FP_MUL_MEAT_DW_2_wide_c_f1,			\
294 		       _FP_MUL_MEAT_DW_2_wide_c_f0,			\
295 		       _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
296 		       _FP_FRAC_WORD_4 (R, 1));				\
297     }									\
298   while (0)
299 
300 #define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)			\
301   do									\
302     {									\
303       _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_z);				\
304 									\
305       _FP_MUL_MEAT_DW_2_wide ((wfracbits), _FP_MUL_MEAT_2_wide_z,	\
306 			      X, Y, doit);				\
307 									\
308       /* Normalize since we know where the msb of the multiplicands	\
309 	 were (bit B), we know that the msb of the of the product is	\
310 	 at either 2B or 2B-1.  */					\
311       _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_z, (wfracbits)-1,		\
312 		      2*(wfracbits));					\
313       R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 0);		\
314       R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 1);		\
315     }									\
316   while (0)
317 
318 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.
319    Do only 3 multiplications instead of four. This one is for machines
320    where multiplication is much more expensive than subtraction.  */
321 
322 #define _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, R, X, Y, doit)		\
323   do									\
324     {									\
325       _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_b);			\
326       _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_c);			\
327       _FP_W_TYPE _FP_MUL_MEAT_DW_2_wide_3mul_d;				\
328       int _FP_MUL_MEAT_DW_2_wide_3mul_c1;				\
329       int _FP_MUL_MEAT_DW_2_wide_3mul_c2;				\
330 									\
331       _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 = X##_f0 + X##_f1;		\
332       _FP_MUL_MEAT_DW_2_wide_3mul_c1					\
333 	= _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 < X##_f0;			\
334       _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 = Y##_f0 + Y##_f1;		\
335       _FP_MUL_MEAT_DW_2_wide_3mul_c2					\
336 	= _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 < Y##_f0;			\
337       doit (_FP_MUL_MEAT_DW_2_wide_3mul_d, _FP_FRAC_WORD_4 (R, 0),	\
338 	    X##_f0, Y##_f0);						\
339       doit (_FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1),		\
340 	    _FP_MUL_MEAT_DW_2_wide_3mul_b_f0,				\
341 	    _FP_MUL_MEAT_DW_2_wide_3mul_b_f1);				\
342       doit (_FP_MUL_MEAT_DW_2_wide_3mul_c_f1,				\
343 	    _FP_MUL_MEAT_DW_2_wide_3mul_c_f0, X##_f1, Y##_f1);		\
344 									\
345       _FP_MUL_MEAT_DW_2_wide_3mul_b_f0					\
346 	&= -_FP_MUL_MEAT_DW_2_wide_3mul_c2;				\
347       _FP_MUL_MEAT_DW_2_wide_3mul_b_f1					\
348 	&= -_FP_MUL_MEAT_DW_2_wide_3mul_c1;				\
349       __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
350 		       _FP_FRAC_WORD_4 (R, 1),				\
351 		       (_FP_MUL_MEAT_DW_2_wide_3mul_c1			\
352 			& _FP_MUL_MEAT_DW_2_wide_3mul_c2), 0,		\
353 		       _FP_MUL_MEAT_DW_2_wide_3mul_d,			\
354 		       0, _FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1)); \
355       __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
356 			_FP_MUL_MEAT_DW_2_wide_3mul_b_f0);		\
357       __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
358 			_FP_MUL_MEAT_DW_2_wide_3mul_b_f1);		\
359       __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
360 		       _FP_FRAC_WORD_4 (R, 1),				\
361 		       0, _FP_MUL_MEAT_DW_2_wide_3mul_d,		\
362 		       _FP_FRAC_WORD_4 (R, 0));				\
363       __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
364 		       _FP_FRAC_WORD_4 (R, 1), 0,			\
365 		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f1,		\
366 		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f0);		\
367       __FP_FRAC_ADD_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
368 		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f1,		\
369 		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f0,		\
370 		       _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2));	\
371     }									\
372   while (0)
373 
374 #define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)		\
375   do									\
376     {									\
377       _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_3mul_z);			\
378 									\
379       _FP_MUL_MEAT_DW_2_wide_3mul ((wfracbits),				\
380 				   _FP_MUL_MEAT_2_wide_3mul_z,		\
381 				   X, Y, doit);				\
382 									\
383       /* Normalize since we know where the msb of the multiplicands	\
384 	 were (bit B), we know that the msb of the of the product is	\
385 	 at either 2B or 2B-1.  */					\
386       _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_3mul_z,			\
387 		      (wfracbits)-1, 2*(wfracbits));			\
388       R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 0);		\
389       R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 1);		\
390     }									\
391   while (0)
392 
393 #define _FP_MUL_MEAT_DW_2_gmp(wfracbits, R, X, Y)	\
394   do							\
395     {							\
396       _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_x[2];		\
397       _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_y[2];		\
398       _FP_MUL_MEAT_DW_2_gmp_x[0] = X##_f0;		\
399       _FP_MUL_MEAT_DW_2_gmp_x[1] = X##_f1;		\
400       _FP_MUL_MEAT_DW_2_gmp_y[0] = Y##_f0;		\
401       _FP_MUL_MEAT_DW_2_gmp_y[1] = Y##_f1;		\
402 							\
403       mpn_mul_n (R##_f, _FP_MUL_MEAT_DW_2_gmp_x,	\
404 		 _FP_MUL_MEAT_DW_2_gmp_y, 2);		\
405     }							\
406   while (0)
407 
408 #define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)				\
409   do									\
410     {									\
411       _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_gmp_z);				\
412 									\
413       _FP_MUL_MEAT_DW_2_gmp ((wfracbits), _FP_MUL_MEAT_2_gmp_z, X, Y);	\
414 									\
415       /* Normalize since we know where the msb of the multiplicands	\
416 	 were (bit B), we know that the msb of the of the product is	\
417 	 at either 2B or 2B-1.  */					\
418       _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_gmp_z, (wfracbits)-1,		\
419 		      2*(wfracbits));					\
420       R##_f0 = _FP_MUL_MEAT_2_gmp_z_f[0];				\
421       R##_f1 = _FP_MUL_MEAT_2_gmp_z_f[1];				\
422     }									\
423   while (0)
424 
425 /* Do at most 120x120=240 bits multiplication using double floating
426    point multiplication.  This is useful if floating point
427    multiplication has much bigger throughput than integer multiply.
428    It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
429    between 106 and 120 only.
430    Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
431    SETFETZ is a macro which will disable all FPU exceptions and set rounding
432    towards zero,  RESETFE should optionally reset it back.  */
433 
434 #define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe) \
435   do									\
436     {									\
437       static const double _const[] =					\
438 	{								\
439 	  /* 2^-24 */ 5.9604644775390625e-08,				\
440 	  /* 2^-48 */ 3.5527136788005009e-15,				\
441 	  /* 2^-72 */ 2.1175823681357508e-22,				\
442 	  /* 2^-96 */ 1.2621774483536189e-29,				\
443 	  /* 2^28 */ 2.68435456e+08,					\
444 	  /* 2^4 */ 1.600000e+01,					\
445 	  /* 2^-20 */ 9.5367431640625e-07,				\
446 	  /* 2^-44 */ 5.6843418860808015e-14,				\
447 	  /* 2^-68 */ 3.3881317890172014e-21,				\
448 	  /* 2^-92 */ 2.0194839173657902e-28,				\
449 	  /* 2^-116 */ 1.2037062152420224e-35				\
450 	};								\
451       double _a240, _b240, _c240, _d240, _e240, _f240,			\
452 	_g240, _h240, _i240, _j240, _k240;				\
453       union { double d; UDItype i; } _l240, _m240, _n240, _o240,	\
454 				       _p240, _q240, _r240, _s240;	\
455       UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0;		\
456 									\
457       _FP_STATIC_ASSERT ((wfracbits) >= 106 && (wfracbits) <= 120,	\
458 			 "wfracbits out of range");			\
459 									\
460       setfetz;								\
461 									\
462       _e240 = (double) (long) (X##_f0 & 0xffffff);			\
463       _j240 = (double) (long) (Y##_f0 & 0xffffff);			\
464       _d240 = (double) (long) ((X##_f0 >> 24) & 0xffffff);		\
465       _i240 = (double) (long) ((Y##_f0 >> 24) & 0xffffff);		\
466       _c240 = (double) (long) (((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48)); \
467       _h240 = (double) (long) (((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48)); \
468       _b240 = (double) (long) ((X##_f1 >> 8) & 0xffffff);		\
469       _g240 = (double) (long) ((Y##_f1 >> 8) & 0xffffff);		\
470       _a240 = (double) (long) (X##_f1 >> 32);				\
471       _f240 = (double) (long) (Y##_f1 >> 32);				\
472       _e240 *= _const[3];						\
473       _j240 *= _const[3];						\
474       _d240 *= _const[2];						\
475       _i240 *= _const[2];						\
476       _c240 *= _const[1];						\
477       _h240 *= _const[1];						\
478       _b240 *= _const[0];						\
479       _g240 *= _const[0];						\
480       _s240.d =							      _e240*_j240; \
481       _r240.d =						_d240*_j240 + _e240*_i240; \
482       _q240.d =				  _c240*_j240 + _d240*_i240 + _e240*_h240; \
483       _p240.d =		    _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240; \
484       _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240; \
485       _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240;	\
486       _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240;		\
487       _l240.d = _a240*_g240 + _b240*_f240;				\
488       _k240 =   _a240*_f240;						\
489       _r240.d += _s240.d;						\
490       _q240.d += _r240.d;						\
491       _p240.d += _q240.d;						\
492       _o240.d += _p240.d;						\
493       _n240.d += _o240.d;						\
494       _m240.d += _n240.d;						\
495       _l240.d += _m240.d;						\
496       _k240 += _l240.d;							\
497       _s240.d -= ((_const[10]+_s240.d)-_const[10]);			\
498       _r240.d -= ((_const[9]+_r240.d)-_const[9]);			\
499       _q240.d -= ((_const[8]+_q240.d)-_const[8]);			\
500       _p240.d -= ((_const[7]+_p240.d)-_const[7]);			\
501       _o240.d += _const[7];						\
502       _n240.d += _const[6];						\
503       _m240.d += _const[5];						\
504       _l240.d += _const[4];						\
505       if (_s240.d != 0.0)						\
506 	_y240 = 1;							\
507       if (_r240.d != 0.0)						\
508 	_y240 = 1;							\
509       if (_q240.d != 0.0)						\
510 	_y240 = 1;							\
511       if (_p240.d != 0.0)						\
512 	_y240 = 1;							\
513       _t240 = (DItype) _k240;						\
514       _u240 = _l240.i;							\
515       _v240 = _m240.i;							\
516       _w240 = _n240.i;							\
517       _x240 = _o240.i;							\
518       R##_f1 = ((_t240 << (128 - (wfracbits - 1)))			\
519 		| ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104)));	\
520       R##_f0 = (((_u240 & 0xffffff) << (168 - (wfracbits - 1)))		\
521 		| ((_v240 & 0xffffff) << (144 - (wfracbits - 1)))	\
522 		| ((_w240 & 0xffffff) << (120 - (wfracbits - 1)))	\
523 		| ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96))	\
524 		| _y240);						\
525       resetfe;								\
526     }									\
527   while (0)
528 
529 /* Division algorithms: */
530 
531 #define _FP_DIV_MEAT_2_udiv(fs, R, X, Y)				\
532   do									\
533     {									\
534       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f2;				\
535       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f1;				\
536       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f0;				\
537       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f1;				\
538       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f0;				\
539       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f1;				\
540       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f0;				\
541       if (_FP_FRAC_GE_2 (X, Y))						\
542 	{								\
543 	  _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1 >> 1;			\
544 	  _FP_DIV_MEAT_2_udiv_n_f1					\
545 	    = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1;		\
546 	  _FP_DIV_MEAT_2_udiv_n_f0					\
547 	    = X##_f0 << (_FP_W_TYPE_SIZE - 1);				\
548 	}								\
549       else								\
550 	{								\
551 	  R##_e--;							\
552 	  _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1;				\
553 	  _FP_DIV_MEAT_2_udiv_n_f1 = X##_f0;				\
554 	  _FP_DIV_MEAT_2_udiv_n_f0 = 0;					\
555 	}								\
556 									\
557       /* Normalize, i.e. make the most significant bit of the		\
558 	 denominator set.  */						\
559       _FP_FRAC_SLL_2 (Y, _FP_WFRACXBITS_##fs);				\
560 									\
561       udiv_qrnnd (R##_f1, _FP_DIV_MEAT_2_udiv_r_f1,			\
562 		  _FP_DIV_MEAT_2_udiv_n_f2, _FP_DIV_MEAT_2_udiv_n_f1,	\
563 		  Y##_f1);						\
564       umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1, _FP_DIV_MEAT_2_udiv_m_f0,	\
565 		 R##_f1, Y##_f0);					\
566       _FP_DIV_MEAT_2_udiv_r_f0 = _FP_DIV_MEAT_2_udiv_n_f0;		\
567       if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m, _FP_DIV_MEAT_2_udiv_r))	\
568 	{								\
569 	  R##_f1--;							\
570 	  _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,			\
571 			  _FP_DIV_MEAT_2_udiv_r);			\
572 	  if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y)			\
573 	      && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,			\
574 				_FP_DIV_MEAT_2_udiv_r))			\
575 	    {								\
576 	      R##_f1--;							\
577 	      _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,			\
578 			      _FP_DIV_MEAT_2_udiv_r);			\
579 	    }								\
580 	}								\
581       _FP_FRAC_DEC_2 (_FP_DIV_MEAT_2_udiv_r, _FP_DIV_MEAT_2_udiv_m);	\
582 									\
583       if (_FP_DIV_MEAT_2_udiv_r_f1 == Y##_f1)				\
584 	{								\
585 	  /* This is a special case, not an optimization		\
586 	     (_FP_DIV_MEAT_2_udiv_r/Y##_f1 would not fit into UWtype).	\
587 	     As _FP_DIV_MEAT_2_udiv_r is guaranteed to be < Y,		\
588 	     R##_f0 can be either (UWtype)-1 or (UWtype)-2.  But as we	\
589 	     know what kind of bits it is (sticky, guard, round),	\
590 	     we don't care.  We also don't care what the reminder is,	\
591 	     because the guard bit will be set anyway.  -jj */		\
592 	  R##_f0 = -1;							\
593 	}								\
594       else								\
595 	{								\
596 	  udiv_qrnnd (R##_f0, _FP_DIV_MEAT_2_udiv_r_f1,			\
597 		      _FP_DIV_MEAT_2_udiv_r_f1,				\
598 		      _FP_DIV_MEAT_2_udiv_r_f0, Y##_f1);		\
599 	  umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1,				\
600 		     _FP_DIV_MEAT_2_udiv_m_f0, R##_f0, Y##_f0);		\
601 	  _FP_DIV_MEAT_2_udiv_r_f0 = 0;					\
602 	  if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,			\
603 			     _FP_DIV_MEAT_2_udiv_r))			\
604 	    {								\
605 	      R##_f0--;							\
606 	      _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,			\
607 			      _FP_DIV_MEAT_2_udiv_r);			\
608 	      if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y)		\
609 		  && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,		\
610 				    _FP_DIV_MEAT_2_udiv_r))		\
611 		{							\
612 		  R##_f0--;						\
613 		  _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,		\
614 				  _FP_DIV_MEAT_2_udiv_r);		\
615 		}							\
616 	    }								\
617 	  if (!_FP_FRAC_EQ_2 (_FP_DIV_MEAT_2_udiv_r,			\
618 			      _FP_DIV_MEAT_2_udiv_m))			\
619 	    R##_f0 |= _FP_WORK_STICKY;					\
620 	}								\
621     }									\
622   while (0)
623 
624 
625 /* Square root algorithms:
626    We have just one right now, maybe Newton approximation
627    should be added for those machines where division is fast.  */
628 
629 #define _FP_SQRT_MEAT_2(R, S, T, X, q)				\
630   do								\
631     {								\
632       while (q)							\
633 	{							\
634 	  T##_f1 = S##_f1 + (q);				\
635 	  if (T##_f1 <= X##_f1)					\
636 	    {							\
637 	      S##_f1 = T##_f1 + (q);				\
638 	      X##_f1 -= T##_f1;					\
639 	      R##_f1 += (q);					\
640 	    }							\
641 	  _FP_FRAC_SLL_2 (X, 1);				\
642 	  (q) >>= 1;						\
643 	}							\
644       (q) = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1);		\
645       while ((q) != _FP_WORK_ROUND)				\
646 	{							\
647 	  T##_f0 = S##_f0 + (q);				\
648 	  T##_f1 = S##_f1;					\
649 	  if (T##_f1 < X##_f1					\
650 	      || (T##_f1 == X##_f1 && T##_f0 <= X##_f0))	\
651 	    {							\
652 	      S##_f0 = T##_f0 + (q);				\
653 	      S##_f1 += (T##_f0 > S##_f0);			\
654 	      _FP_FRAC_DEC_2 (X, T);				\
655 	      R##_f0 += (q);					\
656 	    }							\
657 	  _FP_FRAC_SLL_2 (X, 1);				\
658 	  (q) >>= 1;						\
659 	}							\
660       if (X##_f0 | X##_f1)					\
661 	{							\
662 	  if (S##_f1 < X##_f1					\
663 	      || (S##_f1 == X##_f1 && S##_f0 < X##_f0))		\
664 	    R##_f0 |= _FP_WORK_ROUND;				\
665 	  R##_f0 |= _FP_WORK_STICKY;				\
666 	}							\
667     }								\
668   while (0)
669 
670 
671 /* Assembly/disassembly for converting to/from integral types.
672    No shifting or overflow handled here.  */
673 
674 #define _FP_FRAC_ASSEMBLE_2(r, X, rsize)	\
675   (void) (((rsize) <= _FP_W_TYPE_SIZE)		\
676 	  ? ({ (r) = X##_f0; })			\
677 	  : ({					\
678 	      (r) = X##_f1;			\
679 	      (r) <<= _FP_W_TYPE_SIZE;		\
680 	      (r) += X##_f0;			\
681 	    }))
682 
683 #define _FP_FRAC_DISASSEMBLE_2(X, r, rsize)	\
684   do						\
685     {						\
686       X##_f0 = (r);				\
687       X##_f1 = ((rsize) <= _FP_W_TYPE_SIZE	\
688 		? 0				\
689 		: (r) >> _FP_W_TYPE_SIZE);	\
690     }						\
691   while (0)
692 
693 /* Convert FP values between word sizes.  */
694 
695 #define _FP_FRAC_COPY_1_2(D, S)		(D##_f = S##_f0)
696 
697 #define _FP_FRAC_COPY_2_1(D, S)		((D##_f0 = S##_f), (D##_f1 = 0))
698 
699 #define _FP_FRAC_COPY_2_2(D, S)		_FP_FRAC_COPY_2 (D, S)
700 
701 #endif /* !SOFT_FP_OP_2_H */
702