1 /*
2  * Copyright (C) 2017 Denys Vlasenko
3  *
4  * Licensed under GPLv2, see file LICENSE in this source tree.
5  */
6 #include "tls.h"
7 
8 /* The file is taken almost verbatim from matrixssl-3-7-2b-open/crypto/math/.
9  * Changes are flagged with //bbox
10  */
11 
12 /**
13  *	@file    pstm_sqr_comba.c
14  *	@version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
15  *
16  *	Multiprecision Squaring with Comba technique.
17  */
18 /*
19  *	Copyright (c) 2013-2015 INSIDE Secure Corporation
20  *	Copyright (c) PeerSec Networks, 2002-2011
21  *	All Rights Reserved
22  *
23  *	The latest version of this code is available at http://www.matrixssl.org
24  *
25  *	This software is open source; you can redistribute it and/or modify
26  *	it under the terms of the GNU General Public License as published by
27  *	the Free Software Foundation; either version 2 of the License, or
28  *	(at your option) any later version.
29  *
30  *	This General Public License does NOT permit incorporating this software
31  *	into proprietary programs.  If you are unable to comply with the GPL, a
32  *	commercial license for this software may be purchased from INSIDE at
33  *	http://www.insidesecure.com/eng/Company/Locations
34  *
35  *	This program is distributed in WITHOUT ANY WARRANTY; without even the
36  *	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37  *	See the GNU General Public License for more details.
38  *
39  *	You should have received a copy of the GNU General Public License
40  *	along with this program; if not, write to the Free Software
41  *	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
42  *	http://www.gnu.org/copyleft/gpl.html
43  */
44 /******************************************************************************/
45 
46 //bbox
47 //#include "../cryptoApi.h"
48 #ifndef DISABLE_PSTM
49 
50 /******************************************************************************/
51 #if defined(PSTM_X86)
52 /* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
53 #if !defined(__GNUC__) || !defined(__i386__)
54 #error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
55 #endif
56 //#pragma message ("Using 32 bit x86 Assembly Optimizations")
57 
58 #define COMBA_START
59 
60 #define CLEAR_CARRY \
61    c0 = c1 = c2 = 0;
62 
63 #define COMBA_STORE(x) \
64    x = c0;
65 
66 #define COMBA_STORE2(x) \
67    x = c1;
68 
69 #define CARRY_FORWARD \
70    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
71 
72 #define COMBA_FINI
73 
74 #define SQRADD(i, j)                                      \
75 asm(                                            \
76 	 "movl  %6,%%eax     \n\t"                            \
77 	 "mull  %%eax        \n\t"                            \
78 	 "addl  %%eax,%0     \n\t"                            \
79 	 "adcl  %%edx,%1     \n\t"                            \
80 	 "adcl  $0,%2        \n\t"                            \
81 	 :"=rm"(c0), "=rm"(c1), "=rm"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
82 	//bbox: ^^^ replaced "=r" with "=rm": %ebx is not available on shared build
83 
84 #define SQRADD2(i, j)                                     \
85 asm(                                            \
86 	 "movl  %6,%%eax     \n\t"                            \
87 	 "mull  %7           \n\t"                            \
88 	 "addl  %%eax,%0     \n\t"                            \
89 	 "adcl  %%edx,%1     \n\t"                            \
90 	 "adcl  $0,%2        \n\t"                            \
91 	 "addl  %%eax,%0     \n\t"                            \
92 	 "adcl  %%edx,%1     \n\t"                            \
93 	 "adcl  $0,%2        \n\t"                            \
94 	 :"=rm"(c0), "=rm"(c1), "=rm"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
95 	//bbox: ^^^ replaced "=r" with "=rm": %ebx is not available on shared build
96 
97 #define SQRADDSC(i, j)                                    \
98 asm(                                                     \
99 	 "movl  %6,%%eax     \n\t"                            \
100 	 "mull  %7           \n\t"                            \
101 	 "movl  %%eax,%0     \n\t"                            \
102 	 "movl  %%edx,%1     \n\t"                            \
103 	 "xorl  %2,%2        \n\t"                            \
104 	 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
105 
106 #define SQRADDAC(i, j)                                    \
107 asm(                                                     \
108 	 "movl  %6,%%eax     \n\t"                            \
109 	 "mull  %7           \n\t"                            \
110 	 "addl  %%eax,%0     \n\t"                            \
111 	 "adcl  %%edx,%1     \n\t"                            \
112 	 "adcl  $0,%2        \n\t"                            \
113 	 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
114 
115 #define SQRADDDB                                          \
116 asm(                                                     \
117 	 "addl %6,%0         \n\t"                            \
118 	 "adcl %7,%1         \n\t"                            \
119 	 "adcl %8,%2         \n\t"                            \
120 	 "addl %6,%0         \n\t"                            \
121 	 "adcl %7,%1         \n\t"                            \
122 	 "adcl %8,%2         \n\t"                            \
123 	 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
124 
125 /******************************************************************************/
126 #elif defined(PSTM_X86_64)
127 /* x86-64 optimized */
128 #if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
129 #error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
130 #endif
131 //#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
132 
133 #define COMBA_START
134 
135 #define CLEAR_CARRY \
136 c0 = c1 = c2 = 0;
137 
138 #define COMBA_STORE(x) \
139 x = c0;
140 
141 #define COMBA_STORE2(x) \
142 x = c1;
143 
144 #define CARRY_FORWARD \
145 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
146 
147 #define COMBA_FINI
148 
149 #define SQRADD(i, j)                                     \
150 asm(                                                     \
151 	"movq  %6,%%rax     \n\t"                            \
152 	"mulq  %%rax        \n\t"                            \
153 	"addq  %%rax,%0     \n\t"                            \
154 	"adcq  %%rdx,%1     \n\t"                            \
155 	"adcq  $0,%2        \n\t"                            \
156 	:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
157 
158 #define SQRADD2(i, j)                                    \
159 asm(                                                     \
160 	"movq  %6,%%rax     \n\t"                            \
161 	"mulq  %7           \n\t"                            \
162 	"addq  %%rax,%0     \n\t"                            \
163 	"adcq  %%rdx,%1     \n\t"                            \
164 	"adcq  $0,%2        \n\t"                            \
165 	"addq  %%rax,%0     \n\t"                            \
166 	"adcq  %%rdx,%1     \n\t"                            \
167 	"adcq  $0,%2        \n\t"                            \
168 	:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
169 
170 #define SQRADDSC(i, j)                                   \
171 asm(                                                     \
172 	"movq  %6,%%rax     \n\t"                            \
173 	"mulq  %7           \n\t"                            \
174 	"movq  %%rax,%0     \n\t"                            \
175 	"movq  %%rdx,%1     \n\t"                            \
176 	"xorq  %2,%2        \n\t"                            \
177 	:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
178 
179 #define SQRADDAC(i, j)                                   \
180 asm(                                                     \
181 	"movq  %6,%%rax     \n\t"                            \
182 	"mulq  %7           \n\t"                            \
183 	"addq  %%rax,%0     \n\t"                            \
184 	"adcq  %%rdx,%1     \n\t"                            \
185 	"adcq  $0,%2        \n\t"                            \
186 	:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
187 
188 #define SQRADDDB                                         \
189 asm(                                                     \
190 	"addq %6,%0         \n\t"                            \
191 	"adcq %7,%1         \n\t"                            \
192 	"adcq %8,%2         \n\t"                            \
193 	"addq %6,%0         \n\t"                            \
194 	"adcq %7,%1         \n\t"                            \
195 	"adcq %8,%2         \n\t"                            \
196 	:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
197 
198 /******************************************************************************/
199 #elif defined(PSTM_ARM)
200 /* ARM code */
201 //#pragma message ("Using 32 bit ARM Assembly Optimizations")
202 
203 #define COMBA_START
204 
205 #define CLEAR_CARRY \
206 c0 = c1 = c2 = 0;
207 
208 #define COMBA_STORE(x) \
209 x = c0;
210 
211 #define COMBA_STORE2(x) \
212 x = c1;
213 
214 #define CARRY_FORWARD \
215 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
216 
217 #define COMBA_FINI
218 
219 /* multiplies point i and j, updates carry "c1" and digit c2 */
220 #define SQRADD(i, j)                                             \
221 asm(                                                             \
222 "  UMULL  r0,r1,%6,%6              \n\t"                         \
223 "  ADDS   %0,%0,r0                 \n\t"                         \
224 "  ADCS   %1,%1,r1                 \n\t"                         \
225 "  ADC    %2,%2,#0                 \n\t"                         \
226 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
227 
228 /* for squaring some of the terms are doubled... */
229 #define SQRADD2(i, j)                                            \
230 asm(                                                             \
231 "  UMULL  r0,r1,%6,%7              \n\t"                         \
232 "  ADDS   %0,%0,r0                 \n\t"                         \
233 "  ADCS   %1,%1,r1                 \n\t"                         \
234 "  ADC    %2,%2,#0                 \n\t"                         \
235 "  ADDS   %0,%0,r0                 \n\t"                         \
236 "  ADCS   %1,%1,r1                 \n\t"                         \
237 "  ADC    %2,%2,#0                 \n\t"                         \
238 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
239 
240 #define SQRADDSC(i, j)                                           \
241 asm(                                                             \
242 "  UMULL  %0,%1,%6,%7              \n\t"                         \
243 "  SUB    %2,%2,%2                 \n\t"                         \
244 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "cc");
245 
246 #define SQRADDAC(i, j)                                           \
247 asm(                                                             \
248 "  UMULL  r0,r1,%6,%7              \n\t"                         \
249 "  ADDS   %0,%0,r0                 \n\t"                         \
250 "  ADCS   %1,%1,r1                 \n\t"                         \
251 "  ADC    %2,%2,#0                 \n\t"                         \
252 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
253 
254 #define SQRADDDB                                                 \
255 asm(                                                             \
256 "  ADDS  %0,%0,%3                     \n\t"                      \
257 "  ADCS  %1,%1,%4                     \n\t"                      \
258 "  ADC   %2,%2,%5                     \n\t"                      \
259 "  ADDS  %0,%0,%3                     \n\t"                      \
260 "  ADCS  %1,%1,%4                     \n\t"                      \
261 "  ADC   %2,%2,%5                     \n\t"                      \
262 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
263 
264 /******************************************************************************/
265 #elif defined(PSTM_MIPS)
266 /* MIPS32 */
267 //#pragma message ("Using 32 bit MIPS Assembly Optimizations")
268 
269 #define COMBA_START
270 
271 #define CLEAR_CARRY \
272 c0 = c1 = c2 = 0;
273 
274 #define COMBA_STORE(x) \
275 x = c0;
276 
277 #define COMBA_STORE2(x) \
278 x = c1;
279 
280 #define CARRY_FORWARD \
281 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
282 
283 #define COMBA_FINI
284 
285 /* multiplies point i and j, updates carry "c1" and digit c2 */
286 #define SQRADD(i, j)               \
287 asm(                               \
288 	" multu  %6,%6          \n\t"  \
289 	" mflo   $12            \n\t"  \
290 	" mfhi   $13            \n\t"  \
291 	" addu    %0,%0,$12     \n\t"  \
292 	" sltu   $12,%0,$12     \n\t"  \
293 	" addu    %1,%1,$13     \n\t"  \
294 	" sltu   $13,%1,$13     \n\t"  \
295 	" addu    %1,%1,$12     \n\t"  \
296 	" sltu   $12,%1,$12     \n\t"  \
297 	" addu    %2,%2,$13     \n\t"  \
298 	" addu    %2,%2,$12     \n\t"  \
299 	:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13");
300 
301 /* for squaring some of the terms are doubled... */
302 #define SQRADD2(i, j)             \
303 asm(                              \
304 	" multu  %6,%7          \n\t" \
305 	" mflo   $12            \n\t" \
306 	" mfhi   $13            \n\t" \
307 	\
308 	" addu    %0,%0,$12     \n\t" \
309 	" sltu   $14,%0,$12     \n\t" \
310 	" addu    %1,%1,$13     \n\t" \
311 	" sltu   $15,%1,$13     \n\t" \
312 	" addu    %1,%1,$14     \n\t" \
313 	" sltu   $14,%1,$14     \n\t" \
314 	" addu    %2,%2,$15     \n\t" \
315 	" addu    %2,%2,$14     \n\t" \
316 	\
317 	" addu    %0,%0,$12     \n\t" \
318 	" sltu   $14,%0,$12     \n\t" \
319 	" addu    %1,%1,$13     \n\t" \
320 	" sltu   $15,%1,$13     \n\t" \
321 	" addu    %1,%1,$14     \n\t" \
322 	" sltu   $14,%1,$14     \n\t" \
323 	" addu    %2,%2,$15     \n\t" \
324 	" addu    %2,%2,$14     \n\t" \
325 	:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15");
326 
327 #define SQRADDSC(i, j)             \
328 asm(                               \
329 	" multu  %6,%7          \n\t"  \
330 	" mflo   %0             \n\t"  \
331 	" mfhi   %1             \n\t"  \
332 	" xor    %2,%2,%2       \n\t"  \
333 	:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
334 
335 #define SQRADDAC(i, j)            \
336 asm(                              \
337 	" multu  %6,%7          \n\t" \
338 	" mflo   $12            \n\t" \
339 	" mfhi   $13            \n\t" \
340 	" addu    %0,%0,$12     \n\t" \
341 	" sltu   $12,%0,$12     \n\t" \
342 	" addu    %1,%1,$13     \n\t" \
343 	" sltu   $13,%1,$13     \n\t" \
344 	" addu    %1,%1,$12     \n\t" \
345 	" sltu   $12,%1,$12     \n\t" \
346 	" addu    %2,%2,$13     \n\t" \
347 	" addu    %2,%2,$12     \n\t" \
348 	:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14");
349 
350 #define SQRADDDB                   \
351 asm(                               \
352 	" addu    %0,%0,%3       \n\t" \
353 	" sltu   $10,%0,%3       \n\t" \
354 	" addu    %1,%1,$10      \n\t" \
355 	" sltu   $10,%1,$10      \n\t" \
356 	" addu    %1,%1,%4       \n\t" \
357 	" sltu   $11,%1,%4       \n\t" \
358 	" addu    %2,%2,$10      \n\t" \
359 	" addu    %2,%2,$11      \n\t" \
360 	" addu    %2,%2,%5       \n\t" \
361 	\
362 	" addu    %0,%0,%3       \n\t" \
363 	" sltu   $10,%0,%3       \n\t" \
364 	" addu    %1,%1,$10      \n\t" \
365 	" sltu   $10,%1,$10      \n\t" \
366 	" addu    %1,%1,%4       \n\t" \
367 	" sltu   $11,%1,%4       \n\t" \
368 	" addu    %2,%2,$10      \n\t" \
369 	" addu    %2,%2,$11      \n\t" \
370 	" addu    %2,%2,%5       \n\t" \
371 	:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11");
372 
373 #else
374 /******************************************************************************/
375 #define PSTM_ISO
376 /* ISO C portable code */
377 
378 #define COMBA_START
379 
380 #define CLEAR_CARRY \
381    c0 = c1 = c2 = 0;
382 
383 #define COMBA_STORE(x) \
384    x = c0;
385 
386 #define COMBA_STORE2(x) \
387    x = c1;
388 
389 #define CARRY_FORWARD \
390    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
391 
392 #define COMBA_FINI
393 
394 /* multiplies point i and j, updates carry "c1" and digit c2 */
395 #define SQRADD(i, j)													\
396    do { pstm_word t;													\
397    t = c0 + ((pstm_word)i) * ((pstm_word)j);  c0 = (pstm_digit)t;		\
398    t = c1 + (t >> DIGIT_BIT);											\
399    c1 = (pstm_digit)t; c2 += (pstm_digit)(t >> DIGIT_BIT);				\
400    } while (0);
401 
402 
403 /* for squaring some of the terms are doubled... */
404 #define SQRADD2(i, j)											\
405    do { pstm_word t;											\
406    t  = ((pstm_word)i) * ((pstm_word)j);						\
407    tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt;					\
408    tt = (pstm_word)c1 + (tt >> DIGIT_BIT);						\
409    c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT);	\
410    tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt;					\
411    tt = (pstm_word)c1 + (tt >> DIGIT_BIT);						\
412    c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT);	\
413    } while (0);
414 
415 #define SQRADDSC(i, j)										\
416    do { pstm_word t;										\
417 	  t =  ((pstm_word)i) * ((pstm_word)j);					\
418 	  sc0 = (pstm_digit)t; sc1 = (pstm_digit)(t >> DIGIT_BIT); sc2 = 0;	\
419    } while (0);
420 
421 #define SQRADDAC(i, j)														\
422    do { pstm_word t;														\
423    t = ((pstm_word)sc0) + ((pstm_word)i) * ((pstm_word)j);					\
424    sc0 = (pstm_digit)t;														\
425    t = ((pstm_word)sc1) + (t >> DIGIT_BIT); sc1 = (pstm_digit)t;			\
426    sc2 += (pstm_digit)(t >> DIGIT_BIT);										\
427    } while (0);
428 
429 #define SQRADDDB															\
430    do { pstm_word t;														\
431    t = ((pstm_word)sc0) + ((pstm_word)sc0) + ((pstm_word)c0);				\
432    c0 = (pstm_digit)t;														\
433    t = ((pstm_word)sc1) + ((pstm_word)sc1) + c1 + (t >> DIGIT_BIT);			\
434    c1 = (pstm_digit)t;														\
435    c2 = c2 + sc2 + sc2 + (pstm_digit)(t >> DIGIT_BIT);						\
436    } while (0);
437 
438 #endif /* ISO_C */
439 
440 /******************************************************************************/
441 /*
442 	Non-unrolled comba squarer
443  */
444 //bbox: pool unused
445 #define pstm_sqr_comba_gen(pool, A, B, paD, paDlen) \
446         pstm_sqr_comba_gen(      A, B, paD, paDlen)
pstm_sqr_comba_gen(psPool_t * pool,pstm_int * A,pstm_int * B,pstm_digit * paD,uint32 paDlen)447 static int32 pstm_sqr_comba_gen(psPool_t *pool, pstm_int *A, pstm_int *B,
448 			pstm_digit *paD, uint32 paDlen)
449 {
450 	int		paDfail, pa; //bbox: was int16
451 	int32       ix, iz;
452 	pstm_digit  c0, c1, c2, *dst;
453 #ifdef PSTM_ISO
454 	pstm_word   tt;
455 #endif
456 
457 	paDfail = 0;
458 	/* get size of output and trim */
459 	pa = A->used + A->used;
460 
461 	/* number of output digits to produce */
462 	COMBA_START;
463 	CLEAR_CARRY;
464 /*
465 	If b is not large enough grow it and continue
466 */
467 	if (B->alloc < pa) {
468 		if (pstm_grow(B, pa) != PSTM_OKAY) {
469 			return PS_MEM_FAIL;
470 		}
471 	}
472 	if (paD != NULL) {
473 		if (paDlen < (sizeof(pstm_digit) * pa)) {
474 			paDfail = 1; /* have a paD, but it's not big enough */
475 			dst = xzalloc(sizeof(pstm_digit) * pa);//bbox
476 		} else {
477 			dst = paD;
478 			memset(dst, 0x0, paDlen);
479 		}
480 	} else {
481 		dst = xzalloc(sizeof(pstm_digit) * pa);//bbox
482 	}
483 
484 	for (ix = 0; ix < pa; ix++) {
485 		int32      tx, ty, iy;
486 		pstm_digit *tmpy, *tmpx;
487 
488 		/* get offsets into the two bignums */
489 		ty = min(A->used-1, ix);
490 		tx = ix - ty;
491 
492 		/* setup temp aliases */
493 		tmpx = A->dp + tx;
494 		tmpy = A->dp + ty;
495 
496 /*
497 			This is the number of times the loop will iterate,
498 				while (tx++ < a->used && ty-- >= 0) { ... }
499 */
500 		iy = min(A->used-tx, ty+1);
501 
502 /*
503 		now for squaring tx can never equal ty. We halve the distance since
504 		they approach at a rate of 2x and we have to round because odd cases
505 		need to be executed
506 */
507 		iy = min(iy, (ty-tx+1)>>1);
508 
509 		/* forward carries */
510 		CARRY_FORWARD;
511 
512 		/* execute loop */
513 		for (iz = 0; iz < iy; iz++) {
514 			SQRADD2(*tmpx++, *tmpy--);
515 		}
516 
517 		/* even columns have the square term in them */
518 		if ((ix&1) == 0) {
519 			SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
520 		}
521 
522 		/* store it */
523 		COMBA_STORE(dst[ix]);
524 	}
525 
526 	COMBA_FINI;
527 /*
528 	setup dest
529  */
530 	iz  = B->used;
531 	B->used = pa;
532 	{
533 		pstm_digit *tmpc;
534 		tmpc = B->dp;
535 		for (ix = 0; ix < pa; ix++) {
536 			*tmpc++ = dst[ix];
537 		}
538 		/*	clear unused digits (that existed in the old copy of c) */
539 		for (; ix < iz; ix++) {
540 			*tmpc++ = 0;
541 		}
542 	}
543 	pstm_clamp(B);
544 
545 	if ((paD == NULL) || paDfail == 1) {
546 		psFree(dst, pool);
547 	}
548 	return PS_SUCCESS;
549 }
550 
551 /******************************************************************************/
552 /*
553 	Unrolled Comba loop for 1024 bit keys
554  */
555 #ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
pstm_sqr_comba16(pstm_int * A,pstm_int * B)556 static int32 pstm_sqr_comba16(pstm_int *A, pstm_int *B)
557 {
558 	pstm_digit *a, b[32], c0, c1, c2, sc0, sc1, sc2;
559 #ifdef PSTM_ISO
560 	pstm_word   tt;
561 #endif
562 
563 	if (B->alloc < 32) {
564 		if (pstm_grow(B, 32) != PSTM_OKAY) {
565 			return PS_MEM_FAIL;
566 		}
567 	}
568 	a = A->dp;
569 	sc0 = sc1 = sc2 = 0;
570 
571 	COMBA_START;
572 
573    /* clear carries */
574    CLEAR_CARRY;
575 
576    /* output 0 */
577    SQRADD(a[0],a[0]);
578    COMBA_STORE(b[0]);
579 
580    /* output 1 */
581    CARRY_FORWARD;
582    SQRADD2(a[0], a[1]);
583    COMBA_STORE(b[1]);
584 
585    /* output 2 */
586    CARRY_FORWARD;
587    SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
588    COMBA_STORE(b[2]);
589 
590    /* output 3 */
591    CARRY_FORWARD;
592    SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
593    COMBA_STORE(b[3]);
594 
595    /* output 4 */
596    CARRY_FORWARD;
597    SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
598    COMBA_STORE(b[4]);
599 
600    /* output 5 */
601    CARRY_FORWARD;
602    SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
603    COMBA_STORE(b[5]);
604 
605    /* output 6 */
606    CARRY_FORWARD;
607    SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
608    COMBA_STORE(b[6]);
609 
610    /* output 7 */
611    CARRY_FORWARD;
612    SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
613    COMBA_STORE(b[7]);
614 
615    /* output 8 */
616    CARRY_FORWARD;
617    SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
618    COMBA_STORE(b[8]);
619 
620    /* output 9 */
621    CARRY_FORWARD;
622    SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
623    COMBA_STORE(b[9]);
624 
625    /* output 10 */
626    CARRY_FORWARD;
627    SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
628    COMBA_STORE(b[10]);
629 
630    /* output 11 */
631    CARRY_FORWARD;
632    SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
633    COMBA_STORE(b[11]);
634 
635    /* output 12 */
636    CARRY_FORWARD;
637    SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
638    COMBA_STORE(b[12]);
639 
640    /* output 13 */
641    CARRY_FORWARD;
642    SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
643    COMBA_STORE(b[13]);
644 
645    /* output 14 */
646    CARRY_FORWARD;
647    SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
648    COMBA_STORE(b[14]);
649 
650    /* output 15 */
651    CARRY_FORWARD;
652    SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
653    COMBA_STORE(b[15]);
654 
655    /* output 16 */
656    CARRY_FORWARD;
657    SQRADDSC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
658    COMBA_STORE(b[16]);
659 
660    /* output 17 */
661    CARRY_FORWARD;
662    SQRADDSC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
663    COMBA_STORE(b[17]);
664 
665    /* output 18 */
666    CARRY_FORWARD;
667    SQRADDSC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
668    COMBA_STORE(b[18]);
669 
670    /* output 19 */
671    CARRY_FORWARD;
672    SQRADDSC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
673    COMBA_STORE(b[19]);
674 
675    /* output 20 */
676    CARRY_FORWARD;
677    SQRADDSC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
678    COMBA_STORE(b[20]);
679 
680    /* output 21 */
681    CARRY_FORWARD;
682    SQRADDSC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
683    COMBA_STORE(b[21]);
684 
685    /* output 22 */
686    CARRY_FORWARD;
687    SQRADDSC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
688    COMBA_STORE(b[22]);
689 
690    /* output 23 */
691    CARRY_FORWARD;
692    SQRADDSC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
693    COMBA_STORE(b[23]);
694 
695    /* output 24 */
696    CARRY_FORWARD;
697    SQRADDSC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
698    COMBA_STORE(b[24]);
699 
700    /* output 25 */
701    CARRY_FORWARD;
702    SQRADDSC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
703    COMBA_STORE(b[25]);
704 
705    /* output 26 */
706    CARRY_FORWARD;
707    SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]);
708    COMBA_STORE(b[26]);
709 
710    /* output 27 */
711    CARRY_FORWARD;
712    SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]);
713    COMBA_STORE(b[27]);
714 
715    /* output 28 */
716    CARRY_FORWARD;
717    SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]);
718    COMBA_STORE(b[28]);
719 
720    /* output 29 */
721    CARRY_FORWARD;
722    SQRADD2(a[14], a[15]);
723    COMBA_STORE(b[29]);
724 
725    /* output 30 */
726    CARRY_FORWARD;
727    SQRADD(a[15], a[15]);
728    COMBA_STORE(b[30]);
729    COMBA_STORE2(b[31]);
730    COMBA_FINI;
731 
732    B->used = 32;
733    B->sign = PSTM_ZPOS;
734    memcpy(B->dp, b, 32 * sizeof(pstm_digit));
735    pstm_clamp(B);
736    return PSTM_OKAY;
737 }
738 #endif /* USE_1024_KEY_SPEED_OPTIMIZATIONS */
739 
740 
741 #ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
pstm_sqr_comba32(pstm_int * A,pstm_int * B)742 static int32 pstm_sqr_comba32(pstm_int *A, pstm_int *B)
743 {
744    pstm_digit *a, b[64], c0, c1, c2, sc0, sc1, sc2;
745 #ifdef PSTM_ISO
746    pstm_word tt;
747 #endif
748 
749 	if (B->alloc < 64) {
750 		if (pstm_grow(B, 64) != PSTM_OKAY) {
751 			return PS_MEM_FAIL;
752 		}
753 	}
754 	sc0 = sc1 = sc2 = 0;
755    a = A->dp;
756    COMBA_START;
757 
758    /* clear carries */
759    CLEAR_CARRY;
760 
761    /* output 0 */
762    SQRADD(a[0],a[0]);
763    COMBA_STORE(b[0]);
764 
765    /* output 1 */
766    CARRY_FORWARD;
767    SQRADD2(a[0], a[1]);
768    COMBA_STORE(b[1]);
769 
770    /* output 2 */
771    CARRY_FORWARD;
772    SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
773    COMBA_STORE(b[2]);
774 
775    /* output 3 */
776    CARRY_FORWARD;
777    SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
778    COMBA_STORE(b[3]);
779 
780    /* output 4 */
781    CARRY_FORWARD;
782    SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
783    COMBA_STORE(b[4]);
784 
785    /* output 5 */
786    CARRY_FORWARD;
787    SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
788    COMBA_STORE(b[5]);
789 
790    /* output 6 */
791    CARRY_FORWARD;
792    SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
793    COMBA_STORE(b[6]);
794 
795    /* output 7 */
796    CARRY_FORWARD;
797    SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
798    COMBA_STORE(b[7]);
799 
800    /* output 8 */
801    CARRY_FORWARD;
802    SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
803    COMBA_STORE(b[8]);
804 
805    /* output 9 */
806    CARRY_FORWARD;
807    SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
808    COMBA_STORE(b[9]);
809 
810    /* output 10 */
811    CARRY_FORWARD;
812    SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
813    COMBA_STORE(b[10]);
814 
815    /* output 11 */
816    CARRY_FORWARD;
817    SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
818    COMBA_STORE(b[11]);
819 
820    /* output 12 */
821    CARRY_FORWARD;
822    SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
823    COMBA_STORE(b[12]);
824 
825    /* output 13 */
826    CARRY_FORWARD;
827    SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
828    COMBA_STORE(b[13]);
829 
830    /* output 14 */
831    CARRY_FORWARD;
832    SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
833    COMBA_STORE(b[14]);
834 
835    /* output 15 */
836    CARRY_FORWARD;
837    SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
838    COMBA_STORE(b[15]);
839 
840    /* output 16 */
841    CARRY_FORWARD;
842    SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
843    COMBA_STORE(b[16]);
844 
845    /* output 17 */
846    CARRY_FORWARD;
847    SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
848    COMBA_STORE(b[17]);
849 
850    /* output 18 */
851    CARRY_FORWARD;
852    SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
853    COMBA_STORE(b[18]);
854 
855    /* output 19 */
856    CARRY_FORWARD;
857    SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
858    COMBA_STORE(b[19]);
859 
860    /* output 20 */
861    CARRY_FORWARD;
862    SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
863    COMBA_STORE(b[20]);
864 
865    /* output 21 */
866    CARRY_FORWARD;
867    SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
868    COMBA_STORE(b[21]);
869 
870    /* output 22 */
871    CARRY_FORWARD;
872    SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
873    COMBA_STORE(b[22]);
874 
875    /* output 23 */
876    CARRY_FORWARD;
877    SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
878    COMBA_STORE(b[23]);
879 
880    /* output 24 */
881    CARRY_FORWARD;
882    SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
883    COMBA_STORE(b[24]);
884 
885    /* output 25 */
886    CARRY_FORWARD;
887    SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
888    COMBA_STORE(b[25]);
889 
890    /* output 26 */
891    CARRY_FORWARD;
892    SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]);
893    COMBA_STORE(b[26]);
894 
895    /* output 27 */
896    CARRY_FORWARD;
897    SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB;
898    COMBA_STORE(b[27]);
899 
900    /* output 28 */
901    CARRY_FORWARD;
902    SQRADDSC(a[0], a[28]); SQRADDAC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]);
903    COMBA_STORE(b[28]);
904 
905    /* output 29 */
906    CARRY_FORWARD;
907    SQRADDSC(a[0], a[29]); SQRADDAC(a[1], a[28]); SQRADDAC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB;
908    COMBA_STORE(b[29]);
909 
910    /* output 30 */
911    CARRY_FORWARD;
912    SQRADDSC(a[0], a[30]); SQRADDAC(a[1], a[29]); SQRADDAC(a[2], a[28]); SQRADDAC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]);
913    COMBA_STORE(b[30]);
914 
915    /* output 31 */
916    CARRY_FORWARD;
917    SQRADDSC(a[0], a[31]); SQRADDAC(a[1], a[30]); SQRADDAC(a[2], a[29]); SQRADDAC(a[3], a[28]); SQRADDAC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB;
918    COMBA_STORE(b[31]);
919 
920    /* output 32 */
921    CARRY_FORWARD;
922    SQRADDSC(a[1], a[31]); SQRADDAC(a[2], a[30]); SQRADDAC(a[3], a[29]); SQRADDAC(a[4], a[28]); SQRADDAC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]);
923    COMBA_STORE(b[32]);
924 
925    /* output 33 */
926    CARRY_FORWARD;
927    SQRADDSC(a[2], a[31]); SQRADDAC(a[3], a[30]); SQRADDAC(a[4], a[29]); SQRADDAC(a[5], a[28]); SQRADDAC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB;
928    COMBA_STORE(b[33]);
929 
930    /* output 34 */
931    CARRY_FORWARD;
932    SQRADDSC(a[3], a[31]); SQRADDAC(a[4], a[30]); SQRADDAC(a[5], a[29]); SQRADDAC(a[6], a[28]); SQRADDAC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]);
933    COMBA_STORE(b[34]);
934 
935    /* output 35 */
936    CARRY_FORWARD;
937    SQRADDSC(a[4], a[31]); SQRADDAC(a[5], a[30]); SQRADDAC(a[6], a[29]); SQRADDAC(a[7], a[28]); SQRADDAC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB;
938    COMBA_STORE(b[35]);
939 
940    /* output 36 */
941    CARRY_FORWARD;
942    SQRADDSC(a[5], a[31]); SQRADDAC(a[6], a[30]); SQRADDAC(a[7], a[29]); SQRADDAC(a[8], a[28]); SQRADDAC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]);
943    COMBA_STORE(b[36]);
944 
945    /* output 37 */
946    CARRY_FORWARD;
947    SQRADDSC(a[6], a[31]); SQRADDAC(a[7], a[30]); SQRADDAC(a[8], a[29]); SQRADDAC(a[9], a[28]); SQRADDAC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB;
948    COMBA_STORE(b[37]);
949 
950    /* output 38 */
951    CARRY_FORWARD;
952    SQRADDSC(a[7], a[31]); SQRADDAC(a[8], a[30]); SQRADDAC(a[9], a[29]); SQRADDAC(a[10], a[28]); SQRADDAC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]);
953    COMBA_STORE(b[38]);
954 
955    /* output 39 */
956    CARRY_FORWARD;
957    SQRADDSC(a[8], a[31]); SQRADDAC(a[9], a[30]); SQRADDAC(a[10], a[29]); SQRADDAC(a[11], a[28]); SQRADDAC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB;
958    COMBA_STORE(b[39]);
959 
960    /* output 40 */
961    CARRY_FORWARD;
962    SQRADDSC(a[9], a[31]); SQRADDAC(a[10], a[30]); SQRADDAC(a[11], a[29]); SQRADDAC(a[12], a[28]); SQRADDAC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]);
963    COMBA_STORE(b[40]);
964 
965    /* output 41 */
966    CARRY_FORWARD;
967    SQRADDSC(a[10], a[31]); SQRADDAC(a[11], a[30]); SQRADDAC(a[12], a[29]); SQRADDAC(a[13], a[28]); SQRADDAC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB;
968    COMBA_STORE(b[41]);
969 
970    /* output 42 */
971    CARRY_FORWARD;
972    SQRADDSC(a[11], a[31]); SQRADDAC(a[12], a[30]); SQRADDAC(a[13], a[29]); SQRADDAC(a[14], a[28]); SQRADDAC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]);
973    COMBA_STORE(b[42]);
974 
975    /* output 43 */
976    CARRY_FORWARD;
977    SQRADDSC(a[12], a[31]); SQRADDAC(a[13], a[30]); SQRADDAC(a[14], a[29]); SQRADDAC(a[15], a[28]); SQRADDAC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB;
978    COMBA_STORE(b[43]);
979 
980    /* output 44 */
981    CARRY_FORWARD;
982    SQRADDSC(a[13], a[31]); SQRADDAC(a[14], a[30]); SQRADDAC(a[15], a[29]); SQRADDAC(a[16], a[28]); SQRADDAC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]);
983    COMBA_STORE(b[44]);
984 
985    /* output 45 */
986    CARRY_FORWARD;
987    SQRADDSC(a[14], a[31]); SQRADDAC(a[15], a[30]); SQRADDAC(a[16], a[29]); SQRADDAC(a[17], a[28]); SQRADDAC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB;
988    COMBA_STORE(b[45]);
989 
990    /* output 46 */
991    CARRY_FORWARD;
992    SQRADDSC(a[15], a[31]); SQRADDAC(a[16], a[30]); SQRADDAC(a[17], a[29]); SQRADDAC(a[18], a[28]); SQRADDAC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]);
993    COMBA_STORE(b[46]);
994 
995    /* output 47 */
996    CARRY_FORWARD;
997    SQRADDSC(a[16], a[31]); SQRADDAC(a[17], a[30]); SQRADDAC(a[18], a[29]); SQRADDAC(a[19], a[28]); SQRADDAC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB;
998    COMBA_STORE(b[47]);
999 
1000    /* output 48 */
1001    CARRY_FORWARD;
1002    SQRADDSC(a[17], a[31]); SQRADDAC(a[18], a[30]); SQRADDAC(a[19], a[29]); SQRADDAC(a[20], a[28]); SQRADDAC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]);
1003    COMBA_STORE(b[48]);
1004 
1005    /* output 49 */
1006    CARRY_FORWARD;
1007    SQRADDSC(a[18], a[31]); SQRADDAC(a[19], a[30]); SQRADDAC(a[20], a[29]); SQRADDAC(a[21], a[28]); SQRADDAC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB;
1008    COMBA_STORE(b[49]);
1009 
1010    /* output 50 */
1011    CARRY_FORWARD;
1012    SQRADDSC(a[19], a[31]); SQRADDAC(a[20], a[30]); SQRADDAC(a[21], a[29]); SQRADDAC(a[22], a[28]); SQRADDAC(a[23], a[27]); SQRADDAC(a[24], a[26]); SQRADDDB; SQRADD(a[25], a[25]);
1013    COMBA_STORE(b[50]);
1014 
1015    /* output 51 */
1016    CARRY_FORWARD;
1017    SQRADDSC(a[20], a[31]); SQRADDAC(a[21], a[30]); SQRADDAC(a[22], a[29]); SQRADDAC(a[23], a[28]); SQRADDAC(a[24], a[27]); SQRADDAC(a[25], a[26]); SQRADDDB;
1018    COMBA_STORE(b[51]);
1019 
1020    /* output 52 */
1021    CARRY_FORWARD;
1022    SQRADDSC(a[21], a[31]); SQRADDAC(a[22], a[30]); SQRADDAC(a[23], a[29]); SQRADDAC(a[24], a[28]); SQRADDAC(a[25], a[27]); SQRADDDB; SQRADD(a[26], a[26]);
1023    COMBA_STORE(b[52]);
1024 
1025    /* output 53 */
1026    CARRY_FORWARD;
1027    SQRADDSC(a[22], a[31]); SQRADDAC(a[23], a[30]); SQRADDAC(a[24], a[29]); SQRADDAC(a[25], a[28]); SQRADDAC(a[26], a[27]); SQRADDDB;
1028    COMBA_STORE(b[53]);
1029 
1030    /* output 54 */
1031    CARRY_FORWARD;
1032    SQRADDSC(a[23], a[31]); SQRADDAC(a[24], a[30]); SQRADDAC(a[25], a[29]); SQRADDAC(a[26], a[28]); SQRADDDB; SQRADD(a[27], a[27]);
1033    COMBA_STORE(b[54]);
1034 
1035    /* output 55 */
1036    CARRY_FORWARD;
1037    SQRADDSC(a[24], a[31]); SQRADDAC(a[25], a[30]); SQRADDAC(a[26], a[29]); SQRADDAC(a[27], a[28]); SQRADDDB;
1038    COMBA_STORE(b[55]);
1039 
1040    /* output 56 */
1041    CARRY_FORWARD;
1042    SQRADDSC(a[25], a[31]); SQRADDAC(a[26], a[30]); SQRADDAC(a[27], a[29]); SQRADDDB; SQRADD(a[28], a[28]);
1043    COMBA_STORE(b[56]);
1044 
1045    /* output 57 */
1046    CARRY_FORWARD;
1047    SQRADDSC(a[26], a[31]); SQRADDAC(a[27], a[30]); SQRADDAC(a[28], a[29]); SQRADDDB;
1048    COMBA_STORE(b[57]);
1049 
1050    /* output 58 */
1051    CARRY_FORWARD;
1052    SQRADD2(a[27], a[31]); SQRADD2(a[28], a[30]); SQRADD(a[29], a[29]);
1053    COMBA_STORE(b[58]);
1054 
1055    /* output 59 */
1056    CARRY_FORWARD;
1057    SQRADD2(a[28], a[31]); SQRADD2(a[29], a[30]);
1058    COMBA_STORE(b[59]);
1059 
1060    /* output 60 */
1061    CARRY_FORWARD;
1062    SQRADD2(a[29], a[31]); SQRADD(a[30], a[30]);
1063    COMBA_STORE(b[60]);
1064 
1065    /* output 61 */
1066    CARRY_FORWARD;
1067    SQRADD2(a[30], a[31]);
1068    COMBA_STORE(b[61]);
1069 
1070    /* output 62 */
1071    CARRY_FORWARD;
1072    SQRADD(a[31], a[31]);
1073    COMBA_STORE(b[62]);
1074    COMBA_STORE2(b[63]);
1075    COMBA_FINI;
1076 
1077    B->used = 64;
1078    B->sign = PSTM_ZPOS;
1079    memcpy(B->dp, b, 64 * sizeof(pstm_digit));
1080    pstm_clamp(B);
1081    return PSTM_OKAY;
1082 }
1083 #endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
1084 
1085 /******************************************************************************/
1086 /*
1087  */
pstm_sqr_comba(psPool_t * pool,pstm_int * A,pstm_int * B,pstm_digit * paD,uint32 paDlen)1088 int32 FAST_FUNC pstm_sqr_comba(psPool_t *pool, pstm_int *A, pstm_int *B, pstm_digit *paD,
1089 		uint32 paDlen)
1090 {
1091 #ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
1092 	if (A->used == 16) {
1093 		return pstm_sqr_comba16(A, B);
1094 	} else {
1095 #ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
1096 		if (A->used == 32) {
1097 			return pstm_sqr_comba32(A, B);
1098 		}
1099 #endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
1100 		return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
1101 	}
1102 #else
1103 #ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
1104 	if (A->used == 32) {
1105 		return pstm_sqr_comba32(A, B);
1106 	}
1107 #endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
1108 	return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
1109 #endif
1110 }
1111 
1112 #endif /* DISABLE_PSTM */
1113 /******************************************************************************/
1114