1 /* vi: set sw=4 ts=4: */
2 /*
3  * Utility routines.
4  *
5  * Copyright (C) 2010 Denys Vlasenko
6  *
7  * Licensed under GPLv2 or later, see file LICENSE in this source tree.
8  */
9 #include "libbb.h"
10 
11 #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
12 
13 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
14  * (for rotX32, there is no difference). Why? My guess is that
15  * macro requires clever common subexpression elimination heuristics
16  * in gcc, while inline basically forces it to happen.
17  */
18 //#define rotl32(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
rotl32(uint32_t x,unsigned n)19 static ALWAYS_INLINE uint32_t rotl32(uint32_t x, unsigned n)
20 {
21 	return (x << n) | (x >> (32 - n));
22 }
23 //#define rotr32(x,n) (((x) >> (n)) | ((x) << (32 - (n))))
rotr32(uint32_t x,unsigned n)24 static ALWAYS_INLINE uint32_t rotr32(uint32_t x, unsigned n)
25 {
26 	return (x >> n) | (x << (32 - n));
27 }
28 /* rotr64 in needed for sha512 only: */
29 //#define rotr64(x,n) (((x) >> (n)) | ((x) << (64 - (n))))
rotr64(uint64_t x,unsigned n)30 static ALWAYS_INLINE uint64_t rotr64(uint64_t x, unsigned n)
31 {
32 	return (x >> n) | (x << (64 - n));
33 }
34 
35 /* rotl64 only used for sha3 currently */
rotl64(uint64_t x,unsigned n)36 static ALWAYS_INLINE uint64_t rotl64(uint64_t x, unsigned n)
37 {
38 	return (x << n) | (x >> (64 - n));
39 }
40 
41 /* Process the remaining bytes in the buffer */
common64_end(md5_ctx_t * ctx,int swap_needed)42 static void FAST_FUNC common64_end(md5_ctx_t *ctx, int swap_needed)
43 {
44 	unsigned bufpos = ctx->total64 & 63;
45 	/* Pad the buffer to the next 64-byte boundary with 0x80,0,0,0... */
46 	ctx->wbuffer[bufpos++] = 0x80;
47 
48 	/* This loop iterates either once or twice, no more, no less */
49 	while (1) {
50 		unsigned remaining = 64 - bufpos;
51 		memset(ctx->wbuffer + bufpos, 0, remaining);
52 		/* Do we have enough space for the length count? */
53 		if (remaining >= 8) {
54 			/* Store the 64-bit counter of bits in the buffer */
55 			uint64_t t = ctx->total64 << 3;
56 			if (swap_needed)
57 				t = bb_bswap_64(t);
58 			/* wbuffer is suitably aligned for this */
59 			*(bb__aliased_uint64_t *) (&ctx->wbuffer[64 - 8]) = t;
60 		}
61 		ctx->process_block(ctx);
62 		if (remaining >= 8)
63 			break;
64 		bufpos = 0;
65 	}
66 }
67 
68 
69 /*
70  * Compute MD5 checksum of strings according to the
71  * definition of MD5 in RFC 1321 from April 1992.
72  *
73  * Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
74  *
75  * Copyright (C) 1995-1999 Free Software Foundation, Inc.
76  * Copyright (C) 2001 Manuel Novoa III
77  * Copyright (C) 2003 Glenn L. McGrath
78  * Copyright (C) 2003 Erik Andersen
79  *
80  * Licensed under GPLv2 or later, see file LICENSE in this source tree.
81  */
82 
83 /* 0: fastest, 3: smallest */
84 #if CONFIG_MD5_SMALL < 0
85 # define MD5_SMALL 0
86 #elif CONFIG_MD5_SMALL > 3
87 # define MD5_SMALL 3
88 #else
89 # define MD5_SMALL CONFIG_MD5_SMALL
90 #endif
91 
92 /* These are the four functions used in the four steps of the MD5 algorithm
93  * and defined in the RFC 1321.  The first function is a little bit optimized
94  * (as found in Colin Plumbs public domain implementation).
95  * #define FF(b, c, d) ((b & c) | (~b & d))
96  */
97 #undef FF
98 #undef FG
99 #undef FH
100 #undef FI
101 #define FF(b, c, d) (d ^ (b & (c ^ d)))
102 #define FG(b, c, d) FF(d, b, c)
103 #define FH(b, c, d) (b ^ c ^ d)
104 #define FI(b, c, d) (c ^ (b | ~d))
105 
106 /* Hash a single block, 64 bytes long and 4-byte aligned */
md5_process_block64(md5_ctx_t * ctx)107 static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
108 {
109 #if MD5_SMALL > 0
110 	/* Before we start, one word to the strange constants.
111 	   They are defined in RFC 1321 as
112 	   T[i] = (int)(2^32 * fabs(sin(i))), i=1..64
113 	 */
114 	static const uint32_t C_array[] ALIGN4 = {
115 		/* round 1 */
116 		0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
117 		0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
118 		0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
119 		0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
120 		/* round 2 */
121 		0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
122 		0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
123 		0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
124 		0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
125 		/* round 3 */
126 		0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
127 		0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
128 		0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
129 		0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
130 		/* round 4 */
131 		0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
132 		0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
133 		0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
134 		0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
135 	};
136 	static const char P_array[] ALIGN1 = {
137 # if MD5_SMALL > 1
138 		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
139 # endif
140 		1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
141 		5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
142 		0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9  /* 4 */
143 	};
144 #endif
145 	uint32_t *words = (void*) ctx->wbuffer;
146 	uint32_t A = ctx->hash[0];
147 	uint32_t B = ctx->hash[1];
148 	uint32_t C = ctx->hash[2];
149 	uint32_t D = ctx->hash[3];
150 
151 #if MD5_SMALL >= 2  /* 2 or 3 */
152 
153 	static const char S_array[] ALIGN1 = {
154 		7, 12, 17, 22,
155 		5, 9, 14, 20,
156 		4, 11, 16, 23,
157 		6, 10, 15, 21
158 	};
159 	const uint32_t *pc;
160 	const char *pp;
161 	const char *ps;
162 	int i;
163 	uint32_t temp;
164 
165 	if (BB_BIG_ENDIAN)
166 		for (i = 0; i < 16; i++)
167 			words[i] = SWAP_LE32(words[i]);
168 
169 # if MD5_SMALL == 3
170 	pc = C_array;
171 	pp = P_array;
172 	ps = S_array - 4;
173 
174 	for (i = 0; i < 64; i++) {
175 		if ((i & 0x0f) == 0)
176 			ps += 4;
177 		temp = A;
178 		switch (i >> 4) {
179 		case 0:
180 			temp += FF(B, C, D);
181 			break;
182 		case 1:
183 			temp += FG(B, C, D);
184 			break;
185 		case 2:
186 			temp += FH(B, C, D);
187 			break;
188 		default: /* case 3 */
189 			temp += FI(B, C, D);
190 		}
191 		temp += words[(int) (*pp++)] + *pc++;
192 		temp = rotl32(temp, ps[i & 3]);
193 		temp += B;
194 		A = D;
195 		D = C;
196 		C = B;
197 		B = temp;
198 	}
199 # else  /* MD5_SMALL == 2 */
200 	pc = C_array;
201 	pp = P_array;
202 	ps = S_array;
203 
204 	for (i = 0; i < 16; i++) {
205 		temp = A + FF(B, C, D) + words[(int) (*pp++)] + *pc++;
206 		temp = rotl32(temp, ps[i & 3]);
207 		temp += B;
208 		A = D;
209 		D = C;
210 		C = B;
211 		B = temp;
212 	}
213 	ps += 4;
214 	for (i = 0; i < 16; i++) {
215 		temp = A + FG(B, C, D) + words[(int) (*pp++)] + *pc++;
216 		temp = rotl32(temp, ps[i & 3]);
217 		temp += B;
218 		A = D;
219 		D = C;
220 		C = B;
221 		B = temp;
222 	}
223 	ps += 4;
224 	for (i = 0; i < 16; i++) {
225 		temp = A + FH(B, C, D) + words[(int) (*pp++)] + *pc++;
226 		temp = rotl32(temp, ps[i & 3]);
227 		temp += B;
228 		A = D;
229 		D = C;
230 		C = B;
231 		B = temp;
232 	}
233 	ps += 4;
234 	for (i = 0; i < 16; i++) {
235 		temp = A + FI(B, C, D) + words[(int) (*pp++)] + *pc++;
236 		temp = rotl32(temp, ps[i & 3]);
237 		temp += B;
238 		A = D;
239 		D = C;
240 		C = B;
241 		B = temp;
242 	}
243 # endif
244 	/* Add checksum to the starting values */
245 	ctx->hash[0] += A;
246 	ctx->hash[1] += B;
247 	ctx->hash[2] += C;
248 	ctx->hash[3] += D;
249 
250 #else  /* MD5_SMALL == 0 or 1 */
251 
252 # if MD5_SMALL == 1
253 	const uint32_t *pc;
254 	const char *pp;
255 	int i;
256 # endif
257 
258 	/* First round: using the given function, the context and a constant
259 	   the next context is computed.  Because the algorithm's processing
260 	   unit is a 32-bit word and it is determined to work on words in
261 	   little endian byte order we perhaps have to change the byte order
262 	   before the computation.  To reduce the work for the next steps
263 	   we save swapped words in WORDS array.  */
264 # undef OP
265 # define OP(a, b, c, d, s, T) \
266 	do { \
267 		a += FF(b, c, d) + (*words IF_BIG_ENDIAN(= SWAP_LE32(*words))) + T; \
268 		words++; \
269 		a = rotl32(a, s); \
270 		a += b; \
271 	} while (0)
272 
273 	/* Round 1 */
274 # if MD5_SMALL == 1
275 	pc = C_array;
276 	for (i = 0; i < 4; i++) {
277 		OP(A, B, C, D, 7, *pc++);
278 		OP(D, A, B, C, 12, *pc++);
279 		OP(C, D, A, B, 17, *pc++);
280 		OP(B, C, D, A, 22, *pc++);
281 	}
282 # else
283 	OP(A, B, C, D, 7, 0xd76aa478);
284 	OP(D, A, B, C, 12, 0xe8c7b756);
285 	OP(C, D, A, B, 17, 0x242070db);
286 	OP(B, C, D, A, 22, 0xc1bdceee);
287 	OP(A, B, C, D, 7, 0xf57c0faf);
288 	OP(D, A, B, C, 12, 0x4787c62a);
289 	OP(C, D, A, B, 17, 0xa8304613);
290 	OP(B, C, D, A, 22, 0xfd469501);
291 	OP(A, B, C, D, 7, 0x698098d8);
292 	OP(D, A, B, C, 12, 0x8b44f7af);
293 	OP(C, D, A, B, 17, 0xffff5bb1);
294 	OP(B, C, D, A, 22, 0x895cd7be);
295 	OP(A, B, C, D, 7, 0x6b901122);
296 	OP(D, A, B, C, 12, 0xfd987193);
297 	OP(C, D, A, B, 17, 0xa679438e);
298 	OP(B, C, D, A, 22, 0x49b40821);
299 # endif
300 	words -= 16;
301 
302 	/* For the second to fourth round we have the possibly swapped words
303 	   in WORDS.  Redefine the macro to take an additional first
304 	   argument specifying the function to use.  */
305 # undef OP
306 # define OP(f, a, b, c, d, k, s, T) \
307 	do { \
308 		a += f(b, c, d) + words[k] + T; \
309 		a = rotl32(a, s); \
310 		a += b; \
311 	} while (0)
312 
313 	/* Round 2 */
314 # if MD5_SMALL == 1
315 	pp = P_array;
316 	for (i = 0; i < 4; i++) {
317 		OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
318 		OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
319 		OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
320 		OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
321 	}
322 # else
323 	OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
324 	OP(FG, D, A, B, C, 6, 9, 0xc040b340);
325 	OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
326 	OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
327 	OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
328 	OP(FG, D, A, B, C, 10, 9, 0x02441453);
329 	OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
330 	OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
331 	OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
332 	OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
333 	OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
334 	OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
335 	OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
336 	OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
337 	OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
338 	OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
339 # endif
340 
341 	/* Round 3 */
342 # if MD5_SMALL == 1
343 	for (i = 0; i < 4; i++) {
344 		OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
345 		OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
346 		OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
347 		OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
348 	}
349 # else
350 	OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
351 	OP(FH, D, A, B, C, 8, 11, 0x8771f681);
352 	OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
353 	OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
354 	OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
355 	OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
356 	OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
357 	OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
358 	OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
359 	OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
360 	OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
361 	OP(FH, B, C, D, A, 6, 23, 0x04881d05);
362 	OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
363 	OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
364 	OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
365 	OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
366 # endif
367 
368 	/* Round 4 */
369 # if MD5_SMALL == 1
370 	for (i = 0; i < 4; i++) {
371 		OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
372 		OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
373 		OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
374 		OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
375 	}
376 # else
377 	OP(FI, A, B, C, D, 0, 6, 0xf4292244);
378 	OP(FI, D, A, B, C, 7, 10, 0x432aff97);
379 	OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
380 	OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
381 	OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
382 	OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
383 	OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
384 	OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
385 	OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
386 	OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
387 	OP(FI, C, D, A, B, 6, 15, 0xa3014314);
388 	OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
389 	OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
390 	OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
391 	OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
392 	OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
393 # undef OP
394 # endif
395 	/* Add checksum to the starting values */
396 	ctx->hash[0] += A;
397 	ctx->hash[1] += B;
398 	ctx->hash[2] += C;
399 	ctx->hash[3] += D;
400 #endif
401 }
402 #undef FF
403 #undef FG
404 #undef FH
405 #undef FI
406 
407 /* Initialize structure containing state of computation.
408  * (RFC 1321, 3.3: Step 3)
409  */
md5_begin(md5_ctx_t * ctx)410 void FAST_FUNC md5_begin(md5_ctx_t *ctx)
411 {
412 	ctx->hash[0] = 0x67452301;
413 	ctx->hash[1] = 0xefcdab89;
414 	ctx->hash[2] = 0x98badcfe;
415 	ctx->hash[3] = 0x10325476;
416 	ctx->total64 = 0;
417 	ctx->process_block = md5_process_block64;
418 }
419 
420 /* Used also for sha1 and sha256 */
md5_hash(md5_ctx_t * ctx,const void * buffer,size_t len)421 void FAST_FUNC md5_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
422 {
423 	unsigned bufpos = ctx->total64 & 63;
424 
425 	ctx->total64 += len;
426 
427 	while (1) {
428 		unsigned remaining = 64 - bufpos;
429 		if (remaining > len)
430 			remaining = len;
431 		/* Copy data into aligned buffer */
432 		memcpy(ctx->wbuffer + bufpos, buffer, remaining);
433 		len -= remaining;
434 		buffer = (const char *)buffer + remaining;
435 		bufpos += remaining;
436 
437 		/* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
438 		bufpos -= 64;
439 		if (bufpos != 0)
440 			break;
441 
442 		/* Buffer is filled up, process it */
443 		ctx->process_block(ctx);
444 		/*bufpos = 0; - already is */
445 	}
446 }
447 
448 /* Process the remaining bytes in the buffer and put result from CTX
449  * in first 16 bytes following RESBUF.  The result is always in little
450  * endian byte order, so that a byte-wise output yields to the wanted
451  * ASCII representation of the message digest.
452  */
md5_end(md5_ctx_t * ctx,void * resbuf)453 unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
454 {
455 	/* MD5 stores total in LE, need to swap on BE arches: */
456 	common64_end(ctx, /*swap_needed:*/ BB_BIG_ENDIAN);
457 
458 	/* The MD5 result is in little endian byte order */
459 	if (BB_BIG_ENDIAN) {
460 		ctx->hash[0] = SWAP_LE32(ctx->hash[0]);
461 		ctx->hash[1] = SWAP_LE32(ctx->hash[1]);
462 		ctx->hash[2] = SWAP_LE32(ctx->hash[2]);
463 		ctx->hash[3] = SWAP_LE32(ctx->hash[3]);
464 	}
465 
466 	memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * 4);
467 	return sizeof(ctx->hash[0]) * 4;
468 }
469 
470 
471 /*
472  * SHA1 part is:
473  * Copyright 2007 Rob Landley <rob@landley.net>
474  *
475  * Based on the public domain SHA-1 in C by Steve Reid <steve@edmweb.com>
476  * from http://www.mirrors.wiretapped.net/security/cryptography/hashes/sha1/
477  *
478  * Licensed under GPLv2, see file LICENSE in this source tree.
479  *
480  * ---------------------------------------------------------------------------
481  *
482  * SHA256 and SHA512 parts are:
483  * Released into the Public Domain by Ulrich Drepper <drepper@redhat.com>.
484  * Shrank by Denys Vlasenko.
485  *
486  * ---------------------------------------------------------------------------
487  *
488  * The best way to test random blocksizes is to go to coreutils/md5_sha1_sum.c
489  * and replace "4096" with something like "2000 + time(NULL) % 2097",
490  * then rebuild and compare "shaNNNsum bigfile" results.
491  */
492 
sha1_process_block64(sha1_ctx_t * ctx)493 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
494 {
495 	static const uint32_t rconsts[] ALIGN4 = {
496 		0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
497 	};
498 	int i, j;
499 	int cnt;
500 	uint32_t W[16+16];
501 	uint32_t a, b, c, d, e;
502 
503 	/* On-stack work buffer frees up one register in the main loop
504 	 * which otherwise will be needed to hold ctx pointer */
505 	for (i = 0; i < 16; i++)
506 		W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
507 
508 	a = ctx->hash[0];
509 	b = ctx->hash[1];
510 	c = ctx->hash[2];
511 	d = ctx->hash[3];
512 	e = ctx->hash[4];
513 
514 	/* 4 rounds of 20 operations each */
515 	cnt = 0;
516 	for (i = 0; i < 4; i++) {
517 		j = 19;
518 		do {
519 			uint32_t work;
520 
521 			work = c ^ d;
522 			if (i == 0) {
523 				work = (work & b) ^ d;
524 				if (j <= 3)
525 					goto ge16;
526 				/* Used to do SWAP_BE32 here, but this
527 				 * requires ctx (see comment above) */
528 				work += W[cnt];
529 			} else {
530 				if (i == 2)
531 					work = ((b | c) & d) | (b & c);
532 				else /* i = 1 or 3 */
533 					work ^= b;
534  ge16:
535 				W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
536 				work += W[cnt];
537 			}
538 			work += e + rotl32(a, 5) + rconsts[i];
539 
540 			/* Rotate by one for next time */
541 			e = d;
542 			d = c;
543 			c = /* b = */ rotl32(b, 30);
544 			b = a;
545 			a = work;
546 			cnt = (cnt + 1) & 15;
547 		} while (--j >= 0);
548 	}
549 
550 	ctx->hash[0] += a;
551 	ctx->hash[1] += b;
552 	ctx->hash[2] += c;
553 	ctx->hash[3] += d;
554 	ctx->hash[4] += e;
555 }
556 
557 /* Constants for SHA512 from FIPS 180-2:4.2.3.
558  * SHA256 constants from FIPS 180-2:4.2.2
559  * are the most significant half of first 64 elements
560  * of the same array.
561  */
562 #undef K
563 #if NEED_SHA512
564 typedef uint64_t sha_K_int;
565 # define K(v) v
566 #else
567 typedef uint32_t sha_K_int;
568 # define K(v) (uint32_t)(v >> 32)
569 #endif
570 static const sha_K_int sha_K[] ALIGN8 = {
571 	K(0x428a2f98d728ae22ULL), K(0x7137449123ef65cdULL),
572 	K(0xb5c0fbcfec4d3b2fULL), K(0xe9b5dba58189dbbcULL),
573 	K(0x3956c25bf348b538ULL), K(0x59f111f1b605d019ULL),
574 	K(0x923f82a4af194f9bULL), K(0xab1c5ed5da6d8118ULL),
575 	K(0xd807aa98a3030242ULL), K(0x12835b0145706fbeULL),
576 	K(0x243185be4ee4b28cULL), K(0x550c7dc3d5ffb4e2ULL),
577 	K(0x72be5d74f27b896fULL), K(0x80deb1fe3b1696b1ULL),
578 	K(0x9bdc06a725c71235ULL), K(0xc19bf174cf692694ULL),
579 	K(0xe49b69c19ef14ad2ULL), K(0xefbe4786384f25e3ULL),
580 	K(0x0fc19dc68b8cd5b5ULL), K(0x240ca1cc77ac9c65ULL),
581 	K(0x2de92c6f592b0275ULL), K(0x4a7484aa6ea6e483ULL),
582 	K(0x5cb0a9dcbd41fbd4ULL), K(0x76f988da831153b5ULL),
583 	K(0x983e5152ee66dfabULL), K(0xa831c66d2db43210ULL),
584 	K(0xb00327c898fb213fULL), K(0xbf597fc7beef0ee4ULL),
585 	K(0xc6e00bf33da88fc2ULL), K(0xd5a79147930aa725ULL),
586 	K(0x06ca6351e003826fULL), K(0x142929670a0e6e70ULL),
587 	K(0x27b70a8546d22ffcULL), K(0x2e1b21385c26c926ULL),
588 	K(0x4d2c6dfc5ac42aedULL), K(0x53380d139d95b3dfULL),
589 	K(0x650a73548baf63deULL), K(0x766a0abb3c77b2a8ULL),
590 	K(0x81c2c92e47edaee6ULL), K(0x92722c851482353bULL),
591 	K(0xa2bfe8a14cf10364ULL), K(0xa81a664bbc423001ULL),
592 	K(0xc24b8b70d0f89791ULL), K(0xc76c51a30654be30ULL),
593 	K(0xd192e819d6ef5218ULL), K(0xd69906245565a910ULL),
594 	K(0xf40e35855771202aULL), K(0x106aa07032bbd1b8ULL),
595 	K(0x19a4c116b8d2d0c8ULL), K(0x1e376c085141ab53ULL),
596 	K(0x2748774cdf8eeb99ULL), K(0x34b0bcb5e19b48a8ULL),
597 	K(0x391c0cb3c5c95a63ULL), K(0x4ed8aa4ae3418acbULL),
598 	K(0x5b9cca4f7763e373ULL), K(0x682e6ff3d6b2b8a3ULL),
599 	K(0x748f82ee5defb2fcULL), K(0x78a5636f43172f60ULL),
600 	K(0x84c87814a1f0ab72ULL), K(0x8cc702081a6439ecULL),
601 	K(0x90befffa23631e28ULL), K(0xa4506cebde82bde9ULL),
602 	K(0xbef9a3f7b2c67915ULL), K(0xc67178f2e372532bULL),
603 #if NEED_SHA512  /* [64]+ are used for sha512 only */
604 	K(0xca273eceea26619cULL), K(0xd186b8c721c0c207ULL),
605 	K(0xeada7dd6cde0eb1eULL), K(0xf57d4f7fee6ed178ULL),
606 	K(0x06f067aa72176fbaULL), K(0x0a637dc5a2c898a6ULL),
607 	K(0x113f9804bef90daeULL), K(0x1b710b35131c471bULL),
608 	K(0x28db77f523047d84ULL), K(0x32caab7b40c72493ULL),
609 	K(0x3c9ebe0a15c9bebcULL), K(0x431d67c49c100d4cULL),
610 	K(0x4cc5d4becb3e42b6ULL), K(0x597f299cfc657e2aULL),
611 	K(0x5fcb6fab3ad6faecULL), K(0x6c44198c4a475817ULL),
612 #endif
613 };
614 #undef K
615 
616 #undef Ch
617 #undef Maj
618 #undef S0
619 #undef S1
620 #undef R0
621 #undef R1
622 
sha256_process_block64(sha256_ctx_t * ctx)623 static void FAST_FUNC sha256_process_block64(sha256_ctx_t *ctx)
624 {
625 	unsigned t;
626 	uint32_t W[64], a, b, c, d, e, f, g, h;
627 	const uint32_t *words = (uint32_t*) ctx->wbuffer;
628 
629 	/* Operators defined in FIPS 180-2:4.1.2.  */
630 #define Ch(x, y, z) ((x & y) ^ (~x & z))
631 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
632 #define S0(x) (rotr32(x, 2) ^ rotr32(x, 13) ^ rotr32(x, 22))
633 #define S1(x) (rotr32(x, 6) ^ rotr32(x, 11) ^ rotr32(x, 25))
634 #define R0(x) (rotr32(x, 7) ^ rotr32(x, 18) ^ (x >> 3))
635 #define R1(x) (rotr32(x, 17) ^ rotr32(x, 19) ^ (x >> 10))
636 
637 	/* Compute the message schedule according to FIPS 180-2:6.2.2 step 2.  */
638 	for (t = 0; t < 16; ++t)
639 		W[t] = SWAP_BE32(words[t]);
640 	for (/*t = 16*/; t < 64; ++t)
641 		W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
642 
643 	a = ctx->hash[0];
644 	b = ctx->hash[1];
645 	c = ctx->hash[2];
646 	d = ctx->hash[3];
647 	e = ctx->hash[4];
648 	f = ctx->hash[5];
649 	g = ctx->hash[6];
650 	h = ctx->hash[7];
651 
652 	/* The actual computation according to FIPS 180-2:6.2.2 step 3.  */
653 	for (t = 0; t < 64; ++t) {
654 		/* Need to fetch upper half of sha_K[t]
655 		 * (I hope compiler is clever enough to just fetch
656 		 * upper half)
657 		 */
658 		uint32_t K_t = NEED_SHA512 ? (sha_K[t] >> 32) : sha_K[t];
659 		uint32_t T1 = h + S1(e) + Ch(e, f, g) + K_t + W[t];
660 		uint32_t T2 = S0(a) + Maj(a, b, c);
661 		h = g;
662 		g = f;
663 		f = e;
664 		e = d + T1;
665 		d = c;
666 		c = b;
667 		b = a;
668 		a = T1 + T2;
669 	}
670 #undef Ch
671 #undef Maj
672 #undef S0
673 #undef S1
674 #undef R0
675 #undef R1
676 	/* Add the starting values of the context according to FIPS 180-2:6.2.2
677 	   step 4.  */
678 	ctx->hash[0] += a;
679 	ctx->hash[1] += b;
680 	ctx->hash[2] += c;
681 	ctx->hash[3] += d;
682 	ctx->hash[4] += e;
683 	ctx->hash[5] += f;
684 	ctx->hash[6] += g;
685 	ctx->hash[7] += h;
686 }
687 
688 #if NEED_SHA512
sha512_process_block128(sha512_ctx_t * ctx)689 static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
690 {
691 	unsigned t;
692 	uint64_t W[80];
693 	/* On i386, having assignments here (not later as sha256 does)
694 	 * produces 99 bytes smaller code with gcc 4.3.1
695 	 */
696 	uint64_t a = ctx->hash[0];
697 	uint64_t b = ctx->hash[1];
698 	uint64_t c = ctx->hash[2];
699 	uint64_t d = ctx->hash[3];
700 	uint64_t e = ctx->hash[4];
701 	uint64_t f = ctx->hash[5];
702 	uint64_t g = ctx->hash[6];
703 	uint64_t h = ctx->hash[7];
704 	const uint64_t *words = (uint64_t*) ctx->wbuffer;
705 
706 	/* Operators defined in FIPS 180-2:4.1.2.  */
707 #define Ch(x, y, z) ((x & y) ^ (~x & z))
708 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
709 #define S0(x) (rotr64(x, 28) ^ rotr64(x, 34) ^ rotr64(x, 39))
710 #define S1(x) (rotr64(x, 14) ^ rotr64(x, 18) ^ rotr64(x, 41))
711 #define R0(x) (rotr64(x, 1) ^ rotr64(x, 8) ^ (x >> 7))
712 #define R1(x) (rotr64(x, 19) ^ rotr64(x, 61) ^ (x >> 6))
713 
714 	/* Compute the message schedule according to FIPS 180-2:6.3.2 step 2.  */
715 	for (t = 0; t < 16; ++t)
716 		W[t] = SWAP_BE64(words[t]);
717 	for (/*t = 16*/; t < 80; ++t)
718 		W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
719 
720 	/* The actual computation according to FIPS 180-2:6.3.2 step 3.  */
721 	for (t = 0; t < 80; ++t) {
722 		uint64_t T1 = h + S1(e) + Ch(e, f, g) + sha_K[t] + W[t];
723 		uint64_t T2 = S0(a) + Maj(a, b, c);
724 		h = g;
725 		g = f;
726 		f = e;
727 		e = d + T1;
728 		d = c;
729 		c = b;
730 		b = a;
731 		a = T1 + T2;
732 	}
733 #undef Ch
734 #undef Maj
735 #undef S0
736 #undef S1
737 #undef R0
738 #undef R1
739 	/* Add the starting values of the context according to FIPS 180-2:6.3.2
740 	   step 4.  */
741 	ctx->hash[0] += a;
742 	ctx->hash[1] += b;
743 	ctx->hash[2] += c;
744 	ctx->hash[3] += d;
745 	ctx->hash[4] += e;
746 	ctx->hash[5] += f;
747 	ctx->hash[6] += g;
748 	ctx->hash[7] += h;
749 }
750 #endif /* NEED_SHA512 */
751 
sha1_begin(sha1_ctx_t * ctx)752 void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
753 {
754 	ctx->hash[0] = 0x67452301;
755 	ctx->hash[1] = 0xefcdab89;
756 	ctx->hash[2] = 0x98badcfe;
757 	ctx->hash[3] = 0x10325476;
758 	ctx->hash[4] = 0xc3d2e1f0;
759 	ctx->total64 = 0;
760 	ctx->process_block = sha1_process_block64;
761 }
762 
763 static const uint32_t init256[] ALIGN4 = {
764 	0,
765 	0,
766 	0x6a09e667,
767 	0xbb67ae85,
768 	0x3c6ef372,
769 	0xa54ff53a,
770 	0x510e527f,
771 	0x9b05688c,
772 	0x1f83d9ab,
773 	0x5be0cd19,
774 };
775 #if NEED_SHA512
776 static const uint32_t init512_lo[] ALIGN4 = {
777 	0,
778 	0,
779 	0xf3bcc908,
780 	0x84caa73b,
781 	0xfe94f82b,
782 	0x5f1d36f1,
783 	0xade682d1,
784 	0x2b3e6c1f,
785 	0xfb41bd6b,
786 	0x137e2179,
787 };
788 #endif /* NEED_SHA512 */
789 
790 // Note: SHA-384 is identical to SHA-512, except that initial hash values are
791 // 0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939,
792 // 0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4,
793 // and the output is constructed by omitting last two 64-bit words of it.
794 
795 /* Initialize structure containing state of computation.
796    (FIPS 180-2:5.3.2)  */
sha256_begin(sha256_ctx_t * ctx)797 void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
798 {
799 	memcpy(&ctx->total64, init256, sizeof(init256));
800 	/*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
801 	ctx->process_block = sha256_process_block64;
802 }
803 
804 #if NEED_SHA512
805 /* Initialize structure containing state of computation.
806    (FIPS 180-2:5.3.3)  */
sha512_begin(sha512_ctx_t * ctx)807 void FAST_FUNC sha512_begin(sha512_ctx_t *ctx)
808 {
809 	int i;
810 	/* Two extra iterations zero out ctx->total64[2] */
811 	uint64_t *tp = ctx->total64;
812 	for (i = 0; i < 8 + 2; i++)
813 		tp[i] = ((uint64_t)(init256[i]) << 32) + init512_lo[i];
814 	/*ctx->total64[0] = ctx->total64[1] = 0; - already done */
815 }
816 
sha512_hash(sha512_ctx_t * ctx,const void * buffer,size_t len)817 void FAST_FUNC sha512_hash(sha512_ctx_t *ctx, const void *buffer, size_t len)
818 {
819 	unsigned bufpos = ctx->total64[0] & 127;
820 	unsigned remaining;
821 
822 	/* First increment the byte count.  FIPS 180-2 specifies the possible
823 	   length of the file up to 2^128 _bits_.
824 	   We compute the number of _bytes_ and convert to bits later.  */
825 	ctx->total64[0] += len;
826 	if (ctx->total64[0] < len)
827 		ctx->total64[1]++;
828 
829 	while (1) {
830 		remaining = 128 - bufpos;
831 		if (remaining > len)
832 			remaining = len;
833 		/* Copy data into aligned buffer */
834 		memcpy(ctx->wbuffer + bufpos, buffer, remaining);
835 		len -= remaining;
836 		buffer = (const char *)buffer + remaining;
837 		bufpos += remaining;
838 
839 		/* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
840 		bufpos -= 128;
841 		if (bufpos != 0)
842 			break;
843 
844 		/* Buffer is filled up, process it */
845 		sha512_process_block128(ctx);
846 		/*bufpos = 0; - already is */
847 	}
848 }
849 #endif /* NEED_SHA512 */
850 
851 /* Used also for sha256 */
sha1_end(sha1_ctx_t * ctx,void * resbuf)852 unsigned FAST_FUNC sha1_end(sha1_ctx_t *ctx, void *resbuf)
853 {
854 	unsigned hash_size;
855 
856 	/* SHA stores total in BE, need to swap on LE arches: */
857 	common64_end(ctx, /*swap_needed:*/ BB_LITTLE_ENDIAN);
858 
859 	hash_size = (ctx->process_block == sha1_process_block64) ? 5 : 8;
860 	/* This way we do not impose alignment constraints on resbuf: */
861 	if (BB_LITTLE_ENDIAN) {
862 		unsigned i;
863 		for (i = 0; i < hash_size; ++i)
864 			ctx->hash[i] = SWAP_BE32(ctx->hash[i]);
865 	}
866 	hash_size *= sizeof(ctx->hash[0]);
867 	memcpy(resbuf, ctx->hash, hash_size);
868 	return hash_size;
869 }
870 
871 #if NEED_SHA512
sha512_end(sha512_ctx_t * ctx,void * resbuf)872 unsigned FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
873 {
874 	unsigned bufpos = ctx->total64[0] & 127;
875 
876 	/* Pad the buffer to the next 128-byte boundary with 0x80,0,0,0... */
877 	ctx->wbuffer[bufpos++] = 0x80;
878 
879 	while (1) {
880 		unsigned remaining = 128 - bufpos;
881 		memset(ctx->wbuffer + bufpos, 0, remaining);
882 		if (remaining >= 16) {
883 			/* Store the 128-bit counter of bits in the buffer in BE format */
884 			uint64_t t;
885 			t = ctx->total64[0] << 3;
886 			t = SWAP_BE64(t);
887 			*(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 8]) = t;
888 			t = (ctx->total64[1] << 3) | (ctx->total64[0] >> 61);
889 			t = SWAP_BE64(t);
890 			*(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 16]) = t;
891 		}
892 		sha512_process_block128(ctx);
893 		if (remaining >= 16)
894 			break;
895 		bufpos = 0;
896 	}
897 
898 	if (BB_LITTLE_ENDIAN) {
899 		unsigned i;
900 		for (i = 0; i < ARRAY_SIZE(ctx->hash); ++i)
901 			ctx->hash[i] = SWAP_BE64(ctx->hash[i]);
902 	}
903 	memcpy(resbuf, ctx->hash, sizeof(ctx->hash));
904 	return sizeof(ctx->hash);
905 }
906 #endif /* NEED_SHA512 */
907 
908 
909 /*
910  * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
911  * Michael Peeters and Gilles Van Assche. For more information, feedback or
912  * questions, please refer to our website: http://keccak.noekeon.org/
913  *
914  * Implementation by Ronny Van Keer,
915  * hereby denoted as "the implementer".
916  *
917  * To the extent possible under law, the implementer has waived all copyright
918  * and related or neighboring rights to the source code in this file.
919  * http://creativecommons.org/publicdomain/zero/1.0/
920  *
921  * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
922  */
923 
924 #if CONFIG_SHA3_SMALL < 0
925 # define SHA3_SMALL 0
926 #elif CONFIG_SHA3_SMALL > 1
927 # define SHA3_SMALL 1
928 #else
929 # define SHA3_SMALL CONFIG_SHA3_SMALL
930 #endif
931 
932 #define OPTIMIZE_SHA3_FOR_32 0
933 /*
934  * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
935  * every 64-bit word of state[] can be split into two 32-bit words
936  * by even/odd bits. In this form, all rotations of sha3 round
937  * are 32-bit - and there are lots of them.
938  * However, it requires either splitting/combining state words
939  * before/after sha3 round (code does this now)
940  * or shuffling bits before xor'ing them into state and in sha3_end.
941  * Without shuffling, bit-slicing results in -130 bytes of code
942  * and marginal speedup (but of course it gives wrong result).
943  * With shuffling it works, but +260 code bytes, and slower.
944  * Disabled for now:
945  */
946 #if 0 /* LONG_MAX == 0x7fffffff */
947 # undef OPTIMIZE_SHA3_FOR_32
948 # define OPTIMIZE_SHA3_FOR_32 1
949 #endif
950 
951 #if OPTIMIZE_SHA3_FOR_32
952 /* This splits every 64-bit word into a pair of 32-bit words,
953  * even bits go into first word, odd bits go to second one.
954  * The conversion is done in-place.
955  */
split_halves(uint64_t * state)956 static void split_halves(uint64_t *state)
957 {
958 	/* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
959 	uint32_t *s32 = (uint32_t*)state;
960 	uint32_t t, x0, x1;
961 	int i;
962 	for (i = 24; i >= 0; --i) {
963 		x0 = s32[0];
964 		t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
965 		t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
966 		t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
967 		t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
968 		x1 = s32[1];
969 		t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
970 		t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
971 		t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
972 		t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
973 		*s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
974 		*s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
975 	}
976 }
977 /* The reverse operation */
combine_halves(uint64_t * state)978 static void combine_halves(uint64_t *state)
979 {
980 	uint32_t *s32 = (uint32_t*)state;
981 	uint32_t t, x0, x1;
982 	int i;
983 	for (i = 24; i >= 0; --i) {
984 		x0 = s32[0];
985 		x1 = s32[1];
986 		t = (x0 & 0x0000FFFF) | (x1 << 16);
987 		x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
988 		x0 = t;
989 		t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
990 		t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
991 		t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
992 		t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
993 		*s32++ = x0;
994 		t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
995 		t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
996 		t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
997 		t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
998 		*s32++ = x1;
999 	}
1000 }
1001 #endif
1002 
1003 /*
1004  * In the crypto literature this function is usually called Keccak-f().
1005  */
sha3_process_block72(uint64_t * state)1006 static void sha3_process_block72(uint64_t *state)
1007 {
1008 	enum { NROUNDS = 24 };
1009 
1010 #if OPTIMIZE_SHA3_FOR_32
1011 	/*
1012 	static const uint32_t IOTA_CONST_0[NROUNDS] ALIGN4 = {
1013 		0x00000001UL,
1014 		0x00000000UL,
1015 		0x00000000UL,
1016 		0x00000000UL,
1017 		0x00000001UL,
1018 		0x00000001UL,
1019 		0x00000001UL,
1020 		0x00000001UL,
1021 		0x00000000UL,
1022 		0x00000000UL,
1023 		0x00000001UL,
1024 		0x00000000UL,
1025 		0x00000001UL,
1026 		0x00000001UL,
1027 		0x00000001UL,
1028 		0x00000001UL,
1029 		0x00000000UL,
1030 		0x00000000UL,
1031 		0x00000000UL,
1032 		0x00000000UL,
1033 		0x00000001UL,
1034 		0x00000000UL,
1035 		0x00000001UL,
1036 		0x00000000UL,
1037 	};
1038 	** bits are in lsb: 0101 0000 1111 0100 1111 0001
1039 	*/
1040 	uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
1041 	static const uint32_t IOTA_CONST_1[NROUNDS] ALIGN4 = {
1042 		0x00000000UL,
1043 		0x00000089UL,
1044 		0x8000008bUL,
1045 		0x80008080UL,
1046 		0x0000008bUL,
1047 		0x00008000UL,
1048 		0x80008088UL,
1049 		0x80000082UL,
1050 		0x0000000bUL,
1051 		0x0000000aUL,
1052 		0x00008082UL,
1053 		0x00008003UL,
1054 		0x0000808bUL,
1055 		0x8000000bUL,
1056 		0x8000008aUL,
1057 		0x80000081UL,
1058 		0x80000081UL,
1059 		0x80000008UL,
1060 		0x00000083UL,
1061 		0x80008003UL,
1062 		0x80008088UL,
1063 		0x80000088UL,
1064 		0x00008000UL,
1065 		0x80008082UL,
1066 	};
1067 
1068 	uint32_t *const s32 = (uint32_t*)state;
1069 	unsigned round;
1070 
1071 	split_halves(state);
1072 
1073 	for (round = 0; round < NROUNDS; round++) {
1074 		unsigned x;
1075 
1076 		/* Theta */
1077 		{
1078 			uint32_t BC[20];
1079 			for (x = 0; x < 10; ++x) {
1080 				BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1081 			}
1082 			for (x = 0; x < 10; x += 2) {
1083 				uint32_t ta, tb;
1084 				ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1085 				tb = BC[x+9] ^ BC[x+2];
1086 				s32[x+0] ^= ta;
1087 				s32[x+1] ^= tb;
1088 				s32[x+10] ^= ta;
1089 				s32[x+11] ^= tb;
1090 				s32[x+20] ^= ta;
1091 				s32[x+21] ^= tb;
1092 				s32[x+30] ^= ta;
1093 				s32[x+31] ^= tb;
1094 				s32[x+40] ^= ta;
1095 				s32[x+41] ^= tb;
1096 			}
1097 		}
1098 		/* RhoPi */
1099 		{
1100 			uint32_t t0a,t0b, t1a,t1b;
1101 			t1a = s32[1*2+0];
1102 			t1b = s32[1*2+1];
1103 
1104 #define RhoPi(PI_LANE, ROT_CONST) \
1105 	t0a = s32[PI_LANE*2+0];\
1106 	t0b = s32[PI_LANE*2+1];\
1107 	if (ROT_CONST & 1) {\
1108 		s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1109 		s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1110 	} else {\
1111 		s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1112 		s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1113 	}\
1114 	t1a = t0a; t1b = t0b;
1115 
1116 			RhoPi(10, 1)
1117 			RhoPi( 7, 3)
1118 			RhoPi(11, 6)
1119 			RhoPi(17,10)
1120 			RhoPi(18,15)
1121 			RhoPi( 3,21)
1122 			RhoPi( 5,28)
1123 			RhoPi(16,36)
1124 			RhoPi( 8,45)
1125 			RhoPi(21,55)
1126 			RhoPi(24, 2)
1127 			RhoPi( 4,14)
1128 			RhoPi(15,27)
1129 			RhoPi(23,41)
1130 			RhoPi(19,56)
1131 			RhoPi(13, 8)
1132 			RhoPi(12,25)
1133 			RhoPi( 2,43)
1134 			RhoPi(20,62)
1135 			RhoPi(14,18)
1136 			RhoPi(22,39)
1137 			RhoPi( 9,61)
1138 			RhoPi( 6,20)
1139 			RhoPi( 1,44)
1140 #undef RhoPi
1141 		}
1142 		/* Chi */
1143 		for (x = 0; x <= 40;) {
1144 			uint32_t BC0, BC1, BC2, BC3, BC4;
1145 			BC0 = s32[x + 0*2];
1146 			BC1 = s32[x + 1*2];
1147 			BC2 = s32[x + 2*2];
1148 			s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1149 			BC3 = s32[x + 3*2];
1150 			s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1151 			BC4 = s32[x + 4*2];
1152 			s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1153 			s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1154 			s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1155 			x++;
1156 			BC0 = s32[x + 0*2];
1157 			BC1 = s32[x + 1*2];
1158 			BC2 = s32[x + 2*2];
1159 			s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1160 			BC3 = s32[x + 3*2];
1161 			s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1162 			BC4 = s32[x + 4*2];
1163 			s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1164 			s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1165 			s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1166 			x += 9;
1167 		}
1168 		/* Iota */
1169 		s32[0] ^= IOTA_CONST_0bits & 1;
1170 		IOTA_CONST_0bits >>= 1;
1171 		s32[1] ^= IOTA_CONST_1[round];
1172 	}
1173 
1174 	combine_halves(state);
1175 #else
1176 	/* Native 64-bit algorithm */
1177 	static const uint16_t IOTA_CONST[NROUNDS] ALIGN2 = {
1178 		/* Elements should be 64-bit, but top half is always zero
1179 		 * or 0x80000000. We encode 63rd bits in a separate word below.
1180 		 * Same is true for 31th bits, which lets us use 16-bit table
1181 		 * instead of 64-bit. The speed penalty is lost in the noise.
1182 		 */
1183 		0x0001,
1184 		0x8082,
1185 		0x808a,
1186 		0x8000,
1187 		0x808b,
1188 		0x0001,
1189 		0x8081,
1190 		0x8009,
1191 		0x008a,
1192 		0x0088,
1193 		0x8009,
1194 		0x000a,
1195 		0x808b,
1196 		0x008b,
1197 		0x8089,
1198 		0x8003,
1199 		0x8002,
1200 		0x0080,
1201 		0x800a,
1202 		0x000a,
1203 		0x8081,
1204 		0x8080,
1205 		0x0001,
1206 		0x8008,
1207 	};
1208 	/* bit for CONST[0] is in msb: 0011 0011 0000 0111 1101 1101 */
1209 	const uint32_t IOTA_CONST_bit63 = (uint32_t)(0x3307dd00);
1210 	/* bit for CONST[0] is in msb: 0001 0110 0011 1000 0001 1011 */
1211 	const uint32_t IOTA_CONST_bit31 = (uint32_t)(0x16381b00);
1212 
1213 	static const uint8_t ROT_CONST[24] ALIGN1 = {
1214 		1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
1215 		27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
1216 	};
1217 	static const uint8_t PI_LANE[24] ALIGN1 = {
1218 		10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
1219 		15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
1220 	};
1221 	/*static const uint8_t MOD5[10] ALIGN1 = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
1222 
1223 	unsigned x;
1224 	unsigned round;
1225 
1226 	if (BB_BIG_ENDIAN) {
1227 		for (x = 0; x < 25; x++) {
1228 			state[x] = SWAP_LE64(state[x]);
1229 		}
1230 	}
1231 
1232 	for (round = 0; round < NROUNDS; ++round) {
1233 		/* Theta */
1234 		{
1235 			uint64_t BC[10];
1236 			for (x = 0; x < 5; ++x) {
1237 				BC[x + 5] = BC[x] = state[x]
1238 					^ state[x + 5] ^ state[x + 10]
1239 					^ state[x + 15]	^ state[x + 20];
1240 			}
1241 			/* Using 2x5 vector above eliminates the need to use
1242 			 * BC[MOD5[x+N]] trick below to fetch BC[(x+N) % 5],
1243 			 * and the code is a bit _smaller_.
1244 			 */
1245 			for (x = 0; x < 5; ++x) {
1246 				uint64_t temp = BC[x + 4] ^ rotl64(BC[x + 1], 1);
1247 				state[x] ^= temp;
1248 				state[x + 5] ^= temp;
1249 				state[x + 10] ^= temp;
1250 				state[x + 15] ^= temp;
1251 				state[x + 20] ^= temp;
1252 			}
1253 		}
1254 
1255 		/* Rho Pi */
1256 		if (SHA3_SMALL) {
1257 			uint64_t t1 = state[1];
1258 			for (x = 0; x < 24; ++x) {
1259 				uint64_t t0 = state[PI_LANE[x]];
1260 				state[PI_LANE[x]] = rotl64(t1, ROT_CONST[x]);
1261 				t1 = t0;
1262 			}
1263 		} else {
1264 			/* Especially large benefit for 32-bit arch (75% faster):
1265 			 * 64-bit rotations by non-constant usually are SLOW on those.
1266 			 * We resort to unrolling here.
1267 			 * This optimizes out PI_LANE[] and ROT_CONST[],
1268 			 * but generates 300-500 more bytes of code.
1269 			 */
1270 			uint64_t t0;
1271 			uint64_t t1 = state[1];
1272 #define RhoPi_twice(x) \
1273 	t0 = state[PI_LANE[x  ]]; \
1274 	state[PI_LANE[x  ]] = rotl64(t1, ROT_CONST[x  ]); \
1275 	t1 = state[PI_LANE[x+1]]; \
1276 	state[PI_LANE[x+1]] = rotl64(t0, ROT_CONST[x+1]);
1277 			RhoPi_twice(0); RhoPi_twice(2);
1278 			RhoPi_twice(4); RhoPi_twice(6);
1279 			RhoPi_twice(8); RhoPi_twice(10);
1280 			RhoPi_twice(12); RhoPi_twice(14);
1281 			RhoPi_twice(16); RhoPi_twice(18);
1282 			RhoPi_twice(20); RhoPi_twice(22);
1283 #undef RhoPi_twice
1284 		}
1285 		/* Chi */
1286 # if LONG_MAX > 0x7fffffff
1287 		for (x = 0; x <= 20; x += 5) {
1288 			uint64_t BC0, BC1, BC2, BC3, BC4;
1289 			BC0 = state[x + 0];
1290 			BC1 = state[x + 1];
1291 			BC2 = state[x + 2];
1292 			state[x + 0] = BC0 ^ ((~BC1) & BC2);
1293 			BC3 = state[x + 3];
1294 			state[x + 1] = BC1 ^ ((~BC2) & BC3);
1295 			BC4 = state[x + 4];
1296 			state[x + 2] = BC2 ^ ((~BC3) & BC4);
1297 			state[x + 3] = BC3 ^ ((~BC4) & BC0);
1298 			state[x + 4] = BC4 ^ ((~BC0) & BC1);
1299 		}
1300 # else
1301 		/* Reduced register pressure version
1302 		 * for register-starved 32-bit arches
1303 		 * (i386: -95 bytes, and it is _faster_)
1304 		 */
1305 		for (x = 0; x <= 40;) {
1306 			uint32_t BC0, BC1, BC2, BC3, BC4;
1307 			uint32_t *const s32 = (uint32_t*)state;
1308 #  if SHA3_SMALL
1309  do_half:
1310 #  endif
1311 			BC0 = s32[x + 0*2];
1312 			BC1 = s32[x + 1*2];
1313 			BC2 = s32[x + 2*2];
1314 			s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1315 			BC3 = s32[x + 3*2];
1316 			s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1317 			BC4 = s32[x + 4*2];
1318 			s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1319 			s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1320 			s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1321 			x++;
1322 #  if SHA3_SMALL
1323 			if (x & 1)
1324 				goto do_half;
1325 			x += 8;
1326 #  else
1327 			BC0 = s32[x + 0*2];
1328 			BC1 = s32[x + 1*2];
1329 			BC2 = s32[x + 2*2];
1330 			s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1331 			BC3 = s32[x + 3*2];
1332 			s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1333 			BC4 = s32[x + 4*2];
1334 			s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1335 			s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1336 			s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1337 			x += 9;
1338 #  endif
1339 		}
1340 # endif /* long is 32-bit */
1341 		/* Iota */
1342 		state[0] ^= IOTA_CONST[round]
1343 			| (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
1344 			| (uint64_t)((IOTA_CONST_bit63 << round) & 0x80000000) << 32;
1345 	}
1346 
1347 	if (BB_BIG_ENDIAN) {
1348 		for (x = 0; x < 25; x++) {
1349 			state[x] = SWAP_LE64(state[x]);
1350 		}
1351 	}
1352 #endif
1353 }
1354 
sha3_begin(sha3_ctx_t * ctx)1355 void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)
1356 {
1357 	memset(ctx, 0, sizeof(*ctx));
1358 	/* SHA3-512, user can override */
1359 	ctx->input_block_bytes = (1600 - 512*2) / 8; /* 72 bytes */
1360 }
1361 
sha3_hash(sha3_ctx_t * ctx,const void * buffer,size_t len)1362 void FAST_FUNC sha3_hash(sha3_ctx_t *ctx, const void *buffer, size_t len)
1363 {
1364 #if SHA3_SMALL
1365 	const uint8_t *data = buffer;
1366 	unsigned bufpos = ctx->bytes_queued;
1367 
1368 	while (1) {
1369 		unsigned remaining = ctx->input_block_bytes - bufpos;
1370 		if (remaining > len)
1371 			remaining = len;
1372 		len -= remaining;
1373 		/* XOR data into buffer */
1374 		while (remaining != 0) {
1375 			uint8_t *buf = (uint8_t*)ctx->state;
1376 			buf[bufpos] ^= *data++;
1377 			bufpos++;
1378 			remaining--;
1379 		}
1380 
1381 		/* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
1382 		bufpos -= ctx->input_block_bytes;
1383 		if (bufpos != 0)
1384 			break;
1385 
1386 		/* Buffer is filled up, process it */
1387 		sha3_process_block72(ctx->state);
1388 		/*bufpos = 0; - already is */
1389 	}
1390 	ctx->bytes_queued = bufpos + ctx->input_block_bytes;
1391 #else
1392 	/* +50 bytes code size, but a bit faster because of long-sized XORs */
1393 	const uint8_t *data = buffer;
1394 	unsigned bufpos = ctx->bytes_queued;
1395 	unsigned iblk_bytes = ctx->input_block_bytes;
1396 
1397 	/* If already data in queue, continue queuing first */
1398 	if (bufpos != 0) {
1399 		while (len != 0) {
1400 			uint8_t *buf = (uint8_t*)ctx->state;
1401 			buf[bufpos] ^= *data++;
1402 			len--;
1403 			bufpos++;
1404 			if (bufpos == iblk_bytes) {
1405 				bufpos = 0;
1406 				goto do_block;
1407 			}
1408 		}
1409 	}
1410 
1411 	/* Absorb complete blocks */
1412 	while (len >= iblk_bytes) {
1413 		/* XOR data onto beginning of state[].
1414 		 * We try to be efficient - operate one word at a time, not byte.
1415 		 * Careful wrt unaligned access: can't just use "*(long*)data"!
1416 		 */
1417 		unsigned count = iblk_bytes / sizeof(long);
1418 		long *buf = (long*)ctx->state;
1419 		do {
1420 			long v;
1421 			move_from_unaligned_long(v, (long*)data);
1422 			*buf++ ^= v;
1423 			data += sizeof(long);
1424 		} while (--count);
1425 		len -= iblk_bytes;
1426  do_block:
1427 		sha3_process_block72(ctx->state);
1428 	}
1429 
1430 	/* Queue remaining data bytes */
1431 	while (len != 0) {
1432 		uint8_t *buf = (uint8_t*)ctx->state;
1433 		buf[bufpos] ^= *data++;
1434 		bufpos++;
1435 		len--;
1436 	}
1437 
1438 	ctx->bytes_queued = bufpos;
1439 #endif
1440 }
1441 
sha3_end(sha3_ctx_t * ctx,void * resbuf)1442 unsigned FAST_FUNC sha3_end(sha3_ctx_t *ctx, void *resbuf)
1443 {
1444 	/* Padding */
1445 	uint8_t *buf = (uint8_t*)ctx->state;
1446 	/*
1447 	 * Keccak block padding is: add 1 bit after last bit of input,
1448 	 * then add zero bits until the end of block, and add the last 1 bit
1449 	 * (the last bit in the block) - the "10*1" pattern.
1450 	 * SHA3 standard appends additional two bits, 01,  before that padding:
1451 	 *
1452 	 * SHA3-224(M) = KECCAK[448](M||01, 224)
1453 	 * SHA3-256(M) = KECCAK[512](M||01, 256)
1454 	 * SHA3-384(M) = KECCAK[768](M||01, 384)
1455 	 * SHA3-512(M) = KECCAK[1024](M||01, 512)
1456 	 * (M is the input, || is bit concatenation)
1457 	 *
1458 	 * The 6 below contains 01 "SHA3" bits and the first 1 "Keccak" bit:
1459 	 */
1460 	buf[ctx->bytes_queued]          ^= 6; /* bit pattern 00000110 */
1461 	buf[ctx->input_block_bytes - 1] ^= 0x80;
1462 
1463 	sha3_process_block72(ctx->state);
1464 
1465 	/* Output */
1466 	memcpy(resbuf, ctx->state, 64);
1467 	return 64;
1468 }
1469