1 /* vi: set sw=4 ts=4: */
2 /*
3 * Utility routines.
4 *
5 * Copyright (C) 2010 Denys Vlasenko
6 *
7 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
8 */
9 #include "libbb.h"
10
11 #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
12
13 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
14 * (for rotX32, there is no difference). Why? My guess is that
15 * macro requires clever common subexpression elimination heuristics
16 * in gcc, while inline basically forces it to happen.
17 */
18 //#define rotl32(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
rotl32(uint32_t x,unsigned n)19 static ALWAYS_INLINE uint32_t rotl32(uint32_t x, unsigned n)
20 {
21 return (x << n) | (x >> (32 - n));
22 }
23 //#define rotr32(x,n) (((x) >> (n)) | ((x) << (32 - (n))))
rotr32(uint32_t x,unsigned n)24 static ALWAYS_INLINE uint32_t rotr32(uint32_t x, unsigned n)
25 {
26 return (x >> n) | (x << (32 - n));
27 }
28 /* rotr64 in needed for sha512 only: */
29 //#define rotr64(x,n) (((x) >> (n)) | ((x) << (64 - (n))))
rotr64(uint64_t x,unsigned n)30 static ALWAYS_INLINE uint64_t rotr64(uint64_t x, unsigned n)
31 {
32 return (x >> n) | (x << (64 - n));
33 }
34
35 /* rotl64 only used for sha3 currently */
rotl64(uint64_t x,unsigned n)36 static ALWAYS_INLINE uint64_t rotl64(uint64_t x, unsigned n)
37 {
38 return (x << n) | (x >> (64 - n));
39 }
40
41 /* Process the remaining bytes in the buffer */
common64_end(md5_ctx_t * ctx,int swap_needed)42 static void FAST_FUNC common64_end(md5_ctx_t *ctx, int swap_needed)
43 {
44 unsigned bufpos = ctx->total64 & 63;
45 /* Pad the buffer to the next 64-byte boundary with 0x80,0,0,0... */
46 ctx->wbuffer[bufpos++] = 0x80;
47
48 /* This loop iterates either once or twice, no more, no less */
49 while (1) {
50 unsigned remaining = 64 - bufpos;
51 memset(ctx->wbuffer + bufpos, 0, remaining);
52 /* Do we have enough space for the length count? */
53 if (remaining >= 8) {
54 /* Store the 64-bit counter of bits in the buffer */
55 uint64_t t = ctx->total64 << 3;
56 if (swap_needed)
57 t = bb_bswap_64(t);
58 /* wbuffer is suitably aligned for this */
59 *(bb__aliased_uint64_t *) (&ctx->wbuffer[64 - 8]) = t;
60 }
61 ctx->process_block(ctx);
62 if (remaining >= 8)
63 break;
64 bufpos = 0;
65 }
66 }
67
68
69 /*
70 * Compute MD5 checksum of strings according to the
71 * definition of MD5 in RFC 1321 from April 1992.
72 *
73 * Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
74 *
75 * Copyright (C) 1995-1999 Free Software Foundation, Inc.
76 * Copyright (C) 2001 Manuel Novoa III
77 * Copyright (C) 2003 Glenn L. McGrath
78 * Copyright (C) 2003 Erik Andersen
79 *
80 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
81 */
82
83 /* 0: fastest, 3: smallest */
84 #if CONFIG_MD5_SMALL < 0
85 # define MD5_SMALL 0
86 #elif CONFIG_MD5_SMALL > 3
87 # define MD5_SMALL 3
88 #else
89 # define MD5_SMALL CONFIG_MD5_SMALL
90 #endif
91
92 /* These are the four functions used in the four steps of the MD5 algorithm
93 * and defined in the RFC 1321. The first function is a little bit optimized
94 * (as found in Colin Plumbs public domain implementation).
95 * #define FF(b, c, d) ((b & c) | (~b & d))
96 */
97 #undef FF
98 #undef FG
99 #undef FH
100 #undef FI
101 #define FF(b, c, d) (d ^ (b & (c ^ d)))
102 #define FG(b, c, d) FF(d, b, c)
103 #define FH(b, c, d) (b ^ c ^ d)
104 #define FI(b, c, d) (c ^ (b | ~d))
105
106 /* Hash a single block, 64 bytes long and 4-byte aligned */
md5_process_block64(md5_ctx_t * ctx)107 static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
108 {
109 #if MD5_SMALL > 0
110 /* Before we start, one word to the strange constants.
111 They are defined in RFC 1321 as
112 T[i] = (int)(2^32 * fabs(sin(i))), i=1..64
113 */
114 static const uint32_t C_array[] ALIGN4 = {
115 /* round 1 */
116 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
117 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
118 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
119 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
120 /* round 2 */
121 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
122 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
123 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
124 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
125 /* round 3 */
126 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
127 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
128 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
129 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
130 /* round 4 */
131 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
132 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
133 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
134 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
135 };
136 static const char P_array[] ALIGN1 = {
137 # if MD5_SMALL > 1
138 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
139 # endif
140 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
141 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
142 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9 /* 4 */
143 };
144 #endif
145 uint32_t *words = (void*) ctx->wbuffer;
146 uint32_t A = ctx->hash[0];
147 uint32_t B = ctx->hash[1];
148 uint32_t C = ctx->hash[2];
149 uint32_t D = ctx->hash[3];
150
151 #if MD5_SMALL >= 2 /* 2 or 3 */
152
153 static const char S_array[] ALIGN1 = {
154 7, 12, 17, 22,
155 5, 9, 14, 20,
156 4, 11, 16, 23,
157 6, 10, 15, 21
158 };
159 const uint32_t *pc;
160 const char *pp;
161 const char *ps;
162 int i;
163 uint32_t temp;
164
165 if (BB_BIG_ENDIAN)
166 for (i = 0; i < 16; i++)
167 words[i] = SWAP_LE32(words[i]);
168
169 # if MD5_SMALL == 3
170 pc = C_array;
171 pp = P_array;
172 ps = S_array - 4;
173
174 for (i = 0; i < 64; i++) {
175 if ((i & 0x0f) == 0)
176 ps += 4;
177 temp = A;
178 switch (i >> 4) {
179 case 0:
180 temp += FF(B, C, D);
181 break;
182 case 1:
183 temp += FG(B, C, D);
184 break;
185 case 2:
186 temp += FH(B, C, D);
187 break;
188 default: /* case 3 */
189 temp += FI(B, C, D);
190 }
191 temp += words[(int) (*pp++)] + *pc++;
192 temp = rotl32(temp, ps[i & 3]);
193 temp += B;
194 A = D;
195 D = C;
196 C = B;
197 B = temp;
198 }
199 # else /* MD5_SMALL == 2 */
200 pc = C_array;
201 pp = P_array;
202 ps = S_array;
203
204 for (i = 0; i < 16; i++) {
205 temp = A + FF(B, C, D) + words[(int) (*pp++)] + *pc++;
206 temp = rotl32(temp, ps[i & 3]);
207 temp += B;
208 A = D;
209 D = C;
210 C = B;
211 B = temp;
212 }
213 ps += 4;
214 for (i = 0; i < 16; i++) {
215 temp = A + FG(B, C, D) + words[(int) (*pp++)] + *pc++;
216 temp = rotl32(temp, ps[i & 3]);
217 temp += B;
218 A = D;
219 D = C;
220 C = B;
221 B = temp;
222 }
223 ps += 4;
224 for (i = 0; i < 16; i++) {
225 temp = A + FH(B, C, D) + words[(int) (*pp++)] + *pc++;
226 temp = rotl32(temp, ps[i & 3]);
227 temp += B;
228 A = D;
229 D = C;
230 C = B;
231 B = temp;
232 }
233 ps += 4;
234 for (i = 0; i < 16; i++) {
235 temp = A + FI(B, C, D) + words[(int) (*pp++)] + *pc++;
236 temp = rotl32(temp, ps[i & 3]);
237 temp += B;
238 A = D;
239 D = C;
240 C = B;
241 B = temp;
242 }
243 # endif
244 /* Add checksum to the starting values */
245 ctx->hash[0] += A;
246 ctx->hash[1] += B;
247 ctx->hash[2] += C;
248 ctx->hash[3] += D;
249
250 #else /* MD5_SMALL == 0 or 1 */
251
252 # if MD5_SMALL == 1
253 const uint32_t *pc;
254 const char *pp;
255 int i;
256 # endif
257
258 /* First round: using the given function, the context and a constant
259 the next context is computed. Because the algorithm's processing
260 unit is a 32-bit word and it is determined to work on words in
261 little endian byte order we perhaps have to change the byte order
262 before the computation. To reduce the work for the next steps
263 we save swapped words in WORDS array. */
264 # undef OP
265 # define OP(a, b, c, d, s, T) \
266 do { \
267 a += FF(b, c, d) + (*words IF_BIG_ENDIAN(= SWAP_LE32(*words))) + T; \
268 words++; \
269 a = rotl32(a, s); \
270 a += b; \
271 } while (0)
272
273 /* Round 1 */
274 # if MD5_SMALL == 1
275 pc = C_array;
276 for (i = 0; i < 4; i++) {
277 OP(A, B, C, D, 7, *pc++);
278 OP(D, A, B, C, 12, *pc++);
279 OP(C, D, A, B, 17, *pc++);
280 OP(B, C, D, A, 22, *pc++);
281 }
282 # else
283 OP(A, B, C, D, 7, 0xd76aa478);
284 OP(D, A, B, C, 12, 0xe8c7b756);
285 OP(C, D, A, B, 17, 0x242070db);
286 OP(B, C, D, A, 22, 0xc1bdceee);
287 OP(A, B, C, D, 7, 0xf57c0faf);
288 OP(D, A, B, C, 12, 0x4787c62a);
289 OP(C, D, A, B, 17, 0xa8304613);
290 OP(B, C, D, A, 22, 0xfd469501);
291 OP(A, B, C, D, 7, 0x698098d8);
292 OP(D, A, B, C, 12, 0x8b44f7af);
293 OP(C, D, A, B, 17, 0xffff5bb1);
294 OP(B, C, D, A, 22, 0x895cd7be);
295 OP(A, B, C, D, 7, 0x6b901122);
296 OP(D, A, B, C, 12, 0xfd987193);
297 OP(C, D, A, B, 17, 0xa679438e);
298 OP(B, C, D, A, 22, 0x49b40821);
299 # endif
300 words -= 16;
301
302 /* For the second to fourth round we have the possibly swapped words
303 in WORDS. Redefine the macro to take an additional first
304 argument specifying the function to use. */
305 # undef OP
306 # define OP(f, a, b, c, d, k, s, T) \
307 do { \
308 a += f(b, c, d) + words[k] + T; \
309 a = rotl32(a, s); \
310 a += b; \
311 } while (0)
312
313 /* Round 2 */
314 # if MD5_SMALL == 1
315 pp = P_array;
316 for (i = 0; i < 4; i++) {
317 OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
318 OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
319 OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
320 OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
321 }
322 # else
323 OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
324 OP(FG, D, A, B, C, 6, 9, 0xc040b340);
325 OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
326 OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
327 OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
328 OP(FG, D, A, B, C, 10, 9, 0x02441453);
329 OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
330 OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
331 OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
332 OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
333 OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
334 OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
335 OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
336 OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
337 OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
338 OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
339 # endif
340
341 /* Round 3 */
342 # if MD5_SMALL == 1
343 for (i = 0; i < 4; i++) {
344 OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
345 OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
346 OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
347 OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
348 }
349 # else
350 OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
351 OP(FH, D, A, B, C, 8, 11, 0x8771f681);
352 OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
353 OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
354 OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
355 OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
356 OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
357 OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
358 OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
359 OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
360 OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
361 OP(FH, B, C, D, A, 6, 23, 0x04881d05);
362 OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
363 OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
364 OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
365 OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
366 # endif
367
368 /* Round 4 */
369 # if MD5_SMALL == 1
370 for (i = 0; i < 4; i++) {
371 OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
372 OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
373 OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
374 OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
375 }
376 # else
377 OP(FI, A, B, C, D, 0, 6, 0xf4292244);
378 OP(FI, D, A, B, C, 7, 10, 0x432aff97);
379 OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
380 OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
381 OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
382 OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
383 OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
384 OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
385 OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
386 OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
387 OP(FI, C, D, A, B, 6, 15, 0xa3014314);
388 OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
389 OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
390 OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
391 OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
392 OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
393 # undef OP
394 # endif
395 /* Add checksum to the starting values */
396 ctx->hash[0] += A;
397 ctx->hash[1] += B;
398 ctx->hash[2] += C;
399 ctx->hash[3] += D;
400 #endif
401 }
402 #undef FF
403 #undef FG
404 #undef FH
405 #undef FI
406
407 /* Initialize structure containing state of computation.
408 * (RFC 1321, 3.3: Step 3)
409 */
md5_begin(md5_ctx_t * ctx)410 void FAST_FUNC md5_begin(md5_ctx_t *ctx)
411 {
412 ctx->hash[0] = 0x67452301;
413 ctx->hash[1] = 0xefcdab89;
414 ctx->hash[2] = 0x98badcfe;
415 ctx->hash[3] = 0x10325476;
416 ctx->total64 = 0;
417 ctx->process_block = md5_process_block64;
418 }
419
420 /* Used also for sha1 and sha256 */
md5_hash(md5_ctx_t * ctx,const void * buffer,size_t len)421 void FAST_FUNC md5_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
422 {
423 unsigned bufpos = ctx->total64 & 63;
424
425 ctx->total64 += len;
426
427 while (1) {
428 unsigned remaining = 64 - bufpos;
429 if (remaining > len)
430 remaining = len;
431 /* Copy data into aligned buffer */
432 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
433 len -= remaining;
434 buffer = (const char *)buffer + remaining;
435 bufpos += remaining;
436
437 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
438 bufpos -= 64;
439 if (bufpos != 0)
440 break;
441
442 /* Buffer is filled up, process it */
443 ctx->process_block(ctx);
444 /*bufpos = 0; - already is */
445 }
446 }
447
448 /* Process the remaining bytes in the buffer and put result from CTX
449 * in first 16 bytes following RESBUF. The result is always in little
450 * endian byte order, so that a byte-wise output yields to the wanted
451 * ASCII representation of the message digest.
452 */
md5_end(md5_ctx_t * ctx,void * resbuf)453 unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
454 {
455 /* MD5 stores total in LE, need to swap on BE arches: */
456 common64_end(ctx, /*swap_needed:*/ BB_BIG_ENDIAN);
457
458 /* The MD5 result is in little endian byte order */
459 if (BB_BIG_ENDIAN) {
460 ctx->hash[0] = SWAP_LE32(ctx->hash[0]);
461 ctx->hash[1] = SWAP_LE32(ctx->hash[1]);
462 ctx->hash[2] = SWAP_LE32(ctx->hash[2]);
463 ctx->hash[3] = SWAP_LE32(ctx->hash[3]);
464 }
465
466 memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * 4);
467 return sizeof(ctx->hash[0]) * 4;
468 }
469
470
471 /*
472 * SHA1 part is:
473 * Copyright 2007 Rob Landley <rob@landley.net>
474 *
475 * Based on the public domain SHA-1 in C by Steve Reid <steve@edmweb.com>
476 * from http://www.mirrors.wiretapped.net/security/cryptography/hashes/sha1/
477 *
478 * Licensed under GPLv2, see file LICENSE in this source tree.
479 *
480 * ---------------------------------------------------------------------------
481 *
482 * SHA256 and SHA512 parts are:
483 * Released into the Public Domain by Ulrich Drepper <drepper@redhat.com>.
484 * Shrank by Denys Vlasenko.
485 *
486 * ---------------------------------------------------------------------------
487 *
488 * The best way to test random blocksizes is to go to coreutils/md5_sha1_sum.c
489 * and replace "4096" with something like "2000 + time(NULL) % 2097",
490 * then rebuild and compare "shaNNNsum bigfile" results.
491 */
492
sha1_process_block64(sha1_ctx_t * ctx)493 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
494 {
495 static const uint32_t rconsts[] ALIGN4 = {
496 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
497 };
498 int i, j;
499 int cnt;
500 uint32_t W[16+16];
501 uint32_t a, b, c, d, e;
502
503 /* On-stack work buffer frees up one register in the main loop
504 * which otherwise will be needed to hold ctx pointer */
505 for (i = 0; i < 16; i++)
506 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
507
508 a = ctx->hash[0];
509 b = ctx->hash[1];
510 c = ctx->hash[2];
511 d = ctx->hash[3];
512 e = ctx->hash[4];
513
514 /* 4 rounds of 20 operations each */
515 cnt = 0;
516 for (i = 0; i < 4; i++) {
517 j = 19;
518 do {
519 uint32_t work;
520
521 work = c ^ d;
522 if (i == 0) {
523 work = (work & b) ^ d;
524 if (j <= 3)
525 goto ge16;
526 /* Used to do SWAP_BE32 here, but this
527 * requires ctx (see comment above) */
528 work += W[cnt];
529 } else {
530 if (i == 2)
531 work = ((b | c) & d) | (b & c);
532 else /* i = 1 or 3 */
533 work ^= b;
534 ge16:
535 W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
536 work += W[cnt];
537 }
538 work += e + rotl32(a, 5) + rconsts[i];
539
540 /* Rotate by one for next time */
541 e = d;
542 d = c;
543 c = /* b = */ rotl32(b, 30);
544 b = a;
545 a = work;
546 cnt = (cnt + 1) & 15;
547 } while (--j >= 0);
548 }
549
550 ctx->hash[0] += a;
551 ctx->hash[1] += b;
552 ctx->hash[2] += c;
553 ctx->hash[3] += d;
554 ctx->hash[4] += e;
555 }
556
557 /* Constants for SHA512 from FIPS 180-2:4.2.3.
558 * SHA256 constants from FIPS 180-2:4.2.2
559 * are the most significant half of first 64 elements
560 * of the same array.
561 */
562 #undef K
563 #if NEED_SHA512
564 typedef uint64_t sha_K_int;
565 # define K(v) v
566 #else
567 typedef uint32_t sha_K_int;
568 # define K(v) (uint32_t)(v >> 32)
569 #endif
570 static const sha_K_int sha_K[] ALIGN8 = {
571 K(0x428a2f98d728ae22ULL), K(0x7137449123ef65cdULL),
572 K(0xb5c0fbcfec4d3b2fULL), K(0xe9b5dba58189dbbcULL),
573 K(0x3956c25bf348b538ULL), K(0x59f111f1b605d019ULL),
574 K(0x923f82a4af194f9bULL), K(0xab1c5ed5da6d8118ULL),
575 K(0xd807aa98a3030242ULL), K(0x12835b0145706fbeULL),
576 K(0x243185be4ee4b28cULL), K(0x550c7dc3d5ffb4e2ULL),
577 K(0x72be5d74f27b896fULL), K(0x80deb1fe3b1696b1ULL),
578 K(0x9bdc06a725c71235ULL), K(0xc19bf174cf692694ULL),
579 K(0xe49b69c19ef14ad2ULL), K(0xefbe4786384f25e3ULL),
580 K(0x0fc19dc68b8cd5b5ULL), K(0x240ca1cc77ac9c65ULL),
581 K(0x2de92c6f592b0275ULL), K(0x4a7484aa6ea6e483ULL),
582 K(0x5cb0a9dcbd41fbd4ULL), K(0x76f988da831153b5ULL),
583 K(0x983e5152ee66dfabULL), K(0xa831c66d2db43210ULL),
584 K(0xb00327c898fb213fULL), K(0xbf597fc7beef0ee4ULL),
585 K(0xc6e00bf33da88fc2ULL), K(0xd5a79147930aa725ULL),
586 K(0x06ca6351e003826fULL), K(0x142929670a0e6e70ULL),
587 K(0x27b70a8546d22ffcULL), K(0x2e1b21385c26c926ULL),
588 K(0x4d2c6dfc5ac42aedULL), K(0x53380d139d95b3dfULL),
589 K(0x650a73548baf63deULL), K(0x766a0abb3c77b2a8ULL),
590 K(0x81c2c92e47edaee6ULL), K(0x92722c851482353bULL),
591 K(0xa2bfe8a14cf10364ULL), K(0xa81a664bbc423001ULL),
592 K(0xc24b8b70d0f89791ULL), K(0xc76c51a30654be30ULL),
593 K(0xd192e819d6ef5218ULL), K(0xd69906245565a910ULL),
594 K(0xf40e35855771202aULL), K(0x106aa07032bbd1b8ULL),
595 K(0x19a4c116b8d2d0c8ULL), K(0x1e376c085141ab53ULL),
596 K(0x2748774cdf8eeb99ULL), K(0x34b0bcb5e19b48a8ULL),
597 K(0x391c0cb3c5c95a63ULL), K(0x4ed8aa4ae3418acbULL),
598 K(0x5b9cca4f7763e373ULL), K(0x682e6ff3d6b2b8a3ULL),
599 K(0x748f82ee5defb2fcULL), K(0x78a5636f43172f60ULL),
600 K(0x84c87814a1f0ab72ULL), K(0x8cc702081a6439ecULL),
601 K(0x90befffa23631e28ULL), K(0xa4506cebde82bde9ULL),
602 K(0xbef9a3f7b2c67915ULL), K(0xc67178f2e372532bULL),
603 #if NEED_SHA512 /* [64]+ are used for sha512 only */
604 K(0xca273eceea26619cULL), K(0xd186b8c721c0c207ULL),
605 K(0xeada7dd6cde0eb1eULL), K(0xf57d4f7fee6ed178ULL),
606 K(0x06f067aa72176fbaULL), K(0x0a637dc5a2c898a6ULL),
607 K(0x113f9804bef90daeULL), K(0x1b710b35131c471bULL),
608 K(0x28db77f523047d84ULL), K(0x32caab7b40c72493ULL),
609 K(0x3c9ebe0a15c9bebcULL), K(0x431d67c49c100d4cULL),
610 K(0x4cc5d4becb3e42b6ULL), K(0x597f299cfc657e2aULL),
611 K(0x5fcb6fab3ad6faecULL), K(0x6c44198c4a475817ULL),
612 #endif
613 };
614 #undef K
615
616 #undef Ch
617 #undef Maj
618 #undef S0
619 #undef S1
620 #undef R0
621 #undef R1
622
sha256_process_block64(sha256_ctx_t * ctx)623 static void FAST_FUNC sha256_process_block64(sha256_ctx_t *ctx)
624 {
625 unsigned t;
626 uint32_t W[64], a, b, c, d, e, f, g, h;
627 const uint32_t *words = (uint32_t*) ctx->wbuffer;
628
629 /* Operators defined in FIPS 180-2:4.1.2. */
630 #define Ch(x, y, z) ((x & y) ^ (~x & z))
631 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
632 #define S0(x) (rotr32(x, 2) ^ rotr32(x, 13) ^ rotr32(x, 22))
633 #define S1(x) (rotr32(x, 6) ^ rotr32(x, 11) ^ rotr32(x, 25))
634 #define R0(x) (rotr32(x, 7) ^ rotr32(x, 18) ^ (x >> 3))
635 #define R1(x) (rotr32(x, 17) ^ rotr32(x, 19) ^ (x >> 10))
636
637 /* Compute the message schedule according to FIPS 180-2:6.2.2 step 2. */
638 for (t = 0; t < 16; ++t)
639 W[t] = SWAP_BE32(words[t]);
640 for (/*t = 16*/; t < 64; ++t)
641 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
642
643 a = ctx->hash[0];
644 b = ctx->hash[1];
645 c = ctx->hash[2];
646 d = ctx->hash[3];
647 e = ctx->hash[4];
648 f = ctx->hash[5];
649 g = ctx->hash[6];
650 h = ctx->hash[7];
651
652 /* The actual computation according to FIPS 180-2:6.2.2 step 3. */
653 for (t = 0; t < 64; ++t) {
654 /* Need to fetch upper half of sha_K[t]
655 * (I hope compiler is clever enough to just fetch
656 * upper half)
657 */
658 uint32_t K_t = NEED_SHA512 ? (sha_K[t] >> 32) : sha_K[t];
659 uint32_t T1 = h + S1(e) + Ch(e, f, g) + K_t + W[t];
660 uint32_t T2 = S0(a) + Maj(a, b, c);
661 h = g;
662 g = f;
663 f = e;
664 e = d + T1;
665 d = c;
666 c = b;
667 b = a;
668 a = T1 + T2;
669 }
670 #undef Ch
671 #undef Maj
672 #undef S0
673 #undef S1
674 #undef R0
675 #undef R1
676 /* Add the starting values of the context according to FIPS 180-2:6.2.2
677 step 4. */
678 ctx->hash[0] += a;
679 ctx->hash[1] += b;
680 ctx->hash[2] += c;
681 ctx->hash[3] += d;
682 ctx->hash[4] += e;
683 ctx->hash[5] += f;
684 ctx->hash[6] += g;
685 ctx->hash[7] += h;
686 }
687
688 #if NEED_SHA512
sha512_process_block128(sha512_ctx_t * ctx)689 static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
690 {
691 unsigned t;
692 uint64_t W[80];
693 /* On i386, having assignments here (not later as sha256 does)
694 * produces 99 bytes smaller code with gcc 4.3.1
695 */
696 uint64_t a = ctx->hash[0];
697 uint64_t b = ctx->hash[1];
698 uint64_t c = ctx->hash[2];
699 uint64_t d = ctx->hash[3];
700 uint64_t e = ctx->hash[4];
701 uint64_t f = ctx->hash[5];
702 uint64_t g = ctx->hash[6];
703 uint64_t h = ctx->hash[7];
704 const uint64_t *words = (uint64_t*) ctx->wbuffer;
705
706 /* Operators defined in FIPS 180-2:4.1.2. */
707 #define Ch(x, y, z) ((x & y) ^ (~x & z))
708 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
709 #define S0(x) (rotr64(x, 28) ^ rotr64(x, 34) ^ rotr64(x, 39))
710 #define S1(x) (rotr64(x, 14) ^ rotr64(x, 18) ^ rotr64(x, 41))
711 #define R0(x) (rotr64(x, 1) ^ rotr64(x, 8) ^ (x >> 7))
712 #define R1(x) (rotr64(x, 19) ^ rotr64(x, 61) ^ (x >> 6))
713
714 /* Compute the message schedule according to FIPS 180-2:6.3.2 step 2. */
715 for (t = 0; t < 16; ++t)
716 W[t] = SWAP_BE64(words[t]);
717 for (/*t = 16*/; t < 80; ++t)
718 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
719
720 /* The actual computation according to FIPS 180-2:6.3.2 step 3. */
721 for (t = 0; t < 80; ++t) {
722 uint64_t T1 = h + S1(e) + Ch(e, f, g) + sha_K[t] + W[t];
723 uint64_t T2 = S0(a) + Maj(a, b, c);
724 h = g;
725 g = f;
726 f = e;
727 e = d + T1;
728 d = c;
729 c = b;
730 b = a;
731 a = T1 + T2;
732 }
733 #undef Ch
734 #undef Maj
735 #undef S0
736 #undef S1
737 #undef R0
738 #undef R1
739 /* Add the starting values of the context according to FIPS 180-2:6.3.2
740 step 4. */
741 ctx->hash[0] += a;
742 ctx->hash[1] += b;
743 ctx->hash[2] += c;
744 ctx->hash[3] += d;
745 ctx->hash[4] += e;
746 ctx->hash[5] += f;
747 ctx->hash[6] += g;
748 ctx->hash[7] += h;
749 }
750 #endif /* NEED_SHA512 */
751
sha1_begin(sha1_ctx_t * ctx)752 void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
753 {
754 ctx->hash[0] = 0x67452301;
755 ctx->hash[1] = 0xefcdab89;
756 ctx->hash[2] = 0x98badcfe;
757 ctx->hash[3] = 0x10325476;
758 ctx->hash[4] = 0xc3d2e1f0;
759 ctx->total64 = 0;
760 ctx->process_block = sha1_process_block64;
761 }
762
763 static const uint32_t init256[] ALIGN4 = {
764 0,
765 0,
766 0x6a09e667,
767 0xbb67ae85,
768 0x3c6ef372,
769 0xa54ff53a,
770 0x510e527f,
771 0x9b05688c,
772 0x1f83d9ab,
773 0x5be0cd19,
774 };
775 #if NEED_SHA512
776 static const uint32_t init512_lo[] ALIGN4 = {
777 0,
778 0,
779 0xf3bcc908,
780 0x84caa73b,
781 0xfe94f82b,
782 0x5f1d36f1,
783 0xade682d1,
784 0x2b3e6c1f,
785 0xfb41bd6b,
786 0x137e2179,
787 };
788 #endif /* NEED_SHA512 */
789
790 // Note: SHA-384 is identical to SHA-512, except that initial hash values are
791 // 0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939,
792 // 0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4,
793 // and the output is constructed by omitting last two 64-bit words of it.
794
795 /* Initialize structure containing state of computation.
796 (FIPS 180-2:5.3.2) */
sha256_begin(sha256_ctx_t * ctx)797 void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
798 {
799 memcpy(&ctx->total64, init256, sizeof(init256));
800 /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
801 ctx->process_block = sha256_process_block64;
802 }
803
804 #if NEED_SHA512
805 /* Initialize structure containing state of computation.
806 (FIPS 180-2:5.3.3) */
sha512_begin(sha512_ctx_t * ctx)807 void FAST_FUNC sha512_begin(sha512_ctx_t *ctx)
808 {
809 int i;
810 /* Two extra iterations zero out ctx->total64[2] */
811 uint64_t *tp = ctx->total64;
812 for (i = 0; i < 8 + 2; i++)
813 tp[i] = ((uint64_t)(init256[i]) << 32) + init512_lo[i];
814 /*ctx->total64[0] = ctx->total64[1] = 0; - already done */
815 }
816
sha512_hash(sha512_ctx_t * ctx,const void * buffer,size_t len)817 void FAST_FUNC sha512_hash(sha512_ctx_t *ctx, const void *buffer, size_t len)
818 {
819 unsigned bufpos = ctx->total64[0] & 127;
820 unsigned remaining;
821
822 /* First increment the byte count. FIPS 180-2 specifies the possible
823 length of the file up to 2^128 _bits_.
824 We compute the number of _bytes_ and convert to bits later. */
825 ctx->total64[0] += len;
826 if (ctx->total64[0] < len)
827 ctx->total64[1]++;
828
829 while (1) {
830 remaining = 128 - bufpos;
831 if (remaining > len)
832 remaining = len;
833 /* Copy data into aligned buffer */
834 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
835 len -= remaining;
836 buffer = (const char *)buffer + remaining;
837 bufpos += remaining;
838
839 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
840 bufpos -= 128;
841 if (bufpos != 0)
842 break;
843
844 /* Buffer is filled up, process it */
845 sha512_process_block128(ctx);
846 /*bufpos = 0; - already is */
847 }
848 }
849 #endif /* NEED_SHA512 */
850
851 /* Used also for sha256 */
sha1_end(sha1_ctx_t * ctx,void * resbuf)852 unsigned FAST_FUNC sha1_end(sha1_ctx_t *ctx, void *resbuf)
853 {
854 unsigned hash_size;
855
856 /* SHA stores total in BE, need to swap on LE arches: */
857 common64_end(ctx, /*swap_needed:*/ BB_LITTLE_ENDIAN);
858
859 hash_size = (ctx->process_block == sha1_process_block64) ? 5 : 8;
860 /* This way we do not impose alignment constraints on resbuf: */
861 if (BB_LITTLE_ENDIAN) {
862 unsigned i;
863 for (i = 0; i < hash_size; ++i)
864 ctx->hash[i] = SWAP_BE32(ctx->hash[i]);
865 }
866 hash_size *= sizeof(ctx->hash[0]);
867 memcpy(resbuf, ctx->hash, hash_size);
868 return hash_size;
869 }
870
871 #if NEED_SHA512
sha512_end(sha512_ctx_t * ctx,void * resbuf)872 unsigned FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
873 {
874 unsigned bufpos = ctx->total64[0] & 127;
875
876 /* Pad the buffer to the next 128-byte boundary with 0x80,0,0,0... */
877 ctx->wbuffer[bufpos++] = 0x80;
878
879 while (1) {
880 unsigned remaining = 128 - bufpos;
881 memset(ctx->wbuffer + bufpos, 0, remaining);
882 if (remaining >= 16) {
883 /* Store the 128-bit counter of bits in the buffer in BE format */
884 uint64_t t;
885 t = ctx->total64[0] << 3;
886 t = SWAP_BE64(t);
887 *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 8]) = t;
888 t = (ctx->total64[1] << 3) | (ctx->total64[0] >> 61);
889 t = SWAP_BE64(t);
890 *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 16]) = t;
891 }
892 sha512_process_block128(ctx);
893 if (remaining >= 16)
894 break;
895 bufpos = 0;
896 }
897
898 if (BB_LITTLE_ENDIAN) {
899 unsigned i;
900 for (i = 0; i < ARRAY_SIZE(ctx->hash); ++i)
901 ctx->hash[i] = SWAP_BE64(ctx->hash[i]);
902 }
903 memcpy(resbuf, ctx->hash, sizeof(ctx->hash));
904 return sizeof(ctx->hash);
905 }
906 #endif /* NEED_SHA512 */
907
908
909 /*
910 * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
911 * Michael Peeters and Gilles Van Assche. For more information, feedback or
912 * questions, please refer to our website: http://keccak.noekeon.org/
913 *
914 * Implementation by Ronny Van Keer,
915 * hereby denoted as "the implementer".
916 *
917 * To the extent possible under law, the implementer has waived all copyright
918 * and related or neighboring rights to the source code in this file.
919 * http://creativecommons.org/publicdomain/zero/1.0/
920 *
921 * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
922 */
923
924 #if CONFIG_SHA3_SMALL < 0
925 # define SHA3_SMALL 0
926 #elif CONFIG_SHA3_SMALL > 1
927 # define SHA3_SMALL 1
928 #else
929 # define SHA3_SMALL CONFIG_SHA3_SMALL
930 #endif
931
932 #define OPTIMIZE_SHA3_FOR_32 0
933 /*
934 * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
935 * every 64-bit word of state[] can be split into two 32-bit words
936 * by even/odd bits. In this form, all rotations of sha3 round
937 * are 32-bit - and there are lots of them.
938 * However, it requires either splitting/combining state words
939 * before/after sha3 round (code does this now)
940 * or shuffling bits before xor'ing them into state and in sha3_end.
941 * Without shuffling, bit-slicing results in -130 bytes of code
942 * and marginal speedup (but of course it gives wrong result).
943 * With shuffling it works, but +260 code bytes, and slower.
944 * Disabled for now:
945 */
946 #if 0 /* LONG_MAX == 0x7fffffff */
947 # undef OPTIMIZE_SHA3_FOR_32
948 # define OPTIMIZE_SHA3_FOR_32 1
949 #endif
950
951 #if OPTIMIZE_SHA3_FOR_32
952 /* This splits every 64-bit word into a pair of 32-bit words,
953 * even bits go into first word, odd bits go to second one.
954 * The conversion is done in-place.
955 */
split_halves(uint64_t * state)956 static void split_halves(uint64_t *state)
957 {
958 /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
959 uint32_t *s32 = (uint32_t*)state;
960 uint32_t t, x0, x1;
961 int i;
962 for (i = 24; i >= 0; --i) {
963 x0 = s32[0];
964 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
965 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
966 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
967 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
968 x1 = s32[1];
969 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
970 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
971 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
972 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
973 *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
974 *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
975 }
976 }
977 /* The reverse operation */
combine_halves(uint64_t * state)978 static void combine_halves(uint64_t *state)
979 {
980 uint32_t *s32 = (uint32_t*)state;
981 uint32_t t, x0, x1;
982 int i;
983 for (i = 24; i >= 0; --i) {
984 x0 = s32[0];
985 x1 = s32[1];
986 t = (x0 & 0x0000FFFF) | (x1 << 16);
987 x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
988 x0 = t;
989 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
990 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
991 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
992 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
993 *s32++ = x0;
994 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
995 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
996 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
997 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
998 *s32++ = x1;
999 }
1000 }
1001 #endif
1002
1003 /*
1004 * In the crypto literature this function is usually called Keccak-f().
1005 */
sha3_process_block72(uint64_t * state)1006 static void sha3_process_block72(uint64_t *state)
1007 {
1008 enum { NROUNDS = 24 };
1009
1010 #if OPTIMIZE_SHA3_FOR_32
1011 /*
1012 static const uint32_t IOTA_CONST_0[NROUNDS] ALIGN4 = {
1013 0x00000001UL,
1014 0x00000000UL,
1015 0x00000000UL,
1016 0x00000000UL,
1017 0x00000001UL,
1018 0x00000001UL,
1019 0x00000001UL,
1020 0x00000001UL,
1021 0x00000000UL,
1022 0x00000000UL,
1023 0x00000001UL,
1024 0x00000000UL,
1025 0x00000001UL,
1026 0x00000001UL,
1027 0x00000001UL,
1028 0x00000001UL,
1029 0x00000000UL,
1030 0x00000000UL,
1031 0x00000000UL,
1032 0x00000000UL,
1033 0x00000001UL,
1034 0x00000000UL,
1035 0x00000001UL,
1036 0x00000000UL,
1037 };
1038 ** bits are in lsb: 0101 0000 1111 0100 1111 0001
1039 */
1040 uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
1041 static const uint32_t IOTA_CONST_1[NROUNDS] ALIGN4 = {
1042 0x00000000UL,
1043 0x00000089UL,
1044 0x8000008bUL,
1045 0x80008080UL,
1046 0x0000008bUL,
1047 0x00008000UL,
1048 0x80008088UL,
1049 0x80000082UL,
1050 0x0000000bUL,
1051 0x0000000aUL,
1052 0x00008082UL,
1053 0x00008003UL,
1054 0x0000808bUL,
1055 0x8000000bUL,
1056 0x8000008aUL,
1057 0x80000081UL,
1058 0x80000081UL,
1059 0x80000008UL,
1060 0x00000083UL,
1061 0x80008003UL,
1062 0x80008088UL,
1063 0x80000088UL,
1064 0x00008000UL,
1065 0x80008082UL,
1066 };
1067
1068 uint32_t *const s32 = (uint32_t*)state;
1069 unsigned round;
1070
1071 split_halves(state);
1072
1073 for (round = 0; round < NROUNDS; round++) {
1074 unsigned x;
1075
1076 /* Theta */
1077 {
1078 uint32_t BC[20];
1079 for (x = 0; x < 10; ++x) {
1080 BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1081 }
1082 for (x = 0; x < 10; x += 2) {
1083 uint32_t ta, tb;
1084 ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1085 tb = BC[x+9] ^ BC[x+2];
1086 s32[x+0] ^= ta;
1087 s32[x+1] ^= tb;
1088 s32[x+10] ^= ta;
1089 s32[x+11] ^= tb;
1090 s32[x+20] ^= ta;
1091 s32[x+21] ^= tb;
1092 s32[x+30] ^= ta;
1093 s32[x+31] ^= tb;
1094 s32[x+40] ^= ta;
1095 s32[x+41] ^= tb;
1096 }
1097 }
1098 /* RhoPi */
1099 {
1100 uint32_t t0a,t0b, t1a,t1b;
1101 t1a = s32[1*2+0];
1102 t1b = s32[1*2+1];
1103
1104 #define RhoPi(PI_LANE, ROT_CONST) \
1105 t0a = s32[PI_LANE*2+0];\
1106 t0b = s32[PI_LANE*2+1];\
1107 if (ROT_CONST & 1) {\
1108 s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1109 s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1110 } else {\
1111 s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1112 s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1113 }\
1114 t1a = t0a; t1b = t0b;
1115
1116 RhoPi(10, 1)
1117 RhoPi( 7, 3)
1118 RhoPi(11, 6)
1119 RhoPi(17,10)
1120 RhoPi(18,15)
1121 RhoPi( 3,21)
1122 RhoPi( 5,28)
1123 RhoPi(16,36)
1124 RhoPi( 8,45)
1125 RhoPi(21,55)
1126 RhoPi(24, 2)
1127 RhoPi( 4,14)
1128 RhoPi(15,27)
1129 RhoPi(23,41)
1130 RhoPi(19,56)
1131 RhoPi(13, 8)
1132 RhoPi(12,25)
1133 RhoPi( 2,43)
1134 RhoPi(20,62)
1135 RhoPi(14,18)
1136 RhoPi(22,39)
1137 RhoPi( 9,61)
1138 RhoPi( 6,20)
1139 RhoPi( 1,44)
1140 #undef RhoPi
1141 }
1142 /* Chi */
1143 for (x = 0; x <= 40;) {
1144 uint32_t BC0, BC1, BC2, BC3, BC4;
1145 BC0 = s32[x + 0*2];
1146 BC1 = s32[x + 1*2];
1147 BC2 = s32[x + 2*2];
1148 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1149 BC3 = s32[x + 3*2];
1150 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1151 BC4 = s32[x + 4*2];
1152 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1153 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1154 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1155 x++;
1156 BC0 = s32[x + 0*2];
1157 BC1 = s32[x + 1*2];
1158 BC2 = s32[x + 2*2];
1159 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1160 BC3 = s32[x + 3*2];
1161 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1162 BC4 = s32[x + 4*2];
1163 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1164 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1165 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1166 x += 9;
1167 }
1168 /* Iota */
1169 s32[0] ^= IOTA_CONST_0bits & 1;
1170 IOTA_CONST_0bits >>= 1;
1171 s32[1] ^= IOTA_CONST_1[round];
1172 }
1173
1174 combine_halves(state);
1175 #else
1176 /* Native 64-bit algorithm */
1177 static const uint16_t IOTA_CONST[NROUNDS] ALIGN2 = {
1178 /* Elements should be 64-bit, but top half is always zero
1179 * or 0x80000000. We encode 63rd bits in a separate word below.
1180 * Same is true for 31th bits, which lets us use 16-bit table
1181 * instead of 64-bit. The speed penalty is lost in the noise.
1182 */
1183 0x0001,
1184 0x8082,
1185 0x808a,
1186 0x8000,
1187 0x808b,
1188 0x0001,
1189 0x8081,
1190 0x8009,
1191 0x008a,
1192 0x0088,
1193 0x8009,
1194 0x000a,
1195 0x808b,
1196 0x008b,
1197 0x8089,
1198 0x8003,
1199 0x8002,
1200 0x0080,
1201 0x800a,
1202 0x000a,
1203 0x8081,
1204 0x8080,
1205 0x0001,
1206 0x8008,
1207 };
1208 /* bit for CONST[0] is in msb: 0011 0011 0000 0111 1101 1101 */
1209 const uint32_t IOTA_CONST_bit63 = (uint32_t)(0x3307dd00);
1210 /* bit for CONST[0] is in msb: 0001 0110 0011 1000 0001 1011 */
1211 const uint32_t IOTA_CONST_bit31 = (uint32_t)(0x16381b00);
1212
1213 static const uint8_t ROT_CONST[24] ALIGN1 = {
1214 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
1215 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
1216 };
1217 static const uint8_t PI_LANE[24] ALIGN1 = {
1218 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
1219 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
1220 };
1221 /*static const uint8_t MOD5[10] ALIGN1 = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
1222
1223 unsigned x;
1224 unsigned round;
1225
1226 if (BB_BIG_ENDIAN) {
1227 for (x = 0; x < 25; x++) {
1228 state[x] = SWAP_LE64(state[x]);
1229 }
1230 }
1231
1232 for (round = 0; round < NROUNDS; ++round) {
1233 /* Theta */
1234 {
1235 uint64_t BC[10];
1236 for (x = 0; x < 5; ++x) {
1237 BC[x + 5] = BC[x] = state[x]
1238 ^ state[x + 5] ^ state[x + 10]
1239 ^ state[x + 15] ^ state[x + 20];
1240 }
1241 /* Using 2x5 vector above eliminates the need to use
1242 * BC[MOD5[x+N]] trick below to fetch BC[(x+N) % 5],
1243 * and the code is a bit _smaller_.
1244 */
1245 for (x = 0; x < 5; ++x) {
1246 uint64_t temp = BC[x + 4] ^ rotl64(BC[x + 1], 1);
1247 state[x] ^= temp;
1248 state[x + 5] ^= temp;
1249 state[x + 10] ^= temp;
1250 state[x + 15] ^= temp;
1251 state[x + 20] ^= temp;
1252 }
1253 }
1254
1255 /* Rho Pi */
1256 if (SHA3_SMALL) {
1257 uint64_t t1 = state[1];
1258 for (x = 0; x < 24; ++x) {
1259 uint64_t t0 = state[PI_LANE[x]];
1260 state[PI_LANE[x]] = rotl64(t1, ROT_CONST[x]);
1261 t1 = t0;
1262 }
1263 } else {
1264 /* Especially large benefit for 32-bit arch (75% faster):
1265 * 64-bit rotations by non-constant usually are SLOW on those.
1266 * We resort to unrolling here.
1267 * This optimizes out PI_LANE[] and ROT_CONST[],
1268 * but generates 300-500 more bytes of code.
1269 */
1270 uint64_t t0;
1271 uint64_t t1 = state[1];
1272 #define RhoPi_twice(x) \
1273 t0 = state[PI_LANE[x ]]; \
1274 state[PI_LANE[x ]] = rotl64(t1, ROT_CONST[x ]); \
1275 t1 = state[PI_LANE[x+1]]; \
1276 state[PI_LANE[x+1]] = rotl64(t0, ROT_CONST[x+1]);
1277 RhoPi_twice(0); RhoPi_twice(2);
1278 RhoPi_twice(4); RhoPi_twice(6);
1279 RhoPi_twice(8); RhoPi_twice(10);
1280 RhoPi_twice(12); RhoPi_twice(14);
1281 RhoPi_twice(16); RhoPi_twice(18);
1282 RhoPi_twice(20); RhoPi_twice(22);
1283 #undef RhoPi_twice
1284 }
1285 /* Chi */
1286 # if LONG_MAX > 0x7fffffff
1287 for (x = 0; x <= 20; x += 5) {
1288 uint64_t BC0, BC1, BC2, BC3, BC4;
1289 BC0 = state[x + 0];
1290 BC1 = state[x + 1];
1291 BC2 = state[x + 2];
1292 state[x + 0] = BC0 ^ ((~BC1) & BC2);
1293 BC3 = state[x + 3];
1294 state[x + 1] = BC1 ^ ((~BC2) & BC3);
1295 BC4 = state[x + 4];
1296 state[x + 2] = BC2 ^ ((~BC3) & BC4);
1297 state[x + 3] = BC3 ^ ((~BC4) & BC0);
1298 state[x + 4] = BC4 ^ ((~BC0) & BC1);
1299 }
1300 # else
1301 /* Reduced register pressure version
1302 * for register-starved 32-bit arches
1303 * (i386: -95 bytes, and it is _faster_)
1304 */
1305 for (x = 0; x <= 40;) {
1306 uint32_t BC0, BC1, BC2, BC3, BC4;
1307 uint32_t *const s32 = (uint32_t*)state;
1308 # if SHA3_SMALL
1309 do_half:
1310 # endif
1311 BC0 = s32[x + 0*2];
1312 BC1 = s32[x + 1*2];
1313 BC2 = s32[x + 2*2];
1314 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1315 BC3 = s32[x + 3*2];
1316 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1317 BC4 = s32[x + 4*2];
1318 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1319 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1320 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1321 x++;
1322 # if SHA3_SMALL
1323 if (x & 1)
1324 goto do_half;
1325 x += 8;
1326 # else
1327 BC0 = s32[x + 0*2];
1328 BC1 = s32[x + 1*2];
1329 BC2 = s32[x + 2*2];
1330 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1331 BC3 = s32[x + 3*2];
1332 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1333 BC4 = s32[x + 4*2];
1334 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1335 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1336 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1337 x += 9;
1338 # endif
1339 }
1340 # endif /* long is 32-bit */
1341 /* Iota */
1342 state[0] ^= IOTA_CONST[round]
1343 | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
1344 | (uint64_t)((IOTA_CONST_bit63 << round) & 0x80000000) << 32;
1345 }
1346
1347 if (BB_BIG_ENDIAN) {
1348 for (x = 0; x < 25; x++) {
1349 state[x] = SWAP_LE64(state[x]);
1350 }
1351 }
1352 #endif
1353 }
1354
sha3_begin(sha3_ctx_t * ctx)1355 void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)
1356 {
1357 memset(ctx, 0, sizeof(*ctx));
1358 /* SHA3-512, user can override */
1359 ctx->input_block_bytes = (1600 - 512*2) / 8; /* 72 bytes */
1360 }
1361
sha3_hash(sha3_ctx_t * ctx,const void * buffer,size_t len)1362 void FAST_FUNC sha3_hash(sha3_ctx_t *ctx, const void *buffer, size_t len)
1363 {
1364 #if SHA3_SMALL
1365 const uint8_t *data = buffer;
1366 unsigned bufpos = ctx->bytes_queued;
1367
1368 while (1) {
1369 unsigned remaining = ctx->input_block_bytes - bufpos;
1370 if (remaining > len)
1371 remaining = len;
1372 len -= remaining;
1373 /* XOR data into buffer */
1374 while (remaining != 0) {
1375 uint8_t *buf = (uint8_t*)ctx->state;
1376 buf[bufpos] ^= *data++;
1377 bufpos++;
1378 remaining--;
1379 }
1380
1381 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
1382 bufpos -= ctx->input_block_bytes;
1383 if (bufpos != 0)
1384 break;
1385
1386 /* Buffer is filled up, process it */
1387 sha3_process_block72(ctx->state);
1388 /*bufpos = 0; - already is */
1389 }
1390 ctx->bytes_queued = bufpos + ctx->input_block_bytes;
1391 #else
1392 /* +50 bytes code size, but a bit faster because of long-sized XORs */
1393 const uint8_t *data = buffer;
1394 unsigned bufpos = ctx->bytes_queued;
1395 unsigned iblk_bytes = ctx->input_block_bytes;
1396
1397 /* If already data in queue, continue queuing first */
1398 if (bufpos != 0) {
1399 while (len != 0) {
1400 uint8_t *buf = (uint8_t*)ctx->state;
1401 buf[bufpos] ^= *data++;
1402 len--;
1403 bufpos++;
1404 if (bufpos == iblk_bytes) {
1405 bufpos = 0;
1406 goto do_block;
1407 }
1408 }
1409 }
1410
1411 /* Absorb complete blocks */
1412 while (len >= iblk_bytes) {
1413 /* XOR data onto beginning of state[].
1414 * We try to be efficient - operate one word at a time, not byte.
1415 * Careful wrt unaligned access: can't just use "*(long*)data"!
1416 */
1417 unsigned count = iblk_bytes / sizeof(long);
1418 long *buf = (long*)ctx->state;
1419 do {
1420 long v;
1421 move_from_unaligned_long(v, (long*)data);
1422 *buf++ ^= v;
1423 data += sizeof(long);
1424 } while (--count);
1425 len -= iblk_bytes;
1426 do_block:
1427 sha3_process_block72(ctx->state);
1428 }
1429
1430 /* Queue remaining data bytes */
1431 while (len != 0) {
1432 uint8_t *buf = (uint8_t*)ctx->state;
1433 buf[bufpos] ^= *data++;
1434 bufpos++;
1435 len--;
1436 }
1437
1438 ctx->bytes_queued = bufpos;
1439 #endif
1440 }
1441
sha3_end(sha3_ctx_t * ctx,void * resbuf)1442 unsigned FAST_FUNC sha3_end(sha3_ctx_t *ctx, void *resbuf)
1443 {
1444 /* Padding */
1445 uint8_t *buf = (uint8_t*)ctx->state;
1446 /*
1447 * Keccak block padding is: add 1 bit after last bit of input,
1448 * then add zero bits until the end of block, and add the last 1 bit
1449 * (the last bit in the block) - the "10*1" pattern.
1450 * SHA3 standard appends additional two bits, 01, before that padding:
1451 *
1452 * SHA3-224(M) = KECCAK[448](M||01, 224)
1453 * SHA3-256(M) = KECCAK[512](M||01, 256)
1454 * SHA3-384(M) = KECCAK[768](M||01, 384)
1455 * SHA3-512(M) = KECCAK[1024](M||01, 512)
1456 * (M is the input, || is bit concatenation)
1457 *
1458 * The 6 below contains 01 "SHA3" bits and the first 1 "Keccak" bit:
1459 */
1460 buf[ctx->bytes_queued] ^= 6; /* bit pattern 00000110 */
1461 buf[ctx->input_block_bytes - 1] ^= 0x80;
1462
1463 sha3_process_block72(ctx->state);
1464
1465 /* Output */
1466 memcpy(resbuf, ctx->state, 64);
1467 return 64;
1468 }
1469