1 /* gf128mul.c - GF(2^128) multiplication functions
2  *
3  * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
4  * Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
5  *
6  * Based on Dr Brian Gladman's (GPL'd) work published at
7  * http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
8  * See the original copyright notice below.
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms of the GNU General Public License as published by the Free
12  * Software Foundation; either version 2 of the License, or (at your option)
13  * any later version.
14  */
15 
16 /*
17  ---------------------------------------------------------------------------
18  Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
19 
20  LICENSE TERMS
21 
22  The free distribution and use of this software in both source and binary
23  form is allowed (with or without changes) provided that:
24 
25    1. distributions of this source code include the above copyright
26       notice, this list of conditions and the following disclaimer;
27 
28    2. distributions in binary form include the above copyright
29       notice, this list of conditions and the following disclaimer
30       in the documentation and/or other associated materials;
31 
32    3. the copyright holder's name is not used to endorse products
33       built using this software without specific written permission.
34 
35  ALTERNATIVELY, provided that this notice is retained in full, this product
36  may be distributed under the terms of the GNU General Public License (GPL),
37  in which case the provisions of the GPL apply INSTEAD OF those given above.
38 
39  DISCLAIMER
40 
41  This software is provided 'as is' with no explicit or implied warranties
42  in respect of its properties, including, but not limited to, correctness
43  and/or fitness for purpose.
44  ---------------------------------------------------------------------------
45  Issue 31/01/2006
46 
47  This file provides fast multiplication in GF(2^128) as required by several
48  cryptographic authentication modes
49 */
50 
51 #include <crypto/gf128mul.h>
52 #include <linux/kernel.h>
53 #include <linux/module.h>
54 #include <linux/slab.h>
55 
56 #define gf128mul_dat(q) { \
57 	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
58 	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
59 	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
60 	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
61 	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
62 	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
63 	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
64 	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
65 	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
66 	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
67 	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
68 	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
69 	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
70 	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
71 	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
72 	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
73 	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
74 	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
75 	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
76 	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
77 	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
78 	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
79 	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
80 	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
81 	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
82 	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
83 	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
84 	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
85 	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
86 	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
87 	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
88 	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
89 }
90 
91 /*
92  * Given a value i in 0..255 as the byte overflow when a field element
93  * in GF(2^128) is multiplied by x^8, the following macro returns the
94  * 16-bit value that must be XOR-ed into the low-degree end of the
95  * product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1.
96  *
97  * There are two versions of the macro, and hence two tables: one for
98  * the "be" convention where the highest-order bit is the coefficient of
99  * the highest-degree polynomial term, and one for the "le" convention
100  * where the highest-order bit is the coefficient of the lowest-degree
101  * polynomial term.  In both cases the values are stored in CPU byte
102  * endianness such that the coefficients are ordered consistently across
103  * bytes, i.e. in the "be" table bits 15..0 of the stored value
104  * correspond to the coefficients of x^15..x^0, and in the "le" table
105  * bits 15..0 correspond to the coefficients of x^0..x^15.
106  *
107  * Therefore, provided that the appropriate byte endianness conversions
108  * are done by the multiplication functions (and these must be in place
109  * anyway to support both little endian and big endian CPUs), the "be"
110  * table can be used for multiplications of both "bbe" and "ble"
111  * elements, and the "le" table can be used for multiplications of both
112  * "lle" and "lbe" elements.
113  */
114 
115 #define xda_be(i) ( \
116 	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
117 	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
118 	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
119 	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
120 )
121 
122 #define xda_le(i) ( \
123 	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
124 	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
125 	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
126 	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
127 )
128 
129 static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
130 static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
131 
132 /*
133  * The following functions multiply a field element by x^8 in
134  * the polynomial field representation.  They use 64-bit word operations
135  * to gain speed but compensate for machine endianness and hence work
136  * correctly on both styles of machine.
137  */
138 
gf128mul_x8_lle(be128 * x)139 static void gf128mul_x8_lle(be128 *x)
140 {
141 	u64 a = be64_to_cpu(x->a);
142 	u64 b = be64_to_cpu(x->b);
143 	u64 _tt = gf128mul_table_le[b & 0xff];
144 
145 	x->b = cpu_to_be64((b >> 8) | (a << 56));
146 	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
147 }
148 
gf128mul_x8_bbe(be128 * x)149 static void gf128mul_x8_bbe(be128 *x)
150 {
151 	u64 a = be64_to_cpu(x->a);
152 	u64 b = be64_to_cpu(x->b);
153 	u64 _tt = gf128mul_table_be[a >> 56];
154 
155 	x->a = cpu_to_be64((a << 8) | (b >> 56));
156 	x->b = cpu_to_be64((b << 8) ^ _tt);
157 }
158 
gf128mul_x8_ble(le128 * r,const le128 * x)159 void gf128mul_x8_ble(le128 *r, const le128 *x)
160 {
161 	u64 a = le64_to_cpu(x->a);
162 	u64 b = le64_to_cpu(x->b);
163 	u64 _tt = gf128mul_table_be[a >> 56];
164 
165 	r->a = cpu_to_le64((a << 8) | (b >> 56));
166 	r->b = cpu_to_le64((b << 8) ^ _tt);
167 }
168 EXPORT_SYMBOL(gf128mul_x8_ble);
169 
gf128mul_lle(be128 * r,const be128 * b)170 void gf128mul_lle(be128 *r, const be128 *b)
171 {
172 	be128 p[8];
173 	int i;
174 
175 	p[0] = *r;
176 	for (i = 0; i < 7; ++i)
177 		gf128mul_x_lle(&p[i + 1], &p[i]);
178 
179 	memset(r, 0, sizeof(*r));
180 	for (i = 0;;) {
181 		u8 ch = ((u8 *)b)[15 - i];
182 
183 		if (ch & 0x80)
184 			be128_xor(r, r, &p[0]);
185 		if (ch & 0x40)
186 			be128_xor(r, r, &p[1]);
187 		if (ch & 0x20)
188 			be128_xor(r, r, &p[2]);
189 		if (ch & 0x10)
190 			be128_xor(r, r, &p[3]);
191 		if (ch & 0x08)
192 			be128_xor(r, r, &p[4]);
193 		if (ch & 0x04)
194 			be128_xor(r, r, &p[5]);
195 		if (ch & 0x02)
196 			be128_xor(r, r, &p[6]);
197 		if (ch & 0x01)
198 			be128_xor(r, r, &p[7]);
199 
200 		if (++i >= 16)
201 			break;
202 
203 		gf128mul_x8_lle(r);
204 	}
205 }
206 EXPORT_SYMBOL(gf128mul_lle);
207 
gf128mul_bbe(be128 * r,const be128 * b)208 void gf128mul_bbe(be128 *r, const be128 *b)
209 {
210 	be128 p[8];
211 	int i;
212 
213 	p[0] = *r;
214 	for (i = 0; i < 7; ++i)
215 		gf128mul_x_bbe(&p[i + 1], &p[i]);
216 
217 	memset(r, 0, sizeof(*r));
218 	for (i = 0;;) {
219 		u8 ch = ((u8 *)b)[i];
220 
221 		if (ch & 0x80)
222 			be128_xor(r, r, &p[7]);
223 		if (ch & 0x40)
224 			be128_xor(r, r, &p[6]);
225 		if (ch & 0x20)
226 			be128_xor(r, r, &p[5]);
227 		if (ch & 0x10)
228 			be128_xor(r, r, &p[4]);
229 		if (ch & 0x08)
230 			be128_xor(r, r, &p[3]);
231 		if (ch & 0x04)
232 			be128_xor(r, r, &p[2]);
233 		if (ch & 0x02)
234 			be128_xor(r, r, &p[1]);
235 		if (ch & 0x01)
236 			be128_xor(r, r, &p[0]);
237 
238 		if (++i >= 16)
239 			break;
240 
241 		gf128mul_x8_bbe(r);
242 	}
243 }
244 EXPORT_SYMBOL(gf128mul_bbe);
245 
246 /*      This version uses 64k bytes of table space.
247     A 16 byte buffer has to be multiplied by a 16 byte key
248     value in GF(2^128).  If we consider a GF(2^128) value in
249     the buffer's lowest byte, we can construct a table of
250     the 256 16 byte values that result from the 256 values
251     of this byte.  This requires 4096 bytes. But we also
252     need tables for each of the 16 higher bytes in the
253     buffer as well, which makes 64 kbytes in total.
254 */
255 /* additional explanation
256  * t[0][BYTE] contains g*BYTE
257  * t[1][BYTE] contains g*x^8*BYTE
258  *  ..
259  * t[15][BYTE] contains g*x^120*BYTE */
gf128mul_init_64k_bbe(const be128 * g)260 struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g)
261 {
262 	struct gf128mul_64k *t;
263 	int i, j, k;
264 
265 	t = kzalloc(sizeof(*t), GFP_KERNEL);
266 	if (!t)
267 		goto out;
268 
269 	for (i = 0; i < 16; i++) {
270 		t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
271 		if (!t->t[i]) {
272 			gf128mul_free_64k(t);
273 			t = NULL;
274 			goto out;
275 		}
276 	}
277 
278 	t->t[0]->t[1] = *g;
279 	for (j = 1; j <= 64; j <<= 1)
280 		gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);
281 
282 	for (i = 0;;) {
283 		for (j = 2; j < 256; j += j)
284 			for (k = 1; k < j; ++k)
285 				be128_xor(&t->t[i]->t[j + k],
286 					  &t->t[i]->t[j], &t->t[i]->t[k]);
287 
288 		if (++i >= 16)
289 			break;
290 
291 		for (j = 128; j > 0; j >>= 1) {
292 			t->t[i]->t[j] = t->t[i - 1]->t[j];
293 			gf128mul_x8_bbe(&t->t[i]->t[j]);
294 		}
295 	}
296 
297 out:
298 	return t;
299 }
300 EXPORT_SYMBOL(gf128mul_init_64k_bbe);
301 
gf128mul_free_64k(struct gf128mul_64k * t)302 void gf128mul_free_64k(struct gf128mul_64k *t)
303 {
304 	int i;
305 
306 	for (i = 0; i < 16; i++)
307 		kfree_sensitive(t->t[i]);
308 	kfree_sensitive(t);
309 }
310 EXPORT_SYMBOL(gf128mul_free_64k);
311 
gf128mul_64k_bbe(be128 * a,const struct gf128mul_64k * t)312 void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t)
313 {
314 	u8 *ap = (u8 *)a;
315 	be128 r[1];
316 	int i;
317 
318 	*r = t->t[0]->t[ap[15]];
319 	for (i = 1; i < 16; ++i)
320 		be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
321 	*a = *r;
322 }
323 EXPORT_SYMBOL(gf128mul_64k_bbe);
324 
325 /*      This version uses 4k bytes of table space.
326     A 16 byte buffer has to be multiplied by a 16 byte key
327     value in GF(2^128).  If we consider a GF(2^128) value in a
328     single byte, we can construct a table of the 256 16 byte
329     values that result from the 256 values of this byte.
330     This requires 4096 bytes. If we take the highest byte in
331     the buffer and use this table to get the result, we then
332     have to multiply by x^120 to get the final value. For the
333     next highest byte the result has to be multiplied by x^112
334     and so on. But we can do this by accumulating the result
335     in an accumulator starting with the result for the top
336     byte.  We repeatedly multiply the accumulator value by
337     x^8 and then add in (i.e. xor) the 16 bytes of the next
338     lower byte in the buffer, stopping when we reach the
339     lowest byte. This requires a 4096 byte table.
340 */
gf128mul_init_4k_lle(const be128 * g)341 struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g)
342 {
343 	struct gf128mul_4k *t;
344 	int j, k;
345 
346 	t = kzalloc(sizeof(*t), GFP_KERNEL);
347 	if (!t)
348 		goto out;
349 
350 	t->t[128] = *g;
351 	for (j = 64; j > 0; j >>= 1)
352 		gf128mul_x_lle(&t->t[j], &t->t[j+j]);
353 
354 	for (j = 2; j < 256; j += j)
355 		for (k = 1; k < j; ++k)
356 			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
357 
358 out:
359 	return t;
360 }
361 EXPORT_SYMBOL(gf128mul_init_4k_lle);
362 
gf128mul_init_4k_bbe(const be128 * g)363 struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g)
364 {
365 	struct gf128mul_4k *t;
366 	int j, k;
367 
368 	t = kzalloc(sizeof(*t), GFP_KERNEL);
369 	if (!t)
370 		goto out;
371 
372 	t->t[1] = *g;
373 	for (j = 1; j <= 64; j <<= 1)
374 		gf128mul_x_bbe(&t->t[j + j], &t->t[j]);
375 
376 	for (j = 2; j < 256; j += j)
377 		for (k = 1; k < j; ++k)
378 			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
379 
380 out:
381 	return t;
382 }
383 EXPORT_SYMBOL(gf128mul_init_4k_bbe);
384 
gf128mul_4k_lle(be128 * a,const struct gf128mul_4k * t)385 void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t)
386 {
387 	u8 *ap = (u8 *)a;
388 	be128 r[1];
389 	int i = 15;
390 
391 	*r = t->t[ap[15]];
392 	while (i--) {
393 		gf128mul_x8_lle(r);
394 		be128_xor(r, r, &t->t[ap[i]]);
395 	}
396 	*a = *r;
397 }
398 EXPORT_SYMBOL(gf128mul_4k_lle);
399 
gf128mul_4k_bbe(be128 * a,const struct gf128mul_4k * t)400 void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t)
401 {
402 	u8 *ap = (u8 *)a;
403 	be128 r[1];
404 	int i = 0;
405 
406 	*r = t->t[ap[0]];
407 	while (++i < 16) {
408 		gf128mul_x8_bbe(r);
409 		be128_xor(r, r, &t->t[ap[i]]);
410 	}
411 	*a = *r;
412 }
413 EXPORT_SYMBOL(gf128mul_4k_bbe);
414 
415 MODULE_LICENSE("GPL");
416 MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
417