1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4 *
5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/* included by aes-ce.S and aes-neon.S */
9
10	.text
11	.align		4
12
13#ifndef MAX_STRIDE
14#define MAX_STRIDE	4
15#endif
16
17#if MAX_STRIDE == 4
18#define ST4(x...) x
19#define ST5(x...)
20#else
21#define ST4(x...)
22#define ST5(x...) x
23#endif
24
25SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
27	ret
28SYM_FUNC_END(aes_encrypt_block4x)
29
30SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
32	ret
33SYM_FUNC_END(aes_decrypt_block4x)
34
35#if MAX_STRIDE == 5
36SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
38	ret
39SYM_FUNC_END(aes_encrypt_block5x)
40
41SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
43	ret
44SYM_FUNC_END(aes_decrypt_block5x)
45#endif
46
47	/*
48	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49	 *		   int blocks)
50	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51	 *		   int blocks)
52	 */
53
54AES_FUNC_START(aes_ecb_encrypt)
55	stp		x29, x30, [sp, #-16]!
56	mov		x29, sp
57
58	enc_prepare	w3, x2, x5
59
60.LecbencloopNx:
61	subs		w4, w4, #MAX_STRIDE
62	bmi		.Lecbenc1x
63	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
64ST4(	bl		aes_encrypt_block4x		)
65ST5(	ld1		{v4.16b}, [x1], #16		)
66ST5(	bl		aes_encrypt_block5x		)
67	st1		{v0.16b-v3.16b}, [x0], #64
68ST5(	st1		{v4.16b}, [x0], #16		)
69	b		.LecbencloopNx
70.Lecbenc1x:
71	adds		w4, w4, #MAX_STRIDE
72	beq		.Lecbencout
73.Lecbencloop:
74	ld1		{v0.16b}, [x1], #16		/* get next pt block */
75	encrypt_block	v0, w3, x2, x5, w6
76	st1		{v0.16b}, [x0], #16
77	subs		w4, w4, #1
78	bne		.Lecbencloop
79.Lecbencout:
80	ldp		x29, x30, [sp], #16
81	ret
82AES_FUNC_END(aes_ecb_encrypt)
83
84
85AES_FUNC_START(aes_ecb_decrypt)
86	stp		x29, x30, [sp, #-16]!
87	mov		x29, sp
88
89	dec_prepare	w3, x2, x5
90
91.LecbdecloopNx:
92	subs		w4, w4, #MAX_STRIDE
93	bmi		.Lecbdec1x
94	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
95ST4(	bl		aes_decrypt_block4x		)
96ST5(	ld1		{v4.16b}, [x1], #16		)
97ST5(	bl		aes_decrypt_block5x		)
98	st1		{v0.16b-v3.16b}, [x0], #64
99ST5(	st1		{v4.16b}, [x0], #16		)
100	b		.LecbdecloopNx
101.Lecbdec1x:
102	adds		w4, w4, #MAX_STRIDE
103	beq		.Lecbdecout
104.Lecbdecloop:
105	ld1		{v0.16b}, [x1], #16		/* get next ct block */
106	decrypt_block	v0, w3, x2, x5, w6
107	st1		{v0.16b}, [x0], #16
108	subs		w4, w4, #1
109	bne		.Lecbdecloop
110.Lecbdecout:
111	ldp		x29, x30, [sp], #16
112	ret
113AES_FUNC_END(aes_ecb_decrypt)
114
115
116	/*
117	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118	 *		   int blocks, u8 iv[])
119	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120	 *		   int blocks, u8 iv[])
121	 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122	 *			 int rounds, int blocks, u8 iv[],
123	 *			 u32 const rk2[]);
124	 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125	 *			 int rounds, int blocks, u8 iv[],
126	 *			 u32 const rk2[]);
127	 */
128
129AES_FUNC_START(aes_essiv_cbc_encrypt)
130	ld1		{v4.16b}, [x5]			/* get iv */
131
132	mov		w8, #14				/* AES-256: 14 rounds */
133	enc_prepare	w8, x6, x7
134	encrypt_block	v4, w8, x6, x7, w9
135	enc_switch_key	w3, x2, x6
136	b		.Lcbcencloop4x
137
138AES_FUNC_START(aes_cbc_encrypt)
139	ld1		{v4.16b}, [x5]			/* get iv */
140	enc_prepare	w3, x2, x6
141
142.Lcbcencloop4x:
143	subs		w4, w4, #4
144	bmi		.Lcbcenc1x
145	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
146	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
147	encrypt_block	v0, w3, x2, x6, w7
148	eor		v1.16b, v1.16b, v0.16b
149	encrypt_block	v1, w3, x2, x6, w7
150	eor		v2.16b, v2.16b, v1.16b
151	encrypt_block	v2, w3, x2, x6, w7
152	eor		v3.16b, v3.16b, v2.16b
153	encrypt_block	v3, w3, x2, x6, w7
154	st1		{v0.16b-v3.16b}, [x0], #64
155	mov		v4.16b, v3.16b
156	b		.Lcbcencloop4x
157.Lcbcenc1x:
158	adds		w4, w4, #4
159	beq		.Lcbcencout
160.Lcbcencloop:
161	ld1		{v0.16b}, [x1], #16		/* get next pt block */
162	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
163	encrypt_block	v4, w3, x2, x6, w7
164	st1		{v4.16b}, [x0], #16
165	subs		w4, w4, #1
166	bne		.Lcbcencloop
167.Lcbcencout:
168	st1		{v4.16b}, [x5]			/* return iv */
169	ret
170AES_FUNC_END(aes_cbc_encrypt)
171AES_FUNC_END(aes_essiv_cbc_encrypt)
172
173AES_FUNC_START(aes_essiv_cbc_decrypt)
174	stp		x29, x30, [sp, #-16]!
175	mov		x29, sp
176
177	ld1		{cbciv.16b}, [x5]		/* get iv */
178
179	mov		w8, #14				/* AES-256: 14 rounds */
180	enc_prepare	w8, x6, x7
181	encrypt_block	cbciv, w8, x6, x7, w9
182	b		.Lessivcbcdecstart
183
184AES_FUNC_START(aes_cbc_decrypt)
185	stp		x29, x30, [sp, #-16]!
186	mov		x29, sp
187
188	ld1		{cbciv.16b}, [x5]		/* get iv */
189.Lessivcbcdecstart:
190	dec_prepare	w3, x2, x6
191
192.LcbcdecloopNx:
193	subs		w4, w4, #MAX_STRIDE
194	bmi		.Lcbcdec1x
195	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
196#if MAX_STRIDE == 5
197	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
198	mov		v5.16b, v0.16b
199	mov		v6.16b, v1.16b
200	mov		v7.16b, v2.16b
201	bl		aes_decrypt_block5x
202	sub		x1, x1, #32
203	eor		v0.16b, v0.16b, cbciv.16b
204	eor		v1.16b, v1.16b, v5.16b
205	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
206	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
207	eor		v2.16b, v2.16b, v6.16b
208	eor		v3.16b, v3.16b, v7.16b
209	eor		v4.16b, v4.16b, v5.16b
210#else
211	mov		v4.16b, v0.16b
212	mov		v5.16b, v1.16b
213	mov		v6.16b, v2.16b
214	bl		aes_decrypt_block4x
215	sub		x1, x1, #16
216	eor		v0.16b, v0.16b, cbciv.16b
217	eor		v1.16b, v1.16b, v4.16b
218	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
219	eor		v2.16b, v2.16b, v5.16b
220	eor		v3.16b, v3.16b, v6.16b
221#endif
222	st1		{v0.16b-v3.16b}, [x0], #64
223ST5(	st1		{v4.16b}, [x0], #16		)
224	b		.LcbcdecloopNx
225.Lcbcdec1x:
226	adds		w4, w4, #MAX_STRIDE
227	beq		.Lcbcdecout
228.Lcbcdecloop:
229	ld1		{v1.16b}, [x1], #16		/* get next ct block */
230	mov		v0.16b, v1.16b			/* ...and copy to v0 */
231	decrypt_block	v0, w3, x2, x6, w7
232	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
233	mov		cbciv.16b, v1.16b		/* ct is next iv */
234	st1		{v0.16b}, [x0], #16
235	subs		w4, w4, #1
236	bne		.Lcbcdecloop
237.Lcbcdecout:
238	st1		{cbciv.16b}, [x5]		/* return iv */
239	ldp		x29, x30, [sp], #16
240	ret
241AES_FUNC_END(aes_cbc_decrypt)
242AES_FUNC_END(aes_essiv_cbc_decrypt)
243
244
245	/*
246	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247	 *		       int rounds, int bytes, u8 const iv[])
248	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249	 *		       int rounds, int bytes, u8 const iv[])
250	 */
251
252AES_FUNC_START(aes_cbc_cts_encrypt)
253	adr_l		x8, .Lcts_permute_table
254	sub		x4, x4, #16
255	add		x9, x8, #32
256	add		x8, x8, x4
257	sub		x9, x9, x4
258	ld1		{v3.16b}, [x8]
259	ld1		{v4.16b}, [x9]
260
261	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
262	ld1		{v1.16b}, [x1]
263
264	ld1		{v5.16b}, [x5]			/* get iv */
265	enc_prepare	w3, x2, x6
266
267	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
268	tbl		v1.16b, {v1.16b}, v4.16b
269	encrypt_block	v0, w3, x2, x6, w7
270
271	eor		v1.16b, v1.16b, v0.16b
272	tbl		v0.16b, {v0.16b}, v3.16b
273	encrypt_block	v1, w3, x2, x6, w7
274
275	add		x4, x0, x4
276	st1		{v0.16b}, [x4]			/* overlapping stores */
277	st1		{v1.16b}, [x0]
278	ret
279AES_FUNC_END(aes_cbc_cts_encrypt)
280
281AES_FUNC_START(aes_cbc_cts_decrypt)
282	adr_l		x8, .Lcts_permute_table
283	sub		x4, x4, #16
284	add		x9, x8, #32
285	add		x8, x8, x4
286	sub		x9, x9, x4
287	ld1		{v3.16b}, [x8]
288	ld1		{v4.16b}, [x9]
289
290	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
291	ld1		{v1.16b}, [x1]
292
293	ld1		{v5.16b}, [x5]			/* get iv */
294	dec_prepare	w3, x2, x6
295
296	decrypt_block	v0, w3, x2, x6, w7
297	tbl		v2.16b, {v0.16b}, v3.16b
298	eor		v2.16b, v2.16b, v1.16b
299
300	tbx		v0.16b, {v1.16b}, v4.16b
301	decrypt_block	v0, w3, x2, x6, w7
302	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
303
304	add		x4, x0, x4
305	st1		{v2.16b}, [x4]			/* overlapping stores */
306	st1		{v0.16b}, [x0]
307	ret
308AES_FUNC_END(aes_cbc_cts_decrypt)
309
310	.section	".rodata", "a"
311	.align		6
312.Lcts_permute_table:
313	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
316	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
317	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
319	.previous
320
321
322	/*
323	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
324	 *		   int bytes, u8 ctr[])
325	 */
326
327AES_FUNC_START(aes_ctr_encrypt)
328	stp		x29, x30, [sp, #-16]!
329	mov		x29, sp
330
331	enc_prepare	w3, x2, x12
332	ld1		{vctr.16b}, [x5]
333
334	umov		x12, vctr.d[1]		/* keep swabbed ctr in reg */
335	rev		x12, x12
336
337.LctrloopNx:
338	add		w7, w4, #15
339	sub		w4, w4, #MAX_STRIDE << 4
340	lsr		w7, w7, #4
341	mov		w8, #MAX_STRIDE
342	cmp		w7, w8
343	csel		w7, w7, w8, lt
344	adds		x12, x12, x7
345
346	mov		v0.16b, vctr.16b
347	mov		v1.16b, vctr.16b
348	mov		v2.16b, vctr.16b
349	mov		v3.16b, vctr.16b
350ST5(	mov		v4.16b, vctr.16b		)
351	bcs		0f
352
353	.subsection	1
354	/* apply carry to outgoing counter */
3550:	umov		x8, vctr.d[0]
356	rev		x8, x8
357	add		x8, x8, #1
358	rev		x8, x8
359	ins		vctr.d[0], x8
360
361	/* apply carry to N counter blocks for N := x12 */
362	cbz		x12, 2f
363	adr		x16, 1f
364	sub		x16, x16, x12, lsl #3
365	br		x16
366	bti		c
367	mov		v0.d[0], vctr.d[0]
368	bti		c
369	mov		v1.d[0], vctr.d[0]
370	bti		c
371	mov		v2.d[0], vctr.d[0]
372	bti		c
373	mov		v3.d[0], vctr.d[0]
374ST5(	bti		c				)
375ST5(	mov		v4.d[0], vctr.d[0]		)
3761:	b		2f
377	.previous
378
3792:	rev		x7, x12
380	ins		vctr.d[1], x7
381	sub		x7, x12, #MAX_STRIDE - 1
382	sub		x8, x12, #MAX_STRIDE - 2
383	sub		x9, x12, #MAX_STRIDE - 3
384	rev		x7, x7
385	rev		x8, x8
386	mov		v1.d[1], x7
387	rev		x9, x9
388ST5(	sub		x10, x12, #MAX_STRIDE - 4	)
389	mov		v2.d[1], x8
390ST5(	rev		x10, x10			)
391	mov		v3.d[1], x9
392ST5(	mov		v4.d[1], x10			)
393	tbnz		w4, #31, .Lctrtail
394	ld1		{v5.16b-v7.16b}, [x1], #48
395ST4(	bl		aes_encrypt_block4x		)
396ST5(	bl		aes_encrypt_block5x		)
397	eor		v0.16b, v5.16b, v0.16b
398ST4(	ld1		{v5.16b}, [x1], #16		)
399	eor		v1.16b, v6.16b, v1.16b
400ST5(	ld1		{v5.16b-v6.16b}, [x1], #32	)
401	eor		v2.16b, v7.16b, v2.16b
402	eor		v3.16b, v5.16b, v3.16b
403ST5(	eor		v4.16b, v6.16b, v4.16b		)
404	st1		{v0.16b-v3.16b}, [x0], #64
405ST5(	st1		{v4.16b}, [x0], #16		)
406	cbz		w4, .Lctrout
407	b		.LctrloopNx
408
409.Lctrout:
410	st1		{vctr.16b}, [x5]	/* return next CTR value */
411	ldp		x29, x30, [sp], #16
412	ret
413
414.Lctrtail:
415	/* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
416	mov		x16, #16
417	ands		x6, x4, #0xf
418	csel		x13, x6, x16, ne
419
420ST5(	cmp		w4, #64 - (MAX_STRIDE << 4)	)
421ST5(	csel		x14, x16, xzr, gt		)
422	cmp		w4, #48 - (MAX_STRIDE << 4)
423	csel		x15, x16, xzr, gt
424	cmp		w4, #32 - (MAX_STRIDE << 4)
425	csel		x16, x16, xzr, gt
426	cmp		w4, #16 - (MAX_STRIDE << 4)
427
428	adr_l		x12, .Lcts_permute_table
429	add		x12, x12, x13
430	ble		.Lctrtail1x
431
432ST5(	ld1		{v5.16b}, [x1], x14		)
433	ld1		{v6.16b}, [x1], x15
434	ld1		{v7.16b}, [x1], x16
435
436ST4(	bl		aes_encrypt_block4x		)
437ST5(	bl		aes_encrypt_block5x		)
438
439	ld1		{v8.16b}, [x1], x13
440	ld1		{v9.16b}, [x1]
441	ld1		{v10.16b}, [x12]
442
443ST4(	eor		v6.16b, v6.16b, v0.16b		)
444ST4(	eor		v7.16b, v7.16b, v1.16b		)
445ST4(	tbl		v3.16b, {v3.16b}, v10.16b	)
446ST4(	eor		v8.16b, v8.16b, v2.16b		)
447ST4(	eor		v9.16b, v9.16b, v3.16b		)
448
449ST5(	eor		v5.16b, v5.16b, v0.16b		)
450ST5(	eor		v6.16b, v6.16b, v1.16b		)
451ST5(	tbl		v4.16b, {v4.16b}, v10.16b	)
452ST5(	eor		v7.16b, v7.16b, v2.16b		)
453ST5(	eor		v8.16b, v8.16b, v3.16b		)
454ST5(	eor		v9.16b, v9.16b, v4.16b		)
455
456ST5(	st1		{v5.16b}, [x0], x14		)
457	st1		{v6.16b}, [x0], x15
458	st1		{v7.16b}, [x0], x16
459	add		x13, x13, x0
460	st1		{v9.16b}, [x13]		// overlapping stores
461	st1		{v8.16b}, [x0]
462	b		.Lctrout
463
464.Lctrtail1x:
465	sub		x7, x6, #16
466	csel		x6, x6, x7, eq
467	add		x1, x1, x6
468	add		x0, x0, x6
469	ld1		{v5.16b}, [x1]
470	ld1		{v6.16b}, [x0]
471ST5(	mov		v3.16b, v4.16b			)
472	encrypt_block	v3, w3, x2, x8, w7
473	ld1		{v10.16b-v11.16b}, [x12]
474	tbl		v3.16b, {v3.16b}, v10.16b
475	sshr		v11.16b, v11.16b, #7
476	eor		v5.16b, v5.16b, v3.16b
477	bif		v5.16b, v6.16b, v11.16b
478	st1		{v5.16b}, [x0]
479	b		.Lctrout
480AES_FUNC_END(aes_ctr_encrypt)
481
482
483	/*
484	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
485	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
486	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
487	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
488	 */
489
490	.macro		next_tweak, out, in, tmp
491	sshr		\tmp\().2d,  \in\().2d,   #63
492	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
493	add		\out\().2d,  \in\().2d,   \in\().2d
494	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
495	eor		\out\().16b, \out\().16b, \tmp\().16b
496	.endm
497
498	.macro		xts_load_mask, tmp
499	movi		xtsmask.2s, #0x1
500	movi		\tmp\().2s, #0x87
501	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
502	.endm
503
504AES_FUNC_START(aes_xts_encrypt)
505	stp		x29, x30, [sp, #-16]!
506	mov		x29, sp
507
508	ld1		{v4.16b}, [x6]
509	xts_load_mask	v8
510	cbz		w7, .Lxtsencnotfirst
511
512	enc_prepare	w3, x5, x8
513	xts_cts_skip_tw	w7, .LxtsencNx
514	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
515	enc_switch_key	w3, x2, x8
516	b		.LxtsencNx
517
518.Lxtsencnotfirst:
519	enc_prepare	w3, x2, x8
520.LxtsencloopNx:
521	next_tweak	v4, v4, v8
522.LxtsencNx:
523	subs		w4, w4, #64
524	bmi		.Lxtsenc1x
525	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
526	next_tweak	v5, v4, v8
527	eor		v0.16b, v0.16b, v4.16b
528	next_tweak	v6, v5, v8
529	eor		v1.16b, v1.16b, v5.16b
530	eor		v2.16b, v2.16b, v6.16b
531	next_tweak	v7, v6, v8
532	eor		v3.16b, v3.16b, v7.16b
533	bl		aes_encrypt_block4x
534	eor		v3.16b, v3.16b, v7.16b
535	eor		v0.16b, v0.16b, v4.16b
536	eor		v1.16b, v1.16b, v5.16b
537	eor		v2.16b, v2.16b, v6.16b
538	st1		{v0.16b-v3.16b}, [x0], #64
539	mov		v4.16b, v7.16b
540	cbz		w4, .Lxtsencret
541	xts_reload_mask	v8
542	b		.LxtsencloopNx
543.Lxtsenc1x:
544	adds		w4, w4, #64
545	beq		.Lxtsencout
546	subs		w4, w4, #16
547	bmi		.LxtsencctsNx
548.Lxtsencloop:
549	ld1		{v0.16b}, [x1], #16
550.Lxtsencctsout:
551	eor		v0.16b, v0.16b, v4.16b
552	encrypt_block	v0, w3, x2, x8, w7
553	eor		v0.16b, v0.16b, v4.16b
554	cbz		w4, .Lxtsencout
555	subs		w4, w4, #16
556	next_tweak	v4, v4, v8
557	bmi		.Lxtsenccts
558	st1		{v0.16b}, [x0], #16
559	b		.Lxtsencloop
560.Lxtsencout:
561	st1		{v0.16b}, [x0]
562.Lxtsencret:
563	st1		{v4.16b}, [x6]
564	ldp		x29, x30, [sp], #16
565	ret
566
567.LxtsencctsNx:
568	mov		v0.16b, v3.16b
569	sub		x0, x0, #16
570.Lxtsenccts:
571	adr_l		x8, .Lcts_permute_table
572
573	add		x1, x1, w4, sxtw	/* rewind input pointer */
574	add		w4, w4, #16		/* # bytes in final block */
575	add		x9, x8, #32
576	add		x8, x8, x4
577	sub		x9, x9, x4
578	add		x4, x0, x4		/* output address of final block */
579
580	ld1		{v1.16b}, [x1]		/* load final block */
581	ld1		{v2.16b}, [x8]
582	ld1		{v3.16b}, [x9]
583
584	tbl		v2.16b, {v0.16b}, v2.16b
585	tbx		v0.16b, {v1.16b}, v3.16b
586	st1		{v2.16b}, [x4]			/* overlapping stores */
587	mov		w4, wzr
588	b		.Lxtsencctsout
589AES_FUNC_END(aes_xts_encrypt)
590
591AES_FUNC_START(aes_xts_decrypt)
592	stp		x29, x30, [sp, #-16]!
593	mov		x29, sp
594
595	/* subtract 16 bytes if we are doing CTS */
596	sub		w8, w4, #0x10
597	tst		w4, #0xf
598	csel		w4, w4, w8, eq
599
600	ld1		{v4.16b}, [x6]
601	xts_load_mask	v8
602	xts_cts_skip_tw	w7, .Lxtsdecskiptw
603	cbz		w7, .Lxtsdecnotfirst
604
605	enc_prepare	w3, x5, x8
606	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
607.Lxtsdecskiptw:
608	dec_prepare	w3, x2, x8
609	b		.LxtsdecNx
610
611.Lxtsdecnotfirst:
612	dec_prepare	w3, x2, x8
613.LxtsdecloopNx:
614	next_tweak	v4, v4, v8
615.LxtsdecNx:
616	subs		w4, w4, #64
617	bmi		.Lxtsdec1x
618	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
619	next_tweak	v5, v4, v8
620	eor		v0.16b, v0.16b, v4.16b
621	next_tweak	v6, v5, v8
622	eor		v1.16b, v1.16b, v5.16b
623	eor		v2.16b, v2.16b, v6.16b
624	next_tweak	v7, v6, v8
625	eor		v3.16b, v3.16b, v7.16b
626	bl		aes_decrypt_block4x
627	eor		v3.16b, v3.16b, v7.16b
628	eor		v0.16b, v0.16b, v4.16b
629	eor		v1.16b, v1.16b, v5.16b
630	eor		v2.16b, v2.16b, v6.16b
631	st1		{v0.16b-v3.16b}, [x0], #64
632	mov		v4.16b, v7.16b
633	cbz		w4, .Lxtsdecout
634	xts_reload_mask	v8
635	b		.LxtsdecloopNx
636.Lxtsdec1x:
637	adds		w4, w4, #64
638	beq		.Lxtsdecout
639	subs		w4, w4, #16
640.Lxtsdecloop:
641	ld1		{v0.16b}, [x1], #16
642	bmi		.Lxtsdeccts
643.Lxtsdecctsout:
644	eor		v0.16b, v0.16b, v4.16b
645	decrypt_block	v0, w3, x2, x8, w7
646	eor		v0.16b, v0.16b, v4.16b
647	st1		{v0.16b}, [x0], #16
648	cbz		w4, .Lxtsdecout
649	subs		w4, w4, #16
650	next_tweak	v4, v4, v8
651	b		.Lxtsdecloop
652.Lxtsdecout:
653	st1		{v4.16b}, [x6]
654	ldp		x29, x30, [sp], #16
655	ret
656
657.Lxtsdeccts:
658	adr_l		x8, .Lcts_permute_table
659
660	add		x1, x1, w4, sxtw	/* rewind input pointer */
661	add		w4, w4, #16		/* # bytes in final block */
662	add		x9, x8, #32
663	add		x8, x8, x4
664	sub		x9, x9, x4
665	add		x4, x0, x4		/* output address of final block */
666
667	next_tweak	v5, v4, v8
668
669	ld1		{v1.16b}, [x1]		/* load final block */
670	ld1		{v2.16b}, [x8]
671	ld1		{v3.16b}, [x9]
672
673	eor		v0.16b, v0.16b, v5.16b
674	decrypt_block	v0, w3, x2, x8, w7
675	eor		v0.16b, v0.16b, v5.16b
676
677	tbl		v2.16b, {v0.16b}, v2.16b
678	tbx		v0.16b, {v1.16b}, v3.16b
679
680	st1		{v2.16b}, [x4]			/* overlapping stores */
681	mov		w4, wzr
682	b		.Lxtsdecctsout
683AES_FUNC_END(aes_xts_decrypt)
684
685	/*
686	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
687	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
688	 */
689AES_FUNC_START(aes_mac_update)
690	ld1		{v0.16b}, [x4]			/* get dg */
691	enc_prepare	w2, x1, x7
692	cbz		w5, .Lmacloop4x
693
694	encrypt_block	v0, w2, x1, x7, w8
695
696.Lmacloop4x:
697	subs		w3, w3, #4
698	bmi		.Lmac1x
699	ld1		{v1.16b-v4.16b}, [x0], #64	/* get next pt block */
700	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
701	encrypt_block	v0, w2, x1, x7, w8
702	eor		v0.16b, v0.16b, v2.16b
703	encrypt_block	v0, w2, x1, x7, w8
704	eor		v0.16b, v0.16b, v3.16b
705	encrypt_block	v0, w2, x1, x7, w8
706	eor		v0.16b, v0.16b, v4.16b
707	cmp		w3, wzr
708	csinv		x5, x6, xzr, eq
709	cbz		w5, .Lmacout
710	encrypt_block	v0, w2, x1, x7, w8
711	st1		{v0.16b}, [x4]			/* return dg */
712	cond_yield	.Lmacout, x7, x8
713	b		.Lmacloop4x
714.Lmac1x:
715	add		w3, w3, #4
716.Lmacloop:
717	cbz		w3, .Lmacout
718	ld1		{v1.16b}, [x0], #16		/* get next pt block */
719	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
720
721	subs		w3, w3, #1
722	csinv		x5, x6, xzr, eq
723	cbz		w5, .Lmacout
724
725.Lmacenc:
726	encrypt_block	v0, w2, x1, x7, w8
727	b		.Lmacloop
728
729.Lmacout:
730	st1		{v0.16b}, [x4]			/* return dg */
731	mov		w0, w3
732	ret
733AES_FUNC_END(aes_mac_update)
734