1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2/*
3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
5 */
6
7#include <linux/linkage.h>
8
9.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
10.align 32
11IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
12	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
13.section .rodata.cst16.ROT16, "aM", @progbits, 16
14.align 16
15ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
16.section .rodata.cst16.ROR328, "aM", @progbits, 16
17.align 16
18ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
19.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
20.align 64
21SIGMA:
22.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
23.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
24.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
25.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
26.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
27.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
28.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
29.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
30.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
31.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
32#ifdef CONFIG_AS_AVX512
33.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
34.align 64
35SIGMA2:
36.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
37.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
38.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
39.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
40.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
41.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
42.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
43.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
44.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
45.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
46#endif /* CONFIG_AS_AVX512 */
47
48.text
49SYM_FUNC_START(blake2s_compress_ssse3)
50	testq		%rdx,%rdx
51	je		.Lendofloop
52	movdqu		(%rdi),%xmm0
53	movdqu		0x10(%rdi),%xmm1
54	movdqa		ROT16(%rip),%xmm12
55	movdqa		ROR328(%rip),%xmm13
56	movdqu		0x20(%rdi),%xmm14
57	movq		%rcx,%xmm15
58	leaq		SIGMA+0xa0(%rip),%r8
59	jmp		.Lbeginofloop
60	.align		32
61.Lbeginofloop:
62	movdqa		%xmm0,%xmm10
63	movdqa		%xmm1,%xmm11
64	paddq		%xmm15,%xmm14
65	movdqa		IV(%rip),%xmm2
66	movdqa		%xmm14,%xmm3
67	pxor		IV+0x10(%rip),%xmm3
68	leaq		SIGMA(%rip),%rcx
69.Lroundloop:
70	movzbl		(%rcx),%eax
71	movd		(%rsi,%rax,4),%xmm4
72	movzbl		0x1(%rcx),%eax
73	movd		(%rsi,%rax,4),%xmm5
74	movzbl		0x2(%rcx),%eax
75	movd		(%rsi,%rax,4),%xmm6
76	movzbl		0x3(%rcx),%eax
77	movd		(%rsi,%rax,4),%xmm7
78	punpckldq	%xmm5,%xmm4
79	punpckldq	%xmm7,%xmm6
80	punpcklqdq	%xmm6,%xmm4
81	paddd		%xmm4,%xmm0
82	paddd		%xmm1,%xmm0
83	pxor		%xmm0,%xmm3
84	pshufb		%xmm12,%xmm3
85	paddd		%xmm3,%xmm2
86	pxor		%xmm2,%xmm1
87	movdqa		%xmm1,%xmm8
88	psrld		$0xc,%xmm1
89	pslld		$0x14,%xmm8
90	por		%xmm8,%xmm1
91	movzbl		0x4(%rcx),%eax
92	movd		(%rsi,%rax,4),%xmm5
93	movzbl		0x5(%rcx),%eax
94	movd		(%rsi,%rax,4),%xmm6
95	movzbl		0x6(%rcx),%eax
96	movd		(%rsi,%rax,4),%xmm7
97	movzbl		0x7(%rcx),%eax
98	movd		(%rsi,%rax,4),%xmm4
99	punpckldq	%xmm6,%xmm5
100	punpckldq	%xmm4,%xmm7
101	punpcklqdq	%xmm7,%xmm5
102	paddd		%xmm5,%xmm0
103	paddd		%xmm1,%xmm0
104	pxor		%xmm0,%xmm3
105	pshufb		%xmm13,%xmm3
106	paddd		%xmm3,%xmm2
107	pxor		%xmm2,%xmm1
108	movdqa		%xmm1,%xmm8
109	psrld		$0x7,%xmm1
110	pslld		$0x19,%xmm8
111	por		%xmm8,%xmm1
112	pshufd		$0x93,%xmm0,%xmm0
113	pshufd		$0x4e,%xmm3,%xmm3
114	pshufd		$0x39,%xmm2,%xmm2
115	movzbl		0x8(%rcx),%eax
116	movd		(%rsi,%rax,4),%xmm6
117	movzbl		0x9(%rcx),%eax
118	movd		(%rsi,%rax,4),%xmm7
119	movzbl		0xa(%rcx),%eax
120	movd		(%rsi,%rax,4),%xmm4
121	movzbl		0xb(%rcx),%eax
122	movd		(%rsi,%rax,4),%xmm5
123	punpckldq	%xmm7,%xmm6
124	punpckldq	%xmm5,%xmm4
125	punpcklqdq	%xmm4,%xmm6
126	paddd		%xmm6,%xmm0
127	paddd		%xmm1,%xmm0
128	pxor		%xmm0,%xmm3
129	pshufb		%xmm12,%xmm3
130	paddd		%xmm3,%xmm2
131	pxor		%xmm2,%xmm1
132	movdqa		%xmm1,%xmm8
133	psrld		$0xc,%xmm1
134	pslld		$0x14,%xmm8
135	por		%xmm8,%xmm1
136	movzbl		0xc(%rcx),%eax
137	movd		(%rsi,%rax,4),%xmm7
138	movzbl		0xd(%rcx),%eax
139	movd		(%rsi,%rax,4),%xmm4
140	movzbl		0xe(%rcx),%eax
141	movd		(%rsi,%rax,4),%xmm5
142	movzbl		0xf(%rcx),%eax
143	movd		(%rsi,%rax,4),%xmm6
144	punpckldq	%xmm4,%xmm7
145	punpckldq	%xmm6,%xmm5
146	punpcklqdq	%xmm5,%xmm7
147	paddd		%xmm7,%xmm0
148	paddd		%xmm1,%xmm0
149	pxor		%xmm0,%xmm3
150	pshufb		%xmm13,%xmm3
151	paddd		%xmm3,%xmm2
152	pxor		%xmm2,%xmm1
153	movdqa		%xmm1,%xmm8
154	psrld		$0x7,%xmm1
155	pslld		$0x19,%xmm8
156	por		%xmm8,%xmm1
157	pshufd		$0x39,%xmm0,%xmm0
158	pshufd		$0x4e,%xmm3,%xmm3
159	pshufd		$0x93,%xmm2,%xmm2
160	addq		$0x10,%rcx
161	cmpq		%r8,%rcx
162	jnz		.Lroundloop
163	pxor		%xmm2,%xmm0
164	pxor		%xmm3,%xmm1
165	pxor		%xmm10,%xmm0
166	pxor		%xmm11,%xmm1
167	addq		$0x40,%rsi
168	decq		%rdx
169	jnz		.Lbeginofloop
170	movdqu		%xmm0,(%rdi)
171	movdqu		%xmm1,0x10(%rdi)
172	movdqu		%xmm14,0x20(%rdi)
173.Lendofloop:
174	RET
175SYM_FUNC_END(blake2s_compress_ssse3)
176
177#ifdef CONFIG_AS_AVX512
178SYM_FUNC_START(blake2s_compress_avx512)
179	vmovdqu		(%rdi),%xmm0
180	vmovdqu		0x10(%rdi),%xmm1
181	vmovdqu		0x20(%rdi),%xmm4
182	vmovq		%rcx,%xmm5
183	vmovdqa		IV(%rip),%xmm14
184	vmovdqa		IV+16(%rip),%xmm15
185	jmp		.Lblake2s_compress_avx512_mainloop
186.align 32
187.Lblake2s_compress_avx512_mainloop:
188	vmovdqa		%xmm0,%xmm10
189	vmovdqa		%xmm1,%xmm11
190	vpaddq		%xmm5,%xmm4,%xmm4
191	vmovdqa		%xmm14,%xmm2
192	vpxor		%xmm15,%xmm4,%xmm3
193	vmovdqu		(%rsi),%ymm6
194	vmovdqu		0x20(%rsi),%ymm7
195	addq		$0x40,%rsi
196	leaq		SIGMA2(%rip),%rax
197	movb		$0xa,%cl
198.Lblake2s_compress_avx512_roundloop:
199	addq		$0x40,%rax
200	vmovdqa		-0x40(%rax),%ymm8
201	vmovdqa		-0x20(%rax),%ymm9
202	vpermi2d	%ymm7,%ymm6,%ymm8
203	vpermi2d	%ymm7,%ymm6,%ymm9
204	vmovdqa		%ymm8,%ymm6
205	vmovdqa		%ymm9,%ymm7
206	vpaddd		%xmm8,%xmm0,%xmm0
207	vpaddd		%xmm1,%xmm0,%xmm0
208	vpxor		%xmm0,%xmm3,%xmm3
209	vprord		$0x10,%xmm3,%xmm3
210	vpaddd		%xmm3,%xmm2,%xmm2
211	vpxor		%xmm2,%xmm1,%xmm1
212	vprord		$0xc,%xmm1,%xmm1
213	vextracti128	$0x1,%ymm8,%xmm8
214	vpaddd		%xmm8,%xmm0,%xmm0
215	vpaddd		%xmm1,%xmm0,%xmm0
216	vpxor		%xmm0,%xmm3,%xmm3
217	vprord		$0x8,%xmm3,%xmm3
218	vpaddd		%xmm3,%xmm2,%xmm2
219	vpxor		%xmm2,%xmm1,%xmm1
220	vprord		$0x7,%xmm1,%xmm1
221	vpshufd		$0x93,%xmm0,%xmm0
222	vpshufd		$0x4e,%xmm3,%xmm3
223	vpshufd		$0x39,%xmm2,%xmm2
224	vpaddd		%xmm9,%xmm0,%xmm0
225	vpaddd		%xmm1,%xmm0,%xmm0
226	vpxor		%xmm0,%xmm3,%xmm3
227	vprord		$0x10,%xmm3,%xmm3
228	vpaddd		%xmm3,%xmm2,%xmm2
229	vpxor		%xmm2,%xmm1,%xmm1
230	vprord		$0xc,%xmm1,%xmm1
231	vextracti128	$0x1,%ymm9,%xmm9
232	vpaddd		%xmm9,%xmm0,%xmm0
233	vpaddd		%xmm1,%xmm0,%xmm0
234	vpxor		%xmm0,%xmm3,%xmm3
235	vprord		$0x8,%xmm3,%xmm3
236	vpaddd		%xmm3,%xmm2,%xmm2
237	vpxor		%xmm2,%xmm1,%xmm1
238	vprord		$0x7,%xmm1,%xmm1
239	vpshufd		$0x39,%xmm0,%xmm0
240	vpshufd		$0x4e,%xmm3,%xmm3
241	vpshufd		$0x93,%xmm2,%xmm2
242	decb		%cl
243	jne		.Lblake2s_compress_avx512_roundloop
244	vpxor		%xmm10,%xmm0,%xmm0
245	vpxor		%xmm11,%xmm1,%xmm1
246	vpxor		%xmm2,%xmm0,%xmm0
247	vpxor		%xmm3,%xmm1,%xmm1
248	decq		%rdx
249	jne		.Lblake2s_compress_avx512_mainloop
250	vmovdqu		%xmm0,(%rdi)
251	vmovdqu		%xmm1,0x10(%rdi)
252	vmovdqu		%xmm4,0x20(%rdi)
253	vzeroupper
254	RET
255SYM_FUNC_END(blake2s_compress_avx512)
256#endif /* CONFIG_AS_AVX512 */
257