1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* -*- linux-c -*- ------------------------------------------------------- *
3 *
4 * Copyright (C) 2012 Intel Corporation
5 * Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
6 *
7 * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
8 *
9 * ----------------------------------------------------------------------- */
10
11 /*
12 * AVX2 implementation of RAID-6 syndrome functions
13 *
14 */
15
16 #include <linux/raid/pq.h>
17 #include "x86.h"
18
19 static const struct raid6_avx2_constants {
20 u64 x1d[4];
21 } raid6_avx2_constants __aligned(32) = {
22 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
23 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
24 };
25
raid6_have_avx2(void)26 static int raid6_have_avx2(void)
27 {
28 return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
29 }
30
31 /*
32 * Plain AVX2 implementation
33 */
raid6_avx21_gen_syndrome(int disks,size_t bytes,void ** ptrs)34 static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
35 {
36 u8 **dptr = (u8 **)ptrs;
37 u8 *p, *q;
38 int d, z, z0;
39
40 z0 = disks - 3; /* Highest data disk */
41 p = dptr[z0+1]; /* XOR parity */
42 q = dptr[z0+2]; /* RS syndrome */
43
44 kernel_fpu_begin();
45
46 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
47 asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* Zero temp */
48
49 for (d = 0; d < bytes; d += 32) {
50 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
51 asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
52 asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
53 asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
54 asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
55 for (z = z0-2; z >= 0; z--) {
56 asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
57 asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
58 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
59 asm volatile("vpand %ymm0,%ymm5,%ymm5");
60 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
61 asm volatile("vpxor %ymm6,%ymm2,%ymm2");
62 asm volatile("vpxor %ymm6,%ymm4,%ymm4");
63 asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
64 }
65 asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
66 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
67 asm volatile("vpand %ymm0,%ymm5,%ymm5");
68 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
69 asm volatile("vpxor %ymm6,%ymm2,%ymm2");
70 asm volatile("vpxor %ymm6,%ymm4,%ymm4");
71
72 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
73 asm volatile("vpxor %ymm2,%ymm2,%ymm2");
74 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
75 asm volatile("vpxor %ymm4,%ymm4,%ymm4");
76 }
77
78 asm volatile("sfence" : : : "memory");
79 kernel_fpu_end();
80 }
81
raid6_avx21_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)82 static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
83 size_t bytes, void **ptrs)
84 {
85 u8 **dptr = (u8 **)ptrs;
86 u8 *p, *q;
87 int d, z, z0;
88
89 z0 = stop; /* P/Q right side optimization */
90 p = dptr[disks-2]; /* XOR parity */
91 q = dptr[disks-1]; /* RS syndrome */
92
93 kernel_fpu_begin();
94
95 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
96
97 for (d = 0 ; d < bytes ; d += 32) {
98 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
99 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
100 asm volatile("vpxor %ymm4,%ymm2,%ymm2");
101 /* P/Q data pages */
102 for (z = z0-1 ; z >= start ; z--) {
103 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
104 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
105 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
106 asm volatile("vpand %ymm0,%ymm5,%ymm5");
107 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
108 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
109 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
110 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
111 }
112 /* P/Q left side optimization */
113 for (z = start-1 ; z >= 0 ; z--) {
114 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
115 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
116 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
117 asm volatile("vpand %ymm0,%ymm5,%ymm5");
118 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119 }
120 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
121 /* Don't use movntdq for r/w memory area < cache line */
122 asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
123 asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
124 }
125
126 asm volatile("sfence" : : : "memory");
127 kernel_fpu_end();
128 }
129
130 const struct raid6_calls raid6_avx2x1 = {
131 raid6_avx21_gen_syndrome,
132 raid6_avx21_xor_syndrome,
133 raid6_have_avx2,
134 "avx2x1",
135 .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */
136 };
137
138 /*
139 * Unrolled-by-2 AVX2 implementation
140 */
raid6_avx22_gen_syndrome(int disks,size_t bytes,void ** ptrs)141 static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
142 {
143 u8 **dptr = (u8 **)ptrs;
144 u8 *p, *q;
145 int d, z, z0;
146
147 z0 = disks - 3; /* Highest data disk */
148 p = dptr[z0+1]; /* XOR parity */
149 q = dptr[z0+2]; /* RS syndrome */
150
151 kernel_fpu_begin();
152
153 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
154 asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
155
156 /* We uniformly assume a single prefetch covers at least 32 bytes */
157 for (d = 0; d < bytes; d += 64) {
158 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
159 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
160 asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
161 asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
162 asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
163 asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
164 for (z = z0-1; z >= 0; z--) {
165 asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
166 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
167 asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
168 asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
169 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
170 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
171 asm volatile("vpand %ymm0,%ymm5,%ymm5");
172 asm volatile("vpand %ymm0,%ymm7,%ymm7");
173 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
174 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
175 asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
176 asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
177 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
178 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
179 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
180 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
181 }
182 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
183 asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
184 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
185 asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
186 }
187
188 asm volatile("sfence" : : : "memory");
189 kernel_fpu_end();
190 }
191
raid6_avx22_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)192 static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
193 size_t bytes, void **ptrs)
194 {
195 u8 **dptr = (u8 **)ptrs;
196 u8 *p, *q;
197 int d, z, z0;
198
199 z0 = stop; /* P/Q right side optimization */
200 p = dptr[disks-2]; /* XOR parity */
201 q = dptr[disks-1]; /* RS syndrome */
202
203 kernel_fpu_begin();
204
205 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
206
207 for (d = 0 ; d < bytes ; d += 64) {
208 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
209 asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
210 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
211 asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
212 asm volatile("vpxor %ymm4,%ymm2,%ymm2");
213 asm volatile("vpxor %ymm6,%ymm3,%ymm3");
214 /* P/Q data pages */
215 for (z = z0-1 ; z >= start ; z--) {
216 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
217 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
218 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
219 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
220 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
221 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
222 asm volatile("vpand %ymm0,%ymm5,%ymm5");
223 asm volatile("vpand %ymm0,%ymm7,%ymm7");
224 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
225 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
226 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
227 asm volatile("vmovdqa %0,%%ymm7"
228 :: "m" (dptr[z][d+32]));
229 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
230 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
231 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
232 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
233 }
234 /* P/Q left side optimization */
235 for (z = start-1 ; z >= 0 ; z--) {
236 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
237 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
238 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
239 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
240 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
241 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
242 asm volatile("vpand %ymm0,%ymm5,%ymm5");
243 asm volatile("vpand %ymm0,%ymm7,%ymm7");
244 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
245 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
246 }
247 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
248 asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
249 /* Don't use movntdq for r/w memory area < cache line */
250 asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
251 asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
252 asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
253 asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
254 }
255
256 asm volatile("sfence" : : : "memory");
257 kernel_fpu_end();
258 }
259
260 const struct raid6_calls raid6_avx2x2 = {
261 raid6_avx22_gen_syndrome,
262 raid6_avx22_xor_syndrome,
263 raid6_have_avx2,
264 "avx2x2",
265 .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */
266 };
267
268 #ifdef CONFIG_X86_64
269
270 /*
271 * Unrolled-by-4 AVX2 implementation
272 */
raid6_avx24_gen_syndrome(int disks,size_t bytes,void ** ptrs)273 static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
274 {
275 u8 **dptr = (u8 **)ptrs;
276 u8 *p, *q;
277 int d, z, z0;
278
279 z0 = disks - 3; /* Highest data disk */
280 p = dptr[z0+1]; /* XOR parity */
281 q = dptr[z0+2]; /* RS syndrome */
282
283 kernel_fpu_begin();
284
285 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
286 asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
287 asm volatile("vpxor %ymm2,%ymm2,%ymm2"); /* P[0] */
288 asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* P[1] */
289 asm volatile("vpxor %ymm4,%ymm4,%ymm4"); /* Q[0] */
290 asm volatile("vpxor %ymm6,%ymm6,%ymm6"); /* Q[1] */
291 asm volatile("vpxor %ymm10,%ymm10,%ymm10"); /* P[2] */
292 asm volatile("vpxor %ymm11,%ymm11,%ymm11"); /* P[3] */
293 asm volatile("vpxor %ymm12,%ymm12,%ymm12"); /* Q[2] */
294 asm volatile("vpxor %ymm14,%ymm14,%ymm14"); /* Q[3] */
295
296 for (d = 0; d < bytes; d += 128) {
297 for (z = z0; z >= 0; z--) {
298 asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
299 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
300 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
301 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
302 asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
303 asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
304 asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
305 asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
306 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
307 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
308 asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
309 asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
310 asm volatile("vpand %ymm0,%ymm5,%ymm5");
311 asm volatile("vpand %ymm0,%ymm7,%ymm7");
312 asm volatile("vpand %ymm0,%ymm13,%ymm13");
313 asm volatile("vpand %ymm0,%ymm15,%ymm15");
314 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
315 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
316 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
317 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
318 asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
319 asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
320 asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
321 asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
322 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
323 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
324 asm volatile("vpxor %ymm13,%ymm10,%ymm10");
325 asm volatile("vpxor %ymm15,%ymm11,%ymm11");
326 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
327 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
328 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
329 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
330 }
331 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
332 asm volatile("vpxor %ymm2,%ymm2,%ymm2");
333 asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
334 asm volatile("vpxor %ymm3,%ymm3,%ymm3");
335 asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
336 asm volatile("vpxor %ymm10,%ymm10,%ymm10");
337 asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
338 asm volatile("vpxor %ymm11,%ymm11,%ymm11");
339 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
340 asm volatile("vpxor %ymm4,%ymm4,%ymm4");
341 asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
342 asm volatile("vpxor %ymm6,%ymm6,%ymm6");
343 asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
344 asm volatile("vpxor %ymm12,%ymm12,%ymm12");
345 asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
346 asm volatile("vpxor %ymm14,%ymm14,%ymm14");
347 }
348
349 asm volatile("sfence" : : : "memory");
350 kernel_fpu_end();
351 }
352
raid6_avx24_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)353 static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
354 size_t bytes, void **ptrs)
355 {
356 u8 **dptr = (u8 **)ptrs;
357 u8 *p, *q;
358 int d, z, z0;
359
360 z0 = stop; /* P/Q right side optimization */
361 p = dptr[disks-2]; /* XOR parity */
362 q = dptr[disks-1]; /* RS syndrome */
363
364 kernel_fpu_begin();
365
366 asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
367
368 for (d = 0 ; d < bytes ; d += 128) {
369 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
370 asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
371 asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
372 asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
373 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
374 asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
375 asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
376 asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
377 asm volatile("vpxor %ymm4,%ymm2,%ymm2");
378 asm volatile("vpxor %ymm6,%ymm3,%ymm3");
379 asm volatile("vpxor %ymm12,%ymm10,%ymm10");
380 asm volatile("vpxor %ymm14,%ymm11,%ymm11");
381 /* P/Q data pages */
382 for (z = z0-1 ; z >= start ; z--) {
383 asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
384 asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
385 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
386 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
387 asm volatile("vpxor %ymm13,%ymm13,%ymm13");
388 asm volatile("vpxor %ymm15,%ymm15,%ymm15");
389 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
390 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
391 asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
392 asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
393 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
394 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
395 asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
396 asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
397 asm volatile("vpand %ymm0,%ymm5,%ymm5");
398 asm volatile("vpand %ymm0,%ymm7,%ymm7");
399 asm volatile("vpand %ymm0,%ymm13,%ymm13");
400 asm volatile("vpand %ymm0,%ymm15,%ymm15");
401 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
402 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
403 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
404 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
405 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
406 asm volatile("vmovdqa %0,%%ymm7"
407 :: "m" (dptr[z][d+32]));
408 asm volatile("vmovdqa %0,%%ymm13"
409 :: "m" (dptr[z][d+64]));
410 asm volatile("vmovdqa %0,%%ymm15"
411 :: "m" (dptr[z][d+96]));
412 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
413 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
414 asm volatile("vpxor %ymm13,%ymm10,%ymm10");
415 asm volatile("vpxor %ymm15,%ymm11,%ymm11");
416 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
417 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
418 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
419 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
420 }
421 asm volatile("prefetchnta %0" :: "m" (q[d]));
422 asm volatile("prefetchnta %0" :: "m" (q[d+64]));
423 /* P/Q left side optimization */
424 for (z = start-1 ; z >= 0 ; z--) {
425 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
426 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
427 asm volatile("vpxor %ymm13,%ymm13,%ymm13");
428 asm volatile("vpxor %ymm15,%ymm15,%ymm15");
429 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
430 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
431 asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
432 asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
433 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
434 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
435 asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
436 asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
437 asm volatile("vpand %ymm0,%ymm5,%ymm5");
438 asm volatile("vpand %ymm0,%ymm7,%ymm7");
439 asm volatile("vpand %ymm0,%ymm13,%ymm13");
440 asm volatile("vpand %ymm0,%ymm15,%ymm15");
441 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
442 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
443 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
444 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
445 }
446 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
447 asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
448 asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
449 asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
450 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
451 asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
452 asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
453 asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
454 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
455 asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
456 asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
457 asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
458 }
459 asm volatile("sfence" : : : "memory");
460 kernel_fpu_end();
461 }
462
463 const struct raid6_calls raid6_avx2x4 = {
464 raid6_avx24_gen_syndrome,
465 raid6_avx24_xor_syndrome,
466 raid6_have_avx2,
467 "avx2x4",
468 .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */
469 };
470 #endif /* CONFIG_X86_64 */
471