1 /* SPDX-License-Identifier: GPL-2.0-only */
2 #ifndef _ASM_X86_XOR_AVX_H
3 #define _ASM_X86_XOR_AVX_H
4 
5 /*
6  * Optimized RAID-5 checksumming functions for AVX
7  *
8  * Copyright (C) 2012 Intel Corporation
9  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
10  *
11  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
12  */
13 
14 #include <linux/compiler.h>
15 #include <asm/fpu/api.h>
16 
17 #define BLOCK4(i) \
18 		BLOCK(32 * i, 0) \
19 		BLOCK(32 * (i + 1), 1) \
20 		BLOCK(32 * (i + 2), 2) \
21 		BLOCK(32 * (i + 3), 3)
22 
23 #define BLOCK16() \
24 		BLOCK4(0) \
25 		BLOCK4(4) \
26 		BLOCK4(8) \
27 		BLOCK4(12)
28 
xor_avx_2(unsigned long bytes,unsigned long * __restrict p0,const unsigned long * __restrict p1)29 static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
30 		      const unsigned long * __restrict p1)
31 {
32 	unsigned long lines = bytes >> 9;
33 
34 	kernel_fpu_begin();
35 
36 	while (lines--) {
37 #undef BLOCK
38 #define BLOCK(i, reg) \
39 do { \
40 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
41 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
42 		"m" (p0[i / sizeof(*p0)])); \
43 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
44 		"=m" (p0[i / sizeof(*p0)])); \
45 } while (0);
46 
47 		BLOCK16()
48 
49 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
50 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
51 	}
52 
53 	kernel_fpu_end();
54 }
55 
xor_avx_3(unsigned long bytes,unsigned long * __restrict p0,const unsigned long * __restrict p1,const unsigned long * __restrict p2)56 static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
57 		      const unsigned long * __restrict p1,
58 		      const unsigned long * __restrict p2)
59 {
60 	unsigned long lines = bytes >> 9;
61 
62 	kernel_fpu_begin();
63 
64 	while (lines--) {
65 #undef BLOCK
66 #define BLOCK(i, reg) \
67 do { \
68 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
69 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
70 		"m" (p1[i / sizeof(*p1)])); \
71 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
72 		"m" (p0[i / sizeof(*p0)])); \
73 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
74 		"=m" (p0[i / sizeof(*p0)])); \
75 } while (0);
76 
77 		BLOCK16()
78 
79 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
80 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
81 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
82 	}
83 
84 	kernel_fpu_end();
85 }
86 
xor_avx_4(unsigned long bytes,unsigned long * __restrict p0,const unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)87 static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
88 		      const unsigned long * __restrict p1,
89 		      const unsigned long * __restrict p2,
90 		      const unsigned long * __restrict p3)
91 {
92 	unsigned long lines = bytes >> 9;
93 
94 	kernel_fpu_begin();
95 
96 	while (lines--) {
97 #undef BLOCK
98 #define BLOCK(i, reg) \
99 do { \
100 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
101 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102 		"m" (p2[i / sizeof(*p2)])); \
103 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104 		"m" (p1[i / sizeof(*p1)])); \
105 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
106 		"m" (p0[i / sizeof(*p0)])); \
107 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
108 		"=m" (p0[i / sizeof(*p0)])); \
109 } while (0);
110 
111 		BLOCK16();
112 
113 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
114 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
115 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
116 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
117 	}
118 
119 	kernel_fpu_end();
120 }
121 
xor_avx_5(unsigned long bytes,unsigned long * __restrict p0,const unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)122 static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
123 	     const unsigned long * __restrict p1,
124 	     const unsigned long * __restrict p2,
125 	     const unsigned long * __restrict p3,
126 	     const unsigned long * __restrict p4)
127 {
128 	unsigned long lines = bytes >> 9;
129 
130 	kernel_fpu_begin();
131 
132 	while (lines--) {
133 #undef BLOCK
134 #define BLOCK(i, reg) \
135 do { \
136 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
137 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
138 		"m" (p3[i / sizeof(*p3)])); \
139 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
140 		"m" (p2[i / sizeof(*p2)])); \
141 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
142 		"m" (p1[i / sizeof(*p1)])); \
143 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
144 		"m" (p0[i / sizeof(*p0)])); \
145 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
146 		"=m" (p0[i / sizeof(*p0)])); \
147 } while (0);
148 
149 		BLOCK16()
150 
151 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
152 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
153 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
154 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
155 		p4 = (unsigned long *)((uintptr_t)p4 + 512);
156 	}
157 
158 	kernel_fpu_end();
159 }
160 
161 static struct xor_block_template xor_block_avx = {
162 	.name = "avx",
163 	.do_2 = xor_avx_2,
164 	.do_3 = xor_avx_3,
165 	.do_4 = xor_avx_4,
166 	.do_5 = xor_avx_5,
167 };
168 
169 #define AVX_XOR_SPEED \
170 do { \
171 	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
172 		xor_speed(&xor_block_avx); \
173 } while (0)
174 
175 #define AVX_SELECT(FASTEST) \
176 	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
177 
178 #endif
179