1 /*
2  * include/asm-x86_64/xor.h
3  *
4  * Optimized RAID-5 checksumming functions for MMX and SSE.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2, or (at your option)
9  * any later version.
10  *
11  * You should have received a copy of the GNU General Public License
12  * (for example /usr/src/linux/COPYING); if not, write to the Free
13  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14  */
15 
16 
17 /*
18  * Cache avoiding checksumming functions utilizing KNI instructions
19  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
20  */
21 
22 /*
23  * Based on
24  * High-speed RAID5 checksumming functions utilizing SSE instructions.
25  * Copyright (C) 1998 Ingo Molnar.
26  */
27 
28 /*
29  * x86-64 changes / gcc fixes from Andi Kleen.
30  * Copyright 2002 Andi Kleen, SuSE Labs.
31  */
32 
33 typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
34 
35 /* Doesn't use gcc to save the XMM registers, because there is no easy way to
36    tell it to do a clts before the register saving. */
37 #define XMMS_SAVE				\
38 	asm volatile ( 			\
39 		"movq %%cr0,%0		;\n\t"	\
40 		"clts			;\n\t"	\
41 		"movups %%xmm0,(%1)	;\n\t"	\
42 		"movups %%xmm1,0x10(%1)	;\n\t"	\
43 		"movups %%xmm2,0x20(%1)	;\n\t"	\
44 		"movups %%xmm3,0x30(%1)	;\n\t"	\
45 		: "=&r" (cr0)			\
46 		: "r" (xmm_save) 		\
47 		: "memory")
48 
49 #define XMMS_RESTORE				\
50 	asm volatile ( 			\
51 		"sfence			;\n\t"	\
52 		"movups (%1),%%xmm0	;\n\t"	\
53 		"movups 0x10(%1),%%xmm1	;\n\t"	\
54 		"movups 0x20(%1),%%xmm2	;\n\t"	\
55 		"movups 0x30(%1),%%xmm3	;\n\t"	\
56 		"movq 	%0,%%cr0	;\n\t"	\
57 		:				\
58 		: "r" (cr0), "r" (xmm_save)	\
59 		: "memory")
60 
61 #define OFFS(x)		"16*("#x")"
62 #define PF_OFFS(x)	"320+16*("#x")"
63 #define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
64 #define LD(x,y)		"       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
65 #define ST(x,y)		"       movntdq %%xmm"#y",   "OFFS(x)"(%[p1])	;\n"
66 #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
67 #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
68 #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
69 #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
70 #define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%[p6])		;\n"
71 #define XO1(x,y)	"       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
72 #define XO2(x,y)	"       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
73 #define XO3(x,y)	"       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
74 #define XO4(x,y)	"       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
75 #define XO5(x,y)	"       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"	;\n"
76 
77 static void
xor_sse_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)78 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
79 {
80         unsigned int lines = bytes >> 7;
81 	unsigned long cr0;
82 	xmm_store_t xmm_save[4];
83 
84 	XMMS_SAVE;
85 
86         asm volatile (
87 #undef BLOCK
88 #define BLOCK(i) \
89 		LD(i,0)					\
90 			LD(i+1,1)			\
91 		PF1(i)					\
92 				LD(i+2,2)		\
93 					LD(i+3,3)	\
94 		PF0(i+4)				\
95 		XO1(i,0)				\
96 			XO1(i+1,1)			\
97 		ST(i,0)					\
98 			ST(i+1,1)			\
99 				XO1(i+2,2)		\
100 					XO1(i+3,3)	\
101 				ST(i+2,2)		\
102 					ST(i+3,3)	\
103 
104 
105 		PF0(0)
106 
107 	" .p2align 4			;\n"
108         " 1:                            ;\n"
109 
110 		BLOCK(0)
111 		BLOCK(4)
112 
113 	"       decl %[cnt]\n"
114         "       leaq 128(%[p1]),%[p1]\n"
115         "       leaq 128(%[p2]),%[p2]\n"
116 	"       jnz 1b\n"
117 	: [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
118 	:
119         : "memory");
120 
121 	XMMS_RESTORE;
122 }
123 
124 static void
xor_sse_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)125 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
126 	  unsigned long *p3)
127 {
128 	unsigned int lines = bytes >> 7;
129 	xmm_store_t xmm_save[4];
130 	unsigned long cr0;
131 
132 	XMMS_SAVE;
133 
134         __asm__ __volatile__ (
135 #undef BLOCK
136 #define BLOCK(i) \
137 		PF1(i)					\
138 		LD(i,0)					\
139 			LD(i+1,1)			\
140 		XO1(i,0)				\
141 			XO1(i+1,1)			\
142 				LD(i+2,2)		\
143 					LD(i+3,3)	\
144 		PF2(i)					\
145 		PF0(i+4)				\
146 				XO1(i+2,2)		\
147 					XO1(i+3,3)	\
148 		XO2(i,0)				\
149 			XO2(i+1,1)			\
150 		ST(i,0)					\
151 			ST(i+1,1)			\
152 				XO2(i+2,2)		\
153 					XO2(i+3,3)	\
154 				ST(i+2,2)		\
155 					ST(i+3,3)	\
156 
157 
158 		PF0(0)
159 
160 	" .p2align 4			;\n"
161         " 1:                            ;\n"
162 
163 		BLOCK(0)
164 		BLOCK(4)
165 
166 	"	decl %[cnt]\n"
167         "       leaq 128(%[p1]),%[p1]\n"
168         "       leaq 128(%[p2]),%[p2]\n"
169         "       leaq 128(%[p3]),%[p3]\n"
170 	"       jnz  1b"
171 	: [cnt] "+r" (lines),
172 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
173 	:
174 	: "memory");
175 	XMMS_RESTORE;
176 }
177 
178 static void
xor_sse_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)179 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180 	  unsigned long *p3, unsigned long *p4)
181 {
182 	unsigned int lines = bytes >> 7;
183 	xmm_store_t xmm_save[4];
184 	unsigned long cr0;
185 
186 	XMMS_SAVE;
187 
188         __asm__ __volatile__ (
189 #undef BLOCK
190 #define BLOCK(i) \
191 		PF1(i)					\
192 		LD(i,0)					\
193 			LD(i+1,1)			\
194 		XO1(i,0)				\
195 			XO1(i+1,1)			\
196 				LD(i+2,2)		\
197 					LD(i+3,3)	\
198 		PF2(i)					\
199 				XO1(i+2,2)		\
200 					XO1(i+3,3)	\
201 		PF3(i)					\
202 		PF0(i+4)				\
203 		XO2(i,0)				\
204 			XO2(i+1,1)			\
205 				XO2(i+2,2)		\
206 					XO2(i+3,3)	\
207 		XO3(i,0)				\
208 			XO3(i+1,1)			\
209 		ST(i,0)					\
210 			ST(i+1,1)			\
211 				XO3(i+2,2)		\
212 					XO3(i+3,3)	\
213 				ST(i+2,2)		\
214 					ST(i+3,3)	\
215 
216 
217 		PF0(0)
218 
219 	" .align 32			;\n"
220         " 1:                            ;\n"
221 
222 		BLOCK(0)
223 		BLOCK(4)
224 
225 	"       decl %[cnt]\n"
226         "       leaq 128(%[p1]),%[p1]\n"
227         "       leaq 128(%[p2]),%[p2]\n"
228         "       leaq 128(%[p3]),%[p3]\n"
229         "       leaq 128(%[p4]),%[p4]\n"
230 	"       jnz  1b"
231 	: [cnt] "+r" (lines),
232 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
233 	:
234         : "memory" );
235 
236 	XMMS_RESTORE;
237 }
238 
239 static void
xor_sse_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)240 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
241 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
242 {
243         unsigned int lines = bytes >> 7;
244 	xmm_store_t xmm_save[4];
245 	unsigned long cr0;
246 
247 	XMMS_SAVE;
248 
249         __asm__ __volatile__ (
250 #undef BLOCK
251 #define BLOCK(i) \
252 		PF1(i)					\
253 		LD(i,0)					\
254 			LD(i+1,1)			\
255 		XO1(i,0)				\
256 			XO1(i+1,1)			\
257 				LD(i+2,2)		\
258 					LD(i+3,3)	\
259 		PF2(i)					\
260 				XO1(i+2,2)		\
261 					XO1(i+3,3)	\
262 		PF3(i)					\
263 		XO2(i,0)				\
264 			XO2(i+1,1)			\
265 				XO2(i+2,2)		\
266 					XO2(i+3,3)	\
267 		PF4(i)					\
268 		PF0(i+4)				\
269 		XO3(i,0)				\
270 			XO3(i+1,1)			\
271 				XO3(i+2,2)		\
272 					XO3(i+3,3)	\
273 		XO4(i,0)				\
274 			XO4(i+1,1)			\
275 		ST(i,0)					\
276 			ST(i+1,1)			\
277 				XO4(i+2,2)		\
278 					XO4(i+3,3)	\
279 				ST(i+2,2)		\
280 					ST(i+3,3)	\
281 
282 
283 		PF0(0)
284 
285 	" .p2align 4			;\n"
286         " 1:                            ;\n"
287 
288 		BLOCK(0)
289 		BLOCK(4)
290 
291 	"       decl %[cnt]\n"
292         "       leaq 128(%[p1]),%[p1]\n"
293         "       leaq 128(%[p2]),%[p2]\n"
294         "       leaq 128(%[p3]),%[p3]\n"
295         "       leaq 128(%[p4]),%[p4]\n"
296         "       leaq 128(%[p5]),%[p5]\n"
297 	"       jnz  1b"
298 	: [cnt] "+r" (lines),
299   	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
300 	  [p5] "+r" (p5)
301 	:
302 	: "memory");
303 
304 	XMMS_RESTORE;
305 }
306 
307 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC__MINOR__ >= 3)
308 #define STORE_NTI(x,mem) __builtin_ia32_movnti(&(mem), (x))
309 #else
310 #define STORE_NTI(x,mem)  asm("movnti %1,%0" : "=m" (mem) : "r" (x))
311 #endif
312 
313 
314 static void
xor_64regs_stream_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)315 xor_64regs_stream_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
316 {
317 	long lines = bytes / (sizeof (long)) / 8;
318 
319 	do {
320 		register long d0, d1, d2, d3, d4, d5, d6, d7;
321 		d0 = p1[0];	/* Pull the stuff into registers	*/
322 		d1 = p1[1];	/*  ... in bursts, if possible.		*/
323 		d2 = p1[2];
324 		d3 = p1[3];
325 		d4 = p1[4];
326 		d5 = p1[5];
327 		d6 = p1[6];
328 		d7 = p1[7];
329 		__builtin_prefetch(p1 + 5*64, 0, 0);
330 		d0 ^= p2[0];
331 		d1 ^= p2[1];
332 		d2 ^= p2[2];
333 		d3 ^= p2[3];
334 		d4 ^= p2[4];
335 		d5 ^= p2[5];
336 		d6 ^= p2[6];
337 		d7 ^= p2[7];
338 		__builtin_prefetch(p2 + 5*64, 0, 0);
339 		STORE_NTI(d0, p1[0]);
340 		STORE_NTI(d1, p1[1]);
341 		STORE_NTI(d2, p1[2]);
342 		STORE_NTI(d3, p1[3]);
343 		STORE_NTI(d4, p1[4]);
344 		STORE_NTI(d5, p1[5]);
345 		STORE_NTI(d6, p1[6]);
346 		STORE_NTI(d7, p1[7]);
347 		p1 += 8;
348 		p2 += 8;
349 	} while (--lines > 0);
350 }
351 
352 static void
xor_64regs_stream_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)353 xor_64regs_stream_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
354 	    unsigned long *p3)
355 {
356 	long lines = bytes / (sizeof (long)) / 8;
357 
358 	do {
359 		register long d0, d1, d2, d3, d4, d5, d6, d7;
360 		d0 = p1[0];	/* Pull the stuff into registers	*/
361 		d1 = p1[1];	/*  ... in bursts, if possible.		*/
362 		d2 = p1[2];
363 		d3 = p1[3];
364 		d4 = p1[4];
365 		d5 = p1[5];
366 		d6 = p1[6];
367 		d7 = p1[7];
368 		__builtin_prefetch(p1 + 5*64, 0, 0);
369 		d0 ^= p2[0];
370 		d1 ^= p2[1];
371 		d2 ^= p2[2];
372 		d3 ^= p2[3];
373 		d4 ^= p2[4];
374 		d5 ^= p2[5];
375 		d6 ^= p2[6];
376 		d7 ^= p2[7];
377 		__builtin_prefetch(p2 + 5*64, 0, 0);
378 		d0 ^= p3[0];
379 		d1 ^= p3[1];
380 		d2 ^= p3[2];
381 		d3 ^= p3[3];
382 		d4 ^= p3[4];
383 		d5 ^= p3[5];
384 		d6 ^= p3[6];
385 		d7 ^= p3[7];
386 		__builtin_prefetch(p3 + 5*64, 0, 0);
387 		STORE_NTI(d0, p1[0]);
388 		STORE_NTI(d1, p1[1]);
389 		STORE_NTI(d2, p1[2]);
390 		STORE_NTI(d3, p1[3]);
391 		STORE_NTI(d4, p1[4]);
392 		STORE_NTI(d5, p1[5]);
393 		STORE_NTI(d6, p1[6]);
394 		STORE_NTI(d7, p1[7]);
395 		p1 += 8;
396 		p2 += 8;
397 		p3 += 8;
398 	} while (--lines > 0);
399 }
400 
401 static void
xor_64regs_stream_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)402 xor_64regs_stream_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
403 	    unsigned long *p3, unsigned long *p4)
404 {
405 	long lines = bytes / (sizeof (long)) / 8;
406 
407 	do {
408 		register long d0, d1, d2, d3, d4, d5, d6, d7;
409 		d0 = p1[0];	/* Pull the stuff into registers	*/
410 		d1 = p1[1];	/*  ... in bursts, if possible.		*/
411 		d2 = p1[2];
412 		d3 = p1[3];
413 		d4 = p1[4];
414 		d5 = p1[5];
415 		d6 = p1[6];
416 		d7 = p1[7];
417 		__builtin_prefetch(p1 + 5*64, 0, 0);
418 		d0 ^= p2[0];
419 		d1 ^= p2[1];
420 		d2 ^= p2[2];
421 		d3 ^= p2[3];
422 		d4 ^= p2[4];
423 		d5 ^= p2[5];
424 		d6 ^= p2[6];
425 		d7 ^= p2[7];
426 		__builtin_prefetch(p2 + 5*64, 0, 0);
427 		d0 ^= p3[0];
428 		d1 ^= p3[1];
429 		d2 ^= p3[2];
430 		d3 ^= p3[3];
431 		d4 ^= p3[4];
432 		d5 ^= p3[5];
433 		d6 ^= p3[6];
434 		d7 ^= p3[7];
435 		__builtin_prefetch(p3 + 5*64, 0, 0);
436 		d0 ^= p4[0];
437 		d1 ^= p4[1];
438 		d2 ^= p4[2];
439 		d3 ^= p4[3];
440 		d4 ^= p4[4];
441 		d5 ^= p4[5];
442 		d6 ^= p4[6];
443 		d7 ^= p4[7];
444 		__builtin_prefetch(p4 + 5*64, 0, 0);
445 		STORE_NTI(d0, p1[0]);
446 		STORE_NTI(d1, p1[1]);
447 		STORE_NTI(d2, p1[2]);
448 		STORE_NTI(d3, p1[3]);
449 		STORE_NTI(d4, p1[4]);
450 		STORE_NTI(d5, p1[5]);
451 		STORE_NTI(d6, p1[6]);
452 		STORE_NTI(d7, p1[7]);
453 		p1 += 8;
454 		p2 += 8;
455 		p3 += 8;
456 		p4 += 8;
457 	} while (--lines > 0);
458 }
459 
460 static void
xor_64regs_stream_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)461 xor_64regs_stream_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
462 	    unsigned long *p3, unsigned long *p4, unsigned long *p5)
463 {
464 	long lines = bytes / (sizeof (long)) / 8;
465 
466 	do {
467 		register long d0, d1, d2, d3, d4, d5, d6, d7;
468 		d0 = p1[0];	/* Pull the stuff into registers	*/
469 		d1 = p1[1];	/*  ... in bursts, if possible.		*/
470 		d2 = p1[2];
471 		d3 = p1[3];
472 		d4 = p1[4];
473 		d5 = p1[5];
474 		d6 = p1[6];
475 		d7 = p1[7];
476 		__builtin_prefetch(p1 + 5*64, 0, 0);
477 		d0 ^= p2[0];
478 		d1 ^= p2[1];
479 		d2 ^= p2[2];
480 		d3 ^= p2[3];
481 		d4 ^= p2[4];
482 		d5 ^= p2[5];
483 		d6 ^= p2[6];
484 		d7 ^= p2[7];
485 		__builtin_prefetch(p2 + 5*64, 0, 0);
486 		d0 ^= p3[0];
487 		d1 ^= p3[1];
488 		d2 ^= p3[2];
489 		d3 ^= p3[3];
490 		d4 ^= p3[4];
491 		d5 ^= p3[5];
492 		d6 ^= p3[6];
493 		d7 ^= p3[7];
494 		__builtin_prefetch(p3 + 5*64, 0, 0);
495 		d0 ^= p4[0];
496 		d1 ^= p4[1];
497 		d2 ^= p4[2];
498 		d3 ^= p4[3];
499 		d4 ^= p4[4];
500 		d5 ^= p4[5];
501 		d6 ^= p4[6];
502 		d7 ^= p4[7];
503 		__builtin_prefetch(p4 + 5*64, 0, 0);
504 		d0 ^= p5[0];
505 		d1 ^= p5[1];
506 		d2 ^= p5[2];
507 		d3 ^= p5[3];
508 		d4 ^= p5[4];
509 		d5 ^= p5[5];
510 		d6 ^= p5[6];
511 		d7 ^= p5[7];
512 		__builtin_prefetch(p5 + 5*64, 0, 0);
513 		STORE_NTI(d0, p1[0]);
514 		STORE_NTI(d1, p1[1]);
515 		STORE_NTI(d2, p1[2]);
516 		STORE_NTI(d3, p1[3]);
517 		STORE_NTI(d4, p1[4]);
518 		STORE_NTI(d5, p1[5]);
519 		STORE_NTI(d6, p1[6]);
520 		STORE_NTI(d7, p1[7]);
521 		p1 += 8;
522 		p2 += 8;
523 		p3 += 8;
524 		p4 += 8;
525 		p5 += 8;
526 	} while (--lines > 0);
527 }
528 
529 
530 static struct xor_block_template xor_block_sse = {
531         name: "128byte sse streaming",
532         do_2: xor_sse_2,
533         do_3: xor_sse_3,
534         do_4: xor_sse_4,
535         do_5: xor_sse_5,
536 };
537 
538 static struct xor_block_template xor_block_64regs_stream = {
539 	name: "64byte int streaming",
540 	do_2: xor_64regs_stream_2,
541 	do_3: xor_64regs_stream_3,
542 	do_4: xor_64regs_stream_4,
543 	do_5: xor_64regs_stream_5,
544 };
545 
546 /* AK: the speed test is useless: it only tests cache hot */
547 #undef XOR_TRY_TEMPLATES
548 #define XOR_TRY_TEMPLATES				\
549 	do {						\
550 		xor_speed(&xor_block_sse);	\
551 		xor_speed(&xor_block_64regs_stream);	\
552 	} while (0)
553 
554 #define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)
555