1 /*
2  * include/asm-i386/xor.h
3  *
4  * Optimized RAID-5 checksumming functions for MMX and SSE.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2, or (at your option)
9  * any later version.
10  *
11  * You should have received a copy of the GNU General Public License
12  * (for example /usr/src/linux/COPYING); if not, write to the Free
13  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14  */
15 
16 /*
17  * High-speed RAID5 checksumming functions utilizing MMX instructions.
18  * Copyright (C) 1998 Ingo Molnar.
19  */
20 
21 #define FPU_SAVE							\
22   do {									\
23 	if (!(current->flags & PF_USEDFPU))				\
24 		__asm__ __volatile__ (" clts;\n");			\
25 	__asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0]));	\
26   } while (0)
27 
28 #define FPU_RESTORE							\
29   do {									\
30 	__asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0]));		\
31 	if (!(current->flags & PF_USEDFPU))				\
32 		stts();							\
33   } while (0)
34 
35 #define LD(x,y)		"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
36 #define ST(x,y)		"       movq %%mm"#y",   8*("#x")(%1)   ;\n"
37 #define XO1(x,y)	"       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
38 #define XO2(x,y)	"       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
39 #define XO3(x,y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
40 #define XO4(x,y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
41 
42 
43 static void
xor_pII_mmx_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)44 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
45 {
46 	unsigned long lines = bytes >> 7;
47 	char fpu_save[108];
48 
49 	FPU_SAVE;
50 
51 	__asm__ __volatile__ (
52 #undef BLOCK
53 #define BLOCK(i) \
54 	LD(i,0)					\
55 		LD(i+1,1)			\
56 			LD(i+2,2)		\
57 				LD(i+3,3)	\
58 	XO1(i,0)				\
59 	ST(i,0)					\
60 		XO1(i+1,1)			\
61 		ST(i+1,1)			\
62 			XO1(i+2,2)		\
63 			ST(i+2,2)		\
64 				XO1(i+3,3)	\
65 				ST(i+3,3)
66 
67 	" .align 32			;\n"
68   	" 1:                            ;\n"
69 
70 	BLOCK(0)
71 	BLOCK(4)
72 	BLOCK(8)
73 	BLOCK(12)
74 
75 	"       addl $128, %1         ;\n"
76 	"       addl $128, %2         ;\n"
77 	"       decl %0               ;\n"
78 	"       jnz 1b                ;\n"
79 	: "+r" (lines),
80 	  "+r" (p1), "+r" (p2)
81 	:
82 	: "memory");
83 
84 	FPU_RESTORE;
85 }
86 
87 static void
xor_pII_mmx_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)88 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
89 	      unsigned long *p3)
90 {
91 	unsigned long lines = bytes >> 7;
92 	char fpu_save[108];
93 
94 	FPU_SAVE;
95 
96 	__asm__ __volatile__ (
97 #undef BLOCK
98 #define BLOCK(i) \
99 	LD(i,0)					\
100 		LD(i+1,1)			\
101 			LD(i+2,2)		\
102 				LD(i+3,3)	\
103 	XO1(i,0)				\
104 		XO1(i+1,1)			\
105 			XO1(i+2,2)		\
106 				XO1(i+3,3)	\
107 	XO2(i,0)				\
108 	ST(i,0)					\
109 		XO2(i+1,1)			\
110 		ST(i+1,1)			\
111 			XO2(i+2,2)		\
112 			ST(i+2,2)		\
113 				XO2(i+3,3)	\
114 				ST(i+3,3)
115 
116 	" .align 32			;\n"
117 	" 1:                            ;\n"
118 
119 	BLOCK(0)
120 	BLOCK(4)
121 	BLOCK(8)
122 	BLOCK(12)
123 
124 	"       addl $128, %1         ;\n"
125 	"       addl $128, %2         ;\n"
126 	"       addl $128, %3         ;\n"
127 	"       decl %0               ;\n"
128 	"       jnz 1b                ;\n"
129 	: "+r" (lines),
130 	  "+r" (p1), "+r" (p2), "+r" (p3)
131 	:
132 	: "memory");
133 
134 	FPU_RESTORE;
135 }
136 
137 static void
xor_pII_mmx_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)138 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
139 	      unsigned long *p3, unsigned long *p4)
140 {
141 	unsigned long lines = bytes >> 7;
142 	char fpu_save[108];
143 
144 	FPU_SAVE;
145 
146 	__asm__ __volatile__ (
147 #undef BLOCK
148 #define BLOCK(i) \
149 	LD(i,0)					\
150 		LD(i+1,1)			\
151 			LD(i+2,2)		\
152 				LD(i+3,3)	\
153 	XO1(i,0)				\
154 		XO1(i+1,1)			\
155 			XO1(i+2,2)		\
156 				XO1(i+3,3)	\
157 	XO2(i,0)				\
158 		XO2(i+1,1)			\
159 			XO2(i+2,2)		\
160 				XO2(i+3,3)	\
161 	XO3(i,0)				\
162 	ST(i,0)					\
163 		XO3(i+1,1)			\
164 		ST(i+1,1)			\
165 			XO3(i+2,2)		\
166 			ST(i+2,2)		\
167 				XO3(i+3,3)	\
168 				ST(i+3,3)
169 
170 	" .align 32			;\n"
171 	" 1:                            ;\n"
172 
173 	BLOCK(0)
174 	BLOCK(4)
175 	BLOCK(8)
176 	BLOCK(12)
177 
178 	"       addl $128, %1         ;\n"
179 	"       addl $128, %2         ;\n"
180 	"       addl $128, %3         ;\n"
181 	"       addl $128, %4         ;\n"
182 	"       decl %0               ;\n"
183 	"       jnz 1b                ;\n"
184 	: "+r" (lines),
185 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
186 	:
187 	: "memory");
188 
189 	FPU_RESTORE;
190 }
191 
192 
193 static void
xor_pII_mmx_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)194 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
195 	      unsigned long *p3, unsigned long *p4, unsigned long *p5)
196 {
197 	unsigned long lines = bytes >> 7;
198 	char fpu_save[108];
199 
200 	FPU_SAVE;
201 
202 	/* need to save/restore p4/p5 manually otherwise gcc's 10 argument
203 	   limit gets exceeded (+ counts as two arguments) */
204 	__asm__ __volatile__ (
205 		"  pushl %4\n"
206 		"  pushl %5\n"
207 #undef BLOCK
208 #define BLOCK(i) \
209 	LD(i,0)					\
210 		LD(i+1,1)			\
211 			LD(i+2,2)		\
212 				LD(i+3,3)	\
213 	XO1(i,0)				\
214 		XO1(i+1,1)			\
215 			XO1(i+2,2)		\
216 				XO1(i+3,3)	\
217 	XO2(i,0)				\
218 		XO2(i+1,1)			\
219 			XO2(i+2,2)		\
220 				XO2(i+3,3)	\
221 	XO3(i,0)				\
222 		XO3(i+1,1)			\
223 			XO3(i+2,2)		\
224 				XO3(i+3,3)	\
225 	XO4(i,0)				\
226 	ST(i,0)					\
227 		XO4(i+1,1)			\
228 		ST(i+1,1)			\
229 			XO4(i+2,2)		\
230 			ST(i+2,2)		\
231 				XO4(i+3,3)	\
232 				ST(i+3,3)
233 
234 	" .align 32			;\n"
235 	" 1:                            ;\n"
236 
237 	BLOCK(0)
238 	BLOCK(4)
239 	BLOCK(8)
240 	BLOCK(12)
241 
242 	"       addl $128, %1         ;\n"
243 	"       addl $128, %2         ;\n"
244 	"       addl $128, %3         ;\n"
245 	"       addl $128, %4         ;\n"
246 	"       addl $128, %5         ;\n"
247 	"       decl %0               ;\n"
248 	"       jnz 1b                ;\n"
249 	"	popl %5\n"
250 	"	popl %4\n"
251 	: "+r" (lines),
252 	  "+r" (p1), "+r" (p2), "+r" (p3)
253 	: "r" (p4), "r" (p5)
254 	: "memory");
255 
256 	FPU_RESTORE;
257 }
258 
259 #undef LD
260 #undef XO1
261 #undef XO2
262 #undef XO3
263 #undef XO4
264 #undef ST
265 #undef BLOCK
266 
267 static void
xor_p5_mmx_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)268 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
269 {
270 	unsigned long lines = bytes >> 6;
271 	char fpu_save[108];
272 
273 	FPU_SAVE;
274 
275 	__asm__ __volatile__ (
276 	" .align 32	             ;\n"
277 	" 1:                         ;\n"
278 	"       movq   (%1), %%mm0   ;\n"
279 	"       movq  8(%1), %%mm1   ;\n"
280 	"       pxor   (%2), %%mm0   ;\n"
281 	"       movq 16(%1), %%mm2   ;\n"
282 	"       movq %%mm0,   (%1)   ;\n"
283 	"       pxor  8(%2), %%mm1   ;\n"
284 	"       movq 24(%1), %%mm3   ;\n"
285 	"       movq %%mm1,  8(%1)   ;\n"
286 	"       pxor 16(%2), %%mm2   ;\n"
287 	"       movq 32(%1), %%mm4   ;\n"
288 	"       movq %%mm2, 16(%1)   ;\n"
289 	"       pxor 24(%2), %%mm3   ;\n"
290 	"       movq 40(%1), %%mm5   ;\n"
291 	"       movq %%mm3, 24(%1)   ;\n"
292 	"       pxor 32(%2), %%mm4   ;\n"
293 	"       movq 48(%1), %%mm6   ;\n"
294 	"       movq %%mm4, 32(%1)   ;\n"
295 	"       pxor 40(%2), %%mm5   ;\n"
296 	"       movq 56(%1), %%mm7   ;\n"
297 	"       movq %%mm5, 40(%1)   ;\n"
298 	"       pxor 48(%2), %%mm6   ;\n"
299 	"       pxor 56(%2), %%mm7   ;\n"
300 	"       movq %%mm6, 48(%1)   ;\n"
301 	"       movq %%mm7, 56(%1)   ;\n"
302 
303 	"       addl $64, %1         ;\n"
304 	"       addl $64, %2         ;\n"
305 	"       decl %0              ;\n"
306 	"       jnz 1b               ;\n"
307 	: "+r" (lines),
308 	  "+r" (p1), "+r" (p2)
309 	:
310 	: "memory");
311 
312 	FPU_RESTORE;
313 }
314 
315 static void
xor_p5_mmx_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)316 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
317 	     unsigned long *p3)
318 {
319 	unsigned long lines = bytes >> 6;
320 	char fpu_save[108];
321 
322 	FPU_SAVE;
323 
324 	__asm__ __volatile__ (
325 	" .align 32,0x90             ;\n"
326 	" 1:                         ;\n"
327 	"       movq   (%1), %%mm0   ;\n"
328 	"       movq  8(%1), %%mm1   ;\n"
329 	"       pxor   (%2), %%mm0   ;\n"
330 	"       movq 16(%1), %%mm2   ;\n"
331 	"       pxor  8(%2), %%mm1   ;\n"
332 	"       pxor   (%3), %%mm0   ;\n"
333 	"       pxor 16(%2), %%mm2   ;\n"
334 	"       movq %%mm0,   (%1)   ;\n"
335 	"       pxor  8(%3), %%mm1   ;\n"
336 	"       pxor 16(%3), %%mm2   ;\n"
337 	"       movq 24(%1), %%mm3   ;\n"
338 	"       movq %%mm1,  8(%1)   ;\n"
339 	"       movq 32(%1), %%mm4   ;\n"
340 	"       movq 40(%1), %%mm5   ;\n"
341 	"       pxor 24(%2), %%mm3   ;\n"
342 	"       movq %%mm2, 16(%1)   ;\n"
343 	"       pxor 32(%2), %%mm4   ;\n"
344 	"       pxor 24(%3), %%mm3   ;\n"
345 	"       pxor 40(%2), %%mm5   ;\n"
346 	"       movq %%mm3, 24(%1)   ;\n"
347 	"       pxor 32(%3), %%mm4   ;\n"
348 	"       pxor 40(%3), %%mm5   ;\n"
349 	"       movq 48(%1), %%mm6   ;\n"
350 	"       movq %%mm4, 32(%1)   ;\n"
351 	"       movq 56(%1), %%mm7   ;\n"
352 	"       pxor 48(%2), %%mm6   ;\n"
353 	"       movq %%mm5, 40(%1)   ;\n"
354 	"       pxor 56(%2), %%mm7   ;\n"
355 	"       pxor 48(%3), %%mm6   ;\n"
356 	"       pxor 56(%3), %%mm7   ;\n"
357 	"       movq %%mm6, 48(%1)   ;\n"
358 	"       movq %%mm7, 56(%1)   ;\n"
359 
360 	"       addl $64, %1         ;\n"
361 	"       addl $64, %2         ;\n"
362 	"       addl $64, %3         ;\n"
363 	"       decl %0              ;\n"
364 	"       jnz 1b               ;\n"
365 	: "+r" (lines),
366 	  "+r" (p1), "+r" (p2), "+r" (p3)
367 	:
368 	: "memory" );
369 
370 	FPU_RESTORE;
371 }
372 
373 static void
xor_p5_mmx_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)374 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
375 	     unsigned long *p3, unsigned long *p4)
376 {
377 	unsigned long lines = bytes >> 6;
378 	char fpu_save[108];
379 
380 	FPU_SAVE;
381 
382 	__asm__ __volatile__ (
383 	" .align 32,0x90             ;\n"
384 	" 1:                         ;\n"
385 	"       movq   (%1), %%mm0   ;\n"
386 	"       movq  8(%1), %%mm1   ;\n"
387 	"       pxor   (%2), %%mm0   ;\n"
388 	"       movq 16(%1), %%mm2   ;\n"
389 	"       pxor  8(%2), %%mm1   ;\n"
390 	"       pxor   (%3), %%mm0   ;\n"
391 	"       pxor 16(%2), %%mm2   ;\n"
392 	"       pxor  8(%3), %%mm1   ;\n"
393 	"       pxor   (%4), %%mm0   ;\n"
394 	"       movq 24(%1), %%mm3   ;\n"
395 	"       pxor 16(%3), %%mm2   ;\n"
396 	"       pxor  8(%4), %%mm1   ;\n"
397 	"       movq %%mm0,   (%1)   ;\n"
398 	"       movq 32(%1), %%mm4   ;\n"
399 	"       pxor 24(%2), %%mm3   ;\n"
400 	"       pxor 16(%4), %%mm2   ;\n"
401 	"       movq %%mm1,  8(%1)   ;\n"
402 	"       movq 40(%1), %%mm5   ;\n"
403 	"       pxor 32(%2), %%mm4   ;\n"
404 	"       pxor 24(%3), %%mm3   ;\n"
405 	"       movq %%mm2, 16(%1)   ;\n"
406 	"       pxor 40(%2), %%mm5   ;\n"
407 	"       pxor 32(%3), %%mm4   ;\n"
408 	"       pxor 24(%4), %%mm3   ;\n"
409 	"       movq %%mm3, 24(%1)   ;\n"
410 	"       movq 56(%1), %%mm7   ;\n"
411 	"       movq 48(%1), %%mm6   ;\n"
412 	"       pxor 40(%3), %%mm5   ;\n"
413 	"       pxor 32(%4), %%mm4   ;\n"
414 	"       pxor 48(%2), %%mm6   ;\n"
415 	"       movq %%mm4, 32(%1)   ;\n"
416 	"       pxor 56(%2), %%mm7   ;\n"
417 	"       pxor 40(%4), %%mm5   ;\n"
418 	"       pxor 48(%3), %%mm6   ;\n"
419 	"       pxor 56(%3), %%mm7   ;\n"
420 	"       movq %%mm5, 40(%1)   ;\n"
421 	"       pxor 48(%4), %%mm6   ;\n"
422 	"       pxor 56(%4), %%mm7   ;\n"
423 	"       movq %%mm6, 48(%1)   ;\n"
424 	"       movq %%mm7, 56(%1)   ;\n"
425 
426 	"       addl $64, %1         ;\n"
427 	"       addl $64, %2         ;\n"
428 	"       addl $64, %3         ;\n"
429 	"       addl $64, %4         ;\n"
430 	"       decl %0              ;\n"
431 	"       jnz 1b               ;\n"
432 	: "+r" (lines),
433 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
434 	:
435 	: "memory");
436 
437 	FPU_RESTORE;
438 }
439 
440 static void
xor_p5_mmx_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)441 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
442 	     unsigned long *p3, unsigned long *p4, unsigned long *p5)
443 {
444 	unsigned long lines = bytes >> 6;
445 	char fpu_save[108];
446 
447 	FPU_SAVE;
448 
449 	/* need to save p4/p5 manually to not exceed gcc's 10 argument limit */
450 	__asm__ __volatile__ (
451 	"	pushl %4\n"
452 	"	pushl %5\n"
453 	" .align 32,0x90             ;\n"
454 	" 1:                         ;\n"
455 	"       movq   (%1), %%mm0   ;\n"
456 	"       movq  8(%1), %%mm1   ;\n"
457 	"       pxor   (%2), %%mm0   ;\n"
458 	"       pxor  8(%2), %%mm1   ;\n"
459 	"       movq 16(%1), %%mm2   ;\n"
460 	"       pxor   (%3), %%mm0   ;\n"
461 	"       pxor  8(%3), %%mm1   ;\n"
462 	"       pxor 16(%2), %%mm2   ;\n"
463 	"       pxor   (%4), %%mm0   ;\n"
464 	"       pxor  8(%4), %%mm1   ;\n"
465 	"       pxor 16(%3), %%mm2   ;\n"
466 	"       movq 24(%1), %%mm3   ;\n"
467 	"       pxor   (%5), %%mm0   ;\n"
468 	"       pxor  8(%5), %%mm1   ;\n"
469 	"       movq %%mm0,   (%1)   ;\n"
470 	"       pxor 16(%4), %%mm2   ;\n"
471 	"       pxor 24(%2), %%mm3   ;\n"
472 	"       movq %%mm1,  8(%1)   ;\n"
473 	"       pxor 16(%5), %%mm2   ;\n"
474 	"       pxor 24(%3), %%mm3   ;\n"
475 	"       movq 32(%1), %%mm4   ;\n"
476 	"       movq %%mm2, 16(%1)   ;\n"
477 	"       pxor 24(%4), %%mm3   ;\n"
478 	"       pxor 32(%2), %%mm4   ;\n"
479 	"       movq 40(%1), %%mm5   ;\n"
480 	"       pxor 24(%5), %%mm3   ;\n"
481 	"       pxor 32(%3), %%mm4   ;\n"
482 	"       pxor 40(%2), %%mm5   ;\n"
483 	"       movq %%mm3, 24(%1)   ;\n"
484 	"       pxor 32(%4), %%mm4   ;\n"
485 	"       pxor 40(%3), %%mm5   ;\n"
486 	"       movq 48(%1), %%mm6   ;\n"
487 	"       movq 56(%1), %%mm7   ;\n"
488 	"       pxor 32(%5), %%mm4   ;\n"
489 	"       pxor 40(%4), %%mm5   ;\n"
490 	"       pxor 48(%2), %%mm6   ;\n"
491 	"       pxor 56(%2), %%mm7   ;\n"
492 	"       movq %%mm4, 32(%1)   ;\n"
493 	"       pxor 48(%3), %%mm6   ;\n"
494 	"       pxor 56(%3), %%mm7   ;\n"
495 	"       pxor 40(%5), %%mm5   ;\n"
496 	"       pxor 48(%4), %%mm6   ;\n"
497 	"       pxor 56(%4), %%mm7   ;\n"
498 	"       movq %%mm5, 40(%1)   ;\n"
499 	"       pxor 48(%5), %%mm6   ;\n"
500 	"       pxor 56(%5), %%mm7   ;\n"
501 	"       movq %%mm6, 48(%1)   ;\n"
502 	"       movq %%mm7, 56(%1)   ;\n"
503 
504 	"       addl $64, %1         ;\n"
505 	"       addl $64, %2         ;\n"
506 	"       addl $64, %3         ;\n"
507 	"       addl $64, %4         ;\n"
508 	"       addl $64, %5         ;\n"
509 	"       decl %0              ;\n"
510 	"       jnz 1b               ;\n"
511 	"	popl %5\n"
512 	"	popl %4\n"
513 	: "+g" (lines),
514 	  "+r" (p1), "+r" (p2), "+r" (p3)
515 	: "r" (p4), "r" (p5)
516 	: "memory");
517 
518 	FPU_RESTORE;
519 }
520 
521 static struct xor_block_template xor_block_pII_mmx = {
522 	name: "pII_mmx",
523 	do_2: xor_pII_mmx_2,
524 	do_3: xor_pII_mmx_3,
525 	do_4: xor_pII_mmx_4,
526 	do_5: xor_pII_mmx_5,
527 };
528 
529 static struct xor_block_template xor_block_p5_mmx = {
530 	name: "p5_mmx",
531 	do_2: xor_p5_mmx_2,
532 	do_3: xor_p5_mmx_3,
533 	do_4: xor_p5_mmx_4,
534 	do_5: xor_p5_mmx_5,
535 };
536 
537 #undef FPU_SAVE
538 #undef FPU_RESTORE
539 
540 /*
541  * Cache avoiding checksumming functions utilizing KNI instructions
542  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
543  */
544 
545 #define XMMS_SAVE				\
546 	__asm__ __volatile__ ( 			\
547 		"movl %%cr0,%0		;\n\t"	\
548 		"clts			;\n\t"	\
549 		"movups %%xmm0,(%1)	;\n\t"	\
550 		"movups %%xmm1,0x10(%1)	;\n\t"	\
551 		"movups %%xmm2,0x20(%1)	;\n\t"	\
552 		"movups %%xmm3,0x30(%1)	;\n\t"	\
553 		: "=&r" (cr0)			\
554 		: "r" (xmm_save) 		\
555 		: "memory")
556 
557 #define XMMS_RESTORE				\
558 	__asm__ __volatile__ ( 			\
559 		"sfence			;\n\t"	\
560 		"movups (%1),%%xmm0	;\n\t"	\
561 		"movups 0x10(%1),%%xmm1	;\n\t"	\
562 		"movups 0x20(%1),%%xmm2	;\n\t"	\
563 		"movups 0x30(%1),%%xmm3	;\n\t"	\
564 		"movl 	%0,%%cr0	;\n\t"	\
565 		:				\
566 		: "r" (cr0), "r" (xmm_save)	\
567 		: "memory")
568 
569 #define ALIGN16 __attribute__((aligned(16)))
570 
571 #define OFFS(x)		"16*("#x")"
572 #define PF_OFFS(x)	"256+16*("#x")"
573 #define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%1)		;\n"
574 #define LD(x,y)		"       movaps   "OFFS(x)"(%1), %%xmm"#y"	;\n"
575 #define ST(x,y)		"       movaps %%xmm"#y",   "OFFS(x)"(%1)	;\n"
576 #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%2)		;\n"
577 #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%3)		;\n"
578 #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%4)		;\n"
579 #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%5)		;\n"
580 #define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%6)		;\n"
581 #define XO1(x,y)	"       xorps   "OFFS(x)"(%2), %%xmm"#y"	;\n"
582 #define XO2(x,y)	"       xorps   "OFFS(x)"(%3), %%xmm"#y"	;\n"
583 #define XO3(x,y)	"       xorps   "OFFS(x)"(%4), %%xmm"#y"	;\n"
584 #define XO4(x,y)	"       xorps   "OFFS(x)"(%5), %%xmm"#y"	;\n"
585 #define XO5(x,y)	"       xorps   "OFFS(x)"(%6), %%xmm"#y"	;\n"
586 
587 
588 static void
xor_sse_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)589 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
590 {
591         unsigned long lines = bytes >> 8;
592 	char xmm_save[16*4] ALIGN16;
593 	int cr0;
594 
595 	XMMS_SAVE;
596 
597         __asm__ __volatile__ (
598 #undef BLOCK
599 #define BLOCK(i) \
600 		LD(i,0)					\
601 			LD(i+1,1)			\
602 		PF1(i)					\
603 				PF1(i+2)		\
604 				LD(i+2,2)		\
605 					LD(i+3,3)	\
606 		PF0(i+4)				\
607 				PF0(i+6)		\
608 		XO1(i,0)				\
609 			XO1(i+1,1)			\
610 				XO1(i+2,2)		\
611 					XO1(i+3,3)	\
612 		ST(i,0)					\
613 			ST(i+1,1)			\
614 				ST(i+2,2)		\
615 					ST(i+3,3)	\
616 
617 
618 		PF0(0)
619 				PF0(2)
620 
621 	" .align 32			;\n"
622         " 1:                            ;\n"
623 
624 		BLOCK(0)
625 		BLOCK(4)
626 		BLOCK(8)
627 		BLOCK(12)
628 
629         "       addl $256, %1           ;\n"
630         "       addl $256, %2           ;\n"
631         "       decl %0                 ;\n"
632         "       jnz 1b                  ;\n"
633 	: "+r" (lines),
634 	  "+r" (p1), "+r" (p2)
635 	:
636         : "memory");
637 
638 	XMMS_RESTORE;
639 }
640 
641 static void
xor_sse_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)642 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
643 	  unsigned long *p3)
644 {
645         unsigned long lines = bytes >> 8;
646 	char xmm_save[16*4] ALIGN16;
647 	int cr0;
648 
649 	XMMS_SAVE;
650 
651         __asm__ __volatile__ (
652 #undef BLOCK
653 #define BLOCK(i) \
654 		PF1(i)					\
655 				PF1(i+2)		\
656 		LD(i,0)					\
657 			LD(i+1,1)			\
658 				LD(i+2,2)		\
659 					LD(i+3,3)	\
660 		PF2(i)					\
661 				PF2(i+2)		\
662 		PF0(i+4)				\
663 				PF0(i+6)		\
664 		XO1(i,0)				\
665 			XO1(i+1,1)			\
666 				XO1(i+2,2)		\
667 					XO1(i+3,3)	\
668 		XO2(i,0)				\
669 			XO2(i+1,1)			\
670 				XO2(i+2,2)		\
671 					XO2(i+3,3)	\
672 		ST(i,0)					\
673 			ST(i+1,1)			\
674 				ST(i+2,2)		\
675 					ST(i+3,3)	\
676 
677 
678 		PF0(0)
679 				PF0(2)
680 
681 	" .align 32			;\n"
682         " 1:                            ;\n"
683 
684 		BLOCK(0)
685 		BLOCK(4)
686 		BLOCK(8)
687 		BLOCK(12)
688 
689         "       addl $256, %1           ;\n"
690         "       addl $256, %2           ;\n"
691         "       addl $256, %3           ;\n"
692         "       decl %0                 ;\n"
693         "       jnz 1b                  ;\n"
694 	: "+r" (lines),
695 	  "+r" (p1), "+r"(p2), "+r"(p3)
696 	:
697         : "memory" );
698 
699 	XMMS_RESTORE;
700 }
701 
702 static void
xor_sse_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)703 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
704 	  unsigned long *p3, unsigned long *p4)
705 {
706         unsigned long lines = bytes >> 8;
707 	char xmm_save[16*4] ALIGN16;
708 	int cr0;
709 
710 	XMMS_SAVE;
711 
712         __asm__ __volatile__ (
713 #undef BLOCK
714 #define BLOCK(i) \
715 		PF1(i)					\
716 				PF1(i+2)		\
717 		LD(i,0)					\
718 			LD(i+1,1)			\
719 				LD(i+2,2)		\
720 					LD(i+3,3)	\
721 		PF2(i)					\
722 				PF2(i+2)		\
723 		XO1(i,0)				\
724 			XO1(i+1,1)			\
725 				XO1(i+2,2)		\
726 					XO1(i+3,3)	\
727 		PF3(i)					\
728 				PF3(i+2)		\
729 		PF0(i+4)				\
730 				PF0(i+6)		\
731 		XO2(i,0)				\
732 			XO2(i+1,1)			\
733 				XO2(i+2,2)		\
734 					XO2(i+3,3)	\
735 		XO3(i,0)				\
736 			XO3(i+1,1)			\
737 				XO3(i+2,2)		\
738 					XO3(i+3,3)	\
739 		ST(i,0)					\
740 			ST(i+1,1)			\
741 				ST(i+2,2)		\
742 					ST(i+3,3)	\
743 
744 
745 		PF0(0)
746 				PF0(2)
747 
748 	" .align 32			;\n"
749         " 1:                            ;\n"
750 
751 		BLOCK(0)
752 		BLOCK(4)
753 		BLOCK(8)
754 		BLOCK(12)
755 
756         "       addl $256, %1           ;\n"
757         "       addl $256, %2           ;\n"
758         "       addl $256, %3           ;\n"
759         "       addl $256, %4           ;\n"
760         "       decl %0                 ;\n"
761         "       jnz 1b                  ;\n"
762 	: "+r" (lines),
763 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
764 	:
765         : "memory" );
766 
767 	XMMS_RESTORE;
768 }
769 
770 static void
xor_sse_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)771 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
772 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
773 {
774         unsigned long lines = bytes >> 8;
775 	char xmm_save[16*4] ALIGN16;
776 	int cr0;
777 
778 	XMMS_SAVE;
779 
780 	/* need to save p4/p5 manually to not exceed gcc's 10 argument limit */
781         __asm__ __volatile__ (
782 		" pushl %4\n"
783 		" pushl %5\n"
784 #undef BLOCK
785 #define BLOCK(i) \
786 		PF1(i)					\
787 				PF1(i+2)		\
788 		LD(i,0)					\
789 			LD(i+1,1)			\
790 				LD(i+2,2)		\
791 					LD(i+3,3)	\
792 		PF2(i)					\
793 				PF2(i+2)		\
794 		XO1(i,0)				\
795 			XO1(i+1,1)			\
796 				XO1(i+2,2)		\
797 					XO1(i+3,3)	\
798 		PF3(i)					\
799 				PF3(i+2)		\
800 		XO2(i,0)				\
801 			XO2(i+1,1)			\
802 				XO2(i+2,2)		\
803 					XO2(i+3,3)	\
804 		PF4(i)					\
805 				PF4(i+2)		\
806 		PF0(i+4)				\
807 				PF0(i+6)		\
808 		XO3(i,0)				\
809 			XO3(i+1,1)			\
810 				XO3(i+2,2)		\
811 					XO3(i+3,3)	\
812 		XO4(i,0)				\
813 			XO4(i+1,1)			\
814 				XO4(i+2,2)		\
815 					XO4(i+3,3)	\
816 		ST(i,0)					\
817 			ST(i+1,1)			\
818 				ST(i+2,2)		\
819 					ST(i+3,3)	\
820 
821 
822 		PF0(0)
823 				PF0(2)
824 
825 	" .align 32			;\n"
826         " 1:                            ;\n"
827 
828 		BLOCK(0)
829 		BLOCK(4)
830 		BLOCK(8)
831 		BLOCK(12)
832 
833         "       addl $256, %1           ;\n"
834         "       addl $256, %2           ;\n"
835         "       addl $256, %3           ;\n"
836         "       addl $256, %4           ;\n"
837         "       addl $256, %5           ;\n"
838         "       decl %0                 ;\n"
839         "       jnz 1b                  ;\n"
840 	"	popl %5\n"
841 	"	popl %4\n"
842 	: "+r" (lines),
843 	  "+r" (p1), "+r" (p2), "+r" (p3)
844 	: "r" (p4), "r" (p5)
845 	: "memory");
846 
847 	XMMS_RESTORE;
848 }
849 
850 static struct xor_block_template xor_block_pIII_sse = {
851         name: "pIII_sse",
852         do_2: xor_sse_2,
853         do_3: xor_sse_3,
854         do_4: xor_sse_4,
855         do_5: xor_sse_5,
856 };
857 
858 /* Also try the generic routines.  */
859 #include <asm-generic/xor.h>
860 
861 #undef XOR_TRY_TEMPLATES
862 #define XOR_TRY_TEMPLATES				\
863 	do {						\
864 		xor_speed(&xor_block_8regs);		\
865 		xor_speed(&xor_block_32regs);		\
866 	        if (cpu_has_xmm)			\
867 			xor_speed(&xor_block_pIII_sse);	\
868 	        if (md_cpu_has_mmx()) {			\
869 	                xor_speed(&xor_block_pII_mmx);	\
870 	                xor_speed(&xor_block_p5_mmx);	\
871 	        }					\
872 	} while (0)
873 
874 /* We force the use of the SSE xor block because it can write around L2.
875    We may also be able to load into the L1 only depending on how the cpu
876    deals with a load to a line that is being prefetched.  */
877 #define XOR_SELECT_TEMPLATE(FASTEST) \
878 	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
879