1 /*
2 * include/asm-i386/xor.h
3 *
4 * Optimized RAID-5 checksumming functions for MMX and SSE.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2, or (at your option)
9 * any later version.
10 *
11 * You should have received a copy of the GNU General Public License
12 * (for example /usr/src/linux/COPYING); if not, write to the Free
13 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14 */
15
16 /*
17 * High-speed RAID5 checksumming functions utilizing MMX instructions.
18 * Copyright (C) 1998 Ingo Molnar.
19 */
20
21 #define FPU_SAVE \
22 do { \
23 if (!(current->flags & PF_USEDFPU)) \
24 __asm__ __volatile__ (" clts;\n"); \
25 __asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0])); \
26 } while (0)
27
28 #define FPU_RESTORE \
29 do { \
30 __asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0])); \
31 if (!(current->flags & PF_USEDFPU)) \
32 stts(); \
33 } while (0)
34
35 #define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
36 #define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
37 #define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
38 #define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
39 #define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
40 #define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
41
42
43 static void
xor_pII_mmx_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)44 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
45 {
46 unsigned long lines = bytes >> 7;
47 char fpu_save[108];
48
49 FPU_SAVE;
50
51 __asm__ __volatile__ (
52 #undef BLOCK
53 #define BLOCK(i) \
54 LD(i,0) \
55 LD(i+1,1) \
56 LD(i+2,2) \
57 LD(i+3,3) \
58 XO1(i,0) \
59 ST(i,0) \
60 XO1(i+1,1) \
61 ST(i+1,1) \
62 XO1(i+2,2) \
63 ST(i+2,2) \
64 XO1(i+3,3) \
65 ST(i+3,3)
66
67 " .align 32 ;\n"
68 " 1: ;\n"
69
70 BLOCK(0)
71 BLOCK(4)
72 BLOCK(8)
73 BLOCK(12)
74
75 " addl $128, %1 ;\n"
76 " addl $128, %2 ;\n"
77 " decl %0 ;\n"
78 " jnz 1b ;\n"
79 : "+r" (lines),
80 "+r" (p1), "+r" (p2)
81 :
82 : "memory");
83
84 FPU_RESTORE;
85 }
86
87 static void
xor_pII_mmx_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)88 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
89 unsigned long *p3)
90 {
91 unsigned long lines = bytes >> 7;
92 char fpu_save[108];
93
94 FPU_SAVE;
95
96 __asm__ __volatile__ (
97 #undef BLOCK
98 #define BLOCK(i) \
99 LD(i,0) \
100 LD(i+1,1) \
101 LD(i+2,2) \
102 LD(i+3,3) \
103 XO1(i,0) \
104 XO1(i+1,1) \
105 XO1(i+2,2) \
106 XO1(i+3,3) \
107 XO2(i,0) \
108 ST(i,0) \
109 XO2(i+1,1) \
110 ST(i+1,1) \
111 XO2(i+2,2) \
112 ST(i+2,2) \
113 XO2(i+3,3) \
114 ST(i+3,3)
115
116 " .align 32 ;\n"
117 " 1: ;\n"
118
119 BLOCK(0)
120 BLOCK(4)
121 BLOCK(8)
122 BLOCK(12)
123
124 " addl $128, %1 ;\n"
125 " addl $128, %2 ;\n"
126 " addl $128, %3 ;\n"
127 " decl %0 ;\n"
128 " jnz 1b ;\n"
129 : "+r" (lines),
130 "+r" (p1), "+r" (p2), "+r" (p3)
131 :
132 : "memory");
133
134 FPU_RESTORE;
135 }
136
137 static void
xor_pII_mmx_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)138 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
139 unsigned long *p3, unsigned long *p4)
140 {
141 unsigned long lines = bytes >> 7;
142 char fpu_save[108];
143
144 FPU_SAVE;
145
146 __asm__ __volatile__ (
147 #undef BLOCK
148 #define BLOCK(i) \
149 LD(i,0) \
150 LD(i+1,1) \
151 LD(i+2,2) \
152 LD(i+3,3) \
153 XO1(i,0) \
154 XO1(i+1,1) \
155 XO1(i+2,2) \
156 XO1(i+3,3) \
157 XO2(i,0) \
158 XO2(i+1,1) \
159 XO2(i+2,2) \
160 XO2(i+3,3) \
161 XO3(i,0) \
162 ST(i,0) \
163 XO3(i+1,1) \
164 ST(i+1,1) \
165 XO3(i+2,2) \
166 ST(i+2,2) \
167 XO3(i+3,3) \
168 ST(i+3,3)
169
170 " .align 32 ;\n"
171 " 1: ;\n"
172
173 BLOCK(0)
174 BLOCK(4)
175 BLOCK(8)
176 BLOCK(12)
177
178 " addl $128, %1 ;\n"
179 " addl $128, %2 ;\n"
180 " addl $128, %3 ;\n"
181 " addl $128, %4 ;\n"
182 " decl %0 ;\n"
183 " jnz 1b ;\n"
184 : "+r" (lines),
185 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
186 :
187 : "memory");
188
189 FPU_RESTORE;
190 }
191
192
193 static void
xor_pII_mmx_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)194 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
195 unsigned long *p3, unsigned long *p4, unsigned long *p5)
196 {
197 unsigned long lines = bytes >> 7;
198 char fpu_save[108];
199
200 FPU_SAVE;
201
202 /* need to save/restore p4/p5 manually otherwise gcc's 10 argument
203 limit gets exceeded (+ counts as two arguments) */
204 __asm__ __volatile__ (
205 " pushl %4\n"
206 " pushl %5\n"
207 #undef BLOCK
208 #define BLOCK(i) \
209 LD(i,0) \
210 LD(i+1,1) \
211 LD(i+2,2) \
212 LD(i+3,3) \
213 XO1(i,0) \
214 XO1(i+1,1) \
215 XO1(i+2,2) \
216 XO1(i+3,3) \
217 XO2(i,0) \
218 XO2(i+1,1) \
219 XO2(i+2,2) \
220 XO2(i+3,3) \
221 XO3(i,0) \
222 XO3(i+1,1) \
223 XO3(i+2,2) \
224 XO3(i+3,3) \
225 XO4(i,0) \
226 ST(i,0) \
227 XO4(i+1,1) \
228 ST(i+1,1) \
229 XO4(i+2,2) \
230 ST(i+2,2) \
231 XO4(i+3,3) \
232 ST(i+3,3)
233
234 " .align 32 ;\n"
235 " 1: ;\n"
236
237 BLOCK(0)
238 BLOCK(4)
239 BLOCK(8)
240 BLOCK(12)
241
242 " addl $128, %1 ;\n"
243 " addl $128, %2 ;\n"
244 " addl $128, %3 ;\n"
245 " addl $128, %4 ;\n"
246 " addl $128, %5 ;\n"
247 " decl %0 ;\n"
248 " jnz 1b ;\n"
249 " popl %5\n"
250 " popl %4\n"
251 : "+r" (lines),
252 "+r" (p1), "+r" (p2), "+r" (p3)
253 : "r" (p4), "r" (p5)
254 : "memory");
255
256 FPU_RESTORE;
257 }
258
259 #undef LD
260 #undef XO1
261 #undef XO2
262 #undef XO3
263 #undef XO4
264 #undef ST
265 #undef BLOCK
266
267 static void
xor_p5_mmx_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)268 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
269 {
270 unsigned long lines = bytes >> 6;
271 char fpu_save[108];
272
273 FPU_SAVE;
274
275 __asm__ __volatile__ (
276 " .align 32 ;\n"
277 " 1: ;\n"
278 " movq (%1), %%mm0 ;\n"
279 " movq 8(%1), %%mm1 ;\n"
280 " pxor (%2), %%mm0 ;\n"
281 " movq 16(%1), %%mm2 ;\n"
282 " movq %%mm0, (%1) ;\n"
283 " pxor 8(%2), %%mm1 ;\n"
284 " movq 24(%1), %%mm3 ;\n"
285 " movq %%mm1, 8(%1) ;\n"
286 " pxor 16(%2), %%mm2 ;\n"
287 " movq 32(%1), %%mm4 ;\n"
288 " movq %%mm2, 16(%1) ;\n"
289 " pxor 24(%2), %%mm3 ;\n"
290 " movq 40(%1), %%mm5 ;\n"
291 " movq %%mm3, 24(%1) ;\n"
292 " pxor 32(%2), %%mm4 ;\n"
293 " movq 48(%1), %%mm6 ;\n"
294 " movq %%mm4, 32(%1) ;\n"
295 " pxor 40(%2), %%mm5 ;\n"
296 " movq 56(%1), %%mm7 ;\n"
297 " movq %%mm5, 40(%1) ;\n"
298 " pxor 48(%2), %%mm6 ;\n"
299 " pxor 56(%2), %%mm7 ;\n"
300 " movq %%mm6, 48(%1) ;\n"
301 " movq %%mm7, 56(%1) ;\n"
302
303 " addl $64, %1 ;\n"
304 " addl $64, %2 ;\n"
305 " decl %0 ;\n"
306 " jnz 1b ;\n"
307 : "+r" (lines),
308 "+r" (p1), "+r" (p2)
309 :
310 : "memory");
311
312 FPU_RESTORE;
313 }
314
315 static void
xor_p5_mmx_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)316 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
317 unsigned long *p3)
318 {
319 unsigned long lines = bytes >> 6;
320 char fpu_save[108];
321
322 FPU_SAVE;
323
324 __asm__ __volatile__ (
325 " .align 32,0x90 ;\n"
326 " 1: ;\n"
327 " movq (%1), %%mm0 ;\n"
328 " movq 8(%1), %%mm1 ;\n"
329 " pxor (%2), %%mm0 ;\n"
330 " movq 16(%1), %%mm2 ;\n"
331 " pxor 8(%2), %%mm1 ;\n"
332 " pxor (%3), %%mm0 ;\n"
333 " pxor 16(%2), %%mm2 ;\n"
334 " movq %%mm0, (%1) ;\n"
335 " pxor 8(%3), %%mm1 ;\n"
336 " pxor 16(%3), %%mm2 ;\n"
337 " movq 24(%1), %%mm3 ;\n"
338 " movq %%mm1, 8(%1) ;\n"
339 " movq 32(%1), %%mm4 ;\n"
340 " movq 40(%1), %%mm5 ;\n"
341 " pxor 24(%2), %%mm3 ;\n"
342 " movq %%mm2, 16(%1) ;\n"
343 " pxor 32(%2), %%mm4 ;\n"
344 " pxor 24(%3), %%mm3 ;\n"
345 " pxor 40(%2), %%mm5 ;\n"
346 " movq %%mm3, 24(%1) ;\n"
347 " pxor 32(%3), %%mm4 ;\n"
348 " pxor 40(%3), %%mm5 ;\n"
349 " movq 48(%1), %%mm6 ;\n"
350 " movq %%mm4, 32(%1) ;\n"
351 " movq 56(%1), %%mm7 ;\n"
352 " pxor 48(%2), %%mm6 ;\n"
353 " movq %%mm5, 40(%1) ;\n"
354 " pxor 56(%2), %%mm7 ;\n"
355 " pxor 48(%3), %%mm6 ;\n"
356 " pxor 56(%3), %%mm7 ;\n"
357 " movq %%mm6, 48(%1) ;\n"
358 " movq %%mm7, 56(%1) ;\n"
359
360 " addl $64, %1 ;\n"
361 " addl $64, %2 ;\n"
362 " addl $64, %3 ;\n"
363 " decl %0 ;\n"
364 " jnz 1b ;\n"
365 : "+r" (lines),
366 "+r" (p1), "+r" (p2), "+r" (p3)
367 :
368 : "memory" );
369
370 FPU_RESTORE;
371 }
372
373 static void
xor_p5_mmx_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)374 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
375 unsigned long *p3, unsigned long *p4)
376 {
377 unsigned long lines = bytes >> 6;
378 char fpu_save[108];
379
380 FPU_SAVE;
381
382 __asm__ __volatile__ (
383 " .align 32,0x90 ;\n"
384 " 1: ;\n"
385 " movq (%1), %%mm0 ;\n"
386 " movq 8(%1), %%mm1 ;\n"
387 " pxor (%2), %%mm0 ;\n"
388 " movq 16(%1), %%mm2 ;\n"
389 " pxor 8(%2), %%mm1 ;\n"
390 " pxor (%3), %%mm0 ;\n"
391 " pxor 16(%2), %%mm2 ;\n"
392 " pxor 8(%3), %%mm1 ;\n"
393 " pxor (%4), %%mm0 ;\n"
394 " movq 24(%1), %%mm3 ;\n"
395 " pxor 16(%3), %%mm2 ;\n"
396 " pxor 8(%4), %%mm1 ;\n"
397 " movq %%mm0, (%1) ;\n"
398 " movq 32(%1), %%mm4 ;\n"
399 " pxor 24(%2), %%mm3 ;\n"
400 " pxor 16(%4), %%mm2 ;\n"
401 " movq %%mm1, 8(%1) ;\n"
402 " movq 40(%1), %%mm5 ;\n"
403 " pxor 32(%2), %%mm4 ;\n"
404 " pxor 24(%3), %%mm3 ;\n"
405 " movq %%mm2, 16(%1) ;\n"
406 " pxor 40(%2), %%mm5 ;\n"
407 " pxor 32(%3), %%mm4 ;\n"
408 " pxor 24(%4), %%mm3 ;\n"
409 " movq %%mm3, 24(%1) ;\n"
410 " movq 56(%1), %%mm7 ;\n"
411 " movq 48(%1), %%mm6 ;\n"
412 " pxor 40(%3), %%mm5 ;\n"
413 " pxor 32(%4), %%mm4 ;\n"
414 " pxor 48(%2), %%mm6 ;\n"
415 " movq %%mm4, 32(%1) ;\n"
416 " pxor 56(%2), %%mm7 ;\n"
417 " pxor 40(%4), %%mm5 ;\n"
418 " pxor 48(%3), %%mm6 ;\n"
419 " pxor 56(%3), %%mm7 ;\n"
420 " movq %%mm5, 40(%1) ;\n"
421 " pxor 48(%4), %%mm6 ;\n"
422 " pxor 56(%4), %%mm7 ;\n"
423 " movq %%mm6, 48(%1) ;\n"
424 " movq %%mm7, 56(%1) ;\n"
425
426 " addl $64, %1 ;\n"
427 " addl $64, %2 ;\n"
428 " addl $64, %3 ;\n"
429 " addl $64, %4 ;\n"
430 " decl %0 ;\n"
431 " jnz 1b ;\n"
432 : "+r" (lines),
433 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
434 :
435 : "memory");
436
437 FPU_RESTORE;
438 }
439
440 static void
xor_p5_mmx_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)441 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
442 unsigned long *p3, unsigned long *p4, unsigned long *p5)
443 {
444 unsigned long lines = bytes >> 6;
445 char fpu_save[108];
446
447 FPU_SAVE;
448
449 /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */
450 __asm__ __volatile__ (
451 " pushl %4\n"
452 " pushl %5\n"
453 " .align 32,0x90 ;\n"
454 " 1: ;\n"
455 " movq (%1), %%mm0 ;\n"
456 " movq 8(%1), %%mm1 ;\n"
457 " pxor (%2), %%mm0 ;\n"
458 " pxor 8(%2), %%mm1 ;\n"
459 " movq 16(%1), %%mm2 ;\n"
460 " pxor (%3), %%mm0 ;\n"
461 " pxor 8(%3), %%mm1 ;\n"
462 " pxor 16(%2), %%mm2 ;\n"
463 " pxor (%4), %%mm0 ;\n"
464 " pxor 8(%4), %%mm1 ;\n"
465 " pxor 16(%3), %%mm2 ;\n"
466 " movq 24(%1), %%mm3 ;\n"
467 " pxor (%5), %%mm0 ;\n"
468 " pxor 8(%5), %%mm1 ;\n"
469 " movq %%mm0, (%1) ;\n"
470 " pxor 16(%4), %%mm2 ;\n"
471 " pxor 24(%2), %%mm3 ;\n"
472 " movq %%mm1, 8(%1) ;\n"
473 " pxor 16(%5), %%mm2 ;\n"
474 " pxor 24(%3), %%mm3 ;\n"
475 " movq 32(%1), %%mm4 ;\n"
476 " movq %%mm2, 16(%1) ;\n"
477 " pxor 24(%4), %%mm3 ;\n"
478 " pxor 32(%2), %%mm4 ;\n"
479 " movq 40(%1), %%mm5 ;\n"
480 " pxor 24(%5), %%mm3 ;\n"
481 " pxor 32(%3), %%mm4 ;\n"
482 " pxor 40(%2), %%mm5 ;\n"
483 " movq %%mm3, 24(%1) ;\n"
484 " pxor 32(%4), %%mm4 ;\n"
485 " pxor 40(%3), %%mm5 ;\n"
486 " movq 48(%1), %%mm6 ;\n"
487 " movq 56(%1), %%mm7 ;\n"
488 " pxor 32(%5), %%mm4 ;\n"
489 " pxor 40(%4), %%mm5 ;\n"
490 " pxor 48(%2), %%mm6 ;\n"
491 " pxor 56(%2), %%mm7 ;\n"
492 " movq %%mm4, 32(%1) ;\n"
493 " pxor 48(%3), %%mm6 ;\n"
494 " pxor 56(%3), %%mm7 ;\n"
495 " pxor 40(%5), %%mm5 ;\n"
496 " pxor 48(%4), %%mm6 ;\n"
497 " pxor 56(%4), %%mm7 ;\n"
498 " movq %%mm5, 40(%1) ;\n"
499 " pxor 48(%5), %%mm6 ;\n"
500 " pxor 56(%5), %%mm7 ;\n"
501 " movq %%mm6, 48(%1) ;\n"
502 " movq %%mm7, 56(%1) ;\n"
503
504 " addl $64, %1 ;\n"
505 " addl $64, %2 ;\n"
506 " addl $64, %3 ;\n"
507 " addl $64, %4 ;\n"
508 " addl $64, %5 ;\n"
509 " decl %0 ;\n"
510 " jnz 1b ;\n"
511 " popl %5\n"
512 " popl %4\n"
513 : "+g" (lines),
514 "+r" (p1), "+r" (p2), "+r" (p3)
515 : "r" (p4), "r" (p5)
516 : "memory");
517
518 FPU_RESTORE;
519 }
520
521 static struct xor_block_template xor_block_pII_mmx = {
522 name: "pII_mmx",
523 do_2: xor_pII_mmx_2,
524 do_3: xor_pII_mmx_3,
525 do_4: xor_pII_mmx_4,
526 do_5: xor_pII_mmx_5,
527 };
528
529 static struct xor_block_template xor_block_p5_mmx = {
530 name: "p5_mmx",
531 do_2: xor_p5_mmx_2,
532 do_3: xor_p5_mmx_3,
533 do_4: xor_p5_mmx_4,
534 do_5: xor_p5_mmx_5,
535 };
536
537 #undef FPU_SAVE
538 #undef FPU_RESTORE
539
540 /*
541 * Cache avoiding checksumming functions utilizing KNI instructions
542 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
543 */
544
545 #define XMMS_SAVE \
546 __asm__ __volatile__ ( \
547 "movl %%cr0,%0 ;\n\t" \
548 "clts ;\n\t" \
549 "movups %%xmm0,(%1) ;\n\t" \
550 "movups %%xmm1,0x10(%1) ;\n\t" \
551 "movups %%xmm2,0x20(%1) ;\n\t" \
552 "movups %%xmm3,0x30(%1) ;\n\t" \
553 : "=&r" (cr0) \
554 : "r" (xmm_save) \
555 : "memory")
556
557 #define XMMS_RESTORE \
558 __asm__ __volatile__ ( \
559 "sfence ;\n\t" \
560 "movups (%1),%%xmm0 ;\n\t" \
561 "movups 0x10(%1),%%xmm1 ;\n\t" \
562 "movups 0x20(%1),%%xmm2 ;\n\t" \
563 "movups 0x30(%1),%%xmm3 ;\n\t" \
564 "movl %0,%%cr0 ;\n\t" \
565 : \
566 : "r" (cr0), "r" (xmm_save) \
567 : "memory")
568
569 #define ALIGN16 __attribute__((aligned(16)))
570
571 #define OFFS(x) "16*("#x")"
572 #define PF_OFFS(x) "256+16*("#x")"
573 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
574 #define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
575 #define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
576 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
577 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
578 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
579 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
580 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
581 #define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
582 #define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
583 #define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
584 #define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
585 #define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
586
587
588 static void
xor_sse_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)589 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
590 {
591 unsigned long lines = bytes >> 8;
592 char xmm_save[16*4] ALIGN16;
593 int cr0;
594
595 XMMS_SAVE;
596
597 __asm__ __volatile__ (
598 #undef BLOCK
599 #define BLOCK(i) \
600 LD(i,0) \
601 LD(i+1,1) \
602 PF1(i) \
603 PF1(i+2) \
604 LD(i+2,2) \
605 LD(i+3,3) \
606 PF0(i+4) \
607 PF0(i+6) \
608 XO1(i,0) \
609 XO1(i+1,1) \
610 XO1(i+2,2) \
611 XO1(i+3,3) \
612 ST(i,0) \
613 ST(i+1,1) \
614 ST(i+2,2) \
615 ST(i+3,3) \
616
617
618 PF0(0)
619 PF0(2)
620
621 " .align 32 ;\n"
622 " 1: ;\n"
623
624 BLOCK(0)
625 BLOCK(4)
626 BLOCK(8)
627 BLOCK(12)
628
629 " addl $256, %1 ;\n"
630 " addl $256, %2 ;\n"
631 " decl %0 ;\n"
632 " jnz 1b ;\n"
633 : "+r" (lines),
634 "+r" (p1), "+r" (p2)
635 :
636 : "memory");
637
638 XMMS_RESTORE;
639 }
640
641 static void
xor_sse_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)642 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
643 unsigned long *p3)
644 {
645 unsigned long lines = bytes >> 8;
646 char xmm_save[16*4] ALIGN16;
647 int cr0;
648
649 XMMS_SAVE;
650
651 __asm__ __volatile__ (
652 #undef BLOCK
653 #define BLOCK(i) \
654 PF1(i) \
655 PF1(i+2) \
656 LD(i,0) \
657 LD(i+1,1) \
658 LD(i+2,2) \
659 LD(i+3,3) \
660 PF2(i) \
661 PF2(i+2) \
662 PF0(i+4) \
663 PF0(i+6) \
664 XO1(i,0) \
665 XO1(i+1,1) \
666 XO1(i+2,2) \
667 XO1(i+3,3) \
668 XO2(i,0) \
669 XO2(i+1,1) \
670 XO2(i+2,2) \
671 XO2(i+3,3) \
672 ST(i,0) \
673 ST(i+1,1) \
674 ST(i+2,2) \
675 ST(i+3,3) \
676
677
678 PF0(0)
679 PF0(2)
680
681 " .align 32 ;\n"
682 " 1: ;\n"
683
684 BLOCK(0)
685 BLOCK(4)
686 BLOCK(8)
687 BLOCK(12)
688
689 " addl $256, %1 ;\n"
690 " addl $256, %2 ;\n"
691 " addl $256, %3 ;\n"
692 " decl %0 ;\n"
693 " jnz 1b ;\n"
694 : "+r" (lines),
695 "+r" (p1), "+r"(p2), "+r"(p3)
696 :
697 : "memory" );
698
699 XMMS_RESTORE;
700 }
701
702 static void
xor_sse_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)703 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
704 unsigned long *p3, unsigned long *p4)
705 {
706 unsigned long lines = bytes >> 8;
707 char xmm_save[16*4] ALIGN16;
708 int cr0;
709
710 XMMS_SAVE;
711
712 __asm__ __volatile__ (
713 #undef BLOCK
714 #define BLOCK(i) \
715 PF1(i) \
716 PF1(i+2) \
717 LD(i,0) \
718 LD(i+1,1) \
719 LD(i+2,2) \
720 LD(i+3,3) \
721 PF2(i) \
722 PF2(i+2) \
723 XO1(i,0) \
724 XO1(i+1,1) \
725 XO1(i+2,2) \
726 XO1(i+3,3) \
727 PF3(i) \
728 PF3(i+2) \
729 PF0(i+4) \
730 PF0(i+6) \
731 XO2(i,0) \
732 XO2(i+1,1) \
733 XO2(i+2,2) \
734 XO2(i+3,3) \
735 XO3(i,0) \
736 XO3(i+1,1) \
737 XO3(i+2,2) \
738 XO3(i+3,3) \
739 ST(i,0) \
740 ST(i+1,1) \
741 ST(i+2,2) \
742 ST(i+3,3) \
743
744
745 PF0(0)
746 PF0(2)
747
748 " .align 32 ;\n"
749 " 1: ;\n"
750
751 BLOCK(0)
752 BLOCK(4)
753 BLOCK(8)
754 BLOCK(12)
755
756 " addl $256, %1 ;\n"
757 " addl $256, %2 ;\n"
758 " addl $256, %3 ;\n"
759 " addl $256, %4 ;\n"
760 " decl %0 ;\n"
761 " jnz 1b ;\n"
762 : "+r" (lines),
763 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
764 :
765 : "memory" );
766
767 XMMS_RESTORE;
768 }
769
770 static void
xor_sse_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)771 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
772 unsigned long *p3, unsigned long *p4, unsigned long *p5)
773 {
774 unsigned long lines = bytes >> 8;
775 char xmm_save[16*4] ALIGN16;
776 int cr0;
777
778 XMMS_SAVE;
779
780 /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */
781 __asm__ __volatile__ (
782 " pushl %4\n"
783 " pushl %5\n"
784 #undef BLOCK
785 #define BLOCK(i) \
786 PF1(i) \
787 PF1(i+2) \
788 LD(i,0) \
789 LD(i+1,1) \
790 LD(i+2,2) \
791 LD(i+3,3) \
792 PF2(i) \
793 PF2(i+2) \
794 XO1(i,0) \
795 XO1(i+1,1) \
796 XO1(i+2,2) \
797 XO1(i+3,3) \
798 PF3(i) \
799 PF3(i+2) \
800 XO2(i,0) \
801 XO2(i+1,1) \
802 XO2(i+2,2) \
803 XO2(i+3,3) \
804 PF4(i) \
805 PF4(i+2) \
806 PF0(i+4) \
807 PF0(i+6) \
808 XO3(i,0) \
809 XO3(i+1,1) \
810 XO3(i+2,2) \
811 XO3(i+3,3) \
812 XO4(i,0) \
813 XO4(i+1,1) \
814 XO4(i+2,2) \
815 XO4(i+3,3) \
816 ST(i,0) \
817 ST(i+1,1) \
818 ST(i+2,2) \
819 ST(i+3,3) \
820
821
822 PF0(0)
823 PF0(2)
824
825 " .align 32 ;\n"
826 " 1: ;\n"
827
828 BLOCK(0)
829 BLOCK(4)
830 BLOCK(8)
831 BLOCK(12)
832
833 " addl $256, %1 ;\n"
834 " addl $256, %2 ;\n"
835 " addl $256, %3 ;\n"
836 " addl $256, %4 ;\n"
837 " addl $256, %5 ;\n"
838 " decl %0 ;\n"
839 " jnz 1b ;\n"
840 " popl %5\n"
841 " popl %4\n"
842 : "+r" (lines),
843 "+r" (p1), "+r" (p2), "+r" (p3)
844 : "r" (p4), "r" (p5)
845 : "memory");
846
847 XMMS_RESTORE;
848 }
849
850 static struct xor_block_template xor_block_pIII_sse = {
851 name: "pIII_sse",
852 do_2: xor_sse_2,
853 do_3: xor_sse_3,
854 do_4: xor_sse_4,
855 do_5: xor_sse_5,
856 };
857
858 /* Also try the generic routines. */
859 #include <asm-generic/xor.h>
860
861 #undef XOR_TRY_TEMPLATES
862 #define XOR_TRY_TEMPLATES \
863 do { \
864 xor_speed(&xor_block_8regs); \
865 xor_speed(&xor_block_32regs); \
866 if (cpu_has_xmm) \
867 xor_speed(&xor_block_pIII_sse); \
868 if (md_cpu_has_mmx()) { \
869 xor_speed(&xor_block_pII_mmx); \
870 xor_speed(&xor_block_p5_mmx); \
871 } \
872 } while (0)
873
874 /* We force the use of the SSE xor block because it can write around L2.
875 We may also be able to load into the L1 only depending on how the cpu
876 deals with a load to a line that is being prefetched. */
877 #define XOR_SELECT_TEMPLATE(FASTEST) \
878 (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
879