1 /*
2 * include/asm-x86_64/xor.h
3 *
4 * Optimized RAID-5 checksumming functions for MMX and SSE.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2, or (at your option)
9 * any later version.
10 *
11 * You should have received a copy of the GNU General Public License
12 * (for example /usr/src/linux/COPYING); if not, write to the Free
13 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14 */
15
16
17 /*
18 * Cache avoiding checksumming functions utilizing KNI instructions
19 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
20 */
21
22 /*
23 * Based on
24 * High-speed RAID5 checksumming functions utilizing SSE instructions.
25 * Copyright (C) 1998 Ingo Molnar.
26 */
27
28 /*
29 * x86-64 changes / gcc fixes from Andi Kleen.
30 * Copyright 2002 Andi Kleen, SuSE Labs.
31 */
32
33 typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
34
35 /* Doesn't use gcc to save the XMM registers, because there is no easy way to
36 tell it to do a clts before the register saving. */
37 #define XMMS_SAVE \
38 asm volatile ( \
39 "movq %%cr0,%0 ;\n\t" \
40 "clts ;\n\t" \
41 "movups %%xmm0,(%1) ;\n\t" \
42 "movups %%xmm1,0x10(%1) ;\n\t" \
43 "movups %%xmm2,0x20(%1) ;\n\t" \
44 "movups %%xmm3,0x30(%1) ;\n\t" \
45 : "=&r" (cr0) \
46 : "r" (xmm_save) \
47 : "memory")
48
49 #define XMMS_RESTORE \
50 asm volatile ( \
51 "sfence ;\n\t" \
52 "movups (%1),%%xmm0 ;\n\t" \
53 "movups 0x10(%1),%%xmm1 ;\n\t" \
54 "movups 0x20(%1),%%xmm2 ;\n\t" \
55 "movups 0x30(%1),%%xmm3 ;\n\t" \
56 "movq %0,%%cr0 ;\n\t" \
57 : \
58 : "r" (cr0), "r" (xmm_save) \
59 : "memory")
60
61 #define OFFS(x) "16*("#x")"
62 #define PF_OFFS(x) "320+16*("#x")"
63 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
64 #define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
65 #define ST(x,y) " movntdq %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
66 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
67 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
68 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
69 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
70 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
71 #define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
72 #define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
73 #define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
74 #define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
75 #define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
76
77 static void
xor_sse_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)78 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
79 {
80 unsigned int lines = bytes >> 7;
81 unsigned long cr0;
82 xmm_store_t xmm_save[4];
83
84 XMMS_SAVE;
85
86 asm volatile (
87 #undef BLOCK
88 #define BLOCK(i) \
89 LD(i,0) \
90 LD(i+1,1) \
91 PF1(i) \
92 LD(i+2,2) \
93 LD(i+3,3) \
94 PF0(i+4) \
95 XO1(i,0) \
96 XO1(i+1,1) \
97 ST(i,0) \
98 ST(i+1,1) \
99 XO1(i+2,2) \
100 XO1(i+3,3) \
101 ST(i+2,2) \
102 ST(i+3,3) \
103
104
105 PF0(0)
106
107 " .p2align 4 ;\n"
108 " 1: ;\n"
109
110 BLOCK(0)
111 BLOCK(4)
112
113 " decl %[cnt]\n"
114 " leaq 128(%[p1]),%[p1]\n"
115 " leaq 128(%[p2]),%[p2]\n"
116 " jnz 1b\n"
117 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
118 :
119 : "memory");
120
121 XMMS_RESTORE;
122 }
123
124 static void
xor_sse_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)125 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
126 unsigned long *p3)
127 {
128 unsigned int lines = bytes >> 7;
129 xmm_store_t xmm_save[4];
130 unsigned long cr0;
131
132 XMMS_SAVE;
133
134 __asm__ __volatile__ (
135 #undef BLOCK
136 #define BLOCK(i) \
137 PF1(i) \
138 LD(i,0) \
139 LD(i+1,1) \
140 XO1(i,0) \
141 XO1(i+1,1) \
142 LD(i+2,2) \
143 LD(i+3,3) \
144 PF2(i) \
145 PF0(i+4) \
146 XO1(i+2,2) \
147 XO1(i+3,3) \
148 XO2(i,0) \
149 XO2(i+1,1) \
150 ST(i,0) \
151 ST(i+1,1) \
152 XO2(i+2,2) \
153 XO2(i+3,3) \
154 ST(i+2,2) \
155 ST(i+3,3) \
156
157
158 PF0(0)
159
160 " .p2align 4 ;\n"
161 " 1: ;\n"
162
163 BLOCK(0)
164 BLOCK(4)
165
166 " decl %[cnt]\n"
167 " leaq 128(%[p1]),%[p1]\n"
168 " leaq 128(%[p2]),%[p2]\n"
169 " leaq 128(%[p3]),%[p3]\n"
170 " jnz 1b"
171 : [cnt] "+r" (lines),
172 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
173 :
174 : "memory");
175 XMMS_RESTORE;
176 }
177
178 static void
xor_sse_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)179 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180 unsigned long *p3, unsigned long *p4)
181 {
182 unsigned int lines = bytes >> 7;
183 xmm_store_t xmm_save[4];
184 unsigned long cr0;
185
186 XMMS_SAVE;
187
188 __asm__ __volatile__ (
189 #undef BLOCK
190 #define BLOCK(i) \
191 PF1(i) \
192 LD(i,0) \
193 LD(i+1,1) \
194 XO1(i,0) \
195 XO1(i+1,1) \
196 LD(i+2,2) \
197 LD(i+3,3) \
198 PF2(i) \
199 XO1(i+2,2) \
200 XO1(i+3,3) \
201 PF3(i) \
202 PF0(i+4) \
203 XO2(i,0) \
204 XO2(i+1,1) \
205 XO2(i+2,2) \
206 XO2(i+3,3) \
207 XO3(i,0) \
208 XO3(i+1,1) \
209 ST(i,0) \
210 ST(i+1,1) \
211 XO3(i+2,2) \
212 XO3(i+3,3) \
213 ST(i+2,2) \
214 ST(i+3,3) \
215
216
217 PF0(0)
218
219 " .align 32 ;\n"
220 " 1: ;\n"
221
222 BLOCK(0)
223 BLOCK(4)
224
225 " decl %[cnt]\n"
226 " leaq 128(%[p1]),%[p1]\n"
227 " leaq 128(%[p2]),%[p2]\n"
228 " leaq 128(%[p3]),%[p3]\n"
229 " leaq 128(%[p4]),%[p4]\n"
230 " jnz 1b"
231 : [cnt] "+r" (lines),
232 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
233 :
234 : "memory" );
235
236 XMMS_RESTORE;
237 }
238
239 static void
xor_sse_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)240 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
241 unsigned long *p3, unsigned long *p4, unsigned long *p5)
242 {
243 unsigned int lines = bytes >> 7;
244 xmm_store_t xmm_save[4];
245 unsigned long cr0;
246
247 XMMS_SAVE;
248
249 __asm__ __volatile__ (
250 #undef BLOCK
251 #define BLOCK(i) \
252 PF1(i) \
253 LD(i,0) \
254 LD(i+1,1) \
255 XO1(i,0) \
256 XO1(i+1,1) \
257 LD(i+2,2) \
258 LD(i+3,3) \
259 PF2(i) \
260 XO1(i+2,2) \
261 XO1(i+3,3) \
262 PF3(i) \
263 XO2(i,0) \
264 XO2(i+1,1) \
265 XO2(i+2,2) \
266 XO2(i+3,3) \
267 PF4(i) \
268 PF0(i+4) \
269 XO3(i,0) \
270 XO3(i+1,1) \
271 XO3(i+2,2) \
272 XO3(i+3,3) \
273 XO4(i,0) \
274 XO4(i+1,1) \
275 ST(i,0) \
276 ST(i+1,1) \
277 XO4(i+2,2) \
278 XO4(i+3,3) \
279 ST(i+2,2) \
280 ST(i+3,3) \
281
282
283 PF0(0)
284
285 " .p2align 4 ;\n"
286 " 1: ;\n"
287
288 BLOCK(0)
289 BLOCK(4)
290
291 " decl %[cnt]\n"
292 " leaq 128(%[p1]),%[p1]\n"
293 " leaq 128(%[p2]),%[p2]\n"
294 " leaq 128(%[p3]),%[p3]\n"
295 " leaq 128(%[p4]),%[p4]\n"
296 " leaq 128(%[p5]),%[p5]\n"
297 " jnz 1b"
298 : [cnt] "+r" (lines),
299 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
300 [p5] "+r" (p5)
301 :
302 : "memory");
303
304 XMMS_RESTORE;
305 }
306
307 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC__MINOR__ >= 3)
308 #define STORE_NTI(x,mem) __builtin_ia32_movnti(&(mem), (x))
309 #else
310 #define STORE_NTI(x,mem) asm("movnti %1,%0" : "=m" (mem) : "r" (x))
311 #endif
312
313
314 static void
xor_64regs_stream_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)315 xor_64regs_stream_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
316 {
317 long lines = bytes / (sizeof (long)) / 8;
318
319 do {
320 register long d0, d1, d2, d3, d4, d5, d6, d7;
321 d0 = p1[0]; /* Pull the stuff into registers */
322 d1 = p1[1]; /* ... in bursts, if possible. */
323 d2 = p1[2];
324 d3 = p1[3];
325 d4 = p1[4];
326 d5 = p1[5];
327 d6 = p1[6];
328 d7 = p1[7];
329 __builtin_prefetch(p1 + 5*64, 0, 0);
330 d0 ^= p2[0];
331 d1 ^= p2[1];
332 d2 ^= p2[2];
333 d3 ^= p2[3];
334 d4 ^= p2[4];
335 d5 ^= p2[5];
336 d6 ^= p2[6];
337 d7 ^= p2[7];
338 __builtin_prefetch(p2 + 5*64, 0, 0);
339 STORE_NTI(d0, p1[0]);
340 STORE_NTI(d1, p1[1]);
341 STORE_NTI(d2, p1[2]);
342 STORE_NTI(d3, p1[3]);
343 STORE_NTI(d4, p1[4]);
344 STORE_NTI(d5, p1[5]);
345 STORE_NTI(d6, p1[6]);
346 STORE_NTI(d7, p1[7]);
347 p1 += 8;
348 p2 += 8;
349 } while (--lines > 0);
350 }
351
352 static void
xor_64regs_stream_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)353 xor_64regs_stream_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
354 unsigned long *p3)
355 {
356 long lines = bytes / (sizeof (long)) / 8;
357
358 do {
359 register long d0, d1, d2, d3, d4, d5, d6, d7;
360 d0 = p1[0]; /* Pull the stuff into registers */
361 d1 = p1[1]; /* ... in bursts, if possible. */
362 d2 = p1[2];
363 d3 = p1[3];
364 d4 = p1[4];
365 d5 = p1[5];
366 d6 = p1[6];
367 d7 = p1[7];
368 __builtin_prefetch(p1 + 5*64, 0, 0);
369 d0 ^= p2[0];
370 d1 ^= p2[1];
371 d2 ^= p2[2];
372 d3 ^= p2[3];
373 d4 ^= p2[4];
374 d5 ^= p2[5];
375 d6 ^= p2[6];
376 d7 ^= p2[7];
377 __builtin_prefetch(p2 + 5*64, 0, 0);
378 d0 ^= p3[0];
379 d1 ^= p3[1];
380 d2 ^= p3[2];
381 d3 ^= p3[3];
382 d4 ^= p3[4];
383 d5 ^= p3[5];
384 d6 ^= p3[6];
385 d7 ^= p3[7];
386 __builtin_prefetch(p3 + 5*64, 0, 0);
387 STORE_NTI(d0, p1[0]);
388 STORE_NTI(d1, p1[1]);
389 STORE_NTI(d2, p1[2]);
390 STORE_NTI(d3, p1[3]);
391 STORE_NTI(d4, p1[4]);
392 STORE_NTI(d5, p1[5]);
393 STORE_NTI(d6, p1[6]);
394 STORE_NTI(d7, p1[7]);
395 p1 += 8;
396 p2 += 8;
397 p3 += 8;
398 } while (--lines > 0);
399 }
400
401 static void
xor_64regs_stream_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)402 xor_64regs_stream_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
403 unsigned long *p3, unsigned long *p4)
404 {
405 long lines = bytes / (sizeof (long)) / 8;
406
407 do {
408 register long d0, d1, d2, d3, d4, d5, d6, d7;
409 d0 = p1[0]; /* Pull the stuff into registers */
410 d1 = p1[1]; /* ... in bursts, if possible. */
411 d2 = p1[2];
412 d3 = p1[3];
413 d4 = p1[4];
414 d5 = p1[5];
415 d6 = p1[6];
416 d7 = p1[7];
417 __builtin_prefetch(p1 + 5*64, 0, 0);
418 d0 ^= p2[0];
419 d1 ^= p2[1];
420 d2 ^= p2[2];
421 d3 ^= p2[3];
422 d4 ^= p2[4];
423 d5 ^= p2[5];
424 d6 ^= p2[6];
425 d7 ^= p2[7];
426 __builtin_prefetch(p2 + 5*64, 0, 0);
427 d0 ^= p3[0];
428 d1 ^= p3[1];
429 d2 ^= p3[2];
430 d3 ^= p3[3];
431 d4 ^= p3[4];
432 d5 ^= p3[5];
433 d6 ^= p3[6];
434 d7 ^= p3[7];
435 __builtin_prefetch(p3 + 5*64, 0, 0);
436 d0 ^= p4[0];
437 d1 ^= p4[1];
438 d2 ^= p4[2];
439 d3 ^= p4[3];
440 d4 ^= p4[4];
441 d5 ^= p4[5];
442 d6 ^= p4[6];
443 d7 ^= p4[7];
444 __builtin_prefetch(p4 + 5*64, 0, 0);
445 STORE_NTI(d0, p1[0]);
446 STORE_NTI(d1, p1[1]);
447 STORE_NTI(d2, p1[2]);
448 STORE_NTI(d3, p1[3]);
449 STORE_NTI(d4, p1[4]);
450 STORE_NTI(d5, p1[5]);
451 STORE_NTI(d6, p1[6]);
452 STORE_NTI(d7, p1[7]);
453 p1 += 8;
454 p2 += 8;
455 p3 += 8;
456 p4 += 8;
457 } while (--lines > 0);
458 }
459
460 static void
xor_64regs_stream_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)461 xor_64regs_stream_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
462 unsigned long *p3, unsigned long *p4, unsigned long *p5)
463 {
464 long lines = bytes / (sizeof (long)) / 8;
465
466 do {
467 register long d0, d1, d2, d3, d4, d5, d6, d7;
468 d0 = p1[0]; /* Pull the stuff into registers */
469 d1 = p1[1]; /* ... in bursts, if possible. */
470 d2 = p1[2];
471 d3 = p1[3];
472 d4 = p1[4];
473 d5 = p1[5];
474 d6 = p1[6];
475 d7 = p1[7];
476 __builtin_prefetch(p1 + 5*64, 0, 0);
477 d0 ^= p2[0];
478 d1 ^= p2[1];
479 d2 ^= p2[2];
480 d3 ^= p2[3];
481 d4 ^= p2[4];
482 d5 ^= p2[5];
483 d6 ^= p2[6];
484 d7 ^= p2[7];
485 __builtin_prefetch(p2 + 5*64, 0, 0);
486 d0 ^= p3[0];
487 d1 ^= p3[1];
488 d2 ^= p3[2];
489 d3 ^= p3[3];
490 d4 ^= p3[4];
491 d5 ^= p3[5];
492 d6 ^= p3[6];
493 d7 ^= p3[7];
494 __builtin_prefetch(p3 + 5*64, 0, 0);
495 d0 ^= p4[0];
496 d1 ^= p4[1];
497 d2 ^= p4[2];
498 d3 ^= p4[3];
499 d4 ^= p4[4];
500 d5 ^= p4[5];
501 d6 ^= p4[6];
502 d7 ^= p4[7];
503 __builtin_prefetch(p4 + 5*64, 0, 0);
504 d0 ^= p5[0];
505 d1 ^= p5[1];
506 d2 ^= p5[2];
507 d3 ^= p5[3];
508 d4 ^= p5[4];
509 d5 ^= p5[5];
510 d6 ^= p5[6];
511 d7 ^= p5[7];
512 __builtin_prefetch(p5 + 5*64, 0, 0);
513 STORE_NTI(d0, p1[0]);
514 STORE_NTI(d1, p1[1]);
515 STORE_NTI(d2, p1[2]);
516 STORE_NTI(d3, p1[3]);
517 STORE_NTI(d4, p1[4]);
518 STORE_NTI(d5, p1[5]);
519 STORE_NTI(d6, p1[6]);
520 STORE_NTI(d7, p1[7]);
521 p1 += 8;
522 p2 += 8;
523 p3 += 8;
524 p4 += 8;
525 p5 += 8;
526 } while (--lines > 0);
527 }
528
529
530 static struct xor_block_template xor_block_sse = {
531 name: "128byte sse streaming",
532 do_2: xor_sse_2,
533 do_3: xor_sse_3,
534 do_4: xor_sse_4,
535 do_5: xor_sse_5,
536 };
537
538 static struct xor_block_template xor_block_64regs_stream = {
539 name: "64byte int streaming",
540 do_2: xor_64regs_stream_2,
541 do_3: xor_64regs_stream_3,
542 do_4: xor_64regs_stream_4,
543 do_5: xor_64regs_stream_5,
544 };
545
546 /* AK: the speed test is useless: it only tests cache hot */
547 #undef XOR_TRY_TEMPLATES
548 #define XOR_TRY_TEMPLATES \
549 do { \
550 xor_speed(&xor_block_sse); \
551 xor_speed(&xor_block_64regs_stream); \
552 } while (0)
553
554 #define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)
555