1 #include <linux/config.h>
2 #include <linux/types.h>
3 #include <linux/string.h>
4 #include <linux/sched.h>
5
6 #include <asm/i387.h>
7 #include <asm/hardirq.h>
8
9
10 /*
11 * MMX 3DNow! library helper functions
12 *
13 * To do:
14 * We can use MMX just for prefetch in IRQ's. This may be a win.
15 * (reported so on K6-III)
16 * We should use a better code neutral filler for the short jump
17 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
18 * We also want to clobber the filler register so we dont get any
19 * register forwarding stalls on the filler.
20 *
21 * Add *user handling. Checksums are not a win with MMX on any CPU
22 * tested so far for any MMX solution figured.
23 *
24 * 22/09/2000 - Arjan van de Ven
25 * Improved for non-egineering-sample Athlons
26 *
27 */
28
_mmx_memcpy(void * to,const void * from,size_t len)29 void *_mmx_memcpy(void *to, const void *from, size_t len)
30 {
31 void *p;
32 int i;
33
34 if (in_interrupt())
35 return __memcpy(to, from, len);
36
37 p = to;
38 i = len >> 6; /* len/64 */
39
40 kernel_fpu_begin();
41
42 __asm__ __volatile__ (
43 "1: prefetch (%0)\n" /* This set is 28 bytes */
44 " prefetch 64(%0)\n"
45 " prefetch 128(%0)\n"
46 " prefetch 192(%0)\n"
47 " prefetch 256(%0)\n"
48 "2: \n"
49 ".section .fixup, \"ax\"\n"
50 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
51 " jmp 2b\n"
52 ".previous\n"
53 ".section __ex_table,\"a\"\n"
54 " .align 4\n"
55 " .long 1b, 3b\n"
56 ".previous"
57 : : "r" (from) );
58
59
60 for(; i>5; i--)
61 {
62 __asm__ __volatile__ (
63 "1: prefetch 320(%0)\n"
64 "2: movq (%0), %%mm0\n"
65 " movq 8(%0), %%mm1\n"
66 " movq 16(%0), %%mm2\n"
67 " movq 24(%0), %%mm3\n"
68 " movq %%mm0, (%1)\n"
69 " movq %%mm1, 8(%1)\n"
70 " movq %%mm2, 16(%1)\n"
71 " movq %%mm3, 24(%1)\n"
72 " movq 32(%0), %%mm0\n"
73 " movq 40(%0), %%mm1\n"
74 " movq 48(%0), %%mm2\n"
75 " movq 56(%0), %%mm3\n"
76 " movq %%mm0, 32(%1)\n"
77 " movq %%mm1, 40(%1)\n"
78 " movq %%mm2, 48(%1)\n"
79 " movq %%mm3, 56(%1)\n"
80 ".section .fixup, \"ax\"\n"
81 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
82 " jmp 2b\n"
83 ".previous\n"
84 ".section __ex_table,\"a\"\n"
85 " .align 4\n"
86 " .long 1b, 3b\n"
87 ".previous"
88 : : "r" (from), "r" (to) : "memory");
89 from+=64;
90 to+=64;
91 }
92
93 for(; i>0; i--)
94 {
95 __asm__ __volatile__ (
96 " movq (%0), %%mm0\n"
97 " movq 8(%0), %%mm1\n"
98 " movq 16(%0), %%mm2\n"
99 " movq 24(%0), %%mm3\n"
100 " movq %%mm0, (%1)\n"
101 " movq %%mm1, 8(%1)\n"
102 " movq %%mm2, 16(%1)\n"
103 " movq %%mm3, 24(%1)\n"
104 " movq 32(%0), %%mm0\n"
105 " movq 40(%0), %%mm1\n"
106 " movq 48(%0), %%mm2\n"
107 " movq 56(%0), %%mm3\n"
108 " movq %%mm0, 32(%1)\n"
109 " movq %%mm1, 40(%1)\n"
110 " movq %%mm2, 48(%1)\n"
111 " movq %%mm3, 56(%1)\n"
112 : : "r" (from), "r" (to) : "memory");
113 from+=64;
114 to+=64;
115 }
116 /*
117 * Now do the tail of the block
118 */
119 __memcpy(to, from, len&63);
120 kernel_fpu_end();
121 return p;
122 }
123
124 #ifdef CONFIG_MK7
125
126 /*
127 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
128 * other MMX using processors do not.
129 */
130
fast_clear_page(void * page)131 static void fast_clear_page(void *page)
132 {
133 int i;
134
135 kernel_fpu_begin();
136
137 __asm__ __volatile__ (
138 " pxor %%mm0, %%mm0\n" : :
139 );
140
141 for(i=0;i<4096/64;i++)
142 {
143 __asm__ __volatile__ (
144 " movntq %%mm0, (%0)\n"
145 " movntq %%mm0, 8(%0)\n"
146 " movntq %%mm0, 16(%0)\n"
147 " movntq %%mm0, 24(%0)\n"
148 " movntq %%mm0, 32(%0)\n"
149 " movntq %%mm0, 40(%0)\n"
150 " movntq %%mm0, 48(%0)\n"
151 " movntq %%mm0, 56(%0)\n"
152 : : "r" (page) : "memory");
153 page+=64;
154 }
155 /* since movntq is weakly-ordered, a "sfence" is needed to become
156 * ordered again.
157 */
158 __asm__ __volatile__ (
159 " sfence \n" : :
160 );
161 kernel_fpu_end();
162 }
163
fast_copy_page(void * to,void * from)164 static void fast_copy_page(void *to, void *from)
165 {
166 int i;
167
168 kernel_fpu_begin();
169
170 /* maybe the prefetch stuff can go before the expensive fnsave...
171 * but that is for later. -AV
172 */
173 __asm__ __volatile__ (
174 "1: prefetch (%0)\n"
175 " prefetch 64(%0)\n"
176 " prefetch 128(%0)\n"
177 " prefetch 192(%0)\n"
178 " prefetch 256(%0)\n"
179 "2: \n"
180 ".section .fixup, \"ax\"\n"
181 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
182 " jmp 2b\n"
183 ".previous\n"
184 ".section __ex_table,\"a\"\n"
185 " .align 4\n"
186 " .long 1b, 3b\n"
187 ".previous"
188 : : "r" (from) );
189
190 for(i=0; i<(4096-320)/64; i++)
191 {
192 __asm__ __volatile__ (
193 "1: prefetch 320(%0)\n"
194 "2: movq (%0), %%mm0\n"
195 " movntq %%mm0, (%1)\n"
196 " movq 8(%0), %%mm1\n"
197 " movntq %%mm1, 8(%1)\n"
198 " movq 16(%0), %%mm2\n"
199 " movntq %%mm2, 16(%1)\n"
200 " movq 24(%0), %%mm3\n"
201 " movntq %%mm3, 24(%1)\n"
202 " movq 32(%0), %%mm4\n"
203 " movntq %%mm4, 32(%1)\n"
204 " movq 40(%0), %%mm5\n"
205 " movntq %%mm5, 40(%1)\n"
206 " movq 48(%0), %%mm6\n"
207 " movntq %%mm6, 48(%1)\n"
208 " movq 56(%0), %%mm7\n"
209 " movntq %%mm7, 56(%1)\n"
210 ".section .fixup, \"ax\"\n"
211 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
212 " jmp 2b\n"
213 ".previous\n"
214 ".section __ex_table,\"a\"\n"
215 " .align 4\n"
216 " .long 1b, 3b\n"
217 ".previous"
218 : : "r" (from), "r" (to) : "memory");
219 from+=64;
220 to+=64;
221 }
222 for(i=(4096-320)/64; i<4096/64; i++)
223 {
224 __asm__ __volatile__ (
225 "2: movq (%0), %%mm0\n"
226 " movntq %%mm0, (%1)\n"
227 " movq 8(%0), %%mm1\n"
228 " movntq %%mm1, 8(%1)\n"
229 " movq 16(%0), %%mm2\n"
230 " movntq %%mm2, 16(%1)\n"
231 " movq 24(%0), %%mm3\n"
232 " movntq %%mm3, 24(%1)\n"
233 " movq 32(%0), %%mm4\n"
234 " movntq %%mm4, 32(%1)\n"
235 " movq 40(%0), %%mm5\n"
236 " movntq %%mm5, 40(%1)\n"
237 " movq 48(%0), %%mm6\n"
238 " movntq %%mm6, 48(%1)\n"
239 " movq 56(%0), %%mm7\n"
240 " movntq %%mm7, 56(%1)\n"
241 : : "r" (from), "r" (to) : "memory");
242 from+=64;
243 to+=64;
244 }
245 /* since movntq is weakly-ordered, a "sfence" is needed to become
246 * ordered again.
247 */
248 __asm__ __volatile__ (
249 " sfence \n" : :
250 );
251 kernel_fpu_end();
252 }
253
254 #else
255
256 /*
257 * Generic MMX implementation without K7 specific streaming
258 */
259
fast_clear_page(void * page)260 static void fast_clear_page(void *page)
261 {
262 int i;
263
264 kernel_fpu_begin();
265
266 __asm__ __volatile__ (
267 " pxor %%mm0, %%mm0\n" : :
268 );
269
270 for(i=0;i<4096/128;i++)
271 {
272 __asm__ __volatile__ (
273 " movq %%mm0, (%0)\n"
274 " movq %%mm0, 8(%0)\n"
275 " movq %%mm0, 16(%0)\n"
276 " movq %%mm0, 24(%0)\n"
277 " movq %%mm0, 32(%0)\n"
278 " movq %%mm0, 40(%0)\n"
279 " movq %%mm0, 48(%0)\n"
280 " movq %%mm0, 56(%0)\n"
281 " movq %%mm0, 64(%0)\n"
282 " movq %%mm0, 72(%0)\n"
283 " movq %%mm0, 80(%0)\n"
284 " movq %%mm0, 88(%0)\n"
285 " movq %%mm0, 96(%0)\n"
286 " movq %%mm0, 104(%0)\n"
287 " movq %%mm0, 112(%0)\n"
288 " movq %%mm0, 120(%0)\n"
289 : : "r" (page) : "memory");
290 page+=128;
291 }
292
293 kernel_fpu_end();
294 }
295
fast_copy_page(void * to,void * from)296 static void fast_copy_page(void *to, void *from)
297 {
298 int i;
299
300
301 kernel_fpu_begin();
302
303 __asm__ __volatile__ (
304 "1: prefetch (%0)\n"
305 " prefetch 64(%0)\n"
306 " prefetch 128(%0)\n"
307 " prefetch 192(%0)\n"
308 " prefetch 256(%0)\n"
309 "2: \n"
310 ".section .fixup, \"ax\"\n"
311 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
312 " jmp 2b\n"
313 ".previous\n"
314 ".section __ex_table,\"a\"\n"
315 " .align 4\n"
316 " .long 1b, 3b\n"
317 ".previous"
318 : : "r" (from) );
319
320 for(i=0; i<4096/64; i++)
321 {
322 __asm__ __volatile__ (
323 "1: prefetch 320(%0)\n"
324 "2: movq (%0), %%mm0\n"
325 " movq 8(%0), %%mm1\n"
326 " movq 16(%0), %%mm2\n"
327 " movq 24(%0), %%mm3\n"
328 " movq %%mm0, (%1)\n"
329 " movq %%mm1, 8(%1)\n"
330 " movq %%mm2, 16(%1)\n"
331 " movq %%mm3, 24(%1)\n"
332 " movq 32(%0), %%mm0\n"
333 " movq 40(%0), %%mm1\n"
334 " movq 48(%0), %%mm2\n"
335 " movq 56(%0), %%mm3\n"
336 " movq %%mm0, 32(%1)\n"
337 " movq %%mm1, 40(%1)\n"
338 " movq %%mm2, 48(%1)\n"
339 " movq %%mm3, 56(%1)\n"
340 ".section .fixup, \"ax\"\n"
341 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
342 " jmp 2b\n"
343 ".previous\n"
344 ".section __ex_table,\"a\"\n"
345 " .align 4\n"
346 " .long 1b, 3b\n"
347 ".previous"
348 : : "r" (from), "r" (to) : "memory");
349 from+=64;
350 to+=64;
351 }
352 kernel_fpu_end();
353 }
354
355
356 #endif
357
358 /*
359 * Favour MMX for page clear and copy.
360 */
361
slow_zero_page(void * page)362 static void slow_zero_page(void * page)
363 {
364 int d0, d1;
365 __asm__ __volatile__( \
366 "cld\n\t" \
367 "rep ; stosl" \
368 : "=&c" (d0), "=&D" (d1)
369 :"a" (0),"1" (page),"0" (1024)
370 :"memory");
371 }
372
mmx_clear_page(void * page)373 void mmx_clear_page(void * page)
374 {
375 if(in_interrupt())
376 slow_zero_page(page);
377 else
378 fast_clear_page(page);
379 }
380
slow_copy_page(void * to,void * from)381 static void slow_copy_page(void *to, void *from)
382 {
383 int d0, d1, d2;
384 __asm__ __volatile__( \
385 "cld\n\t" \
386 "rep ; movsl" \
387 : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
388 : "0" (1024),"1" ((long) to),"2" ((long) from) \
389 : "memory");
390 }
391
392
mmx_copy_page(void * to,void * from)393 void mmx_copy_page(void *to, void *from)
394 {
395 if(in_interrupt())
396 slow_copy_page(to, from);
397 else
398 fast_copy_page(to, from);
399 }
400