1 #include <linux/config.h>
2 #include <linux/types.h>
3 #include <linux/string.h>
4 #include <linux/sched.h>
5 
6 #include <asm/i387.h>
7 #include <asm/hardirq.h>
8 
9 
10 /*
11  *	MMX 3DNow! library helper functions
12  *
13  *	To do:
14  *	We can use MMX just for prefetch in IRQ's. This may be a win.
15  *		(reported so on K6-III)
16  *	We should use a better code neutral filler for the short jump
17  *		leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
18  *	We also want to clobber the filler register so we dont get any
19  *		register forwarding stalls on the filler.
20  *
21  *	Add *user handling. Checksums are not a win with MMX on any CPU
22  *	tested so far for any MMX solution figured.
23  *
24  *	22/09/2000 - Arjan van de Ven
25  *		Improved for non-egineering-sample Athlons
26  *
27  */
28 
_mmx_memcpy(void * to,const void * from,size_t len)29 void *_mmx_memcpy(void *to, const void *from, size_t len)
30 {
31 	void *p;
32 	int i;
33 
34 	if (in_interrupt())
35 		return __memcpy(to, from, len);
36 
37 	p = to;
38 	i = len >> 6; /* len/64 */
39 
40 	kernel_fpu_begin();
41 
42 	__asm__ __volatile__ (
43 		"1: prefetch (%0)\n"		/* This set is 28 bytes */
44 		"   prefetch 64(%0)\n"
45 		"   prefetch 128(%0)\n"
46 		"   prefetch 192(%0)\n"
47 		"   prefetch 256(%0)\n"
48 		"2:  \n"
49 		".section .fixup, \"ax\"\n"
50 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
51 		"   jmp 2b\n"
52 		".previous\n"
53 		".section __ex_table,\"a\"\n"
54 		"	.align 4\n"
55 		"	.long 1b, 3b\n"
56 		".previous"
57 		: : "r" (from) );
58 
59 
60 	for(; i>5; i--)
61 	{
62 		__asm__ __volatile__ (
63 		"1:  prefetch 320(%0)\n"
64 		"2:  movq (%0), %%mm0\n"
65 		"  movq 8(%0), %%mm1\n"
66 		"  movq 16(%0), %%mm2\n"
67 		"  movq 24(%0), %%mm3\n"
68 		"  movq %%mm0, (%1)\n"
69 		"  movq %%mm1, 8(%1)\n"
70 		"  movq %%mm2, 16(%1)\n"
71 		"  movq %%mm3, 24(%1)\n"
72 		"  movq 32(%0), %%mm0\n"
73 		"  movq 40(%0), %%mm1\n"
74 		"  movq 48(%0), %%mm2\n"
75 		"  movq 56(%0), %%mm3\n"
76 		"  movq %%mm0, 32(%1)\n"
77 		"  movq %%mm1, 40(%1)\n"
78 		"  movq %%mm2, 48(%1)\n"
79 		"  movq %%mm3, 56(%1)\n"
80 		".section .fixup, \"ax\"\n"
81 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
82 		"   jmp 2b\n"
83 		".previous\n"
84 		".section __ex_table,\"a\"\n"
85 		"	.align 4\n"
86 		"	.long 1b, 3b\n"
87 		".previous"
88 		: : "r" (from), "r" (to) : "memory");
89 		from+=64;
90 		to+=64;
91 	}
92 
93 	for(; i>0; i--)
94 	{
95 		__asm__ __volatile__ (
96 		"  movq (%0), %%mm0\n"
97 		"  movq 8(%0), %%mm1\n"
98 		"  movq 16(%0), %%mm2\n"
99 		"  movq 24(%0), %%mm3\n"
100 		"  movq %%mm0, (%1)\n"
101 		"  movq %%mm1, 8(%1)\n"
102 		"  movq %%mm2, 16(%1)\n"
103 		"  movq %%mm3, 24(%1)\n"
104 		"  movq 32(%0), %%mm0\n"
105 		"  movq 40(%0), %%mm1\n"
106 		"  movq 48(%0), %%mm2\n"
107 		"  movq 56(%0), %%mm3\n"
108 		"  movq %%mm0, 32(%1)\n"
109 		"  movq %%mm1, 40(%1)\n"
110 		"  movq %%mm2, 48(%1)\n"
111 		"  movq %%mm3, 56(%1)\n"
112 		: : "r" (from), "r" (to) : "memory");
113 		from+=64;
114 		to+=64;
115 	}
116 	/*
117 	 *	Now do the tail of the block
118 	 */
119 	__memcpy(to, from, len&63);
120 	kernel_fpu_end();
121 	return p;
122 }
123 
124 #ifdef CONFIG_MK7
125 
126 /*
127  *	The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
128  *	other MMX using processors do not.
129  */
130 
fast_clear_page(void * page)131 static void fast_clear_page(void *page)
132 {
133 	int i;
134 
135 	kernel_fpu_begin();
136 
137 	__asm__ __volatile__ (
138 		"  pxor %%mm0, %%mm0\n" : :
139 	);
140 
141 	for(i=0;i<4096/64;i++)
142 	{
143 		__asm__ __volatile__ (
144 		"  movntq %%mm0, (%0)\n"
145 		"  movntq %%mm0, 8(%0)\n"
146 		"  movntq %%mm0, 16(%0)\n"
147 		"  movntq %%mm0, 24(%0)\n"
148 		"  movntq %%mm0, 32(%0)\n"
149 		"  movntq %%mm0, 40(%0)\n"
150 		"  movntq %%mm0, 48(%0)\n"
151 		"  movntq %%mm0, 56(%0)\n"
152 		: : "r" (page) : "memory");
153 		page+=64;
154 	}
155 	/* since movntq is weakly-ordered, a "sfence" is needed to become
156 	 * ordered again.
157 	 */
158 	__asm__ __volatile__ (
159 		"  sfence \n" : :
160 	);
161 	kernel_fpu_end();
162 }
163 
fast_copy_page(void * to,void * from)164 static void fast_copy_page(void *to, void *from)
165 {
166 	int i;
167 
168 	kernel_fpu_begin();
169 
170 	/* maybe the prefetch stuff can go before the expensive fnsave...
171 	 * but that is for later. -AV
172 	 */
173 	__asm__ __volatile__ (
174 		"1: prefetch (%0)\n"
175 		"   prefetch 64(%0)\n"
176 		"   prefetch 128(%0)\n"
177 		"   prefetch 192(%0)\n"
178 		"   prefetch 256(%0)\n"
179 		"2:  \n"
180 		".section .fixup, \"ax\"\n"
181 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
182 		"   jmp 2b\n"
183 		".previous\n"
184 		".section __ex_table,\"a\"\n"
185 		"	.align 4\n"
186 		"	.long 1b, 3b\n"
187 		".previous"
188 		: : "r" (from) );
189 
190 	for(i=0; i<(4096-320)/64; i++)
191 	{
192 		__asm__ __volatile__ (
193 		"1: prefetch 320(%0)\n"
194 		"2: movq (%0), %%mm0\n"
195 		"   movntq %%mm0, (%1)\n"
196 		"   movq 8(%0), %%mm1\n"
197 		"   movntq %%mm1, 8(%1)\n"
198 		"   movq 16(%0), %%mm2\n"
199 		"   movntq %%mm2, 16(%1)\n"
200 		"   movq 24(%0), %%mm3\n"
201 		"   movntq %%mm3, 24(%1)\n"
202 		"   movq 32(%0), %%mm4\n"
203 		"   movntq %%mm4, 32(%1)\n"
204 		"   movq 40(%0), %%mm5\n"
205 		"   movntq %%mm5, 40(%1)\n"
206 		"   movq 48(%0), %%mm6\n"
207 		"   movntq %%mm6, 48(%1)\n"
208 		"   movq 56(%0), %%mm7\n"
209 		"   movntq %%mm7, 56(%1)\n"
210 		".section .fixup, \"ax\"\n"
211 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
212 		"   jmp 2b\n"
213 		".previous\n"
214 		".section __ex_table,\"a\"\n"
215 		"	.align 4\n"
216 		"	.long 1b, 3b\n"
217 		".previous"
218 		: : "r" (from), "r" (to) : "memory");
219 		from+=64;
220 		to+=64;
221 	}
222 	for(i=(4096-320)/64; i<4096/64; i++)
223 	{
224 		__asm__ __volatile__ (
225 		"2: movq (%0), %%mm0\n"
226 		"   movntq %%mm0, (%1)\n"
227 		"   movq 8(%0), %%mm1\n"
228 		"   movntq %%mm1, 8(%1)\n"
229 		"   movq 16(%0), %%mm2\n"
230 		"   movntq %%mm2, 16(%1)\n"
231 		"   movq 24(%0), %%mm3\n"
232 		"   movntq %%mm3, 24(%1)\n"
233 		"   movq 32(%0), %%mm4\n"
234 		"   movntq %%mm4, 32(%1)\n"
235 		"   movq 40(%0), %%mm5\n"
236 		"   movntq %%mm5, 40(%1)\n"
237 		"   movq 48(%0), %%mm6\n"
238 		"   movntq %%mm6, 48(%1)\n"
239 		"   movq 56(%0), %%mm7\n"
240 		"   movntq %%mm7, 56(%1)\n"
241 		: : "r" (from), "r" (to) : "memory");
242 		from+=64;
243 		to+=64;
244 	}
245 	/* since movntq is weakly-ordered, a "sfence" is needed to become
246 	 * ordered again.
247 	 */
248 	__asm__ __volatile__ (
249 		"  sfence \n" : :
250 	);
251 	kernel_fpu_end();
252 }
253 
254 #else
255 
256 /*
257  *	Generic MMX implementation without K7 specific streaming
258  */
259 
fast_clear_page(void * page)260 static void fast_clear_page(void *page)
261 {
262 	int i;
263 
264 	kernel_fpu_begin();
265 
266 	__asm__ __volatile__ (
267 		"  pxor %%mm0, %%mm0\n" : :
268 	);
269 
270 	for(i=0;i<4096/128;i++)
271 	{
272 		__asm__ __volatile__ (
273 		"  movq %%mm0, (%0)\n"
274 		"  movq %%mm0, 8(%0)\n"
275 		"  movq %%mm0, 16(%0)\n"
276 		"  movq %%mm0, 24(%0)\n"
277 		"  movq %%mm0, 32(%0)\n"
278 		"  movq %%mm0, 40(%0)\n"
279 		"  movq %%mm0, 48(%0)\n"
280 		"  movq %%mm0, 56(%0)\n"
281 		"  movq %%mm0, 64(%0)\n"
282 		"  movq %%mm0, 72(%0)\n"
283 		"  movq %%mm0, 80(%0)\n"
284 		"  movq %%mm0, 88(%0)\n"
285 		"  movq %%mm0, 96(%0)\n"
286 		"  movq %%mm0, 104(%0)\n"
287 		"  movq %%mm0, 112(%0)\n"
288 		"  movq %%mm0, 120(%0)\n"
289 		: : "r" (page) : "memory");
290 		page+=128;
291 	}
292 
293 	kernel_fpu_end();
294 }
295 
fast_copy_page(void * to,void * from)296 static void fast_copy_page(void *to, void *from)
297 {
298 	int i;
299 
300 
301 	kernel_fpu_begin();
302 
303 	__asm__ __volatile__ (
304 		"1: prefetch (%0)\n"
305 		"   prefetch 64(%0)\n"
306 		"   prefetch 128(%0)\n"
307 		"   prefetch 192(%0)\n"
308 		"   prefetch 256(%0)\n"
309 		"2:  \n"
310 		".section .fixup, \"ax\"\n"
311 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
312 		"   jmp 2b\n"
313 		".previous\n"
314 		".section __ex_table,\"a\"\n"
315 		"	.align 4\n"
316 		"	.long 1b, 3b\n"
317 		".previous"
318 		: : "r" (from) );
319 
320 	for(i=0; i<4096/64; i++)
321 	{
322 		__asm__ __volatile__ (
323 		"1: prefetch 320(%0)\n"
324 		"2: movq (%0), %%mm0\n"
325 		"   movq 8(%0), %%mm1\n"
326 		"   movq 16(%0), %%mm2\n"
327 		"   movq 24(%0), %%mm3\n"
328 		"   movq %%mm0, (%1)\n"
329 		"   movq %%mm1, 8(%1)\n"
330 		"   movq %%mm2, 16(%1)\n"
331 		"   movq %%mm3, 24(%1)\n"
332 		"   movq 32(%0), %%mm0\n"
333 		"   movq 40(%0), %%mm1\n"
334 		"   movq 48(%0), %%mm2\n"
335 		"   movq 56(%0), %%mm3\n"
336 		"   movq %%mm0, 32(%1)\n"
337 		"   movq %%mm1, 40(%1)\n"
338 		"   movq %%mm2, 48(%1)\n"
339 		"   movq %%mm3, 56(%1)\n"
340 		".section .fixup, \"ax\"\n"
341 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
342 		"   jmp 2b\n"
343 		".previous\n"
344 		".section __ex_table,\"a\"\n"
345 		"	.align 4\n"
346 		"	.long 1b, 3b\n"
347 		".previous"
348 		: : "r" (from), "r" (to) : "memory");
349 		from+=64;
350 		to+=64;
351 	}
352 	kernel_fpu_end();
353 }
354 
355 
356 #endif
357 
358 /*
359  *	Favour MMX for page clear and copy.
360  */
361 
slow_zero_page(void * page)362 static void slow_zero_page(void * page)
363 {
364 	int d0, d1;
365 	__asm__ __volatile__( \
366 		"cld\n\t" \
367 		"rep ; stosl" \
368 		: "=&c" (d0), "=&D" (d1)
369 		:"a" (0),"1" (page),"0" (1024)
370 		:"memory");
371 }
372 
mmx_clear_page(void * page)373 void mmx_clear_page(void * page)
374 {
375 	if(in_interrupt())
376 		slow_zero_page(page);
377 	else
378 		fast_clear_page(page);
379 }
380 
slow_copy_page(void * to,void * from)381 static void slow_copy_page(void *to, void *from)
382 {
383 	int d0, d1, d2;
384 	__asm__ __volatile__( \
385 		"cld\n\t" \
386 		"rep ; movsl" \
387 		: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
388 		: "0" (1024),"1" ((long) to),"2" ((long) from) \
389 		: "memory");
390 }
391 
392 
mmx_copy_page(void * to,void * from)393 void mmx_copy_page(void *to, void *from)
394 {
395 	if(in_interrupt())
396 		slow_copy_page(to, from);
397 	else
398 		fast_copy_page(to, from);
399 }
400