1/* memcpy with SSSE3 and REP string.
2   Copyright (C) 2010-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#if IS_IN (libc) \
22    && (defined SHARED \
23	|| defined USE_AS_MEMMOVE \
24	|| !defined USE_MULTIARCH)
25
26#include "asm-syntax.h"
27
28#ifndef MEMCPY
29# define MEMCPY		__memcpy_ssse3_rep
30# define MEMCPY_CHK	__memcpy_chk_ssse3_rep
31#endif
32
33#define DEST		PARMS
34#define SRC		DEST+4
35#define LEN		SRC+4
36
37#define CFI_PUSH(REG)						\
38  cfi_adjust_cfa_offset (4);					\
39  cfi_rel_offset (REG, 0)
40
41#define CFI_POP(REG)						\
42  cfi_adjust_cfa_offset (-4);					\
43  cfi_restore (REG)
44
45#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
46#define POP(REG)	popl REG; CFI_POP (REG)
47
48#ifdef PIC
49# define PARMS		8		/* Preserve EBX.  */
50# define ENTRANCE	PUSH (%ebx);
51# define RETURN_END	POP (%ebx); ret
52# define RETURN		RETURN_END; CFI_PUSH (%ebx)
53# define JMPTBL(I, B)	I - B
54
55/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
56   jump table with relative offsets.  INDEX is a register contains the
57   index into the jump table.   SCALE is the scale of INDEX. */
58# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
59    /* We first load PC into EBX.  */				\
60    SETUP_PIC_REG(bx);						\
61    /* Get the address of the jump table.  */			\
62    addl	$(TABLE - .), %ebx;				\
63    /* Get the entry and convert the relative offset to the	\
64       absolute address.  */					\
65    addl	(%ebx,INDEX,SCALE), %ebx;			\
66    /* We loaded the jump table.  Go.  */			\
67    _CET_NOTRACK jmp *%ebx
68
69# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
70    addl	$(TABLE - .), %ebx
71
72# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
73    addl	(%ebx,INDEX,SCALE), %ebx;			\
74    /* We loaded the jump table.  Go.  */			\
75    _CET_NOTRACK jmp *%ebx
76#else
77# define PARMS		4
78# define ENTRANCE
79# define RETURN_END	ret
80# define RETURN		RETURN_END
81# define JMPTBL(I, B)	I
82
83/* Branch to an entry in a jump table.  TABLE is a jump table with
84   absolute offsets.  INDEX is a register contains the index into the
85   jump table.  SCALE is the scale of INDEX. */
86# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
87    _CET_NOTRACK jmp *TABLE(,INDEX,SCALE)
88
89# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
90
91# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
92    _CET_NOTRACK jmp *TABLE(,INDEX,SCALE)
93#endif
94
95	.section .text.ssse3,"ax",@progbits
96#ifdef SHARED
97ENTRY (MEMCPY_CHK)
98	movl	12(%esp), %eax
99	cmpl	%eax, 16(%esp)
100	jb	HIDDEN_JUMPTARGET (__chk_fail)
101END (MEMCPY_CHK)
102#endif
103ENTRY (MEMCPY)
104	ENTRANCE
105	movl	LEN(%esp), %ecx
106	movl	SRC(%esp), %eax
107	movl	DEST(%esp), %edx
108
109#ifdef USE_AS_MEMMOVE
110	cmp	%eax, %edx
111	jb	L(copy_forward)
112	je	L(fwd_write_0bytes)
113	cmp	$48, %ecx
114	jb	L(bk_write_less48bytes)
115	add	%ecx, %eax
116	cmp	%eax, %edx
117	movl	SRC(%esp), %eax
118	jb	L(copy_backward)
119
120L(copy_forward):
121#endif
122	cmp	$48, %ecx
123	jae	L(48bytesormore)
124
125L(fwd_write_less32bytes):
126#ifndef USE_AS_MEMMOVE
127	cmp	%dl, %al
128	jb	L(bk_write)
129#endif
130	add	%ecx, %edx
131	add	%ecx, %eax
132	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
133#ifndef USE_AS_MEMMOVE
134L(bk_write):
135	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
136#endif
137
138	ALIGN (4)
139/* ECX > 32 and EDX is 4 byte aligned.  */
140L(48bytesormore):
141	movdqu	(%eax), %xmm0
142	PUSH (%edi)
143	movl	%edx, %edi
144	and	$-16, %edx
145	PUSH (%esi)
146	cfi_remember_state
147	add	$16, %edx
148	movl	%edi, %esi
149	sub	%edx, %edi
150	add	%edi, %ecx
151	sub	%edi, %eax
152
153#ifdef SHARED_CACHE_SIZE_HALF
154	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
155#else
156# ifdef PIC
157	SETUP_PIC_REG(bx)
158	add	$_GLOBAL_OFFSET_TABLE_, %ebx
159	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
160# else
161	cmp	__x86_shared_cache_size_half, %ecx
162# endif
163#endif
164
165	mov	%eax, %edi
166	jae	L(large_page)
167	and	$0xf, %edi
168	jz	L(shl_0)
169
170	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
171
172	ALIGN (4)
173L(shl_0):
174	movdqu	%xmm0, (%esi)
175	xor	%edi, %edi
176	cmp	$127, %ecx
177	ja	L(shl_0_gobble)
178	lea	-32(%ecx), %ecx
179L(shl_0_loop):
180	movdqa	(%eax, %edi), %xmm0
181	movdqa	16(%eax, %edi), %xmm1
182	sub	$32, %ecx
183	movdqa	%xmm0, (%edx, %edi)
184	movdqa	%xmm1, 16(%edx, %edi)
185	lea	32(%edi), %edi
186	jb	L(shl_0_end)
187
188	movdqa	(%eax, %edi), %xmm0
189	movdqa	16(%eax, %edi), %xmm1
190	sub	$32, %ecx
191	movdqa	%xmm0, (%edx, %edi)
192	movdqa	%xmm1, 16(%edx, %edi)
193	lea	32(%edi), %edi
194	jb	L(shl_0_end)
195
196	movdqa	(%eax, %edi), %xmm0
197	movdqa	16(%eax, %edi), %xmm1
198	sub	$32, %ecx
199	movdqa	%xmm0, (%edx, %edi)
200	movdqa	%xmm1, 16(%edx, %edi)
201	lea	32(%edi), %edi
202	jb	L(shl_0_end)
203
204	movdqa	(%eax, %edi), %xmm0
205	movdqa	16(%eax, %edi), %xmm1
206	sub	$32, %ecx
207	movdqa	%xmm0, (%edx, %edi)
208	movdqa	%xmm1, 16(%edx, %edi)
209	lea	32(%edi), %edi
210L(shl_0_end):
211	lea	32(%ecx), %ecx
212	add	%ecx, %edi
213	add	%edi, %edx
214	add	%edi, %eax
215	POP (%esi)
216	POP (%edi)
217	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
218
219	cfi_restore_state
220	cfi_remember_state
221L(shl_0_gobble):
222
223#ifdef DATA_CACHE_SIZE_HALF
224	cmp	$DATA_CACHE_SIZE_HALF, %ecx
225#else
226# ifdef PIC
227	SETUP_PIC_REG(bx)
228	add	$_GLOBAL_OFFSET_TABLE_, %ebx
229	mov	__x86_data_cache_size_half@GOTOFF(%ebx), %edi
230# else
231	mov	__x86_data_cache_size_half, %edi
232# endif
233#endif
234	mov	%edi, %esi
235	shr	$3, %esi
236	sub	%esi, %edi
237	cmp	%edi, %ecx
238	jae	L(shl_0_gobble_mem_start)
239	sub	$128, %ecx
240	ALIGN (4)
241L(shl_0_gobble_cache_loop):
242	movdqa	(%eax), %xmm0
243	movaps	0x10(%eax), %xmm1
244	movaps	0x20(%eax), %xmm2
245	movaps	0x30(%eax), %xmm3
246	movaps	0x40(%eax), %xmm4
247	movaps	0x50(%eax), %xmm5
248	movaps	0x60(%eax), %xmm6
249	movaps	0x70(%eax), %xmm7
250	lea	0x80(%eax), %eax
251	sub	$128, %ecx
252	movdqa	%xmm0, (%edx)
253	movaps	%xmm1, 0x10(%edx)
254	movaps	%xmm2, 0x20(%edx)
255	movaps	%xmm3, 0x30(%edx)
256	movaps	%xmm4, 0x40(%edx)
257	movaps	%xmm5, 0x50(%edx)
258	movaps	%xmm6, 0x60(%edx)
259	movaps	%xmm7, 0x70(%edx)
260	lea	0x80(%edx), %edx
261
262	jae	L(shl_0_gobble_cache_loop)
263	add	$0x80, %ecx
264	cmp	$0x40, %ecx
265	jb	L(shl_0_cache_less_64bytes)
266
267	movdqa	(%eax), %xmm0
268	sub	$0x40, %ecx
269	movdqa	0x10(%eax), %xmm1
270
271	movdqa	%xmm0, (%edx)
272	movdqa	%xmm1, 0x10(%edx)
273
274	movdqa	0x20(%eax), %xmm0
275	movdqa	0x30(%eax), %xmm1
276	add	$0x40, %eax
277
278	movdqa	%xmm0, 0x20(%edx)
279	movdqa	%xmm1, 0x30(%edx)
280	add	$0x40, %edx
281L(shl_0_cache_less_64bytes):
282	cmp	$0x20, %ecx
283	jb	L(shl_0_cache_less_32bytes)
284	movdqa	(%eax), %xmm0
285	sub	$0x20, %ecx
286	movdqa	0x10(%eax), %xmm1
287	add	$0x20, %eax
288	movdqa	%xmm0, (%edx)
289	movdqa	%xmm1, 0x10(%edx)
290	add	$0x20, %edx
291L(shl_0_cache_less_32bytes):
292	cmp	$0x10, %ecx
293	jb	L(shl_0_cache_less_16bytes)
294	sub	$0x10, %ecx
295	movdqa	(%eax), %xmm0
296	add	$0x10, %eax
297	movdqa	%xmm0, (%edx)
298	add	$0x10, %edx
299L(shl_0_cache_less_16bytes):
300	add	%ecx, %edx
301	add	%ecx, %eax
302	POP (%esi)
303	POP (%edi)
304	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
305
306	cfi_restore_state
307	cfi_remember_state
308	ALIGN (4)
309L(shl_0_gobble_mem_start):
310	cmp	%al, %dl
311	je	L(copy_page_by_rep)
312	sub	$128, %ecx
313L(shl_0_gobble_mem_loop):
314	prefetchnta 0x1c0(%eax)
315	prefetchnta 0x280(%eax)
316	prefetchnta 0x1c0(%edx)
317	prefetchnta 0x280(%edx)
318
319	movdqa	(%eax), %xmm0
320	movaps	0x10(%eax), %xmm1
321	movaps	0x20(%eax), %xmm2
322	movaps	0x30(%eax), %xmm3
323	movaps	0x40(%eax), %xmm4
324	movaps	0x50(%eax), %xmm5
325	movaps	0x60(%eax), %xmm6
326	movaps	0x70(%eax), %xmm7
327	lea	0x80(%eax), %eax
328	sub	$0x80, %ecx
329	movdqa	%xmm0, (%edx)
330	movaps	%xmm1, 0x10(%edx)
331	movaps	%xmm2, 0x20(%edx)
332	movaps	%xmm3, 0x30(%edx)
333	movaps	%xmm4, 0x40(%edx)
334	movaps	%xmm5, 0x50(%edx)
335	movaps	%xmm6, 0x60(%edx)
336	movaps	%xmm7, 0x70(%edx)
337	lea	0x80(%edx), %edx
338
339	jae	L(shl_0_gobble_mem_loop)
340	add	$0x80, %ecx
341	cmp	$0x40, %ecx
342	jb	L(shl_0_mem_less_64bytes)
343
344	movdqa	(%eax), %xmm0
345	sub	$0x40, %ecx
346	movdqa	0x10(%eax), %xmm1
347
348	movdqa	%xmm0, (%edx)
349	movdqa	%xmm1, 0x10(%edx)
350
351	movdqa	0x20(%eax), %xmm0
352	movdqa	0x30(%eax), %xmm1
353	add	$0x40, %eax
354
355	movdqa	%xmm0, 0x20(%edx)
356	movdqa	%xmm1, 0x30(%edx)
357	add	$0x40, %edx
358L(shl_0_mem_less_64bytes):
359	cmp	$0x20, %ecx
360	jb	L(shl_0_mem_less_32bytes)
361	movdqa	(%eax), %xmm0
362	sub	$0x20, %ecx
363	movdqa	0x10(%eax), %xmm1
364	add	$0x20, %eax
365	movdqa	%xmm0, (%edx)
366	movdqa	%xmm1, 0x10(%edx)
367	add	$0x20, %edx
368L(shl_0_mem_less_32bytes):
369	cmp	$0x10, %ecx
370	jb	L(shl_0_mem_less_16bytes)
371	sub	$0x10, %ecx
372	movdqa	(%eax), %xmm0
373	add	$0x10, %eax
374	movdqa	%xmm0, (%edx)
375	add	$0x10, %edx
376L(shl_0_mem_less_16bytes):
377	add	%ecx, %edx
378	add	%ecx, %eax
379	POP (%esi)
380	POP (%edi)
381	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
382
383	cfi_restore_state
384	cfi_remember_state
385	ALIGN (4)
386L(shl_1):
387	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
388	sub	$1, %eax
389	movaps	(%eax), %xmm1
390	xor	%edi, %edi
391	sub	$32, %ecx
392	movdqu	%xmm0, (%esi)
393	POP (%esi)
394L(shl_1_loop):
395
396	movdqa	16(%eax, %edi), %xmm2
397	sub	$32, %ecx
398	movdqa	32(%eax, %edi), %xmm3
399	movdqa	%xmm3, %xmm4
400	palignr	$1, %xmm2, %xmm3
401	palignr	$1, %xmm1, %xmm2
402	lea	32(%edi), %edi
403	movdqa	%xmm2, -32(%edx, %edi)
404	movdqa	%xmm3, -16(%edx, %edi)
405
406	jb	L(shl_1_end)
407
408	movdqa	16(%eax, %edi), %xmm2
409	sub	$32, %ecx
410	movdqa	32(%eax, %edi), %xmm3
411	movdqa	%xmm3, %xmm1
412	palignr	$1, %xmm2, %xmm3
413	palignr	$1, %xmm4, %xmm2
414	lea	32(%edi), %edi
415	movdqa	%xmm2, -32(%edx, %edi)
416	movdqa	%xmm3, -16(%edx, %edi)
417
418	jae	L(shl_1_loop)
419
420L(shl_1_end):
421	add	$32, %ecx
422	add	%ecx, %edi
423	add	%edi, %edx
424	lea	1(%edi, %eax), %eax
425	POP (%edi)
426	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
427
428	cfi_restore_state
429	cfi_remember_state
430	ALIGN (4)
431L(shl_2):
432	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
433	sub	$2, %eax
434	movaps	(%eax), %xmm1
435	xor	%edi, %edi
436	sub	$32, %ecx
437	movdqu	%xmm0, (%esi)
438	POP (%esi)
439L(shl_2_loop):
440
441	movdqa	16(%eax, %edi), %xmm2
442	sub	$32, %ecx
443	movdqa	32(%eax, %edi), %xmm3
444	movdqa	%xmm3, %xmm4
445	palignr	$2, %xmm2, %xmm3
446	palignr	$2, %xmm1, %xmm2
447	lea	32(%edi), %edi
448	movdqa	%xmm2, -32(%edx, %edi)
449	movdqa	%xmm3, -16(%edx, %edi)
450
451	jb	L(shl_2_end)
452
453	movdqa	16(%eax, %edi), %xmm2
454	sub	$32, %ecx
455	movdqa	32(%eax, %edi), %xmm3
456	movdqa	%xmm3, %xmm1
457	palignr	$2, %xmm2, %xmm3
458	palignr	$2, %xmm4, %xmm2
459	lea	32(%edi), %edi
460	movdqa	%xmm2, -32(%edx, %edi)
461	movdqa	%xmm3, -16(%edx, %edi)
462
463	jae	L(shl_2_loop)
464
465L(shl_2_end):
466	add	$32, %ecx
467	add	%ecx, %edi
468	add	%edi, %edx
469	lea	2(%edi, %eax), %eax
470	POP (%edi)
471	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
472
473	cfi_restore_state
474	cfi_remember_state
475	ALIGN (4)
476L(shl_3):
477	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
478	sub	$3, %eax
479	movaps	(%eax), %xmm1
480	xor	%edi, %edi
481	sub	$32, %ecx
482	movdqu	%xmm0, (%esi)
483	POP (%esi)
484L(shl_3_loop):
485
486	movdqa	16(%eax, %edi), %xmm2
487	sub	$32, %ecx
488	movdqa	32(%eax, %edi), %xmm3
489	movdqa	%xmm3, %xmm4
490	palignr	$3, %xmm2, %xmm3
491	palignr	$3, %xmm1, %xmm2
492	lea	32(%edi), %edi
493	movdqa	%xmm2, -32(%edx, %edi)
494	movdqa	%xmm3, -16(%edx, %edi)
495
496	jb	L(shl_3_end)
497
498	movdqa	16(%eax, %edi), %xmm2
499	sub	$32, %ecx
500	movdqa	32(%eax, %edi), %xmm3
501	movdqa	%xmm3, %xmm1
502	palignr	$3, %xmm2, %xmm3
503	palignr	$3, %xmm4, %xmm2
504	lea	32(%edi), %edi
505	movdqa	%xmm2, -32(%edx, %edi)
506	movdqa	%xmm3, -16(%edx, %edi)
507
508	jae	L(shl_3_loop)
509
510L(shl_3_end):
511	add	$32, %ecx
512	add	%ecx, %edi
513	add	%edi, %edx
514	lea	3(%edi, %eax), %eax
515	POP (%edi)
516	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
517
518	cfi_restore_state
519	cfi_remember_state
520	ALIGN (4)
521L(shl_4):
522	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
523	sub	$4, %eax
524	movaps	(%eax), %xmm1
525	xor	%edi, %edi
526	sub	$32, %ecx
527	movdqu	%xmm0, (%esi)
528	POP (%esi)
529L(shl_4_loop):
530
531	movdqa	16(%eax, %edi), %xmm2
532	sub	$32, %ecx
533	movdqa	32(%eax, %edi), %xmm3
534	movdqa	%xmm3, %xmm4
535	palignr	$4, %xmm2, %xmm3
536	palignr	$4, %xmm1, %xmm2
537	lea	32(%edi), %edi
538	movdqa	%xmm2, -32(%edx, %edi)
539	movdqa	%xmm3, -16(%edx, %edi)
540
541	jb	L(shl_4_end)
542
543	movdqa	16(%eax, %edi), %xmm2
544	sub	$32, %ecx
545	movdqa	32(%eax, %edi), %xmm3
546	movdqa	%xmm3, %xmm1
547	palignr	$4, %xmm2, %xmm3
548	palignr	$4, %xmm4, %xmm2
549	lea	32(%edi), %edi
550	movdqa	%xmm2, -32(%edx, %edi)
551	movdqa	%xmm3, -16(%edx, %edi)
552
553	jae	L(shl_4_loop)
554
555L(shl_4_end):
556	add	$32, %ecx
557	add	%ecx, %edi
558	add	%edi, %edx
559	lea	4(%edi, %eax), %eax
560	POP (%edi)
561	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
562
563	cfi_restore_state
564	cfi_remember_state
565	ALIGN (4)
566L(shl_5):
567	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
568	sub	$5, %eax
569	movaps	(%eax), %xmm1
570	xor	%edi, %edi
571	sub	$32, %ecx
572	movdqu	%xmm0, (%esi)
573	POP (%esi)
574L(shl_5_loop):
575
576	movdqa	16(%eax, %edi), %xmm2
577	sub	$32, %ecx
578	movdqa	32(%eax, %edi), %xmm3
579	movdqa	%xmm3, %xmm4
580	palignr	$5, %xmm2, %xmm3
581	palignr	$5, %xmm1, %xmm2
582	lea	32(%edi), %edi
583	movdqa	%xmm2, -32(%edx, %edi)
584	movdqa	%xmm3, -16(%edx, %edi)
585
586	jb	L(shl_5_end)
587
588	movdqa	16(%eax, %edi), %xmm2
589	sub	$32, %ecx
590	movdqa	32(%eax, %edi), %xmm3
591	movdqa	%xmm3, %xmm1
592	palignr	$5, %xmm2, %xmm3
593	palignr	$5, %xmm4, %xmm2
594	lea	32(%edi), %edi
595	movdqa	%xmm2, -32(%edx, %edi)
596	movdqa	%xmm3, -16(%edx, %edi)
597
598	jae	L(shl_5_loop)
599
600L(shl_5_end):
601	add	$32, %ecx
602	add	%ecx, %edi
603	add	%edi, %edx
604	lea	5(%edi, %eax), %eax
605	POP (%edi)
606	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
607
608	cfi_restore_state
609	cfi_remember_state
610	ALIGN (4)
611L(shl_6):
612	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
613	sub	$6, %eax
614	movaps	(%eax), %xmm1
615	xor	%edi, %edi
616	sub	$32, %ecx
617	movdqu	%xmm0, (%esi)
618	POP (%esi)
619L(shl_6_loop):
620
621	movdqa	16(%eax, %edi), %xmm2
622	sub	$32, %ecx
623	movdqa	32(%eax, %edi), %xmm3
624	movdqa	%xmm3, %xmm4
625	palignr	$6, %xmm2, %xmm3
626	palignr	$6, %xmm1, %xmm2
627	lea	32(%edi), %edi
628	movdqa	%xmm2, -32(%edx, %edi)
629	movdqa	%xmm3, -16(%edx, %edi)
630
631	jb	L(shl_6_end)
632
633	movdqa	16(%eax, %edi), %xmm2
634	sub	$32, %ecx
635	movdqa	32(%eax, %edi), %xmm3
636	movdqa	%xmm3, %xmm1
637	palignr	$6, %xmm2, %xmm3
638	palignr	$6, %xmm4, %xmm2
639	lea	32(%edi), %edi
640	movdqa	%xmm2, -32(%edx, %edi)
641	movdqa	%xmm3, -16(%edx, %edi)
642
643	jae	L(shl_6_loop)
644
645L(shl_6_end):
646	add	$32, %ecx
647	add	%ecx, %edi
648	add	%edi, %edx
649	lea	6(%edi, %eax), %eax
650	POP (%edi)
651	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
652
653	cfi_restore_state
654	cfi_remember_state
655	ALIGN (4)
656L(shl_7):
657	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
658	sub	$7, %eax
659	movaps	(%eax), %xmm1
660	xor	%edi, %edi
661	sub	$32, %ecx
662	movdqu	%xmm0, (%esi)
663	POP (%esi)
664L(shl_7_loop):
665
666	movdqa	16(%eax, %edi), %xmm2
667	sub	$32, %ecx
668	movdqa	32(%eax, %edi), %xmm3
669	movdqa	%xmm3, %xmm4
670	palignr	$7, %xmm2, %xmm3
671	palignr	$7, %xmm1, %xmm2
672	lea	32(%edi), %edi
673	movdqa	%xmm2, -32(%edx, %edi)
674	movdqa	%xmm3, -16(%edx, %edi)
675
676	jb	L(shl_7_end)
677
678	movdqa	16(%eax, %edi), %xmm2
679	sub	$32, %ecx
680	movdqa	32(%eax, %edi), %xmm3
681	movdqa	%xmm3, %xmm1
682	palignr	$7, %xmm2, %xmm3
683	palignr	$7, %xmm4, %xmm2
684	lea	32(%edi), %edi
685	movdqa	%xmm2, -32(%edx, %edi)
686	movdqa	%xmm3, -16(%edx, %edi)
687
688	jae	L(shl_7_loop)
689
690L(shl_7_end):
691	add	$32, %ecx
692	add	%ecx, %edi
693	add	%edi, %edx
694	lea	7(%edi, %eax), %eax
695	POP (%edi)
696	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
697
698	cfi_restore_state
699	cfi_remember_state
700	ALIGN (4)
701L(shl_8):
702	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
703	sub	$8, %eax
704	movaps	(%eax), %xmm1
705	xor	%edi, %edi
706	sub	$32, %ecx
707	movdqu	%xmm0, (%esi)
708	POP (%esi)
709L(shl_8_loop):
710
711	movdqa	16(%eax, %edi), %xmm2
712	sub	$32, %ecx
713	movdqa	32(%eax, %edi), %xmm3
714	movdqa	%xmm3, %xmm4
715	palignr	$8, %xmm2, %xmm3
716	palignr	$8, %xmm1, %xmm2
717	lea	32(%edi), %edi
718	movdqa	%xmm2, -32(%edx, %edi)
719	movdqa	%xmm3, -16(%edx, %edi)
720
721	jb	L(shl_8_end)
722
723	movdqa	16(%eax, %edi), %xmm2
724	sub	$32, %ecx
725	movdqa	32(%eax, %edi), %xmm3
726	movdqa	%xmm3, %xmm1
727	palignr	$8, %xmm2, %xmm3
728	palignr	$8, %xmm4, %xmm2
729	lea	32(%edi), %edi
730	movdqa	%xmm2, -32(%edx, %edi)
731	movdqa	%xmm3, -16(%edx, %edi)
732
733	jae	L(shl_8_loop)
734
735L(shl_8_end):
736	add	$32, %ecx
737	add	%ecx, %edi
738	add	%edi, %edx
739	lea	8(%edi, %eax), %eax
740	POP (%edi)
741	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
742
743	cfi_restore_state
744	cfi_remember_state
745	ALIGN (4)
746L(shl_9):
747	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
748	sub	$9, %eax
749	movaps	(%eax), %xmm1
750	xor	%edi, %edi
751	sub	$32, %ecx
752	movdqu	%xmm0, (%esi)
753	POP (%esi)
754L(shl_9_loop):
755
756	movdqa	16(%eax, %edi), %xmm2
757	sub	$32, %ecx
758	movdqa	32(%eax, %edi), %xmm3
759	movdqa	%xmm3, %xmm4
760	palignr	$9, %xmm2, %xmm3
761	palignr	$9, %xmm1, %xmm2
762	lea	32(%edi), %edi
763	movdqa	%xmm2, -32(%edx, %edi)
764	movdqa	%xmm3, -16(%edx, %edi)
765
766	jb	L(shl_9_end)
767
768	movdqa	16(%eax, %edi), %xmm2
769	sub	$32, %ecx
770	movdqa	32(%eax, %edi), %xmm3
771	movdqa	%xmm3, %xmm1
772	palignr	$9, %xmm2, %xmm3
773	palignr	$9, %xmm4, %xmm2
774	lea	32(%edi), %edi
775	movdqa	%xmm2, -32(%edx, %edi)
776	movdqa	%xmm3, -16(%edx, %edi)
777
778	jae	L(shl_9_loop)
779
780L(shl_9_end):
781	add	$32, %ecx
782	add	%ecx, %edi
783	add	%edi, %edx
784	lea	9(%edi, %eax), %eax
785	POP (%edi)
786	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
787
788	cfi_restore_state
789	cfi_remember_state
790	ALIGN (4)
791L(shl_10):
792	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
793	sub	$10, %eax
794	movaps	(%eax), %xmm1
795	xor	%edi, %edi
796	sub	$32, %ecx
797	movdqu	%xmm0, (%esi)
798	POP (%esi)
799L(shl_10_loop):
800
801	movdqa	16(%eax, %edi), %xmm2
802	sub	$32, %ecx
803	movdqa	32(%eax, %edi), %xmm3
804	movdqa	%xmm3, %xmm4
805	palignr	$10, %xmm2, %xmm3
806	palignr	$10, %xmm1, %xmm2
807	lea	32(%edi), %edi
808	movdqa	%xmm2, -32(%edx, %edi)
809	movdqa	%xmm3, -16(%edx, %edi)
810
811	jb	L(shl_10_end)
812
813	movdqa	16(%eax, %edi), %xmm2
814	sub	$32, %ecx
815	movdqa	32(%eax, %edi), %xmm3
816	movdqa	%xmm3, %xmm1
817	palignr	$10, %xmm2, %xmm3
818	palignr	$10, %xmm4, %xmm2
819	lea	32(%edi), %edi
820	movdqa	%xmm2, -32(%edx, %edi)
821	movdqa	%xmm3, -16(%edx, %edi)
822
823	jae	L(shl_10_loop)
824
825L(shl_10_end):
826	add	$32, %ecx
827	add	%ecx, %edi
828	add	%edi, %edx
829	lea	10(%edi, %eax), %eax
830	POP (%edi)
831	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
832
833	cfi_restore_state
834	cfi_remember_state
835	ALIGN (4)
836L(shl_11):
837	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
838	sub	$11, %eax
839	movaps	(%eax), %xmm1
840	xor	%edi, %edi
841	sub	$32, %ecx
842	movdqu	%xmm0, (%esi)
843	POP (%esi)
844L(shl_11_loop):
845
846	movdqa	16(%eax, %edi), %xmm2
847	sub	$32, %ecx
848	movdqa	32(%eax, %edi), %xmm3
849	movdqa	%xmm3, %xmm4
850	palignr	$11, %xmm2, %xmm3
851	palignr	$11, %xmm1, %xmm2
852	lea	32(%edi), %edi
853	movdqa	%xmm2, -32(%edx, %edi)
854	movdqa	%xmm3, -16(%edx, %edi)
855
856	jb	L(shl_11_end)
857
858	movdqa	16(%eax, %edi), %xmm2
859	sub	$32, %ecx
860	movdqa	32(%eax, %edi), %xmm3
861	movdqa	%xmm3, %xmm1
862	palignr	$11, %xmm2, %xmm3
863	palignr	$11, %xmm4, %xmm2
864	lea	32(%edi), %edi
865	movdqa	%xmm2, -32(%edx, %edi)
866	movdqa	%xmm3, -16(%edx, %edi)
867
868	jae	L(shl_11_loop)
869
870L(shl_11_end):
871	add	$32, %ecx
872	add	%ecx, %edi
873	add	%edi, %edx
874	lea	11(%edi, %eax), %eax
875	POP (%edi)
876	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
877
878	cfi_restore_state
879	cfi_remember_state
880	ALIGN (4)
881L(shl_12):
882	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
883	sub	$12, %eax
884	movaps	(%eax), %xmm1
885	xor	%edi, %edi
886	sub	$32, %ecx
887	movdqu	%xmm0, (%esi)
888	POP (%esi)
889L(shl_12_loop):
890
891	movdqa	16(%eax, %edi), %xmm2
892	sub	$32, %ecx
893	movdqa	32(%eax, %edi), %xmm3
894	movdqa	%xmm3, %xmm4
895	palignr	$12, %xmm2, %xmm3
896	palignr	$12, %xmm1, %xmm2
897	lea	32(%edi), %edi
898	movdqa	%xmm2, -32(%edx, %edi)
899	movdqa	%xmm3, -16(%edx, %edi)
900
901	jb	L(shl_12_end)
902
903	movdqa	16(%eax, %edi), %xmm2
904	sub	$32, %ecx
905	movdqa	32(%eax, %edi), %xmm3
906	movdqa	%xmm3, %xmm1
907	palignr	$12, %xmm2, %xmm3
908	palignr	$12, %xmm4, %xmm2
909	lea	32(%edi), %edi
910	movdqa	%xmm2, -32(%edx, %edi)
911	movdqa	%xmm3, -16(%edx, %edi)
912
913	jae	L(shl_12_loop)
914
915L(shl_12_end):
916	add	$32, %ecx
917	add	%ecx, %edi
918	add	%edi, %edx
919	lea	12(%edi, %eax), %eax
920	POP (%edi)
921	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
922
923	cfi_restore_state
924	cfi_remember_state
925	ALIGN (4)
926L(shl_13):
927	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
928	sub	$13, %eax
929	movaps	(%eax), %xmm1
930	xor	%edi, %edi
931	sub	$32, %ecx
932	movdqu	%xmm0, (%esi)
933	POP (%esi)
934L(shl_13_loop):
935
936	movdqa	16(%eax, %edi), %xmm2
937	sub	$32, %ecx
938	movdqa	32(%eax, %edi), %xmm3
939	movdqa	%xmm3, %xmm4
940	palignr	$13, %xmm2, %xmm3
941	palignr	$13, %xmm1, %xmm2
942	lea	32(%edi), %edi
943	movdqa	%xmm2, -32(%edx, %edi)
944	movdqa	%xmm3, -16(%edx, %edi)
945
946	jb	L(shl_13_end)
947
948	movdqa	16(%eax, %edi), %xmm2
949	sub	$32, %ecx
950	movdqa	32(%eax, %edi), %xmm3
951	movdqa	%xmm3, %xmm1
952	palignr	$13, %xmm2, %xmm3
953	palignr	$13, %xmm4, %xmm2
954	lea	32(%edi), %edi
955	movdqa	%xmm2, -32(%edx, %edi)
956	movdqa	%xmm3, -16(%edx, %edi)
957
958	jae	L(shl_13_loop)
959
960L(shl_13_end):
961	add	$32, %ecx
962	add	%ecx, %edi
963	add	%edi, %edx
964	lea	13(%edi, %eax), %eax
965	POP (%edi)
966	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
967
968	cfi_restore_state
969	cfi_remember_state
970	ALIGN (4)
971L(shl_14):
972	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
973	sub	$14, %eax
974	movaps	(%eax), %xmm1
975	xor	%edi, %edi
976	sub	$32, %ecx
977	movdqu	%xmm0, (%esi)
978	POP (%esi)
979L(shl_14_loop):
980
981	movdqa	16(%eax, %edi), %xmm2
982	sub	$32, %ecx
983	movdqa	32(%eax, %edi), %xmm3
984	movdqa	%xmm3, %xmm4
985	palignr	$14, %xmm2, %xmm3
986	palignr	$14, %xmm1, %xmm2
987	lea	32(%edi), %edi
988	movdqa	%xmm2, -32(%edx, %edi)
989	movdqa	%xmm3, -16(%edx, %edi)
990
991	jb	L(shl_14_end)
992
993	movdqa	16(%eax, %edi), %xmm2
994	sub	$32, %ecx
995	movdqa	32(%eax, %edi), %xmm3
996	movdqa	%xmm3, %xmm1
997	palignr	$14, %xmm2, %xmm3
998	palignr	$14, %xmm4, %xmm2
999	lea	32(%edi), %edi
1000	movdqa	%xmm2, -32(%edx, %edi)
1001	movdqa	%xmm3, -16(%edx, %edi)
1002
1003	jae	L(shl_14_loop)
1004
1005L(shl_14_end):
1006	add	$32, %ecx
1007	add	%ecx, %edi
1008	add	%edi, %edx
1009	lea	14(%edi, %eax), %eax
1010	POP (%edi)
1011	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1012
1013	cfi_restore_state
1014	cfi_remember_state
1015	ALIGN (4)
1016L(shl_15):
1017	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
1018	sub	$15, %eax
1019	movaps	(%eax), %xmm1
1020	xor	%edi, %edi
1021	sub	$32, %ecx
1022	movdqu	%xmm0, (%esi)
1023	POP (%esi)
1024L(shl_15_loop):
1025
1026	movdqa	16(%eax, %edi), %xmm2
1027	sub	$32, %ecx
1028	movdqa	32(%eax, %edi), %xmm3
1029	movdqa	%xmm3, %xmm4
1030	palignr	$15, %xmm2, %xmm3
1031	palignr	$15, %xmm1, %xmm2
1032	lea	32(%edi), %edi
1033	movdqa	%xmm2, -32(%edx, %edi)
1034	movdqa	%xmm3, -16(%edx, %edi)
1035
1036	jb	L(shl_15_end)
1037
1038	movdqa	16(%eax, %edi), %xmm2
1039	sub	$32, %ecx
1040	movdqa	32(%eax, %edi), %xmm3
1041	movdqa	%xmm3, %xmm1
1042	palignr	$15, %xmm2, %xmm3
1043	palignr	$15, %xmm4, %xmm2
1044	lea	32(%edi), %edi
1045	movdqa	%xmm2, -32(%edx, %edi)
1046	movdqa	%xmm3, -16(%edx, %edi)
1047
1048	jae	L(shl_15_loop)
1049
1050L(shl_15_end):
1051	add	$32, %ecx
1052	add	%ecx, %edi
1053	add	%edi, %edx
1054	lea	15(%edi, %eax), %eax
1055	POP (%edi)
1056	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1057
1058
1059	ALIGN (4)
1060L(fwd_write_44bytes):
1061	movl	-44(%eax), %ecx
1062	movl	%ecx, -44(%edx)
1063L(fwd_write_40bytes):
1064	movl	-40(%eax), %ecx
1065	movl	%ecx, -40(%edx)
1066L(fwd_write_36bytes):
1067	movl	-36(%eax), %ecx
1068	movl	%ecx, -36(%edx)
1069L(fwd_write_32bytes):
1070	movl	-32(%eax), %ecx
1071	movl	%ecx, -32(%edx)
1072L(fwd_write_28bytes):
1073	movl	-28(%eax), %ecx
1074	movl	%ecx, -28(%edx)
1075L(fwd_write_24bytes):
1076	movl	-24(%eax), %ecx
1077	movl	%ecx, -24(%edx)
1078L(fwd_write_20bytes):
1079	movl	-20(%eax), %ecx
1080	movl	%ecx, -20(%edx)
1081L(fwd_write_16bytes):
1082	movl	-16(%eax), %ecx
1083	movl	%ecx, -16(%edx)
1084L(fwd_write_12bytes):
1085	movl	-12(%eax), %ecx
1086	movl	%ecx, -12(%edx)
1087L(fwd_write_8bytes):
1088	movl	-8(%eax), %ecx
1089	movl	%ecx, -8(%edx)
1090L(fwd_write_4bytes):
1091	movl	-4(%eax), %ecx
1092	movl	%ecx, -4(%edx)
1093L(fwd_write_0bytes):
1094#ifdef USE_AS_MEMPCPY
1095	movl	%edx, %eax
1096#else
1097	movl	DEST(%esp), %eax
1098#endif
1099	RETURN
1100
1101	ALIGN (4)
1102L(fwd_write_5bytes):
1103	movl	-5(%eax), %ecx
1104	movl	-4(%eax), %eax
1105	movl	%ecx, -5(%edx)
1106	movl	%eax, -4(%edx)
1107#ifdef USE_AS_MEMPCPY
1108	movl	%edx, %eax
1109#else
1110	movl	DEST(%esp), %eax
1111#endif
1112	RETURN
1113
1114	ALIGN (4)
1115L(fwd_write_45bytes):
1116	movl	-45(%eax), %ecx
1117	movl	%ecx, -45(%edx)
1118L(fwd_write_41bytes):
1119	movl	-41(%eax), %ecx
1120	movl	%ecx, -41(%edx)
1121L(fwd_write_37bytes):
1122	movl	-37(%eax), %ecx
1123	movl	%ecx, -37(%edx)
1124L(fwd_write_33bytes):
1125	movl	-33(%eax), %ecx
1126	movl	%ecx, -33(%edx)
1127L(fwd_write_29bytes):
1128	movl	-29(%eax), %ecx
1129	movl	%ecx, -29(%edx)
1130L(fwd_write_25bytes):
1131	movl	-25(%eax), %ecx
1132	movl	%ecx, -25(%edx)
1133L(fwd_write_21bytes):
1134	movl	-21(%eax), %ecx
1135	movl	%ecx, -21(%edx)
1136L(fwd_write_17bytes):
1137	movl	-17(%eax), %ecx
1138	movl	%ecx, -17(%edx)
1139L(fwd_write_13bytes):
1140	movl	-13(%eax), %ecx
1141	movl	%ecx, -13(%edx)
1142L(fwd_write_9bytes):
1143	movl	-9(%eax), %ecx
1144	movl	%ecx, -9(%edx)
1145	movl	-5(%eax), %ecx
1146	movl	%ecx, -5(%edx)
1147L(fwd_write_1bytes):
1148	movzbl	-1(%eax), %ecx
1149	movb	%cl, -1(%edx)
1150#ifdef USE_AS_MEMPCPY
1151	movl	%edx, %eax
1152#else
1153	movl	DEST(%esp), %eax
1154#endif
1155	RETURN
1156
1157	ALIGN (4)
1158L(fwd_write_46bytes):
1159	movl	-46(%eax), %ecx
1160	movl	%ecx, -46(%edx)
1161L(fwd_write_42bytes):
1162	movl	-42(%eax), %ecx
1163	movl	%ecx, -42(%edx)
1164L(fwd_write_38bytes):
1165	movl	-38(%eax), %ecx
1166	movl	%ecx, -38(%edx)
1167L(fwd_write_34bytes):
1168	movl	-34(%eax), %ecx
1169	movl	%ecx, -34(%edx)
1170L(fwd_write_30bytes):
1171	movl	-30(%eax), %ecx
1172	movl	%ecx, -30(%edx)
1173L(fwd_write_26bytes):
1174	movl	-26(%eax), %ecx
1175	movl	%ecx, -26(%edx)
1176L(fwd_write_22bytes):
1177	movl	-22(%eax), %ecx
1178	movl	%ecx, -22(%edx)
1179L(fwd_write_18bytes):
1180	movl	-18(%eax), %ecx
1181	movl	%ecx, -18(%edx)
1182L(fwd_write_14bytes):
1183	movl	-14(%eax), %ecx
1184	movl	%ecx, -14(%edx)
1185L(fwd_write_10bytes):
1186	movl	-10(%eax), %ecx
1187	movl	%ecx, -10(%edx)
1188L(fwd_write_6bytes):
1189	movl	-6(%eax), %ecx
1190	movl	%ecx, -6(%edx)
1191L(fwd_write_2bytes):
1192	movzwl	-2(%eax), %ecx
1193	movw	%cx, -2(%edx)
1194#ifdef USE_AS_MEMPCPY
1195	movl	%edx, %eax
1196#else
1197	movl	DEST(%esp), %eax
1198#endif
1199	RETURN
1200
1201	ALIGN (4)
1202L(fwd_write_47bytes):
1203	movl	-47(%eax), %ecx
1204	movl	%ecx, -47(%edx)
1205L(fwd_write_43bytes):
1206	movl	-43(%eax), %ecx
1207	movl	%ecx, -43(%edx)
1208L(fwd_write_39bytes):
1209	movl	-39(%eax), %ecx
1210	movl	%ecx, -39(%edx)
1211L(fwd_write_35bytes):
1212	movl	-35(%eax), %ecx
1213	movl	%ecx, -35(%edx)
1214L(fwd_write_31bytes):
1215	movl	-31(%eax), %ecx
1216	movl	%ecx, -31(%edx)
1217L(fwd_write_27bytes):
1218	movl	-27(%eax), %ecx
1219	movl	%ecx, -27(%edx)
1220L(fwd_write_23bytes):
1221	movl	-23(%eax), %ecx
1222	movl	%ecx, -23(%edx)
1223L(fwd_write_19bytes):
1224	movl	-19(%eax), %ecx
1225	movl	%ecx, -19(%edx)
1226L(fwd_write_15bytes):
1227	movl	-15(%eax), %ecx
1228	movl	%ecx, -15(%edx)
1229L(fwd_write_11bytes):
1230	movl	-11(%eax), %ecx
1231	movl	%ecx, -11(%edx)
1232L(fwd_write_7bytes):
1233	movl	-7(%eax), %ecx
1234	movl	%ecx, -7(%edx)
1235L(fwd_write_3bytes):
1236	movzwl	-3(%eax), %ecx
1237	movzbl	-1(%eax), %eax
1238	movw	%cx, -3(%edx)
1239	movb	%al, -1(%edx)
1240#ifdef USE_AS_MEMPCPY
1241	movl	%edx, %eax
1242#else
1243	movl	DEST(%esp), %eax
1244#endif
1245	RETURN_END
1246
1247	cfi_restore_state
1248	cfi_remember_state
1249	ALIGN (4)
1250L(large_page):
1251	movdqu	(%eax), %xmm1
1252	movdqu	%xmm0, (%esi)
1253	movntdq	%xmm1, (%edx)
1254	add	$0x10, %eax
1255	add	$0x10, %edx
1256	sub	$0x10, %ecx
1257	cmp	%al, %dl
1258	je	L(copy_page_by_rep)
1259L(large_page_loop_init):
1260	POP (%esi)
1261	sub	$0x80, %ecx
1262	POP (%edi)
1263L(large_page_loop):
1264	prefetchnta	0x1c0(%eax)
1265	prefetchnta	0x280(%eax)
1266	movdqu	(%eax), %xmm0
1267	movdqu	0x10(%eax), %xmm1
1268	movdqu	0x20(%eax), %xmm2
1269	movdqu	0x30(%eax), %xmm3
1270	movdqu	0x40(%eax), %xmm4
1271	movdqu	0x50(%eax), %xmm5
1272	movdqu	0x60(%eax), %xmm6
1273	movdqu	0x70(%eax), %xmm7
1274	lea	0x80(%eax), %eax
1275	lfence
1276	sub	$0x80, %ecx
1277	movntdq	%xmm0, (%edx)
1278	movntdq	%xmm1, 0x10(%edx)
1279	movntdq	%xmm2, 0x20(%edx)
1280	movntdq	%xmm3, 0x30(%edx)
1281	movntdq	%xmm4, 0x40(%edx)
1282	movntdq	%xmm5, 0x50(%edx)
1283	movntdq	%xmm6, 0x60(%edx)
1284	movntdq	%xmm7, 0x70(%edx)
1285	lea	0x80(%edx), %edx
1286	jae	L(large_page_loop)
1287	add	$0x80, %ecx
1288	cmp	$0x40, %ecx
1289	jb	L(large_page_less_64bytes)
1290
1291	movdqu	(%eax), %xmm0
1292	movdqu	0x10(%eax), %xmm1
1293	movdqu	0x20(%eax), %xmm2
1294	movdqu	0x30(%eax), %xmm3
1295	lea	0x40(%eax), %eax
1296
1297	movntdq	%xmm0, (%edx)
1298	movntdq	%xmm1, 0x10(%edx)
1299	movntdq	%xmm2, 0x20(%edx)
1300	movntdq	%xmm3, 0x30(%edx)
1301	lea	0x40(%edx), %edx
1302	sub	$0x40, %ecx
1303L(large_page_less_64bytes):
1304	cmp	$32, %ecx
1305	jb	L(large_page_less_32bytes)
1306	movdqu	(%eax), %xmm0
1307	movdqu	0x10(%eax), %xmm1
1308	lea	0x20(%eax), %eax
1309	movntdq	%xmm0, (%edx)
1310	movntdq	%xmm1, 0x10(%edx)
1311	lea	0x20(%edx), %edx
1312	sub	$0x20, %ecx
1313L(large_page_less_32bytes):
1314	add	%ecx, %edx
1315	add	%ecx, %eax
1316	sfence
1317	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
1318
1319	cfi_restore_state
1320	cfi_remember_state
1321	ALIGN (4)
1322L(copy_page_by_rep):
1323	mov	%eax, %esi
1324	mov	%edx, %edi
1325	mov	%ecx, %edx
1326	shr	$2, %ecx
1327	and	$3, %edx
1328	rep	movsl
1329	jz	L(copy_page_by_rep_exit)
1330	cmp	$2, %edx
1331	jb	L(copy_page_by_rep_left_1)
1332	movzwl	(%esi), %eax
1333	movw	%ax, (%edi)
1334	add	$2, %esi
1335	add	$2, %edi
1336	sub	$2, %edx
1337	jz	L(copy_page_by_rep_exit)
1338L(copy_page_by_rep_left_1):
1339	movzbl	(%esi), %eax
1340	movb	%al, (%edi)
1341L(copy_page_by_rep_exit):
1342	POP (%esi)
1343	POP (%edi)
1344	movl	DEST(%esp), %eax
1345#ifdef USE_AS_MEMPCPY
1346	movl	LEN(%esp), %ecx
1347	add	%ecx, %eax
1348#endif
1349	RETURN
1350
1351	ALIGN (4)
1352L(bk_write_44bytes):
1353	movl	40(%eax), %ecx
1354	movl	%ecx, 40(%edx)
1355L(bk_write_40bytes):
1356	movl	36(%eax), %ecx
1357	movl	%ecx, 36(%edx)
1358L(bk_write_36bytes):
1359	movl	32(%eax), %ecx
1360	movl	%ecx, 32(%edx)
1361L(bk_write_32bytes):
1362	movl	28(%eax), %ecx
1363	movl	%ecx, 28(%edx)
1364L(bk_write_28bytes):
1365	movl	24(%eax), %ecx
1366	movl	%ecx, 24(%edx)
1367L(bk_write_24bytes):
1368	movl	20(%eax), %ecx
1369	movl	%ecx, 20(%edx)
1370L(bk_write_20bytes):
1371	movl	16(%eax), %ecx
1372	movl	%ecx, 16(%edx)
1373L(bk_write_16bytes):
1374	movl	12(%eax), %ecx
1375	movl	%ecx, 12(%edx)
1376L(bk_write_12bytes):
1377	movl	8(%eax), %ecx
1378	movl	%ecx, 8(%edx)
1379L(bk_write_8bytes):
1380	movl	4(%eax), %ecx
1381	movl	%ecx, 4(%edx)
1382L(bk_write_4bytes):
1383	movl	(%eax), %ecx
1384	movl	%ecx, (%edx)
1385L(bk_write_0bytes):
1386	movl	DEST(%esp), %eax
1387#ifdef USE_AS_MEMPCPY
1388	movl	LEN(%esp), %ecx
1389	add	%ecx, %eax
1390#endif
1391	RETURN
1392
1393	ALIGN (4)
1394L(bk_write_45bytes):
1395	movl	41(%eax), %ecx
1396	movl	%ecx, 41(%edx)
1397L(bk_write_41bytes):
1398	movl	37(%eax), %ecx
1399	movl	%ecx, 37(%edx)
1400L(bk_write_37bytes):
1401	movl	33(%eax), %ecx
1402	movl	%ecx, 33(%edx)
1403L(bk_write_33bytes):
1404	movl	29(%eax), %ecx
1405	movl	%ecx, 29(%edx)
1406L(bk_write_29bytes):
1407	movl	25(%eax), %ecx
1408	movl	%ecx, 25(%edx)
1409L(bk_write_25bytes):
1410	movl	21(%eax), %ecx
1411	movl	%ecx, 21(%edx)
1412L(bk_write_21bytes):
1413	movl	17(%eax), %ecx
1414	movl	%ecx, 17(%edx)
1415L(bk_write_17bytes):
1416	movl	13(%eax), %ecx
1417	movl	%ecx, 13(%edx)
1418L(bk_write_13bytes):
1419	movl	9(%eax), %ecx
1420	movl	%ecx, 9(%edx)
1421L(bk_write_9bytes):
1422	movl	5(%eax), %ecx
1423	movl	%ecx, 5(%edx)
1424L(bk_write_5bytes):
1425	movl	1(%eax), %ecx
1426	movl	%ecx, 1(%edx)
1427L(bk_write_1bytes):
1428	movzbl	(%eax), %ecx
1429	movb	%cl, (%edx)
1430	movl	DEST(%esp), %eax
1431#ifdef USE_AS_MEMPCPY
1432	movl	LEN(%esp), %ecx
1433	add	%ecx, %eax
1434#endif
1435	RETURN
1436
1437	ALIGN (4)
1438L(bk_write_46bytes):
1439	movl	42(%eax), %ecx
1440	movl	%ecx, 42(%edx)
1441L(bk_write_42bytes):
1442	movl	38(%eax), %ecx
1443	movl	%ecx, 38(%edx)
1444L(bk_write_38bytes):
1445	movl	34(%eax), %ecx
1446	movl	%ecx, 34(%edx)
1447L(bk_write_34bytes):
1448	movl	30(%eax), %ecx
1449	movl	%ecx, 30(%edx)
1450L(bk_write_30bytes):
1451	movl	26(%eax), %ecx
1452	movl	%ecx, 26(%edx)
1453L(bk_write_26bytes):
1454	movl	22(%eax), %ecx
1455	movl	%ecx, 22(%edx)
1456L(bk_write_22bytes):
1457	movl	18(%eax), %ecx
1458	movl	%ecx, 18(%edx)
1459L(bk_write_18bytes):
1460	movl	14(%eax), %ecx
1461	movl	%ecx, 14(%edx)
1462L(bk_write_14bytes):
1463	movl	10(%eax), %ecx
1464	movl	%ecx, 10(%edx)
1465L(bk_write_10bytes):
1466	movl	6(%eax), %ecx
1467	movl	%ecx, 6(%edx)
1468L(bk_write_6bytes):
1469	movl	2(%eax), %ecx
1470	movl	%ecx, 2(%edx)
1471L(bk_write_2bytes):
1472	movzwl	(%eax), %ecx
1473	movw	%cx, (%edx)
1474	movl	DEST(%esp), %eax
1475#ifdef USE_AS_MEMPCPY
1476	movl	LEN(%esp), %ecx
1477	add	%ecx, %eax
1478#endif
1479	RETURN
1480
1481	ALIGN (4)
1482L(bk_write_47bytes):
1483	movl	43(%eax), %ecx
1484	movl	%ecx, 43(%edx)
1485L(bk_write_43bytes):
1486	movl	39(%eax), %ecx
1487	movl	%ecx, 39(%edx)
1488L(bk_write_39bytes):
1489	movl	35(%eax), %ecx
1490	movl	%ecx, 35(%edx)
1491L(bk_write_35bytes):
1492	movl	31(%eax), %ecx
1493	movl	%ecx, 31(%edx)
1494L(bk_write_31bytes):
1495	movl	27(%eax), %ecx
1496	movl	%ecx, 27(%edx)
1497L(bk_write_27bytes):
1498	movl	23(%eax), %ecx
1499	movl	%ecx, 23(%edx)
1500L(bk_write_23bytes):
1501	movl	19(%eax), %ecx
1502	movl	%ecx, 19(%edx)
1503L(bk_write_19bytes):
1504	movl	15(%eax), %ecx
1505	movl	%ecx, 15(%edx)
1506L(bk_write_15bytes):
1507	movl	11(%eax), %ecx
1508	movl	%ecx, 11(%edx)
1509L(bk_write_11bytes):
1510	movl	7(%eax), %ecx
1511	movl	%ecx, 7(%edx)
1512L(bk_write_7bytes):
1513	movl	3(%eax), %ecx
1514	movl	%ecx, 3(%edx)
1515L(bk_write_3bytes):
1516	movzwl	1(%eax), %ecx
1517	movw	%cx, 1(%edx)
1518	movzbl	(%eax), %eax
1519	movb	%al, (%edx)
1520	movl	DEST(%esp), %eax
1521#ifdef USE_AS_MEMPCPY
1522	movl	LEN(%esp), %ecx
1523	add	%ecx, %eax
1524#endif
1525	RETURN_END
1526
1527
1528	.pushsection .rodata.ssse3,"a",@progbits
1529	ALIGN (2)
1530L(table_48bytes_fwd):
1531	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
1532	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
1533	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
1534	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
1535	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
1536	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
1537	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
1538	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
1539	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
1540	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
1541	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
1542	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
1543	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
1544	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
1545	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
1546	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
1547	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
1548	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
1549	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
1550	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
1551	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
1552	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
1553	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
1554	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
1555	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
1556	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
1557	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
1558	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
1559	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
1560	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
1561	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
1562	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
1563	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
1564	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
1565	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
1566	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
1567	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
1568	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
1569	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
1570	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
1571	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
1572	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
1573	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
1574	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
1575	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
1576	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
1577	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
1578	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
1579
1580	ALIGN (2)
1581L(shl_table):
1582	.int	JMPTBL (L(shl_0), L(shl_table))
1583	.int	JMPTBL (L(shl_1), L(shl_table))
1584	.int	JMPTBL (L(shl_2), L(shl_table))
1585	.int	JMPTBL (L(shl_3), L(shl_table))
1586	.int	JMPTBL (L(shl_4), L(shl_table))
1587	.int	JMPTBL (L(shl_5), L(shl_table))
1588	.int	JMPTBL (L(shl_6), L(shl_table))
1589	.int	JMPTBL (L(shl_7), L(shl_table))
1590	.int	JMPTBL (L(shl_8), L(shl_table))
1591	.int	JMPTBL (L(shl_9), L(shl_table))
1592	.int	JMPTBL (L(shl_10), L(shl_table))
1593	.int	JMPTBL (L(shl_11), L(shl_table))
1594	.int	JMPTBL (L(shl_12), L(shl_table))
1595	.int	JMPTBL (L(shl_13), L(shl_table))
1596	.int	JMPTBL (L(shl_14), L(shl_table))
1597	.int	JMPTBL (L(shl_15), L(shl_table))
1598
1599	ALIGN (2)
1600L(table_48_bytes_bwd):
1601	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
1602	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
1603	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
1604	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
1605	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
1606	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
1607	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
1608	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
1609	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
1610	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
1611	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
1612	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
1613	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
1614	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
1615	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
1616	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
1617	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
1618	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
1619	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
1620	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
1621	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
1622	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
1623	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
1624	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
1625	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
1626	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
1627	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
1628	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
1629	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
1630	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
1631	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
1632	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
1633	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
1634	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
1635	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
1636	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
1637	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
1638	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
1639	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
1640	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
1641	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
1642	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
1643	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
1644	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
1645	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
1646	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
1647	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
1648	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
1649
1650	.popsection
1651
1652#ifdef USE_AS_MEMMOVE
1653	ALIGN (4)
1654L(copy_backward):
1655	PUSH (%esi)
1656	movl	%eax, %esi
1657	add	%ecx, %edx
1658	add	%ecx, %esi
1659	testl	$0x3, %edx
1660	jnz	L(bk_align)
1661
1662L(bk_aligned_4):
1663	cmp	$64, %ecx
1664	jae	L(bk_write_more64bytes)
1665
1666L(bk_write_64bytesless):
1667	cmp	$32, %ecx
1668	jb	L(bk_write_less32bytes)
1669
1670L(bk_write_more32bytes):
1671	/* Copy 32 bytes at a time.  */
1672	sub	$32, %ecx
1673	movl	-4(%esi), %eax
1674	movl	%eax, -4(%edx)
1675	movl	-8(%esi), %eax
1676	movl	%eax, -8(%edx)
1677	movl	-12(%esi), %eax
1678	movl	%eax, -12(%edx)
1679	movl	-16(%esi), %eax
1680	movl	%eax, -16(%edx)
1681	movl	-20(%esi), %eax
1682	movl	%eax, -20(%edx)
1683	movl	-24(%esi), %eax
1684	movl	%eax, -24(%edx)
1685	movl	-28(%esi), %eax
1686	movl	%eax, -28(%edx)
1687	movl	-32(%esi), %eax
1688	movl	%eax, -32(%edx)
1689	sub	$32, %edx
1690	sub	$32, %esi
1691
1692L(bk_write_less32bytes):
1693	movl	%esi, %eax
1694	sub	%ecx, %edx
1695	sub	%ecx, %eax
1696	POP (%esi)
1697L(bk_write_less48bytes):
1698	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
1699
1700	CFI_PUSH (%esi)
1701	ALIGN (4)
1702L(bk_align):
1703	cmp	$8, %ecx
1704	jbe	L(bk_write_less32bytes)
1705	testl	$1, %edx
1706	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
1707	   then (EDX & 2) must be != 0.  */
1708	jz	L(bk_got2)
1709	sub	$1, %esi
1710	sub	$1, %ecx
1711	sub	$1, %edx
1712	movzbl	(%esi), %eax
1713	movb	%al, (%edx)
1714
1715	testl	$2, %edx
1716	jz	L(bk_aligned_4)
1717
1718L(bk_got2):
1719	sub	$2, %esi
1720	sub	$2, %ecx
1721	sub	$2, %edx
1722	movzwl	(%esi), %eax
1723	movw	%ax, (%edx)
1724	jmp	L(bk_aligned_4)
1725
1726	ALIGN (4)
1727L(bk_write_more64bytes):
1728	/* Check alignment of last byte.  */
1729	testl	$15, %edx
1730	jz	L(bk_ssse3_cpy_pre)
1731
1732/* EDX is aligned 4 bytes, but not 16 bytes.  */
1733L(bk_ssse3_align):
1734	sub	$4, %esi
1735	sub	$4, %ecx
1736	sub	$4, %edx
1737	movl	(%esi), %eax
1738	movl	%eax, (%edx)
1739
1740	testl	$15, %edx
1741	jz	L(bk_ssse3_cpy_pre)
1742
1743	sub	$4, %esi
1744	sub	$4, %ecx
1745	sub	$4, %edx
1746	movl	(%esi), %eax
1747	movl	%eax, (%edx)
1748
1749	testl	$15, %edx
1750	jz	L(bk_ssse3_cpy_pre)
1751
1752	sub	$4, %esi
1753	sub	$4, %ecx
1754	sub	$4, %edx
1755	movl	(%esi), %eax
1756	movl	%eax, (%edx)
1757
1758L(bk_ssse3_cpy_pre):
1759	cmp	$64, %ecx
1760	jb	L(bk_write_more32bytes)
1761
1762L(bk_ssse3_cpy):
1763	sub	$64, %esi
1764	sub	$64, %ecx
1765	sub	$64, %edx
1766	movdqu	0x30(%esi), %xmm3
1767	movdqa	%xmm3, 0x30(%edx)
1768	movdqu	0x20(%esi), %xmm2
1769	movdqa	%xmm2, 0x20(%edx)
1770	movdqu	0x10(%esi), %xmm1
1771	movdqa	%xmm1, 0x10(%edx)
1772	movdqu	(%esi), %xmm0
1773	movdqa	%xmm0, (%edx)
1774	cmp	$64, %ecx
1775	jae	L(bk_ssse3_cpy)
1776	jmp	L(bk_write_64bytesless)
1777
1778#endif
1779
1780END (MEMCPY)
1781
1782#endif
1783