1/* memcpy with SSSE3
2   Copyright (C) 2010-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc) \
20    && (defined SHARED \
21	|| defined USE_AS_MEMMOVE \
22	|| !defined USE_MULTIARCH)
23
24# include <sysdep.h>
25# include "asm-syntax.h"
26
27# ifndef MEMCPY
28#  define MEMCPY		__memcpy_ssse3
29#  define MEMCPY_CHK	__memcpy_chk_ssse3
30# endif
31
32# define DEST		PARMS
33# define SRC		DEST+4
34# define LEN		SRC+4
35
36# define CFI_PUSH(REG)		\
37  cfi_adjust_cfa_offset (4);		\
38  cfi_rel_offset (REG, 0)
39
40# define CFI_POP(REG)		\
41  cfi_adjust_cfa_offset (-4);		\
42  cfi_restore (REG)
43
44# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
45# define POP(REG)	popl REG; CFI_POP (REG)
46
47# ifdef PIC
48#  define PARMS		8		/* Preserve EBX.  */
49#  define ENTRANCE	PUSH (%ebx);
50#  define RETURN_END	POP (%ebx); ret
51#  define RETURN		RETURN_END; CFI_PUSH (%ebx)
52#  define JMPTBL(I, B)	I - B
53
54/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
55	jump table with relative offsets.  INDEX is a register contains the
56	index into the jump table.   SCALE is the scale of INDEX. */
57
58#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
59    /* We first load PC into EBX.  */		\
60	SETUP_PIC_REG(bx);		\
61    /* Get the address of the jump table.  */		\
62	addl	$(TABLE - .), %ebx;		\
63    /* Get the entry and convert the relative offset to the		\
64	absolute	address.  */		\
65	addl	(%ebx, INDEX, SCALE), %ebx;		\
66    /* We loaded the jump table.  Go.  */		\
67	_CET_NOTRACK jmp *%ebx
68# else
69
70#  define PARMS		4
71#  define ENTRANCE
72#  define RETURN_END	ret
73#  define RETURN		RETURN_END
74#  define JMPTBL(I, B)	I
75
76/* Branch to an entry in a jump table.  TABLE is a jump table with
77	absolute offsets.  INDEX is a register contains the index into the
78	jump table.  SCALE is the scale of INDEX. */
79
80#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
81	_CET_NOTRACK jmp *TABLE(, INDEX, SCALE)
82# endif
83
84	.section .text.ssse3,"ax",@progbits
85# ifdef SHARED
86ENTRY (MEMCPY_CHK)
87	movl	12(%esp), %eax
88	cmpl	%eax, 16(%esp)
89	jb	HIDDEN_JUMPTARGET (__chk_fail)
90END (MEMCPY_CHK)
91# endif
92ENTRY (MEMCPY)
93	ENTRANCE
94	movl	LEN(%esp), %ecx
95	movl	SRC(%esp), %eax
96	movl	DEST(%esp), %edx
97
98# ifdef USE_AS_MEMMOVE
99	cmp	%eax, %edx
100	jb	L(copy_forward)
101	je	L(fwd_write_0bytes)
102	cmp	$32, %ecx
103	jae	L(memmove_bwd)
104	jmp	L(bk_write_less32bytes_2)
105
106	.p2align 4
107L(memmove_bwd):
108	add	%ecx, %eax
109	cmp	%eax, %edx
110	movl	SRC(%esp), %eax
111	jb	L(copy_backward)
112
113L(copy_forward):
114# endif
115	cmp	$48, %ecx
116	jae	L(48bytesormore)
117
118L(fwd_write_less32bytes):
119# ifndef USE_AS_MEMMOVE
120	cmp	%dl, %al
121	jb	L(bk_write)
122# endif
123	add	%ecx, %edx
124	add	%ecx, %eax
125	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
126# ifndef USE_AS_MEMMOVE
127	.p2align 4
128L(bk_write):
129	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
130# endif
131
132	.p2align 4
133L(48bytesormore):
134# ifndef USE_AS_MEMMOVE
135	movlpd	(%eax), %xmm0
136	movlpd	8(%eax), %xmm1
137	movlpd	%xmm0, (%edx)
138	movlpd	%xmm1, 8(%edx)
139# else
140	movdqu	(%eax), %xmm0
141# endif
142	PUSH (%edi)
143	movl	%edx, %edi
144	and	$-16, %edx
145	add	$16, %edx
146	sub	%edx, %edi
147	add	%edi, %ecx
148	sub	%edi, %eax
149
150# ifdef SHARED_CACHE_SIZE_HALF
151	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
152# else
153#  ifdef PIC
154	SETUP_PIC_REG(bx)
155	add	$_GLOBAL_OFFSET_TABLE_, %ebx
156	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
157#  else
158	cmp	__x86_shared_cache_size_half, %ecx
159#  endif
160# endif
161
162	mov	%eax, %edi
163	jae	L(large_page)
164	and	$0xf, %edi
165	jz	L(shl_0)
166	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
167
168	.p2align 4
169L(shl_0):
170# ifdef USE_AS_MEMMOVE
171	movl	DEST+4(%esp), %edi
172	movdqu	%xmm0, (%edi)
173# endif
174	xor	%edi, %edi
175	cmp	$127, %ecx
176	ja	L(shl_0_gobble)
177	lea	-32(%ecx), %ecx
178
179	.p2align 4
180L(shl_0_loop):
181	movdqa	(%eax, %edi), %xmm0
182	movdqa	16(%eax, %edi), %xmm1
183	sub	$32, %ecx
184	movdqa	%xmm0, (%edx, %edi)
185	movdqa	%xmm1, 16(%edx, %edi)
186	lea	32(%edi), %edi
187	jb	L(shl_0_end)
188
189	movdqa	(%eax, %edi), %xmm0
190	movdqa	16(%eax, %edi), %xmm1
191	sub	$32, %ecx
192	movdqa	%xmm0, (%edx, %edi)
193	movdqa	%xmm1, 16(%edx, %edi)
194	lea	32(%edi), %edi
195	jb	L(shl_0_end)
196
197	movdqa	(%eax, %edi), %xmm0
198	movdqa	16(%eax, %edi), %xmm1
199	sub	$32, %ecx
200	movdqa	%xmm0, (%edx, %edi)
201	movdqa	%xmm1, 16(%edx, %edi)
202	lea	32(%edi), %edi
203	jb	L(shl_0_end)
204
205	movdqa	(%eax, %edi), %xmm0
206	movdqa	16(%eax, %edi), %xmm1
207	sub	$32, %ecx
208	movdqa	%xmm0, (%edx, %edi)
209	movdqa	%xmm1, 16(%edx, %edi)
210	lea	32(%edi), %edi
211
212L(shl_0_end):
213	lea	32(%ecx), %ecx
214	add	%ecx, %edi
215	add	%edi, %edx
216	add	%edi, %eax
217	POP (%edi)
218	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
219
220	CFI_PUSH (%edi)
221
222	.p2align 4
223L(shl_0_gobble):
224# ifdef DATA_CACHE_SIZE_HALF
225	cmp	$DATA_CACHE_SIZE_HALF, %ecx
226# else
227#  ifdef PIC
228	SETUP_PIC_REG(bx)
229	add	$_GLOBAL_OFFSET_TABLE_, %ebx
230	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
231#  else
232	cmp	__x86_data_cache_size_half, %ecx
233#  endif
234# endif
235	POP	(%edi)
236	lea	-128(%ecx), %ecx
237	jae	L(shl_0_gobble_mem_loop)
238
239	.p2align 4
240L(shl_0_gobble_cache_loop):
241	movdqa	(%eax), %xmm0
242	movdqa	0x10(%eax), %xmm1
243	movdqa	0x20(%eax), %xmm2
244	movdqa	0x30(%eax), %xmm3
245	movdqa	0x40(%eax), %xmm4
246	movdqa	0x50(%eax), %xmm5
247	movdqa	0x60(%eax), %xmm6
248	movdqa	0x70(%eax), %xmm7
249	lea	0x80(%eax), %eax
250	sub	$128, %ecx
251	movdqa	%xmm0, (%edx)
252	movdqa	%xmm1, 0x10(%edx)
253	movdqa	%xmm2, 0x20(%edx)
254	movdqa	%xmm3, 0x30(%edx)
255	movdqa	%xmm4, 0x40(%edx)
256	movdqa	%xmm5, 0x50(%edx)
257	movdqa	%xmm6, 0x60(%edx)
258	movdqa	%xmm7, 0x70(%edx)
259	lea	0x80(%edx), %edx
260
261	jae	L(shl_0_gobble_cache_loop)
262	cmp	$-0x40, %ecx
263	lea	0x80(%ecx), %ecx
264	jl	L(shl_0_cache_less_64bytes)
265
266	movdqa	(%eax), %xmm0
267	sub	$0x40, %ecx
268	movdqa	0x10(%eax), %xmm1
269	movdqa	%xmm0, (%edx)
270	movdqa	%xmm1, 0x10(%edx)
271	movdqa	0x20(%eax), %xmm0
272	movdqa	0x30(%eax), %xmm1
273	add	$0x40, %eax
274	movdqa	%xmm0, 0x20(%edx)
275	movdqa	%xmm1, 0x30(%edx)
276	add	$0x40, %edx
277
278L(shl_0_cache_less_64bytes):
279	cmp	$0x20, %ecx
280	jb	L(shl_0_cache_less_32bytes)
281	movdqa	(%eax), %xmm0
282	sub	$0x20, %ecx
283	movdqa	0x10(%eax), %xmm1
284	add	$0x20, %eax
285	movdqa	%xmm0, (%edx)
286	movdqa	%xmm1, 0x10(%edx)
287	add	$0x20, %edx
288
289L(shl_0_cache_less_32bytes):
290	cmp	$0x10, %ecx
291	jb	L(shl_0_cache_less_16bytes)
292	sub	$0x10, %ecx
293	movdqa	(%eax), %xmm0
294	add	$0x10, %eax
295	movdqa	%xmm0, (%edx)
296	add	$0x10, %edx
297
298L(shl_0_cache_less_16bytes):
299	add	%ecx, %edx
300	add	%ecx, %eax
301	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
302
303	.p2align 4
304L(shl_0_gobble_mem_loop):
305	prefetcht0 0x1c0(%eax)
306	prefetcht0 0x280(%eax)
307	prefetcht0 0x1c0(%edx)
308
309	movdqa	(%eax), %xmm0
310	movdqa	0x10(%eax), %xmm1
311	movdqa	0x20(%eax), %xmm2
312	movdqa	0x30(%eax), %xmm3
313	movdqa	0x40(%eax), %xmm4
314	movdqa	0x50(%eax), %xmm5
315	movdqa	0x60(%eax), %xmm6
316	movdqa	0x70(%eax), %xmm7
317	lea	0x80(%eax), %eax
318	sub	$0x80, %ecx
319	movdqa	%xmm0, (%edx)
320	movdqa	%xmm1, 0x10(%edx)
321	movdqa	%xmm2, 0x20(%edx)
322	movdqa	%xmm3, 0x30(%edx)
323	movdqa	%xmm4, 0x40(%edx)
324	movdqa	%xmm5, 0x50(%edx)
325	movdqa	%xmm6, 0x60(%edx)
326	movdqa	%xmm7, 0x70(%edx)
327	lea	0x80(%edx), %edx
328
329	jae	L(shl_0_gobble_mem_loop)
330	cmp	$-0x40, %ecx
331	lea	0x80(%ecx), %ecx
332	jl	L(shl_0_mem_less_64bytes)
333
334	movdqa	(%eax), %xmm0
335	sub	$0x40, %ecx
336	movdqa	0x10(%eax), %xmm1
337
338	movdqa	%xmm0, (%edx)
339	movdqa	%xmm1, 0x10(%edx)
340
341	movdqa	0x20(%eax), %xmm0
342	movdqa	0x30(%eax), %xmm1
343	add	$0x40, %eax
344
345	movdqa	%xmm0, 0x20(%edx)
346	movdqa	%xmm1, 0x30(%edx)
347	add	$0x40, %edx
348
349L(shl_0_mem_less_64bytes):
350	cmp	$0x20, %ecx
351	jb	L(shl_0_mem_less_32bytes)
352	movdqa	(%eax), %xmm0
353	sub	$0x20, %ecx
354	movdqa	0x10(%eax), %xmm1
355	add	$0x20, %eax
356	movdqa	%xmm0, (%edx)
357	movdqa	%xmm1, 0x10(%edx)
358	add	$0x20, %edx
359
360L(shl_0_mem_less_32bytes):
361	cmp	$0x10, %ecx
362	jb	L(shl_0_mem_less_16bytes)
363	sub	$0x10, %ecx
364	movdqa	(%eax), %xmm0
365	add	$0x10, %eax
366	movdqa	%xmm0, (%edx)
367	add	$0x10, %edx
368
369L(shl_0_mem_less_16bytes):
370	add	%ecx, %edx
371	add	%ecx, %eax
372	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
373
374	.p2align 4
375L(shl_1):
376# ifndef USE_AS_MEMMOVE
377	movaps	-1(%eax), %xmm1
378# else
379	movl	DEST+4(%esp), %edi
380	movaps	-1(%eax), %xmm1
381	movdqu	%xmm0, (%edi)
382# endif
383# ifdef DATA_CACHE_SIZE_HALF
384	cmp	$DATA_CACHE_SIZE_HALF, %ecx
385# else
386#  ifdef PIC
387	SETUP_PIC_REG(bx)
388	add	$_GLOBAL_OFFSET_TABLE_, %ebx
389	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
390#  else
391	cmp	__x86_data_cache_size_half, %ecx
392#  endif
393# endif
394	jb L(sh_1_no_prefetch)
395
396	lea	-64(%ecx), %ecx
397
398	.p2align 4
399L(Shl1LoopStart):
400	prefetcht0 0x1c0(%eax)
401	prefetcht0 0x1c0(%edx)
402	movaps	15(%eax), %xmm2
403	movaps	31(%eax), %xmm3
404	movaps	47(%eax), %xmm4
405	movaps	63(%eax), %xmm5
406	movaps	%xmm5, %xmm7
407	palignr	$1, %xmm4, %xmm5
408	palignr	$1, %xmm3, %xmm4
409	movaps	%xmm5, 48(%edx)
410	palignr	$1, %xmm2, %xmm3
411	lea	64(%eax), %eax
412	palignr	$1, %xmm1, %xmm2
413	movaps	%xmm4, 32(%edx)
414	movaps	%xmm3, 16(%edx)
415	movaps	%xmm7, %xmm1
416	movaps	%xmm2, (%edx)
417	lea	64(%edx), %edx
418	sub	$64, %ecx
419	ja	L(Shl1LoopStart)
420
421L(Shl1LoopLeave):
422	add	$32, %ecx
423	jle	L(shl_end_0)
424
425	movaps	15(%eax), %xmm2
426	movaps	31(%eax), %xmm3
427	palignr	$1, %xmm2, %xmm3
428	palignr	$1, %xmm1, %xmm2
429	movaps	%xmm2, (%edx)
430	movaps	%xmm3, 16(%edx)
431	lea	32(%edx, %ecx), %edx
432	lea	32(%eax, %ecx), %eax
433	POP (%edi)
434	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
435
436	CFI_PUSH (%edi)
437
438	.p2align 4
439L(sh_1_no_prefetch):
440	lea	-32(%ecx), %ecx
441	lea	-1(%eax), %eax
442	xor	%edi, %edi
443
444	.p2align 4
445L(sh_1_no_prefetch_loop):
446	movdqa	16(%eax, %edi), %xmm2
447	sub	$32, %ecx
448	movdqa	32(%eax, %edi), %xmm3
449	movdqa	%xmm3, %xmm4
450	palignr	$1, %xmm2, %xmm3
451	palignr	$1, %xmm1, %xmm2
452	lea	32(%edi), %edi
453	movdqa	%xmm2, -32(%edx, %edi)
454	movdqa	%xmm3, -16(%edx, %edi)
455	jb	L(sh_1_end_no_prefetch_loop)
456
457	movdqa	16(%eax, %edi), %xmm2
458	sub	$32, %ecx
459	movdqa	32(%eax, %edi), %xmm3
460	movdqa	%xmm3, %xmm1
461	palignr	$1, %xmm2, %xmm3
462	palignr	$1, %xmm4, %xmm2
463	lea	32(%edi), %edi
464	movdqa	%xmm2, -32(%edx, %edi)
465	movdqa	%xmm3, -16(%edx, %edi)
466	jae	L(sh_1_no_prefetch_loop)
467
468L(sh_1_end_no_prefetch_loop):
469	lea	32(%ecx), %ecx
470	add	%ecx, %edi
471	add	%edi, %edx
472	lea	1(%edi, %eax), %eax
473	POP	(%edi)
474	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
475
476	CFI_PUSH (%edi)
477
478	.p2align 4
479L(shl_2):
480# ifndef USE_AS_MEMMOVE
481	movaps	-2(%eax), %xmm1
482# else
483	movl	DEST+4(%esp), %edi
484	movaps	-2(%eax), %xmm1
485	movdqu	%xmm0, (%edi)
486# endif
487# ifdef DATA_CACHE_SIZE_HALF
488	cmp	$DATA_CACHE_SIZE_HALF, %ecx
489# else
490#  ifdef PIC
491	SETUP_PIC_REG(bx)
492	add	$_GLOBAL_OFFSET_TABLE_, %ebx
493	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
494#  else
495	cmp	__x86_data_cache_size_half, %ecx
496#  endif
497# endif
498	jb L(sh_2_no_prefetch)
499
500	lea	-64(%ecx), %ecx
501
502	.p2align 4
503L(Shl2LoopStart):
504	prefetcht0 0x1c0(%eax)
505	prefetcht0 0x1c0(%edx)
506	movaps	14(%eax), %xmm2
507	movaps	30(%eax), %xmm3
508	movaps	46(%eax), %xmm4
509	movaps	62(%eax), %xmm5
510	movaps	%xmm5, %xmm7
511	palignr	$2, %xmm4, %xmm5
512	palignr	$2, %xmm3, %xmm4
513	movaps	%xmm5, 48(%edx)
514	palignr	$2, %xmm2, %xmm3
515	lea	64(%eax), %eax
516	palignr	$2, %xmm1, %xmm2
517	movaps	%xmm4, 32(%edx)
518	movaps	%xmm3, 16(%edx)
519	movaps	%xmm7, %xmm1
520	movaps	%xmm2, (%edx)
521	lea	64(%edx), %edx
522	sub	$64, %ecx
523	ja	L(Shl2LoopStart)
524
525L(Shl2LoopLeave):
526	add	$32, %ecx
527	jle	L(shl_end_0)
528
529	movaps	14(%eax), %xmm2
530	movaps	30(%eax), %xmm3
531	palignr	$2, %xmm2, %xmm3
532	palignr	$2, %xmm1, %xmm2
533	movaps	%xmm2, (%edx)
534	movaps	%xmm3, 16(%edx)
535	lea	32(%edx, %ecx), %edx
536	lea	32(%eax, %ecx), %eax
537	POP (%edi)
538	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
539
540	CFI_PUSH (%edi)
541
542	.p2align 4
543L(sh_2_no_prefetch):
544	lea	-32(%ecx), %ecx
545	lea	-2(%eax), %eax
546	xor	%edi, %edi
547
548	.p2align 4
549L(sh_2_no_prefetch_loop):
550	movdqa	16(%eax, %edi), %xmm2
551	sub	$32, %ecx
552	movdqa	32(%eax, %edi), %xmm3
553	movdqa	%xmm3, %xmm4
554	palignr	$2, %xmm2, %xmm3
555	palignr	$2, %xmm1, %xmm2
556	lea	32(%edi), %edi
557	movdqa	%xmm2, -32(%edx, %edi)
558	movdqa	%xmm3, -16(%edx, %edi)
559	jb	L(sh_2_end_no_prefetch_loop)
560
561	movdqa	16(%eax, %edi), %xmm2
562	sub	$32, %ecx
563	movdqa	32(%eax, %edi), %xmm3
564	movdqa	%xmm3, %xmm1
565	palignr	$2, %xmm2, %xmm3
566	palignr	$2, %xmm4, %xmm2
567	lea	32(%edi), %edi
568	movdqa	%xmm2, -32(%edx, %edi)
569	movdqa	%xmm3, -16(%edx, %edi)
570	jae	L(sh_2_no_prefetch_loop)
571
572L(sh_2_end_no_prefetch_loop):
573	lea	32(%ecx), %ecx
574	add	%ecx, %edi
575	add	%edi, %edx
576	lea	2(%edi, %eax), %eax
577	POP	(%edi)
578	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
579
580	CFI_PUSH (%edi)
581
582	.p2align 4
583L(shl_3):
584# ifndef USE_AS_MEMMOVE
585	movaps	-3(%eax), %xmm1
586# else
587	movl	DEST+4(%esp), %edi
588	movaps	-3(%eax), %xmm1
589	movdqu	%xmm0, (%edi)
590# endif
591# ifdef DATA_CACHE_SIZE_HALF
592	cmp	$DATA_CACHE_SIZE_HALF, %ecx
593# else
594#  ifdef PIC
595	SETUP_PIC_REG(bx)
596	add	$_GLOBAL_OFFSET_TABLE_, %ebx
597	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
598#  else
599	cmp	__x86_data_cache_size_half, %ecx
600#  endif
601# endif
602	jb L(sh_3_no_prefetch)
603
604	lea	-64(%ecx), %ecx
605
606	.p2align 4
607L(Shl3LoopStart):
608	prefetcht0 0x1c0(%eax)
609	prefetcht0 0x1c0(%edx)
610	movaps	13(%eax), %xmm2
611	movaps	29(%eax), %xmm3
612	movaps	45(%eax), %xmm4
613	movaps	61(%eax), %xmm5
614	movaps	%xmm5, %xmm7
615	palignr	$3, %xmm4, %xmm5
616	palignr	$3, %xmm3, %xmm4
617	movaps	%xmm5, 48(%edx)
618	palignr	$3, %xmm2, %xmm3
619	lea	64(%eax), %eax
620	palignr	$3, %xmm1, %xmm2
621	movaps	%xmm4, 32(%edx)
622	movaps	%xmm3, 16(%edx)
623	movaps	%xmm7, %xmm1
624	movaps	%xmm2, (%edx)
625	lea	64(%edx), %edx
626	sub	$64, %ecx
627	ja	L(Shl3LoopStart)
628
629L(Shl3LoopLeave):
630	add	$32, %ecx
631	jle	L(shl_end_0)
632
633	movaps	13(%eax), %xmm2
634	movaps	29(%eax), %xmm3
635	palignr	$3, %xmm2, %xmm3
636	palignr	$3, %xmm1, %xmm2
637	movaps	%xmm2, (%edx)
638	movaps	%xmm3, 16(%edx)
639	lea	32(%edx, %ecx), %edx
640	lea	32(%eax, %ecx), %eax
641	POP (%edi)
642	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
643
644	CFI_PUSH (%edi)
645
646	.p2align 4
647L(sh_3_no_prefetch):
648	lea	-32(%ecx), %ecx
649	lea	-3(%eax), %eax
650	xor	%edi, %edi
651
652	.p2align 4
653L(sh_3_no_prefetch_loop):
654	movdqa	16(%eax, %edi), %xmm2
655	sub	$32, %ecx
656	movdqa	32(%eax, %edi), %xmm3
657	movdqa	%xmm3, %xmm4
658	palignr	$3, %xmm2, %xmm3
659	palignr	$3, %xmm1, %xmm2
660	lea	32(%edi), %edi
661	movdqa	%xmm2, -32(%edx, %edi)
662	movdqa	%xmm3, -16(%edx, %edi)
663
664	jb	L(sh_3_end_no_prefetch_loop)
665
666	movdqa	16(%eax, %edi), %xmm2
667	sub	$32, %ecx
668	movdqa	32(%eax, %edi), %xmm3
669	movdqa	%xmm3, %xmm1
670	palignr	$3, %xmm2, %xmm3
671	palignr	$3, %xmm4, %xmm2
672	lea	32(%edi), %edi
673	movdqa	%xmm2, -32(%edx, %edi)
674	movdqa	%xmm3, -16(%edx, %edi)
675
676	jae	L(sh_3_no_prefetch_loop)
677
678L(sh_3_end_no_prefetch_loop):
679	lea	32(%ecx), %ecx
680	add	%ecx, %edi
681	add	%edi, %edx
682	lea	3(%edi, %eax), %eax
683	POP	(%edi)
684	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
685
686	CFI_PUSH (%edi)
687
688	.p2align 4
689L(shl_4):
690# ifndef USE_AS_MEMMOVE
691	movaps	-4(%eax), %xmm1
692# else
693	movl	DEST+4(%esp), %edi
694	movaps	-4(%eax), %xmm1
695	movdqu	%xmm0, (%edi)
696# endif
697# ifdef DATA_CACHE_SIZE_HALF
698	cmp	$DATA_CACHE_SIZE_HALF, %ecx
699# else
700#  ifdef PIC
701	SETUP_PIC_REG(bx)
702	add	$_GLOBAL_OFFSET_TABLE_, %ebx
703	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
704#  else
705	cmp	__x86_data_cache_size_half, %ecx
706#  endif
707# endif
708	jb L(sh_4_no_prefetch)
709
710	lea	-64(%ecx), %ecx
711
712	.p2align 4
713L(Shl4LoopStart):
714	prefetcht0 0x1c0(%eax)
715	prefetcht0 0x1c0(%edx)
716	movaps	12(%eax), %xmm2
717	movaps	28(%eax), %xmm3
718	movaps	44(%eax), %xmm4
719	movaps	60(%eax), %xmm5
720	movaps	%xmm5, %xmm7
721	palignr	$4, %xmm4, %xmm5
722	palignr	$4, %xmm3, %xmm4
723	movaps	%xmm5, 48(%edx)
724	palignr	$4, %xmm2, %xmm3
725	lea	64(%eax), %eax
726	palignr	$4, %xmm1, %xmm2
727	movaps	%xmm4, 32(%edx)
728	movaps	%xmm3, 16(%edx)
729	movaps	%xmm7, %xmm1
730	movaps	%xmm2, (%edx)
731	lea	64(%edx), %edx
732	sub	$64, %ecx
733	ja	L(Shl4LoopStart)
734
735L(Shl4LoopLeave):
736	add	$32, %ecx
737	jle	L(shl_end_0)
738
739	movaps	12(%eax), %xmm2
740	movaps	28(%eax), %xmm3
741	palignr	$4, %xmm2, %xmm3
742	palignr	$4, %xmm1, %xmm2
743	movaps	%xmm2, (%edx)
744	movaps	%xmm3, 16(%edx)
745	lea	32(%edx, %ecx), %edx
746	lea	32(%eax, %ecx), %eax
747	POP (%edi)
748	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
749
750	CFI_PUSH (%edi)
751
752	.p2align 4
753L(sh_4_no_prefetch):
754	lea	-32(%ecx), %ecx
755	lea	-4(%eax), %eax
756	xor	%edi, %edi
757
758	.p2align 4
759L(sh_4_no_prefetch_loop):
760	movdqa	16(%eax, %edi), %xmm2
761	sub	$32, %ecx
762	movdqa	32(%eax, %edi), %xmm3
763	movdqa	%xmm3, %xmm4
764	palignr	$4, %xmm2, %xmm3
765	palignr	$4, %xmm1, %xmm2
766	lea	32(%edi), %edi
767	movdqa	%xmm2, -32(%edx, %edi)
768	movdqa	%xmm3, -16(%edx, %edi)
769
770	jb	L(sh_4_end_no_prefetch_loop)
771
772	movdqa	16(%eax, %edi), %xmm2
773	sub	$32, %ecx
774	movdqa	32(%eax, %edi), %xmm3
775	movdqa	%xmm3, %xmm1
776	palignr	$4, %xmm2, %xmm3
777	palignr	$4, %xmm4, %xmm2
778	lea	32(%edi), %edi
779	movdqa	%xmm2, -32(%edx, %edi)
780	movdqa	%xmm3, -16(%edx, %edi)
781
782	jae	L(sh_4_no_prefetch_loop)
783
784L(sh_4_end_no_prefetch_loop):
785	lea	32(%ecx), %ecx
786	add	%ecx, %edi
787	add	%edi, %edx
788	lea	4(%edi, %eax), %eax
789	POP	(%edi)
790	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
791
792	CFI_PUSH (%edi)
793
794	.p2align 4
795L(shl_5):
796# ifndef USE_AS_MEMMOVE
797	movaps	-5(%eax), %xmm1
798# else
799	movl	DEST+4(%esp), %edi
800	movaps	-5(%eax), %xmm1
801	movdqu	%xmm0, (%edi)
802# endif
803# ifdef DATA_CACHE_SIZE_HALF
804	cmp	$DATA_CACHE_SIZE_HALF, %ecx
805# else
806#  ifdef PIC
807	SETUP_PIC_REG(bx)
808	add	$_GLOBAL_OFFSET_TABLE_, %ebx
809	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
810#  else
811	cmp	__x86_data_cache_size_half, %ecx
812#  endif
813# endif
814	jb L(sh_5_no_prefetch)
815
816	lea	-64(%ecx), %ecx
817
818	.p2align 4
819L(Shl5LoopStart):
820	prefetcht0 0x1c0(%eax)
821	prefetcht0 0x1c0(%edx)
822	movaps	11(%eax), %xmm2
823	movaps	27(%eax), %xmm3
824	movaps	43(%eax), %xmm4
825	movaps	59(%eax), %xmm5
826	movaps	%xmm5, %xmm7
827	palignr	$5, %xmm4, %xmm5
828	palignr	$5, %xmm3, %xmm4
829	movaps	%xmm5, 48(%edx)
830	palignr	$5, %xmm2, %xmm3
831	lea	64(%eax), %eax
832	palignr	$5, %xmm1, %xmm2
833	movaps	%xmm4, 32(%edx)
834	movaps	%xmm3, 16(%edx)
835	movaps	%xmm7, %xmm1
836	movaps	%xmm2, (%edx)
837	lea	64(%edx), %edx
838	sub	$64, %ecx
839	ja	L(Shl5LoopStart)
840
841L(Shl5LoopLeave):
842	add	$32, %ecx
843	jle	L(shl_end_0)
844
845	movaps	11(%eax), %xmm2
846	movaps	27(%eax), %xmm3
847	palignr	$5, %xmm2, %xmm3
848	palignr	$5, %xmm1, %xmm2
849	movaps	%xmm2, (%edx)
850	movaps	%xmm3, 16(%edx)
851	lea	32(%edx, %ecx), %edx
852	lea	32(%eax, %ecx), %eax
853	POP (%edi)
854	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
855
856	CFI_PUSH (%edi)
857
858	.p2align 4
859L(sh_5_no_prefetch):
860	lea	-32(%ecx), %ecx
861	lea	-5(%eax), %eax
862	xor	%edi, %edi
863
864	.p2align 4
865L(sh_5_no_prefetch_loop):
866	movdqa	16(%eax, %edi), %xmm2
867	sub	$32, %ecx
868	movdqa	32(%eax, %edi), %xmm3
869	movdqa	%xmm3, %xmm4
870	palignr	$5, %xmm2, %xmm3
871	palignr	$5, %xmm1, %xmm2
872	lea	32(%edi), %edi
873	movdqa	%xmm2, -32(%edx, %edi)
874	movdqa	%xmm3, -16(%edx, %edi)
875
876	jb	L(sh_5_end_no_prefetch_loop)
877
878	movdqa	16(%eax, %edi), %xmm2
879	sub	$32, %ecx
880	movdqa	32(%eax, %edi), %xmm3
881	movdqa	%xmm3, %xmm1
882	palignr	$5, %xmm2, %xmm3
883	palignr	$5, %xmm4, %xmm2
884	lea	32(%edi), %edi
885	movdqa	%xmm2, -32(%edx, %edi)
886	movdqa	%xmm3, -16(%edx, %edi)
887
888	jae	L(sh_5_no_prefetch_loop)
889
890L(sh_5_end_no_prefetch_loop):
891	lea	32(%ecx), %ecx
892	add	%ecx, %edi
893	add	%edi, %edx
894	lea	5(%edi, %eax), %eax
895	POP	(%edi)
896	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
897
898	CFI_PUSH (%edi)
899
900	.p2align 4
901L(shl_6):
902# ifndef USE_AS_MEMMOVE
903	movaps	-6(%eax), %xmm1
904# else
905	movl	DEST+4(%esp), %edi
906	movaps	-6(%eax), %xmm1
907	movdqu	%xmm0, (%edi)
908# endif
909# ifdef DATA_CACHE_SIZE_HALF
910	cmp	$DATA_CACHE_SIZE_HALF, %ecx
911# else
912#  ifdef PIC
913	SETUP_PIC_REG(bx)
914	add	$_GLOBAL_OFFSET_TABLE_, %ebx
915	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
916#  else
917	cmp	__x86_data_cache_size_half, %ecx
918#  endif
919# endif
920	jb L(sh_6_no_prefetch)
921
922	lea	-64(%ecx), %ecx
923
924	.p2align 4
925L(Shl6LoopStart):
926	prefetcht0 0x1c0(%eax)
927	prefetcht0 0x1c0(%edx)
928	movaps	10(%eax), %xmm2
929	movaps	26(%eax), %xmm3
930	movaps	42(%eax), %xmm4
931	movaps	58(%eax), %xmm5
932	movaps	%xmm5, %xmm7
933	palignr	$6, %xmm4, %xmm5
934	palignr	$6, %xmm3, %xmm4
935	movaps	%xmm5, 48(%edx)
936	palignr	$6, %xmm2, %xmm3
937	lea	64(%eax), %eax
938	palignr	$6, %xmm1, %xmm2
939	movaps	%xmm4, 32(%edx)
940	movaps	%xmm3, 16(%edx)
941	movaps	%xmm7, %xmm1
942	movaps	%xmm2, (%edx)
943	lea	64(%edx), %edx
944	sub	$64, %ecx
945	ja	L(Shl6LoopStart)
946
947L(Shl6LoopLeave):
948	add	$32, %ecx
949	jle	L(shl_end_0)
950
951	movaps	10(%eax), %xmm2
952	movaps	26(%eax), %xmm3
953	palignr	$6, %xmm2, %xmm3
954	palignr	$6, %xmm1, %xmm2
955	movaps	%xmm2, (%edx)
956	movaps	%xmm3, 16(%edx)
957	lea	32(%edx, %ecx), %edx
958	lea	32(%eax, %ecx), %eax
959	POP (%edi)
960	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
961
962	CFI_PUSH (%edi)
963
964	.p2align 4
965L(sh_6_no_prefetch):
966	lea	-32(%ecx), %ecx
967	lea	-6(%eax), %eax
968	xor	%edi, %edi
969
970	.p2align 4
971L(sh_6_no_prefetch_loop):
972	movdqa	16(%eax, %edi), %xmm2
973	sub	$32, %ecx
974	movdqa	32(%eax, %edi), %xmm3
975	movdqa	%xmm3, %xmm4
976	palignr	$6, %xmm2, %xmm3
977	palignr	$6, %xmm1, %xmm2
978	lea	32(%edi), %edi
979	movdqa	%xmm2, -32(%edx, %edi)
980	movdqa	%xmm3, -16(%edx, %edi)
981
982	jb	L(sh_6_end_no_prefetch_loop)
983
984	movdqa	16(%eax, %edi), %xmm2
985	sub	$32, %ecx
986	movdqa	32(%eax, %edi), %xmm3
987	movdqa	%xmm3, %xmm1
988	palignr	$6, %xmm2, %xmm3
989	palignr	$6, %xmm4, %xmm2
990	lea	32(%edi), %edi
991	movdqa	%xmm2, -32(%edx, %edi)
992	movdqa	%xmm3, -16(%edx, %edi)
993
994	jae	L(sh_6_no_prefetch_loop)
995
996L(sh_6_end_no_prefetch_loop):
997	lea	32(%ecx), %ecx
998	add	%ecx, %edi
999	add	%edi, %edx
1000	lea	6(%edi, %eax), %eax
1001	POP	(%edi)
1002	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1003
1004	CFI_PUSH (%edi)
1005
1006	.p2align 4
1007L(shl_7):
1008# ifndef USE_AS_MEMMOVE
1009	movaps	-7(%eax), %xmm1
1010# else
1011	movl	DEST+4(%esp), %edi
1012	movaps	-7(%eax), %xmm1
1013	movdqu	%xmm0, (%edi)
1014# endif
1015# ifdef DATA_CACHE_SIZE_HALF
1016	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1017# else
1018#  ifdef PIC
1019	SETUP_PIC_REG(bx)
1020	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1021	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1022#  else
1023	cmp	__x86_data_cache_size_half, %ecx
1024#  endif
1025# endif
1026	jb L(sh_7_no_prefetch)
1027
1028	lea	-64(%ecx), %ecx
1029
1030	.p2align 4
1031L(Shl7LoopStart):
1032	prefetcht0 0x1c0(%eax)
1033	prefetcht0 0x1c0(%edx)
1034	movaps	9(%eax), %xmm2
1035	movaps	25(%eax), %xmm3
1036	movaps	41(%eax), %xmm4
1037	movaps	57(%eax), %xmm5
1038	movaps	%xmm5, %xmm7
1039	palignr	$7, %xmm4, %xmm5
1040	palignr	$7, %xmm3, %xmm4
1041	movaps	%xmm5, 48(%edx)
1042	palignr	$7, %xmm2, %xmm3
1043	lea	64(%eax), %eax
1044	palignr	$7, %xmm1, %xmm2
1045	movaps	%xmm4, 32(%edx)
1046	movaps	%xmm3, 16(%edx)
1047	movaps	%xmm7, %xmm1
1048	movaps	%xmm2, (%edx)
1049	lea	64(%edx), %edx
1050	sub	$64, %ecx
1051	ja	L(Shl7LoopStart)
1052
1053L(Shl7LoopLeave):
1054	add	$32, %ecx
1055	jle	L(shl_end_0)
1056
1057	movaps	9(%eax), %xmm2
1058	movaps	25(%eax), %xmm3
1059	palignr	$7, %xmm2, %xmm3
1060	palignr	$7, %xmm1, %xmm2
1061	movaps	%xmm2, (%edx)
1062	movaps	%xmm3, 16(%edx)
1063	lea	32(%edx, %ecx), %edx
1064	lea	32(%eax, %ecx), %eax
1065	POP (%edi)
1066	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1067
1068	CFI_PUSH (%edi)
1069
1070	.p2align 4
1071L(sh_7_no_prefetch):
1072	lea	-32(%ecx), %ecx
1073	lea	-7(%eax), %eax
1074	xor	%edi, %edi
1075
1076	.p2align 4
1077L(sh_7_no_prefetch_loop):
1078	movdqa	16(%eax, %edi), %xmm2
1079	sub	$32, %ecx
1080	movdqa	32(%eax, %edi), %xmm3
1081	movdqa	%xmm3, %xmm4
1082	palignr	$7, %xmm2, %xmm3
1083	palignr	$7, %xmm1, %xmm2
1084	lea	32(%edi), %edi
1085	movdqa	%xmm2, -32(%edx, %edi)
1086	movdqa	%xmm3, -16(%edx, %edi)
1087	jb	L(sh_7_end_no_prefetch_loop)
1088
1089	movdqa	16(%eax, %edi), %xmm2
1090	sub	$32, %ecx
1091	movdqa	32(%eax, %edi), %xmm3
1092	movdqa	%xmm3, %xmm1
1093	palignr	$7, %xmm2, %xmm3
1094	palignr	$7, %xmm4, %xmm2
1095	lea	32(%edi), %edi
1096	movdqa	%xmm2, -32(%edx, %edi)
1097	movdqa	%xmm3, -16(%edx, %edi)
1098	jae	L(sh_7_no_prefetch_loop)
1099
1100L(sh_7_end_no_prefetch_loop):
1101	lea	32(%ecx), %ecx
1102	add	%ecx, %edi
1103	add	%edi, %edx
1104	lea	7(%edi, %eax), %eax
1105	POP	(%edi)
1106	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1107
1108	CFI_PUSH (%edi)
1109
1110	.p2align 4
1111L(shl_8):
1112# ifndef USE_AS_MEMMOVE
1113	movaps	-8(%eax), %xmm1
1114# else
1115	movl	DEST+4(%esp), %edi
1116	movaps	-8(%eax), %xmm1
1117	movdqu	%xmm0, (%edi)
1118# endif
1119# ifdef DATA_CACHE_SIZE_HALF
1120	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1121# else
1122#  ifdef PIC
1123	SETUP_PIC_REG(bx)
1124	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1125	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1126#  else
1127	cmp	__x86_data_cache_size_half, %ecx
1128#  endif
1129# endif
1130	jb L(sh_8_no_prefetch)
1131
1132	lea	-64(%ecx), %ecx
1133
1134	.p2align 4
1135L(Shl8LoopStart):
1136	prefetcht0 0x1c0(%eax)
1137	prefetcht0 0x1c0(%edx)
1138	movaps	8(%eax), %xmm2
1139	movaps	24(%eax), %xmm3
1140	movaps	40(%eax), %xmm4
1141	movaps	56(%eax), %xmm5
1142	movaps	%xmm5, %xmm7
1143	palignr	$8, %xmm4, %xmm5
1144	palignr	$8, %xmm3, %xmm4
1145	movaps	%xmm5, 48(%edx)
1146	palignr	$8, %xmm2, %xmm3
1147	lea	64(%eax), %eax
1148	palignr	$8, %xmm1, %xmm2
1149	movaps	%xmm4, 32(%edx)
1150	movaps	%xmm3, 16(%edx)
1151	movaps	%xmm7, %xmm1
1152	movaps	%xmm2, (%edx)
1153	lea	64(%edx), %edx
1154	sub	$64, %ecx
1155	ja	L(Shl8LoopStart)
1156
1157L(LoopLeave8):
1158	add	$32, %ecx
1159	jle	L(shl_end_0)
1160
1161	movaps	8(%eax), %xmm2
1162	movaps	24(%eax), %xmm3
1163	palignr	$8, %xmm2, %xmm3
1164	palignr	$8, %xmm1, %xmm2
1165	movaps	%xmm2, (%edx)
1166	movaps	%xmm3, 16(%edx)
1167	lea	32(%edx, %ecx), %edx
1168	lea	32(%eax, %ecx), %eax
1169	POP (%edi)
1170	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1171
1172	CFI_PUSH (%edi)
1173
1174	.p2align 4
1175L(sh_8_no_prefetch):
1176	lea	-32(%ecx), %ecx
1177	lea	-8(%eax), %eax
1178	xor	%edi, %edi
1179
1180	.p2align 4
1181L(sh_8_no_prefetch_loop):
1182	movdqa	16(%eax, %edi), %xmm2
1183	sub	$32, %ecx
1184	movdqa	32(%eax, %edi), %xmm3
1185	movdqa	%xmm3, %xmm4
1186	palignr	$8, %xmm2, %xmm3
1187	palignr	$8, %xmm1, %xmm2
1188	lea	32(%edi), %edi
1189	movdqa	%xmm2, -32(%edx, %edi)
1190	movdqa	%xmm3, -16(%edx, %edi)
1191	jb	L(sh_8_end_no_prefetch_loop)
1192
1193	movdqa	16(%eax, %edi), %xmm2
1194	sub	$32, %ecx
1195	movdqa	32(%eax, %edi), %xmm3
1196	movdqa	%xmm3, %xmm1
1197	palignr	$8, %xmm2, %xmm3
1198	palignr	$8, %xmm4, %xmm2
1199	lea	32(%edi), %edi
1200	movdqa	%xmm2, -32(%edx, %edi)
1201	movdqa	%xmm3, -16(%edx, %edi)
1202	jae	L(sh_8_no_prefetch_loop)
1203
1204L(sh_8_end_no_prefetch_loop):
1205	lea	32(%ecx), %ecx
1206	add	%ecx, %edi
1207	add	%edi, %edx
1208	lea	8(%edi, %eax), %eax
1209	POP	(%edi)
1210	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1211
1212	CFI_PUSH (%edi)
1213
1214	.p2align 4
1215L(shl_9):
1216# ifndef USE_AS_MEMMOVE
1217	movaps	-9(%eax), %xmm1
1218# else
1219	movl	DEST+4(%esp), %edi
1220	movaps	-9(%eax), %xmm1
1221	movdqu	%xmm0, (%edi)
1222# endif
1223# ifdef DATA_CACHE_SIZE_HALF
1224	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1225# else
1226#  ifdef PIC
1227	SETUP_PIC_REG(bx)
1228	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1229	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1230#  else
1231	cmp	__x86_data_cache_size_half, %ecx
1232#  endif
1233# endif
1234	jb L(sh_9_no_prefetch)
1235
1236	lea	-64(%ecx), %ecx
1237
1238	.p2align 4
1239L(Shl9LoopStart):
1240	prefetcht0 0x1c0(%eax)
1241	prefetcht0 0x1c0(%edx)
1242	movaps	7(%eax), %xmm2
1243	movaps	23(%eax), %xmm3
1244	movaps	39(%eax), %xmm4
1245	movaps	55(%eax), %xmm5
1246	movaps	%xmm5, %xmm7
1247	palignr	$9, %xmm4, %xmm5
1248	palignr	$9, %xmm3, %xmm4
1249	movaps	%xmm5, 48(%edx)
1250	palignr	$9, %xmm2, %xmm3
1251	lea	64(%eax), %eax
1252	palignr	$9, %xmm1, %xmm2
1253	movaps	%xmm4, 32(%edx)
1254	movaps	%xmm3, 16(%edx)
1255	movaps	%xmm7, %xmm1
1256	movaps	%xmm2, (%edx)
1257	lea	64(%edx), %edx
1258	sub	$64, %ecx
1259	ja	L(Shl9LoopStart)
1260
1261L(Shl9LoopLeave):
1262	add	$32, %ecx
1263	jle	L(shl_end_0)
1264
1265	movaps	7(%eax), %xmm2
1266	movaps	23(%eax), %xmm3
1267	palignr	$9, %xmm2, %xmm3
1268	palignr	$9, %xmm1, %xmm2
1269
1270	movaps	%xmm2, (%edx)
1271	movaps	%xmm3, 16(%edx)
1272	lea	32(%edx, %ecx), %edx
1273	lea	32(%eax, %ecx), %eax
1274	POP (%edi)
1275	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1276
1277	CFI_PUSH (%edi)
1278
1279	.p2align 4
1280L(sh_9_no_prefetch):
1281	lea	-32(%ecx), %ecx
1282	lea	-9(%eax), %eax
1283	xor	%edi, %edi
1284
1285	.p2align 4
1286L(sh_9_no_prefetch_loop):
1287	movdqa	16(%eax, %edi), %xmm2
1288	sub	$32, %ecx
1289	movdqa	32(%eax, %edi), %xmm3
1290	movdqa	%xmm3, %xmm4
1291	palignr	$9, %xmm2, %xmm3
1292	palignr	$9, %xmm1, %xmm2
1293	lea	32(%edi), %edi
1294	movdqa	%xmm2, -32(%edx, %edi)
1295	movdqa	%xmm3, -16(%edx, %edi)
1296	jb	L(sh_9_end_no_prefetch_loop)
1297
1298	movdqa	16(%eax, %edi), %xmm2
1299	sub	$32, %ecx
1300	movdqa	32(%eax, %edi), %xmm3
1301	movdqa	%xmm3, %xmm1
1302	palignr	$9, %xmm2, %xmm3
1303	palignr	$9, %xmm4, %xmm2
1304	lea	32(%edi), %edi
1305	movdqa	%xmm2, -32(%edx, %edi)
1306	movdqa	%xmm3, -16(%edx, %edi)
1307	jae	L(sh_9_no_prefetch_loop)
1308
1309L(sh_9_end_no_prefetch_loop):
1310	lea	32(%ecx), %ecx
1311	add	%ecx, %edi
1312	add	%edi, %edx
1313	lea	9(%edi, %eax), %eax
1314	POP	(%edi)
1315	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1316
1317	CFI_PUSH (%edi)
1318
1319	.p2align 4
1320L(shl_10):
1321# ifndef USE_AS_MEMMOVE
1322	movaps	-10(%eax), %xmm1
1323# else
1324	movl	DEST+4(%esp), %edi
1325	movaps	-10(%eax), %xmm1
1326	movdqu	%xmm0, (%edi)
1327# endif
1328# ifdef DATA_CACHE_SIZE_HALF
1329	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1330# else
1331#  ifdef PIC
1332	SETUP_PIC_REG(bx)
1333	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1334	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1335#  else
1336	cmp	__x86_data_cache_size_half, %ecx
1337#  endif
1338# endif
1339	jb L(sh_10_no_prefetch)
1340
1341	lea	-64(%ecx), %ecx
1342
1343	.p2align 4
1344L(Shl10LoopStart):
1345	prefetcht0 0x1c0(%eax)
1346	prefetcht0 0x1c0(%edx)
1347	movaps	6(%eax), %xmm2
1348	movaps	22(%eax), %xmm3
1349	movaps	38(%eax), %xmm4
1350	movaps	54(%eax), %xmm5
1351	movaps	%xmm5, %xmm7
1352	palignr	$10, %xmm4, %xmm5
1353	palignr	$10, %xmm3, %xmm4
1354	movaps	%xmm5, 48(%edx)
1355	palignr	$10, %xmm2, %xmm3
1356	lea	64(%eax), %eax
1357	palignr	$10, %xmm1, %xmm2
1358	movaps	%xmm4, 32(%edx)
1359	movaps	%xmm3, 16(%edx)
1360	movaps	%xmm7, %xmm1
1361	movaps	%xmm2, (%edx)
1362	lea	64(%edx), %edx
1363	sub	$64, %ecx
1364	ja	L(Shl10LoopStart)
1365
1366L(Shl10LoopLeave):
1367	add	$32, %ecx
1368	jle	L(shl_end_0)
1369
1370	movaps	6(%eax), %xmm2
1371	movaps	22(%eax), %xmm3
1372	palignr	$10, %xmm2, %xmm3
1373	palignr	$10, %xmm1, %xmm2
1374
1375	movaps	%xmm2, (%edx)
1376	movaps	%xmm3, 16(%edx)
1377	lea	32(%edx, %ecx), %edx
1378	lea	32(%eax, %ecx), %eax
1379	POP (%edi)
1380	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1381
1382	CFI_PUSH (%edi)
1383
1384	.p2align 4
1385L(sh_10_no_prefetch):
1386	lea	-32(%ecx), %ecx
1387	lea	-10(%eax), %eax
1388	xor	%edi, %edi
1389
1390	.p2align 4
1391L(sh_10_no_prefetch_loop):
1392	movdqa	16(%eax, %edi), %xmm2
1393	sub	$32, %ecx
1394	movdqa	32(%eax, %edi), %xmm3
1395	movdqa	%xmm3, %xmm4
1396	palignr	$10, %xmm2, %xmm3
1397	palignr	$10, %xmm1, %xmm2
1398	lea	32(%edi), %edi
1399	movdqa	%xmm2, -32(%edx, %edi)
1400	movdqa	%xmm3, -16(%edx, %edi)
1401	jb	L(sh_10_end_no_prefetch_loop)
1402
1403	movdqa	16(%eax, %edi), %xmm2
1404	sub	$32, %ecx
1405	movdqa	32(%eax, %edi), %xmm3
1406	movdqa	%xmm3, %xmm1
1407	palignr	$10, %xmm2, %xmm3
1408	palignr	$10, %xmm4, %xmm2
1409	lea	32(%edi), %edi
1410	movdqa	%xmm2, -32(%edx, %edi)
1411	movdqa	%xmm3, -16(%edx, %edi)
1412	jae	L(sh_10_no_prefetch_loop)
1413
1414L(sh_10_end_no_prefetch_loop):
1415	lea	32(%ecx), %ecx
1416	add	%ecx, %edi
1417	add	%edi, %edx
1418	lea	10(%edi, %eax), %eax
1419	POP	(%edi)
1420	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1421
1422	CFI_PUSH (%edi)
1423
1424	.p2align 4
1425L(shl_11):
1426# ifndef USE_AS_MEMMOVE
1427	movaps	-11(%eax), %xmm1
1428# else
1429	movl	DEST+4(%esp), %edi
1430	movaps	-11(%eax), %xmm1
1431	movdqu	%xmm0, (%edi)
1432# endif
1433# ifdef DATA_CACHE_SIZE_HALF
1434	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1435# else
1436#  ifdef PIC
1437	SETUP_PIC_REG(bx)
1438	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1439	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1440#  else
1441	cmp	__x86_data_cache_size_half, %ecx
1442#  endif
1443# endif
1444	jb L(sh_11_no_prefetch)
1445
1446	lea	-64(%ecx), %ecx
1447
1448	.p2align 4
1449L(Shl11LoopStart):
1450	prefetcht0 0x1c0(%eax)
1451	prefetcht0 0x1c0(%edx)
1452	movaps	5(%eax), %xmm2
1453	movaps	21(%eax), %xmm3
1454	movaps	37(%eax), %xmm4
1455	movaps	53(%eax), %xmm5
1456	movaps	%xmm5, %xmm7
1457	palignr	$11, %xmm4, %xmm5
1458	palignr	$11, %xmm3, %xmm4
1459	movaps	%xmm5, 48(%edx)
1460	palignr	$11, %xmm2, %xmm3
1461	lea	64(%eax), %eax
1462	palignr	$11, %xmm1, %xmm2
1463	movaps	%xmm4, 32(%edx)
1464	movaps	%xmm3, 16(%edx)
1465	movaps	%xmm7, %xmm1
1466	movaps	%xmm2, (%edx)
1467	lea	64(%edx), %edx
1468	sub	$64, %ecx
1469	ja	L(Shl11LoopStart)
1470
1471L(Shl11LoopLeave):
1472	add	$32, %ecx
1473	jle	L(shl_end_0)
1474
1475	movaps	5(%eax), %xmm2
1476	movaps	21(%eax), %xmm3
1477	palignr	$11, %xmm2, %xmm3
1478	palignr	$11, %xmm1, %xmm2
1479
1480	movaps	%xmm2, (%edx)
1481	movaps	%xmm3, 16(%edx)
1482	lea	32(%edx, %ecx), %edx
1483	lea	32(%eax, %ecx), %eax
1484	POP (%edi)
1485	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1486
1487	CFI_PUSH (%edi)
1488
1489	.p2align 4
1490L(sh_11_no_prefetch):
1491	lea	-32(%ecx), %ecx
1492	lea	-11(%eax), %eax
1493	xor	%edi, %edi
1494
1495	.p2align 4
1496L(sh_11_no_prefetch_loop):
1497	movdqa	16(%eax, %edi), %xmm2
1498	sub	$32, %ecx
1499	movdqa	32(%eax, %edi), %xmm3
1500	movdqa	%xmm3, %xmm4
1501	palignr	$11, %xmm2, %xmm3
1502	palignr	$11, %xmm1, %xmm2
1503	lea	32(%edi), %edi
1504	movdqa	%xmm2, -32(%edx, %edi)
1505	movdqa	%xmm3, -16(%edx, %edi)
1506	jb	L(sh_11_end_no_prefetch_loop)
1507
1508	movdqa	16(%eax, %edi), %xmm2
1509	sub	$32, %ecx
1510	movdqa	32(%eax, %edi), %xmm3
1511	movdqa	%xmm3, %xmm1
1512	palignr	$11, %xmm2, %xmm3
1513	palignr	$11, %xmm4, %xmm2
1514	lea	32(%edi), %edi
1515	movdqa	%xmm2, -32(%edx, %edi)
1516	movdqa	%xmm3, -16(%edx, %edi)
1517	jae	L(sh_11_no_prefetch_loop)
1518
1519L(sh_11_end_no_prefetch_loop):
1520	lea	32(%ecx), %ecx
1521	add	%ecx, %edi
1522	add	%edi, %edx
1523	lea	11(%edi, %eax), %eax
1524	POP	(%edi)
1525	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1526
1527	CFI_PUSH (%edi)
1528
1529	.p2align 4
1530L(shl_12):
1531# ifndef USE_AS_MEMMOVE
1532	movaps	-12(%eax), %xmm1
1533# else
1534	movl	DEST+4(%esp), %edi
1535	movaps	-12(%eax), %xmm1
1536	movdqu	%xmm0, (%edi)
1537# endif
1538# ifdef DATA_CACHE_SIZE_HALF
1539	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1540# else
1541#  ifdef PIC
1542	SETUP_PIC_REG(bx)
1543	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1544	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1545#  else
1546	cmp	__x86_data_cache_size_half, %ecx
1547#  endif
1548# endif
1549	jb L(sh_12_no_prefetch)
1550
1551	lea	-64(%ecx), %ecx
1552
1553	.p2align 4
1554L(Shl12LoopStart):
1555	prefetcht0 0x1c0(%eax)
1556	prefetcht0 0x1c0(%edx)
1557	movaps	4(%eax), %xmm2
1558	movaps	20(%eax), %xmm3
1559	movaps	36(%eax), %xmm4
1560	movaps	52(%eax), %xmm5
1561	movaps	%xmm5, %xmm7
1562	palignr	$12, %xmm4, %xmm5
1563	palignr	$12, %xmm3, %xmm4
1564	movaps	%xmm5, 48(%edx)
1565	palignr	$12, %xmm2, %xmm3
1566	lea	64(%eax), %eax
1567	palignr	$12, %xmm1, %xmm2
1568	movaps	%xmm4, 32(%edx)
1569	movaps	%xmm3, 16(%edx)
1570	movaps	%xmm7, %xmm1
1571	movaps	%xmm2, (%edx)
1572	lea	64(%edx), %edx
1573	sub	$64, %ecx
1574	ja	L(Shl12LoopStart)
1575
1576L(Shl12LoopLeave):
1577	add	$32, %ecx
1578	jle	L(shl_end_0)
1579
1580	movaps	4(%eax), %xmm2
1581	movaps	20(%eax), %xmm3
1582	palignr	$12, %xmm2, %xmm3
1583	palignr	$12, %xmm1, %xmm2
1584
1585	movaps	%xmm2, (%edx)
1586	movaps	%xmm3, 16(%edx)
1587	lea	32(%edx, %ecx), %edx
1588	lea	32(%eax, %ecx), %eax
1589	POP (%edi)
1590	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1591
1592	CFI_PUSH (%edi)
1593
1594	.p2align 4
1595L(sh_12_no_prefetch):
1596	lea	-32(%ecx), %ecx
1597	lea	-12(%eax), %eax
1598	xor	%edi, %edi
1599
1600	.p2align 4
1601L(sh_12_no_prefetch_loop):
1602	movdqa	16(%eax, %edi), %xmm2
1603	sub	$32, %ecx
1604	movdqa	32(%eax, %edi), %xmm3
1605	movdqa	%xmm3, %xmm4
1606	palignr	$12, %xmm2, %xmm3
1607	palignr	$12, %xmm1, %xmm2
1608	lea	32(%edi), %edi
1609	movdqa	%xmm2, -32(%edx, %edi)
1610	movdqa	%xmm3, -16(%edx, %edi)
1611	jb	L(sh_12_end_no_prefetch_loop)
1612
1613	movdqa	16(%eax, %edi), %xmm2
1614	sub	$32, %ecx
1615	movdqa	32(%eax, %edi), %xmm3
1616	movdqa	%xmm3, %xmm1
1617	palignr	$12, %xmm2, %xmm3
1618	palignr	$12, %xmm4, %xmm2
1619	lea	32(%edi), %edi
1620	movdqa	%xmm2, -32(%edx, %edi)
1621	movdqa	%xmm3, -16(%edx, %edi)
1622	jae	L(sh_12_no_prefetch_loop)
1623
1624L(sh_12_end_no_prefetch_loop):
1625	lea	32(%ecx), %ecx
1626	add	%ecx, %edi
1627	add	%edi, %edx
1628	lea	12(%edi, %eax), %eax
1629	POP	(%edi)
1630	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1631
1632	CFI_PUSH (%edi)
1633
1634	.p2align 4
1635L(shl_13):
1636# ifndef USE_AS_MEMMOVE
1637	movaps	-13(%eax), %xmm1
1638# else
1639	movl	DEST+4(%esp), %edi
1640	movaps	-13(%eax), %xmm1
1641	movdqu	%xmm0, (%edi)
1642# endif
1643# ifdef DATA_CACHE_SIZE_HALF
1644	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1645# else
1646#  ifdef PIC
1647	SETUP_PIC_REG(bx)
1648	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1649	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1650#  else
1651	cmp	__x86_data_cache_size_half, %ecx
1652#  endif
1653# endif
1654	jb L(sh_13_no_prefetch)
1655
1656	lea	-64(%ecx), %ecx
1657
1658	.p2align 4
1659L(Shl13LoopStart):
1660	prefetcht0 0x1c0(%eax)
1661	prefetcht0 0x1c0(%edx)
1662	movaps	3(%eax), %xmm2
1663	movaps	19(%eax), %xmm3
1664	movaps	35(%eax), %xmm4
1665	movaps	51(%eax), %xmm5
1666	movaps	%xmm5, %xmm7
1667	palignr	$13, %xmm4, %xmm5
1668	palignr	$13, %xmm3, %xmm4
1669	movaps	%xmm5, 48(%edx)
1670	palignr	$13, %xmm2, %xmm3
1671	lea	64(%eax), %eax
1672	palignr	$13, %xmm1, %xmm2
1673	movaps	%xmm4, 32(%edx)
1674	movaps	%xmm3, 16(%edx)
1675	movaps	%xmm7, %xmm1
1676	movaps	%xmm2, (%edx)
1677	lea	64(%edx), %edx
1678	sub	$64, %ecx
1679	ja	L(Shl13LoopStart)
1680
1681L(Shl13LoopLeave):
1682	add	$32, %ecx
1683	jle	L(shl_end_0)
1684
1685	movaps	3(%eax), %xmm2
1686	movaps	19(%eax), %xmm3
1687	palignr	$13, %xmm2, %xmm3
1688	palignr	$13, %xmm1, %xmm2
1689
1690	movaps	%xmm2, (%edx)
1691	movaps	%xmm3, 16(%edx)
1692	lea	32(%edx, %ecx), %edx
1693	lea	32(%eax, %ecx), %eax
1694	POP (%edi)
1695	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1696
1697	CFI_PUSH (%edi)
1698
1699	.p2align 4
1700L(sh_13_no_prefetch):
1701	lea	-32(%ecx), %ecx
1702	lea	-13(%eax), %eax
1703	xor	%edi, %edi
1704
1705	.p2align 4
1706L(sh_13_no_prefetch_loop):
1707	movdqa	16(%eax, %edi), %xmm2
1708	sub	$32, %ecx
1709	movdqa	32(%eax, %edi), %xmm3
1710	movdqa	%xmm3, %xmm4
1711	palignr	$13, %xmm2, %xmm3
1712	palignr	$13, %xmm1, %xmm2
1713	lea	32(%edi), %edi
1714	movdqa	%xmm2, -32(%edx, %edi)
1715	movdqa	%xmm3, -16(%edx, %edi)
1716	jb	L(sh_13_end_no_prefetch_loop)
1717
1718	movdqa	16(%eax, %edi), %xmm2
1719	sub	$32, %ecx
1720	movdqa	32(%eax, %edi), %xmm3
1721	movdqa	%xmm3, %xmm1
1722	palignr	$13, %xmm2, %xmm3
1723	palignr	$13, %xmm4, %xmm2
1724	lea	32(%edi), %edi
1725	movdqa	%xmm2, -32(%edx, %edi)
1726	movdqa	%xmm3, -16(%edx, %edi)
1727	jae	L(sh_13_no_prefetch_loop)
1728
1729L(sh_13_end_no_prefetch_loop):
1730	lea	32(%ecx), %ecx
1731	add	%ecx, %edi
1732	add	%edi, %edx
1733	lea	13(%edi, %eax), %eax
1734	POP	(%edi)
1735	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1736
1737	CFI_PUSH (%edi)
1738
1739	.p2align 4
1740L(shl_14):
1741# ifndef USE_AS_MEMMOVE
1742	movaps	-14(%eax), %xmm1
1743# else
1744	movl	DEST+4(%esp), %edi
1745	movaps	-14(%eax), %xmm1
1746	movdqu	%xmm0, (%edi)
1747# endif
1748# ifdef DATA_CACHE_SIZE_HALF
1749	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1750# else
1751#  ifdef PIC
1752	SETUP_PIC_REG(bx)
1753	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1754	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1755#  else
1756	cmp	__x86_data_cache_size_half, %ecx
1757#  endif
1758# endif
1759	jb L(sh_14_no_prefetch)
1760
1761	lea	-64(%ecx), %ecx
1762
1763	.p2align 4
1764L(Shl14LoopStart):
1765	prefetcht0 0x1c0(%eax)
1766	prefetcht0 0x1c0(%edx)
1767	movaps	2(%eax), %xmm2
1768	movaps	18(%eax), %xmm3
1769	movaps	34(%eax), %xmm4
1770	movaps	50(%eax), %xmm5
1771	movaps	%xmm5, %xmm7
1772	palignr	$14, %xmm4, %xmm5
1773	palignr	$14, %xmm3, %xmm4
1774	movaps	%xmm5, 48(%edx)
1775	palignr	$14, %xmm2, %xmm3
1776	lea	64(%eax), %eax
1777	palignr	$14, %xmm1, %xmm2
1778	movaps	%xmm4, 32(%edx)
1779	movaps	%xmm3, 16(%edx)
1780	movaps	%xmm7, %xmm1
1781	movaps	%xmm2, (%edx)
1782	lea	64(%edx), %edx
1783	sub	$64, %ecx
1784	ja	L(Shl14LoopStart)
1785
1786L(Shl14LoopLeave):
1787	add	$32, %ecx
1788	jle	L(shl_end_0)
1789
1790	movaps	2(%eax), %xmm2
1791	movaps	18(%eax), %xmm3
1792	palignr	$14, %xmm2, %xmm3
1793	palignr	$14, %xmm1, %xmm2
1794
1795	movaps	%xmm2, (%edx)
1796	movaps	%xmm3, 16(%edx)
1797	lea	32(%edx, %ecx), %edx
1798	lea	32(%eax, %ecx), %eax
1799	POP (%edi)
1800	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1801
1802	CFI_PUSH (%edi)
1803
1804	.p2align 4
1805L(sh_14_no_prefetch):
1806	lea	-32(%ecx), %ecx
1807	lea	-14(%eax), %eax
1808	xor	%edi, %edi
1809
1810	.p2align 4
1811L(sh_14_no_prefetch_loop):
1812	movdqa	16(%eax, %edi), %xmm2
1813	sub	$32, %ecx
1814	movdqa	32(%eax, %edi), %xmm3
1815	movdqa	%xmm3, %xmm4
1816	palignr	$14, %xmm2, %xmm3
1817	palignr	$14, %xmm1, %xmm2
1818	lea	32(%edi), %edi
1819	movdqa	%xmm2, -32(%edx, %edi)
1820	movdqa	%xmm3, -16(%edx, %edi)
1821	jb	L(sh_14_end_no_prefetch_loop)
1822
1823	movdqa	16(%eax, %edi), %xmm2
1824	sub	$32, %ecx
1825	movdqa	32(%eax, %edi), %xmm3
1826	movdqa	%xmm3, %xmm1
1827	palignr	$14, %xmm2, %xmm3
1828	palignr	$14, %xmm4, %xmm2
1829	lea	32(%edi), %edi
1830	movdqa	%xmm2, -32(%edx, %edi)
1831	movdqa	%xmm3, -16(%edx, %edi)
1832	jae	L(sh_14_no_prefetch_loop)
1833
1834L(sh_14_end_no_prefetch_loop):
1835	lea	32(%ecx), %ecx
1836	add	%ecx, %edi
1837	add	%edi, %edx
1838	lea	14(%edi, %eax), %eax
1839	POP	(%edi)
1840	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1841
1842	CFI_PUSH (%edi)
1843
1844	.p2align 4
1845L(shl_15):
1846# ifndef USE_AS_MEMMOVE
1847	movaps	-15(%eax), %xmm1
1848# else
1849	movl	DEST+4(%esp), %edi
1850	movaps	-15(%eax), %xmm1
1851	movdqu	%xmm0, (%edi)
1852# endif
1853# ifdef DATA_CACHE_SIZE_HALF
1854	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1855# else
1856#  ifdef PIC
1857	SETUP_PIC_REG(bx)
1858	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1859	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1860#  else
1861	cmp	__x86_data_cache_size_half, %ecx
1862#  endif
1863# endif
1864	jb L(sh_15_no_prefetch)
1865
1866	lea	-64(%ecx), %ecx
1867
1868	.p2align 4
1869L(Shl15LoopStart):
1870	prefetcht0 0x1c0(%eax)
1871	prefetcht0 0x1c0(%edx)
1872	movaps	1(%eax), %xmm2
1873	movaps	17(%eax), %xmm3
1874	movaps	33(%eax), %xmm4
1875	movaps	49(%eax), %xmm5
1876	movaps	%xmm5, %xmm7
1877	palignr	$15, %xmm4, %xmm5
1878	palignr	$15, %xmm3, %xmm4
1879	movaps	%xmm5, 48(%edx)
1880	palignr	$15, %xmm2, %xmm3
1881	lea	64(%eax), %eax
1882	palignr	$15, %xmm1, %xmm2
1883	movaps	%xmm4, 32(%edx)
1884	movaps	%xmm3, 16(%edx)
1885	movaps	%xmm7, %xmm1
1886	movaps	%xmm2, (%edx)
1887	lea	64(%edx), %edx
1888	sub	$64, %ecx
1889	ja	L(Shl15LoopStart)
1890
1891L(Shl15LoopLeave):
1892	add	$32, %ecx
1893	jle	L(shl_end_0)
1894
1895	movaps	1(%eax), %xmm2
1896	movaps	17(%eax), %xmm3
1897	palignr	$15, %xmm2, %xmm3
1898	palignr	$15, %xmm1, %xmm2
1899
1900	movaps	%xmm2, (%edx)
1901	movaps	%xmm3, 16(%edx)
1902	lea	32(%edx, %ecx), %edx
1903	lea	32(%eax, %ecx), %eax
1904	POP (%edi)
1905	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1906
1907	CFI_PUSH (%edi)
1908
1909	.p2align 4
1910L(sh_15_no_prefetch):
1911	lea	-32(%ecx), %ecx
1912	lea	-15(%eax), %eax
1913	xor	%edi, %edi
1914
1915	.p2align 4
1916L(sh_15_no_prefetch_loop):
1917	movdqa	16(%eax, %edi), %xmm2
1918	sub	$32, %ecx
1919	movdqa	32(%eax, %edi), %xmm3
1920	movdqa	%xmm3, %xmm4
1921	palignr	$15, %xmm2, %xmm3
1922	palignr	$15, %xmm1, %xmm2
1923	lea	32(%edi), %edi
1924	movdqa	%xmm2, -32(%edx, %edi)
1925	movdqa	%xmm3, -16(%edx, %edi)
1926	jb	L(sh_15_end_no_prefetch_loop)
1927
1928	movdqa	16(%eax, %edi), %xmm2
1929	sub	$32, %ecx
1930	movdqa	32(%eax, %edi), %xmm3
1931	movdqa	%xmm3, %xmm1
1932	palignr	$15, %xmm2, %xmm3
1933	palignr	$15, %xmm4, %xmm2
1934	lea	32(%edi), %edi
1935	movdqa	%xmm2, -32(%edx, %edi)
1936	movdqa	%xmm3, -16(%edx, %edi)
1937	jae	L(sh_15_no_prefetch_loop)
1938
1939L(sh_15_end_no_prefetch_loop):
1940	lea	32(%ecx), %ecx
1941	add	%ecx, %edi
1942	add	%edi, %edx
1943	lea	15(%edi, %eax), %eax
1944	POP	(%edi)
1945	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1946
1947	CFI_PUSH (%edi)
1948
1949	.p2align 4
1950L(shl_end_0):
1951	lea	32(%ecx), %ecx
1952	lea	(%edx, %ecx), %edx
1953	lea	(%eax, %ecx), %eax
1954	POP	(%edi)
1955	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1956
1957	.p2align 4
1958L(fwd_write_44bytes):
1959	movq	-44(%eax), %xmm0
1960	movq	%xmm0, -44(%edx)
1961L(fwd_write_36bytes):
1962	movq	-36(%eax), %xmm0
1963	movq	%xmm0, -36(%edx)
1964L(fwd_write_28bytes):
1965	movq	-28(%eax), %xmm0
1966	movq	%xmm0, -28(%edx)
1967L(fwd_write_20bytes):
1968	movq	-20(%eax), %xmm0
1969	movq	%xmm0, -20(%edx)
1970L(fwd_write_12bytes):
1971	movq	-12(%eax), %xmm0
1972	movq	%xmm0, -12(%edx)
1973L(fwd_write_4bytes):
1974	movl	-4(%eax), %ecx
1975	movl	%ecx, -4(%edx)
1976# ifdef USE_AS_MEMPCPY
1977	movl	%edx, %eax
1978# else
1979	movl	DEST(%esp), %eax
1980# endif
1981	RETURN
1982
1983	.p2align 4
1984L(fwd_write_40bytes):
1985	movq	-40(%eax), %xmm0
1986	movq	%xmm0, -40(%edx)
1987L(fwd_write_32bytes):
1988	movq	-32(%eax), %xmm0
1989	movq	%xmm0, -32(%edx)
1990L(fwd_write_24bytes):
1991	movq	-24(%eax), %xmm0
1992	movq	%xmm0, -24(%edx)
1993L(fwd_write_16bytes):
1994	movq	-16(%eax), %xmm0
1995	movq	%xmm0, -16(%edx)
1996L(fwd_write_8bytes):
1997	movq	-8(%eax), %xmm0
1998	movq	%xmm0, -8(%edx)
1999L(fwd_write_0bytes):
2000# ifdef USE_AS_MEMPCPY
2001	movl	%edx, %eax
2002# else
2003	movl	DEST(%esp), %eax
2004# endif
2005	RETURN
2006
2007	.p2align 4
2008L(fwd_write_5bytes):
2009	movl	-5(%eax), %ecx
2010	movl	-4(%eax), %eax
2011	movl	%ecx, -5(%edx)
2012	movl	%eax, -4(%edx)
2013# ifdef USE_AS_MEMPCPY
2014	movl	%edx, %eax
2015# else
2016	movl	DEST(%esp), %eax
2017# endif
2018	RETURN
2019
2020	.p2align 4
2021L(fwd_write_45bytes):
2022	movq	-45(%eax), %xmm0
2023	movq	%xmm0, -45(%edx)
2024L(fwd_write_37bytes):
2025	movq	-37(%eax), %xmm0
2026	movq	%xmm0, -37(%edx)
2027L(fwd_write_29bytes):
2028	movq	-29(%eax), %xmm0
2029	movq	%xmm0, -29(%edx)
2030L(fwd_write_21bytes):
2031	movq	-21(%eax), %xmm0
2032	movq	%xmm0, -21(%edx)
2033L(fwd_write_13bytes):
2034	movq	-13(%eax), %xmm0
2035	movq	%xmm0, -13(%edx)
2036	movl	-5(%eax), %ecx
2037	movl	%ecx, -5(%edx)
2038	movzbl	-1(%eax), %ecx
2039	movb	%cl, -1(%edx)
2040# ifdef USE_AS_MEMPCPY
2041	movl	%edx, %eax
2042# else
2043	movl	DEST(%esp), %eax
2044# endif
2045	RETURN
2046
2047	.p2align 4
2048L(fwd_write_41bytes):
2049	movq	-41(%eax), %xmm0
2050	movq	%xmm0, -41(%edx)
2051L(fwd_write_33bytes):
2052	movq	-33(%eax), %xmm0
2053	movq	%xmm0, -33(%edx)
2054L(fwd_write_25bytes):
2055	movq	-25(%eax), %xmm0
2056	movq	%xmm0, -25(%edx)
2057L(fwd_write_17bytes):
2058	movq	-17(%eax), %xmm0
2059	movq	%xmm0, -17(%edx)
2060L(fwd_write_9bytes):
2061	movq	-9(%eax), %xmm0
2062	movq	%xmm0, -9(%edx)
2063L(fwd_write_1bytes):
2064	movzbl	-1(%eax), %ecx
2065	movb	%cl, -1(%edx)
2066# ifdef USE_AS_MEMPCPY
2067	movl	%edx, %eax
2068# else
2069	movl	DEST(%esp), %eax
2070# endif
2071	RETURN
2072
2073	.p2align 4
2074L(fwd_write_46bytes):
2075	movq	-46(%eax), %xmm0
2076	movq	%xmm0, -46(%edx)
2077L(fwd_write_38bytes):
2078	movq	-38(%eax), %xmm0
2079	movq	%xmm0, -38(%edx)
2080L(fwd_write_30bytes):
2081	movq	-30(%eax), %xmm0
2082	movq	%xmm0, -30(%edx)
2083L(fwd_write_22bytes):
2084	movq	-22(%eax), %xmm0
2085	movq	%xmm0, -22(%edx)
2086L(fwd_write_14bytes):
2087	movq	-14(%eax), %xmm0
2088	movq	%xmm0, -14(%edx)
2089L(fwd_write_6bytes):
2090	movl	-6(%eax), %ecx
2091	movl	%ecx, -6(%edx)
2092	movzwl	-2(%eax), %ecx
2093	movw	%cx, -2(%edx)
2094# ifdef USE_AS_MEMPCPY
2095	movl	%edx, %eax
2096# else
2097	movl	DEST(%esp), %eax
2098# endif
2099	RETURN
2100
2101	.p2align 4
2102L(fwd_write_42bytes):
2103	movq	-42(%eax), %xmm0
2104	movq	%xmm0, -42(%edx)
2105L(fwd_write_34bytes):
2106	movq	-34(%eax), %xmm0
2107	movq	%xmm0, -34(%edx)
2108L(fwd_write_26bytes):
2109	movq	-26(%eax), %xmm0
2110	movq	%xmm0, -26(%edx)
2111L(fwd_write_18bytes):
2112	movq	-18(%eax), %xmm0
2113	movq	%xmm0, -18(%edx)
2114L(fwd_write_10bytes):
2115	movq	-10(%eax), %xmm0
2116	movq	%xmm0, -10(%edx)
2117L(fwd_write_2bytes):
2118	movzwl	-2(%eax), %ecx
2119	movw	%cx, -2(%edx)
2120# ifdef USE_AS_MEMPCPY
2121	movl	%edx, %eax
2122# else
2123	movl	DEST(%esp), %eax
2124# endif
2125	RETURN
2126
2127	.p2align 4
2128L(fwd_write_47bytes):
2129	movq	-47(%eax), %xmm0
2130	movq	%xmm0, -47(%edx)
2131L(fwd_write_39bytes):
2132	movq	-39(%eax), %xmm0
2133	movq	%xmm0, -39(%edx)
2134L(fwd_write_31bytes):
2135	movq	-31(%eax), %xmm0
2136	movq	%xmm0, -31(%edx)
2137L(fwd_write_23bytes):
2138	movq	-23(%eax), %xmm0
2139	movq	%xmm0, -23(%edx)
2140L(fwd_write_15bytes):
2141	movq	-15(%eax), %xmm0
2142	movq	%xmm0, -15(%edx)
2143L(fwd_write_7bytes):
2144	movl	-7(%eax), %ecx
2145	movl	%ecx, -7(%edx)
2146	movzwl	-3(%eax), %ecx
2147	movzbl	-1(%eax), %eax
2148	movw	%cx, -3(%edx)
2149	movb	%al, -1(%edx)
2150# ifdef USE_AS_MEMPCPY
2151	movl	%edx, %eax
2152# else
2153	movl	DEST(%esp), %eax
2154# endif
2155	RETURN
2156
2157	.p2align 4
2158L(fwd_write_43bytes):
2159	movq	-43(%eax), %xmm0
2160	movq	%xmm0, -43(%edx)
2161L(fwd_write_35bytes):
2162	movq	-35(%eax), %xmm0
2163	movq	%xmm0, -35(%edx)
2164L(fwd_write_27bytes):
2165	movq	-27(%eax), %xmm0
2166	movq	%xmm0, -27(%edx)
2167L(fwd_write_19bytes):
2168	movq	-19(%eax), %xmm0
2169	movq	%xmm0, -19(%edx)
2170L(fwd_write_11bytes):
2171	movq	-11(%eax), %xmm0
2172	movq	%xmm0, -11(%edx)
2173L(fwd_write_3bytes):
2174	movzwl	-3(%eax), %ecx
2175	movzbl	-1(%eax), %eax
2176	movw	%cx, -3(%edx)
2177	movb	%al, -1(%edx)
2178# ifdef USE_AS_MEMPCPY
2179	movl	%edx, %eax
2180# else
2181	movl	DEST(%esp), %eax
2182# endif
2183	RETURN
2184
2185	.p2align 4
2186L(fwd_write_40bytes_align):
2187	movdqa	-40(%eax), %xmm0
2188	movdqa	%xmm0, -40(%edx)
2189L(fwd_write_24bytes_align):
2190	movdqa	-24(%eax), %xmm0
2191	movdqa	%xmm0, -24(%edx)
2192L(fwd_write_8bytes_align):
2193	movq	-8(%eax), %xmm0
2194	movq	%xmm0, -8(%edx)
2195L(fwd_write_0bytes_align):
2196# ifdef USE_AS_MEMPCPY
2197	movl	%edx, %eax
2198# else
2199	movl	DEST(%esp), %eax
2200# endif
2201	RETURN
2202
2203	.p2align 4
2204L(fwd_write_32bytes_align):
2205	movdqa	-32(%eax), %xmm0
2206	movdqa	%xmm0, -32(%edx)
2207L(fwd_write_16bytes_align):
2208	movdqa	-16(%eax), %xmm0
2209	movdqa	%xmm0, -16(%edx)
2210# ifdef USE_AS_MEMPCPY
2211	movl	%edx, %eax
2212# else
2213	movl	DEST(%esp), %eax
2214# endif
2215	RETURN
2216
2217	.p2align 4
2218L(fwd_write_5bytes_align):
2219	movl	-5(%eax), %ecx
2220	movl	-4(%eax), %eax
2221	movl	%ecx, -5(%edx)
2222	movl	%eax, -4(%edx)
2223# ifdef USE_AS_MEMPCPY
2224	movl	%edx, %eax
2225# else
2226	movl	DEST(%esp), %eax
2227# endif
2228	RETURN
2229
2230	.p2align 4
2231L(fwd_write_45bytes_align):
2232	movdqa	-45(%eax), %xmm0
2233	movdqa	%xmm0, -45(%edx)
2234L(fwd_write_29bytes_align):
2235	movdqa	-29(%eax), %xmm0
2236	movdqa	%xmm0, -29(%edx)
2237L(fwd_write_13bytes_align):
2238	movq	-13(%eax), %xmm0
2239	movq	%xmm0, -13(%edx)
2240	movl	-5(%eax), %ecx
2241	movl	%ecx, -5(%edx)
2242	movzbl	-1(%eax), %ecx
2243	movb	%cl, -1(%edx)
2244# ifdef USE_AS_MEMPCPY
2245	movl	%edx, %eax
2246# else
2247	movl	DEST(%esp), %eax
2248# endif
2249	RETURN
2250
2251	.p2align 4
2252L(fwd_write_37bytes_align):
2253	movdqa	-37(%eax), %xmm0
2254	movdqa	%xmm0, -37(%edx)
2255L(fwd_write_21bytes_align):
2256	movdqa	-21(%eax), %xmm0
2257	movdqa	%xmm0, -21(%edx)
2258	movl	-5(%eax), %ecx
2259	movl	%ecx, -5(%edx)
2260	movzbl	-1(%eax), %ecx
2261	movb	%cl, -1(%edx)
2262# ifdef USE_AS_MEMPCPY
2263	movl	%edx, %eax
2264# else
2265	movl	DEST(%esp), %eax
2266# endif
2267	RETURN
2268
2269	.p2align 4
2270L(fwd_write_41bytes_align):
2271	movdqa	-41(%eax), %xmm0
2272	movdqa	%xmm0, -41(%edx)
2273L(fwd_write_25bytes_align):
2274	movdqa	-25(%eax), %xmm0
2275	movdqa	%xmm0, -25(%edx)
2276L(fwd_write_9bytes_align):
2277	movq	-9(%eax), %xmm0
2278	movq	%xmm0, -9(%edx)
2279L(fwd_write_1bytes_align):
2280	movzbl	-1(%eax), %ecx
2281	movb	%cl, -1(%edx)
2282# ifdef USE_AS_MEMPCPY
2283	movl	%edx, %eax
2284# else
2285	movl	DEST(%esp), %eax
2286# endif
2287	RETURN
2288
2289	.p2align 4
2290L(fwd_write_33bytes_align):
2291	movdqa	-33(%eax), %xmm0
2292	movdqa	%xmm0, -33(%edx)
2293L(fwd_write_17bytes_align):
2294	movdqa	-17(%eax), %xmm0
2295	movdqa	%xmm0, -17(%edx)
2296	movzbl	-1(%eax), %ecx
2297	movb	%cl, -1(%edx)
2298# ifdef USE_AS_MEMPCPY
2299	movl	%edx, %eax
2300# else
2301	movl	DEST(%esp), %eax
2302# endif
2303	RETURN
2304
2305	.p2align 4
2306L(fwd_write_46bytes_align):
2307	movdqa	-46(%eax), %xmm0
2308	movdqa	%xmm0, -46(%edx)
2309L(fwd_write_30bytes_align):
2310	movdqa	-30(%eax), %xmm0
2311	movdqa	%xmm0, -30(%edx)
2312L(fwd_write_14bytes_align):
2313	movq	-14(%eax), %xmm0
2314	movq	%xmm0, -14(%edx)
2315L(fwd_write_6bytes_align):
2316	movl	-6(%eax), %ecx
2317	movl	%ecx, -6(%edx)
2318	movzwl	-2(%eax), %ecx
2319	movw	%cx, -2(%edx)
2320# ifdef USE_AS_MEMPCPY
2321	movl	%edx, %eax
2322# else
2323	movl	DEST(%esp), %eax
2324# endif
2325	RETURN
2326
2327	.p2align 4
2328L(fwd_write_38bytes_align):
2329	movdqa	-38(%eax), %xmm0
2330	movdqa	%xmm0, -38(%edx)
2331L(fwd_write_22bytes_align):
2332	movdqa	-22(%eax), %xmm0
2333	movdqa	%xmm0, -22(%edx)
2334	movl	-6(%eax), %ecx
2335	movl	%ecx, -6(%edx)
2336	movzwl	-2(%eax), %ecx
2337	movw	%cx, -2(%edx)
2338# ifdef USE_AS_MEMPCPY
2339	movl	%edx, %eax
2340# else
2341	movl	DEST(%esp), %eax
2342# endif
2343	RETURN
2344
2345	.p2align 4
2346L(fwd_write_42bytes_align):
2347	movdqa	-42(%eax), %xmm0
2348	movdqa	%xmm0, -42(%edx)
2349L(fwd_write_26bytes_align):
2350	movdqa	-26(%eax), %xmm0
2351	movdqa	%xmm0, -26(%edx)
2352L(fwd_write_10bytes_align):
2353	movq	-10(%eax), %xmm0
2354	movq	%xmm0, -10(%edx)
2355L(fwd_write_2bytes_align):
2356	movzwl	-2(%eax), %ecx
2357	movw	%cx, -2(%edx)
2358# ifdef USE_AS_MEMPCPY
2359	movl	%edx, %eax
2360# else
2361	movl	DEST(%esp), %eax
2362# endif
2363	RETURN
2364
2365	.p2align 4
2366L(fwd_write_34bytes_align):
2367	movdqa	-34(%eax), %xmm0
2368	movdqa	%xmm0, -34(%edx)
2369L(fwd_write_18bytes_align):
2370	movdqa	-18(%eax), %xmm0
2371	movdqa	%xmm0, -18(%edx)
2372	movzwl	-2(%eax), %ecx
2373	movw	%cx, -2(%edx)
2374# ifdef USE_AS_MEMPCPY
2375	movl	%edx, %eax
2376# else
2377	movl	DEST(%esp), %eax
2378# endif
2379	RETURN
2380
2381	.p2align 4
2382L(fwd_write_47bytes_align):
2383	movdqa	-47(%eax), %xmm0
2384	movdqa	%xmm0, -47(%edx)
2385L(fwd_write_31bytes_align):
2386	movdqa	-31(%eax), %xmm0
2387	movdqa	%xmm0, -31(%edx)
2388L(fwd_write_15bytes_align):
2389	movq	-15(%eax), %xmm0
2390	movq	%xmm0, -15(%edx)
2391L(fwd_write_7bytes_align):
2392	movl	-7(%eax), %ecx
2393	movl	%ecx, -7(%edx)
2394	movzwl	-3(%eax), %ecx
2395	movzbl	-1(%eax), %eax
2396	movw	%cx, -3(%edx)
2397	movb	%al, -1(%edx)
2398# ifdef USE_AS_MEMPCPY
2399	movl	%edx, %eax
2400# else
2401	movl	DEST(%esp), %eax
2402# endif
2403	RETURN
2404
2405	.p2align 4
2406L(fwd_write_39bytes_align):
2407	movdqa	-39(%eax), %xmm0
2408	movdqa	%xmm0, -39(%edx)
2409L(fwd_write_23bytes_align):
2410	movdqa	-23(%eax), %xmm0
2411	movdqa	%xmm0, -23(%edx)
2412	movl	-7(%eax), %ecx
2413	movl	%ecx, -7(%edx)
2414	movzwl	-3(%eax), %ecx
2415	movzbl	-1(%eax), %eax
2416	movw	%cx, -3(%edx)
2417	movb	%al, -1(%edx)
2418# ifdef USE_AS_MEMPCPY
2419	movl	%edx, %eax
2420# else
2421	movl	DEST(%esp), %eax
2422# endif
2423	RETURN
2424
2425	.p2align 4
2426L(fwd_write_43bytes_align):
2427	movdqa	-43(%eax), %xmm0
2428	movdqa	%xmm0, -43(%edx)
2429L(fwd_write_27bytes_align):
2430	movdqa	-27(%eax), %xmm0
2431	movdqa	%xmm0, -27(%edx)
2432L(fwd_write_11bytes_align):
2433	movq	-11(%eax), %xmm0
2434	movq	%xmm0, -11(%edx)
2435L(fwd_write_3bytes_align):
2436	movzwl	-3(%eax), %ecx
2437	movzbl	-1(%eax), %eax
2438	movw	%cx, -3(%edx)
2439	movb	%al, -1(%edx)
2440# ifdef USE_AS_MEMPCPY
2441	movl	%edx, %eax
2442# else
2443	movl	DEST(%esp), %eax
2444# endif
2445	RETURN
2446
2447	.p2align 4
2448L(fwd_write_35bytes_align):
2449	movdqa	-35(%eax), %xmm0
2450	movdqa	%xmm0, -35(%edx)
2451L(fwd_write_19bytes_align):
2452	movdqa	-19(%eax), %xmm0
2453	movdqa	%xmm0, -19(%edx)
2454	movzwl	-3(%eax), %ecx
2455	movzbl	-1(%eax), %eax
2456	movw	%cx, -3(%edx)
2457	movb	%al, -1(%edx)
2458# ifdef USE_AS_MEMPCPY
2459	movl	%edx, %eax
2460# else
2461	movl	DEST(%esp), %eax
2462# endif
2463	RETURN
2464
2465	.p2align 4
2466L(fwd_write_44bytes_align):
2467	movdqa	-44(%eax), %xmm0
2468	movdqa	%xmm0, -44(%edx)
2469L(fwd_write_28bytes_align):
2470	movdqa	-28(%eax), %xmm0
2471	movdqa	%xmm0, -28(%edx)
2472L(fwd_write_12bytes_align):
2473	movq	-12(%eax), %xmm0
2474	movq	%xmm0, -12(%edx)
2475L(fwd_write_4bytes_align):
2476	movl	-4(%eax), %ecx
2477	movl	%ecx, -4(%edx)
2478# ifdef USE_AS_MEMPCPY
2479	movl	%edx, %eax
2480# else
2481	movl	DEST(%esp), %eax
2482# endif
2483	RETURN
2484
2485	.p2align 4
2486L(fwd_write_36bytes_align):
2487	movdqa	-36(%eax), %xmm0
2488	movdqa	%xmm0, -36(%edx)
2489L(fwd_write_20bytes_align):
2490	movdqa	-20(%eax), %xmm0
2491	movdqa	%xmm0, -20(%edx)
2492	movl	-4(%eax), %ecx
2493	movl	%ecx, -4(%edx)
2494# ifdef USE_AS_MEMPCPY
2495	movl	%edx, %eax
2496# else
2497	movl	DEST(%esp), %eax
2498# endif
2499	RETURN_END
2500
2501	CFI_PUSH (%edi)
2502
2503	.p2align 4
2504L(large_page):
2505	movdqu	(%eax), %xmm1
2506# ifdef USE_AS_MEMMOVE
2507	movl	DEST+4(%esp), %edi
2508	movdqu	%xmm0, (%edi)
2509# endif
2510	lea	16(%eax), %eax
2511	movntdq	%xmm1, (%edx)
2512	lea	16(%edx), %edx
2513	lea	-0x90(%ecx), %ecx
2514	POP (%edi)
2515
2516	.p2align 4
2517L(large_page_loop):
2518	movdqu	(%eax), %xmm0
2519	movdqu	0x10(%eax), %xmm1
2520	movdqu	0x20(%eax), %xmm2
2521	movdqu	0x30(%eax), %xmm3
2522	movdqu	0x40(%eax), %xmm4
2523	movdqu	0x50(%eax), %xmm5
2524	movdqu	0x60(%eax), %xmm6
2525	movdqu	0x70(%eax), %xmm7
2526	lea	0x80(%eax), %eax
2527
2528	sub	$0x80, %ecx
2529	movntdq	%xmm0, (%edx)
2530	movntdq	%xmm1, 0x10(%edx)
2531	movntdq	%xmm2, 0x20(%edx)
2532	movntdq	%xmm3, 0x30(%edx)
2533	movntdq	%xmm4, 0x40(%edx)
2534	movntdq	%xmm5, 0x50(%edx)
2535	movntdq	%xmm6, 0x60(%edx)
2536	movntdq	%xmm7, 0x70(%edx)
2537	lea	0x80(%edx), %edx
2538	jae	L(large_page_loop)
2539	cmp	$-0x40, %ecx
2540	lea	0x80(%ecx), %ecx
2541	jl	L(large_page_less_64bytes)
2542
2543	movdqu	(%eax), %xmm0
2544	movdqu	0x10(%eax), %xmm1
2545	movdqu	0x20(%eax), %xmm2
2546	movdqu	0x30(%eax), %xmm3
2547	lea	0x40(%eax), %eax
2548
2549	movntdq	%xmm0, (%edx)
2550	movntdq	%xmm1, 0x10(%edx)
2551	movntdq	%xmm2, 0x20(%edx)
2552	movntdq	%xmm3, 0x30(%edx)
2553	lea	0x40(%edx), %edx
2554	sub	$0x40, %ecx
2555L(large_page_less_64bytes):
2556	cmp	$32, %ecx
2557	jb	L(large_page_less_32bytes)
2558	movdqu	(%eax), %xmm0
2559	movdqu	0x10(%eax), %xmm1
2560	lea	0x20(%eax), %eax
2561	movntdq	%xmm0, (%edx)
2562	movntdq	%xmm1, 0x10(%edx)
2563	lea	0x20(%edx), %edx
2564	sub	$0x20, %ecx
2565L(large_page_less_32bytes):
2566	add	%ecx, %edx
2567	add	%ecx, %eax
2568	sfence
2569	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2570
2571	.p2align 4
2572L(bk_write_44bytes):
2573	movq	36(%eax), %xmm0
2574	movq	%xmm0, 36(%edx)
2575L(bk_write_36bytes):
2576	movq	28(%eax), %xmm0
2577	movq	%xmm0, 28(%edx)
2578L(bk_write_28bytes):
2579	movq	20(%eax), %xmm0
2580	movq	%xmm0, 20(%edx)
2581L(bk_write_20bytes):
2582	movq	12(%eax), %xmm0
2583	movq	%xmm0, 12(%edx)
2584L(bk_write_12bytes):
2585	movq	4(%eax), %xmm0
2586	movq	%xmm0, 4(%edx)
2587L(bk_write_4bytes):
2588	movl	(%eax), %ecx
2589	movl	%ecx, (%edx)
2590L(bk_write_0bytes):
2591	movl	DEST(%esp), %eax
2592# ifdef USE_AS_MEMPCPY
2593	movl	LEN(%esp), %ecx
2594	add	%ecx, %eax
2595# endif
2596	RETURN
2597
2598	.p2align 4
2599L(bk_write_40bytes):
2600	movq	32(%eax), %xmm0
2601	movq	%xmm0, 32(%edx)
2602L(bk_write_32bytes):
2603	movq	24(%eax), %xmm0
2604	movq	%xmm0, 24(%edx)
2605L(bk_write_24bytes):
2606	movq	16(%eax), %xmm0
2607	movq	%xmm0, 16(%edx)
2608L(bk_write_16bytes):
2609	movq	8(%eax), %xmm0
2610	movq	%xmm0, 8(%edx)
2611L(bk_write_8bytes):
2612	movq	(%eax), %xmm0
2613	movq	%xmm0, (%edx)
2614	movl	DEST(%esp), %eax
2615# ifdef USE_AS_MEMPCPY
2616	movl	LEN(%esp), %ecx
2617	add	%ecx, %eax
2618# endif
2619	RETURN
2620
2621	.p2align 4
2622L(bk_write_45bytes):
2623	movq	37(%eax), %xmm0
2624	movq	%xmm0, 37(%edx)
2625L(bk_write_37bytes):
2626	movq	29(%eax), %xmm0
2627	movq	%xmm0, 29(%edx)
2628L(bk_write_29bytes):
2629	movq	21(%eax), %xmm0
2630	movq	%xmm0, 21(%edx)
2631L(bk_write_21bytes):
2632	movq	13(%eax), %xmm0
2633	movq	%xmm0, 13(%edx)
2634L(bk_write_13bytes):
2635	movq	5(%eax), %xmm0
2636	movq	%xmm0, 5(%edx)
2637L(bk_write_5bytes):
2638	movl	1(%eax), %ecx
2639	movl	%ecx, 1(%edx)
2640L(bk_write_1bytes):
2641	movzbl	(%eax), %ecx
2642	movb	%cl, (%edx)
2643	movl	DEST(%esp), %eax
2644# ifdef USE_AS_MEMPCPY
2645	movl	LEN(%esp), %ecx
2646	add	%ecx, %eax
2647# endif
2648	RETURN
2649
2650	.p2align 4
2651L(bk_write_41bytes):
2652	movq	33(%eax), %xmm0
2653	movq	%xmm0, 33(%edx)
2654L(bk_write_33bytes):
2655	movq	25(%eax), %xmm0
2656	movq	%xmm0, 25(%edx)
2657L(bk_write_25bytes):
2658	movq	17(%eax), %xmm0
2659	movq	%xmm0, 17(%edx)
2660L(bk_write_17bytes):
2661	movq	9(%eax), %xmm0
2662	movq	%xmm0, 9(%edx)
2663L(bk_write_9bytes):
2664	movq	1(%eax), %xmm0
2665	movq	%xmm0, 1(%edx)
2666	movzbl	(%eax), %ecx
2667	movb	%cl, (%edx)
2668	movl	DEST(%esp), %eax
2669# ifdef USE_AS_MEMPCPY
2670	movl	LEN(%esp), %ecx
2671	add	%ecx, %eax
2672# endif
2673	RETURN
2674
2675	.p2align 4
2676L(bk_write_46bytes):
2677	movq	38(%eax), %xmm0
2678	movq	%xmm0, 38(%edx)
2679L(bk_write_38bytes):
2680	movq	30(%eax), %xmm0
2681	movq	%xmm0, 30(%edx)
2682L(bk_write_30bytes):
2683	movq	22(%eax), %xmm0
2684	movq	%xmm0, 22(%edx)
2685L(bk_write_22bytes):
2686	movq	14(%eax), %xmm0
2687	movq	%xmm0, 14(%edx)
2688L(bk_write_14bytes):
2689	movq	6(%eax), %xmm0
2690	movq	%xmm0, 6(%edx)
2691L(bk_write_6bytes):
2692	movl	2(%eax), %ecx
2693	movl	%ecx, 2(%edx)
2694	movzwl	(%eax), %ecx
2695	movw	%cx, (%edx)
2696	movl	DEST(%esp), %eax
2697# ifdef USE_AS_MEMPCPY
2698	movl	LEN(%esp), %ecx
2699	add	%ecx, %eax
2700# endif
2701	RETURN
2702
2703	.p2align 4
2704L(bk_write_42bytes):
2705	movq	34(%eax), %xmm0
2706	movq	%xmm0, 34(%edx)
2707L(bk_write_34bytes):
2708	movq	26(%eax), %xmm0
2709	movq	%xmm0, 26(%edx)
2710L(bk_write_26bytes):
2711	movq	18(%eax), %xmm0
2712	movq	%xmm0, 18(%edx)
2713L(bk_write_18bytes):
2714	movq	10(%eax), %xmm0
2715	movq	%xmm0, 10(%edx)
2716L(bk_write_10bytes):
2717	movq	2(%eax), %xmm0
2718	movq	%xmm0, 2(%edx)
2719L(bk_write_2bytes):
2720	movzwl	(%eax), %ecx
2721	movw	%cx, (%edx)
2722	movl	DEST(%esp), %eax
2723# ifdef USE_AS_MEMPCPY
2724	movl	LEN(%esp), %ecx
2725	add	%ecx, %eax
2726# endif
2727	RETURN
2728
2729	.p2align 4
2730L(bk_write_47bytes):
2731	movq	39(%eax), %xmm0
2732	movq	%xmm0, 39(%edx)
2733L(bk_write_39bytes):
2734	movq	31(%eax), %xmm0
2735	movq	%xmm0, 31(%edx)
2736L(bk_write_31bytes):
2737	movq	23(%eax), %xmm0
2738	movq	%xmm0, 23(%edx)
2739L(bk_write_23bytes):
2740	movq	15(%eax), %xmm0
2741	movq	%xmm0, 15(%edx)
2742L(bk_write_15bytes):
2743	movq	7(%eax), %xmm0
2744	movq	%xmm0, 7(%edx)
2745L(bk_write_7bytes):
2746	movl	3(%eax), %ecx
2747	movl	%ecx, 3(%edx)
2748	movzwl	1(%eax), %ecx
2749	movw	%cx, 1(%edx)
2750	movzbl	(%eax), %eax
2751	movb	%al, (%edx)
2752	movl	DEST(%esp), %eax
2753# ifdef USE_AS_MEMPCPY
2754	movl	LEN(%esp), %ecx
2755	add	%ecx, %eax
2756# endif
2757	RETURN
2758
2759	.p2align 4
2760L(bk_write_43bytes):
2761	movq	35(%eax), %xmm0
2762	movq	%xmm0, 35(%edx)
2763L(bk_write_35bytes):
2764	movq	27(%eax), %xmm0
2765	movq	%xmm0, 27(%edx)
2766L(bk_write_27bytes):
2767	movq	19(%eax), %xmm0
2768	movq	%xmm0, 19(%edx)
2769L(bk_write_19bytes):
2770	movq	11(%eax), %xmm0
2771	movq	%xmm0, 11(%edx)
2772L(bk_write_11bytes):
2773	movq	3(%eax), %xmm0
2774	movq	%xmm0, 3(%edx)
2775L(bk_write_3bytes):
2776	movzwl	1(%eax), %ecx
2777	movw	%cx, 1(%edx)
2778	movzbl	(%eax), %eax
2779	movb	%al, (%edx)
2780	movl	DEST(%esp), %eax
2781# ifdef USE_AS_MEMPCPY
2782	movl	LEN(%esp), %ecx
2783	add	%ecx, %eax
2784# endif
2785	RETURN_END
2786
2787
2788	.pushsection .rodata.ssse3,"a",@progbits
2789	.p2align 2
2790L(table_48bytes_fwd):
2791	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2792	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2793	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2794	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2795	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2796	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2797	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2798	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2799	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2800	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2801	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2802	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2803	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2804	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2805	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2806	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2807	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2808	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2809	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2810	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2811	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2812	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2813	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2814	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2815	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2816	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2817	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2818	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2819	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2820	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2821	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2822	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2823	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2824	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2825	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2826	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2827	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2828	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2829	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2830	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2831	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2832	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2833	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2834	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2835	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2836	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2837	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2838	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2839
2840	.p2align 2
2841L(table_48bytes_fwd_align):
2842	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2843	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2844	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2845	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2846	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2847	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2848	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2849	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2850	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2851	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2852	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2853	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2854	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2855	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2856	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2857	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2858	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2859	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2860	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2861	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2862	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2863	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2864	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2865	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2866	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2867	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2868	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2869	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2870	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2871	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2872	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2873	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2874	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2875	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2876	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2877	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2878	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2879	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2880	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2881	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2882	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2883	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2884	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2885	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2886	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
2887	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
2888	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
2889	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
2890
2891	.p2align 2
2892L(shl_table):
2893	.int	JMPTBL (L(shl_0), L(shl_table))
2894	.int	JMPTBL (L(shl_1), L(shl_table))
2895	.int	JMPTBL (L(shl_2), L(shl_table))
2896	.int	JMPTBL (L(shl_3), L(shl_table))
2897	.int	JMPTBL (L(shl_4), L(shl_table))
2898	.int	JMPTBL (L(shl_5), L(shl_table))
2899	.int	JMPTBL (L(shl_6), L(shl_table))
2900	.int	JMPTBL (L(shl_7), L(shl_table))
2901	.int	JMPTBL (L(shl_8), L(shl_table))
2902	.int	JMPTBL (L(shl_9), L(shl_table))
2903	.int	JMPTBL (L(shl_10), L(shl_table))
2904	.int	JMPTBL (L(shl_11), L(shl_table))
2905	.int	JMPTBL (L(shl_12), L(shl_table))
2906	.int	JMPTBL (L(shl_13), L(shl_table))
2907	.int	JMPTBL (L(shl_14), L(shl_table))
2908	.int	JMPTBL (L(shl_15), L(shl_table))
2909
2910	.p2align 2
2911L(table_48_bytes_bwd):
2912	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
2913	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
2914	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
2915	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
2916	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
2917	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
2918	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
2919	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
2920	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
2921	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
2922	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
2923	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
2924	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
2925	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
2926	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
2927	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
2928	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
2929	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
2930	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
2931	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
2932	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
2933	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
2934	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
2935	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
2936	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
2937	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
2938	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
2939	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
2940	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
2941	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
2942	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
2943	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
2944	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
2945	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
2946	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
2947	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
2948	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
2949	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
2950	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
2951	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
2952	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
2953	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
2954	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
2955	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
2956	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
2957	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
2958	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
2959	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
2960
2961	.popsection
2962
2963# ifdef USE_AS_MEMMOVE
2964	.p2align 4
2965L(copy_backward):
2966	PUSH (%edi)
2967	movl	%eax, %edi
2968	lea	(%ecx,%edx,1),%edx
2969	lea	(%ecx,%edi,1),%edi
2970	testl	$0x3, %edx
2971	jnz	L(bk_align)
2972
2973L(bk_aligned_4):
2974	cmp	$64, %ecx
2975	jae	L(bk_write_more64bytes)
2976
2977L(bk_write_64bytesless):
2978	cmp	$32, %ecx
2979	jb	L(bk_write_less32bytes)
2980
2981L(bk_write_more32bytes):
2982	/* Copy 32 bytes at a time.  */
2983	sub	$32, %ecx
2984	movq	-8(%edi), %xmm0
2985	movq	%xmm0, -8(%edx)
2986	movq	-16(%edi), %xmm0
2987	movq	%xmm0, -16(%edx)
2988	movq	-24(%edi), %xmm0
2989	movq	%xmm0, -24(%edx)
2990	movq	-32(%edi), %xmm0
2991	movq	%xmm0, -32(%edx)
2992	sub	$32, %edx
2993	sub	$32, %edi
2994
2995L(bk_write_less32bytes):
2996	movl	%edi, %eax
2997	sub	%ecx, %edx
2998	sub	%ecx, %eax
2999	POP (%edi)
3000L(bk_write_less32bytes_2):
3001	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3002
3003	CFI_PUSH (%edi)
3004
3005	.p2align 4
3006L(bk_align):
3007	cmp	$8, %ecx
3008	jbe	L(bk_write_less32bytes)
3009	testl	$1, %edx
3010	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
3011	then	(EDX & 2) must be != 0.  */
3012	jz	L(bk_got2)
3013	sub	$1, %edi
3014	sub	$1, %ecx
3015	sub	$1, %edx
3016	movzbl	(%edi), %eax
3017	movb	%al, (%edx)
3018
3019	testl	$2, %edx
3020	jz	L(bk_aligned_4)
3021
3022L(bk_got2):
3023	sub	$2, %edi
3024	sub	$2, %ecx
3025	sub	$2, %edx
3026	movzwl	(%edi), %eax
3027	movw	%ax, (%edx)
3028	jmp	L(bk_aligned_4)
3029
3030	.p2align 4
3031L(bk_write_more64bytes):
3032	/* Check alignment of last byte.  */
3033	testl	$15, %edx
3034	jz	L(bk_ssse3_cpy_pre)
3035
3036/* EDX is aligned 4 bytes, but not 16 bytes.  */
3037L(bk_ssse3_align):
3038	sub	$4, %edi
3039	sub	$4, %ecx
3040	sub	$4, %edx
3041	movl	(%edi), %eax
3042	movl	%eax, (%edx)
3043
3044	testl	$15, %edx
3045	jz	L(bk_ssse3_cpy_pre)
3046
3047	sub	$4, %edi
3048	sub	$4, %ecx
3049	sub	$4, %edx
3050	movl	(%edi), %eax
3051	movl	%eax, (%edx)
3052
3053	testl	$15, %edx
3054	jz	L(bk_ssse3_cpy_pre)
3055
3056	sub	$4, %edi
3057	sub	$4, %ecx
3058	sub	$4, %edx
3059	movl	(%edi), %eax
3060	movl	%eax, (%edx)
3061
3062L(bk_ssse3_cpy_pre):
3063	cmp	$64, %ecx
3064	jb	L(bk_write_more32bytes)
3065
3066	.p2align 4
3067L(bk_ssse3_cpy):
3068	sub	$64, %edi
3069	sub	$64, %ecx
3070	sub	$64, %edx
3071	movdqu	0x30(%edi), %xmm3
3072	movdqa	%xmm3, 0x30(%edx)
3073	movdqu	0x20(%edi), %xmm2
3074	movdqa	%xmm2, 0x20(%edx)
3075	movdqu	0x10(%edi), %xmm1
3076	movdqa	%xmm1, 0x10(%edx)
3077	movdqu	(%edi), %xmm0
3078	movdqa	%xmm0, (%edx)
3079	cmp	$64, %ecx
3080	jae	L(bk_ssse3_cpy)
3081	jmp	L(bk_write_64bytesless)
3082
3083# endif
3084
3085END (MEMCPY)
3086
3087#endif
3088