1/* memcmp with SSSE3, wmemcmp with SSSE3
2   Copyright (C) 2010-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCMP
24#  define MEMCMP		__memcmp_ssse3
25# endif
26
27# define CFI_PUSH(REG)	\
28	cfi_adjust_cfa_offset (4);	\
29	cfi_rel_offset (REG, 0)
30
31# define CFI_POP(REG)	\
32	cfi_adjust_cfa_offset (-4);	\
33	cfi_restore (REG)
34
35# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
36# define POP(REG)	popl REG; CFI_POP (REG)
37
38# define PARMS		4
39# define BLK1		PARMS
40# define BLK2		BLK1+4
41# define LEN		BLK2+4
42# define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
43# define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
44
45/* Warning!
46           wmemcmp has to use SIGNED comparison for elements.
47           memcmp has to use UNSIGNED comparison for elemnts.
48*/
49
50	atom_text_section
51ENTRY (MEMCMP)
52	movl	LEN(%esp), %ecx
53
54# ifdef USE_AS_WMEMCMP
55	shl	$2, %ecx
56	test	%ecx, %ecx
57	jz	L(zero)
58# endif
59
60	movl	BLK1(%esp), %eax
61	cmp	$48, %ecx
62	movl	BLK2(%esp), %edx
63	jae	L(48bytesormore)
64
65# ifndef USE_AS_WMEMCMP
66	cmp	$1, %ecx
67	jbe	L(less1bytes)
68# endif
69
70	PUSH	(%ebx)
71	add	%ecx, %edx
72	add	%ecx, %eax
73	jmp	L(less48bytes)
74
75	CFI_POP	(%ebx)
76
77# ifndef USE_AS_WMEMCMP
78	.p2align 4
79L(less1bytes):
80	jb	L(zero)
81	movb	(%eax), %cl
82	cmp	(%edx), %cl
83	je	L(zero)
84	mov	$1, %eax
85	ja	L(1bytesend)
86	neg	%eax
87L(1bytesend):
88	ret
89# endif
90
91	.p2align 4
92L(zero):
93	xor	%eax, %eax
94	ret
95
96	.p2align 4
97L(48bytesormore):
98	PUSH	(%ebx)
99	PUSH	(%esi)
100	PUSH	(%edi)
101	cfi_remember_state
102	movdqu	(%eax), %xmm3
103	movdqu	(%edx), %xmm0
104	movl	%eax, %edi
105	movl	%edx, %esi
106	pcmpeqb	%xmm0, %xmm3
107	pmovmskb %xmm3, %edx
108	lea	16(%edi), %edi
109
110	sub	$0xffff, %edx
111	lea	16(%esi), %esi
112	jnz	L(less16bytes)
113	mov	%edi, %edx
114	and	$0xf, %edx
115	xor	%edx, %edi
116	sub	%edx, %esi
117	add	%edx, %ecx
118	mov	%esi, %edx
119	and	$0xf, %edx
120	jz	L(shr_0)
121	xor	%edx, %esi
122
123# ifndef USE_AS_WMEMCMP
124	cmp	$8, %edx
125	jae	L(next_unaligned_table)
126	cmp	$0, %edx
127	je	L(shr_0)
128	cmp	$1, %edx
129	je	L(shr_1)
130	cmp	$2, %edx
131	je	L(shr_2)
132	cmp	$3, %edx
133	je	L(shr_3)
134	cmp	$4, %edx
135	je	L(shr_4)
136	cmp	$5, %edx
137	je	L(shr_5)
138	cmp	$6, %edx
139	je	L(shr_6)
140	jmp	L(shr_7)
141
142	.p2align 2
143L(next_unaligned_table):
144	cmp	$8, %edx
145	je	L(shr_8)
146	cmp	$9, %edx
147	je	L(shr_9)
148	cmp	$10, %edx
149	je	L(shr_10)
150	cmp	$11, %edx
151	je	L(shr_11)
152	cmp	$12, %edx
153	je	L(shr_12)
154	cmp	$13, %edx
155	je	L(shr_13)
156	cmp	$14, %edx
157	je	L(shr_14)
158	jmp	L(shr_15)
159# else
160	cmp	$0, %edx
161	je	L(shr_0)
162	cmp	$4, %edx
163	je	L(shr_4)
164	cmp	$8, %edx
165	je	L(shr_8)
166	jmp	L(shr_12)
167# endif
168
169	.p2align 4
170L(shr_0):
171	cmp	$80, %ecx
172	jae	L(shr_0_gobble)
173	lea	-48(%ecx), %ecx
174	xor	%eax, %eax
175	movaps	(%esi), %xmm1
176	pcmpeqb	(%edi), %xmm1
177	movaps	16(%esi), %xmm2
178	pcmpeqb	16(%edi), %xmm2
179	pand	%xmm1, %xmm2
180	pmovmskb %xmm2, %edx
181	add	$32, %edi
182	add	$32, %esi
183	sub	$0xffff, %edx
184	jnz	L(exit)
185
186	lea	(%ecx, %edi,1), %eax
187	lea	(%ecx, %esi,1), %edx
188	POP	(%edi)
189	POP	(%esi)
190	jmp	L(less48bytes)
191
192	cfi_restore_state
193	cfi_remember_state
194	.p2align 4
195L(shr_0_gobble):
196	lea	-48(%ecx), %ecx
197	movdqa	(%esi), %xmm0
198	xor	%eax, %eax
199	pcmpeqb	(%edi), %xmm0
200	sub	$32, %ecx
201	movdqa	16(%esi), %xmm2
202	pcmpeqb	16(%edi), %xmm2
203L(shr_0_gobble_loop):
204	pand	%xmm0, %xmm2
205	sub	$32, %ecx
206	pmovmskb %xmm2, %edx
207	movdqa	%xmm0, %xmm1
208	movdqa	32(%esi), %xmm0
209	movdqa	48(%esi), %xmm2
210	sbb	$0xffff, %edx
211	pcmpeqb	32(%edi), %xmm0
212	pcmpeqb	48(%edi), %xmm2
213	lea	32(%edi), %edi
214	lea	32(%esi), %esi
215	jz	L(shr_0_gobble_loop)
216
217	pand	%xmm0, %xmm2
218	cmp	$0, %ecx
219	jge	L(shr_0_gobble_loop_next)
220	inc	%edx
221	add	$32, %ecx
222L(shr_0_gobble_loop_next):
223	test	%edx, %edx
224	jnz	L(exit)
225
226	pmovmskb %xmm2, %edx
227	movdqa	%xmm0, %xmm1
228	lea	32(%edi), %edi
229	lea	32(%esi), %esi
230	sub	$0xffff, %edx
231	jnz	L(exit)
232	lea	(%ecx, %edi,1), %eax
233	lea	(%ecx, %esi,1), %edx
234	POP	(%edi)
235	POP	(%esi)
236	jmp	L(less48bytes)
237
238# ifndef USE_AS_WMEMCMP
239	cfi_restore_state
240	cfi_remember_state
241	.p2align 4
242L(shr_1):
243	cmp	$80, %ecx
244	lea	-48(%ecx), %ecx
245	mov	%edx, %eax
246	jae	L(shr_1_gobble)
247
248	movdqa	16(%esi), %xmm1
249	movdqa	%xmm1, %xmm2
250	palignr	$1,(%esi), %xmm1
251	pcmpeqb	(%edi), %xmm1
252
253	movdqa	32(%esi), %xmm3
254	palignr	$1,%xmm2, %xmm3
255	pcmpeqb	16(%edi), %xmm3
256
257	pand	%xmm1, %xmm3
258	pmovmskb %xmm3, %edx
259	lea	32(%edi), %edi
260	lea	32(%esi), %esi
261	sub	$0xffff, %edx
262	jnz	L(exit)
263	lea	(%ecx, %edi,1), %eax
264	lea	1(%ecx, %esi,1), %edx
265	POP	(%edi)
266	POP	(%esi)
267	jmp	L(less48bytes)
268
269	cfi_restore_state
270	cfi_remember_state
271	.p2align 4
272L(shr_1_gobble):
273	sub	$32, %ecx
274	movdqa	16(%esi), %xmm0
275	palignr	$1,(%esi), %xmm0
276	pcmpeqb	(%edi), %xmm0
277
278	movdqa	32(%esi), %xmm3
279	palignr	$1,16(%esi), %xmm3
280	pcmpeqb	16(%edi), %xmm3
281
282L(shr_1_gobble_loop):
283	pand	%xmm0, %xmm3
284	sub	$32, %ecx
285	pmovmskb %xmm3, %edx
286	movdqa	%xmm0, %xmm1
287
288	movdqa	64(%esi), %xmm3
289	palignr	$1,48(%esi), %xmm3
290	sbb	$0xffff, %edx
291	movdqa	48(%esi), %xmm0
292	palignr	$1,32(%esi), %xmm0
293	pcmpeqb	32(%edi), %xmm0
294	lea	32(%esi), %esi
295	pcmpeqb	48(%edi), %xmm3
296
297	lea	32(%edi), %edi
298	jz	L(shr_1_gobble_loop)
299	pand	%xmm0, %xmm3
300
301	cmp	$0, %ecx
302	jge	L(shr_1_gobble_next)
303	inc	%edx
304	add	$32, %ecx
305L(shr_1_gobble_next):
306	test	%edx, %edx
307	jnz	L(exit)
308
309	pmovmskb %xmm3, %edx
310	movdqa	%xmm0, %xmm1
311	lea	32(%edi), %edi
312	lea	32(%esi), %esi
313	sub	$0xffff, %edx
314	jnz	L(exit)
315
316	lea	(%ecx, %edi,1), %eax
317	lea	1(%ecx, %esi,1), %edx
318	POP	(%edi)
319	POP	(%esi)
320	jmp	L(less48bytes)
321
322
323	cfi_restore_state
324	cfi_remember_state
325	.p2align 4
326L(shr_2):
327	cmp	$80, %ecx
328	lea	-48(%ecx), %ecx
329	mov	%edx, %eax
330	jae	L(shr_2_gobble)
331
332	movdqa	16(%esi), %xmm1
333	movdqa	%xmm1, %xmm2
334	palignr	$2,(%esi), %xmm1
335	pcmpeqb	(%edi), %xmm1
336
337	movdqa	32(%esi), %xmm3
338	palignr	$2,%xmm2, %xmm3
339	pcmpeqb	16(%edi), %xmm3
340
341	pand	%xmm1, %xmm3
342	pmovmskb %xmm3, %edx
343	lea	32(%edi), %edi
344	lea	32(%esi), %esi
345	sub	$0xffff, %edx
346	jnz	L(exit)
347	lea	(%ecx, %edi,1), %eax
348	lea	2(%ecx, %esi,1), %edx
349	POP	(%edi)
350	POP	(%esi)
351	jmp	L(less48bytes)
352
353	cfi_restore_state
354	cfi_remember_state
355	.p2align 4
356L(shr_2_gobble):
357	sub	$32, %ecx
358	movdqa	16(%esi), %xmm0
359	palignr	$2,(%esi), %xmm0
360	pcmpeqb	(%edi), %xmm0
361
362	movdqa	32(%esi), %xmm3
363	palignr	$2,16(%esi), %xmm3
364	pcmpeqb	16(%edi), %xmm3
365
366L(shr_2_gobble_loop):
367	pand	%xmm0, %xmm3
368	sub	$32, %ecx
369	pmovmskb %xmm3, %edx
370	movdqa	%xmm0, %xmm1
371
372	movdqa	64(%esi), %xmm3
373	palignr	$2,48(%esi), %xmm3
374	sbb	$0xffff, %edx
375	movdqa	48(%esi), %xmm0
376	palignr	$2,32(%esi), %xmm0
377	pcmpeqb	32(%edi), %xmm0
378	lea	32(%esi), %esi
379	pcmpeqb	48(%edi), %xmm3
380
381	lea	32(%edi), %edi
382	jz	L(shr_2_gobble_loop)
383	pand	%xmm0, %xmm3
384
385	cmp	$0, %ecx
386	jge	L(shr_2_gobble_next)
387	inc	%edx
388	add	$32, %ecx
389L(shr_2_gobble_next):
390	test	%edx, %edx
391	jnz	L(exit)
392
393	pmovmskb %xmm3, %edx
394	movdqa	%xmm0, %xmm1
395	lea	32(%edi), %edi
396	lea	32(%esi), %esi
397	sub	$0xffff, %edx
398	jnz	L(exit)
399
400	lea	(%ecx, %edi,1), %eax
401	lea	2(%ecx, %esi,1), %edx
402	POP	(%edi)
403	POP	(%esi)
404	jmp	L(less48bytes)
405
406	cfi_restore_state
407	cfi_remember_state
408	.p2align 4
409L(shr_3):
410	cmp	$80, %ecx
411	lea	-48(%ecx), %ecx
412	mov	%edx, %eax
413	jae	L(shr_3_gobble)
414
415	movdqa	16(%esi), %xmm1
416	movdqa	%xmm1, %xmm2
417	palignr	$3,(%esi), %xmm1
418	pcmpeqb	(%edi), %xmm1
419
420	movdqa	32(%esi), %xmm3
421	palignr	$3,%xmm2, %xmm3
422	pcmpeqb	16(%edi), %xmm3
423
424	pand	%xmm1, %xmm3
425	pmovmskb %xmm3, %edx
426	lea	32(%edi), %edi
427	lea	32(%esi), %esi
428	sub	$0xffff, %edx
429	jnz	L(exit)
430	lea	(%ecx, %edi,1), %eax
431	lea	3(%ecx, %esi,1), %edx
432	POP	(%edi)
433	POP	(%esi)
434	jmp	L(less48bytes)
435
436	cfi_restore_state
437	cfi_remember_state
438	.p2align 4
439L(shr_3_gobble):
440	sub	$32, %ecx
441	movdqa	16(%esi), %xmm0
442	palignr	$3,(%esi), %xmm0
443	pcmpeqb	(%edi), %xmm0
444
445	movdqa	32(%esi), %xmm3
446	palignr	$3,16(%esi), %xmm3
447	pcmpeqb	16(%edi), %xmm3
448
449L(shr_3_gobble_loop):
450	pand	%xmm0, %xmm3
451	sub	$32, %ecx
452	pmovmskb %xmm3, %edx
453	movdqa	%xmm0, %xmm1
454
455	movdqa	64(%esi), %xmm3
456	palignr	$3,48(%esi), %xmm3
457	sbb	$0xffff, %edx
458	movdqa	48(%esi), %xmm0
459	palignr	$3,32(%esi), %xmm0
460	pcmpeqb	32(%edi), %xmm0
461	lea	32(%esi), %esi
462	pcmpeqb	48(%edi), %xmm3
463
464	lea	32(%edi), %edi
465	jz	L(shr_3_gobble_loop)
466	pand	%xmm0, %xmm3
467
468	cmp	$0, %ecx
469	jge	L(shr_3_gobble_next)
470	inc	%edx
471	add	$32, %ecx
472L(shr_3_gobble_next):
473	test	%edx, %edx
474	jnz	L(exit)
475
476	pmovmskb %xmm3, %edx
477	movdqa	%xmm0, %xmm1
478	lea	32(%edi), %edi
479	lea	32(%esi), %esi
480	sub	$0xffff, %edx
481	jnz	L(exit)
482
483	lea	(%ecx, %edi,1), %eax
484	lea	3(%ecx, %esi,1), %edx
485	POP	(%edi)
486	POP	(%esi)
487	jmp	L(less48bytes)
488# endif
489
490	cfi_restore_state
491	cfi_remember_state
492	.p2align 4
493L(shr_4):
494	cmp	$80, %ecx
495	lea	-48(%ecx), %ecx
496	mov	%edx, %eax
497	jae	L(shr_4_gobble)
498
499	movdqa	16(%esi), %xmm1
500	movdqa	%xmm1, %xmm2
501	palignr	$4,(%esi), %xmm1
502	pcmpeqb	(%edi), %xmm1
503
504	movdqa	32(%esi), %xmm3
505	palignr	$4,%xmm2, %xmm3
506	pcmpeqb	16(%edi), %xmm3
507
508	pand	%xmm1, %xmm3
509	pmovmskb %xmm3, %edx
510	lea	32(%edi), %edi
511	lea	32(%esi), %esi
512	sub	$0xffff, %edx
513	jnz	L(exit)
514	lea	(%ecx, %edi,1), %eax
515	lea	4(%ecx, %esi,1), %edx
516	POP	(%edi)
517	POP	(%esi)
518	jmp	L(less48bytes)
519
520	cfi_restore_state
521	cfi_remember_state
522	.p2align 4
523L(shr_4_gobble):
524	sub	$32, %ecx
525	movdqa	16(%esi), %xmm0
526	palignr	$4,(%esi), %xmm0
527	pcmpeqb	(%edi), %xmm0
528
529	movdqa	32(%esi), %xmm3
530	palignr	$4,16(%esi), %xmm3
531	pcmpeqb	16(%edi), %xmm3
532
533L(shr_4_gobble_loop):
534	pand	%xmm0, %xmm3
535	sub	$32, %ecx
536	pmovmskb %xmm3, %edx
537	movdqa	%xmm0, %xmm1
538
539	movdqa	64(%esi), %xmm3
540	palignr	$4,48(%esi), %xmm3
541	sbb	$0xffff, %edx
542	movdqa	48(%esi), %xmm0
543	palignr	$4,32(%esi), %xmm0
544	pcmpeqb	32(%edi), %xmm0
545	lea	32(%esi), %esi
546	pcmpeqb	48(%edi), %xmm3
547
548	lea	32(%edi), %edi
549	jz	L(shr_4_gobble_loop)
550	pand	%xmm0, %xmm3
551
552	cmp	$0, %ecx
553	jge	L(shr_4_gobble_next)
554	inc	%edx
555	add	$32, %ecx
556L(shr_4_gobble_next):
557	test	%edx, %edx
558	jnz	L(exit)
559
560	pmovmskb %xmm3, %edx
561	movdqa	%xmm0, %xmm1
562	lea	32(%edi), %edi
563	lea	32(%esi), %esi
564	sub	$0xffff, %edx
565	jnz	L(exit)
566
567	lea	(%ecx, %edi,1), %eax
568	lea	4(%ecx, %esi,1), %edx
569	POP	(%edi)
570	POP	(%esi)
571	jmp	L(less48bytes)
572
573# ifndef USE_AS_WMEMCMP
574	cfi_restore_state
575	cfi_remember_state
576	.p2align 4
577L(shr_5):
578	cmp	$80, %ecx
579	lea	-48(%ecx), %ecx
580	mov	%edx, %eax
581	jae	L(shr_5_gobble)
582
583	movdqa	16(%esi), %xmm1
584	movdqa	%xmm1, %xmm2
585	palignr	$5,(%esi), %xmm1
586	pcmpeqb	(%edi), %xmm1
587
588	movdqa	32(%esi), %xmm3
589	palignr	$5,%xmm2, %xmm3
590	pcmpeqb	16(%edi), %xmm3
591
592	pand	%xmm1, %xmm3
593	pmovmskb %xmm3, %edx
594	lea	32(%edi), %edi
595	lea	32(%esi), %esi
596	sub	$0xffff, %edx
597	jnz	L(exit)
598	lea	(%ecx, %edi,1), %eax
599	lea	5(%ecx, %esi,1), %edx
600	POP	(%edi)
601	POP	(%esi)
602	jmp	L(less48bytes)
603
604	cfi_restore_state
605	cfi_remember_state
606	.p2align 4
607L(shr_5_gobble):
608	sub	$32, %ecx
609	movdqa	16(%esi), %xmm0
610	palignr	$5,(%esi), %xmm0
611	pcmpeqb	(%edi), %xmm0
612
613	movdqa	32(%esi), %xmm3
614	palignr	$5,16(%esi), %xmm3
615	pcmpeqb	16(%edi), %xmm3
616
617L(shr_5_gobble_loop):
618	pand	%xmm0, %xmm3
619	sub	$32, %ecx
620	pmovmskb %xmm3, %edx
621	movdqa	%xmm0, %xmm1
622
623	movdqa	64(%esi), %xmm3
624	palignr	$5,48(%esi), %xmm3
625	sbb	$0xffff, %edx
626	movdqa	48(%esi), %xmm0
627	palignr	$5,32(%esi), %xmm0
628	pcmpeqb	32(%edi), %xmm0
629	lea	32(%esi), %esi
630	pcmpeqb	48(%edi), %xmm3
631
632	lea	32(%edi), %edi
633	jz	L(shr_5_gobble_loop)
634	pand	%xmm0, %xmm3
635
636	cmp	$0, %ecx
637	jge	L(shr_5_gobble_next)
638	inc	%edx
639	add	$32, %ecx
640L(shr_5_gobble_next):
641	test	%edx, %edx
642	jnz	L(exit)
643
644	pmovmskb %xmm3, %edx
645	movdqa	%xmm0, %xmm1
646	lea	32(%edi), %edi
647	lea	32(%esi), %esi
648	sub	$0xffff, %edx
649	jnz	L(exit)
650
651	lea	(%ecx, %edi,1), %eax
652	lea	5(%ecx, %esi,1), %edx
653	POP	(%edi)
654	POP	(%esi)
655	jmp	L(less48bytes)
656
657	cfi_restore_state
658	cfi_remember_state
659	.p2align 4
660L(shr_6):
661	cmp	$80, %ecx
662	lea	-48(%ecx), %ecx
663	mov	%edx, %eax
664	jae	L(shr_6_gobble)
665
666	movdqa	16(%esi), %xmm1
667	movdqa	%xmm1, %xmm2
668	palignr	$6,(%esi), %xmm1
669	pcmpeqb	(%edi), %xmm1
670
671	movdqa	32(%esi), %xmm3
672	palignr	$6,%xmm2, %xmm3
673	pcmpeqb	16(%edi), %xmm3
674
675	pand	%xmm1, %xmm3
676	pmovmskb %xmm3, %edx
677	lea	32(%edi), %edi
678	lea	32(%esi), %esi
679	sub	$0xffff, %edx
680	jnz	L(exit)
681	lea	(%ecx, %edi,1), %eax
682	lea	6(%ecx, %esi,1), %edx
683	POP	(%edi)
684	POP	(%esi)
685	jmp	L(less48bytes)
686
687	cfi_restore_state
688	cfi_remember_state
689	.p2align 4
690L(shr_6_gobble):
691	sub	$32, %ecx
692	movdqa	16(%esi), %xmm0
693	palignr	$6,(%esi), %xmm0
694	pcmpeqb	(%edi), %xmm0
695
696	movdqa	32(%esi), %xmm3
697	palignr	$6,16(%esi), %xmm3
698	pcmpeqb	16(%edi), %xmm3
699
700L(shr_6_gobble_loop):
701	pand	%xmm0, %xmm3
702	sub	$32, %ecx
703	pmovmskb %xmm3, %edx
704	movdqa	%xmm0, %xmm1
705
706	movdqa	64(%esi), %xmm3
707	palignr	$6,48(%esi), %xmm3
708	sbb	$0xffff, %edx
709	movdqa	48(%esi), %xmm0
710	palignr	$6,32(%esi), %xmm0
711	pcmpeqb	32(%edi), %xmm0
712	lea	32(%esi), %esi
713	pcmpeqb	48(%edi), %xmm3
714
715	lea	32(%edi), %edi
716	jz	L(shr_6_gobble_loop)
717	pand	%xmm0, %xmm3
718
719	cmp	$0, %ecx
720	jge	L(shr_6_gobble_next)
721	inc	%edx
722	add	$32, %ecx
723L(shr_6_gobble_next):
724	test	%edx, %edx
725	jnz	L(exit)
726
727	pmovmskb %xmm3, %edx
728	movdqa	%xmm0, %xmm1
729	lea	32(%edi), %edi
730	lea	32(%esi), %esi
731	sub	$0xffff, %edx
732	jnz	L(exit)
733
734	lea	(%ecx, %edi,1), %eax
735	lea	6(%ecx, %esi,1), %edx
736	POP	(%edi)
737	POP	(%esi)
738	jmp	L(less48bytes)
739
740	cfi_restore_state
741	cfi_remember_state
742	.p2align 4
743L(shr_7):
744	cmp	$80, %ecx
745	lea	-48(%ecx), %ecx
746	mov	%edx, %eax
747	jae	L(shr_7_gobble)
748
749	movdqa	16(%esi), %xmm1
750	movdqa	%xmm1, %xmm2
751	palignr	$7,(%esi), %xmm1
752	pcmpeqb	(%edi), %xmm1
753
754	movdqa	32(%esi), %xmm3
755	palignr	$7,%xmm2, %xmm3
756	pcmpeqb	16(%edi), %xmm3
757
758	pand	%xmm1, %xmm3
759	pmovmskb %xmm3, %edx
760	lea	32(%edi), %edi
761	lea	32(%esi), %esi
762	sub	$0xffff, %edx
763	jnz	L(exit)
764	lea	(%ecx, %edi,1), %eax
765	lea	7(%ecx, %esi,1), %edx
766	POP	(%edi)
767	POP	(%esi)
768	jmp	L(less48bytes)
769
770	cfi_restore_state
771	cfi_remember_state
772	.p2align 4
773L(shr_7_gobble):
774	sub	$32, %ecx
775	movdqa	16(%esi), %xmm0
776	palignr	$7,(%esi), %xmm0
777	pcmpeqb	(%edi), %xmm0
778
779	movdqa	32(%esi), %xmm3
780	palignr	$7,16(%esi), %xmm3
781	pcmpeqb	16(%edi), %xmm3
782
783L(shr_7_gobble_loop):
784	pand	%xmm0, %xmm3
785	sub	$32, %ecx
786	pmovmskb %xmm3, %edx
787	movdqa	%xmm0, %xmm1
788
789	movdqa	64(%esi), %xmm3
790	palignr	$7,48(%esi), %xmm3
791	sbb	$0xffff, %edx
792	movdqa	48(%esi), %xmm0
793	palignr	$7,32(%esi), %xmm0
794	pcmpeqb	32(%edi), %xmm0
795	lea	32(%esi), %esi
796	pcmpeqb	48(%edi), %xmm3
797
798	lea	32(%edi), %edi
799	jz	L(shr_7_gobble_loop)
800	pand	%xmm0, %xmm3
801
802	cmp	$0, %ecx
803	jge	L(shr_7_gobble_next)
804	inc	%edx
805	add	$32, %ecx
806L(shr_7_gobble_next):
807	test	%edx, %edx
808	jnz	L(exit)
809
810	pmovmskb %xmm3, %edx
811	movdqa	%xmm0, %xmm1
812	lea	32(%edi), %edi
813	lea	32(%esi), %esi
814	sub	$0xffff, %edx
815	jnz	L(exit)
816
817	lea	(%ecx, %edi,1), %eax
818	lea	7(%ecx, %esi,1), %edx
819	POP	(%edi)
820	POP	(%esi)
821	jmp	L(less48bytes)
822# endif
823
824	cfi_restore_state
825	cfi_remember_state
826	.p2align 4
827L(shr_8):
828	cmp	$80, %ecx
829	lea	-48(%ecx), %ecx
830	mov	%edx, %eax
831	jae	L(shr_8_gobble)
832
833	movdqa	16(%esi), %xmm1
834	movdqa	%xmm1, %xmm2
835	palignr	$8,(%esi), %xmm1
836	pcmpeqb	(%edi), %xmm1
837
838	movdqa	32(%esi), %xmm3
839	palignr	$8,%xmm2, %xmm3
840	pcmpeqb	16(%edi), %xmm3
841
842	pand	%xmm1, %xmm3
843	pmovmskb %xmm3, %edx
844	lea	32(%edi), %edi
845	lea	32(%esi), %esi
846	sub	$0xffff, %edx
847	jnz	L(exit)
848	lea	(%ecx, %edi,1), %eax
849	lea	8(%ecx, %esi,1), %edx
850	POP	(%edi)
851	POP	(%esi)
852	jmp	L(less48bytes)
853
854	cfi_restore_state
855	cfi_remember_state
856	.p2align 4
857L(shr_8_gobble):
858	sub	$32, %ecx
859	movdqa	16(%esi), %xmm0
860	palignr	$8,(%esi), %xmm0
861	pcmpeqb	(%edi), %xmm0
862
863	movdqa	32(%esi), %xmm3
864	palignr	$8,16(%esi), %xmm3
865	pcmpeqb	16(%edi), %xmm3
866
867L(shr_8_gobble_loop):
868	pand	%xmm0, %xmm3
869	sub	$32, %ecx
870	pmovmskb %xmm3, %edx
871	movdqa	%xmm0, %xmm1
872
873	movdqa	64(%esi), %xmm3
874	palignr	$8,48(%esi), %xmm3
875	sbb	$0xffff, %edx
876	movdqa	48(%esi), %xmm0
877	palignr	$8,32(%esi), %xmm0
878	pcmpeqb	32(%edi), %xmm0
879	lea	32(%esi), %esi
880	pcmpeqb	48(%edi), %xmm3
881
882	lea	32(%edi), %edi
883	jz	L(shr_8_gobble_loop)
884	pand	%xmm0, %xmm3
885
886	cmp	$0, %ecx
887	jge	L(shr_8_gobble_next)
888	inc	%edx
889	add	$32, %ecx
890L(shr_8_gobble_next):
891	test	%edx, %edx
892	jnz	L(exit)
893
894	pmovmskb %xmm3, %edx
895	movdqa	%xmm0, %xmm1
896	lea	32(%edi), %edi
897	lea	32(%esi), %esi
898	sub	$0xffff, %edx
899	jnz	L(exit)
900
901	lea	(%ecx, %edi,1), %eax
902	lea	8(%ecx, %esi,1), %edx
903	POP	(%edi)
904	POP	(%esi)
905	jmp	L(less48bytes)
906
907# ifndef USE_AS_WMEMCMP
908	cfi_restore_state
909	cfi_remember_state
910	.p2align 4
911L(shr_9):
912	cmp	$80, %ecx
913	lea	-48(%ecx), %ecx
914	mov	%edx, %eax
915	jae	L(shr_9_gobble)
916
917	movdqa	16(%esi), %xmm1
918	movdqa	%xmm1, %xmm2
919	palignr	$9,(%esi), %xmm1
920	pcmpeqb	(%edi), %xmm1
921
922	movdqa	32(%esi), %xmm3
923	palignr	$9,%xmm2, %xmm3
924	pcmpeqb	16(%edi), %xmm3
925
926	pand	%xmm1, %xmm3
927	pmovmskb %xmm3, %edx
928	lea	32(%edi), %edi
929	lea	32(%esi), %esi
930	sub	$0xffff, %edx
931	jnz	L(exit)
932	lea	(%ecx, %edi,1), %eax
933	lea	9(%ecx, %esi,1), %edx
934	POP	(%edi)
935	POP	(%esi)
936	jmp	L(less48bytes)
937
938	cfi_restore_state
939	cfi_remember_state
940	.p2align 4
941L(shr_9_gobble):
942	sub	$32, %ecx
943	movdqa	16(%esi), %xmm0
944	palignr	$9,(%esi), %xmm0
945	pcmpeqb	(%edi), %xmm0
946
947	movdqa	32(%esi), %xmm3
948	palignr	$9,16(%esi), %xmm3
949	pcmpeqb	16(%edi), %xmm3
950
951L(shr_9_gobble_loop):
952	pand	%xmm0, %xmm3
953	sub	$32, %ecx
954	pmovmskb %xmm3, %edx
955	movdqa	%xmm0, %xmm1
956
957	movdqa	64(%esi), %xmm3
958	palignr	$9,48(%esi), %xmm3
959	sbb	$0xffff, %edx
960	movdqa	48(%esi), %xmm0
961	palignr	$9,32(%esi), %xmm0
962	pcmpeqb	32(%edi), %xmm0
963	lea	32(%esi), %esi
964	pcmpeqb	48(%edi), %xmm3
965
966	lea	32(%edi), %edi
967	jz	L(shr_9_gobble_loop)
968	pand	%xmm0, %xmm3
969
970	cmp	$0, %ecx
971	jge	L(shr_9_gobble_next)
972	inc	%edx
973	add	$32, %ecx
974L(shr_9_gobble_next):
975	test	%edx, %edx
976	jnz	L(exit)
977
978	pmovmskb %xmm3, %edx
979	movdqa	%xmm0, %xmm1
980	lea	32(%edi), %edi
981	lea	32(%esi), %esi
982	sub	$0xffff, %edx
983	jnz	L(exit)
984
985	lea	(%ecx, %edi,1), %eax
986	lea	9(%ecx, %esi,1), %edx
987	POP	(%edi)
988	POP	(%esi)
989	jmp	L(less48bytes)
990
991	cfi_restore_state
992	cfi_remember_state
993	.p2align 4
994L(shr_10):
995	cmp	$80, %ecx
996	lea	-48(%ecx), %ecx
997	mov	%edx, %eax
998	jae	L(shr_10_gobble)
999
1000	movdqa	16(%esi), %xmm1
1001	movdqa	%xmm1, %xmm2
1002	palignr	$10, (%esi), %xmm1
1003	pcmpeqb	(%edi), %xmm1
1004
1005	movdqa	32(%esi), %xmm3
1006	palignr	$10,%xmm2, %xmm3
1007	pcmpeqb	16(%edi), %xmm3
1008
1009	pand	%xmm1, %xmm3
1010	pmovmskb %xmm3, %edx
1011	lea	32(%edi), %edi
1012	lea	32(%esi), %esi
1013	sub	$0xffff, %edx
1014	jnz	L(exit)
1015	lea	(%ecx, %edi,1), %eax
1016	lea	10(%ecx, %esi,1), %edx
1017	POP	(%edi)
1018	POP	(%esi)
1019	jmp	L(less48bytes)
1020
1021	cfi_restore_state
1022	cfi_remember_state
1023	.p2align 4
1024L(shr_10_gobble):
1025	sub	$32, %ecx
1026	movdqa	16(%esi), %xmm0
1027	palignr	$10, (%esi), %xmm0
1028	pcmpeqb	(%edi), %xmm0
1029
1030	movdqa	32(%esi), %xmm3
1031	palignr	$10, 16(%esi), %xmm3
1032	pcmpeqb	16(%edi), %xmm3
1033
1034L(shr_10_gobble_loop):
1035	pand	%xmm0, %xmm3
1036	sub	$32, %ecx
1037	pmovmskb %xmm3, %edx
1038	movdqa	%xmm0, %xmm1
1039
1040	movdqa	64(%esi), %xmm3
1041	palignr	$10,48(%esi), %xmm3
1042	sbb	$0xffff, %edx
1043	movdqa	48(%esi), %xmm0
1044	palignr	$10,32(%esi), %xmm0
1045	pcmpeqb	32(%edi), %xmm0
1046	lea	32(%esi), %esi
1047	pcmpeqb	48(%edi), %xmm3
1048
1049	lea	32(%edi), %edi
1050	jz	L(shr_10_gobble_loop)
1051	pand	%xmm0, %xmm3
1052
1053	cmp	$0, %ecx
1054	jge	L(shr_10_gobble_next)
1055	inc	%edx
1056	add	$32, %ecx
1057L(shr_10_gobble_next):
1058	test	%edx, %edx
1059	jnz	L(exit)
1060
1061	pmovmskb %xmm3, %edx
1062	movdqa	%xmm0, %xmm1
1063	lea	32(%edi), %edi
1064	lea	32(%esi), %esi
1065	sub	$0xffff, %edx
1066	jnz	L(exit)
1067
1068	lea	(%ecx, %edi,1), %eax
1069	lea	10(%ecx, %esi,1), %edx
1070	POP	(%edi)
1071	POP	(%esi)
1072	jmp	L(less48bytes)
1073
1074	cfi_restore_state
1075	cfi_remember_state
1076	.p2align 4
1077L(shr_11):
1078	cmp	$80, %ecx
1079	lea	-48(%ecx), %ecx
1080	mov	%edx, %eax
1081	jae	L(shr_11_gobble)
1082
1083	movdqa	16(%esi), %xmm1
1084	movdqa	%xmm1, %xmm2
1085	palignr	$11, (%esi), %xmm1
1086	pcmpeqb	(%edi), %xmm1
1087
1088	movdqa	32(%esi), %xmm3
1089	palignr	$11, %xmm2, %xmm3
1090	pcmpeqb	16(%edi), %xmm3
1091
1092	pand	%xmm1, %xmm3
1093	pmovmskb %xmm3, %edx
1094	lea	32(%edi), %edi
1095	lea	32(%esi), %esi
1096	sub	$0xffff, %edx
1097	jnz	L(exit)
1098	lea	(%ecx, %edi,1), %eax
1099	lea	11(%ecx, %esi,1), %edx
1100	POP	(%edi)
1101	POP	(%esi)
1102	jmp	L(less48bytes)
1103
1104	cfi_restore_state
1105	cfi_remember_state
1106	.p2align 4
1107L(shr_11_gobble):
1108	sub	$32, %ecx
1109	movdqa	16(%esi), %xmm0
1110	palignr	$11, (%esi), %xmm0
1111	pcmpeqb	(%edi), %xmm0
1112
1113	movdqa	32(%esi), %xmm3
1114	palignr	$11, 16(%esi), %xmm3
1115	pcmpeqb	16(%edi), %xmm3
1116
1117L(shr_11_gobble_loop):
1118	pand	%xmm0, %xmm3
1119	sub	$32, %ecx
1120	pmovmskb %xmm3, %edx
1121	movdqa	%xmm0, %xmm1
1122
1123	movdqa	64(%esi), %xmm3
1124	palignr	$11,48(%esi), %xmm3
1125	sbb	$0xffff, %edx
1126	movdqa	48(%esi), %xmm0
1127	palignr	$11,32(%esi), %xmm0
1128	pcmpeqb	32(%edi), %xmm0
1129	lea	32(%esi), %esi
1130	pcmpeqb	48(%edi), %xmm3
1131
1132	lea	32(%edi), %edi
1133	jz	L(shr_11_gobble_loop)
1134	pand	%xmm0, %xmm3
1135
1136	cmp	$0, %ecx
1137	jge	L(shr_11_gobble_next)
1138	inc	%edx
1139	add	$32, %ecx
1140L(shr_11_gobble_next):
1141	test	%edx, %edx
1142	jnz	L(exit)
1143
1144	pmovmskb %xmm3, %edx
1145	movdqa	%xmm0, %xmm1
1146	lea	32(%edi), %edi
1147	lea	32(%esi), %esi
1148	sub	$0xffff, %edx
1149	jnz	L(exit)
1150
1151	lea	(%ecx, %edi,1), %eax
1152	lea	11(%ecx, %esi,1), %edx
1153	POP	(%edi)
1154	POP	(%esi)
1155	jmp	L(less48bytes)
1156# endif
1157
1158	cfi_restore_state
1159	cfi_remember_state
1160	.p2align 4
1161L(shr_12):
1162	cmp	$80, %ecx
1163	lea	-48(%ecx), %ecx
1164	mov	%edx, %eax
1165	jae	L(shr_12_gobble)
1166
1167	movdqa	16(%esi), %xmm1
1168	movdqa	%xmm1, %xmm2
1169	palignr	$12, (%esi), %xmm1
1170	pcmpeqb	(%edi), %xmm1
1171
1172	movdqa	32(%esi), %xmm3
1173	palignr	$12, %xmm2, %xmm3
1174	pcmpeqb	16(%edi), %xmm3
1175
1176	pand	%xmm1, %xmm3
1177	pmovmskb %xmm3, %edx
1178	lea	32(%edi), %edi
1179	lea	32(%esi), %esi
1180	sub	$0xffff, %edx
1181	jnz	L(exit)
1182	lea	(%ecx, %edi,1), %eax
1183	lea	12(%ecx, %esi,1), %edx
1184	POP	(%edi)
1185	POP	(%esi)
1186	jmp	L(less48bytes)
1187
1188	cfi_restore_state
1189	cfi_remember_state
1190	.p2align 4
1191L(shr_12_gobble):
1192	sub	$32, %ecx
1193	movdqa	16(%esi), %xmm0
1194	palignr	$12, (%esi), %xmm0
1195	pcmpeqb	(%edi), %xmm0
1196
1197	movdqa	32(%esi), %xmm3
1198	palignr	$12, 16(%esi), %xmm3
1199	pcmpeqb	16(%edi), %xmm3
1200
1201L(shr_12_gobble_loop):
1202	pand	%xmm0, %xmm3
1203	sub	$32, %ecx
1204	pmovmskb %xmm3, %edx
1205	movdqa	%xmm0, %xmm1
1206
1207	movdqa	64(%esi), %xmm3
1208	palignr	$12,48(%esi), %xmm3
1209	sbb	$0xffff, %edx
1210	movdqa	48(%esi), %xmm0
1211	palignr	$12,32(%esi), %xmm0
1212	pcmpeqb	32(%edi), %xmm0
1213	lea	32(%esi), %esi
1214	pcmpeqb	48(%edi), %xmm3
1215
1216	lea	32(%edi), %edi
1217	jz	L(shr_12_gobble_loop)
1218	pand	%xmm0, %xmm3
1219
1220	cmp	$0, %ecx
1221	jge	L(shr_12_gobble_next)
1222	inc	%edx
1223	add	$32, %ecx
1224L(shr_12_gobble_next):
1225	test	%edx, %edx
1226	jnz	L(exit)
1227
1228	pmovmskb %xmm3, %edx
1229	movdqa	%xmm0, %xmm1
1230	lea	32(%edi), %edi
1231	lea	32(%esi), %esi
1232	sub	$0xffff, %edx
1233	jnz	L(exit)
1234
1235	lea	(%ecx, %edi,1), %eax
1236	lea	12(%ecx, %esi,1), %edx
1237	POP	(%edi)
1238	POP	(%esi)
1239	jmp	L(less48bytes)
1240
1241# ifndef USE_AS_WMEMCMP
1242	cfi_restore_state
1243	cfi_remember_state
1244	.p2align 4
1245L(shr_13):
1246	cmp	$80, %ecx
1247	lea	-48(%ecx), %ecx
1248	mov	%edx, %eax
1249	jae	L(shr_13_gobble)
1250
1251	movdqa	16(%esi), %xmm1
1252	movdqa	%xmm1, %xmm2
1253	palignr	$13, (%esi), %xmm1
1254	pcmpeqb	(%edi), %xmm1
1255
1256	movdqa	32(%esi), %xmm3
1257	palignr	$13, %xmm2, %xmm3
1258	pcmpeqb	16(%edi), %xmm3
1259
1260	pand	%xmm1, %xmm3
1261	pmovmskb %xmm3, %edx
1262	lea	32(%edi), %edi
1263	lea	32(%esi), %esi
1264	sub	$0xffff, %edx
1265	jnz	L(exit)
1266	lea	(%ecx, %edi,1), %eax
1267	lea	13(%ecx, %esi,1), %edx
1268	POP	(%edi)
1269	POP	(%esi)
1270	jmp	L(less48bytes)
1271
1272	cfi_restore_state
1273	cfi_remember_state
1274	.p2align 4
1275L(shr_13_gobble):
1276	sub	$32, %ecx
1277	movdqa	16(%esi), %xmm0
1278	palignr	$13, (%esi), %xmm0
1279	pcmpeqb	(%edi), %xmm0
1280
1281	movdqa	32(%esi), %xmm3
1282	palignr	$13, 16(%esi), %xmm3
1283	pcmpeqb	16(%edi), %xmm3
1284
1285L(shr_13_gobble_loop):
1286	pand	%xmm0, %xmm3
1287	sub	$32, %ecx
1288	pmovmskb %xmm3, %edx
1289	movdqa	%xmm0, %xmm1
1290
1291	movdqa	64(%esi), %xmm3
1292	palignr	$13,48(%esi), %xmm3
1293	sbb	$0xffff, %edx
1294	movdqa	48(%esi), %xmm0
1295	palignr	$13,32(%esi), %xmm0
1296	pcmpeqb	32(%edi), %xmm0
1297	lea	32(%esi), %esi
1298	pcmpeqb	48(%edi), %xmm3
1299
1300	lea	32(%edi), %edi
1301	jz	L(shr_13_gobble_loop)
1302	pand	%xmm0, %xmm3
1303
1304	cmp	$0, %ecx
1305	jge	L(shr_13_gobble_next)
1306	inc	%edx
1307	add	$32, %ecx
1308L(shr_13_gobble_next):
1309	test	%edx, %edx
1310	jnz	L(exit)
1311
1312	pmovmskb %xmm3, %edx
1313	movdqa	%xmm0, %xmm1
1314	lea	32(%edi), %edi
1315	lea	32(%esi), %esi
1316	sub	$0xffff, %edx
1317	jnz	L(exit)
1318
1319	lea	(%ecx, %edi,1), %eax
1320	lea	13(%ecx, %esi,1), %edx
1321	POP	(%edi)
1322	POP	(%esi)
1323	jmp	L(less48bytes)
1324
1325	cfi_restore_state
1326	cfi_remember_state
1327	.p2align 4
1328L(shr_14):
1329	cmp	$80, %ecx
1330	lea	-48(%ecx), %ecx
1331	mov	%edx, %eax
1332	jae	L(shr_14_gobble)
1333
1334	movdqa	16(%esi), %xmm1
1335	movdqa	%xmm1, %xmm2
1336	palignr	$14, (%esi), %xmm1
1337	pcmpeqb	(%edi), %xmm1
1338
1339	movdqa	32(%esi), %xmm3
1340	palignr	$14, %xmm2, %xmm3
1341	pcmpeqb	16(%edi), %xmm3
1342
1343	pand	%xmm1, %xmm3
1344	pmovmskb %xmm3, %edx
1345	lea	32(%edi), %edi
1346	lea	32(%esi), %esi
1347	sub	$0xffff, %edx
1348	jnz	L(exit)
1349	lea	(%ecx, %edi,1), %eax
1350	lea	14(%ecx, %esi,1), %edx
1351	POP	(%edi)
1352	POP	(%esi)
1353	jmp	L(less48bytes)
1354
1355	cfi_restore_state
1356	cfi_remember_state
1357	.p2align 4
1358L(shr_14_gobble):
1359	sub	$32, %ecx
1360	movdqa	16(%esi), %xmm0
1361	palignr	$14, (%esi), %xmm0
1362	pcmpeqb	(%edi), %xmm0
1363
1364	movdqa	32(%esi), %xmm3
1365	palignr	$14, 16(%esi), %xmm3
1366	pcmpeqb	16(%edi), %xmm3
1367
1368L(shr_14_gobble_loop):
1369	pand	%xmm0, %xmm3
1370	sub	$32, %ecx
1371	pmovmskb %xmm3, %edx
1372	movdqa	%xmm0, %xmm1
1373
1374	movdqa	64(%esi), %xmm3
1375	palignr	$14,48(%esi), %xmm3
1376	sbb	$0xffff, %edx
1377	movdqa	48(%esi), %xmm0
1378	palignr	$14,32(%esi), %xmm0
1379	pcmpeqb	32(%edi), %xmm0
1380	lea	32(%esi), %esi
1381	pcmpeqb	48(%edi), %xmm3
1382
1383	lea	32(%edi), %edi
1384	jz	L(shr_14_gobble_loop)
1385	pand	%xmm0, %xmm3
1386
1387	cmp	$0, %ecx
1388	jge	L(shr_14_gobble_next)
1389	inc	%edx
1390	add	$32, %ecx
1391L(shr_14_gobble_next):
1392	test	%edx, %edx
1393	jnz	L(exit)
1394
1395	pmovmskb %xmm3, %edx
1396	movdqa	%xmm0, %xmm1
1397	lea	32(%edi), %edi
1398	lea	32(%esi), %esi
1399	sub	$0xffff, %edx
1400	jnz	L(exit)
1401
1402	lea	(%ecx, %edi,1), %eax
1403	lea	14(%ecx, %esi,1), %edx
1404	POP	(%edi)
1405	POP	(%esi)
1406	jmp	L(less48bytes)
1407
1408	cfi_restore_state
1409	cfi_remember_state
1410	.p2align 4
1411L(shr_15):
1412	cmp	$80, %ecx
1413	lea	-48(%ecx), %ecx
1414	mov	%edx, %eax
1415	jae	L(shr_15_gobble)
1416
1417	movdqa	16(%esi), %xmm1
1418	movdqa	%xmm1, %xmm2
1419	palignr	$15, (%esi), %xmm1
1420	pcmpeqb	(%edi), %xmm1
1421
1422	movdqa	32(%esi), %xmm3
1423	palignr	$15, %xmm2, %xmm3
1424	pcmpeqb	16(%edi), %xmm3
1425
1426	pand	%xmm1, %xmm3
1427	pmovmskb %xmm3, %edx
1428	lea	32(%edi), %edi
1429	lea	32(%esi), %esi
1430	sub	$0xffff, %edx
1431	jnz	L(exit)
1432	lea	(%ecx, %edi,1), %eax
1433	lea	15(%ecx, %esi,1), %edx
1434	POP	(%edi)
1435	POP	(%esi)
1436	jmp	L(less48bytes)
1437
1438	cfi_restore_state
1439	cfi_remember_state
1440	.p2align 4
1441L(shr_15_gobble):
1442	sub	$32, %ecx
1443	movdqa	16(%esi), %xmm0
1444	palignr	$15, (%esi), %xmm0
1445	pcmpeqb	(%edi), %xmm0
1446
1447	movdqa	32(%esi), %xmm3
1448	palignr	$15, 16(%esi), %xmm3
1449	pcmpeqb	16(%edi), %xmm3
1450
1451L(shr_15_gobble_loop):
1452	pand	%xmm0, %xmm3
1453	sub	$32, %ecx
1454	pmovmskb %xmm3, %edx
1455	movdqa	%xmm0, %xmm1
1456
1457	movdqa	64(%esi), %xmm3
1458	palignr	$15,48(%esi), %xmm3
1459	sbb	$0xffff, %edx
1460	movdqa	48(%esi), %xmm0
1461	palignr	$15,32(%esi), %xmm0
1462	pcmpeqb	32(%edi), %xmm0
1463	lea	32(%esi), %esi
1464	pcmpeqb	48(%edi), %xmm3
1465
1466	lea	32(%edi), %edi
1467	jz	L(shr_15_gobble_loop)
1468	pand	%xmm0, %xmm3
1469
1470	cmp	$0, %ecx
1471	jge	L(shr_15_gobble_next)
1472	inc	%edx
1473	add	$32, %ecx
1474L(shr_15_gobble_next):
1475	test	%edx, %edx
1476	jnz	L(exit)
1477
1478	pmovmskb %xmm3, %edx
1479	movdqa	%xmm0, %xmm1
1480	lea	32(%edi), %edi
1481	lea	32(%esi), %esi
1482	sub	$0xffff, %edx
1483	jnz	L(exit)
1484
1485	lea	(%ecx, %edi,1), %eax
1486	lea	15(%ecx, %esi,1), %edx
1487	POP	(%edi)
1488	POP	(%esi)
1489	jmp	L(less48bytes)
1490# endif
1491
1492	cfi_restore_state
1493	cfi_remember_state
1494	.p2align 4
1495L(exit):
1496	pmovmskb %xmm1, %ebx
1497	sub	$0xffff, %ebx
1498	jz	L(first16bytes)
1499	lea	-16(%esi), %esi
1500	lea	-16(%edi), %edi
1501	mov	%ebx, %edx
1502
1503L(first16bytes):
1504	add	%eax, %esi
1505L(less16bytes):
1506
1507# ifndef USE_AS_WMEMCMP
1508	test	%dl, %dl
1509	jz	L(next_24_bytes)
1510
1511	test	$0x01, %dl
1512	jnz	L(Byte16)
1513
1514	test	$0x02, %dl
1515	jnz	L(Byte17)
1516
1517	test	$0x04, %dl
1518	jnz	L(Byte18)
1519
1520	test	$0x08, %dl
1521	jnz	L(Byte19)
1522
1523	test	$0x10, %dl
1524	jnz	L(Byte20)
1525
1526	test	$0x20, %dl
1527	jnz	L(Byte21)
1528
1529	test	$0x40, %dl
1530	jnz	L(Byte22)
1531L(Byte23):
1532	movzbl	-9(%edi), %eax
1533	movzbl	-9(%esi), %edx
1534	sub	%edx, %eax
1535	RETURN
1536
1537	.p2align 4
1538L(Byte16):
1539	movzbl	-16(%edi), %eax
1540	movzbl	-16(%esi), %edx
1541	sub	%edx, %eax
1542	RETURN
1543
1544	.p2align 4
1545L(Byte17):
1546	movzbl	-15(%edi), %eax
1547	movzbl	-15(%esi), %edx
1548	sub	%edx, %eax
1549	RETURN
1550
1551	.p2align 4
1552L(Byte18):
1553	movzbl	-14(%edi), %eax
1554	movzbl	-14(%esi), %edx
1555	sub	%edx, %eax
1556	RETURN
1557
1558	.p2align 4
1559L(Byte19):
1560	movzbl	-13(%edi), %eax
1561	movzbl	-13(%esi), %edx
1562	sub	%edx, %eax
1563	RETURN
1564
1565	.p2align 4
1566L(Byte20):
1567	movzbl	-12(%edi), %eax
1568	movzbl	-12(%esi), %edx
1569	sub	%edx, %eax
1570	RETURN
1571
1572	.p2align 4
1573L(Byte21):
1574	movzbl	-11(%edi), %eax
1575	movzbl	-11(%esi), %edx
1576	sub	%edx, %eax
1577	RETURN
1578
1579	.p2align 4
1580L(Byte22):
1581	movzbl	-10(%edi), %eax
1582	movzbl	-10(%esi), %edx
1583	sub	%edx, %eax
1584	RETURN
1585
1586	.p2align 4
1587L(next_24_bytes):
1588	lea	8(%edi), %edi
1589	lea	8(%esi), %esi
1590	test	$0x01, %dh
1591	jnz	L(Byte16)
1592
1593	test	$0x02, %dh
1594	jnz	L(Byte17)
1595
1596	test	$0x04, %dh
1597	jnz	L(Byte18)
1598
1599	test	$0x08, %dh
1600	jnz	L(Byte19)
1601
1602	test	$0x10, %dh
1603	jnz	L(Byte20)
1604
1605	test	$0x20, %dh
1606	jnz	L(Byte21)
1607
1608	test	$0x40, %dh
1609	jnz	L(Byte22)
1610
1611	.p2align 4
1612L(Byte31):
1613	movzbl	-9(%edi), %eax
1614	movzbl	-9(%esi), %edx
1615	sub	%edx, %eax
1616	RETURN_END
1617# else
1618
1619/* special for wmemcmp */
1620	xor	%eax, %eax
1621	test	%dl, %dl
1622	jz	L(next_two_double_words)
1623	and	$15, %dl
1624	jz	L(second_double_word)
1625	mov	-16(%edi), %eax
1626	cmp	-16(%esi), %eax
1627	jne	L(nequal)
1628	RETURN
1629
1630	.p2align 4
1631L(second_double_word):
1632	mov	-12(%edi), %eax
1633	cmp	-12(%esi), %eax
1634	jne	L(nequal)
1635	RETURN
1636
1637	.p2align 4
1638L(next_two_double_words):
1639	and	$15, %dh
1640	jz	L(fourth_double_word)
1641	mov	-8(%edi), %eax
1642	cmp	-8(%esi), %eax
1643	jne	L(nequal)
1644	RETURN
1645
1646	.p2align 4
1647L(fourth_double_word):
1648	mov	-4(%edi), %eax
1649	cmp	-4(%esi), %eax
1650	jne	L(nequal)
1651	RETURN
1652
1653	.p2align 4
1654L(nequal):
1655	mov	$1, %eax
1656	jg	L(nequal_bigger)
1657	neg	%eax
1658	RETURN
1659
1660	.p2align 4
1661L(nequal_bigger):
1662	RETURN_END
1663# endif
1664
1665	CFI_PUSH (%ebx)
1666
1667	.p2align 4
1668L(more8bytes):
1669	cmp	$16, %ecx
1670	jae	L(more16bytes)
1671	cmp	$8, %ecx
1672	je	L(8bytes)
1673# ifndef USE_AS_WMEMCMP
1674	cmp	$9, %ecx
1675	je	L(9bytes)
1676	cmp	$10, %ecx
1677	je	L(10bytes)
1678	cmp	$11, %ecx
1679	je	L(11bytes)
1680	cmp	$12, %ecx
1681	je	L(12bytes)
1682	cmp	$13, %ecx
1683	je	L(13bytes)
1684	cmp	$14, %ecx
1685	je	L(14bytes)
1686	jmp	L(15bytes)
1687# else
1688	jmp	L(12bytes)
1689# endif
1690
1691	.p2align 4
1692L(more16bytes):
1693	cmp	$24, %ecx
1694	jae	L(more24bytes)
1695	cmp	$16, %ecx
1696	je	L(16bytes)
1697# ifndef USE_AS_WMEMCMP
1698	cmp	$17, %ecx
1699	je	L(17bytes)
1700	cmp	$18, %ecx
1701	je	L(18bytes)
1702	cmp	$19, %ecx
1703	je	L(19bytes)
1704	cmp	$20, %ecx
1705	je	L(20bytes)
1706	cmp	$21, %ecx
1707	je	L(21bytes)
1708	cmp	$22, %ecx
1709	je	L(22bytes)
1710	jmp	L(23bytes)
1711# else
1712	jmp	L(20bytes)
1713# endif
1714
1715	.p2align 4
1716L(more24bytes):
1717	cmp	$32, %ecx
1718	jae	L(more32bytes)
1719	cmp	$24, %ecx
1720	je	L(24bytes)
1721# ifndef USE_AS_WMEMCMP
1722	cmp	$25, %ecx
1723	je	L(25bytes)
1724	cmp	$26, %ecx
1725	je	L(26bytes)
1726	cmp	$27, %ecx
1727	je	L(27bytes)
1728	cmp	$28, %ecx
1729	je	L(28bytes)
1730	cmp	$29, %ecx
1731	je	L(29bytes)
1732	cmp	$30, %ecx
1733	je	L(30bytes)
1734	jmp	L(31bytes)
1735# else
1736	jmp	L(28bytes)
1737# endif
1738
1739	.p2align 4
1740L(more32bytes):
1741	cmp	$40, %ecx
1742	jae	L(more40bytes)
1743	cmp	$32, %ecx
1744	je	L(32bytes)
1745# ifndef USE_AS_WMEMCMP
1746	cmp	$33, %ecx
1747	je	L(33bytes)
1748	cmp	$34, %ecx
1749	je	L(34bytes)
1750	cmp	$35, %ecx
1751	je	L(35bytes)
1752	cmp	$36, %ecx
1753	je	L(36bytes)
1754	cmp	$37, %ecx
1755	je	L(37bytes)
1756	cmp	$38, %ecx
1757	je	L(38bytes)
1758	jmp	L(39bytes)
1759# else
1760	jmp	L(36bytes)
1761# endif
1762
1763	.p2align 4
1764L(less48bytes):
1765	cmp	$8, %ecx
1766	jae	L(more8bytes)
1767# ifndef USE_AS_WMEMCMP
1768	cmp	$2, %ecx
1769	je	L(2bytes)
1770	cmp	$3, %ecx
1771	je	L(3bytes)
1772	cmp	$4, %ecx
1773	je	L(4bytes)
1774	cmp	$5, %ecx
1775	je	L(5bytes)
1776	cmp	$6, %ecx
1777	je	L(6bytes)
1778	jmp	L(7bytes)
1779# else
1780	jmp	L(4bytes)
1781# endif
1782
1783	.p2align 4
1784L(more40bytes):
1785	cmp	$40, %ecx
1786	je	L(40bytes)
1787# ifndef USE_AS_WMEMCMP
1788	cmp	$41, %ecx
1789	je	L(41bytes)
1790	cmp	$42, %ecx
1791	je	L(42bytes)
1792	cmp	$43, %ecx
1793	je	L(43bytes)
1794	cmp	$44, %ecx
1795	je	L(44bytes)
1796	cmp	$45, %ecx
1797	je	L(45bytes)
1798	cmp	$46, %ecx
1799	je	L(46bytes)
1800	jmp	L(47bytes)
1801
1802	.p2align 4
1803L(44bytes):
1804	mov	-44(%eax), %ecx
1805	mov	-44(%edx), %ebx
1806	cmp	%ebx, %ecx
1807	jne	L(find_diff)
1808L(40bytes):
1809	mov	-40(%eax), %ecx
1810	mov	-40(%edx), %ebx
1811	cmp	%ebx, %ecx
1812	jne	L(find_diff)
1813L(36bytes):
1814	mov	-36(%eax), %ecx
1815	mov	-36(%edx), %ebx
1816	cmp	%ebx, %ecx
1817	jne	L(find_diff)
1818L(32bytes):
1819	mov	-32(%eax), %ecx
1820	mov	-32(%edx), %ebx
1821	cmp	%ebx, %ecx
1822	jne	L(find_diff)
1823L(28bytes):
1824	mov	-28(%eax), %ecx
1825	mov	-28(%edx), %ebx
1826	cmp	%ebx, %ecx
1827	jne	L(find_diff)
1828L(24bytes):
1829	mov	-24(%eax), %ecx
1830	mov	-24(%edx), %ebx
1831	cmp	%ebx, %ecx
1832	jne	L(find_diff)
1833L(20bytes):
1834	mov	-20(%eax), %ecx
1835	mov	-20(%edx), %ebx
1836	cmp	%ebx, %ecx
1837	jne	L(find_diff)
1838L(16bytes):
1839	mov	-16(%eax), %ecx
1840	mov	-16(%edx), %ebx
1841	cmp	%ebx, %ecx
1842	jne	L(find_diff)
1843L(12bytes):
1844	mov	-12(%eax), %ecx
1845	mov	-12(%edx), %ebx
1846	cmp	%ebx, %ecx
1847	jne	L(find_diff)
1848L(8bytes):
1849	mov	-8(%eax), %ecx
1850	mov	-8(%edx), %ebx
1851	cmp	%ebx, %ecx
1852	jne	L(find_diff)
1853L(4bytes):
1854	mov	-4(%eax), %ecx
1855	mov	-4(%edx), %ebx
1856	cmp	%ebx, %ecx
1857	mov	$0, %eax
1858	jne	L(find_diff)
1859	POP	(%ebx)
1860	ret
1861	CFI_PUSH (%ebx)
1862# else
1863	.p2align 4
1864L(44bytes):
1865	mov	-44(%eax), %ecx
1866	cmp	-44(%edx), %ecx
1867	jne	L(find_diff)
1868L(40bytes):
1869	mov	-40(%eax), %ecx
1870	cmp	-40(%edx), %ecx
1871	jne	L(find_diff)
1872L(36bytes):
1873	mov	-36(%eax), %ecx
1874	cmp	-36(%edx), %ecx
1875	jne	L(find_diff)
1876L(32bytes):
1877	mov	-32(%eax), %ecx
1878	cmp	-32(%edx), %ecx
1879	jne	L(find_diff)
1880L(28bytes):
1881	mov	-28(%eax), %ecx
1882	cmp	-28(%edx), %ecx
1883	jne	L(find_diff)
1884L(24bytes):
1885	mov	-24(%eax), %ecx
1886	cmp	-24(%edx), %ecx
1887	jne	L(find_diff)
1888L(20bytes):
1889	mov	-20(%eax), %ecx
1890	cmp	-20(%edx), %ecx
1891	jne	L(find_diff)
1892L(16bytes):
1893	mov	-16(%eax), %ecx
1894	cmp	-16(%edx), %ecx
1895	jne	L(find_diff)
1896L(12bytes):
1897	mov	-12(%eax), %ecx
1898	cmp	-12(%edx), %ecx
1899	jne	L(find_diff)
1900L(8bytes):
1901	mov	-8(%eax), %ecx
1902	cmp	-8(%edx), %ecx
1903	jne	L(find_diff)
1904L(4bytes):
1905	mov	-4(%eax), %ecx
1906	xor	%eax, %eax
1907	cmp	-4(%edx), %ecx
1908	jne	L(find_diff)
1909	POP	(%ebx)
1910	ret
1911	CFI_PUSH (%ebx)
1912# endif
1913
1914# ifndef USE_AS_WMEMCMP
1915
1916	.p2align 4
1917L(45bytes):
1918	mov	-45(%eax), %ecx
1919	mov	-45(%edx), %ebx
1920	cmp	%ebx, %ecx
1921	jne	L(find_diff)
1922L(41bytes):
1923	mov	-41(%eax), %ecx
1924	mov	-41(%edx), %ebx
1925	cmp	%ebx, %ecx
1926	jne	L(find_diff)
1927L(37bytes):
1928	mov	-37(%eax), %ecx
1929	mov	-37(%edx), %ebx
1930	cmp	%ebx, %ecx
1931	jne	L(find_diff)
1932L(33bytes):
1933	mov	-33(%eax), %ecx
1934	mov	-33(%edx), %ebx
1935	cmp	%ebx, %ecx
1936	jne	L(find_diff)
1937L(29bytes):
1938	mov	-29(%eax), %ecx
1939	mov	-29(%edx), %ebx
1940	cmp	%ebx, %ecx
1941	jne	L(find_diff)
1942L(25bytes):
1943	mov	-25(%eax), %ecx
1944	mov	-25(%edx), %ebx
1945	cmp	%ebx, %ecx
1946	jne	L(find_diff)
1947L(21bytes):
1948	mov	-21(%eax), %ecx
1949	mov	-21(%edx), %ebx
1950	cmp	%ebx, %ecx
1951	jne	L(find_diff)
1952L(17bytes):
1953	mov	-17(%eax), %ecx
1954	mov	-17(%edx), %ebx
1955	cmp	%ebx, %ecx
1956	jne	L(find_diff)
1957L(13bytes):
1958	mov	-13(%eax), %ecx
1959	mov	-13(%edx), %ebx
1960	cmp	%ebx, %ecx
1961	jne	L(find_diff)
1962L(9bytes):
1963	mov	-9(%eax), %ecx
1964	mov	-9(%edx), %ebx
1965	cmp	%ebx, %ecx
1966	jne	L(find_diff)
1967L(5bytes):
1968	mov	-5(%eax), %ecx
1969	mov	-5(%edx), %ebx
1970	cmp	%ebx, %ecx
1971	jne	L(find_diff)
1972	movzbl	-1(%eax), %ecx
1973	cmp	-1(%edx), %cl
1974	mov	$0, %eax
1975	jne	L(end)
1976	POP	(%ebx)
1977	ret
1978	CFI_PUSH (%ebx)
1979
1980	.p2align 4
1981L(46bytes):
1982	mov	-46(%eax), %ecx
1983	mov	-46(%edx), %ebx
1984	cmp	%ebx, %ecx
1985	jne	L(find_diff)
1986L(42bytes):
1987	mov	-42(%eax), %ecx
1988	mov	-42(%edx), %ebx
1989	cmp	%ebx, %ecx
1990	jne	L(find_diff)
1991L(38bytes):
1992	mov	-38(%eax), %ecx
1993	mov	-38(%edx), %ebx
1994	cmp	%ebx, %ecx
1995	jne	L(find_diff)
1996L(34bytes):
1997	mov	-34(%eax), %ecx
1998	mov	-34(%edx), %ebx
1999	cmp	%ebx, %ecx
2000	jne	L(find_diff)
2001L(30bytes):
2002	mov	-30(%eax), %ecx
2003	mov	-30(%edx), %ebx
2004	cmp	%ebx, %ecx
2005	jne	L(find_diff)
2006L(26bytes):
2007	mov	-26(%eax), %ecx
2008	mov	-26(%edx), %ebx
2009	cmp	%ebx, %ecx
2010	jne	L(find_diff)
2011L(22bytes):
2012	mov	-22(%eax), %ecx
2013	mov	-22(%edx), %ebx
2014	cmp	%ebx, %ecx
2015	jne	L(find_diff)
2016L(18bytes):
2017	mov	-18(%eax), %ecx
2018	mov	-18(%edx), %ebx
2019	cmp	%ebx, %ecx
2020	jne	L(find_diff)
2021L(14bytes):
2022	mov	-14(%eax), %ecx
2023	mov	-14(%edx), %ebx
2024	cmp	%ebx, %ecx
2025	jne	L(find_diff)
2026L(10bytes):
2027	mov	-10(%eax), %ecx
2028	mov	-10(%edx), %ebx
2029	cmp	%ebx, %ecx
2030	jne	L(find_diff)
2031L(6bytes):
2032	mov	-6(%eax), %ecx
2033	mov	-6(%edx), %ebx
2034	cmp	%ebx, %ecx
2035	jne	L(find_diff)
2036L(2bytes):
2037	movzwl	-2(%eax), %ecx
2038	movzwl	-2(%edx), %ebx
2039	cmp	%bl, %cl
2040	jne	L(end)
2041	cmp	%bh, %ch
2042	mov	$0, %eax
2043	jne	L(end)
2044	POP	(%ebx)
2045	ret
2046	CFI_PUSH (%ebx)
2047
2048	.p2align 4
2049L(47bytes):
2050	movl	-47(%eax), %ecx
2051	movl	-47(%edx), %ebx
2052	cmp	%ebx, %ecx
2053	jne	L(find_diff)
2054L(43bytes):
2055	movl	-43(%eax), %ecx
2056	movl	-43(%edx), %ebx
2057	cmp	%ebx, %ecx
2058	jne	L(find_diff)
2059L(39bytes):
2060	movl	-39(%eax), %ecx
2061	movl	-39(%edx), %ebx
2062	cmp	%ebx, %ecx
2063	jne	L(find_diff)
2064L(35bytes):
2065	movl	-35(%eax), %ecx
2066	movl	-35(%edx), %ebx
2067	cmp	%ebx, %ecx
2068	jne	L(find_diff)
2069L(31bytes):
2070	movl	-31(%eax), %ecx
2071	movl	-31(%edx), %ebx
2072	cmp	%ebx, %ecx
2073	jne	L(find_diff)
2074L(27bytes):
2075	movl	-27(%eax), %ecx
2076	movl	-27(%edx), %ebx
2077	cmp	%ebx, %ecx
2078	jne	L(find_diff)
2079L(23bytes):
2080	movl	-23(%eax), %ecx
2081	movl	-23(%edx), %ebx
2082	cmp	%ebx, %ecx
2083	jne	L(find_diff)
2084L(19bytes):
2085	movl	-19(%eax), %ecx
2086	movl	-19(%edx), %ebx
2087	cmp	%ebx, %ecx
2088	jne	L(find_diff)
2089L(15bytes):
2090	movl	-15(%eax), %ecx
2091	movl	-15(%edx), %ebx
2092	cmp	%ebx, %ecx
2093	jne	L(find_diff)
2094L(11bytes):
2095	movl	-11(%eax), %ecx
2096	movl	-11(%edx), %ebx
2097	cmp	%ebx, %ecx
2098	jne	L(find_diff)
2099L(7bytes):
2100	movl	-7(%eax), %ecx
2101	movl	-7(%edx), %ebx
2102	cmp	%ebx, %ecx
2103	jne	L(find_diff)
2104L(3bytes):
2105	movzwl	-3(%eax), %ecx
2106	movzwl	-3(%edx), %ebx
2107	cmpb	%bl, %cl
2108	jne	L(end)
2109	cmp	%bx, %cx
2110	jne	L(end)
2111	movzbl	-1(%eax), %eax
2112	cmpb	-1(%edx), %al
2113	mov	$0, %eax
2114	jne	L(end)
2115	POP	(%ebx)
2116	ret
2117	CFI_PUSH (%ebx)
2118
2119	.p2align 4
2120L(find_diff):
2121	cmpb	%bl, %cl
2122	jne	L(end)
2123	cmp	%bx, %cx
2124	jne	L(end)
2125	shr	$16,%ecx
2126	shr	$16,%ebx
2127	cmp	%bl, %cl
2128	jne	L(end)
2129	cmp	%bx, %cx
2130
2131	.p2align 4
2132L(end):
2133	POP	(%ebx)
2134	mov	$1, %eax
2135	ja	L(bigger)
2136	neg	%eax
2137L(bigger):
2138	ret
2139# else
2140
2141/* for wmemcmp */
2142	.p2align 4
2143L(find_diff):
2144	POP	(%ebx)
2145	mov	$1, %eax
2146	jg	L(find_diff_bigger)
2147	neg	%eax
2148	ret
2149
2150	.p2align 4
2151L(find_diff_bigger):
2152	ret
2153
2154# endif
2155END (MEMCMP)
2156#endif
2157