1/* memcmp with SSE4.2, wmemcmp with SSE4.2
2   Copyright (C) 2010-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCMP
24#  define MEMCMP	__memcmp_sse4_2
25# endif
26
27# define CFI_PUSH(REG)	\
28	cfi_adjust_cfa_offset (4);	\
29	cfi_rel_offset (REG, 0)
30
31# define CFI_POP(REG)	\
32	cfi_adjust_cfa_offset (-4);	\
33	cfi_restore (REG)
34
35# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
36# define POP(REG)	popl REG; CFI_POP (REG)
37
38# define PARMS	4
39# define BLK1	PARMS
40# define BLK2	BLK1 + 4
41# define LEN	BLK2 + 4
42# define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx)
43
44
45# ifdef PIC
46#  define JMPTBL(I, B)	I - B
47
48/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
49	jump	table with relative offsets.  INDEX is a register contains the
50	index	into the jump table.   SCALE is the scale of INDEX. */
51
52#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
53/* We first load PC into EBX.  */	\
54	SETUP_PIC_REG(bx);	\
55/* Get the address of the jump table.  */	\
56	addl	$(TABLE - .), %ebx;	\
57/* Get the entry and convert the relative offset to the	\
58	absolute	address.  */	\
59	addl	(%ebx,INDEX,SCALE), %ebx;	\
60/* We loaded the jump table and adjusted EDX/ESI. Go.  */	\
61	_CET_NOTRACK jmp *%ebx
62# else
63#  define JMPTBL(I, B)	I
64
65/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
66	jump	table with relative offsets.  INDEX is a register contains the
67	index	into the jump table.   SCALE is the scale of INDEX. */
68#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
69	_CET_NOTRACK jmp *TABLE(,INDEX,SCALE)
70# endif
71
72
73/* Warning!
74           wmemcmp has to use SIGNED comparison for elements.
75           memcmp has to use UNSIGNED comparison for elemnts.
76*/
77
78	.section .text.sse4.2,"ax",@progbits
79ENTRY (MEMCMP)
80	movl	BLK1(%esp), %eax
81	movl	BLK2(%esp), %edx
82	movl	LEN(%esp), %ecx
83
84# ifdef USE_AS_WMEMCMP
85	shl	$2, %ecx
86	test	%ecx, %ecx
87	jz	L(return0)
88# else
89	cmp	$1, %ecx
90	jbe	L(less1bytes)
91# endif
92
93	pxor	%xmm0, %xmm0
94	cmp	$64, %ecx
95	ja	L(64bytesormore)
96	cmp	$8, %ecx
97
98# ifndef USE_AS_WMEMCMP
99	PUSH	(%ebx)
100	jb	L(less8bytes)
101# else
102	jb	L(less8bytes)
103	PUSH	(%ebx)
104# endif
105
106	add	%ecx, %edx
107	add	%ecx, %eax
108	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
109
110# ifndef USE_AS_WMEMCMP
111	.p2align 4
112L(less8bytes):
113	mov	(%eax), %bl
114	cmpb	(%edx), %bl
115	jne	L(nonzero)
116
117	mov	1(%eax), %bl
118	cmpb	1(%edx), %bl
119	jne	L(nonzero)
120
121	cmp	$2, %ecx
122	jz	L(0bytes)
123
124	mov	2(%eax), %bl
125	cmpb	2(%edx), %bl
126	jne	L(nonzero)
127
128	cmp	$3, %ecx
129	jz	L(0bytes)
130
131	mov	3(%eax), %bl
132	cmpb	3(%edx), %bl
133	jne	L(nonzero)
134
135	cmp	$4, %ecx
136	jz	L(0bytes)
137
138	mov	4(%eax), %bl
139	cmpb	4(%edx), %bl
140	jne	L(nonzero)
141
142	cmp	$5, %ecx
143	jz	L(0bytes)
144
145	mov	5(%eax), %bl
146	cmpb	5(%edx), %bl
147	jne	L(nonzero)
148
149	cmp	$6, %ecx
150	jz	L(0bytes)
151
152	mov	6(%eax), %bl
153	cmpb	6(%edx), %bl
154	je	L(0bytes)
155
156L(nonzero):
157	POP	(%ebx)
158	mov	$1, %eax
159	ja	L(above)
160	neg	%eax
161L(above):
162	ret
163	CFI_PUSH (%ebx)
164# endif
165
166	.p2align 4
167L(0bytes):
168	POP	(%ebx)
169	xor	%eax, %eax
170	ret
171
172# ifdef USE_AS_WMEMCMP
173
174/* for wmemcmp, case N == 1 */
175
176	.p2align 4
177L(less8bytes):
178	mov	(%eax), %ecx
179	cmp	(%edx), %ecx
180	je	L(return0)
181	mov	$1, %eax
182	jg	L(find_diff_bigger)
183	neg	%eax
184	ret
185
186	.p2align 4
187L(find_diff_bigger):
188	ret
189
190	.p2align 4
191L(return0):
192	xor	%eax, %eax
193	ret
194# endif
195
196# ifndef USE_AS_WMEMCMP
197	.p2align 4
198L(less1bytes):
199	jb	L(0bytesend)
200	movzbl	(%eax), %eax
201	movzbl	(%edx), %edx
202	sub	%edx, %eax
203	ret
204
205	.p2align 4
206L(0bytesend):
207	xor	%eax, %eax
208	ret
209# endif
210	.p2align 4
211L(64bytesormore):
212	PUSH	(%ebx)
213	mov	%ecx, %ebx
214	mov	$64, %ecx
215	sub	$64, %ebx
216L(64bytesormore_loop):
217	movdqu	(%eax), %xmm1
218	movdqu	(%edx), %xmm2
219	pxor	%xmm1, %xmm2
220	ptest	%xmm2, %xmm0
221	jnc	L(find_16diff)
222
223	movdqu	16(%eax), %xmm1
224	movdqu	16(%edx), %xmm2
225	pxor	%xmm1, %xmm2
226	ptest	%xmm2, %xmm0
227	jnc	L(find_32diff)
228
229	movdqu	32(%eax), %xmm1
230	movdqu	32(%edx), %xmm2
231	pxor	%xmm1, %xmm2
232	ptest	%xmm2, %xmm0
233	jnc	L(find_48diff)
234
235	movdqu	48(%eax), %xmm1
236	movdqu	48(%edx), %xmm2
237	pxor	%xmm1, %xmm2
238	ptest	%xmm2, %xmm0
239	jnc	L(find_64diff)
240	add	%ecx, %eax
241	add	%ecx, %edx
242	sub	%ecx, %ebx
243	jae	L(64bytesormore_loop)
244	add	%ebx, %ecx
245	add	%ecx, %edx
246	add	%ecx, %eax
247	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
248
249# ifdef USE_AS_WMEMCMP
250
251/* Label needs only for table_64bytes filling */
252L(unreal_case):
253/* no code here */
254
255# endif
256	.p2align 4
257L(find_16diff):
258	sub	$16, %ecx
259L(find_32diff):
260	sub	$16, %ecx
261L(find_48diff):
262	sub	$16, %ecx
263L(find_64diff):
264	add	%ecx, %edx
265	add	%ecx, %eax
266
267# ifndef USE_AS_WMEMCMP
268	.p2align 4
269L(16bytes):
270	mov	-16(%eax), %ecx
271	mov	-16(%edx), %ebx
272	cmp	%ebx, %ecx
273	jne	L(find_diff)
274L(12bytes):
275	mov	-12(%eax), %ecx
276	mov	-12(%edx), %ebx
277	cmp	%ebx, %ecx
278	jne	L(find_diff)
279L(8bytes):
280	mov	-8(%eax), %ecx
281	mov	-8(%edx), %ebx
282	cmp	%ebx, %ecx
283	jne	L(find_diff)
284L(4bytes):
285	mov	-4(%eax), %ecx
286	mov	-4(%edx), %ebx
287	cmp	%ebx, %ecx
288	mov	$0, %eax
289	jne	L(find_diff)
290	RETURN
291# else
292	.p2align 4
293L(16bytes):
294	mov	-16(%eax), %ecx
295	cmp	-16(%edx), %ecx
296	jne	L(find_diff)
297L(12bytes):
298	mov	-12(%eax), %ecx
299	cmp	-12(%edx), %ecx
300	jne	L(find_diff)
301L(8bytes):
302	mov	-8(%eax), %ecx
303	cmp	-8(%edx), %ecx
304	jne	L(find_diff)
305L(4bytes):
306	mov	-4(%eax), %ecx
307	cmp	-4(%edx), %ecx
308	mov	$0, %eax
309	jne	L(find_diff)
310	RETURN
311# endif
312
313# ifndef USE_AS_WMEMCMP
314	.p2align 4
315L(49bytes):
316	movdqu	-49(%eax), %xmm1
317	movdqu	-49(%edx), %xmm2
318	mov	$-49, %ebx
319	pxor	%xmm1, %xmm2
320	ptest	%xmm2, %xmm0
321	jnc	L(less16bytes)
322L(33bytes):
323	movdqu	-33(%eax), %xmm1
324	movdqu	-33(%edx), %xmm2
325	mov	$-33, %ebx
326	pxor	%xmm1, %xmm2
327	ptest	%xmm2, %xmm0
328	jnc	L(less16bytes)
329L(17bytes):
330	mov	-17(%eax), %ecx
331	mov	-17(%edx), %ebx
332	cmp	%ebx, %ecx
333	jne	L(find_diff)
334L(13bytes):
335	mov	-13(%eax), %ecx
336	mov	-13(%edx), %ebx
337	cmp	%ebx, %ecx
338	jne	L(find_diff)
339L(9bytes):
340	mov	-9(%eax), %ecx
341	mov	-9(%edx), %ebx
342	cmp	%ebx, %ecx
343	jne	L(find_diff)
344L(5bytes):
345	mov	-5(%eax), %ecx
346	mov	-5(%edx), %ebx
347	cmp	%ebx, %ecx
348	jne	L(find_diff)
349	movzbl	-1(%eax), %ecx
350	cmp	-1(%edx), %cl
351	mov	$0, %eax
352	jne	L(end)
353	RETURN
354
355	.p2align 4
356L(50bytes):
357	mov	$-50, %ebx
358	movdqu	-50(%eax), %xmm1
359	movdqu	-50(%edx), %xmm2
360	pxor	%xmm1, %xmm2
361	ptest	%xmm2, %xmm0
362	jnc	L(less16bytes)
363L(34bytes):
364	mov	$-34, %ebx
365	movdqu	-34(%eax), %xmm1
366	movdqu	-34(%edx), %xmm2
367	pxor	%xmm1, %xmm2
368	ptest	%xmm2, %xmm0
369	jnc	L(less16bytes)
370L(18bytes):
371	mov	-18(%eax), %ecx
372	mov	-18(%edx), %ebx
373	cmp	%ebx, %ecx
374	jne	L(find_diff)
375L(14bytes):
376	mov	-14(%eax), %ecx
377	mov	-14(%edx), %ebx
378	cmp	%ebx, %ecx
379	jne	L(find_diff)
380L(10bytes):
381	mov	-10(%eax), %ecx
382	mov	-10(%edx), %ebx
383	cmp	%ebx, %ecx
384	jne	L(find_diff)
385L(6bytes):
386	mov	-6(%eax), %ecx
387	mov	-6(%edx), %ebx
388	cmp	%ebx, %ecx
389	jne	L(find_diff)
390L(2bytes):
391	movzwl	-2(%eax), %ecx
392	movzwl	-2(%edx), %ebx
393	cmp	%bl, %cl
394	jne	L(end)
395	cmp	%bh, %ch
396	mov	$0, %eax
397	jne	L(end)
398	RETURN
399
400	.p2align 4
401L(51bytes):
402	mov	$-51, %ebx
403	movdqu	-51(%eax), %xmm1
404	movdqu	-51(%edx), %xmm2
405	pxor	%xmm1, %xmm2
406	ptest	%xmm2, %xmm0
407	jnc	L(less16bytes)
408L(35bytes):
409	mov	$-35, %ebx
410	movdqu	-35(%eax), %xmm1
411	movdqu	-35(%edx), %xmm2
412	pxor	%xmm1, %xmm2
413	ptest	%xmm2, %xmm0
414	jnc	L(less16bytes)
415L(19bytes):
416	movl	-19(%eax), %ecx
417	movl	-19(%edx), %ebx
418	cmp	%ebx, %ecx
419	jne	L(find_diff)
420L(15bytes):
421	movl	-15(%eax), %ecx
422	movl	-15(%edx), %ebx
423	cmp	%ebx, %ecx
424	jne	L(find_diff)
425L(11bytes):
426	movl	-11(%eax), %ecx
427	movl	-11(%edx), %ebx
428	cmp	%ebx, %ecx
429	jne	L(find_diff)
430L(7bytes):
431	movl	-7(%eax), %ecx
432	movl	-7(%edx), %ebx
433	cmp	%ebx, %ecx
434	jne	L(find_diff)
435L(3bytes):
436	movzwl	-3(%eax), %ecx
437	movzwl	-3(%edx), %ebx
438	cmpb	%bl, %cl
439	jne	L(end)
440	cmp	%bx, %cx
441	jne	L(end)
442L(1bytes):
443	movzbl	-1(%eax), %eax
444	cmpb	-1(%edx), %al
445	mov	$0, %eax
446	jne	L(end)
447	RETURN
448# endif
449	.p2align 4
450L(52bytes):
451	movdqu	-52(%eax), %xmm1
452	movdqu	-52(%edx), %xmm2
453	mov	$-52, %ebx
454	pxor	%xmm1, %xmm2
455	ptest	%xmm2, %xmm0
456	jnc	L(less16bytes)
457L(36bytes):
458	movdqu	-36(%eax), %xmm1
459	movdqu	-36(%edx), %xmm2
460	mov	$-36, %ebx
461	pxor	%xmm1, %xmm2
462	ptest	%xmm2, %xmm0
463	jnc	L(less16bytes)
464L(20bytes):
465	movdqu	-20(%eax), %xmm1
466	movdqu	-20(%edx), %xmm2
467	mov	$-20, %ebx
468	pxor	%xmm1, %xmm2
469	ptest	%xmm2, %xmm0
470	jnc	L(less16bytes)
471	mov	-4(%eax), %ecx
472# ifndef USE_AS_WMEMCMP
473	mov	-4(%edx), %ebx
474	cmp	%ebx, %ecx
475# else
476	cmp	-4(%edx), %ecx
477# endif
478	mov	$0, %eax
479	jne	L(find_diff)
480	RETURN
481
482# ifndef USE_AS_WMEMCMP
483	.p2align 4
484L(53bytes):
485	movdqu	-53(%eax), %xmm1
486	movdqu	-53(%edx), %xmm2
487	mov	$-53, %ebx
488	pxor	%xmm1, %xmm2
489	ptest	%xmm2, %xmm0
490	jnc	L(less16bytes)
491L(37bytes):
492	mov	$-37, %ebx
493	movdqu	-37(%eax), %xmm1
494	movdqu	-37(%edx), %xmm2
495	pxor	%xmm1, %xmm2
496	ptest	%xmm2, %xmm0
497	jnc	L(less16bytes)
498L(21bytes):
499	mov	$-21, %ebx
500	movdqu	-21(%eax), %xmm1
501	movdqu	-21(%edx), %xmm2
502	pxor	%xmm1, %xmm2
503	ptest	%xmm2, %xmm0
504	jnc	L(less16bytes)
505	mov	-5(%eax), %ecx
506	mov	-5(%edx), %ebx
507	cmp	%ebx, %ecx
508	jne	L(find_diff)
509	movzbl	-1(%eax), %ecx
510	cmp	-1(%edx), %cl
511	mov	$0, %eax
512	jne	L(end)
513	RETURN
514
515	.p2align 4
516L(54bytes):
517	movdqu	-54(%eax), %xmm1
518	movdqu	-54(%edx), %xmm2
519	mov	$-54, %ebx
520	pxor	%xmm1, %xmm2
521	ptest	%xmm2, %xmm0
522	jnc	L(less16bytes)
523L(38bytes):
524	mov	$-38, %ebx
525	movdqu	-38(%eax), %xmm1
526	movdqu	-38(%edx), %xmm2
527	pxor	%xmm1, %xmm2
528	ptest	%xmm2, %xmm0
529	jnc	L(less16bytes)
530L(22bytes):
531	mov	$-22, %ebx
532	movdqu	-22(%eax), %xmm1
533	movdqu	-22(%edx), %xmm2
534	pxor	%xmm1, %xmm2
535	ptest	%xmm2, %xmm0
536	jnc	L(less16bytes)
537
538	mov	-6(%eax), %ecx
539	mov	-6(%edx), %ebx
540	cmp	%ebx, %ecx
541	jne	L(find_diff)
542	movzwl	-2(%eax), %ecx
543	movzwl	-2(%edx), %ebx
544	cmp	%bl, %cl
545	jne	L(end)
546	cmp	%bh, %ch
547	mov	$0, %eax
548	jne	L(end)
549	RETURN
550
551	.p2align 4
552L(55bytes):
553	movdqu	-55(%eax), %xmm1
554	movdqu	-55(%edx), %xmm2
555	mov	$-55, %ebx
556	pxor	%xmm1, %xmm2
557	ptest	%xmm2, %xmm0
558	jnc	L(less16bytes)
559L(39bytes):
560	mov	$-39, %ebx
561	movdqu	-39(%eax), %xmm1
562	movdqu	-39(%edx), %xmm2
563	pxor	%xmm1, %xmm2
564	ptest	%xmm2, %xmm0
565	jnc	L(less16bytes)
566L(23bytes):
567	mov	$-23, %ebx
568	movdqu	-23(%eax), %xmm1
569	movdqu	-23(%edx), %xmm2
570	pxor	%xmm1, %xmm2
571	ptest	%xmm2, %xmm0
572	jnc	L(less16bytes)
573	movl	-7(%eax), %ecx
574	movl	-7(%edx), %ebx
575	cmp	%ebx, %ecx
576	jne	L(find_diff)
577	movzwl	-3(%eax), %ecx
578	movzwl	-3(%edx), %ebx
579	cmpb	%bl, %cl
580	jne	L(end)
581	cmp	%bx, %cx
582	jne	L(end)
583	movzbl	-1(%eax), %eax
584	cmpb	-1(%edx), %al
585	mov	$0, %eax
586	jne	L(end)
587	RETURN
588# endif
589	.p2align 4
590L(56bytes):
591	movdqu	-56(%eax), %xmm1
592	movdqu	-56(%edx), %xmm2
593	mov	$-56, %ebx
594	pxor	%xmm1, %xmm2
595	ptest	%xmm2, %xmm0
596	jnc	L(less16bytes)
597L(40bytes):
598	mov	$-40, %ebx
599	movdqu	-40(%eax), %xmm1
600	movdqu	-40(%edx), %xmm2
601	pxor	%xmm1, %xmm2
602	ptest	%xmm2, %xmm0
603	jnc	L(less16bytes)
604L(24bytes):
605	mov	$-24, %ebx
606	movdqu	-24(%eax), %xmm1
607	movdqu	-24(%edx), %xmm2
608	pxor	%xmm1, %xmm2
609	ptest	%xmm2, %xmm0
610	jnc	L(less16bytes)
611
612	mov	-8(%eax), %ecx
613# ifndef USE_AS_WMEMCMP
614	mov	-8(%edx), %ebx
615	cmp	%ebx, %ecx
616# else
617	cmp	-8(%edx), %ecx
618# endif
619	jne	L(find_diff)
620
621	mov	-4(%eax), %ecx
622# ifndef USE_AS_WMEMCMP
623	mov	-4(%edx), %ebx
624	cmp	%ebx, %ecx
625# else
626	cmp	-4(%edx), %ecx
627# endif
628	mov	$0, %eax
629	jne	L(find_diff)
630	RETURN
631
632# ifndef USE_AS_WMEMCMP
633	.p2align 4
634L(57bytes):
635	movdqu	-57(%eax), %xmm1
636	movdqu	-57(%edx), %xmm2
637	mov	$-57, %ebx
638	pxor	%xmm1, %xmm2
639	ptest	%xmm2, %xmm0
640	jnc	L(less16bytes)
641L(41bytes):
642	mov	$-41, %ebx
643	movdqu	-41(%eax), %xmm1
644	movdqu	-41(%edx), %xmm2
645	pxor	%xmm1, %xmm2
646	ptest	%xmm2, %xmm0
647	jnc	L(less16bytes)
648L(25bytes):
649	mov	$-25, %ebx
650	movdqu	-25(%eax), %xmm1
651	movdqu	-25(%edx), %xmm2
652	pxor	%xmm1, %xmm2
653	ptest	%xmm2, %xmm0
654	jnc	L(less16bytes)
655	mov	-9(%eax), %ecx
656	mov	-9(%edx), %ebx
657	cmp	%ebx, %ecx
658	jne	L(find_diff)
659	mov	-5(%eax), %ecx
660	mov	-5(%edx), %ebx
661	cmp	%ebx, %ecx
662	jne	L(find_diff)
663	movzbl	-1(%eax), %ecx
664	cmp	-1(%edx), %cl
665	mov	$0, %eax
666	jne	L(end)
667	RETURN
668
669	.p2align 4
670L(58bytes):
671	movdqu	-58(%eax), %xmm1
672	movdqu	-58(%edx), %xmm2
673	mov	$-58, %ebx
674	pxor	%xmm1, %xmm2
675	ptest	%xmm2, %xmm0
676	jnc	L(less16bytes)
677L(42bytes):
678	mov	$-42, %ebx
679	movdqu	-42(%eax), %xmm1
680	movdqu	-42(%edx), %xmm2
681	pxor	%xmm1, %xmm2
682	ptest	%xmm2, %xmm0
683	jnc	L(less16bytes)
684L(26bytes):
685	mov	$-26, %ebx
686	movdqu	-26(%eax), %xmm1
687	movdqu	-26(%edx), %xmm2
688	pxor	%xmm1, %xmm2
689	ptest	%xmm2, %xmm0
690	jnc	L(less16bytes)
691
692	mov	-10(%eax), %ecx
693	mov	-10(%edx), %ebx
694	cmp	%ebx, %ecx
695	jne	L(find_diff)
696
697	mov	-6(%eax), %ecx
698	mov	-6(%edx), %ebx
699	cmp	%ebx, %ecx
700	jne	L(find_diff)
701
702	movzwl	-2(%eax), %ecx
703	movzwl	-2(%edx), %ebx
704	cmp	%bl, %cl
705	jne	L(end)
706	cmp	%bh, %ch
707	mov	$0, %eax
708	jne	L(end)
709	RETURN
710
711	.p2align 4
712L(59bytes):
713	movdqu	-59(%eax), %xmm1
714	movdqu	-59(%edx), %xmm2
715	mov	$-59, %ebx
716	pxor	%xmm1, %xmm2
717	ptest	%xmm2, %xmm0
718	jnc	L(less16bytes)
719L(43bytes):
720	mov	$-43, %ebx
721	movdqu	-43(%eax), %xmm1
722	movdqu	-43(%edx), %xmm2
723	pxor	%xmm1, %xmm2
724	ptest	%xmm2, %xmm0
725	jnc	L(less16bytes)
726L(27bytes):
727	mov	$-27, %ebx
728	movdqu	-27(%eax), %xmm1
729	movdqu	-27(%edx), %xmm2
730	pxor	%xmm1, %xmm2
731	ptest	%xmm2, %xmm0
732	jnc	L(less16bytes)
733	movl	-11(%eax), %ecx
734	movl	-11(%edx), %ebx
735	cmp	%ebx, %ecx
736	jne	L(find_diff)
737	movl	-7(%eax), %ecx
738	movl	-7(%edx), %ebx
739	cmp	%ebx, %ecx
740	jne	L(find_diff)
741	movzwl	-3(%eax), %ecx
742	movzwl	-3(%edx), %ebx
743	cmpb	%bl, %cl
744	jne	L(end)
745	cmp	%bx, %cx
746	jne	L(end)
747	movzbl	-1(%eax), %eax
748	cmpb	-1(%edx), %al
749	mov	$0, %eax
750	jne	L(end)
751	RETURN
752# endif
753	.p2align 4
754L(60bytes):
755	movdqu	-60(%eax), %xmm1
756	movdqu	-60(%edx), %xmm2
757	mov	$-60, %ebx
758	pxor	%xmm1, %xmm2
759	ptest	%xmm2, %xmm0
760	jnc	L(less16bytes)
761L(44bytes):
762	mov	$-44, %ebx
763	movdqu	-44(%eax), %xmm1
764	movdqu	-44(%edx), %xmm2
765	pxor	%xmm1, %xmm2
766	ptest	%xmm2, %xmm0
767	jnc	L(less16bytes)
768L(28bytes):
769	mov	$-28, %ebx
770	movdqu	-28(%eax), %xmm1
771	movdqu	-28(%edx), %xmm2
772	pxor	%xmm1, %xmm2
773	ptest	%xmm2, %xmm0
774	jnc	L(less16bytes)
775
776	mov	-12(%eax), %ecx
777# ifndef USE_AS_WMEMCMP
778	mov	-12(%edx), %ebx
779	cmp	%ebx, %ecx
780# else
781	cmp	-12(%edx), %ecx
782# endif
783	jne	L(find_diff)
784
785	mov	-8(%eax), %ecx
786# ifndef USE_AS_WMEMCMP
787	mov	-8(%edx), %ebx
788	cmp	%ebx, %ecx
789# else
790	cmp	-8(%edx), %ecx
791# endif
792	jne	L(find_diff)
793
794	mov	-4(%eax), %ecx
795# ifndef USE_AS_WMEMCMP
796	mov	-4(%edx), %ebx
797	cmp	%ebx, %ecx
798# else
799	cmp	-4(%edx), %ecx
800# endif
801	mov	$0, %eax
802	jne	L(find_diff)
803	RETURN
804
805# ifndef USE_AS_WMEMCMP
806	.p2align 4
807L(61bytes):
808	movdqu	-61(%eax), %xmm1
809	movdqu	-61(%edx), %xmm2
810	mov	$-61, %ebx
811	pxor	%xmm1, %xmm2
812	ptest	%xmm2, %xmm0
813	jnc	L(less16bytes)
814L(45bytes):
815	mov	$-45, %ebx
816	movdqu	-45(%eax), %xmm1
817	movdqu	-45(%edx), %xmm2
818	pxor	%xmm1, %xmm2
819	ptest	%xmm2, %xmm0
820	jnc	L(less16bytes)
821L(29bytes):
822	mov	$-29, %ebx
823	movdqu	-29(%eax), %xmm1
824	movdqu	-29(%edx), %xmm2
825	pxor	%xmm1, %xmm2
826	ptest	%xmm2, %xmm0
827	jnc	L(less16bytes)
828
829	mov	-13(%eax), %ecx
830	mov	-13(%edx), %ebx
831	cmp	%ebx, %ecx
832	jne	L(find_diff)
833
834	mov	-9(%eax), %ecx
835	mov	-9(%edx), %ebx
836	cmp	%ebx, %ecx
837	jne	L(find_diff)
838
839	mov	-5(%eax), %ecx
840	mov	-5(%edx), %ebx
841	cmp	%ebx, %ecx
842	jne	L(find_diff)
843	movzbl	-1(%eax), %ecx
844	cmp	-1(%edx), %cl
845	mov	$0, %eax
846	jne	L(end)
847	RETURN
848
849	.p2align 4
850L(62bytes):
851	movdqu	-62(%eax), %xmm1
852	movdqu	-62(%edx), %xmm2
853	mov	$-62, %ebx
854	pxor	%xmm1, %xmm2
855	ptest	%xmm2, %xmm0
856	jnc	L(less16bytes)
857L(46bytes):
858	mov	$-46, %ebx
859	movdqu	-46(%eax), %xmm1
860	movdqu	-46(%edx), %xmm2
861	pxor	%xmm1, %xmm2
862	ptest	%xmm2, %xmm0
863	jnc	L(less16bytes)
864L(30bytes):
865	mov	$-30, %ebx
866	movdqu	-30(%eax), %xmm1
867	movdqu	-30(%edx), %xmm2
868	pxor	%xmm1, %xmm2
869	ptest	%xmm2, %xmm0
870	jnc	L(less16bytes)
871	mov	-14(%eax), %ecx
872	mov	-14(%edx), %ebx
873	cmp	%ebx, %ecx
874	jne	L(find_diff)
875	mov	-10(%eax), %ecx
876	mov	-10(%edx), %ebx
877	cmp	%ebx, %ecx
878	jne	L(find_diff)
879	mov	-6(%eax), %ecx
880	mov	-6(%edx), %ebx
881	cmp	%ebx, %ecx
882	jne	L(find_diff)
883	movzwl	-2(%eax), %ecx
884	movzwl	-2(%edx), %ebx
885	cmp	%bl, %cl
886	jne	L(end)
887	cmp	%bh, %ch
888	mov	$0, %eax
889	jne	L(end)
890	RETURN
891
892	.p2align 4
893L(63bytes):
894	movdqu	-63(%eax), %xmm1
895	movdqu	-63(%edx), %xmm2
896	mov	$-63, %ebx
897	pxor	%xmm1, %xmm2
898	ptest	%xmm2, %xmm0
899	jnc	L(less16bytes)
900L(47bytes):
901	mov	$-47, %ebx
902	movdqu	-47(%eax), %xmm1
903	movdqu	-47(%edx), %xmm2
904	pxor	%xmm1, %xmm2
905	ptest	%xmm2, %xmm0
906	jnc	L(less16bytes)
907L(31bytes):
908	mov	$-31, %ebx
909	movdqu	-31(%eax), %xmm1
910	movdqu	-31(%edx), %xmm2
911	pxor	%xmm1, %xmm2
912	ptest	%xmm2, %xmm0
913	jnc	L(less16bytes)
914
915	movl	-15(%eax), %ecx
916	movl	-15(%edx), %ebx
917	cmp	%ebx, %ecx
918	jne	L(find_diff)
919	movl	-11(%eax), %ecx
920	movl	-11(%edx), %ebx
921	cmp	%ebx, %ecx
922	jne	L(find_diff)
923	movl	-7(%eax), %ecx
924	movl	-7(%edx), %ebx
925	cmp	%ebx, %ecx
926	jne	L(find_diff)
927	movzwl	-3(%eax), %ecx
928	movzwl	-3(%edx), %ebx
929	cmpb	%bl, %cl
930	jne	L(end)
931	cmp	%bx, %cx
932	jne	L(end)
933	movzbl	-1(%eax), %eax
934	cmpb	-1(%edx), %al
935	mov	$0, %eax
936	jne	L(end)
937	RETURN
938# endif
939
940	.p2align 4
941L(64bytes):
942	movdqu	-64(%eax), %xmm1
943	movdqu	-64(%edx), %xmm2
944	mov	$-64, %ebx
945	pxor	%xmm1, %xmm2
946	ptest	%xmm2, %xmm0
947	jnc	L(less16bytes)
948L(48bytes):
949	movdqu	-48(%eax), %xmm1
950	movdqu	-48(%edx), %xmm2
951	mov	$-48, %ebx
952	pxor	%xmm1, %xmm2
953	ptest	%xmm2, %xmm0
954	jnc	L(less16bytes)
955L(32bytes):
956	movdqu	-32(%eax), %xmm1
957	movdqu	-32(%edx), %xmm2
958	mov	$-32, %ebx
959	pxor	%xmm1, %xmm2
960	ptest	%xmm2, %xmm0
961	jnc	L(less16bytes)
962
963	mov	-16(%eax), %ecx
964# ifndef USE_AS_WMEMCMP
965	mov	-16(%edx), %ebx
966	cmp	%ebx, %ecx
967# else
968	cmp	-16(%edx), %ecx
969# endif
970	jne	L(find_diff)
971
972	mov	-12(%eax), %ecx
973# ifndef USE_AS_WMEMCMP
974	mov	-12(%edx), %ebx
975	cmp	%ebx, %ecx
976# else
977	cmp	-12(%edx), %ecx
978# endif
979	jne	L(find_diff)
980
981	mov	-8(%eax), %ecx
982# ifndef USE_AS_WMEMCMP
983	mov	-8(%edx), %ebx
984	cmp	%ebx, %ecx
985# else
986	cmp	-8(%edx), %ecx
987# endif
988	jne	L(find_diff)
989
990	mov	-4(%eax), %ecx
991# ifndef USE_AS_WMEMCMP
992	mov	-4(%edx), %ebx
993	cmp	%ebx, %ecx
994# else
995	cmp	-4(%edx), %ecx
996# endif
997	mov	$0, %eax
998	jne	L(find_diff)
999	RETURN
1000
1001# ifndef USE_AS_WMEMCMP
1002	.p2align 4
1003L(less16bytes):
1004	add	%ebx, %eax
1005	add	%ebx, %edx
1006
1007	mov	(%eax), %ecx
1008	mov	(%edx), %ebx
1009	cmp	%ebx, %ecx
1010	jne	L(find_diff)
1011
1012	mov	4(%eax), %ecx
1013	mov	4(%edx), %ebx
1014	cmp	%ebx, %ecx
1015	jne	L(find_diff)
1016
1017	mov	8(%eax), %ecx
1018	mov	8(%edx), %ebx
1019	cmp	%ebx, %ecx
1020	jne	L(find_diff)
1021
1022	mov	12(%eax), %ecx
1023	mov	12(%edx), %ebx
1024	cmp	%ebx, %ecx
1025	mov	$0, %eax
1026	jne	L(find_diff)
1027	RETURN
1028# else
1029	.p2align 4
1030L(less16bytes):
1031	add	%ebx, %eax
1032	add	%ebx, %edx
1033
1034	mov	(%eax), %ecx
1035	cmp	(%edx), %ecx
1036	jne	L(find_diff)
1037
1038	mov	4(%eax), %ecx
1039	cmp	4(%edx), %ecx
1040	jne	L(find_diff)
1041
1042	mov	8(%eax), %ecx
1043	cmp	8(%edx), %ecx
1044	jne	L(find_diff)
1045
1046	mov	12(%eax), %ecx
1047	cmp	12(%edx), %ecx
1048
1049	mov	$0, %eax
1050	jne	L(find_diff)
1051	RETURN
1052# endif
1053
1054	.p2align 4
1055L(find_diff):
1056# ifndef USE_AS_WMEMCMP
1057	cmpb	%bl, %cl
1058	jne	L(end)
1059	cmp	%bx, %cx
1060	jne	L(end)
1061	shr	$16,%ecx
1062	shr	$16,%ebx
1063	cmp	%bl, %cl
1064	jne	L(end)
1065	cmp	%bx, %cx
1066L(end):
1067	POP	(%ebx)
1068	mov	$1, %eax
1069	ja	L(bigger)
1070	neg	%eax
1071L(bigger):
1072	ret
1073# else
1074	POP	(%ebx)
1075	mov	$1, %eax
1076	jg	L(bigger)
1077	neg	%eax
1078	ret
1079
1080	.p2align 4
1081L(bigger):
1082	ret
1083# endif
1084END (MEMCMP)
1085
1086	.section .rodata.sse4.2,"a",@progbits
1087	.p2align 2
1088	.type	L(table_64bytes), @object
1089# ifndef USE_AS_WMEMCMP
1090L(table_64bytes):
1091	.int	JMPTBL (L(0bytes), L(table_64bytes))
1092	.int	JMPTBL (L(1bytes), L(table_64bytes))
1093	.int	JMPTBL (L(2bytes), L(table_64bytes))
1094	.int	JMPTBL (L(3bytes), L(table_64bytes))
1095	.int	JMPTBL (L(4bytes), L(table_64bytes))
1096	.int	JMPTBL (L(5bytes), L(table_64bytes))
1097	.int	JMPTBL (L(6bytes), L(table_64bytes))
1098	.int	JMPTBL (L(7bytes), L(table_64bytes))
1099	.int	JMPTBL (L(8bytes), L(table_64bytes))
1100	.int	JMPTBL (L(9bytes), L(table_64bytes))
1101	.int	JMPTBL (L(10bytes), L(table_64bytes))
1102	.int	JMPTBL (L(11bytes), L(table_64bytes))
1103	.int	JMPTBL (L(12bytes), L(table_64bytes))
1104	.int	JMPTBL (L(13bytes), L(table_64bytes))
1105	.int	JMPTBL (L(14bytes), L(table_64bytes))
1106	.int	JMPTBL (L(15bytes), L(table_64bytes))
1107	.int	JMPTBL (L(16bytes), L(table_64bytes))
1108	.int	JMPTBL (L(17bytes), L(table_64bytes))
1109	.int	JMPTBL (L(18bytes), L(table_64bytes))
1110	.int	JMPTBL (L(19bytes), L(table_64bytes))
1111	.int	JMPTBL (L(20bytes), L(table_64bytes))
1112	.int	JMPTBL (L(21bytes), L(table_64bytes))
1113	.int	JMPTBL (L(22bytes), L(table_64bytes))
1114	.int	JMPTBL (L(23bytes), L(table_64bytes))
1115	.int	JMPTBL (L(24bytes), L(table_64bytes))
1116	.int	JMPTBL (L(25bytes), L(table_64bytes))
1117	.int	JMPTBL (L(26bytes), L(table_64bytes))
1118	.int	JMPTBL (L(27bytes), L(table_64bytes))
1119	.int	JMPTBL (L(28bytes), L(table_64bytes))
1120	.int	JMPTBL (L(29bytes), L(table_64bytes))
1121	.int	JMPTBL (L(30bytes), L(table_64bytes))
1122	.int	JMPTBL (L(31bytes), L(table_64bytes))
1123	.int	JMPTBL (L(32bytes), L(table_64bytes))
1124	.int	JMPTBL (L(33bytes), L(table_64bytes))
1125	.int	JMPTBL (L(34bytes), L(table_64bytes))
1126	.int	JMPTBL (L(35bytes), L(table_64bytes))
1127	.int	JMPTBL (L(36bytes), L(table_64bytes))
1128	.int	JMPTBL (L(37bytes), L(table_64bytes))
1129	.int	JMPTBL (L(38bytes), L(table_64bytes))
1130	.int	JMPTBL (L(39bytes), L(table_64bytes))
1131	.int	JMPTBL (L(40bytes), L(table_64bytes))
1132	.int	JMPTBL (L(41bytes), L(table_64bytes))
1133	.int	JMPTBL (L(42bytes), L(table_64bytes))
1134	.int	JMPTBL (L(43bytes), L(table_64bytes))
1135	.int	JMPTBL (L(44bytes), L(table_64bytes))
1136	.int	JMPTBL (L(45bytes), L(table_64bytes))
1137	.int	JMPTBL (L(46bytes), L(table_64bytes))
1138	.int	JMPTBL (L(47bytes), L(table_64bytes))
1139	.int	JMPTBL (L(48bytes), L(table_64bytes))
1140	.int	JMPTBL (L(49bytes), L(table_64bytes))
1141	.int	JMPTBL (L(50bytes), L(table_64bytes))
1142	.int	JMPTBL (L(51bytes), L(table_64bytes))
1143	.int	JMPTBL (L(52bytes), L(table_64bytes))
1144	.int	JMPTBL (L(53bytes), L(table_64bytes))
1145	.int	JMPTBL (L(54bytes), L(table_64bytes))
1146	.int	JMPTBL (L(55bytes), L(table_64bytes))
1147	.int	JMPTBL (L(56bytes), L(table_64bytes))
1148	.int	JMPTBL (L(57bytes), L(table_64bytes))
1149	.int	JMPTBL (L(58bytes), L(table_64bytes))
1150	.int	JMPTBL (L(59bytes), L(table_64bytes))
1151	.int	JMPTBL (L(60bytes), L(table_64bytes))
1152	.int	JMPTBL (L(61bytes), L(table_64bytes))
1153	.int	JMPTBL (L(62bytes), L(table_64bytes))
1154	.int	JMPTBL (L(63bytes), L(table_64bytes))
1155	.int	JMPTBL (L(64bytes), L(table_64bytes))
1156# else
1157L(table_64bytes):
1158	.int	JMPTBL (L(0bytes), L(table_64bytes))
1159	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1160	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1161	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1162	.int	JMPTBL (L(4bytes), L(table_64bytes))
1163	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1164	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1165	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1166	.int	JMPTBL (L(8bytes), L(table_64bytes))
1167	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1168	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1169	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1170	.int	JMPTBL (L(12bytes), L(table_64bytes))
1171	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1172	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1173	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1174	.int	JMPTBL (L(16bytes), L(table_64bytes))
1175	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1176	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1177	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1178	.int	JMPTBL (L(20bytes), L(table_64bytes))
1179	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1180	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1181	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1182	.int	JMPTBL (L(24bytes), L(table_64bytes))
1183	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1184	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1185	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1186	.int	JMPTBL (L(28bytes), L(table_64bytes))
1187	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1188	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1189	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1190	.int	JMPTBL (L(32bytes), L(table_64bytes))
1191	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1192	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1193	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1194	.int	JMPTBL (L(36bytes), L(table_64bytes))
1195	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1196	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1197	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1198	.int	JMPTBL (L(40bytes), L(table_64bytes))
1199	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1200	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1201	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1202	.int	JMPTBL (L(44bytes), L(table_64bytes))
1203	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1204	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1205	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1206	.int	JMPTBL (L(48bytes), L(table_64bytes))
1207	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1208	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1209	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1210	.int	JMPTBL (L(52bytes), L(table_64bytes))
1211	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1212	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1213	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1214	.int	JMPTBL (L(56bytes), L(table_64bytes))
1215	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1216	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1217	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1218	.int	JMPTBL (L(60bytes), L(table_64bytes))
1219	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1220	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1221	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1222	.int	JMPTBL (L(64bytes), L(table_64bytes))
1223# endif
1224#endif
1225