1/* wcscmp optimized with SSE2.
2   Copyright (C) 2018-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21/* ISA level >= 2 because there is no wcscmp-sse4 implementations.  */
22#if ISA_SHOULD_BUILD (2)
23# include <sysdep.h>
24
25/* Needed to get right name.  */
26# define USE_AS_WCSCMP
27# define STRCMP_ISA	_sse2
28# include "strcmp-naming.h"
29
30/* Note: wcscmp uses signed comparison, not unsighed as in strcmp function. */
31
32	.text
33ENTRY (STRCMP)
34/*
35	* This implementation uses SSE to compare up to 16 bytes at a time.
36*/
37	mov	%esi, %eax
38	mov	%edi, %edx
39	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
40	mov	%al, %ch
41	mov	%dl, %cl
42	and	$63, %eax		/* rsi alignment in cache line */
43	and	$63, %edx		/* rdi alignment in cache line */
44	and	$15, %cl
45	jz	L(continue_00)
46	cmp	$16, %edx
47	jb	L(continue_0)
48	cmp	$32, %edx
49	jb	L(continue_16)
50	cmp	$48, %edx
51	jb	L(continue_32)
52
53L(continue_48):
54	and	$15, %ch
55	jz	L(continue_48_00)
56	cmp	$16, %eax
57	jb	L(continue_0_48)
58	cmp	$32, %eax
59	jb	L(continue_16_48)
60	cmp	$48, %eax
61	jb	L(continue_32_48)
62
63	.p2align 4
64L(continue_48_48):
65	mov	(%rsi), %ecx
66	cmp	%ecx, (%rdi)
67	jne	L(nequal)
68	test	%ecx, %ecx
69	jz	L(equal)
70
71	mov	4(%rsi), %ecx
72	cmp	%ecx, 4(%rdi)
73	jne	L(nequal)
74	test	%ecx, %ecx
75	jz	L(equal)
76
77	mov	8(%rsi), %ecx
78	cmp	%ecx, 8(%rdi)
79	jne	L(nequal)
80	test	%ecx, %ecx
81	jz	L(equal)
82
83	mov	12(%rsi), %ecx
84	cmp	%ecx, 12(%rdi)
85	jne	L(nequal)
86	test	%ecx, %ecx
87	jz	L(equal)
88
89	movdqu	16(%rdi), %xmm1
90	movdqu	16(%rsi), %xmm2
91	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
92	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
93	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
94	pmovmskb %xmm1, %edx
95	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
96	jnz	L(less4_double_words_16)
97
98	movdqu	32(%rdi), %xmm1
99	movdqu	32(%rsi), %xmm2
100	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
101	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
102	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
103	pmovmskb %xmm1, %edx
104	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
105	jnz	L(less4_double_words_32)
106
107	movdqu	48(%rdi), %xmm1
108	movdqu	48(%rsi), %xmm2
109	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
110	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
111	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
112	pmovmskb %xmm1, %edx
113	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
114	jnz	L(less4_double_words_48)
115
116	add	$64, %rsi
117	add	$64, %rdi
118	jmp	L(continue_48_48)
119
120L(continue_0):
121	and	$15, %ch
122	jz	L(continue_0_00)
123	cmp	$16, %eax
124	jb	L(continue_0_0)
125	cmp	$32, %eax
126	jb	L(continue_0_16)
127	cmp	$48, %eax
128	jb	L(continue_0_32)
129
130	.p2align 4
131L(continue_0_48):
132	mov	(%rsi), %ecx
133	cmp	%ecx, (%rdi)
134	jne	L(nequal)
135	test	%ecx, %ecx
136	jz	L(equal)
137
138	mov	4(%rsi), %ecx
139	cmp	%ecx, 4(%rdi)
140	jne	L(nequal)
141	test	%ecx, %ecx
142	jz	L(equal)
143
144	mov	8(%rsi), %ecx
145	cmp	%ecx, 8(%rdi)
146	jne	L(nequal)
147	test	%ecx, %ecx
148	jz	L(equal)
149
150	mov	12(%rsi), %ecx
151	cmp	%ecx, 12(%rdi)
152	jne	L(nequal)
153	test	%ecx, %ecx
154	jz	L(equal)
155
156	movdqu	16(%rdi), %xmm1
157	movdqu	16(%rsi), %xmm2
158	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
159	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
160	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
161	pmovmskb %xmm1, %edx
162	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
163	jnz	L(less4_double_words_16)
164
165	movdqu	32(%rdi), %xmm1
166	movdqu	32(%rsi), %xmm2
167	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
168	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
169	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
170	pmovmskb %xmm1, %edx
171	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
172	jnz	L(less4_double_words_32)
173
174	mov	48(%rsi), %ecx
175	cmp	%ecx, 48(%rdi)
176	jne	L(nequal)
177	test	%ecx, %ecx
178	jz	L(equal)
179
180	mov	52(%rsi), %ecx
181	cmp	%ecx, 52(%rdi)
182	jne	L(nequal)
183	test	%ecx, %ecx
184	jz	L(equal)
185
186	mov	56(%rsi), %ecx
187	cmp	%ecx, 56(%rdi)
188	jne	L(nequal)
189	test	%ecx, %ecx
190	jz	L(equal)
191
192	mov	60(%rsi), %ecx
193	cmp	%ecx, 60(%rdi)
194	jne	L(nequal)
195	test	%ecx, %ecx
196	jz	L(equal)
197
198	add	$64, %rsi
199	add	$64, %rdi
200	jmp	L(continue_0_48)
201
202	.p2align 4
203L(continue_00):
204	and	$15, %ch
205	jz	L(continue_00_00)
206	cmp	$16, %eax
207	jb	L(continue_00_0)
208	cmp	$32, %eax
209	jb	L(continue_00_16)
210	cmp	$48, %eax
211	jb	L(continue_00_32)
212
213	.p2align 4
214L(continue_00_48):
215	pcmpeqd	(%rdi), %xmm0
216	mov	(%rdi), %eax
217	pmovmskb %xmm0, %ecx
218	test	%ecx, %ecx
219	jnz	L(less4_double_words1)
220
221	cmp	(%rsi), %eax
222	jne	L(nequal)
223
224	mov	4(%rdi), %eax
225	cmp	4(%rsi), %eax
226	jne	L(nequal)
227
228	mov	8(%rdi), %eax
229	cmp	8(%rsi), %eax
230	jne	L(nequal)
231
232	mov	12(%rdi), %eax
233	cmp	12(%rsi), %eax
234	jne	L(nequal)
235
236	movdqu	16(%rsi), %xmm2
237	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
238	pcmpeqd	16(%rdi), %xmm2		/* compare first 4 double_words for equality */
239	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
240	pmovmskb %xmm2, %edx
241	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
242	jnz	L(less4_double_words_16)
243
244	movdqu	32(%rsi), %xmm2
245	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
246	pcmpeqd	32(%rdi), %xmm2		/* compare first 4 double_words for equality */
247	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
248	pmovmskb %xmm2, %edx
249	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
250	jnz	L(less4_double_words_32)
251
252	movdqu	48(%rsi), %xmm2
253	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
254	pcmpeqd	48(%rdi), %xmm2		/* compare first 4 double_words for equality */
255	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
256	pmovmskb %xmm2, %edx
257	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
258	jnz	L(less4_double_words_48)
259
260	add	$64, %rsi
261	add	$64, %rdi
262	jmp	L(continue_00_48)
263
264	.p2align 4
265L(continue_32):
266	and	$15, %ch
267	jz	L(continue_32_00)
268	cmp	$16, %eax
269	jb	L(continue_0_32)
270	cmp	$32, %eax
271	jb	L(continue_16_32)
272	cmp	$48, %eax
273	jb	L(continue_32_32)
274
275	.p2align 4
276L(continue_32_48):
277	mov	(%rsi), %ecx
278	cmp	%ecx, (%rdi)
279	jne	L(nequal)
280	test	%ecx, %ecx
281	jz	L(equal)
282
283	mov	4(%rsi), %ecx
284	cmp	%ecx, 4(%rdi)
285	jne	L(nequal)
286	test	%ecx, %ecx
287	jz	L(equal)
288
289	mov	8(%rsi), %ecx
290	cmp	%ecx, 8(%rdi)
291	jne	L(nequal)
292	test	%ecx, %ecx
293	jz	L(equal)
294
295	mov	12(%rsi), %ecx
296	cmp	%ecx, 12(%rdi)
297	jne	L(nequal)
298	test	%ecx, %ecx
299	jz	L(equal)
300
301	mov	16(%rsi), %ecx
302	cmp	%ecx, 16(%rdi)
303	jne	L(nequal)
304	test	%ecx, %ecx
305	jz	L(equal)
306
307	mov	20(%rsi), %ecx
308	cmp	%ecx, 20(%rdi)
309	jne	L(nequal)
310	test	%ecx, %ecx
311	jz	L(equal)
312
313	mov	24(%rsi), %ecx
314	cmp	%ecx, 24(%rdi)
315	jne	L(nequal)
316	test	%ecx, %ecx
317	jz	L(equal)
318
319	mov	28(%rsi), %ecx
320	cmp	%ecx, 28(%rdi)
321	jne	L(nequal)
322	test	%ecx, %ecx
323	jz	L(equal)
324
325	movdqu	32(%rdi), %xmm1
326	movdqu	32(%rsi), %xmm2
327	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
328	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
329	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
330	pmovmskb %xmm1, %edx
331	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
332	jnz	L(less4_double_words_32)
333
334	movdqu	48(%rdi), %xmm1
335	movdqu	48(%rsi), %xmm2
336	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
337	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
338	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
339	pmovmskb %xmm1, %edx
340	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
341	jnz	L(less4_double_words_48)
342
343	add	$64, %rsi
344	add	$64, %rdi
345	jmp	L(continue_32_48)
346
347	.p2align 4
348L(continue_16):
349	and	$15, %ch
350	jz	L(continue_16_00)
351	cmp	$16, %eax
352	jb	L(continue_0_16)
353	cmp	$32, %eax
354	jb	L(continue_16_16)
355	cmp	$48, %eax
356	jb	L(continue_16_32)
357
358	.p2align 4
359L(continue_16_48):
360	mov	(%rsi), %ecx
361	cmp	%ecx, (%rdi)
362	jne	L(nequal)
363	test	%ecx, %ecx
364	jz	L(equal)
365
366	mov	4(%rsi), %ecx
367	cmp	%ecx, 4(%rdi)
368	jne	L(nequal)
369	test	%ecx, %ecx
370	jz	L(equal)
371
372	mov	8(%rsi), %ecx
373	cmp	%ecx, 8(%rdi)
374	jne	L(nequal)
375	test	%ecx, %ecx
376	jz	L(equal)
377
378	mov	12(%rsi), %ecx
379	cmp	%ecx, 12(%rdi)
380	jne	L(nequal)
381	test	%ecx, %ecx
382	jz	L(equal)
383
384	movdqu	16(%rdi), %xmm1
385	movdqu	16(%rsi), %xmm2
386	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
387	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
388	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
389	pmovmskb %xmm1, %edx
390	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
391	jnz	L(less4_double_words_16)
392
393	mov	32(%rsi), %ecx
394	cmp	%ecx, 32(%rdi)
395	jne	L(nequal)
396	test	%ecx, %ecx
397	jz	L(equal)
398
399	mov	36(%rsi), %ecx
400	cmp	%ecx, 36(%rdi)
401	jne	L(nequal)
402	test	%ecx, %ecx
403	jz	L(equal)
404
405	mov	40(%rsi), %ecx
406	cmp	%ecx, 40(%rdi)
407	jne	L(nequal)
408	test	%ecx, %ecx
409	jz	L(equal)
410
411	mov	44(%rsi), %ecx
412	cmp	%ecx, 44(%rdi)
413	jne	L(nequal)
414	test	%ecx, %ecx
415	jz	L(equal)
416
417	movdqu	48(%rdi), %xmm1
418	movdqu	48(%rsi), %xmm2
419	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
420	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
421	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
422	pmovmskb %xmm1, %edx
423	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
424	jnz	L(less4_double_words_48)
425
426	add	$64, %rsi
427	add	$64, %rdi
428	jmp	L(continue_16_48)
429
430	.p2align 4
431L(continue_00_00):
432	movdqa	(%rdi), %xmm1
433	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
434	pcmpeqd	(%rsi), %xmm1		/* compare first 4 double_words for equality */
435	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
436	pmovmskb %xmm1, %edx
437	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
438	jnz	L(less4_double_words)
439
440	movdqa	16(%rdi), %xmm3
441	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
442	pcmpeqd	16(%rsi), %xmm3		/* compare first 4 double_words for equality */
443	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
444	pmovmskb %xmm3, %edx
445	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
446	jnz	L(less4_double_words_16)
447
448	movdqa	32(%rdi), %xmm5
449	pcmpeqd	%xmm5, %xmm0		/* Any null double_word? */
450	pcmpeqd	32(%rsi), %xmm5		/* compare first 4 double_words for equality */
451	psubb	%xmm0, %xmm5		/* packed sub of comparison results*/
452	pmovmskb %xmm5, %edx
453	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
454	jnz	L(less4_double_words_32)
455
456	movdqa	48(%rdi), %xmm1
457	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
458	pcmpeqd	48(%rsi), %xmm1		/* compare first 4 double_words for equality */
459	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
460	pmovmskb %xmm1, %edx
461	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
462	jnz	L(less4_double_words_48)
463
464	add	$64, %rsi
465	add	$64, %rdi
466	jmp	L(continue_00_00)
467
468	.p2align 4
469L(continue_00_32):
470	movdqu	(%rsi), %xmm2
471	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
472	pcmpeqd	(%rdi), %xmm2		/* compare first 4 double_words for equality */
473	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
474	pmovmskb %xmm2, %edx
475	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
476	jnz	L(less4_double_words)
477
478	add	$16, %rsi
479	add	$16, %rdi
480	jmp	L(continue_00_48)
481
482	.p2align 4
483L(continue_00_16):
484	movdqu	(%rsi), %xmm2
485	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
486	pcmpeqd	(%rdi), %xmm2		/* compare first 4 double_words for equality */
487	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
488	pmovmskb %xmm2, %edx
489	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
490	jnz	L(less4_double_words)
491
492	movdqu	16(%rsi), %xmm2
493	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
494	pcmpeqd	16(%rdi), %xmm2		/* compare first 4 double_words for equality */
495	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
496	pmovmskb %xmm2, %edx
497	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
498	jnz	L(less4_double_words_16)
499
500	add	$32, %rsi
501	add	$32, %rdi
502	jmp	L(continue_00_48)
503
504	.p2align 4
505L(continue_00_0):
506	movdqu	(%rsi), %xmm2
507	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
508	pcmpeqd	(%rdi), %xmm2		/* compare first 4 double_words for equality */
509	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
510	pmovmskb %xmm2, %edx
511	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
512	jnz	L(less4_double_words)
513
514	movdqu	16(%rsi), %xmm2
515	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
516	pcmpeqd	16(%rdi), %xmm2		/* compare first 4 double_words for equality */
517	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
518	pmovmskb %xmm2, %edx
519	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
520	jnz	L(less4_double_words_16)
521
522	movdqu	32(%rsi), %xmm2
523	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
524	pcmpeqd	32(%rdi), %xmm2		/* compare first 4 double_words for equality */
525	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
526	pmovmskb %xmm2, %edx
527	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
528	jnz	L(less4_double_words_32)
529
530	add	$48, %rsi
531	add	$48, %rdi
532	jmp	L(continue_00_48)
533
534	.p2align 4
535L(continue_48_00):
536	pcmpeqd	(%rsi), %xmm0
537	mov	(%rdi), %eax
538	pmovmskb %xmm0, %ecx
539	test	%ecx, %ecx
540	jnz	L(less4_double_words1)
541
542	cmp	(%rsi), %eax
543	jne	L(nequal)
544
545	mov	4(%rdi), %eax
546	cmp	4(%rsi), %eax
547	jne	L(nequal)
548
549	mov	8(%rdi), %eax
550	cmp	8(%rsi), %eax
551	jne	L(nequal)
552
553	mov	12(%rdi), %eax
554	cmp	12(%rsi), %eax
555	jne	L(nequal)
556
557	movdqu	16(%rdi), %xmm1
558	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
559	pcmpeqd	16(%rsi), %xmm1		/* compare first 4 double_words for equality */
560	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
561	pmovmskb %xmm1, %edx
562	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
563	jnz	L(less4_double_words_16)
564
565	movdqu	32(%rdi), %xmm1
566	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
567	pcmpeqd	32(%rsi), %xmm1		/* compare first 4 double_words for equality */
568	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
569	pmovmskb %xmm1, %edx
570	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
571	jnz	L(less4_double_words_32)
572
573	movdqu	48(%rdi), %xmm1
574	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
575	pcmpeqd	48(%rsi), %xmm1		/* compare first 4 double_words for equality */
576	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
577	pmovmskb %xmm1, %edx
578	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
579	jnz	L(less4_double_words_48)
580
581	add	$64, %rsi
582	add	$64, %rdi
583	jmp	L(continue_48_00)
584
585	.p2align 4
586L(continue_32_00):
587	movdqu	(%rdi), %xmm1
588	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
589	pcmpeqd	(%rsi), %xmm1		/* compare first 4 double_words for equality */
590	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
591	pmovmskb %xmm1, %edx
592	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
593	jnz	L(less4_double_words)
594
595	add	$16, %rsi
596	add	$16, %rdi
597	jmp	L(continue_48_00)
598
599	.p2align 4
600L(continue_16_00):
601	movdqu	(%rdi), %xmm1
602	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
603	pcmpeqd	(%rsi), %xmm1		/* compare first 4 double_words for equality */
604	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
605	pmovmskb %xmm1, %edx
606	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
607	jnz	L(less4_double_words)
608
609	movdqu	16(%rdi), %xmm1
610	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
611	pcmpeqd	16(%rsi), %xmm1		/* compare first 4 double_words for equality */
612	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
613	pmovmskb %xmm1, %edx
614	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
615	jnz	L(less4_double_words_16)
616
617	add	$32, %rsi
618	add	$32, %rdi
619	jmp	L(continue_48_00)
620
621	.p2align 4
622L(continue_0_00):
623	movdqu	(%rdi), %xmm1
624	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
625	pcmpeqd	(%rsi), %xmm1		/* compare first 4 double_words for equality */
626	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
627	pmovmskb %xmm1, %edx
628	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
629	jnz	L(less4_double_words)
630
631	movdqu	16(%rdi), %xmm1
632	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
633	pcmpeqd	16(%rsi), %xmm1		/* compare first 4 double_words for equality */
634	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
635	pmovmskb %xmm1, %edx
636	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
637	jnz	L(less4_double_words_16)
638
639	movdqu	32(%rdi), %xmm1
640	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
641	pcmpeqd	32(%rsi), %xmm1		/* compare first 4 double_words for equality */
642	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
643	pmovmskb %xmm1, %edx
644	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
645	jnz	L(less4_double_words_32)
646
647	add	$48, %rsi
648	add	$48, %rdi
649	jmp	L(continue_48_00)
650
651	.p2align 4
652L(continue_32_32):
653	movdqu	(%rdi), %xmm1
654	movdqu	(%rsi), %xmm2
655	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
656	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
657	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
658	pmovmskb %xmm1, %edx
659	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
660	jnz	L(less4_double_words)
661
662	add	$16, %rsi
663	add	$16, %rdi
664	jmp	L(continue_48_48)
665
666	.p2align 4
667L(continue_16_16):
668	movdqu	(%rdi), %xmm1
669	movdqu	(%rsi), %xmm2
670	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
671	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
672	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
673	pmovmskb %xmm1, %edx
674	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
675	jnz	L(less4_double_words)
676
677	movdqu	16(%rdi), %xmm3
678	movdqu	16(%rsi), %xmm4
679	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
680	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
681	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
682	pmovmskb %xmm3, %edx
683	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
684	jnz	L(less4_double_words_16)
685
686	add	$32, %rsi
687	add	$32, %rdi
688	jmp	L(continue_48_48)
689
690	.p2align 4
691L(continue_0_0):
692	movdqu	(%rdi), %xmm1
693	movdqu	(%rsi), %xmm2
694	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
695	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
696	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
697	pmovmskb %xmm1, %edx
698	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
699	jnz	L(less4_double_words)
700
701	movdqu	16(%rdi), %xmm3
702	movdqu	16(%rsi), %xmm4
703	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
704	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
705	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
706	pmovmskb %xmm3, %edx
707	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
708	jnz	L(less4_double_words_16)
709
710	movdqu	32(%rdi), %xmm1
711	movdqu	32(%rsi), %xmm2
712	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
713	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
714	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
715	pmovmskb %xmm1, %edx
716	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
717	jnz	L(less4_double_words_32)
718
719	add	$48, %rsi
720	add	$48, %rdi
721	jmp	L(continue_48_48)
722
723	.p2align 4
724L(continue_0_16):
725	movdqu	(%rdi), %xmm1
726	movdqu	(%rsi), %xmm2
727	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
728	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
729	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
730	pmovmskb %xmm1, %edx
731	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
732	jnz	L(less4_double_words)
733
734	movdqu	16(%rdi), %xmm1
735	movdqu	16(%rsi), %xmm2
736	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
737	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
738	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
739	pmovmskb %xmm1, %edx
740	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
741	jnz	L(less4_double_words_16)
742
743	add	$32, %rsi
744	add	$32, %rdi
745	jmp	L(continue_32_48)
746
747	.p2align 4
748L(continue_0_32):
749	movdqu	(%rdi), %xmm1
750	movdqu	(%rsi), %xmm2
751	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
752	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
753	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
754	pmovmskb %xmm1, %edx
755	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
756	jnz	L(less4_double_words)
757
758	add	$16, %rsi
759	add	$16, %rdi
760	jmp	L(continue_16_48)
761
762	.p2align 4
763L(continue_16_32):
764	movdqu	(%rdi), %xmm1
765	movdqu	(%rsi), %xmm2
766	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
767	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
768	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
769	pmovmskb %xmm1, %edx
770	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
771	jnz	L(less4_double_words)
772
773	add	$16, %rsi
774	add	$16, %rdi
775	jmp	L(continue_32_48)
776
777	.p2align 4
778L(less4_double_words1):
779	cmp	(%rsi), %eax
780	jne	L(nequal)
781	test	%eax, %eax
782	jz	L(equal)
783
784	mov	4(%rsi), %ecx
785	cmp	%ecx, 4(%rdi)
786	jne	L(nequal)
787	test	%ecx, %ecx
788	jz	L(equal)
789
790	mov	8(%rsi), %ecx
791	cmp	%ecx, 8(%rdi)
792	jne	L(nequal)
793	test	%ecx, %ecx
794	jz	L(equal)
795
796	mov	12(%rsi), %ecx
797	cmp	%ecx, 12(%rdi)
798	jne	L(nequal)
799	xor	%eax, %eax
800	ret
801
802	.p2align 4
803L(less4_double_words):
804	xor	%eax, %eax
805	test	%dl, %dl
806	jz	L(next_two_double_words)
807	and	$15, %dl
808	jz	L(second_double_word)
809	mov	(%rdi), %eax
810	cmp	(%rsi), %eax
811	jne	L(nequal)
812	ret
813
814	.p2align 4
815L(second_double_word):
816	mov	4(%rdi), %eax
817	cmp	4(%rsi), %eax
818	jne	L(nequal)
819	ret
820
821	.p2align 4
822L(next_two_double_words):
823	and	$15, %dh
824	jz	L(fourth_double_word)
825	mov	8(%rdi), %eax
826	cmp	8(%rsi), %eax
827	jne	L(nequal)
828	ret
829
830	.p2align 4
831L(fourth_double_word):
832	mov	12(%rdi), %eax
833	cmp	12(%rsi), %eax
834	jne	L(nequal)
835	ret
836
837	.p2align 4
838L(less4_double_words_16):
839	xor	%eax, %eax
840	test	%dl, %dl
841	jz	L(next_two_double_words_16)
842	and	$15, %dl
843	jz	L(second_double_word_16)
844	mov	16(%rdi), %eax
845	cmp	16(%rsi), %eax
846	jne	L(nequal)
847	ret
848
849	.p2align 4
850L(second_double_word_16):
851	mov	20(%rdi), %eax
852	cmp	20(%rsi), %eax
853	jne	L(nequal)
854	ret
855
856	.p2align 4
857L(next_two_double_words_16):
858	and	$15, %dh
859	jz	L(fourth_double_word_16)
860	mov	24(%rdi), %eax
861	cmp	24(%rsi), %eax
862	jne	L(nequal)
863	ret
864
865	.p2align 4
866L(fourth_double_word_16):
867	mov	28(%rdi), %eax
868	cmp	28(%rsi), %eax
869	jne	L(nequal)
870	ret
871
872	.p2align 4
873L(less4_double_words_32):
874	xor	%eax, %eax
875	test	%dl, %dl
876	jz	L(next_two_double_words_32)
877	and	$15, %dl
878	jz	L(second_double_word_32)
879	mov	32(%rdi), %eax
880	cmp	32(%rsi), %eax
881	jne	L(nequal)
882	ret
883
884	.p2align 4
885L(second_double_word_32):
886	mov	36(%rdi), %eax
887	cmp	36(%rsi), %eax
888	jne	L(nequal)
889	ret
890
891	.p2align 4
892L(next_two_double_words_32):
893	and	$15, %dh
894	jz	L(fourth_double_word_32)
895	mov	40(%rdi), %eax
896	cmp	40(%rsi), %eax
897	jne	L(nequal)
898	ret
899
900	.p2align 4
901L(fourth_double_word_32):
902	mov	44(%rdi), %eax
903	cmp	44(%rsi), %eax
904	jne	L(nequal)
905	ret
906
907	.p2align 4
908L(less4_double_words_48):
909	xor	%eax, %eax
910	test	%dl, %dl
911	jz	L(next_two_double_words_48)
912	and	$15, %dl
913	jz	L(second_double_word_48)
914	mov	48(%rdi), %eax
915	cmp	48(%rsi), %eax
916	jne	L(nequal)
917	ret
918
919	.p2align 4
920L(second_double_word_48):
921	mov	52(%rdi), %eax
922	cmp	52(%rsi), %eax
923	jne	L(nequal)
924	ret
925
926	.p2align 4
927L(next_two_double_words_48):
928	and	$15, %dh
929	jz	L(fourth_double_word_48)
930	mov	56(%rdi), %eax
931	cmp	56(%rsi), %eax
932	jne	L(nequal)
933	ret
934
935	.p2align 4
936L(fourth_double_word_48):
937	mov	60(%rdi), %eax
938	cmp	60(%rsi), %eax
939	jne	L(nequal)
940	ret
941
942	.p2align 4
943L(nequal):
944	mov	$1, %eax
945	jg	L(nequal_bigger)
946	neg	%eax
947
948L(nequal_bigger):
949	ret
950
951	.p2align 4
952L(equal):
953	xor	%rax, %rax
954	ret
955
956END (STRCMP)
957#endif
958