1/* wcscmp with SSE2
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# define CFI_PUSH(REG)	\
24	cfi_adjust_cfa_offset (4);	\
25	cfi_rel_offset (REG, 0)
26
27# define CFI_POP(REG)	\
28	cfi_adjust_cfa_offset (-4);	\
29	cfi_restore (REG)
30
31# define PUSH(REG) pushl REG; CFI_PUSH (REG)
32# define POP(REG) popl REG; CFI_POP (REG)
33
34# define ENTRANCE PUSH(%esi); PUSH(%edi)
35# define RETURN  POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
36# define PARMS  4
37# define STR1  PARMS
38# define STR2  STR1+4
39
40/* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */
41
42	.text
43ENTRY (__wcscmp_sse2)
44/*
45	* This implementation uses SSE to compare up to 16 bytes at a time.
46*/
47	mov	STR1(%esp), %edx
48	mov	STR2(%esp), %eax
49
50	mov	(%eax), %ecx
51	cmp	%ecx, (%edx)
52	jne	L(neq)
53	test	%ecx, %ecx
54	jz	L(eq)
55
56	mov	4(%eax), %ecx
57	cmp	%ecx, 4(%edx)
58	jne	L(neq)
59	test	%ecx, %ecx
60	jz	L(eq)
61
62	mov	8(%eax), %ecx
63	cmp	%ecx, 8(%edx)
64	jne	L(neq)
65	test	%ecx, %ecx
66	jz	L(eq)
67
68	mov	12(%eax), %ecx
69	cmp	%ecx, 12(%edx)
70	jne	L(neq)
71	test	%ecx, %ecx
72	jz	L(eq)
73
74	ENTRANCE
75	add	$16, %eax
76	add	$16, %edx
77
78	mov	%eax, %esi
79	mov	%edx, %edi
80	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
81	mov	%al, %ch
82	mov	%dl, %cl
83	and	$63, %eax		/* esi alignment in cache line */
84	and	$63, %edx		/* edi alignment in cache line */
85	and	$15, %cl
86	jz	L(continue_00)
87	cmp	$16, %edx
88	jb	L(continue_0)
89	cmp	$32, %edx
90	jb	L(continue_16)
91	cmp	$48, %edx
92	jb	L(continue_32)
93
94L(continue_48):
95	and	$15, %ch
96	jz	L(continue_48_00)
97	cmp	$16, %eax
98	jb	L(continue_0_48)
99	cmp	$32, %eax
100	jb	L(continue_16_48)
101	cmp	$48, %eax
102	jb	L(continue_32_48)
103
104	.p2align 4
105L(continue_48_48):
106	mov	(%esi), %ecx
107	cmp	%ecx, (%edi)
108	jne	L(nequal)
109	test	%ecx, %ecx
110	jz	L(equal)
111
112	mov	4(%esi), %ecx
113	cmp	%ecx, 4(%edi)
114	jne	L(nequal)
115	test	%ecx, %ecx
116	jz	L(equal)
117
118	mov	8(%esi), %ecx
119	cmp	%ecx, 8(%edi)
120	jne	L(nequal)
121	test	%ecx, %ecx
122	jz	L(equal)
123
124	mov	12(%esi), %ecx
125	cmp	%ecx, 12(%edi)
126	jne	L(nequal)
127	test	%ecx, %ecx
128	jz	L(equal)
129
130	movdqu	16(%edi), %xmm1
131	movdqu	16(%esi), %xmm2
132	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
133	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
134	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
135	pmovmskb %xmm1, %edx
136	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
137	jnz	L(less4_double_words_16)
138
139	movdqu	32(%edi), %xmm1
140	movdqu	32(%esi), %xmm2
141	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
142	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
143	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
144	pmovmskb %xmm1, %edx
145	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
146	jnz	L(less4_double_words_32)
147
148	movdqu	48(%edi), %xmm1
149	movdqu	48(%esi), %xmm2
150	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
151	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
152	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
153	pmovmskb %xmm1, %edx
154	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
155	jnz	L(less4_double_words_48)
156
157	add	$64, %esi
158	add	$64, %edi
159	jmp	L(continue_48_48)
160
161L(continue_0):
162	and	$15, %ch
163	jz	L(continue_0_00)
164	cmp	$16, %eax
165	jb	L(continue_0_0)
166	cmp	$32, %eax
167	jb	L(continue_0_16)
168	cmp	$48, %eax
169	jb	L(continue_0_32)
170
171	.p2align 4
172L(continue_0_48):
173	mov	(%esi), %ecx
174	cmp	%ecx, (%edi)
175	jne	L(nequal)
176	test	%ecx, %ecx
177	jz	L(equal)
178
179	mov	4(%esi), %ecx
180	cmp	%ecx, 4(%edi)
181	jne	L(nequal)
182	test	%ecx, %ecx
183	jz	L(equal)
184
185	mov	8(%esi), %ecx
186	cmp	%ecx, 8(%edi)
187	jne	L(nequal)
188	test	%ecx, %ecx
189	jz	L(equal)
190
191	mov	12(%esi), %ecx
192	cmp	%ecx, 12(%edi)
193	jne	L(nequal)
194	test	%ecx, %ecx
195	jz	L(equal)
196
197	movdqu	16(%edi), %xmm1
198	movdqu	16(%esi), %xmm2
199	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
200	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
201	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
202	pmovmskb %xmm1, %edx
203	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
204	jnz	L(less4_double_words_16)
205
206	movdqu	32(%edi), %xmm1
207	movdqu	32(%esi), %xmm2
208	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
209	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
210	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
211	pmovmskb %xmm1, %edx
212	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
213	jnz	L(less4_double_words_32)
214
215	mov	48(%esi), %ecx
216	cmp	%ecx, 48(%edi)
217	jne	L(nequal)
218	test	%ecx, %ecx
219	jz	L(equal)
220
221	mov	52(%esi), %ecx
222	cmp	%ecx, 52(%edi)
223	jne	L(nequal)
224	test	%ecx, %ecx
225	jz	L(equal)
226
227	mov	56(%esi), %ecx
228	cmp	%ecx, 56(%edi)
229	jne	L(nequal)
230	test	%ecx, %ecx
231	jz	L(equal)
232
233	mov	60(%esi), %ecx
234	cmp	%ecx, 60(%edi)
235	jne	L(nequal)
236	test	%ecx, %ecx
237	jz	L(equal)
238
239	add	$64, %esi
240	add	$64, %edi
241	jmp	L(continue_0_48)
242
243	.p2align 4
244L(continue_00):
245	and	$15, %ch
246	jz	L(continue_00_00)
247	cmp	$16, %eax
248	jb	L(continue_00_0)
249	cmp	$32, %eax
250	jb	L(continue_00_16)
251	cmp	$48, %eax
252	jb	L(continue_00_32)
253
254	.p2align 4
255L(continue_00_48):
256	pcmpeqd	(%edi), %xmm0
257	mov	(%edi), %eax
258	pmovmskb %xmm0, %ecx
259	test	%ecx, %ecx
260	jnz	L(less4_double_words1)
261
262	cmp	(%esi), %eax
263	jne	L(nequal)
264
265	mov	4(%edi), %eax
266	cmp	4(%esi), %eax
267	jne	L(nequal)
268
269	mov	8(%edi), %eax
270	cmp	8(%esi), %eax
271	jne	L(nequal)
272
273	mov	12(%edi), %eax
274	cmp	12(%esi), %eax
275	jne	L(nequal)
276
277	movdqu	16(%esi), %xmm2
278	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
279	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
280	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
281	pmovmskb %xmm2, %edx
282	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
283	jnz	L(less4_double_words_16)
284
285	movdqu	32(%esi), %xmm2
286	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
287	pcmpeqd	32(%edi), %xmm2		/* compare first 4 double_words for equality */
288	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
289	pmovmskb %xmm2, %edx
290	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
291	jnz	L(less4_double_words_32)
292
293	movdqu	48(%esi), %xmm2
294	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
295	pcmpeqd	48(%edi), %xmm2		/* compare first 4 double_words for equality */
296	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
297	pmovmskb %xmm2, %edx
298	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
299	jnz	L(less4_double_words_48)
300
301	add	$64, %esi
302	add	$64, %edi
303	jmp	L(continue_00_48)
304
305	.p2align 4
306L(continue_32):
307	and	$15, %ch
308	jz	L(continue_32_00)
309	cmp	$16, %eax
310	jb	L(continue_0_32)
311	cmp	$32, %eax
312	jb	L(continue_16_32)
313	cmp	$48, %eax
314	jb	L(continue_32_32)
315
316	.p2align 4
317L(continue_32_48):
318	mov	(%esi), %ecx
319	cmp	%ecx, (%edi)
320	jne	L(nequal)
321	test	%ecx, %ecx
322	jz	L(equal)
323
324	mov	4(%esi), %ecx
325	cmp	%ecx, 4(%edi)
326	jne	L(nequal)
327	test	%ecx, %ecx
328	jz	L(equal)
329
330	mov	8(%esi), %ecx
331	cmp	%ecx, 8(%edi)
332	jne	L(nequal)
333	test	%ecx, %ecx
334	jz	L(equal)
335
336	mov	12(%esi), %ecx
337	cmp	%ecx, 12(%edi)
338	jne	L(nequal)
339	test	%ecx, %ecx
340	jz	L(equal)
341
342	mov	16(%esi), %ecx
343	cmp	%ecx, 16(%edi)
344	jne	L(nequal)
345	test	%ecx, %ecx
346	jz	L(equal)
347
348	mov	20(%esi), %ecx
349	cmp	%ecx, 20(%edi)
350	jne	L(nequal)
351	test	%ecx, %ecx
352	jz	L(equal)
353
354	mov	24(%esi), %ecx
355	cmp	%ecx, 24(%edi)
356	jne	L(nequal)
357	test	%ecx, %ecx
358	jz	L(equal)
359
360	mov	28(%esi), %ecx
361	cmp	%ecx, 28(%edi)
362	jne	L(nequal)
363	test	%ecx, %ecx
364	jz	L(equal)
365
366	movdqu	32(%edi), %xmm1
367	movdqu	32(%esi), %xmm2
368	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
369	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
370	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
371	pmovmskb %xmm1, %edx
372	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
373	jnz	L(less4_double_words_32)
374
375	movdqu	48(%edi), %xmm1
376	movdqu	48(%esi), %xmm2
377	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
378	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
379	psubb	%xmm0, %xmm1		/* packed sub of comparison results */
380	pmovmskb %xmm1, %edx
381	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
382	jnz	L(less4_double_words_48)
383
384	add	$64, %esi
385	add	$64, %edi
386	jmp	L(continue_32_48)
387
388	.p2align 4
389L(continue_16):
390	and	$15, %ch
391	jz	L(continue_16_00)
392	cmp	$16, %eax
393	jb	L(continue_0_16)
394	cmp	$32, %eax
395	jb	L(continue_16_16)
396	cmp	$48, %eax
397	jb	L(continue_16_32)
398
399	.p2align 4
400L(continue_16_48):
401	mov	(%esi), %ecx
402	cmp	%ecx, (%edi)
403	jne	L(nequal)
404	test	%ecx, %ecx
405	jz	L(equal)
406
407	mov	4(%esi), %ecx
408	cmp	%ecx, 4(%edi)
409	jne	L(nequal)
410	test	%ecx, %ecx
411	jz	L(equal)
412
413	mov	8(%esi), %ecx
414	cmp	%ecx, 8(%edi)
415	jne	L(nequal)
416	test	%ecx, %ecx
417	jz	L(equal)
418
419	mov	12(%esi), %ecx
420	cmp	%ecx, 12(%edi)
421	jne	L(nequal)
422	test	%ecx, %ecx
423	jz	L(equal)
424
425	movdqu	16(%edi), %xmm1
426	movdqu	16(%esi), %xmm2
427	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
428	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
429	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
430	pmovmskb %xmm1, %edx
431	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
432	jnz	L(less4_double_words_16)
433
434	mov	32(%esi), %ecx
435	cmp	%ecx, 32(%edi)
436	jne	L(nequal)
437	test	%ecx, %ecx
438	jz	L(equal)
439
440	mov	36(%esi), %ecx
441	cmp	%ecx, 36(%edi)
442	jne	L(nequal)
443	test	%ecx, %ecx
444	jz	L(equal)
445
446	mov	40(%esi), %ecx
447	cmp	%ecx, 40(%edi)
448	jne	L(nequal)
449	test	%ecx, %ecx
450	jz	L(equal)
451
452	mov	44(%esi), %ecx
453	cmp	%ecx, 44(%edi)
454	jne	L(nequal)
455	test	%ecx, %ecx
456	jz	L(equal)
457
458	movdqu	48(%edi), %xmm1
459	movdqu	48(%esi), %xmm2
460	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
461	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
462	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
463	pmovmskb %xmm1, %edx
464	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
465	jnz	L(less4_double_words_48)
466
467	add	$64, %esi
468	add	$64, %edi
469	jmp	L(continue_16_48)
470
471	.p2align 4
472L(continue_00_00):
473	movdqa	(%edi), %xmm1
474	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
475	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
476	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
477	pmovmskb %xmm1, %edx
478	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
479	jnz	L(less4_double_words)
480
481	movdqa	16(%edi), %xmm3
482	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
483	pcmpeqd	16(%esi), %xmm3		/* compare first 4 double_words for equality */
484	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
485	pmovmskb %xmm3, %edx
486	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
487	jnz	L(less4_double_words_16)
488
489	movdqa	32(%edi), %xmm5
490	pcmpeqd	%xmm5, %xmm0		/* Any null double_word? */
491	pcmpeqd	32(%esi), %xmm5		/* compare first 4 double_words for equality */
492	psubb	%xmm0, %xmm5		/* packed sub of comparison results*/
493	pmovmskb %xmm5, %edx
494	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
495	jnz	L(less4_double_words_32)
496
497	movdqa	48(%edi), %xmm1
498	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
499	pcmpeqd	48(%esi), %xmm1		/* compare first 4 double_words for equality */
500	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
501	pmovmskb %xmm1, %edx
502	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
503	jnz	L(less4_double_words_48)
504
505	add	$64, %esi
506	add	$64, %edi
507	jmp	L(continue_00_00)
508
509	.p2align 4
510L(continue_00_32):
511	movdqu	(%esi), %xmm2
512	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
513	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
514	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
515	pmovmskb %xmm2, %edx
516	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
517	jnz	L(less4_double_words)
518
519	add	$16, %esi
520	add	$16, %edi
521	jmp	L(continue_00_48)
522
523	.p2align 4
524L(continue_00_16):
525	movdqu	(%esi), %xmm2
526	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
527	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
528	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
529	pmovmskb %xmm2, %edx
530	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
531	jnz	L(less4_double_words)
532
533	movdqu	16(%esi), %xmm2
534	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
535	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
536	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
537	pmovmskb %xmm2, %edx
538	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
539	jnz	L(less4_double_words_16)
540
541	add	$32, %esi
542	add	$32, %edi
543	jmp	L(continue_00_48)
544
545	.p2align 4
546L(continue_00_0):
547	movdqu	(%esi), %xmm2
548	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
549	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
550	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
551	pmovmskb %xmm2, %edx
552	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
553	jnz	L(less4_double_words)
554
555	movdqu	16(%esi), %xmm2
556	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
557	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
558	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
559	pmovmskb %xmm2, %edx
560	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
561	jnz	L(less4_double_words_16)
562
563	movdqu	32(%esi), %xmm2
564	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
565	pcmpeqd	32(%edi), %xmm2		/* compare first 4 double_words for equality */
566	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
567	pmovmskb %xmm2, %edx
568	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
569	jnz	L(less4_double_words_32)
570
571	add	$48, %esi
572	add	$48, %edi
573	jmp	L(continue_00_48)
574
575	.p2align 4
576L(continue_48_00):
577	pcmpeqd	(%esi), %xmm0
578	mov	(%edi), %eax
579	pmovmskb %xmm0, %ecx
580	test	%ecx, %ecx
581	jnz	L(less4_double_words1)
582
583	cmp	(%esi), %eax
584	jne	L(nequal)
585
586	mov	4(%edi), %eax
587	cmp	4(%esi), %eax
588	jne	L(nequal)
589
590	mov	8(%edi), %eax
591	cmp	8(%esi), %eax
592	jne	L(nequal)
593
594	mov	12(%edi), %eax
595	cmp	12(%esi), %eax
596	jne	L(nequal)
597
598	movdqu	16(%edi), %xmm1
599	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
600	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
601	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
602	pmovmskb %xmm1, %edx
603	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
604	jnz	L(less4_double_words_16)
605
606	movdqu	32(%edi), %xmm1
607	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
608	pcmpeqd	32(%esi), %xmm1		/* compare first 4 double_words for equality */
609	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
610	pmovmskb %xmm1, %edx
611	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
612	jnz	L(less4_double_words_32)
613
614	movdqu	48(%edi), %xmm1
615	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
616	pcmpeqd	48(%esi), %xmm1		/* compare first 4 double_words for equality */
617	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
618	pmovmskb %xmm1, %edx
619	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
620	jnz	L(less4_double_words_48)
621
622	add	$64, %esi
623	add	$64, %edi
624	jmp	L(continue_48_00)
625
626	.p2align 4
627L(continue_32_00):
628	movdqu	(%edi), %xmm1
629	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
630	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
631	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
632	pmovmskb %xmm1, %edx
633	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
634	jnz	L(less4_double_words)
635
636	add	$16, %esi
637	add	$16, %edi
638	jmp	L(continue_48_00)
639
640	.p2align 4
641L(continue_16_00):
642	movdqu	(%edi), %xmm1
643	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
644	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
645	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
646	pmovmskb %xmm1, %edx
647	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
648	jnz	L(less4_double_words)
649
650	movdqu	16(%edi), %xmm1
651	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
652	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
653	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
654	pmovmskb %xmm1, %edx
655	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
656	jnz	L(less4_double_words_16)
657
658	add	$32, %esi
659	add	$32, %edi
660	jmp	L(continue_48_00)
661
662	.p2align 4
663L(continue_0_00):
664	movdqu	(%edi), %xmm1
665	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
666	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
667	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
668	pmovmskb %xmm1, %edx
669	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
670	jnz	L(less4_double_words)
671
672	movdqu	16(%edi), %xmm1
673	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
674	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
675	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
676	pmovmskb %xmm1, %edx
677	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
678	jnz	L(less4_double_words_16)
679
680	movdqu	32(%edi), %xmm1
681	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
682	pcmpeqd	32(%esi), %xmm1		/* compare first 4 double_words for equality */
683	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
684	pmovmskb %xmm1, %edx
685	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
686	jnz	L(less4_double_words_32)
687
688	add	$48, %esi
689	add	$48, %edi
690	jmp	L(continue_48_00)
691
692	.p2align 4
693L(continue_32_32):
694	movdqu	(%edi), %xmm1
695	movdqu	(%esi), %xmm2
696	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
697	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
698	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
699	pmovmskb %xmm1, %edx
700	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
701	jnz	L(less4_double_words)
702
703	add	$16, %esi
704	add	$16, %edi
705	jmp	L(continue_48_48)
706
707	.p2align 4
708L(continue_16_16):
709	movdqu	(%edi), %xmm1
710	movdqu	(%esi), %xmm2
711	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
712	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
713	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
714	pmovmskb %xmm1, %edx
715	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
716	jnz	L(less4_double_words)
717
718	movdqu	16(%edi), %xmm3
719	movdqu	16(%esi), %xmm4
720	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
721	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
722	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
723	pmovmskb %xmm3, %edx
724	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
725	jnz	L(less4_double_words_16)
726
727	add	$32, %esi
728	add	$32, %edi
729	jmp	L(continue_48_48)
730
731	.p2align 4
732L(continue_0_0):
733	movdqu	(%edi), %xmm1
734	movdqu	(%esi), %xmm2
735	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
736	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
737	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
738	pmovmskb %xmm1, %edx
739	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
740	jnz	L(less4_double_words)
741
742	movdqu	16(%edi), %xmm3
743	movdqu	16(%esi), %xmm4
744	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
745	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
746	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
747	pmovmskb %xmm3, %edx
748	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
749	jnz	L(less4_double_words_16)
750
751	movdqu	32(%edi), %xmm1
752	movdqu	32(%esi), %xmm2
753	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
754	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
755	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
756	pmovmskb %xmm1, %edx
757	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
758	jnz	L(less4_double_words_32)
759
760	add	$48, %esi
761	add	$48, %edi
762	jmp	L(continue_48_48)
763
764	.p2align 4
765L(continue_0_16):
766	movdqu	(%edi), %xmm1
767	movdqu	(%esi), %xmm2
768	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
769	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
770	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
771	pmovmskb %xmm1, %edx
772	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
773	jnz	L(less4_double_words)
774
775	movdqu	16(%edi), %xmm1
776	movdqu	16(%esi), %xmm2
777	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
778	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
779	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
780	pmovmskb %xmm1, %edx
781	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
782	jnz	L(less4_double_words_16)
783
784	add	$32, %esi
785	add	$32, %edi
786	jmp	L(continue_32_48)
787
788	.p2align 4
789L(continue_0_32):
790	movdqu	(%edi), %xmm1
791	movdqu	(%esi), %xmm2
792	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
793	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
794	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
795	pmovmskb %xmm1, %edx
796	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
797	jnz	L(less4_double_words)
798
799	add	$16, %esi
800	add	$16, %edi
801	jmp	L(continue_16_48)
802
803	.p2align 4
804L(continue_16_32):
805	movdqu	(%edi), %xmm1
806	movdqu	(%esi), %xmm2
807	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
808	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
809	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
810	pmovmskb %xmm1, %edx
811	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
812	jnz	L(less4_double_words)
813
814	add	$16, %esi
815	add	$16, %edi
816	jmp	L(continue_32_48)
817
818	.p2align 4
819L(less4_double_words1):
820	cmp	(%esi), %eax
821	jne	L(nequal)
822	test	%eax, %eax
823	jz	L(equal)
824
825	mov	4(%esi), %ecx
826	cmp	%ecx, 4(%edi)
827	jne	L(nequal)
828	test	%ecx, %ecx
829	jz	L(equal)
830
831	mov	8(%esi), %ecx
832	cmp	%ecx, 8(%edi)
833	jne	L(nequal)
834	test	%ecx, %ecx
835	jz	L(equal)
836
837	mov	12(%esi), %ecx
838	cmp	%ecx, 12(%edi)
839	jne	L(nequal)
840	xor	%eax, %eax
841	RETURN
842
843	.p2align 4
844L(less4_double_words):
845	xor	%eax, %eax
846	test	%dl, %dl
847	jz	L(next_two_double_words)
848	and	$15, %dl
849	jz	L(second_double_word)
850	mov	(%esi), %ecx
851	cmp	%ecx, (%edi)
852	jne	L(nequal)
853	RETURN
854
855	.p2align 4
856L(second_double_word):
857	mov	4(%esi), %ecx
858	cmp	%ecx, 4(%edi)
859	jne	L(nequal)
860	RETURN
861
862	.p2align 4
863L(next_two_double_words):
864	and	$15, %dh
865	jz	L(fourth_double_word)
866	mov	8(%esi), %ecx
867	cmp	%ecx, 8(%edi)
868	jne	L(nequal)
869	RETURN
870
871	.p2align 4
872L(fourth_double_word):
873	mov	12(%esi), %ecx
874	cmp	%ecx, 12(%edi)
875	jne	L(nequal)
876	RETURN
877
878	.p2align 4
879L(less4_double_words_16):
880	xor	%eax, %eax
881	test	%dl, %dl
882	jz	L(next_two_double_words_16)
883	and	$15, %dl
884	jz	L(second_double_word_16)
885	mov	16(%esi), %ecx
886	cmp	%ecx, 16(%edi)
887	jne	L(nequal)
888	RETURN
889
890	.p2align 4
891L(second_double_word_16):
892	mov	20(%esi), %ecx
893	cmp	%ecx, 20(%edi)
894	jne	L(nequal)
895	RETURN
896
897	.p2align 4
898L(next_two_double_words_16):
899	and	$15, %dh
900	jz	L(fourth_double_word_16)
901	mov	24(%esi), %ecx
902	cmp	%ecx, 24(%edi)
903	jne	L(nequal)
904	RETURN
905
906	.p2align 4
907L(fourth_double_word_16):
908	mov	28(%esi), %ecx
909	cmp	%ecx, 28(%edi)
910	jne	L(nequal)
911	RETURN
912
913	.p2align 4
914L(less4_double_words_32):
915	xor	%eax, %eax
916	test	%dl, %dl
917	jz	L(next_two_double_words_32)
918	and	$15, %dl
919	jz	L(second_double_word_32)
920	mov	32(%esi), %ecx
921	cmp	%ecx, 32(%edi)
922	jne	L(nequal)
923	RETURN
924
925	.p2align 4
926L(second_double_word_32):
927	mov	36(%esi), %ecx
928	cmp	%ecx, 36(%edi)
929	jne	L(nequal)
930	RETURN
931
932	.p2align 4
933L(next_two_double_words_32):
934	and	$15, %dh
935	jz	L(fourth_double_word_32)
936	mov	40(%esi), %ecx
937	cmp	%ecx, 40(%edi)
938	jne	L(nequal)
939	RETURN
940
941	.p2align 4
942L(fourth_double_word_32):
943	mov	44(%esi), %ecx
944	cmp	%ecx, 44(%edi)
945	jne	L(nequal)
946	RETURN
947
948	.p2align 4
949L(less4_double_words_48):
950	xor	%eax, %eax
951	test	%dl, %dl
952	jz	L(next_two_double_words_48)
953	and	$15, %dl
954	jz	L(second_double_word_48)
955	mov	48(%esi), %ecx
956	cmp	%ecx, 48(%edi)
957	jne	L(nequal)
958	RETURN
959
960	.p2align 4
961L(second_double_word_48):
962	mov	52(%esi), %ecx
963	cmp	%ecx, 52(%edi)
964	jne	L(nequal)
965	RETURN
966
967	.p2align 4
968L(next_two_double_words_48):
969	and	$15, %dh
970	jz	L(fourth_double_word_48)
971	mov	56(%esi), %ecx
972	cmp	%ecx, 56(%edi)
973	jne	L(nequal)
974	RETURN
975
976	.p2align 4
977L(fourth_double_word_48):
978	mov	60(%esi), %ecx
979	cmp	%ecx, 60(%edi)
980	jne	L(nequal)
981	RETURN
982
983	.p2align 4
984L(nequal):
985	mov	$1, %eax
986	jg	L(return)
987	neg	%eax
988	RETURN
989
990	.p2align 4
991L(return):
992	RETURN
993
994	.p2align 4
995L(equal):
996	xorl	%eax, %eax
997	RETURN
998
999	CFI_POP (%edi)
1000	CFI_POP (%esi)
1001
1002	.p2align 4
1003L(neq):
1004	mov	$1, %eax
1005	jg	L(neq_bigger)
1006	neg	%eax
1007
1008L(neq_bigger):
1009	ret
1010
1011	.p2align 4
1012L(eq):
1013	xorl	%eax, %eax
1014	ret
1015
1016END (__wcscmp_sse2)
1017#endif
1018