1/* strcmp optimized with SSE2.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21/* Continue building at ISA level 2 as the strcmp-sse42 is not always
22   preferable for ISA level == 2 CPUs.  */
23#if ISA_SHOULD_BUILD (2)
24
25# define STRCMP_ISA	_sse2
26# include "strcmp-naming.h"
27
28# include <sysdep.h>
29
30# undef UPDATE_STRNCMP_COUNTER
31
32# ifndef LABEL
33#  define LABEL(l) L(l)
34# endif
35
36# ifdef USE_AS_STRNCMP
37/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
38   if the new counter > the old one or is 0.  */
39#  define UPDATE_STRNCMP_COUNTER				\
40	/* calculate left number to compare */		\
41	lea	-16(%rcx, %r11), %r9;			\
42	cmp	%r9, %r11;				\
43	jb	LABEL(strcmp_exitz);			\
44	test	%r9, %r9;				\
45	je	LABEL(strcmp_exitz);			\
46	mov	%r9, %r11
47
48# elif defined USE_AS_STRCASECMP_L
49#  include "locale-defines.h"
50
51#  define UPDATE_STRNCMP_COUNTER
52# elif defined USE_AS_STRNCASECMP_L
53#  include "locale-defines.h"
54
55#  define UPDATE_STRNCMP_COUNTER				\
56	/* calculate left number to compare */		\
57	lea	-16(%rcx, %r11), %r9;			\
58	cmp	%r9, %r11;				\
59	jb	LABEL(strcmp_exitz);			\
60	test	%r9, %r9;				\
61	je	LABEL(strcmp_exitz);			\
62	mov	%r9, %r11
63# else
64#  define UPDATE_STRNCMP_COUNTER
65# endif
66
67	.text
68# ifdef USE_AS_STRCASECMP_L
69#  ifndef ENTRY2
70#   define ENTRY2(name) ENTRY (name)
71#   define END2(name) END (name)
72#  endif
73
74ENTRY2 (STRCASECMP)
75	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
76	mov	%fs:(%rax),%RDX_LP
77
78	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
79	.p2align 4
80END2 (STRCASECMP)
81	/* FALLTHROUGH to strcasecmp_l.  */
82# elif defined USE_AS_STRNCASECMP_L
83#  ifndef ENTRY2
84#   define ENTRY2(name) ENTRY (name)
85#   define END2(name) END (name)
86#  endif
87
88ENTRY2 (STRCASECMP)
89	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
90	mov	%fs:(%rax),%RCX_LP
91
92	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
93	.p2align 4
94END2 (STRCASECMP)
95	/* FALLTHROUGH to strncasecmp_l.  */
96# endif
97
98ENTRY (STRCMP)
99# ifdef USE_AS_STRCASECMP_L
100	/* We have to fall back on the C implementation for locales
101	   with encodings not matching ASCII for single bytes.  */
102#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
103	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
104#  else
105	mov	(%rdx), %RAX_LP
106#  endif
107	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
108	jne	__strcasecmp_l_nonascii
109# elif defined USE_AS_STRNCASECMP_L
110	/* We have to fall back on the C implementation for locales
111	   with encodings not matching ASCII for single bytes.  */
112#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
113	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
114#  else
115	mov	(%rcx), %RAX_LP
116#  endif
117	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
118	jne	__strncasecmp_l_nonascii
119# endif
120
121/*
122 * This implementation uses SSE to compare up to 16 bytes at a time.
123 */
124# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
125	test	%RDX_LP, %RDX_LP
126	je	LABEL(strcmp_exitz)
127	cmp	$1, %RDX_LP
128	je	LABEL(Byte0)
129	mov	%RDX_LP, %R11_LP
130# endif
131	mov	%esi, %ecx
132	mov	%edi, %eax
133/* Use 64bit AND here to avoid long NOP padding.  */
134	and	$0x3f, %rcx		/* rsi alignment in cache line */
135	and	$0x3f, %rax		/* rdi alignment in cache line */
136# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
137	.section .rodata.cst16,"aM",@progbits,16
138	.align 16
139.Llcase_min:
140	.quad	0x3f3f3f3f3f3f3f3f
141	.quad	0x3f3f3f3f3f3f3f3f
142.Llcase_max:
143	.quad	0x9999999999999999
144	.quad	0x9999999999999999
145.Lcase_add:
146	.quad	0x2020202020202020
147	.quad	0x2020202020202020
148	.previous
149	movdqa	.Llcase_min(%rip), %xmm5
150#  define LCASE_MIN_reg %xmm5
151	movdqa	.Llcase_max(%rip), %xmm6
152#  define LCASE_MAX_reg %xmm6
153	movdqa	.Lcase_add(%rip), %xmm7
154#  define CASE_ADD_reg %xmm7
155# endif
156	cmp	$0x30, %ecx
157	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
158	cmp	$0x30, %eax
159	ja	LABEL(crosscache)	/* rdi: 16-byte load will cross cache line */
160	movlpd	(%rdi), %xmm1
161	movlpd	(%rsi), %xmm2
162	movhpd	8(%rdi), %xmm1
163	movhpd	8(%rsi), %xmm2
164# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
165#   define TOLOWER(reg1, reg2) \
166	movdqa	LCASE_MIN_reg, %xmm8;					\
167	movdqa	LCASE_MIN_reg, %xmm9;					\
168	paddb	reg1, %xmm8;					\
169	paddb	reg2, %xmm9;					\
170	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
171	pcmpgtb	LCASE_MAX_reg, %xmm9;				\
172	pandn	CASE_ADD_reg, %xmm8;					\
173	pandn	CASE_ADD_reg, %xmm9;					\
174	paddb	%xmm8, reg1;					\
175	paddb	%xmm9, reg2
176	TOLOWER	(%xmm1, %xmm2)
177# else
178#  define TOLOWER(reg1, reg2)
179# endif
180	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
181	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
182	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
183	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
184	pmovmskb %xmm1, %edx
185	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
186	jnz	LABEL(less16bytes)	/* If not, find different value or null char */
187# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
188	sub	$16, %r11
189	jbe	LABEL(strcmp_exitz)	/* finish comparision */
190# endif
191	add	$16, %rsi		/* prepare to search next 16 bytes */
192	add	$16, %rdi		/* prepare to search next 16 bytes */
193
194	/*
195	 * Determine source and destination string offsets from 16-byte alignment.
196	 * Use relative offset difference between the two to determine which case
197	 * below to use.
198	 */
199	.p2align 4
200LABEL(crosscache):
201	and	$0xfffffffffffffff0, %rsi	/* force %rsi is 16 byte aligned */
202	and	$0xfffffffffffffff0, %rdi	/* force %rdi is 16 byte aligned */
203	mov	$0xffff, %edx			/* for equivalent offset */
204	xor	%r8d, %r8d
205	and	$0xf, %ecx			/* offset of rsi */
206	and	$0xf, %eax			/* offset of rdi */
207	cmp	%eax, %ecx
208	je	LABEL(ashr_0)			/* rsi and rdi relative offset same */
209	ja	LABEL(bigger)
210	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
211	xchg	%ecx, %eax
212	xchg	%rsi, %rdi
213LABEL(bigger):
214	lea	15(%rax), %r9
215	sub	%rcx, %r9
216	lea	LABEL(unaligned_table)(%rip), %r10
217	movslq	(%r10, %r9,4), %r9
218	lea	(%r10, %r9), %r10
219	_CET_NOTRACK jmp *%r10			/* jump to corresponding case */
220
221/*
222 * The following cases will be handled by ashr_0
223 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
224 *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
225 */
226	.p2align 4
227LABEL(ashr_0):
228
229	movdqa	(%rsi), %xmm1
230	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
231	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
232# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
233	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
234# else
235	movdqa	(%rdi), %xmm2
236	TOLOWER (%xmm1, %xmm2)
237	pcmpeqb	%xmm2, %xmm1			/* compare 16 bytes for equality */
238# endif
239	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
240	pmovmskb %xmm1, %r9d
241	shr	%cl, %edx			/* adjust 0xffff for offset */
242	shr	%cl, %r9d			/* adjust for 16-byte offset */
243	sub	%r9d, %edx
244	/*
245	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
246	 * the start from (16-rax) and no null char was seen.
247	 */
248	jne	LABEL(less32bytes)		/* mismatch or null char */
249	UPDATE_STRNCMP_COUNTER
250	mov	$16, %rcx
251	mov	$16, %r9
252	pxor	%xmm0, %xmm0			/* clear xmm0, may have changed above */
253
254	/*
255	 * Now both strings are aligned at 16-byte boundary. Loop over strings
256	 * checking 32-bytes per iteration.
257	 */
258	.p2align 4
259LABEL(loop_ashr_0):
260	movdqa	(%rsi, %rcx), %xmm1
261	movdqa	(%rdi, %rcx), %xmm2
262	TOLOWER (%xmm1, %xmm2)
263
264	pcmpeqb	%xmm1, %xmm0
265	pcmpeqb	%xmm2, %xmm1
266	psubb	%xmm0, %xmm1
267	pmovmskb %xmm1, %edx
268	sub	$0xffff, %edx
269	jnz	LABEL(exit)		/* mismatch or null char seen */
270
271# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
272	sub	$16, %r11
273	jbe	LABEL(strcmp_exitz)
274# endif
275	add	$16, %rcx
276	movdqa	(%rsi, %rcx), %xmm1
277	movdqa	(%rdi, %rcx), %xmm2
278	TOLOWER (%xmm1, %xmm2)
279
280	pcmpeqb	%xmm1, %xmm0
281	pcmpeqb	%xmm2, %xmm1
282	psubb	%xmm0, %xmm1
283	pmovmskb %xmm1, %edx
284	sub	$0xffff, %edx
285	jnz	LABEL(exit)
286# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
287	sub	$16, %r11
288	jbe	LABEL(strcmp_exitz)
289# endif
290	add	$16, %rcx
291	jmp	LABEL(loop_ashr_0)
292
293/*
294 * The following cases will be handled by ashr_1
295 * rcx(offset of rsi)  rax(offset of rdi)   relative offset	corresponding case
296 *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
297 */
298	.p2align 4
299LABEL(ashr_1):
300	pxor	%xmm0, %xmm0
301	movdqa	(%rdi), %xmm2
302	movdqa	(%rsi), %xmm1
303	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
304	pslldq	$15, %xmm2		/* shift first string to align with second */
305	TOLOWER (%xmm1, %xmm2)
306	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
307	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
308	pmovmskb %xmm2, %r9d
309	shr	%cl, %edx		/* adjust 0xffff for offset */
310	shr	%cl, %r9d		/* adjust for 16-byte offset */
311	sub	%r9d, %edx
312	jnz	LABEL(less32bytes)	/* mismatch or null char seen */
313	movdqa	(%rdi), %xmm3
314	UPDATE_STRNCMP_COUNTER
315
316	pxor	%xmm0, %xmm0
317	mov	$16, %rcx		/* index for loads*/
318	mov	$1, %r9d		/* byte position left over from less32bytes case */
319	/*
320	 * Setup %r10 value allows us to detect crossing a page boundary.
321	 * When %r10 goes positive we have crossed a page boundary and
322	 * need to do a nibble.
323	 */
324	lea	1(%rdi), %r10
325	and	$0xfff, %r10		/* offset into 4K page */
326	sub	$0x1000, %r10		/* subtract 4K pagesize */
327
328	.p2align 4
329LABEL(loop_ashr_1):
330	add	$16, %r10
331	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
332
333LABEL(gobble_ashr_1):
334	movdqa	(%rsi, %rcx), %xmm1
335	movdqa	(%rdi, %rcx), %xmm2
336	movdqa	%xmm2, %xmm4		 /* store for next cycle */
337
338	psrldq	$1, %xmm3
339	pslldq	$15, %xmm2
340	por	%xmm3, %xmm2		/* merge into one 16byte value */
341
342	TOLOWER (%xmm1, %xmm2)
343
344	pcmpeqb	%xmm1, %xmm0
345	pcmpeqb	%xmm2, %xmm1
346	psubb	%xmm0, %xmm1
347	pmovmskb %xmm1, %edx
348	sub	$0xffff, %edx
349	jnz	LABEL(exit)
350
351# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
352	sub	$16, %r11
353	jbe	LABEL(strcmp_exitz)
354# endif
355	add	$16, %rcx
356	movdqa	%xmm4, %xmm3
357
358	add	$16, %r10
359	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
360
361	movdqa	(%rsi, %rcx), %xmm1
362	movdqa	(%rdi, %rcx), %xmm2
363	movdqa	%xmm2, %xmm4		/* store for next cycle */
364
365	psrldq	$1, %xmm3
366	pslldq	$15, %xmm2
367	por	%xmm3, %xmm2		/* merge into one 16byte value */
368
369	TOLOWER (%xmm1, %xmm2)
370
371	pcmpeqb	%xmm1, %xmm0
372	pcmpeqb	%xmm2, %xmm1
373	psubb	%xmm0, %xmm1
374	pmovmskb %xmm1, %edx
375	sub	$0xffff, %edx
376	jnz	LABEL(exit)
377
378# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
379	sub	$16, %r11
380	jbe	LABEL(strcmp_exitz)
381# endif
382	add	$16, %rcx
383	movdqa	%xmm4, %xmm3
384	jmp	LABEL(loop_ashr_1)
385
386	/*
387	 * Nibble avoids loads across page boundary. This is to avoid a potential
388	 * access into unmapped memory.
389	 */
390	.p2align 4
391LABEL(nibble_ashr_1):
392	pcmpeqb	%xmm3, %xmm0		 /* check nibble for null char*/
393	pmovmskb %xmm0, %edx
394	test	$0xfffe, %edx
395	jnz	LABEL(ashr_1_exittail)	/* find null char*/
396
397# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
398	cmp	$15, %r11
399	jbe	LABEL(ashr_1_exittail)
400# endif
401
402	pxor	%xmm0, %xmm0
403	sub	$0x1000, %r10		/* substract 4K from %r10 */
404	jmp	LABEL(gobble_ashr_1)
405
406	/*
407	 * Once find null char, determine if there is a string mismatch
408	 * before the null char.
409	 */
410	.p2align 4
411LABEL(ashr_1_exittail):
412	movdqa	(%rsi, %rcx), %xmm1
413	psrldq	$1, %xmm0
414	psrldq	$1, %xmm3
415	jmp	LABEL(aftertail)
416
417/*
418 * The following cases will be handled by ashr_2
419 * rcx(offset of rsi)  rax(offset of rdi)   relative offset   corresponding case
420 *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
421 */
422	.p2align 4
423LABEL(ashr_2):
424	pxor	%xmm0, %xmm0
425	movdqa	(%rdi), %xmm2
426	movdqa	(%rsi), %xmm1
427	pcmpeqb	%xmm1, %xmm0
428	pslldq	$14, %xmm2
429	TOLOWER (%xmm1, %xmm2)
430	pcmpeqb	%xmm1, %xmm2
431	psubb	%xmm0, %xmm2
432	pmovmskb %xmm2, %r9d
433	shr	%cl, %edx
434	shr	%cl, %r9d
435	sub	%r9d, %edx
436	jnz	LABEL(less32bytes)
437	movdqa	(%rdi), %xmm3
438	UPDATE_STRNCMP_COUNTER
439
440	pxor	%xmm0, %xmm0
441	mov	$16, %rcx	/* index for loads */
442	mov	$2, %r9d	/* byte position left over from less32bytes case */
443	/*
444	 * Setup %r10 value allows us to detect crossing a page boundary.
445	 * When %r10 goes positive we have crossed a page boundary and
446	 * need to do a nibble.
447	 */
448	lea	2(%rdi), %r10
449	and	$0xfff, %r10	/* offset into 4K page */
450	sub	$0x1000, %r10	/* subtract 4K pagesize */
451
452	.p2align 4
453LABEL(loop_ashr_2):
454	add	$16, %r10
455	jg	LABEL(nibble_ashr_2)
456
457LABEL(gobble_ashr_2):
458	movdqa	(%rsi, %rcx), %xmm1
459	movdqa	(%rdi, %rcx), %xmm2
460	movdqa	%xmm2, %xmm4
461
462	psrldq	$2, %xmm3
463	pslldq	$14, %xmm2
464	por	%xmm3, %xmm2		/* merge into one 16byte value */
465
466	TOLOWER (%xmm1, %xmm2)
467
468	pcmpeqb	%xmm1, %xmm0
469	pcmpeqb	%xmm2, %xmm1
470	psubb	%xmm0, %xmm1
471	pmovmskb %xmm1, %edx
472	sub	$0xffff, %edx
473	jnz	LABEL(exit)
474
475# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
476	sub	$16, %r11
477	jbe	LABEL(strcmp_exitz)
478# endif
479
480	add	$16, %rcx
481	movdqa	%xmm4, %xmm3
482
483	add	$16, %r10
484	jg	LABEL(nibble_ashr_2)	/* cross page boundary */
485
486	movdqa	(%rsi, %rcx), %xmm1
487	movdqa	(%rdi, %rcx), %xmm2
488	movdqa	%xmm2, %xmm4
489
490	psrldq	$2, %xmm3
491	pslldq	$14, %xmm2
492	por	%xmm3, %xmm2		/* merge into one 16byte value */
493
494	TOLOWER (%xmm1, %xmm2)
495
496	pcmpeqb	%xmm1, %xmm0
497	pcmpeqb	%xmm2, %xmm1
498	psubb	%xmm0, %xmm1
499	pmovmskb %xmm1, %edx
500	sub	$0xffff, %edx
501	jnz	LABEL(exit)
502
503# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
504	sub	$16, %r11
505	jbe	LABEL(strcmp_exitz)
506# endif
507
508	add	$16, %rcx
509	movdqa	%xmm4, %xmm3
510	jmp	LABEL(loop_ashr_2)
511
512	.p2align 4
513LABEL(nibble_ashr_2):
514	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
515	pmovmskb %xmm0, %edx
516	test	$0xfffc, %edx
517	jnz	LABEL(ashr_2_exittail)
518
519# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
520	cmp	$14, %r11
521	jbe	LABEL(ashr_2_exittail)
522# endif
523
524	pxor	%xmm0, %xmm0
525	sub	$0x1000, %r10
526	jmp	LABEL(gobble_ashr_2)
527
528	.p2align 4
529LABEL(ashr_2_exittail):
530	movdqa	(%rsi, %rcx), %xmm1
531	psrldq	$2, %xmm0
532	psrldq	$2, %xmm3
533	jmp	LABEL(aftertail)
534
535/*
536 * The following cases will be handled by ashr_3
537 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
538 *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
539 */
540	.p2align 4
541LABEL(ashr_3):
542	pxor	%xmm0, %xmm0
543	movdqa	(%rdi), %xmm2
544	movdqa	(%rsi), %xmm1
545	pcmpeqb	%xmm1, %xmm0
546	pslldq	$13, %xmm2
547	TOLOWER (%xmm1, %xmm2)
548	pcmpeqb	%xmm1, %xmm2
549	psubb	%xmm0, %xmm2
550	pmovmskb %xmm2, %r9d
551	shr	%cl, %edx
552	shr	%cl, %r9d
553	sub	%r9d, %edx
554	jnz	LABEL(less32bytes)
555	movdqa	(%rdi), %xmm3
556
557	UPDATE_STRNCMP_COUNTER
558
559	pxor	%xmm0, %xmm0
560	mov	$16, %rcx	/* index for loads */
561	mov	$3, %r9d	/* byte position left over from less32bytes case */
562	/*
563	 * Setup %r10 value allows us to detect crossing a page boundary.
564	 * When %r10 goes positive we have crossed a page boundary and
565	 * need to do a nibble.
566	 */
567	lea	3(%rdi), %r10
568	and	$0xfff, %r10	/* offset into 4K page */
569	sub	$0x1000, %r10	/* subtract 4K pagesize */
570
571	.p2align 4
572LABEL(loop_ashr_3):
573	add	$16, %r10
574	jg	LABEL(nibble_ashr_3)
575
576LABEL(gobble_ashr_3):
577	movdqa	(%rsi, %rcx), %xmm1
578	movdqa	(%rdi, %rcx), %xmm2
579	movdqa	%xmm2, %xmm4
580
581	psrldq	$3, %xmm3
582	pslldq	$13, %xmm2
583	por	%xmm3, %xmm2		/* merge into one 16byte value */
584
585	TOLOWER (%xmm1, %xmm2)
586
587	pcmpeqb	%xmm1, %xmm0
588	pcmpeqb	%xmm2, %xmm1
589	psubb	%xmm0, %xmm1
590	pmovmskb %xmm1, %edx
591	sub	$0xffff, %edx
592	jnz	LABEL(exit)
593
594# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
595	sub	$16, %r11
596	jbe	LABEL(strcmp_exitz)
597# endif
598
599	add	$16, %rcx
600	movdqa	%xmm4, %xmm3
601
602	add	$16, %r10
603	jg	LABEL(nibble_ashr_3)	/* cross page boundary */
604
605	movdqa	(%rsi, %rcx), %xmm1
606	movdqa	(%rdi, %rcx), %xmm2
607	movdqa	%xmm2, %xmm4
608
609	psrldq	$3, %xmm3
610	pslldq	$13, %xmm2
611	por	%xmm3, %xmm2		/* merge into one 16byte value */
612
613	TOLOWER (%xmm1, %xmm2)
614
615	pcmpeqb	%xmm1, %xmm0
616	pcmpeqb	%xmm2, %xmm1
617	psubb	%xmm0, %xmm1
618	pmovmskb %xmm1, %edx
619	sub	$0xffff, %edx
620	jnz	LABEL(exit)
621
622# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
623	sub	$16, %r11
624	jbe	LABEL(strcmp_exitz)
625# endif
626
627	add	$16, %rcx
628	movdqa	%xmm4, %xmm3
629	jmp	LABEL(loop_ashr_3)
630
631	.p2align 4
632LABEL(nibble_ashr_3):
633	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
634	pmovmskb %xmm0, %edx
635	test	$0xfff8, %edx
636	jnz	LABEL(ashr_3_exittail)
637
638# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
639	cmp	$13, %r11
640	jbe	LABEL(ashr_3_exittail)
641# endif
642
643	pxor	%xmm0, %xmm0
644	sub	$0x1000, %r10
645	jmp	LABEL(gobble_ashr_3)
646
647	.p2align 4
648LABEL(ashr_3_exittail):
649	movdqa	(%rsi, %rcx), %xmm1
650	psrldq	$3, %xmm0
651	psrldq	$3, %xmm3
652	jmp	LABEL(aftertail)
653
654/*
655 * The following cases will be handled by ashr_4
656 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
657 *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
658 */
659	.p2align 4
660LABEL(ashr_4):
661	pxor	%xmm0, %xmm0
662	movdqa	(%rdi), %xmm2
663	movdqa	(%rsi), %xmm1
664	pcmpeqb	%xmm1, %xmm0
665	pslldq	$12, %xmm2
666	TOLOWER (%xmm1, %xmm2)
667	pcmpeqb	%xmm1, %xmm2
668	psubb	%xmm0, %xmm2
669	pmovmskb %xmm2, %r9d
670	shr	%cl, %edx
671	shr	%cl, %r9d
672	sub	%r9d, %edx
673	jnz	LABEL(less32bytes)
674	movdqa	(%rdi), %xmm3
675
676	UPDATE_STRNCMP_COUNTER
677
678	pxor	%xmm0, %xmm0
679	mov	$16, %rcx	/* index for loads */
680	mov	$4, %r9d	/* byte position left over from less32bytes case */
681	/*
682	 * Setup %r10 value allows us to detect crossing a page boundary.
683	 * When %r10 goes positive we have crossed a page boundary and
684	 * need to do a nibble.
685	 */
686	lea	4(%rdi), %r10
687	and	$0xfff, %r10	/* offset into 4K page */
688	sub	$0x1000, %r10	/* subtract 4K pagesize */
689
690	.p2align 4
691LABEL(loop_ashr_4):
692	add	$16, %r10
693	jg	LABEL(nibble_ashr_4)
694
695LABEL(gobble_ashr_4):
696	movdqa	(%rsi, %rcx), %xmm1
697	movdqa	(%rdi, %rcx), %xmm2
698	movdqa	%xmm2, %xmm4
699
700	psrldq	$4, %xmm3
701	pslldq	$12, %xmm2
702	por	%xmm3, %xmm2		/* merge into one 16byte value */
703
704	TOLOWER (%xmm1, %xmm2)
705
706	pcmpeqb	%xmm1, %xmm0
707	pcmpeqb	%xmm2, %xmm1
708	psubb	%xmm0, %xmm1
709	pmovmskb %xmm1, %edx
710	sub	$0xffff, %edx
711	jnz	LABEL(exit)
712
713# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
714	sub	$16, %r11
715	jbe	LABEL(strcmp_exitz)
716# endif
717
718	add	$16, %rcx
719	movdqa	%xmm4, %xmm3
720
721	add	$16, %r10
722	jg	LABEL(nibble_ashr_4)	/* cross page boundary */
723
724	movdqa	(%rsi, %rcx), %xmm1
725	movdqa	(%rdi, %rcx), %xmm2
726	movdqa	%xmm2, %xmm4
727
728	psrldq	$4, %xmm3
729	pslldq	$12, %xmm2
730	por	%xmm3, %xmm2		/* merge into one 16byte value */
731
732	TOLOWER (%xmm1, %xmm2)
733
734	pcmpeqb	%xmm1, %xmm0
735	pcmpeqb	%xmm2, %xmm1
736	psubb	%xmm0, %xmm1
737	pmovmskb %xmm1, %edx
738	sub	$0xffff, %edx
739	jnz	LABEL(exit)
740
741# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
742	sub	$16, %r11
743	jbe	LABEL(strcmp_exitz)
744# endif
745
746	add	$16, %rcx
747	movdqa	%xmm4, %xmm3
748	jmp	LABEL(loop_ashr_4)
749
750	.p2align 4
751LABEL(nibble_ashr_4):
752	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
753	pmovmskb %xmm0, %edx
754	test	$0xfff0, %edx
755	jnz	LABEL(ashr_4_exittail)
756
757# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
758	cmp	$12, %r11
759	jbe	LABEL(ashr_4_exittail)
760# endif
761
762	pxor	%xmm0, %xmm0
763	sub	$0x1000, %r10
764	jmp	LABEL(gobble_ashr_4)
765
766	.p2align 4
767LABEL(ashr_4_exittail):
768	movdqa	(%rsi, %rcx), %xmm1
769	psrldq	$4, %xmm0
770	psrldq	$4, %xmm3
771	jmp	LABEL(aftertail)
772
773/*
774 * The following cases will be handled by ashr_5
775 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
776 *        n(11~15)          n - 11		  4(15 +(n-11) - n)         ashr_5
777 */
778	.p2align 4
779LABEL(ashr_5):
780	pxor	%xmm0, %xmm0
781	movdqa	(%rdi), %xmm2
782	movdqa	(%rsi), %xmm1
783	pcmpeqb	%xmm1, %xmm0
784	pslldq	$11, %xmm2
785	TOLOWER (%xmm1, %xmm2)
786	pcmpeqb	%xmm1, %xmm2
787	psubb	%xmm0, %xmm2
788	pmovmskb %xmm2, %r9d
789	shr	%cl, %edx
790	shr	%cl, %r9d
791	sub	%r9d, %edx
792	jnz	LABEL(less32bytes)
793	movdqa	(%rdi), %xmm3
794
795	UPDATE_STRNCMP_COUNTER
796
797	pxor	%xmm0, %xmm0
798	mov	$16, %rcx	/* index for loads */
799	mov	$5, %r9d	/* byte position left over from less32bytes case */
800	/*
801	 * Setup %r10 value allows us to detect crossing a page boundary.
802	 * When %r10 goes positive we have crossed a page boundary and
803	 * need to do a nibble.
804	 */
805	lea	5(%rdi), %r10
806	and	$0xfff, %r10	/* offset into 4K page */
807	sub	$0x1000, %r10	/* subtract 4K pagesize */
808
809	.p2align 4
810LABEL(loop_ashr_5):
811	add	$16, %r10
812	jg	LABEL(nibble_ashr_5)
813
814LABEL(gobble_ashr_5):
815	movdqa	(%rsi, %rcx), %xmm1
816	movdqa	(%rdi, %rcx), %xmm2
817	movdqa	%xmm2, %xmm4
818
819	psrldq	$5, %xmm3
820	pslldq	$11, %xmm2
821	por	%xmm3, %xmm2		/* merge into one 16byte value */
822
823	TOLOWER (%xmm1, %xmm2)
824
825	pcmpeqb	%xmm1, %xmm0
826	pcmpeqb	%xmm2, %xmm1
827	psubb	%xmm0, %xmm1
828	pmovmskb %xmm1, %edx
829	sub	$0xffff, %edx
830	jnz	LABEL(exit)
831
832# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
833	sub	$16, %r11
834	jbe	LABEL(strcmp_exitz)
835# endif
836
837	add	$16, %rcx
838	movdqa	%xmm4, %xmm3
839
840	add	$16, %r10
841	jg	LABEL(nibble_ashr_5)	/* cross page boundary */
842
843	movdqa	(%rsi, %rcx), %xmm1
844	movdqa	(%rdi, %rcx), %xmm2
845	movdqa	%xmm2, %xmm4
846
847	psrldq	$5, %xmm3
848	pslldq	$11, %xmm2
849	por	%xmm3, %xmm2		/* merge into one 16byte value */
850
851	TOLOWER (%xmm1, %xmm2)
852
853	pcmpeqb	%xmm1, %xmm0
854	pcmpeqb	%xmm2, %xmm1
855	psubb	%xmm0, %xmm1
856	pmovmskb %xmm1, %edx
857	sub	$0xffff, %edx
858	jnz	LABEL(exit)
859
860# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
861	sub	$16, %r11
862	jbe	LABEL(strcmp_exitz)
863# endif
864
865	add	$16, %rcx
866	movdqa	%xmm4, %xmm3
867	jmp	LABEL(loop_ashr_5)
868
869	.p2align 4
870LABEL(nibble_ashr_5):
871	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
872	pmovmskb %xmm0, %edx
873	test	$0xffe0, %edx
874	jnz	LABEL(ashr_5_exittail)
875
876# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
877	cmp	$11, %r11
878	jbe	LABEL(ashr_5_exittail)
879# endif
880
881	pxor	%xmm0, %xmm0
882	sub	$0x1000, %r10
883	jmp	LABEL(gobble_ashr_5)
884
885	.p2align 4
886LABEL(ashr_5_exittail):
887	movdqa	(%rsi, %rcx), %xmm1
888	psrldq	$5, %xmm0
889	psrldq	$5, %xmm3
890	jmp	LABEL(aftertail)
891
892/*
893 * The following cases will be handled by ashr_6
894 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
895 *        n(10~15)          n - 10		  5(15 +(n-10) - n)         ashr_6
896 */
897	.p2align 4
898LABEL(ashr_6):
899	pxor	%xmm0, %xmm0
900	movdqa	(%rdi), %xmm2
901	movdqa	(%rsi), %xmm1
902	pcmpeqb	%xmm1, %xmm0
903	pslldq	$10, %xmm2
904	TOLOWER (%xmm1, %xmm2)
905	pcmpeqb	%xmm1, %xmm2
906	psubb	%xmm0, %xmm2
907	pmovmskb %xmm2, %r9d
908	shr	%cl, %edx
909	shr	%cl, %r9d
910	sub	%r9d, %edx
911	jnz	LABEL(less32bytes)
912	movdqa	(%rdi), %xmm3
913
914	UPDATE_STRNCMP_COUNTER
915
916	pxor	%xmm0, %xmm0
917	mov	$16, %rcx	/* index for loads */
918	mov	$6, %r9d	/* byte position left over from less32bytes case */
919	/*
920	 * Setup %r10 value allows us to detect crossing a page boundary.
921	 * When %r10 goes positive we have crossed a page boundary and
922	 * need to do a nibble.
923	 */
924	lea	6(%rdi), %r10
925	and	$0xfff, %r10	/* offset into 4K page */
926	sub	$0x1000, %r10	/* subtract 4K pagesize */
927
928	.p2align 4
929LABEL(loop_ashr_6):
930	add	$16, %r10
931	jg	LABEL(nibble_ashr_6)
932
933LABEL(gobble_ashr_6):
934	movdqa	(%rsi, %rcx), %xmm1
935	movdqa	(%rdi, %rcx), %xmm2
936	movdqa	%xmm2, %xmm4
937
938	psrldq	$6, %xmm3
939	pslldq	$10, %xmm2
940	por	%xmm3, %xmm2		/* merge into one 16byte value */
941
942	TOLOWER (%xmm1, %xmm2)
943
944	pcmpeqb	%xmm1, %xmm0
945	pcmpeqb	%xmm2, %xmm1
946	psubb	%xmm0, %xmm1
947	pmovmskb %xmm1, %edx
948	sub	$0xffff, %edx
949	jnz	LABEL(exit)
950
951# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
952	sub	$16, %r11
953	jbe	LABEL(strcmp_exitz)
954# endif
955
956	add	$16, %rcx
957	movdqa	%xmm4, %xmm3
958
959	add	$16, %r10
960	jg	LABEL(nibble_ashr_6)	/* cross page boundary */
961
962	movdqa	(%rsi, %rcx), %xmm1
963	movdqa	(%rdi, %rcx), %xmm2
964	movdqa	%xmm2, %xmm4
965
966	psrldq	$6, %xmm3
967	pslldq	$10, %xmm2
968	por	%xmm3, %xmm2		/* merge into one 16byte value */
969
970	TOLOWER (%xmm1, %xmm2)
971
972	pcmpeqb	%xmm1, %xmm0
973	pcmpeqb	%xmm2, %xmm1
974	psubb	%xmm0, %xmm1
975	pmovmskb %xmm1, %edx
976	sub	$0xffff, %edx
977	jnz	LABEL(exit)
978
979# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
980	sub	$16, %r11
981	jbe	LABEL(strcmp_exitz)
982# endif
983
984	add	$16, %rcx
985	movdqa	%xmm4, %xmm3
986	jmp	LABEL(loop_ashr_6)
987
988	.p2align 4
989LABEL(nibble_ashr_6):
990	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
991	pmovmskb %xmm0, %edx
992	test	$0xffc0, %edx
993	jnz	LABEL(ashr_6_exittail)
994
995# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
996	cmp	$10, %r11
997	jbe	LABEL(ashr_6_exittail)
998# endif
999
1000	pxor	%xmm0, %xmm0
1001	sub	$0x1000, %r10
1002	jmp	LABEL(gobble_ashr_6)
1003
1004	.p2align 4
1005LABEL(ashr_6_exittail):
1006	movdqa	(%rsi, %rcx), %xmm1
1007	psrldq	$6, %xmm0
1008	psrldq	$6, %xmm3
1009	jmp	LABEL(aftertail)
1010
1011/*
1012 * The following cases will be handled by ashr_7
1013 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
1014 *        n(9~15)          n - 9                6(15 +(n - 9) - n)         ashr_7
1015 */
1016	.p2align 4
1017LABEL(ashr_7):
1018	pxor	%xmm0, %xmm0
1019	movdqa	(%rdi), %xmm2
1020	movdqa	(%rsi), %xmm1
1021	pcmpeqb	%xmm1, %xmm0
1022	pslldq	$9, %xmm2
1023	TOLOWER (%xmm1, %xmm2)
1024	pcmpeqb	%xmm1, %xmm2
1025	psubb	%xmm0, %xmm2
1026	pmovmskb %xmm2, %r9d
1027	shr	%cl, %edx
1028	shr	%cl, %r9d
1029	sub	%r9d, %edx
1030	jnz	LABEL(less32bytes)
1031	movdqa	(%rdi), %xmm3
1032
1033	UPDATE_STRNCMP_COUNTER
1034
1035	pxor	%xmm0, %xmm0
1036	mov	$16, %rcx	/* index for loads */
1037	mov	$7, %r9d	/* byte position left over from less32bytes case */
1038	/*
1039	 * Setup %r10 value allows us to detect crossing a page boundary.
1040	 * When %r10 goes positive we have crossed a page boundary and
1041	 * need to do a nibble.
1042	 */
1043	lea	7(%rdi), %r10
1044	and	$0xfff, %r10	/* offset into 4K page */
1045	sub	$0x1000, %r10	/* subtract 4K pagesize */
1046
1047	.p2align 4
1048LABEL(loop_ashr_7):
1049	add	$16, %r10
1050	jg	LABEL(nibble_ashr_7)
1051
1052LABEL(gobble_ashr_7):
1053	movdqa	(%rsi, %rcx), %xmm1
1054	movdqa	(%rdi, %rcx), %xmm2
1055	movdqa	%xmm2, %xmm4
1056
1057	psrldq	$7, %xmm3
1058	pslldq	$9, %xmm2
1059	por	%xmm3, %xmm2		/* merge into one 16byte value */
1060
1061	TOLOWER (%xmm1, %xmm2)
1062
1063	pcmpeqb	%xmm1, %xmm0
1064	pcmpeqb	%xmm2, %xmm1
1065	psubb	%xmm0, %xmm1
1066	pmovmskb %xmm1, %edx
1067	sub	$0xffff, %edx
1068	jnz	LABEL(exit)
1069
1070# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1071	sub	$16, %r11
1072	jbe	LABEL(strcmp_exitz)
1073# endif
1074
1075	add	$16, %rcx
1076	movdqa	%xmm4, %xmm3
1077
1078	add	$16, %r10
1079	jg	LABEL(nibble_ashr_7)	/* cross page boundary */
1080
1081	movdqa	(%rsi, %rcx), %xmm1
1082	movdqa	(%rdi, %rcx), %xmm2
1083	movdqa	%xmm2, %xmm4
1084
1085	psrldq	$7, %xmm3
1086	pslldq	$9, %xmm2
1087	por	%xmm3, %xmm2		/* merge into one 16byte value */
1088
1089	TOLOWER (%xmm1, %xmm2)
1090
1091	pcmpeqb	%xmm1, %xmm0
1092	pcmpeqb	%xmm2, %xmm1
1093	psubb	%xmm0, %xmm1
1094	pmovmskb %xmm1, %edx
1095	sub	$0xffff, %edx
1096	jnz	LABEL(exit)
1097
1098# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1099	sub	$16, %r11
1100	jbe	LABEL(strcmp_exitz)
1101# endif
1102
1103	add	$16, %rcx
1104	movdqa	%xmm4, %xmm3
1105	jmp	LABEL(loop_ashr_7)
1106
1107	.p2align 4
1108LABEL(nibble_ashr_7):
1109	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1110	pmovmskb %xmm0, %edx
1111	test	$0xff80, %edx
1112	jnz	LABEL(ashr_7_exittail)
1113
1114# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1115	cmp	$9, %r11
1116	jbe	LABEL(ashr_7_exittail)
1117# endif
1118
1119	pxor	%xmm0, %xmm0
1120	sub	$0x1000, %r10
1121	jmp	LABEL(gobble_ashr_7)
1122
1123	.p2align 4
1124LABEL(ashr_7_exittail):
1125	movdqa	(%rsi, %rcx), %xmm1
1126	psrldq	$7, %xmm0
1127	psrldq	$7, %xmm3
1128	jmp	LABEL(aftertail)
1129
1130/*
1131 *  The following cases will be handled by ashr_8
1132 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1133 *        n(8~15)          n - 8                7(15 +(n - 8) - n)         ashr_8
1134 */
1135	.p2align 4
1136LABEL(ashr_8):
1137	pxor	%xmm0, %xmm0
1138	movdqa	(%rdi), %xmm2
1139	movdqa	(%rsi), %xmm1
1140	pcmpeqb	%xmm1, %xmm0
1141	pslldq	$8, %xmm2
1142	TOLOWER (%xmm1, %xmm2)
1143	pcmpeqb	%xmm1, %xmm2
1144	psubb	%xmm0, %xmm2
1145	pmovmskb %xmm2, %r9d
1146	shr	%cl, %edx
1147	shr	%cl, %r9d
1148	sub	%r9d, %edx
1149	jnz	LABEL(less32bytes)
1150	movdqa	(%rdi), %xmm3
1151
1152	UPDATE_STRNCMP_COUNTER
1153
1154	pxor	%xmm0, %xmm0
1155	mov	$16, %rcx	/* index for loads */
1156	mov	$8, %r9d	/* byte position left over from less32bytes case */
1157	/*
1158	 * Setup %r10 value allows us to detect crossing a page boundary.
1159	 * When %r10 goes positive we have crossed a page boundary and
1160	 * need to do a nibble.
1161	 */
1162	lea	8(%rdi), %r10
1163	and	$0xfff, %r10	/* offset into 4K page */
1164	sub	$0x1000, %r10	/* subtract 4K pagesize */
1165
1166	.p2align 4
1167LABEL(loop_ashr_8):
1168	add	$16, %r10
1169	jg	LABEL(nibble_ashr_8)
1170
1171LABEL(gobble_ashr_8):
1172	movdqa	(%rsi, %rcx), %xmm1
1173	movdqa	(%rdi, %rcx), %xmm2
1174	movdqa	%xmm2, %xmm4
1175
1176	psrldq	$8, %xmm3
1177	pslldq	$8, %xmm2
1178	por	%xmm3, %xmm2		/* merge into one 16byte value */
1179
1180	TOLOWER (%xmm1, %xmm2)
1181
1182	pcmpeqb	%xmm1, %xmm0
1183	pcmpeqb	%xmm2, %xmm1
1184	psubb	%xmm0, %xmm1
1185	pmovmskb %xmm1, %edx
1186	sub	$0xffff, %edx
1187	jnz	LABEL(exit)
1188
1189# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1190	sub	$16, %r11
1191	jbe	LABEL(strcmp_exitz)
1192# endif
1193
1194	add	$16, %rcx
1195	movdqa	%xmm4, %xmm3
1196
1197	add	$16, %r10
1198	jg	LABEL(nibble_ashr_8)	/* cross page boundary */
1199
1200	movdqa	(%rsi, %rcx), %xmm1
1201	movdqa	(%rdi, %rcx), %xmm2
1202	movdqa	%xmm2, %xmm4
1203
1204	psrldq	$8, %xmm3
1205	pslldq	$8, %xmm2
1206	por	%xmm3, %xmm2		/* merge into one 16byte value */
1207
1208	TOLOWER (%xmm1, %xmm2)
1209
1210	pcmpeqb	%xmm1, %xmm0
1211	pcmpeqb	%xmm2, %xmm1
1212	psubb	%xmm0, %xmm1
1213	pmovmskb %xmm1, %edx
1214	sub	$0xffff, %edx
1215	jnz	LABEL(exit)
1216
1217# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1218	sub	$16, %r11
1219	jbe	LABEL(strcmp_exitz)
1220# endif
1221
1222	add	$16, %rcx
1223	movdqa	%xmm4, %xmm3
1224	jmp	LABEL(loop_ashr_8)
1225
1226	.p2align 4
1227LABEL(nibble_ashr_8):
1228	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1229	pmovmskb %xmm0, %edx
1230	test	$0xff00, %edx
1231	jnz	LABEL(ashr_8_exittail)
1232
1233# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1234	cmp	$8, %r11
1235	jbe	LABEL(ashr_8_exittail)
1236# endif
1237
1238	pxor	%xmm0, %xmm0
1239	sub	$0x1000, %r10
1240	jmp	LABEL(gobble_ashr_8)
1241
1242	.p2align 4
1243LABEL(ashr_8_exittail):
1244	movdqa	(%rsi, %rcx), %xmm1
1245	psrldq	$8, %xmm0
1246	psrldq	$8, %xmm3
1247	jmp	LABEL(aftertail)
1248
1249/*
1250 *  The following cases will be handled by ashr_9
1251 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1252 *        n(7~15)          n - 7                8(15 +(n - 7) - n)         ashr_9
1253 */
1254	.p2align 4
1255LABEL(ashr_9):
1256	pxor	%xmm0, %xmm0
1257	movdqa	(%rdi), %xmm2
1258	movdqa	(%rsi), %xmm1
1259	pcmpeqb	%xmm1, %xmm0
1260	pslldq	$7, %xmm2
1261	TOLOWER (%xmm1, %xmm2)
1262	pcmpeqb	%xmm1, %xmm2
1263	psubb	%xmm0, %xmm2
1264	pmovmskb %xmm2, %r9d
1265	shr	%cl, %edx
1266	shr	%cl, %r9d
1267	sub	%r9d, %edx
1268	jnz	LABEL(less32bytes)
1269	movdqa	(%rdi), %xmm3
1270
1271	UPDATE_STRNCMP_COUNTER
1272
1273	pxor	%xmm0, %xmm0
1274	mov	$16, %rcx	/* index for loads */
1275	mov	$9, %r9d	/* byte position left over from less32bytes case */
1276	/*
1277	 * Setup %r10 value allows us to detect crossing a page boundary.
1278	 * When %r10 goes positive we have crossed a page boundary and
1279	 * need to do a nibble.
1280	 */
1281	lea	9(%rdi), %r10
1282	and	$0xfff, %r10	/* offset into 4K page */
1283	sub	$0x1000, %r10	/* subtract 4K pagesize */
1284
1285	.p2align 4
1286LABEL(loop_ashr_9):
1287	add	$16, %r10
1288	jg	LABEL(nibble_ashr_9)
1289
1290LABEL(gobble_ashr_9):
1291	movdqa	(%rsi, %rcx), %xmm1
1292	movdqa	(%rdi, %rcx), %xmm2
1293	movdqa	%xmm2, %xmm4
1294
1295	psrldq	$9, %xmm3
1296	pslldq	$7, %xmm2
1297	por	%xmm3, %xmm2		/* merge into one 16byte value */
1298
1299	TOLOWER (%xmm1, %xmm2)
1300
1301	pcmpeqb	%xmm1, %xmm0
1302	pcmpeqb	%xmm2, %xmm1
1303	psubb	%xmm0, %xmm1
1304	pmovmskb %xmm1, %edx
1305	sub	$0xffff, %edx
1306	jnz	LABEL(exit)
1307
1308# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1309	sub	$16, %r11
1310	jbe	LABEL(strcmp_exitz)
1311# endif
1312
1313	add	$16, %rcx
1314	movdqa	%xmm4, %xmm3
1315
1316	add	$16, %r10
1317	jg	LABEL(nibble_ashr_9)	/* cross page boundary */
1318
1319	movdqa	(%rsi, %rcx), %xmm1
1320	movdqa	(%rdi, %rcx), %xmm2
1321	movdqa	%xmm2, %xmm4
1322
1323	psrldq	$9, %xmm3
1324	pslldq	$7, %xmm2
1325	por	%xmm3, %xmm2		/* merge into one 16byte value */
1326
1327	TOLOWER (%xmm1, %xmm2)
1328
1329	pcmpeqb	%xmm1, %xmm0
1330	pcmpeqb	%xmm2, %xmm1
1331	psubb	%xmm0, %xmm1
1332	pmovmskb %xmm1, %edx
1333	sub	$0xffff, %edx
1334	jnz	LABEL(exit)
1335
1336# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1337	sub	$16, %r11
1338	jbe	LABEL(strcmp_exitz)
1339# endif
1340
1341	add	$16, %rcx
1342	movdqa	%xmm4, %xmm3		/* store for next cycle */
1343	jmp	LABEL(loop_ashr_9)
1344
1345	.p2align 4
1346LABEL(nibble_ashr_9):
1347	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1348	pmovmskb %xmm0, %edx
1349	test	$0xfe00, %edx
1350	jnz	LABEL(ashr_9_exittail)
1351
1352# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1353	cmp	$7, %r11
1354	jbe	LABEL(ashr_9_exittail)
1355# endif
1356
1357	pxor	%xmm0, %xmm0
1358	sub	$0x1000, %r10
1359	jmp	LABEL(gobble_ashr_9)
1360
1361	.p2align 4
1362LABEL(ashr_9_exittail):
1363	movdqa	(%rsi, %rcx), %xmm1
1364	psrldq	$9, %xmm0
1365	psrldq	$9, %xmm3
1366	jmp	LABEL(aftertail)
1367
1368/*
1369 *  The following cases will be handled by ashr_10
1370 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1371 *        n(6~15)          n - 6                9(15 +(n - 6) - n)         ashr_10
1372 */
1373	.p2align 4
1374LABEL(ashr_10):
1375	pxor	%xmm0, %xmm0
1376	movdqa	(%rdi), %xmm2
1377	movdqa	(%rsi), %xmm1
1378	pcmpeqb	%xmm1, %xmm0
1379	pslldq	$6, %xmm2
1380	TOLOWER (%xmm1, %xmm2)
1381	pcmpeqb	%xmm1, %xmm2
1382	psubb	%xmm0, %xmm2
1383	pmovmskb %xmm2, %r9d
1384	shr	%cl, %edx
1385	shr	%cl, %r9d
1386	sub	%r9d, %edx
1387	jnz	LABEL(less32bytes)
1388	movdqa	(%rdi), %xmm3
1389
1390	UPDATE_STRNCMP_COUNTER
1391
1392	pxor	%xmm0, %xmm0
1393	mov	$16, %rcx	/* index for loads */
1394	mov	$10, %r9d	/* byte position left over from less32bytes case */
1395	/*
1396	 * Setup %r10 value allows us to detect crossing a page boundary.
1397	 * When %r10 goes positive we have crossed a page boundary and
1398	 * need to do a nibble.
1399	 */
1400	lea	10(%rdi), %r10
1401	and	$0xfff, %r10	/* offset into 4K page */
1402	sub	$0x1000, %r10	/* subtract 4K pagesize */
1403
1404	.p2align 4
1405LABEL(loop_ashr_10):
1406	add	$16, %r10
1407	jg	LABEL(nibble_ashr_10)
1408
1409LABEL(gobble_ashr_10):
1410	movdqa	(%rsi, %rcx), %xmm1
1411	movdqa	(%rdi, %rcx), %xmm2
1412	movdqa	%xmm2, %xmm4
1413
1414	psrldq	$10, %xmm3
1415	pslldq	$6, %xmm2
1416	por	%xmm3, %xmm2		/* merge into one 16byte value */
1417
1418	TOLOWER (%xmm1, %xmm2)
1419
1420	pcmpeqb	%xmm1, %xmm0
1421	pcmpeqb	%xmm2, %xmm1
1422	psubb	%xmm0, %xmm1
1423	pmovmskb %xmm1, %edx
1424	sub	$0xffff, %edx
1425	jnz	LABEL(exit)
1426
1427# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1428	sub	$16, %r11
1429	jbe	LABEL(strcmp_exitz)
1430# endif
1431
1432	add	$16, %rcx
1433	movdqa	%xmm4, %xmm3
1434
1435	add	$16, %r10
1436	jg	LABEL(nibble_ashr_10)	/* cross page boundary */
1437
1438	movdqa	(%rsi, %rcx), %xmm1
1439	movdqa	(%rdi, %rcx), %xmm2
1440	movdqa	%xmm2, %xmm4
1441
1442	psrldq	$10, %xmm3
1443	pslldq	$6, %xmm2
1444	por	%xmm3, %xmm2		/* merge into one 16byte value */
1445
1446	TOLOWER (%xmm1, %xmm2)
1447
1448	pcmpeqb	%xmm1, %xmm0
1449	pcmpeqb	%xmm2, %xmm1
1450	psubb	%xmm0, %xmm1
1451	pmovmskb %xmm1, %edx
1452	sub	$0xffff, %edx
1453	jnz	LABEL(exit)
1454
1455# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1456	sub	$16, %r11
1457	jbe	LABEL(strcmp_exitz)
1458# endif
1459
1460	add	$16, %rcx
1461	movdqa	%xmm4, %xmm3
1462	jmp	LABEL(loop_ashr_10)
1463
1464	.p2align 4
1465LABEL(nibble_ashr_10):
1466	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1467	pmovmskb %xmm0, %edx
1468	test	$0xfc00, %edx
1469	jnz	LABEL(ashr_10_exittail)
1470
1471# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1472	cmp	$6, %r11
1473	jbe	LABEL(ashr_10_exittail)
1474# endif
1475
1476	pxor	%xmm0, %xmm0
1477	sub	$0x1000, %r10
1478	jmp	LABEL(gobble_ashr_10)
1479
1480	.p2align 4
1481LABEL(ashr_10_exittail):
1482	movdqa	(%rsi, %rcx), %xmm1
1483	psrldq	$10, %xmm0
1484	psrldq	$10, %xmm3
1485	jmp	LABEL(aftertail)
1486
1487/*
1488 *  The following cases will be handled by ashr_11
1489 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1490 *        n(5~15)          n - 5               10(15 +(n - 5) - n)         ashr_11
1491 */
1492	.p2align 4
1493LABEL(ashr_11):
1494	pxor	%xmm0, %xmm0
1495	movdqa	(%rdi), %xmm2
1496	movdqa	(%rsi), %xmm1
1497	pcmpeqb	%xmm1, %xmm0
1498	pslldq	$5, %xmm2
1499	TOLOWER (%xmm1, %xmm2)
1500	pcmpeqb	%xmm1, %xmm2
1501	psubb	%xmm0, %xmm2
1502	pmovmskb %xmm2, %r9d
1503	shr	%cl, %edx
1504	shr	%cl, %r9d
1505	sub	%r9d, %edx
1506	jnz	LABEL(less32bytes)
1507	movdqa	(%rdi), %xmm3
1508
1509	UPDATE_STRNCMP_COUNTER
1510
1511	pxor	%xmm0, %xmm0
1512	mov	$16, %rcx	/* index for loads */
1513	mov	$11, %r9d	/* byte position left over from less32bytes case */
1514	/*
1515	 * Setup %r10 value allows us to detect crossing a page boundary.
1516	 * When %r10 goes positive we have crossed a page boundary and
1517	 * need to do a nibble.
1518	 */
1519	lea	11(%rdi), %r10
1520	and	$0xfff, %r10	/* offset into 4K page */
1521	sub	$0x1000, %r10	/* subtract 4K pagesize */
1522
1523	.p2align 4
1524LABEL(loop_ashr_11):
1525	add	$16, %r10
1526	jg	LABEL(nibble_ashr_11)
1527
1528LABEL(gobble_ashr_11):
1529	movdqa	(%rsi, %rcx), %xmm1
1530	movdqa	(%rdi, %rcx), %xmm2
1531	movdqa	%xmm2, %xmm4
1532
1533	psrldq	$11, %xmm3
1534	pslldq	$5, %xmm2
1535	por	%xmm3, %xmm2		/* merge into one 16byte value */
1536
1537	TOLOWER (%xmm1, %xmm2)
1538
1539	pcmpeqb	%xmm1, %xmm0
1540	pcmpeqb	%xmm2, %xmm1
1541	psubb	%xmm0, %xmm1
1542	pmovmskb %xmm1, %edx
1543	sub	$0xffff, %edx
1544	jnz	LABEL(exit)
1545
1546# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1547	sub	$16, %r11
1548	jbe	LABEL(strcmp_exitz)
1549# endif
1550
1551	add	$16, %rcx
1552	movdqa	%xmm4, %xmm3
1553
1554	add	$16, %r10
1555	jg	LABEL(nibble_ashr_11)	/* cross page boundary */
1556
1557	movdqa	(%rsi, %rcx), %xmm1
1558	movdqa	(%rdi, %rcx), %xmm2
1559	movdqa	%xmm2, %xmm4
1560
1561	psrldq	$11, %xmm3
1562	pslldq	$5, %xmm2
1563	por	%xmm3, %xmm2		/* merge into one 16byte value */
1564
1565	TOLOWER (%xmm1, %xmm2)
1566
1567	pcmpeqb	%xmm1, %xmm0
1568	pcmpeqb	%xmm2, %xmm1
1569	psubb	%xmm0, %xmm1
1570	pmovmskb %xmm1, %edx
1571	sub	$0xffff, %edx
1572	jnz	LABEL(exit)
1573
1574# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1575	sub	$16, %r11
1576	jbe	LABEL(strcmp_exitz)
1577# endif
1578
1579	add	$16, %rcx
1580	movdqa	%xmm4, %xmm3
1581	jmp	LABEL(loop_ashr_11)
1582
1583	.p2align 4
1584LABEL(nibble_ashr_11):
1585	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1586	pmovmskb %xmm0, %edx
1587	test	$0xf800, %edx
1588	jnz	LABEL(ashr_11_exittail)
1589
1590# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1591	cmp	$5, %r11
1592	jbe	LABEL(ashr_11_exittail)
1593# endif
1594
1595	pxor	%xmm0, %xmm0
1596	sub	$0x1000, %r10
1597	jmp	LABEL(gobble_ashr_11)
1598
1599	.p2align 4
1600LABEL(ashr_11_exittail):
1601	movdqa	(%rsi, %rcx), %xmm1
1602	psrldq	$11, %xmm0
1603	psrldq	$11, %xmm3
1604	jmp	LABEL(aftertail)
1605
1606/*
1607 *  The following cases will be handled by ashr_12
1608 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1609 *        n(4~15)          n - 4                11(15 +(n - 4) - n)         ashr_12
1610 */
1611	.p2align 4
1612LABEL(ashr_12):
1613	pxor	%xmm0, %xmm0
1614	movdqa	(%rdi), %xmm2
1615	movdqa	(%rsi), %xmm1
1616	pcmpeqb	%xmm1, %xmm0
1617	pslldq	$4, %xmm2
1618	TOLOWER (%xmm1, %xmm2)
1619	pcmpeqb	%xmm1, %xmm2
1620	psubb	%xmm0, %xmm2
1621	pmovmskb %xmm2, %r9d
1622	shr	%cl, %edx
1623	shr	%cl, %r9d
1624	sub	%r9d, %edx
1625	jnz	LABEL(less32bytes)
1626	movdqa	(%rdi), %xmm3
1627
1628	UPDATE_STRNCMP_COUNTER
1629
1630	pxor	%xmm0, %xmm0
1631	mov	$16, %rcx	/* index for loads */
1632	mov	$12, %r9d	/* byte position left over from less32bytes case */
1633	/*
1634	 * Setup %r10 value allows us to detect crossing a page boundary.
1635	 * When %r10 goes positive we have crossed a page boundary and
1636	 * need to do a nibble.
1637	 */
1638	lea	12(%rdi), %r10
1639	and	$0xfff, %r10	/* offset into 4K page */
1640	sub	$0x1000, %r10	/* subtract 4K pagesize */
1641
1642	.p2align 4
1643LABEL(loop_ashr_12):
1644	add	$16, %r10
1645	jg	LABEL(nibble_ashr_12)
1646
1647LABEL(gobble_ashr_12):
1648	movdqa	(%rsi, %rcx), %xmm1
1649	movdqa	(%rdi, %rcx), %xmm2
1650	movdqa	%xmm2, %xmm4
1651
1652	psrldq	$12, %xmm3
1653	pslldq	$4, %xmm2
1654	por	%xmm3, %xmm2		/* merge into one 16byte value */
1655
1656	TOLOWER (%xmm1, %xmm2)
1657
1658	pcmpeqb	%xmm1, %xmm0
1659	pcmpeqb	%xmm2, %xmm1
1660	psubb	%xmm0, %xmm1
1661	pmovmskb %xmm1, %edx
1662	sub	$0xffff, %edx
1663	jnz	LABEL(exit)
1664
1665# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1666	sub	$16, %r11
1667	jbe	LABEL(strcmp_exitz)
1668# endif
1669
1670	add	$16, %rcx
1671	movdqa	%xmm4, %xmm3
1672
1673	add	$16, %r10
1674	jg	LABEL(nibble_ashr_12)	/* cross page boundary */
1675
1676	movdqa	(%rsi, %rcx), %xmm1
1677	movdqa	(%rdi, %rcx), %xmm2
1678	movdqa	%xmm2, %xmm4
1679
1680	psrldq	$12, %xmm3
1681	pslldq	$4, %xmm2
1682	por	%xmm3, %xmm2		/* merge into one 16byte value */
1683
1684	TOLOWER (%xmm1, %xmm2)
1685
1686	pcmpeqb	%xmm1, %xmm0
1687	pcmpeqb	%xmm2, %xmm1
1688	psubb	%xmm0, %xmm1
1689	pmovmskb %xmm1, %edx
1690	sub	$0xffff, %edx
1691	jnz	LABEL(exit)
1692
1693# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1694	sub	$16, %r11
1695	jbe	LABEL(strcmp_exitz)
1696# endif
1697
1698	add	$16, %rcx
1699	movdqa	%xmm4, %xmm3
1700	jmp	LABEL(loop_ashr_12)
1701
1702	.p2align 4
1703LABEL(nibble_ashr_12):
1704	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1705	pmovmskb %xmm0, %edx
1706	test	$0xf000, %edx
1707	jnz	LABEL(ashr_12_exittail)
1708
1709# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1710	cmp	$4, %r11
1711	jbe	LABEL(ashr_12_exittail)
1712# endif
1713
1714	pxor	%xmm0, %xmm0
1715	sub	$0x1000, %r10
1716	jmp	LABEL(gobble_ashr_12)
1717
1718	.p2align 4
1719LABEL(ashr_12_exittail):
1720	movdqa	(%rsi, %rcx), %xmm1
1721	psrldq	$12, %xmm0
1722	psrldq	$12, %xmm3
1723	jmp	LABEL(aftertail)
1724
1725/*
1726 *  The following cases will be handled by ashr_13
1727 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1728 *        n(3~15)          n - 3                12(15 +(n - 3) - n)         ashr_13
1729 */
1730	.p2align 4
1731LABEL(ashr_13):
1732	pxor	%xmm0, %xmm0
1733	movdqa	(%rdi), %xmm2
1734	movdqa	(%rsi), %xmm1
1735	pcmpeqb	%xmm1, %xmm0
1736	pslldq	$3, %xmm2
1737	TOLOWER (%xmm1, %xmm2)
1738	pcmpeqb	%xmm1, %xmm2
1739	psubb	%xmm0, %xmm2
1740	pmovmskb %xmm2, %r9d
1741	shr	%cl, %edx
1742	shr	%cl, %r9d
1743	sub	%r9d, %edx
1744	jnz	LABEL(less32bytes)
1745	movdqa	(%rdi), %xmm3
1746
1747	UPDATE_STRNCMP_COUNTER
1748
1749	pxor	%xmm0, %xmm0
1750	mov	$16, %rcx	/* index for loads */
1751	mov	$13, %r9d	/* byte position left over from less32bytes case */
1752	/*
1753	 * Setup %r10 value allows us to detect crossing a page boundary.
1754	 * When %r10 goes positive we have crossed a page boundary and
1755	 * need to do a nibble.
1756	 */
1757	lea	13(%rdi), %r10
1758	and	$0xfff, %r10	/* offset into 4K page */
1759	sub	$0x1000, %r10	/* subtract 4K pagesize */
1760
1761	.p2align 4
1762LABEL(loop_ashr_13):
1763	add	$16, %r10
1764	jg	LABEL(nibble_ashr_13)
1765
1766LABEL(gobble_ashr_13):
1767	movdqa	(%rsi, %rcx), %xmm1
1768	movdqa	(%rdi, %rcx), %xmm2
1769	movdqa	%xmm2, %xmm4
1770
1771	psrldq	$13, %xmm3
1772	pslldq	$3, %xmm2
1773	por	%xmm3, %xmm2		/* merge into one 16byte value */
1774
1775	TOLOWER (%xmm1, %xmm2)
1776
1777	pcmpeqb	%xmm1, %xmm0
1778	pcmpeqb	%xmm2, %xmm1
1779	psubb	%xmm0, %xmm1
1780	pmovmskb %xmm1, %edx
1781	sub	$0xffff, %edx
1782	jnz	LABEL(exit)
1783
1784# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1785	sub	$16, %r11
1786	jbe	LABEL(strcmp_exitz)
1787# endif
1788
1789	add	$16, %rcx
1790	movdqa	%xmm4, %xmm3
1791
1792	add	$16, %r10
1793	jg	LABEL(nibble_ashr_13)	/* cross page boundary */
1794
1795	movdqa	(%rsi, %rcx), %xmm1
1796	movdqa	(%rdi, %rcx), %xmm2
1797	movdqa	%xmm2, %xmm4
1798
1799	psrldq	$13, %xmm3
1800	pslldq	$3, %xmm2
1801	por	%xmm3, %xmm2		/* merge into one 16byte value */
1802
1803	TOLOWER (%xmm1, %xmm2)
1804
1805	pcmpeqb	%xmm1, %xmm0
1806	pcmpeqb	%xmm2, %xmm1
1807	psubb	%xmm0, %xmm1
1808	pmovmskb %xmm1, %edx
1809	sub	$0xffff, %edx
1810	jnz	LABEL(exit)
1811
1812# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1813	sub	$16, %r11
1814	jbe	LABEL(strcmp_exitz)
1815# endif
1816
1817	add	$16, %rcx
1818	movdqa	%xmm4, %xmm3
1819	jmp	LABEL(loop_ashr_13)
1820
1821	.p2align 4
1822LABEL(nibble_ashr_13):
1823	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1824	pmovmskb %xmm0, %edx
1825	test	$0xe000, %edx
1826	jnz	LABEL(ashr_13_exittail)
1827
1828# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1829	cmp	$3, %r11
1830	jbe	LABEL(ashr_13_exittail)
1831# endif
1832
1833	pxor	%xmm0, %xmm0
1834	sub	$0x1000, %r10
1835	jmp	LABEL(gobble_ashr_13)
1836
1837	.p2align 4
1838LABEL(ashr_13_exittail):
1839	movdqa	(%rsi, %rcx), %xmm1
1840	psrldq  $13, %xmm0
1841	psrldq  $13, %xmm3
1842	jmp	LABEL(aftertail)
1843
1844/*
1845 *  The following cases will be handled by ashr_14
1846 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1847 *        n(2~15)          n - 2                13(15 +(n - 2) - n)         ashr_14
1848 */
1849	.p2align 4
1850LABEL(ashr_14):
1851	pxor	%xmm0, %xmm0
1852	movdqa	(%rdi), %xmm2
1853	movdqa	(%rsi), %xmm1
1854	pcmpeqb	%xmm1, %xmm0
1855	pslldq  $2, %xmm2
1856	TOLOWER (%xmm1, %xmm2)
1857	pcmpeqb	%xmm1, %xmm2
1858	psubb	%xmm0, %xmm2
1859	pmovmskb %xmm2, %r9d
1860	shr	%cl, %edx
1861	shr	%cl, %r9d
1862	sub	%r9d, %edx
1863	jnz	LABEL(less32bytes)
1864	movdqa	(%rdi), %xmm3
1865
1866	UPDATE_STRNCMP_COUNTER
1867
1868	pxor	%xmm0, %xmm0
1869	mov	$16, %rcx	/* index for loads */
1870	mov	$14, %r9d	/* byte position left over from less32bytes case */
1871	/*
1872	 * Setup %r10 value allows us to detect crossing a page boundary.
1873	 * When %r10 goes positive we have crossed a page boundary and
1874	 * need to do a nibble.
1875	 */
1876	lea	14(%rdi), %r10
1877	and	$0xfff, %r10	/* offset into 4K page */
1878	sub	$0x1000, %r10	/* subtract 4K pagesize */
1879
1880	.p2align 4
1881LABEL(loop_ashr_14):
1882	add	$16, %r10
1883	jg	LABEL(nibble_ashr_14)
1884
1885LABEL(gobble_ashr_14):
1886	movdqa	(%rsi, %rcx), %xmm1
1887	movdqa	(%rdi, %rcx), %xmm2
1888	movdqa	%xmm2, %xmm4
1889
1890	psrldq	$14, %xmm3
1891	pslldq	$2, %xmm2
1892	por	%xmm3, %xmm2		/* merge into one 16byte value */
1893
1894	TOLOWER (%xmm1, %xmm2)
1895
1896	pcmpeqb	%xmm1, %xmm0
1897	pcmpeqb	%xmm2, %xmm1
1898	psubb	%xmm0, %xmm1
1899	pmovmskb %xmm1, %edx
1900	sub	$0xffff, %edx
1901	jnz	LABEL(exit)
1902
1903# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1904	sub	$16, %r11
1905	jbe	LABEL(strcmp_exitz)
1906# endif
1907
1908	add	$16, %rcx
1909	movdqa	%xmm4, %xmm3
1910
1911	add	$16, %r10
1912	jg	LABEL(nibble_ashr_14)	/* cross page boundary */
1913
1914	movdqa	(%rsi, %rcx), %xmm1
1915	movdqa	(%rdi, %rcx), %xmm2
1916	movdqa	%xmm2, %xmm4
1917
1918	psrldq	$14, %xmm3
1919	pslldq	$2, %xmm2
1920	por	%xmm3, %xmm2		/* merge into one 16byte value */
1921
1922	TOLOWER (%xmm1, %xmm2)
1923
1924	pcmpeqb	%xmm1, %xmm0
1925	pcmpeqb	%xmm2, %xmm1
1926	psubb	%xmm0, %xmm1
1927	pmovmskb %xmm1, %edx
1928	sub	$0xffff, %edx
1929	jnz	LABEL(exit)
1930
1931# if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
1932	sub	$16, %r11
1933	jbe	LABEL(strcmp_exitz)
1934# endif
1935
1936	add	$16, %rcx
1937	movdqa	%xmm4, %xmm3
1938	jmp	LABEL(loop_ashr_14)
1939
1940	.p2align 4
1941LABEL(nibble_ashr_14):
1942	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1943	pmovmskb %xmm0, %edx
1944	test	$0xc000, %edx
1945	jnz	LABEL(ashr_14_exittail)
1946
1947# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1948	cmp	$2, %r11
1949	jbe	LABEL(ashr_14_exittail)
1950# endif
1951
1952	pxor	%xmm0, %xmm0
1953	sub	$0x1000, %r10
1954	jmp	LABEL(gobble_ashr_14)
1955
1956	.p2align 4
1957LABEL(ashr_14_exittail):
1958	movdqa	(%rsi, %rcx), %xmm1
1959	psrldq	$14, %xmm0
1960	psrldq	$14, %xmm3
1961	jmp	LABEL(aftertail)
1962
1963/*
1964 *  The following cases will be handled by ashr_15
1965 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1966 *        n(1~15)          n - 1                14(15 +(n - 1) - n)         ashr_15
1967 */
1968	.p2align 4
1969LABEL(ashr_15):
1970	pxor	%xmm0, %xmm0
1971	movdqa	(%rdi), %xmm2
1972	movdqa	(%rsi), %xmm1
1973	pcmpeqb	%xmm1, %xmm0
1974	pslldq	$1, %xmm2
1975	TOLOWER (%xmm1, %xmm2)
1976	pcmpeqb	%xmm1, %xmm2
1977	psubb	%xmm0, %xmm2
1978	pmovmskb %xmm2, %r9d
1979	shr	%cl, %edx
1980	shr	%cl, %r9d
1981	sub	%r9d, %edx
1982	jnz	LABEL(less32bytes)
1983
1984	movdqa	(%rdi), %xmm3
1985
1986	UPDATE_STRNCMP_COUNTER
1987
1988	pxor	%xmm0, %xmm0
1989	mov	$16, %rcx	/* index for loads */
1990	mov	$15, %r9d	/* byte position left over from less32bytes case */
1991	/*
1992	 * Setup %r10 value allows us to detect crossing a page boundary.
1993	 * When %r10 goes positive we have crossed a page boundary and
1994	 * need to do a nibble.
1995	 */
1996	lea	15(%rdi), %r10
1997	and	$0xfff, %r10	/* offset into 4K page */
1998
1999	sub	$0x1000, %r10	/* subtract 4K pagesize */
2000
2001	.p2align 4
2002LABEL(loop_ashr_15):
2003	add	$16, %r10
2004	jg	LABEL(nibble_ashr_15)
2005
2006LABEL(gobble_ashr_15):
2007	movdqa	(%rsi, %rcx), %xmm1
2008	movdqa	(%rdi, %rcx), %xmm2
2009	movdqa	%xmm2, %xmm4
2010
2011	psrldq	$15, %xmm3
2012	pslldq	$1, %xmm2
2013	por	%xmm3, %xmm2		/* merge into one 16byte value */
2014
2015	TOLOWER (%xmm1, %xmm2)
2016
2017	pcmpeqb	%xmm1, %xmm0
2018	pcmpeqb	%xmm2, %xmm1
2019	psubb	%xmm0, %xmm1
2020	pmovmskb %xmm1, %edx
2021	sub	$0xffff, %edx
2022	jnz	LABEL(exit)
2023
2024# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2025	sub	$16, %r11
2026	jbe	LABEL(strcmp_exitz)
2027# endif
2028
2029	add	$16, %rcx
2030	movdqa	%xmm4, %xmm3
2031
2032	add	$16, %r10
2033	jg	LABEL(nibble_ashr_15)	/* cross page boundary */
2034
2035	movdqa	(%rsi, %rcx), %xmm1
2036	movdqa	(%rdi, %rcx), %xmm2
2037	movdqa	%xmm2, %xmm4
2038
2039	psrldq	$15, %xmm3
2040	pslldq	$1, %xmm2
2041	por	%xmm3, %xmm2		/* merge into one 16byte value */
2042
2043	TOLOWER (%xmm1, %xmm2)
2044
2045	pcmpeqb	%xmm1, %xmm0
2046	pcmpeqb	%xmm2, %xmm1
2047	psubb	%xmm0, %xmm1
2048	pmovmskb %xmm1, %edx
2049	sub	$0xffff, %edx
2050	jnz	LABEL(exit)
2051
2052# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2053	sub	$16, %r11
2054	jbe	LABEL(strcmp_exitz)
2055# endif
2056
2057	add	$16, %rcx
2058	movdqa	%xmm4, %xmm3
2059	jmp	LABEL(loop_ashr_15)
2060
2061	.p2align 4
2062LABEL(nibble_ashr_15):
2063	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
2064	pmovmskb %xmm0, %edx
2065	test	$0x8000, %edx
2066	jnz	LABEL(ashr_15_exittail)
2067
2068# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2069	cmpq	$1, %r11
2070	jbe	LABEL(ashr_15_exittail)
2071# endif
2072
2073	pxor	%xmm0, %xmm0
2074	sub	$0x1000, %r10
2075	jmp	LABEL(gobble_ashr_15)
2076
2077	.p2align 4
2078LABEL(ashr_15_exittail):
2079	movdqa	(%rsi, %rcx), %xmm1
2080	psrldq	$15, %xmm3
2081	psrldq	$15, %xmm0
2082
2083	.p2align 4
2084LABEL(aftertail):
2085	TOLOWER (%xmm1, %xmm3)
2086	pcmpeqb	%xmm3, %xmm1
2087	psubb	%xmm0, %xmm1
2088	pmovmskb %xmm1, %edx
2089	not	%edx
2090
2091	.p2align 4
2092LABEL(exit):
2093	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
2094LABEL(less32bytes):
2095	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
2096	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
2097	test	%r8d, %r8d
2098	jz	LABEL(ret)
2099	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
2100
2101	.p2align 4
2102LABEL(ret):
2103LABEL(less16bytes):
2104	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
2105
2106# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2107	sub	%rdx, %r11
2108	jbe	LABEL(strcmp_exitz)
2109# endif
2110	movzbl	(%rsi, %rdx), %ecx
2111	movzbl	(%rdi, %rdx), %eax
2112
2113# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
2114	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
2115	movl	(%rdx,%rcx,4), %ecx
2116	movl	(%rdx,%rax,4), %eax
2117# endif
2118
2119	sub	%ecx, %eax
2120	ret
2121
2122LABEL(strcmp_exitz):
2123	xor	%eax, %eax
2124	ret
2125
2126	.p2align 4
2127LABEL(Byte0):
2128	movzbl	(%rsi), %ecx
2129	movzbl	(%rdi), %eax
2130
2131# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
2132	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
2133	movl	(%rdx,%rcx,4), %ecx
2134	movl	(%rdx,%rax,4), %eax
2135# endif
2136
2137	sub	%ecx, %eax
2138	ret
2139END (STRCMP)
2140
2141	.section .rodata,"a",@progbits
2142	.p2align 3
2143LABEL(unaligned_table):
2144	.int	LABEL(ashr_1) - LABEL(unaligned_table)
2145	.int	LABEL(ashr_2) - LABEL(unaligned_table)
2146	.int	LABEL(ashr_3) - LABEL(unaligned_table)
2147	.int	LABEL(ashr_4) - LABEL(unaligned_table)
2148	.int	LABEL(ashr_5) - LABEL(unaligned_table)
2149	.int	LABEL(ashr_6) - LABEL(unaligned_table)
2150	.int	LABEL(ashr_7) - LABEL(unaligned_table)
2151	.int	LABEL(ashr_8) - LABEL(unaligned_table)
2152	.int	LABEL(ashr_9) - LABEL(unaligned_table)
2153	.int	LABEL(ashr_10) - LABEL(unaligned_table)
2154	.int	LABEL(ashr_11) - LABEL(unaligned_table)
2155	.int	LABEL(ashr_12) - LABEL(unaligned_table)
2156	.int	LABEL(ashr_13) - LABEL(unaligned_table)
2157	.int	LABEL(ashr_14) - LABEL(unaligned_table)
2158	.int	LABEL(ashr_15) - LABEL(unaligned_table)
2159	.int	LABEL(ashr_0) - LABEL(unaligned_table)
2160#endif
2161