1/* strcmp optimized with SSE4.2.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (2)
22
23# include <sysdep.h>
24
25# define STRCMP_ISA	_sse42
26# include "strcmp-naming.h"
27
28# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
29#  include "locale-defines.h"
30# endif
31
32# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
33/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
34   if the new counter > the old one or is 0.  */
35#  define UPDATE_STRNCMP_COUNTER				\
36	/* calculate left number to compare */		\
37	lea	-16(%rcx, %r11), %r9;			\
38	cmp	%r9, %r11;				\
39	jb	LABEL(strcmp_exitz);			\
40	test	%r9, %r9;				\
41	je	LABEL(strcmp_exitz);			\
42	mov	%r9, %r11
43# else
44#  define UPDATE_STRNCMP_COUNTER
45# endif
46
47# define SECTION	sse4.2
48
49# define LABEL(l)	.L##l
50
51/* We use 0x1a:
52	_SIDD_SBYTE_OPS
53	| _SIDD_CMP_EQUAL_EACH
54	| _SIDD_NEGATIVE_POLARITY
55	| _SIDD_LEAST_SIGNIFICANT
56   on pcmpistri to find out if two 16byte data elements are the same
57   and the offset of the first different byte.  There are 4 cases:
58
59   1. Both 16byte data elements are valid and identical.
60   2. Both 16byte data elements have EOS and identical.
61   3. Both 16byte data elements are valid and they differ at offset X.
62   4. At least one 16byte data element has EOS at offset X.  Two 16byte
63      data elements must differ at or before offset X.
64
65   Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
66
67   case		ECX	CFlag	ZFlag	SFlag
68    1		16	  0	  0	  0
69    2		16	  0	  1	  1
70    3		 X	  1	  0	  0
71    4	       0 <= X	  1	 0/1	 0/1
72
73   We exit from the loop for cases 2, 3 and 4 with jbe which branches
74   when either CFlag or ZFlag is 1.  If CFlag == 0, we return 0 for
75   case 2.  */
76
77	/* Put all SSE 4.2 functions together.  */
78	.section .text.SECTION,"ax",@progbits
79	.align	16
80	.type	STRCMP, @function
81	.globl	STRCMP
82# ifdef USE_AS_STRCASECMP_L
83ENTRY (STRCASECMP)
84	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
85	mov	%fs:(%rax),%RDX_LP
86
87	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
88	.p2align 4
89END (STRCASECMP)
90	/* FALLTHROUGH to strcasecmp_l.  */
91# endif
92# ifdef USE_AS_STRNCASECMP_L
93ENTRY (STRCASECMP)
94	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
95	mov	%fs:(%rax),%RCX_LP
96
97	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
98	.p2align 4
99END (STRCASECMP)
100	/* FALLTHROUGH to strncasecmp_l.  */
101# endif
102
103
104# define arg arg
105
106STRCMP:
107	cfi_startproc
108	_CET_ENDBR
109	CALL_MCOUNT
110
111/*
112 * This implementation uses SSE to compare up to 16 bytes at a time.
113 */
114# ifdef USE_AS_STRCASECMP_L
115	/* We have to fall back on the C implementation for locales
116	   with encodings not matching ASCII for single bytes.  */
117#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
118	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
119#  else
120	mov	(%rdx), %RAX_LP
121#  endif
122	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
123	jne	__strcasecmp_l_nonascii
124# endif
125# ifdef USE_AS_STRNCASECMP_L
126	/* We have to fall back on the C implementation for locales
127	   with encodings not matching ASCII for single bytes.  */
128#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
129	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
130#  else
131	mov	(%rcx), %RAX_LP
132#  endif
133	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
134	jne	__strncasecmp_l_nonascii
135# endif
136
137# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
138	test	%RDX_LP, %RDX_LP
139	je	LABEL(strcmp_exitz)
140	cmp	$1, %RDX_LP
141	je	LABEL(Byte0)
142	mov	%RDX_LP, %R11_LP
143# endif
144	mov	%esi, %ecx
145	mov	%edi, %eax
146/* Use 64bit AND here to avoid long NOP padding.  */
147	and	$0x3f, %rcx		/* rsi alignment in cache line */
148	and	$0x3f, %rax		/* rdi alignment in cache line */
149# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
150	.section .rodata.cst16,"aM",@progbits,16
151	.align 16
152LABEL(lcase_min):
153	.quad	0x3f3f3f3f3f3f3f3f
154	.quad	0x3f3f3f3f3f3f3f3f
155LABEL(lcase_max):
156	.quad	0x9999999999999999
157	.quad	0x9999999999999999
158LABEL(case_add):
159	.quad	0x2020202020202020
160	.quad	0x2020202020202020
161	.previous
162	movdqa	LABEL(lcase_min)(%rip), %xmm4
163#  define LCASE_MIN_reg %xmm4
164	movdqa	LABEL(lcase_max)(%rip), %xmm5
165#  define LCASE_MAX_reg %xmm5
166	movdqa	LABEL(case_add)(%rip), %xmm6
167#  define CASE_ADD_reg %xmm6
168# endif
169	cmp	$0x30, %ecx
170	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
171	cmp	$0x30, %eax
172	ja	LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
173	movdqu	(%rdi), %xmm1
174	movdqu	(%rsi), %xmm2
175# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
176#  define TOLOWER(reg1, reg2) \
177	movdqa	LCASE_MIN_reg, %xmm7;					\
178	movdqa	LCASE_MIN_reg, %xmm8;					\
179	paddb	reg1, %xmm7;					\
180	paddb	reg2, %xmm8;					\
181	pcmpgtb	LCASE_MAX_reg, %xmm7;				\
182	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
183	pandn	CASE_ADD_reg, %xmm7;					\
184	pandn	CASE_ADD_reg, %xmm8;					\
185	paddb	%xmm7, reg1;					\
186	paddb	%xmm8, reg2
187
188	TOLOWER (%xmm1, %xmm2)
189# else
190#  define TOLOWER(reg1, reg2)
191# endif
192	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
193	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
194	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
195	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
196	pmovmskb %xmm1, %edx
197	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
198	jnz	LABEL(less16bytes)/* If not, find different value or null char */
199# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
200	sub	$16, %r11
201	jbe	LABEL(strcmp_exitz)/* finish comparison */
202# endif
203	add	$16, %rsi		/* prepare to search next 16 bytes */
204	add	$16, %rdi		/* prepare to search next 16 bytes */
205
206	/*
207	 * Determine source and destination string offsets from 16-byte
208	 * alignment.  Use relative offset difference between the two to
209	 * determine which case below to use.
210	 */
211	.p2align 4
212LABEL(crosscache):
213	and	$0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
214	and	$0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
215	mov	$0xffff, %edx		/* for equivalent offset */
216	xor	%r8d, %r8d
217	and	$0xf, %ecx		/* offset of rsi */
218	and	$0xf, %eax		/* offset of rdi */
219	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char check */
220	cmp	%eax, %ecx
221	je	LABEL(ashr_0)		/* rsi and rdi relative offset same */
222	ja	LABEL(bigger)
223	mov	%edx, %r8d		/* r8d is offset flag for exit tail */
224	xchg	%ecx, %eax
225	xchg	%rsi, %rdi
226LABEL(bigger):
227	movdqa	(%rdi), %xmm2
228	movdqa	(%rsi), %xmm1
229	lea	15(%rax), %r9
230	sub	%rcx, %r9
231	lea	LABEL(unaligned_table)(%rip), %r10
232	movslq	(%r10, %r9,4), %r9
233	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
234	lea	(%r10, %r9), %r10
235	_CET_NOTRACK jmp *%r10		/* jump to corresponding case */
236
237/*
238 * The following cases will be handled by ashr_0
239 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
240 *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
241 */
242	.p2align 4
243LABEL(ashr_0):
244
245	movdqa	(%rsi), %xmm1
246	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
247# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
248	pcmpeqb	(%rdi), %xmm1		/* compare 16 bytes for equality */
249# else
250	movdqa	(%rdi), %xmm2
251	TOLOWER (%xmm1, %xmm2)
252	pcmpeqb	%xmm2, %xmm1		/* compare 16 bytes for equality */
253# endif
254	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
255	pmovmskb %xmm1, %r9d
256	shr	%cl, %edx		/* adjust 0xffff for offset */
257	shr	%cl, %r9d		/* adjust for 16-byte offset */
258	sub	%r9d, %edx
259	/*
260	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
261	 * the start from (16-rax) and no null char was seen.
262	 */
263	jne	LABEL(less32bytes)	/* mismatch or null char */
264	UPDATE_STRNCMP_COUNTER
265	mov	$16, %rcx
266	mov	$16, %r9
267
268	/*
269	 * Now both strings are aligned at 16-byte boundary. Loop over strings
270	 * checking 32-bytes per iteration.
271	 */
272	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
273	.p2align 4
274LABEL(ashr_0_use):
275	movdqa	(%rdi,%rdx), %xmm0
276# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
277	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
278# else
279	movdqa	(%rsi,%rdx), %xmm1
280	TOLOWER (%xmm0, %xmm1)
281	pcmpistri $0x1a, %xmm1, %xmm0
282# endif
283	lea	16(%rdx), %rdx
284	jbe	LABEL(ashr_0_exit_use)
285# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
286	sub	$16, %r11
287	jbe	LABEL(strcmp_exitz)
288# endif
289
290	movdqa	(%rdi,%rdx), %xmm0
291# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
292	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
293# else
294	movdqa	(%rsi,%rdx), %xmm1
295	TOLOWER (%xmm0, %xmm1)
296	pcmpistri $0x1a, %xmm1, %xmm0
297# endif
298	lea	16(%rdx), %rdx
299	jbe	LABEL(ashr_0_exit_use)
300# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
301	sub	$16, %r11
302	jbe	LABEL(strcmp_exitz)
303# endif
304	jmp	LABEL(ashr_0_use)
305
306
307	.p2align 4
308LABEL(ashr_0_exit_use):
309	jnc	LABEL(strcmp_exitz)
310# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
311	sub	%rcx, %r11
312	jbe	LABEL(strcmp_exitz)
313# endif
314	lea	-16(%rdx, %rcx), %rcx
315	movzbl	(%rdi, %rcx), %eax
316	movzbl	(%rsi, %rcx), %edx
317# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
318	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
319	movl	(%rcx,%rax,4), %eax
320	movl	(%rcx,%rdx,4), %edx
321# endif
322	sub	%edx, %eax
323	ret
324
325
326
327/*
328 * The following cases will be handled by ashr_1
329 * rcx(offset of rsi)  rax(offset of rdi)   relative offset	corresponding case
330 *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
331 */
332	.p2align 4
333LABEL(ashr_1):
334	pslldq	$15, %xmm2		/* shift first string to align with second */
335	TOLOWER (%xmm1, %xmm2)
336	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
337	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
338	pmovmskb %xmm2, %r9d
339	shr	%cl, %edx		/* adjust 0xffff for offset */
340	shr	%cl, %r9d		/* adjust for 16-byte offset */
341	sub	%r9d, %edx
342	jnz	LABEL(less32bytes)	/* mismatch or null char seen */
343	movdqa	(%rdi), %xmm3
344	UPDATE_STRNCMP_COUNTER
345
346	mov	$16, %rcx		/* index for loads*/
347	mov	$1, %r9d		/* byte position left over from less32bytes case */
348	/*
349	 * Setup %r10 value allows us to detect crossing a page boundary.
350	 * When %r10 goes positive we have crossed a page boundary and
351	 * need to do a nibble.
352	 */
353	lea	1(%rdi), %r10
354	and	$0xfff, %r10		/* offset into 4K page */
355	sub	$0x1000, %r10		/* subtract 4K pagesize */
356	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
357
358	.p2align 4
359LABEL(loop_ashr_1_use):
360	add	$16, %r10
361	jg	LABEL(nibble_ashr_1_use)
362
363LABEL(nibble_ashr_1_restart_use):
364	movdqa	(%rdi, %rdx), %xmm0
365	palignr $1, -16(%rdi, %rdx), %xmm0
366# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
367	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
368# else
369	movdqa	(%rsi,%rdx), %xmm1
370	TOLOWER (%xmm0, %xmm1)
371	pcmpistri $0x1a, %xmm1, %xmm0
372# endif
373	jbe	LABEL(exit_use)
374# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
375	sub	$16, %r11
376	jbe	LABEL(strcmp_exitz)
377# endif
378
379	add	$16, %rdx
380	add	$16, %r10
381	jg	LABEL(nibble_ashr_1_use)
382
383	movdqa	(%rdi, %rdx), %xmm0
384	palignr $1, -16(%rdi, %rdx), %xmm0
385# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
386	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
387# else
388	movdqa	(%rsi,%rdx), %xmm1
389	TOLOWER (%xmm0, %xmm1)
390	pcmpistri $0x1a, %xmm1, %xmm0
391# endif
392	jbe	LABEL(exit_use)
393# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
394	sub	$16, %r11
395	jbe	LABEL(strcmp_exitz)
396# endif
397	add	$16, %rdx
398	jmp	LABEL(loop_ashr_1_use)
399
400	.p2align 4
401LABEL(nibble_ashr_1_use):
402	sub	$0x1000, %r10
403	movdqa	-16(%rdi, %rdx), %xmm0
404	psrldq	$1, %xmm0
405	pcmpistri      $0x3a,%xmm0, %xmm0
406# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
407	cmp	%r11, %rcx
408	jae	LABEL(nibble_ashr_exit_use)
409# endif
410	cmp	$14, %ecx
411	ja	LABEL(nibble_ashr_1_restart_use)
412
413	jmp	LABEL(nibble_ashr_exit_use)
414
415/*
416 * The following cases will be handled by ashr_2
417 * rcx(offset of rsi)  rax(offset of rdi)   relative offset	corresponding case
418 *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
419 */
420	.p2align 4
421LABEL(ashr_2):
422	pslldq	$14, %xmm2
423	TOLOWER (%xmm1, %xmm2)
424	pcmpeqb	%xmm1, %xmm2
425	psubb	%xmm0, %xmm2
426	pmovmskb %xmm2, %r9d
427	shr	%cl, %edx
428	shr	%cl, %r9d
429	sub	%r9d, %edx
430	jnz	LABEL(less32bytes)
431	movdqa	(%rdi), %xmm3
432	UPDATE_STRNCMP_COUNTER
433
434	mov	$16, %rcx	/* index for loads */
435	mov	$2, %r9d	/* byte position left over from less32bytes case */
436	/*
437	 * Setup %r10 value allows us to detect crossing a page boundary.
438	 * When %r10 goes positive we have crossed a page boundary and
439	 * need to do a nibble.
440	 */
441	lea	2(%rdi), %r10
442	and	$0xfff, %r10	/* offset into 4K page */
443	sub	$0x1000, %r10	/* subtract 4K pagesize */
444	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
445
446	.p2align 4
447LABEL(loop_ashr_2_use):
448	add	$16, %r10
449	jg	LABEL(nibble_ashr_2_use)
450
451LABEL(nibble_ashr_2_restart_use):
452	movdqa	(%rdi, %rdx), %xmm0
453	palignr $2, -16(%rdi, %rdx), %xmm0
454# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
455	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
456# else
457	movdqa	(%rsi,%rdx), %xmm1
458	TOLOWER (%xmm0, %xmm1)
459	pcmpistri $0x1a, %xmm1, %xmm0
460# endif
461	jbe	LABEL(exit_use)
462# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
463	sub	$16, %r11
464	jbe	LABEL(strcmp_exitz)
465# endif
466
467	add	$16, %rdx
468	add	$16, %r10
469	jg	LABEL(nibble_ashr_2_use)
470
471	movdqa	(%rdi, %rdx), %xmm0
472	palignr $2, -16(%rdi, %rdx), %xmm0
473# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
474	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
475# else
476	movdqa	(%rsi,%rdx), %xmm1
477	TOLOWER (%xmm0, %xmm1)
478	pcmpistri $0x1a, %xmm1, %xmm0
479# endif
480	jbe	LABEL(exit_use)
481# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
482	sub	$16, %r11
483	jbe	LABEL(strcmp_exitz)
484# endif
485	add	$16, %rdx
486	jmp	LABEL(loop_ashr_2_use)
487
488	.p2align 4
489LABEL(nibble_ashr_2_use):
490	sub	$0x1000, %r10
491	movdqa	-16(%rdi, %rdx), %xmm0
492	psrldq	$2, %xmm0
493	pcmpistri      $0x3a,%xmm0, %xmm0
494# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
495	cmp	%r11, %rcx
496	jae	LABEL(nibble_ashr_exit_use)
497# endif
498	cmp	$13, %ecx
499	ja	LABEL(nibble_ashr_2_restart_use)
500
501	jmp	LABEL(nibble_ashr_exit_use)
502
503/*
504 * The following cases will be handled by ashr_3
505 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
506 *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
507 */
508	.p2align 4
509LABEL(ashr_3):
510	pslldq	$13, %xmm2
511	TOLOWER (%xmm1, %xmm2)
512	pcmpeqb	%xmm1, %xmm2
513	psubb	%xmm0, %xmm2
514	pmovmskb %xmm2, %r9d
515	shr	%cl, %edx
516	shr	%cl, %r9d
517	sub	%r9d, %edx
518	jnz	LABEL(less32bytes)
519	movdqa	(%rdi), %xmm3
520
521	UPDATE_STRNCMP_COUNTER
522
523	mov	$16, %rcx	/* index for loads */
524	mov	$3, %r9d	/* byte position left over from less32bytes case */
525	/*
526	 * Setup %r10 value allows us to detect crossing a page boundary.
527	 * When %r10 goes positive we have crossed a page boundary and
528	 * need to do a nibble.
529	 */
530	lea	3(%rdi), %r10
531	and	$0xfff, %r10	/* offset into 4K page */
532	sub	$0x1000, %r10	/* subtract 4K pagesize */
533	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
534
535LABEL(loop_ashr_3_use):
536	add	$16, %r10
537	jg	LABEL(nibble_ashr_3_use)
538
539LABEL(nibble_ashr_3_restart_use):
540	movdqa	(%rdi, %rdx), %xmm0
541	palignr $3, -16(%rdi, %rdx), %xmm0
542# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
543	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
544# else
545	movdqa	(%rsi,%rdx), %xmm1
546	TOLOWER (%xmm0, %xmm1)
547	pcmpistri $0x1a, %xmm1, %xmm0
548# endif
549	jbe	LABEL(exit_use)
550# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
551	sub	$16, %r11
552	jbe	LABEL(strcmp_exitz)
553# endif
554
555	add	$16, %rdx
556	add	$16, %r10
557	jg	LABEL(nibble_ashr_3_use)
558
559	movdqa	(%rdi, %rdx), %xmm0
560	palignr $3, -16(%rdi, %rdx), %xmm0
561# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
562	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
563# else
564	movdqa	(%rsi,%rdx), %xmm1
565	TOLOWER (%xmm0, %xmm1)
566	pcmpistri $0x1a, %xmm1, %xmm0
567# endif
568	jbe	LABEL(exit_use)
569# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
570	sub	$16, %r11
571	jbe	LABEL(strcmp_exitz)
572# endif
573	add	$16, %rdx
574	jmp	LABEL(loop_ashr_3_use)
575
576	.p2align 4
577LABEL(nibble_ashr_3_use):
578	sub	$0x1000, %r10
579	movdqa	-16(%rdi, %rdx), %xmm0
580	psrldq	$3, %xmm0
581	pcmpistri      $0x3a,%xmm0, %xmm0
582# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
583	cmp	%r11, %rcx
584	jae	LABEL(nibble_ashr_exit_use)
585# endif
586	cmp	$12, %ecx
587	ja	LABEL(nibble_ashr_3_restart_use)
588
589	jmp	LABEL(nibble_ashr_exit_use)
590
591/*
592 * The following cases will be handled by ashr_4
593 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
594 *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
595 */
596	.p2align 4
597LABEL(ashr_4):
598	pslldq	$12, %xmm2
599	TOLOWER (%xmm1, %xmm2)
600	pcmpeqb	%xmm1, %xmm2
601	psubb	%xmm0, %xmm2
602	pmovmskb %xmm2, %r9d
603	shr	%cl, %edx
604	shr	%cl, %r9d
605	sub	%r9d, %edx
606	jnz	LABEL(less32bytes)
607	movdqa	(%rdi), %xmm3
608
609	UPDATE_STRNCMP_COUNTER
610
611	mov	$16, %rcx	/* index for loads */
612	mov	$4, %r9d	/* byte position left over from less32bytes case */
613	/*
614	 * Setup %r10 value allows us to detect crossing a page boundary.
615	 * When %r10 goes positive we have crossed a page boundary and
616	 * need to do a nibble.
617	 */
618	lea	4(%rdi), %r10
619	and	$0xfff, %r10	/* offset into 4K page */
620	sub	$0x1000, %r10	/* subtract 4K pagesize */
621	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
622
623	.p2align 4
624LABEL(loop_ashr_4_use):
625	add	$16, %r10
626	jg	LABEL(nibble_ashr_4_use)
627
628LABEL(nibble_ashr_4_restart_use):
629	movdqa	(%rdi, %rdx), %xmm0
630	palignr $4, -16(%rdi, %rdx), %xmm0
631# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
632	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
633# else
634	movdqa	(%rsi,%rdx), %xmm1
635	TOLOWER (%xmm0, %xmm1)
636	pcmpistri $0x1a, %xmm1, %xmm0
637# endif
638	jbe	LABEL(exit_use)
639# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
640	sub	$16, %r11
641	jbe	LABEL(strcmp_exitz)
642# endif
643
644	add	$16, %rdx
645	add	$16, %r10
646	jg	LABEL(nibble_ashr_4_use)
647
648	movdqa	(%rdi, %rdx), %xmm0
649	palignr $4, -16(%rdi, %rdx), %xmm0
650# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
651	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
652# else
653	movdqa	(%rsi,%rdx), %xmm1
654	TOLOWER (%xmm0, %xmm1)
655	pcmpistri $0x1a, %xmm1, %xmm0
656# endif
657	jbe	LABEL(exit_use)
658# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
659	sub	$16, %r11
660	jbe	LABEL(strcmp_exitz)
661# endif
662	add	$16, %rdx
663	jmp	LABEL(loop_ashr_4_use)
664
665	.p2align 4
666LABEL(nibble_ashr_4_use):
667	sub	$0x1000, %r10
668	movdqa	-16(%rdi, %rdx), %xmm0
669	psrldq	$4, %xmm0
670	pcmpistri      $0x3a,%xmm0, %xmm0
671# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
672	cmp	%r11, %rcx
673	jae	LABEL(nibble_ashr_exit_use)
674# endif
675	cmp	$11, %ecx
676	ja	LABEL(nibble_ashr_4_restart_use)
677
678	jmp	LABEL(nibble_ashr_exit_use)
679
680/*
681 * The following cases will be handled by ashr_5
682 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
683 *        n(11~15)          n - 11		  4(15 +(n-11) - n)         ashr_5
684 */
685	.p2align 4
686LABEL(ashr_5):
687	pslldq	$11, %xmm2
688	TOLOWER (%xmm1, %xmm2)
689	pcmpeqb	%xmm1, %xmm2
690	psubb	%xmm0, %xmm2
691	pmovmskb %xmm2, %r9d
692	shr	%cl, %edx
693	shr	%cl, %r9d
694	sub	%r9d, %edx
695	jnz	LABEL(less32bytes)
696	movdqa	(%rdi), %xmm3
697
698	UPDATE_STRNCMP_COUNTER
699
700	mov	$16, %rcx	/* index for loads */
701	mov	$5, %r9d	/* byte position left over from less32bytes case */
702	/*
703	 * Setup %r10 value allows us to detect crossing a page boundary.
704	 * When %r10 goes positive we have crossed a page boundary and
705	 * need to do a nibble.
706	 */
707	lea	5(%rdi), %r10
708	and	$0xfff, %r10	/* offset into 4K page */
709	sub	$0x1000, %r10	/* subtract 4K pagesize */
710	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
711
712	.p2align 4
713LABEL(loop_ashr_5_use):
714	add	$16, %r10
715	jg	LABEL(nibble_ashr_5_use)
716
717LABEL(nibble_ashr_5_restart_use):
718	movdqa	(%rdi, %rdx), %xmm0
719	palignr $5, -16(%rdi, %rdx), %xmm0
720# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
721	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
722# else
723	movdqa	(%rsi,%rdx), %xmm1
724	TOLOWER (%xmm0, %xmm1)
725	pcmpistri $0x1a, %xmm1, %xmm0
726# endif
727	jbe	LABEL(exit_use)
728# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
729	sub	$16, %r11
730	jbe	LABEL(strcmp_exitz)
731# endif
732
733	add	$16, %rdx
734	add	$16, %r10
735	jg	LABEL(nibble_ashr_5_use)
736
737	movdqa	(%rdi, %rdx), %xmm0
738
739	palignr $5, -16(%rdi, %rdx), %xmm0
740# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
741	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
742# else
743	movdqa	(%rsi,%rdx), %xmm1
744	TOLOWER (%xmm0, %xmm1)
745	pcmpistri $0x1a, %xmm1, %xmm0
746# endif
747	jbe	LABEL(exit_use)
748# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
749	sub	$16, %r11
750	jbe	LABEL(strcmp_exitz)
751# endif
752	add	$16, %rdx
753	jmp	LABEL(loop_ashr_5_use)
754
755	.p2align 4
756LABEL(nibble_ashr_5_use):
757	sub	$0x1000, %r10
758	movdqa	-16(%rdi, %rdx), %xmm0
759	psrldq	$5, %xmm0
760	pcmpistri      $0x3a,%xmm0, %xmm0
761# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
762	cmp	%r11, %rcx
763	jae	LABEL(nibble_ashr_exit_use)
764# endif
765	cmp	$10, %ecx
766	ja	LABEL(nibble_ashr_5_restart_use)
767
768	jmp	LABEL(nibble_ashr_exit_use)
769
770/*
771 * The following cases will be handled by ashr_6
772 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
773 *        n(10~15)          n - 10		  5(15 +(n-10) - n)         ashr_6
774 */
775	.p2align 4
776LABEL(ashr_6):
777	pslldq	$10, %xmm2
778	TOLOWER (%xmm1, %xmm2)
779	pcmpeqb	%xmm1, %xmm2
780	psubb	%xmm0, %xmm2
781	pmovmskb %xmm2, %r9d
782	shr	%cl, %edx
783	shr	%cl, %r9d
784	sub	%r9d, %edx
785	jnz	LABEL(less32bytes)
786	movdqa	(%rdi), %xmm3
787
788	UPDATE_STRNCMP_COUNTER
789
790	mov	$16, %rcx	/* index for loads */
791	mov	$6, %r9d	/* byte position left over from less32bytes case */
792	/*
793	 * Setup %r10 value allows us to detect crossing a page boundary.
794	 * When %r10 goes positive we have crossed a page boundary and
795	 * need to do a nibble.
796	 */
797	lea	6(%rdi), %r10
798	and	$0xfff, %r10	/* offset into 4K page */
799	sub	$0x1000, %r10	/* subtract 4K pagesize */
800	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
801
802	.p2align 4
803LABEL(loop_ashr_6_use):
804	add	$16, %r10
805	jg	LABEL(nibble_ashr_6_use)
806
807LABEL(nibble_ashr_6_restart_use):
808	movdqa	(%rdi, %rdx), %xmm0
809	palignr $6, -16(%rdi, %rdx), %xmm0
810# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
811	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
812# else
813	movdqa	(%rsi,%rdx), %xmm1
814	TOLOWER (%xmm0, %xmm1)
815	pcmpistri $0x1a, %xmm1, %xmm0
816# endif
817	jbe	LABEL(exit_use)
818# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
819	sub	$16, %r11
820	jbe	LABEL(strcmp_exitz)
821# endif
822
823	add	$16, %rdx
824	add	$16, %r10
825	jg	LABEL(nibble_ashr_6_use)
826
827	movdqa	(%rdi, %rdx), %xmm0
828	palignr $6, -16(%rdi, %rdx), %xmm0
829# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
830	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
831# else
832	movdqa	(%rsi,%rdx), %xmm1
833	TOLOWER (%xmm0, %xmm1)
834	pcmpistri $0x1a, %xmm1, %xmm0
835# endif
836	jbe	LABEL(exit_use)
837# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
838	sub	$16, %r11
839	jbe	LABEL(strcmp_exitz)
840# endif
841	add	$16, %rdx
842	jmp	LABEL(loop_ashr_6_use)
843
844	.p2align 4
845LABEL(nibble_ashr_6_use):
846	sub	$0x1000, %r10
847	movdqa	-16(%rdi, %rdx), %xmm0
848	psrldq	$6, %xmm0
849	pcmpistri      $0x3a,%xmm0, %xmm0
850# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
851	cmp	%r11, %rcx
852	jae	LABEL(nibble_ashr_exit_use)
853# endif
854	cmp	$9, %ecx
855	ja	LABEL(nibble_ashr_6_restart_use)
856
857	jmp	LABEL(nibble_ashr_exit_use)
858
859/*
860 * The following cases will be handled by ashr_7
861 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
862 *        n(9~15)          n - 9		  6(15 +(n - 9) - n)         ashr_7
863 */
864	.p2align 4
865LABEL(ashr_7):
866	pslldq	$9, %xmm2
867	TOLOWER (%xmm1, %xmm2)
868	pcmpeqb	%xmm1, %xmm2
869	psubb	%xmm0, %xmm2
870	pmovmskb %xmm2, %r9d
871	shr	%cl, %edx
872	shr	%cl, %r9d
873	sub	%r9d, %edx
874	jnz	LABEL(less32bytes)
875	movdqa	(%rdi), %xmm3
876
877	UPDATE_STRNCMP_COUNTER
878
879	mov	$16, %rcx	/* index for loads */
880	mov	$7, %r9d	/* byte position left over from less32bytes case */
881	/*
882	 * Setup %r10 value allows us to detect crossing a page boundary.
883	 * When %r10 goes positive we have crossed a page boundary and
884	 * need to do a nibble.
885	 */
886	lea	7(%rdi), %r10
887	and	$0xfff, %r10	/* offset into 4K page */
888	sub	$0x1000, %r10	/* subtract 4K pagesize */
889	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
890
891	.p2align 4
892LABEL(loop_ashr_7_use):
893	add	$16, %r10
894	jg	LABEL(nibble_ashr_7_use)
895
896LABEL(nibble_ashr_7_restart_use):
897	movdqa	(%rdi, %rdx), %xmm0
898	palignr $7, -16(%rdi, %rdx), %xmm0
899# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
900	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
901# else
902	movdqa	(%rsi,%rdx), %xmm1
903	TOLOWER (%xmm0, %xmm1)
904	pcmpistri $0x1a, %xmm1, %xmm0
905# endif
906	jbe	LABEL(exit_use)
907# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
908	sub	$16, %r11
909	jbe	LABEL(strcmp_exitz)
910# endif
911
912	add	$16, %rdx
913	add	$16, %r10
914	jg	LABEL(nibble_ashr_7_use)
915
916	movdqa	(%rdi, %rdx), %xmm0
917	palignr $7, -16(%rdi, %rdx), %xmm0
918# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
919	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
920# else
921	movdqa	(%rsi,%rdx), %xmm1
922	TOLOWER (%xmm0, %xmm1)
923	pcmpistri $0x1a, %xmm1, %xmm0
924# endif
925	jbe	LABEL(exit_use)
926# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
927	sub	$16, %r11
928	jbe	LABEL(strcmp_exitz)
929# endif
930	add	$16, %rdx
931	jmp	LABEL(loop_ashr_7_use)
932
933	.p2align 4
934LABEL(nibble_ashr_7_use):
935	sub	$0x1000, %r10
936	movdqa	-16(%rdi, %rdx), %xmm0
937	psrldq	$7, %xmm0
938	pcmpistri      $0x3a,%xmm0, %xmm0
939# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
940	cmp	%r11, %rcx
941	jae	LABEL(nibble_ashr_exit_use)
942# endif
943	cmp	$8, %ecx
944	ja	LABEL(nibble_ashr_7_restart_use)
945
946	jmp	LABEL(nibble_ashr_exit_use)
947
948/*
949 *  The following cases will be handled by ashr_8
950 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
951 *        n(8~15)          n - 8		  7(15 +(n - 8) - n)         ashr_8
952 */
953	.p2align 4
954LABEL(ashr_8):
955	pslldq	$8, %xmm2
956	TOLOWER (%xmm1, %xmm2)
957	pcmpeqb	%xmm1, %xmm2
958	psubb	%xmm0, %xmm2
959	pmovmskb %xmm2, %r9d
960	shr	%cl, %edx
961	shr	%cl, %r9d
962	sub	%r9d, %edx
963	jnz	LABEL(less32bytes)
964	movdqa	(%rdi), %xmm3
965
966	UPDATE_STRNCMP_COUNTER
967
968	mov	$16, %rcx	/* index for loads */
969	mov	$8, %r9d	/* byte position left over from less32bytes case */
970	/*
971	 * Setup %r10 value allows us to detect crossing a page boundary.
972	 * When %r10 goes positive we have crossed a page boundary and
973	 * need to do a nibble.
974	 */
975	lea	8(%rdi), %r10
976	and	$0xfff, %r10	/* offset into 4K page */
977	sub	$0x1000, %r10	/* subtract 4K pagesize */
978	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
979
980	.p2align 4
981LABEL(loop_ashr_8_use):
982	add	$16, %r10
983	jg	LABEL(nibble_ashr_8_use)
984
985LABEL(nibble_ashr_8_restart_use):
986	movdqa	(%rdi, %rdx), %xmm0
987	palignr $8, -16(%rdi, %rdx), %xmm0
988# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
989	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
990# else
991	movdqa	(%rsi,%rdx), %xmm1
992	TOLOWER (%xmm0, %xmm1)
993	pcmpistri $0x1a, %xmm1, %xmm0
994# endif
995	jbe	LABEL(exit_use)
996# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
997	sub	$16, %r11
998	jbe	LABEL(strcmp_exitz)
999# endif
1000
1001	add	$16, %rdx
1002	add	$16, %r10
1003	jg	LABEL(nibble_ashr_8_use)
1004
1005	movdqa	(%rdi, %rdx), %xmm0
1006	palignr $8, -16(%rdi, %rdx), %xmm0
1007# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1008	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1009# else
1010	movdqa	(%rsi,%rdx), %xmm1
1011	TOLOWER (%xmm0, %xmm1)
1012	pcmpistri $0x1a, %xmm1, %xmm0
1013# endif
1014	jbe	LABEL(exit_use)
1015# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1016	sub	$16, %r11
1017	jbe	LABEL(strcmp_exitz)
1018# endif
1019	add	$16, %rdx
1020	jmp	LABEL(loop_ashr_8_use)
1021
1022	.p2align 4
1023LABEL(nibble_ashr_8_use):
1024	sub	$0x1000, %r10
1025	movdqa	-16(%rdi, %rdx), %xmm0
1026	psrldq	$8, %xmm0
1027	pcmpistri      $0x3a,%xmm0, %xmm0
1028# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1029	cmp	%r11, %rcx
1030	jae	LABEL(nibble_ashr_exit_use)
1031# endif
1032	cmp	$7, %ecx
1033	ja	LABEL(nibble_ashr_8_restart_use)
1034
1035	jmp	LABEL(nibble_ashr_exit_use)
1036
1037/*
1038 *  The following cases will be handled by ashr_9
1039 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1040 *        n(7~15)          n - 7		  8(15 +(n - 7) - n)         ashr_9
1041 */
1042	.p2align 4
1043LABEL(ashr_9):
1044	pslldq	$7, %xmm2
1045	TOLOWER (%xmm1, %xmm2)
1046	pcmpeqb	%xmm1, %xmm2
1047	psubb	%xmm0, %xmm2
1048	pmovmskb %xmm2, %r9d
1049	shr	%cl, %edx
1050	shr	%cl, %r9d
1051	sub	%r9d, %edx
1052	jnz	LABEL(less32bytes)
1053	movdqa	(%rdi), %xmm3
1054
1055	UPDATE_STRNCMP_COUNTER
1056
1057	mov	$16, %rcx	/* index for loads */
1058	mov	$9, %r9d	/* byte position left over from less32bytes case */
1059	/*
1060	 * Setup %r10 value allows us to detect crossing a page boundary.
1061	 * When %r10 goes positive we have crossed a page boundary and
1062	 * need to do a nibble.
1063	 */
1064	lea	9(%rdi), %r10
1065	and	$0xfff, %r10	/* offset into 4K page */
1066	sub	$0x1000, %r10	/* subtract 4K pagesize */
1067	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1068
1069	.p2align 4
1070LABEL(loop_ashr_9_use):
1071	add	$16, %r10
1072	jg	LABEL(nibble_ashr_9_use)
1073
1074LABEL(nibble_ashr_9_restart_use):
1075	movdqa	(%rdi, %rdx), %xmm0
1076
1077	palignr $9, -16(%rdi, %rdx), %xmm0
1078# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1079	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1080# else
1081	movdqa	(%rsi,%rdx), %xmm1
1082	TOLOWER (%xmm0, %xmm1)
1083	pcmpistri $0x1a, %xmm1, %xmm0
1084# endif
1085	jbe	LABEL(exit_use)
1086# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1087	sub	$16, %r11
1088	jbe	LABEL(strcmp_exitz)
1089# endif
1090
1091	add	$16, %rdx
1092	add	$16, %r10
1093	jg	LABEL(nibble_ashr_9_use)
1094
1095	movdqa	(%rdi, %rdx), %xmm0
1096	palignr $9, -16(%rdi, %rdx), %xmm0
1097# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1098	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1099# else
1100	movdqa	(%rsi,%rdx), %xmm1
1101	TOLOWER (%xmm0, %xmm1)
1102	pcmpistri $0x1a, %xmm1, %xmm0
1103# endif
1104	jbe	LABEL(exit_use)
1105# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1106	sub	$16, %r11
1107	jbe	LABEL(strcmp_exitz)
1108# endif
1109	add	$16, %rdx
1110	jmp	LABEL(loop_ashr_9_use)
1111
1112	.p2align 4
1113LABEL(nibble_ashr_9_use):
1114	sub	$0x1000, %r10
1115	movdqa	-16(%rdi, %rdx), %xmm0
1116	psrldq	$9, %xmm0
1117	pcmpistri      $0x3a,%xmm0, %xmm0
1118# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1119	cmp	%r11, %rcx
1120	jae	LABEL(nibble_ashr_exit_use)
1121# endif
1122	cmp	$6, %ecx
1123	ja	LABEL(nibble_ashr_9_restart_use)
1124
1125	jmp	LABEL(nibble_ashr_exit_use)
1126
1127/*
1128 *  The following cases will be handled by ashr_10
1129 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1130 *        n(6~15)          n - 6		  9(15 +(n - 6) - n)         ashr_10
1131 */
1132	.p2align 4
1133LABEL(ashr_10):
1134	pslldq	$6, %xmm2
1135	TOLOWER (%xmm1, %xmm2)
1136	pcmpeqb	%xmm1, %xmm2
1137	psubb	%xmm0, %xmm2
1138	pmovmskb %xmm2, %r9d
1139	shr	%cl, %edx
1140	shr	%cl, %r9d
1141	sub	%r9d, %edx
1142	jnz	LABEL(less32bytes)
1143	movdqa	(%rdi), %xmm3
1144
1145	UPDATE_STRNCMP_COUNTER
1146
1147	mov	$16, %rcx	/* index for loads */
1148	mov	$10, %r9d	/* byte position left over from less32bytes case */
1149	/*
1150	 * Setup %r10 value allows us to detect crossing a page boundary.
1151	 * When %r10 goes positive we have crossed a page boundary and
1152	 * need to do a nibble.
1153	 */
1154	lea	10(%rdi), %r10
1155	and	$0xfff, %r10	/* offset into 4K page */
1156	sub	$0x1000, %r10	/* subtract 4K pagesize */
1157	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1158
1159	.p2align 4
1160LABEL(loop_ashr_10_use):
1161	add	$16, %r10
1162	jg	LABEL(nibble_ashr_10_use)
1163
1164LABEL(nibble_ashr_10_restart_use):
1165	movdqa	(%rdi, %rdx), %xmm0
1166	palignr $10, -16(%rdi, %rdx), %xmm0
1167# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1168	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1169# else
1170	movdqa	(%rsi,%rdx), %xmm1
1171	TOLOWER (%xmm0, %xmm1)
1172	pcmpistri $0x1a, %xmm1, %xmm0
1173# endif
1174	jbe	LABEL(exit_use)
1175# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1176	sub	$16, %r11
1177	jbe	LABEL(strcmp_exitz)
1178# endif
1179
1180	add	$16, %rdx
1181	add	$16, %r10
1182	jg	LABEL(nibble_ashr_10_use)
1183
1184	movdqa	(%rdi, %rdx), %xmm0
1185	palignr $10, -16(%rdi, %rdx), %xmm0
1186# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1187	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1188# else
1189	movdqa	(%rsi,%rdx), %xmm1
1190	TOLOWER (%xmm0, %xmm1)
1191	pcmpistri $0x1a, %xmm1, %xmm0
1192# endif
1193	jbe	LABEL(exit_use)
1194# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1195	sub	$16, %r11
1196	jbe	LABEL(strcmp_exitz)
1197# endif
1198	add	$16, %rdx
1199	jmp	LABEL(loop_ashr_10_use)
1200
1201	.p2align 4
1202LABEL(nibble_ashr_10_use):
1203	sub	$0x1000, %r10
1204	movdqa	-16(%rdi, %rdx), %xmm0
1205	psrldq	$10, %xmm0
1206	pcmpistri      $0x3a,%xmm0, %xmm0
1207# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1208	cmp	%r11, %rcx
1209	jae	LABEL(nibble_ashr_exit_use)
1210# endif
1211	cmp	$5, %ecx
1212	ja	LABEL(nibble_ashr_10_restart_use)
1213
1214	jmp	LABEL(nibble_ashr_exit_use)
1215
1216/*
1217 *  The following cases will be handled by ashr_11
1218 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1219 *        n(5~15)          n - 5		  10(15 +(n - 5) - n)         ashr_11
1220 */
1221	.p2align 4
1222LABEL(ashr_11):
1223	pslldq	$5, %xmm2
1224	TOLOWER (%xmm1, %xmm2)
1225	pcmpeqb	%xmm1, %xmm2
1226	psubb	%xmm0, %xmm2
1227	pmovmskb %xmm2, %r9d
1228	shr	%cl, %edx
1229	shr	%cl, %r9d
1230	sub	%r9d, %edx
1231	jnz	LABEL(less32bytes)
1232	movdqa	(%rdi), %xmm3
1233
1234	UPDATE_STRNCMP_COUNTER
1235
1236	mov	$16, %rcx	/* index for loads */
1237	mov	$11, %r9d	/* byte position left over from less32bytes case */
1238	/*
1239	 * Setup %r10 value allows us to detect crossing a page boundary.
1240	 * When %r10 goes positive we have crossed a page boundary and
1241	 * need to do a nibble.
1242	 */
1243	lea	11(%rdi), %r10
1244	and	$0xfff, %r10	/* offset into 4K page */
1245	sub	$0x1000, %r10	/* subtract 4K pagesize */
1246	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1247
1248	.p2align 4
1249LABEL(loop_ashr_11_use):
1250	add	$16, %r10
1251	jg	LABEL(nibble_ashr_11_use)
1252
1253LABEL(nibble_ashr_11_restart_use):
1254	movdqa	(%rdi, %rdx), %xmm0
1255	palignr $11, -16(%rdi, %rdx), %xmm0
1256# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1257	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1258# else
1259	movdqa	(%rsi,%rdx), %xmm1
1260	TOLOWER (%xmm0, %xmm1)
1261	pcmpistri $0x1a, %xmm1, %xmm0
1262# endif
1263	jbe	LABEL(exit_use)
1264# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1265	sub	$16, %r11
1266	jbe	LABEL(strcmp_exitz)
1267# endif
1268
1269	add	$16, %rdx
1270	add	$16, %r10
1271	jg	LABEL(nibble_ashr_11_use)
1272
1273	movdqa	(%rdi, %rdx), %xmm0
1274	palignr $11, -16(%rdi, %rdx), %xmm0
1275# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1276	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1277# else
1278	movdqa	(%rsi,%rdx), %xmm1
1279	TOLOWER (%xmm0, %xmm1)
1280	pcmpistri $0x1a, %xmm1, %xmm0
1281# endif
1282	jbe	LABEL(exit_use)
1283# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1284	sub	$16, %r11
1285	jbe	LABEL(strcmp_exitz)
1286# endif
1287	add	$16, %rdx
1288	jmp	LABEL(loop_ashr_11_use)
1289
1290	.p2align 4
1291LABEL(nibble_ashr_11_use):
1292	sub	$0x1000, %r10
1293	movdqa	-16(%rdi, %rdx), %xmm0
1294	psrldq	$11, %xmm0
1295	pcmpistri      $0x3a,%xmm0, %xmm0
1296# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1297	cmp	%r11, %rcx
1298	jae	LABEL(nibble_ashr_exit_use)
1299# endif
1300	cmp	$4, %ecx
1301	ja	LABEL(nibble_ashr_11_restart_use)
1302
1303	jmp	LABEL(nibble_ashr_exit_use)
1304
1305/*
1306 *  The following cases will be handled by ashr_12
1307 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1308 *        n(4~15)          n - 4		  11(15 +(n - 4) - n)         ashr_12
1309 */
1310	.p2align 4
1311LABEL(ashr_12):
1312	pslldq	$4, %xmm2
1313	TOLOWER (%xmm1, %xmm2)
1314	pcmpeqb	%xmm1, %xmm2
1315	psubb	%xmm0, %xmm2
1316	pmovmskb %xmm2, %r9d
1317	shr	%cl, %edx
1318	shr	%cl, %r9d
1319	sub	%r9d, %edx
1320	jnz	LABEL(less32bytes)
1321	movdqa	(%rdi), %xmm3
1322
1323	UPDATE_STRNCMP_COUNTER
1324
1325	mov	$16, %rcx	/* index for loads */
1326	mov	$12, %r9d	/* byte position left over from less32bytes case */
1327	/*
1328	 * Setup %r10 value allows us to detect crossing a page boundary.
1329	 * When %r10 goes positive we have crossed a page boundary and
1330	 * need to do a nibble.
1331	 */
1332	lea	12(%rdi), %r10
1333	and	$0xfff, %r10	/* offset into 4K page */
1334	sub	$0x1000, %r10	/* subtract 4K pagesize */
1335	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1336
1337	.p2align 4
1338LABEL(loop_ashr_12_use):
1339	add	$16, %r10
1340	jg	LABEL(nibble_ashr_12_use)
1341
1342LABEL(nibble_ashr_12_restart_use):
1343	movdqa	(%rdi, %rdx), %xmm0
1344	palignr $12, -16(%rdi, %rdx), %xmm0
1345# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1346	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1347# else
1348	movdqa	(%rsi,%rdx), %xmm1
1349	TOLOWER (%xmm0, %xmm1)
1350	pcmpistri $0x1a, %xmm1, %xmm0
1351# endif
1352	jbe	LABEL(exit_use)
1353# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1354	sub	$16, %r11
1355	jbe	LABEL(strcmp_exitz)
1356# endif
1357
1358	add	$16, %rdx
1359	add	$16, %r10
1360	jg	LABEL(nibble_ashr_12_use)
1361
1362	movdqa	(%rdi, %rdx), %xmm0
1363	palignr $12, -16(%rdi, %rdx), %xmm0
1364# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1365	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1366# else
1367	movdqa	(%rsi,%rdx), %xmm1
1368	TOLOWER (%xmm0, %xmm1)
1369	pcmpistri $0x1a, %xmm1, %xmm0
1370# endif
1371	jbe	LABEL(exit_use)
1372# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1373	sub	$16, %r11
1374	jbe	LABEL(strcmp_exitz)
1375# endif
1376	add	$16, %rdx
1377	jmp	LABEL(loop_ashr_12_use)
1378
1379	.p2align 4
1380LABEL(nibble_ashr_12_use):
1381	sub	$0x1000, %r10
1382	movdqa	-16(%rdi, %rdx), %xmm0
1383	psrldq	$12, %xmm0
1384	pcmpistri      $0x3a,%xmm0, %xmm0
1385# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1386	cmp	%r11, %rcx
1387	jae	LABEL(nibble_ashr_exit_use)
1388# endif
1389	cmp	$3, %ecx
1390	ja	LABEL(nibble_ashr_12_restart_use)
1391
1392	jmp	LABEL(nibble_ashr_exit_use)
1393
1394/*
1395 *  The following cases will be handled by ashr_13
1396 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1397 *        n(3~15)          n - 3		  12(15 +(n - 3) - n)         ashr_13
1398 */
1399	.p2align 4
1400LABEL(ashr_13):
1401	pslldq	$3, %xmm2
1402	TOLOWER (%xmm1, %xmm2)
1403	pcmpeqb	%xmm1, %xmm2
1404	psubb	%xmm0, %xmm2
1405	pmovmskb %xmm2, %r9d
1406	shr	%cl, %edx
1407	shr	%cl, %r9d
1408	sub	%r9d, %edx
1409	jnz	LABEL(less32bytes)
1410	movdqa	(%rdi), %xmm3
1411
1412	UPDATE_STRNCMP_COUNTER
1413
1414	mov	$16, %rcx	/* index for loads */
1415	mov	$13, %r9d	/* byte position left over from less32bytes case */
1416	/*
1417	 * Setup %r10 value allows us to detect crossing a page boundary.
1418	 * When %r10 goes positive we have crossed a page boundary and
1419	 * need to do a nibble.
1420	 */
1421	lea	13(%rdi), %r10
1422	and	$0xfff, %r10	/* offset into 4K page */
1423	sub	$0x1000, %r10	/* subtract 4K pagesize */
1424
1425	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1426
1427	.p2align 4
1428LABEL(loop_ashr_13_use):
1429	add	$16, %r10
1430	jg	LABEL(nibble_ashr_13_use)
1431
1432LABEL(nibble_ashr_13_restart_use):
1433	movdqa	(%rdi, %rdx), %xmm0
1434	palignr $13, -16(%rdi, %rdx), %xmm0
1435# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1436	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1437# else
1438	movdqa	(%rsi,%rdx), %xmm1
1439	TOLOWER (%xmm0, %xmm1)
1440	pcmpistri $0x1a, %xmm1, %xmm0
1441# endif
1442	jbe	LABEL(exit_use)
1443# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1444	sub	$16, %r11
1445	jbe	LABEL(strcmp_exitz)
1446# endif
1447
1448	add	$16, %rdx
1449	add	$16, %r10
1450	jg	LABEL(nibble_ashr_13_use)
1451
1452	movdqa	(%rdi, %rdx), %xmm0
1453	palignr $13, -16(%rdi, %rdx), %xmm0
1454# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1455	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1456# else
1457	movdqa	(%rsi,%rdx), %xmm1
1458	TOLOWER (%xmm0, %xmm1)
1459	pcmpistri $0x1a, %xmm1, %xmm0
1460# endif
1461	jbe	LABEL(exit_use)
1462# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1463	sub	$16, %r11
1464	jbe	LABEL(strcmp_exitz)
1465# endif
1466	add	$16, %rdx
1467	jmp	LABEL(loop_ashr_13_use)
1468
1469	.p2align 4
1470LABEL(nibble_ashr_13_use):
1471	sub	$0x1000, %r10
1472	movdqa	-16(%rdi, %rdx), %xmm0
1473	psrldq	$13, %xmm0
1474	pcmpistri      $0x3a,%xmm0, %xmm0
1475# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1476	cmp	%r11, %rcx
1477	jae	LABEL(nibble_ashr_exit_use)
1478# endif
1479	cmp	$2, %ecx
1480	ja	LABEL(nibble_ashr_13_restart_use)
1481
1482	jmp	LABEL(nibble_ashr_exit_use)
1483
1484/*
1485 *  The following cases will be handled by ashr_14
1486 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1487 *        n(2~15)          n - 2		  13(15 +(n - 2) - n)         ashr_14
1488 */
1489	.p2align 4
1490LABEL(ashr_14):
1491	pslldq  $2, %xmm2
1492	TOLOWER (%xmm1, %xmm2)
1493	pcmpeqb	%xmm1, %xmm2
1494	psubb	%xmm0, %xmm2
1495	pmovmskb %xmm2, %r9d
1496	shr	%cl, %edx
1497	shr	%cl, %r9d
1498	sub	%r9d, %edx
1499	jnz	LABEL(less32bytes)
1500	movdqa	(%rdi), %xmm3
1501
1502	UPDATE_STRNCMP_COUNTER
1503
1504	mov	$16, %rcx	/* index for loads */
1505	mov	$14, %r9d	/* byte position left over from less32bytes case */
1506	/*
1507	 * Setup %r10 value allows us to detect crossing a page boundary.
1508	 * When %r10 goes positive we have crossed a page boundary and
1509	 * need to do a nibble.
1510	 */
1511	lea	14(%rdi), %r10
1512	and	$0xfff, %r10	/* offset into 4K page */
1513	sub	$0x1000, %r10	/* subtract 4K pagesize */
1514
1515	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1516
1517	.p2align 4
1518LABEL(loop_ashr_14_use):
1519	add	$16, %r10
1520	jg	LABEL(nibble_ashr_14_use)
1521
1522LABEL(nibble_ashr_14_restart_use):
1523	movdqa	(%rdi, %rdx), %xmm0
1524	palignr $14, -16(%rdi, %rdx), %xmm0
1525# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1526	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1527# else
1528	movdqa	(%rsi,%rdx), %xmm1
1529	TOLOWER (%xmm0, %xmm1)
1530	pcmpistri $0x1a, %xmm1, %xmm0
1531# endif
1532	jbe	LABEL(exit_use)
1533# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1534	sub	$16, %r11
1535	jbe	LABEL(strcmp_exitz)
1536# endif
1537
1538	add	$16, %rdx
1539	add	$16, %r10
1540	jg	LABEL(nibble_ashr_14_use)
1541
1542	movdqa	(%rdi, %rdx), %xmm0
1543	palignr $14, -16(%rdi, %rdx), %xmm0
1544# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1545	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1546# else
1547	movdqa	(%rsi,%rdx), %xmm1
1548	TOLOWER (%xmm0, %xmm1)
1549	pcmpistri $0x1a, %xmm1, %xmm0
1550# endif
1551	jbe	LABEL(exit_use)
1552# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1553	sub	$16, %r11
1554	jbe	LABEL(strcmp_exitz)
1555# endif
1556	add	$16, %rdx
1557	jmp	LABEL(loop_ashr_14_use)
1558
1559	.p2align 4
1560LABEL(nibble_ashr_14_use):
1561	sub	$0x1000, %r10
1562	movdqa	-16(%rdi, %rdx), %xmm0
1563	psrldq	$14, %xmm0
1564	pcmpistri      $0x3a,%xmm0, %xmm0
1565# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1566	cmp	%r11, %rcx
1567	jae	LABEL(nibble_ashr_exit_use)
1568# endif
1569	cmp	$1, %ecx
1570	ja	LABEL(nibble_ashr_14_restart_use)
1571
1572	jmp	LABEL(nibble_ashr_exit_use)
1573
1574/*
1575 *  The following cases will be handled by ashr_15
1576 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1577 *        n(1~15)          n - 1		  14(15 +(n - 1) - n)         ashr_15
1578 */
1579	.p2align 4
1580LABEL(ashr_15):
1581	pslldq	$1, %xmm2
1582	TOLOWER (%xmm1, %xmm2)
1583	pcmpeqb	%xmm1, %xmm2
1584	psubb	%xmm0, %xmm2
1585	pmovmskb %xmm2, %r9d
1586	shr	%cl, %edx
1587	shr	%cl, %r9d
1588	sub	%r9d, %edx
1589	jnz	LABEL(less32bytes)
1590
1591	movdqa	(%rdi), %xmm3
1592
1593	UPDATE_STRNCMP_COUNTER
1594
1595	mov	$16, %rcx	/* index for loads */
1596	mov	$15, %r9d	/* byte position left over from less32bytes case */
1597	/*
1598	 * Setup %r10 value allows us to detect crossing a page boundary.
1599	 * When %r10 goes positive we have crossed a page boundary and
1600	 * need to do a nibble.
1601	 */
1602	lea	15(%rdi), %r10
1603	and	$0xfff, %r10	/* offset into 4K page */
1604
1605	sub	$0x1000, %r10	/* subtract 4K pagesize */
1606
1607	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1608
1609	.p2align 4
1610LABEL(loop_ashr_15_use):
1611	add	$16, %r10
1612	jg	LABEL(nibble_ashr_15_use)
1613
1614LABEL(nibble_ashr_15_restart_use):
1615	movdqa	(%rdi, %rdx), %xmm0
1616	palignr $15, -16(%rdi, %rdx), %xmm0
1617# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1618	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1619# else
1620	movdqa	(%rsi,%rdx), %xmm1
1621	TOLOWER (%xmm0, %xmm1)
1622	pcmpistri $0x1a, %xmm1, %xmm0
1623# endif
1624	jbe	LABEL(exit_use)
1625# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1626	sub	$16, %r11
1627	jbe	LABEL(strcmp_exitz)
1628# endif
1629
1630	add	$16, %rdx
1631	add	$16, %r10
1632	jg	LABEL(nibble_ashr_15_use)
1633
1634	movdqa	(%rdi, %rdx), %xmm0
1635	palignr $15, -16(%rdi, %rdx), %xmm0
1636# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1637	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1638# else
1639	movdqa	(%rsi,%rdx), %xmm1
1640	TOLOWER (%xmm0, %xmm1)
1641	pcmpistri $0x1a, %xmm1, %xmm0
1642# endif
1643	jbe	LABEL(exit_use)
1644# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1645	sub	$16, %r11
1646	jbe	LABEL(strcmp_exitz)
1647# endif
1648	add	$16, %rdx
1649	jmp	LABEL(loop_ashr_15_use)
1650
1651	.p2align 4
1652LABEL(nibble_ashr_15_use):
1653	sub	$0x1000, %r10
1654	movdqa	-16(%rdi, %rdx), %xmm0
1655	psrldq	$15, %xmm0
1656	pcmpistri      $0x3a,%xmm0, %xmm0
1657# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1658	cmp	%r11, %rcx
1659	jae	LABEL(nibble_ashr_exit_use)
1660# endif
1661	cmp	$0, %ecx
1662	ja	LABEL(nibble_ashr_15_restart_use)
1663
1664LABEL(nibble_ashr_exit_use):
1665# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1666	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
1667# else
1668	movdqa	(%rsi,%rdx), %xmm1
1669	TOLOWER (%xmm0, %xmm1)
1670	pcmpistri $0x1a, %xmm1, %xmm0
1671# endif
1672	.p2align 4
1673LABEL(exit_use):
1674	jnc	LABEL(strcmp_exitz)
1675# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1676	sub	%rcx, %r11
1677	jbe	LABEL(strcmp_exitz)
1678# endif
1679	add	%rcx, %rdx
1680	lea	-16(%rdi, %r9), %rdi
1681	movzbl	(%rdi, %rdx), %eax
1682	movzbl	(%rsi, %rdx), %edx
1683	test	%r8d, %r8d
1684	jz	LABEL(ret_use)
1685	xchg	%eax, %edx
1686LABEL(ret_use):
1687# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1688	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1689	movl	(%rcx,%rdx,4), %edx
1690	movl	(%rcx,%rax,4), %eax
1691# endif
1692
1693	sub	%edx, %eax
1694	ret
1695
1696LABEL(less32bytes):
1697	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
1698	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
1699	test	%r8d, %r8d
1700	jz	LABEL(ret)
1701	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
1702
1703	.p2align 4
1704LABEL(ret):
1705LABEL(less16bytes):
1706	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
1707
1708# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1709	sub	%rdx, %r11
1710	jbe	LABEL(strcmp_exitz)
1711# endif
1712	movzbl	(%rsi, %rdx), %ecx
1713	movzbl	(%rdi, %rdx), %eax
1714
1715# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1716	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1717	movl	(%rdx,%rcx,4), %ecx
1718	movl	(%rdx,%rax,4), %eax
1719# endif
1720
1721	sub	%ecx, %eax
1722	ret
1723
1724LABEL(strcmp_exitz):
1725	xor	%eax, %eax
1726	ret
1727
1728	.p2align 4
1729	// XXX Same as code above
1730LABEL(Byte0):
1731	movzbl	(%rsi), %ecx
1732	movzbl	(%rdi), %eax
1733
1734# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1735	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1736	movl	(%rdx,%rcx,4), %ecx
1737	movl	(%rdx,%rax,4), %eax
1738# endif
1739
1740	sub	%ecx, %eax
1741	ret
1742	cfi_endproc
1743	.size	STRCMP, .-STRCMP
1744
1745# undef UCLOW_reg
1746# undef UCHIGH_reg
1747# undef LCQWORD_reg
1748# undef TOLOWER
1749
1750	/* Put all SSE 4.2 functions together.  */
1751	.section .rodata.SECTION,"a",@progbits
1752	.p2align 3
1753LABEL(unaligned_table):
1754	.int	LABEL(ashr_1) - LABEL(unaligned_table)
1755	.int	LABEL(ashr_2) - LABEL(unaligned_table)
1756	.int	LABEL(ashr_3) - LABEL(unaligned_table)
1757	.int	LABEL(ashr_4) - LABEL(unaligned_table)
1758	.int	LABEL(ashr_5) - LABEL(unaligned_table)
1759	.int	LABEL(ashr_6) - LABEL(unaligned_table)
1760	.int	LABEL(ashr_7) - LABEL(unaligned_table)
1761	.int	LABEL(ashr_8) - LABEL(unaligned_table)
1762	.int	LABEL(ashr_9) - LABEL(unaligned_table)
1763	.int	LABEL(ashr_10) - LABEL(unaligned_table)
1764	.int	LABEL(ashr_11) - LABEL(unaligned_table)
1765	.int	LABEL(ashr_12) - LABEL(unaligned_table)
1766	.int	LABEL(ashr_13) - LABEL(unaligned_table)
1767	.int	LABEL(ashr_14) - LABEL(unaligned_table)
1768	.int	LABEL(ashr_15) - LABEL(unaligned_table)
1769	.int	LABEL(ashr_0) - LABEL(unaligned_table)
1770
1771# undef LABEL
1772# undef SECTION
1773# undef movdqa
1774# undef movdqu
1775# undef pmovmskb
1776# undef pcmpistri
1777# undef psubb
1778# undef pcmpeqb
1779# undef psrldq
1780# undef pslldq
1781# undef palignr
1782# undef pxor
1783# undef D
1784#endif
1785