1/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
2   Copyright (C) 2018-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23# ifndef STRCMP_ISA
24#  define STRCMP_ISA	_avx2
25# endif
26
27# include "strcmp-naming.h"
28
29# include <sysdep.h>
30
31# if defined USE_AS_STRCASECMP_L
32#  include "locale-defines.h"
33# endif
34
35# ifndef STRCMP
36#  define STRCMP	__strcmp_avx2
37# endif
38
39# define PAGE_SIZE	4096
40
41	/* VEC_SIZE = Number of bytes in a ymm register.  */
42# define VEC_SIZE	32
43
44# define VMOVU	vmovdqu
45# define VMOVA	vmovdqa
46
47# ifdef USE_AS_WCSCMP
48	/* Compare packed dwords.  */
49#  define VPCMPEQ	vpcmpeqd
50	/* Compare packed dwords and store minimum.  */
51#  define VPMINU	vpminud
52	/* 1 dword char == 4 bytes.  */
53#  define SIZE_OF_CHAR	4
54# else
55	/* Compare packed bytes.  */
56#  define VPCMPEQ	vpcmpeqb
57	/* Compare packed bytes and store minimum.  */
58#  define VPMINU	vpminub
59	/* 1 byte char == 1 byte.  */
60#  define SIZE_OF_CHAR	1
61# endif
62
63# ifdef USE_AS_STRNCMP
64#  define LOOP_REG	r9d
65#  define LOOP_REG64	r9
66
67#  define OFFSET_REG8	r9b
68#  define OFFSET_REG	r9d
69#  define OFFSET_REG64	r9
70# else
71#  define LOOP_REG	edx
72#  define LOOP_REG64	rdx
73
74#  define OFFSET_REG8	dl
75#  define OFFSET_REG	edx
76#  define OFFSET_REG64	rdx
77# endif
78
79# ifndef VZEROUPPER
80#  define VZEROUPPER	vzeroupper
81# endif
82
83# if defined USE_AS_STRNCMP
84#  define VEC_OFFSET	0
85# else
86#  define VEC_OFFSET	(-VEC_SIZE)
87# endif
88
89# ifdef USE_AS_STRCASECMP_L
90#  define BYTE_LOOP_REG	OFFSET_REG
91# else
92#  define BYTE_LOOP_REG	ecx
93# endif
94
95# ifdef USE_AS_STRCASECMP_L
96#  ifdef USE_AS_STRNCMP
97#   define LOCALE_REG	rcx
98#   define LOCALE_REG_LP	RCX_LP
99#  else
100#   define LOCALE_REG	rdx
101#   define LOCALE_REG_LP	RDX_LP
102#  endif
103# endif
104
105# define xmmZERO	xmm15
106# define ymmZERO	ymm15
107
108# define LCASE_MIN_ymm	%ymm10
109# define LCASE_MAX_ymm	%ymm11
110# define CASE_ADD_ymm	%ymm12
111
112# define LCASE_MIN_xmm	%xmm10
113# define LCASE_MAX_xmm	%xmm11
114# define CASE_ADD_xmm	%xmm12
115
116	/* r11 is never use elsewhere so this is safe to maintain.  */
117# define TOLOWER_BASE	%r11
118
119# ifndef SECTION
120#  define SECTION(p)	p##.avx
121# endif
122
123# ifdef USE_AS_STRCASECMP_L
124#  define REG(x, y) x ## y
125#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)			\
126	vpaddb	REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);				\
127	vpaddb	REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);				\
128	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);			\
129	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);			\
130	vpandn	REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);			\
131	vpandn	REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);			\
132	vpaddb	REG(%ext, 8), reg1_in, reg1_out;							\
133	vpaddb	REG(%ext, 9), reg2_in, reg2_out
134
135#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
136#  define TOLOWER_ymm(...)	TOLOWER(__VA_ARGS__, ymm)
137#  define TOLOWER_xmm(...)	TOLOWER(__VA_ARGS__, xmm)
138
139#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)			\
140	TOLOWER	(s1_reg, scratch_reg, s2_reg, s2_reg, ext);					\
141	VPCMPEQ	scratch_reg, s2_reg, reg_out
142
143#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)			\
144	VMOVU	s2_mem, reg_out;											\
145	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
146
147#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
148#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
149
150#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
151#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
152
153# else
154#  define TOLOWER_gpr(...)
155#  define TOLOWER_ymm(...)
156#  define TOLOWER_xmm(...)
157
158#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)			\
159	VPCMPEQ	s2_reg, s1_reg, reg_out
160
161#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
162
163#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
164#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
165# endif
166
167/* Warning!
168           wcscmp/wcsncmp have to use SIGNED comparison for elements.
169           strcmp/strncmp have to use UNSIGNED comparison for elements.
170*/
171
172/* The main idea of the string comparison (byte or dword) using AVX2
173   consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
174   either packed bytes or dwords depending on USE_AS_WCSCMP. In order
175   to check the null char, algorithm keeps the matched bytes/dwords,
176   requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
177   the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
178   one VPMINU instructions, together with movdqu and testl instructions.
179   Main loop (away from from page boundary) compares 4 vectors are a time,
180   effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
181
182   The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
183   is the same as strcmp, except that an a maximum offset is tracked.  If
184   the maximum offset is reached before a difference is found, zero is
185   returned.  */
186
187	.section SECTION(.text), "ax", @progbits
188	.align	16
189	.type	STRCMP, @function
190	.globl	STRCMP
191
192# ifdef USE_AS_STRCASECMP_L
193ENTRY (STRCASECMP)
194	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
195	mov	%fs:(%rax), %LOCALE_REG_LP
196
197	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
198	.p2align 4
199END (STRCASECMP)
200	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
201# endif
202
203	.p2align 4
204STRCMP:
205	cfi_startproc
206	_CET_ENDBR
207	CALL_MCOUNT
208
209# if defined USE_AS_STRCASECMP_L
210	/* We have to fall back on the C implementation for locales with
211	   encodings not matching ASCII for single bytes.  */
212#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
213	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
214#  else
215	mov	(%LOCALE_REG), %RAX_LP
216#  endif
217	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
218	jne	STRCASECMP_L_NONASCII
219	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
220# endif
221
222# ifdef USE_AS_STRNCMP
223	/* Don't overwrite LOCALE_REG (rcx) until we have pass
224	   L(one_or_less). Otherwise we might use the wrong locale in
225	   the OVERFLOW_STRCMP (strcasecmp_l).  */
226#  ifdef __ILP32__
227	/* Clear the upper 32 bits.  */
228	movl	%edx, %edx
229#  endif
230	cmp	$1, %RDX_LP
231	/* Signed comparison intentional. We use this branch to also
232	   test cases where length >= 2^63. These very large sizes can be
233	   handled with strcmp as there is no way for that length to
234	   actually bound the buffer.  */
235	jle	L(one_or_less)
236#  ifdef USE_AS_WCSCMP
237	movq	%rdx, %rcx
238
239	/* Multiplying length by sizeof(wchar_t) can result in overflow.
240	   Check if that is possible. All cases where overflow are possible
241	   are cases where length is large enough that it can never be a
242	   bound on valid memory so just use wcscmp.  */
243	shrq	$56, %rcx
244	jnz	OVERFLOW_STRCMP
245
246	leaq	(, %rdx, 4), %rdx
247#  endif
248# endif
249	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
250# if defined USE_AS_STRCASECMP_L
251	.section .rodata.cst32, "aM", @progbits, 32
252	.align	32
253L(lcase_min):
254	.quad	0x3f3f3f3f3f3f3f3f
255	.quad	0x3f3f3f3f3f3f3f3f
256	.quad	0x3f3f3f3f3f3f3f3f
257	.quad	0x3f3f3f3f3f3f3f3f
258L(lcase_max):
259	.quad	0x9999999999999999
260	.quad	0x9999999999999999
261	.quad	0x9999999999999999
262	.quad	0x9999999999999999
263L(case_add):
264	.quad	0x2020202020202020
265	.quad	0x2020202020202020
266	.quad	0x2020202020202020
267	.quad	0x2020202020202020
268	.previous
269
270	vmovdqa	L(lcase_min)(%rip), LCASE_MIN_ymm
271	vmovdqa	L(lcase_max)(%rip), LCASE_MAX_ymm
272	vmovdqa	L(case_add)(%rip), CASE_ADD_ymm
273# endif
274	movl	%edi, %eax
275	orl	%esi, %eax
276	sall	$20, %eax
277	/* Check if s1 or s2 may cross a page  in next 4x VEC loads.  */
278	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
279	ja	L(page_cross)
280
281L(no_page_cross):
282	/* Safe to compare 4x vectors.  */
283	VMOVU	(%rdi), %ymm0
284	/* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
285	   Otherwise converts ymm0 and load from rsi to lower. ymm2 is
286	   scratch and ymm1 is the return.  */
287	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
288	/* 1s at null CHAR.  */
289	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
290	/* 1s where s1 and s2 equal AND not null CHAR.  */
291	vpandn	%ymm1, %ymm2, %ymm1
292
293	/* All 1s -> keep going, any 0s -> return.  */
294	vpmovmskb %ymm1, %ecx
295# ifdef USE_AS_STRNCMP
296	cmpq	$VEC_SIZE, %rdx
297	jbe	L(vec_0_test_len)
298# endif
299
300	/* All 1s represents all equals. incl will overflow to zero in
301	   all equals case. Otherwise 1s will carry until position of first
302	   mismatch.  */
303	incl	%ecx
304	jz	L(more_3x_vec)
305
306	.p2align 4,, 4
307L(return_vec_0):
308	tzcntl	%ecx, %ecx
309# ifdef USE_AS_WCSCMP
310	movl	(%rdi, %rcx), %edx
311	xorl	%eax, %eax
312	cmpl	(%rsi, %rcx), %edx
313	je	L(ret0)
314	setl	%al
315	negl	%eax
316	orl	$1, %eax
317# else
318	movzbl	(%rdi, %rcx), %eax
319	movzbl	(%rsi, %rcx), %ecx
320	TOLOWER_gpr (%rax, %eax)
321	TOLOWER_gpr (%rcx, %ecx)
322	subl	%ecx, %eax
323# endif
324L(ret0):
325L(return_vzeroupper):
326	ZERO_UPPER_VEC_REGISTERS_RETURN
327
328# ifdef USE_AS_STRNCMP
329	.p2align 4,, 8
330L(vec_0_test_len):
331	notl	%ecx
332	bzhil	%edx, %ecx, %eax
333	jnz	L(return_vec_0)
334	/* Align if will cross fetch block.  */
335	.p2align 4,, 2
336L(ret_zero):
337	xorl	%eax, %eax
338	VZEROUPPER_RETURN
339
340	.p2align 4,, 5
341L(one_or_less):
342#  ifdef USE_AS_STRCASECMP_L
343	/* Set locale argument for strcasecmp.  */
344	movq	%LOCALE_REG, %rdx
345#  endif
346	jb	L(ret_zero)
347	/* 'nbe' covers the case where length is negative (large
348	   unsigned).  */
349	jnbe	OVERFLOW_STRCMP
350#  ifdef USE_AS_WCSCMP
351	movl	(%rdi), %edx
352	xorl	%eax, %eax
353	cmpl	(%rsi), %edx
354	je	L(ret1)
355	setl	%al
356	negl	%eax
357	orl	$1, %eax
358#  else
359	movzbl	(%rdi), %eax
360	movzbl	(%rsi), %ecx
361	TOLOWER_gpr (%rax, %eax)
362	TOLOWER_gpr (%rcx, %ecx)
363	subl	%ecx, %eax
364#  endif
365L(ret1):
366	ret
367# endif
368
369	.p2align 4,, 10
370L(return_vec_1):
371	tzcntl	%ecx, %ecx
372# ifdef USE_AS_STRNCMP
373	/* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
374	   overflow.  */
375	addq	$-VEC_SIZE, %rdx
376	cmpq	%rcx, %rdx
377	jbe	L(ret_zero)
378# endif
379# ifdef USE_AS_WCSCMP
380	movl	VEC_SIZE(%rdi, %rcx), %edx
381	xorl	%eax, %eax
382	cmpl	VEC_SIZE(%rsi, %rcx), %edx
383	je	L(ret2)
384	setl	%al
385	negl	%eax
386	orl	$1, %eax
387# else
388	movzbl	VEC_SIZE(%rdi, %rcx), %eax
389	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
390	TOLOWER_gpr (%rax, %eax)
391	TOLOWER_gpr (%rcx, %ecx)
392	subl	%ecx, %eax
393# endif
394L(ret2):
395	VZEROUPPER_RETURN
396
397	.p2align 4,, 10
398# ifdef USE_AS_STRNCMP
399L(return_vec_3):
400	salq	$32, %rcx
401# endif
402
403L(return_vec_2):
404# ifndef USE_AS_STRNCMP
405	tzcntl	%ecx, %ecx
406# else
407	tzcntq	%rcx, %rcx
408	cmpq	%rcx, %rdx
409	jbe	L(ret_zero)
410# endif
411
412# ifdef USE_AS_WCSCMP
413	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
414	xorl	%eax, %eax
415	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
416	je	L(ret3)
417	setl	%al
418	negl	%eax
419	orl	$1, %eax
420# else
421	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
422	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
423	TOLOWER_gpr (%rax, %eax)
424	TOLOWER_gpr (%rcx, %ecx)
425	subl	%ecx, %eax
426# endif
427L(ret3):
428	VZEROUPPER_RETURN
429
430# ifndef USE_AS_STRNCMP
431	.p2align 4,, 10
432L(return_vec_3):
433	tzcntl	%ecx, %ecx
434#  ifdef USE_AS_WCSCMP
435	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
436	xorl	%eax, %eax
437	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
438	je	L(ret4)
439	setl	%al
440	negl	%eax
441	orl	$1, %eax
442#  else
443	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
444	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
445	TOLOWER_gpr (%rax, %eax)
446	TOLOWER_gpr (%rcx, %ecx)
447	subl	%ecx, %eax
448#  endif
449L(ret4):
450	VZEROUPPER_RETURN
451# endif
452
453	.p2align 4,, 10
454L(more_3x_vec):
455	/* Safe to compare 4x vectors.  */
456	VMOVU	VEC_SIZE(%rdi), %ymm0
457	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
458	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
459	vpandn	%ymm1, %ymm2, %ymm1
460	vpmovmskb %ymm1, %ecx
461	incl	%ecx
462	jnz	L(return_vec_1)
463
464# ifdef USE_AS_STRNCMP
465	subq	$(VEC_SIZE * 2), %rdx
466	jbe	L(ret_zero)
467# endif
468
469	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
470	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
471	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
472	vpandn	%ymm1, %ymm2, %ymm1
473	vpmovmskb %ymm1, %ecx
474	incl	%ecx
475	jnz	L(return_vec_2)
476
477	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
478	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
479	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
480	vpandn	%ymm1, %ymm2, %ymm1
481	vpmovmskb %ymm1, %ecx
482	incl	%ecx
483	jnz	L(return_vec_3)
484
485# ifdef USE_AS_STRNCMP
486	cmpq	$(VEC_SIZE * 2), %rdx
487	jbe	L(ret_zero)
488# endif
489
490# ifdef USE_AS_WCSCMP
491	/* any non-zero positive value that doesn't inference with 0x1.
492	 */
493	movl	$2, %r8d
494
495# else
496	xorl	%r8d, %r8d
497# endif
498
499	/* The prepare labels are various entry points from the page
500	   cross logic.  */
501L(prepare_loop):
502
503# ifdef USE_AS_STRNCMP
504	/* Store N + (VEC_SIZE * 4) and place check at the begining of
505	   the loop.  */
506	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
507# endif
508L(prepare_loop_no_len):
509
510	/* Align s1 and adjust s2 accordingly.  */
511	subq	%rdi, %rsi
512	andq	$-(VEC_SIZE * 4), %rdi
513	addq	%rdi, %rsi
514
515# ifdef USE_AS_STRNCMP
516	subq	%rdi, %rdx
517# endif
518
519L(prepare_loop_aligned):
520	/* eax stores distance from rsi to next page cross. These cases
521	   need to be handled specially as the 4x loop could potentially
522	   read memory past the length of s1 or s2 and across a page
523	   boundary.  */
524	movl	$-(VEC_SIZE * 4), %eax
525	subl	%esi, %eax
526	andl	$(PAGE_SIZE - 1), %eax
527
528	/* Loop 4x comparisons at a time.  */
529	.p2align 4
530L(loop):
531
532	/* End condition for strncmp.  */
533# ifdef USE_AS_STRNCMP
534	subq	$(VEC_SIZE * 4), %rdx
535	jbe	L(ret_zero)
536# endif
537
538	subq	$-(VEC_SIZE * 4), %rdi
539	subq	$-(VEC_SIZE * 4), %rsi
540
541	/* Check if rsi loads will cross a page boundary.  */
542	addl	$-(VEC_SIZE * 4), %eax
543	jnb	L(page_cross_during_loop)
544
545	/* Loop entry after handling page cross during loop.  */
546L(loop_skip_page_cross_check):
547	VMOVA	(VEC_SIZE * 0)(%rdi), %ymm0
548	VMOVA	(VEC_SIZE * 1)(%rdi), %ymm2
549	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
550	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
551
552	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
553	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
554	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
555	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
556	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
557
558	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
559	   zero.  */
560	vpand	%ymm0, %ymm1, %ymm1
561
562
563	vpand	%ymm2, %ymm3, %ymm3
564	vpand	%ymm4, %ymm5, %ymm5
565	vpand	%ymm6, %ymm7, %ymm7
566
567	VPMINU	%ymm1, %ymm3, %ymm3
568	VPMINU	%ymm5, %ymm7, %ymm7
569
570	/* Reduce all 0 CHARs for the 4x VEC into ymm7.  */
571	VPMINU	%ymm3, %ymm7, %ymm7
572
573	/* If any 0 CHAR then done.  */
574	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
575	vpmovmskb %ymm7, %LOOP_REG
576	testl	%LOOP_REG, %LOOP_REG
577	jz	L(loop)
578
579	/* Find which VEC has the mismatch of end of string.  */
580	VPCMPEQ	%ymm1, %ymmZERO, %ymm1
581	vpmovmskb %ymm1, %ecx
582	testl	%ecx, %ecx
583	jnz	L(return_vec_0_end)
584
585
586	VPCMPEQ	%ymm3, %ymmZERO, %ymm3
587	vpmovmskb %ymm3, %ecx
588	testl	%ecx, %ecx
589	jnz	L(return_vec_1_end)
590
591L(return_vec_2_3_end):
592# ifdef USE_AS_STRNCMP
593	subq	$(VEC_SIZE * 2), %rdx
594	jbe	L(ret_zero_end)
595# endif
596
597	VPCMPEQ	%ymm5, %ymmZERO, %ymm5
598	vpmovmskb %ymm5, %ecx
599	testl	%ecx, %ecx
600	jnz	L(return_vec_2_end)
601
602	/* LOOP_REG contains matches for null/mismatch from the loop. If
603	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
604	   must entirely be from VEC 3 which is fully represented by
605	   LOOP_REG.  */
606	tzcntl	%LOOP_REG, %LOOP_REG
607
608# ifdef USE_AS_STRNCMP
609	subl	$-(VEC_SIZE), %LOOP_REG
610	cmpq	%LOOP_REG64, %rdx
611	jbe	L(ret_zero_end)
612# endif
613
614# ifdef USE_AS_WCSCMP
615	movl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
616	xorl	%eax, %eax
617	cmpl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
618	je	L(ret5)
619	setl	%al
620	negl	%eax
621	xorl	%r8d, %eax
622# else
623	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
624	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
625	TOLOWER_gpr (%rax, %eax)
626	TOLOWER_gpr (%rcx, %ecx)
627	subl	%ecx, %eax
628	xorl	%r8d, %eax
629	subl	%r8d, %eax
630# endif
631L(ret5):
632	VZEROUPPER_RETURN
633
634# ifdef USE_AS_STRNCMP
635	.p2align 4,, 2
636L(ret_zero_end):
637	xorl	%eax, %eax
638	VZEROUPPER_RETURN
639# endif
640
641
642	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
643	   they use the value of `r8` to negate the return value. This is
644	   because the page cross logic can swap `rdi` and `rsi`.  */
645	.p2align 4,, 10
646# ifdef USE_AS_STRNCMP
647L(return_vec_1_end):
648	salq	$32, %rcx
649# endif
650L(return_vec_0_end):
651# ifndef USE_AS_STRNCMP
652	tzcntl	%ecx, %ecx
653# else
654	tzcntq	%rcx, %rcx
655	cmpq	%rcx, %rdx
656	jbe	L(ret_zero_end)
657# endif
658
659# ifdef USE_AS_WCSCMP
660	movl	(%rdi, %rcx), %edx
661	xorl	%eax, %eax
662	cmpl	(%rsi, %rcx), %edx
663	je	L(ret6)
664	setl	%al
665	negl	%eax
666	xorl	%r8d, %eax
667# else
668	movzbl	(%rdi, %rcx), %eax
669	movzbl	(%rsi, %rcx), %ecx
670	TOLOWER_gpr (%rax, %eax)
671	TOLOWER_gpr (%rcx, %ecx)
672	subl	%ecx, %eax
673	xorl	%r8d, %eax
674	subl	%r8d, %eax
675# endif
676L(ret6):
677	VZEROUPPER_RETURN
678
679# ifndef USE_AS_STRNCMP
680	.p2align 4,, 10
681L(return_vec_1_end):
682	tzcntl	%ecx, %ecx
683#  ifdef USE_AS_WCSCMP
684	movl	VEC_SIZE(%rdi, %rcx), %edx
685	xorl	%eax, %eax
686	cmpl	VEC_SIZE(%rsi, %rcx), %edx
687	je	L(ret7)
688	setl	%al
689	negl	%eax
690	xorl	%r8d, %eax
691#  else
692	movzbl	VEC_SIZE(%rdi, %rcx), %eax
693	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
694	TOLOWER_gpr (%rax, %eax)
695	TOLOWER_gpr (%rcx, %ecx)
696	subl	%ecx, %eax
697	xorl	%r8d, %eax
698	subl	%r8d, %eax
699#  endif
700L(ret7):
701	VZEROUPPER_RETURN
702# endif
703
704	.p2align 4,, 10
705L(return_vec_2_end):
706	tzcntl	%ecx, %ecx
707# ifdef USE_AS_STRNCMP
708	cmpq	%rcx, %rdx
709	jbe	L(ret_zero_page_cross)
710# endif
711# ifdef USE_AS_WCSCMP
712	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
713	xorl	%eax, %eax
714	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
715	je	L(ret11)
716	setl	%al
717	negl	%eax
718	xorl	%r8d, %eax
719# else
720	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
721	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
722	TOLOWER_gpr (%rax, %eax)
723	TOLOWER_gpr (%rcx, %ecx)
724	subl	%ecx, %eax
725	xorl	%r8d, %eax
726	subl	%r8d, %eax
727# endif
728L(ret11):
729	VZEROUPPER_RETURN
730
731
732	/* Page cross in rsi in next 4x VEC.  */
733
734	/* TODO: Improve logic here.  */
735	.p2align 4,, 10
736L(page_cross_during_loop):
737	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
738
739	/* Optimistically rsi and rdi and both aligned inwhich case we
740	   don't need any logic here.  */
741	cmpl	$-(VEC_SIZE * 4), %eax
742	/* Don't adjust eax before jumping back to loop and we will
743	   never hit page cross case again.  */
744	je	L(loop_skip_page_cross_check)
745
746	/* Check if we can safely load a VEC.  */
747	cmpl	$-(VEC_SIZE * 3), %eax
748	jle	L(less_1x_vec_till_page_cross)
749
750	VMOVA	(%rdi), %ymm0
751	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
752	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
753	vpandn	%ymm1, %ymm2, %ymm1
754	vpmovmskb %ymm1, %ecx
755	incl	%ecx
756	jnz	L(return_vec_0_end)
757
758	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
759	cmpl	$-(VEC_SIZE * 2), %eax
760	jg	L(more_2x_vec_till_page_cross)
761
762	.p2align 4,, 4
763L(less_1x_vec_till_page_cross):
764	subl	$-(VEC_SIZE * 4), %eax
765	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
766	   concerning case is first iteration if incoming s1 was near start
767	   of a page and s2 near end. If s1 was near the start of the page
768	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
769	   to read back -VEC_SIZE. If rdi is truly at the start of a page
770	   here, it means the previous page (rdi - VEC_SIZE) has already
771	   been loaded earlier so must be valid.  */
772	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
773	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
774	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
775	vpandn	%ymm1, %ymm2, %ymm1
776	vpmovmskb %ymm1, %ecx
777
778	/* Mask of potentially valid bits. The lower bits can be out of
779	   range comparisons (but safe regarding page crosses).  */
780	movl	$-1, %r10d
781	shlxl	%esi, %r10d, %r10d
782	notl	%ecx
783
784# ifdef USE_AS_STRNCMP
785	cmpq	%rax, %rdx
786	jbe	L(return_page_cross_end_check)
787# endif
788	movl	%eax, %OFFSET_REG
789	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
790
791	andl	%r10d, %ecx
792	jz	L(loop_skip_page_cross_check)
793
794	.p2align 4,, 3
795L(return_page_cross_end):
796	tzcntl	%ecx, %ecx
797
798# ifdef USE_AS_STRNCMP
799	leal	-VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
800L(return_page_cross_cmp_mem):
801# else
802	addl	%OFFSET_REG, %ecx
803# endif
804# ifdef USE_AS_WCSCMP
805	movl	VEC_OFFSET(%rdi, %rcx), %edx
806	xorl	%eax, %eax
807	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
808	je	L(ret8)
809	setl	%al
810	negl	%eax
811	xorl	%r8d, %eax
812# else
813	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
814	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
815	TOLOWER_gpr (%rax, %eax)
816	TOLOWER_gpr (%rcx, %ecx)
817	subl	%ecx, %eax
818	xorl	%r8d, %eax
819	subl	%r8d, %eax
820# endif
821L(ret8):
822	VZEROUPPER_RETURN
823
824# ifdef USE_AS_STRNCMP
825	.p2align 4,, 10
826L(return_page_cross_end_check):
827	andl	%r10d, %ecx
828	tzcntl	%ecx, %ecx
829	leal	-VEC_SIZE(%rax, %rcx), %ecx
830	cmpl	%ecx, %edx
831	ja	L(return_page_cross_cmp_mem)
832	xorl	%eax, %eax
833	VZEROUPPER_RETURN
834# endif
835
836
837	.p2align 4,, 10
838L(more_2x_vec_till_page_cross):
839	/* If more 2x vec till cross we will complete a full loop
840	   iteration here.  */
841
842	VMOVU	VEC_SIZE(%rdi), %ymm0
843	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
844	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
845	vpandn	%ymm1, %ymm2, %ymm1
846	vpmovmskb %ymm1, %ecx
847	incl	%ecx
848	jnz	L(return_vec_1_end)
849
850# ifdef USE_AS_STRNCMP
851	cmpq	$(VEC_SIZE * 2), %rdx
852	jbe	L(ret_zero_in_loop_page_cross)
853# endif
854
855	subl	$-(VEC_SIZE * 4), %eax
856
857	/* Safe to include comparisons from lower bytes.  */
858	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
859	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
860	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
861	vpandn	%ymm1, %ymm2, %ymm1
862	vpmovmskb %ymm1, %ecx
863	incl	%ecx
864	jnz	L(return_vec_page_cross_0)
865
866	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
867	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
868	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
869	vpandn	%ymm1, %ymm2, %ymm1
870	vpmovmskb %ymm1, %ecx
871	incl	%ecx
872	jnz	L(return_vec_page_cross_1)
873
874# ifdef USE_AS_STRNCMP
875	/* Must check length here as length might proclude reading next
876	   page.  */
877	cmpq	%rax, %rdx
878	jbe	L(ret_zero_in_loop_page_cross)
879# endif
880
881	/* Finish the loop.  */
882	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
883	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
884
885	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
886	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
887	vpand	%ymm4, %ymm5, %ymm5
888	vpand	%ymm6, %ymm7, %ymm7
889	VPMINU	%ymm5, %ymm7, %ymm7
890	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
891	vpmovmskb %ymm7, %LOOP_REG
892	testl	%LOOP_REG, %LOOP_REG
893	jnz	L(return_vec_2_3_end)
894
895	/* Best for code size to include ucond-jmp here. Would be faster
896	   if this case is hot to duplicate the L(return_vec_2_3_end) code
897	   as fall-through and have jump back to loop on mismatch
898	   comparison.  */
899	subq	$-(VEC_SIZE * 4), %rdi
900	subq	$-(VEC_SIZE * 4), %rsi
901	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
902# ifdef USE_AS_STRNCMP
903	subq	$(VEC_SIZE * 4), %rdx
904	ja	L(loop_skip_page_cross_check)
905L(ret_zero_in_loop_page_cross):
906	xorl	%eax, %eax
907	VZEROUPPER_RETURN
908# else
909	jmp	L(loop_skip_page_cross_check)
910# endif
911
912
913	.p2align 4,, 10
914L(return_vec_page_cross_0):
915	addl	$-VEC_SIZE, %eax
916L(return_vec_page_cross_1):
917	tzcntl	%ecx, %ecx
918# ifdef USE_AS_STRNCMP
919	leal	-VEC_SIZE(%rax, %rcx), %ecx
920	cmpq	%rcx, %rdx
921	jbe	L(ret_zero_in_loop_page_cross)
922# else
923	addl	%eax, %ecx
924# endif
925
926# ifdef USE_AS_WCSCMP
927	movl	VEC_OFFSET(%rdi, %rcx), %edx
928	xorl	%eax, %eax
929	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
930	je	L(ret9)
931	setl	%al
932	negl	%eax
933	xorl	%r8d, %eax
934# else
935	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
936	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
937	TOLOWER_gpr (%rax, %eax)
938	TOLOWER_gpr (%rcx, %ecx)
939	subl	%ecx, %eax
940	xorl	%r8d, %eax
941	subl	%r8d, %eax
942# endif
943L(ret9):
944	VZEROUPPER_RETURN
945
946
947	.p2align 4,, 10
948L(page_cross):
949# ifndef USE_AS_STRNCMP
950	/* If both are VEC aligned we don't need any special logic here.
951	   Only valid for strcmp where stop condition is guranteed to be
952	   reachable by just reading memory.  */
953	testl	$((VEC_SIZE - 1) << 20), %eax
954	jz	L(no_page_cross)
955# endif
956
957	movl	%edi, %eax
958	movl	%esi, %ecx
959	andl	$(PAGE_SIZE - 1), %eax
960	andl	$(PAGE_SIZE - 1), %ecx
961
962	xorl	%OFFSET_REG, %OFFSET_REG
963
964	/* Check which is closer to page cross, s1 or s2.  */
965	cmpl	%eax, %ecx
966	jg	L(page_cross_s2)
967
968	/* The previous page cross check has false positives. Check for
969	   true positive as page cross logic is very expensive.  */
970	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
971	jbe	L(no_page_cross)
972
973	/* Set r8 to not interfere with normal return value (rdi and rsi
974	   did not swap).  */
975# ifdef USE_AS_WCSCMP
976	/* any non-zero positive value that doesn't inference with 0x1.
977	 */
978	movl	$2, %r8d
979# else
980	xorl	%r8d, %r8d
981# endif
982
983	/* Check if less than 1x VEC till page cross.  */
984	subl	$(VEC_SIZE * 3), %eax
985	jg	L(less_1x_vec_till_page)
986
987	/* If more than 1x VEC till page cross, loop throuh safely
988	   loadable memory until within 1x VEC of page cross.  */
989
990	.p2align 4,, 10
991L(page_cross_loop):
992
993	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
994	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
995	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
996	vpandn	%ymm1, %ymm2, %ymm1
997	vpmovmskb %ymm1, %ecx
998	incl	%ecx
999
1000	jnz	L(check_ret_vec_page_cross)
1001	addl	$VEC_SIZE, %OFFSET_REG
1002# ifdef USE_AS_STRNCMP
1003	cmpq	%OFFSET_REG64, %rdx
1004	jbe	L(ret_zero_page_cross)
1005# endif
1006	addl	$VEC_SIZE, %eax
1007	jl	L(page_cross_loop)
1008
1009	subl	%eax, %OFFSET_REG
1010	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
1011	   to not cross page so is safe to load. Since we have already
1012	   loaded at least 1 VEC from rsi it is also guranteed to be
1013	   safe.  */
1014
1015	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
1016	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
1017	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
1018	vpandn	%ymm1, %ymm2, %ymm1
1019	vpmovmskb %ymm1, %ecx
1020
1021# ifdef USE_AS_STRNCMP
1022	leal	VEC_SIZE(%OFFSET_REG64), %eax
1023	cmpq	%rax, %rdx
1024	jbe	L(check_ret_vec_page_cross2)
1025	addq	%rdi, %rdx
1026# endif
1027	incl	%ecx
1028	jz	L(prepare_loop_no_len)
1029
1030	.p2align 4,, 4
1031L(ret_vec_page_cross):
1032# ifndef USE_AS_STRNCMP
1033L(check_ret_vec_page_cross):
1034# endif
1035	tzcntl	%ecx, %ecx
1036	addl	%OFFSET_REG, %ecx
1037L(ret_vec_page_cross_cont):
1038# ifdef USE_AS_WCSCMP
1039	movl	(%rdi, %rcx), %edx
1040	xorl	%eax, %eax
1041	cmpl	(%rsi, %rcx), %edx
1042	je	L(ret12)
1043	setl	%al
1044	negl	%eax
1045	xorl	%r8d, %eax
1046# else
1047	movzbl	(%rdi, %rcx), %eax
1048	movzbl	(%rsi, %rcx), %ecx
1049	TOLOWER_gpr (%rax, %eax)
1050	TOLOWER_gpr (%rcx, %ecx)
1051	subl	%ecx, %eax
1052	xorl	%r8d, %eax
1053	subl	%r8d, %eax
1054# endif
1055L(ret12):
1056	VZEROUPPER_RETURN
1057
1058# ifdef USE_AS_STRNCMP
1059	.p2align 4,, 10
1060L(check_ret_vec_page_cross2):
1061	incl	%ecx
1062L(check_ret_vec_page_cross):
1063	tzcntl	%ecx, %ecx
1064	addl	%OFFSET_REG, %ecx
1065	cmpq	%rcx, %rdx
1066	ja	L(ret_vec_page_cross_cont)
1067	.p2align 4,, 2
1068L(ret_zero_page_cross):
1069	xorl	%eax, %eax
1070	VZEROUPPER_RETURN
1071# endif
1072
1073	.p2align 4,, 4
1074L(page_cross_s2):
1075	/* Ensure this is a true page cross.  */
1076	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
1077	jbe	L(no_page_cross)
1078
1079
1080	movl	%ecx, %eax
1081	movq	%rdi, %rcx
1082	movq	%rsi, %rdi
1083	movq	%rcx, %rsi
1084
1085	/* set r8 to negate return value as rdi and rsi swapped.  */
1086# ifdef USE_AS_WCSCMP
1087	movl	$-4, %r8d
1088# else
1089	movl	$-1, %r8d
1090# endif
1091	xorl	%OFFSET_REG, %OFFSET_REG
1092
1093	/* Check if more than 1x VEC till page cross.  */
1094	subl	$(VEC_SIZE * 3), %eax
1095	jle	L(page_cross_loop)
1096
1097	.p2align 4,, 6
1098L(less_1x_vec_till_page):
1099	/* Find largest load size we can use.  */
1100	cmpl	$16, %eax
1101	ja	L(less_16_till_page)
1102
1103	VMOVU	(%rdi), %xmm0
1104	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
1105	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
1106	vpandn	%xmm1, %xmm2, %xmm1
1107	vpmovmskb %ymm1, %ecx
1108	incw	%cx
1109	jnz	L(check_ret_vec_page_cross)
1110	movl	$16, %OFFSET_REG
1111# ifdef USE_AS_STRNCMP
1112	cmpq	%OFFSET_REG64, %rdx
1113	jbe	L(ret_zero_page_cross_slow_case0)
1114	subl	%eax, %OFFSET_REG
1115# else
1116	/* Explicit check for 16 byte alignment.  */
1117	subl	%eax, %OFFSET_REG
1118	jz	L(prepare_loop)
1119# endif
1120
1121	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
1122	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
1123	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
1124	vpandn	%xmm1, %xmm2, %xmm1
1125	vpmovmskb %ymm1, %ecx
1126	incw	%cx
1127	jnz	L(check_ret_vec_page_cross)
1128
1129# ifdef USE_AS_STRNCMP
1130	addl	$16, %OFFSET_REG
1131	subq	%OFFSET_REG64, %rdx
1132	jbe	L(ret_zero_page_cross_slow_case0)
1133	subq	$-(VEC_SIZE * 4), %rdx
1134
1135	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1136	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1137# else
1138	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1139	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1140# endif
1141	jmp	L(prepare_loop_aligned)
1142
1143# ifdef USE_AS_STRNCMP
1144	.p2align 4,, 2
1145L(ret_zero_page_cross_slow_case0):
1146	xorl	%eax, %eax
1147	ret
1148# endif
1149
1150
1151	.p2align 4,, 10
1152L(less_16_till_page):
1153	/* Find largest load size we can use.  */
1154	cmpl	$24, %eax
1155	ja	L(less_8_till_page)
1156
1157	vmovq	(%rdi), %xmm0
1158	vmovq	(%rsi), %xmm1
1159	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
1160	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1161	vpandn	%xmm1, %xmm2, %xmm1
1162	vpmovmskb %ymm1, %ecx
1163	incb	%cl
1164	jnz	L(check_ret_vec_page_cross)
1165
1166
1167# ifdef USE_AS_STRNCMP
1168	cmpq	$8, %rdx
1169	jbe	L(ret_zero_page_cross_slow_case0)
1170# endif
1171	movl	$24, %OFFSET_REG
1172	/* Explicit check for 16 byte alignment.  */
1173	subl	%eax, %OFFSET_REG
1174
1175
1176
1177	vmovq	(%rdi, %OFFSET_REG64), %xmm0
1178	vmovq	(%rsi, %OFFSET_REG64), %xmm1
1179	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
1180	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1181	vpandn	%xmm1, %xmm2, %xmm1
1182	vpmovmskb %ymm1, %ecx
1183	incb	%cl
1184	jnz	L(check_ret_vec_page_cross)
1185
1186# ifdef USE_AS_STRNCMP
1187	addl	$8, %OFFSET_REG
1188	subq	%OFFSET_REG64, %rdx
1189	jbe	L(ret_zero_page_cross_slow_case0)
1190	subq	$-(VEC_SIZE * 4), %rdx
1191
1192	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1193	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1194# else
1195	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1196	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1197# endif
1198	jmp	L(prepare_loop_aligned)
1199
1200
1201	.p2align 4,, 10
1202L(less_8_till_page):
1203# ifdef USE_AS_WCSCMP
1204	/* If using wchar then this is the only check before we reach
1205	   the page boundary.  */
1206	movl	(%rdi), %eax
1207	movl	(%rsi), %ecx
1208	cmpl	%ecx, %eax
1209	jnz	L(ret_less_8_wcs)
1210#  ifdef USE_AS_STRNCMP
1211	addq	%rdi, %rdx
1212	/* We already checked for len <= 1 so cannot hit that case here.
1213	 */
1214#  endif
1215	testl	%eax, %eax
1216	jnz	L(prepare_loop_no_len)
1217	ret
1218
1219	.p2align 4,, 8
1220L(ret_less_8_wcs):
1221	setl	%OFFSET_REG8
1222	negl	%OFFSET_REG
1223	movl	%OFFSET_REG, %eax
1224	xorl	%r8d, %eax
1225	ret
1226
1227# else
1228
1229	/* Find largest load size we can use.  */
1230	cmpl	$28, %eax
1231	ja	L(less_4_till_page)
1232
1233	vmovd	(%rdi), %xmm0
1234	vmovd	(%rsi), %xmm1
1235	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
1236	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1237	vpandn	%xmm1, %xmm2, %xmm1
1238	vpmovmskb %ymm1, %ecx
1239	subl	$0xf, %ecx
1240	jnz	L(check_ret_vec_page_cross)
1241
1242#  ifdef USE_AS_STRNCMP
1243	cmpq	$4, %rdx
1244	jbe	L(ret_zero_page_cross_slow_case1)
1245#  endif
1246	movl	$28, %OFFSET_REG
1247	/* Explicit check for 16 byte alignment.  */
1248	subl	%eax, %OFFSET_REG
1249
1250
1251
1252	vmovd	(%rdi, %OFFSET_REG64), %xmm0
1253	vmovd	(%rsi, %OFFSET_REG64), %xmm1
1254	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
1255	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1256	vpandn	%xmm1, %xmm2, %xmm1
1257	vpmovmskb %ymm1, %ecx
1258	subl	$0xf, %ecx
1259	jnz	L(check_ret_vec_page_cross)
1260
1261#  ifdef USE_AS_STRNCMP
1262	addl	$4, %OFFSET_REG
1263	subq	%OFFSET_REG64, %rdx
1264	jbe	L(ret_zero_page_cross_slow_case1)
1265	subq	$-(VEC_SIZE * 4), %rdx
1266
1267	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1268	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1269#  else
1270	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1271	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1272#  endif
1273	jmp	L(prepare_loop_aligned)
1274
1275#  ifdef USE_AS_STRNCMP
1276	.p2align 4,, 2
1277L(ret_zero_page_cross_slow_case1):
1278	xorl	%eax, %eax
1279	ret
1280#  endif
1281
1282	.p2align 4,, 10
1283L(less_4_till_page):
1284	subq	%rdi, %rsi
1285	/* Extremely slow byte comparison loop.  */
1286L(less_4_loop):
1287	movzbl	(%rdi), %eax
1288	movzbl	(%rsi, %rdi), %ecx
1289	TOLOWER_gpr (%rax, %eax)
1290	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1291	subl	%BYTE_LOOP_REG, %eax
1292	jnz	L(ret_less_4_loop)
1293	testl	%ecx, %ecx
1294	jz	L(ret_zero_4_loop)
1295#  ifdef USE_AS_STRNCMP
1296	decq	%rdx
1297	jz	L(ret_zero_4_loop)
1298#  endif
1299	incq	%rdi
1300	/* end condition is reach page boundary (rdi is aligned).  */
1301	testl	$31, %edi
1302	jnz	L(less_4_loop)
1303	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
1304	addq	$-(VEC_SIZE * 4), %rdi
1305#  ifdef USE_AS_STRNCMP
1306	subq	$-(VEC_SIZE * 4), %rdx
1307#  endif
1308	jmp	L(prepare_loop_aligned)
1309
1310L(ret_zero_4_loop):
1311	xorl	%eax, %eax
1312	ret
1313L(ret_less_4_loop):
1314	xorl	%r8d, %eax
1315	subl	%r8d, %eax
1316	ret
1317# endif
1318	cfi_endproc
1319	.size	STRCMP, .-STRCMP
1320#endif
1321